提交 dc92b1f9 编写于 作者: L Linus Torvalds

Merge branch 'virtio-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull virtio changes from Rusty Russell:
 "New workflow: same git trees pulled by linux-next get sent straight to
  Linus.  Git is awkward at shuffling patches compared with quilt or mq,
  but that doesn't happen often once things get into my -next branch."

* 'virtio-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (24 commits)
  lguest: fix occasional crash in example launcher.
  virtio-blk: Disable callback in virtblk_done()
  virtio_mmio: Don't attempt to create empty virtqueues
  virtio_mmio: fix off by one error allocating queue
  drivers/virtio/virtio_pci.c: fix error return code
  virtio: don't crash when device is buggy
  virtio: remove CONFIG_VIRTIO_RING
  virtio: add help to CONFIG_VIRTIO option.
  virtio: support reserved vqs
  virtio: introduce an API to set affinity for a virtqueue
  virtio-ring: move queue_index to vring_virtqueue
  virtio_balloon: not EXPERIMENTAL any more.
  virtio-balloon: dependency fix
  virtio-blk: fix NULL checking in virtblk_alloc_req()
  virtio-blk: Add REQ_FLUSH and REQ_FUA support to bio path
  virtio-blk: Add bio-based IO path for virtio-blk
  virtio: console: fix error handling in init() function
  tools: Fix pthread flag for Makefile of trace-agent used by virtio-trace
  tools: Add guest trace agent as a user tool
  virtio/console: Allocate scatterlist according to the current pipe size
  ...
......@@ -656,7 +656,6 @@ config S390_GUEST
depends on 64BIT && EXPERIMENTAL
select VIRTUALIZATION
select VIRTIO
select VIRTIO_RING
select VIRTIO_CONSOLE
help
Enabling this option adds support for virtio based paravirtual device
......
......@@ -4,7 +4,6 @@ config LGUEST_GUEST
depends on X86_32
select VIRTUALIZATION
select VIRTIO
select VIRTIO_RING
select VIRTIO_CONSOLE
help
Lguest is a tiny in-kernel hypervisor. Selecting this will
......
......@@ -14,6 +14,9 @@
#define PART_BITS 4
static bool use_bio;
module_param(use_bio, bool, S_IRUGO);
static int major;
static DEFINE_IDA(vd_index_ida);
......@@ -23,6 +26,7 @@ struct virtio_blk
{
struct virtio_device *vdev;
struct virtqueue *vq;
wait_queue_head_t queue_wait;
/* The disk structure for the kernel. */
struct gendisk *disk;
......@@ -51,53 +55,244 @@ struct virtio_blk
struct virtblk_req
{
struct request *req;
struct bio *bio;
struct virtio_blk_outhdr out_hdr;
struct virtio_scsi_inhdr in_hdr;
struct work_struct work;
struct virtio_blk *vblk;
int flags;
u8 status;
struct scatterlist sg[];
};
enum {
VBLK_IS_FLUSH = 1,
VBLK_REQ_FLUSH = 2,
VBLK_REQ_DATA = 4,
VBLK_REQ_FUA = 8,
};
static void blk_done(struct virtqueue *vq)
static inline int virtblk_result(struct virtblk_req *vbr)
{
switch (vbr->status) {
case VIRTIO_BLK_S_OK:
return 0;
case VIRTIO_BLK_S_UNSUPP:
return -ENOTTY;
default:
return -EIO;
}
}
static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
gfp_t gfp_mask)
{
struct virtio_blk *vblk = vq->vdev->priv;
struct virtblk_req *vbr;
unsigned int len;
unsigned long flags;
spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
int error;
vbr = mempool_alloc(vblk->pool, gfp_mask);
if (!vbr)
return NULL;
switch (vbr->status) {
case VIRTIO_BLK_S_OK:
error = 0;
break;
case VIRTIO_BLK_S_UNSUPP:
error = -ENOTTY;
break;
default:
error = -EIO;
vbr->vblk = vblk;
if (use_bio)
sg_init_table(vbr->sg, vblk->sg_elems);
return vbr;
}
static void virtblk_add_buf_wait(struct virtio_blk *vblk,
struct virtblk_req *vbr,
unsigned long out,
unsigned long in)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
TASK_UNINTERRUPTIBLE);
spin_lock_irq(vblk->disk->queue->queue_lock);
if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
GFP_ATOMIC) < 0) {
spin_unlock_irq(vblk->disk->queue->queue_lock);
io_schedule();
} else {
virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock);
break;
}
switch (vbr->req->cmd_type) {
case REQ_TYPE_BLOCK_PC:
vbr->req->resid_len = vbr->in_hdr.residual;
vbr->req->sense_len = vbr->in_hdr.sense_len;
vbr->req->errors = vbr->in_hdr.errors;
break;
case REQ_TYPE_SPECIAL:
vbr->req->errors = (error != 0);
break;
default:
break;
}
finish_wait(&vblk->queue_wait, &wait);
}
static inline void virtblk_add_req(struct virtblk_req *vbr,
unsigned int out, unsigned int in)
{
struct virtio_blk *vblk = vbr->vblk;
spin_lock_irq(vblk->disk->queue->queue_lock);
if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
GFP_ATOMIC) < 0)) {
spin_unlock_irq(vblk->disk->queue->queue_lock);
virtblk_add_buf_wait(vblk, vbr, out, in);
return;
}
virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock);
}
static int virtblk_bio_send_flush(struct virtblk_req *vbr)
{
unsigned int out = 0, in = 0;
vbr->flags |= VBLK_IS_FLUSH;
vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = 0;
sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
virtblk_add_req(vbr, out, in);
return 0;
}
static int virtblk_bio_send_data(struct virtblk_req *vbr)
{
struct virtio_blk *vblk = vbr->vblk;
unsigned int num, out = 0, in = 0;
struct bio *bio = vbr->bio;
vbr->flags &= ~VBLK_IS_FLUSH;
vbr->out_hdr.type = 0;
vbr->out_hdr.sector = bio->bi_sector;
vbr->out_hdr.ioprio = bio_prio(bio);
sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out);
sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
sizeof(vbr->status));
if (num) {
if (bio->bi_rw & REQ_WRITE) {
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
out += num;
} else {
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
in += num;
}
}
virtblk_add_req(vbr, out, in);
return 0;
}
static void virtblk_bio_send_data_work(struct work_struct *work)
{
struct virtblk_req *vbr;
vbr = container_of(work, struct virtblk_req, work);
virtblk_bio_send_data(vbr);
}
static void virtblk_bio_send_flush_work(struct work_struct *work)
{
struct virtblk_req *vbr;
vbr = container_of(work, struct virtblk_req, work);
virtblk_bio_send_flush(vbr);
}
static inline void virtblk_request_done(struct virtblk_req *vbr)
{
struct virtio_blk *vblk = vbr->vblk;
struct request *req = vbr->req;
int error = virtblk_result(vbr);
if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
req->resid_len = vbr->in_hdr.residual;
req->sense_len = vbr->in_hdr.sense_len;
req->errors = vbr->in_hdr.errors;
} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
req->errors = (error != 0);
}
__blk_end_request_all(req, error);
mempool_free(vbr, vblk->pool);
}
static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
{
struct virtio_blk *vblk = vbr->vblk;
if (vbr->flags & VBLK_REQ_DATA) {
/* Send out the actual write data */
INIT_WORK(&vbr->work, virtblk_bio_send_data_work);
queue_work(virtblk_wq, &vbr->work);
} else {
bio_endio(vbr->bio, virtblk_result(vbr));
mempool_free(vbr, vblk->pool);
}
}
static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
{
struct virtio_blk *vblk = vbr->vblk;
__blk_end_request_all(vbr->req, error);
if (unlikely(vbr->flags & VBLK_REQ_FUA)) {
/* Send out a flush before end the bio */
vbr->flags &= ~VBLK_REQ_DATA;
INIT_WORK(&vbr->work, virtblk_bio_send_flush_work);
queue_work(virtblk_wq, &vbr->work);
} else {
bio_endio(vbr->bio, virtblk_result(vbr));
mempool_free(vbr, vblk->pool);
}
}
static inline void virtblk_bio_done(struct virtblk_req *vbr)
{
if (unlikely(vbr->flags & VBLK_IS_FLUSH))
virtblk_bio_flush_done(vbr);
else
virtblk_bio_data_done(vbr);
}
static void virtblk_done(struct virtqueue *vq)
{
struct virtio_blk *vblk = vq->vdev->priv;
bool bio_done = false, req_done = false;
struct virtblk_req *vbr;
unsigned long flags;
unsigned int len;
spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
do {
virtqueue_disable_cb(vq);
while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
if (vbr->bio) {
virtblk_bio_done(vbr);
bio_done = true;
} else {
virtblk_request_done(vbr);
req_done = true;
}
}
} while (!virtqueue_enable_cb(vq));
/* In case queue is stopped waiting for more buffers. */
blk_start_queue(vblk->disk->queue);
if (req_done)
blk_start_queue(vblk->disk->queue);
spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
if (bio_done)
wake_up(&vblk->queue_wait);
}
static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
......@@ -106,13 +301,13 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
unsigned long num, out = 0, in = 0;
struct virtblk_req *vbr;
vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
if (!vbr)
/* When another request finishes we'll try again. */
return false;
vbr->req = req;
vbr->bio = NULL;
if (req->cmd_flags & REQ_FLUSH) {
vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
vbr->out_hdr.sector = 0;
......@@ -172,7 +367,8 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
}
}
if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) {
if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
GFP_ATOMIC) < 0) {
mempool_free(vbr, vblk->pool);
return false;
}
......@@ -180,7 +376,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
return true;
}
static void do_virtblk_request(struct request_queue *q)
static void virtblk_request(struct request_queue *q)
{
struct virtio_blk *vblk = q->queuedata;
struct request *req;
......@@ -203,6 +399,34 @@ static void do_virtblk_request(struct request_queue *q)
virtqueue_kick(vblk->vq);
}
static void virtblk_make_request(struct request_queue *q, struct bio *bio)
{
struct virtio_blk *vblk = q->queuedata;
struct virtblk_req *vbr;
BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
vbr = virtblk_alloc_req(vblk, GFP_NOIO);
if (!vbr) {
bio_endio(bio, -ENOMEM);
return;
}
vbr->bio = bio;
vbr->flags = 0;
if (bio->bi_rw & REQ_FLUSH)
vbr->flags |= VBLK_REQ_FLUSH;
if (bio->bi_rw & REQ_FUA)
vbr->flags |= VBLK_REQ_FUA;
if (bio->bi_size)
vbr->flags |= VBLK_REQ_DATA;
if (unlikely(vbr->flags & VBLK_REQ_FLUSH))
virtblk_bio_send_flush(vbr);
else
virtblk_bio_send_data(vbr);
}
/* return id (s/n) string for *disk to *id_str
*/
static int virtblk_get_id(struct gendisk *disk, char *id_str)
......@@ -360,7 +584,7 @@ static int init_vq(struct virtio_blk *vblk)
int err = 0;
/* We expect one virtqueue, for output. */
vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests");
vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
if (IS_ERR(vblk->vq))
err = PTR_ERR(vblk->vq);
......@@ -477,6 +701,8 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
struct virtio_blk *vblk;
struct request_queue *q;
int err, index;
int pool_size;
u64 cap;
u32 v, blk_size, sg_elems, opt_io_size;
u16 min_io_size;
......@@ -506,10 +732,12 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
goto out_free_index;
}
init_waitqueue_head(&vblk->queue_wait);
vblk->vdev = vdev;
vblk->sg_elems = sg_elems;
sg_init_table(vblk->sg, vblk->sg_elems);
mutex_init(&vblk->config_lock);
INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
vblk->config_enable = true;
......@@ -517,7 +745,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
if (err)
goto out_free_vblk;
vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
pool_size = sizeof(struct virtblk_req);
if (use_bio)
pool_size += sizeof(struct scatterlist) * sg_elems;
vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
if (!vblk->pool) {
err = -ENOMEM;
goto out_free_vq;
......@@ -530,12 +761,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
goto out_mempool;
}
q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL);
q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL);
if (!q) {
err = -ENOMEM;
goto out_put_disk;
}
if (use_bio)
blk_queue_make_request(q, virtblk_make_request);
q->queuedata = vblk;
virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
......@@ -620,7 +853,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
if (!err && opt_io_size)
blk_queue_io_opt(q, blk_size * opt_io_size);
add_disk(vblk->disk);
err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
if (err)
......
......@@ -24,6 +24,8 @@
#include <linux/err.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/splice.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/poll.h>
......@@ -474,26 +476,53 @@ static ssize_t send_control_msg(struct port *port, unsigned int event,
return 0;
}
struct buffer_token {
union {
void *buf;
struct scatterlist *sg;
} u;
/* If sgpages == 0 then buf is used, else sg is used */
unsigned int sgpages;
};
static void reclaim_sg_pages(struct scatterlist *sg, unsigned int nrpages)
{
int i;
struct page *page;
for (i = 0; i < nrpages; i++) {
page = sg_page(&sg[i]);
if (!page)
break;
put_page(page);
}
kfree(sg);
}
/* Callers must take the port->outvq_lock */
static void reclaim_consumed_buffers(struct port *port)
{
void *buf;
struct buffer_token *tok;
unsigned int len;
if (!port->portdev) {
/* Device has been unplugged. vqs are already gone. */
return;
}
while ((buf = virtqueue_get_buf(port->out_vq, &len))) {
kfree(buf);
while ((tok = virtqueue_get_buf(port->out_vq, &len))) {
if (tok->sgpages)
reclaim_sg_pages(tok->u.sg, tok->sgpages);
else
kfree(tok->u.buf);
kfree(tok);
port->outvq_full = false;
}
}
static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
bool nonblock)
static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
int nents, size_t in_count,
struct buffer_token *tok, bool nonblock)
{
struct scatterlist sg[1];
struct virtqueue *out_vq;
ssize_t ret;
unsigned long flags;
......@@ -505,8 +534,7 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
reclaim_consumed_buffers(port);
sg_init_one(sg, in_buf, in_count);
ret = virtqueue_add_buf(out_vq, sg, 1, 0, in_buf, GFP_ATOMIC);
ret = virtqueue_add_buf(out_vq, sg, nents, 0, tok, GFP_ATOMIC);
/* Tell Host to go! */
virtqueue_kick(out_vq);
......@@ -544,6 +572,37 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
return in_count;
}
static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
bool nonblock)
{
struct scatterlist sg[1];
struct buffer_token *tok;
tok = kmalloc(sizeof(*tok), GFP_ATOMIC);
if (!tok)
return -ENOMEM;
tok->sgpages = 0;
tok->u.buf = in_buf;
sg_init_one(sg, in_buf, in_count);
return __send_to_port(port, sg, 1, in_count, tok, nonblock);
}
static ssize_t send_pages(struct port *port, struct scatterlist *sg, int nents,
size_t in_count, bool nonblock)
{
struct buffer_token *tok;
tok = kmalloc(sizeof(*tok), GFP_ATOMIC);
if (!tok)
return -ENOMEM;
tok->sgpages = nents;
tok->u.sg = sg;
return __send_to_port(port, sg, nents, in_count, tok, nonblock);
}
/*
* Give out the data that's requested from the buffer that we have
* queued up.
......@@ -665,6 +724,26 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
return fill_readbuf(port, ubuf, count, true);
}
static int wait_port_writable(struct port *port, bool nonblock)
{
int ret;
if (will_write_block(port)) {
if (nonblock)
return -EAGAIN;
ret = wait_event_freezable(port->waitqueue,
!will_write_block(port));
if (ret < 0)
return ret;
}
/* Port got hot-unplugged. */
if (!port->guest_connected)
return -ENODEV;
return 0;
}
static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
size_t count, loff_t *offp)
{
......@@ -681,18 +760,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
nonblock = filp->f_flags & O_NONBLOCK;
if (will_write_block(port)) {
if (nonblock)
return -EAGAIN;
ret = wait_event_freezable(port->waitqueue,
!will_write_block(port));
if (ret < 0)
return ret;
}
/* Port got hot-unplugged. */
if (!port->guest_connected)
return -ENODEV;
ret = wait_port_writable(port, nonblock);
if (ret < 0)
return ret;
count = min((size_t)(32 * 1024), count);
......@@ -725,6 +795,93 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
return ret;
}
struct sg_list {
unsigned int n;
unsigned int size;
size_t len;
struct scatterlist *sg;
};
static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
struct sg_list *sgl = sd->u.data;
unsigned int offset, len;
if (sgl->n == sgl->size)
return 0;
/* Try lock this page */
if (buf->ops->steal(pipe, buf) == 0) {
/* Get reference and unlock page for moving */
get_page(buf->page);
unlock_page(buf->page);
len = min(buf->len, sd->len);
sg_set_page(&(sgl->sg[sgl->n]), buf->page, len, buf->offset);
} else {
/* Failback to copying a page */
struct page *page = alloc_page(GFP_KERNEL);
char *src = buf->ops->map(pipe, buf, 1);
char *dst;
if (!page)
return -ENOMEM;
dst = kmap(page);
offset = sd->pos & ~PAGE_MASK;
len = sd->len;
if (len + offset > PAGE_SIZE)
len = PAGE_SIZE - offset;
memcpy(dst + offset, src + buf->offset, len);
kunmap(page);
buf->ops->unmap(pipe, buf, src);
sg_set_page(&(sgl->sg[sgl->n]), page, len, offset);
}
sgl->n++;
sgl->len += len;
return len;
}
/* Faster zero-copy write by splicing */
static ssize_t port_fops_splice_write(struct pipe_inode_info *pipe,
struct file *filp, loff_t *ppos,
size_t len, unsigned int flags)
{
struct port *port = filp->private_data;
struct sg_list sgl;
ssize_t ret;
struct splice_desc sd = {
.total_len = len,
.flags = flags,
.pos = *ppos,
.u.data = &sgl,
};
ret = wait_port_writable(port, filp->f_flags & O_NONBLOCK);
if (ret < 0)
return ret;
sgl.n = 0;
sgl.len = 0;
sgl.size = pipe->nrbufs;
sgl.sg = kmalloc(sizeof(struct scatterlist) * sgl.size, GFP_KERNEL);
if (unlikely(!sgl.sg))
return -ENOMEM;
sg_init_table(sgl.sg, sgl.size);
ret = __splice_from_pipe(pipe, &sd, pipe_to_sg);
if (likely(ret > 0))
ret = send_pages(port, sgl.sg, sgl.n, sgl.len, true);
return ret;
}
static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
{
struct port *port;
......@@ -856,6 +1013,7 @@ static const struct file_operations port_fops = {
.open = port_fops_open,
.read = port_fops_read,
.write = port_fops_write,
.splice_write = port_fops_splice_write,
.poll = port_fops_poll,
.release = port_fops_release,
.fasync = port_fops_fasync,
......
......@@ -263,6 +263,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
struct virtqueue *vq;
int err;
if (!name)
return NULL;
/* We must have this many virtqueues. */
if (index >= ldev->desc->num_vq)
return ERR_PTR(-ENOENT);
......@@ -296,7 +299,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
* to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
* barriers.
*/
vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev,
vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
true, lvq->pages, lg_notify, callback, name);
if (!vq) {
err = -ENOMEM;
......
......@@ -84,6 +84,9 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
if (id >= ARRAY_SIZE(rvdev->vring))
return ERR_PTR(-EINVAL);
if (!name)
return NULL;
ret = rproc_alloc_vring(rvdev, id);
if (ret)
return ERR_PTR(ret);
......@@ -103,7 +106,7 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
* Create the new vq, and tell virtio we're not interested in
* the 'weak' smp barriers, since we're talking with a real device.
*/
vq = vring_new_virtqueue(len, rvring->align, vdev, false, addr,
vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, addr,
rproc_virtio_notify, callback, name);
if (!vq) {
dev_err(dev, "vring_new_virtqueue %s failed\n", name);
......
......@@ -4,7 +4,6 @@ menu "Rpmsg drivers (EXPERIMENTAL)"
config RPMSG
tristate
select VIRTIO
select VIRTIO_RING
depends on EXPERIMENTAL
endmenu
......@@ -190,6 +190,9 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
if (index >= kdev->desc->num_vq)
return ERR_PTR(-ENOENT);
if (!name)
return NULL;
config = kvm_vq_config(kdev->desc)+index;
err = vmem_add_mapping(config->address,
......@@ -198,7 +201,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
if (err)
goto out;
vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN,
vq = vring_new_virtqueue(index, config->num, KVM_S390_VIRTIO_RING_ALIGN,
vdev, true, (void *) config->address,
kvm_notify, callback, name);
if (!vq) {
......
# Virtio always gets selected by whoever wants it.
config VIRTIO
tristate
# Similarly the virtio ring implementation.
config VIRTIO_RING
tristate
depends on VIRTIO
---help---
This option is selected by any driver which implements the virtio
bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
CONFIG_RPMSG or CONFIG_S390_GUEST.
menu "Virtio drivers"
......@@ -13,7 +11,6 @@ config VIRTIO_PCI
tristate "PCI driver for virtio devices (EXPERIMENTAL)"
depends on PCI && EXPERIMENTAL
select VIRTIO
select VIRTIO_RING
---help---
This drivers provides support for virtio based paravirtual device
drivers over PCI. This requires that your VMM has appropriate PCI
......@@ -26,9 +23,8 @@ config VIRTIO_PCI
If unsure, say M.
config VIRTIO_BALLOON
tristate "Virtio balloon driver (EXPERIMENTAL)"
select VIRTIO
select VIRTIO_RING
tristate "Virtio balloon driver"
depends on VIRTIO
---help---
This driver supports increasing and decreasing the amount
of memory within a KVM guest.
......@@ -39,7 +35,6 @@ config VIRTIO_BALLOON
tristate "Platform bus driver for memory mapped virtio devices (EXPERIMENTAL)"
depends on HAS_IOMEM && EXPERIMENTAL
select VIRTIO
select VIRTIO_RING
---help---
This drivers provides support for memory mapped virtio
platform device driver.
......
obj-$(CONFIG_VIRTIO) += virtio.o
obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
......@@ -159,7 +159,7 @@ static int virtio_dev_remove(struct device *_d)
drv->remove(dev);
/* Driver should have reset device. */
BUG_ON(dev->config->get_status(dev));
WARN_ON_ONCE(dev->config->get_status(dev));
/* Acknowledge the device's existence again. */
add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
......
......@@ -131,9 +131,6 @@ struct virtio_mmio_vq_info {
/* the number of entries in the queue */
unsigned int num;
/* the index of the queue */
int queue_index;
/* the virtual address of the ring queue */
void *queue;
......@@ -225,11 +222,10 @@ static void vm_reset(struct virtio_device *vdev)
static void vm_notify(struct virtqueue *vq)
{
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
struct virtio_mmio_vq_info *info = vq->priv;
/* We write the queue's selector into the notification register to
* signal the other end */
writel(info->queue_index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
writel(virtqueue_get_queue_index(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
}
/* Notify all virtqueues on an interrupt. */
......@@ -270,6 +266,7 @@ static void vm_del_vq(struct virtqueue *vq)
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
struct virtio_mmio_vq_info *info = vq->priv;
unsigned long flags, size;
unsigned int index = virtqueue_get_queue_index(vq);
spin_lock_irqsave(&vm_dev->lock, flags);
list_del(&info->node);
......@@ -278,7 +275,7 @@ static void vm_del_vq(struct virtqueue *vq)
vring_del_virtqueue(vq);
/* Select and deactivate the queue */
writel(info->queue_index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN));
......@@ -309,6 +306,9 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
unsigned long flags, size;
int err;
if (!name)
return NULL;
/* Select the queue we're interested in */
writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
......@@ -324,7 +324,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
err = -ENOMEM;
goto error_kmalloc;
}
info->queue_index = index;
/* Allocate pages for the queue - start with a queue as big as
* possible (limited by maximum size allowed by device), drop down
......@@ -332,11 +331,21 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
* and two rings (which makes it "alignment_size * 2")
*/
info->num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
/* If the device reports a 0 entry queue, we won't be able to
* use it to perform I/O, and vring_new_virtqueue() can't create
* empty queues anyway, so don't bother to set up the device.
*/
if (info->num == 0) {
err = -ENOENT;
goto error_alloc_pages;
}
while (1) {
size = PAGE_ALIGN(vring_size(info->num,
VIRTIO_MMIO_VRING_ALIGN));
/* Already smallest possible allocation? */
if (size <= VIRTIO_MMIO_VRING_ALIGN * 2) {
/* Did the last iter shrink the queue below minimum size? */
if (size < VIRTIO_MMIO_VRING_ALIGN * 2) {
err = -ENOMEM;
goto error_alloc_pages;
}
......@@ -356,7 +365,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
/* Create the vring */
vq = vring_new_virtqueue(info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
true, info->queue, vm_notify, callback, name);
if (!vq) {
err = -ENOMEM;
......
......@@ -48,6 +48,7 @@ struct virtio_pci_device
int msix_enabled;
int intx_enabled;
struct msix_entry *msix_entries;
cpumask_var_t *msix_affinity_masks;
/* Name strings for interrupts. This size should be enough,
* and I'm too lazy to allocate each name separately. */
char (*msix_names)[256];
......@@ -79,9 +80,6 @@ struct virtio_pci_vq_info
/* the number of entries in the queue */
int num;
/* the index of the queue */
int queue_index;
/* the virtual address of the ring queue */
void *queue;
......@@ -202,11 +200,11 @@ static void vp_reset(struct virtio_device *vdev)
static void vp_notify(struct virtqueue *vq)
{
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
struct virtio_pci_vq_info *info = vq->priv;
/* we write the queue's selector into the notification register to
* signal the other end */
iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
iowrite16(virtqueue_get_queue_index(vq),
vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
}
/* Handle a configuration change: Tell driver if it wants to know. */
......@@ -279,6 +277,10 @@ static void vp_free_vectors(struct virtio_device *vdev)
for (i = 0; i < vp_dev->msix_used_vectors; ++i)
free_irq(vp_dev->msix_entries[i].vector, vp_dev);
for (i = 0; i < vp_dev->msix_vectors; i++)
if (vp_dev->msix_affinity_masks[i])
free_cpumask_var(vp_dev->msix_affinity_masks[i]);
if (vp_dev->msix_enabled) {
/* Disable the vector used for configuration */
iowrite16(VIRTIO_MSI_NO_VECTOR,
......@@ -296,6 +298,8 @@ static void vp_free_vectors(struct virtio_device *vdev)
vp_dev->msix_names = NULL;
kfree(vp_dev->msix_entries);
vp_dev->msix_entries = NULL;
kfree(vp_dev->msix_affinity_masks);
vp_dev->msix_affinity_masks = NULL;
}
static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
......@@ -314,6 +318,15 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
GFP_KERNEL);
if (!vp_dev->msix_names)
goto error;
vp_dev->msix_affinity_masks
= kzalloc(nvectors * sizeof *vp_dev->msix_affinity_masks,
GFP_KERNEL);
if (!vp_dev->msix_affinity_masks)
goto error;
for (i = 0; i < nvectors; ++i)
if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i],
GFP_KERNEL))
goto error;
for (i = 0; i < nvectors; ++i)
vp_dev->msix_entries[i].entry = i;
......@@ -402,7 +415,6 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
if (!info)
return ERR_PTR(-ENOMEM);
info->queue_index = index;
info->num = num;
info->msix_vector = msix_vec;
......@@ -418,7 +430,7 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
/* create the vring */
vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
true, info->queue, vp_notify, callback, name);
if (!vq) {
err = -ENOMEM;
......@@ -467,7 +479,8 @@ static void vp_del_vq(struct virtqueue *vq)
list_del(&info->node);
spin_unlock_irqrestore(&vp_dev->lock, flags);
iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
iowrite16(virtqueue_get_queue_index(vq),
vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
if (vp_dev->msix_enabled) {
iowrite16(VIRTIO_MSI_NO_VECTOR,
......@@ -542,7 +555,10 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
vp_dev->per_vq_vectors = per_vq_vectors;
allocated_vectors = vp_dev->msix_used_vectors;
for (i = 0; i < nvqs; ++i) {
if (!callbacks[i] || !vp_dev->msix_enabled)
if (!names[i]) {
vqs[i] = NULL;
continue;
} else if (!callbacks[i] || !vp_dev->msix_enabled)
msix_vec = VIRTIO_MSI_NO_VECTOR;
else if (vp_dev->per_vq_vectors)
msix_vec = allocated_vectors++;
......@@ -609,6 +625,35 @@ static const char *vp_bus_name(struct virtio_device *vdev)
return pci_name(vp_dev->pci_dev);
}
/* Setup the affinity for a virtqueue:
* - force the affinity for per vq vector
* - OR over all affinities for shared MSI
* - ignore the affinity request if we're using INTX
*/
static int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
{
struct virtio_device *vdev = vq->vdev;
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
struct virtio_pci_vq_info *info = vq->priv;
struct cpumask *mask;
unsigned int irq;
if (!vq->callback)
return -EINVAL;
if (vp_dev->msix_enabled) {
mask = vp_dev->msix_affinity_masks[info->msix_vector];
irq = vp_dev->msix_entries[info->msix_vector].vector;
if (cpu == -1)
irq_set_affinity_hint(irq, NULL);
else {
cpumask_set_cpu(cpu, mask);
irq_set_affinity_hint(irq, mask);
}
}
return 0;
}
static struct virtio_config_ops virtio_pci_config_ops = {
.get = vp_get,
.set = vp_set,
......@@ -620,6 +665,7 @@ static struct virtio_config_ops virtio_pci_config_ops = {
.get_features = vp_get_features,
.finalize_features = vp_finalize_features,
.bus_name = vp_bus_name,
.set_vq_affinity = vp_set_vq_affinity,
};
static void virtio_pci_release_dev(struct device *_d)
......@@ -673,8 +719,10 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
goto out_enable_device;
vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
if (vp_dev->ioaddr == NULL)
if (vp_dev->ioaddr == NULL) {
err = -ENOMEM;
goto out_req_regions;
}
pci_set_drvdata(pci_dev, vp_dev);
pci_set_master(pci_dev);
......
......@@ -106,6 +106,9 @@ struct vring_virtqueue
/* How to notify other side. FIXME: commonalize hcalls! */
void (*notify)(struct virtqueue *vq);
/* Index of the queue */
int queue_index;
#ifdef DEBUG
/* They're supposed to lock for us. */
unsigned int in_use;
......@@ -171,6 +174,13 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
return head;
}
int virtqueue_get_queue_index(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
return vq->queue_index;
}
EXPORT_SYMBOL_GPL(virtqueue_get_queue_index);
/**
* virtqueue_add_buf - expose buffer to other end
* @vq: the struct virtqueue we're talking about.
......@@ -616,7 +626,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
}
EXPORT_SYMBOL_GPL(vring_interrupt);
struct virtqueue *vring_new_virtqueue(unsigned int num,
struct virtqueue *vring_new_virtqueue(unsigned int index,
unsigned int num,
unsigned int vring_align,
struct virtio_device *vdev,
bool weak_barriers,
......@@ -647,6 +658,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
vq->broken = false;
vq->last_used_idx = 0;
vq->num_added = 0;
vq->queue_index = index;
list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG
vq->in_use = false;
......
......@@ -50,6 +50,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *vq);
unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
int virtqueue_get_queue_index(struct virtqueue *vq);
/**
* virtio_device - representation of a device using virtio
* @index: unique position on the virtio bus
......
......@@ -84,7 +84,9 @@
* nvqs: the number of virtqueues to find
* vqs: on success, includes new virtqueues
* callbacks: array of callbacks, for each virtqueue
* include a NULL entry for vqs that do not need a callback
* names: array of virtqueue names (mainly for debugging)
* include a NULL entry for vqs unused by driver
* Returns 0 on success or error status
* @del_vqs: free virtqueues found by find_vqs().
* @get_features: get the array of feature bits for this device.
......@@ -98,6 +100,7 @@
* vdev: the virtio_device
* This returns a pointer to the bus name a la pci_name from which
* the caller can then copy.
* @set_vq_affinity: set the affinity for a virtqueue.
*/
typedef void vq_callback_t(struct virtqueue *);
struct virtio_config_ops {
......@@ -116,6 +119,7 @@ struct virtio_config_ops {
u32 (*get_features)(struct virtio_device *vdev);
void (*finalize_features)(struct virtio_device *vdev);
const char *(*bus_name)(struct virtio_device *vdev);
int (*set_vq_affinity)(struct virtqueue *vq, int cpu);
};
/* If driver didn't advertise the feature, it will never appear. */
......@@ -190,5 +194,24 @@ const char *virtio_bus_name(struct virtio_device *vdev)
return vdev->config->bus_name(vdev);
}
/**
* virtqueue_set_affinity - setting affinity for a virtqueue
* @vq: the virtqueue
* @cpu: the cpu no.
*
* Pay attention the function are best-effort: the affinity hint may not be set
* due to config support, irq type and sharing.
*
*/
static inline
int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
{
struct virtio_device *vdev = vq->vdev;
if (vdev->config->set_vq_affinity)
return vdev->config->set_vq_affinity(vq, cpu);
return 0;
}
#endif /* __KERNEL__ */
#endif /* _LINUX_VIRTIO_CONFIG_H */
......@@ -165,7 +165,8 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
struct virtio_device;
struct virtqueue;
struct virtqueue *vring_new_virtqueue(unsigned int num,
struct virtqueue *vring_new_virtqueue(unsigned int index,
unsigned int num,
unsigned int vring_align,
struct virtio_device *vdev,
bool weak_barriers,
......
......@@ -4200,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0;
}
static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
return 1;
}
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
......@@ -4221,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
.steal = buffer_pipe_buf_steal,
.steal = generic_pipe_buf_steal,
.get = buffer_pipe_buf_get,
};
......
......@@ -1299,6 +1299,7 @@ static struct device *new_device(const char *name, u16 type)
dev->feature_len = 0;
dev->num_vq = 0;
dev->running = false;
dev->next = NULL;
/*
* Append to device list. Prepending to a single-linked list is
......
CC = gcc
CFLAGS = -O2 -Wall -pthread
all: trace-agent
.c.o:
$(CC) $(CFLAGS) -c $^ -o $@
trace-agent: trace-agent.o trace-agent-ctl.o trace-agent-rw.o
$(CC) $(CFLAGS) -o $@ $^
clean:
rm -f *.o trace-agent
Trace Agent for virtio-trace
============================
Trace agent is a user tool for sending trace data of a guest to a Host in low
overhead. Trace agent has the following functions:
- splice a page of ring-buffer to read_pipe without memory copying
- splice the page from write_pipe to virtio-console without memory copying
- write trace data to stdout by using -o option
- controlled by start/stop orders from a Host
The trace agent operates as follows:
1) Initialize all structures.
2) Create a read/write thread per CPU. Each thread is bound to a CPU.
The read/write threads hold it.
3) A controller thread does poll() for a start order of a host.
4) After the controller of the trace agent receives a start order from a host,
the controller wake read/write threads.
5) The read/write threads start to read trace data from ring-buffers and
write the data to virtio-serial.
6) If the controller receives a stop order from a host, the read/write threads
stop to read trace data.
Files
=====
README: this file
Makefile: Makefile of trace agent for virtio-trace
trace-agent.c: includes main function, sets up for operating trace agent
trace-agent.h: includes all structures and some macros
trace-agent-ctl.c: includes controller function for read/write threads
trace-agent-rw.c: includes read/write threads function
Setup
=====
To use this trace agent for virtio-trace, we need to prepare some virtio-serial
I/Fs.
1) Make FIFO in a host
virtio-trace uses virtio-serial pipe as trace data paths as to the number
of CPUs and a control path, so FIFO (named pipe) should be created as follows:
# mkdir /tmp/virtio-trace/
# mkfifo /tmp/virtio-trace/trace-path-cpu{0,1,2,...,X}.{in,out}
# mkfifo /tmp/virtio-trace/agent-ctl-path.{in,out}
For example, if a guest use three CPUs, the names are
trace-path-cpu{0,1,2}.{in.out}
and
agent-ctl-path.{in,out}.
2) Set up of virtio-serial pipe in a host
Add qemu option to use virtio-serial pipe.
##virtio-serial device##
-device virtio-serial-pci,id=virtio-serial0\
##control path##
-chardev pipe,id=charchannel0,path=/tmp/virtio-trace/agent-ctl-path\
-device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,\
id=channel0,name=agent-ctl-path\
##data path##
-chardev pipe,id=charchannel1,path=/tmp/virtio-trace/trace-path-cpu0\
-device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel0,\
id=channel1,name=trace-path-cpu0\
...
If you manage guests with libvirt, add the following tags to domain XML files.
Then, libvirt passes the same command option to qemu.
<channel type='pipe'>
<source path='/tmp/virtio-trace/agent-ctl-path'/>
<target type='virtio' name='agent-ctl-path'/>
<address type='virtio-serial' controller='0' bus='0' port='0'/>
</channel>
<channel type='pipe'>
<source path='/tmp/virtio-trace/trace-path-cpu0'/>
<target type='virtio' name='trace-path-cpu0'/>
<address type='virtio-serial' controller='0' bus='0' port='1'/>
</channel>
...
Here, chardev names are restricted to trace-path-cpuX and agent-ctl-path. For
example, if a guest use three CPUs, chardev names should be trace-path-cpu0,
trace-path-cpu1, trace-path-cpu2, and agent-ctl-path.
3) Boot the guest
You can find some chardev in /dev/virtio-ports/ in the guest.
Run
===
0) Build trace agent in a guest
$ make
1) Enable ftrace in the guest
<Example>
# echo 1 > /sys/kernel/debug/tracing/events/sched/enable
2) Run trace agent in the guest
This agent must be operated as root.
# ./trace-agent
read/write threads in the agent wait for start order from host. If you add -o
option, trace data are output via stdout in the guest.
3) Open FIFO in a host
# cat /tmp/virtio-trace/trace-path-cpu0.out
If a host does not open these, trace data get stuck in buffers of virtio. Then,
the guest will stop by specification of chardev in QEMU. This blocking mode may
be solved in the future.
4) Start to read trace data by ordering from a host
A host injects read start order to the guest via virtio-serial.
# echo 1 > /tmp/virtio-trace/agent-ctl-path.in
5) Stop to read trace data by ordering from a host
A host injects read stop order to the guest via virtio-serial.
# echo 0 > /tmp/virtio-trace/agent-ctl-path.in
/*
* Controller of read/write threads for virtio-trace
*
* Copyright (C) 2012 Hitachi, Ltd.
* Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com>
* Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
*
* Licensed under GPL version 2 only.
*
*/
#define _GNU_SOURCE
#include <fcntl.h>
#include <poll.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "trace-agent.h"
#define HOST_MSG_SIZE 256
#define EVENT_WAIT_MSEC 100
static volatile sig_atomic_t global_signal_val;
bool global_sig_receive; /* default false */
bool global_run_operation; /* default false*/
/* Handle SIGTERM/SIGINT/SIGQUIT to exit */
static void signal_handler(int sig)
{
global_signal_val = sig;
}
int rw_ctl_init(const char *ctl_path)
{
int ctl_fd;
ctl_fd = open(ctl_path, O_RDONLY);
if (ctl_fd == -1) {
pr_err("Cannot open ctl_fd\n");
goto error;
}
return ctl_fd;
error:
exit(EXIT_FAILURE);
}
static int wait_order(int ctl_fd)
{
struct pollfd poll_fd;
int ret = 0;
while (!global_sig_receive) {
poll_fd.fd = ctl_fd;
poll_fd.events = POLLIN;
ret = poll(&poll_fd, 1, EVENT_WAIT_MSEC);
if (global_signal_val) {
global_sig_receive = true;
pr_info("Receive interrupt %d\n", global_signal_val);
/* Wakes rw-threads when they are sleeping */
if (!global_run_operation)
pthread_cond_broadcast(&cond_wakeup);
ret = -1;
break;
}
if (ret < 0) {
pr_err("Polling error\n");
goto error;
}
if (ret)
break;
};
return ret;
error:
exit(EXIT_FAILURE);
}
/*
* contol read/write threads by handling global_run_operation
*/
void *rw_ctl_loop(int ctl_fd)
{
ssize_t rlen;
char buf[HOST_MSG_SIZE];
int ret;
/* Setup signal handlers */
signal(SIGTERM, signal_handler);
signal(SIGINT, signal_handler);
signal(SIGQUIT, signal_handler);
while (!global_sig_receive) {
ret = wait_order(ctl_fd);
if (ret < 0)
break;
rlen = read(ctl_fd, buf, sizeof(buf));
if (rlen < 0) {
pr_err("read data error in ctl thread\n");
goto error;
}
if (rlen == 2 && buf[0] == '1') {
/*
* If host writes '1' to a control path,
* this controller wakes all read/write threads.
*/
global_run_operation = true;
pthread_cond_broadcast(&cond_wakeup);
pr_debug("Wake up all read/write threads\n");
} else if (rlen == 2 && buf[0] == '0') {
/*
* If host writes '0' to a control path, read/write
* threads will wait for notification from Host.
*/
global_run_operation = false;
pr_debug("Stop all read/write threads\n");
} else
pr_info("Invalid host notification: %s\n", buf);
}
return NULL;
error:
exit(EXIT_FAILURE);
}
/*
* Read/write thread of a guest agent for virtio-trace
*
* Copyright (C) 2012 Hitachi, Ltd.
* Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com>
* Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
*
* Licensed under GPL version 2 only.
*
*/
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/syscall.h>
#include "trace-agent.h"
#define READ_WAIT_USEC 100000
void *rw_thread_info_new(void)
{
struct rw_thread_info *rw_ti;
rw_ti = zalloc(sizeof(struct rw_thread_info));
if (rw_ti == NULL) {
pr_err("rw_thread_info zalloc error\n");
exit(EXIT_FAILURE);
}
rw_ti->cpu_num = -1;
rw_ti->in_fd = -1;
rw_ti->out_fd = -1;
rw_ti->read_pipe = -1;
rw_ti->write_pipe = -1;
rw_ti->pipe_size = PIPE_INIT;
return rw_ti;
}
void *rw_thread_init(int cpu, const char *in_path, const char *out_path,
bool stdout_flag, unsigned long pipe_size,
struct rw_thread_info *rw_ti)
{
int data_pipe[2];
rw_ti->cpu_num = cpu;
/* set read(input) fd */
rw_ti->in_fd = open(in_path, O_RDONLY);
if (rw_ti->in_fd == -1) {
pr_err("Could not open in_fd (CPU:%d)\n", cpu);
goto error;
}
/* set write(output) fd */
if (!stdout_flag) {
/* virtio-serial output mode */
rw_ti->out_fd = open(out_path, O_WRONLY);
if (rw_ti->out_fd == -1) {
pr_err("Could not open out_fd (CPU:%d)\n", cpu);
goto error;
}
} else
/* stdout mode */
rw_ti->out_fd = STDOUT_FILENO;
if (pipe2(data_pipe, O_NONBLOCK) < 0) {
pr_err("Could not create pipe in rw-thread(%d)\n", cpu);
goto error;
}
/*
* Size of pipe is 64kB in default based on fs/pipe.c.
* To read/write trace data speedy, pipe size is changed.
*/
if (fcntl(*data_pipe, F_SETPIPE_SZ, pipe_size) < 0) {
pr_err("Could not change pipe size in rw-thread(%d)\n", cpu);
goto error;
}
rw_ti->read_pipe = data_pipe[1];
rw_ti->write_pipe = data_pipe[0];
rw_ti->pipe_size = pipe_size;
return NULL;
error:
exit(EXIT_FAILURE);
}
/* Bind a thread to a cpu */
static void bind_cpu(int cpu_num)
{
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu_num, &mask);
/* bind my thread to cpu_num by assigning zero to the first argument */
if (sched_setaffinity(0, sizeof(mask), &mask) == -1)
pr_err("Could not set CPU#%d affinity\n", (int)cpu_num);
}
static void *rw_thread_main(void *thread_info)
{
ssize_t rlen, wlen;
ssize_t ret;
struct rw_thread_info *ts = (struct rw_thread_info *)thread_info;
bind_cpu(ts->cpu_num);
while (1) {
/* Wait for a read order of trace data by Host OS */
if (!global_run_operation) {
pthread_mutex_lock(&mutex_notify);
pthread_cond_wait(&cond_wakeup, &mutex_notify);
pthread_mutex_unlock(&mutex_notify);
}
if (global_sig_receive)
break;
/*
* Each thread read trace_pipe_raw of each cpu bounding the
* thread, so contention of multi-threads does not occur.
*/
rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
if (rlen < 0) {
pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
goto error;
} else if (rlen == 0) {
/*
* If trace data do not exist or are unreadable not
* for exceeding the page size, splice_read returns
* NULL. Then, this waits for being filled the data in a
* ring-buffer.
*/
usleep(READ_WAIT_USEC);
pr_debug("Read retry(cpu:%d)\n", ts->cpu_num);
continue;
}
wlen = 0;
do {
ret = splice(ts->write_pipe, NULL, ts->out_fd, NULL,
rlen - wlen,
SPLICE_F_MOVE | SPLICE_F_MORE);
if (ret < 0) {
pr_err("Splice_write in rw-thread(%d)\n",
ts->cpu_num);
goto error;
} else if (ret == 0)
/*
* When host reader is not in time for reading
* trace data, guest will be stopped. This is
* because char dev in QEMU is not supported
* non-blocking mode. Then, writer might be
* sleep in that case.
* This sleep will be removed by supporting
* non-blocking mode.
*/
sleep(1);
wlen += ret;
} while (wlen < rlen);
}
return NULL;
error:
exit(EXIT_FAILURE);
}
pthread_t rw_thread_run(struct rw_thread_info *rw_ti)
{
int ret;
pthread_t rw_thread_per_cpu;
ret = pthread_create(&rw_thread_per_cpu, NULL, rw_thread_main, rw_ti);
if (ret != 0) {
pr_err("Could not create a rw thread(%d)\n", rw_ti->cpu_num);
exit(EXIT_FAILURE);
}
return rw_thread_per_cpu;
}
/*
* Guest agent for virtio-trace
*
* Copyright (C) 2012 Hitachi, Ltd.
* Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com>
* Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
*
* Licensed under GPL version 2 only.
*
*/
#define _GNU_SOURCE
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "trace-agent.h"
#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
#define PIPE_DEF_BUFS 16
#define PIPE_MIN_SIZE (PAGE_SIZE*PIPE_DEF_BUFS)
#define PIPE_MAX_SIZE (1024*1024)
#define READ_PATH_FMT \
"/sys/kernel/debug/tracing/per_cpu/cpu%d/trace_pipe_raw"
#define WRITE_PATH_FMT "/dev/virtio-ports/trace-path-cpu%d"
#define CTL_PATH "/dev/virtio-ports/agent-ctl-path"
pthread_mutex_t mutex_notify = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond_wakeup = PTHREAD_COND_INITIALIZER;
static int get_total_cpus(void)
{
int nr_cpus = (int)sysconf(_SC_NPROCESSORS_CONF);
if (nr_cpus <= 0) {
pr_err("Could not read cpus\n");
goto error;
} else if (nr_cpus > MAX_CPUS) {
pr_err("Exceed max cpus(%d)\n", (int)MAX_CPUS);
goto error;
}
return nr_cpus;
error:
exit(EXIT_FAILURE);
}
static void *agent_info_new(void)
{
struct agent_info *s;
int i;
s = zalloc(sizeof(struct agent_info));
if (s == NULL) {
pr_err("agent_info zalloc error\n");
exit(EXIT_FAILURE);
}
s->pipe_size = PIPE_INIT;
s->use_stdout = false;
s->cpus = get_total_cpus();
s->ctl_fd = -1;
/* read/write threads init */
for (i = 0; i < s->cpus; i++)
s->rw_ti[i] = rw_thread_info_new();
return s;
}
static unsigned long parse_size(const char *arg)
{
unsigned long value, round;
char *ptr;
value = strtoul(arg, &ptr, 10);
switch (*ptr) {
case 'K': case 'k':
value <<= 10;
break;
case 'M': case 'm':
value <<= 20;
break;
default:
break;
}
if (value > PIPE_MAX_SIZE) {
pr_err("Pipe size must be less than 1MB\n");
goto error;
} else if (value < PIPE_MIN_SIZE) {
pr_err("Pipe size must be over 64KB\n");
goto error;
}
/* Align buffer size with page unit */
round = value & (PAGE_SIZE - 1);
value = value - round;
return value;
error:
return 0;
}
static void usage(char const *prg)
{
pr_err("usage: %s [-h] [-o] [-s <size of pipe>]\n", prg);
}
static const char *make_path(int cpu_num, bool this_is_write_path)
{
int ret;
char *buf;
buf = zalloc(PATH_MAX);
if (buf == NULL) {
pr_err("Could not allocate buffer\n");
goto error;
}
if (this_is_write_path)
/* write(output) path */
ret = snprintf(buf, PATH_MAX, WRITE_PATH_FMT, cpu_num);
else
/* read(input) path */
ret = snprintf(buf, PATH_MAX, READ_PATH_FMT, cpu_num);
if (ret <= 0) {
pr_err("Failed to generate %s path(CPU#%d):%d\n",
this_is_write_path ? "read" : "write", cpu_num, ret);
goto error;
}
return buf;
error:
free(buf);
return NULL;
}
static const char *make_input_path(int cpu_num)
{
return make_path(cpu_num, false);
}
static const char *make_output_path(int cpu_num)
{
return make_path(cpu_num, true);
}
static void *agent_info_init(struct agent_info *s)
{
int cpu;
const char *in_path = NULL;
const char *out_path = NULL;
/* init read/write threads */
for (cpu = 0; cpu < s->cpus; cpu++) {
/* set read(input) path per read/write thread */
in_path = make_input_path(cpu);
if (in_path == NULL)
goto error;
/* set write(output) path per read/write thread*/
if (!s->use_stdout) {
out_path = make_output_path(cpu);
if (out_path == NULL)
goto error;
} else
/* stdout mode */
pr_debug("stdout mode\n");
rw_thread_init(cpu, in_path, out_path, s->use_stdout,
s->pipe_size, s->rw_ti[cpu]);
}
/* init controller of read/write threads */
s->ctl_fd = rw_ctl_init((const char *)CTL_PATH);
return NULL;
error:
exit(EXIT_FAILURE);
}
static void *parse_args(int argc, char *argv[], struct agent_info *s)
{
int cmd;
unsigned long size;
while ((cmd = getopt(argc, argv, "hos:")) != -1) {
switch (cmd) {
/* stdout mode */
case 'o':
s->use_stdout = true;
break;
/* size of pipe */
case 's':
size = parse_size(optarg);
if (size == 0)
goto error;
s->pipe_size = size;
break;
case 'h':
default:
usage(argv[0]);
goto error;
}
}
agent_info_init(s);
return NULL;
error:
exit(EXIT_FAILURE);
}
static void agent_main_loop(struct agent_info *s)
{
int cpu;
pthread_t rw_thread_per_cpu[MAX_CPUS];
/* Start all read/write threads */
for (cpu = 0; cpu < s->cpus; cpu++)
rw_thread_per_cpu[cpu] = rw_thread_run(s->rw_ti[cpu]);
rw_ctl_loop(s->ctl_fd);
/* Finish all read/write threads */
for (cpu = 0; cpu < s->cpus; cpu++) {
int ret;
ret = pthread_join(rw_thread_per_cpu[cpu], NULL);
if (ret != 0) {
pr_err("pthread_join() error:%d (cpu %d)\n", ret, cpu);
exit(EXIT_FAILURE);
}
}
}
static void agent_info_free(struct agent_info *s)
{
int i;
close(s->ctl_fd);
for (i = 0; i < s->cpus; i++) {
close(s->rw_ti[i]->in_fd);
close(s->rw_ti[i]->out_fd);
close(s->rw_ti[i]->read_pipe);
close(s->rw_ti[i]->write_pipe);
free(s->rw_ti[i]);
}
free(s);
}
int main(int argc, char *argv[])
{
struct agent_info *s = NULL;
s = agent_info_new();
parse_args(argc, argv, s);
agent_main_loop(s);
agent_info_free(s);
return 0;
}
#ifndef __TRACE_AGENT_H__
#define __TRACE_AGENT_H__
#include <pthread.h>
#include <stdbool.h>
#define MAX_CPUS 256
#define PIPE_INIT (1024*1024)
/*
* agent_info - structure managing total information of guest agent
* @pipe_size: size of pipe (default 1MB)
* @use_stdout: set to true when o option is added (default false)
* @cpus: total number of CPUs
* @ctl_fd: fd of control path, /dev/virtio-ports/agent-ctl-path
* @rw_ti: structure managing information of read/write threads
*/
struct agent_info {
unsigned long pipe_size;
bool use_stdout;
int cpus;
int ctl_fd;
struct rw_thread_info *rw_ti[MAX_CPUS];
};
/*
* rw_thread_info - structure managing a read/write thread a cpu
* @cpu_num: cpu number operating this read/write thread
* @in_fd: fd of reading trace data path in cpu_num
* @out_fd: fd of writing trace data path in cpu_num
* @read_pipe: fd of read pipe
* @write_pipe: fd of write pipe
* @pipe_size: size of pipe (default 1MB)
*/
struct rw_thread_info {
int cpu_num;
int in_fd;
int out_fd;
int read_pipe;
int write_pipe;
unsigned long pipe_size;
};
/* use for stopping rw threads */
extern bool global_sig_receive;
/* use for notification */
extern bool global_run_operation;
extern pthread_mutex_t mutex_notify;
extern pthread_cond_t cond_wakeup;
/* for controller of read/write threads */
extern int rw_ctl_init(const char *ctl_path);
extern void *rw_ctl_loop(int ctl_fd);
/* for trace read/write thread */
extern void *rw_thread_info_new(void);
extern void *rw_thread_init(int cpu, const char *in_path, const char *out_path,
bool stdout_flag, unsigned long pipe_size,
struct rw_thread_info *rw_ti);
extern pthread_t rw_thread_run(struct rw_thread_info *rw_ti);
static inline void *zalloc(size_t size)
{
return calloc(1, size);
}
#define pr_err(format, ...) fprintf(stderr, format, ## __VA_ARGS__)
#define pr_info(format, ...) fprintf(stdout, format, ## __VA_ARGS__)
#ifdef DEBUG
#define pr_debug(format, ...) fprintf(stderr, format, ## __VA_ARGS__)
#else
#define pr_debug(format, ...) do {} while (0)
#endif
#endif /*__TRACE_AGENT_H__*/
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册