提交 90e98c52 编写于 作者: G Guangliang Zhao 提交者: Ilya Dryomov

rbd: initial discard bits from Guangliang Zhao

This patch add the discard support for rbd driver.

There are three types operation in the driver:
1. The objects would be removed if they completely contained
   within the discard range.
2. The objects would be truncated if they partly contained within
   the discard range, and align with their boundary.
3. Others would be zeroed.

A discard request from blkdev_issue_discard() is defined which
REQ_WRITE and REQ_DISCARD both marked and no data, so we must
check the REQ_DISCARD first when getting the request type.

This resolve:
	http://tracker.ceph.com/issues/190

[ Ilya Dryomov: This is incomplete and somewhat buggy, see follow up
  commits by Josh Durgin for refinements and fixes which weren't
  folded in to preserve authorship. ]
Signed-off-by: NGuangliang Zhao <lucienchao@gmail.com>
Reviewed-by: NJosh Durgin <josh.durgin@inktank.com>
Reviewed-by: NAlex Elder <elder@linaro.org>
上级 6d2940c8
...@@ -213,6 +213,7 @@ enum obj_request_type { ...@@ -213,6 +213,7 @@ enum obj_request_type {
enum obj_operation_type { enum obj_operation_type {
OBJ_OP_WRITE, OBJ_OP_WRITE,
OBJ_OP_READ, OBJ_OP_READ,
OBJ_OP_DISCARD,
}; };
enum obj_req_flags { enum obj_req_flags {
...@@ -281,6 +282,7 @@ enum img_req_flags { ...@@ -281,6 +282,7 @@ enum img_req_flags {
IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
}; };
struct rbd_img_request { struct rbd_img_request {
...@@ -797,6 +799,8 @@ static char* obj_op_name(enum obj_operation_type op_type) ...@@ -797,6 +799,8 @@ static char* obj_op_name(enum obj_operation_type op_type)
return "read"; return "read";
case OBJ_OP_WRITE: case OBJ_OP_WRITE:
return "write"; return "write";
case OBJ_OP_DISCARD:
return "discard";
default: default:
return "???"; return "???";
} }
...@@ -1617,6 +1621,21 @@ static bool img_request_write_test(struct rbd_img_request *img_request) ...@@ -1617,6 +1621,21 @@ static bool img_request_write_test(struct rbd_img_request *img_request)
return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
} }
/*
* Set the discard flag when the img_request is an discard request
*/
static void img_request_discard_set(struct rbd_img_request *img_request)
{
set_bit(IMG_REQ_DISCARD, &img_request->flags);
smp_mb();
}
static bool img_request_discard_test(struct rbd_img_request *img_request)
{
smp_mb();
return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
}
static void img_request_child_set(struct rbd_img_request *img_request) static void img_request_child_set(struct rbd_img_request *img_request)
{ {
set_bit(IMG_REQ_CHILD, &img_request->flags); set_bit(IMG_REQ_CHILD, &img_request->flags);
...@@ -1739,6 +1758,18 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) ...@@ -1739,6 +1758,18 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
obj_request_done_set(obj_request); obj_request_done_set(obj_request);
} }
static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
{
dout("%s: obj %p result %d %llu\n", __func__, obj_request,
obj_request->result, obj_request->length);
/*
* There is no such thing as a successful short discard. Set
* it to our originally-requested length.
*/
obj_request->xferred = obj_request->length;
obj_request_done_set(obj_request);
}
/* /*
* For a simple stat call there's nothing to do. We'll do more if * For a simple stat call there's nothing to do. We'll do more if
* this is part of a write sequence for a layered image. * this is part of a write sequence for a layered image.
...@@ -1790,6 +1821,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, ...@@ -1790,6 +1821,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_STAT:
rbd_osd_stat_callback(obj_request); rbd_osd_stat_callback(obj_request);
break; break;
case CEPH_OSD_OP_DELETE:
case CEPH_OSD_OP_TRUNCATE:
case CEPH_OSD_OP_ZERO:
rbd_osd_discard_callback(obj_request);
break;
case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_CALL:
case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_NOTIFY_ACK:
case CEPH_OSD_OP_WATCH: case CEPH_OSD_OP_WATCH:
...@@ -1848,10 +1884,14 @@ static struct ceph_osd_request *rbd_osd_req_create( ...@@ -1848,10 +1884,14 @@ static struct ceph_osd_request *rbd_osd_req_create(
struct ceph_osd_client *osdc; struct ceph_osd_client *osdc;
struct ceph_osd_request *osd_req; struct ceph_osd_request *osd_req;
if (obj_request_img_data_test(obj_request) && op_type == OBJ_OP_WRITE) { if (obj_request_img_data_test(obj_request) &&
(op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
struct rbd_img_request *img_request = obj_request->img_request; struct rbd_img_request *img_request = obj_request->img_request;
if (op_type == OBJ_OP_WRITE) {
rbd_assert(img_request_write_test(img_request)); rbd_assert(img_request_write_test(img_request));
} else {
rbd_assert(img_request_discard_test(img_request));
}
snapc = img_request->snapc; snapc = img_request->snapc;
} }
...@@ -1865,7 +1905,7 @@ static struct ceph_osd_request *rbd_osd_req_create( ...@@ -1865,7 +1905,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
if (!osd_req) if (!osd_req)
return NULL; /* ENOMEM */ return NULL; /* ENOMEM */
if (op_type == OBJ_OP_WRITE) if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
else else
osd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req->r_flags = CEPH_OSD_FLAG_READ;
...@@ -2086,7 +2126,10 @@ static struct rbd_img_request *rbd_img_request_create( ...@@ -2086,7 +2126,10 @@ static struct rbd_img_request *rbd_img_request_create(
img_request->offset = offset; img_request->offset = offset;
img_request->length = length; img_request->length = length;
img_request->flags = 0; img_request->flags = 0;
if (op_type == OBJ_OP_WRITE) { if (op_type == OBJ_OP_DISCARD) {
img_request_discard_set(img_request);
img_request->snapc = snapc;
} else if (op_type == OBJ_OP_WRITE) {
img_request_write_set(img_request); img_request_write_set(img_request);
img_request->snapc = snapc; img_request->snapc = snapc;
} else { } else {
...@@ -2187,8 +2230,12 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) ...@@ -2187,8 +2230,12 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_device *rbd_dev = img_request->rbd_dev;
enum obj_operation_type op_type; enum obj_operation_type op_type;
op_type = img_request_write_test(img_request) ? OBJ_OP_WRITE : if (img_request_discard_test(img_request))
OBJ_OP_READ; op_type = OBJ_OP_DISCARD;
else if (img_request_write_test(img_request))
op_type = OBJ_OP_WRITE;
else
op_type = OBJ_OP_READ;
rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
obj_op_name(op_type), obj_request->length, obj_op_name(op_type), obj_request->length,
...@@ -2275,7 +2322,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ...@@ -2275,7 +2322,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
unsigned int bio_offset = 0; unsigned int bio_offset = 0;
struct page **pages = NULL; struct page **pages = NULL;
enum obj_operation_type op_type; enum obj_operation_type op_type;
u64 object_size = rbd_obj_bytes(&rbd_dev->header);
u64 img_offset; u64 img_offset;
u64 img_end;
u64 resid; u64 resid;
u16 opcode; u16 opcode;
...@@ -2283,6 +2332,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ...@@ -2283,6 +2332,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
(int)type, data_desc); (int)type, data_desc);
img_offset = img_request->offset; img_offset = img_request->offset;
img_end = rbd_dev->header.image_size;
resid = img_request->length; resid = img_request->length;
rbd_assert(resid > 0); rbd_assert(resid > 0);
...@@ -2290,8 +2340,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ...@@ -2290,8 +2340,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
bio_list = data_desc; bio_list = data_desc;
rbd_assert(img_offset == rbd_assert(img_offset ==
bio_list->bi_iter.bi_sector << SECTOR_SHIFT); bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
} else { } else if (type == OBJ_REQUEST_PAGES) {
rbd_assert(type == OBJ_REQUEST_PAGES);
pages = data_desc; pages = data_desc;
} }
...@@ -2332,7 +2381,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ...@@ -2332,7 +2381,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
GFP_ATOMIC); GFP_ATOMIC);
if (!obj_request->bio_list) if (!obj_request->bio_list)
goto out_unwind; goto out_unwind;
} else { } else if (type == OBJ_REQUEST_PAGES) {
unsigned int page_count; unsigned int page_count;
obj_request->pages = pages; obj_request->pages = pages;
...@@ -2343,7 +2392,19 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ...@@ -2343,7 +2392,19 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
pages += page_count; pages += page_count;
} }
if (img_request_write_test(img_request)) { if (img_request_discard_test(img_request)) {
op_type = OBJ_OP_DISCARD;
if (!offset && (length == object_size)
&& (!img_request_layered_test(img_request) ||
(rbd_dev->parent_overlap <=
obj_request->img_offset)))
opcode = CEPH_OSD_OP_DELETE;
else if ((offset + length == object_size) ||
(obj_request->img_offset + length == img_end))
opcode = CEPH_OSD_OP_TRUNCATE;
else
opcode = CEPH_OSD_OP_ZERO;
} else if (img_request_write_test(img_request)) {
op_type = OBJ_OP_WRITE; op_type = OBJ_OP_WRITE;
opcode = CEPH_OSD_OP_WRITE; opcode = CEPH_OSD_OP_WRITE;
} else { } else {
...@@ -2372,12 +2433,13 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, ...@@ -2372,12 +2433,13 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
if (type == OBJ_REQUEST_BIO) if (type == OBJ_REQUEST_BIO)
osd_req_op_extent_osd_data_bio(osd_req, which, osd_req_op_extent_osd_data_bio(osd_req, which,
obj_request->bio_list, length); obj_request->bio_list, length);
else else if (type == OBJ_REQUEST_PAGES)
osd_req_op_extent_osd_data_pages(osd_req, which, osd_req_op_extent_osd_data_pages(osd_req, which,
obj_request->pages, length, obj_request->pages, length,
offset & ~PAGE_MASK, false, false); offset & ~PAGE_MASK, false, false);
if (op_type == OBJ_OP_WRITE) /* Discards are also writes */
if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
rbd_osd_req_format_write(obj_request); rbd_osd_req_format_write(obj_request);
else else
rbd_osd_req_format_read(obj_request); rbd_osd_req_format_read(obj_request);
...@@ -3229,7 +3291,9 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) ...@@ -3229,7 +3291,9 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
u64 mapping_size; u64 mapping_size;
int result; int result;
if (rq->cmd_flags & REQ_WRITE) if (rq->cmd_flags & REQ_DISCARD)
op_type = OBJ_OP_DISCARD;
else if (rq->cmd_flags & REQ_WRITE)
op_type = OBJ_OP_WRITE; op_type = OBJ_OP_WRITE;
else else
op_type = OBJ_OP_READ; op_type = OBJ_OP_READ;
...@@ -3295,7 +3359,12 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) ...@@ -3295,7 +3359,12 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
} }
img_request->rq = rq; img_request->rq = rq;
result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); if (op_type == OBJ_OP_DISCARD)
result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
NULL);
else
result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
rq->bio);
if (result) if (result)
goto err_img_request; goto err_img_request;
...@@ -3667,6 +3736,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) ...@@ -3667,6 +3736,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
blk_queue_io_min(q, segment_size); blk_queue_io_min(q, segment_size);
blk_queue_io_opt(q, segment_size); blk_queue_io_opt(q, segment_size);
/* enable the discard support */
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
q->limits.discard_granularity = segment_size;
q->limits.discard_alignment = segment_size;
blk_queue_merge_bvec(q, rbd_merge_bvec); blk_queue_merge_bvec(q, rbd_merge_bvec);
disk->queue = q; disk->queue = q;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册