提交 53d8ab29 编写于 作者: L Linus Torvalds

Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block

Pull block IO driver changes from Jens Axboe:

 - bcache update from Kent Overstreet.

 - two bcache fixes from Nicholas Swenson.

 - cciss pci init error fix from Andrew.

 - underflow fix in the parallel IDE pg_write code from Dan Carpenter.
   I'm sure the 1 (or 0) users of that are now happy.

 - two PCI related fixes for sx8 from Jingoo Han.

 - floppy init fix for first block read from Jiri Kosina.

 - pktcdvd error return miss fix from Julia Lawall.

 - removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from
   Michael Opdenacker.

 - comment typo fix for the loop driver from Olaf Hering.

 - potential oops fix for null_blk from Raghavendra K T.

 - two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing
   an OOM problem and a problem with handling security locked conditions

* 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits)
  mg_disk: Spelling s/finised/finished/
  null_blk: Null pointer deference problem in alloc_page_buffers
  mtip32xx: Correctly handle security locked condition
  mtip32xx: Make SGL container per-command to eliminate high order dma allocation
  drivers/block/loop.c: fix comment typo in loop_config_discard
  drivers/block/cciss.c:cciss_init_one(): use proper errnos
  drivers/block/paride/pg.c: underflow bug in pg_write()
  drivers/block/sx8.c: remove unnecessary pci_set_drvdata()
  drivers/block/sx8.c: use module_pci_driver()
  floppy: bail out in open() if drive is not responding to block0 read
  bcache: Fix auxiliary search trees for key size > cacheline size
  bcache: Don't return -EINTR when insert finished
  bcache: Improve bucket_prio() calculation
  bcache: Add bch_bkey_equal_header()
  bcache: update bch_bkey_try_merge
  bcache: Move insert_fixup() to btree_keys_ops
  bcache: Convert sorting to btree_keys
  bcache: Convert debug code to btree_keys
  bcache: Convert btree_iter to struct btree_keys
  bcache: Refactor bset_tree sysfs stats
  ...
......@@ -592,6 +592,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
ret = -1;
}
t->raid_partial_stripes_expensive =
max(t->raid_partial_stripes_expensive,
b->raid_partial_stripes_expensive);
/* Find lowest common alignment_offset */
t->alignment_offset = lcm(t->alignment_offset, alignment)
& (max(t->physical_block_size, t->io_min) - 1);
......
......@@ -5004,7 +5004,7 @@ static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
i = alloc_cciss_hba(pdev);
if (i < 0)
return -1;
return -ENOMEM;
h = hba[i];
h->pdev = pdev;
......@@ -5205,7 +5205,7 @@ static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
*/
pci_set_drvdata(pdev, NULL);
free_hba(h);
return -1;
return -ENODEV;
}
static void cciss_shutdown(struct pci_dev *pdev)
......
......@@ -3691,9 +3691,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
if (!(mode & FMODE_NDELAY)) {
if (mode & (FMODE_READ|FMODE_WRITE)) {
UDRS->last_checked = 0;
clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
check_disk_change(bdev);
if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
goto out;
if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
goto out;
}
res = -EROFS;
if ((mode & FMODE_WRITE) &&
......@@ -3746,17 +3749,29 @@ static unsigned int floppy_check_events(struct gendisk *disk,
* a disk in the drive, and whether that disk is writable.
*/
static void floppy_rb0_complete(struct bio *bio, int err)
struct rb0_cbdata {
int drive;
struct completion complete;
};
static void floppy_rb0_cb(struct bio *bio, int err)
{
complete((struct completion *)bio->bi_private);
struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private;
int drive = cbdata->drive;
if (err) {
pr_info("floppy: error %d while reading block 0", err);
set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
}
complete(&cbdata->complete);
}
static int __floppy_read_block_0(struct block_device *bdev)
static int __floppy_read_block_0(struct block_device *bdev, int drive)
{
struct bio bio;
struct bio_vec bio_vec;
struct completion complete;
struct page *page;
struct rb0_cbdata cbdata;
size_t size;
page = alloc_page(GFP_NOIO);
......@@ -3769,6 +3784,8 @@ static int __floppy_read_block_0(struct block_device *bdev)
if (!size)
size = 1024;
cbdata.drive = drive;
bio_init(&bio);
bio.bi_io_vec = &bio_vec;
bio_vec.bv_page = page;
......@@ -3779,13 +3796,14 @@ static int __floppy_read_block_0(struct block_device *bdev)
bio.bi_bdev = bdev;
bio.bi_iter.bi_sector = 0;
bio.bi_flags = (1 << BIO_QUIET);
init_completion(&complete);
bio.bi_private = &complete;
bio.bi_end_io = floppy_rb0_complete;
bio.bi_private = &cbdata;
bio.bi_end_io = floppy_rb0_cb;
submit_bio(READ, &bio);
process_fd_request();
wait_for_completion(&complete);
init_completion(&cbdata.complete);
wait_for_completion(&cbdata.complete);
__free_page(page);
......@@ -3827,7 +3845,7 @@ static int floppy_revalidate(struct gendisk *disk)
UDRS->generation++;
if (drive_no_geom(drive)) {
/* auto-sensing */
res = __floppy_read_block_0(opened_bdev[drive]);
res = __floppy_read_block_0(opened_bdev[drive], drive);
} else {
if (cf)
poll_drive(false, FD_RAW_NEED_DISK);
......
......@@ -799,7 +799,7 @@ static void loop_config_discard(struct loop_device *lo)
/*
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard. However we do support discard if
* image a.k.a. discard. However we do not support discard if
* encryption is enabled, because it may give an attacker
* useful information.
*/
......
......@@ -915,7 +915,7 @@ static int mg_probe(struct platform_device *plat_dev)
/* disk reset */
if (prv_data->dev_attr == MG_STORAGE_DEV) {
/* If POR seq. not yet finised, wait */
/* If POR seq. not yet finished, wait */
err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
if (err)
goto probe_err_3b;
......
......@@ -41,10 +41,31 @@
#include "mtip32xx.h"
#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
#define HW_CMD_TBL_SZ (AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16))
#define HW_CMD_TBL_AR_SZ (HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS)
#define HW_PORT_PRIV_DMA_SZ \
(HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
/* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */
#define AHCI_RX_FIS_SZ 0x100
#define AHCI_RX_FIS_OFFSET 0x0
#define AHCI_IDFY_SZ ATA_SECT_SIZE
#define AHCI_IDFY_OFFSET 0x400
#define AHCI_SECTBUF_SZ ATA_SECT_SIZE
#define AHCI_SECTBUF_OFFSET 0x800
#define AHCI_SMARTBUF_SZ ATA_SECT_SIZE
#define AHCI_SMARTBUF_OFFSET 0xC00
/* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */
#define BLOCK_DMA_ALLOC_SZ 4096
/* DMA region containing command table (should be 8192 bytes) */
#define AHCI_CMD_SLOT_SZ sizeof(struct mtip_cmd_hdr)
#define AHCI_CMD_TBL_SZ (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ)
#define AHCI_CMD_TBL_OFFSET 0x0
/* DMA region per command (contains header and SGL) */
#define AHCI_CMD_TBL_HDR_SZ 0x80
#define AHCI_CMD_TBL_HDR_OFFSET 0x0
#define AHCI_CMD_TBL_SGL_SZ (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg))
#define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ
#define CMD_DMA_ALLOC_SZ (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ)
#define HOST_CAP_NZDMA (1 << 19)
#define HOST_HSORG 0xFC
......@@ -899,8 +920,9 @@ static void mtip_handle_tfe(struct driver_data *dd)
fail_reason = "thermal shutdown";
}
if (buf[288] == 0xBF) {
set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
dev_info(&dd->pdev->dev,
"Drive indicates rebuild has failed.\n");
"Drive indicates rebuild has failed. Secure erase required.\n");
fail_all_ncq_cmds = 1;
fail_reason = "rebuild failed";
}
......@@ -1566,6 +1588,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
}
#endif
/* Check security locked state */
if (port->identify[128] & 0x4)
set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
else
clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
/* Demux ID.DRAT & ID.RZAT to determine trim support */
if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
......@@ -1887,6 +1915,10 @@ static void mtip_dump_identify(struct mtip_port *port)
strlcpy(cbuf, (char *)(port->identify+27), 41);
dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
dev_info(&port->dd->pdev->dev, "Security: %04x %s\n",
port->identify[128],
port->identify[128] & 0x4 ? "(LOCKED)" : "");
if (mtip_hw_get_capacity(port->dd, &sectors))
dev_info(&port->dd->pdev->dev,
"Capacity: %llu sectors (%llu MB)\n",
......@@ -3312,6 +3344,118 @@ static int mtip_service_thread(void *data)
return 0;
}
/*
* DMA region teardown
*
* @dd Pointer to driver_data structure
*
* return value
* None
*/
static void mtip_dma_free(struct driver_data *dd)
{
int i;
struct mtip_port *port = dd->port;
if (port->block1)
dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
port->block1, port->block1_dma);
if (port->command_list) {
dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
port->command_list, port->command_list_dma);
}
for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
if (port->commands[i].command)
dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
port->commands[i].command,
port->commands[i].command_dma);
}
}
/*
* DMA region setup
*
* @dd Pointer to driver_data structure
*
* return value
* -ENOMEM Not enough free DMA region space to initialize driver
*/
static int mtip_dma_alloc(struct driver_data *dd)
{
struct mtip_port *port = dd->port;
int i, rv = 0;
u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
port->block1 =
dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
&port->block1_dma, GFP_KERNEL);
if (!port->block1)
return -ENOMEM;
memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ);
/* Allocate dma memory for command list */
port->command_list =
dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
&port->command_list_dma, GFP_KERNEL);
if (!port->command_list) {
dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
port->block1, port->block1_dma);
port->block1 = NULL;
port->block1_dma = 0;
return -ENOMEM;
}
memset(port->command_list, 0, AHCI_CMD_TBL_SZ);
/* Setup all pointers into first DMA region */
port->rxfis = port->block1 + AHCI_RX_FIS_OFFSET;
port->rxfis_dma = port->block1_dma + AHCI_RX_FIS_OFFSET;
port->identify = port->block1 + AHCI_IDFY_OFFSET;
port->identify_dma = port->block1_dma + AHCI_IDFY_OFFSET;
port->log_buf = port->block1 + AHCI_SECTBUF_OFFSET;
port->log_buf_dma = port->block1_dma + AHCI_SECTBUF_OFFSET;
port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET;
port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET;
/* Setup per command SGL DMA region */
/* Point the command headers at the command tables */
for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
port->commands[i].command =
dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
&port->commands[i].command_dma, GFP_KERNEL);
if (!port->commands[i].command) {
rv = -ENOMEM;
mtip_dma_free(dd);
return rv;
}
memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ);
port->commands[i].command_header = port->command_list +
(sizeof(struct mtip_cmd_hdr) * i);
port->commands[i].command_header_dma =
dd->port->command_list_dma +
(sizeof(struct mtip_cmd_hdr) * i);
if (host_cap_64)
port->commands[i].command_header->ctbau =
__force_bit2int cpu_to_le32(
(port->commands[i].command_dma >> 16) >> 16);
port->commands[i].command_header->ctba =
__force_bit2int cpu_to_le32(
port->commands[i].command_dma & 0xFFFFFFFF);
sg_init_table(port->commands[i].sg, MTIP_MAX_SG);
/* Mark command as currently inactive */
atomic_set(&dd->port->commands[i].active, 0);
}
return 0;
}
/*
* Called once for each card.
*
......@@ -3370,83 +3514,10 @@ static int mtip_hw_init(struct driver_data *dd)
dd->port->mmio = dd->mmio + PORT_OFFSET;
dd->port->dd = dd;
/* Allocate memory for the command list. */
dd->port->command_list =
dmam_alloc_coherent(&dd->pdev->dev,
HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
&dd->port->command_list_dma,
GFP_KERNEL);
if (!dd->port->command_list) {
dev_err(&dd->pdev->dev,
"Memory allocation: command list\n");
rv = -ENOMEM;
/* DMA allocations */
rv = mtip_dma_alloc(dd);
if (rv < 0)
goto out1;
}
/* Clear the memory we have allocated. */
memset(dd->port->command_list,
0,
HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4));
/* Setup the addresse of the RX FIS. */
dd->port->rxfis = dd->port->command_list + HW_CMD_SLOT_SZ;
dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ;
/* Setup the address of the command tables. */
dd->port->command_table = dd->port->rxfis + AHCI_RX_FIS_SZ;
dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ;
/* Setup the address of the identify data. */
dd->port->identify = dd->port->command_table +
HW_CMD_TBL_AR_SZ;
dd->port->identify_dma = dd->port->command_tbl_dma +
HW_CMD_TBL_AR_SZ;
/* Setup the address of the sector buffer - for some non-ncq cmds */
dd->port->sector_buffer = (void *) dd->port->identify + ATA_SECT_SIZE;
dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
/* Setup the address of the log buf - for read log command */
dd->port->log_buf = (void *)dd->port->sector_buffer + ATA_SECT_SIZE;
dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE;
/* Setup the address of the smart buf - for smart read data command */
dd->port->smart_buf = (void *)dd->port->log_buf + ATA_SECT_SIZE;
dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE;
/* Point the command headers at the command tables. */
for (i = 0; i < num_command_slots; i++) {
dd->port->commands[i].command_header =
dd->port->command_list +
(sizeof(struct mtip_cmd_hdr) * i);
dd->port->commands[i].command_header_dma =
dd->port->command_list_dma +
(sizeof(struct mtip_cmd_hdr) * i);
dd->port->commands[i].command =
dd->port->command_table + (HW_CMD_TBL_SZ * i);
dd->port->commands[i].command_dma =
dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i);
if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
dd->port->commands[i].command_header->ctbau =
__force_bit2int cpu_to_le32(
(dd->port->commands[i].command_dma >> 16) >> 16);
dd->port->commands[i].command_header->ctba =
__force_bit2int cpu_to_le32(
dd->port->commands[i].command_dma & 0xFFFFFFFF);
/*
* If this is not done, a bug is reported by the stock
* FC11 i386. Due to the fact that it has lots of kernel
* debugging enabled.
*/
sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG);
/* Mark all commands as currently inactive.*/
atomic_set(&dd->port->commands[i].active, 0);
}
/* Setup the pointers to the extended s_active and CI registers. */
for (i = 0; i < dd->slot_groups; i++) {
......@@ -3594,12 +3665,8 @@ static int mtip_hw_init(struct driver_data *dd)
out2:
mtip_deinit_port(dd->port);
mtip_dma_free(dd);
/* Free the command/command header memory. */
dmam_free_coherent(&dd->pdev->dev,
HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
dd->port->command_list,
dd->port->command_list_dma);
out1:
/* Free the memory allocated for the for structure. */
kfree(dd->port);
......@@ -3622,7 +3689,8 @@ static int mtip_hw_exit(struct driver_data *dd)
* saves its state.
*/
if (!dd->sr) {
if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
!test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
if (mtip_standby_immediate(dd->port))
dev_warn(&dd->pdev->dev,
"STANDBY IMMEDIATE failed\n");
......@@ -3641,11 +3709,9 @@ static int mtip_hw_exit(struct driver_data *dd)
irq_set_affinity_hint(dd->pdev->irq, NULL);
devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
/* Free the command/command header memory. */
dmam_free_coherent(&dd->pdev->dev,
HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
dd->port->command_list,
dd->port->command_list_dma);
/* Free dma regions */
mtip_dma_free(dd);
/* Free the memory allocated for the for structure. */
kfree(dd->port);
dd->port = NULL;
......
......@@ -69,7 +69,7 @@
* Maximum number of scatter gather entries
* a single command may have.
*/
#define MTIP_MAX_SG 128
#define MTIP_MAX_SG 504
/*
* Maximum number of slot groups (Command Issue & s_active registers)
......@@ -92,7 +92,7 @@
/* Driver name and version strings */
#define MTIP_DRV_NAME "mtip32xx"
#define MTIP_DRV_VERSION "1.2.6os3"
#define MTIP_DRV_VERSION "1.3.0"
/* Maximum number of minor device numbers per device. */
#define MTIP_MAX_MINORS 16
......@@ -391,15 +391,13 @@ struct mtip_port {
*/
dma_addr_t rxfis_dma;
/*
* Pointer to the beginning of the command table memory as used
* by the driver.
* Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART
*/
void *command_table;
void *block1;
/*
* Pointer to the beginning of the command table memory as used
* by the DMA.
* DMA address of region for RX Fis, Identify, RLE10, and SMART
*/
dma_addr_t command_tbl_dma;
dma_addr_t block1_dma;
/*
* Pointer to the beginning of the identify data memory as used
* by the driver.
......
......@@ -616,6 +616,11 @@ static int __init null_init(void)
irqmode = NULL_IRQ_NONE;
}
#endif
if (bs > PAGE_SIZE) {
pr_warn("null_blk: invalid block size\n");
pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
bs = PAGE_SIZE;
}
if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
if (submit_queues < nr_online_nodes) {
......
......@@ -581,7 +581,7 @@ static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count,
if (hdr.magic != PG_MAGIC)
return -EINVAL;
if (hdr.dlen > PG_MAX_DATA)
if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA)
return -EINVAL;
if ((count - hs) > PG_MAX_DATA)
return -EINVAL;
......
......@@ -706,7 +706,9 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
WRITE : READ, __GFP_WAIT);
if (cgc->buflen) {
if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT))
ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
__GFP_WAIT);
if (ret)
goto out;
}
......
......@@ -1744,20 +1744,6 @@ static void carm_remove_one (struct pci_dev *pdev)
kfree(host);
pci_release_regions(pdev);
pci_disable_device(pdev);
pci_set_drvdata(pdev, NULL);
}
static int __init carm_init(void)
{
return pci_register_driver(&carm_driver);
}
static void __exit carm_exit(void)
{
pci_unregister_driver(&carm_driver);
}
module_init(carm_init);
module_exit(carm_exit);
module_pci_driver(carm_driver);
......@@ -561,11 +561,11 @@ static int gdrom_set_interrupt_handlers(void)
int err;
err = request_irq(HW_EVENT_GDROM_CMD, gdrom_command_interrupt,
IRQF_DISABLED, "gdrom_command", &gd);
0, "gdrom_command", &gd);
if (err)
return err;
err = request_irq(HW_EVENT_GDROM_DMA, gdrom_dma_interrupt,
IRQF_DISABLED, "gdrom_dma", &gd);
0, "gdrom_dma", &gd);
if (err)
free_irq(HW_EVENT_GDROM_CMD, &gd);
return err;
......
obj-$(CONFIG_BCACHE) += bcache.o
bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\
movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
util.o writeback.o
CFLAGS_request.o += -Iblock
......@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
{
BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
return false;
if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
unsigned i;
for (i = 0; i < RESERVE_NONE; i++)
if (!fifo_full(&ca->free[i]))
goto add;
return false;
}
add:
b->prio = 0;
if (can_inc_bucket_gen(b) &&
......@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
fifo_push(&ca->free_inc, b - ca->buckets);
}
#define bucket_prio(b) \
(((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
/*
* Determines what order we're going to reuse buckets, smallest bucket_prio()
* first: we also take into account the number of sectors of live data in that
* bucket, and in order for that multiply to make sense we have to scale bucket
*
* Thus, we scale the bucket priorities so that the bucket with the smallest
* prio is worth 1/8th of what INITIAL_PRIO is worth.
*/
#define bucket_prio(b) \
({ \
unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
\
(b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
})
#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
......@@ -304,6 +323,21 @@ do { \
__set_current_state(TASK_RUNNING); \
} while (0)
static int bch_allocator_push(struct cache *ca, long bucket)
{
unsigned i;
/* Prios/gens are actually the most important reserve */
if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
return true;
for (i = 0; i < RESERVE_NR; i++)
if (fifo_push(&ca->free[i], bucket))
return true;
return false;
}
static int bch_allocator_thread(void *arg)
{
struct cache *ca = arg;
......@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg)
mutex_lock(&ca->set->bucket_lock);
}
allocator_wait(ca, !fifo_full(&ca->free));
fifo_push(&ca->free, bucket);
allocator_wait(ca, bch_allocator_push(ca, bucket));
wake_up(&ca->set->bucket_wait);
}
......@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg)
}
}
long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
{
DEFINE_WAIT(w);
struct bucket *b;
long r;
/* fastpath */
if (fifo_used(&ca->free) > ca->watermark[watermark]) {
fifo_pop(&ca->free, r);
if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
fifo_pop(&ca->free[reserve], r))
goto out;
}
if (!wait)
return -1;
while (1) {
if (fifo_used(&ca->free) > ca->watermark[watermark]) {
fifo_pop(&ca->free, r);
break;
}
do {
prepare_to_wait(&ca->set->bucket_wait, &w,
TASK_UNINTERRUPTIBLE);
mutex_unlock(&ca->set->bucket_lock);
schedule();
mutex_lock(&ca->set->bucket_lock);
}
} while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
!fifo_pop(&ca->free[reserve], r));
finish_wait(&ca->set->bucket_wait, &w);
out:
......@@ -401,12 +428,14 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
if (expensive_debug_checks(ca->set)) {
size_t iter;
long i;
unsigned j;
for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
fifo_for_each(i, &ca->free, iter)
BUG_ON(i == r);
for (j = 0; j < RESERVE_NR; j++)
fifo_for_each(i, &ca->free[j], iter)
BUG_ON(i == r);
fifo_for_each(i, &ca->free_inc, iter)
BUG_ON(i == r);
fifo_for_each(i, &ca->unused, iter)
......@@ -419,7 +448,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
if (watermark <= WATERMARK_METADATA) {
if (reserve <= RESERVE_PRIO) {
SET_GC_MARK(b, GC_MARK_METADATA);
SET_GC_MOVE(b, 0);
b->prio = BTREE_PRIO;
......@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
}
}
int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
struct bkey *k, int n, bool wait)
{
int i;
......@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
for (i = 0; i < n; i++) {
struct cache *ca = c->cache_by_alloc[i];
long b = bch_bucket_alloc(ca, watermark, wait);
long b = bch_bucket_alloc(ca, reserve, wait);
if (b == -1)
goto err;
......@@ -478,12 +507,12 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
return -1;
}
int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
struct bkey *k, int n, bool wait)
{
int ret;
mutex_lock(&c->bucket_lock);
ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
mutex_unlock(&c->bucket_lock);
return ret;
}
......@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
unsigned watermark = write_prio
? WATERMARK_MOVINGGC
: WATERMARK_NONE;
? RESERVE_MOVINGGC
: RESERVE_NONE;
spin_unlock(&c->data_bucket_lock);
......@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca)
* Then 8 for btree allocations
* Then half for the moving garbage collector
*/
#if 0
ca->watermark[WATERMARK_PRIO] = 0;
ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
......@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca)
ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
ca->watermark[WATERMARK_MOVINGGC];
#endif
return 0;
}
......@@ -187,6 +187,7 @@
#include <linux/types.h>
#include <linux/workqueue.h>
#include "bset.h"
#include "util.h"
#include "closure.h"
......@@ -309,7 +310,8 @@ struct cached_dev {
struct cache_sb sb;
struct bio sb_bio;
struct bio_vec sb_bv[1];
struct closure_with_waitlist sb_write;
struct closure sb_write;
struct semaphore sb_write_mutex;
/* Refcount on the cache set. Always nonzero when we're caching. */
atomic_t count;
......@@ -382,12 +384,12 @@ struct cached_dev {
unsigned writeback_rate_p_term_inverse;
};
enum alloc_watermarks {
WATERMARK_PRIO,
WATERMARK_METADATA,
WATERMARK_MOVINGGC,
WATERMARK_NONE,
WATERMARK_MAX
enum alloc_reserve {
RESERVE_BTREE,
RESERVE_PRIO,
RESERVE_MOVINGGC,
RESERVE_NONE,
RESERVE_NR,
};
struct cache {
......@@ -399,8 +401,6 @@ struct cache {
struct kobject kobj;
struct block_device *bdev;
unsigned watermark[WATERMARK_MAX];
struct task_struct *alloc_thread;
struct closure prio;
......@@ -429,7 +429,7 @@ struct cache {
* because all the data they contained was overwritten), so we only
* need to discard them before they can be moved to the free list.
*/
DECLARE_FIFO(long, free);
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
DECLARE_FIFO(long, unused);
......@@ -514,7 +514,8 @@ struct cache_set {
uint64_t cached_dev_sectors;
struct closure caching;
struct closure_with_waitlist sb_write;
struct closure sb_write;
struct semaphore sb_write_mutex;
mempool_t *search;
mempool_t *bio_meta;
......@@ -629,13 +630,15 @@ struct cache_set {
#ifdef CONFIG_BCACHE_DEBUG
struct btree *verify_data;
struct bset *verify_ondisk;
struct mutex verify_lock;
#endif
unsigned nr_uuids;
struct uuid_entry *uuids;
BKEY_PADDED(uuid_bucket);
struct closure_with_waitlist uuid_write;
struct closure uuid_write;
struct semaphore uuid_write_mutex;
/*
* A btree node on disk could have too many bsets for an iterator to fit
......@@ -643,13 +646,7 @@ struct cache_set {
*/
mempool_t *fill_iter;
/*
* btree_sort() is a merge sort and requires temporary space - single
* element mempool
*/
struct mutex sort_lock;
struct bset *sort;
unsigned sort_crit_factor;
struct bset_sort_state sort;
/* List of buckets we're currently writing data to */
struct list_head data_buckets;
......@@ -665,7 +662,6 @@ struct cache_set {
unsigned congested_read_threshold_us;
unsigned congested_write_threshold_us;
struct time_stats sort_time;
struct time_stats btree_gc_time;
struct time_stats btree_split_time;
struct time_stats btree_read_time;
......@@ -683,9 +679,9 @@ struct cache_set {
unsigned error_decay;
unsigned short journal_delay_ms;
bool expensive_debug_checks;
unsigned verify:1;
unsigned key_merging_disabled:1;
unsigned expensive_debug_checks:1;
unsigned gc_always_rewrite:1;
unsigned shrinker_disabled:1;
unsigned copy_gc_enabled:1;
......@@ -707,13 +703,8 @@ struct bbio {
struct bio bio;
};
static inline unsigned local_clock_us(void)
{
return local_clock() >> 10;
}
#define BTREE_PRIO USHRT_MAX
#define INITIAL_PRIO 32768
#define INITIAL_PRIO 32768U
#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
#define btree_blocks(b) \
......@@ -726,21 +717,6 @@ static inline unsigned local_clock_us(void)
#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
#define block_bytes(c) ((c)->sb.block_size << 9)
#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
#define set_bytes(i) __set_bytes(i, i->keys)
#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
#define set_blocks(i, c) __set_blocks(i, (i)->keys, c)
#define node(i, j) ((struct bkey *) ((i)->d + (j)))
#define end(i) node(i, (i)->keys)
#define index(i, b) \
((size_t) (((void *) i - (void *) (b)->sets[0].data) / \
block_bytes(b->c)))
#define btree_data_space(b) (PAGE_SIZE << (b)->page_order)
#define prios_per_bucket(c) \
((bucket_bytes(c) - sizeof(struct prio_set)) / \
sizeof(struct bucket_disk))
......@@ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
}
/* Btree key macros */
static inline uint8_t gen_after(uint8_t a, uint8_t b)
{
uint8_t r = a - b;
return r > 128U ? 0 : r;
}
static inline void bkey_init(struct bkey *k)
static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
unsigned i)
{
*k = ZERO_KEY;
return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
}
static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
unsigned i)
{
return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
}
/* Btree key macros */
/*
* This is used for various on disk data structures - cache_sb, prio_set, bset,
* jset: The checksum is _always_ the first 8 bytes of these structs
*/
#define csum_set(i) \
bch_crc64(((void *) (i)) + sizeof(uint64_t), \
((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
((void *) bset_bkey_last(i)) - \
(((void *) (i)) + sizeof(uint64_t)))
/* Error handling macros */
......
......@@ -5,30 +5,134 @@
* Copyright 2012 Google, Inc.
*/
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
#include "util.h"
#include "bset.h"
#include <linux/console.h>
#include <linux/random.h>
#include <linux/prefetch.h>
#ifdef CONFIG_BCACHE_DEBUG
void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
{
struct bkey *k, *next;
for (k = i->start; k < bset_bkey_last(i); k = next) {
next = bkey_next(k);
printk(KERN_ERR "block %u key %zi/%u: ", set,
(uint64_t *) k - i->d, i->keys);
if (b->ops->key_dump)
b->ops->key_dump(b, k);
else
printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
if (next < bset_bkey_last(i) &&
bkey_cmp(k, b->ops->is_extents ?
&START_KEY(next) : next) > 0)
printk(KERN_ERR "Key skipped backwards\n");
}
}
void bch_dump_bucket(struct btree_keys *b)
{
unsigned i;
console_lock();
for (i = 0; i <= b->nsets; i++)
bch_dump_bset(b, b->set[i].data,
bset_sector_offset(b, b->set[i].data));
console_unlock();
}
int __bch_count_data(struct btree_keys *b)
{
unsigned ret = 0;
struct btree_iter iter;
struct bkey *k;
if (b->ops->is_extents)
for_each_key(b, k, &iter)
ret += KEY_SIZE(k);
return ret;
}
void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
{
va_list args;
struct bkey *k, *p = NULL;
struct btree_iter iter;
const char *err;
for_each_key(b, k, &iter) {
if (b->ops->is_extents) {
err = "Keys out of order";
if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
goto bug;
if (bch_ptr_invalid(b, k))
continue;
err = "Overlapping keys";
if (p && bkey_cmp(p, &START_KEY(k)) > 0)
goto bug;
} else {
if (bch_ptr_bad(b, k))
continue;
err = "Duplicate keys";
if (p && !bkey_cmp(p, k))
goto bug;
}
p = k;
}
#if 0
err = "Key larger than btree node key";
if (p && bkey_cmp(p, &b->key) > 0)
goto bug;
#endif
return;
bug:
bch_dump_bucket(b);
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
panic("bch_check_keys error: %s:\n", err);
}
static void bch_btree_iter_next_check(struct btree_iter *iter)
{
struct bkey *k = iter->data->k, *next = bkey_next(k);
if (next < iter->data->end &&
bkey_cmp(k, iter->b->ops->is_extents ?
&START_KEY(next) : next) > 0) {
bch_dump_bucket(iter->b);
panic("Key skipped backwards\n");
}
}
#else
static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
#endif
/* Keylists */
int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
{
size_t oldsize = bch_keylist_nkeys(l);
size_t newsize = oldsize + 2 + nptrs;
size_t newsize = oldsize + u64s;
uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
uint64_t *new_keys;
/* The journalling code doesn't handle the case where the keys to insert
* is bigger than an empty write: If we just return -ENOMEM here,
* bio_insert() and bio_invalidate() will insert the keys created so far
* and finish the rest when the keylist is empty.
*/
if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
return -ENOMEM;
newsize = roundup_pow_of_two(newsize);
if (newsize <= KEYLIST_INLINE ||
......@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l)
bch_keylist_bytes(l));
}
/* Pointer validation */
static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
{
unsigned i;
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(c, k, i)) {
struct cache *ca = PTR_CACHE(c, k, i);
size_t bucket = PTR_BUCKET_NR(c, k, i);
size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
if (KEY_SIZE(k) + r > c->sb.bucket_size ||
bucket < ca->sb.first_bucket ||
bucket >= ca->sb.nbuckets)
return true;
}
return false;
}
bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
{
char buf[80];
if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
goto bad;
if (__ptr_invalid(c, k))
goto bad;
return false;
bad:
bch_bkey_to_text(buf, sizeof(buf), k);
cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
return true;
}
bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
{
char buf[80];
if (!KEY_SIZE(k))
return true;
if (KEY_SIZE(k) > KEY_OFFSET(k))
goto bad;
if (__ptr_invalid(c, k))
goto bad;
return false;
bad:
bch_bkey_to_text(buf, sizeof(buf), k);
cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
return true;
}
static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
unsigned ptr)
{
struct bucket *g = PTR_BUCKET(b->c, k, ptr);
char buf[80];
if (mutex_trylock(&b->c->bucket_lock)) {
if (b->level) {
if (KEY_DIRTY(k) ||
g->prio != BTREE_PRIO ||
(b->c->gc_mark_valid &&
GC_MARK(g) != GC_MARK_METADATA))
goto err;
} else {
if (g->prio == BTREE_PRIO)
goto err;
if (KEY_DIRTY(k) &&
b->c->gc_mark_valid &&
GC_MARK(g) != GC_MARK_DIRTY)
goto err;
}
mutex_unlock(&b->c->bucket_lock);
}
return false;
err:
mutex_unlock(&b->c->bucket_lock);
bch_bkey_to_text(buf, sizeof(buf), k);
btree_bug(b,
"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
return true;
}
bool bch_ptr_bad(struct btree *b, const struct bkey *k)
{
struct bucket *g;
unsigned i, stale;
if (!bkey_cmp(k, &ZERO_KEY) ||
!KEY_PTRS(k) ||
bch_ptr_invalid(b, k))
return true;
for (i = 0; i < KEY_PTRS(k); i++) {
if (!ptr_available(b->c, k, i))
return true;
g = PTR_BUCKET(b->c, k, i);
stale = ptr_stale(b->c, k, i);
btree_bug_on(stale > 96, b,
"key too stale: %i, need_gc %u",
stale, b->c->need_gc);
btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
b, "stale dirty pointer");
if (stale)
return true;
if (expensive_debug_checks(b->c) &&
ptr_bad_expensive_checks(b, k, i))
return true;
}
return false;
}
/* Key/pointer manipulation */
void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
......@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
return true;
}
static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
/* Auxiliary search trees */
/* 32 bits total: */
#define BKEY_MID_BITS 3
#define BKEY_EXPONENT_BITS 7
#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
struct bkey_float {
unsigned exponent:BKEY_EXPONENT_BITS;
unsigned m:BKEY_MID_BITS;
unsigned mantissa:BKEY_MANTISSA_BITS;
} __packed;
/*
* BSET_CACHELINE was originally intended to match the hardware cacheline size -
* it used to be 64, but I realized the lookup code would touch slightly less
* memory if it was 128.
*
* It definites the number of bytes (in struct bset) per struct bkey_float in
* the auxiliar search tree - when we're done searching the bset_float tree we
* have this many bytes left that we do a linear search over.
*
* Since (after level 5) every level of the bset_tree is on a new cacheline,
* we're touching one fewer cacheline in the bset tree in exchange for one more
* cacheline in the linear search - but the linear search might stop before it
* gets to the second cacheline.
*/
#define BSET_CACHELINE 128
/* Space required for the btree node keys */
static inline size_t btree_keys_bytes(struct btree_keys *b)
{
return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
~((uint64_t)1 << 63);
return PAGE_SIZE << b->page_order;
}
/* Tries to merge l and r: l should be lower than r
* Returns true if we were able to merge. If we did merge, l will be the merged
* key, r will be untouched.
*/
bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
static inline size_t btree_keys_cachelines(struct btree_keys *b)
{
unsigned i;
return btree_keys_bytes(b) / BSET_CACHELINE;
}
if (key_merging_disabled(b->c))
return false;
/* Space required for the auxiliary search trees */
static inline size_t bset_tree_bytes(struct btree_keys *b)
{
return btree_keys_cachelines(b) * sizeof(struct bkey_float);
}
if (KEY_PTRS(l) != KEY_PTRS(r) ||
KEY_DIRTY(l) != KEY_DIRTY(r) ||
bkey_cmp(l, &START_KEY(r)))
return false;
/* Space required for the prev pointers */
static inline size_t bset_prev_bytes(struct btree_keys *b)
{
return btree_keys_cachelines(b) * sizeof(uint8_t);
}
for (i = 0; i < KEY_PTRS(l); i++)
if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
return false;
/* Memory allocation */
/* Keys with no pointers aren't restricted to one bucket and could
* overflow KEY_SIZE
*/
if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
SET_KEY_SIZE(l, USHRT_MAX);
void bch_btree_keys_free(struct btree_keys *b)
{
struct bset_tree *t = b->set;
bch_cut_front(l, r);
return false;
}
if (bset_prev_bytes(b) < PAGE_SIZE)
kfree(t->prev);
else
free_pages((unsigned long) t->prev,
get_order(bset_prev_bytes(b)));
if (KEY_CSUM(l)) {
if (KEY_CSUM(r))
l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
else
SET_KEY_CSUM(l, 0);
}
if (bset_tree_bytes(b) < PAGE_SIZE)
kfree(t->tree);
else
free_pages((unsigned long) t->tree,
get_order(bset_tree_bytes(b)));
SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
free_pages((unsigned long) t->data, b->page_order);
return true;
t->prev = NULL;
t->tree = NULL;
t->data = NULL;
}
EXPORT_SYMBOL(bch_btree_keys_free);
int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
{
struct bset_tree *t = b->set;
BUG_ON(t->data);
b->page_order = page_order;
t->data = (void *) __get_free_pages(gfp, b->page_order);
if (!t->data)
goto err;
t->tree = bset_tree_bytes(b) < PAGE_SIZE
? kmalloc(bset_tree_bytes(b), gfp)
: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
if (!t->tree)
goto err;
t->prev = bset_prev_bytes(b) < PAGE_SIZE
? kmalloc(bset_prev_bytes(b), gfp)
: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
if (!t->prev)
goto err;
return 0;
err:
bch_btree_keys_free(b);
return -ENOMEM;
}
EXPORT_SYMBOL(bch_btree_keys_alloc);
void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
bool *expensive_debug_checks)
{
unsigned i;
b->ops = ops;
b->expensive_debug_checks = expensive_debug_checks;
b->nsets = 0;
b->last_set_unwritten = 0;
/* XXX: shouldn't be needed */
for (i = 0; i < MAX_BSETS; i++)
b->set[i].size = 0;
/*
* Second loop starts at 1 because b->keys[0]->data is the memory we
* allocated
*/
for (i = 1; i < MAX_BSETS; i++)
b->set[i].data = NULL;
}
EXPORT_SYMBOL(bch_btree_keys_init);
/* Binary tree stuff for auxiliary search trees */
......@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
return ((void *) k - (void *) t->data) / BSET_CACHELINE;
}
static unsigned bkey_to_cacheline_offset(struct bkey *k)
static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
unsigned cacheline,
struct bkey *k)
{
return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
}
static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
......@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
: tree_to_prev_bkey(t, j >> ffs(j));
struct bkey *r = is_power_of_2(j + 1)
? node(t->data, t->data->keys - bkey_u64s(&t->end))
? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
: tree_to_bkey(t, j >> (ffz(j) + 1));
BUG_ON(m < l || m > r);
......@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
f->exponent = 127;
}
static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
{
if (t != b->sets) {
if (t != b->set) {
unsigned j = roundup(t[-1].size,
64 / sizeof(struct bkey_float));
......@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
t->prev = t[-1].prev + j;
}
while (t < b->sets + MAX_BSETS)
while (t < b->set + MAX_BSETS)
t++->size = 0;
}
static void bset_build_unwritten_tree(struct btree *b)
static void bch_bset_build_unwritten_tree(struct btree_keys *b)
{
struct bset_tree *t = b->sets + b->nsets;
struct bset_tree *t = bset_tree_last(b);
BUG_ON(b->last_set_unwritten);
b->last_set_unwritten = 1;
bset_alloc_tree(b, t);
if (t->tree != b->sets->tree + bset_tree_space(b)) {
t->prev[0] = bkey_to_cacheline_offset(t->data->start);
if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
t->size = 1;
}
}
static void bset_build_written_tree(struct btree *b)
void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
{
if (i != b->set->data) {
b->set[++b->nsets].data = i;
i->seq = b->set->data->seq;
} else
get_random_bytes(&i->seq, sizeof(uint64_t));
i->magic = magic;
i->version = 0;
i->keys = 0;
bch_bset_build_unwritten_tree(b);
}
EXPORT_SYMBOL(bch_bset_init_next);
void bch_bset_build_written_tree(struct btree_keys *b)
{
struct bset_tree *t = b->sets + b->nsets;
struct bkey *k = t->data->start;
struct bset_tree *t = bset_tree_last(b);
struct bkey *prev = NULL, *k = t->data->start;
unsigned j, cacheline = 1;
b->last_set_unwritten = 0;
bset_alloc_tree(b, t);
t->size = min_t(unsigned,
bkey_to_cacheline(t, end(t->data)),
b->sets->tree + bset_tree_space(b) - t->tree);
bkey_to_cacheline(t, bset_bkey_last(t->data)),
b->set->tree + btree_keys_cachelines(b) - t->tree);
if (t->size < 2) {
t->size = 0;
......@@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b)
for (j = inorder_next(0, t->size);
j;
j = inorder_next(j, t->size)) {
while (bkey_to_cacheline(t, k) != cacheline)
k = bkey_next(k);
while (bkey_to_cacheline(t, k) < cacheline)
prev = k, k = bkey_next(k);
t->prev[j] = bkey_u64s(k);
k = bkey_next(k);
cacheline++;
t->tree[j].m = bkey_to_cacheline_offset(k);
t->prev[j] = bkey_u64s(prev);
t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
}
while (bkey_next(k) != end(t->data))
while (bkey_next(k) != bset_bkey_last(t->data))
k = bkey_next(k);
t->end = *k;
......@@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b)
j = inorder_next(j, t->size))
make_bfloat(t, j);
}
EXPORT_SYMBOL(bch_bset_build_written_tree);
void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
/* Insert */
void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
{
struct bset_tree *t;
unsigned inorder, j = 1;
for (t = b->sets; t <= &b->sets[b->nsets]; t++)
if (k < end(t->data))
for (t = b->set; t <= bset_tree_last(b); t++)
if (k < bset_bkey_last(t->data))
goto found_set;
BUG();
......@@ -617,7 +697,7 @@ void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
if (k == t->data->start)
goto fix_left;
if (bkey_next(k) == end(t->data)) {
if (bkey_next(k) == bset_bkey_last(t->data)) {
t->end = *k;
goto fix_right;
}
......@@ -642,10 +722,12 @@ fix_right: do {
j = j * 2 + 1;
} while (j < t->size);
}
EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
static void bch_bset_fix_lookup_table(struct btree_keys *b,
struct bset_tree *t,
struct bkey *k)
{
struct bset_tree *t = &b->sets[b->nsets];
unsigned shift = bkey_u64s(k);
unsigned j = bkey_to_cacheline(t, k);
......@@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
* lookup table for the first key that is strictly greater than k:
* it's either k's cacheline or the next one
*/
if (j < t->size &&
table_to_bkey(t, j) <= k)
while (j < t->size &&
table_to_bkey(t, j) <= k)
j++;
/* Adjust all the lookup table entries, and find a new key for any that
......@@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
while (k < cacheline_to_bkey(t, j, 0))
k = bkey_next(k);
t->prev[j] = bkey_to_cacheline_offset(k);
t->prev[j] = bkey_to_cacheline_offset(t, j, k);
}
}
if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
return;
/* Possibly add a new entry to the end of the lookup table */
for (k = table_to_bkey(t, t->size - 1);
k != end(t->data);
k != bset_bkey_last(t->data);
k = bkey_next(k))
if (t->size == bkey_to_cacheline(t, k)) {
t->prev[t->size] = bkey_to_cacheline_offset(k);
t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
t->size++;
}
}
void bch_bset_init_next(struct btree *b)
/*
* Tries to merge l and r: l should be lower than r
* Returns true if we were able to merge. If we did merge, l will be the merged
* key, r will be untouched.
*/
bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
{
struct bset *i = write_block(b);
if (!b->ops->key_merge)
return false;
if (i != b->sets[0].data) {
b->sets[++b->nsets].data = i;
i->seq = b->sets[0].data->seq;
} else
get_random_bytes(&i->seq, sizeof(uint64_t));
/*
* Generic header checks
* Assumes left and right are in order
* Left and right must be exactly aligned
*/
if (!bch_bkey_equal_header(l, r) ||
bkey_cmp(l, &START_KEY(r)))
return false;
i->magic = bset_magic(&b->c->sb);
i->version = 0;
i->keys = 0;
return b->ops->key_merge(b, l, r);
}
EXPORT_SYMBOL(bch_bkey_try_merge);
bset_build_unwritten_tree(b);
void bch_bset_insert(struct btree_keys *b, struct bkey *where,
struct bkey *insert)
{
struct bset_tree *t = bset_tree_last(b);
BUG_ON(!b->last_set_unwritten);
BUG_ON(bset_byte_offset(b, t->data) +
__set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
PAGE_SIZE << b->page_order);
memmove((uint64_t *) where + bkey_u64s(insert),
where,
(void *) bset_bkey_last(t->data) - (void *) where);
t->data->keys += bkey_u64s(insert);
bkey_copy(where, insert);
bch_bset_fix_lookup_table(b, t, where);
}
EXPORT_SYMBOL(bch_bset_insert);
unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
struct bkey *replace_key)
{
unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
struct btree_iter iter;
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
m = bch_btree_iter_init(b, &iter, b->ops->is_extents
? PRECEDING_KEY(&START_KEY(k))
: PRECEDING_KEY(k));
if (b->ops->insert_fixup(b, k, &iter, replace_key))
return status;
status = BTREE_INSERT_STATUS_INSERT;
while (m != bset_bkey_last(i) &&
bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
prev = m, m = bkey_next(m);
/* prev is in the tree, if we merge we're done */
status = BTREE_INSERT_STATUS_BACK_MERGE;
if (prev &&
bch_bkey_try_merge(b, prev, k))
goto merged;
#if 0
status = BTREE_INSERT_STATUS_OVERWROTE;
if (m != bset_bkey_last(i) &&
KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
goto copy;
#endif
status = BTREE_INSERT_STATUS_FRONT_MERGE;
if (m != bset_bkey_last(i) &&
bch_bkey_try_merge(b, k, m))
goto copy;
bch_bset_insert(b, m, k);
copy: bkey_copy(m, k);
merged:
return status;
}
EXPORT_SYMBOL(bch_btree_insert_key);
/* Lookup */
struct bset_search_iter {
struct bkey *l, *r;
};
static struct bset_search_iter bset_search_write_set(struct btree *b,
struct bset_tree *t,
static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
const struct bkey *search)
{
unsigned li = 0, ri = t->size;
BUG_ON(!b->nsets &&
t->size < bkey_to_cacheline(t, end(t->data)));
while (li + 1 != ri) {
unsigned m = (li + ri) >> 1;
......@@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b,
return (struct bset_search_iter) {
table_to_bkey(t, li),
ri < t->size ? table_to_bkey(t, ri) : end(t->data)
ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
};
}
static struct bset_search_iter bset_search_tree(struct btree *b,
struct bset_tree *t,
static struct bset_search_iter bset_search_tree(struct bset_tree *t,
const struct bkey *search)
{
struct bkey *l, *r;
......@@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
f = &t->tree[inorder_next(j, t->size)];
r = cacheline_to_bkey(t, inorder, f->m);
} else
r = end(t->data);
r = bset_bkey_last(t->data);
} else {
r = cacheline_to_bkey(t, inorder, f->m);
......@@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
return (struct bset_search_iter) {l, r};
}
struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
const struct bkey *search)
{
struct bset_search_iter i;
......@@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
if (unlikely(!t->size)) {
i.l = t->data->start;
i.r = end(t->data);
i.r = bset_bkey_last(t->data);
} else if (bset_written(b, t)) {
/*
* Each node in the auxiliary search tree covers a certain range
......@@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
*/
if (unlikely(bkey_cmp(search, &t->end) >= 0))
return end(t->data);
return bset_bkey_last(t->data);
if (unlikely(bkey_cmp(search, t->data->start) < 0))
return t->data->start;
i = bset_search_tree(b, t, search);
} else
i = bset_search_write_set(b, t, search);
i = bset_search_tree(t, search);
} else {
BUG_ON(!b->nsets &&
t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
if (expensive_debug_checks(b->c)) {
i = bset_search_write_set(t, search);
}
if (btree_keys_expensive_checks(b)) {
BUG_ON(bset_written(b, t) &&
i.l != t->data->start &&
bkey_cmp(tree_to_prev_bkey(t,
inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
search) > 0);
BUG_ON(i.r != end(t->data) &&
BUG_ON(i.r != bset_bkey_last(t->data) &&
bkey_cmp(i.r, search) <= 0);
}
......@@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
return i.l;
}
EXPORT_SYMBOL(__bch_bset_search);
/* Btree iterator */
/*
* Returns true if l > r - unless l == r, in which case returns true if l is
* older than r.
*
* Necessary for btree_sort_fixup() - if there are multiple keys that compare
* equal in different sets, we have to process them newest to oldest.
*/
typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
struct btree_iter_set);
static inline bool btree_iter_cmp(struct btree_iter_set l,
struct btree_iter_set r)
{
int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
return c ? c > 0 : l.k < r.k;
return bkey_cmp(l.k, r.k) > 0;
}
static inline bool btree_iter_end(struct btree_iter *iter)
......@@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
btree_iter_cmp));
}
struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
struct bkey *search, struct bset_tree *start)
static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
struct btree_iter *iter,
struct bkey *search,
struct bset_tree *start)
{
struct bkey *ret = NULL;
iter->size = ARRAY_SIZE(iter->data);
......@@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
iter->b = b;
#endif
for (; start <= &b->sets[b->nsets]; start++) {
for (; start <= bset_tree_last(b); start++) {
ret = bch_bset_search(b, start, search);
bch_btree_iter_push(iter, ret, end(start->data));
bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
}
return ret;
}
struct bkey *bch_btree_iter_next(struct btree_iter *iter)
struct bkey *bch_btree_iter_init(struct btree_keys *b,
struct btree_iter *iter,
struct bkey *search)
{
return __bch_btree_iter_init(b, iter, search, b->set);
}
EXPORT_SYMBOL(bch_btree_iter_init);
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
btree_iter_cmp_fn *cmp)
{
struct btree_iter_set unused;
struct bkey *ret = NULL;
......@@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
}
if (iter->data->k == iter->data->end)
heap_pop(iter, unused, btree_iter_cmp);
heap_pop(iter, unused, cmp);
else
heap_sift(iter, 0, btree_iter_cmp);
heap_sift(iter, 0, cmp);
}
return ret;
}
struct bkey *bch_btree_iter_next(struct btree_iter *iter)
{
return __bch_btree_iter_next(iter, btree_iter_cmp);
}
EXPORT_SYMBOL(bch_btree_iter_next);
struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
struct btree *b, ptr_filter_fn fn)
struct btree_keys *b, ptr_filter_fn fn)
{
struct bkey *ret;
......@@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
/* Mergesort */
static void sort_key_next(struct btree_iter *iter,
struct btree_iter_set *i)
void bch_bset_sort_state_free(struct bset_sort_state *state)
{
i->k = bkey_next(i->k);
if (i->k == i->end)
*i = iter->data[--iter->used];
if (state->pool)
mempool_destroy(state->pool);
}
static void btree_sort_fixup(struct btree_iter *iter)
int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
{
while (iter->used > 1) {
struct btree_iter_set *top = iter->data, *i = top + 1;
spin_lock_init(&state->time.lock);
if (iter->used > 2 &&
btree_iter_cmp(i[0], i[1]))
i++;
state->page_order = page_order;
state->crit_factor = int_sqrt(1 << page_order);
if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
break;
if (!KEY_SIZE(i->k)) {
sort_key_next(iter, i);
heap_sift(iter, i - top, btree_iter_cmp);
continue;
}
if (top->k > i->k) {
if (bkey_cmp(top->k, i->k) >= 0)
sort_key_next(iter, i);
else
bch_cut_front(top->k, i->k);
state->pool = mempool_create_page_pool(1, page_order);
if (!state->pool)
return -ENOMEM;
heap_sift(iter, i - top, btree_iter_cmp);
} else {
/* can't happen because of comparison func */
BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
bch_cut_back(&START_KEY(i->k), top->k);
}
}
return 0;
}
EXPORT_SYMBOL(bch_bset_sort_state_init);
static void btree_mergesort(struct btree *b, struct bset *out,
static void btree_mergesort(struct btree_keys *b, struct bset *out,
struct btree_iter *iter,
bool fixup, bool remove_stale)
{
int i;
struct bkey *k, *last = NULL;
bool (*bad)(struct btree *, const struct bkey *) = remove_stale
BKEY_PADDED(k) tmp;
bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
? bch_ptr_bad
: bch_ptr_invalid;
/* Heapify the iterator, using our comparison function */
for (i = iter->used / 2 - 1; i >= 0; --i)
heap_sift(iter, i, b->ops->sort_cmp);
while (!btree_iter_end(iter)) {
if (fixup && !b->level)
btree_sort_fixup(iter);
if (b->ops->sort_fixup && fixup)
k = b->ops->sort_fixup(iter, &tmp.k);
else
k = NULL;
if (!k)
k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
k = bch_btree_iter_next(iter);
if (bad(b, k))
continue;
if (!last) {
last = out->start;
bkey_copy(last, k);
} else if (b->level ||
!bch_bkey_try_merge(b, last, k)) {
} else if (!bch_bkey_try_merge(b, last, k)) {
last = bkey_next(last);
bkey_copy(last, k);
}
......@@ -1020,27 +1176,27 @@ static void btree_mergesort(struct btree *b, struct bset *out,
pr_debug("sorted %i keys", out->keys);
}
static void __btree_sort(struct btree *b, struct btree_iter *iter,
unsigned start, unsigned order, bool fixup)
static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
unsigned start, unsigned order, bool fixup,
struct bset_sort_state *state)
{
uint64_t start_time;
bool remove_stale = !b->written;
bool used_mempool = false;
struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
order);
if (!out) {
mutex_lock(&b->c->sort_lock);
out = b->c->sort;
order = ilog2(bucket_pages(b->c));
BUG_ON(order > state->page_order);
out = page_address(mempool_alloc(state->pool, GFP_NOIO));
used_mempool = true;
order = state->page_order;
}
start_time = local_clock();
btree_mergesort(b, out, iter, fixup, remove_stale);
btree_mergesort(b, out, iter, fixup, false);
b->nsets = start;
if (!fixup && !start && b->written)
bch_btree_verify(b, out);
if (!start && order == b->page_order) {
/*
* Our temporary buffer is the same size as the btree node's
......@@ -1048,84 +1204,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
* memcpy()
*/
out->magic = bset_magic(&b->c->sb);
out->seq = b->sets[0].data->seq;
out->version = b->sets[0].data->version;
swap(out, b->sets[0].data);
if (b->c->sort == b->sets[0].data)
b->c->sort = out;
out->magic = b->set->data->magic;
out->seq = b->set->data->seq;
out->version = b->set->data->version;
swap(out, b->set->data);
} else {
b->sets[start].data->keys = out->keys;
memcpy(b->sets[start].data->start, out->start,
(void *) end(out) - (void *) out->start);
b->set[start].data->keys = out->keys;
memcpy(b->set[start].data->start, out->start,
(void *) bset_bkey_last(out) - (void *) out->start);
}
if (out == b->c->sort)
mutex_unlock(&b->c->sort_lock);
if (used_mempool)
mempool_free(virt_to_page(out), state->pool);
else
free_pages((unsigned long) out, order);
if (b->written)
bset_build_written_tree(b);
bch_bset_build_written_tree(b);
if (!start)
bch_time_stats_update(&b->c->sort_time, start_time);
bch_time_stats_update(&state->time, start_time);
}
void bch_btree_sort_partial(struct btree *b, unsigned start)
void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
struct bset_sort_state *state)
{
size_t order = b->page_order, keys = 0;
struct btree_iter iter;
int oldsize = bch_count_data(b);
__bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
BUG_ON(b->sets[b->nsets].data == write_block(b) &&
(b->sets[b->nsets].size || b->nsets));
__bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
if (start) {
unsigned i;
for (i = start; i <= b->nsets; i++)
keys += b->sets[i].data->keys;
keys += b->set[i].data->keys;
order = roundup_pow_of_two(__set_bytes(b->sets->data,
keys)) / PAGE_SIZE;
if (order)
order = ilog2(order);
order = get_order(__set_bytes(b->set->data, keys));
}
__btree_sort(b, &iter, start, order, false);
__btree_sort(b, &iter, start, order, false, state);
EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
}
EXPORT_SYMBOL(bch_btree_sort_partial);
void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
void bch_btree_sort_and_fix_extents(struct btree_keys *b,
struct btree_iter *iter,
struct bset_sort_state *state)
{
BUG_ON(!b->written);
__btree_sort(b, iter, 0, b->page_order, true);
__btree_sort(b, iter, 0, b->page_order, true, state);
}
void bch_btree_sort_into(struct btree *b, struct btree *new)
void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
struct bset_sort_state *state)
{
uint64_t start_time = local_clock();
struct btree_iter iter;
bch_btree_iter_init(b, &iter, NULL);
btree_mergesort(b, new->sets->data, &iter, false, true);
btree_mergesort(b, new->set->data, &iter, false, true);
bch_time_stats_update(&b->c->sort_time, start_time);
bch_time_stats_update(&state->time, start_time);
bkey_copy_key(&new->key, &b->key);
new->sets->size = 0;
new->set->size = 0; // XXX: why?
}
#define SORT_CRIT (4096 / sizeof(uint64_t))
void bch_btree_sort_lazy(struct btree *b)
void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
{
unsigned crit = SORT_CRIT;
int i;
......@@ -1134,50 +1282,32 @@ void bch_btree_sort_lazy(struct btree *b)
if (!b->nsets)
goto out;
/* If not a leaf node, always sort */
if (b->level) {
bch_btree_sort(b);
return;
}
for (i = b->nsets - 1; i >= 0; --i) {
crit *= b->c->sort_crit_factor;
crit *= state->crit_factor;
if (b->sets[i].data->keys < crit) {
bch_btree_sort_partial(b, i);
if (b->set[i].data->keys < crit) {
bch_btree_sort_partial(b, i, state);
return;
}
}
/* Sort if we'd overflow */
if (b->nsets + 1 == MAX_BSETS) {
bch_btree_sort(b);
bch_btree_sort(b, state);
return;
}
out:
bset_build_written_tree(b);
bch_bset_build_written_tree(b);
}
EXPORT_SYMBOL(bch_btree_sort_lazy);
/* Sysfs stuff */
struct bset_stats {
struct btree_op op;
size_t nodes;
size_t sets_written, sets_unwritten;
size_t bytes_written, bytes_unwritten;
size_t floats, failed;
};
static int btree_bset_stats(struct btree_op *op, struct btree *b)
void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
{
struct bset_stats *stats = container_of(op, struct bset_stats, op);
unsigned i;
stats->nodes++;
for (i = 0; i <= b->nsets; i++) {
struct bset_tree *t = &b->sets[i];
struct bset_tree *t = &b->set[i];
size_t bytes = t->data->keys * sizeof(uint64_t);
size_t j;
......@@ -1195,32 +1325,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b)
stats->bytes_unwritten += bytes;
}
}
return MAP_CONTINUE;
}
int bch_bset_print_stats(struct cache_set *c, char *buf)
{
struct bset_stats t;
int ret;
memset(&t, 0, sizeof(struct bset_stats));
bch_btree_op_init(&t.op, -1);
ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
if (ret < 0)
return ret;
return snprintf(buf, PAGE_SIZE,
"btree nodes: %zu\n"
"written sets: %zu\n"
"unwritten sets: %zu\n"
"written key bytes: %zu\n"
"unwritten key bytes: %zu\n"
"floats: %zu\n"
"failed: %zu\n",
t.nodes,
t.sets_written, t.sets_unwritten,
t.bytes_written, t.bytes_unwritten,
t.floats, t.failed);
}
#ifndef _BCACHE_BSET_H
#define _BCACHE_BSET_H
#include <linux/slab.h>
#include <linux/bcache.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include "util.h" /* for time_stats */
/*
* BKEYS:
......@@ -142,20 +146,13 @@
* first key in that range of bytes again.
*/
/* Btree key comparison/iteration */
struct btree_keys;
struct btree_iter;
struct btree_iter_set;
struct bkey_float;
#define MAX_BSETS 4U
struct btree_iter {
size_t size, used;
#ifdef CONFIG_BCACHE_DEBUG
struct btree *b;
#endif
struct btree_iter_set {
struct bkey *k, *end;
} data[MAX_BSETS];
};
struct bset_tree {
/*
* We construct a binary tree in an array as if the array
......@@ -165,14 +162,14 @@ struct bset_tree {
*/
/* size of the binary tree and prev array */
unsigned size;
unsigned size;
/* function of size - precalculated for to_inorder() */
unsigned extra;
unsigned extra;
/* copy of the last key in the set */
struct bkey end;
struct bkey_float *tree;
struct bkey end;
struct bkey_float *tree;
/*
* The nodes in the bset tree point to specific keys - this
......@@ -182,12 +179,219 @@ struct bset_tree {
* to keep bkey_float to 4 bytes and prev isn't used in the fast
* path.
*/
uint8_t *prev;
uint8_t *prev;
/* The actual btree node, with pointers to each sorted set */
struct bset *data;
struct bset *data;
};
struct btree_keys_ops {
bool (*sort_cmp)(struct btree_iter_set,
struct btree_iter_set);
struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *);
bool (*insert_fixup)(struct btree_keys *, struct bkey *,
struct btree_iter *, struct bkey *);
bool (*key_invalid)(struct btree_keys *,
const struct bkey *);
bool (*key_bad)(struct btree_keys *, const struct bkey *);
bool (*key_merge)(struct btree_keys *,
struct bkey *, struct bkey *);
void (*key_to_text)(char *, size_t, const struct bkey *);
void (*key_dump)(struct btree_keys *, const struct bkey *);
/*
* Only used for deciding whether to use START_KEY(k) or just the key
* itself in a couple places
*/
bool is_extents;
};
struct btree_keys {
const struct btree_keys_ops *ops;
uint8_t page_order;
uint8_t nsets;
unsigned last_set_unwritten:1;
bool *expensive_debug_checks;
/*
* Sets of sorted keys - the real btree node - plus a binary search tree
*
* set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
* to the memory we have allocated for this btree node. Additionally,
* set[0]->data points to the entire btree node as it exists on disk.
*/
struct bset_tree set[MAX_BSETS];
};
static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
{
return b->set + b->nsets;
}
static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
{
return t <= b->set + b->nsets - b->last_set_unwritten;
}
static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
{
return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
}
static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
{
return ((size_t) i) - ((size_t) b->set->data);
}
static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
{
return bset_byte_offset(b, i) >> 9;
}
#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
#define set_bytes(i) __set_bytes(i, i->keys)
#define __set_blocks(i, k, block_bytes) \
DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
#define set_blocks(i, block_bytes) \
__set_blocks(i, (i)->keys, block_bytes)
static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
{
struct bset_tree *t = bset_tree_last(b);
BUG_ON((PAGE_SIZE << b->page_order) <
(bset_byte_offset(b, t->data) + set_bytes(t->data)));
if (!b->last_set_unwritten)
return 0;
return ((PAGE_SIZE << b->page_order) -
(bset_byte_offset(b, t->data) + set_bytes(t->data))) /
sizeof(u64);
}
static inline struct bset *bset_next_set(struct btree_keys *b,
unsigned block_bytes)
{
struct bset *i = bset_tree_last(b)->data;
return ((void *) i) + roundup(set_bytes(i), block_bytes);
}
void bch_btree_keys_free(struct btree_keys *);
int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
bool *);
void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
void bch_bset_build_written_tree(struct btree_keys *);
void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
struct bkey *);
enum {
BTREE_INSERT_STATUS_NO_INSERT = 0,
BTREE_INSERT_STATUS_INSERT,
BTREE_INSERT_STATUS_BACK_MERGE,
BTREE_INSERT_STATUS_OVERWROTE,
BTREE_INSERT_STATUS_FRONT_MERGE,
};
/* Btree key iteration */
struct btree_iter {
size_t size, used;
#ifdef CONFIG_BCACHE_DEBUG
struct btree_keys *b;
#endif
struct btree_iter_set {
struct bkey *k, *end;
} data[MAX_BSETS];
};
typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
struct bkey *bch_btree_iter_next(struct btree_iter *);
struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
struct btree_keys *, ptr_filter_fn);
void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
struct bkey *);
struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
const struct bkey *);
/*
* Returns the first key that is strictly greater than search
*/
static inline struct bkey *bch_bset_search(struct btree_keys *b,
struct bset_tree *t,
const struct bkey *search)
{
return search ? __bch_bset_search(b, t, search) : t->data->start;
}
#define for_each_key_filter(b, k, iter, filter) \
for (bch_btree_iter_init((b), (iter), NULL); \
((k) = bch_btree_iter_next_filter((iter), (b), filter));)
#define for_each_key(b, k, iter) \
for (bch_btree_iter_init((b), (iter), NULL); \
((k) = bch_btree_iter_next(iter));)
/* Sorting */
struct bset_sort_state {
mempool_t *pool;
unsigned page_order;
unsigned crit_factor;
struct time_stats time;
};
void bch_bset_sort_state_free(struct bset_sort_state *);
int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
struct bset_sort_state *);
void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
struct bset_sort_state *);
void bch_btree_sort_partial(struct btree_keys *, unsigned,
struct bset_sort_state *);
static inline void bch_btree_sort(struct btree_keys *b,
struct bset_sort_state *state)
{
bch_btree_sort_partial(b, 0, state);
}
struct bset_stats {
size_t sets_written, sets_unwritten;
size_t bytes_written, bytes_unwritten;
size_t floats, failed;
};
void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
/* Bkey utility code */
#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
{
return bkey_idx(i->start, idx);
}
static inline void bkey_init(struct bkey *k)
{
*k = ZERO_KEY;
}
static __always_inline int64_t bkey_cmp(const struct bkey *l,
const struct bkey *r)
{
......@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
}
void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
unsigned);
bool __bch_cut_front(const struct bkey *, struct bkey *);
bool __bch_cut_back(const struct bkey *, struct bkey *);
static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
{
BUG_ON(bkey_cmp(where, k) > 0);
return __bch_cut_front(where, k);
}
static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
{
BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
return __bch_cut_back(where, k);
}
#define PRECEDING_KEY(_k) \
({ \
struct bkey *_ret = NULL; \
\
if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
\
if (!_ret->low) \
_ret->high--; \
_ret->low--; \
} \
\
_ret; \
})
static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
{
return b->ops->key_invalid(b, k);
}
static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
{
return b->ops->key_bad(b, k);
}
static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
size_t size, const struct bkey *k)
{
return b->ops->key_to_text(buf, size, k);
}
static inline bool bch_bkey_equal_header(const struct bkey *l,
const struct bkey *r)
{
return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
KEY_PTRS(l) == KEY_PTRS(r) &&
KEY_CSUM(l) == KEY_CSUM(l));
}
/* Keylists */
struct keylist {
......@@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l)
struct bkey *bch_keylist_pop(struct keylist *);
void bch_keylist_pop_front(struct keylist *);
int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
unsigned);
bool __bch_cut_front(const struct bkey *, struct bkey *);
bool __bch_cut_back(const struct bkey *, struct bkey *);
int __bch_keylist_realloc(struct keylist *, unsigned);
static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
{
BUG_ON(bkey_cmp(where, k) > 0);
return __bch_cut_front(where, k);
}
/* Debug stuff */
static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
{
BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
return __bch_cut_back(where, k);
}
const char *bch_ptr_status(struct cache_set *, const struct bkey *);
bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
bool bch_ptr_bad(struct btree *, const struct bkey *);
static inline uint8_t gen_after(uint8_t a, uint8_t b)
{
uint8_t r = a - b;
return r > 128U ? 0 : r;
}
static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
unsigned i)
{
return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
}
static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
unsigned i)
{
return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
}
typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
struct bkey *bch_btree_iter_next(struct btree_iter *);
struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
struct btree *, ptr_filter_fn);
void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
struct bkey *, struct bset_tree *);
/* 32 bits total: */
#define BKEY_MID_BITS 3
#define BKEY_EXPONENT_BITS 7
#define BKEY_MANTISSA_BITS 22
#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
struct bkey_float {
unsigned exponent:BKEY_EXPONENT_BITS;
unsigned m:BKEY_MID_BITS;
unsigned mantissa:BKEY_MANTISSA_BITS;
} __packed;
/*
* BSET_CACHELINE was originally intended to match the hardware cacheline size -
* it used to be 64, but I realized the lookup code would touch slightly less
* memory if it was 128.
*
* It definites the number of bytes (in struct bset) per struct bkey_float in
* the auxiliar search tree - when we're done searching the bset_float tree we
* have this many bytes left that we do a linear search over.
*
* Since (after level 5) every level of the bset_tree is on a new cacheline,
* we're touching one fewer cacheline in the bset tree in exchange for one more
* cacheline in the linear search - but the linear search might stop before it
* gets to the second cacheline.
*/
#define BSET_CACHELINE 128
#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE)
#ifdef CONFIG_BCACHE_DEBUG
#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float))
#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t))
int __bch_count_data(struct btree_keys *);
void __bch_check_keys(struct btree_keys *, const char *, ...);
void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
void bch_dump_bucket(struct btree_keys *);
void bch_bset_init_next(struct btree *);
#else
void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
static inline int __bch_count_data(struct btree_keys *b) { return -1; }
static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
static inline void bch_dump_bucket(struct btree_keys *b) {}
void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
const struct bkey *);
#endif
/*
* Returns the first key that is strictly greater than search
*/
static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
const struct bkey *search)
static inline bool btree_keys_expensive_checks(struct btree_keys *b)
{
return search ? __bch_bset_search(b, t, search) : t->data->start;
#ifdef CONFIG_BCACHE_DEBUG
return *b->expensive_debug_checks;
#else
return false;
#endif
}
#define PRECEDING_KEY(_k) \
({ \
struct bkey *_ret = NULL; \
\
if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
\
if (!_ret->low) \
_ret->high--; \
_ret->low--; \
} \
\
_ret; \
})
bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
void bch_btree_sort_lazy(struct btree *);
void bch_btree_sort_into(struct btree *, struct btree *);
void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
void bch_btree_sort_partial(struct btree *, unsigned);
static inline void bch_btree_sort(struct btree *b)
static inline int bch_count_data(struct btree_keys *b)
{
bch_btree_sort_partial(b, 0);
return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
}
int bch_bset_print_stats(struct cache_set *, char *);
#define bch_check_keys(b, ...) \
do { \
if (btree_keys_expensive_checks(b)) \
__bch_check_keys(b, __VA_ARGS__); \
} while (0)
#endif
......@@ -23,7 +23,7 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "writeback.h"
#include "extents.h"
#include <linux/slab.h>
#include <linux/bitops.h>
......@@ -89,13 +89,6 @@
* Test module load/unload
*/
enum {
BTREE_INSERT_STATUS_INSERT,
BTREE_INSERT_STATUS_BACK_MERGE,
BTREE_INSERT_STATUS_OVERWROTE,
BTREE_INSERT_STATUS_FRONT_MERGE,
};
#define MAX_NEED_GC 64
#define MAX_SAVE_PRIO 72
......@@ -106,14 +99,6 @@ enum {
static struct workqueue_struct *btree_io_wq;
static inline bool should_split(struct btree *b)
{
struct bset *i = write_block(b);
return b->written >= btree_blocks(b) ||
(b->written + __set_blocks(i, i->keys + 15, b->c)
> btree_blocks(b));
}
#define insert_lock(s, b) ((b)->level <= (s)->lock)
/*
......@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b)
_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
} \
rw_unlock(_w, _b); \
if (_r == -EINTR) \
schedule(); \
bch_cannibalize_unlock(c); \
if (_r == -ENOSPC) { \
wait_event((c)->try_wait, \
......@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b)
} \
} while (_r == -EINTR); \
\
finish_wait(&(c)->bucket_wait, &(op)->wait); \
_r; \
})
static inline struct bset *write_block(struct btree *b)
{
return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
}
/* Btree key manipulation */
void bkey_put(struct cache_set *c, struct bkey *k)
......@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k)
static uint64_t btree_csum_set(struct btree *b, struct bset *i)
{
uint64_t crc = b->key.ptr[0];
void *data = (void *) i + 8, *end = end(i);
void *data = (void *) i + 8, *end = bset_bkey_last(i);
crc = bch_crc64_update(crc, data, end - data);
return crc ^ 0xffffffffffffffffULL;
}
static void bch_btree_node_read_done(struct btree *b)
void bch_btree_node_read_done(struct btree *b)
{
const char *err = "bad btree header";
struct bset *i = b->sets[0].data;
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
......@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b)
iter->used = 0;
#ifdef CONFIG_BCACHE_DEBUG
iter->b = b;
iter->b = &b->keys;
#endif
if (!i->seq)
goto err;
for (;
b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
i = write_block(b)) {
err = "unsupported bset version";
if (i->version > BCACHE_BSET_VERSION)
goto err;
err = "bad btree header";
if (b->written + set_blocks(i, b->c) > btree_blocks(b))
if (b->written + set_blocks(i, block_bytes(b->c)) >
btree_blocks(b))
goto err;
err = "bad magic";
......@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b)
}
err = "empty set";
if (i != b->sets[0].data && !i->keys)
if (i != b->keys.set[0].data && !i->keys)
goto err;
bch_btree_iter_push(iter, i->start, end(i));
bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
b->written += set_blocks(i, b->c);
b->written += set_blocks(i, block_bytes(b->c));
}
err = "corrupted btree";
for (i = write_block(b);
index(i, b) < btree_blocks(b);
bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
i = ((void *) i) + block_bytes(b->c))
if (i->seq == b->sets[0].data->seq)
if (i->seq == b->keys.set[0].data->seq)
goto err;
bch_btree_sort_and_fix_extents(b, iter);
bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
i = b->sets[0].data;
i = b->keys.set[0].data;
err = "short btree key";
if (b->sets[0].size &&
bkey_cmp(&b->key, &b->sets[0].end) < 0)
if (b->keys.set[0].size &&
bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
goto err;
if (b->written < btree_blocks(b))
bch_bset_init_next(b);
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->sb));
out:
mempool_free(iter, b->c->fill_iter);
return;
err:
set_btree_node_io_error(b);
bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
err, PTR_BUCKET_NR(b->c, &b->key, 0),
index(i, b), i->keys);
bset_block_offset(b, i), i->keys);
goto out;
}
......@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error)
closure_put(cl);
}
void bch_btree_node_read(struct btree *b)
static void bch_btree_node_read(struct btree *b)
{
uint64_t start_time = local_clock();
struct closure cl;
......@@ -303,7 +298,7 @@ void bch_btree_node_read(struct btree *b)
bio->bi_end_io = btree_node_read_endio;
bio->bi_private = &cl;
bch_bio_map(bio, b->sets[0].data);
bch_bio_map(bio, b->keys.set[0].data);
bch_submit_bbio(bio, b->c, &b->key, 0);
closure_sync(&cl);
......@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
w->journal = NULL;
}
static void btree_node_write_unlock(struct closure *cl)
{
struct btree *b = container_of(cl, struct btree, io);
up(&b->io_mutex);
}
static void __btree_node_write_done(struct closure *cl)
{
struct btree *b = container_of(cl, struct btree, io.cl);
struct btree *b = container_of(cl, struct btree, io);
struct btree_write *w = btree_prev_write(b);
bch_bbio_free(b->bio, b->c);
......@@ -353,12 +355,12 @@ static void __btree_node_write_done(struct closure *cl)
queue_delayed_work(btree_io_wq, &b->work,
msecs_to_jiffies(30000));
closure_return(cl);
closure_return_with_destructor(cl, btree_node_write_unlock);
}
static void btree_node_write_done(struct closure *cl)
{
struct btree *b = container_of(cl, struct btree, io.cl);
struct btree *b = container_of(cl, struct btree, io);
struct bio_vec *bv;
int n;
......@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl)
static void btree_node_write_endio(struct bio *bio, int error)
{
struct closure *cl = bio->bi_private;
struct btree *b = container_of(cl, struct btree, io.cl);
struct btree *b = container_of(cl, struct btree, io);
if (error)
set_btree_node_io_error(b);
......@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error)
static void do_btree_node_write(struct btree *b)
{
struct closure *cl = &b->io.cl;
struct bset *i = b->sets[b->nsets].data;
struct closure *cl = &b->io;
struct bset *i = btree_bset_last(b);
BKEY_PADDED(key) k;
i->version = BCACHE_BSET_VERSION;
......@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b)
b->bio->bi_end_io = btree_node_write_endio;
b->bio->bi_private = cl;
b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c);
b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
bch_bio_map(b->bio, i);
/*
......@@ -414,7 +416,8 @@ static void do_btree_node_write(struct btree *b)
*/
bkey_copy(&k.key, &b->key);
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i));
if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
int j;
......@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b)
bch_submit_bbio(b->bio, b->c, &k.key, 0);
closure_sync(cl);
__btree_node_write_done(cl);
continue_at_nobarrier(cl, __btree_node_write_done, NULL);
}
}
void bch_btree_node_write(struct btree *b, struct closure *parent)
{
struct bset *i = b->sets[b->nsets].data;
struct bset *i = btree_bset_last(b);
trace_bcache_btree_write(b);
BUG_ON(current->bio_list);
BUG_ON(b->written >= btree_blocks(b));
BUG_ON(b->written && !i->keys);
BUG_ON(b->sets->data->seq != i->seq);
bch_check_keys(b, "writing");
BUG_ON(btree_bset_first(b)->seq != i->seq);
bch_check_keys(&b->keys, "writing");
cancel_delayed_work(&b->work);
/* If caller isn't waiting for write, parent refcount is cache set */
closure_lock(&b->io, parent ?: &b->c->cl);
down(&b->io_mutex);
closure_init(&b->io, parent ?: &b->c->cl);
clear_bit(BTREE_NODE_dirty, &b->flags);
change_bit(BTREE_NODE_write_idx, &b->flags);
do_btree_node_write(b);
b->written += set_blocks(i, b->c);
atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
bch_btree_sort_lazy(b);
b->written += set_blocks(i, block_bytes(b->c));
/* If not a leaf node, always sort */
if (b->level && b->keys.nsets)
bch_btree_sort(&b->keys, &b->c->sort);
else
bch_btree_sort_lazy(&b->keys, &b->c->sort);
/*
* do verify if there was more than one set initially (i.e. we did a
* sort) and we sorted down to a single set:
*/
if (i != b->keys.set->data && !b->keys.nsets)
bch_btree_verify(b);
if (b->written < btree_blocks(b))
bch_bset_init_next(b);
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->sb));
}
static void bch_btree_node_write_sync(struct btree *b)
......@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w)
static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
{
struct bset *i = b->sets[b->nsets].data;
struct bset *i = btree_bset_last(b);
struct btree_write *w = btree_current_write(b);
BUG_ON(!b->written);
......@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
* mca -> memory cache
*/
static void mca_reinit(struct btree *b)
{
unsigned i;
b->flags = 0;
b->written = 0;
b->nsets = 0;
for (i = 0; i < MAX_BSETS; i++)
b->sets[i].size = 0;
/*
* Second loop starts at 1 because b->sets[0]->data is the memory we
* allocated
*/
for (i = 1; i < MAX_BSETS; i++)
b->sets[i].data = NULL;
}
#define mca_reserve(c) (((c->root && c->root->level) \
? c->root->level : 1) * 8 + 16)
#define mca_can_free(c) \
......@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b)
static void mca_data_free(struct btree *b)
{
struct bset_tree *t = b->sets;
BUG_ON(!closure_is_unlocked(&b->io.cl));
BUG_ON(b->io_mutex.count != 1);
if (bset_prev_bytes(b) < PAGE_SIZE)
kfree(t->prev);
else
free_pages((unsigned long) t->prev,
get_order(bset_prev_bytes(b)));
bch_btree_keys_free(&b->keys);
if (bset_tree_bytes(b) < PAGE_SIZE)
kfree(t->tree);
else
free_pages((unsigned long) t->tree,
get_order(bset_tree_bytes(b)));
free_pages((unsigned long) t->data, b->page_order);
t->prev = NULL;
t->tree = NULL;
t->data = NULL;
list_move(&b->list, &b->c->btree_cache_freed);
b->c->bucket_cache_used--;
list_move(&b->list, &b->c->btree_cache_freed);
}
static void mca_bucket_free(struct btree *b)
......@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k)
static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
{
struct bset_tree *t = b->sets;
BUG_ON(t->data);
b->page_order = max_t(unsigned,
ilog2(b->c->btree_pages),
btree_order(k));
t->data = (void *) __get_free_pages(gfp, b->page_order);
if (!t->data)
goto err;
t->tree = bset_tree_bytes(b) < PAGE_SIZE
? kmalloc(bset_tree_bytes(b), gfp)
: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
if (!t->tree)
goto err;
t->prev = bset_prev_bytes(b) < PAGE_SIZE
? kmalloc(bset_prev_bytes(b), gfp)
: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
if (!t->prev)
goto err;
list_move(&b->list, &b->c->btree_cache);
b->c->bucket_cache_used++;
return;
err:
mca_data_free(b);
if (!bch_btree_keys_alloc(&b->keys,
max_t(unsigned,
ilog2(b->c->btree_pages),
btree_order(k)),
gfp)) {
b->c->bucket_cache_used++;
list_move(&b->list, &b->c->btree_cache);
} else {
list_move(&b->list, &b->c->btree_cache_freed);
}
}
static struct btree *mca_bucket_alloc(struct cache_set *c,
......@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
INIT_LIST_HEAD(&b->list);
INIT_DELAYED_WORK(&b->work, btree_node_write_work);
b->c = c;
closure_init_unlocked(&b->io);
sema_init(&b->io_mutex, 1);
mca_data_alloc(b, k, gfp);
return b;
......@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
if (!down_write_trylock(&b->lock))
return -ENOMEM;
BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
if (b->page_order < min_order ||
(!flush &&
(btree_node_dirty(b) ||
atomic_read(&b->io.cl.remaining) != -1))) {
rw_unlock(true, b);
return -ENOMEM;
if (b->keys.page_order < min_order)
goto out_unlock;
if (!flush) {
if (btree_node_dirty(b))
goto out_unlock;
if (down_trylock(&b->io_mutex))
goto out_unlock;
up(&b->io_mutex);
}
if (btree_node_dirty(b))
bch_btree_node_write_sync(b);
/* wait for any in flight btree write */
closure_wait_event(&b->io.wait, &cl,
atomic_read(&b->io.cl.remaining) == -1);
down(&b->io_mutex);
up(&b->io_mutex);
return 0;
out_unlock:
rw_unlock(true, b);
return -ENOMEM;
}
static unsigned long bch_mca_scan(struct shrinker *shrink,
......@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
}
}
/*
* Can happen right when we first start up, before we've read in any
* btree nodes
*/
if (list_empty(&c->btree_cache))
goto out;
for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
if (list_empty(&c->btree_cache))
goto out;
b = list_first_entry(&c->btree_cache, struct btree, list);
list_rotate_left(&c->btree_cache);
......@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c)
#ifdef CONFIG_BCACHE_DEBUG
if (c->verify_data)
list_move(&c->verify_data->list, &c->btree_cache);
free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
#endif
list_splice(&c->btree_cache_freeable,
......@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c)
#ifdef CONFIG_BCACHE_DEBUG
mutex_init(&c->verify_lock);
c->verify_ondisk = (void *)
__get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
if (c->verify_data &&
c->verify_data->sets[0].data)
c->verify_data->keys.set->data)
list_del_init(&c->verify_data->list);
else
c->verify_data = NULL;
......@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
list_for_each_entry(b, &c->btree_cache_freed, list)
if (!mca_reap(b, 0, false)) {
mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
if (!b->sets[0].data)
if (!b->keys.set[0].data)
goto err;
else
goto out;
......@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
goto err;
BUG_ON(!down_write_trylock(&b->lock));
if (!b->sets->data)
if (!b->keys.set->data)
goto err;
out:
BUG_ON(!closure_is_unlocked(&b->io.cl));
BUG_ON(b->io_mutex.count != 1);
bkey_copy(&b->key, k);
list_move(&b->list, &c->btree_cache);
......@@ -930,10 +903,17 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
hlist_add_head_rcu(&b->hash, mca_hash(c, k));
lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
b->level = level;
b->parent = (void *) ~0UL;
b->flags = 0;
b->written = 0;
b->level = level;
mca_reinit(b);
if (!b->level)
bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
&b->c->expensive_debug_checks);
else
bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
&b->c->expensive_debug_checks);
return b;
err:
......@@ -994,13 +974,13 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
b->accessed = 1;
for (; i <= b->nsets && b->sets[i].size; i++) {
prefetch(b->sets[i].tree);
prefetch(b->sets[i].data);
for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
prefetch(b->keys.set[i].tree);
prefetch(b->keys.set[i].data);
}
for (; i <= b->nsets; i++)
prefetch(b->sets[i].data);
for (; i <= b->keys.nsets; i++)
prefetch(b->keys.set[i].data);
if (btree_node_io_error(b)) {
rw_unlock(write, b);
......@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
mutex_lock(&c->bucket_lock);
retry:
if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
goto err;
bkey_put(c, &k.key);
......@@ -1080,7 +1060,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
}
b->accessed = 1;
bch_bset_init_next(b);
bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
mutex_unlock(&c->bucket_lock);
......@@ -1098,8 +1078,10 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
{
struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
if (!IS_ERR_OR_NULL(n))
bch_btree_sort_into(b, n);
if (!IS_ERR_OR_NULL(n)) {
bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
bkey_copy_key(&n->key, &b->key);
}
return n;
}
......@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
atomic_inc(&b->c->prio_blocked);
}
static int btree_check_reserve(struct btree *b, struct btree_op *op)
{
struct cache_set *c = b->c;
struct cache *ca;
unsigned i, reserve = c->root->level * 2 + 1;
int ret = 0;
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i)
if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
if (op)
prepare_to_wait(&c->bucket_wait, &op->wait,
TASK_UNINTERRUPTIBLE);
ret = -EINTR;
break;
}
mutex_unlock(&c->bucket_lock);
return ret;
}
/* Garbage collection */
uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
......@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
gc->nodes++;
for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
stale = max(stale, btree_mark_key(b, k));
keys++;
if (bch_ptr_bad(b, k))
if (bch_ptr_bad(&b->keys, k))
continue;
gc->key_bytes += bkey_u64s(k);
......@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
gc->data += KEY_SIZE(k);
}
for (t = b->sets; t <= &b->sets[b->nsets]; t++)
for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
btree_bug_on(t->size &&
bset_written(b, t) &&
bset_written(&b->keys, t) &&
bkey_cmp(&b->key, &t->end) < 0,
b, "found short btree key in gc");
......@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
blocks = btree_default_blocks(b->c) * 2 / 3;
if (nodes < 2 ||
__set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
__set_blocks(b->keys.set[0].data, keys,
block_bytes(b->c)) > blocks * (nodes - 1))
return 0;
for (i = 0; i < nodes; i++) {
......@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
}
for (i = nodes - 1; i > 0; --i) {
struct bset *n1 = new_nodes[i]->sets->data;
struct bset *n2 = new_nodes[i - 1]->sets->data;
struct bset *n1 = btree_bset_first(new_nodes[i]);
struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
struct bkey *k, *last = NULL;
keys = 0;
if (i > 1) {
for (k = n2->start;
k < end(n2);
k < bset_bkey_last(n2);
k = bkey_next(k)) {
if (__set_blocks(n1, n1->keys + keys +
bkey_u64s(k), b->c) > blocks)
bkey_u64s(k),
block_bytes(b->c)) > blocks)
break;
last = k;
......@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
* though)
*/
if (__set_blocks(n1, n1->keys + n2->keys,
b->c) > btree_blocks(new_nodes[i]))
block_bytes(b->c)) >
btree_blocks(new_nodes[i]))
goto out_nocoalesce;
keys = n2->keys;
......@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
last = &r->b->key;
}
BUG_ON(__set_blocks(n1, n1->keys + keys,
b->c) > btree_blocks(new_nodes[i]));
BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
btree_blocks(new_nodes[i]));
if (last)
bkey_copy_key(&new_nodes[i]->key, last);
memcpy(end(n1),
memcpy(bset_bkey_last(n1),
n2->start,
(void *) node(n2, keys) - (void *) n2->start);
(void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
n1->keys += keys;
r[i].keys = n1->keys;
memmove(n2->start,
node(n2, keys),
(void *) end(n2) - (void *) node(n2, keys));
bset_bkey_idx(n2, keys),
(void *) bset_bkey_last(n2) -
(void *) bset_bkey_idx(n2, keys));
n2->keys -= keys;
if (bch_keylist_realloc(keylist,
KEY_PTRS(&new_nodes[i]->key), b->c))
if (__bch_keylist_realloc(keylist,
bkey_u64s(&new_nodes[i]->key)))
goto out_nocoalesce;
bch_btree_node_write(new_nodes[i], &cl);
......@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
}
for (i = 0; i < nodes; i++) {
if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
goto out_nocoalesce;
make_btree_freeing_key(r[i].b, keylist->top);
......@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
}
/* We emptied out this node */
BUG_ON(new_nodes[0]->sets->data->keys);
BUG_ON(btree_bset_first(new_nodes[0])->keys);
btree_node_free(new_nodes[0]);
rw_unlock(true, new_nodes[0]);
......@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b)
struct btree_iter iter;
unsigned ret = 0;
for_each_key_filter(b, k, &iter, bch_ptr_bad)
for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
ret += bkey_u64s(k);
return ret;
......@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
bch_keylist_init(&keys);
bch_btree_iter_init(b, &iter, &b->c->gc_done);
bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
for (i = 0; i < GC_MERGE_NODES; i++)
r[i].b = ERR_PTR(-EINTR);
while (1) {
k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
if (k) {
r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
if (IS_ERR(r->b)) {
......@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
if (!IS_ERR(last->b)) {
should_rewrite = btree_gc_mark_node(last->b, gc);
if (should_rewrite) {
if (should_rewrite &&
!btree_check_reserve(b, NULL)) {
n = btree_node_alloc_replacement(last->b,
false);
......@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
struct bucket *g;
struct btree_iter iter;
for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
for (i = 0; i < KEY_PTRS(k); i++) {
if (!ptr_available(b->c, k, i))
continue;
......@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
}
if (b->level) {
bch_btree_iter_init(b, &iter, NULL);
bch_btree_iter_init(&b->keys, &iter, NULL);
do {
k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
k = bch_btree_iter_next_filter(&iter, &b->keys,
bch_ptr_bad);
if (k)
btree_node_prefetch(b->c, k, b->level - 1);
......@@ -1774,235 +1784,36 @@ int bch_btree_check(struct cache_set *c)
/* Btree insertion */
static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
{
struct bset *i = b->sets[b->nsets].data;
memmove((uint64_t *) where + bkey_u64s(insert),
where,
(void *) end(i) - (void *) where);
i->keys += bkey_u64s(insert);
bkey_copy(where, insert);
bch_bset_fix_lookup_table(b, where);
}
static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
struct btree_iter *iter,
struct bkey *replace_key)
static bool btree_insert_key(struct btree *b, struct bkey *k,
struct bkey *replace_key)
{
void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
{
if (KEY_DIRTY(k))
bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
offset, -sectors);
}
uint64_t old_offset;
unsigned old_size, sectors_found = 0;
while (1) {
struct bkey *k = bch_btree_iter_next(iter);
if (!k ||
bkey_cmp(&START_KEY(k), insert) >= 0)
break;
if (bkey_cmp(k, &START_KEY(insert)) <= 0)
continue;
old_offset = KEY_START(k);
old_size = KEY_SIZE(k);
/*
* We might overlap with 0 size extents; we can't skip these
* because if they're in the set we're inserting to we have to
* adjust them so they don't overlap with the key we're
* inserting. But we don't want to check them for replace
* operations.
*/
if (replace_key && KEY_SIZE(k)) {
/*
* k might have been split since we inserted/found the
* key we're replacing
*/
unsigned i;
uint64_t offset = KEY_START(k) -
KEY_START(replace_key);
/* But it must be a subset of the replace key */
if (KEY_START(k) < KEY_START(replace_key) ||
KEY_OFFSET(k) > KEY_OFFSET(replace_key))
goto check_failed;
/* We didn't find a key that we were supposed to */
if (KEY_START(k) > KEY_START(insert) + sectors_found)
goto check_failed;
if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
KEY_DIRTY(k) != KEY_DIRTY(replace_key))
goto check_failed;
/* skip past gen */
offset <<= 8;
BUG_ON(!KEY_PTRS(replace_key));
unsigned status;
for (i = 0; i < KEY_PTRS(replace_key); i++)
if (k->ptr[i] != replace_key->ptr[i] + offset)
goto check_failed;
sectors_found = KEY_OFFSET(k) - KEY_START(insert);
}
if (bkey_cmp(insert, k) < 0 &&
bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
/*
* We overlapped in the middle of an existing key: that
* means we have to split the old key. But we have to do
* slightly different things depending on whether the
* old key has been written out yet.
*/
struct bkey *top;
subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
if (bkey_written(b, k)) {
/*
* We insert a new key to cover the top of the
* old key, and the old key is modified in place
* to represent the bottom split.
*
* It's completely arbitrary whether the new key
* is the top or the bottom, but it has to match
* up with what btree_sort_fixup() does - it
* doesn't check for this kind of overlap, it
* depends on us inserting a new key for the top
* here.
*/
top = bch_bset_search(b, &b->sets[b->nsets],
insert);
shift_keys(b, top, k);
} else {
BKEY_PADDED(key) temp;
bkey_copy(&temp.key, k);
shift_keys(b, k, &temp.key);
top = bkey_next(k);
}
bch_cut_front(insert, top);
bch_cut_back(&START_KEY(insert), k);
bch_bset_fix_invalidated_key(b, k);
return false;
}
if (bkey_cmp(insert, k) < 0) {
bch_cut_front(insert, k);
} else {
if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
old_offset = KEY_START(insert);
if (bkey_written(b, k) &&
bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
/*
* Completely overwrote, so we don't have to
* invalidate the binary search tree
*/
bch_cut_front(k, k);
} else {
__bch_cut_back(&START_KEY(insert), k);
bch_bset_fix_invalidated_key(b, k);
}
}
subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
}
BUG_ON(bkey_cmp(k, &b->key) > 0);
check_failed:
if (replace_key) {
if (!sectors_found) {
return true;
} else if (sectors_found < KEY_SIZE(insert)) {
SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
(KEY_SIZE(insert) - sectors_found));
SET_KEY_SIZE(insert, sectors_found);
}
}
status = bch_btree_insert_key(&b->keys, k, replace_key);
if (status != BTREE_INSERT_STATUS_NO_INSERT) {
bch_check_keys(&b->keys, "%u for %s", status,
replace_key ? "replace" : "insert");
return false;
trace_bcache_btree_insert_key(b, k, replace_key != NULL,
status);
return true;
} else
return false;
}
static bool btree_insert_key(struct btree *b, struct btree_op *op,
struct bkey *k, struct bkey *replace_key)
static size_t insert_u64s_remaining(struct btree *b)
{
struct bset *i = b->sets[b->nsets].data;
struct bkey *m, *prev;
unsigned status = BTREE_INSERT_STATUS_INSERT;
BUG_ON(bkey_cmp(k, &b->key) > 0);
BUG_ON(b->level && !KEY_PTRS(k));
BUG_ON(!b->level && !KEY_OFFSET(k));
if (!b->level) {
struct btree_iter iter;
/*
* bset_search() returns the first key that is strictly greater
* than the search key - but for back merging, we want to find
* the previous key.
*/
prev = NULL;
m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys);
if (fix_overlapping_extents(b, k, &iter, replace_key)) {
op->insert_collision = true;
return false;
}
if (KEY_DIRTY(k))
bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
KEY_START(k), KEY_SIZE(k));
while (m != end(i) &&
bkey_cmp(k, &START_KEY(m)) > 0)
prev = m, m = bkey_next(m);
if (key_merging_disabled(b->c))
goto insert;
/* prev is in the tree, if we merge we're done */
status = BTREE_INSERT_STATUS_BACK_MERGE;
if (prev &&
bch_bkey_try_merge(b, prev, k))
goto merged;
status = BTREE_INSERT_STATUS_OVERWROTE;
if (m != end(i) &&
KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
goto copy;
status = BTREE_INSERT_STATUS_FRONT_MERGE;
if (m != end(i) &&
bch_bkey_try_merge(b, k, m))
goto copy;
} else {
BUG_ON(replace_key);
m = bch_bset_search(b, &b->sets[b->nsets], k);
}
insert: shift_keys(b, m, k);
copy: bkey_copy(m, k);
merged:
bch_check_keys(b, "%u for %s", status,
replace_key ? "replace" : "insert");
if (b->level && !KEY_OFFSET(k))
btree_current_write(b)->prio_blocked++;
trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
/*
* Might land in the middle of an existing extent and have to split it
*/
if (b->keys.ops->is_extents)
ret -= KEY_MAX_U64S;
return true;
return max(ret, 0L);
}
static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
......@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
struct bkey *replace_key)
{
bool ret = false;
int oldsize = bch_count_data(b);
int oldsize = bch_count_data(&b->keys);
while (!bch_keylist_empty(insert_keys)) {
struct bset *i = write_block(b);
struct bkey *k = insert_keys->keys;
if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
> btree_blocks(b))
if (bkey_u64s(k) > insert_u64s_remaining(b))
break;
if (bkey_cmp(k, &b->key) <= 0) {
if (!b->level)
bkey_put(b->c, k);
ret |= btree_insert_key(b, op, k, replace_key);
ret |= btree_insert_key(b, k, replace_key);
bch_keylist_pop_front(insert_keys);
} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
BKEY_PADDED(key) temp;
......@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
bch_cut_back(&b->key, &temp.key);
bch_cut_front(&b->key, insert_keys->keys);
ret |= btree_insert_key(b, op, &temp.key, replace_key);
ret |= btree_insert_key(b, &temp.key, replace_key);
break;
} else {
break;
}
}
if (!ret)
op->insert_collision = true;
BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
BUG_ON(bch_count_data(b) < oldsize);
BUG_ON(bch_count_data(&b->keys) < oldsize);
return ret;
}
......@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
closure_init_stack(&cl);
bch_keylist_init(&parent_keys);
if (!b->level &&
btree_check_reserve(b, op))
return -EINTR;
n1 = btree_node_alloc_replacement(b, true);
if (IS_ERR(n1))
goto err;
split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
split = set_blocks(btree_bset_first(n1),
block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
if (split) {
unsigned keys = 0;
trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
n2 = bch_btree_node_alloc(b->c, b->level, true);
if (IS_ERR(n2))
......@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op,
* search tree yet
*/
while (keys < (n1->sets[0].data->keys * 3) / 5)
keys += bkey_u64s(node(n1->sets[0].data, keys));
while (keys < (btree_bset_first(n1)->keys * 3) / 5)
keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
keys));
bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
keys += bkey_u64s(node(n1->sets[0].data, keys));
bkey_copy_key(&n1->key,
bset_bkey_idx(btree_bset_first(n1), keys));
keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
n1->sets[0].data->keys = keys;
btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
btree_bset_first(n1)->keys = keys;
memcpy(n2->sets[0].data->start,
end(n1->sets[0].data),
n2->sets[0].data->keys * sizeof(uint64_t));
memcpy(btree_bset_first(n2)->start,
bset_bkey_last(btree_bset_first(n1)),
btree_bset_first(n2)->keys * sizeof(uint64_t));
bkey_copy_key(&n2->key, &b->key);
......@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
bch_btree_node_write(n2, &cl);
rw_unlock(true, n2);
} else {
trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
bch_btree_insert_keys(n1, op, insert_keys, replace_key);
}
......@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
return 0;
err_free2:
bkey_put(b->c, &n2->key);
btree_node_free(n2);
rw_unlock(true, n2);
err_free1:
bkey_put(b->c, &n1->key);
btree_node_free(n1);
rw_unlock(true, n1);
err:
WARN(1, "bcache: btree split failed");
if (n3 == ERR_PTR(-EAGAIN) ||
n2 == ERR_PTR(-EAGAIN) ||
n1 == ERR_PTR(-EAGAIN))
return -EAGAIN;
pr_warn("couldn't split");
return -ENOMEM;
}
......@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
{
BUG_ON(b->level && replace_key);
if (should_split(b)) {
if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
if (current->bio_list) {
op->lock = b->c->root->level + 1;
return -EAGAIN;
......@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
return -EINTR;
} else {
/* Invalidated all iterators */
return btree_split(b, op, insert_keys, replace_key) ?:
-EINTR;
int ret = btree_split(b, op, insert_keys, replace_key);
return bch_keylist_empty(insert_keys) ?
0 : ret ?: -EINTR;
}
} else {
BUG_ON(write_block(b) != b->sets[b->nsets].data);
BUG_ON(write_block(b) != btree_bset_last(b));
if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
if (!b->level)
......@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
struct bkey *k;
struct btree_iter iter;
bch_btree_iter_init(b, &iter, from);
bch_btree_iter_init(&b->keys, &iter, from);
while ((k = bch_btree_iter_next_filter(&iter, b,
while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
bch_ptr_bad))) {
ret = btree(map_nodes_recurse, k, b,
op, from, fn, flags);
......@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
struct bkey *k;
struct btree_iter iter;
bch_btree_iter_init(b, &iter, from);
bch_btree_iter_init(&b->keys, &iter, from);
while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
ret = !b->level
? fn(op, b, k)
: btree(map_keys_recurse, k, b, op, from, fn, flags);
......
......@@ -130,20 +130,12 @@ struct btree {
unsigned long flags;
uint16_t written; /* would be nice to kill */
uint8_t level;
uint8_t nsets;
uint8_t page_order;
/*
* Set of sorted keys - the real btree node - plus a binary search tree
*
* sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
* to the memory we have allocated for this btree node. Additionally,
* set[0]->data points to the entire btree node as it exists on disk.
*/
struct bset_tree sets[MAX_BSETS];
struct btree_keys keys;
/* For outstanding btree writes, used as a lock - protects write_idx */
struct closure_with_waitlist io;
struct closure io;
struct semaphore io_mutex;
struct list_head list;
struct delayed_work work;
......@@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
return b->writes + (btree_node_write_idx(b) ^ 1);
}
static inline unsigned bset_offset(struct btree *b, struct bset *i)
static inline struct bset *btree_bset_first(struct btree *b)
{
return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
return b->keys.set->data;
}
static inline struct bset *write_block(struct btree *b)
static inline struct bset *btree_bset_last(struct btree *b)
{
return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
return bset_tree_last(&b->keys)->data;
}
static inline bool bset_written(struct btree *b, struct bset_tree *t)
static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
{
return t->data < write_block(b);
}
static inline bool bkey_written(struct btree *b, struct bkey *k)
{
return k < write_block(b)->start;
return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
}
static inline void set_gc_sectors(struct cache_set *c)
......@@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c)
atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
}
static inline struct bkey *bch_btree_iter_init(struct btree *b,
struct btree_iter *iter,
struct bkey *search)
{
return __bch_btree_iter_init(b, iter, search, b->sets);
}
static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
{
if (b->level)
return bch_btree_ptr_invalid(b->c, k);
else
return bch_extent_ptr_invalid(b->c, k);
}
void bkey_put(struct cache_set *c, struct bkey *k);
/* Looping macros */
......@@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k);
iter++) \
hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
#define for_each_key_filter(b, k, iter, filter) \
for (bch_btree_iter_init((b), (iter), NULL); \
((k) = bch_btree_iter_next_filter((iter), b, filter));)
#define for_each_key(b, k, iter) \
for (bch_btree_iter_init((b), (iter), NULL); \
((k) = bch_btree_iter_next(iter));)
/* Recursing down the btree */
struct btree_op {
/* for waiting on btree reserve in btree_split() */
wait_queue_t wait;
/* Btree level at which we start taking write locks */
short lock;
......@@ -249,6 +216,7 @@ struct btree_op {
static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
{
memset(op, 0, sizeof(struct btree_op));
init_wait(&op->wait);
op->lock = write_lock_level;
}
......@@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b)
(w ? up_write : up_read)(&b->lock);
}
void bch_btree_node_read(struct btree *);
void bch_btree_node_read_done(struct btree *);
void bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_set_root(struct btree *);
......
......@@ -11,19 +11,6 @@
#include "closure.h"
#define CL_FIELD(type, field) \
case TYPE_ ## type: \
return &container_of(cl, struct type, cl)->field
static struct closure_waitlist *closure_waitlist(struct closure *cl)
{
switch (cl->type) {
CL_FIELD(closure_with_waitlist, wait);
default:
return NULL;
}
}
static inline void closure_put_after_sub(struct closure *cl, int flags)
{
int r = flags & CLOSURE_REMAINING_MASK;
......@@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
closure_queue(cl);
} else {
struct closure *parent = cl->parent;
struct closure_waitlist *wait = closure_waitlist(cl);
closure_fn *destructor = cl->fn;
closure_debug_destroy(cl);
smp_mb();
atomic_set(&cl->remaining, -1);
if (wait)
closure_wake_up(wait);
if (destructor)
destructor(cl);
......@@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v)
}
EXPORT_SYMBOL(closure_sub);
/**
* closure_put - decrement a closure's refcount
*/
void closure_put(struct closure *cl)
{
closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
}
EXPORT_SYMBOL(closure_put);
static void set_waiting(struct closure *cl, unsigned long f)
{
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
cl->waiting_on = f;
#endif
}
/**
* closure_wake_up - wake up all closures on a wait list, without memory barrier
*/
void __closure_wake_up(struct closure_waitlist *wait_list)
{
struct llist_node *list;
......@@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
cl = container_of(reverse, struct closure, list);
reverse = llist_next(reverse);
set_waiting(cl, 0);
closure_set_waiting(cl, 0);
closure_sub(cl, CLOSURE_WAITING + 1);
}
}
EXPORT_SYMBOL(__closure_wake_up);
bool closure_wait(struct closure_waitlist *list, struct closure *cl)
/**
* closure_wait - add a closure to a waitlist
*
* @waitlist will own a ref on @cl, which will be released when
* closure_wake_up() is called on @waitlist.
*
*/
bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
{
if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
return false;
set_waiting(cl, _RET_IP_);
closure_set_waiting(cl, _RET_IP_);
atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
llist_add(&cl->list, &list->list);
llist_add(&cl->list, &waitlist->list);
return true;
}
EXPORT_SYMBOL(closure_wait);
/**
* closure_sync() - sleep until a closure a closure has nothing left to wait on
* closure_sync - sleep until a closure a closure has nothing left to wait on
*
* Sleeps until the refcount hits 1 - the thread that's running the closure owns
* the last refcount.
......@@ -148,46 +134,6 @@ void closure_sync(struct closure *cl)
}
EXPORT_SYMBOL(closure_sync);
/**
* closure_trylock() - try to acquire the closure, without waiting
* @cl: closure to lock
*
* Returns true if the closure was succesfully locked.
*/
bool closure_trylock(struct closure *cl, struct closure *parent)
{
if (atomic_cmpxchg(&cl->remaining, -1,
CLOSURE_REMAINING_INITIALIZER) != -1)
return false;
smp_mb();
cl->parent = parent;
if (parent)
closure_get(parent);
closure_set_ret_ip(cl);
closure_debug_create(cl);
return true;
}
EXPORT_SYMBOL(closure_trylock);
void __closure_lock(struct closure *cl, struct closure *parent,
struct closure_waitlist *wait_list)
{
struct closure wait;
closure_init_stack(&wait);
while (1) {
if (closure_trylock(cl, parent))
return;
closure_wait_event(wait_list, &wait,
atomic_read(&cl->remaining) == -1);
}
}
EXPORT_SYMBOL(__closure_lock);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
static LIST_HEAD(closure_list);
......
......@@ -72,30 +72,6 @@
* closure - _always_ use continue_at(). Doing so consistently will help
* eliminate an entire class of particularly pernicious races.
*
* For a closure to wait on an arbitrary event, we need to introduce waitlists:
*
* struct closure_waitlist list;
* closure_wait_event(list, cl, condition);
* closure_wake_up(wait_list);
*
* These work analagously to wait_event() and wake_up() - except that instead of
* operating on the current thread (for wait_event()) and lists of threads, they
* operate on an explicit closure and lists of closures.
*
* Because it's a closure we can now wait either synchronously or
* asynchronously. closure_wait_event() returns the current value of the
* condition, and if it returned false continue_at() or closure_sync() can be
* used to wait for it to become true.
*
* It's useful for waiting on things when you can't sleep in the context in
* which you must check the condition (perhaps a spinlock held, or you might be
* beneath generic_make_request() - in which case you can't sleep on IO).
*
* closure_wait_event() will wait either synchronously or asynchronously,
* depending on whether the closure is in blocking mode or not. You can pick a
* mode explicitly with closure_wait_event_sync() and
* closure_wait_event_async(), which do just what you might expect.
*
* Lastly, you might have a wait list dedicated to a specific event, and have no
* need for specifying the condition - you just want to wait until someone runs
* closure_wake_up() on the appropriate wait list. In that case, just use
......@@ -121,40 +97,6 @@
* All this implies that a closure should typically be embedded in a particular
* struct (which its refcount will normally control the lifetime of), and that
* struct can very much be thought of as a stack frame.
*
* Locking:
*
* Closures are based on work items but they can be thought of as more like
* threads - in that like threads and unlike work items they have a well
* defined lifetime; they are created (with closure_init()) and eventually
* complete after a continue_at(cl, NULL, NULL).
*
* Suppose you've got some larger structure with a closure embedded in it that's
* used for periodically doing garbage collection. You only want one garbage
* collection happening at a time, so the natural thing to do is protect it with
* a lock. However, it's difficult to use a lock protecting a closure correctly
* because the unlock should come after the last continue_to() (additionally, if
* you're using the closure asynchronously a mutex won't work since a mutex has
* to be unlocked by the same process that locked it).
*
* So to make it less error prone and more efficient, we also have the ability
* to use closures as locks:
*
* closure_init_unlocked();
* closure_trylock();
*
* That's all we need for trylock() - the last closure_put() implicitly unlocks
* it for you. But for closure_lock(), we also need a wait list:
*
* struct closure_with_waitlist frobnicator_cl;
*
* closure_init_unlocked(&frobnicator_cl);
* closure_lock(&frobnicator_cl);
*
* A closure_with_waitlist embeds a closure and a wait list - much like struct
* delayed_work embeds a work item and a timer_list. The important thing is, use
* it exactly like you would a regular closure and closure_put() will magically
* handle everything for you.
*/
struct closure;
......@@ -164,12 +106,6 @@ struct closure_waitlist {
struct llist_head list;
};
enum closure_type {
TYPE_closure = 0,
TYPE_closure_with_waitlist = 1,
MAX_CLOSURE_TYPE = 1,
};
enum closure_state {
/*
* CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
......@@ -224,8 +160,6 @@ struct closure {
atomic_t remaining;
enum closure_type type;
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
#define CLOSURE_MAGIC_DEAD 0xc054dead
#define CLOSURE_MAGIC_ALIVE 0xc054a11e
......@@ -237,34 +171,12 @@ struct closure {
#endif
};
struct closure_with_waitlist {
struct closure cl;
struct closure_waitlist wait;
};
extern unsigned invalid_closure_type(void);
#define __CLOSURE_TYPE(cl, _t) \
__builtin_types_compatible_p(typeof(cl), struct _t) \
? TYPE_ ## _t : \
#define __closure_type(cl) \
( \
__CLOSURE_TYPE(cl, closure) \
__CLOSURE_TYPE(cl, closure_with_waitlist) \
invalid_closure_type() \
)
void closure_sub(struct closure *cl, int v);
void closure_put(struct closure *cl);
void __closure_wake_up(struct closure_waitlist *list);
bool closure_wait(struct closure_waitlist *list, struct closure *cl);
void closure_sync(struct closure *cl);
bool closure_trylock(struct closure *cl, struct closure *parent);
void __closure_lock(struct closure *cl, struct closure *parent,
struct closure_waitlist *wait_list);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
void closure_debug_init(void);
......@@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl)
#endif
}
static inline void closure_get(struct closure *cl)
static inline void closure_set_waiting(struct closure *cl, unsigned long f)
{
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
BUG_ON((atomic_inc_return(&cl->remaining) &
CLOSURE_REMAINING_MASK) <= 1);
#else
atomic_inc(&cl->remaining);
cl->waiting_on = f;
#endif
}
static inline void closure_set_stopped(struct closure *cl)
static inline void __closure_end_sleep(struct closure *cl)
{
atomic_sub(CLOSURE_RUNNING, &cl->remaining);
__set_current_state(TASK_RUNNING);
if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
}
static inline bool closure_is_unlocked(struct closure *cl)
static inline void __closure_start_sleep(struct closure *cl)
{
return atomic_read(&cl->remaining) == -1;
closure_set_ip(cl);
cl->task = current;
set_current_state(TASK_UNINTERRUPTIBLE);
if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
atomic_add(CLOSURE_SLEEPING, &cl->remaining);
}
static inline void do_closure_init(struct closure *cl, struct closure *parent,
bool running)
static inline void closure_set_stopped(struct closure *cl)
{
cl->parent = parent;
if (parent)
closure_get(parent);
if (running) {
closure_debug_create(cl);
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
} else
atomic_set(&cl->remaining, -1);
atomic_sub(CLOSURE_RUNNING, &cl->remaining);
}
static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
struct workqueue_struct *wq)
{
BUG_ON(object_is_on_stack(cl));
closure_set_ip(cl);
cl->fn = fn;
cl->wq = wq;
/* between atomic_dec() in closure_put() */
smp_mb__before_atomic_dec();
}
/*
* Hack to get at the embedded closure if there is one, by doing an unsafe cast:
* the result of __closure_type() is thrown away, it's used merely for type
* checking.
*/
#define __to_internal_closure(cl) \
({ \
BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \
(struct closure *) cl; \
})
#define closure_init_type(cl, parent, running) \
do { \
struct closure *_cl = __to_internal_closure(cl); \
_cl->type = __closure_type(*(cl)); \
do_closure_init(_cl, parent, running); \
} while (0)
static inline void closure_queue(struct closure *cl)
{
struct workqueue_struct *wq = cl->wq;
if (wq) {
INIT_WORK(&cl->work, cl->work.func);
BUG_ON(!queue_work(wq, &cl->work));
} else
cl->fn(cl);
}
/**
* __closure_init() - Initialize a closure, skipping the memset()
*
* May be used instead of closure_init() when memory has already been zeroed.
* closure_get - increment a closure's refcount
*/
#define __closure_init(cl, parent) \
closure_init_type(cl, parent, true)
static inline void closure_get(struct closure *cl)
{
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
BUG_ON((atomic_inc_return(&cl->remaining) &
CLOSURE_REMAINING_MASK) <= 1);
#else
atomic_inc(&cl->remaining);
#endif
}
/**
* closure_init() - Initialize a closure, setting the refcount to 1
* closure_init - Initialize a closure, setting the refcount to 1
* @cl: closure to initialize
* @parent: parent of the new closure. cl will take a refcount on it for its
* lifetime; may be NULL.
*/
#define closure_init(cl, parent) \
do { \
memset((cl), 0, sizeof(*(cl))); \
__closure_init(cl, parent); \
} while (0)
static inline void closure_init_stack(struct closure *cl)
static inline void closure_init(struct closure *cl, struct closure *parent)
{
memset(cl, 0, sizeof(struct closure));
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
}
/**
* closure_init_unlocked() - Initialize a closure but leave it unlocked.
* @cl: closure to initialize
*
* For when the closure will be used as a lock. The closure may not be used
* until after a closure_lock() or closure_trylock().
*/
#define closure_init_unlocked(cl) \
do { \
memset((cl), 0, sizeof(*(cl))); \
closure_init_type(cl, NULL, false); \
} while (0)
/**
* closure_lock() - lock and initialize a closure.
* @cl: the closure to lock
* @parent: the new parent for this closure
*
* The closure must be of one of the types that has a waitlist (otherwise we
* wouldn't be able to sleep on contention).
*
* @parent has exactly the same meaning as in closure_init(); if non null, the
* closure will take a reference on @parent which will be released when it is
* unlocked.
*/
#define closure_lock(cl, parent) \
__closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
cl->parent = parent;
if (parent)
closure_get(parent);
static inline void __closure_end_sleep(struct closure *cl)
{
__set_current_state(TASK_RUNNING);
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
closure_debug_create(cl);
closure_set_ip(cl);
}
static inline void __closure_start_sleep(struct closure *cl)
static inline void closure_init_stack(struct closure *cl)
{
closure_set_ip(cl);
cl->task = current;
set_current_state(TASK_UNINTERRUPTIBLE);
if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
atomic_add(CLOSURE_SLEEPING, &cl->remaining);
memset(cl, 0, sizeof(struct closure));
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
}
/**
* closure_wake_up() - wake up all closures on a wait list.
* closure_wake_up - wake up all closures on a wait list.
*/
static inline void closure_wake_up(struct closure_waitlist *list)
{
......@@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list)
__closure_wake_up(list);
}
/*
* Wait on an event, synchronously or asynchronously - analogous to wait_event()
* but for closures.
*
* The loop is oddly structured so as to avoid a race; we must check the
* condition again after we've added ourself to the waitlist. We know if we were
* already on the waitlist because closure_wait() returns false; thus, we only
* schedule or break if closure_wait() returns false. If it returns true, we
* just loop again - rechecking the condition.
*
* The __closure_wake_up() is necessary because we may race with the event
* becoming true; i.e. we see event false -> wait -> recheck condition, but the
* thread that made the event true may have called closure_wake_up() before we
* added ourself to the wait list.
*
* We have to call closure_sync() at the end instead of just
* __closure_end_sleep() because a different thread might've called
* closure_wake_up() before us and gotten preempted before they dropped the
* refcount on our closure. If this was a stack allocated closure, that would be
* bad.
/**
* continue_at - jump to another function with barrier
*
* After @cl is no longer waiting on anything (i.e. all outstanding refs have
* been dropped with closure_put()), it will resume execution at @fn running out
* of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
*
* NOTE: This macro expands to a return in the calling function!
*
* This is because after calling continue_at() you no longer have a ref on @cl,
* and whatever @cl owns may be freed out from under you - a running closure fn
* has a ref on its own closure which continue_at() drops.
*/
#define closure_wait_event(list, cl, condition) \
({ \
typeof(condition) ret; \
\
while (1) { \
ret = (condition); \
if (ret) { \
__closure_wake_up(list); \
closure_sync(cl); \
break; \
} \
\
__closure_start_sleep(cl); \
\
if (!closure_wait(list, cl)) \
schedule(); \
} \
\
ret; \
})
static inline void closure_queue(struct closure *cl)
{
struct workqueue_struct *wq = cl->wq;
if (wq) {
INIT_WORK(&cl->work, cl->work.func);
BUG_ON(!queue_work(wq, &cl->work));
} else
cl->fn(cl);
}
static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
struct workqueue_struct *wq)
{
BUG_ON(object_is_on_stack(cl));
closure_set_ip(cl);
cl->fn = fn;
cl->wq = wq;
/* between atomic_dec() in closure_put() */
smp_mb__before_atomic_dec();
}
#define continue_at(_cl, _fn, _wq) \
do { \
set_closure_fn(_cl, _fn, _wq); \
......@@ -498,8 +323,28 @@ do { \
return; \
} while (0)
/**
* closure_return - finish execution of a closure
*
* This is used to indicate that @cl is finished: when all outstanding refs on
* @cl have been dropped @cl's ref on its parent closure (as passed to
* closure_init()) will be dropped, if one was specified - thus this can be
* thought of as returning to the parent closure.
*/
#define closure_return(_cl) continue_at((_cl), NULL, NULL)
/**
* continue_at_nobarrier - jump to another function without barrier
*
* Causes @fn to be executed out of @cl, in @wq context (or called directly if
* @wq is NULL).
*
* NOTE: like continue_at(), this macro expands to a return in the caller!
*
* The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
* thus it's not safe to touch anything protected by @cl after a
* continue_at_nobarrier().
*/
#define continue_at_nobarrier(_cl, _fn, _wq) \
do { \
set_closure_fn(_cl, _fn, _wq); \
......@@ -507,6 +352,15 @@ do { \
return; \
} while (0)
/**
* closure_return - finish execution of a closure, with destructor
*
* Works like closure_return(), except @destructor will be called when all
* outstanding refs on @cl have been dropped; @destructor may be used to safely
* free the memory occupied by @cl, and it is called with the ref on the parent
* closure still held - so @destructor could safely return an item to a
* freelist protected by @cl's parent.
*/
#define closure_return_with_destructor(_cl, _destructor) \
do { \
set_closure_fn(_cl, _destructor, NULL); \
......@@ -514,6 +368,13 @@ do { \
return; \
} while (0)
/**
* closure_call - execute @fn out of a new, uninitialized closure
*
* Typically used when running out of one closure, and we want to run @fn
* asynchronously out of a new closure - @parent will then wait for @cl to
* finish.
*/
static inline void closure_call(struct closure *cl, closure_fn fn,
struct workqueue_struct *wq,
struct closure *parent)
......@@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
continue_at_nobarrier(cl, fn, wq);
}
static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
struct workqueue_struct *wq,
struct closure *parent)
{
if (closure_trylock(cl, parent))
continue_at_nobarrier(cl, fn, wq);
}
#endif /* _LINUX_CLOSURE_H */
......@@ -8,6 +8,7 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "extents.h"
#include <linux/console.h>
#include <linux/debugfs.h>
......@@ -17,156 +18,88 @@
static struct dentry *debug;
const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
{
unsigned i;
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(c, k, i)) {
struct cache *ca = PTR_CACHE(c, k, i);
size_t bucket = PTR_BUCKET_NR(c, k, i);
size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
if (KEY_SIZE(k) + r > c->sb.bucket_size)
return "bad, length too big";
if (bucket < ca->sb.first_bucket)
return "bad, short offset";
if (bucket >= ca->sb.nbuckets)
return "bad, offset past end of device";
if (ptr_stale(c, k, i))
return "stale";
}
if (!bkey_cmp(k, &ZERO_KEY))
return "bad, null key";
if (!KEY_PTRS(k))
return "bad, no pointers";
if (!KEY_SIZE(k))
return "zeroed key";
return "";
}
int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
unsigned i = 0;
char *out = buf, *end = buf + size;
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
if (KEY_PTRS(k))
while (1) {
p("%llu:%llu gen %llu",
PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
if (++i == KEY_PTRS(k))
break;
p(", ");
}
p("]");
if (KEY_DIRTY(k))
p(" dirty");
if (KEY_CSUM(k))
p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
#undef p
return out - buf;
}
#ifdef CONFIG_BCACHE_DEBUG
static void dump_bset(struct btree *b, struct bset *i)
{
struct bkey *k, *next;
unsigned j;
char buf[80];
for (k = i->start; k < end(i); k = next) {
next = bkey_next(k);
bch_bkey_to_text(buf, sizeof(buf), k);
printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
(uint64_t *) k - i->d, i->keys, buf);
for (j = 0; j < KEY_PTRS(k); j++) {
size_t n = PTR_BUCKET_NR(b->c, k, j);
printk(" bucket %zu", n);
if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
printk(" prio %i",
PTR_BUCKET(b->c, k, j)->prio);
}
#define for_each_written_bset(b, start, i) \
for (i = (start); \
(void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
i->seq == (start)->seq; \
i = (void *) i + set_blocks(i, block_bytes(b->c)) * \
block_bytes(b->c))
printk(" %s\n", bch_ptr_status(b->c, k));
if (next < end(i) &&
bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
printk(KERN_ERR "Key skipped backwards\n");
}
}
static void bch_dump_bucket(struct btree *b)
{
unsigned i;
console_lock();
for (i = 0; i <= b->nsets; i++)
dump_bset(b, b->sets[i].data);
console_unlock();
}
void bch_btree_verify(struct btree *b, struct bset *new)
void bch_btree_verify(struct btree *b)
{
struct btree *v = b->c->verify_data;
struct closure cl;
closure_init_stack(&cl);
struct bset *ondisk, *sorted, *inmemory;
struct bio *bio;
if (!b->c->verify)
if (!b->c->verify || !b->c->verify_ondisk)
return;
closure_wait_event(&b->io.wait, &cl,
atomic_read(&b->io.cl.remaining) == -1);
down(&b->io_mutex);
mutex_lock(&b->c->verify_lock);
ondisk = b->c->verify_ondisk;
sorted = b->c->verify_data->keys.set->data;
inmemory = b->keys.set->data;
bkey_copy(&v->key, &b->key);
v->written = 0;
v->level = b->level;
v->keys.ops = b->keys.ops;
bio = bch_bbio_alloc(b->c);
bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev;
bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9;
bch_bio_map(bio, sorted);
bch_btree_node_read(v);
closure_wait_event(&v->io.wait, &cl,
atomic_read(&b->io.cl.remaining) == -1);
submit_bio_wait(REQ_META|READ_SYNC, bio);
bch_bbio_free(bio, b->c);
if (new->keys != v->sets[0].data->keys ||
memcmp(new->start,
v->sets[0].data->start,
(void *) end(new) - (void *) new->start)) {
unsigned i, j;
memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
bch_btree_node_read_done(v);
sorted = v->keys.set->data;
if (inmemory->keys != sorted->keys ||
memcmp(inmemory->start,
sorted->start,
(void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
struct bset *i;
unsigned j;
console_lock();
printk(KERN_ERR "*** original memory node:\n");
for (i = 0; i <= b->nsets; i++)
dump_bset(b, b->sets[i].data);
printk(KERN_ERR "*** in memory:\n");
bch_dump_bset(&b->keys, inmemory, 0);
printk(KERN_ERR "*** sorted memory node:\n");
dump_bset(b, new);
printk(KERN_ERR "*** read back in:\n");
bch_dump_bset(&v->keys, sorted, 0);
printk(KERN_ERR "*** on disk node:\n");
dump_bset(v, v->sets[0].data);
for_each_written_bset(b, ondisk, i) {
unsigned block = ((void *) i - (void *) ondisk) /
block_bytes(b->c);
printk(KERN_ERR "*** on disk block %u:\n", block);
bch_dump_bset(&b->keys, i, block);
}
for (j = 0; j < new->keys; j++)
if (new->d[j] != v->sets[0].data->d[j])
printk(KERN_ERR "*** block %zu not written\n",
((void *) i - (void *) ondisk) / block_bytes(b->c));
for (j = 0; j < inmemory->keys; j++)
if (inmemory->d[j] != sorted->d[j])
break;
printk(KERN_ERR "b->written %u\n", b->written);
console_unlock();
panic("verify failed at %u\n", j);
}
mutex_unlock(&b->c->verify_lock);
up(&b->io_mutex);
}
void bch_data_verify(struct cached_dev *dc, struct bio *bio)
......@@ -207,74 +140,6 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
bio_put(check);
}
int __bch_count_data(struct btree *b)
{
unsigned ret = 0;
struct btree_iter iter;
struct bkey *k;
if (!b->level)
for_each_key(b, k, &iter)
ret += KEY_SIZE(k);
return ret;
}
void __bch_check_keys(struct btree *b, const char *fmt, ...)
{
va_list args;
struct bkey *k, *p = NULL;
struct btree_iter iter;
const char *err;
for_each_key(b, k, &iter) {
if (!b->level) {
err = "Keys out of order";
if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
goto bug;
if (bch_ptr_invalid(b, k))
continue;
err = "Overlapping keys";
if (p && bkey_cmp(p, &START_KEY(k)) > 0)
goto bug;
} else {
if (bch_ptr_bad(b, k))
continue;
err = "Duplicate keys";
if (p && !bkey_cmp(p, k))
goto bug;
}
p = k;
}
err = "Key larger than btree node key";
if (p && bkey_cmp(p, &b->key) > 0)
goto bug;
return;
bug:
bch_dump_bucket(b);
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
panic("bcache error: %s:\n", err);
}
void bch_btree_iter_next_check(struct btree_iter *iter)
{
struct bkey *k = iter->data->k, *next = bkey_next(k);
if (next < iter->data->end &&
bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
bch_dump_bucket(iter->b);
panic("Key skipped backwards\n");
}
}
#endif
#ifdef CONFIG_DEBUG_FS
......@@ -321,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
if (!w)
break;
bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
bch_keybuf_del(&i->keys, w);
}
......
#ifndef _BCACHE_DEBUG_H
#define _BCACHE_DEBUG_H
/* Btree/bkey debug printing */
int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
struct bio;
struct cached_dev;
struct cache_set;
#ifdef CONFIG_BCACHE_DEBUG
void bch_btree_verify(struct btree *, struct bset *);
void bch_btree_verify(struct btree *);
void bch_data_verify(struct cached_dev *, struct bio *);
int __bch_count_data(struct btree *);
void __bch_check_keys(struct btree *, const char *, ...);
void bch_btree_iter_next_check(struct btree_iter *);
#define EBUG_ON(cond) BUG_ON(cond)
#define expensive_debug_checks(c) ((c)->expensive_debug_checks)
#define key_merging_disabled(c) ((c)->key_merging_disabled)
#define bypass_torture_test(d) ((d)->bypass_torture_test)
#else /* DEBUG */
static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
static inline void bch_btree_verify(struct btree *b) {}
static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
static inline int __bch_count_data(struct btree *b) { return -1; }
static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
#define EBUG_ON(cond) do { if (cond); } while (0)
#define expensive_debug_checks(c) 0
#define key_merging_disabled(c) 0
#define bypass_torture_test(d) 0
#endif
#define bch_count_data(b) \
(expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
#define bch_check_keys(b, ...) \
do { \
if (expensive_debug_checks((b)->c)) \
__bch_check_keys(b, __VA_ARGS__); \
} while (0)
#ifdef CONFIG_DEBUG_FS
void bch_debug_init_cache_set(struct cache_set *);
#else
......
/*
* Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
*
* Uses a block device as cache for other block devices; optimized for SSDs.
* All allocation is done in buckets, which should match the erase block size
* of the device.
*
* Buckets containing cached data are kept on a heap sorted by priority;
* bucket priority is increased on cache hit, and periodically all the buckets
* on the heap have their priority scaled down. This currently is just used as
* an LRU but in the future should allow for more intelligent heuristics.
*
* Buckets have an 8 bit counter; freeing is accomplished by incrementing the
* counter. Garbage collection is used to remove stale pointers.
*
* Indexing is done via a btree; nodes are not necessarily fully sorted, rather
* as keys are inserted we only sort the pages that have not yet been written.
* When garbage collection is run, we resort the entire node.
*
* All configuration is done via sysfs; see Documentation/bcache.txt.
*/
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "extents.h"
#include "writeback.h"
static void sort_key_next(struct btree_iter *iter,
struct btree_iter_set *i)
{
i->k = bkey_next(i->k);
if (i->k == i->end)
*i = iter->data[--iter->used];
}
static bool bch_key_sort_cmp(struct btree_iter_set l,
struct btree_iter_set r)
{
int64_t c = bkey_cmp(l.k, r.k);
return c ? c > 0 : l.k < r.k;
}
static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
{
unsigned i;
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(c, k, i)) {
struct cache *ca = PTR_CACHE(c, k, i);
size_t bucket = PTR_BUCKET_NR(c, k, i);
size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
if (KEY_SIZE(k) + r > c->sb.bucket_size ||
bucket < ca->sb.first_bucket ||
bucket >= ca->sb.nbuckets)
return true;
}
return false;
}
/* Common among btree and extent ptrs */
static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
{
unsigned i;
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(c, k, i)) {
struct cache *ca = PTR_CACHE(c, k, i);
size_t bucket = PTR_BUCKET_NR(c, k, i);
size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
if (KEY_SIZE(k) + r > c->sb.bucket_size)
return "bad, length too big";
if (bucket < ca->sb.first_bucket)
return "bad, short offset";
if (bucket >= ca->sb.nbuckets)
return "bad, offset past end of device";
if (ptr_stale(c, k, i))
return "stale";
}
if (!bkey_cmp(k, &ZERO_KEY))
return "bad, null key";
if (!KEY_PTRS(k))
return "bad, no pointers";
if (!KEY_SIZE(k))
return "zeroed key";
return "";
}
void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
{
unsigned i = 0;
char *out = buf, *end = buf + size;
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
for (i = 0; i < KEY_PTRS(k); i++) {
if (i)
p(", ");
if (PTR_DEV(k, i) == PTR_CHECK_DEV)
p("check dev");
else
p("%llu:%llu gen %llu", PTR_DEV(k, i),
PTR_OFFSET(k, i), PTR_GEN(k, i));
}
p("]");
if (KEY_DIRTY(k))
p(" dirty");
if (KEY_CSUM(k))
p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
#undef p
}
static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
{
struct btree *b = container_of(keys, struct btree, keys);
unsigned j;
char buf[80];
bch_extent_to_text(buf, sizeof(buf), k);
printk(" %s", buf);
for (j = 0; j < KEY_PTRS(k); j++) {
size_t n = PTR_BUCKET_NR(b->c, k, j);
printk(" bucket %zu", n);
if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
printk(" prio %i",
PTR_BUCKET(b->c, k, j)->prio);
}
printk(" %s\n", bch_ptr_status(b->c, k));
}
/* Btree ptrs */
bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
{
char buf[80];
if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
goto bad;
if (__ptr_invalid(c, k))
goto bad;
return false;
bad:
bch_extent_to_text(buf, sizeof(buf), k);
cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
return true;
}
static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
{
struct btree *b = container_of(bk, struct btree, keys);
return __bch_btree_ptr_invalid(b->c, k);
}
static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
{
unsigned i;
char buf[80];
struct bucket *g;
if (mutex_trylock(&b->c->bucket_lock)) {
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(b->c, k, i)) {
g = PTR_BUCKET(b->c, k, i);
if (KEY_DIRTY(k) ||
g->prio != BTREE_PRIO ||
(b->c->gc_mark_valid &&
GC_MARK(g) != GC_MARK_METADATA))
goto err;
}
mutex_unlock(&b->c->bucket_lock);
}
return false;
err:
mutex_unlock(&b->c->bucket_lock);
bch_extent_to_text(buf, sizeof(buf), k);
btree_bug(b,
"inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
return true;
}
static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
{
struct btree *b = container_of(bk, struct btree, keys);
unsigned i;
if (!bkey_cmp(k, &ZERO_KEY) ||
!KEY_PTRS(k) ||
bch_ptr_invalid(bk, k))
return true;
for (i = 0; i < KEY_PTRS(k); i++)
if (!ptr_available(b->c, k, i) ||
ptr_stale(b->c, k, i))
return true;
if (expensive_debug_checks(b->c) &&
btree_ptr_bad_expensive(b, k))
return true;
return false;
}
static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
struct bkey *insert,
struct btree_iter *iter,
struct bkey *replace_key)
{
struct btree *b = container_of(bk, struct btree, keys);
if (!KEY_OFFSET(insert))
btree_current_write(b)->prio_blocked++;
return false;
}
const struct btree_keys_ops bch_btree_keys_ops = {
.sort_cmp = bch_key_sort_cmp,
.insert_fixup = bch_btree_ptr_insert_fixup,
.key_invalid = bch_btree_ptr_invalid,
.key_bad = bch_btree_ptr_bad,
.key_to_text = bch_extent_to_text,
.key_dump = bch_bkey_dump,
};
/* Extents */
/*
* Returns true if l > r - unless l == r, in which case returns true if l is
* older than r.
*
* Necessary for btree_sort_fixup() - if there are multiple keys that compare
* equal in different sets, we have to process them newest to oldest.
*/
static bool bch_extent_sort_cmp(struct btree_iter_set l,
struct btree_iter_set r)
{
int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
return c ? c > 0 : l.k < r.k;
}
static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
struct bkey *tmp)
{
while (iter->used > 1) {
struct btree_iter_set *top = iter->data, *i = top + 1;
if (iter->used > 2 &&
bch_extent_sort_cmp(i[0], i[1]))
i++;
if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
break;
if (!KEY_SIZE(i->k)) {
sort_key_next(iter, i);
heap_sift(iter, i - top, bch_extent_sort_cmp);
continue;
}
if (top->k > i->k) {
if (bkey_cmp(top->k, i->k) >= 0)
sort_key_next(iter, i);
else
bch_cut_front(top->k, i->k);
heap_sift(iter, i - top, bch_extent_sort_cmp);
} else {
/* can't happen because of comparison func */
BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
if (bkey_cmp(i->k, top->k) < 0) {
bkey_copy(tmp, top->k);
bch_cut_back(&START_KEY(i->k), tmp);
bch_cut_front(i->k, top->k);
heap_sift(iter, 0, bch_extent_sort_cmp);
return tmp;
} else {
bch_cut_back(&START_KEY(i->k), top->k);
}
}
}
return NULL;
}
static bool bch_extent_insert_fixup(struct btree_keys *b,
struct bkey *insert,
struct btree_iter *iter,
struct bkey *replace_key)
{
struct cache_set *c = container_of(b, struct btree, keys)->c;
void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
{
if (KEY_DIRTY(k))
bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
offset, -sectors);
}
uint64_t old_offset;
unsigned old_size, sectors_found = 0;
BUG_ON(!KEY_OFFSET(insert));
BUG_ON(!KEY_SIZE(insert));
while (1) {
struct bkey *k = bch_btree_iter_next(iter);
if (!k)
break;
if (bkey_cmp(&START_KEY(k), insert) >= 0) {
if (KEY_SIZE(k))
break;
else
continue;
}
if (bkey_cmp(k, &START_KEY(insert)) <= 0)
continue;
old_offset = KEY_START(k);
old_size = KEY_SIZE(k);
/*
* We might overlap with 0 size extents; we can't skip these
* because if they're in the set we're inserting to we have to
* adjust them so they don't overlap with the key we're
* inserting. But we don't want to check them for replace
* operations.
*/
if (replace_key && KEY_SIZE(k)) {
/*
* k might have been split since we inserted/found the
* key we're replacing
*/
unsigned i;
uint64_t offset = KEY_START(k) -
KEY_START(replace_key);
/* But it must be a subset of the replace key */
if (KEY_START(k) < KEY_START(replace_key) ||
KEY_OFFSET(k) > KEY_OFFSET(replace_key))
goto check_failed;
/* We didn't find a key that we were supposed to */
if (KEY_START(k) > KEY_START(insert) + sectors_found)
goto check_failed;
if (!bch_bkey_equal_header(k, replace_key))
goto check_failed;
/* skip past gen */
offset <<= 8;
BUG_ON(!KEY_PTRS(replace_key));
for (i = 0; i < KEY_PTRS(replace_key); i++)
if (k->ptr[i] != replace_key->ptr[i] + offset)
goto check_failed;
sectors_found = KEY_OFFSET(k) - KEY_START(insert);
}
if (bkey_cmp(insert, k) < 0 &&
bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
/*
* We overlapped in the middle of an existing key: that
* means we have to split the old key. But we have to do
* slightly different things depending on whether the
* old key has been written out yet.
*/
struct bkey *top;
subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
if (bkey_written(b, k)) {
/*
* We insert a new key to cover the top of the
* old key, and the old key is modified in place
* to represent the bottom split.
*
* It's completely arbitrary whether the new key
* is the top or the bottom, but it has to match
* up with what btree_sort_fixup() does - it
* doesn't check for this kind of overlap, it
* depends on us inserting a new key for the top
* here.
*/
top = bch_bset_search(b, bset_tree_last(b),
insert);
bch_bset_insert(b, top, k);
} else {
BKEY_PADDED(key) temp;
bkey_copy(&temp.key, k);
bch_bset_insert(b, k, &temp.key);
top = bkey_next(k);
}
bch_cut_front(insert, top);
bch_cut_back(&START_KEY(insert), k);
bch_bset_fix_invalidated_key(b, k);
goto out;
}
if (bkey_cmp(insert, k) < 0) {
bch_cut_front(insert, k);
} else {
if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
old_offset = KEY_START(insert);
if (bkey_written(b, k) &&
bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
/*
* Completely overwrote, so we don't have to
* invalidate the binary search tree
*/
bch_cut_front(k, k);
} else {
__bch_cut_back(&START_KEY(insert), k);
bch_bset_fix_invalidated_key(b, k);
}
}
subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
}
check_failed:
if (replace_key) {
if (!sectors_found) {
return true;
} else if (sectors_found < KEY_SIZE(insert)) {
SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
(KEY_SIZE(insert) - sectors_found));
SET_KEY_SIZE(insert, sectors_found);
}
}
out:
if (KEY_DIRTY(insert))
bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
KEY_START(insert),
KEY_SIZE(insert));
return false;
}
static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
{
struct btree *b = container_of(bk, struct btree, keys);
char buf[80];
if (!KEY_SIZE(k))
return true;
if (KEY_SIZE(k) > KEY_OFFSET(k))
goto bad;
if (__ptr_invalid(b->c, k))
goto bad;
return false;
bad:
bch_extent_to_text(buf, sizeof(buf), k);
cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
return true;
}
static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
unsigned ptr)
{
struct bucket *g = PTR_BUCKET(b->c, k, ptr);
char buf[80];
if (mutex_trylock(&b->c->bucket_lock)) {
if (b->c->gc_mark_valid &&
((GC_MARK(g) != GC_MARK_DIRTY &&
KEY_DIRTY(k)) ||
GC_MARK(g) == GC_MARK_METADATA))
goto err;
if (g->prio == BTREE_PRIO)
goto err;
mutex_unlock(&b->c->bucket_lock);
}
return false;
err:
mutex_unlock(&b->c->bucket_lock);
bch_extent_to_text(buf, sizeof(buf), k);
btree_bug(b,
"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
return true;
}
static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
{
struct btree *b = container_of(bk, struct btree, keys);
struct bucket *g;
unsigned i, stale;
if (!KEY_PTRS(k) ||
bch_extent_invalid(bk, k))
return true;
for (i = 0; i < KEY_PTRS(k); i++)
if (!ptr_available(b->c, k, i))
return true;
if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
return false;
for (i = 0; i < KEY_PTRS(k); i++) {
g = PTR_BUCKET(b->c, k, i);
stale = ptr_stale(b->c, k, i);
btree_bug_on(stale > 96, b,
"key too stale: %i, need_gc %u",
stale, b->c->need_gc);
btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
b, "stale dirty pointer");
if (stale)
return true;
if (expensive_debug_checks(b->c) &&
bch_extent_bad_expensive(b, k, i))
return true;
}
return false;
}
static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
{
return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
~((uint64_t)1 << 63);
}
static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
{
struct btree *b = container_of(bk, struct btree, keys);
unsigned i;
if (key_merging_disabled(b->c))
return false;
for (i = 0; i < KEY_PTRS(l); i++)
if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
return false;
/* Keys with no pointers aren't restricted to one bucket and could
* overflow KEY_SIZE
*/
if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
SET_KEY_SIZE(l, USHRT_MAX);
bch_cut_front(l, r);
return false;
}
if (KEY_CSUM(l)) {
if (KEY_CSUM(r))
l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
else
SET_KEY_CSUM(l, 0);
}
SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
return true;
}
const struct btree_keys_ops bch_extent_keys_ops = {
.sort_cmp = bch_extent_sort_cmp,
.sort_fixup = bch_extent_sort_fixup,
.insert_fixup = bch_extent_insert_fixup,
.key_invalid = bch_extent_invalid,
.key_bad = bch_extent_bad,
.key_merge = bch_extent_merge,
.key_to_text = bch_extent_to_text,
.key_dump = bch_bkey_dump,
.is_extents = true,
};
#ifndef _BCACHE_EXTENTS_H
#define _BCACHE_EXTENTS_H
extern const struct btree_keys_ops bch_btree_keys_ops;
extern const struct btree_keys_ops bch_extent_keys_ops;
struct bkey;
struct cache_set;
void bch_extent_to_text(char *, size_t, const struct bkey *);
bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
#endif /* _BCACHE_EXTENTS_H */
......@@ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
closure_init_stack(&cl);
pr_debug("reading %llu", (uint64_t) bucket);
pr_debug("reading %u", bucket_index);
while (offset < ca->sb.bucket_size) {
reread: left = ca->sb.bucket_size - offset;
len = min_t(unsigned, left, PAGE_SECTORS * 8);
len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
bio_reset(bio);
bio->bi_iter.bi_sector = bucket + offset;
......@@ -74,19 +74,28 @@ reread: left = ca->sb.bucket_size - offset;
struct list_head *where;
size_t blocks, bytes = set_bytes(j);
if (j->magic != jset_magic(&ca->sb))
if (j->magic != jset_magic(&ca->sb)) {
pr_debug("%u: bad magic", bucket_index);
return ret;
}
if (bytes > left << 9)
if (bytes > left << 9 ||
bytes > PAGE_SIZE << JSET_BITS) {
pr_info("%u: too big, %zu bytes, offset %u",
bucket_index, bytes, offset);
return ret;
}
if (bytes > len << 9)
goto reread;
if (j->csum != csum_set(j))
if (j->csum != csum_set(j)) {
pr_info("%u: bad csum, %zu bytes, offset %u",
bucket_index, bytes, offset);
return ret;
}
blocks = set_blocks(j, ca->set);
blocks = set_blocks(j, block_bytes(ca->set));
while (!list_empty(list)) {
i = list_first_entry(list,
......@@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
}
for (k = i->j.start;
k < end(&i->j);
k < bset_bkey_last(&i->j);
k = bkey_next(k)) {
unsigned j;
......@@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
n, i->j.seq - 1, start, end);
for (k = i->j.start;
k < end(&i->j);
k < bset_bkey_last(&i->j);
k = bkey_next(k)) {
trace_bcache_journal_replay_key(k);
......@@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl)
continue_at_nobarrier(cl, journal_write, system_wq);
}
static void journal_write_unlock(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, journal.io);
c->journal.io_in_flight = 0;
spin_unlock(&c->journal.lock);
}
static void journal_write_unlocked(struct closure *cl)
__releases(c->journal.lock)
{
......@@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl)
struct cache *ca;
struct journal_write *w = c->journal.cur;
struct bkey *k = &c->journal.key;
unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
c->sb.block_size;
struct bio *bio;
struct bio_list list;
bio_list_init(&list);
if (!w->need_write) {
/*
* XXX: have to unlock closure before we unlock journal lock,
* else we race with bch_journal(). But this way we race
* against cache set unregister. Doh.
*/
set_closure_fn(cl, NULL, NULL);
closure_sub(cl, CLOSURE_RUNNING + 1);
spin_unlock(&c->journal.lock);
return;
closure_return_with_destructor(cl, journal_write_unlock);
} else if (journal_full(&c->journal)) {
journal_reclaim(c);
spin_unlock(&c->journal.lock);
......@@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
continue_at(cl, journal_write, system_wq);
}
c->journal.blocks_free -= set_blocks(w->data, c);
c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
w->data->btree_level = c->root->level;
......@@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c)
w->need_write = true;
if (closure_trylock(cl, &c->cl))
journal_write_unlocked(cl);
else
if (!c->journal.io_in_flight) {
c->journal.io_in_flight = 1;
closure_call(cl, journal_write_unlocked, NULL, &c->cl);
} else {
spin_unlock(&c->journal.lock);
}
}
static struct journal_write *journal_wait_for_write(struct cache_set *c,
......@@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
{
size_t sectors;
struct closure cl;
bool wait = false;
closure_init_stack(&cl);
......@@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
struct journal_write *w = c->journal.cur;
sectors = __set_blocks(w->data, w->data->keys + nkeys,
c) * c->sb.block_size;
block_bytes(c)) * c->sb.block_size;
if (sectors <= min_t(size_t,
c->journal.blocks_free * c->sb.block_size,
PAGE_SECTORS << JSET_BITS))
return w;
/* XXX: tracepoint */
if (wait)
closure_wait(&c->journal.wait, &cl);
if (!journal_full(&c->journal)) {
trace_bcache_journal_entry_full(c);
if (wait)
trace_bcache_journal_entry_full(c);
/*
* XXX: If we were inserting so many keys that they
......@@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
*/
BUG_ON(!w->data->keys);
closure_wait(&w->wait, &cl);
journal_try_write(c); /* unlocks */
} else {
trace_bcache_journal_full(c);
if (wait)
trace_bcache_journal_full(c);
closure_wait(&c->journal.wait, &cl);
journal_reclaim(c);
spin_unlock(&c->journal.lock);
......@@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
closure_sync(&cl);
spin_lock(&c->journal.lock);
wait = true;
}
}
......@@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c,
w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
w->data->keys += bch_keylist_nkeys(keys);
ret = &fifo_back(&c->journal.pin);
......@@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c)
{
struct journal *j = &c->journal;
closure_init_unlocked(&j->io);
spin_lock_init(&j->lock);
INIT_DELAYED_WORK(&j->work, journal_write_work);
......
......@@ -104,6 +104,7 @@ struct journal {
/* used when waiting because the journal was full */
struct closure_waitlist wait;
struct closure io;
int io_in_flight;
struct delayed_work work;
/* Number of blocks free in the bucket(s) we're currently writing to */
......
......@@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c)
for_each_cache(ca, c, i) {
unsigned sectors_to_move = 0;
unsigned reserve_sectors = ca->sb.bucket_size *
min(fifo_used(&ca->free), ca->free.size / 2);
fifo_used(&ca->free[RESERVE_MOVINGGC]);
ca->heap.used = 0;
......
......@@ -254,6 +254,24 @@ static void bch_data_insert_keys(struct closure *cl)
closure_return(cl);
}
static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
struct cache_set *c)
{
size_t oldsize = bch_keylist_nkeys(l);
size_t newsize = oldsize + u64s;
/*
* The journalling code doesn't handle the case where the keys to insert
* is bigger than an empty write: If we just return -ENOMEM here,
* bio_insert() and bio_invalidate() will insert the keys created so far
* and finish the rest when the keylist is empty.
*/
if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
return -ENOMEM;
return __bch_keylist_realloc(l, u64s);
}
static void bch_data_invalidate(struct closure *cl)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
......@@ -266,7 +284,7 @@ static void bch_data_invalidate(struct closure *cl)
unsigned sectors = min(bio_sectors(bio),
1U << (KEY_SIZE_BITS - 1));
if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
goto out;
bio->bi_iter.bi_sector += sectors;
......@@ -356,7 +374,7 @@ static void bch_data_insert_start(struct closure *cl)
/* 1 for the device pointer and 1 for the chksum */
if (bch_keylist_realloc(&op->insert_keys,
1 + (op->csum ? 1 : 0),
3 + (op->csum ? 1 : 0),
op->c))
continue_at(cl, bch_data_insert_keys, bcache_wq);
......@@ -596,14 +614,12 @@ struct search {
/* Stack frame for bio_complete */
struct closure cl;
struct bcache_device *d;
struct bbio bio;
struct bio *orig_bio;
struct bio *cache_miss;
struct bcache_device *d;
unsigned insert_bio_sectors;
unsigned recoverable:1;
unsigned write:1;
unsigned read_dirty_data:1;
......@@ -629,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error)
if (error)
s->iop.error = error;
else if (ptr_stale(s->iop.c, &b->key, 0)) {
else if (!KEY_DIRTY(&b->key) &&
ptr_stale(s->iop.c, &b->key, 0)) {
atomic_long_inc(&s->iop.c->cache_read_races);
s->iop.error = -EINTR;
}
......@@ -710,10 +727,13 @@ static void cache_lookup(struct closure *cl)
{
struct search *s = container_of(cl, struct search, iop.cl);
struct bio *bio = &s->bio.bio;
int ret;
bch_btree_op_init(&s->op, -1);
int ret = bch_btree_map_keys(&s->op, s->iop.c,
&KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
cache_lookup_fn, MAP_END_KEY);
ret = bch_btree_map_keys(&s->op, s->iop.c,
&KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
cache_lookup_fn, MAP_END_KEY);
if (ret == -EAGAIN)
continue_at(cl, cache_lookup, bcache_wq);
......@@ -754,12 +774,12 @@ static void bio_complete(struct search *s)
}
}
static void do_bio_hook(struct search *s)
static void do_bio_hook(struct search *s, struct bio *orig_bio)
{
struct bio *bio = &s->bio.bio;
bio_init(bio);
__bio_clone_fast(bio, s->orig_bio);
__bio_clone_fast(bio, orig_bio);
bio->bi_end_io = request_endio;
bio->bi_private = &s->cl;
......@@ -778,26 +798,32 @@ static void search_free(struct closure *cl)
mempool_free(s, s->d->c->search);
}
static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
static inline struct search *search_alloc(struct bio *bio,
struct bcache_device *d)
{
struct search *s;
s = mempool_alloc(d->c->search, GFP_NOIO);
memset(s, 0, offsetof(struct search, iop.insert_keys));
__closure_init(&s->cl, NULL);
closure_init(&s->cl, NULL);
do_bio_hook(s, bio);
s->iop.inode = d->id;
s->iop.c = d->c;
s->d = d;
s->op.lock = -1;
s->iop.write_point = hash_long((unsigned long) current, 16);
s->orig_bio = bio;
s->write = (bio->bi_rw & REQ_WRITE) != 0;
s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
s->cache_miss = NULL;
s->d = d;
s->recoverable = 1;
s->write = (bio->bi_rw & REQ_WRITE) != 0;
s->read_dirty_data = 0;
s->start_time = jiffies;
do_bio_hook(s);
s->iop.c = d->c;
s->iop.bio = NULL;
s->iop.inode = d->id;
s->iop.write_point = hash_long((unsigned long) current, 16);
s->iop.write_prio = 0;
s->iop.error = 0;
s->iop.flags = 0;
s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
return s;
}
......@@ -843,7 +869,7 @@ static void cached_dev_read_error(struct closure *cl)
trace_bcache_read_retry(s->orig_bio);
s->iop.error = 0;
do_bio_hook(s);
do_bio_hook(s, s->orig_bio);
/* XXX: invalidate cache */
......
......@@ -13,17 +13,22 @@ struct data_insert_op {
uint16_t write_prio;
short error;
unsigned bypass:1;
unsigned writeback:1;
unsigned flush_journal:1;
unsigned csum:1;
union {
uint16_t flags;
unsigned replace:1;
unsigned replace_collision:1;
struct {
unsigned bypass:1;
unsigned writeback:1;
unsigned flush_journal:1;
unsigned csum:1;
unsigned insert_data_done:1;
unsigned replace:1;
unsigned replace_collision:1;
unsigned insert_data_done:1;
};
};
/* Anything past this point won't get zeroed in search_alloc() */
struct keylist insert_keys;
BKEY_PADDED(replace_key);
};
......
......@@ -9,6 +9,7 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "extents.h"
#include "request.h"
#include "writeback.h"
......@@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error)
struct cached_dev *dc = bio->bi_private;
/* XXX: error checking */
closure_put(&dc->sb_write.cl);
closure_put(&dc->sb_write);
}
static void __write_super(struct cache_sb *sb, struct bio *bio)
......@@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
submit_bio(REQ_WRITE, bio);
}
static void bch_write_bdev_super_unlock(struct closure *cl)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
up(&dc->sb_write_mutex);
}
void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
{
struct closure *cl = &dc->sb_write.cl;
struct closure *cl = &dc->sb_write;
struct bio *bio = &dc->sb_bio;
closure_lock(&dc->sb_write, parent);
down(&dc->sb_write_mutex);
closure_init(cl, parent);
bio_reset(bio);
bio->bi_bdev = dc->bdev;
......@@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
closure_get(cl);
__write_super(&dc->sb, bio);
closure_return(cl);
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
}
static void write_super_endio(struct bio *bio, int error)
......@@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error)
struct cache *ca = bio->bi_private;
bch_count_io_errors(ca, error, "writing superblock");
closure_put(&ca->set->sb_write.cl);
closure_put(&ca->set->sb_write);
}
static void bcache_write_super_unlock(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, sb_write);
up(&c->sb_write_mutex);
}
void bcache_write_super(struct cache_set *c)
{
struct closure *cl = &c->sb_write.cl;
struct closure *cl = &c->sb_write;
struct cache *ca;
unsigned i;
closure_lock(&c->sb_write, &c->cl);
down(&c->sb_write_mutex);
closure_init(cl, &c->cl);
c->sb.seq++;
......@@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c)
__write_super(&ca->sb, bio);
}
closure_return(cl);
closure_return_with_destructor(cl, bcache_write_super_unlock);
}
/* UUID io */
......@@ -325,23 +342,31 @@ void bcache_write_super(struct cache_set *c)
static void uuid_endio(struct bio *bio, int error)
{
struct closure *cl = bio->bi_private;
struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
cache_set_err_on(error, c, "accessing uuids");
bch_bbio_free(bio, c);
closure_put(cl);
}
static void uuid_io_unlock(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
up(&c->uuid_write_mutex);
}
static void uuid_io(struct cache_set *c, unsigned long rw,
struct bkey *k, struct closure *parent)
{
struct closure *cl = &c->uuid_write.cl;
struct closure *cl = &c->uuid_write;
struct uuid_entry *u;
unsigned i;
char buf[80];
BUG_ON(!parent);
closure_lock(&c->uuid_write, parent);
down(&c->uuid_write_mutex);
closure_init(cl, parent);
for (i = 0; i < KEY_PTRS(k); i++) {
struct bio *bio = bch_bbio_alloc(c);
......@@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
break;
}
bch_bkey_to_text(buf, sizeof(buf), k);
bch_extent_to_text(buf, sizeof(buf), k);
pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
......@@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
u - c->uuids, u->uuid, u->label,
u->first_reg, u->last_reg, u->invalidated);
closure_return(cl);
closure_return_with_destructor(cl, uuid_io_unlock);
}
static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
{
struct bkey *k = &j->uuid_bucket;
if (bch_btree_ptr_invalid(c, k))
if (__bch_btree_ptr_invalid(c, k))
return "bad uuid pointer";
bkey_copy(&c->uuid_bucket, k);
......@@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c)
lockdep_assert_held(&bch_register_lock);
if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
return 1;
SET_KEY_SIZE(&k.key, c->sb.bucket_size);
......@@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca)
atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
&ca->meta_sectors_written);
pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
fifo_used(&ca->free_inc), fifo_used(&ca->unused));
//pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
// fifo_used(&ca->free_inc), fifo_used(&ca->unused));
for (i = prio_buckets(ca) - 1; i >= 0; --i) {
long bucket;
......@@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca)
p->magic = pset_magic(&ca->sb);
p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
BUG_ON(bucket == -1);
mutex_unlock(&ca->set->bucket_lock);
......@@ -1098,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
INIT_WORK(&dc->detach, cached_dev_detach_finish);
closure_init_unlocked(&dc->sb_write);
sema_init(&dc->sb_write_mutex, 1);
INIT_LIST_HEAD(&dc->io_lru);
spin_lock_init(&dc->io_lock);
bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
......@@ -1110,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
}
dc->disk.stripe_size = q->limits.io_opt >> 9;
if (dc->disk.stripe_size)
dc->partial_stripes_expensive =
q->limits.raid_partial_stripes_expensive;
ret = bcache_device_init(&dc->disk, block_size,
dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
if (ret)
......@@ -1321,8 +1352,8 @@ static void cache_set_free(struct closure *cl)
if (ca)
kobject_put(&ca->kobj);
bch_bset_sort_state_free(&c->sort);
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
if (c->bio_split)
bioset_free(c->bio_split);
......@@ -1447,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
c->btree_pages = c->sb.bucket_size / PAGE_SECTORS;
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
BTREE_MAX_PAGES);
c->sort_crit_factor = int_sqrt(c->btree_pages);
closure_init_unlocked(&c->sb_write);
sema_init(&c->sb_write_mutex, 1);
mutex_init(&c->bucket_lock);
init_waitqueue_head(&c->try_wait);
init_waitqueue_head(&c->bucket_wait);
closure_init_unlocked(&c->uuid_write);
mutex_init(&c->sort_lock);
sema_init(&c->uuid_write_mutex, 1);
spin_lock_init(&c->sort_time.lock);
spin_lock_init(&c->btree_gc_time.lock);
spin_lock_init(&c->btree_split_time.lock);
spin_lock_init(&c->btree_read_time.lock);
......@@ -1489,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
bucket_pages(c))) ||
!(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
!(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
!(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
bch_journal_alloc(c) ||
bch_btree_cache_alloc(c) ||
bch_open_buckets_alloc(c))
bch_open_buckets_alloc(c) ||
bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
goto err;
c->congested_read_threshold_us = 2000;
......@@ -1549,7 +1576,7 @@ static void run_cache_set(struct cache_set *c)
k = &j->btree_root;
err = "bad btree root";
if (bch_btree_ptr_invalid(c, k))
if (__bch_btree_ptr_invalid(c, k))
goto err;
err = "error reading btree root";
......@@ -1743,6 +1770,7 @@ static const char *register_cache_set(struct cache *ca)
void bch_cache_release(struct kobject *kobj)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
unsigned i;
if (ca->set)
ca->set->cache[ca->sb.nr_this_dev] = NULL;
......@@ -1756,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj)
free_heap(&ca->heap);
free_fifo(&ca->unused);
free_fifo(&ca->free_inc);
free_fifo(&ca->free);
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
if (ca->sb_bio.bi_inline_vecs[0].bv_page)
put_page(ca->sb_bio.bi_io_vec[0].bv_page);
......@@ -1782,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
ca->journal.bio.bi_max_vecs = 8;
ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
if (!init_fifo(&ca->free, free, GFP_KERNEL) ||
if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
!init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
!init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
......@@ -2030,7 +2062,8 @@ static void bcache_exit(void)
kobject_put(bcache_kobj);
if (bcache_wq)
destroy_workqueue(bcache_wq);
unregister_blkdev(bcache_major, "bcache");
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
unregister_reboot_notifier(&reboot);
}
......
......@@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test);
rw_attribute(key_merging_disabled);
rw_attribute(gc_always_rewrite);
rw_attribute(expensive_debug_checks);
rw_attribute(freelist_percent);
rw_attribute(cache_replacement_policy);
rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
......@@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = {
};
KTYPE(bch_flash_dev);
struct bset_stats_op {
struct btree_op op;
size_t nodes;
struct bset_stats stats;
};
static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
{
struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
op->nodes++;
bch_btree_keys_stats(&b->keys, &op->stats);
return MAP_CONTINUE;
}
int bch_bset_print_stats(struct cache_set *c, char *buf)
{
struct bset_stats_op op;
int ret;
memset(&op, 0, sizeof(op));
bch_btree_op_init(&op.op, -1);
ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
if (ret < 0)
return ret;
return snprintf(buf, PAGE_SIZE,
"btree nodes: %zu\n"
"written sets: %zu\n"
"unwritten sets: %zu\n"
"written key bytes: %zu\n"
"unwritten key bytes: %zu\n"
"floats: %zu\n"
"failed: %zu\n",
op.nodes,
op.stats.sets_written, op.stats.sets_unwritten,
op.stats.bytes_written, op.stats.bytes_unwritten,
op.stats.floats, op.stats.failed);
}
SHOW(__bch_cache_set)
{
unsigned root_usage(struct cache_set *c)
......@@ -419,7 +460,7 @@ SHOW(__bch_cache_set)
rw_lock(false, b, b->level);
} while (b != c->root);
for_each_key_filter(b, k, &iter, bch_ptr_bad)
for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
bytes += bkey_bytes(k);
rw_unlock(false, b);
......@@ -434,7 +475,7 @@ SHOW(__bch_cache_set)
mutex_lock(&c->bucket_lock);
list_for_each_entry(b, &c->btree_cache, list)
ret += 1 << (b->page_order + PAGE_SHIFT);
ret += 1 << (b->keys.page_order + PAGE_SHIFT);
mutex_unlock(&c->bucket_lock);
return ret;
......@@ -491,7 +532,7 @@ SHOW(__bch_cache_set)
sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us);
sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
......@@ -711,9 +752,6 @@ SHOW(__bch_cache)
sysfs_print(io_errors,
atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
sysfs_print(freelist_percent, ca->free.size * 100 /
((size_t) ca->sb.nbuckets));
if (attr == &sysfs_cache_replacement_policy)
return bch_snprint_string_list(buf, PAGE_SIZE,
cache_replacement_policies,
......@@ -820,32 +858,6 @@ STORE(__bch_cache)
}
}
if (attr == &sysfs_freelist_percent) {
DECLARE_FIFO(long, free);
long i;
size_t p = strtoul_or_return(buf);
p = clamp_t(size_t,
((size_t) ca->sb.nbuckets * p) / 100,
roundup_pow_of_two(ca->sb.nbuckets) >> 9,
ca->sb.nbuckets / 2);
if (!init_fifo_exact(&free, p, GFP_KERNEL))
return -ENOMEM;
mutex_lock(&ca->set->bucket_lock);
fifo_move(&free, &ca->free);
fifo_swap(&free, &ca->free);
mutex_unlock(&ca->set->bucket_lock);
while (fifo_pop(&free, i))
atomic_dec(&ca->buckets[i].pin);
free_fifo(&free);
}
if (attr == &sysfs_clear_stats) {
atomic_long_set(&ca->sectors_written, 0);
atomic_long_set(&ca->btree_sectors_written, 0);
......@@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = {
&sysfs_metadata_written,
&sysfs_io_errors,
&sysfs_clear_stats,
&sysfs_freelist_percent,
&sysfs_cache_replacement_policy,
NULL
};
......
......@@ -2,6 +2,7 @@
#ifndef _BCACHE_UTIL_H
#define _BCACHE_UTIL_H
#include <linux/blkdev.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/llist.h>
......@@ -17,11 +18,13 @@ struct closure;
#ifdef CONFIG_BCACHE_DEBUG
#define EBUG_ON(cond) BUG_ON(cond)
#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
#else /* DEBUG */
#define EBUG_ON(cond) do { if (cond); } while (0)
#define atomic_dec_bug(v) atomic_dec(v)
#define atomic_inc_bug(v, i) atomic_inc(v)
......@@ -391,6 +394,11 @@ struct time_stats {
void bch_time_stats_update(struct time_stats *stats, uint64_t time);
static inline unsigned local_clock_us(void)
{
return local_clock() >> 10;
}
#define NSEC_PER_ns 1L
#define NSEC_PER_us NSEC_PER_USEC
#define NSEC_PER_ms NSEC_PER_MSEC
......
......@@ -6103,6 +6103,7 @@ static int run(struct mddev *mddev)
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded));
mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk
......
......@@ -1312,7 +1312,7 @@ static void bh_lru_install(struct buffer_head *bh)
}
while (out < BH_LRU_SIZE)
bhs[out++] = NULL;
memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
}
bh_lru_unlock();
......
......@@ -291,6 +291,7 @@ struct queue_limits {
unsigned char discard_misaligned;
unsigned char cluster;
unsigned char discard_zeroes_data;
unsigned char raid_partial_stripes_expensive;
};
struct request_queue {
......
......@@ -247,7 +247,7 @@ TRACE_EVENT(bcache_btree_write,
TP_fast_assign(
__entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0);
__entry->block = b->written;
__entry->keys = b->sets[b->nsets].data->keys;
__entry->keys = b->keys.set[b->keys.nsets].data->keys;
),
TP_printk("bucket %zu", __entry->bucket)
......@@ -411,7 +411,7 @@ TRACE_EVENT(bcache_alloc_invalidate,
),
TP_fast_assign(
__entry->free = fifo_used(&ca->free);
__entry->free = fifo_used(&ca->free[RESERVE_NONE]);
__entry->free_inc = fifo_used(&ca->free_inc);
__entry->free_inc_size = ca->free_inc.size;
__entry->unused = fifo_used(&ca->unused);
......@@ -422,8 +422,8 @@ TRACE_EVENT(bcache_alloc_invalidate,
);
TRACE_EVENT(bcache_alloc_fail,
TP_PROTO(struct cache *ca),
TP_ARGS(ca),
TP_PROTO(struct cache *ca, unsigned reserve),
TP_ARGS(ca, reserve),
TP_STRUCT__entry(
__field(unsigned, free )
......@@ -433,7 +433,7 @@ TRACE_EVENT(bcache_alloc_fail,
),
TP_fast_assign(
__entry->free = fifo_used(&ca->free);
__entry->free = fifo_used(&ca->free[reserve]);
__entry->free_inc = fifo_used(&ca->free_inc);
__entry->unused = fifo_used(&ca->unused);
__entry->blocked = atomic_read(&ca->set->prio_blocked);
......
......@@ -39,6 +39,7 @@ static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \
}
#define KEY_SIZE_BITS 16
#define KEY_MAX_U64S 8
KEY_FIELD(KEY_PTRS, high, 60, 3)
KEY_FIELD(HEADER_SIZE, high, 58, 2)
......@@ -118,7 +119,7 @@ static inline struct bkey *bkey_next(const struct bkey *k)
return (struct bkey *) (d + bkey_u64s(k));
}
static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
static inline struct bkey *bkey_idx(const struct bkey *k, unsigned nr_keys)
{
__u64 *d = (void *) k;
return (struct bkey *) (d + nr_keys);
......
......@@ -185,7 +185,8 @@ enum {
* to clear media change status */
FD_UNUSED_BIT,
FD_DISK_CHANGED_BIT, /* disk has been changed since last i/o */
FD_DISK_WRITABLE_BIT /* disk is writable */
FD_DISK_WRITABLE_BIT, /* disk is writable */
FD_OPEN_SHOULD_FAIL_BIT
};
#define FDSETDRVPRM _IOW(2, 0x90, struct floppy_drive_params)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册