• F
    Btrfs: ensure readers see new data after a clone operation · c125b8bf
    Filipe Manana 提交于
    We were cleaning the clone target file range from the page cache before
    we did replace the file extent items in the fs tree. This was racy,
    as right after cleaning the relevant range from the page cache and before
    replacing the file extent items, a read against that range could be
    performed by another task and populate again the page cache with stale
    data (stale after the cloning finishes). This would result in reads after
    the clone operation successfully finishes to get old data (and potentially
    for a very long time). Therefore evict the pages after replacing the file
    extent items, so that subsequent reads will always get the new data.
    
    Similarly, we were prone to races while cloning the file extent items
    because we weren't locking the target range and wait for any existing
    ordered extents against that range to complete. It was possible that
    after cloning the extent items, a write operation that was performed
    before the clone operation and overlaps the same range, would end up
    undoing all or part of the work the clone operation did (a worker task
    running inode.c:btrfs_finish_ordered_io). Therefore lock the target
    range in the io tree, wait for all pending ordered extents against that
    range to finish and then safely perform the cloning.
    
    The issue of reading stale data after the clone operation is easy to
    reproduce by running the following C program in a loop until it exits
    with return value 1.
    
     #include <unistd.h>
     #include <stdio.h>
     #include <stdlib.h>
     #include <string.h>
     #include <errno.h>
     #include <pthread.h>
     #include <fcntl.h>
     #include <assert.h>
     #include <asm/types.h>
     #include <linux/ioctl.h>
     #include <sys/stat.h>
     #include <sys/types.h>
     #include <sys/ioctl.h>
    
     #define SRC_FILE "/mnt/sdd/foo"
     #define DST_FILE "/mnt/sdd/bar"
     #define FILE_SIZE (16 * 1024)
     #define PATTERN_SRC 'X'
     #define PATTERN_DST 'Y'
    
    struct btrfs_ioctl_clone_range_args {
    	__s64 src_fd;
    	__u64 src_offset, src_length;
    	__u64 dest_offset;
    };
    
     #define BTRFS_IOCTL_MAGIC 0x94
     #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
    				   struct btrfs_ioctl_clone_range_args)
    
    static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
    static int clone_done = 0;
    static int reader_ready = 0;
    static int stale_data = 0;
    
    static void *reader_loop(void *arg)
    {
    	char buf[4096], want_buf[4096];
    
    	memset(want_buf, PATTERN_SRC, 4096);
    	pthread_mutex_lock(&mutex);
    	reader_ready = 1;
    	pthread_mutex_unlock(&mutex);
    
    	while (1) {
    		int done, fd, ret;
    
    		fd = open(DST_FILE, O_RDONLY);
    		assert(fd != -1);
    
    		pthread_mutex_lock(&mutex);
    		done = clone_done;
    		pthread_mutex_unlock(&mutex);
    
    		ret = read(fd, buf, 4096);
    		assert(ret == 4096);
    		close(fd);
    
    		if (done) {
    			ret = memcmp(buf, want_buf, 4096);
    			if (ret == 0) {
    				printf("Found new content\n");
    			} else {
    				printf("Found old content\n");
    				pthread_mutex_lock(&mutex);
    				stale_data = 1;
    				pthread_mutex_unlock(&mutex);
    			}
    			break;
    		}
    	}
    	return NULL;
    }
    
    int main(int argc, char *argv[])
    {
    	pthread_t reader;
    	int ret, i, fd;
    	struct btrfs_ioctl_clone_range_args clone_args;
    	int fd1, fd2;
    
    	ret = remove(SRC_FILE);
    	if (ret == -1 && errno != ENOENT) {
    		fprintf(stderr, "Error deleting src file: %s\n", strerror(errno));
    		return 1;
    	}
    	ret = remove(DST_FILE);
    	if (ret == -1 && errno != ENOENT) {
    		fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno));
    		return 1;
    	}
    
    	fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
    	assert(fd != -1);
    	for (i = 0; i < FILE_SIZE; i++) {
    		char c = PATTERN_SRC;
    		ret = write(fd, &c, 1);
    		assert(ret == 1);
    	}
    	close(fd);
    	fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
    	assert(fd != -1);
    	for (i = 0; i < FILE_SIZE; i++) {
    		char c = PATTERN_DST;
    		ret = write(fd, &c, 1);
    		assert(ret == 1);
    	}
    	close(fd);
            sync();
    
    	ret = pthread_create(&reader, NULL, reader_loop, NULL);
    	assert(ret == 0);
    	while (1) {
    		int r;
    		pthread_mutex_lock(&mutex);
    		r = reader_ready;
    		pthread_mutex_unlock(&mutex);
    		if (r) break;
    	}
    
    	fd1 = open(SRC_FILE, O_RDONLY);
    	if (fd1 < 0) {
    		fprintf(stderr, "Error open src file: %s\n", strerror(errno));
    		return 1;
    	}
    	fd2 = open(DST_FILE, O_RDWR);
    	if (fd2 < 0) {
    		fprintf(stderr, "Error open dst file: %s\n", strerror(errno));
    		return 1;
    	}
    	clone_args.src_fd = fd1;
    	clone_args.src_offset = 0;
    	clone_args.src_length = 4096;
    	clone_args.dest_offset = 0;
    	ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args);
    	assert(ret == 0);
    	close(fd1);
    	close(fd2);
    
    	pthread_mutex_lock(&mutex);
    	clone_done = 1;
    	pthread_mutex_unlock(&mutex);
    	ret = pthread_join(reader, NULL);
    	assert(ret == 0);
    
    	pthread_mutex_lock(&mutex);
    	ret = stale_data ? 1 : 0;
    	pthread_mutex_unlock(&mutex);
    	return ret;
    }
    Signed-off-by: NFilipe David Borba Manana <fdmanana@gmail.com>
    Signed-off-by: NChris Mason <clm@fb.com>
    c125b8bf
ioctl.c 121.8 KB