md_k.h 9.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
   md_k.h : kernel internal structure of the Linux MD driver
          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
	  
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.
   
   You should have received a copy of the GNU General Public License
   (for example /usr/src/linux/COPYING); if not, write to the Free
   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
*/

#ifndef _MD_K_H
#define _MD_K_H

18 19 20
/* and dm-bio-list.h is not under include/linux because.... ??? */
#include "../../../drivers/md/dm-bio-list.h"

L
Linus Torvalds 已提交
21 22 23 24
#define	LEVEL_MULTIPATH		(-4)
#define	LEVEL_LINEAR		(-1)
#define	LEVEL_FAULTY		(-5)

25 26 27 28 29 30
/* we need a value for 'no level specified' and 0
 * means 'raid0', so we need something else.  This is
 * for internal use only
 */
#define	LEVEL_NONE		(-1000000)

L
Linus Torvalds 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
#define MaxSector (~(sector_t)0)
#define MD_THREAD_NAME_MAX 14

typedef struct mddev_s mddev_t;
typedef struct mdk_rdev_s mdk_rdev_t;

#define MAX_MD_DEVS  256	/* Max number of md dev */

/*
 * options passed in raidrun:
 */

#define MAX_CHUNK_SIZE (4096*1024)

/*
 * MD's 'extended' device
 */
struct mdk_rdev_s
{
	struct list_head same_set;	/* RAID devices within the same set */

	sector_t size;			/* Device size (in blocks) */
	mddev_t *mddev;			/* RAID array if running */
	unsigned long last_events;	/* IO event timestamp */

	struct block_device *bdev;	/* block device handle */

	struct page	*sb_page;
	int		sb_loaded;
	sector_t	data_offset;	/* start of data in array */
	sector_t	sb_offset;
62
	int		sb_size;	/* bytes in the superblock */
L
Linus Torvalds 已提交
63 64
	int		preferred_minor;	/* autorun support */

65 66
	struct kobject	kobj;

L
Linus Torvalds 已提交
67 68 69 70 71 72 73 74 75 76 77
	/* A device can be in one of three states based on two flags:
	 * Not working:   faulty==1 in_sync==0
	 * Fully working: faulty==0 in_sync==1
	 * Working, but not
	 * in sync with array
	 *                faulty==0 in_sync==0
	 *
	 * It can never have faulty==1, in_sync==1
	 * This reduces the burden of testing multiple flags in many cases
	 */

78 79 80
	unsigned long	flags;
#define	Faulty		1		/* device is known to have a fault */
#define	In_sync		2		/* device is in_sync with rest of array */
81
#define	WriteMostly	4		/* Avoid reading if at all possible */
82
#define	BarriersNotsupp	5		/* BIO_RW_BARRIER is not supported */
83

L
Linus Torvalds 已提交
84 85
	int desc_nr;			/* descriptor index in the superblock */
	int raid_disk;			/* role of device in array */
86 87 88 89
	int saved_raid_disk;		/* role that device used to have in the
					 * array and could again if we did a partial
					 * resync from the bitmap
					 */
L
Linus Torvalds 已提交
90 91 92 93 94

	atomic_t	nr_pending;	/* number of pending requests.
					 * only maintained for arrays that
					 * support hot removal
					 */
95 96 97
	atomic_t	read_errors;	/* number of consecutive read errors that
					 * we have tried to ignore.
					 */
L
Linus Torvalds 已提交
98 99 100 101 102
};

struct mddev_s
{
	void				*private;
103
	struct mdk_personality		*pers;
L
Linus Torvalds 已提交
104 105 106 107 108 109 110 111
	dev_t				unit;
	int				md_minor;
	struct list_head 		disks;
	int				sb_dirty;
	int				ro;

	struct gendisk			*gendisk;

112 113
	struct kobject			kobj;

L
Linus Torvalds 已提交
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
	/* Superblock information */
	int				major_version,
					minor_version,
					patch_version;
	int				persistent;
	int				chunk_size;
	time_t				ctime, utime;
	int				level, layout;
	int				raid_disks;
	int				max_disks;
	sector_t			size; /* used size of component devices */
	sector_t			array_size; /* exported array size */
	__u64				events;

	char				uuid[16];

	struct mdk_thread_s		*thread;	/* management thread */
	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
	sector_t			curr_resync;	/* blocks scheduled */
	unsigned long			resync_mark;	/* a recent timestamp */
	sector_t			resync_mark_cnt;/* blocks written at resync_mark */

	sector_t			resync_max_sectors; /* may be set by personality */
137 138 139 140

	sector_t			resync_mismatches; /* count of sectors where
							    * parity/replica mismatch found
							    */
141
	int				ok_start_degraded;
L
Linus Torvalds 已提交
142 143 144 145 146 147 148
	/* recovery/resync flags 
	 * NEEDED:   we might need to start a resync/recover
	 * RUNNING:  a thread is running, or about to be started
	 * SYNC:     actually doing a resync, not a recovery
	 * ERR:      and IO error was detected - abort the resync/recovery
	 * INTR:     someone requested a (clean) early abort.
	 * DONE:     thread is done and is waiting to be reaped
149 150
	 * REQUEST:  user-space has requested a sync (used with SYNC)
	 * CHECK:    user-space request for for check-only, no repair
L
Linus Torvalds 已提交
151 152 153 154 155 156 157
	 */
#define	MD_RECOVERY_RUNNING	0
#define	MD_RECOVERY_SYNC	1
#define	MD_RECOVERY_ERR		2
#define	MD_RECOVERY_INTR	3
#define	MD_RECOVERY_DONE	4
#define	MD_RECOVERY_NEEDED	5
158 159
#define	MD_RECOVERY_REQUESTED	6
#define	MD_RECOVERY_CHECK	7
L
Linus Torvalds 已提交
160 161 162 163 164 165 166 167 168 169
	unsigned long			recovery;

	int				in_sync;	/* know to not need resync */
	struct semaphore		reconfig_sem;
	atomic_t			active;

	int				changed;	/* true if we might need to reread partition info */
	int				degraded;	/* whether md should consider
							 * adding a spare
							 */
170 171 172 173 174 175 176
	int				barriers_work;	/* initialised to true, cleared as soon
							 * as a barrier request to slave
							 * fails.  Only supported
							 */
	struct bio			*biolist; 	/* bios that need to be retried
							 * because BIO_RW_BARRIER is not supported
							 */
L
Linus Torvalds 已提交
177 178 179 180

	atomic_t			recovery_active; /* blocks scheduled, but not written */
	wait_queue_head_t		recovery_wait;
	sector_t			recovery_cp;
181 182

	spinlock_t			write_lock;
183
	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
184
	atomic_t			pending_writes;	/* number of active superblock writes */
185

L
Linus Torvalds 已提交
186 187 188 189 190 191 192 193
	unsigned int			safemode;	/* if set, update "clean" superblock
							 * when no writes pending.
							 */ 
	unsigned int			safemode_delay;
	struct timer_list		safemode_timer;
	atomic_t			writes_pending; 
	request_queue_t			*queue;	/* for plugging ... */

194 195 196
	atomic_t                        write_behind; /* outstanding async IO */
	unsigned int                    max_write_behind; /* 0 = sync */

197 198
	struct bitmap                   *bitmap; /* the bitmap for the device */
	struct file			*bitmap_file; /* the bitmap file */
199 200 201 202
	long				bitmap_offset; /* offset from superblock of
							* start of bitmap. May be
							* negative, but not '0'
							*/
203 204 205 206
	long				default_bitmap_offset; /* this is the offset to use when
								* hot-adding a bitmap.  It should
								* eventually be settable by sysfs.
								*/
207

L
Linus Torvalds 已提交
208 209 210 211 212 213
	struct list_head		all_mddevs;
};


static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
{
214
	int faulty = test_bit(Faulty, &rdev->flags);
L
Linus Torvalds 已提交
215 216 217 218 219 220 221 222 223
	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}

static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
}

224
struct mdk_personality
L
Linus Torvalds 已提交
225 226
{
	char *name;
227 228
	int level;
	struct list_head list;
L
Linus Torvalds 已提交
229 230 231 232 233 234 235 236 237 238 239 240
	struct module *owner;
	int (*make_request)(request_queue_t *q, struct bio *bio);
	int (*run)(mddev_t *mddev);
	int (*stop)(mddev_t *mddev);
	void (*status)(struct seq_file *seq, mddev_t *mddev);
	/* error_handler must set ->faulty and clear ->in_sync
	 * if appropriate, and should abort recovery if needed 
	 */
	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
	int (*hot_remove_disk) (mddev_t *mddev, int number);
	int (*spare_active) (mddev_t *mddev);
241
	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
L
Linus Torvalds 已提交
242 243 244
	int (*resize) (mddev_t *mddev, sector_t sectors);
	int (*reshape) (mddev_t *mddev, int raid_disks);
	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
245 246 247 248 249 250
	/* quiesce moves between quiescence states
	 * 0 - fully active
	 * 1 - no new requests allowed
	 * others - reserved
	 */
	void (*quiesce) (mddev_t *mddev, int state);
L
Linus Torvalds 已提交
251 252 253
};


254 255 256 257 258 259 260
struct md_sysfs_entry {
	struct attribute attr;
	ssize_t (*show)(mddev_t *, char *);
	ssize_t (*store)(mddev_t *, const char *, size_t);
};


L
Linus Torvalds 已提交
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
static inline char * mdname (mddev_t * mddev)
{
	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
}

extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);

/*
 * iterates through some rdev ringlist. It's safe to remove the
 * current 'rdev'. Dont touch 'tmp' though.
 */
#define ITERATE_RDEV_GENERIC(head,rdev,tmp)				\
									\
	for ((tmp) = (head).next;					\
		(rdev) = (list_entry((tmp), mdk_rdev_t, same_set)),	\
			(tmp) = (tmp)->next, (tmp)->prev != &(head)	\
		; )
/*
 * iterates through the 'same array disks' ringlist
 */
#define ITERATE_RDEV(mddev,rdev,tmp)					\
	ITERATE_RDEV_GENERIC((mddev)->disks,rdev,tmp)

/*
 * Iterates through 'pending RAID disks'
 */
#define ITERATE_RDEV_PENDING(rdev,tmp)					\
	ITERATE_RDEV_GENERIC(pending_raid_disks,rdev,tmp)

typedef struct mdk_thread_s {
	void			(*run) (mddev_t *mddev);
	mddev_t			*mddev;
	wait_queue_head_t	wqueue;
	unsigned long           flags;
	struct task_struct	*tsk;
296
	unsigned long		timeout;
L
Linus Torvalds 已提交
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
} mdk_thread_t;

#define THREAD_WAKEUP  0

#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
do {									\
	wait_queue_t __wait;						\
	init_waitqueue_entry(&__wait, current);				\
									\
	add_wait_queue(&wq, &__wait);					\
	for (;;) {							\
		set_current_state(TASK_UNINTERRUPTIBLE);		\
		if (condition)						\
			break;						\
		spin_unlock_irq(&lock);					\
		cmd;							\
		schedule();						\
		spin_lock_irq(&lock);					\
	}								\
	current->state = TASK_RUNNING;					\
	remove_wait_queue(&wq, &__wait);				\
} while (0)

#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
do {									\
	if (condition)	 						\
		break;							\
	__wait_event_lock_irq(wq, condition, lock, cmd);		\
} while (0)

#endif