scsi_dh_alua.c 25.6 KB
Newer Older
1 2 3
/*
 * Generic SCSI-3 ALUA SCSI Device Handler
 *
4
 * Copyright (C) 2007-2010 Hannes Reinecke, SUSE Linux Products GmbH.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */
22
#include <linux/slab.h>
23
#include <linux/delay.h>
24
#include <linux/module.h>
25
#include <asm/unaligned.h>
26
#include <scsi/scsi.h>
27
#include <scsi/scsi_dbg.h>
28 29 30 31
#include <scsi/scsi_eh.h>
#include <scsi/scsi_dh.h>

#define ALUA_DH_NAME "alua"
32
#define ALUA_DH_VER "1.3"
33 34 35 36 37

#define TPGS_STATE_OPTIMIZED		0x0
#define TPGS_STATE_NONOPTIMIZED		0x1
#define TPGS_STATE_STANDBY		0x2
#define TPGS_STATE_UNAVAILABLE		0x3
38
#define TPGS_STATE_LBA_DEPENDENT	0x4
39 40 41 42 43 44 45 46
#define TPGS_STATE_OFFLINE		0xe
#define TPGS_STATE_TRANSITIONING	0xf

#define TPGS_SUPPORT_NONE		0x00
#define TPGS_SUPPORT_OPTIMIZED		0x01
#define TPGS_SUPPORT_NONOPTIMIZED	0x02
#define TPGS_SUPPORT_STANDBY		0x04
#define TPGS_SUPPORT_UNAVAILABLE	0x08
47
#define TPGS_SUPPORT_LBA_DEPENDENT	0x10
48 49 50
#define TPGS_SUPPORT_OFFLINE		0x40
#define TPGS_SUPPORT_TRANSITION		0x80

51 52 53
#define RTPG_FMT_MASK			0x70
#define RTPG_FMT_EXT_HDR		0x10

54 55 56 57 58
#define TPGS_MODE_UNINITIALIZED		 -1
#define TPGS_MODE_NONE			0x0
#define TPGS_MODE_IMPLICIT		0x1
#define TPGS_MODE_EXPLICIT		0x2

59
#define ALUA_RTPG_SIZE			128
60
#define ALUA_FAILOVER_TIMEOUT		60
61
#define ALUA_FAILOVER_RETRIES		5
62
#define ALUA_RTPG_DELAY_MSECS		5
63

64
/* device handler flags */
65 66 67 68 69 70
#define ALUA_OPTIMIZE_STPG		0x01
#define ALUA_RTPG_EXT_HDR_UNSUPP	0x02
/* State machine flags */
#define ALUA_PG_RUN_RTPG		0x10
#define ALUA_PG_RUN_STPG		0x20
#define ALUA_PG_RUNNING			0x40
71

72 73 74 75
static uint optimize_stpg;
module_param(optimize_stpg, uint, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(optimize_stpg, "Allow use of a non-optimized path, rather than sending a STPG, when implicit TPGS is supported (0=No,1=Yes). Default is 0.");

76 77
static LIST_HEAD(port_group_list);
static DEFINE_SPINLOCK(port_group_lock);
78
static struct workqueue_struct *kaluad_wq;
79 80 81

struct alua_port_group {
	struct kref		kref;
82
	struct rcu_head		rcu;
83
	struct list_head	node;
84 85
	unsigned char		device_id_str[256];
	int			device_id_len;
86 87 88
	int			group_id;
	int			tpgs;
	int			state;
89
	int			pref;
90
	unsigned		flags; /* used for optimizing STPG */
91
	unsigned char		transition_tmo;
92 93 94 95 96 97
	unsigned long		expiry;
	unsigned long		interval;
	struct delayed_work	rtpg_work;
	spinlock_t		lock;
	struct list_head	rtpg_list;
	struct scsi_device	*rtpg_sdev;
98 99 100 101 102
};

struct alua_dh_data {
	struct alua_port_group	*pg;
	int			group_id;
103
	spinlock_t		pg_lock;
104
	struct scsi_device	*sdev;
105 106 107 108 109 110
	int			init_error;
	struct mutex		init_mutex;
};

struct alua_queue_data {
	struct list_head	entry;
111 112
	activate_complete	callback_fn;
	void			*callback_data;
113 114 115 116 117
};

#define ALUA_POLICY_SWITCH_CURRENT	0
#define ALUA_POLICY_SWITCH_ALL		1

118 119 120 121
static void alua_rtpg_work(struct work_struct *work);
static void alua_rtpg_queue(struct alua_port_group *pg,
			    struct scsi_device *sdev,
			    struct alua_queue_data *qdata);
122

123 124 125 126 127
static void release_port_group(struct kref *kref)
{
	struct alua_port_group *pg;

	pg = container_of(kref, struct alua_port_group, kref);
128 129
	if (pg->rtpg_sdev)
		flush_delayed_work(&pg->rtpg_work);
130 131 132
	spin_lock(&port_group_lock);
	list_del(&pg->node);
	spin_unlock(&port_group_lock);
133
	kfree_rcu(pg, rcu);
134 135
}

136 137 138 139
/*
 * submit_rtpg - Issue a REPORT TARGET GROUP STATES command
 * @sdev: sdev the command should be sent to
 */
140 141
static int submit_rtpg(struct scsi_device *sdev, unsigned char *buff,
		       int bufflen, struct scsi_sense_hdr *sshdr, int flags)
142
{
143 144 145
	u8 cdb[COMMAND_SIZE(MAINTENANCE_IN)];
	int req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
		REQ_FAILFAST_DRIVER;
146 147

	/* Prepare the command. */
148 149
	memset(cdb, 0x0, COMMAND_SIZE(MAINTENANCE_IN));
	cdb[0] = MAINTENANCE_IN;
150
	if (!(flags & ALUA_RTPG_EXT_HDR_UNSUPP))
151
		cdb[1] = MI_REPORT_TARGET_PGS | MI_EXT_HDR_PARAM_FMT;
152
	else
153 154 155 156 157 158 159
		cdb[1] = MI_REPORT_TARGET_PGS;
	put_unaligned_be32(bufflen, &cdb[6]);

	return scsi_execute_req_flags(sdev, cdb, DMA_FROM_DEVICE,
				      buff, bufflen, sshdr,
				      ALUA_FAILOVER_TIMEOUT * HZ,
				      ALUA_FAILOVER_RETRIES, NULL, req_flags);
160 161
}

162
/*
163
 * submit_stpg - Issue a SET TARGET PORT GROUP command
164 165 166 167 168
 *
 * Currently we're only setting the current target port group state
 * to 'active/optimized' and let the array firmware figure out
 * the states of the remaining groups.
 */
169 170
static int submit_stpg(struct scsi_device *sdev, int group_id,
		       struct scsi_sense_hdr *sshdr)
171
{
172
	u8 cdb[COMMAND_SIZE(MAINTENANCE_OUT)];
173
	unsigned char stpg_data[8];
174
	int stpg_len = 8;
175 176
	int req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
		REQ_FAILFAST_DRIVER;
177 178

	/* Prepare the data buffer */
179 180 181
	memset(stpg_data, 0, stpg_len);
	stpg_data[4] = TPGS_STATE_OPTIMIZED & 0x0f;
	put_unaligned_be16(group_id, &stpg_data[6]);
182 183

	/* Prepare the command. */
184 185 186 187 188 189 190 191 192
	memset(cdb, 0x0, COMMAND_SIZE(MAINTENANCE_OUT));
	cdb[0] = MAINTENANCE_OUT;
	cdb[1] = MO_SET_TARGET_PGS;
	put_unaligned_be32(stpg_len, &cdb[6]);

	return scsi_execute_req_flags(sdev, cdb, DMA_TO_DEVICE,
				      stpg_data, stpg_len,
				      sshdr, ALUA_FAILOVER_TIMEOUT * HZ,
				      ALUA_FAILOVER_RETRIES, NULL, req_flags);
193 194
}

195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
struct alua_port_group *alua_find_get_pg(char *id_str, size_t id_size,
					 int group_id)
{
	struct alua_port_group *pg;

	list_for_each_entry(pg, &port_group_list, node) {
		if (pg->group_id != group_id)
			continue;
		if (pg->device_id_len != id_size)
			continue;
		if (strncmp(pg->device_id_str, id_str, id_size))
			continue;
		if (!kref_get_unless_zero(&pg->kref))
			continue;
		return pg;
	}

	return NULL;
}

215 216 217 218 219 220 221 222 223 224 225 226
/*
 * alua_alloc_pg - Allocate a new port_group structure
 * @sdev: scsi device
 * @h: alua device_handler data
 * @group_id: port group id
 *
 * Allocate a new port_group structure for a given
 * device.
 */
struct alua_port_group *alua_alloc_pg(struct scsi_device *sdev,
				      int group_id, int tpgs)
{
227
	struct alua_port_group *pg, *tmp_pg;
228 229 230

	pg = kzalloc(sizeof(struct alua_port_group), GFP_KERNEL);
	if (!pg)
231
		return ERR_PTR(-ENOMEM);
232

233 234 235 236 237 238 239 240 241 242 243 244 245
	pg->device_id_len = scsi_vpd_lun_id(sdev, pg->device_id_str,
					    sizeof(pg->device_id_str));
	if (pg->device_id_len <= 0) {
		/*
		 * Internal error: TPGS supported but no device
		 * identifcation found. Disable ALUA support.
		 */
		kfree(pg);
		sdev_printk(KERN_INFO, sdev,
			    "%s: No device descriptors found\n",
			    ALUA_DH_NAME);
		return ERR_PTR(-ENXIO);
	}
246 247 248
	pg->group_id = group_id;
	pg->tpgs = tpgs;
	pg->state = TPGS_STATE_OPTIMIZED;
249 250
	if (optimize_stpg)
		pg->flags |= ALUA_OPTIMIZE_STPG;
251
	kref_init(&pg->kref);
252 253 254 255
	INIT_DELAYED_WORK(&pg->rtpg_work, alua_rtpg_work);
	INIT_LIST_HEAD(&pg->rtpg_list);
	INIT_LIST_HEAD(&pg->node);
	spin_lock_init(&pg->lock);
256

257
	spin_lock(&port_group_lock);
258 259 260 261 262 263 264 265
	tmp_pg = alua_find_get_pg(pg->device_id_str, pg->device_id_len,
				  group_id);
	if (tmp_pg) {
		spin_unlock(&port_group_lock);
		kfree(pg);
		return tmp_pg;
	}

266 267 268 269 270 271
	list_add(&pg->node, &port_group_list);
	spin_unlock(&port_group_lock);

	return pg;
}

272
/*
273
 * alua_check_tpgs - Evaluate TPGS setting
274 275
 * @sdev: device to be checked
 *
276
 * Examine the TPGS setting of the sdev to find out if ALUA
277 278
 * is supported.
 */
279
static int alua_check_tpgs(struct scsi_device *sdev)
280
{
281
	int tpgs = TPGS_MODE_NONE;
282

283 284 285 286 287 288 289 290
	/*
	 * ALUA support for non-disk devices is fraught with
	 * difficulties, so disable it for now.
	 */
	if (sdev->type != TYPE_DISK) {
		sdev_printk(KERN_INFO, sdev,
			    "%s: disable for non-disk devices\n",
			    ALUA_DH_NAME);
291
		return tpgs;
292 293
	}

294 295
	tpgs = scsi_device_tpgs(sdev);
	switch (tpgs) {
296 297 298 299 300 301 302 303 304 305 306 307 308
	case TPGS_MODE_EXPLICIT|TPGS_MODE_IMPLICIT:
		sdev_printk(KERN_INFO, sdev,
			    "%s: supports implicit and explicit TPGS\n",
			    ALUA_DH_NAME);
		break;
	case TPGS_MODE_EXPLICIT:
		sdev_printk(KERN_INFO, sdev, "%s: supports explicit TPGS\n",
			    ALUA_DH_NAME);
		break;
	case TPGS_MODE_IMPLICIT:
		sdev_printk(KERN_INFO, sdev, "%s: supports implicit TPGS\n",
			    ALUA_DH_NAME);
		break;
H
Hannes Reinecke 已提交
309
	case TPGS_MODE_NONE:
310 311 312
		sdev_printk(KERN_INFO, sdev, "%s: not supported\n",
			    ALUA_DH_NAME);
		break;
H
Hannes Reinecke 已提交
313 314 315
	default:
		sdev_printk(KERN_INFO, sdev,
			    "%s: unsupported TPGS setting %d\n",
316 317
			    ALUA_DH_NAME, tpgs);
		tpgs = TPGS_MODE_NONE;
H
Hannes Reinecke 已提交
318
		break;
319 320
	}

321
	return tpgs;
322 323 324
}

/*
325
 * alua_check_vpd - Evaluate INQUIRY vpd page 0x83
326 327 328 329 330
 * @sdev: device to be checked
 *
 * Extract the relative target port and the target port group
 * descriptor from the list of identificators.
 */
331 332
static int alua_check_vpd(struct scsi_device *sdev, struct alua_dh_data *h,
			  int tpgs)
333
{
334
	int rel_port = -1, group_id;
335
	struct alua_port_group *pg, *old_pg = NULL;
336

337 338
	group_id = scsi_vpd_tpg_id(sdev, &rel_port);
	if (group_id < 0) {
339 340 341 342 343 344 345 346
		/*
		 * Internal error; TPGS supported but required
		 * VPD identification descriptors not present.
		 * Disable ALUA support
		 */
		sdev_printk(KERN_INFO, sdev,
			    "%s: No target port descriptors found\n",
			    ALUA_DH_NAME);
347
		return SCSI_DH_DEV_UNSUPP;
348
	}
349

350 351 352
	pg = alua_alloc_pg(sdev, group_id, tpgs);
	if (IS_ERR(pg)) {
		if (PTR_ERR(pg) == -ENOMEM)
353 354 355
			return SCSI_DH_NOMEM;
		return SCSI_DH_DEV_UNSUPP;
	}
356
	sdev_printk(KERN_INFO, sdev,
357
		    "%s: device %s port group %x rel port %x\n",
358 359 360 361 362 363 364 365 366 367 368 369 370 371
		    ALUA_DH_NAME, pg->device_id_str, group_id, rel_port);

	/* Check for existing port group references */
	spin_lock(&h->pg_lock);
	old_pg = h->pg;
	if (old_pg != pg) {
		/* port group has changed. Update to new port group */
		rcu_assign_pointer(h->pg, pg);
	}
	alua_rtpg_queue(h->pg, sdev, NULL);
	spin_unlock(&h->pg_lock);

	if (old_pg)
		kref_put(&old_pg->kref, release_port_group);
372

373
	return SCSI_DH_OK;
374 375 376 377 378 379 380 381 382 383 384 385 386
}

static char print_alua_state(int state)
{
	switch (state) {
	case TPGS_STATE_OPTIMIZED:
		return 'A';
	case TPGS_STATE_NONOPTIMIZED:
		return 'N';
	case TPGS_STATE_STANDBY:
		return 'S';
	case TPGS_STATE_UNAVAILABLE:
		return 'U';
387 388
	case TPGS_STATE_LBA_DEPENDENT:
		return 'L';
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
	case TPGS_STATE_OFFLINE:
		return 'O';
	case TPGS_STATE_TRANSITIONING:
		return 'T';
	default:
		return 'X';
	}
}

static int alua_check_sense(struct scsi_device *sdev,
			    struct scsi_sense_hdr *sense_hdr)
{
	switch (sense_hdr->sense_key) {
	case NOT_READY:
		if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a)
			/*
			 * LUN Not Accessible - ALUA state transition
			 */
407
			return ADD_TO_MLQUEUE;
408 409 410 411 412 413
		break;
	case UNIT_ATTENTION:
		if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x00)
			/*
			 * Power On, Reset, or Bus Device Reset, just retry.
			 */
414
			return ADD_TO_MLQUEUE;
415 416 417 418 419
		if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x04)
			/*
			 * Device internal reset
			 */
			return ADD_TO_MLQUEUE;
420 421 422 423 424
		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x01)
			/*
			 * Mode Parameters Changed
			 */
			return ADD_TO_MLQUEUE;
425
		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x06)
426 427 428
			/*
			 * ALUA state changed
			 */
429
			return ADD_TO_MLQUEUE;
430
		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x07)
431 432 433
			/*
			 * Implicit ALUA state transition failed
			 */
434
			return ADD_TO_MLQUEUE;
435 436 437 438 439 440
		if (sense_hdr->asc == 0x3f && sense_hdr->ascq == 0x03)
			/*
			 * Inquiry data has changed
			 */
			return ADD_TO_MLQUEUE;
		if (sense_hdr->asc == 0x3f && sense_hdr->ascq == 0x0e)
441 442 443 444 445 446
			/*
			 * REPORTED_LUNS_DATA_HAS_CHANGED is reported
			 * when switching controllers on targets like
			 * Intel Multi-Flex. We can just retry.
			 */
			return ADD_TO_MLQUEUE;
447 448 449 450 451 452 453 454 455 456 457 458
		break;
	}

	return SCSI_RETURN_NOT_HANDLED;
}

/*
 * alua_rtpg - Evaluate REPORT TARGET GROUP STATES
 * @sdev: the device to be evaluated.
 *
 * Evaluate the Target Port Group State.
 * Returns SCSI_DH_DEV_OFFLINED if the path is
L
Lucas De Marchi 已提交
459
 * found to be unusable.
460
 */
461
static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
462 463
{
	struct scsi_sense_hdr sense_hdr;
464 465
	int len, k, off, valid_states = 0, bufflen = ALUA_RTPG_SIZE;
	unsigned char *ucp, *buff;
466
	unsigned err, retval;
467 468 469
	unsigned int tpg_desc_tbl_off;
	unsigned char orig_transition_tmo;

470 471 472 473 474 475 476 477
	if (!pg->expiry) {
		unsigned long transition_tmo = ALUA_FAILOVER_TIMEOUT * HZ;

		if (pg->transition_tmo)
			transition_tmo = pg->transition_tmo * HZ;

		pg->expiry = round_jiffies_up(jiffies + transition_tmo);
	}
478

479 480 481 482
	buff = kzalloc(bufflen, GFP_KERNEL);
	if (!buff)
		return SCSI_DH_DEV_TEMP_BUSY;

483
 retry:
484
	retval = submit_rtpg(sdev, buff, bufflen, &sense_hdr, pg->flags);
485

486
	if (retval) {
487
		if (!scsi_sense_valid(&sense_hdr)) {
488 489 490
			sdev_printk(KERN_INFO, sdev,
				    "%s: rtpg failed, result %d\n",
				    ALUA_DH_NAME, retval);
491
			kfree(buff);
492
			if (driver_byte(retval) == DRIVER_ERROR)
493
				return SCSI_DH_DEV_TEMP_BUSY;
494
			return SCSI_DH_IO;
495
		}
496

497 498 499 500 501 502 503 504
		/*
		 * submit_rtpg() has failed on existing arrays
		 * when requesting extended header info, and
		 * the array doesn't support extended headers,
		 * even though it shouldn't according to T10.
		 * The retry without rtpg_ext_hdr_req set
		 * handles this.
		 */
505
		if (!(pg->flags & ALUA_RTPG_EXT_HDR_UNSUPP) &&
506 507
		    sense_hdr.sense_key == ILLEGAL_REQUEST &&
		    sense_hdr.asc == 0x24 && sense_hdr.ascq == 0) {
508
			pg->flags |= ALUA_RTPG_EXT_HDR_UNSUPP;
509 510
			goto retry;
		}
511 512 513 514 515 516 517 518 519
		/*
		 * Retry on ALUA state transition or if any
		 * UNIT ATTENTION occurred.
		 */
		if (sense_hdr.sense_key == NOT_READY &&
		    sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x0a)
			err = SCSI_DH_RETRY;
		else if (sense_hdr.sense_key == UNIT_ATTENTION)
			err = SCSI_DH_RETRY;
520 521
		if (err == SCSI_DH_RETRY &&
		    pg->expiry != 0 && time_before(jiffies, pg->expiry)) {
522 523 524
			sdev_printk(KERN_ERR, sdev, "%s: rtpg retry\n",
				    ALUA_DH_NAME);
			scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
525
			return err;
526 527 528 529
		}
		sdev_printk(KERN_ERR, sdev, "%s: rtpg failed\n",
			    ALUA_DH_NAME);
		scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
530
		kfree(buff);
531
		pg->expiry = 0;
532
		return SCSI_DH_IO;
533 534
	}

535
	len = get_unaligned_be32(&buff[0]) + 4;
536

537
	if (len > bufflen) {
538
		/* Resubmit with the correct length */
539 540 541 542
		kfree(buff);
		bufflen = len;
		buff = kmalloc(bufflen, GFP_KERNEL);
		if (!buff) {
543
			sdev_printk(KERN_WARNING, sdev,
544
				    "%s: kmalloc buffer failed\n",__func__);
545
			/* Temporary failure, bypass */
546
			pg->expiry = 0;
547 548 549 550 551
			return SCSI_DH_DEV_TEMP_BUSY;
		}
		goto retry;
	}

552
	orig_transition_tmo = pg->transition_tmo;
553
	if ((buff[4] & RTPG_FMT_MASK) == RTPG_FMT_EXT_HDR && buff[5] != 0)
554
		pg->transition_tmo = buff[5];
555
	else
556
		pg->transition_tmo = ALUA_FAILOVER_TIMEOUT;
557

558
	if (orig_transition_tmo != pg->transition_tmo) {
559 560
		sdev_printk(KERN_INFO, sdev,
			    "%s: transition timeout set to %d seconds\n",
561
			    ALUA_DH_NAME, pg->transition_tmo);
562
		pg->expiry = jiffies + pg->transition_tmo * HZ;
563 564
	}

565
	if ((buff[4] & RTPG_FMT_MASK) == RTPG_FMT_EXT_HDR)
566 567 568 569
		tpg_desc_tbl_off = 8;
	else
		tpg_desc_tbl_off = 4;

570
	for (k = tpg_desc_tbl_off, ucp = buff + tpg_desc_tbl_off;
571 572 573
	     k < len;
	     k += off, ucp += off) {

574 575 576
		if (pg->group_id == get_unaligned_be16(&ucp[2])) {
			pg->state = ucp[0] & 0x0f;
			pg->pref = ucp[0] >> 7;
577 578 579 580 581 582
			valid_states = ucp[1];
		}
		off = 8 + (ucp[7] * 4);
	}

	sdev_printk(KERN_INFO, sdev,
583
		    "%s: port group %02x state %c %s supports %c%c%c%c%c%c%c\n",
584 585
		    ALUA_DH_NAME, pg->group_id, print_alua_state(pg->state),
		    pg->pref ? "preferred" : "non-preferred",
586 587
		    valid_states&TPGS_SUPPORT_TRANSITION?'T':'t',
		    valid_states&TPGS_SUPPORT_OFFLINE?'O':'o',
588
		    valid_states&TPGS_SUPPORT_LBA_DEPENDENT?'L':'l',
589 590 591 592 593
		    valid_states&TPGS_SUPPORT_UNAVAILABLE?'U':'u',
		    valid_states&TPGS_SUPPORT_STANDBY?'S':'s',
		    valid_states&TPGS_SUPPORT_NONOPTIMIZED?'N':'n',
		    valid_states&TPGS_SUPPORT_OPTIMIZED?'A':'a');

594
	switch (pg->state) {
595
	case TPGS_STATE_TRANSITIONING:
596
		if (time_before(jiffies, pg->expiry)) {
597
			/* State transition, retry */
598 599 600 601 602 603 604
			pg->interval = 2;
			err = SCSI_DH_RETRY;
		} else {
			/* Transitioning time exceeded, set port to standby */
			err = SCSI_DH_IO;
			pg->state = TPGS_STATE_STANDBY;
			pg->expiry = 0;
605
		}
606 607
		break;
	case TPGS_STATE_OFFLINE:
608
		/* Path unusable */
609
		err = SCSI_DH_DEV_OFFLINED;
610
		pg->expiry = 0;
611 612 613 614
		break;
	default:
		/* Useable path if active */
		err = SCSI_DH_OK;
615
		pg->expiry = 0;
616
		break;
617
	}
618
	kfree(buff);
619 620 621
	return err;
}

622 623 624 625
/*
 * alua_stpg - Issue a SET TARGET PORT GROUP command
 *
 * Issue a SET TARGET PORT GROUP command and evaluate the
626 627 628
 * response. Returns SCSI_DH_RETRY per default to trigger
 * a re-evaluation of the target group state or SCSI_DH_OK
 * if no further action needs to be taken.
629
 */
630
static unsigned alua_stpg(struct scsi_device *sdev, struct alua_port_group *pg)
631
{
632 633
	int retval;
	struct scsi_sense_hdr sense_hdr;
634

635
	if (!(pg->tpgs & TPGS_MODE_EXPLICIT)) {
636 637 638
		/* Only implicit ALUA supported, retry */
		return SCSI_DH_RETRY;
	}
639
	switch (pg->state) {
640 641
	case TPGS_STATE_OPTIMIZED:
		return SCSI_DH_OK;
642
	case TPGS_STATE_NONOPTIMIZED:
643 644 645
		if ((pg->flags & ALUA_OPTIMIZE_STPG) &&
		    !pg->pref &&
		    (pg->tpgs & TPGS_MODE_IMPLICIT))
646
			return SCSI_DH_OK;
647 648 649 650 651
		break;
	case TPGS_STATE_STANDBY:
	case TPGS_STATE_UNAVAILABLE:
		break;
	case TPGS_STATE_OFFLINE:
652
		return SCSI_DH_IO;
653 654 655
	case TPGS_STATE_TRANSITIONING:
		break;
	default:
656 657
		sdev_printk(KERN_INFO, sdev,
			    "%s: stpg failed, unhandled TPGS state %d",
658
			    ALUA_DH_NAME, pg->state);
659
		return SCSI_DH_NOSYS;
660
	}
661
	retval = submit_stpg(sdev, pg->group_id, &sense_hdr);
662

663
	if (retval) {
664
		if (!scsi_sense_valid(&sense_hdr)) {
665 666 667
			sdev_printk(KERN_INFO, sdev,
				    "%s: stpg failed, result %d",
				    ALUA_DH_NAME, retval);
668
			if (driver_byte(retval) == DRIVER_ERROR)
669 670
				return SCSI_DH_DEV_TEMP_BUSY;
		} else {
671
			sdev_printk(KERN_INFO, sdev, "%s: stpg failed\n",
672 673 674
				    ALUA_DH_NAME);
			scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
		}
675
	}
676 677
	/* Retry RTPG */
	return SCSI_DH_RETRY;
678 679
}

680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
static void alua_rtpg_work(struct work_struct *work)
{
	struct alua_port_group *pg =
		container_of(work, struct alua_port_group, rtpg_work.work);
	struct scsi_device *sdev;
	LIST_HEAD(qdata_list);
	int err = SCSI_DH_OK;
	struct alua_queue_data *qdata, *tmp;
	unsigned long flags;

	spin_lock_irqsave(&pg->lock, flags);
	sdev = pg->rtpg_sdev;
	if (!sdev) {
		WARN_ON(pg->flags & ALUA_PG_RUN_RTPG);
		WARN_ON(pg->flags & ALUA_PG_RUN_STPG);
		spin_unlock_irqrestore(&pg->lock, flags);
		return;
	}
	pg->flags |= ALUA_PG_RUNNING;
	if (pg->flags & ALUA_PG_RUN_RTPG) {
		pg->flags &= ~ALUA_PG_RUN_RTPG;
		spin_unlock_irqrestore(&pg->lock, flags);
		err = alua_rtpg(sdev, pg);
		spin_lock_irqsave(&pg->lock, flags);
		if (err == SCSI_DH_RETRY) {
			pg->flags &= ~ALUA_PG_RUNNING;
			pg->flags |= ALUA_PG_RUN_RTPG;
			spin_unlock_irqrestore(&pg->lock, flags);
			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
					   pg->interval * HZ);
			return;
		}
		if (err != SCSI_DH_OK)
			pg->flags &= ~ALUA_PG_RUN_STPG;
	}
	if (pg->flags & ALUA_PG_RUN_STPG) {
		pg->flags &= ~ALUA_PG_RUN_STPG;
		spin_unlock_irqrestore(&pg->lock, flags);
		err = alua_stpg(sdev, pg);
		spin_lock_irqsave(&pg->lock, flags);
		if (err == SCSI_DH_RETRY) {
			pg->flags |= ALUA_PG_RUN_RTPG;
			pg->interval = 0;
			pg->flags &= ~ALUA_PG_RUNNING;
			spin_unlock_irqrestore(&pg->lock, flags);
			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
					   pg->interval * HZ);
			return;
		}
	}

	list_splice_init(&pg->rtpg_list, &qdata_list);
	pg->rtpg_sdev = NULL;
	spin_unlock_irqrestore(&pg->lock, flags);

	list_for_each_entry_safe(qdata, tmp, &qdata_list, entry) {
		list_del(&qdata->entry);
		if (qdata->callback_fn)
			qdata->callback_fn(qdata->callback_data, err);
		kfree(qdata);
	}
	spin_lock_irqsave(&pg->lock, flags);
	pg->flags &= ~ALUA_PG_RUNNING;
	spin_unlock_irqrestore(&pg->lock, flags);
	scsi_device_put(sdev);
	kref_put(&pg->kref, release_port_group);
}

static void alua_rtpg_queue(struct alua_port_group *pg,
			    struct scsi_device *sdev,
			    struct alua_queue_data *qdata)
{
	int start_queue = 0;
	unsigned long flags;

	if (!pg)
		return;

	spin_lock_irqsave(&pg->lock, flags);
	if (qdata) {
		list_add_tail(&qdata->entry, &pg->rtpg_list);
		pg->flags |= ALUA_PG_RUN_STPG;
	}
	if (pg->rtpg_sdev == NULL) {
		pg->interval = 0;
		pg->flags |= ALUA_PG_RUN_RTPG;
		kref_get(&pg->kref);
		pg->rtpg_sdev = sdev;
		scsi_device_get(sdev);
		start_queue = 1;
	}
	spin_unlock_irqrestore(&pg->lock, flags);

	if (start_queue &&
	    !queue_delayed_work(kaluad_wq, &pg->rtpg_work,
				msecs_to_jiffies(ALUA_RTPG_DELAY_MSECS))) {
		scsi_device_put(sdev);
		kref_put(&pg->kref, release_port_group);
	}
}

781 782 783 784 785 786 787 788 789
/*
 * alua_initialize - Initialize ALUA state
 * @sdev: the device to be initialized
 *
 * For the prep_fn to work correctly we have
 * to initialize the ALUA state for the device.
 */
static int alua_initialize(struct scsi_device *sdev, struct alua_dh_data *h)
{
790
	int err = SCSI_DH_DEV_UNSUPP, tpgs;
791

792
	mutex_lock(&h->init_mutex);
793
	tpgs = alua_check_tpgs(sdev);
794 795
	if (tpgs != TPGS_MODE_NONE)
		err = alua_check_vpd(sdev, h, tpgs);
796 797
	h->init_error = err;
	mutex_unlock(&h->init_mutex);
798 799
	return err;
}
800 801 802 803 804 805 806 807 808 809 810
/*
 * alua_set_params - set/unset the optimize flag
 * @sdev: device on the path to be activated
 * params - parameters in the following format
 *      "no_of_params\0param1\0param2\0param3\0...\0"
 * For example, to set the flag pass the following parameters
 * from multipath.conf
 *     hardware_handler        "2 alua 1"
 */
static int alua_set_params(struct scsi_device *sdev, const char *params)
{
811
	struct alua_dh_data *h = sdev->handler_data;
812
	struct alua_port_group __rcu *pg = NULL;
813 814 815
	unsigned int optimize = 0, argc;
	const char *p = params;
	int result = SCSI_DH_OK;
816
	unsigned long flags;
817 818 819 820 821 822 823 824 825

	if ((sscanf(params, "%u", &argc) != 1) || (argc != 1))
		return -EINVAL;

	while (*p++)
		;
	if ((sscanf(p, "%u", &optimize) != 1) || (optimize > 1))
		return -EINVAL;

826 827 828 829
	rcu_read_lock();
	pg = rcu_dereference(h->pg);
	if (!pg) {
		rcu_read_unlock();
830
		return -ENXIO;
831 832
	}
	spin_lock_irqsave(&pg->lock, flags);
833
	if (optimize)
834
		pg->flags |= ALUA_OPTIMIZE_STPG;
835
	else
836
		pg->flags &= ~ALUA_OPTIMIZE_STPG;
837 838
	spin_unlock_irqrestore(&pg->lock, flags);
	rcu_read_unlock();
839 840 841

	return result;
}
842 843 844 845 846 847 848 849 850 851 852

/*
 * alua_activate - activate a path
 * @sdev: device on the path to be activated
 *
 * We're currently switching the port group to be activated only and
 * let the array figure out the rest.
 * There may be other arrays which require us to switch all port groups
 * based on a certain policy. But until we actually encounter them it
 * should be okay.
 */
853 854
static int alua_activate(struct scsi_device *sdev,
			activate_complete fn, void *data)
855
{
856
	struct alua_dh_data *h = sdev->handler_data;
857
	int err = SCSI_DH_OK;
858 859
	struct alua_queue_data *qdata;
	struct alua_port_group __rcu *pg;
860

861 862 863
	qdata = kzalloc(sizeof(*qdata), GFP_KERNEL);
	if (!qdata) {
		err = SCSI_DH_RES_TEMP_UNAVAIL;
864
		goto out;
865 866 867 868 869 870 871 872 873 874 875 876
	}
	qdata->callback_fn = fn;
	qdata->callback_data = data;

	mutex_lock(&h->init_mutex);
	rcu_read_lock();
	pg = rcu_dereference(h->pg);
	if (!pg || !kref_get_unless_zero(&pg->kref)) {
		rcu_read_unlock();
		kfree(qdata);
		err = h->init_error;
		mutex_unlock(&h->init_mutex);
877 878
		goto out;
	}
879 880 881 882 883 884
	fn = NULL;
	rcu_read_unlock();
	mutex_unlock(&h->init_mutex);

	alua_rtpg_queue(pg, sdev, qdata);
	kref_put(&pg->kref, release_port_group);
885
out:
886
	if (fn)
887 888
		fn(data, err);
	return 0;
889 890 891 892 893 894 895 896 897 898
}

/*
 * alua_prep_fn - request callback
 *
 * Fail I/O to all paths not in state
 * active/optimized or active/non-optimized.
 */
static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
{
899
	struct alua_dh_data *h = sdev->handler_data;
900 901
	struct alua_port_group __rcu *pg;
	int state = TPGS_STATE_OPTIMIZED;
902 903
	int ret = BLKPREP_OK;

904 905 906 907 908
	rcu_read_lock();
	pg = rcu_dereference(h->pg);
	if (pg)
		state = pg->state;
	rcu_read_unlock();
909
	if (state == TPGS_STATE_TRANSITIONING)
910
		ret = BLKPREP_DEFER;
911 912 913
	else if (state != TPGS_STATE_OPTIMIZED &&
		 state != TPGS_STATE_NONOPTIMIZED &&
		 state != TPGS_STATE_LBA_DEPENDENT) {
914 915 916 917 918 919 920 921 922 923 924
		ret = BLKPREP_KILL;
		req->cmd_flags |= REQ_QUIET;
	}
	return ret;

}

/*
 * alua_bus_attach - Attach device handler
 * @sdev: device to be attached to
 */
925
static int alua_bus_attach(struct scsi_device *sdev)
926 927
{
	struct alua_dh_data *h;
928
	int err, ret = -EINVAL;
929

930
	h = kzalloc(sizeof(*h) , GFP_KERNEL);
931
	if (!h)
932
		return -ENOMEM;
933 934 935
	spin_lock_init(&h->pg_lock);
	rcu_assign_pointer(h->pg, NULL);
	h->init_error = SCSI_DH_OK;
936
	h->sdev = sdev;
937

938
	mutex_init(&h->init_mutex);
939
	err = alua_initialize(sdev, h);
940 941
	if (err == SCSI_DH_NOMEM)
		ret = -ENOMEM;
942
	if (err != SCSI_DH_OK && err != SCSI_DH_DEV_OFFLINED)
943 944
		goto failed;

945 946
	sdev->handler_data = h;
	return 0;
947
failed:
948
	kfree(h);
949
	return ret;
950 951 952 953 954 955 956 957
}

/*
 * alua_bus_detach - Detach device handler
 * @sdev: device to be detached from
 */
static void alua_bus_detach(struct scsi_device *sdev)
{
958
	struct alua_dh_data *h = sdev->handler_data;
959 960 961 962 963 964 965 966 967
	struct alua_port_group *pg;

	spin_lock(&h->pg_lock);
	pg = h->pg;
	rcu_assign_pointer(h->pg, NULL);
	h->sdev = NULL;
	spin_unlock(&h->pg_lock);
	if (pg)
		kref_put(&pg->kref, release_port_group);
968

969
	sdev->handler_data = NULL;
970
	kfree(h);
971 972
}

973 974 975 976 977 978 979 980 981 982 983
static struct scsi_device_handler alua_dh = {
	.name = ALUA_DH_NAME,
	.module = THIS_MODULE,
	.attach = alua_bus_attach,
	.detach = alua_bus_detach,
	.prep_fn = alua_prep_fn,
	.check_sense = alua_check_sense,
	.activate = alua_activate,
	.set_params = alua_set_params,
};

984 985 986 987
static int __init alua_init(void)
{
	int r;

988 989 990 991 992
	kaluad_wq = alloc_workqueue("kaluad", WQ_MEM_RECLAIM, 0);
	if (!kaluad_wq) {
		/* Temporary failure, bypass */
		return SCSI_DH_DEV_TEMP_BUSY;
	}
993
	r = scsi_register_device_handler(&alua_dh);
994
	if (r != 0) {
995 996
		printk(KERN_ERR "%s: Failed to register scsi device handler",
			ALUA_DH_NAME);
997 998
		destroy_workqueue(kaluad_wq);
	}
999 1000 1001 1002 1003 1004
	return r;
}

static void __exit alua_exit(void)
{
	scsi_unregister_device_handler(&alua_dh);
1005
	destroy_workqueue(kaluad_wq);
1006 1007 1008 1009 1010 1011 1012 1013 1014
}

module_init(alua_init);
module_exit(alua_exit);

MODULE_DESCRIPTION("DM Multipath ALUA support");
MODULE_AUTHOR("Hannes Reinecke <hare@suse.de>");
MODULE_LICENSE("GPL");
MODULE_VERSION(ALUA_DH_VER);