iommu.c 30.5 KB
Newer Older
1
/*
2
 * IOMMU implementation for Cell Broadband Processor Architecture
3
 *
4
 * (C) Copyright IBM Corporation 2006-2008
5
 *
J
Jeremy Kerr 已提交
6
 * Author: Jeremy Kerr <jk@ozlabs.org>
7
 *
J
Jeremy Kerr 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 22 23 24 25 26
 */

#undef DEBUG

#include <linux/kernel.h>
#include <linux/init.h>
J
Jeremy Kerr 已提交
27 28
#include <linux/interrupt.h>
#include <linux/notifier.h>
29
#include <linux/of.h>
30
#include <linux/of_platform.h>
31
#include <linux/slab.h>
Y
Yinghai Lu 已提交
32
#include <linux/memblock.h>
33 34

#include <asm/prom.h>
J
Jeremy Kerr 已提交
35
#include <asm/iommu.h>
36
#include <asm/machdep.h>
J
Jeremy Kerr 已提交
37
#include <asm/pci-bridge.h>
38
#include <asm/udbg.h>
39
#include <asm/firmware.h>
40
#include <asm/cell-regs.h>
41

42
#include "cell.h"
J
Jeremy Kerr 已提交
43
#include "interrupt.h"
44

J
Jeremy Kerr 已提交
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
/* Define CELL_IOMMU_REAL_UNMAP to actually unmap non-used pages
 * instead of leaving them mapped to some dummy page. This can be
 * enabled once the appropriate workarounds for spider bugs have
 * been enabled
 */
#define CELL_IOMMU_REAL_UNMAP

/* Define CELL_IOMMU_STRICT_PROTECTION to enforce protection of
 * IO PTEs based on the transfer direction. That can be enabled
 * once spider-net has been fixed to pass the correct direction
 * to the DMA mapping functions
 */
#define CELL_IOMMU_STRICT_PROTECTION


#define NR_IOMMUS			2

/* IOC mmap registers */
#define IOC_Reg_Size			0x2000

#define IOC_IOPT_CacheInvd		0x908
#define IOC_IOPT_CacheInvd_NE_Mask	0xffe0000000000000ul
#define IOC_IOPT_CacheInvd_IOPTE_Mask	0x000003fffffffff8ul
#define IOC_IOPT_CacheInvd_Busy		0x0000000000000001ul

#define IOC_IOST_Origin			0x918
#define IOC_IOST_Origin_E		0x8000000000000000ul
#define IOC_IOST_Origin_HW		0x0000000000000800ul
#define IOC_IOST_Origin_HL		0x0000000000000400ul

#define IOC_IO_ExcpStat			0x920
#define IOC_IO_ExcpStat_V		0x8000000000000000ul
#define IOC_IO_ExcpStat_SPF_Mask	0x6000000000000000ul
#define IOC_IO_ExcpStat_SPF_S		0x6000000000000000ul
79
#define IOC_IO_ExcpStat_SPF_P		0x2000000000000000ul
J
Jeremy Kerr 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
#define IOC_IO_ExcpStat_ADDR_Mask	0x00000007fffff000ul
#define IOC_IO_ExcpStat_RW_Mask		0x0000000000000800ul
#define IOC_IO_ExcpStat_IOID_Mask	0x00000000000007fful

#define IOC_IO_ExcpMask			0x928
#define IOC_IO_ExcpMask_SFE		0x4000000000000000ul
#define IOC_IO_ExcpMask_PFE		0x2000000000000000ul

#define IOC_IOCmd_Offset		0x1000

#define IOC_IOCmd_Cfg			0xc00
#define IOC_IOCmd_Cfg_TE		0x0000800000000000ul


/* Segment table entries */
#define IOSTE_V			0x8000000000000000ul /* valid */
#define IOSTE_H			0x4000000000000000ul /* cache hint */
#define IOSTE_PT_Base_RPN_Mask  0x3ffffffffffff000ul /* base RPN of IOPT */
#define IOSTE_NPPT_Mask		0x0000000000000fe0ul /* no. pages in IOPT */
#define IOSTE_PS_Mask		0x0000000000000007ul /* page size */
#define IOSTE_PS_4K		0x0000000000000001ul /*   - 4kB  */
#define IOSTE_PS_64K		0x0000000000000003ul /*   - 64kB */
#define IOSTE_PS_1M		0x0000000000000005ul /*   - 1MB  */
#define IOSTE_PS_16M		0x0000000000000007ul /*   - 16MB */


/* IOMMU sizing */
#define IO_SEGMENT_SHIFT	28
108
#define IO_PAGENO_BITS(shift)	(IO_SEGMENT_SHIFT - (shift))
J
Jeremy Kerr 已提交
109 110 111 112 113 114 115 116 117 118 119 120

/* The high bit needs to be set on every DMA address */
#define SPIDER_DMA_OFFSET	0x80000000ul

struct iommu_window {
	struct list_head list;
	struct cbe_iommu *iommu;
	unsigned long offset;
	unsigned long size;
	unsigned int ioid;
	struct iommu_table table;
};
121

J
Jeremy Kerr 已提交
122 123 124 125 126 127 128 129 130 131 132
#define NAMESIZE 8
struct cbe_iommu {
	int nid;
	char name[NAMESIZE];
	void __iomem *xlate_regs;
	void __iomem *cmd_regs;
	unsigned long *stab;
	unsigned long *ptab;
	void *pad_page;
	struct list_head windows;
};
133

J
Jeremy Kerr 已提交
134 135 136 137 138 139 140
/* Static array of iommus, one per node
 *   each contains a list of windows, keyed from dma_window property
 *   - on bus setup, look for a matching window, or create one
 *   - on dev setup, assign iommu_table ptr
 */
static struct cbe_iommu iommus[NR_IOMMUS];
static int cbe_nr_iommus;
141

J
Jeremy Kerr 已提交
142 143
static void invalidate_tce_cache(struct cbe_iommu *iommu, unsigned long *pte,
		long n_ptes)
144
{
145 146
	u64 __iomem *reg;
	u64 val;
J
Jeremy Kerr 已提交
147
	long n;
148

J
Jeremy Kerr 已提交
149
	reg = iommu->xlate_regs + IOC_IOPT_CacheInvd;
150

J
Jeremy Kerr 已提交
151 152 153 154 155 156
	while (n_ptes > 0) {
		/* we can invalidate up to 1 << 11 PTEs at once */
		n = min(n_ptes, 1l << 11);
		val = (((n /*- 1*/) << 53) & IOC_IOPT_CacheInvd_NE_Mask)
			| (__pa(pte) & IOC_IOPT_CacheInvd_IOPTE_Mask)
		        | IOC_IOPT_CacheInvd_Busy;
157

J
Jeremy Kerr 已提交
158 159 160
		out_be64(reg, val);
		while (in_be64(reg) & IOC_IOPT_CacheInvd_Busy)
			;
161

J
Jeremy Kerr 已提交
162 163 164
		n_ptes -= n;
		pte += n;
	}
165 166
}

167
static int tce_build_cell(struct iommu_table *tbl, long index, long npages,
168
		unsigned long uaddr, enum dma_data_direction direction,
169
		unsigned long attrs)
170
{
J
Jeremy Kerr 已提交
171 172 173 174 175 176 177 178 179 180
	int i;
	unsigned long *io_pte, base_pte;
	struct iommu_window *window =
		container_of(tbl, struct iommu_window, table);

	/* implementing proper protection causes problems with the spidernet
	 * driver - check mapping directions later, but allow read & write by
	 * default for now.*/
#ifdef CELL_IOMMU_STRICT_PROTECTION
	/* to avoid referencing a global, we use a trick here to setup the
M
Michael Ellerman 已提交
181
	 * protection bit. "prot" is setup to be 3 fields of 4 bits appended
J
Jeremy Kerr 已提交
182 183 184 185 186 187
	 * together for each of the 3 supported direction values. It is then
	 * shifted left so that the fields matching the desired direction
	 * lands on the appropriate bits, and other bits are masked out.
	 */
	const unsigned long prot = 0xc48;
	base_pte =
188 189 190 191
		((prot << (52 + 4 * direction)) &
		 (CBE_IOPTE_PP_W | CBE_IOPTE_PP_R)) |
		CBE_IOPTE_M | CBE_IOPTE_SO_RW |
		(window->ioid & CBE_IOPTE_IOID_Mask);
J
Jeremy Kerr 已提交
192
#else
193 194
	base_pte = CBE_IOPTE_PP_W | CBE_IOPTE_PP_R | CBE_IOPTE_M |
		CBE_IOPTE_SO_RW | (window->ioid & CBE_IOPTE_IOID_Mask);
J
Jeremy Kerr 已提交
195
#endif
196
	if (unlikely(attrs & DMA_ATTR_WEAK_ORDERING))
197
		base_pte &= ~CBE_IOPTE_SO_RW;
J
Jeremy Kerr 已提交
198

199
	io_pte = (unsigned long *)tbl->it_base + (index - tbl->it_offset);
J
Jeremy Kerr 已提交
200

201
	for (i = 0; i < npages; i++, uaddr += (1 << tbl->it_page_shift))
202
		io_pte[i] = base_pte | (__pa(uaddr) & CBE_IOPTE_RPN_Mask);
J
Jeremy Kerr 已提交
203 204 205 206 207 208 209

	mb();

	invalidate_tce_cache(window->iommu, io_pte, npages);

	pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n",
		 index, npages, direction, base_pte);
210
	return 0;
211 212
}

J
Jeremy Kerr 已提交
213
static void tce_free_cell(struct iommu_table *tbl, long index, long npages)
214 215
{

J
Jeremy Kerr 已提交
216 217 218 219
	int i;
	unsigned long *io_pte, pte;
	struct iommu_window *window =
		container_of(tbl, struct iommu_window, table);
220

J
Jeremy Kerr 已提交
221
	pr_debug("tce_free_cell(index=%lx,n=%lx)\n", index, npages);
222

J
Jeremy Kerr 已提交
223 224 225 226 227
#ifdef CELL_IOMMU_REAL_UNMAP
	pte = 0;
#else
	/* spider bridge does PCI reads after freeing - insert a mapping
	 * to a scratch page instead of an invalid entry */
228 229 230
	pte = CBE_IOPTE_PP_R | CBE_IOPTE_M | CBE_IOPTE_SO_RW |
		__pa(window->iommu->pad_page) |
		(window->ioid & CBE_IOPTE_IOID_Mask);
J
Jeremy Kerr 已提交
231
#endif
232

233
	io_pte = (unsigned long *)tbl->it_base + (index - tbl->it_offset);
234

J
Jeremy Kerr 已提交
235 236 237 238
	for (i = 0; i < npages; i++)
		io_pte[i] = pte;

	mb();
239

J
Jeremy Kerr 已提交
240
	invalidate_tce_cache(window->iommu, io_pte, npages);
241 242
}

J
Jeremy Kerr 已提交
243
static irqreturn_t ioc_interrupt(int irq, void *data)
244
{
245
	unsigned long stat, spf;
J
Jeremy Kerr 已提交
246 247 248
	struct cbe_iommu *iommu = data;

	stat = in_be64(iommu->xlate_regs + IOC_IO_ExcpStat);
249
	spf = stat & IOC_IO_ExcpStat_SPF_Mask;
J
Jeremy Kerr 已提交
250 251 252 253 254

	/* Might want to rate limit it */
	printk(KERN_ERR "iommu: DMA exception 0x%016lx\n", stat);
	printk(KERN_ERR "  V=%d, SPF=[%c%c], RW=%s, IOID=0x%04x\n",
	       !!(stat & IOC_IO_ExcpStat_V),
255 256
	       (spf == IOC_IO_ExcpStat_SPF_S) ? 'S' : ' ',
	       (spf == IOC_IO_ExcpStat_SPF_P) ? 'P' : ' ',
J
Jeremy Kerr 已提交
257 258 259 260 261 262 263 264 265 266
	       (stat & IOC_IO_ExcpStat_RW_Mask) ? "Read" : "Write",
	       (unsigned int)(stat & IOC_IO_ExcpStat_IOID_Mask));
	printk(KERN_ERR "  page=0x%016lx\n",
	       stat & IOC_IO_ExcpStat_ADDR_Mask);

	/* clear interrupt */
	stat &= ~IOC_IO_ExcpStat_V;
	out_be64(iommu->xlate_regs + IOC_IO_ExcpStat, stat);

	return IRQ_HANDLED;
267 268
}

J
Jeremy Kerr 已提交
269
static int cell_iommu_find_ioc(int nid, unsigned long *base)
270
{
J
Jeremy Kerr 已提交
271 272 273 274 275 276 277 278 279 280
	struct device_node *np;
	struct resource r;

	*base = 0;

	/* First look for new style /be nodes */
	for_each_node_by_name(np, "ioc") {
		if (of_node_to_nid(np) != nid)
			continue;
		if (of_address_to_resource(np, 0, &r)) {
281 282
			printk(KERN_ERR "iommu: can't get address for %pOF\n",
			       np);
J
Jeremy Kerr 已提交
283 284 285 286 287 288 289 290 291 292 293 294
			continue;
		}
		*base = r.start;
		of_node_put(np);
		return 0;
	}

	/* Ok, let's try the old way */
	for_each_node_by_type(np, "cpu") {
		const unsigned int *nidp;
		const unsigned long *tmp;

295
		nidp = of_get_property(np, "node-id", NULL);
J
Jeremy Kerr 已提交
296
		if (nidp && *nidp == nid) {
297
			tmp = of_get_property(np, "ioc-translation", NULL);
J
Jeremy Kerr 已提交
298 299 300 301 302 303 304
			if (tmp) {
				*base = *tmp;
				of_node_put(np);
				return 0;
			}
		}
	}
305

J
Jeremy Kerr 已提交
306
	return -ENODEV;
307 308
}

309
static void cell_iommu_setup_stab(struct cbe_iommu *iommu,
310 311
				unsigned long dbase, unsigned long dsize,
				unsigned long fbase, unsigned long fsize)
312
{
J
Jeremy Kerr 已提交
313
	struct page *page;
314
	unsigned long segments, stab_size;
315 316

	segments = max(dbase + dsize, fbase + fsize) >> IO_SEGMENT_SHIFT;
J
Jeremy Kerr 已提交
317

318
	pr_debug("%s: iommu[%d]: segments: %lu\n",
319
			__func__, iommu->nid, segments);
J
Jeremy Kerr 已提交
320 321

	/* set up the segment table */
322 323
	stab_size = segments * sizeof(unsigned long);
	page = alloc_pages_node(iommu->nid, GFP_KERNEL, get_order(stab_size));
J
Jeremy Kerr 已提交
324 325
	BUG_ON(!page);
	iommu->stab = page_address(page);
326 327 328 329 330
	memset(iommu->stab, 0, stab_size);
}

static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu,
		unsigned long base, unsigned long size, unsigned long gap_base,
331
		unsigned long gap_size, unsigned long page_shift)
332 333 334 335 336 337 338 339
{
	struct page *page;
	int i;
	unsigned long reg, segments, pages_per_segment, ptab_size,
		      n_pte_pages, start_seg, *ptab;

	start_seg = base >> IO_SEGMENT_SHIFT;
	segments  = size >> IO_SEGMENT_SHIFT;
340
	pages_per_segment = 1ull << IO_PAGENO_BITS(page_shift);
M
Michael Ellerman 已提交
341
	/* PTEs for each segment must start on a 4K boundary */
342 343
	pages_per_segment = max(pages_per_segment,
				(1 << 12) / sizeof(unsigned long));
J
Jeremy Kerr 已提交
344 345

	ptab_size = segments * pages_per_segment * sizeof(unsigned long);
346
	pr_debug("%s: iommu[%d]: ptab_size: %lu, order: %d\n", __func__,
J
Jeremy Kerr 已提交
347 348 349 350
			iommu->nid, ptab_size, get_order(ptab_size));
	page = alloc_pages_node(iommu->nid, GFP_KERNEL, get_order(ptab_size));
	BUG_ON(!page);

351 352
	ptab = page_address(page);
	memset(ptab, 0, ptab_size);
J
Jeremy Kerr 已提交
353

354 355
	/* number of 4K pages needed for a page table */
	n_pte_pages = (pages_per_segment * sizeof(unsigned long)) >> 12;
J
Jeremy Kerr 已提交
356 357

	pr_debug("%s: iommu[%d]: stab at %p, ptab at %p, n_pte_pages: %lu\n",
358
			__func__, iommu->nid, iommu->stab, ptab,
J
Jeremy Kerr 已提交
359 360 361 362 363
			n_pte_pages);

	/* initialise the STEs */
	reg = IOSTE_V | ((n_pte_pages - 1) << 5);

364 365 366 367 368 369
	switch (page_shift) {
	case 12: reg |= IOSTE_PS_4K;  break;
	case 16: reg |= IOSTE_PS_64K; break;
	case 20: reg |= IOSTE_PS_1M;  break;
	case 24: reg |= IOSTE_PS_16M; break;
	default: BUG();
J
Jeremy Kerr 已提交
370
	}
371

372 373 374
	gap_base = gap_base >> IO_SEGMENT_SHIFT;
	gap_size = gap_size >> IO_SEGMENT_SHIFT;

J
Jeremy Kerr 已提交
375
	pr_debug("Setting up IOMMU stab:\n");
376 377 378 379 380
	for (i = start_seg; i < (start_seg + segments); i++) {
		if (i >= gap_base && i < (gap_base + gap_size)) {
			pr_debug("\toverlap at %d, skipping\n", i);
			continue;
		}
381 382
		iommu->stab[i] = reg | (__pa(ptab) + (n_pte_pages << 12) *
					(i - start_seg));
J
Jeremy Kerr 已提交
383 384
		pr_debug("\t[%d] 0x%016lx\n", i, iommu->stab[i]);
	}
385 386

	return ptab;
387 388 389 390 391 392 393 394 395 396
}

static void cell_iommu_enable_hardware(struct cbe_iommu *iommu)
{
	int ret;
	unsigned long reg, xlate_base;
	unsigned int virq;

	if (cell_iommu_find_ioc(iommu->nid, &xlate_base))
		panic("%s: missing IOC register mappings for node %d\n",
397
		      __func__, iommu->nid);
398 399 400

	iommu->xlate_regs = ioremap(xlate_base, IOC_Reg_Size);
	iommu->cmd_regs = iommu->xlate_regs + IOC_IOCmd_Offset;
J
Jeremy Kerr 已提交
401 402 403 404 405 406 407 408 409 410 411 412 413

	/* ensure that the STEs have updated */
	mb();

	/* setup interrupts for the iommu. */
	reg = in_be64(iommu->xlate_regs + IOC_IO_ExcpStat);
	out_be64(iommu->xlate_regs + IOC_IO_ExcpStat,
			reg & ~IOC_IO_ExcpStat_V);
	out_be64(iommu->xlate_regs + IOC_IO_ExcpMask,
			IOC_IO_ExcpMask_PFE | IOC_IO_ExcpMask_SFE);

	virq = irq_create_mapping(NULL,
			IIC_IRQ_IOEX_ATI | (iommu->nid << IIC_IRQ_NODE_SHIFT));
414
	BUG_ON(!virq);
J
Jeremy Kerr 已提交
415

Y
Yong Zhang 已提交
416
	ret = request_irq(virq, ioc_interrupt, 0, iommu->name, iommu);
J
Jeremy Kerr 已提交
417
	BUG_ON(ret);
418

J
Jeremy Kerr 已提交
419 420 421 422
	/* set the IOC segment table origin register (and turn on the iommu) */
	reg = IOC_IOST_Origin_E | __pa(iommu->stab) | IOC_IOST_Origin_HW;
	out_be64(iommu->xlate_regs + IOC_IOST_Origin, reg);
	in_be64(iommu->xlate_regs + IOC_IOST_Origin);
423

J
Jeremy Kerr 已提交
424 425 426 427 428
	/* turn on IO translation */
	reg = in_be64(iommu->cmd_regs + IOC_IOCmd_Cfg) | IOC_IOCmd_Cfg_TE;
	out_be64(iommu->cmd_regs + IOC_IOCmd_Cfg, reg);
}

429 430 431
static void cell_iommu_setup_hardware(struct cbe_iommu *iommu,
	unsigned long base, unsigned long size)
{
432
	cell_iommu_setup_stab(iommu, base, size, 0, 0);
433
	iommu->ptab = cell_iommu_alloc_ptab(iommu, base, size, 0, 0,
434
					    IOMMU_PAGE_SHIFT_4K);
435 436 437
	cell_iommu_enable_hardware(iommu);
}

J
Jeremy Kerr 已提交
438 439 440
#if 0/* Unused for now */
static struct iommu_window *find_window(struct cbe_iommu *iommu,
		unsigned long offset, unsigned long size)
441
{
J
Jeremy Kerr 已提交
442
	struct iommu_window *window;
443

J
Jeremy Kerr 已提交
444 445 446 447 448
	/* todo: check for overlapping (but not equal) windows) */

	list_for_each_entry(window, &(iommu->windows), list) {
		if (window->offset == offset && window->size == size)
			return window;
449
	}
J
Jeremy Kerr 已提交
450 451

	return NULL;
452
}
J
Jeremy Kerr 已提交
453
#endif
454

455 456 457 458 459 460
static inline u32 cell_iommu_get_ioid(struct device_node *np)
{
	const u32 *ioid;

	ioid = of_get_property(np, "ioid", NULL);
	if (ioid == NULL) {
461 462
		printk(KERN_WARNING "iommu: missing ioid for %pOF using 0\n",
		       np);
463 464 465 466 467 468
		return 0;
	}

	return *ioid;
}

469 470 471 472 473
static struct iommu_table_ops cell_iommu_ops = {
	.set = tce_build_cell,
	.clear = tce_free_cell
};

J
Jeremy Kerr 已提交
474 475 476 477
static struct iommu_window * __init
cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
			unsigned long offset, unsigned long size,
			unsigned long pte_offset)
478
{
J
Jeremy Kerr 已提交
479
	struct iommu_window *window;
480
	struct page *page;
481
	u32 ioid;
482

483
	ioid = cell_iommu_get_ioid(np);
J
Jeremy Kerr 已提交
484

485
	window = kzalloc_node(sizeof(*window), GFP_KERNEL, iommu->nid);
J
Jeremy Kerr 已提交
486 487 488 489
	BUG_ON(window == NULL);

	window->offset = offset;
	window->size = size;
490
	window->ioid = ioid;
J
Jeremy Kerr 已提交
491 492 493 494 495
	window->iommu = iommu;

	window->table.it_blocksize = 16;
	window->table.it_base = (unsigned long)iommu->ptab;
	window->table.it_index = iommu->nid;
496 497 498 499
	window->table.it_page_shift = IOMMU_PAGE_SHIFT_4K;
	window->table.it_offset =
		(offset >> window->table.it_page_shift) + pte_offset;
	window->table.it_size = size >> window->table.it_page_shift;
500
	window->table.it_ops = &cell_iommu_ops;
J
Jeremy Kerr 已提交
501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521

	iommu_init_table(&window->table, iommu->nid);

	pr_debug("\tioid      %d\n", window->ioid);
	pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
	pr_debug("\tbase      0x%016lx\n", window->table.it_base);
	pr_debug("\toffset    0x%lx\n", window->table.it_offset);
	pr_debug("\tsize      %ld\n", window->table.it_size);

	list_add(&window->list, &iommu->windows);

	if (offset != 0)
		return window;

	/* We need to map and reserve the first IOMMU page since it's used
	 * by the spider workaround. In theory, we only need to do that when
	 * running on spider but it doesn't really matter.
	 *
	 * This code also assumes that we have a window that starts at 0,
	 * which is the case on all spider based blades.
	 */
522 523 524 525 526
	page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0);
	BUG_ON(!page);
	iommu->pad_page = page_address(page);
	clear_page(iommu->pad_page);

J
Jeremy Kerr 已提交
527 528
	__set_bit(0, window->table.it_map);
	tce_build_cell(&window->table, window->table.it_offset, 1,
529
		       (unsigned long)iommu->pad_page, DMA_TO_DEVICE, 0);
J
Jeremy Kerr 已提交
530 531 532

	return window;
}
533

J
Jeremy Kerr 已提交
534 535 536
static struct cbe_iommu *cell_iommu_for_node(int nid)
{
	int i;
537

J
Jeremy Kerr 已提交
538 539 540 541 542
	for (i = 0; i < cbe_nr_iommus; i++)
		if (iommus[i].nid == nid)
			return &iommus[i];
	return NULL;
}
543

544
static unsigned long cell_dma_nommu_offset;
545

546
static unsigned long dma_iommu_fixed_base;
547
static bool cell_iommu_enabled;
548 549

/* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */
550
bool iommu_fixed_is_weak;
551

552
static struct iommu_table *cell_get_iommu_table(struct device *dev)
J
Jeremy Kerr 已提交
553 554 555 556 557 558 559 560
{
	struct iommu_window *window;
	struct cbe_iommu *iommu;

	/* Current implementation uses the first window available in that
	 * node's iommu. We -might- do something smarter later though it may
	 * never be necessary
	 */
B
Becky Bruce 已提交
561
	iommu = cell_iommu_for_node(dev_to_node(dev));
J
Jeremy Kerr 已提交
562
	if (iommu == NULL || list_empty(&iommu->windows)) {
563 564
		dev_err(dev, "iommu: missing iommu for %pOF (node %d)\n",
		       dev->of_node, dev_to_node(dev));
565
		return NULL;
J
Jeremy Kerr 已提交
566 567
	}
	window = list_entry(iommu->windows.next, struct iommu_window, list);
568

569
	return &window->table;
570 571
}

572 573
static u64 cell_iommu_get_fixed_address(struct device *dev);

574 575
static void cell_dma_dev_setup(struct device *dev)
{
576
	if (cell_iommu_enabled) {
577 578 579
		u64 addr = cell_iommu_get_fixed_address(dev);

		if (addr != OF_BAD_ADDR)
580
			dev->archdata.dma_offset = addr + dma_iommu_fixed_base;
581
		set_iommu_table_base(dev, cell_get_iommu_table(dev));
582
	} else {
583
		dev->archdata.dma_offset = cell_dma_nommu_offset;
584
	}
585 586
}

J
Jeremy Kerr 已提交
587
static void cell_pci_dma_dev_setup(struct pci_dev *dev)
588
{
J
Jeremy Kerr 已提交
589 590
	cell_dma_dev_setup(&dev->dev);
}
591

J
Jeremy Kerr 已提交
592 593 594 595
static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action,
			      void *data)
{
	struct device *dev = data;
596

J
Jeremy Kerr 已提交
597 598 599
	/* We are only intereted in device addition */
	if (action != BUS_NOTIFY_ADD_DEVICE)
		return 0;
600

601 602
	if (cell_iommu_enabled)
		dev->dma_ops = &dma_iommu_ops;
J
Jeremy Kerr 已提交
603 604 605
	cell_dma_dev_setup(dev);
	return 0;
}
606

J
Jeremy Kerr 已提交
607 608 609
static struct notifier_block cell_of_bus_notifier = {
	.notifier_call = cell_of_bus_notify
};
610

J
Jeremy Kerr 已提交
611 612 613 614
static int __init cell_iommu_get_window(struct device_node *np,
					 unsigned long *base,
					 unsigned long *size)
{
615
	const __be32 *dma_window;
J
Jeremy Kerr 已提交
616
	unsigned long index;
617

J
Jeremy Kerr 已提交
618
	/* Use ibm,dma-window if available, else, hard code ! */
619
	dma_window = of_get_property(np, "ibm,dma-window", NULL);
J
Jeremy Kerr 已提交
620 621 622 623 624
	if (dma_window == NULL) {
		*base = 0;
		*size = 0x80000000u;
		return -ENODEV;
	}
625

J
Jeremy Kerr 已提交
626
	of_parse_dma_window(np, dma_window, &index, base, size);
627
	return 0;
628 629
}

630
static struct cbe_iommu * __init cell_iommu_alloc(struct device_node *np)
631
{
J
Jeremy Kerr 已提交
632 633 634 635 636 637
	struct cbe_iommu *iommu;
	int nid, i;

	/* Get node ID */
	nid = of_node_to_nid(np);
	if (nid < 0) {
638 639
		printk(KERN_ERR "iommu: failed to get node for %pOF\n",
		       np);
640
		return NULL;
J
Jeremy Kerr 已提交
641
	}
642 643
	pr_debug("iommu: setting up iommu for node %d (%pOF)\n",
		 nid, np);
J
Jeremy Kerr 已提交
644 645

	/* XXX todo: If we can have multiple windows on the same IOMMU, which
646
	 * isn't the case today, we probably want here to check whether the
J
Jeremy Kerr 已提交
647 648 649 650 651 652 653
	 * iommu for that node is already setup.
	 * However, there might be issue with getting the size right so let's
	 * ignore that for now. We might want to completely get rid of the
	 * multiple window support since the cell iommu supports per-page ioids
	 */

	if (cbe_nr_iommus >= NR_IOMMUS) {
654 655
		printk(KERN_ERR "iommu: too many IOMMUs detected ! (%pOF)\n",
		       np);
656
		return NULL;
J
Jeremy Kerr 已提交
657 658 659 660 661
	}

	/* Init base fields */
	i = cbe_nr_iommus++;
	iommu = &iommus[i];
662
	iommu->stab = NULL;
J
Jeremy Kerr 已提交
663 664 665
	iommu->nid = nid;
	snprintf(iommu->name, sizeof(iommu->name), "iommu%d", i);
	INIT_LIST_HEAD(&iommu->windows);
666

667 668 669 670 671 672 673 674 675 676 677 678 679
	return iommu;
}

static void __init cell_iommu_init_one(struct device_node *np,
				       unsigned long offset)
{
	struct cbe_iommu *iommu;
	unsigned long base, size;

	iommu = cell_iommu_alloc(np);
	if (!iommu)
		return;

J
Jeremy Kerr 已提交
680 681
	/* Obtain a window for it */
	cell_iommu_get_window(np, &base, &size);
682

J
Jeremy Kerr 已提交
683 684
	pr_debug("\ttranslating window 0x%lx...0x%lx\n",
		 base, base + size - 1);
685

J
Jeremy Kerr 已提交
686
	/* Initialize the hardware */
687
	cell_iommu_setup_hardware(iommu, base, size);
688

J
Jeremy Kerr 已提交
689 690
	/* Setup the iommu_table */
	cell_iommu_setup_window(iommu, np, base, size,
691
				offset >> IOMMU_PAGE_SHIFT_4K);
J
Jeremy Kerr 已提交
692
}
693

J
Jeremy Kerr 已提交
694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
static void __init cell_disable_iommus(void)
{
	int node;
	unsigned long base, val;
	void __iomem *xregs, *cregs;

	/* Make sure IOC translation is disabled on all nodes */
	for_each_online_node(node) {
		if (cell_iommu_find_ioc(node, &base))
			continue;
		xregs = ioremap(base, IOC_Reg_Size);
		if (xregs == NULL)
			continue;
		cregs = xregs + IOC_IOCmd_Offset;

		pr_debug("iommu: cleaning up iommu on node %d\n", node);

		out_be64(xregs + IOC_IOST_Origin, 0);
		(void)in_be64(xregs + IOC_IOST_Origin);
		val = in_be64(cregs + IOC_IOCmd_Cfg);
		val &= ~IOC_IOCmd_Cfg_TE;
		out_be64(cregs + IOC_IOCmd_Cfg, val);
		(void)in_be64(cregs + IOC_IOCmd_Cfg);

		iounmap(xregs);
	}
}
721

J
Jeremy Kerr 已提交
722 723 724 725
static int __init cell_iommu_init_disabled(void)
{
	struct device_node *np = NULL;
	unsigned long base = 0, size;
726

J
Jeremy Kerr 已提交
727
	/* When no iommu is present, we use direct DMA ops */
728

J
Jeremy Kerr 已提交
729 730 731 732 733
	/* First make sure all IOC translation is turned off */
	cell_disable_iommus();

	/* If we have no Axon, we set up the spider DMA magic offset */
	if (of_find_node_by_name(NULL, "axon") == NULL)
734
		cell_dma_nommu_offset = SPIDER_DMA_OFFSET;
J
Jeremy Kerr 已提交
735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760

	/* Now we need to check to see where the memory is mapped
	 * in PCI space. We assume that all busses use the same dma
	 * window which is always the case so far on Cell, thus we
	 * pick up the first pci-internal node we can find and check
	 * the DMA window from there.
	 */
	for_each_node_by_name(np, "axon") {
		if (np->parent == NULL || np->parent->parent != NULL)
			continue;
		if (cell_iommu_get_window(np, &base, &size) == 0)
			break;
	}
	if (np == NULL) {
		for_each_node_by_name(np, "pci-internal") {
			if (np->parent == NULL || np->parent->parent != NULL)
				continue;
			if (cell_iommu_get_window(np, &base, &size) == 0)
				break;
		}
	}
	of_node_put(np);

	/* If we found a DMA window, we check if it's big enough to enclose
	 * all of physical memory. If not, we force enable IOMMU
	 */
Y
Yinghai Lu 已提交
761
	if (np && size < memblock_end_of_DRAM()) {
J
Jeremy Kerr 已提交
762
		printk(KERN_WARNING "iommu: force-enabled, dma window"
763
		       " (%ldMB) smaller than total memory (%lldMB)\n",
Y
Yinghai Lu 已提交
764
		       size >> 20, memblock_end_of_DRAM() >> 20);
J
Jeremy Kerr 已提交
765
		return -ENODEV;
766 767
	}

768
	cell_dma_nommu_offset += base;
J
Jeremy Kerr 已提交
769

770
	if (cell_dma_nommu_offset != 0)
771
		cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup;
772

J
Jeremy Kerr 已提交
773
	printk("iommu: disabled, direct DMA offset is 0x%lx\n",
774
	       cell_dma_nommu_offset);
J
Jeremy Kerr 已提交
775 776

	return 0;
777 778
}

779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806
/*
 *  Fixed IOMMU mapping support
 *
 *  This code adds support for setting up a fixed IOMMU mapping on certain
 *  cell machines. For 64-bit devices this avoids the performance overhead of
 *  mapping and unmapping pages at runtime. 32-bit devices are unable to use
 *  the fixed mapping.
 *
 *  The fixed mapping is established at boot, and maps all of physical memory
 *  1:1 into device space at some offset. On machines with < 30 GB of memory
 *  we setup the fixed mapping immediately above the normal IOMMU window.
 *
 *  For example a machine with 4GB of memory would end up with the normal
 *  IOMMU window from 0-2GB and the fixed mapping window from 2GB to 6GB. In
 *  this case a 64-bit device wishing to DMA to 1GB would be told to DMA to
 *  3GB, plus any offset required by firmware. The firmware offset is encoded
 *  in the "dma-ranges" property.
 *
 *  On machines with 30GB or more of memory, we are unable to place the fixed
 *  mapping above the normal IOMMU window as we would run out of address space.
 *  Instead we move the normal IOMMU window to coincide with the hash page
 *  table, this region does not need to be part of the fixed mapping as no
 *  device should ever be DMA'ing to it. We then setup the fixed mapping
 *  from 0 to 32GB.
 */

static u64 cell_iommu_get_fixed_address(struct device *dev)
{
807
	u64 cpu_addr, size, best_size, dev_addr = OF_BAD_ADDR;
808
	struct device_node *np;
809
	const u32 *ranges = NULL;
810
	int i, len, best, naddr, nsize, pna, range_size;
811

812
	/* We can be called for platform devices that have no of_node */
813
	np = of_node_get(dev->of_node);
814 815 816
	if (!np)
		goto out;

817 818 819 820 821 822 823
	while (1) {
		naddr = of_n_addr_cells(np);
		nsize = of_n_size_cells(np);
		np = of_get_next_parent(np);
		if (!np)
			break;

824
		ranges = of_get_property(np, "dma-ranges", &len);
825 826 827

		/* Ignore empty ranges, they imply no translation required */
		if (ranges && len > 0)
828
			break;
829 830 831 832 833 834 835 836 837
	}

	if (!ranges) {
		dev_dbg(dev, "iommu: no dma-ranges found\n");
		goto out;
	}

	len /= sizeof(u32);

838 839 840
	pna = of_n_addr_cells(np);
	range_size = naddr + nsize + pna;

841
	/* dma-ranges format:
842 843 844
	 * child addr	: naddr cells
	 * parent addr	: pna cells
	 * size		: nsize cells
845
	 */
846 847 848
	for (i = 0, best = -1, best_size = 0; i < len; i += range_size) {
		cpu_addr = of_translate_dma_address(np, ranges + i + naddr);
		size = of_read_number(ranges + i + naddr + pna, nsize);
849 850 851 852 853 854 855

		if (cpu_addr == 0 && size > best_size) {
			best = i;
			best_size = size;
		}
	}

856 857 858
	if (best >= 0) {
		dev_addr = of_read_number(ranges + best, naddr);
	} else
859 860 861 862 863
		dev_dbg(dev, "iommu: no suitable range found!\n");

out:
	of_node_put(np);

864
	return dev_addr;
865 866
}

867
static bool cell_pci_iommu_bypass_supported(struct pci_dev *pdev, u64 mask)
868
{
869 870
	return mask == DMA_BIT_MASK(64) &&
		cell_iommu_get_fixed_address(&pdev->dev) != OF_BAD_ADDR;
871 872
}

873 874 875 876 877 878 879 880 881 882 883 884
static void insert_16M_pte(unsigned long addr, unsigned long *ptab,
			   unsigned long base_pte)
{
	unsigned long segment, offset;

	segment = addr >> IO_SEGMENT_SHIFT;
	offset = (addr >> 24) - (segment << IO_PAGENO_BITS(24));
	ptab = ptab + (segment * (1 << 12) / sizeof(unsigned long));

	pr_debug("iommu: addr %lx ptab %p segment %lx offset %lx\n",
		  addr, ptab, segment, offset);

885
	ptab[offset] = base_pte | (__pa(addr) & CBE_IOPTE_RPN_Mask);
886 887
}

888 889 890 891
static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu,
	struct device_node *np, unsigned long dbase, unsigned long dsize,
	unsigned long fbase, unsigned long fsize)
{
892
	unsigned long base_pte, uaddr, ioaddr, *ptab;
893

894
	ptab = cell_iommu_alloc_ptab(iommu, fbase, fsize, dbase, dsize, 24);
895 896 897 898 899

	dma_iommu_fixed_base = fbase;

	pr_debug("iommu: mapping 0x%lx pages from 0x%lx\n", fsize, fbase);

900 901
	base_pte = CBE_IOPTE_PP_W | CBE_IOPTE_PP_R | CBE_IOPTE_M |
		(cell_iommu_get_ioid(np) & CBE_IOPTE_IOID_Mask);
902

903 904 905 906
	if (iommu_fixed_is_weak)
		pr_info("IOMMU: Using weak ordering for fixed mapping\n");
	else {
		pr_info("IOMMU: Using strong ordering for fixed mapping\n");
907
		base_pte |= CBE_IOPTE_SO_RW;
908 909
	}

910
	for (uaddr = 0; uaddr < fsize; uaddr += (1 << 24)) {
911
		/* Don't touch the dynamic region */
912 913
		ioaddr = uaddr + fbase;
		if (ioaddr >= dbase && ioaddr < (dbase + dsize)) {
914
			pr_debug("iommu: fixed/dynamic overlap, skipping\n");
915 916
			continue;
		}
917 918

		insert_16M_pte(uaddr, ptab, base_pte);
919 920 921 922 923 924 925 926 927 928 929 930 931
	}

	mb();
}

static int __init cell_iommu_fixed_mapping_init(void)
{
	unsigned long dbase, dsize, fbase, fsize, hbase, hend;
	struct cbe_iommu *iommu;
	struct device_node *np;

	/* The fixed mapping is only supported on axon machines */
	np = of_find_node_by_name(NULL, "axon");
932 933
	of_node_put(np);

934 935 936 937 938
	if (!np) {
		pr_debug("iommu: fixed mapping disabled, no axons found\n");
		return -1;
	}

939
	/* We must have dma-ranges properties for fixed mapping to work */
940
	np = of_find_node_with_property(NULL, "dma-ranges");
941 942 943 944 945 946 947
	of_node_put(np);

	if (!np) {
		pr_debug("iommu: no dma-ranges found, no fixed mapping\n");
		return -1;
	}

948 949 950 951 952 953 954 955 956 957 958 959
	/* The default setup is to have the fixed mapping sit after the
	 * dynamic region, so find the top of the largest IOMMU window
	 * on any axon, then add the size of RAM and that's our max value.
	 * If that is > 32GB we have to do other shennanigans.
	 */
	fbase = 0;
	for_each_node_by_name(np, "axon") {
		cell_iommu_get_window(np, &dbase, &dsize);
		fbase = max(fbase, dbase + dsize);
	}

	fbase = _ALIGN_UP(fbase, 1 << IO_SEGMENT_SHIFT);
Y
Yinghai Lu 已提交
960
	fsize = memblock_phys_mem_size();
961

962
	if ((fbase + fsize) <= 0x800000000ul)
963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
		hbase = 0; /* use the device tree window */
	else {
		/* If we're over 32 GB we need to cheat. We can't map all of
		 * RAM with the fixed mapping, and also fit the dynamic
		 * region. So try to place the dynamic region where the hash
		 * table sits, drivers never need to DMA to it, we don't
		 * need a fixed mapping for that area.
		 */
		if (!htab_address) {
			pr_debug("iommu: htab is NULL, on LPAR? Huh?\n");
			return -1;
		}
		hbase = __pa(htab_address);
		hend  = hbase + htab_size_bytes;

		/* The window must start and end on a segment boundary */
		if ((hbase != _ALIGN_UP(hbase, 1 << IO_SEGMENT_SHIFT)) ||
		    (hend != _ALIGN_UP(hend, 1 << IO_SEGMENT_SHIFT))) {
			pr_debug("iommu: hash window not segment aligned\n");
			return -1;
		}

		/* Check the hash window fits inside the real DMA window */
		for_each_node_by_name(np, "axon") {
			cell_iommu_get_window(np, &dbase, &dsize);

			if (hbase < dbase || (hend > (dbase + dsize))) {
				pr_debug("iommu: hash window doesn't fit in"
					 "real DMA window\n");
				return -1;
			}
		}

		fbase = 0;
	}

	/* Setup the dynamic regions */
	for_each_node_by_name(np, "axon") {
		iommu = cell_iommu_alloc(np);
		BUG_ON(!iommu);

		if (hbase == 0)
			cell_iommu_get_window(np, &dbase, &dsize);
		else {
			dbase = hbase;
			dsize = htab_size_bytes;
		}

1011 1012
		printk(KERN_DEBUG "iommu: node %d, dynamic window 0x%lx-0x%lx "
			"fixed window 0x%lx-0x%lx\n", iommu->nid, dbase,
1013 1014
			 dbase + dsize, fbase, fbase + fsize);

1015
		cell_iommu_setup_stab(iommu, dbase, dsize, fbase, fsize);
1016
		iommu->ptab = cell_iommu_alloc_ptab(iommu, dbase, dsize, 0, 0,
1017
						    IOMMU_PAGE_SHIFT_4K);
1018 1019 1020 1021 1022 1023
		cell_iommu_setup_fixed_ptab(iommu, np, dbase, dsize,
					     fbase, fsize);
		cell_iommu_enable_hardware(iommu);
		cell_iommu_setup_window(iommu, np, dbase, dsize, 0);
	}

1024 1025
	cell_pci_controller_ops.iommu_bypass_supported =
		cell_pci_iommu_bypass_supported;
1026 1027 1028 1029 1030 1031 1032
	return 0;
}

static int iommu_fixed_disabled;

static int __init setup_iommu_fixed(char *str)
{
1033 1034
	struct device_node *pciep;

1035 1036 1037
	if (strcmp(str, "off") == 0)
		iommu_fixed_disabled = 1;

1038 1039 1040 1041 1042 1043 1044 1045
	/* If we can find a pcie-endpoint in the device tree assume that
	 * we're on a triblade or a CAB so by default the fixed mapping
	 * should be set to be weakly ordered; but only if the boot
	 * option WASN'T set for strong ordering
	 */
	pciep = of_find_node_by_type(NULL, "pcie-endpoint");

	if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0))
1046
		iommu_fixed_is_weak = true;
1047

1048 1049
	of_node_put(pciep);

1050 1051 1052 1053
	return 1;
}
__setup("iommu_fixed=", setup_iommu_fixed);

J
Jeremy Kerr 已提交
1054
static int __init cell_iommu_init(void)
1055
{
J
Jeremy Kerr 已提交
1056 1057 1058 1059 1060 1061 1062 1063
	struct device_node *np;

	/* If IOMMU is disabled or we have little enough RAM to not need
	 * to enable it, we setup a direct mapping.
	 *
	 * Note: should we make sure we have the IOMMU actually disabled ?
	 */
	if (iommu_is_off ||
Y
Yinghai Lu 已提交
1064
	    (!iommu_force_on && memblock_end_of_DRAM() <= 0x80000000ull))
J
Jeremy Kerr 已提交
1065 1066 1067
		if (cell_iommu_init_disabled() == 0)
			goto bail;

1068 1069
	/* Setup various callbacks */
	cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup;
J
Jeremy Kerr 已提交
1070

1071
	if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
1072
		goto done;
1073

J
Jeremy Kerr 已提交
1074 1075 1076 1077 1078
	/* Create an iommu for each /axon node.  */
	for_each_node_by_name(np, "axon") {
		if (np->parent == NULL || np->parent->parent != NULL)
			continue;
		cell_iommu_init_one(np, 0);
1079
	}
1080

J
Jeremy Kerr 已提交
1081 1082 1083 1084 1085 1086 1087 1088
	/* Create an iommu for each toplevel /pci-internal node for
	 * old hardware/firmware
	 */
	for_each_node_by_name(np, "pci-internal") {
		if (np->parent == NULL || np->parent->parent != NULL)
			continue;
		cell_iommu_init_one(np, SPIDER_DMA_OFFSET);
	}
1089
 done:
J
Jeremy Kerr 已提交
1090
	/* Setup default PCI iommu ops */
1091
	set_pci_dma_ops(&dma_iommu_ops);
1092
	cell_iommu_enabled = true;
J
Jeremy Kerr 已提交
1093 1094 1095 1096
 bail:
	/* Register callbacks on OF platform device addition/removal
	 * to handle linking them to the right DMA operations
	 */
1097
	bus_register_notifier(&platform_bus_type, &cell_of_bus_notifier);
J
Jeremy Kerr 已提交
1098 1099

	return 0;
1100
}
1101
machine_arch_initcall(cell, cell_iommu_init);