提交 00b65985 编写于 作者: C Chen, Kenneth W 提交者: Tony Luck

[IA64] relax per-cpu TLB requirement to DTC

Instead of pinning per-cpu TLB into a DTR, use DTC.  This will free up
one TLB entry for application, or even kernel if access pattern to
per-cpu data area has high temporal locality.

Since per-cpu is mapped at the top of region 7 address, we just need to
add special case in alt_dtlb_miss.  The physical address of per-cpu data
is already conveniently stored in IA64_KR(PER_CPU_DATA).  Latency for
alt_dtlb_miss is not affected as we can hide all the latency.  It was
measured that alt_dtlb_miss handler has 23 cycles latency before and
after the patch.

The performance effect is massive for applications that put lots of tlb
pressure on CPU.  Workload environment like database online transaction
processing or application uses tera-byte of memory would benefit the most.
Measurement with industry standard database benchmark shown an upward
of 1.6% gain.  While smaller workloads like cpu, java also showing small
improvement.
Signed-off-by: NKen Chen <kenneth.w.chen@intel.com>
Signed-off-by: NTony Luck <tony.luck@intel.com>
上级 a0776ec8
...@@ -374,6 +374,7 @@ ENTRY(alt_dtlb_miss) ...@@ -374,6 +374,7 @@ ENTRY(alt_dtlb_miss)
movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff) movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
mov r21=cr.ipsr mov r21=cr.ipsr
mov r31=pr mov r31=pr
mov r24=PERCPU_ADDR
;; ;;
#ifdef CONFIG_DISABLE_VHPT #ifdef CONFIG_DISABLE_VHPT
shr.u r22=r16,61 // get the region number into r21 shr.u r22=r16,61 // get the region number into r21
...@@ -386,22 +387,30 @@ ENTRY(alt_dtlb_miss) ...@@ -386,22 +387,30 @@ ENTRY(alt_dtlb_miss)
(p8) mov r29=b0 // save b0 (p8) mov r29=b0 // save b0
(p8) br.cond.dptk dtlb_fault (p8) br.cond.dptk dtlb_fault
#endif #endif
cmp.ge p10,p11=r16,r24 // access to per_cpu_data?
tbit.z p12,p0=r16,61 // access to region 6?
mov r25=PERCPU_PAGE_SHIFT << 2
mov r26=PERCPU_PAGE_SIZE
nop.m 0
nop.b 0
;;
(p10) mov r19=IA64_KR(PER_CPU_DATA)
(p11) and r19=r19,r16 // clear non-ppn fields
extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field
tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on? tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on?
shr.u r18=r16,57 // move address bit 61 to bit 4
and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on? tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on?
;; ;;
andcm r18=0x10,r18 // bit 4=~address-bit(61) (p10) sub r19=r19,r26
(p10) mov cr.itir=r25
cmp.ne p8,p0=r0,r23 cmp.ne p8,p0=r0,r23
(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field (p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field
(p12) dep r17=-1,r17,4,1 // set ma=UC for region 6 addr
(p8) br.cond.spnt page_fault (p8) br.cond.spnt page_fault
dep r21=-1,r21,IA64_PSR_ED_BIT,1 dep r21=-1,r21,IA64_PSR_ED_BIT,1
or r19=r19,r17 // insert PTE control bits into r19
;; ;;
or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6 or r19=r19,r17 // insert PTE control bits into r19
(p6) mov cr.ipsr=r21 (p6) mov cr.ipsr=r21
;; ;;
(p7) itc.d r19 // insert the TLB entry (p7) itc.d r19 // insert the TLB entry
......
...@@ -101,14 +101,6 @@ ia64_do_tlb_purge: ...@@ -101,14 +101,6 @@ ia64_do_tlb_purge:
;; ;;
srlz.d srlz.d
;; ;;
// 2. Purge DTR for PERCPU data.
movl r16=PERCPU_ADDR
mov r18=PERCPU_PAGE_SHIFT<<2
;;
ptr.d r16,r18
;;
srlz.d
;;
// 3. Purge ITR for PAL code. // 3. Purge ITR for PAL code.
GET_THIS_PADDR(r2, ia64_mca_pal_base) GET_THIS_PADDR(r2, ia64_mca_pal_base)
;; ;;
...@@ -196,22 +188,6 @@ ia64_reload_tr: ...@@ -196,22 +188,6 @@ ia64_reload_tr:
srlz.i srlz.i
srlz.d srlz.d
;; ;;
// 2. Reload DTR register for PERCPU data.
GET_THIS_PADDR(r2, ia64_mca_per_cpu_pte)
;;
movl r16=PERCPU_ADDR // vaddr
movl r18=PERCPU_PAGE_SHIFT<<2
;;
mov cr.itir=r18
mov cr.ifa=r16
;;
ld8 r18=[r2] // load per-CPU PTE
mov r16=IA64_TR_PERCPU_DATA;
;;
itr.d dtr[r16]=r18
;;
srlz.d
;;
// 3. Reload ITR for PAL code. // 3. Reload ITR for PAL code.
GET_THIS_PADDR(r2, ia64_mca_pal_pte) GET_THIS_PADDR(r2, ia64_mca_pal_pte)
;; ;;
......
...@@ -337,7 +337,7 @@ setup_gate (void) ...@@ -337,7 +337,7 @@ setup_gate (void)
void __devinit void __devinit
ia64_mmu_init (void *my_cpu_data) ia64_mmu_init (void *my_cpu_data)
{ {
unsigned long psr, pta, impl_va_bits; unsigned long pta, impl_va_bits;
extern void __devinit tlb_init (void); extern void __devinit tlb_init (void);
#ifdef CONFIG_DISABLE_VHPT #ifdef CONFIG_DISABLE_VHPT
...@@ -346,15 +346,6 @@ ia64_mmu_init (void *my_cpu_data) ...@@ -346,15 +346,6 @@ ia64_mmu_init (void *my_cpu_data)
# define VHPT_ENABLE_BIT 1 # define VHPT_ENABLE_BIT 1
#endif #endif
/* Pin mapping for percpu area into TLB */
psr = ia64_clear_ic();
ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
PERCPU_PAGE_SHIFT);
ia64_set_psr(psr);
ia64_srlz_i();
/* /*
* Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
* address space. The IA-64 architecture guarantees that at least 50 bits of * address space. The IA-64 architecture guarantees that at least 50 bits of
......
...@@ -29,8 +29,7 @@ ...@@ -29,8 +29,7 @@
*/ */
#define IA64_TR_KERNEL 0 /* itr0, dtr0: maps kernel image (code & data) */ #define IA64_TR_KERNEL 0 /* itr0, dtr0: maps kernel image (code & data) */
#define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */ #define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */
#define IA64_TR_PERCPU_DATA 1 /* dtr1: percpu data */ #define IA64_TR_CURRENT_STACK 1 /* dtr1: maps kernel's memory- & register-stacks */
#define IA64_TR_CURRENT_STACK 2 /* dtr2: maps kernel's memory- & register-stacks */
/* Processor status register bits: */ /* Processor status register bits: */
#define IA64_PSR_BE_BIT 1 #define IA64_PSR_BE_BIT 1
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册