提交 77ed23f8 编写于 作者: C Cliff Wickman 提交者: Ingo Molnar

x86: Fix UV BAU for non-consecutive nasids

This is a fix for the SGI Altix-UV Broadcast Assist Unit code,
which is used for TLB flushing.

Certain hardware configurations (that customers are ordering)
cause nasids (numa address space id's) to be non-consecutive.
Specifically, once you have more than 4 blades in a IRU
(Individual Rack Unit - or 1/2 rack) but less than the maximum
of 16, the nasid numbering becomes non-consecutive.  This
currently results in a 'catastrophic error' (CATERR) detected by
the firmware during OS boot.  The BAU is generating an 'INTD'
request that is targeting a non-existent nasid value. Such
configurations may also occur when a blade is configured off
because of hardware errors. (There is one UV hub per blade.)

This patch is required to support such configurations.

The problem with the tlb_uv.c code is that is using the
consecutive hub numbers as indices to the BAU distribution bit
map. These are simply the ordinal position of the hub or blade
within its partition.  It should be using physical node numbers
(pnodes), which correspond to the physical nasid values. Use of
the hub number only works as long as the nasids in the partition
are consecutive and increase with a stride of 1.

This patch changes the index to be the pnode number, thus
allowing nasids to be non-consecutive.
It also provides a table in local memory for each cpu to
translate target cpu number to target pnode and nasid.
And it improves naming to properly reflect 'node' and 'uvhub'
versus 'nasid'.
Signed-off-by: NCliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/E1QJmxX-0002Mz-Fk@eag09.americas.sgi.comSigned-off-by: NIngo Molnar <mingo@elte.hu>
上级 1d44e828
...@@ -94,6 +94,8 @@ ...@@ -94,6 +94,8 @@
/* after this # consecutive successes, bump up the throttle if it was lowered */ /* after this # consecutive successes, bump up the throttle if it was lowered */
#define COMPLETE_THRESHOLD 5 #define COMPLETE_THRESHOLD 5
#define UV_LB_SUBNODEID 0x10
/* /*
* number of entries in the destination side payload queue * number of entries in the destination side payload queue
*/ */
...@@ -124,7 +126,7 @@ ...@@ -124,7 +126,7 @@
* The distribution specification (32 bytes) is interpreted as a 256-bit * The distribution specification (32 bytes) is interpreted as a 256-bit
* distribution vector. Adjacent bits correspond to consecutive even numbered * distribution vector. Adjacent bits correspond to consecutive even numbered
* nodeIDs. The result of adding the index of a given bit to the 15-bit * nodeIDs. The result of adding the index of a given bit to the 15-bit
* 'base_dest_nodeid' field of the header corresponds to the * 'base_dest_nasid' field of the header corresponds to the
* destination nodeID associated with that specified bit. * destination nodeID associated with that specified bit.
*/ */
struct bau_target_uvhubmask { struct bau_target_uvhubmask {
...@@ -176,7 +178,7 @@ struct bau_msg_payload { ...@@ -176,7 +178,7 @@ struct bau_msg_payload {
struct bau_msg_header { struct bau_msg_header {
unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
/* bits 5:0 */ /* bits 5:0 */
unsigned int base_dest_nodeid:15; /* nasid of the */ unsigned int base_dest_nasid:15; /* nasid of the */
/* bits 20:6 */ /* first bit in uvhub map */ /* bits 20:6 */ /* first bit in uvhub map */
unsigned int command:8; /* message type */ unsigned int command:8; /* message type */
/* bits 28:21 */ /* bits 28:21 */
...@@ -378,6 +380,10 @@ struct ptc_stats { ...@@ -378,6 +380,10 @@ struct ptc_stats {
unsigned long d_rcanceled; /* number of messages canceled by resets */ unsigned long d_rcanceled; /* number of messages canceled by resets */
}; };
struct hub_and_pnode {
short uvhub;
short pnode;
};
/* /*
* one per-cpu; to locate the software tables * one per-cpu; to locate the software tables
*/ */
...@@ -399,10 +405,12 @@ struct bau_control { ...@@ -399,10 +405,12 @@ struct bau_control {
int baudisabled; int baudisabled;
int set_bau_off; int set_bau_off;
short cpu; short cpu;
short osnode;
short uvhub_cpu; short uvhub_cpu;
short uvhub; short uvhub;
short cpus_in_socket; short cpus_in_socket;
short cpus_in_uvhub; short cpus_in_uvhub;
short partition_base_pnode;
unsigned short message_number; unsigned short message_number;
unsigned short uvhub_quiesce; unsigned short uvhub_quiesce;
short socket_acknowledge_count[DEST_Q_SIZE]; short socket_acknowledge_count[DEST_Q_SIZE];
...@@ -422,15 +430,16 @@ struct bau_control { ...@@ -422,15 +430,16 @@ struct bau_control {
int congested_period; int congested_period;
cycles_t period_time; cycles_t period_time;
long period_requests; long period_requests;
struct hub_and_pnode *target_hub_and_pnode;
}; };
static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
{ {
return constant_test_bit(uvhub, &dstp->bits[0]); return constant_test_bit(uvhub, &dstp->bits[0]);
} }
static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp) static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp)
{ {
__set_bit(uvhub, &dstp->bits[0]); __set_bit(pnode, &dstp->bits[0]);
} }
static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
int nbits) int nbits)
......
...@@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, ...@@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long va, unsigned int cpu) unsigned long va, unsigned int cpu)
{ {
int tcpu;
int uvhub;
int locals = 0; int locals = 0;
int remotes = 0; int remotes = 0;
int hubs = 0; int hubs = 0;
int tcpu;
int tpnode;
struct bau_desc *bau_desc; struct bau_desc *bau_desc;
struct cpumask *flush_mask; struct cpumask *flush_mask;
struct ptc_stats *stat; struct ptc_stats *stat;
struct bau_control *bcp; struct bau_control *bcp;
struct bau_control *tbcp; struct bau_control *tbcp;
struct hub_and_pnode *hpp;
/* kernel was booted 'nobau' */ /* kernel was booted 'nobau' */
if (nobau) if (nobau)
...@@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, ...@@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
/* cpu statistics */
for_each_cpu(tcpu, flush_mask) { for_each_cpu(tcpu, flush_mask) {
uvhub = uv_cpu_to_blade_id(tcpu); /*
bau_uvhub_set(uvhub, &bau_desc->distribution); * The distribution vector is a bit map of pnodes, relative
if (uvhub == bcp->uvhub) * to the partition base pnode (and the partition base nasid
* in the header).
* Translate cpu to pnode and hub using an array stored
* in local memory.
*/
hpp = &bcp->socket_master->target_hub_and_pnode[tcpu];
tpnode = hpp->pnode - bcp->partition_base_pnode;
bau_uvhub_set(tpnode, &bau_desc->distribution);
if (hpp->uvhub == bcp->uvhub)
locals++; locals++;
else else
remotes++; remotes++;
...@@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) ...@@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
* an interrupt, but causes an error message to be returned to * an interrupt, but causes an error message to be returned to
* the sender. * the sender.
*/ */
static void uv_enable_timeouts(void) static void __init uv_enable_timeouts(void)
{ {
int uvhub; int uvhub;
int nuvhubs; int nuvhubs;
...@@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void) ...@@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void)
} }
/* /*
* initialize the sending side's sending buffers * Initialize the sending side's sending buffers.
*/ */
static void static void
uv_activation_descriptor_init(int node, int pnode) uv_activation_descriptor_init(int node, int pnode, int base_pnode)
{ {
int i; int i;
int cpu; int cpu;
...@@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node, int pnode) ...@@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node, int pnode)
n = pa >> uv_nshift; n = pa >> uv_nshift;
m = pa & uv_mmask; m = pa & uv_mmask;
/* the 14-bit pnode */
uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
(n << UV_DESC_BASE_PNODE_SHIFT | m)); (n << UV_DESC_BASE_PNODE_SHIFT | m));
/* /*
* initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
* cpu even though we only use the first one; one descriptor can * cpu even though we only use the first one; one descriptor can
* describe a broadcast to 256 uv hubs. * describe a broadcast to 256 uv hubs.
*/ */
...@@ -1365,12 +1373,13 @@ uv_activation_descriptor_init(int node, int pnode) ...@@ -1365,12 +1373,13 @@ uv_activation_descriptor_init(int node, int pnode)
memset(bd2, 0, sizeof(struct bau_desc)); memset(bd2, 0, sizeof(struct bau_desc));
bd2->header.sw_ack_flag = 1; bd2->header.sw_ack_flag = 1;
/* /*
* base_dest_nodeid is the nasid of the first uvhub * The base_dest_nasid set in the message header is the nasid
* in the partition. The bit map will indicate uvhub numbers, * of the first uvhub in the partition. The bit map will
* which are 0-N in a partition. Pnodes are unique system-wide. * indicate destination pnode numbers relative to that base.
* They may not be consecutive if nasid striding is being used.
*/ */
bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode); bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
bd2->header.dest_subnodeid = 0x10; /* the LB */ bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
bd2->header.command = UV_NET_ENDPOINT_INTD; bd2->header.command = UV_NET_ENDPOINT_INTD;
bd2->header.int_both = 1; bd2->header.int_both = 1;
/* /*
...@@ -1442,7 +1451,7 @@ uv_payload_queue_init(int node, int pnode) ...@@ -1442,7 +1451,7 @@ uv_payload_queue_init(int node, int pnode)
/* /*
* Initialization of each UV hub's structures * Initialization of each UV hub's structures
*/ */
static void __init uv_init_uvhub(int uvhub, int vector) static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
{ {
int node; int node;
int pnode; int pnode;
...@@ -1450,11 +1459,11 @@ static void __init uv_init_uvhub(int uvhub, int vector) ...@@ -1450,11 +1459,11 @@ static void __init uv_init_uvhub(int uvhub, int vector)
node = uvhub_to_first_node(uvhub); node = uvhub_to_first_node(uvhub);
pnode = uv_blade_to_pnode(uvhub); pnode = uv_blade_to_pnode(uvhub);
uv_activation_descriptor_init(node, pnode); uv_activation_descriptor_init(node, pnode, base_pnode);
uv_payload_queue_init(node, pnode); uv_payload_queue_init(node, pnode);
/* /*
* the below initialization can't be in firmware because the * The below initialization can't be in firmware because the
* messaging IRQ will be determined by the OS * messaging IRQ will be determined by the OS.
*/ */
apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
...@@ -1491,10 +1500,11 @@ calculate_destination_timeout(void) ...@@ -1491,10 +1500,11 @@ calculate_destination_timeout(void)
/* /*
* initialize the bau_control structure for each cpu * initialize the bau_control structure for each cpu
*/ */
static int __init uv_init_per_cpu(int nuvhubs) static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
{ {
int i; int i;
int cpu; int cpu;
int tcpu;
int pnode; int pnode;
int uvhub; int uvhub;
int have_hmaster; int have_hmaster;
...@@ -1528,6 +1538,15 @@ static int __init uv_init_per_cpu(int nuvhubs) ...@@ -1528,6 +1538,15 @@ static int __init uv_init_per_cpu(int nuvhubs)
bcp = &per_cpu(bau_control, cpu); bcp = &per_cpu(bau_control, cpu);
memset(bcp, 0, sizeof(struct bau_control)); memset(bcp, 0, sizeof(struct bau_control));
pnode = uv_cpu_hub_info(cpu)->pnode; pnode = uv_cpu_hub_info(cpu)->pnode;
if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) {
printk(KERN_EMERG
"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
cpu, pnode, base_part_pnode,
UV_DISTRIBUTION_SIZE);
return 1;
}
bcp->osnode = cpu_to_node(cpu);
bcp->partition_base_pnode = uv_partition_base_pnode;
uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
bdp = &uvhub_descs[uvhub]; bdp = &uvhub_descs[uvhub];
...@@ -1536,7 +1555,7 @@ static int __init uv_init_per_cpu(int nuvhubs) ...@@ -1536,7 +1555,7 @@ static int __init uv_init_per_cpu(int nuvhubs)
bdp->pnode = pnode; bdp->pnode = pnode;
/* kludge: 'assuming' one node per socket, and assuming that /* kludge: 'assuming' one node per socket, and assuming that
disabling a socket just leaves a gap in node numbers */ disabling a socket just leaves a gap in node numbers */
socket = (cpu_to_node(cpu) & 1); socket = bcp->osnode & 1;
bdp->socket_mask |= (1 << socket); bdp->socket_mask |= (1 << socket);
sdp = &bdp->socket[socket]; sdp = &bdp->socket[socket];
sdp->cpu_number[sdp->num_cpus] = cpu; sdp->cpu_number[sdp->num_cpus] = cpu;
...@@ -1585,6 +1604,20 @@ static int __init uv_init_per_cpu(int nuvhubs) ...@@ -1585,6 +1604,20 @@ static int __init uv_init_per_cpu(int nuvhubs)
nextsocket: nextsocket:
socket++; socket++;
socket_mask = (socket_mask >> 1); socket_mask = (socket_mask >> 1);
/* each socket gets a local array of pnodes/hubs */
bcp = smaster;
bcp->target_hub_and_pnode = kmalloc_node(
sizeof(struct hub_and_pnode) *
num_possible_cpus(), GFP_KERNEL, bcp->osnode);
memset(bcp->target_hub_and_pnode, 0,
sizeof(struct hub_and_pnode) *
num_possible_cpus());
for_each_present_cpu(tcpu) {
bcp->target_hub_and_pnode[tcpu].pnode =
uv_cpu_hub_info(tcpu)->pnode;
bcp->target_hub_and_pnode[tcpu].uvhub =
uv_cpu_hub_info(tcpu)->numa_blade_id;
}
} }
} }
kfree(uvhub_descs); kfree(uvhub_descs);
...@@ -1637,21 +1670,22 @@ static int __init uv_bau_init(void) ...@@ -1637,21 +1670,22 @@ static int __init uv_bau_init(void)
spin_lock_init(&disable_lock); spin_lock_init(&disable_lock);
congested_cycles = microsec_2_cycles(congested_response_us); congested_cycles = microsec_2_cycles(congested_response_us);
if (uv_init_per_cpu(nuvhubs)) {
nobau = 1;
return 0;
}
uv_partition_base_pnode = 0x7fffffff; uv_partition_base_pnode = 0x7fffffff;
for (uvhub = 0; uvhub < nuvhubs; uvhub++) for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
if (uv_blade_nr_possible_cpus(uvhub) && if (uv_blade_nr_possible_cpus(uvhub) &&
(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
uv_partition_base_pnode = uv_blade_to_pnode(uvhub); uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
}
if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) {
nobau = 1;
return 0;
}
vector = UV_BAU_MESSAGE; vector = UV_BAU_MESSAGE;
for_each_possible_blade(uvhub) for_each_possible_blade(uvhub)
if (uv_blade_nr_possible_cpus(uvhub)) if (uv_blade_nr_possible_cpus(uvhub))
uv_init_uvhub(uvhub, vector); uv_init_uvhub(uvhub, vector, uv_partition_base_pnode);
uv_enable_timeouts(); uv_enable_timeouts();
alloc_intr_gate(vector, uv_bau_message_intr1); alloc_intr_gate(vector, uv_bau_message_intr1);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册