firmware_if.c 27.2 KB
Newer Older
1 2 3 4 5 6 7 8
// SPDX-License-Identifier: GPL-2.0

/*
 * Copyright 2016-2019 HabanaLabs, Ltd.
 * All Rights Reserved.
 */

#include "habanalabs.h"
9
#include "../include/common/hl_boot_if.h"
10 11

#include <linux/firmware.h>
12
#include <linux/slab.h>
13

O
Ofir Bitton 已提交
14
#define FW_FILE_MAX_SIZE	0x1400000 /* maximum size of 20MB */
15
/**
16
 * hl_fw_load_fw_to_device() - Load F/W code to device's memory.
17
 *
18
 * @hdev: pointer to hl_device structure.
19 20
 * @fw_name: the firmware image name
 * @dst: IO memory mapped address space to copy firmware to
O
Ofir Bitton 已提交
21 22
 * @src_offset: offset in src FW to copy from
 * @size: amount of bytes to copy (0 to copy the whole binary)
23 24 25 26 27
 *
 * Copy fw code from firmware file to device memory.
 *
 * Return: 0 on success, non-zero for failure.
 */
28
int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
O
Ofir Bitton 已提交
29
				void __iomem *dst, u32 src_offset, u32 size)
30 31
{
	const struct firmware *fw;
O
Ofir Bitton 已提交
32
	const void *fw_data;
33
	size_t fw_size;
34 35 36 37
	int rc;

	rc = request_firmware(&fw, fw_name, hdev->dev);
	if (rc) {
38
		dev_err(hdev->dev, "Firmware file %s is not found!\n", fw_name);
39 40 41 42 43
		goto out;
	}

	fw_size = fw->size;
	if ((fw_size % 4) != 0) {
44
		dev_err(hdev->dev, "Illegal %s firmware size %zu\n",
45 46 47 48 49 50 51
			fw_name, fw_size);
		rc = -EINVAL;
		goto out;
	}

	dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size);

O
Ofir Bitton 已提交
52 53 54 55 56 57 58 59
	if (fw_size > FW_FILE_MAX_SIZE) {
		dev_err(hdev->dev,
			"FW file size %zu exceeds maximum of %u bytes\n",
			fw_size, FW_FILE_MAX_SIZE);
		rc = -EINVAL;
		goto out;
	}

O
Ofir Bitton 已提交
60 61 62 63 64 65 66 67 68 69 70 71
	if (size - src_offset > fw_size) {
		dev_err(hdev->dev,
			"size to copy(%u) and offset(%u) are invalid\n",
			size, src_offset);
		rc = -EINVAL;
		goto out;
	}

	if (size)
		fw_size = size;

	fw_data = (const void *) fw->data;
72

O
Ofir Bitton 已提交
73
	memcpy_toio(dst, fw_data + src_offset, fw_size);
74 75 76 77 78 79 80 81

out:
	release_firmware(fw);
	return rc;
}

int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
{
82
	struct cpucp_packet pkt = {};
83

84
	pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT);
85 86

	return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
87
						sizeof(pkt), 0, NULL);
88 89 90
}

int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
91
				u16 len, u32 timeout, u64 *result)
92
{
93
	struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
94
	struct cpucp_packet *pkt;
95
	dma_addr_t pkt_dma_addr;
96
	u32 tmp, expected_ack_val;
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
	int rc = 0;

	pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
								&pkt_dma_addr);
	if (!pkt) {
		dev_err(hdev->dev,
			"Failed to allocate DMA memory for packet to CPU\n");
		return -ENOMEM;
	}

	memcpy(pkt, msg, len);

	mutex_lock(&hdev->send_cpu_message_lock);

	if (hdev->disabled)
		goto out;

	if (hdev->device_cpu_disabled) {
		rc = -EIO;
		goto out;
	}

119 120 121
	/* set fence to a non valid value */
	pkt->fence = UINT_MAX;

122 123 124 125 126 127
	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
	if (rc) {
		dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
		goto out;
	}

128 129
	if (hdev->asic_prop.fw_app_security_map &
			CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
130 131 132 133
		expected_ack_val = queue->pi;
	else
		expected_ack_val = CPUCP_PACKET_FENCE_VAL;

134
	rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
135
				(tmp == expected_ack_val), 1000,
136
				timeout, true);
137 138 139 140

	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);

	if (rc == -ETIMEDOUT) {
141
		dev_err(hdev->dev, "Device CPU packet timeout (0x%x)\n", tmp);
142 143 144 145
		hdev->device_cpu_disabled = true;
		goto out;
	}

146
	tmp = le32_to_cpu(pkt->ctl);
147

148
	rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
149 150 151
	if (rc) {
		dev_err(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
			rc,
152 153
			(tmp & CPUCP_PKT_CTL_OPCODE_MASK)
						>> CPUCP_PKT_CTL_OPCODE_SHIFT);
154 155
		rc = -EIO;
	} else if (result) {
156
		*result = le64_to_cpu(pkt->result);
157 158 159 160 161 162 163 164 165 166
	}

out:
	mutex_unlock(&hdev->send_cpu_message_lock);

	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, len, pkt);

	return rc;
}

167 168
int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
{
169
	struct cpucp_packet pkt;
170
	u64 result;
171 172 173 174
	int rc;

	memset(&pkt, 0, sizeof(pkt));

175 176
	pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ <<
				CPUCP_PKT_CTL_OPCODE_SHIFT);
177 178 179
	pkt.value = cpu_to_le64(event_type);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
180
						0, &result);
181 182 183 184 185 186 187 188 189 190

	if (rc)
		dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);

	return rc;
}

int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
		size_t irq_arr_size)
{
191
	struct cpucp_unmask_irq_arr_packet *pkt;
192
	size_t total_pkt_size;
193
	u64 result;
194 195
	int rc;

196
	total_pkt_size = sizeof(struct cpucp_unmask_irq_arr_packet) +
197 198
			irq_arr_size;

O
Oded Gabbay 已提交
199
	/* data should be aligned to 8 bytes in order to CPU-CP to copy it */
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
	total_pkt_size = (total_pkt_size + 0x7) & ~0x7;

	/* total_pkt_size is casted to u16 later on */
	if (total_pkt_size > USHRT_MAX) {
		dev_err(hdev->dev, "too many elements in IRQ array\n");
		return -EINVAL;
	}

	pkt = kzalloc(total_pkt_size, GFP_KERNEL);
	if (!pkt)
		return -ENOMEM;

	pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0]));
	memcpy(&pkt->irqs, irq_arr, irq_arr_size);

215 216
	pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
						CPUCP_PKT_CTL_OPCODE_SHIFT);
217 218

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
219
						total_pkt_size, 0, &result);
220 221 222 223 224 225 226 227 228

	if (rc)
		dev_err(hdev->dev, "failed to unmask IRQ array\n");

	kfree(pkt);

	return rc;
}

229 230
int hl_fw_test_cpu_queue(struct hl_device *hdev)
{
231
	struct cpucp_packet test_pkt = {};
232
	u64 result;
233 234
	int rc;

235 236 237
	test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
					CPUCP_PKT_CTL_OPCODE_SHIFT);
	test_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
238 239

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
240
						sizeof(test_pkt), 0, &result);
241 242

	if (!rc) {
243
		if (result != CPUCP_PACKET_FENCE_VAL)
244
			dev_err(hdev->dev,
245
				"CPU queue test failed (%#08llx)\n", result);
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
	} else {
		dev_err(hdev->dev, "CPU queue test failed, error %d\n", rc);
	}

	return rc;
}

void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
						dma_addr_t *dma_handle)
{
	u64 kernel_addr;

	kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size);

	*dma_handle = hdev->cpu_accessible_dma_address +
		(kernel_addr - (u64) (uintptr_t) hdev->cpu_accessible_dma_mem);

	return (void *) (uintptr_t) kernel_addr;
}

void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
					void *vaddr)
{
	gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) (uintptr_t) vaddr,
			size);
}

int hl_fw_send_heartbeat(struct hl_device *hdev)
{
275
	struct cpucp_packet hb_pkt = {};
276
	u64 result;
277 278
	int rc;

279 280 281
	hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
					CPUCP_PKT_CTL_OPCODE_SHIFT);
	hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
282 283

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
284
						sizeof(hb_pkt), 0, &result);
285

286
	if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
287 288 289 290 291
		rc = -EIO;

	return rc;
}

292 293 294 295
static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
		u32 cpu_security_boot_status_reg)
{
	u32 err_val, security_val;
296
	bool err_exists = false;
297 298 299 300 301 302 303 304 305 306 307 308 309 310

	/* Some of the firmware status codes are deprecated in newer f/w
	 * versions. In those versions, the errors are reported
	 * in different registers. Therefore, we need to check those
	 * registers and print the exact errors. Moreover, there
	 * may be multiple errors, so we need to report on each error
	 * separately. Some of the error codes might indicate a state
	 * that is not an error per-se, but it is an error in production
	 * environment
	 */
	err_val = RREG32(boot_err0_reg);
	if (!(err_val & CPU_BOOT_ERR0_ENABLED))
		return 0;

311
	if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) {
312 313
		dev_err(hdev->dev,
			"Device boot error - DRAM initialization failed\n");
314 315 316 317
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) {
318
		dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
319 320 321 322
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) {
323 324
		dev_err(hdev->dev,
			"Device boot error - Thermal Sensor initialization failed\n");
325 326 327 328
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
329 330
		dev_warn(hdev->dev,
			"Device boot warning - Skipped DRAM initialization\n");
331 332 333 334 335
		/* This is a warning so we don't want it to disable the
		 * device
		 */
		err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
	}
336 337

	if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
338 339
		if (hdev->bmc_enable) {
			dev_err(hdev->dev,
340
				"Device boot error - Skipped waiting for BMC\n");
341 342 343 344 345 346 347
			err_exists = true;
		} else {
			dev_info(hdev->dev,
				"Device boot message - Skipped waiting for BMC\n");
			/* This is an info so we don't want it to disable the
			 * device
			 */
348
			err_val &= ~CPU_BOOT_ERR0_BMC_WAIT_SKIPPED;
349
		}
350 351
	}

352
	if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) {
353 354
		dev_err(hdev->dev,
			"Device boot error - Serdes data from BMC not available\n");
355 356 357 358
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) {
359 360
		dev_err(hdev->dev,
			"Device boot error - NIC F/W initialization failed\n");
361 362 363 364
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
365 366
		dev_warn(hdev->dev,
			"Device boot warning - security not ready\n");
367 368 369 370 371 372 373
		/* This is a warning so we don't want it to disable the
		 * device
		 */
		err_val &= ~CPU_BOOT_ERR0_SECURITY_NOT_RDY;
	}

	if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
374
		dev_err(hdev->dev, "Device boot error - security failure\n");
375 376 377 378
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) {
379
		dev_err(hdev->dev, "Device boot error - eFuse failure\n");
380 381 382 383
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_PLL_FAIL) {
384
		dev_err(hdev->dev, "Device boot error - PLL failure\n");
385 386 387 388
		err_exists = true;
	}

	if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) {
389
		dev_err(hdev->dev,
390 391 392
			"Device boot error - device unusable\n");
		err_exists = true;
	}
393 394 395 396 397 398

	security_val = RREG32(cpu_security_boot_status_reg);
	if (security_val & CPU_BOOT_DEV_STS0_ENABLED)
		dev_dbg(hdev->dev, "Device security status %#x\n",
				security_val);

399 400 401 402 403 404 405 406
	if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
		dev_err(hdev->dev,
			"Device boot error - unknown error 0x%08x\n",
			err_val);
		err_exists = true;
	}

	if (err_exists)
407 408 409 410 411
		return -EIO;

	return 0;
}

412
int hl_fw_cpucp_info_get(struct hl_device *hdev,
413 414
			u32 cpu_security_boot_status_reg,
			u32 boot_err0_reg)
415 416
{
	struct asic_fixed_properties *prop = &hdev->asic_prop;
417 418 419
	struct cpucp_packet pkt = {};
	void *cpucp_info_cpu_addr;
	dma_addr_t cpucp_info_dma_addr;
420
	u64 result;
421 422
	int rc;

423
	cpucp_info_cpu_addr =
424
			hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
425 426 427
					sizeof(struct cpucp_info),
					&cpucp_info_dma_addr);
	if (!cpucp_info_cpu_addr) {
428
		dev_err(hdev->dev,
O
Oded Gabbay 已提交
429
			"Failed to allocate DMA memory for CPU-CP info packet\n");
430 431 432
		return -ENOMEM;
	}

433
	memset(cpucp_info_cpu_addr, 0, sizeof(struct cpucp_info));
434

435 436 437 438
	pkt.ctl = cpu_to_le32(CPUCP_PACKET_INFO_GET <<
				CPUCP_PKT_CTL_OPCODE_SHIFT);
	pkt.addr = cpu_to_le64(cpucp_info_dma_addr);
	pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_info));
439 440

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
441
					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
442 443
	if (rc) {
		dev_err(hdev->dev,
O
Oded Gabbay 已提交
444
			"Failed to handle CPU-CP info pkt, error %d\n", rc);
445 446 447
		goto out;
	}

448 449 450 451 452 453
	rc = fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
	if (rc) {
		dev_err(hdev->dev, "Errors in device boot\n");
		goto out;
	}

454 455
	memcpy(&prop->cpucp_info, cpucp_info_cpu_addr,
			sizeof(prop->cpucp_info));
456

457
	rc = hl_build_hwmon_channel_info(hdev, prop->cpucp_info.sensors);
458 459 460 461 462 463 464
	if (rc) {
		dev_err(hdev->dev,
			"Failed to build hwmon channel info, error %d\n", rc);
		rc = -EFAULT;
		goto out;
	}

465 466 467 468 469
	/* Read FW application security bits again */
	if (hdev->asic_prop.fw_security_status_valid)
		hdev->asic_prop.fw_app_security_map =
				RREG32(cpu_security_boot_status_reg);

470 471
out:
	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
472
			sizeof(struct cpucp_info), cpucp_info_cpu_addr);
473 474 475 476

	return rc;
}

477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543
static int hl_fw_send_msi_info_msg(struct hl_device *hdev)
{
	struct cpucp_array_data_packet *pkt;
	size_t total_pkt_size, data_size;
	u64 result;
	int rc;

	/* skip sending this info for unsupported ASICs */
	if (!hdev->asic_funcs->get_msi_info)
		return 0;

	data_size = CPUCP_NUM_OF_MSI_TYPES * sizeof(u32);
	total_pkt_size = sizeof(struct cpucp_array_data_packet) + data_size;

	/* data should be aligned to 8 bytes in order to CPU-CP to copy it */
	total_pkt_size = (total_pkt_size + 0x7) & ~0x7;

	/* total_pkt_size is casted to u16 later on */
	if (total_pkt_size > USHRT_MAX) {
		dev_err(hdev->dev, "CPUCP array data is too big\n");
		return -EINVAL;
	}

	pkt = kzalloc(total_pkt_size, GFP_KERNEL);
	if (!pkt)
		return -ENOMEM;

	pkt->length = cpu_to_le32(CPUCP_NUM_OF_MSI_TYPES);

	hdev->asic_funcs->get_msi_info((u32 *)&pkt->data);

	pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_MSI_INFO_SET <<
						CPUCP_PKT_CTL_OPCODE_SHIFT);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *)pkt,
						total_pkt_size, 0, &result);

	/*
	 * in case packet result is invalid it means that FW does not support
	 * this feature and will use default/hard coded MSI values. no reason
	 * to stop the boot
	 */
	if (rc && result == cpucp_packet_invalid)
		rc = 0;

	if (rc)
		dev_err(hdev->dev, "failed to send CPUCP array data\n");

	kfree(pkt);

	return rc;
}

int hl_fw_cpucp_handshake(struct hl_device *hdev,
			u32 cpu_security_boot_status_reg,
			u32 boot_err0_reg)
{
	int rc;

	rc = hl_fw_cpucp_info_get(hdev, cpu_security_boot_status_reg,
					boot_err0_reg);
	if (rc)
		return rc;

	return hl_fw_send_msi_info_msg(hdev);
}

544 545
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
{
546
	struct cpucp_packet pkt = {};
547 548
	void *eeprom_info_cpu_addr;
	dma_addr_t eeprom_info_dma_addr;
549
	u64 result;
550 551 552 553 554 555 556
	int rc;

	eeprom_info_cpu_addr =
			hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
					max_size, &eeprom_info_dma_addr);
	if (!eeprom_info_cpu_addr) {
		dev_err(hdev->dev,
O
Oded Gabbay 已提交
557
			"Failed to allocate DMA memory for CPU-CP EEPROM packet\n");
558 559 560 561 562
		return -ENOMEM;
	}

	memset(eeprom_info_cpu_addr, 0, max_size);

563 564
	pkt.ctl = cpu_to_le32(CPUCP_PACKET_EEPROM_DATA_GET <<
				CPUCP_PKT_CTL_OPCODE_SHIFT);
565
	pkt.addr = cpu_to_le64(eeprom_info_dma_addr);
566 567 568
	pkt.data_max_size = cpu_to_le32(max_size);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
569
			HL_CPUCP_EEPROM_TIMEOUT_USEC, &result);
570 571 572

	if (rc) {
		dev_err(hdev->dev,
O
Oded Gabbay 已提交
573 574
			"Failed to handle CPU-CP EEPROM packet, error %d\n",
			rc);
575 576 577 578 579 580 581 582 583 584 585 586
		goto out;
	}

	/* result contains the actual size */
	memcpy(data, eeprom_info_cpu_addr, min((size_t)result, max_size));

out:
	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, max_size,
			eeprom_info_cpu_addr);

	return rc;
}
587

588
int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
589 590
		struct hl_info_pci_counters *counters)
{
591
	struct cpucp_packet pkt = {};
592
	u64 result;
593 594
	int rc;

595 596
	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET <<
			CPUCP_PKT_CTL_OPCODE_SHIFT);
597 598

	/* Fetch PCI rx counter */
599
	pkt.index = cpu_to_le32(cpucp_pcie_throughput_rx);
600
	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
601
					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
602 603
	if (rc) {
		dev_err(hdev->dev,
O
Oded Gabbay 已提交
604
			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
605 606 607 608
		return rc;
	}
	counters->rx_throughput = result;

609 610 611 612
	memset(&pkt, 0, sizeof(pkt));
	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET <<
			CPUCP_PKT_CTL_OPCODE_SHIFT);

613
	/* Fetch PCI tx counter */
614
	pkt.index = cpu_to_le32(cpucp_pcie_throughput_tx);
615
	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
616
					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
617 618
	if (rc) {
		dev_err(hdev->dev,
O
Oded Gabbay 已提交
619
			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
620 621 622 623 624
		return rc;
	}
	counters->tx_throughput = result;

	/* Fetch PCI replay counter */
625
	memset(&pkt, 0, sizeof(pkt));
626 627
	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_REPLAY_CNT_GET <<
			CPUCP_PKT_CTL_OPCODE_SHIFT);
628 629

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
630
			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
631 632
	if (rc) {
		dev_err(hdev->dev,
O
Oded Gabbay 已提交
633
			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
634 635 636 637 638 639 640
		return rc;
	}
	counters->replay_cnt = (u32) result;

	return rc;
}

641
int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
642
{
643
	struct cpucp_packet pkt = {};
644
	u64 result;
645 646
	int rc;

647 648
	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TOTAL_ENERGY_GET <<
				CPUCP_PKT_CTL_OPCODE_SHIFT);
649 650

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
651
					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
652 653
	if (rc) {
		dev_err(hdev->dev,
654
			"Failed to handle CpuCP total energy pkt, error %d\n",
655 656 657 658 659 660 661 662 663
				rc);
		return rc;
	}

	*total_energy = result;

	return rc;
}

664
int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index,
665 666 667 668 669
						enum pll_index *pll_index)
{
	struct asic_fixed_properties *prop = &hdev->asic_prop;
	u8 pll_byte, pll_bit_off;
	bool dynamic_pll;
670
	int fw_pll_idx;
671 672 673 674 675 676 677

	dynamic_pll = prop->fw_security_status_valid &&
		(prop->fw_app_security_map & CPU_BOOT_DEV_STS0_DYN_PLL_EN);

	if (!dynamic_pll) {
		/*
		 * in case we are working with legacy FW (each asic has unique
678 679
		 * PLL numbering) use the driver based index as they are
		 * aligned with fw legacy numbering
680
		 */
681
		*pll_index = input_pll_index;
682 683 684
		return 0;
	}

685 686 687 688 689 690 691 692 693 694
	/* retrieve a FW compatible PLL index based on
	 * ASIC specific user request
	 */
	fw_pll_idx = hdev->asic_funcs->map_pll_idx_to_fw_idx(input_pll_index);
	if (fw_pll_idx < 0) {
		dev_err(hdev->dev, "Invalid PLL index (%u) error %d\n",
			input_pll_index, fw_pll_idx);
		return -EINVAL;
	}

695
	/* PLL map is a u8 array */
696 697
	pll_byte = prop->cpucp_info.pll_map[fw_pll_idx >> 3];
	pll_bit_off = fw_pll_idx & 0x7;
698 699 700

	if (!(pll_byte & BIT(pll_bit_off))) {
		dev_err(hdev->dev, "PLL index %d is not supported\n",
701
			fw_pll_idx);
702 703 704
		return -EINVAL;
	}

705
	*pll_index = fw_pll_idx;
706 707 708 709

	return 0;
}

710
int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
711
		u16 *pll_freq_arr)
712 713
{
	struct cpucp_packet pkt;
714
	enum pll_index used_pll_idx;
715
	u64 result;
716 717
	int rc;

718 719 720 721
	rc = get_used_pll_index(hdev, pll_index, &used_pll_idx);
	if (rc)
		return rc;

722 723
	memset(&pkt, 0, sizeof(pkt));

724
	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PLL_INFO_GET <<
725
				CPUCP_PKT_CTL_OPCODE_SHIFT);
726
	pkt.pll_type = __cpu_to_le16((u16)used_pll_idx);
727 728 729 730 731 732

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
	if (rc)
		dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);

733 734 735 736
	pll_freq_arr[0] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT0_MASK, result);
	pll_freq_arr[1] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT1_MASK, result);
	pll_freq_arr[2] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT2_MASK, result);
	pll_freq_arr[3] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT3_MASK, result);
737 738 739 740

	return rc;
}

741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763
int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
{
	struct cpucp_packet pkt;
	u64 result;
	int rc;

	memset(&pkt, 0, sizeof(pkt));

	pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET <<
				CPUCP_PKT_CTL_OPCODE_SHIFT);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
	if (rc) {
		dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
		return rc;
	}

	*power = result;

	return rc;
}

764
static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
765
{
766 767 768
	/* Some of the status codes below are deprecated in newer f/w
	 * versions but we keep them here for backward compatibility
	 */
769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813
	switch (status) {
	case CPU_BOOT_STATUS_NA:
		dev_err(hdev->dev,
			"Device boot error - BTL did NOT run\n");
		break;
	case CPU_BOOT_STATUS_IN_WFE:
		dev_err(hdev->dev,
			"Device boot error - Stuck inside WFE loop\n");
		break;
	case CPU_BOOT_STATUS_IN_BTL:
		dev_err(hdev->dev,
			"Device boot error - Stuck in BTL\n");
		break;
	case CPU_BOOT_STATUS_IN_PREBOOT:
		dev_err(hdev->dev,
			"Device boot error - Stuck in Preboot\n");
		break;
	case CPU_BOOT_STATUS_IN_SPL:
		dev_err(hdev->dev,
			"Device boot error - Stuck in SPL\n");
		break;
	case CPU_BOOT_STATUS_IN_UBOOT:
		dev_err(hdev->dev,
			"Device boot error - Stuck in u-boot\n");
		break;
	case CPU_BOOT_STATUS_DRAM_INIT_FAIL:
		dev_err(hdev->dev,
			"Device boot error - DRAM initialization failed\n");
		break;
	case CPU_BOOT_STATUS_UBOOT_NOT_READY:
		dev_err(hdev->dev,
			"Device boot error - u-boot stopped by user\n");
		break;
	case CPU_BOOT_STATUS_TS_INIT_FAIL:
		dev_err(hdev->dev,
			"Device boot error - Thermal Sensor initialization failed\n");
		break;
	default:
		dev_err(hdev->dev,
			"Device boot error - Invalid status code %d\n",
			status);
		break;
	}
}

814 815 816
int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
		u32 cpu_security_boot_status_reg, u32 boot_err0_reg,
		u32 timeout)
817
{
818 819
	struct asic_fixed_properties *prop = &hdev->asic_prop;
	u32 status, security_status;
820 821
	int rc;

822 823 824 825 826
	/* pldm was added for cases in which we use preboot on pldm and want
	 * to load boot fit, but we can't wait for preboot because it runs
	 * very slowly
	 */
	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) || hdev->pldm)
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
		return 0;

	/* Need to check two possible scenarios:
	 *
	 * CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT - for newer firmwares where
	 * the preboot is waiting for the boot fit
	 *
	 * All other status values - for older firmwares where the uboot was
	 * loaded from the FLASH
	 */
	rc = hl_poll_timeout(
		hdev,
		cpu_boot_status_reg,
		status,
		(status == CPU_BOOT_STATUS_IN_UBOOT) ||
		(status == CPU_BOOT_STATUS_DRAM_RDY) ||
		(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
		(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
		(status == CPU_BOOT_STATUS_SRAM_AVAIL) ||
		(status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT),
		10000,
		timeout);

	if (rc) {
		dev_err(hdev->dev, "Failed to read preboot version\n");
		detect_cpu_boot_status(hdev, status);
853 854
		fw_read_errors(hdev, boot_err0_reg,
				cpu_security_boot_status_reg);
855 856 857
		return -EIO;
	}

858 859 860
	rc = hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
	if (rc)
		return rc;
861

862 863 864
	security_status = RREG32(cpu_security_boot_status_reg);

	/* We read security status multiple times during boot:
865 866
	 * 1. preboot - a. Check whether the security status bits are valid
	 *              b. Check whether fw security is enabled
867 868 869 870 871
	 *              c. Check whether hard reset is done by preboot
	 * 2. boot cpu - a. Fetch boot cpu security status
	 *               b. Check whether hard reset is done by boot cpu
	 * 3. FW application - a. Fetch fw application security status
	 *                     b. Check whether hard reset is done by fw app
872 873 874 875 876 877
	 *
	 * Preboot:
	 * Check security status bit (CPU_BOOT_DEV_STS0_ENABLED), if it is set
	 * check security enabled bit (CPU_BOOT_DEV_STS0_SECURITY_EN)
	 */
	if (security_status & CPU_BOOT_DEV_STS0_ENABLED) {
878
		prop->fw_security_status_valid = 1;
879

880 881 882
		/* FW security should be derived from PCI ID, we keep this
		 * check for backward compatibility
		 */
883 884
		if (security_status & CPU_BOOT_DEV_STS0_SECURITY_EN)
			prop->fw_security_disabled = false;
885 886

		if (security_status & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
887
			prop->hard_reset_done_by_fw = true;
888
	} else {
889
		prop->fw_security_status_valid = 0;
890 891
	}

892 893 894
	dev_dbg(hdev->dev, "Firmware preboot security status %#x\n",
			security_status);

895 896
	dev_dbg(hdev->dev, "Firmware preboot hard-reset is %s\n",
			prop->hard_reset_done_by_fw ? "enabled" : "disabled");
897

898
	dev_info(hdev->dev, "firmware-level security is %s\n",
899
			prop->fw_security_disabled ? "disabled" : "enabled");
900

901 902 903
	return 0;
}

904
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
905
			u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
906 907
			u32 cpu_security_boot_status_reg, u32 boot_err0_reg,
			bool skip_bmc, u32 cpu_timeout, u32 boot_fit_timeout)
908
{
909
	struct asic_fixed_properties *prop = &hdev->asic_prop;
910 911 912
	u32 status;
	int rc;

913
	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU))
914 915
		return 0;

916 917 918
	dev_info(hdev->dev, "Going to wait for device boot (up to %lds)\n",
		cpu_timeout / USEC_PER_SEC);

919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960
	/* Wait for boot FIT request */
	rc = hl_poll_timeout(
		hdev,
		cpu_boot_status_reg,
		status,
		status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT,
		10000,
		boot_fit_timeout);

	if (rc) {
		dev_dbg(hdev->dev,
			"No boot fit request received, resuming boot\n");
	} else {
		rc = hdev->asic_funcs->load_boot_fit_to_device(hdev);
		if (rc)
			goto out;

		/* Clear device CPU message status */
		WREG32(cpu_msg_status_reg, CPU_MSG_CLR);

		/* Signal device CPU that boot loader is ready */
		WREG32(msg_to_cpu_reg, KMD_MSG_FIT_RDY);

		/* Poll for CPU device ack */
		rc = hl_poll_timeout(
			hdev,
			cpu_msg_status_reg,
			status,
			status == CPU_MSG_OK,
			10000,
			boot_fit_timeout);

		if (rc) {
			dev_err(hdev->dev,
				"Timeout waiting for boot fit load ack\n");
			goto out;
		}

		/* Clear message */
		WREG32(msg_to_cpu_reg, KMD_MSG_NA);
	}

961 962 963 964 965 966 967 968 969 970 971 972
	/* Make sure CPU boot-loader is running */
	rc = hl_poll_timeout(
		hdev,
		cpu_boot_status_reg,
		status,
		(status == CPU_BOOT_STATUS_DRAM_RDY) ||
		(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
		(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
		(status == CPU_BOOT_STATUS_SRAM_AVAIL),
		10000,
		cpu_timeout);

973 974
	dev_dbg(hdev->dev, "uboot status = %d\n", status);

975
	/* Read U-Boot version now in case we will later fail */
976 977
	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_UBOOT);

978 979 980
	/* Clear reset status since we need to read it again from boot CPU */
	prop->hard_reset_done_by_fw = false;

981
	/* Read boot_cpu security bits */
982 983
	if (prop->fw_security_status_valid) {
		prop->fw_boot_cpu_security_map =
984 985
				RREG32(cpu_security_boot_status_reg);

986 987 988
		if (prop->fw_boot_cpu_security_map &
				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
			prop->hard_reset_done_by_fw = true;
989 990 991 992

		dev_dbg(hdev->dev,
			"Firmware boot CPU security status %#x\n",
			prop->fw_boot_cpu_security_map);
993 994 995 996 997
	}

	dev_dbg(hdev->dev, "Firmware boot CPU hard-reset is %s\n",
			prop->hard_reset_done_by_fw ? "enabled" : "disabled");

998
	if (rc) {
999
		detect_cpu_boot_status(hdev, status);
1000 1001 1002 1003
		rc = -EIO;
		goto out;
	}

1004
	if (!(hdev->fw_components & FW_TYPE_LINUX)) {
1005
		dev_info(hdev->dev, "Skip loading Linux F/W\n");
1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049
		goto out;
	}

	if (status == CPU_BOOT_STATUS_SRAM_AVAIL)
		goto out;

	dev_info(hdev->dev,
		"Loading firmware to device, may take some time...\n");

	rc = hdev->asic_funcs->load_firmware_to_device(hdev);
	if (rc)
		goto out;

	if (skip_bmc) {
		WREG32(msg_to_cpu_reg, KMD_MSG_SKIP_BMC);

		rc = hl_poll_timeout(
			hdev,
			cpu_boot_status_reg,
			status,
			(status == CPU_BOOT_STATUS_BMC_WAITING_SKIPPED),
			10000,
			cpu_timeout);

		if (rc) {
			dev_err(hdev->dev,
				"Failed to get ACK on skipping BMC, %d\n",
				status);
			WREG32(msg_to_cpu_reg, KMD_MSG_NA);
			rc = -EIO;
			goto out;
		}
	}

	WREG32(msg_to_cpu_reg, KMD_MSG_FIT_RDY);

	rc = hl_poll_timeout(
		hdev,
		cpu_boot_status_reg,
		status,
		(status == CPU_BOOT_STATUS_SRAM_AVAIL),
		10000,
		cpu_timeout);

1050 1051 1052
	/* Clear message */
	WREG32(msg_to_cpu_reg, KMD_MSG_NA);

1053 1054 1055 1056 1057 1058
	if (rc) {
		if (status == CPU_BOOT_STATUS_FIT_CORRUPTED)
			dev_err(hdev->dev,
				"Device reports FIT image is corrupted\n");
		else
			dev_err(hdev->dev,
1059 1060
				"Failed to load firmware to device, %d\n",
				status);
1061 1062 1063 1064 1065

		rc = -EIO;
		goto out;
	}

1066 1067 1068 1069
	rc = fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
	if (rc)
		return rc;

1070 1071 1072
	/* Clear reset status since we need to read again from app */
	prop->hard_reset_done_by_fw = false;

1073
	/* Read FW application security bits */
1074 1075
	if (prop->fw_security_status_valid) {
		prop->fw_app_security_map =
1076 1077
				RREG32(cpu_security_boot_status_reg);

1078 1079 1080
		if (prop->fw_app_security_map &
				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
			prop->hard_reset_done_by_fw = true;
1081 1082 1083 1084

		dev_dbg(hdev->dev,
			"Firmware application CPU security status %#x\n",
			prop->fw_app_security_map);
1085 1086 1087 1088 1089
	}

	dev_dbg(hdev->dev, "Firmware application CPU hard-reset is %s\n",
			prop->hard_reset_done_by_fw ? "enabled" : "disabled");

1090 1091
	dev_info(hdev->dev, "Successfully loaded firmware to device\n");

1092 1093
	return 0;

1094
out:
1095
	fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
1096 1097 1098

	return rc;
}