提交 cc5ac1ca 编写于 作者: D Dave Airlie

Merge branch 'amdkfd-v6' of git://people.freedesktop.org/~gabbayo/linux into drm-next

Merge AMDKFD it seems clean enough.
* 'amdkfd-v6' of git://people.freedesktop.org/~gabbayo/linux: (29 commits)
  amdkfd: Implement the Get Version IOCTL
  amdkfd: Implement the Get Process Aperture IOCTL
  amdkfd: Implement the Get Clock Counters IOCTL
  amdkfd: Implement the Set Memory Policy IOCTL
  amdkfd: Implement the create/destroy/update queue IOCTLs
  amdkfd: Add interrupt handling module
  amdkfd: Add device queue manager module
  amdkfd: Add process queue manager module
  amdkfd: Add packet manager module
  amdkfd: Add module parameter of scheduling policy
  amdkfd: Add kernel queue module
  amdkfd: Add mqd_manager module
  amdkfd: Add queue module
  amdkfd: Add binding/unbinding calls to amd_iommu driver
  amdkfd: Add basic modules to amdkfd
  amdkfd: Add topology module to amdkfd
  amdkfd: Add amdkfd skeleton driver
  amdkfd: Add IOCTL set definitions of amdkfd
  Update MAINTAINERS and CREDITS files with amdkfd info
  drm/radeon: Add radeon <--> amdkfd interface
  ...
......@@ -1197,6 +1197,13 @@ S: R. Tocantins, 89 - Cristo Rei
S: 80050-430 - Curitiba - Paraná
S: Brazil
N: Oded Gabbay
E: oded.gabbay@gmail.com
D: AMD KFD maintainer
S: 12 Shraga Raphaeli
S: Petah-Tikva, 4906418
S: Israel
N: Kumar Gala
E: galak@kernel.crashing.org
D: Embedded PowerPC 6xx/7xx/74xx/82xx/83xx/85xx support
......
......@@ -618,6 +618,16 @@ S: Maintained
F: drivers/iommu/amd_iommu*.[ch]
F: include/linux/amd-iommu.h
AMD KFD
M: Oded Gabbay <oded.gabbay@amd.com>
L: dri-devel@lists.freedesktop.org
T: git git://people.freedesktop.org/~gabbayo/linux.git
S: Supported
F: drivers/gpu/drm/amd/amdkfd/
F: drivers/gpu/drm/radeon/radeon_kfd.c
F: drivers/gpu/drm/radeon/radeon_kfd.h
F: include/uapi/linux/kfd_ioctl.h
AMD MICROCODE UPDATE SUPPORT
M: Andreas Herrmann <herrmann.der.user@googlemail.com>
L: amd64-microcode@amd64.org
......
......@@ -200,3 +200,5 @@ source "drivers/gpu/drm/tegra/Kconfig"
source "drivers/gpu/drm/panel/Kconfig"
source "drivers/gpu/drm/sti/Kconfig"
source "drivers/gpu/drm/amd/amdkfd/Kconfig"
......@@ -65,3 +65,4 @@ obj-$(CONFIG_DRM_STI) += sti/
obj-y += i2c/
obj-y += panel/
obj-y += bridge/
obj-$(CONFIG_HSA_AMD) += amd/amdkfd/
#
# Heterogenous system architecture configuration
#
config HSA_AMD
tristate "HSA kernel driver for AMD GPU devices"
depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64
help
Enable this if you want to use HSA features on AMD GPU devices.
#
# Makefile for Heterogenous System Architecture support for AMD GPU devices
#
ccflags-y := -Iinclude/drm -Idrivers/gpu/drm/amd/include/
amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
kfd_process.o kfd_queue.o kfd_mqd_manager.o \
kfd_kernel_queue.o kfd_packet_manager.o \
kfd_process_queue_manager.o kfd_device_queue_manager.o \
kfd_interrupt.o
obj-$(CONFIG_HSA_AMD) += amdkfd.o
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef CIK_REGS_H
#define CIK_REGS_H
#define IH_VMID_0_LUT 0x3D40u
#define BIF_DOORBELL_CNTL 0x530Cu
#define SRBM_GFX_CNTL 0xE44
#define PIPEID(x) ((x) << 0)
#define MEID(x) ((x) << 2)
#define VMID(x) ((x) << 4)
#define QUEUEID(x) ((x) << 8)
#define SQ_CONFIG 0x8C00
#define SH_MEM_BASES 0x8C28
/* if PTR32, these are the bases for scratch and lds */
#define PRIVATE_BASE(x) ((x) << 0) /* scratch */
#define SHARED_BASE(x) ((x) << 16) /* LDS */
#define SH_MEM_APE1_BASE 0x8C2C
/* if PTR32, this is the base location of GPUVM */
#define SH_MEM_APE1_LIMIT 0x8C30
/* if PTR32, this is the upper limit of GPUVM */
#define SH_MEM_CONFIG 0x8C34
#define PTR32 (1 << 0)
#define PRIVATE_ATC (1 << 1)
#define ALIGNMENT_MODE(x) ((x) << 2)
#define SH_MEM_ALIGNMENT_MODE_DWORD 0
#define SH_MEM_ALIGNMENT_MODE_DWORD_STRICT 1
#define SH_MEM_ALIGNMENT_MODE_STRICT 2
#define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3
#define DEFAULT_MTYPE(x) ((x) << 4)
#define APE1_MTYPE(x) ((x) << 7)
/* valid for both DEFAULT_MTYPE and APE1_MTYPE */
#define MTYPE_CACHED 0
#define MTYPE_NONCACHED 3
#define SH_STATIC_MEM_CONFIG 0x9604u
#define TC_CFG_L1_LOAD_POLICY0 0xAC68
#define TC_CFG_L1_LOAD_POLICY1 0xAC6C
#define TC_CFG_L1_STORE_POLICY 0xAC70
#define TC_CFG_L2_LOAD_POLICY0 0xAC74
#define TC_CFG_L2_LOAD_POLICY1 0xAC78
#define TC_CFG_L2_STORE_POLICY0 0xAC7C
#define TC_CFG_L2_STORE_POLICY1 0xAC80
#define TC_CFG_L2_ATOMIC_POLICY 0xAC84
#define TC_CFG_L1_VOLATILE 0xAC88
#define TC_CFG_L2_VOLATILE 0xAC8C
#define CP_PQ_WPTR_POLL_CNTL 0xC20C
#define WPTR_POLL_EN (1 << 31)
#define CPC_INT_CNTL 0xC2D0
#define CP_ME1_PIPE0_INT_CNTL 0xC214
#define CP_ME1_PIPE1_INT_CNTL 0xC218
#define CP_ME1_PIPE2_INT_CNTL 0xC21C
#define CP_ME1_PIPE3_INT_CNTL 0xC220
#define CP_ME2_PIPE0_INT_CNTL 0xC224
#define CP_ME2_PIPE1_INT_CNTL 0xC228
#define CP_ME2_PIPE2_INT_CNTL 0xC22C
#define CP_ME2_PIPE3_INT_CNTL 0xC230
#define DEQUEUE_REQUEST_INT_ENABLE (1 << 13)
#define WRM_POLL_TIMEOUT_INT_ENABLE (1 << 17)
#define PRIV_REG_INT_ENABLE (1 << 23)
#define TIME_STAMP_INT_ENABLE (1 << 26)
#define GENERIC2_INT_ENABLE (1 << 29)
#define GENERIC1_INT_ENABLE (1 << 30)
#define GENERIC0_INT_ENABLE (1 << 31)
#define CP_ME1_PIPE0_INT_STATUS 0xC214
#define CP_ME1_PIPE1_INT_STATUS 0xC218
#define CP_ME1_PIPE2_INT_STATUS 0xC21C
#define CP_ME1_PIPE3_INT_STATUS 0xC220
#define CP_ME2_PIPE0_INT_STATUS 0xC224
#define CP_ME2_PIPE1_INT_STATUS 0xC228
#define CP_ME2_PIPE2_INT_STATUS 0xC22C
#define CP_ME2_PIPE3_INT_STATUS 0xC230
#define DEQUEUE_REQUEST_INT_STATUS (1 << 13)
#define WRM_POLL_TIMEOUT_INT_STATUS (1 << 17)
#define PRIV_REG_INT_STATUS (1 << 23)
#define TIME_STAMP_INT_STATUS (1 << 26)
#define GENERIC2_INT_STATUS (1 << 29)
#define GENERIC1_INT_STATUS (1 << 30)
#define GENERIC0_INT_STATUS (1 << 31)
#define CP_HPD_EOP_BASE_ADDR 0xC904
#define CP_HPD_EOP_BASE_ADDR_HI 0xC908
#define CP_HPD_EOP_VMID 0xC90C
#define CP_HPD_EOP_CONTROL 0xC910
#define EOP_SIZE(x) ((x) << 0)
#define EOP_SIZE_MASK (0x3f << 0)
#define CP_MQD_BASE_ADDR 0xC914
#define CP_MQD_BASE_ADDR_HI 0xC918
#define CP_HQD_ACTIVE 0xC91C
#define CP_HQD_VMID 0xC920
#define CP_HQD_PERSISTENT_STATE 0xC924u
#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8)
#define PRELOAD_REQ (1 << 0)
#define CP_HQD_PIPE_PRIORITY 0xC928u
#define CP_HQD_QUEUE_PRIORITY 0xC92Cu
#define CP_HQD_QUANTUM 0xC930u
#define QUANTUM_EN 1U
#define QUANTUM_SCALE_1MS (1U << 4)
#define QUANTUM_DURATION(x) ((x) << 8)
#define CP_HQD_PQ_BASE 0xC934
#define CP_HQD_PQ_BASE_HI 0xC938
#define CP_HQD_PQ_RPTR 0xC93C
#define CP_HQD_PQ_RPTR_REPORT_ADDR 0xC940
#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI 0xC944
#define CP_HQD_PQ_WPTR_POLL_ADDR 0xC948
#define CP_HQD_PQ_WPTR_POLL_ADDR_HI 0xC94C
#define CP_HQD_PQ_DOORBELL_CONTROL 0xC950
#define DOORBELL_OFFSET(x) ((x) << 2)
#define DOORBELL_OFFSET_MASK (0x1fffff << 2)
#define DOORBELL_SOURCE (1 << 28)
#define DOORBELL_SCHD_HIT (1 << 29)
#define DOORBELL_EN (1 << 30)
#define DOORBELL_HIT (1 << 31)
#define CP_HQD_PQ_WPTR 0xC954
#define CP_HQD_PQ_CONTROL 0xC958
#define QUEUE_SIZE(x) ((x) << 0)
#define QUEUE_SIZE_MASK (0x3f << 0)
#define RPTR_BLOCK_SIZE(x) ((x) << 8)
#define RPTR_BLOCK_SIZE_MASK (0x3f << 8)
#define MIN_AVAIL_SIZE(x) ((x) << 20)
#define PQ_ATC_EN (1 << 23)
#define PQ_VOLATILE (1 << 26)
#define NO_UPDATE_RPTR (1 << 27)
#define UNORD_DISPATCH (1 << 28)
#define ROQ_PQ_IB_FLIP (1 << 29)
#define PRIV_STATE (1 << 30)
#define KMD_QUEUE (1 << 31)
#define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5)
#define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3)
#define CP_HQD_IB_BASE_ADDR 0xC95Cu
#define CP_HQD_IB_BASE_ADDR_HI 0xC960u
#define CP_HQD_IB_RPTR 0xC964u
#define CP_HQD_IB_CONTROL 0xC968u
#define IB_ATC_EN (1U << 23)
#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20)
#define CP_HQD_DEQUEUE_REQUEST 0xC974
#define DEQUEUE_REQUEST_DRAIN 1
#define DEQUEUE_REQUEST_RESET 2
#define DEQUEUE_INT (1U << 8)
#define CP_HQD_SEMA_CMD 0xC97Cu
#define CP_HQD_MSG_TYPE 0xC980u
#define CP_HQD_ATOMIC0_PREOP_LO 0xC984u
#define CP_HQD_ATOMIC0_PREOP_HI 0xC988u
#define CP_HQD_ATOMIC1_PREOP_LO 0xC98Cu
#define CP_HQD_ATOMIC1_PREOP_HI 0xC990u
#define CP_HQD_HQ_SCHEDULER0 0xC994u
#define CP_HQD_HQ_SCHEDULER1 0xC998u
#define CP_MQD_CONTROL 0xC99C
#define MQD_VMID(x) ((x) << 0)
#define MQD_VMID_MASK (0xf << 0)
#define MQD_CONTROL_PRIV_STATE_EN (1U << 8)
#define GRBM_GFX_INDEX 0x30800
#define INSTANCE_INDEX(x) ((x) << 0)
#define SH_INDEX(x) ((x) << 8)
#define SE_INDEX(x) ((x) << 16)
#define SH_BROADCAST_WRITES (1 << 29)
#define INSTANCE_BROADCAST_WRITES (1 << 30)
#define SE_BROADCAST_WRITES (1 << 31)
#define SQC_CACHES 0x30d20
#define SQC_POLICY 0x8C38u
#define SQC_VOLATILE 0x8C3Cu
#define CP_PERFMON_CNTL 0x36020
#define ATC_VMID0_PASID_MAPPING 0x339Cu
#define ATC_VMID_PASID_MAPPING_UPDATE_STATUS 0x3398u
#define ATC_VMID_PASID_MAPPING_VALID (1U << 31)
#define ATC_VM_APERTURE0_CNTL 0x3310u
#define ATS_ACCESS_MODE_NEVER 0
#define ATS_ACCESS_MODE_ALWAYS 1
#define ATC_VM_APERTURE0_CNTL2 0x3318u
#define ATC_VM_APERTURE0_HIGH_ADDR 0x3308u
#define ATC_VM_APERTURE0_LOW_ADDR 0x3300u
#define ATC_VM_APERTURE1_CNTL 0x3314u
#define ATC_VM_APERTURE1_CNTL2 0x331Cu
#define ATC_VM_APERTURE1_HIGH_ADDR 0x330Cu
#define ATC_VM_APERTURE1_LOW_ADDR 0x3304u
#endif
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <linux/device.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
#include <uapi/linux/kfd_ioctl.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <uapi/asm-generic/mman-common.h>
#include <asm/processor.h>
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
static long kfd_ioctl(struct file *, unsigned int, unsigned long);
static int kfd_open(struct inode *, struct file *);
static int kfd_mmap(struct file *, struct vm_area_struct *);
static const char kfd_dev_name[] = "kfd";
static const struct file_operations kfd_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = kfd_ioctl,
.compat_ioctl = kfd_ioctl,
.open = kfd_open,
.mmap = kfd_mmap,
};
static int kfd_char_dev_major = -1;
static struct class *kfd_class;
struct device *kfd_device;
int kfd_chardev_init(void)
{
int err = 0;
kfd_char_dev_major = register_chrdev(0, kfd_dev_name, &kfd_fops);
err = kfd_char_dev_major;
if (err < 0)
goto err_register_chrdev;
kfd_class = class_create(THIS_MODULE, kfd_dev_name);
err = PTR_ERR(kfd_class);
if (IS_ERR(kfd_class))
goto err_class_create;
kfd_device = device_create(kfd_class, NULL,
MKDEV(kfd_char_dev_major, 0),
NULL, kfd_dev_name);
err = PTR_ERR(kfd_device);
if (IS_ERR(kfd_device))
goto err_device_create;
return 0;
err_device_create:
class_destroy(kfd_class);
err_class_create:
unregister_chrdev(kfd_char_dev_major, kfd_dev_name);
err_register_chrdev:
return err;
}
void kfd_chardev_exit(void)
{
device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0));
class_destroy(kfd_class);
unregister_chrdev(kfd_char_dev_major, kfd_dev_name);
}
struct device *kfd_chardev(void)
{
return kfd_device;
}
static int kfd_open(struct inode *inode, struct file *filep)
{
struct kfd_process *process;
if (iminor(inode) != 0)
return -ENODEV;
process = kfd_create_process(current);
if (IS_ERR(process))
return PTR_ERR(process);
process->is_32bit_user_mode = is_compat_task();
dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
process->pasid, process->is_32bit_user_mode);
kfd_init_apertures(process);
return 0;
}
static long kfd_ioctl_get_version(struct file *filep, struct kfd_process *p,
void __user *arg)
{
struct kfd_ioctl_get_version_args args;
int err = 0;
args.major_version = KFD_IOCTL_MAJOR_VERSION;
args.minor_version = KFD_IOCTL_MINOR_VERSION;
if (copy_to_user(arg, &args, sizeof(args)))
err = -EFAULT;
return err;
}
static int set_queue_properties_from_user(struct queue_properties *q_properties,
struct kfd_ioctl_create_queue_args *args)
{
if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) {
pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n");
return -EINVAL;
}
if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) {
pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n");
return -EINVAL;
}
if ((args->ring_base_address) &&
(!access_ok(VERIFY_WRITE, args->ring_base_address, sizeof(uint64_t)))) {
pr_err("kfd: can't access ring base address\n");
return -EFAULT;
}
if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) {
pr_err("kfd: ring size must be a power of 2 or 0\n");
return -EINVAL;
}
if (!access_ok(VERIFY_WRITE, args->read_pointer_address, sizeof(uint32_t))) {
pr_err("kfd: can't access read pointer\n");
return -EFAULT;
}
if (!access_ok(VERIFY_WRITE, args->write_pointer_address, sizeof(uint32_t))) {
pr_err("kfd: can't access write pointer\n");
return -EFAULT;
}
q_properties->is_interop = false;
q_properties->queue_percent = args->queue_percentage;
q_properties->priority = args->queue_priority;
q_properties->queue_address = args->ring_base_address;
q_properties->queue_size = args->ring_size;
q_properties->read_ptr = (uint32_t *) args->read_pointer_address;
q_properties->write_ptr = (uint32_t *) args->write_pointer_address;
if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
else
return -ENOTSUPP;
if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
q_properties->format = KFD_QUEUE_FORMAT_AQL;
else
q_properties->format = KFD_QUEUE_FORMAT_PM4;
pr_debug("Queue Percentage (%d, %d)\n",
q_properties->queue_percent, args->queue_percentage);
pr_debug("Queue Priority (%d, %d)\n",
q_properties->priority, args->queue_priority);
pr_debug("Queue Address (0x%llX, 0x%llX)\n",
q_properties->queue_address, args->ring_base_address);
pr_debug("Queue Size (0x%llX, %u)\n",
q_properties->queue_size, args->ring_size);
pr_debug("Queue r/w Pointers (0x%llX, 0x%llX)\n",
(uint64_t) q_properties->read_ptr,
(uint64_t) q_properties->write_ptr);
pr_debug("Queue Format (%d)\n", q_properties->format);
return 0;
}
static long kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
void __user *arg)
{
struct kfd_ioctl_create_queue_args args;
struct kfd_dev *dev;
int err = 0;
unsigned int queue_id;
struct kfd_process_device *pdd;
struct queue_properties q_properties;
memset(&q_properties, 0, sizeof(struct queue_properties));
if (copy_from_user(&args, arg, sizeof(args)))
return -EFAULT;
pr_debug("kfd: creating queue ioctl\n");
err = set_queue_properties_from_user(&q_properties, &args);
if (err)
return err;
dev = kfd_device_by_id(args.gpu_id);
if (dev == NULL)
return -EINVAL;
mutex_lock(&p->mutex);
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd) < 0) {
err = PTR_ERR(pdd);
goto err_bind_process;
}
pr_debug("kfd: creating queue for PASID %d on GPU 0x%x\n",
p->pasid,
dev->id);
err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, 0,
KFD_QUEUE_TYPE_COMPUTE, &queue_id);
if (err != 0)
goto err_create_queue;
args.queue_id = queue_id;
/* Return gpu_id as doorbell offset for mmap usage */
args.doorbell_offset = args.gpu_id << PAGE_SHIFT;
if (copy_to_user(arg, &args, sizeof(args))) {
err = -EFAULT;
goto err_copy_args_out;
}
mutex_unlock(&p->mutex);
pr_debug("kfd: queue id %d was created successfully\n", args.queue_id);
pr_debug("ring buffer address == 0x%016llX\n",
args.ring_base_address);
pr_debug("read ptr address == 0x%016llX\n",
args.read_pointer_address);
pr_debug("write ptr address == 0x%016llX\n",
args.write_pointer_address);
return 0;
err_copy_args_out:
pqm_destroy_queue(&p->pqm, queue_id);
err_create_queue:
err_bind_process:
mutex_unlock(&p->mutex);
return err;
}
static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
void __user *arg)
{
int retval;
struct kfd_ioctl_destroy_queue_args args;
if (copy_from_user(&args, arg, sizeof(args)))
return -EFAULT;
pr_debug("kfd: destroying queue id %d for PASID %d\n",
args.queue_id,
p->pasid);
mutex_lock(&p->mutex);
retval = pqm_destroy_queue(&p->pqm, args.queue_id);
mutex_unlock(&p->mutex);
return retval;
}
static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
void __user *arg)
{
int retval;
struct kfd_ioctl_update_queue_args args;
struct queue_properties properties;
if (copy_from_user(&args, arg, sizeof(args)))
return -EFAULT;
if (args.queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) {
pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n");
return -EINVAL;
}
if (args.queue_priority > KFD_MAX_QUEUE_PRIORITY) {
pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n");
return -EINVAL;
}
if ((args.ring_base_address) &&
(!access_ok(VERIFY_WRITE, args.ring_base_address, sizeof(uint64_t)))) {
pr_err("kfd: can't access ring base address\n");
return -EFAULT;
}
if (!is_power_of_2(args.ring_size) && (args.ring_size != 0)) {
pr_err("kfd: ring size must be a power of 2 or 0\n");
return -EINVAL;
}
properties.queue_address = args.ring_base_address;
properties.queue_size = args.ring_size;
properties.queue_percent = args.queue_percentage;
properties.priority = args.queue_priority;
pr_debug("kfd: updating queue id %d for PASID %d\n",
args.queue_id, p->pasid);
mutex_lock(&p->mutex);
retval = pqm_update_queue(&p->pqm, args.queue_id, &properties);
mutex_unlock(&p->mutex);
return retval;
}
static long kfd_ioctl_set_memory_policy(struct file *filep,
struct kfd_process *p, void __user *arg)
{
struct kfd_ioctl_set_memory_policy_args args;
struct kfd_dev *dev;
int err = 0;
struct kfd_process_device *pdd;
enum cache_policy default_policy, alternate_policy;
if (copy_from_user(&args, arg, sizeof(args)))
return -EFAULT;
if (args.default_policy != KFD_IOC_CACHE_POLICY_COHERENT
&& args.default_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) {
return -EINVAL;
}
if (args.alternate_policy != KFD_IOC_CACHE_POLICY_COHERENT
&& args.alternate_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) {
return -EINVAL;
}
dev = kfd_device_by_id(args.gpu_id);
if (dev == NULL)
return -EINVAL;
mutex_lock(&p->mutex);
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd) < 0) {
err = PTR_ERR(pdd);
goto out;
}
default_policy = (args.default_policy == KFD_IOC_CACHE_POLICY_COHERENT)
? cache_policy_coherent : cache_policy_noncoherent;
alternate_policy =
(args.alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT)
? cache_policy_coherent : cache_policy_noncoherent;
if (!dev->dqm->set_cache_memory_policy(dev->dqm,
&pdd->qpd,
default_policy,
alternate_policy,
(void __user *)args.alternate_aperture_base,
args.alternate_aperture_size))
err = -EINVAL;
out:
mutex_unlock(&p->mutex);
return err;
}
static long kfd_ioctl_get_clock_counters(struct file *filep,
struct kfd_process *p, void __user *arg)
{
struct kfd_ioctl_get_clock_counters_args args;
struct kfd_dev *dev;
struct timespec time;
if (copy_from_user(&args, arg, sizeof(args)))
return -EFAULT;
dev = kfd_device_by_id(args.gpu_id);
if (dev == NULL)
return -EINVAL;
/* Reading GPU clock counter from KGD */
args.gpu_clock_counter = kfd2kgd->get_gpu_clock_counter(dev->kgd);
/* No access to rdtsc. Using raw monotonic time */
getrawmonotonic(&time);
args.cpu_clock_counter = (uint64_t)timespec_to_ns(&time);
get_monotonic_boottime(&time);
args.system_clock_counter = (uint64_t)timespec_to_ns(&time);
/* Since the counter is in nano-seconds we use 1GHz frequency */
args.system_clock_freq = 1000000000;
if (copy_to_user(arg, &args, sizeof(args)))
return -EFAULT;
return 0;
}
static int kfd_ioctl_get_process_apertures(struct file *filp,
struct kfd_process *p, void __user *arg)
{
struct kfd_ioctl_get_process_apertures_args args;
struct kfd_process_device_apertures *pAperture;
struct kfd_process_device *pdd;
dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
if (copy_from_user(&args, arg, sizeof(args)))
return -EFAULT;
args.num_of_nodes = 0;
mutex_lock(&p->mutex);
/*if the process-device list isn't empty*/
if (kfd_has_process_device_data(p)) {
/* Run over all pdd of the process */
pdd = kfd_get_first_process_device_data(p);
do {
pAperture = &args.process_apertures[args.num_of_nodes];
pAperture->gpu_id = pdd->dev->id;
pAperture->lds_base = pdd->lds_base;
pAperture->lds_limit = pdd->lds_limit;
pAperture->gpuvm_base = pdd->gpuvm_base;
pAperture->gpuvm_limit = pdd->gpuvm_limit;
pAperture->scratch_base = pdd->scratch_base;
pAperture->scratch_limit = pdd->scratch_limit;
dev_dbg(kfd_device,
"node id %u\n", args.num_of_nodes);
dev_dbg(kfd_device,
"gpu id %u\n", pdd->dev->id);
dev_dbg(kfd_device,
"lds_base %llX\n", pdd->lds_base);
dev_dbg(kfd_device,
"lds_limit %llX\n", pdd->lds_limit);
dev_dbg(kfd_device,
"gpuvm_base %llX\n", pdd->gpuvm_base);
dev_dbg(kfd_device,
"gpuvm_limit %llX\n", pdd->gpuvm_limit);
dev_dbg(kfd_device,
"scratch_base %llX\n", pdd->scratch_base);
dev_dbg(kfd_device,
"scratch_limit %llX\n", pdd->scratch_limit);
args.num_of_nodes++;
} while ((pdd = kfd_get_next_process_device_data(p, pdd)) != NULL &&
(args.num_of_nodes < NUM_OF_SUPPORTED_GPUS));
}
mutex_unlock(&p->mutex);
if (copy_to_user(arg, &args, sizeof(args)))
return -EFAULT;
return 0;
}
static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
{
struct kfd_process *process;
long err = -EINVAL;
dev_dbg(kfd_device,
"ioctl cmd 0x%x (#%d), arg 0x%lx\n",
cmd, _IOC_NR(cmd), arg);
process = kfd_get_process(current);
if (IS_ERR(process))
return PTR_ERR(process);
switch (cmd) {
case KFD_IOC_GET_VERSION:
err = kfd_ioctl_get_version(filep, process, (void __user *)arg);
break;
case KFD_IOC_CREATE_QUEUE:
err = kfd_ioctl_create_queue(filep, process,
(void __user *)arg);
break;
case KFD_IOC_DESTROY_QUEUE:
err = kfd_ioctl_destroy_queue(filep, process,
(void __user *)arg);
break;
case KFD_IOC_SET_MEMORY_POLICY:
err = kfd_ioctl_set_memory_policy(filep, process,
(void __user *)arg);
break;
case KFD_IOC_GET_CLOCK_COUNTERS:
err = kfd_ioctl_get_clock_counters(filep, process,
(void __user *)arg);
break;
case KFD_IOC_GET_PROCESS_APERTURES:
err = kfd_ioctl_get_process_apertures(filep, process,
(void __user *)arg);
break;
case KFD_IOC_UPDATE_QUEUE:
err = kfd_ioctl_update_queue(filep, process,
(void __user *)arg);
break;
default:
dev_err(kfd_device,
"unknown ioctl cmd 0x%x, arg 0x%lx)\n",
cmd, arg);
err = -EINVAL;
break;
}
if (err < 0)
dev_err(kfd_device,
"ioctl error %ld for ioctl cmd 0x%x (#%d)\n",
err, cmd, _IOC_NR(cmd));
return err;
}
static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct kfd_process *process;
process = kfd_get_process(current);
if (IS_ERR(process))
return PTR_ERR(process);
return kfd_doorbell_mmap(process, vma);
}
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef KFD_CRAT_H_INCLUDED
#define KFD_CRAT_H_INCLUDED
#include <linux/types.h>
#pragma pack(1)
/*
* 4CC signature values for the CRAT and CDIT ACPI tables
*/
#define CRAT_SIGNATURE "CRAT"
#define CDIT_SIGNATURE "CDIT"
/*
* Component Resource Association Table (CRAT)
*/
#define CRAT_OEMID_LENGTH 6
#define CRAT_OEMTABLEID_LENGTH 8
#define CRAT_RESERVED_LENGTH 6
#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1)
struct crat_header {
uint32_t signature;
uint32_t length;
uint8_t revision;
uint8_t checksum;
uint8_t oem_id[CRAT_OEMID_LENGTH];
uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH];
uint32_t oem_revision;
uint32_t creator_id;
uint32_t creator_revision;
uint32_t total_entries;
uint16_t num_domains;
uint8_t reserved[CRAT_RESERVED_LENGTH];
};
/*
* The header structure is immediately followed by total_entries of the
* data definitions
*/
/*
* The currently defined subtype entries in the CRAT
*/
#define CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY 0
#define CRAT_SUBTYPE_MEMORY_AFFINITY 1
#define CRAT_SUBTYPE_CACHE_AFFINITY 2
#define CRAT_SUBTYPE_TLB_AFFINITY 3
#define CRAT_SUBTYPE_CCOMPUTE_AFFINITY 4
#define CRAT_SUBTYPE_IOLINK_AFFINITY 5
#define CRAT_SUBTYPE_MAX 6
#define CRAT_SIBLINGMAP_SIZE 32
/*
* ComputeUnit Affinity structure and definitions
*/
#define CRAT_CU_FLAGS_ENABLED 0x00000001
#define CRAT_CU_FLAGS_HOT_PLUGGABLE 0x00000002
#define CRAT_CU_FLAGS_CPU_PRESENT 0x00000004
#define CRAT_CU_FLAGS_GPU_PRESENT 0x00000008
#define CRAT_CU_FLAGS_IOMMU_PRESENT 0x00000010
#define CRAT_CU_FLAGS_RESERVED 0xffffffe0
#define CRAT_COMPUTEUNIT_RESERVED_LENGTH 4
struct crat_subtype_computeunit {
uint8_t type;
uint8_t length;
uint16_t reserved;
uint32_t flags;
uint32_t proximity_domain;
uint32_t processor_id_low;
uint16_t num_cpu_cores;
uint16_t num_simd_cores;
uint16_t max_waves_simd;
uint16_t io_count;
uint16_t hsa_capability;
uint16_t lds_size_in_kb;
uint8_t wave_front_size;
uint8_t num_banks;
uint16_t micro_engine_id;
uint8_t num_arrays;
uint8_t num_cu_per_array;
uint8_t num_simd_per_cu;
uint8_t max_slots_scatch_cu;
uint8_t reserved2[CRAT_COMPUTEUNIT_RESERVED_LENGTH];
};
/*
* HSA Memory Affinity structure and definitions
*/
#define CRAT_MEM_FLAGS_ENABLED 0x00000001
#define CRAT_MEM_FLAGS_HOT_PLUGGABLE 0x00000002
#define CRAT_MEM_FLAGS_NON_VOLATILE 0x00000004
#define CRAT_MEM_FLAGS_RESERVED 0xfffffff8
#define CRAT_MEMORY_RESERVED_LENGTH 8
struct crat_subtype_memory {
uint8_t type;
uint8_t length;
uint16_t reserved;
uint32_t flags;
uint32_t promixity_domain;
uint32_t base_addr_low;
uint32_t base_addr_high;
uint32_t length_low;
uint32_t length_high;
uint32_t width;
uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH];
};
/*
* HSA Cache Affinity structure and definitions
*/
#define CRAT_CACHE_FLAGS_ENABLED 0x00000001
#define CRAT_CACHE_FLAGS_DATA_CACHE 0x00000002
#define CRAT_CACHE_FLAGS_INST_CACHE 0x00000004
#define CRAT_CACHE_FLAGS_CPU_CACHE 0x00000008
#define CRAT_CACHE_FLAGS_SIMD_CACHE 0x00000010
#define CRAT_CACHE_FLAGS_RESERVED 0xffffffe0
#define CRAT_CACHE_RESERVED_LENGTH 8
struct crat_subtype_cache {
uint8_t type;
uint8_t length;
uint16_t reserved;
uint32_t flags;
uint32_t processor_id_low;
uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE];
uint32_t cache_size;
uint8_t cache_level;
uint8_t lines_per_tag;
uint16_t cache_line_size;
uint8_t associativity;
uint8_t cache_properties;
uint16_t cache_latency;
uint8_t reserved2[CRAT_CACHE_RESERVED_LENGTH];
};
/*
* HSA TLB Affinity structure and definitions
*/
#define CRAT_TLB_FLAGS_ENABLED 0x00000001
#define CRAT_TLB_FLAGS_DATA_TLB 0x00000002
#define CRAT_TLB_FLAGS_INST_TLB 0x00000004
#define CRAT_TLB_FLAGS_CPU_TLB 0x00000008
#define CRAT_TLB_FLAGS_SIMD_TLB 0x00000010
#define CRAT_TLB_FLAGS_RESERVED 0xffffffe0
#define CRAT_TLB_RESERVED_LENGTH 4
struct crat_subtype_tlb {
uint8_t type;
uint8_t length;
uint16_t reserved;
uint32_t flags;
uint32_t processor_id_low;
uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE];
uint32_t tlb_level;
uint8_t data_tlb_associativity_2mb;
uint8_t data_tlb_size_2mb;
uint8_t instruction_tlb_associativity_2mb;
uint8_t instruction_tlb_size_2mb;
uint8_t data_tlb_associativity_4k;
uint8_t data_tlb_size_4k;
uint8_t instruction_tlb_associativity_4k;
uint8_t instruction_tlb_size_4k;
uint8_t data_tlb_associativity_1gb;
uint8_t data_tlb_size_1gb;
uint8_t instruction_tlb_associativity_1gb;
uint8_t instruction_tlb_size_1gb;
uint8_t reserved2[CRAT_TLB_RESERVED_LENGTH];
};
/*
* HSA CCompute/APU Affinity structure and definitions
*/
#define CRAT_CCOMPUTE_FLAGS_ENABLED 0x00000001
#define CRAT_CCOMPUTE_FLAGS_RESERVED 0xfffffffe
#define CRAT_CCOMPUTE_RESERVED_LENGTH 16
struct crat_subtype_ccompute {
uint8_t type;
uint8_t length;
uint16_t reserved;
uint32_t flags;
uint32_t processor_id_low;
uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE];
uint32_t apu_size;
uint8_t reserved2[CRAT_CCOMPUTE_RESERVED_LENGTH];
};
/*
* HSA IO Link Affinity structure and definitions
*/
#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001
#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002
#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc
/*
* IO interface types
*/
#define CRAT_IOLINK_TYPE_UNDEFINED 0
#define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1
#define CRAT_IOLINK_TYPE_PCIEXPRESS 2
#define CRAT_IOLINK_TYPE_OTHER 3
#define CRAT_IOLINK_TYPE_MAX 255
#define CRAT_IOLINK_RESERVED_LENGTH 24
struct crat_subtype_iolink {
uint8_t type;
uint8_t length;
uint16_t reserved;
uint32_t flags;
uint32_t proximity_domain_from;
uint32_t proximity_domain_to;
uint8_t io_interface_type;
uint8_t version_major;
uint16_t version_minor;
uint32_t minimum_latency;
uint32_t maximum_latency;
uint32_t minimum_bandwidth_mbs;
uint32_t maximum_bandwidth_mbs;
uint32_t recommended_transfer_size;
uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH];
};
/*
* HSA generic sub-type header
*/
#define CRAT_SUBTYPE_FLAGS_ENABLED 0x00000001
struct crat_subtype_generic {
uint8_t type;
uint8_t length;
uint16_t reserved;
uint32_t flags;
};
/*
* Component Locality Distance Information Table (CDIT)
*/
#define CDIT_OEMID_LENGTH 6
#define CDIT_OEMTABLEID_LENGTH 8
struct cdit_header {
uint32_t signature;
uint32_t length;
uint8_t revision;
uint8_t checksum;
uint8_t oem_id[CDIT_OEMID_LENGTH];
uint8_t oem_table_id[CDIT_OEMTABLEID_LENGTH];
uint32_t oem_revision;
uint32_t creator_id;
uint32_t creator_revision;
uint32_t total_entries;
uint16_t num_domains;
uint8_t entry[1];
};
#pragma pack()
#endif /* KFD_CRAT_H_INCLUDED */
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <linux/amd-iommu.h>
#include <linux/bsearch.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
#define MQD_SIZE_ALIGNED 768
static const struct kfd_device_info kaveri_device_info = {
.max_pasid_bits = 16,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.mqd_size_aligned = MQD_SIZE_ALIGNED
};
struct kfd_deviceid {
unsigned short did;
const struct kfd_device_info *device_info;
};
/* Please keep this sorted by increasing device id. */
static const struct kfd_deviceid supported_devices[] = {
{ 0x1304, &kaveri_device_info }, /* Kaveri */
{ 0x1305, &kaveri_device_info }, /* Kaveri */
{ 0x1306, &kaveri_device_info }, /* Kaveri */
{ 0x1307, &kaveri_device_info }, /* Kaveri */
{ 0x1309, &kaveri_device_info }, /* Kaveri */
{ 0x130A, &kaveri_device_info }, /* Kaveri */
{ 0x130B, &kaveri_device_info }, /* Kaveri */
{ 0x130C, &kaveri_device_info }, /* Kaveri */
{ 0x130D, &kaveri_device_info }, /* Kaveri */
{ 0x130E, &kaveri_device_info }, /* Kaveri */
{ 0x130F, &kaveri_device_info }, /* Kaveri */
{ 0x1310, &kaveri_device_info }, /* Kaveri */
{ 0x1311, &kaveri_device_info }, /* Kaveri */
{ 0x1312, &kaveri_device_info }, /* Kaveri */
{ 0x1313, &kaveri_device_info }, /* Kaveri */
{ 0x1315, &kaveri_device_info }, /* Kaveri */
{ 0x1316, &kaveri_device_info }, /* Kaveri */
{ 0x1317, &kaveri_device_info }, /* Kaveri */
{ 0x1318, &kaveri_device_info }, /* Kaveri */
{ 0x131B, &kaveri_device_info }, /* Kaveri */
{ 0x131C, &kaveri_device_info }, /* Kaveri */
{ 0x131D, &kaveri_device_info }, /* Kaveri */
};
static const struct kfd_device_info *lookup_device_info(unsigned short did)
{
size_t i;
for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
if (supported_devices[i].did == did) {
BUG_ON(supported_devices[i].device_info == NULL);
return supported_devices[i].device_info;
}
}
return NULL;
}
struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev)
{
struct kfd_dev *kfd;
const struct kfd_device_info *device_info =
lookup_device_info(pdev->device);
if (!device_info)
return NULL;
kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
if (!kfd)
return NULL;
kfd->kgd = kgd;
kfd->device_info = device_info;
kfd->pdev = pdev;
kfd->init_complete = false;
return kfd;
}
static bool device_iommu_pasid_init(struct kfd_dev *kfd)
{
const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
struct amd_iommu_device_info iommu_info;
unsigned int pasid_limit;
int err;
err = amd_iommu_device_info(kfd->pdev, &iommu_info);
if (err < 0) {
dev_err(kfd_device,
"error getting iommu info. is the iommu enabled?\n");
return false;
}
if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) {
dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n",
(iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0,
(iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0,
(iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0);
return false;
}
pasid_limit = min_t(unsigned int,
(unsigned int)1 << kfd->device_info->max_pasid_bits,
iommu_info.max_pasids);
/*
* last pasid is used for kernel queues doorbells
* in the future the last pasid might be used for a kernel thread.
*/
pasid_limit = min_t(unsigned int,
pasid_limit,
kfd->doorbell_process_limit - 1);
err = amd_iommu_init_device(kfd->pdev, pasid_limit);
if (err < 0) {
dev_err(kfd_device, "error initializing iommu device\n");
return false;
}
if (!kfd_set_pasid_limit(pasid_limit)) {
dev_err(kfd_device, "error setting pasid limit\n");
amd_iommu_free_device(kfd->pdev);
return false;
}
return true;
}
static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
{
struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
if (dev)
kfd_unbind_process_from_device(dev, pasid);
}
bool kgd2kfd_device_init(struct kfd_dev *kfd,
const struct kgd2kfd_shared_resources *gpu_resources)
{
unsigned int size;
kfd->shared_resources = *gpu_resources;
/* calculate max size of mqds needed for queues */
size = max_num_of_processes *
max_num_of_queues_per_process *
kfd->device_info->mqd_size_aligned;
/* add another 512KB for all other allocations on gart */
size += 512 * 1024;
if (kfd2kgd->init_sa_manager(kfd->kgd, size)) {
dev_err(kfd_device,
"Error initializing sa manager for device (%x:%x)\n",
kfd->pdev->vendor, kfd->pdev->device);
goto out;
}
kfd_doorbell_init(kfd);
if (kfd_topology_add_device(kfd) != 0) {
dev_err(kfd_device,
"Error adding device (%x:%x) to topology\n",
kfd->pdev->vendor, kfd->pdev->device);
goto kfd_topology_add_device_error;
}
if (kfd_interrupt_init(kfd)) {
dev_err(kfd_device,
"Error initializing interrupts for device (%x:%x)\n",
kfd->pdev->vendor, kfd->pdev->device);
goto kfd_interrupt_error;
}
if (!device_iommu_pasid_init(kfd)) {
dev_err(kfd_device,
"Error initializing iommuv2 for device (%x:%x)\n",
kfd->pdev->vendor, kfd->pdev->device);
goto device_iommu_pasid_error;
}
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
iommu_pasid_shutdown_callback);
kfd->dqm = device_queue_manager_init(kfd);
if (!kfd->dqm) {
dev_err(kfd_device,
"Error initializing queue manager for device (%x:%x)\n",
kfd->pdev->vendor, kfd->pdev->device);
goto device_queue_manager_error;
}
if (kfd->dqm->start(kfd->dqm) != 0) {
dev_err(kfd_device,
"Error starting queuen manager for device (%x:%x)\n",
kfd->pdev->vendor, kfd->pdev->device);
goto dqm_start_error;
}
kfd->init_complete = true;
dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor,
kfd->pdev->device);
pr_debug("kfd: Starting kfd with the following scheduling policy %d\n",
sched_policy);
goto out;
dqm_start_error:
device_queue_manager_uninit(kfd->dqm);
device_queue_manager_error:
amd_iommu_free_device(kfd->pdev);
device_iommu_pasid_error:
kfd_interrupt_exit(kfd);
kfd_interrupt_error:
kfd_topology_remove_device(kfd);
kfd_topology_add_device_error:
kfd2kgd->fini_sa_manager(kfd->kgd);
dev_err(kfd_device,
"device (%x:%x) NOT added due to errors\n",
kfd->pdev->vendor, kfd->pdev->device);
out:
return kfd->init_complete;
}
void kgd2kfd_device_exit(struct kfd_dev *kfd)
{
if (kfd->init_complete) {
device_queue_manager_uninit(kfd->dqm);
amd_iommu_free_device(kfd->pdev);
kfd_interrupt_exit(kfd);
kfd_topology_remove_device(kfd);
}
kfree(kfd);
}
void kgd2kfd_suspend(struct kfd_dev *kfd)
{
BUG_ON(kfd == NULL);
if (kfd->init_complete) {
kfd->dqm->stop(kfd->dqm);
amd_iommu_free_device(kfd->pdev);
}
}
int kgd2kfd_resume(struct kfd_dev *kfd)
{
unsigned int pasid_limit;
int err;
BUG_ON(kfd == NULL);
pasid_limit = kfd_get_pasid_limit();
if (kfd->init_complete) {
err = amd_iommu_init_device(kfd->pdev, pasid_limit);
if (err < 0)
return -ENXIO;
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
iommu_pasid_shutdown_callback);
kfd->dqm->start(kfd->dqm);
}
return 0;
}
/* This is called directly from KGD at ISR. */
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
{
if (kfd->init_complete) {
spin_lock(&kfd->interrupt_lock);
if (kfd->interrupts_active
&& enqueue_ih_ring_entry(kfd, ih_ring_entry))
schedule_work(&kfd->interrupt_work);
spin_unlock(&kfd->interrupt_lock);
}
}
此差异已折叠。
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef KFD_DEVICE_QUEUE_MANAGER_H_
#define KFD_DEVICE_QUEUE_MANAGER_H_
#include <linux/rwsem.h>
#include <linux/list.h>
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500)
#define QUEUES_PER_PIPE (8)
#define PIPE_PER_ME_CP_SCHEDULING (3)
#define CIK_VMID_NUM (8)
#define KFD_VMID_START_OFFSET (8)
#define VMID_PER_DEVICE CIK_VMID_NUM
#define KFD_DQM_FIRST_PIPE (0)
struct device_process_node {
struct qcm_process_device *qpd;
struct list_head list;
};
/**
* struct device_queue_manager
*
* @create_queue: Queue creation routine.
*
* @destroy_queue: Queue destruction routine.
*
* @update_queue: Queue update routine.
*
* @get_mqd_manager: Returns the mqd manager according to the mqd type.
*
* @exeute_queues: Dispatches the queues list to the H/W.
*
* @register_process: This routine associates a specific process with device.
*
* @unregister_process: destroys the associations between process to device.
*
* @initialize: Initializes the pipelines and memory module for that device.
*
* @start: Initializes the resources/modules the the device needs for queues
* execution. This function is called on device initialization and after the
* system woke up after suspension.
*
* @stop: This routine stops execution of all the active queue running on the
* H/W and basically this function called on system suspend.
*
* @uninitialize: Destroys all the device queue manager resources allocated in
* initialize routine.
*
* @create_kernel_queue: Creates kernel queue. Used for debug queue.
*
* @destroy_kernel_queue: Destroys kernel queue. Used for debug queue.
*
* @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the
* memory apertures.
*
* This struct is a base class for the kfd queues scheduler in the
* device level. The device base class should expose the basic operations
* for queue creation and queue destruction. This base class hides the
* scheduling mode of the driver and the specific implementation of the
* concrete device. This class is the only class in the queues scheduler
* that configures the H/W.
*/
struct device_queue_manager {
int (*create_queue)(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd,
int *allocate_vmid);
int (*destroy_queue)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q);
int (*update_queue)(struct device_queue_manager *dqm,
struct queue *q);
struct mqd_manager * (*get_mqd_manager)
(struct device_queue_manager *dqm,
enum KFD_MQD_TYPE type);
int (*register_process)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
int (*unregister_process)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
int (*initialize)(struct device_queue_manager *dqm);
int (*start)(struct device_queue_manager *dqm);
int (*stop)(struct device_queue_manager *dqm);
void (*uninitialize)(struct device_queue_manager *dqm);
int (*create_kernel_queue)(struct device_queue_manager *dqm,
struct kernel_queue *kq,
struct qcm_process_device *qpd);
void (*destroy_kernel_queue)(struct device_queue_manager *dqm,
struct kernel_queue *kq,
struct qcm_process_device *qpd);
bool (*set_cache_memory_policy)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
uint64_t alternate_aperture_size);
struct mqd_manager *mqds[KFD_MQD_TYPE_MAX];
struct packet_manager packets;
struct kfd_dev *dev;
struct mutex lock;
struct list_head queues;
unsigned int processes_count;
unsigned int queue_count;
unsigned int next_pipe_to_allocate;
unsigned int *allocated_queues;
unsigned int vmid_bitmap;
uint64_t pipelines_addr;
struct kfd_mem_obj *pipeline_mem;
uint64_t fence_gpu_addr;
unsigned int *fence_addr;
struct kfd_mem_obj *fence_mem;
bool active_runlist;
};
#endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "kfd_priv.h"
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
/*
* This extension supports a kernel level doorbells management for
* the kernel queues.
* Basically the last doorbells page is devoted to kernel queues
* and that's assures that any user process won't get access to the
* kernel doorbells page
*/
static DEFINE_MUTEX(doorbell_mutex);
static unsigned long doorbell_available_index[
DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)] = { 0 };
#define KERNEL_DOORBELL_PASID 1
#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4
/*
* Each device exposes a doorbell aperture, a PCI MMIO aperture that
* receives 32-bit writes that are passed to queues as wptr values.
* The doorbells are intended to be written by applications as part
* of queueing work on user-mode queues.
* We assign doorbells to applications in PAGE_SIZE-sized and aligned chunks.
* We map the doorbell address space into user-mode when a process creates
* its first queue on each device.
* Although the mapping is done by KFD, it is equivalent to an mmap of
* the /dev/kfd with the particular device encoded in the mmap offset.
* There will be other uses for mmap of /dev/kfd, so only a range of
* offsets (KFD_MMAP_DOORBELL_START-END) is used for doorbells.
*/
/* # of doorbell bytes allocated for each process. */
static inline size_t doorbell_process_allocation(void)
{
return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES *
KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
PAGE_SIZE);
}
/* Doorbell calculations for device init. */
void kfd_doorbell_init(struct kfd_dev *kfd)
{
size_t doorbell_start_offset;
size_t doorbell_aperture_size;
size_t doorbell_process_limit;
/*
* We start with calculations in bytes because the input data might
* only be byte-aligned.
* Only after we have done the rounding can we assume any alignment.
*/
doorbell_start_offset =
roundup(kfd->shared_resources.doorbell_start_offset,
doorbell_process_allocation());
doorbell_aperture_size =
rounddown(kfd->shared_resources.doorbell_aperture_size,
doorbell_process_allocation());
if (doorbell_aperture_size > doorbell_start_offset)
doorbell_process_limit =
(doorbell_aperture_size - doorbell_start_offset) /
doorbell_process_allocation();
else
doorbell_process_limit = 0;
kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address +
doorbell_start_offset;
kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32);
kfd->doorbell_process_limit = doorbell_process_limit - 1;
kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base,
doorbell_process_allocation());
BUG_ON(!kfd->doorbell_kernel_ptr);
pr_debug("kfd: doorbell initialization:\n");
pr_debug("kfd: doorbell base == 0x%08lX\n",
(uintptr_t)kfd->doorbell_base);
pr_debug("kfd: doorbell_id_offset == 0x%08lX\n",
kfd->doorbell_id_offset);
pr_debug("kfd: doorbell_process_limit == 0x%08lX\n",
doorbell_process_limit);
pr_debug("kfd: doorbell_kernel_offset == 0x%08lX\n",
(uintptr_t)kfd->doorbell_base);
pr_debug("kfd: doorbell aperture size == 0x%08lX\n",
kfd->shared_resources.doorbell_aperture_size);
pr_debug("kfd: doorbell kernel address == 0x%08lX\n",
(uintptr_t)kfd->doorbell_kernel_ptr);
}
int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
{
phys_addr_t address;
struct kfd_dev *dev;
/*
* For simplicitly we only allow mapping of the entire doorbell
* allocation of a single device & process.
*/
if (vma->vm_end - vma->vm_start != doorbell_process_allocation())
return -EINVAL;
/* Find kfd device according to gpu id */
dev = kfd_device_by_id(vma->vm_pgoff);
if (dev == NULL)
return -EINVAL;
/* Find if pdd exists for combination of process and gpu id */
if (!kfd_get_process_device_data(dev, process, 0))
return -EINVAL;
/* Calculate physical address of doorbell */
address = kfd_get_process_doorbells(dev, process);
vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
VM_DONTDUMP | VM_PFNMAP;
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n"
" target user address == 0x%08llX\n"
" physical address == 0x%08llX\n"
" vm_flags == 0x%04lX\n"
" size == 0x%04lX\n",
(unsigned long long) vma->vm_start, address, vma->vm_flags,
doorbell_process_allocation());
return io_remap_pfn_range(vma,
vma->vm_start,
address >> PAGE_SHIFT,
doorbell_process_allocation(),
vma->vm_page_prot);
}
/* get kernel iomem pointer for a doorbell */
u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
unsigned int *doorbell_off)
{
u32 inx;
BUG_ON(!kfd || !doorbell_off);
mutex_lock(&doorbell_mutex);
inx = find_first_zero_bit(doorbell_available_index,
KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
__set_bit(inx, doorbell_available_index);
mutex_unlock(&doorbell_mutex);
if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
return NULL;
/*
* Calculating the kernel doorbell offset using "faked" kernel
* pasid that allocated for kernel queues only
*/
*doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() /
sizeof(u32)) + inx;
pr_debug("kfd: get kernel queue doorbell\n"
" doorbell offset == 0x%08d\n"
" kernel address == 0x%08lX\n",
*doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx));
return kfd->doorbell_kernel_ptr + inx;
}
void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr)
{
unsigned int inx;
BUG_ON(!kfd || !db_addr);
inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr);
mutex_lock(&doorbell_mutex);
__clear_bit(inx, doorbell_available_index);
mutex_unlock(&doorbell_mutex);
}
inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
{
if (db) {
writel(value, db);
pr_debug("writing %d to doorbell address 0x%p\n", value, db);
}
}
/*
* queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1
* to doorbells with the process's doorbell page
*/
unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
struct kfd_process *process,
unsigned int queue_id)
{
/*
* doorbell_id_offset accounts for doorbells taken by KGD.
* pasid * doorbell_process_allocation/sizeof(u32) adjusts
* to the process's doorbells
*/
return kfd->doorbell_id_offset +
process->pasid * (doorbell_process_allocation()/sizeof(u32)) +
queue_id;
}
uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
{
uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size -
kfd->shared_resources.doorbell_start_offset) /
doorbell_process_allocation() + 1;
return num_of_elems;
}
phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
struct kfd_process *process)
{
return dev->doorbell_base +
process->pasid * doorbell_process_allocation();
}
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <linux/device.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
#include <uapi/linux/kfd_ioctl.h>
#include <linux/time.h>
#include "kfd_priv.h"
#include <linux/mm.h>
#include <uapi/asm-generic/mman-common.h>
#include <asm/processor.h>
/*
* The primary memory I/O features being added for revisions of gfxip
* beyond 7.0 (Kaveri) are:
*
* Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b
*
* “Flat” shader memory access – These are new shader vector memory
* operations that do not reference a T#/V# so a “pointer” is what is
* sourced from the vector gprs for direct access to memory.
* This pointer space has the Shared(LDS) and Private(Scratch) memory
* mapped into this pointer space as apertures.
* The hardware then determines how to direct the memory request
* based on what apertures the request falls in.
*
* Unaligned support and alignment check
*
*
* System Unified Address - SUA
*
* The standard usage for GPU virtual addresses are that they are mapped by
* a set of page tables we call GPUVM and these page tables are managed by
* a combination of vidMM/driver software components. The current virtual
* address (VA) range for GPUVM is 40b.
*
* As of gfxip7.1 and beyond we’re adding the ability for compute memory
* clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access
* the same page tables used by host x86 processors and that are managed by
* the operating system. This is via a technique and hardware called ATC/IOMMU.
* The GPU has the capability of accessing both the GPUVM and ATC address
* spaces for a given VMID (process) simultaneously and we call this feature
* system unified address (SUA).
*
* There are three fundamental address modes of operation for a given VMID
* (process) on the GPU:
*
* HSA64 – 64b pointers and the default address space is ATC
* HSA32 – 32b pointers and the default address space is ATC
* GPUVM – 64b pointers and the default address space is GPUVM (driver
* model mode)
*
*
* HSA64 - ATC/IOMMU 64b
*
* A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized
* by the CPU so an AMD CPU can only access the high area
* (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space
* so the actual VA carried to translation is 48b. There is a “hole” in
* the middle of the 64b VA space.
*
* The GPU not only has access to all of the CPU accessible address space via
* ATC/IOMMU, but it also has access to the GPUVM address space. The “system
* unified address” feature (SUA) is the mapping of GPUVM and ATC address
* spaces into a unified pointer space. The method we take for 64b mode is
* to map the full 40b GPUVM address space into the hole of the 64b address
* space.
* The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we
* direct requests to be translated via GPUVM page tables instead of the
* IOMMU path.
*
*
* 64b to 49b Address conversion
*
* Note that there are still significant portions of unused regions (holes)
* in the 64b address space even for the GPU. There are several places in
* the pipeline (sw and hw), we wish to compress the 64b virtual address
* to a 49b address. This 49b address is constituted of an “ATC” bit
* plus a 48b virtual address. This 49b address is what is passed to the
* translation hardware. ATC==0 means the 48b address is a GPUVM address
* (max of 2^40 – 1) intended to be translated via GPUVM page tables.
* ATC==1 means the 48b address is intended to be translated via IOMMU
* page tables.
*
* A 64b pointer is compared to the apertures that are defined (Base/Limit), in
* this case the GPUVM aperture (red) is defined and if a pointer falls in this
* aperture, we subtract the GPUVM_Base address and set the ATC bit to zero
* as part of the 64b to 49b conversion.
*
* Where this 64b to 49b conversion is done is a function of the usage.
* Most GPU memory access is via memory objects where the driver builds
* a descriptor which consists of a base address and a memory access by
* the GPU usually consists of some kind of an offset or Cartesian coordinate
* that references this memory descriptor. This is the case for shader
* instructions that reference the T# or V# constants, or for specified
* locations of assets (ex. the shader program location). In these cases
* the driver is what handles the 64b to 49b conversion and the base
* address in the descriptor (ex. V# or T# or shader program location)
* is defined as a 48b address w/ an ATC bit. For this usage a given
* memory object cannot straddle multiple apertures in the 64b address
* space. For example a shader program cannot jump in/out between ATC
* and GPUVM space.
*
* In some cases we wish to pass a 64b pointer to the GPU hardware and
* the GPU hw does the 64b to 49b conversion before passing memory
* requests to the cache/memory system. This is the case for the
* S_LOAD and FLAT_* shader memory instructions where we have 64b pointers
* in scalar and vector GPRs respectively.
*
* In all cases (no matter where the 64b -> 49b conversion is done), the gfxip
* hardware sends a 48b address along w/ an ATC bit, to the memory controller
* on the memory request interfaces.
*
* <client>_MC_rdreq_atc // read request ATC bit
*
* 0 : <client>_MC_rdreq_addr is a GPUVM VA
*
* 1 : <client>_MC_rdreq_addr is a ATC VA
*
*
* “Spare” aperture (APE1)
*
* We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use
* apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the
* config tables for setting cache policies. The “spare” (APE1) aperture is
* motivated by getting a different Mtype from the default.
* The default aperture isn’t an actual base/limit aperture; it is just the
* address space that doesn’t hit any defined base/limit apertures.
* The following diagram is a complete picture of the gfxip7.x SUA apertures.
* The APE1 can be placed either below or above
* the hole (cannot be in the hole).
*
*
* General Aperture definitions and rules
*
* An aperture register definition consists of a Base, Limit, Mtype, and
* usually an ATC bit indicating which translation tables that aperture uses.
* In all cases (for SUA and DUA apertures discussed later), aperture base
* and limit definitions are 64KB aligned.
*
* <ape>_Base[63:0] = { <ape>_Base_register[63:16], 0x0000 }
*
* <ape>_Limit[63:0] = { <ape>_Limit_register[63:16], 0xFFFF }
*
* The base and limit are considered inclusive to an aperture so being
* inside an aperture means (address >= Base) AND (address <= Limit).
*
* In no case is a payload that straddles multiple apertures expected to work.
* For example a load_dword_x4 that starts in one aperture and ends in another,
* does not work. For the vector FLAT_* ops we have detection capability in
* the shader for reporting a “memory violation” back to the
* SQ block for use in traps.
* A memory violation results when an op falls into the hole,
* or a payload straddles multiple apertures. The S_LOAD instruction
* does not have this detection.
*
* Apertures cannot overlap.
*
*
*
* HSA32 - ATC/IOMMU 32b
*
* For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR
* instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b
* will not fit so there is only partial visibility to the GPUVM
* space (defined by the aperture) for S_LOAD and FLAT_* ops.
* There is no spare (APE1) aperture for HSA32 mode.
*
*
* GPUVM 64b mode (driver model)
*
* This mode is related to HSA64 in that the difference really is that
* the default aperture is GPUVM (ATC==0) and not ATC space.
* We have gfxip7.x hardware that has FLAT_* and S_LOAD support for
* SUA GPUVM mode, but does not support HSA32/HSA64.
*
*
* Device Unified Address - DUA
*
* Device unified address (DUA) is the name of the feature that maps the
* Shared(LDS) memory and Private(Scratch) memory into the overall address
* space for use by the new FLAT_* vector memory ops. The Shared and
* Private memories are mapped as apertures into the address space,
* and the hardware detects when a FLAT_* memory request is to be redirected
* to the LDS or Scratch memory when it falls into one of these apertures.
* Like the SUA apertures, the Shared/Private apertures are 64KB aligned and
* the base/limit is “in” the aperture. For both HSA64 and GPUVM SUA modes,
* the Shared/Private apertures are always placed in a limited selection of
* options in the hole of the 64b address space. For HSA32 mode, the
* Shared/Private apertures can be placed anywhere in the 32b space
* except at 0.
*
*
* HSA64 Apertures for FLAT_* vector ops
*
* For HSA64 SUA mode, the Shared and Private apertures are always placed
* in the hole w/ a limited selection of possible locations. The requests
* that fall in the private aperture are expanded as a function of the
* work-item id (tid) and redirected to the location of the
* “hidden private memory”. The hidden private can be placed in either GPUVM
* or ATC space. The addresses that fall in the shared aperture are
* re-directed to the on-chip LDS memory hardware.
*
*
* HSA32 Apertures for FLAT_* vector ops
*
* In HSA32 mode, the Private and Shared apertures can be placed anywhere
* in the 32b space except at 0 (Private or Shared Base at zero disables
* the apertures). If the base address of the apertures are non-zero
* (ie apertures exists), the size is always 64KB.
*
*
* GPUVM Apertures for FLAT_* vector ops
*
* In GPUVM mode, the Shared/Private apertures are specified identically
* to HSA64 mode where they are always in the hole at a limited selection
* of locations.
*
*
* Aperture Definitions for SUA and DUA
*
* The interpretation of the aperture register definitions for a given
* VMID is a function of the “SUA Mode” which is one of HSA64, HSA32, or
* GPUVM64 discussed in previous sections. The mode is first decoded, and
* then the remaining register decode is a function of the mode.
*
*
* SUA Mode Decode
*
* For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from
* the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and
* the SH_MEM_CONFIG:PTR32 bits.
*
* COMPUTE_DISPATCH_INITIATOR:DATA_ATC SH_MEM_CONFIG:PTR32 Mode
*
* 1 0 HSA64
*
* 1 1 HSA32
*
* 0 X GPUVM64
*
* In general the hardware will ignore the PTR32 bit and treat
* as “0” whenever DATA_ATC = “0”, but sw should set PTR32=0
* when DATA_ATC=0.
*
* The DATA_ATC bit is only set for compute dispatches.
* All “Draw” dispatches are hardcoded to GPUVM64 mode
* for FLAT_* / S_LOAD operations.
*/
#define MAKE_GPUVM_APP_BASE(gpu_num) \
(((uint64_t)(gpu_num) << 61) + 0x1000000000000)
#define MAKE_GPUVM_APP_LIMIT(base) \
(((uint64_t)(base) & 0xFFFFFF0000000000) | 0xFFFFFFFFFF)
#define MAKE_SCRATCH_APP_BASE(gpu_num) \
(((uint64_t)(gpu_num) << 61) + 0x100000000)
#define MAKE_SCRATCH_APP_LIMIT(base) \
(((uint64_t)base & 0xFFFFFFFF00000000) | 0xFFFFFFFF)
#define MAKE_LDS_APP_BASE(gpu_num) \
(((uint64_t)(gpu_num) << 61) + 0x0)
#define MAKE_LDS_APP_LIMIT(base) \
(((uint64_t)(base) & 0xFFFFFFFF00000000) | 0xFFFFFFFF)
int kfd_init_apertures(struct kfd_process *process)
{
uint8_t id = 0;
struct kfd_dev *dev;
struct kfd_process_device *pdd;
mutex_lock(&process->mutex);
/*Iterating over all devices*/
while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL &&
id < NUM_OF_SUPPORTED_GPUS) {
pdd = kfd_get_process_device_data(dev, process, 1);
/*
* For 64 bit process aperture will be statically reserved in
* the x86_64 non canonical process address space
* amdkfd doesn't currently support apertures for 32 bit process
*/
if (process->is_32bit_user_mode) {
pdd->lds_base = pdd->lds_limit = 0;
pdd->gpuvm_base = pdd->gpuvm_limit = 0;
pdd->scratch_base = pdd->scratch_limit = 0;
} else {
/*
* node id couldn't be 0 - the three MSB bits of
* aperture shoudn't be 0
*/
pdd->lds_base = MAKE_LDS_APP_BASE(id + 1);
pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
pdd->gpuvm_limit =
MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base);
pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1);
pdd->scratch_limit =
MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
}
dev_dbg(kfd_device, "node id %u\n", id);
dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id);
dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base);
dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit);
dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base);
dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit);
dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base);
dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit);
id++;
}
mutex_unlock(&process->mutex);
return 0;
}
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* KFD Interrupts.
*
* AMD GPUs deliver interrupts by pushing an interrupt description onto the
* interrupt ring and then sending an interrupt. KGD receives the interrupt
* in ISR and sends us a pointer to each new entry on the interrupt ring.
*
* We generally can't process interrupt-signaled events from ISR, so we call
* out to each interrupt client module (currently only the scheduler) to ask if
* each interrupt is interesting. If they return true, then it requires further
* processing so we copy it to an internal interrupt ring and call each
* interrupt client again from a work-queue.
*
* There's no acknowledgment for the interrupts we use. The hardware simply
* queues a new interrupt each time without waiting.
*
* The fixed-size internal queue means that it's possible for us to lose
* interrupts because we have no back-pressure to the hardware.
*/
#include <linux/slab.h>
#include <linux/device.h>
#include "kfd_priv.h"
#define KFD_INTERRUPT_RING_SIZE 256
static void interrupt_wq(struct work_struct *);
int kfd_interrupt_init(struct kfd_dev *kfd)
{
void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE,
kfd->device_info->ih_ring_entry_size,
GFP_KERNEL);
if (!interrupt_ring)
return -ENOMEM;
kfd->interrupt_ring = interrupt_ring;
kfd->interrupt_ring_size =
KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size;
atomic_set(&kfd->interrupt_ring_wptr, 0);
atomic_set(&kfd->interrupt_ring_rptr, 0);
spin_lock_init(&kfd->interrupt_lock);
INIT_WORK(&kfd->interrupt_work, interrupt_wq);
kfd->interrupts_active = true;
/*
* After this function returns, the interrupt will be enabled. This
* barrier ensures that the interrupt running on a different processor
* sees all the above writes.
*/
smp_wmb();
return 0;
}
void kfd_interrupt_exit(struct kfd_dev *kfd)
{
/*
* Stop the interrupt handler from writing to the ring and scheduling
* workqueue items. The spinlock ensures that any interrupt running
* after we have unlocked sees interrupts_active = false.
*/
unsigned long flags;
spin_lock_irqsave(&kfd->interrupt_lock, flags);
kfd->interrupts_active = false;
spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
/*
* Flush_scheduled_work ensures that there are no outstanding
* work-queue items that will access interrupt_ring. New work items
* can't be created because we stopped interrupt handling above.
*/
flush_scheduled_work();
kfree(kfd->interrupt_ring);
}
/*
* This assumes that it can't be called concurrently with itself
* but only with dequeue_ih_ring_entry.
*/
bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry)
{
unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr);
unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr);
if ((rptr - wptr) % kfd->interrupt_ring_size ==
kfd->device_info->ih_ring_entry_size) {
/* This is very bad, the system is likely to hang. */
dev_err_ratelimited(kfd_chardev(),
"Interrupt ring overflow, dropping interrupt.\n");
return false;
}
memcpy(kfd->interrupt_ring + wptr, ih_ring_entry,
kfd->device_info->ih_ring_entry_size);
wptr = (wptr + kfd->device_info->ih_ring_entry_size) %
kfd->interrupt_ring_size;
smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */
atomic_set(&kfd->interrupt_ring_wptr, wptr);
return true;
}
/*
* This assumes that it can't be called concurrently with itself
* but only with enqueue_ih_ring_entry.
*/
static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry)
{
/*
* Assume that wait queues have an implicit barrier, i.e. anything that
* happened in the ISR before it queued work is visible.
*/
unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr);
unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr);
if (rptr == wptr)
return false;
memcpy(ih_ring_entry, kfd->interrupt_ring + rptr,
kfd->device_info->ih_ring_entry_size);
rptr = (rptr + kfd->device_info->ih_ring_entry_size) %
kfd->interrupt_ring_size;
/*
* Ensure the rptr write update is not visible until
* memcpy has finished reading.
*/
smp_mb();
atomic_set(&kfd->interrupt_ring_rptr, rptr);
return true;
}
static void interrupt_wq(struct work_struct *work)
{
struct kfd_dev *dev = container_of(work, struct kfd_dev,
interrupt_work);
uint32_t ih_ring_entry[DIV_ROUND_UP(
dev->device_info->ih_ring_entry_size,
sizeof(uint32_t))];
while (dequeue_ih_ring_entry(dev, ih_ring_entry))
;
}
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/printk.h>
#include "kfd_kernel_queue.h"
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
#include "kfd_pm4_headers.h"
#include "kfd_pm4_opcodes.h"
#define PM4_COUNT_ZERO (((1 << 15) - 1) << 16)
static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
enum kfd_queue_type type, unsigned int queue_size)
{
struct queue_properties prop;
int retval;
union PM4_MES_TYPE_3_HEADER nop;
BUG_ON(!kq || !dev);
BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ);
pr_debug("kfd: In func %s initializing queue type %d size %d\n",
__func__, KFD_QUEUE_TYPE_HIQ, queue_size);
nop.opcode = IT_NOP;
nop.type = PM4_TYPE_3;
nop.u32all |= PM4_COUNT_ZERO;
kq->dev = dev;
kq->nop_packet = nop.u32all;
switch (type) {
case KFD_QUEUE_TYPE_DIQ:
case KFD_QUEUE_TYPE_HIQ:
kq->mqd = dev->dqm->get_mqd_manager(dev->dqm,
KFD_MQD_TYPE_CIK_HIQ);
break;
default:
BUG();
break;
}
if (kq->mqd == NULL)
return false;
prop.doorbell_ptr =
(uint32_t *)kfd_get_kernel_doorbell(dev, &prop.doorbell_off);
if (prop.doorbell_ptr == NULL)
goto err_get_kernel_doorbell;
retval = kfd2kgd->allocate_mem(dev->kgd,
queue_size,
PAGE_SIZE,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &kq->pq);
if (retval != 0)
goto err_pq_allocate_vidmem;
kq->pq_kernel_addr = kq->pq->cpu_ptr;
kq->pq_gpu_addr = kq->pq->gpu_addr;
retval = kfd2kgd->allocate_mem(dev->kgd,
sizeof(*kq->rptr_kernel),
32,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &kq->rptr_mem);
if (retval != 0)
goto err_rptr_allocate_vidmem;
kq->rptr_kernel = kq->rptr_mem->cpu_ptr;
kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr;
retval = kfd2kgd->allocate_mem(dev->kgd,
sizeof(*kq->wptr_kernel),
32,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &kq->wptr_mem);
if (retval != 0)
goto err_wptr_allocate_vidmem;
kq->wptr_kernel = kq->wptr_mem->cpu_ptr;
kq->wptr_gpu_addr = kq->wptr_mem->gpu_addr;
memset(kq->pq_kernel_addr, 0, queue_size);
memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel));
memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel));
prop.queue_size = queue_size;
prop.is_interop = false;
prop.priority = 1;
prop.queue_percent = 100;
prop.type = type;
prop.vmid = 0;
prop.queue_address = kq->pq_gpu_addr;
prop.read_ptr = (uint32_t *) kq->rptr_gpu_addr;
prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr;
if (init_queue(&kq->queue, prop) != 0)
goto err_init_queue;
kq->queue->device = dev;
kq->queue->process = kfd_get_process(current);
retval = kq->mqd->init_mqd(kq->mqd, &kq->queue->mqd,
&kq->queue->mqd_mem_obj,
&kq->queue->gart_mqd_addr,
&kq->queue->properties);
if (retval != 0)
goto err_init_mqd;
/* assign HIQ to HQD */
if (type == KFD_QUEUE_TYPE_HIQ) {
pr_debug("assigning hiq to hqd\n");
kq->queue->pipe = KFD_CIK_HIQ_PIPE;
kq->queue->queue = KFD_CIK_HIQ_QUEUE;
kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe,
kq->queue->queue, NULL);
} else {
/* allocate fence for DIQ */
retval = kfd2kgd->allocate_mem(dev->kgd,
sizeof(uint32_t),
32,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &kq->fence_mem_obj);
if (retval != 0)
goto err_alloc_fence;
kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr;
kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr;
}
print_queue(kq->queue);
return true;
err_alloc_fence:
err_init_mqd:
uninit_queue(kq->queue);
err_init_queue:
kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->wptr_mem);
err_wptr_allocate_vidmem:
kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->rptr_mem);
err_rptr_allocate_vidmem:
kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->pq);
err_pq_allocate_vidmem:
pr_err("kfd: error init pq\n");
kfd_release_kernel_doorbell(dev, (u32 *)prop.doorbell_ptr);
err_get_kernel_doorbell:
pr_err("kfd: error init doorbell");
return false;
}
static void uninitialize(struct kernel_queue *kq)
{
BUG_ON(!kq);
if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
kq->mqd->destroy_mqd(kq->mqd,
NULL,
false,
QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
kq->queue->pipe,
kq->queue->queue);
kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->rptr_mem);
kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->wptr_mem);
kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->pq);
kfd_release_kernel_doorbell(kq->dev,
(u32 *)kq->queue->properties.doorbell_ptr);
uninit_queue(kq->queue);
}
static int acquire_packet_buffer(struct kernel_queue *kq,
size_t packet_size_in_dwords, unsigned int **buffer_ptr)
{
size_t available_size;
size_t queue_size_dwords;
uint32_t wptr, rptr;
unsigned int *queue_address;
BUG_ON(!kq || !buffer_ptr);
rptr = *kq->rptr_kernel;
wptr = *kq->wptr_kernel;
queue_address = (unsigned int *)kq->pq_kernel_addr;
queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t);
pr_debug("kfd: In func %s\nrptr: %d\nwptr: %d\nqueue_address 0x%p\n",
__func__, rptr, wptr, queue_address);
available_size = (rptr - 1 - wptr + queue_size_dwords) %
queue_size_dwords;
if (packet_size_in_dwords >= queue_size_dwords ||
packet_size_in_dwords >= available_size)
return -ENOMEM;
if (wptr + packet_size_in_dwords >= queue_size_dwords) {
while (wptr > 0) {
queue_address[wptr] = kq->nop_packet;
wptr = (wptr + 1) % queue_size_dwords;
}
}
*buffer_ptr = &queue_address[wptr];
kq->pending_wptr = wptr + packet_size_in_dwords;
return 0;
}
static void submit_packet(struct kernel_queue *kq)
{
#ifdef DEBUG
int i;
#endif
BUG_ON(!kq);
#ifdef DEBUG
for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) {
pr_debug("0x%2X ", kq->pq_kernel_addr[i]);
if (i % 15 == 0)
pr_debug("\n");
}
pr_debug("\n");
#endif
*kq->wptr_kernel = kq->pending_wptr;
write_kernel_doorbell((u32 *)kq->queue->properties.doorbell_ptr,
kq->pending_wptr);
}
static int sync_with_hw(struct kernel_queue *kq, unsigned long timeout_ms)
{
unsigned long org_timeout_ms;
BUG_ON(!kq);
org_timeout_ms = timeout_ms;
timeout_ms += jiffies * 1000 / HZ;
while (*kq->wptr_kernel != *kq->rptr_kernel) {
if (time_after(jiffies * 1000 / HZ, timeout_ms)) {
pr_err("kfd: kernel_queue %s timeout expired %lu\n",
__func__, org_timeout_ms);
pr_err("kfd: wptr: %d rptr: %d\n",
*kq->wptr_kernel, *kq->rptr_kernel);
return -ETIME;
}
cpu_relax();
}
return 0;
}
static void rollback_packet(struct kernel_queue *kq)
{
BUG_ON(!kq);
kq->pending_wptr = *kq->queue->properties.write_ptr;
}
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type)
{
struct kernel_queue *kq;
BUG_ON(!dev);
kq = kzalloc(sizeof(struct kernel_queue), GFP_KERNEL);
if (!kq)
return NULL;
kq->initialize = initialize;
kq->uninitialize = uninitialize;
kq->acquire_packet_buffer = acquire_packet_buffer;
kq->submit_packet = submit_packet;
kq->sync_with_hw = sync_with_hw;
kq->rollback_packet = rollback_packet;
if (kq->initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE) == false) {
pr_err("kfd: failed to init kernel queue\n");
kfree(kq);
return NULL;
}
return kq;
}
void kernel_queue_uninit(struct kernel_queue *kq)
{
BUG_ON(!kq);
kq->uninitialize(kq);
kfree(kq);
}
void test_kq(struct kfd_dev *dev)
{
struct kernel_queue *kq;
uint32_t *buffer, i;
int retval;
BUG_ON(!dev);
pr_debug("kfd: starting kernel queue test\n");
kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ);
BUG_ON(!kq);
retval = kq->acquire_packet_buffer(kq, 5, &buffer);
BUG_ON(retval != 0);
for (i = 0; i < 5; i++)
buffer[i] = kq->nop_packet;
kq->submit_packet(kq);
kq->sync_with_hw(kq, 1000);
pr_debug("kfd: ending kernel queue test\n");
}
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef KFD_KERNEL_QUEUE_H_
#define KFD_KERNEL_QUEUE_H_
#include <linux/list.h>
#include <linux/types.h>
#include "kfd_priv.h"
struct kernel_queue {
/* interface */
bool (*initialize)(struct kernel_queue *kq, struct kfd_dev *dev,
enum kfd_queue_type type, unsigned int queue_size);
void (*uninitialize)(struct kernel_queue *kq);
int (*acquire_packet_buffer)(struct kernel_queue *kq,
size_t packet_size_in_dwords,
unsigned int **buffer_ptr);
void (*submit_packet)(struct kernel_queue *kq);
int (*sync_with_hw)(struct kernel_queue *kq,
unsigned long timeout_ms);
void (*rollback_packet)(struct kernel_queue *kq);
/* data */
struct kfd_dev *dev;
struct mqd_manager *mqd;
struct queue *queue;
uint32_t pending_wptr;
unsigned int nop_packet;
struct kfd_mem_obj *rptr_mem;
uint32_t *rptr_kernel;
uint64_t rptr_gpu_addr;
struct kfd_mem_obj *wptr_mem;
uint32_t *wptr_kernel;
uint64_t wptr_gpu_addr;
struct kfd_mem_obj *pq;
uint64_t pq_gpu_addr;
uint32_t *pq_kernel_addr;
struct kfd_mem_obj *fence_mem_obj;
uint64_t fence_gpu_addr;
void *fence_kernel_address;
struct list_head list;
};
#endif /* KFD_KERNEL_QUEUE_H_ */
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
#include <linux/device.h>
#include "kfd_priv.h"
#define KFD_DRIVER_AUTHOR "AMD Inc. and others"
#define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs"
#define KFD_DRIVER_DATE "20141113"
#define KFD_DRIVER_MAJOR 0
#define KFD_DRIVER_MINOR 7
#define KFD_DRIVER_PATCHLEVEL 0
const struct kfd2kgd_calls *kfd2kgd;
static const struct kgd2kfd_calls kgd2kfd = {
.exit = kgd2kfd_exit,
.probe = kgd2kfd_probe,
.device_init = kgd2kfd_device_init,
.device_exit = kgd2kfd_device_exit,
.interrupt = kgd2kfd_interrupt,
.suspend = kgd2kfd_suspend,
.resume = kgd2kfd_resume,
};
int sched_policy = KFD_SCHED_POLICY_HWS;
module_param(sched_policy, int, 0444);
MODULE_PARM_DESC(sched_policy,
"Kernel cmdline parameter that defines the amdkfd scheduling policy");
int max_num_of_processes = KFD_MAX_NUM_OF_PROCESSES_DEFAULT;
module_param(max_num_of_processes, int, 0444);
MODULE_PARM_DESC(max_num_of_processes,
"Kernel cmdline parameter that defines the amdkfd maximum number of supported processes");
int max_num_of_queues_per_process = KFD_MAX_NUM_OF_QUEUES_PER_PROCESS_DEFAULT;
module_param(max_num_of_queues_per_process, int, 0444);
MODULE_PARM_DESC(max_num_of_queues_per_process,
"Kernel cmdline parameter that defines the amdkfd maximum number of supported queues per process");
bool kgd2kfd_init(unsigned interface_version,
const struct kfd2kgd_calls *f2g,
const struct kgd2kfd_calls **g2f)
{
/*
* Only one interface version is supported,
* no kfd/kgd version skew allowed.
*/
if (interface_version != KFD_INTERFACE_VERSION)
return false;
/* Protection against multiple amd kgd loads */
if (kfd2kgd)
return true;
kfd2kgd = f2g;
*g2f = &kgd2kfd;
return true;
}
EXPORT_SYMBOL(kgd2kfd_init);
void kgd2kfd_exit(void)
{
}
static int __init kfd_module_init(void)
{
int err;
kfd2kgd = NULL;
/* Verify module parameters */
if ((sched_policy < KFD_SCHED_POLICY_HWS) ||
(sched_policy > KFD_SCHED_POLICY_NO_HWS)) {
pr_err("kfd: sched_policy has invalid value\n");
return -1;
}
/* Verify module parameters */
if ((max_num_of_processes < 0) ||
(max_num_of_processes > KFD_MAX_NUM_OF_PROCESSES)) {
pr_err("kfd: max_num_of_processes must be between 0 to KFD_MAX_NUM_OF_PROCESSES\n");
return -1;
}
if ((max_num_of_queues_per_process < 0) ||
(max_num_of_queues_per_process >
KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)) {
pr_err("kfd: max_num_of_queues_per_process must be between 0 to KFD_MAX_NUM_OF_QUEUES_PER_PROCESS\n");
return -1;
}
err = kfd_pasid_init();
if (err < 0)
goto err_pasid;
err = kfd_chardev_init();
if (err < 0)
goto err_ioctl;
err = kfd_topology_init();
if (err < 0)
goto err_topology;
kfd_process_create_wq();
dev_info(kfd_device, "Initialized module\n");
return 0;
err_topology:
kfd_chardev_exit();
err_ioctl:
kfd_pasid_exit();
err_pasid:
return err;
}
static void __exit kfd_module_exit(void)
{
kfd_process_destroy_wq();
kfd_topology_shutdown();
kfd_chardev_exit();
kfd_pasid_exit();
dev_info(kfd_device, "Removed module\n");
}
module_init(kfd_module_init);
module_exit(kfd_module_exit);
MODULE_AUTHOR(KFD_DRIVER_AUTHOR);
MODULE_DESCRIPTION(KFD_DRIVER_DESC);
MODULE_LICENSE("GPL and additional rights");
MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "."
__stringify(KFD_DRIVER_MINOR) "."
__stringify(KFD_DRIVER_PATCHLEVEL));
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <linux/printk.h>
#include <linux/slab.h>
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
#include "cik_regs.h"
#include "../../radeon/cik_reg.h"
inline void busy_wait(unsigned long ms)
{
while (time_before(jiffies, ms))
cpu_relax();
}
static inline struct cik_mqd *get_mqd(void *mqd)
{
return (struct cik_mqd *)mqd;
}
static int init_mqd(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
uint64_t addr;
struct cik_mqd *m;
int retval;
BUG_ON(!mm || !q || !mqd);
pr_debug("kfd: In func %s\n", __func__);
retval = kfd2kgd->allocate_mem(mm->dev->kgd,
sizeof(struct cik_mqd),
256,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) mqd_mem_obj);
if (retval != 0)
return -ENOMEM;
m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
addr = (*mqd_mem_obj)->gpu_addr;
memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));
m->header = 0xC0310800;
m->compute_pipelinestat_enable = 1;
m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
/*
* Make sure to use the last queue state saved on mqd when the cp
* reassigns the queue, so when queue is switched on/off (e.g over
* subscription or quantum timeout) the context will be consistent
*/
m->cp_hqd_persistent_state =
DEFAULT_CP_HQD_PERSISTENT_STATE | PRELOAD_REQ;
m->cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN;
m->cp_mqd_base_addr_lo = lower_32_bits(addr);
m->cp_mqd_base_addr_hi = upper_32_bits(addr);
m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN;
/* Although WinKFD writes this, I suspect it should not be necessary */
m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE;
m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
QUANTUM_DURATION(10);
/*
* Pipe Priority
* Identifies the pipe relative priority when this queue is connected
* to the pipeline. The pipe priority is against the GFX pipe and HP3D.
* In KFD we are using a fixed pipe priority set to CS_MEDIUM.
* 0 = CS_LOW (typically below GFX)
* 1 = CS_MEDIUM (typically between HP3D and GFX
* 2 = CS_HIGH (typically above HP3D)
*/
m->cp_hqd_pipe_priority = 1;
m->cp_hqd_queue_priority = 15;
*mqd = m;
if (gart_addr != NULL)
*gart_addr = addr;
retval = mm->update_mqd(mm, m, q);
return retval;
}
static void uninit_mqd(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
BUG_ON(!mm || !mqd);
kfd2kgd->free_mem(mm->dev->kgd, (struct kgd_mem *) mqd_mem_obj);
}
static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
uint32_t queue_id, uint32_t __user *wptr)
{
return kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, wptr);
}
static int update_mqd(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct cik_mqd *m;
BUG_ON(!mm || !q || !mqd);
pr_debug("kfd: In func %s\n", __func__);
m = get_mqd(mqd);
m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE |
DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN;
/*
* Calculating queue size which is log base 2 of actual queue size -1
* dwords and another -1 for ffs
*/
m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
- 1 - 1;
m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
m->cp_hqd_pq_doorbell_control = DOORBELL_EN |
DOORBELL_OFFSET(q->doorbell_off);
m->cp_hqd_vmid = q->vmid;
if (q->format == KFD_QUEUE_FORMAT_AQL) {
m->cp_hqd_iq_rptr = AQL_ENABLE;
m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
}
m->cp_hqd_active = 0;
q->is_active = false;
if (q->queue_size > 0 &&
q->queue_address != 0 &&
q->queue_percent > 0) {
m->cp_hqd_active = 1;
q->is_active = true;
}
return 0;
}
static int destroy_mqd(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id)
{
return kfd2kgd->hqd_destroy(mm->dev->kgd, type, timeout,
pipe_id, queue_id);
}
bool is_occupied(struct mqd_manager *mm, void *mqd,
uint64_t queue_address, uint32_t pipe_id,
uint32_t queue_id)
{
return kfd2kgd->hqd_is_occupies(mm->dev->kgd, queue_address,
pipe_id, queue_id);
}
/*
* HIQ MQD Implementation, concrete implementation for HIQ MQD implementation.
* The HIQ queue in Kaveri is using the same MQD structure as all the user mode
* queues but with different initial values.
*/
static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
uint64_t addr;
struct cik_mqd *m;
int retval;
BUG_ON(!mm || !q || !mqd || !mqd_mem_obj);
pr_debug("kfd: In func %s\n", __func__);
retval = kfd2kgd->allocate_mem(mm->dev->kgd,
sizeof(struct cik_mqd),
256,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) mqd_mem_obj);
if (retval != 0)
return -ENOMEM;
m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
addr = (*mqd_mem_obj)->gpu_addr;
memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));
m->header = 0xC0310800;
m->compute_pipelinestat_enable = 1;
m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
m->cp_hqd_persistent_state = DEFAULT_CP_HQD_PERSISTENT_STATE |
PRELOAD_REQ;
m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
QUANTUM_DURATION(10);
m->cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN;
m->cp_mqd_base_addr_lo = lower_32_bits(addr);
m->cp_mqd_base_addr_hi = upper_32_bits(addr);
m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE;
/*
* Pipe Priority
* Identifies the pipe relative priority when this queue is connected
* to the pipeline. The pipe priority is against the GFX pipe and HP3D.
* In KFD we are using a fixed pipe priority set to CS_MEDIUM.
* 0 = CS_LOW (typically below GFX)
* 1 = CS_MEDIUM (typically between HP3D and GFX
* 2 = CS_HIGH (typically above HP3D)
*/
m->cp_hqd_pipe_priority = 1;
m->cp_hqd_queue_priority = 15;
*mqd = m;
if (gart_addr)
*gart_addr = addr;
retval = mm->update_mqd(mm, m, q);
return retval;
}
static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct cik_mqd *m;
BUG_ON(!mm || !q || !mqd);
pr_debug("kfd: In func %s\n", __func__);
m = get_mqd(mqd);
m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE |
DEFAULT_MIN_AVAIL_SIZE |
PRIV_STATE |
KMD_QUEUE;
/*
* Calculating queue size which is log base 2 of actual queue
* size -1 dwords
*/
m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
- 1 - 1;
m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
m->cp_hqd_pq_doorbell_control = DOORBELL_EN |
DOORBELL_OFFSET(q->doorbell_off);
m->cp_hqd_vmid = q->vmid;
m->cp_hqd_active = 0;
q->is_active = false;
if (q->queue_size > 0 &&
q->queue_address != 0 &&
q->queue_percent > 0) {
m->cp_hqd_active = 1;
q->is_active = true;
}
return 0;
}
struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
struct kfd_dev *dev)
{
struct mqd_manager *mqd;
BUG_ON(!dev);
BUG_ON(type >= KFD_MQD_TYPE_MAX);
pr_debug("kfd: In func %s\n", __func__);
mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL);
if (!mqd)
return NULL;
mqd->dev = dev;
switch (type) {
case KFD_MQD_TYPE_CIK_CP:
case KFD_MQD_TYPE_CIK_COMPUTE:
mqd->init_mqd = init_mqd;
mqd->uninit_mqd = uninit_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
break;
case KFD_MQD_TYPE_CIK_HIQ:
mqd->init_mqd = init_mqd_hiq;
mqd->uninit_mqd = uninit_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
break;
default:
kfree(mqd);
return NULL;
}
return mqd;
}
/* SDMA queues should be implemented here when the cp will supports them */
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef KFD_MQD_MANAGER_H_
#define KFD_MQD_MANAGER_H_
#include "kfd_priv.h"
/**
* struct mqd_manager
*
* @init_mqd: Allocates the mqd buffer on local gpu memory and initialize it.
*
* @load_mqd: Loads the mqd to a concrete hqd slot. Used only for no cp
* scheduling mode.
*
* @update_mqd: Handles a update call for the MQD
*
* @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue.
* Used only for no cp scheduling.
*
* @uninit_mqd: Releases the mqd buffer from local gpu memory.
*
* @is_occupied: Checks if the relevant HQD slot is occupied.
*
* @mqd_mutex: Mqd manager mutex.
*
* @dev: The kfd device structure coupled with this module.
*
* MQD stands for Memory Queue Descriptor which represents the current queue
* state in the memory and initiate the HQD (Hardware Queue Descriptor) state.
* This structure is actually a base class for the different types of MQDs
* structures for the variant ASICs that should be supported in the future.
* This base class is also contains all the MQD specific operations.
* Another important thing to mention is that each queue has a MQD that keeps
* his state (or context) after each preemption or reassignment.
* Basically there are a instances of the mqd manager class per MQD type per
* ASIC. Currently the kfd driver supports only Kaveri so there are instances
* per KFD_MQD_TYPE for each device.
*
*/
struct mqd_manager {
int (*init_mqd)(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q);
int (*load_mqd)(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
uint32_t __user *wptr);
int (*update_mqd)(struct mqd_manager *mm, void *mqd,
struct queue_properties *q);
int (*destroy_mqd)(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id);
void (*uninit_mqd)(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj);
bool (*is_occupied)(struct mqd_manager *mm, void *mqd,
uint64_t queue_address, uint32_t pipe_id,
uint32_t queue_id);
struct mutex mqd_mutex;
struct kfd_dev *dev;
};
#endif /* KFD_MQD_MANAGER_H_ */
此差异已折叠。
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <linux/slab.h>
#include <linux/types.h>
#include "kfd_priv.h"
static unsigned long *pasid_bitmap;
static unsigned int pasid_limit;
static DEFINE_MUTEX(pasid_mutex);
int kfd_pasid_init(void)
{
pasid_limit = max_num_of_processes;
pasid_bitmap = kzalloc(DIV_ROUND_UP(pasid_limit, BITS_PER_BYTE),
GFP_KERNEL);
if (!pasid_bitmap)
return -ENOMEM;
set_bit(0, pasid_bitmap); /* PASID 0 is reserved. */
return 0;
}
void kfd_pasid_exit(void)
{
kfree(pasid_bitmap);
}
bool kfd_set_pasid_limit(unsigned int new_limit)
{
if (new_limit < pasid_limit) {
bool ok;
mutex_lock(&pasid_mutex);
/* ensure that no pasids >= new_limit are in-use */
ok = (find_next_bit(pasid_bitmap, pasid_limit, new_limit) ==
pasid_limit);
if (ok)
pasid_limit = new_limit;
mutex_unlock(&pasid_mutex);
return ok;
}
return true;
}
inline unsigned int kfd_get_pasid_limit(void)
{
return pasid_limit;
}
unsigned int kfd_pasid_alloc(void)
{
unsigned int found;
mutex_lock(&pasid_mutex);
found = find_first_zero_bit(pasid_bitmap, pasid_limit);
if (found == pasid_limit)
found = 0;
else
set_bit(found, pasid_bitmap);
mutex_unlock(&pasid_mutex);
return found;
}
void kfd_pasid_free(unsigned int pasid)
{
BUG_ON(pasid == 0 || pasid >= pasid_limit);
clear_bit(pasid, pasid_bitmap);
}
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef KFD_PM4_HEADERS_H_
#define KFD_PM4_HEADERS_H_
#ifndef PM4_MES_HEADER_DEFINED
#define PM4_MES_HEADER_DEFINED
union PM4_MES_TYPE_3_HEADER {
struct {
uint32_t reserved1:8; /* < reserved */
uint32_t opcode:8; /* < IT opcode */
uint32_t count:14; /* < number of DWORDs - 1
* in the information body.
*/
uint32_t type:2; /* < packet identifier.
* It should be 3 for type 3 packets
*/
};
uint32_t u32all;
};
#endif /* PM4_MES_HEADER_DEFINED */
/* --------------------MES_SET_RESOURCES-------------------- */
#ifndef PM4_MES_SET_RESOURCES_DEFINED
#define PM4_MES_SET_RESOURCES_DEFINED
enum set_resources_queue_type_enum {
queue_type__mes_set_resources__kernel_interface_queue_kiq = 0,
queue_type__mes_set_resources__hsa_interface_queue_hiq = 1,
queue_type__mes_set_resources__hsa_debug_interface_queue = 4
};
struct pm4_set_resources {
union {
union PM4_MES_TYPE_3_HEADER header; /* header */
uint32_t ordinal1;
};
union {
struct {
uint32_t vmid_mask:16;
uint32_t unmap_latency:8;
uint32_t reserved1:5;
enum set_resources_queue_type_enum queue_type:3;
} bitfields2;
uint32_t ordinal2;
};
uint32_t queue_mask_lo;
uint32_t queue_mask_hi;
uint32_t gws_mask_lo;
uint32_t gws_mask_hi;
union {
struct {
uint32_t oac_mask:16;
uint32_t reserved2:16;
} bitfields7;
uint32_t ordinal7;
};
union {
struct {
uint32_t gds_heap_base:6;
uint32_t reserved3:5;
uint32_t gds_heap_size:6;
uint32_t reserved4:15;
} bitfields8;
uint32_t ordinal8;
};
};
#endif
/*--------------------MES_RUN_LIST-------------------- */
#ifndef PM4_MES_RUN_LIST_DEFINED
#define PM4_MES_RUN_LIST_DEFINED
struct pm4_runlist {
union {
union PM4_MES_TYPE_3_HEADER header; /* header */
uint32_t ordinal1;
};
union {
struct {
uint32_t reserved1:2;
uint32_t ib_base_lo:30;
} bitfields2;
uint32_t ordinal2;
};
union {
struct {
uint32_t ib_base_hi:16;
uint32_t reserved2:16;
} bitfields3;
uint32_t ordinal3;
};
union {
struct {
uint32_t ib_size:20;
uint32_t chain:1;
uint32_t offload_polling:1;
uint32_t reserved3:1;
uint32_t valid:1;
uint32_t reserved4:8;
} bitfields4;
uint32_t ordinal4;
};
};
#endif
/*--------------------MES_MAP_PROCESS-------------------- */
#ifndef PM4_MES_MAP_PROCESS_DEFINED
#define PM4_MES_MAP_PROCESS_DEFINED
struct pm4_map_process {
union {
union PM4_MES_TYPE_3_HEADER header; /* header */
uint32_t ordinal1;
};
union {
struct {
uint32_t pasid:16;
uint32_t reserved1:8;
uint32_t diq_enable:1;
uint32_t process_quantum:7;
} bitfields2;
uint32_t ordinal2;
};
union {
struct {
uint32_t page_table_base:28;
uint32_t reserved3:4;
} bitfields3;
uint32_t ordinal3;
};
uint32_t sh_mem_bases;
uint32_t sh_mem_ape1_base;
uint32_t sh_mem_ape1_limit;
uint32_t sh_mem_config;
uint32_t gds_addr_lo;
uint32_t gds_addr_hi;
union {
struct {
uint32_t num_gws:6;
uint32_t reserved4:2;
uint32_t num_oac:4;
uint32_t reserved5:4;
uint32_t gds_size:6;
uint32_t num_queues:10;
} bitfields10;
uint32_t ordinal10;
};
};
#endif
/*--------------------MES_MAP_QUEUES--------------------*/
#ifndef PM4_MES_MAP_QUEUES_DEFINED
#define PM4_MES_MAP_QUEUES_DEFINED
enum map_queues_queue_sel_enum {
queue_sel__mes_map_queues__map_to_specified_queue_slots = 0,
queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1,
queue_sel__mes_map_queues__enable_process_queues = 2
};
enum map_queues_vidmem_enum {
vidmem__mes_map_queues__uses_no_video_memory = 0,
vidmem__mes_map_queues__uses_video_memory = 1
};
enum map_queues_alloc_format_enum {
alloc_format__mes_map_queues__one_per_pipe = 0,
alloc_format__mes_map_queues__all_on_one_pipe = 1
};
enum map_queues_engine_sel_enum {
engine_sel__mes_map_queues__compute = 0,
engine_sel__mes_map_queues__sdma0 = 2,
engine_sel__mes_map_queues__sdma1 = 3
};
struct pm4_map_queues {
union {
union PM4_MES_TYPE_3_HEADER header; /* header */
uint32_t ordinal1;
};
union {
struct {
uint32_t reserved1:4;
enum map_queues_queue_sel_enum queue_sel:2;
uint32_t reserved2:2;
uint32_t vmid:4;
uint32_t reserved3:4;
enum map_queues_vidmem_enum vidmem:2;
uint32_t reserved4:6;
enum map_queues_alloc_format_enum alloc_format:2;
enum map_queues_engine_sel_enum engine_sel:3;
uint32_t num_queues:3;
} bitfields2;
uint32_t ordinal2;
};
struct {
union {
struct {
uint32_t reserved5:2;
uint32_t doorbell_offset:21;
uint32_t reserved6:3;
uint32_t queue:6;
} bitfields3;
uint32_t ordinal3;
};
uint32_t mqd_addr_lo;
uint32_t mqd_addr_hi;
uint32_t wptr_addr_lo;
uint32_t wptr_addr_hi;
} mes_map_queues_ordinals[1]; /* 1..N of these ordinal groups */
};
#endif
/*--------------------MES_QUERY_STATUS--------------------*/
#ifndef PM4_MES_QUERY_STATUS_DEFINED
#define PM4_MES_QUERY_STATUS_DEFINED
enum query_status_interrupt_sel_enum {
interrupt_sel__mes_query_status__completion_status = 0,
interrupt_sel__mes_query_status__process_status = 1,
interrupt_sel__mes_query_status__queue_status = 2
};
enum query_status_command_enum {
command__mes_query_status__interrupt_only = 0,
command__mes_query_status__fence_only_immediate = 1,
command__mes_query_status__fence_only_after_write_ack = 2,
command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3
};
enum query_status_engine_sel_enum {
engine_sel__mes_query_status__compute = 0,
engine_sel__mes_query_status__sdma0_queue = 2,
engine_sel__mes_query_status__sdma1_queue = 3
};
struct pm4_query_status {
union {
union PM4_MES_TYPE_3_HEADER header; /* header */
uint32_t ordinal1;
};
union {
struct {
uint32_t context_id:28;
enum query_status_interrupt_sel_enum interrupt_sel:2;
enum query_status_command_enum command:2;
} bitfields2;
uint32_t ordinal2;
};
union {
struct {
uint32_t pasid:16;
uint32_t reserved1:16;
} bitfields3a;
struct {
uint32_t reserved2:2;
uint32_t doorbell_offset:21;
uint32_t reserved3:3;
enum query_status_engine_sel_enum engine_sel:3;
uint32_t reserved4:3;
} bitfields3b;
uint32_t ordinal3;
};
uint32_t addr_lo;
uint32_t addr_hi;
uint32_t data_lo;
uint32_t data_hi;
};
#endif
/*--------------------MES_UNMAP_QUEUES--------------------*/
#ifndef PM4_MES_UNMAP_QUEUES_DEFINED
#define PM4_MES_UNMAP_QUEUES_DEFINED
enum unmap_queues_action_enum {
action__mes_unmap_queues__preempt_queues = 0,
action__mes_unmap_queues__reset_queues = 1,
action__mes_unmap_queues__disable_process_queues = 2
};
enum unmap_queues_queue_sel_enum {
queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0,
queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1,
queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2
};
enum unmap_queues_engine_sel_enum {
engine_sel__mes_unmap_queues__compute = 0,
engine_sel__mes_unmap_queues__sdma0 = 2,
engine_sel__mes_unmap_queues__sdma1 = 3
};
struct pm4_unmap_queues {
union {
union PM4_MES_TYPE_3_HEADER header; /* header */
uint32_t ordinal1;
};
union {
struct {
enum unmap_queues_action_enum action:2;
uint32_t reserved1:2;
enum unmap_queues_queue_sel_enum queue_sel:2;
uint32_t reserved2:20;
enum unmap_queues_engine_sel_enum engine_sel:3;
uint32_t num_queues:3;
} bitfields2;
uint32_t ordinal2;
};
union {
struct {
uint32_t pasid:16;
uint32_t reserved3:16;
} bitfields3a;
struct {
uint32_t reserved4:2;
uint32_t doorbell_offset0:21;
uint32_t reserved5:9;
} bitfields3b;
uint32_t ordinal3;
};
union {
struct {
uint32_t reserved6:2;
uint32_t doorbell_offset1:21;
uint32_t reserved7:9;
} bitfields4;
uint32_t ordinal4;
};
union {
struct {
uint32_t reserved8:2;
uint32_t doorbell_offset2:21;
uint32_t reserved9:9;
} bitfields5;
uint32_t ordinal5;
};
union {
struct {
uint32_t reserved10:2;
uint32_t doorbell_offset3:21;
uint32_t reserved11:9;
} bitfields6;
uint32_t ordinal6;
};
};
#endif
enum {
CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
};
#endif /* KFD_PM4_HEADERS_H_ */
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -104,6 +104,7 @@ radeon-y += \
radeon_vce.o \
vce_v1_0.o \
vce_v2_0.o \
radeon_kfd.o
radeon-$(CONFIG_COMPAT) += radeon_ioc32.o
radeon-$(CONFIG_VGA_SWITCHEROO) += radeon_atpx_handler.o
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -701,6 +701,10 @@ struct radeon_doorbell {
int radeon_doorbell_get(struct radeon_device *rdev, u32 *page);
void radeon_doorbell_free(struct radeon_device *rdev, u32 doorbell);
void radeon_doorbell_get_kfd_info(struct radeon_device *rdev,
phys_addr_t *aperture_base,
size_t *aperture_size,
size_t *start_offset);
/*
* IRQS.
......@@ -2393,6 +2397,8 @@ struct radeon_device {
struct radeon_atcs atcs;
/* srbm instance registers */
struct mutex srbm_mutex;
/* GRBM index mutex. Protects concurrents access to GRBM index */
struct mutex grbm_idx_mutex;
/* clock, powergating flags */
u32 cg_flags;
u32 pg_flags;
......@@ -2405,6 +2411,10 @@ struct radeon_device {
u64 vram_pin_size;
u64 gart_pin_size;
/* amdkfd interface */
struct kfd_dev *kfd;
struct radeon_sa_manager kfd_bo;
struct mutex mn_lock;
DECLARE_HASHTABLE(mn_hash, 7);
};
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
新手
引导
客服 返回
顶部