diff options
author | Dave Airlie <airlied@redhat.com> | 2015-01-21 00:14:41 (GMT) |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2015-01-21 00:14:41 (GMT) |
commit | 4f4d89af78682f2ed1cf6745794804817f867bba (patch) | |
tree | df1c6c3225743d1cc9ab483ebe000af042246ad2 | |
parent | 426959c945538bace94ccc590d8630c02ecd6b0f (diff) | |
parent | 6bbcde9803a8e385d18c5a235c961e11a8e20601 (diff) | |
download | linux-4f4d89af78682f2ed1cf6745794804817f867bba.tar.xz |
Merge tag 'drm-amdkfd-next-2015-01-09' of git://people.freedesktop.org/~gabbayo/linux into drm-next
- Add support for SDMA usermode queues
- Replace logic of sub-allocating from GART buffer in amdkfd. Instead
of using radeon_sa module, use a new module that is more suited for
this purpose
- Add the number of watch points to amdkfd topology
- Split a function that did two things into two seperate functions.
* tag 'drm-amdkfd-next-2015-01-09' of git://people.freedesktop.org/~gabbayo/linux:
drm/amd: Remove old radeon_sa funcs from kfd-->kgd interface
drm/radeon: Remove old radeon_sa usage from kfd-->kgd interface
drm/amdkfd: Using new gtt sa in amdkfd
drm/amdkfd: Allocate gart memory using new interface
drm/amdkfd: Fixed calculation of gart buffer size
drm/amdkfd: Add kfd gtt sub-allocator functions
drm/amdkfd: Add gtt sa related data to kfd_dev struct
drm/radeon: Impl. new gtt allocate/free functions
drm/amd: Add new kfd-->kgd interface for gart usage
drm/radeon: Enable sdma preemption
drm/amdkfd: Pass queue type to pqm_create_queue()
drm/amdkfd: Identify SDMA queue in create queue ioctl
drm/amdkfd: Add SDMA user-mode queues support to QCM
drm/amdkfd: Add SDMA mqd support
drm/radeon: Implement SDMA interface functions
drm/amd: Add SDMA functions to kfd-->kgd interface
drm/amdkfd: Process-device data creation and lookup split
drm/amdkfd: Add number of watch points to topology
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device.c | 225 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 183 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 5 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 7 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 41 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 136 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 42 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process.c | 40 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 16 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 155 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 45 | ||||
-rw-r--r-- | drivers/gpu/drm/radeon/cik_reg.h | 169 | ||||
-rw-r--r-- | drivers/gpu/drm/radeon/cik_sdma.c | 29 | ||||
-rw-r--r-- | drivers/gpu/drm/radeon/radeon_kfd.c | 243 |
17 files changed, 1078 insertions, 278 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 7d4974b..4c0b1e4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -192,6 +192,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) q_properties->type = KFD_QUEUE_TYPE_COMPUTE; + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA) + q_properties->type = KFD_QUEUE_TYPE_SDMA; else return -ENOTSUPP; @@ -258,8 +260,8 @@ static long kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, p->pasid, dev->id); - err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, 0, - KFD_QUEUE_TYPE_COMPUTE, &queue_id); + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, + 0, q_properties.type, &queue_id); if (err != 0) goto err_create_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 43884eb..994a9c1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -26,12 +26,14 @@ #include <linux/slab.h> #include "kfd_priv.h" #include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers.h" #define MQD_SIZE_ALIGNED 768 static const struct kfd_device_info kaveri_device_info = { .max_pasid_bits = 16, .ih_ring_entry_size = 4 * sizeof(uint32_t), + .num_of_watch_points = 4, .mqd_size_aligned = MQD_SIZE_ALIGNED }; @@ -66,6 +68,10 @@ static const struct kfd_deviceid supported_devices[] = { { 0x131D, &kaveri_device_info }, /* Kaveri */ }; +static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size); +static void kfd_gtt_sa_fini(struct kfd_dev *kfd); + static const struct kfd_device_info *lookup_device_info(unsigned short did) { size_t i; @@ -173,16 +179,39 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, max_num_of_queues_per_process * kfd->device_info->mqd_size_aligned; - /* add another 512KB for all other allocations on gart */ + /* + * calculate max size of runlist packet. + * There can be only 2 packets at once + */ + size += (max_num_of_processes * sizeof(struct pm4_map_process) + + max_num_of_processes * max_num_of_queues_per_process * + sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2; + + /* Add size of HIQ & DIQ */ + size += KFD_KERNEL_QUEUE_SIZE * 2; + + /* add another 512KB for all other allocations on gart (HPD, fences) */ size += 512 * 1024; - if (kfd2kgd->init_sa_manager(kfd->kgd, size)) { + if (kfd2kgd->init_gtt_mem_allocation(kfd->kgd, size, &kfd->gtt_mem, + &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)) { dev_err(kfd_device, - "Error initializing sa manager for device (%x:%x)\n", - kfd->pdev->vendor, kfd->pdev->device); + "Could not allocate %d bytes for device (%x:%x)\n", + size, kfd->pdev->vendor, kfd->pdev->device); goto out; } + dev_info(kfd_device, + "Allocated %d bytes on gart for device(%x:%x)\n", + size, kfd->pdev->vendor, kfd->pdev->device); + + /* Initialize GTT sa with 512 byte chunk size */ + if (kfd_gtt_sa_init(kfd, size, 512) != 0) { + dev_err(kfd_device, + "Error initializing gtt sub-allocator\n"); + goto kfd_gtt_sa_init_error; + } + kfd_doorbell_init(kfd); if (kfd_topology_add_device(kfd) != 0) { @@ -241,7 +270,9 @@ device_iommu_pasid_error: kfd_interrupt_error: kfd_topology_remove_device(kfd); kfd_topology_add_device_error: - kfd2kgd->fini_sa_manager(kfd->kgd); + kfd_gtt_sa_fini(kfd); +kfd_gtt_sa_init_error: + kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); dev_err(kfd_device, "device (%x:%x) NOT added due to errors\n", kfd->pdev->vendor, kfd->pdev->device); @@ -256,6 +287,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) amd_iommu_free_device(kfd->pdev); kfd_interrupt_exit(kfd); kfd_topology_remove_device(kfd); + kfd_gtt_sa_fini(kfd); + kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); } kfree(kfd); @@ -306,3 +339,185 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) spin_unlock(&kfd->interrupt_lock); } } + +static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size) +{ + unsigned int num_of_bits; + + BUG_ON(!kfd); + BUG_ON(!kfd->gtt_mem); + BUG_ON(buf_size < chunk_size); + BUG_ON(buf_size == 0); + BUG_ON(chunk_size == 0); + + kfd->gtt_sa_chunk_size = chunk_size; + kfd->gtt_sa_num_of_chunks = buf_size / chunk_size; + + num_of_bits = kfd->gtt_sa_num_of_chunks / BITS_PER_BYTE; + BUG_ON(num_of_bits == 0); + + kfd->gtt_sa_bitmap = kzalloc(num_of_bits, GFP_KERNEL); + + if (!kfd->gtt_sa_bitmap) + return -ENOMEM; + + pr_debug("kfd: gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", + kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap); + + mutex_init(&kfd->gtt_sa_lock); + + return 0; + +} + +static void kfd_gtt_sa_fini(struct kfd_dev *kfd) +{ + mutex_destroy(&kfd->gtt_sa_lock); + kfree(kfd->gtt_sa_bitmap); +} + +static inline uint64_t kfd_gtt_sa_calc_gpu_addr(uint64_t start_addr, + unsigned int bit_num, + unsigned int chunk_size) +{ + return start_addr + bit_num * chunk_size; +} + +static inline uint32_t *kfd_gtt_sa_calc_cpu_addr(void *start_addr, + unsigned int bit_num, + unsigned int chunk_size) +{ + return (uint32_t *) ((uint64_t) start_addr + bit_num * chunk_size); +} + +int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + struct kfd_mem_obj **mem_obj) +{ + unsigned int found, start_search, cur_size; + + BUG_ON(!kfd); + + if (size == 0) + return -EINVAL; + + if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) + return -ENOMEM; + + *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if ((*mem_obj) == NULL) + return -ENOMEM; + + pr_debug("kfd: allocated mem_obj = %p for size = %d\n", *mem_obj, size); + + start_search = 0; + + mutex_lock(&kfd->gtt_sa_lock); + +kfd_gtt_restart_search: + /* Find the first chunk that is free */ + found = find_next_zero_bit(kfd->gtt_sa_bitmap, + kfd->gtt_sa_num_of_chunks, + start_search); + + pr_debug("kfd: found = %d\n", found); + + /* If there wasn't any free chunk, bail out */ + if (found == kfd->gtt_sa_num_of_chunks) + goto kfd_gtt_no_free_chunk; + + /* Update fields of mem_obj */ + (*mem_obj)->range_start = found; + (*mem_obj)->range_end = found; + (*mem_obj)->gpu_addr = kfd_gtt_sa_calc_gpu_addr( + kfd->gtt_start_gpu_addr, + found, + kfd->gtt_sa_chunk_size); + (*mem_obj)->cpu_ptr = kfd_gtt_sa_calc_cpu_addr( + kfd->gtt_start_cpu_ptr, + found, + kfd->gtt_sa_chunk_size); + + pr_debug("kfd: gpu_addr = %p, cpu_addr = %p\n", + (uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr); + + /* If we need only one chunk, mark it as allocated and get out */ + if (size <= kfd->gtt_sa_chunk_size) { + pr_debug("kfd: single bit\n"); + set_bit(found, kfd->gtt_sa_bitmap); + goto kfd_gtt_out; + } + + /* Otherwise, try to see if we have enough contiguous chunks */ + cur_size = size - kfd->gtt_sa_chunk_size; + do { + (*mem_obj)->range_end = + find_next_zero_bit(kfd->gtt_sa_bitmap, + kfd->gtt_sa_num_of_chunks, ++found); + /* + * If next free chunk is not contiguous than we need to + * restart our search from the last free chunk we found (which + * wasn't contiguous to the previous ones + */ + if ((*mem_obj)->range_end != found) { + start_search = found; + goto kfd_gtt_restart_search; + } + + /* + * If we reached end of buffer, bail out with error + */ + if (found == kfd->gtt_sa_num_of_chunks) + goto kfd_gtt_no_free_chunk; + + /* Check if we don't need another chunk */ + if (cur_size <= kfd->gtt_sa_chunk_size) + cur_size = 0; + else + cur_size -= kfd->gtt_sa_chunk_size; + + } while (cur_size > 0); + + pr_debug("kfd: range_start = %d, range_end = %d\n", + (*mem_obj)->range_start, (*mem_obj)->range_end); + + /* Mark the chunks as allocated */ + for (found = (*mem_obj)->range_start; + found <= (*mem_obj)->range_end; + found++) + set_bit(found, kfd->gtt_sa_bitmap); + +kfd_gtt_out: + mutex_unlock(&kfd->gtt_sa_lock); + return 0; + +kfd_gtt_no_free_chunk: + pr_debug("kfd: allocation failed with mem_obj = %p\n", mem_obj); + mutex_unlock(&kfd->gtt_sa_lock); + kfree(mem_obj); + return -ENOMEM; +} + +int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) +{ + unsigned int bit; + + BUG_ON(!kfd); + BUG_ON(!mem_obj); + + pr_debug("kfd: free mem_obj = %p, range_start = %d, range_end = %d\n", + mem_obj, mem_obj->range_start, mem_obj->range_end); + + mutex_lock(&kfd->gtt_sa_lock); + + /* Mark the chunks as free */ + for (bit = mem_obj->range_start; + bit <= mem_obj->range_end; + bit++) + clear_bit(bit, kfd->gtt_sa_bitmap); + + mutex_unlock(&kfd->gtt_sa_lock); + + kfree(mem_obj); + return 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 924e90c..6806e64 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -46,9 +46,24 @@ static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); + static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); static int destroy_queues_cpsch(struct device_queue_manager *dqm, bool lock); +static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +static void deallocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int sdma_queue_id); + +static inline +enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) +{ + if (type == KFD_QUEUE_TYPE_SDMA) + return KFD_MQD_TYPE_CIK_SDMA; + return KFD_MQD_TYPE_CIK_CP; +} static inline unsigned int get_pipes_num(struct device_queue_manager *dqm) { @@ -75,7 +90,6 @@ get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) nybble = (pdd->lds_base >> 60) & 0x0E; return nybble; - } static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) @@ -190,7 +204,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, *allocated_vmid = qpd->vmid; q->properties.vmid = qpd->vmid; - retval = create_compute_queue_nocpsch(dqm, q, qpd); + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + retval = create_compute_queue_nocpsch(dqm, q, qpd); + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + retval = create_sdma_queue_nocpsch(dqm, q, qpd); if (retval != 0) { if (list_empty(&qpd->queues_list)) { @@ -203,7 +220,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, list_add(&q->list, &qpd->queues_list); dqm->queue_count++; - + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count++; mutex_unlock(&dqm->lock); return 0; } @@ -280,8 +298,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q) { int retval; - struct mqd_manager *mqd; - + struct mqd_manager *mqd, *mqd_sdma; BUG_ON(!dqm || !q || !q->mqd || !qpd); retval = 0; @@ -295,6 +312,12 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, goto out; } + mqd_sdma = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_SDMA); + if (mqd_sdma == NULL) { + mutex_unlock(&dqm->lock); + return -ENOMEM; + } + retval = mqd->destroy_mqd(mqd, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, @@ -303,7 +326,12 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, if (retval != 0) goto out; - deallocate_hqd(dqm, q); + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + deallocate_hqd(dqm, q); + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); @@ -324,7 +352,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) BUG_ON(!dqm || !q || !q->mqd); mutex_lock(&dqm->lock); - mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_COMPUTE); + mqd = dqm->get_mqd_manager(dqm, q->properties.type); if (mqd == NULL) { mutex_unlock(&dqm->lock); return -ENOMEM; @@ -491,11 +519,8 @@ static int init_pipelines(struct device_queue_manager *dqm, * because it contains no data when there are no active queues. */ - err = kfd2kgd->allocate_mem(dqm->dev->kgd, - CIK_HPD_EOP_BYTES * pipes_num, - PAGE_SIZE, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) &dqm->pipeline_mem); + err = kfd_gtt_sa_allocate(dqm->dev, CIK_HPD_EOP_BYTES * pipes_num, + &dqm->pipeline_mem); if (err) { pr_err("kfd: error allocate vidmem num pipes: %d\n", @@ -510,8 +535,7 @@ static int init_pipelines(struct device_queue_manager *dqm, mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_COMPUTE); if (mqd == NULL) { - kfd2kgd->free_mem(dqm->dev->kgd, - (struct kgd_mem *) dqm->pipeline_mem); + kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem); return -ENOMEM; } @@ -527,7 +551,6 @@ static int init_pipelines(struct device_queue_manager *dqm, return 0; } - static int init_scheduler(struct device_queue_manager *dqm) { int retval; @@ -557,6 +580,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) mutex_init(&dqm->lock); INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->next_pipe_to_allocate = 0; + dqm->sdma_queue_count = 0; dqm->allocated_queues = kcalloc(get_pipes_num(dqm), sizeof(unsigned int), GFP_KERNEL); if (!dqm->allocated_queues) { @@ -568,6 +592,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) dqm->allocated_queues[i] = (1 << QUEUES_PER_PIPE) - 1; dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; + dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; init_scheduler(dqm); return 0; @@ -585,8 +610,7 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm) for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) kfree(dqm->mqds[i]); mutex_destroy(&dqm->lock); - kfd2kgd->free_mem(dqm->dev->kgd, - (struct kgd_mem *) dqm->pipeline_mem); + kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem); } static int start_nocpsch(struct device_queue_manager *dqm) @@ -599,6 +623,77 @@ static int stop_nocpsch(struct device_queue_manager *dqm) return 0; } +static int allocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int *sdma_queue_id) +{ + int bit; + + if (dqm->sdma_bitmap == 0) + return -ENOMEM; + + bit = find_first_bit((unsigned long *)&dqm->sdma_bitmap, + CIK_SDMA_QUEUES); + + clear_bit(bit, (unsigned long *)&dqm->sdma_bitmap); + *sdma_queue_id = bit; + + return 0; +} + +static void deallocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int sdma_queue_id) +{ + if (sdma_queue_id < 0 || sdma_queue_id >= CIK_SDMA_QUEUES) + return; + set_bit(sdma_queue_id, (unsigned long *)&dqm->sdma_bitmap); +} + +static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + uint32_t value = SDMA_ATC; + + if (q->process->is_32bit_user_mode) + value |= SDMA_VA_PTR32 | get_sh_mem_bases_32(qpd_to_pdd(qpd)); + else + value |= SDMA_VA_SHARED_BASE(get_sh_mem_bases_nybble_64( + qpd_to_pdd(qpd))); + q->properties.sdma_vm_addr = value; +} + +static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + struct mqd_manager *mqd; + int retval; + + mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_SDMA); + if (!mqd) + return -ENOMEM; + + retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (retval != 0) + return retval; + + q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; + + pr_debug("kfd: sdma id is: %d\n", q->sdma_id); + pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); + pr_debug(" sdma engine id: %d\n", q->properties.sdma_engine_id); + + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval != 0) { + deallocate_sdma_queue(dqm, q->sdma_id); + return retval; + } + + init_sdma_vm(dqm, q, qpd); + return 0; +} + /* * Device Queue Manager implementation for cp scheduler */ @@ -640,6 +735,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) mutex_init(&dqm->lock); INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->processes_count = 0; + dqm->sdma_queue_count = 0; dqm->active_runlist = false; retval = init_pipelines(dqm, get_pipes_num(dqm), 0); if (retval != 0) @@ -672,18 +768,14 @@ static int start_cpsch(struct device_queue_manager *dqm) pr_debug("kfd: allocating fence memory\n"); /* allocate fence memory on the gart */ - retval = kfd2kgd->allocate_mem(dqm->dev->kgd, - sizeof(*dqm->fence_addr), - 32, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) &dqm->fence_mem); + retval = kfd_gtt_sa_allocate(dqm->dev, sizeof(*dqm->fence_addr), + &dqm->fence_mem); if (retval != 0) goto fail_allocate_vidmem; dqm->fence_addr = dqm->fence_mem->cpu_ptr; dqm->fence_gpu_addr = dqm->fence_mem->gpu_addr; - list_for_each_entry(node, &dqm->queues, list) if (node->qpd->pqm->process && dqm->dev) kfd_bind_process_to_device(dqm->dev, @@ -712,8 +804,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) pdd = qpd_to_pdd(node->qpd); pdd->bound = false; } - kfd2kgd->free_mem(dqm->dev->kgd, - (struct kgd_mem *) dqm->fence_mem); + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->packets); return 0; @@ -754,6 +845,14 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, mutex_unlock(&dqm->lock); } +static void select_sdma_engine_id(struct queue *q) +{ + static int sdma_id; + + q->sdma_id = sdma_id; + sdma_id = (sdma_id + 1) % 2; +} + static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd, int *allocate_vmid) { @@ -769,7 +868,12 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, mutex_lock(&dqm->lock); - mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_CP); + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + select_sdma_engine_id(q); + + mqd = dqm->get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (mqd == NULL) { mutex_unlock(&dqm->lock); return -ENOMEM; @@ -786,6 +890,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, retval = execute_queues_cpsch(dqm, false); } + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count++; + out: mutex_unlock(&dqm->lock); return retval; @@ -809,6 +916,14 @@ static int fence_wait_timeout(unsigned int *fence_addr, return 0; } +static int destroy_sdma_queues(struct device_queue_manager *dqm, + unsigned int sdma_engine) +{ + return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, 0, false, + sdma_engine); +} + static int destroy_queues_cpsch(struct device_queue_manager *dqm, bool lock) { int retval; @@ -821,6 +936,15 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm, bool lock) mutex_lock(&dqm->lock); if (dqm->active_runlist == false) goto out; + + pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n", + dqm->sdma_queue_count); + + if (dqm->sdma_queue_count > 0) { + destroy_sdma_queues(dqm, 0); + destroy_sdma_queues(dqm, 1); + } + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, 0, false, 0); if (retval != 0) @@ -892,13 +1016,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, /* remove queue from list to prevent rescheduling after preemption */ mutex_lock(&dqm->lock); - - mqd = dqm->get_mqd_manager(dqm, KFD_MQD_TYPE_CIK_CP); + mqd = dqm->get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); if (!mqd) { retval = -ENOMEM; goto failed; } + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count--; + list_del(&q->list); dqm->queue_count--; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index c3f189e8..554c06e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -36,6 +36,9 @@ #define KFD_VMID_START_OFFSET (8) #define VMID_PER_DEVICE CIK_VMID_NUM #define KFD_DQM_FIRST_PIPE (0) +#define CIK_SDMA_QUEUES (4) +#define CIK_SDMA_QUEUES_PER_ENGINE (2) +#define CIK_SDMA_ENGINE_NUM (2) struct device_process_node { struct qcm_process_device *qpd; @@ -130,8 +133,10 @@ struct device_queue_manager { struct list_head queues; unsigned int processes_count; unsigned int queue_count; + unsigned int sdma_queue_count; unsigned int next_pipe_to_allocate; unsigned int *allocated_queues; + unsigned int sdma_bitmap; unsigned int vmid_bitmap; uint64_t pipelines_addr; struct kfd_mem_obj *pipeline_mem; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index b5791a5..1a9b355 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -137,10 +137,6 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) if (dev == NULL) return -EINVAL; - /* Find if pdd exists for combination of process and gpu id */ - if (!kfd_get_process_device_data(dev, process, 0)) - return -EINVAL; - /* Calculate physical address of doorbell */ address = kfd_get_process_doorbells(dev, process); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index e64aa99..35b9875 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -303,10 +303,11 @@ int kfd_init_apertures(struct kfd_process *process) while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && id < NUM_OF_SUPPORTED_GPUS) { - pdd = kfd_get_process_device_data(dev, process, 1); - if (!pdd) + pdd = kfd_create_process_device_data(dev, process); + if (pdd == NULL) { + pr_err("Failed to create process device data\n"); return -1; - + } /* * For 64 bit process aperture will be statically reserved in * the x86_64 non canonical process address space diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 9350714..0fd8bb7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -72,11 +72,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, if (prop.doorbell_ptr == NULL) goto err_get_kernel_doorbell; - retval = kfd2kgd->allocate_mem(dev->kgd, - queue_size, - PAGE_SIZE, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) &kq->pq); + retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); if (retval != 0) goto err_pq_allocate_vidmem; @@ -84,11 +80,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->pq_kernel_addr = kq->pq->cpu_ptr; kq->pq_gpu_addr = kq->pq->gpu_addr; - retval = kfd2kgd->allocate_mem(dev->kgd, - sizeof(*kq->rptr_kernel), - 32, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) &kq->rptr_mem); + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel), + &kq->rptr_mem); if (retval != 0) goto err_rptr_allocate_vidmem; @@ -96,11 +89,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->rptr_kernel = kq->rptr_mem->cpu_ptr; kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; - retval = kfd2kgd->allocate_mem(dev->kgd, - sizeof(*kq->wptr_kernel), - 32, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) &kq->wptr_mem); + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), + &kq->wptr_mem); if (retval != 0) goto err_wptr_allocate_vidmem; @@ -145,11 +135,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, } else { /* allocate fence for DIQ */ - retval = kfd2kgd->allocate_mem(dev->kgd, - sizeof(uint32_t), - 32, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) &kq->fence_mem_obj); + retval = kfd_gtt_sa_allocate(dev, sizeof(uint32_t), + &kq->fence_mem_obj); if (retval != 0) goto err_alloc_fence; @@ -165,11 +152,11 @@ err_alloc_fence: err_init_mqd: uninit_queue(kq->queue); err_init_queue: - kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->wptr_mem); + kfd_gtt_sa_free(dev, kq->wptr_mem); err_wptr_allocate_vidmem: - kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->rptr_mem); + kfd_gtt_sa_free(dev, kq->rptr_mem); err_rptr_allocate_vidmem: - kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->pq); + kfd_gtt_sa_free(dev, kq->pq); err_pq_allocate_vidmem: pr_err("kfd: error init pq\n"); kfd_release_kernel_doorbell(dev, prop.doorbell_ptr); @@ -190,10 +177,12 @@ static void uninitialize(struct kernel_queue *kq) QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, kq->queue->pipe, kq->queue->queue); + else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) + kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); - kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->rptr_mem); - kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->wptr_mem); - kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->pq); + kfd_gtt_sa_free(kq->dev, kq->rptr_mem); + kfd_gtt_sa_free(kq->dev, kq->wptr_mem); + kfd_gtt_sa_free(kq->dev, kq->pq); kfd_release_kernel_doorbell(kq->dev, kq->queue->properties.doorbell_ptr); uninit_queue(kq->queue); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index adc3147..678c33f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -26,6 +26,7 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" #include "cik_regs.h" +#include "../../radeon/cikd.h" #include "../../radeon/cik_reg.h" inline void busy_wait(unsigned long ms) @@ -51,11 +52,8 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, pr_debug("kfd: In func %s\n", __func__); - retval = kfd2kgd->allocate_mem(mm->dev->kgd, - sizeof(struct cik_mqd), - 256, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) mqd_mem_obj); + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), + mqd_mem_obj); if (retval != 0) return -ENOMEM; @@ -111,18 +109,60 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, return retval; } +static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + struct cik_sdma_rlc_registers *m; + + BUG_ON(!mm || !mqd || !mqd_mem_obj); + + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct cik_sdma_rlc_registers), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr; + + memset(m, 0, sizeof(struct cik_sdma_rlc_registers)); + + *mqd = m; + if (gart_addr != NULL) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); + + return retval; +} + static void uninit_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { BUG_ON(!mm || !mqd); - kfd2kgd->free_mem(mm->dev->kgd, (struct kgd_mem *) mqd_mem_obj); + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + BUG_ON(!mm || !mqd); + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); } static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, uint32_t __user *wptr) { return kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, wptr); +} +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t __user *wptr) +{ + return kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); } static int update_mqd(struct mqd_manager *mm, void *mqd, @@ -170,6 +210,41 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, return 0; } +static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_sdma_rlc_registers *m; + + BUG_ON(!mm || !mqd || !q); + + m = get_sdma_mqd(mqd); + m->sdma_rlc_rb_cntl = + SDMA_RB_SIZE((ffs(q->queue_size / sizeof(unsigned int)))) | + SDMA_RB_VMID(q->vmid) | + SDMA_RPTR_WRITEBACK_ENABLE | + SDMA_RPTR_WRITEBACK_TIMER(6); + + m->sdma_rlc_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdma_rlc_doorbell = SDMA_OFFSET(q->doorbell_off) | SDMA_DB_ENABLE; + m->sdma_rlc_virtual_addr = q->sdma_vm_addr; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0) { + m->sdma_rlc_rb_cntl |= SDMA_RB_ENABLE; + q->is_active = true; + } + + return 0; +} + static int destroy_mqd(struct mqd_manager *mm, void *mqd, enum kfd_preempt_type type, unsigned int timeout, uint32_t pipe_id, @@ -179,6 +254,18 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } +/* + * preempt type here is ignored because there is only one way + * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + static bool is_occupied(struct mqd_manager *mm, void *mqd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id) @@ -189,6 +276,13 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd, } +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + /* * HIQ MQD Implementation, concrete implementation for HIQ MQD implementation. * The HIQ queue in Kaveri is using the same MQD structure as all the user mode @@ -207,11 +301,8 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, pr_debug("kfd: In func %s\n", __func__); - retval = kfd2kgd->allocate_mem(mm->dev->kgd, - sizeof(struct cik_mqd), - 256, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) mqd_mem_obj); + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), + mqd_mem_obj); if (retval != 0) return -ENOMEM; @@ -301,6 +392,21 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, return 0; } +/* + * SDMA MQD Implementation + */ + +struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +{ + struct cik_sdma_rlc_registers *m; + + BUG_ON(!mqd); + + m = (struct cik_sdma_rlc_registers *)mqd; + + return m; +} + struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { @@ -335,6 +441,14 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; break; + case KFD_MQD_TYPE_CIK_SDMA: + mqd->init_mqd = init_mqd_sdma; + mqd->uninit_mqd = uninit_mqd_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; + break; default: kfree(mqd); return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 5ce9233..3cda952 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -97,11 +97,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); - retval = kfd2kgd->allocate_mem(pm->dqm->dev->kgd, - *rl_buffer_size, - PAGE_SIZE, - KFD_MEMPOOL_SYSTEM_WRITECOMBINE, - (struct kgd_mem **) &pm->ib_buffer_obj); + retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, + &pm->ib_buffer_obj); if (retval != 0) { pr_err("kfd: failed to allocate runlist IB\n"); @@ -557,8 +554,7 @@ void pm_release_ib(struct packet_manager *pm) mutex_lock(&pm->lock); if (pm->allocated) { - kfd2kgd->free_mem(pm->dqm->dev->kgd, - (struct kgd_mem *) pm->ib_buffer_obj); + kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj); pm->allocated = false; } mutex_unlock(&pm->lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index f9fb81e3..a79c217 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -107,9 +107,17 @@ enum cache_policy { struct kfd_device_info { unsigned int max_pasid_bits; size_t ih_ring_entry_size; + uint8_t num_of_watch_points; uint16_t mqd_size_aligned; }; +struct kfd_mem_obj { + uint32_t range_start; + uint32_t range_end; + uint64_t gpu_addr; + uint32_t *cpu_ptr; +}; + struct kfd_dev { struct kgd_dev *kgd; @@ -135,6 +143,14 @@ struct kfd_dev { struct kgd2kfd_shared_resources shared_resources; + void *gtt_mem; + uint64_t gtt_start_gpu_addr; + void *gtt_start_cpu_ptr; + void *gtt_sa_bitmap; + struct mutex gtt_sa_lock; + unsigned int gtt_sa_chunk_size; + unsigned int gtt_sa_num_of_chunks; + void *interrupt_ring; size_t interrupt_ring_size; atomic_t interrupt_ring_rptr; @@ -162,12 +178,6 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd); extern const struct kfd2kgd_calls *kfd2kgd; -struct kfd_mem_obj { - void *bo; - uint64_t gpu_addr; - uint32_t *cpu_ptr; -}; - enum kfd_mempool { KFD_MEMPOOL_SYSTEM_CACHEABLE = 1, KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2, @@ -285,6 +295,10 @@ struct queue_properties { bool is_active; /* Not relevant for user mode queues in cp scheduling */ unsigned int vmid; + /* Relevant only for sdma queues*/ + uint32_t sdma_engine_id; + uint32_t sdma_queue_id; + uint32_t sdma_vm_addr; }; /** @@ -327,6 +341,8 @@ struct queue { uint32_t pipe; uint32_t queue; + unsigned int sdma_id; + struct kfd_process *process; struct kfd_dev *device; }; @@ -472,8 +488,9 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process *p); void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, - struct kfd_process *p, - int create_pdd); + struct kfd_process *p); +struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); /* Process device data iterator */ struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); @@ -501,6 +518,13 @@ unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, struct kfd_process *process, unsigned int queue_id); +/* GTT Sub-Allocator */ + +int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + struct kfd_mem_obj **mem_obj); + +int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj); + extern struct device *kfd_device; /* Topology */ @@ -528,6 +552,8 @@ int kfd_init_apertures(struct kfd_process *process); /* Queue Context Management */ inline uint32_t lower_32(uint64_t x); inline uint32_t upper_32(uint64_t x); +struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); +inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m); int init_queue(struct queue **q, struct queue_properties properties); void uninit_queue(struct queue *q); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 3c76ef0..a369c14 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -311,24 +311,29 @@ err_alloc_process: } struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, - struct kfd_process *p, - int create_pdd) + struct kfd_process *p) { struct kfd_process_device *pdd = NULL; list_for_each_entry(pdd, &p->per_device_data, per_device_list) if (pdd->dev == dev) - return pdd; - - if (create_pdd) { - pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); - if (pdd != NULL) { - pdd->dev = dev; - INIT_LIST_HEAD(&pdd->qpd.queues_list); - INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); - pdd->qpd.dqm = dev->dqm; - list_add(&pdd->per_device_list, &p->per_device_data); - } + break; + + return pdd; +} + +struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd = NULL; + + pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); + if (pdd != NULL) { + pdd->dev = dev; + INIT_LIST_HEAD(&pdd->qpd.queues_list); + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; + list_add(&pdd->per_device_list, &p->per_device_data); } return pdd; @@ -344,11 +349,14 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process *p) { - struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p, 1); + struct kfd_process_device *pdd; int err; - if (pdd == NULL) + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); return ERR_PTR(-ENOMEM); + } if (pdd->bound) return pdd; @@ -384,7 +392,7 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) pqm_uninit(&p->pqm); - pdd = kfd_get_process_device_data(dev, p, 0); + pdd = kfd_get_process_device_data(dev, p); /* * Just mark pdd as unbound, because we still need it to call diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 4752678..948b1ca 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -128,7 +128,6 @@ static int create_cp_queue(struct process_queue_manager *pqm, /* let DQM handle it*/ q_properties->vmid = 0; q_properties->queue_id = qid; - q_properties->type = KFD_QUEUE_TYPE_COMPUTE; retval = init_queue(q, *q_properties); if (retval != 0) @@ -167,8 +166,11 @@ int pqm_create_queue(struct process_queue_manager *pqm, q = NULL; kq = NULL; - pdd = kfd_get_process_device_data(dev, pqm->process, 1); - BUG_ON(!pdd); + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return -1; + } retval = find_available_queue_slot(pqm, qid); if (retval != 0) @@ -186,6 +188,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, } switch (type) { + case KFD_QUEUE_TYPE_SDMA: case KFD_QUEUE_TYPE_COMPUTE: /* check if there is over subscription */ if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && @@ -273,8 +276,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) dev = pqn->q->device; BUG_ON(!dev); - pdd = kfd_get_process_device_data(dev, pqm->process, 1); - BUG_ON(!pdd); + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return -1; + } if (pqn->kq) { /* destroy kernel queue (DIQ) */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index b11792d..4886dde 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -27,6 +27,7 @@ #include <linux/acpi.h> #include <linux/hash.h> #include <linux/cpufreq.h> +#include <linux/log2.h> #include "kfd_priv.h" #include "kfd_crat.h" @@ -630,10 +631,10 @@ static struct kobj_type cache_type = { static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; struct kfd_topology_device *dev; char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; uint32_t i; + uint32_t log_max_watch_addr; /* Making sure that the buffer is an empty string */ buffer[0] = 0; @@ -641,8 +642,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, if (strcmp(attr->name, "gpu_id") == 0) { dev = container_of(attr, struct kfd_topology_device, attr_gpuid); - ret = sysfs_show_32bit_val(buffer, dev->gpu_id); - } else if (strcmp(attr->name, "name") == 0) { + return sysfs_show_32bit_val(buffer, dev->gpu_id); + } + + if (strcmp(attr->name, "name") == 0) { dev = container_of(attr, struct kfd_topology_device, attr_name); for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE; i++) { @@ -652,80 +655,90 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, break; } public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1] = 0x0; - ret = sysfs_show_str_val(buffer, public_name); - } else { - dev = container_of(attr, struct kfd_topology_device, - attr_props); - sysfs_show_32bit_prop(buffer, "cpu_cores_count", - dev->node_props.cpu_cores_count); - sysfs_show_32bit_prop(buffer, "simd_count", - dev->node_props.simd_count); - - if (dev->mem_bank_count < dev->node_props.mem_banks_count) { - pr_warn("kfd: mem_banks_count truncated from %d to %d\n", - dev->node_props.mem_banks_count, - dev->mem_bank_count); - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->mem_bank_count); - } else { - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->node_props.mem_banks_count); - } + return sysfs_show_str_val(buffer, public_name); + } - sysfs_show_32bit_prop(buffer, "caches_count", - dev->node_props.caches_count); - sysfs_show_32bit_prop(buffer, "io_links_count", - dev->node_props.io_links_count); - sysfs_show_32bit_prop(buffer, "cpu_core_id_base", - dev->node_props.cpu_core_id_base); - sysfs_show_32bit_prop(buffer, "simd_id_base", - dev->node_props.simd_id_base); - sysfs_show_32bit_prop(buffer, "capability", - dev->node_props.capability); - sysfs_show_32bit_prop(buffer, "max_waves_per_simd", - dev->node_props.max_waves_per_simd); - sysfs_show_32bit_prop(buffer, "lds_size_in_kb", - dev->node_props.lds_size_in_kb); - sysfs_show_32bit_prop(buffer, "gds_size_in_kb", - dev->node_props.gds_size_in_kb); - sysfs_show_32bit_prop(buffer, "wave_front_size", - dev->node_props.wave_front_size); - sysfs_show_32bit_prop(buffer, "array_count", - dev->node_props.array_count); - sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine", - dev->node_props.simd_arrays_per_engine); - sysfs_show_32bit_prop(buffer, "cu_per_simd_array", - dev->node_props.cu_per_simd_array); - sysfs_show_32bit_prop(buffer, "simd_per_cu", - dev->node_props.simd_per_cu); - sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu", - dev->node_props.max_slots_scratch_cu); - sysfs_show_32bit_prop(buffer, "vendor_id", - dev->node_props.vendor_id); - sysfs_show_32bit_prop(buffer, "device_id", - dev->node_props.device_id); - sysfs_show_32bit_prop(buffer, "location_id", - dev->node_props.location_id); - - if (dev->gpu) { - sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", - kfd2kgd->get_max_engine_clock_in_mhz( - dev->gpu->kgd)); - sysfs_show_64bit_prop(buffer, "local_mem_size", - kfd2kgd->get_vmem_size(dev->gpu->kgd)); - - sysfs_show_32bit_prop(buffer, "fw_version", - kfd2kgd->get_fw_version( - dev->gpu->kgd, - KGD_ENGINE_MEC1)); + dev = container_of(attr, struct kfd_topology_device, + attr_props); + sysfs_show_32bit_prop(buffer, "cpu_cores_count", + dev->node_props.cpu_cores_count); + sysfs_show_32bit_prop(buffer, "simd_count", + dev->node_props.simd_count); + + if (dev->mem_bank_count < dev->node_props.mem_banks_count) { + pr_warn("kfd: mem_banks_count truncated from %d to %d\n", + dev->node_props.mem_banks_count, + dev->mem_bank_count); + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->mem_bank_count); + } else { + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); + } + sysfs_show_32bit_prop(buffer, "caches_count", + dev->node_props.caches_count); + sysfs_show_32bit_prop(buffer, "io_links_count", + dev->node_props.io_links_count); + sysfs_show_32bit_prop(buffer, "cpu_core_id_base", + dev->node_props.cpu_core_id_base); + sysfs_show_32bit_prop(buffer, "simd_id_base", + dev->node_props.simd_id_base); + sysfs_show_32bit_prop(buffer, "capability", + dev->node_props.capability); + sysfs_show_32bit_prop(buffer, "max_waves_per_simd", + dev->node_props.max_waves_per_simd); + sysfs_show_32bit_prop(buffer, "lds_size_in_kb", + dev->node_props.lds_size_in_kb); + sysfs_show_32bit_prop(buffer, "gds_size_in_kb", + dev->node_props.gds_size_in_kb); + sysfs_show_32bit_prop(buffer, "wave_front_size", + dev->node_props.wave_front_size); + sysfs_show_32bit_prop(buffer, "array_count", + dev->node_props.array_count); + sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine", + dev->node_props.simd_arrays_per_engine); + sysfs_show_32bit_prop(buffer, "cu_per_simd_array", + dev->node_props.cu_per_simd_array); + sysfs_show_32bit_prop(buffer, "simd_per_cu", + dev->node_props.simd_per_cu); + sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu", + dev->node_props.max_slots_scratch_cu); + sysfs_show_32bit_prop(buffer, "vendor_id", + dev->node_props.vendor_id); + sysfs_show_32bit_prop(buffer, "device_id", + dev->node_props.device_id); + sysfs_show_32bit_prop(buffer, "location_id", + dev->node_props.location_id); + + if (dev->gpu) { + log_max_watch_addr = + __ilog2_u32(dev->gpu->device_info->num_of_watch_points); + + if (log_max_watch_addr) { + dev->node_props.capability |= + HSA_CAP_WATCH_POINTS_SUPPORTED; + + dev->node_props.capability |= + ((log_max_watch_addr << + HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT) & + HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); } - ret = sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute", - cpufreq_quick_get_max(0)/1000); + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", + kfd2kgd->get_max_engine_clock_in_mhz( + dev->gpu->kgd)); + sysfs_show_64bit_prop(buffer, "local_mem_size", + kfd2kgd->get_vmem_size(dev->gpu->kgd)); + + sysfs_show_32bit_prop(buffer, "fw_version", + kfd2kgd->get_fw_version( + dev->gpu->kgd, + KGD_ENGINE_MEC1)); } - return ret; + return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute", + cpufreq_quick_get_max(0)/1000); } static const struct sysfs_ops node_ops = { diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index 47b5519..cd3878f 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -110,17 +110,10 @@ struct kgd2kfd_calls { /** * struct kfd2kgd_calls * - * @init_sa_manager: Initialize an instance of the sa manager, used by - * amdkfd for all system memory allocations that are mapped to the GART - * address space + * @init_gtt_mem_allocation: Allocate a buffer on the gart aperture. + * The buffer can be used for mqds, hpds, kernel queue, fence and runlists * - * @fini_sa_manager: Releases all memory allocations for amdkfd that are - * handled by kgd sa manager - * - * @allocate_mem: Allocate a buffer from amdkfd's sa manager. The buffer can - * be used for mqds, hpds, kernel queue, fence and runlists - * - * @free_mem: Frees a buffer that was allocated by amdkfd's sa manager + * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture * * @get_vmem_size: Retrieves (physical) size of VRAM * @@ -144,10 +137,18 @@ struct kgd2kfd_calls { * @hqd_load: Loads the mqd structure to a H/W hqd slot. used only for no cp * sceduling mode. * + * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot. + * used only for no HWS mode. + * * @hqd_is_occupies: Checks if a hqd slot is occupied. * * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot. * + * @hqd_sdma_is_occupied: Checks if an SDMA hqd slot is occupied. + * + * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that + * SDMA hqd slot. + * * @get_fw_version: Returns FW versions from the header * * This structure contains function pointers to services that the kgd driver @@ -155,13 +156,11 @@ struct kgd2kfd_calls { * */ struct kfd2kgd_calls { - /* Memory management. */ - int (*init_sa_manager)(struct kgd_dev *kgd, unsigned int size); - void (*fini_sa_manager)(struct kgd_dev *kgd); - int (*allocate_mem)(struct kgd_dev *kgd, size_t size, size_t alignment, - enum kgd_memory_pool pool, struct kgd_mem **mem); + int (*init_gtt_mem_allocation)(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr); - void (*free_mem)(struct kgd_dev *kgd, struct kgd_mem *mem); + void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); uint64_t (*get_vmem_size)(struct kgd_dev *kgd); uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); @@ -183,18 +182,26 @@ struct kfd2kgd_calls { int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, uint32_t queue_id, uint32_t __user *wptr); + int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); + bool (*hqd_is_occupies)(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id); + + bool (*hqd_sdma_is_occupied)(struct kgd_dev *kgd, void *mqd); + + int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, + unsigned int timeout); + uint16_t (*get_fw_version)(struct kgd_dev *kgd, enum kgd_engine_type type); }; bool kgd2kfd_init(unsigned interface_version, - const struct kfd2kgd_calls *f2g, - const struct kgd2kfd_calls **g2f); + const struct kfd2kgd_calls *f2g, + const struct kgd2kfd_calls **g2f); -#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ +#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ diff --git a/drivers/gpu/drm/radeon/cik_reg.h b/drivers/gpu/drm/radeon/cik_reg.h index 79c45e8..bbb8f2e 100644 --- a/drivers/gpu/drm/radeon/cik_reg.h +++ b/drivers/gpu/drm/radeon/cik_reg.h @@ -147,10 +147,42 @@ #define CIK_LB_DESKTOP_HEIGHT 0x6b0c +#define KFD_CIK_SDMA_QUEUE_OFFSET 0x200 + #define CP_HQD_IQ_RPTR 0xC970u #define AQL_ENABLE (1U << 0) - -#define IDLE (1 << 2) +#define SDMA0_RLC0_RB_CNTL 0xD400u +#define SDMA_RB_VMID(x) (x << 24) +#define SDMA0_RLC0_RB_BASE 0xD404u +#define SDMA0_RLC0_RB_BASE_HI 0xD408u +#define SDMA0_RLC0_RB_RPTR 0xD40Cu +#define SDMA0_RLC0_RB_WPTR 0xD410u +#define SDMA0_RLC0_RB_WPTR_POLL_CNTL 0xD414u +#define SDMA0_RLC0_RB_WPTR_POLL_ADDR_HI 0xD418u +#define SDMA0_RLC0_RB_WPTR_POLL_ADDR_LO 0xD41Cu +#define SDMA0_RLC0_RB_RPTR_ADDR_HI 0xD420u +#define SDMA0_RLC0_RB_RPTR_ADDR_LO 0xD424u +#define SDMA0_RLC0_IB_CNTL 0xD428u +#define SDMA0_RLC0_IB_RPTR 0xD42Cu +#define SDMA0_RLC0_IB_OFFSET 0xD430u +#define SDMA0_RLC0_IB_BASE_LO 0xD434u +#define SDMA0_RLC0_IB_BASE_HI 0xD438u +#define SDMA0_RLC0_IB_SIZE 0xD43Cu +#define SDMA0_RLC0_SKIP_CNTL 0xD440u +#define SDMA0_RLC0_CONTEXT_STATUS 0xD444u +#define SDMA_RLC_IDLE (1 << 2) +#define SDMA0_RLC0_DOORBELL 0xD448u +#define SDMA_OFFSET(x) (x << 0) +#define SDMA_DB_ENABLE (1 << 28) +#define SDMA0_RLC0_VIRTUAL_ADDR 0xD49Cu +#define SDMA_ATC (1 << 0) +#define SDMA_VA_PTR32 (1 << 4) +#define SDMA_VA_SHARED_BASE(x) (x << 8) +#define SDMA0_RLC0_APE1_CNTL 0xD4A0u +#define SDMA0_RLC0_DOORBELL_LOG 0xD4A4u +#define SDMA0_RLC0_WATERMARK 0xD4A8u +#define SDMA0_CNTL 0xD010 +#define SDMA1_CNTL 0xD810 struct cik_mqd { uint32_t header; @@ -283,4 +315,137 @@ struct cik_mqd { uint32_t queue_doorbell_id15; }; +struct cik_sdma_rlc_registers { + uint32_t sdma_rlc_rb_cntl; + uint32_t sdma_rlc_rb_base; + uint32_t sdma_rlc_rb_base_hi; + uint32_t sdma_rlc_rb_rptr; + uint32_t sdma_rlc_rb_wptr; + uint32_t sdma_rlc_rb_wptr_poll_cntl; + uint32_t sdma_rlc_rb_wptr_poll_addr_hi; + uint32_t sdma_rlc_rb_wptr_poll_addr_lo; + uint32_t sdma_rlc_rb_rptr_addr_hi; + uint32_t sdma_rlc_rb_rptr_addr_lo; + uint32_t sdma_rlc_ib_cntl; + uint32_t sdma_rlc_ib_rptr; + uint32_t sdma_rlc_ib_offset; + uint32_t sdma_rlc_ib_base_lo; + uint32_t sdma_rlc_ib_base_hi; + uint32_t sdma_rlc_ib_size; + uint32_t sdma_rlc_skip_cntl; + uint32_t sdma_rlc_context_status; + uint32_t sdma_rlc_doorbell; + uint32_t sdma_rlc_virtual_addr; + uint32_t sdma_rlc_ape1_cntl; + uint32_t sdma_rlc_doorbell_log; + uint32_t reserved_22; + uint32_t reserved_23; + uint32_t reserved_24; + uint32_t reserved_25; + uint32_t reserved_26; + uint32_t reserved_27; + uint32_t reserved_28; + uint32_t reserved_29; + uint32_t reserved_30; + uint32_t reserved_31; + uint32_t reserved_32; + uint32_t reserved_33; + uint32_t reserved_34; + uint32_t reserved_35; + uint32_t reserved_36; + uint32_t reserved_37; + uint32_t reserved_38; + uint32_t reserved_39; + uint32_t reserved_40; + uint32_t reserved_41; + uint32_t reserved_42; + uint32_t reserved_43; + uint32_t reserved_44; + uint32_t reserved_45; + uint32_t reserved_46; + uint32_t reserved_47; + uint32_t reserved_48; + uint32_t reserved_49; + uint32_t reserved_50; + uint32_t reserved_51; + uint32_t reserved_52; + uint32_t reserved_53; + uint32_t reserved_54; + uint32_t reserved_55; + uint32_t reserved_56; + uint32_t reserved_57; + uint32_t reserved_58; + uint32_t reserved_59; + uint32_t reserved_60; + uint32_t reserved_61; + uint32_t reserved_62; + uint32_t reserved_63; + uint32_t reserved_64; + uint32_t reserved_65; + uint32_t reserved_66; + uint32_t reserved_67; + uint32_t reserved_68; + uint32_t reserved_69; + uint32_t reserved_70; + uint32_t reserved_71; + uint32_t reserved_72; + uint32_t reserved_73; + uint32_t reserved_74; + uint32_t reserved_75; + uint32_t reserved_76; + uint32_t reserved_77; + uint32_t reserved_78; + uint32_t reserved_79; + uint32_t reserved_80; + uint32_t reserved_81; + uint32_t reserved_82; + uint32_t reserved_83; + uint32_t reserved_84; + uint32_t reserved_85; + uint32_t reserved_86; + uint32_t reserved_87; + uint32_t reserved_88; + uint32_t reserved_89; + uint32_t reserved_90; + uint32_t reserved_91; + uint32_t reserved_92; + uint32_t reserved_93; + uint32_t reserved_94; + uint32_t reserved_95; + uint32_t reserved_96; + uint32_t reserved_97; + uint32_t reserved_98; + uint32_t reserved_99; + uint32_t reserved_100; + uint32_t reserved_101; + uint32_t reserved_102; + uint32_t reserved_103; + uint32_t reserved_104; + uint32_t reserved_105; + uint32_t reserved_106; + uint32_t reserved_107; + uint32_t reserved_108; + uint32_t reserved_109; + uint32_t reserved_110; + uint32_t reserved_111; + uint32_t reserved_112; + uint32_t reserved_113; + uint32_t reserved_114; + uint32_t reserved_115; + uint32_t reserved_116; + uint32_t reserved_117; + uint32_t reserved_118; + uint32_t reserved_119; + uint32_t reserved_120; + uint32_t reserved_121; + uint32_t reserved_122; + uint32_t reserved_123; + uint32_t reserved_124; + uint32_t reserved_125; + uint32_t reserved_126; + uint32_t reserved_127; + uint32_t sdma_engine_id; + uint32_t sdma_queue_id; +}; + #endif diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c index dde5c7e..1f4ded1 100644 --- a/drivers/gpu/drm/radeon/cik_sdma.c +++ b/drivers/gpu/drm/radeon/cik_sdma.c @@ -283,6 +283,33 @@ static void cik_sdma_rlc_stop(struct radeon_device *rdev) } /** + * cik_sdma_ctx_switch_enable - enable/disable sdma engine preemption + * + * @rdev: radeon_device pointer + * @enable: enable/disable preemption. + * + * Halt or unhalt the async dma engines (CIK). + */ +void cik_sdma_ctx_switch_enable(struct radeon_device *rdev, bool enable) +{ + uint32_t reg_offset, value; + int i; + + for (i = 0; i < 2; i++) { + if (i == 0) + reg_offset = SDMA0_REGISTER_OFFSET; + else + reg_offset = SDMA1_REGISTER_OFFSET; + value = RREG32(SDMA0_CNTL + reg_offset); + if (enable) + value |= AUTO_CTXSW_ENABLE; + else + value &= ~AUTO_CTXSW_ENABLE; + WREG32(SDMA0_CNTL + reg_offset, value); + } +} + +/** * cik_sdma_enable - stop the async dma engines * * @rdev: radeon_device pointer @@ -312,6 +339,8 @@ void cik_sdma_enable(struct radeon_device *rdev, bool enable) me_cntl |= SDMA_HALT; WREG32(SDMA0_ME_CNTL + reg_offset, me_cntl); } + + cik_sdma_ctx_switch_enable(rdev, enable); } /** diff --git a/drivers/gpu/drm/radeon/radeon_kfd.c b/drivers/gpu/drm/radeon/radeon_kfd.c index 242fd8b..cae11ee 100644 --- a/drivers/gpu/drm/radeon/radeon_kfd.c +++ b/drivers/gpu/drm/radeon/radeon_kfd.c @@ -34,18 +34,17 @@ #define CIK_PIPE_PER_MEC (4) struct kgd_mem { - struct radeon_sa_bo *sa_bo; + struct radeon_bo *bo; uint64_t gpu_addr; - void *ptr; + void *cpu_ptr; }; -static int init_sa_manager(struct kgd_dev *kgd, unsigned int size); -static void fini_sa_manager(struct kgd_dev *kgd); -static int allocate_mem(struct kgd_dev *kgd, size_t size, size_t alignment, - enum kgd_memory_pool pool, struct kgd_mem **mem); +static int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr); -static void free_mem(struct kgd_dev *kgd, struct kgd_mem *mem); +static void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); static uint64_t get_vmem_size(struct kgd_dev *kgd); static uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); @@ -71,19 +70,20 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, uint32_t queue_id, uint32_t __user *wptr); - +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); static bool kgd_hqd_is_occupies(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id); +static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); +static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int timeout); static const struct kfd2kgd_calls kfd2kgd = { - .init_sa_manager = init_sa_manager, - .fini_sa_manager = fini_sa_manager, - .allocate_mem = allocate_mem, - .free_mem = free_mem, + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, .get_vmem_size = get_vmem_size, .get_gpu_clock_counter = get_gpu_clock_counter, .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, @@ -92,8 +92,11 @@ static const struct kfd2kgd_calls kfd2kgd = { .init_memory = kgd_init_memory, .init_pipeline = kgd_init_pipeline, .hqd_load = kgd_hqd_load, + .hqd_sdma_load = kgd_hqd_sdma_load, .hqd_is_occupies = kgd_hqd_is_occupies, + .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, .hqd_destroy = kgd_hqd_destroy, + .hqd_sdma_destroy = kgd_hqd_sdma_destroy, .get_fw_version = get_fw_version }; @@ -182,87 +185,78 @@ int radeon_kfd_resume(struct radeon_device *rdev) return r; } -static u32 pool_to_domain(enum kgd_memory_pool p) -{ - switch (p) { - case KGD_POOL_FRAMEBUFFER: return RADEON_GEM_DOMAIN_VRAM; - default: return RADEON_GEM_DOMAIN_GTT; - } -} - -static int init_sa_manager(struct kgd_dev *kgd, unsigned int size) +static int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, + void **mem_obj, uint64_t *gpu_addr, + void **cpu_ptr) { struct radeon_device *rdev = (struct radeon_device *)kgd; + struct kgd_mem **mem = (struct kgd_mem **) mem_obj; int r; BUG_ON(kgd == NULL); + BUG_ON(gpu_addr == NULL); + BUG_ON(cpu_ptr == NULL); - r = radeon_sa_bo_manager_init(rdev, &rdev->kfd_bo, - size, - RADEON_GPU_PAGE_SIZE, - RADEON_GEM_DOMAIN_GTT, - RADEON_GEM_GTT_WC); + *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); + if ((*mem) == NULL) + return -ENOMEM; - if (r) + r = radeon_bo_create(rdev, size, PAGE_SIZE, true, RADEON_GEM_DOMAIN_GTT, + RADEON_GEM_GTT_WC, NULL, NULL, &(*mem)->bo); + if (r) { + dev_err(rdev->dev, + "failed to allocate BO for amdkfd (%d)\n", r); return r; + } - r = radeon_sa_bo_manager_start(rdev, &rdev->kfd_bo); - if (r) - radeon_sa_bo_manager_fini(rdev, &rdev->kfd_bo); - - return r; -} - -static void fini_sa_manager(struct kgd_dev *kgd) -{ - struct radeon_device *rdev = (struct radeon_device *)kgd; - - BUG_ON(kgd == NULL); - - radeon_sa_bo_manager_suspend(rdev, &rdev->kfd_bo); - radeon_sa_bo_manager_fini(rdev, &rdev->kfd_bo); -} - -static int allocate_mem(struct kgd_dev *kgd, size_t size, size_t alignment, - enum kgd_memory_pool pool, struct kgd_mem **mem) -{ - struct radeon_device *rdev = (struct radeon_device *)kgd; - u32 domain; - int r; - - BUG_ON(kgd == NULL); - - domain = pool_to_domain(pool); - if (domain != RADEON_GEM_DOMAIN_GTT) { - dev_err(rdev->dev, - "Only allowed to allocate gart memory for kfd\n"); - return -EINVAL; + /* map the buffer */ + r = radeon_bo_reserve((*mem)->bo, true); + if (r) { + dev_err(rdev->dev, "(%d) failed to reserve bo for amdkfd\n", r); + goto allocate_mem_reserve_bo_failed; } - *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); - if ((*mem) == NULL) - return -ENOMEM; + r = radeon_bo_pin((*mem)->bo, RADEON_GEM_DOMAIN_GTT, + &(*mem)->gpu_addr); + if (r) { + dev_err(rdev->dev, "(%d) failed to pin bo for amdkfd\n", r); + goto allocate_mem_pin_bo_failed; + } + *gpu_addr = (*mem)->gpu_addr; - r = radeon_sa_bo_new(rdev, &rdev->kfd_bo, &(*mem)->sa_bo, size, - alignment); + r = radeon_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); if (r) { - dev_err(rdev->dev, "failed to get memory for kfd (%d)\n", r); - return r; + dev_err(rdev->dev, + "(%d) failed to map bo to kernel for amdkfd\n", r); + goto allocate_mem_kmap_bo_failed; } + *cpu_ptr = (*mem)->cpu_ptr; - (*mem)->ptr = radeon_sa_bo_cpu_addr((*mem)->sa_bo); - (*mem)->gpu_addr = radeon_sa_bo_gpu_addr((*mem)->sa_bo); + radeon_bo_unreserve((*mem)->bo); return 0; + +allocate_mem_kmap_bo_failed: + radeon_bo_unpin((*mem)->bo); +allocate_mem_pin_bo_failed: + radeon_bo_unreserve((*mem)->bo); +allocate_mem_reserve_bo_failed: + radeon_bo_unref(&(*mem)->bo); + + return r; } -static void free_mem(struct kgd_dev *kgd, struct kgd_mem *mem) +static void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) { - struct radeon_device *rdev = (struct radeon_device *)kgd; + struct kgd_mem *mem = (struct kgd_mem *) mem_obj; - BUG_ON(kgd == NULL); + BUG_ON(mem == NULL); - radeon_sa_bo_free(rdev, &mem->sa_bo, NULL); + radeon_bo_reserve(mem->bo, true); + radeon_bo_kunmap(mem->bo); + radeon_bo_unpin(mem->bo); + radeon_bo_unreserve(mem->bo); + radeon_bo_unref(&(mem->bo)); kfree(mem); } @@ -435,11 +429,28 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, return 0; } +static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) +{ + uint32_t retval; + + retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + + m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; + + pr_debug("kfd: sdma base address: 0x%x\n", retval); + + return retval; +} + static inline struct cik_mqd *get_mqd(void *mqd) { return (struct cik_mqd *)mqd; } +static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +{ + return (struct cik_sdma_rlc_registers *)mqd; +} + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, uint32_t queue_id, uint32_t __user *wptr) { @@ -517,6 +528,45 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, return 0; } +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) +{ + struct cik_sdma_rlc_registers *m; + uint32_t sdma_base_addr; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); + + write_register(kgd, + sdma_base_addr + SDMA0_RLC0_VIRTUAL_ADDR, + m->sdma_rlc_virtual_addr); + + write_register(kgd, + sdma_base_addr + SDMA0_RLC0_RB_BASE, + m->sdma_rlc_rb_base); + + write_register(kgd, + sdma_base_addr + SDMA0_RLC0_RB_BASE_HI, + m->sdma_rlc_rb_base_hi); + + write_register(kgd, + sdma_base_addr + SDMA0_RLC0_RB_RPTR_ADDR_LO, + m->sdma_rlc_rb_rptr_addr_lo); + + write_register(kgd, + sdma_base_addr + SDMA0_RLC0_RB_RPTR_ADDR_HI, + m->sdma_rlc_rb_rptr_addr_hi); + + write_register(kgd, + sdma_base_addr + SDMA0_RLC0_DOORBELL, + m->sdma_rlc_doorbell); + + write_register(kgd, + sdma_base_addr + SDMA0_RLC0_RB_CNTL, + m->sdma_rlc_rb_cntl); + + return 0; +} + static bool kgd_hqd_is_occupies(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id) { @@ -538,6 +588,24 @@ static bool kgd_hqd_is_occupies(struct kgd_dev *kgd, uint64_t queue_address, return retval; } +static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) +{ + struct cik_sdma_rlc_registers *m; + uint32_t sdma_base_addr; + uint32_t sdma_rlc_rb_cntl; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); + + sdma_rlc_rb_cntl = read_register(kgd, + sdma_base_addr + SDMA0_RLC0_RB_CNTL); + + if (sdma_rlc_rb_cntl & SDMA_RB_ENABLE) + return true; + + return false; +} + static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id) @@ -566,6 +634,39 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, return 0; } +static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int timeout) +{ + struct cik_sdma_rlc_registers *m; + uint32_t sdma_base_addr; + uint32_t temp; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); + + temp = read_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_CNTL); + temp = temp & ~SDMA_RB_ENABLE; + write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_CNTL, temp); + + while (true) { + temp = read_register(kgd, sdma_base_addr + + SDMA0_RLC0_CONTEXT_STATUS); + if (temp & SDMA_RLC_IDLE) + break; + if (timeout == 0) + return -ETIME; + msleep(20); + timeout -= 20; + } + + write_register(kgd, sdma_base_addr + SDMA0_RLC0_DOORBELL, 0); + write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_RPTR, 0); + write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_WPTR, 0); + write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_BASE, 0); + + return 0; +} + static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) { struct radeon_device *rdev = (struct radeon_device *) kgd; |