amd/amdkfd/kfd_packet_manager.c

0001 // SPDX-License-Identifier: GPL-2.0 OR MIT
0002 /*
0003  * Copyright 2014-2022 Advanced Micro Devices, Inc.
0004  *
0005  * Permission is hereby granted, free of charge, to any person obtaining a
0006  * copy of this software and associated documentation files (the "Software"),
0007  * to deal in the Software without restriction, including without limitation
0008  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0009  * and/or sell copies of the Software, and to permit persons to whom the
0010  * Software is furnished to do so, subject to the following conditions:
0011  *
0012  * The above copyright notice and this permission notice shall be included in
0013  * all copies or substantial portions of the Software.
0014  *
0015  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0016  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0017  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0018  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0019  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0020  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0021  * OTHER DEALINGS IN THE SOFTWARE.
0022  *
0023  */
0024
0025 #include <linux/slab.h>
0026 #include <linux/mutex.h>
0027 #include "kfd_device_queue_manager.h"
0028 #include "kfd_kernel_queue.h"
0029 #include "kfd_priv.h"
0030
0031 static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
0032                 unsigned int buffer_size_bytes)
0033 {
0034     unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t);
0035
0036     WARN((temp * sizeof(uint32_t)) > buffer_size_bytes,
0037          "Runlist IB overflow");
0038     *wptr = temp;
0039 }
0040
0041 static void pm_calc_rlib_size(struct packet_manager *pm,
0042                 unsigned int *rlib_size,
0043                 bool *over_subscription)
0044 {
0045     unsigned int process_count, queue_count, compute_queue_count, gws_queue_count;
0046     unsigned int map_queue_size;
0047     unsigned int max_proc_per_quantum = 1;
0048     struct kfd_dev *dev = pm->dqm->dev;
0049
0050     process_count = pm->dqm->processes_count;
0051     queue_count = pm->dqm->active_queue_count;
0052     compute_queue_count = pm->dqm->active_cp_queue_count;
0053     gws_queue_count = pm->dqm->gws_queue_count;
0054
0055     /* check if there is over subscription
0056      * Note: the arbitration between the number of VMIDs and
0057      * hws_max_conc_proc has been done in
0058      * kgd2kfd_device_init().
0059      */
0060     *over_subscription = false;
0061
0062     if (dev->max_proc_per_quantum > 1)
0063         max_proc_per_quantum = dev->max_proc_per_quantum;
0064
0065     if ((process_count > max_proc_per_quantum) ||
0066         compute_queue_count > get_cp_queues_num(pm->dqm) ||
0067         gws_queue_count > 1) {
0068         *over_subscription = true;
0069         pr_debug("Over subscribed runlist\n");
0070     }
0071
0072     map_queue_size = pm->pmf->map_queues_size;
0073     /* calculate run list ib allocation size */
0074     *rlib_size = process_count * pm->pmf->map_process_size +
0075              queue_count * map_queue_size;
0076
0077     /*
0078      * Increase the allocation size in case we need a chained run list
0079      * when over subscription
0080      */
0081     if (*over_subscription)
0082         *rlib_size += pm->pmf->runlist_size;
0083
0084     pr_debug("runlist ib size %d\n", *rlib_size);
0085 }
0086
0087 static int pm_allocate_runlist_ib(struct packet_manager *pm,
0088                 unsigned int **rl_buffer,
0089                 uint64_t *rl_gpu_buffer,
0090                 unsigned int *rl_buffer_size,
0091                 bool *is_over_subscription)
0092 {
0093     int retval;
0094
0095     if (WARN_ON(pm->allocated))
0096         return -EINVAL;
0097
0098     pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
0099
0100     mutex_lock(&pm->lock);
0101
0102     retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size,
0103                     &pm->ib_buffer_obj);
0104
0105     if (retval) {
0106         pr_err("Failed to allocate runlist IB\n");
0107         goto out;
0108     }
0109
0110     *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr;
0111     *rl_gpu_buffer = pm->ib_buffer_obj->gpu_addr;
0112
0113     memset(*rl_buffer, 0, *rl_buffer_size);
0114     pm->allocated = true;
0115
0116 out:
0117     mutex_unlock(&pm->lock);
0118     return retval;
0119 }
0120
0121 static int pm_create_runlist_ib(struct packet_manager *pm,
0122                 struct list_head *queues,
0123                 uint64_t *rl_gpu_addr,
0124                 size_t *rl_size_bytes)
0125 {
0126     unsigned int alloc_size_bytes;
0127     unsigned int *rl_buffer, rl_wptr, i;
0128     int retval, processes_mapped;
0129     struct device_process_node *cur;
0130     struct qcm_process_device *qpd;
0131     struct queue *q;
0132     struct kernel_queue *kq;
0133     bool is_over_subscription;
0134
0135     rl_wptr = retval = processes_mapped = 0;
0136
0137     retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr,
0138                 &alloc_size_bytes, &is_over_subscription);
0139     if (retval)
0140         return retval;
0141
0142     *rl_size_bytes = alloc_size_bytes;
0143     pm->ib_size_bytes = alloc_size_bytes;
0144
0145     pr_debug("Building runlist ib process count: %d queues count %d\n",
0146         pm->dqm->processes_count, pm->dqm->active_queue_count);
0147
0148     /* build the run list ib packet */
0149     list_for_each_entry(cur, queues, list) {
0150         qpd = cur->qpd;
0151         /* build map process packet */
0152         if (processes_mapped >= pm->dqm->processes_count) {
0153             pr_debug("Not enough space left in runlist IB\n");
0154             pm_release_ib(pm);
0155             return -ENOMEM;
0156         }
0157
0158         retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd);
0159         if (retval)
0160             return retval;
0161
0162         processes_mapped++;
0163         inc_wptr(&rl_wptr, pm->pmf->map_process_size,
0164                 alloc_size_bytes);
0165
0166         list_for_each_entry(kq, &qpd->priv_queue_list, list) {
0167             if (!kq->queue->properties.is_active)
0168                 continue;
0169
0170             pr_debug("static_queue, mapping kernel q %d, is debug status %d\n",
0171                 kq->queue->queue, qpd->is_debug);
0172
0173             retval = pm->pmf->map_queues(pm,
0174                         &rl_buffer[rl_wptr],
0175                         kq->queue,
0176                         qpd->is_debug);
0177             if (retval)
0178                 return retval;
0179
0180             inc_wptr(&rl_wptr,
0181                 pm->pmf->map_queues_size,
0182                 alloc_size_bytes);
0183         }
0184
0185         list_for_each_entry(q, &qpd->queues_list, list) {
0186             if (!q->properties.is_active)
0187                 continue;
0188
0189             pr_debug("static_queue, mapping user queue %d, is debug status %d\n",
0190                 q->queue, qpd->is_debug);
0191
0192             retval = pm->pmf->map_queues(pm,
0193                         &rl_buffer[rl_wptr],
0194                         q,
0195                         qpd->is_debug);
0196
0197             if (retval)
0198                 return retval;
0199
0200             inc_wptr(&rl_wptr,
0201                 pm->pmf->map_queues_size,
0202                 alloc_size_bytes);
0203         }
0204     }
0205
0206     pr_debug("Finished map process and queues to runlist\n");
0207
0208     if (is_over_subscription) {
0209         if (!pm->is_over_subscription)
0210             pr_warn("Runlist is getting oversubscribed. Expect reduced ROCm performance.\n");
0211         retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
0212                     *rl_gpu_addr,
0213                     alloc_size_bytes / sizeof(uint32_t),
0214                     true);
0215     }
0216     pm->is_over_subscription = is_over_subscription;
0217
0218     for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
0219         pr_debug("0x%2X ", rl_buffer[i]);
0220     pr_debug("\n");
0221
0222     return retval;
0223 }
0224
0225 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
0226 {
0227     switch (dqm->dev->adev->asic_type) {
0228     case CHIP_KAVERI:
0229     case CHIP_HAWAII:
0230         /* PM4 packet structures on CIK are the same as on VI */
0231     case CHIP_CARRIZO:
0232     case CHIP_TONGA:
0233     case CHIP_FIJI:
0234     case CHIP_POLARIS10:
0235     case CHIP_POLARIS11:
0236     case CHIP_POLARIS12:
0237     case CHIP_VEGAM:
0238         pm->pmf = &kfd_vi_pm_funcs;
0239         break;
0240     default:
0241         if (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 2))
0242             pm->pmf = &kfd_aldebaran_pm_funcs;
0243         else if (KFD_GC_VERSION(dqm->dev) >= IP_VERSION(9, 0, 1))
0244             pm->pmf = &kfd_v9_pm_funcs;
0245         else {
0246             WARN(1, "Unexpected ASIC family %u",
0247                  dqm->dev->adev->asic_type);
0248             return -EINVAL;
0249         }
0250     }
0251
0252     pm->dqm = dqm;
0253     mutex_init(&pm->lock);
0254     pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ);
0255     if (!pm->priv_queue) {
0256         mutex_destroy(&pm->lock);
0257         return -ENOMEM;
0258     }
0259     pm->allocated = false;
0260
0261     return 0;
0262 }
0263
0264 void pm_uninit(struct packet_manager *pm, bool hanging)
0265 {
0266     mutex_destroy(&pm->lock);
0267     kernel_queue_uninit(pm->priv_queue, hanging);
0268     pm->priv_queue = NULL;
0269 }
0270
0271 int pm_send_set_resources(struct packet_manager *pm,
0272                 struct scheduling_resources *res)
0273 {
0274     uint32_t *buffer, size;
0275     int retval = 0;
0276
0277     size = pm->pmf->set_resources_size;
0278     mutex_lock(&pm->lock);
0279     kq_acquire_packet_buffer(pm->priv_queue,
0280                     size / sizeof(uint32_t),
0281                     (unsigned int **)&buffer);
0282     if (!buffer) {
0283         pr_err("Failed to allocate buffer on kernel queue\n");
0284         retval = -ENOMEM;
0285         goto out;
0286     }
0287
0288     retval = pm->pmf->set_resources(pm, buffer, res);
0289     if (!retval)
0290         kq_submit_packet(pm->priv_queue);
0291     else
0292         kq_rollback_packet(pm->priv_queue);
0293
0294 out:
0295     mutex_unlock(&pm->lock);
0296
0297     return retval;
0298 }
0299
0300 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
0301 {
0302     uint64_t rl_gpu_ib_addr;
0303     uint32_t *rl_buffer;
0304     size_t rl_ib_size, packet_size_dwords;
0305     int retval;
0306
0307     retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr,
0308                     &rl_ib_size);
0309     if (retval)
0310         goto fail_create_runlist_ib;
0311
0312     pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr);
0313
0314     packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t);
0315     mutex_lock(&pm->lock);
0316
0317     retval = kq_acquire_packet_buffer(pm->priv_queue,
0318                     packet_size_dwords, &rl_buffer);
0319     if (retval)
0320         goto fail_acquire_packet_buffer;
0321
0322     retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr,
0323                     rl_ib_size / sizeof(uint32_t), false);
0324     if (retval)
0325         goto fail_create_runlist;
0326
0327     kq_submit_packet(pm->priv_queue);
0328
0329     mutex_unlock(&pm->lock);
0330
0331     return retval;
0332
0333 fail_create_runlist:
0334     kq_rollback_packet(pm->priv_queue);
0335 fail_acquire_packet_buffer:
0336     mutex_unlock(&pm->lock);
0337 fail_create_runlist_ib:
0338     pm_release_ib(pm);
0339     return retval;
0340 }
0341
0342 int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
0343             uint64_t fence_value)
0344 {
0345     uint32_t *buffer, size;
0346     int retval = 0;
0347
0348     if (WARN_ON(!fence_address))
0349         return -EFAULT;
0350
0351     size = pm->pmf->query_status_size;
0352     mutex_lock(&pm->lock);
0353     kq_acquire_packet_buffer(pm->priv_queue,
0354             size / sizeof(uint32_t), (unsigned int **)&buffer);
0355     if (!buffer) {
0356         pr_err("Failed to allocate buffer on kernel queue\n");
0357         retval = -ENOMEM;
0358         goto out;
0359     }
0360
0361     retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
0362     if (!retval)
0363         kq_submit_packet(pm->priv_queue);
0364     else
0365         kq_rollback_packet(pm->priv_queue);
0366
0367 out:
0368     mutex_unlock(&pm->lock);
0369     return retval;
0370 }
0371
0372 int pm_send_unmap_queue(struct packet_manager *pm,
0373             enum kfd_unmap_queues_filter filter,
0374             uint32_t filter_param, bool reset)
0375 {
0376     uint32_t *buffer, size;
0377     int retval = 0;
0378
0379     size = pm->pmf->unmap_queues_size;
0380     mutex_lock(&pm->lock);
0381     kq_acquire_packet_buffer(pm->priv_queue,
0382             size / sizeof(uint32_t), (unsigned int **)&buffer);
0383     if (!buffer) {
0384         pr_err("Failed to allocate buffer on kernel queue\n");
0385         retval = -ENOMEM;
0386         goto out;
0387     }
0388
0389     retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset);
0390     if (!retval)
0391         kq_submit_packet(pm->priv_queue);
0392     else
0393         kq_rollback_packet(pm->priv_queue);
0394
0395 out:
0396     mutex_unlock(&pm->lock);
0397     return retval;
0398 }
0399
0400 void pm_release_ib(struct packet_manager *pm)
0401 {
0402     mutex_lock(&pm->lock);
0403     if (pm->allocated) {
0404         kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj);
0405         pm->allocated = false;
0406     }
0407     mutex_unlock(&pm->lock);
0408 }
0409
0410 #if defined(CONFIG_DEBUG_FS)
0411
0412 int pm_debugfs_runlist(struct seq_file *m, void *data)
0413 {
0414     struct packet_manager *pm = data;
0415
0416     mutex_lock(&pm->lock);
0417
0418     if (!pm->allocated) {
0419         seq_puts(m, "  No active runlist\n");
0420         goto out;
0421     }
0422
0423     seq_hex_dump(m, "  ", DUMP_PREFIX_OFFSET, 32, 4,
0424              pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
0425
0426 out:
0427     mutex_unlock(&pm->lock);
0428     return 0;
0429 }
0430
0431 int pm_debugfs_hang_hws(struct packet_manager *pm)
0432 {
0433     uint32_t *buffer, size;
0434     int r = 0;
0435
0436     if (!pm->priv_queue)
0437         return -EAGAIN;
0438
0439     size = pm->pmf->query_status_size;
0440     mutex_lock(&pm->lock);
0441     kq_acquire_packet_buffer(pm->priv_queue,
0442             size / sizeof(uint32_t), (unsigned int **)&buffer);
0443     if (!buffer) {
0444         pr_err("Failed to allocate buffer on kernel queue\n");
0445         r = -ENOMEM;
0446         goto out;
0447     }
0448     memset(buffer, 0x55, size);
0449     kq_submit_packet(pm->priv_queue);
0450
0451     pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.",
0452         buffer[0], buffer[1], buffer[2], buffer[3],
0453         buffer[4], buffer[5], buffer[6]);
0454 out:
0455     mutex_unlock(&pm->lock);
0456     return r;
0457 }
0458
0459
0460 #endif