Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 OR MIT
0002 /*
0003  * Copyright 2020-2022 Advanced Micro Devices, Inc.
0004  *
0005  * Permission is hereby granted, free of charge, to any person obtaining a
0006  * copy of this software and associated documentation files (the "Software"),
0007  * to deal in the Software without restriction, including without limitation
0008  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0009  * and/or sell copies of the Software, and to permit persons to whom the
0010  * Software is furnished to do so, subject to the following conditions:
0011  *
0012  * The above copyright notice and this permission notice shall be included in
0013  * all copies or substantial portions of the Software.
0014  *
0015  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0016  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0017  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0018  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0019  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0020  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0021  * OTHER DEALINGS IN THE SOFTWARE.
0022  */
0023 
0024 #include <linux/poll.h>
0025 #include <linux/wait.h>
0026 #include <linux/anon_inodes.h>
0027 #include <uapi/linux/kfd_ioctl.h>
0028 #include "amdgpu.h"
0029 #include "amdgpu_vm.h"
0030 #include "kfd_priv.h"
0031 #include "kfd_smi_events.h"
0032 
0033 struct kfd_smi_client {
0034     struct list_head list;
0035     struct kfifo fifo;
0036     wait_queue_head_t wait_queue;
0037     /* events enabled */
0038     uint64_t events;
0039     struct kfd_dev *dev;
0040     spinlock_t lock;
0041     struct rcu_head rcu;
0042     pid_t pid;
0043     bool suser;
0044 };
0045 
0046 #define MAX_KFIFO_SIZE  1024
0047 
0048 static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
0049 static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
0050 static ssize_t kfd_smi_ev_write(struct file *, const char __user *, size_t,
0051                 loff_t *);
0052 static int kfd_smi_ev_release(struct inode *, struct file *);
0053 
0054 static const char kfd_smi_name[] = "kfd_smi_ev";
0055 
0056 static const struct file_operations kfd_smi_ev_fops = {
0057     .owner = THIS_MODULE,
0058     .poll = kfd_smi_ev_poll,
0059     .read = kfd_smi_ev_read,
0060     .write = kfd_smi_ev_write,
0061     .release = kfd_smi_ev_release
0062 };
0063 
0064 static __poll_t kfd_smi_ev_poll(struct file *filep,
0065                 struct poll_table_struct *wait)
0066 {
0067     struct kfd_smi_client *client = filep->private_data;
0068     __poll_t mask = 0;
0069 
0070     poll_wait(filep, &client->wait_queue, wait);
0071 
0072     spin_lock(&client->lock);
0073     if (!kfifo_is_empty(&client->fifo))
0074         mask = EPOLLIN | EPOLLRDNORM;
0075     spin_unlock(&client->lock);
0076 
0077     return mask;
0078 }
0079 
0080 static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
0081                    size_t size, loff_t *offset)
0082 {
0083     int ret;
0084     size_t to_copy;
0085     struct kfd_smi_client *client = filep->private_data;
0086     unsigned char *buf;
0087 
0088     size = min_t(size_t, size, MAX_KFIFO_SIZE);
0089     buf = kmalloc(size, GFP_KERNEL);
0090     if (!buf)
0091         return -ENOMEM;
0092 
0093     /* kfifo_to_user can sleep so we can't use spinlock protection around
0094      * it. Instead, we kfifo out as spinlocked then copy them to the user.
0095      */
0096     spin_lock(&client->lock);
0097     to_copy = kfifo_len(&client->fifo);
0098     if (!to_copy) {
0099         spin_unlock(&client->lock);
0100         ret = -EAGAIN;
0101         goto ret_err;
0102     }
0103     to_copy = min(size, to_copy);
0104     ret = kfifo_out(&client->fifo, buf, to_copy);
0105     spin_unlock(&client->lock);
0106     if (ret <= 0) {
0107         ret = -EAGAIN;
0108         goto ret_err;
0109     }
0110 
0111     ret = copy_to_user(user, buf, to_copy);
0112     if (ret) {
0113         ret = -EFAULT;
0114         goto ret_err;
0115     }
0116 
0117     kfree(buf);
0118     return to_copy;
0119 
0120 ret_err:
0121     kfree(buf);
0122     return ret;
0123 }
0124 
0125 static ssize_t kfd_smi_ev_write(struct file *filep, const char __user *user,
0126                 size_t size, loff_t *offset)
0127 {
0128     struct kfd_smi_client *client = filep->private_data;
0129     uint64_t events;
0130 
0131     if (!access_ok(user, size) || size < sizeof(events))
0132         return -EFAULT;
0133     if (copy_from_user(&events, user, sizeof(events)))
0134         return -EFAULT;
0135 
0136     WRITE_ONCE(client->events, events);
0137 
0138     return sizeof(events);
0139 }
0140 
0141 static void kfd_smi_ev_client_free(struct rcu_head *p)
0142 {
0143     struct kfd_smi_client *ev = container_of(p, struct kfd_smi_client, rcu);
0144 
0145     kfifo_free(&ev->fifo);
0146     kfree(ev);
0147 }
0148 
0149 static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
0150 {
0151     struct kfd_smi_client *client = filep->private_data;
0152     struct kfd_dev *dev = client->dev;
0153 
0154     spin_lock(&dev->smi_lock);
0155     list_del_rcu(&client->list);
0156     spin_unlock(&dev->smi_lock);
0157 
0158     call_rcu(&client->rcu, kfd_smi_ev_client_free);
0159     return 0;
0160 }
0161 
0162 static bool kfd_smi_ev_enabled(pid_t pid, struct kfd_smi_client *client,
0163                    unsigned int event)
0164 {
0165     uint64_t all = KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS);
0166     uint64_t events = READ_ONCE(client->events);
0167 
0168     if (pid && client->pid != pid && !(client->suser && (events & all)))
0169         return false;
0170 
0171     return events & KFD_SMI_EVENT_MASK_FROM_INDEX(event);
0172 }
0173 
0174 static void add_event_to_kfifo(pid_t pid, struct kfd_dev *dev,
0175                    unsigned int smi_event, char *event_msg, int len)
0176 {
0177     struct kfd_smi_client *client;
0178 
0179     rcu_read_lock();
0180 
0181     list_for_each_entry_rcu(client, &dev->smi_clients, list) {
0182         if (!kfd_smi_ev_enabled(pid, client, smi_event))
0183             continue;
0184         spin_lock(&client->lock);
0185         if (kfifo_avail(&client->fifo) >= len) {
0186             kfifo_in(&client->fifo, event_msg, len);
0187             wake_up_all(&client->wait_queue);
0188         } else {
0189             pr_debug("smi_event(EventID: %u): no space left\n",
0190                     smi_event);
0191         }
0192         spin_unlock(&client->lock);
0193     }
0194 
0195     rcu_read_unlock();
0196 }
0197 
0198 __printf(4, 5)
0199 static void kfd_smi_event_add(pid_t pid, struct kfd_dev *dev,
0200                   unsigned int event, char *fmt, ...)
0201 {
0202     char fifo_in[KFD_SMI_EVENT_MSG_SIZE];
0203     int len;
0204     va_list args;
0205 
0206     if (list_empty(&dev->smi_clients))
0207         return;
0208 
0209     len = snprintf(fifo_in, sizeof(fifo_in), "%x ", event);
0210 
0211     va_start(args, fmt);
0212     len += vsnprintf(fifo_in + len, sizeof(fifo_in) - len, fmt, args);
0213     va_end(args);
0214 
0215     add_event_to_kfifo(pid, dev, event, fifo_in, len);
0216 }
0217 
0218 void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
0219 {
0220     unsigned int event;
0221 
0222     if (post_reset) {
0223         event = KFD_SMI_EVENT_GPU_POST_RESET;
0224     } else {
0225         event = KFD_SMI_EVENT_GPU_PRE_RESET;
0226         ++(dev->reset_seq_num);
0227     }
0228     kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
0229 }
0230 
0231 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
0232                          uint64_t throttle_bitmask)
0233 {
0234     kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
0235               throttle_bitmask,
0236               amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
0237 }
0238 
0239 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
0240 {
0241     struct amdgpu_task_info task_info;
0242 
0243     memset(&task_info, 0, sizeof(struct amdgpu_task_info));
0244     amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
0245     /* Report VM faults from user applications, not retry from kernel */
0246     if (!task_info.pid)
0247         return;
0248 
0249     kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
0250               task_info.pid, task_info.task_name);
0251 }
0252 
0253 void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
0254                     unsigned long address, bool write_fault,
0255                     ktime_t ts)
0256 {
0257     kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START,
0258               "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
0259               address, dev->id, write_fault ? 'W' : 'R');
0260 }
0261 
0262 void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
0263                   unsigned long address, bool migration)
0264 {
0265     kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END,
0266               "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
0267               pid, address, dev->id, migration ? 'M' : 'U');
0268 }
0269 
0270 void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid,
0271                    unsigned long start, unsigned long end,
0272                    uint32_t from, uint32_t to,
0273                    uint32_t prefetch_loc, uint32_t preferred_loc,
0274                    uint32_t trigger)
0275 {
0276     kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_MIGRATE_START,
0277               "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
0278               ktime_get_boottime_ns(), pid, start, end - start,
0279               from, to, prefetch_loc, preferred_loc, trigger);
0280 }
0281 
0282 void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
0283                  unsigned long start, unsigned long end,
0284                  uint32_t from, uint32_t to, uint32_t trigger)
0285 {
0286     kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_MIGRATE_END,
0287               "%lld -%d @%lx(%lx) %x->%x %d\n",
0288               ktime_get_boottime_ns(), pid, start, end - start,
0289               from, to, trigger);
0290 }
0291 
0292 void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
0293                   uint32_t trigger)
0294 {
0295     kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION,
0296               "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
0297               dev->id, trigger);
0298 }
0299 
0300 void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid)
0301 {
0302     kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_RESTORE,
0303               "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
0304               dev->id);
0305 }
0306 
0307 void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
0308 {
0309     struct kfd_process *p;
0310     int i;
0311 
0312     p = kfd_lookup_process_by_mm(mm);
0313     if (!p)
0314         return;
0315 
0316     for (i = 0; i < p->n_pdds; i++) {
0317         struct kfd_process_device *pdd = p->pdds[i];
0318 
0319         kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
0320                   KFD_SMI_EVENT_QUEUE_RESTORE,
0321                   "%lld -%d %x %c\n", ktime_get_boottime_ns(),
0322                   p->lead_thread->pid, pdd->dev->id, 'R');
0323     }
0324     kfd_unref_process(p);
0325 }
0326 
0327 void kfd_smi_event_unmap_from_gpu(struct kfd_dev *dev, pid_t pid,
0328                   unsigned long address, unsigned long last,
0329                   uint32_t trigger)
0330 {
0331     kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_UNMAP_FROM_GPU,
0332               "%lld -%d @%lx(%lx) %x %d\n", ktime_get_boottime_ns(),
0333               pid, address, last - address + 1, dev->id, trigger);
0334 }
0335 
0336 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
0337 {
0338     struct kfd_smi_client *client;
0339     int ret;
0340 
0341     client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL);
0342     if (!client)
0343         return -ENOMEM;
0344     INIT_LIST_HEAD(&client->list);
0345 
0346     ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL);
0347     if (ret) {
0348         kfree(client);
0349         return ret;
0350     }
0351 
0352     init_waitqueue_head(&client->wait_queue);
0353     spin_lock_init(&client->lock);
0354     client->events = 0;
0355     client->dev = dev;
0356     client->pid = current->tgid;
0357     client->suser = capable(CAP_SYS_ADMIN);
0358 
0359     spin_lock(&dev->smi_lock);
0360     list_add_rcu(&client->list, &dev->smi_clients);
0361     spin_unlock(&dev->smi_lock);
0362 
0363     ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client,
0364                    O_RDWR);
0365     if (ret < 0) {
0366         spin_lock(&dev->smi_lock);
0367         list_del_rcu(&client->list);
0368         spin_unlock(&dev->smi_lock);
0369 
0370         synchronize_rcu();
0371 
0372         kfifo_free(&client->fifo);
0373         kfree(client);
0374         return ret;
0375     }
0376     *fd = ret;
0377 
0378     return 0;
0379 }