Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 OR MIT
0002 /*
0003  * Copyright 2014-2022 Advanced Micro Devices, Inc.
0004  *
0005  * Permission is hereby granted, free of charge, to any person obtaining a
0006  * copy of this software and associated documentation files (the "Software"),
0007  * to deal in the Software without restriction, including without limitation
0008  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0009  * and/or sell copies of the Software, and to permit persons to whom the
0010  * Software is furnished to do so, subject to the following conditions:
0011  *
0012  * The above copyright notice and this permission notice shall be included in
0013  * all copies or substantial portions of the Software.
0014  *
0015  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0016  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0017  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0018  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0019  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0020  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0021  * OTHER DEALINGS IN THE SOFTWARE.
0022  */
0023 
0024 #include <linux/mm_types.h>
0025 #include <linux/slab.h>
0026 #include <linux/types.h>
0027 #include <linux/sched/signal.h>
0028 #include <linux/sched/mm.h>
0029 #include <linux/uaccess.h>
0030 #include <linux/mman.h>
0031 #include <linux/memory.h>
0032 #include "kfd_priv.h"
0033 #include "kfd_events.h"
0034 #include "kfd_iommu.h"
0035 #include <linux/device.h>
0036 
0037 /*
0038  * Wrapper around wait_queue_entry_t
0039  */
0040 struct kfd_event_waiter {
0041     wait_queue_entry_t wait;
0042     struct kfd_event *event; /* Event to wait for */
0043     bool activated;      /* Becomes true when event is signaled */
0044 };
0045 
0046 /*
0047  * Each signal event needs a 64-bit signal slot where the signaler will write
0048  * a 1 before sending an interrupt. (This is needed because some interrupts
0049  * do not contain enough spare data bits to identify an event.)
0050  * We get whole pages and map them to the process VA.
0051  * Individual signal events use their event_id as slot index.
0052  */
0053 struct kfd_signal_page {
0054     uint64_t *kernel_address;
0055     uint64_t __user *user_address;
0056     bool need_to_free_pages;
0057 };
0058 
0059 static uint64_t *page_slots(struct kfd_signal_page *page)
0060 {
0061     return page->kernel_address;
0062 }
0063 
0064 static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
0065 {
0066     void *backing_store;
0067     struct kfd_signal_page *page;
0068 
0069     page = kzalloc(sizeof(*page), GFP_KERNEL);
0070     if (!page)
0071         return NULL;
0072 
0073     backing_store = (void *) __get_free_pages(GFP_KERNEL,
0074                     get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
0075     if (!backing_store)
0076         goto fail_alloc_signal_store;
0077 
0078     /* Initialize all events to unsignaled */
0079     memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT,
0080            KFD_SIGNAL_EVENT_LIMIT * 8);
0081 
0082     page->kernel_address = backing_store;
0083     page->need_to_free_pages = true;
0084     pr_debug("Allocated new event signal page at %p, for process %p\n",
0085             page, p);
0086 
0087     return page;
0088 
0089 fail_alloc_signal_store:
0090     kfree(page);
0091     return NULL;
0092 }
0093 
0094 static int allocate_event_notification_slot(struct kfd_process *p,
0095                         struct kfd_event *ev,
0096                         const int *restore_id)
0097 {
0098     int id;
0099 
0100     if (!p->signal_page) {
0101         p->signal_page = allocate_signal_page(p);
0102         if (!p->signal_page)
0103             return -ENOMEM;
0104         /* Oldest user mode expects 256 event slots */
0105         p->signal_mapped_size = 256*8;
0106     }
0107 
0108     if (restore_id) {
0109         id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
0110                 GFP_KERNEL);
0111     } else {
0112         /*
0113          * Compatibility with old user mode: Only use signal slots
0114          * user mode has mapped, may be less than
0115          * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
0116          * of the event limit without breaking user mode.
0117          */
0118         id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
0119                 GFP_KERNEL);
0120     }
0121     if (id < 0)
0122         return id;
0123 
0124     ev->event_id = id;
0125     page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT;
0126 
0127     return 0;
0128 }
0129 
0130 /*
0131  * Assumes that p->event_mutex or rcu_readlock is held and of course that p is
0132  * not going away.
0133  */
0134 static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
0135 {
0136     return idr_find(&p->event_idr, id);
0137 }
0138 
0139 /**
0140  * lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID
0141  * @p:     Pointer to struct kfd_process
0142  * @id:    ID to look up
0143  * @bits:  Number of valid bits in @id
0144  *
0145  * Finds the first signaled event with a matching partial ID. If no
0146  * matching signaled event is found, returns NULL. In that case the
0147  * caller should assume that the partial ID is invalid and do an
0148  * exhaustive search of all siglaned events.
0149  *
0150  * If multiple events with the same partial ID signal at the same
0151  * time, they will be found one interrupt at a time, not necessarily
0152  * in the same order the interrupts occurred. As long as the number of
0153  * interrupts is correct, all signaled events will be seen by the
0154  * driver.
0155  */
0156 static struct kfd_event *lookup_signaled_event_by_partial_id(
0157     struct kfd_process *p, uint32_t id, uint32_t bits)
0158 {
0159     struct kfd_event *ev;
0160 
0161     if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT)
0162         return NULL;
0163 
0164     /* Fast path for the common case that @id is not a partial ID
0165      * and we only need a single lookup.
0166      */
0167     if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) {
0168         if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
0169             return NULL;
0170 
0171         return idr_find(&p->event_idr, id);
0172     }
0173 
0174     /* General case for partial IDs: Iterate over all matching IDs
0175      * and find the first one that has signaled.
0176      */
0177     for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) {
0178         if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
0179             continue;
0180 
0181         ev = idr_find(&p->event_idr, id);
0182     }
0183 
0184     return ev;
0185 }
0186 
0187 static int create_signal_event(struct file *devkfd, struct kfd_process *p,
0188                 struct kfd_event *ev, const int *restore_id)
0189 {
0190     int ret;
0191 
0192     if (p->signal_mapped_size &&
0193         p->signal_event_count == p->signal_mapped_size / 8) {
0194         if (!p->signal_event_limit_reached) {
0195             pr_debug("Signal event wasn't created because limit was reached\n");
0196             p->signal_event_limit_reached = true;
0197         }
0198         return -ENOSPC;
0199     }
0200 
0201     ret = allocate_event_notification_slot(p, ev, restore_id);
0202     if (ret) {
0203         pr_warn("Signal event wasn't created because out of kernel memory\n");
0204         return ret;
0205     }
0206 
0207     p->signal_event_count++;
0208 
0209     ev->user_signal_address = &p->signal_page->user_address[ev->event_id];
0210     pr_debug("Signal event number %zu created with id %d, address %p\n",
0211             p->signal_event_count, ev->event_id,
0212             ev->user_signal_address);
0213 
0214     return 0;
0215 }
0216 
0217 static int create_other_event(struct kfd_process *p, struct kfd_event *ev, const int *restore_id)
0218 {
0219     int id;
0220 
0221     if (restore_id)
0222         id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
0223             GFP_KERNEL);
0224     else
0225         /* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
0226          * intentional integer overflow to -1 without a compiler
0227          * warning. idr_alloc treats a negative value as "maximum
0228          * signed integer".
0229          */
0230         id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
0231                 (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
0232                 GFP_KERNEL);
0233 
0234     if (id < 0)
0235         return id;
0236     ev->event_id = id;
0237 
0238     return 0;
0239 }
0240 
0241 int kfd_event_init_process(struct kfd_process *p)
0242 {
0243     int id;
0244 
0245     mutex_init(&p->event_mutex);
0246     idr_init(&p->event_idr);
0247     p->signal_page = NULL;
0248     p->signal_event_count = 1;
0249     /* Allocate event ID 0. It is used for a fast path to ignore bogus events
0250      * that are sent by the CP without a context ID
0251      */
0252     id = idr_alloc(&p->event_idr, NULL, 0, 1, GFP_KERNEL);
0253     if (id < 0) {
0254         idr_destroy(&p->event_idr);
0255         mutex_destroy(&p->event_mutex);
0256         return id;
0257     }
0258     return 0;
0259 }
0260 
0261 static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
0262 {
0263     struct kfd_event_waiter *waiter;
0264 
0265     /* Wake up pending waiters. They will return failure */
0266     spin_lock(&ev->lock);
0267     list_for_each_entry(waiter, &ev->wq.head, wait.entry)
0268         WRITE_ONCE(waiter->event, NULL);
0269     wake_up_all(&ev->wq);
0270     spin_unlock(&ev->lock);
0271 
0272     if (ev->type == KFD_EVENT_TYPE_SIGNAL ||
0273         ev->type == KFD_EVENT_TYPE_DEBUG)
0274         p->signal_event_count--;
0275 
0276     idr_remove(&p->event_idr, ev->event_id);
0277     kfree_rcu(ev, rcu);
0278 }
0279 
0280 static void destroy_events(struct kfd_process *p)
0281 {
0282     struct kfd_event *ev;
0283     uint32_t id;
0284 
0285     idr_for_each_entry(&p->event_idr, ev, id)
0286         if (ev)
0287             destroy_event(p, ev);
0288     idr_destroy(&p->event_idr);
0289     mutex_destroy(&p->event_mutex);
0290 }
0291 
0292 /*
0293  * We assume that the process is being destroyed and there is no need to
0294  * unmap the pages or keep bookkeeping data in order.
0295  */
0296 static void shutdown_signal_page(struct kfd_process *p)
0297 {
0298     struct kfd_signal_page *page = p->signal_page;
0299 
0300     if (page) {
0301         if (page->need_to_free_pages)
0302             free_pages((unsigned long)page->kernel_address,
0303                    get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
0304         kfree(page);
0305     }
0306 }
0307 
0308 void kfd_event_free_process(struct kfd_process *p)
0309 {
0310     destroy_events(p);
0311     shutdown_signal_page(p);
0312 }
0313 
0314 static bool event_can_be_gpu_signaled(const struct kfd_event *ev)
0315 {
0316     return ev->type == KFD_EVENT_TYPE_SIGNAL ||
0317                     ev->type == KFD_EVENT_TYPE_DEBUG;
0318 }
0319 
0320 static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
0321 {
0322     return ev->type == KFD_EVENT_TYPE_SIGNAL;
0323 }
0324 
0325 static int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
0326                uint64_t size, uint64_t user_handle)
0327 {
0328     struct kfd_signal_page *page;
0329 
0330     if (p->signal_page)
0331         return -EBUSY;
0332 
0333     page = kzalloc(sizeof(*page), GFP_KERNEL);
0334     if (!page)
0335         return -ENOMEM;
0336 
0337     /* Initialize all events to unsignaled */
0338     memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
0339            KFD_SIGNAL_EVENT_LIMIT * 8);
0340 
0341     page->kernel_address = kernel_address;
0342 
0343     p->signal_page = page;
0344     p->signal_mapped_size = size;
0345     p->signal_handle = user_handle;
0346     return 0;
0347 }
0348 
0349 int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset)
0350 {
0351     struct kfd_dev *kfd;
0352     struct kfd_process_device *pdd;
0353     void *mem, *kern_addr;
0354     uint64_t size;
0355     int err = 0;
0356 
0357     if (p->signal_page) {
0358         pr_err("Event page is already set\n");
0359         return -EINVAL;
0360     }
0361 
0362     pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(event_page_offset));
0363     if (!pdd) {
0364         pr_err("Getting device by id failed in %s\n", __func__);
0365         return -EINVAL;
0366     }
0367     kfd = pdd->dev;
0368 
0369     pdd = kfd_bind_process_to_device(kfd, p);
0370     if (IS_ERR(pdd))
0371         return PTR_ERR(pdd);
0372 
0373     mem = kfd_process_device_translate_handle(pdd,
0374             GET_IDR_HANDLE(event_page_offset));
0375     if (!mem) {
0376         pr_err("Can't find BO, offset is 0x%llx\n", event_page_offset);
0377         return -EINVAL;
0378     }
0379 
0380     err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(mem, &kern_addr, &size);
0381     if (err) {
0382         pr_err("Failed to map event page to kernel\n");
0383         return err;
0384     }
0385 
0386     err = kfd_event_page_set(p, kern_addr, size, event_page_offset);
0387     if (err) {
0388         pr_err("Failed to set event page\n");
0389         amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(mem);
0390         return err;
0391     }
0392     return err;
0393 }
0394 
0395 int kfd_event_create(struct file *devkfd, struct kfd_process *p,
0396              uint32_t event_type, bool auto_reset, uint32_t node_id,
0397              uint32_t *event_id, uint32_t *event_trigger_data,
0398              uint64_t *event_page_offset, uint32_t *event_slot_index)
0399 {
0400     int ret = 0;
0401     struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
0402 
0403     if (!ev)
0404         return -ENOMEM;
0405 
0406     ev->type = event_type;
0407     ev->auto_reset = auto_reset;
0408     ev->signaled = false;
0409 
0410     spin_lock_init(&ev->lock);
0411     init_waitqueue_head(&ev->wq);
0412 
0413     *event_page_offset = 0;
0414 
0415     mutex_lock(&p->event_mutex);
0416 
0417     switch (event_type) {
0418     case KFD_EVENT_TYPE_SIGNAL:
0419     case KFD_EVENT_TYPE_DEBUG:
0420         ret = create_signal_event(devkfd, p, ev, NULL);
0421         if (!ret) {
0422             *event_page_offset = KFD_MMAP_TYPE_EVENTS;
0423             *event_slot_index = ev->event_id;
0424         }
0425         break;
0426     default:
0427         ret = create_other_event(p, ev, NULL);
0428         break;
0429     }
0430 
0431     if (!ret) {
0432         *event_id = ev->event_id;
0433         *event_trigger_data = ev->event_id;
0434     } else {
0435         kfree(ev);
0436     }
0437 
0438     mutex_unlock(&p->event_mutex);
0439 
0440     return ret;
0441 }
0442 
0443 int kfd_criu_restore_event(struct file *devkfd,
0444                struct kfd_process *p,
0445                uint8_t __user *user_priv_ptr,
0446                uint64_t *priv_data_offset,
0447                uint64_t max_priv_data_size)
0448 {
0449     struct kfd_criu_event_priv_data *ev_priv;
0450     struct kfd_event *ev = NULL;
0451     int ret = 0;
0452 
0453     ev_priv = kmalloc(sizeof(*ev_priv), GFP_KERNEL);
0454     if (!ev_priv)
0455         return -ENOMEM;
0456 
0457     ev = kzalloc(sizeof(*ev), GFP_KERNEL);
0458     if (!ev) {
0459         ret = -ENOMEM;
0460         goto exit;
0461     }
0462 
0463     if (*priv_data_offset + sizeof(*ev_priv) > max_priv_data_size) {
0464         ret = -EINVAL;
0465         goto exit;
0466     }
0467 
0468     ret = copy_from_user(ev_priv, user_priv_ptr + *priv_data_offset, sizeof(*ev_priv));
0469     if (ret) {
0470         ret = -EFAULT;
0471         goto exit;
0472     }
0473     *priv_data_offset += sizeof(*ev_priv);
0474 
0475     if (ev_priv->user_handle) {
0476         ret = kfd_kmap_event_page(p, ev_priv->user_handle);
0477         if (ret)
0478             goto exit;
0479     }
0480 
0481     ev->type = ev_priv->type;
0482     ev->auto_reset = ev_priv->auto_reset;
0483     ev->signaled = ev_priv->signaled;
0484 
0485     spin_lock_init(&ev->lock);
0486     init_waitqueue_head(&ev->wq);
0487 
0488     mutex_lock(&p->event_mutex);
0489     switch (ev->type) {
0490     case KFD_EVENT_TYPE_SIGNAL:
0491     case KFD_EVENT_TYPE_DEBUG:
0492         ret = create_signal_event(devkfd, p, ev, &ev_priv->event_id);
0493         break;
0494     case KFD_EVENT_TYPE_MEMORY:
0495         memcpy(&ev->memory_exception_data,
0496             &ev_priv->memory_exception_data,
0497             sizeof(struct kfd_hsa_memory_exception_data));
0498 
0499         ret = create_other_event(p, ev, &ev_priv->event_id);
0500         break;
0501     case KFD_EVENT_TYPE_HW_EXCEPTION:
0502         memcpy(&ev->hw_exception_data,
0503             &ev_priv->hw_exception_data,
0504             sizeof(struct kfd_hsa_hw_exception_data));
0505 
0506         ret = create_other_event(p, ev, &ev_priv->event_id);
0507         break;
0508     }
0509 
0510 exit:
0511     if (ret)
0512         kfree(ev);
0513 
0514     kfree(ev_priv);
0515 
0516     mutex_unlock(&p->event_mutex);
0517 
0518     return ret;
0519 }
0520 
0521 int kfd_criu_checkpoint_events(struct kfd_process *p,
0522              uint8_t __user *user_priv_data,
0523              uint64_t *priv_data_offset)
0524 {
0525     struct kfd_criu_event_priv_data *ev_privs;
0526     int i = 0;
0527     int ret =  0;
0528     struct kfd_event *ev;
0529     uint32_t ev_id;
0530 
0531     uint32_t num_events = kfd_get_num_events(p);
0532 
0533     if (!num_events)
0534         return 0;
0535 
0536     ev_privs = kvzalloc(num_events * sizeof(*ev_privs), GFP_KERNEL);
0537     if (!ev_privs)
0538         return -ENOMEM;
0539 
0540 
0541     idr_for_each_entry(&p->event_idr, ev, ev_id) {
0542         struct kfd_criu_event_priv_data *ev_priv;
0543 
0544         /*
0545          * Currently, all events have same size of private_data, but the current ioctl's
0546          * and CRIU plugin supports private_data of variable sizes
0547          */
0548         ev_priv = &ev_privs[i];
0549 
0550         ev_priv->object_type = KFD_CRIU_OBJECT_TYPE_EVENT;
0551 
0552         /* We store the user_handle with the first event */
0553         if (i == 0 && p->signal_page)
0554             ev_priv->user_handle = p->signal_handle;
0555 
0556         ev_priv->event_id = ev->event_id;
0557         ev_priv->auto_reset = ev->auto_reset;
0558         ev_priv->type = ev->type;
0559         ev_priv->signaled = ev->signaled;
0560 
0561         if (ev_priv->type == KFD_EVENT_TYPE_MEMORY)
0562             memcpy(&ev_priv->memory_exception_data,
0563                 &ev->memory_exception_data,
0564                 sizeof(struct kfd_hsa_memory_exception_data));
0565         else if (ev_priv->type == KFD_EVENT_TYPE_HW_EXCEPTION)
0566             memcpy(&ev_priv->hw_exception_data,
0567                 &ev->hw_exception_data,
0568                 sizeof(struct kfd_hsa_hw_exception_data));
0569 
0570         pr_debug("Checkpointed event[%d] id = 0x%08x auto_reset = %x type = %x signaled = %x\n",
0571               i,
0572               ev_priv->event_id,
0573               ev_priv->auto_reset,
0574               ev_priv->type,
0575               ev_priv->signaled);
0576         i++;
0577     }
0578 
0579     ret = copy_to_user(user_priv_data + *priv_data_offset,
0580                ev_privs, num_events * sizeof(*ev_privs));
0581     if (ret) {
0582         pr_err("Failed to copy events priv to user\n");
0583         ret = -EFAULT;
0584     }
0585 
0586     *priv_data_offset += num_events * sizeof(*ev_privs);
0587 
0588     kvfree(ev_privs);
0589     return ret;
0590 }
0591 
0592 int kfd_get_num_events(struct kfd_process *p)
0593 {
0594     struct kfd_event *ev;
0595     uint32_t id;
0596     u32 num_events = 0;
0597 
0598     idr_for_each_entry(&p->event_idr, ev, id)
0599         num_events++;
0600 
0601     return num_events;
0602 }
0603 
0604 /* Assumes that p is current. */
0605 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id)
0606 {
0607     struct kfd_event *ev;
0608     int ret = 0;
0609 
0610     mutex_lock(&p->event_mutex);
0611 
0612     ev = lookup_event_by_id(p, event_id);
0613 
0614     if (ev)
0615         destroy_event(p, ev);
0616     else
0617         ret = -EINVAL;
0618 
0619     mutex_unlock(&p->event_mutex);
0620     return ret;
0621 }
0622 
0623 static void set_event(struct kfd_event *ev)
0624 {
0625     struct kfd_event_waiter *waiter;
0626 
0627     /* Auto reset if the list is non-empty and we're waking
0628      * someone. waitqueue_active is safe here because we're
0629      * protected by the ev->lock, which is also held when
0630      * updating the wait queues in kfd_wait_on_events.
0631      */
0632     ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq);
0633 
0634     list_for_each_entry(waiter, &ev->wq.head, wait.entry)
0635         WRITE_ONCE(waiter->activated, true);
0636 
0637     wake_up_all(&ev->wq);
0638 }
0639 
0640 /* Assumes that p is current. */
0641 int kfd_set_event(struct kfd_process *p, uint32_t event_id)
0642 {
0643     int ret = 0;
0644     struct kfd_event *ev;
0645 
0646     rcu_read_lock();
0647 
0648     ev = lookup_event_by_id(p, event_id);
0649     if (!ev) {
0650         ret = -EINVAL;
0651         goto unlock_rcu;
0652     }
0653     spin_lock(&ev->lock);
0654 
0655     if (event_can_be_cpu_signaled(ev))
0656         set_event(ev);
0657     else
0658         ret = -EINVAL;
0659 
0660     spin_unlock(&ev->lock);
0661 unlock_rcu:
0662     rcu_read_unlock();
0663     return ret;
0664 }
0665 
0666 static void reset_event(struct kfd_event *ev)
0667 {
0668     ev->signaled = false;
0669 }
0670 
0671 /* Assumes that p is current. */
0672 int kfd_reset_event(struct kfd_process *p, uint32_t event_id)
0673 {
0674     int ret = 0;
0675     struct kfd_event *ev;
0676 
0677     rcu_read_lock();
0678 
0679     ev = lookup_event_by_id(p, event_id);
0680     if (!ev) {
0681         ret = -EINVAL;
0682         goto unlock_rcu;
0683     }
0684     spin_lock(&ev->lock);
0685 
0686     if (event_can_be_cpu_signaled(ev))
0687         reset_event(ev);
0688     else
0689         ret = -EINVAL;
0690 
0691     spin_unlock(&ev->lock);
0692 unlock_rcu:
0693     rcu_read_unlock();
0694     return ret;
0695 
0696 }
0697 
0698 static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev)
0699 {
0700     WRITE_ONCE(page_slots(p->signal_page)[ev->event_id], UNSIGNALED_EVENT_SLOT);
0701 }
0702 
0703 static void set_event_from_interrupt(struct kfd_process *p,
0704                     struct kfd_event *ev)
0705 {
0706     if (ev && event_can_be_gpu_signaled(ev)) {
0707         acknowledge_signal(p, ev);
0708         spin_lock(&ev->lock);
0709         set_event(ev);
0710         spin_unlock(&ev->lock);
0711     }
0712 }
0713 
0714 void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
0715                 uint32_t valid_id_bits)
0716 {
0717     struct kfd_event *ev = NULL;
0718 
0719     /*
0720      * Because we are called from arbitrary context (workqueue) as opposed
0721      * to process context, kfd_process could attempt to exit while we are
0722      * running so the lookup function increments the process ref count.
0723      */
0724     struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
0725 
0726     if (!p)
0727         return; /* Presumably process exited. */
0728 
0729     rcu_read_lock();
0730 
0731     if (valid_id_bits)
0732         ev = lookup_signaled_event_by_partial_id(p, partial_id,
0733                              valid_id_bits);
0734     if (ev) {
0735         set_event_from_interrupt(p, ev);
0736     } else if (p->signal_page) {
0737         /*
0738          * Partial ID lookup failed. Assume that the event ID
0739          * in the interrupt payload was invalid and do an
0740          * exhaustive search of signaled events.
0741          */
0742         uint64_t *slots = page_slots(p->signal_page);
0743         uint32_t id;
0744 
0745         if (valid_id_bits)
0746             pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
0747                          partial_id, valid_id_bits);
0748 
0749         if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) {
0750             /* With relatively few events, it's faster to
0751              * iterate over the event IDR
0752              */
0753             idr_for_each_entry(&p->event_idr, ev, id) {
0754                 if (id >= KFD_SIGNAL_EVENT_LIMIT)
0755                     break;
0756 
0757                 if (READ_ONCE(slots[id]) != UNSIGNALED_EVENT_SLOT)
0758                     set_event_from_interrupt(p, ev);
0759             }
0760         } else {
0761             /* With relatively many events, it's faster to
0762              * iterate over the signal slots and lookup
0763              * only signaled events from the IDR.
0764              */
0765             for (id = 1; id < KFD_SIGNAL_EVENT_LIMIT; id++)
0766                 if (READ_ONCE(slots[id]) != UNSIGNALED_EVENT_SLOT) {
0767                     ev = lookup_event_by_id(p, id);
0768                     set_event_from_interrupt(p, ev);
0769                 }
0770         }
0771     }
0772 
0773     rcu_read_unlock();
0774     kfd_unref_process(p);
0775 }
0776 
0777 static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
0778 {
0779     struct kfd_event_waiter *event_waiters;
0780     uint32_t i;
0781 
0782     event_waiters = kmalloc_array(num_events,
0783                     sizeof(struct kfd_event_waiter),
0784                     GFP_KERNEL);
0785     if (!event_waiters)
0786         return NULL;
0787 
0788     for (i = 0; (event_waiters) && (i < num_events) ; i++) {
0789         init_wait(&event_waiters[i].wait);
0790         event_waiters[i].activated = false;
0791     }
0792 
0793     return event_waiters;
0794 }
0795 
0796 static int init_event_waiter(struct kfd_process *p,
0797         struct kfd_event_waiter *waiter,
0798         uint32_t event_id)
0799 {
0800     struct kfd_event *ev = lookup_event_by_id(p, event_id);
0801 
0802     if (!ev)
0803         return -EINVAL;
0804 
0805     spin_lock(&ev->lock);
0806     waiter->event = ev;
0807     waiter->activated = ev->signaled;
0808     ev->signaled = ev->signaled && !ev->auto_reset;
0809     if (!waiter->activated)
0810         add_wait_queue(&ev->wq, &waiter->wait);
0811     spin_unlock(&ev->lock);
0812 
0813     return 0;
0814 }
0815 
0816 /* test_event_condition - Test condition of events being waited for
0817  * @all:           Return completion only if all events have signaled
0818  * @num_events:    Number of events to wait for
0819  * @event_waiters: Array of event waiters, one per event
0820  *
0821  * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have
0822  * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all)
0823  * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of
0824  * the events have been destroyed.
0825  */
0826 static uint32_t test_event_condition(bool all, uint32_t num_events,
0827                 struct kfd_event_waiter *event_waiters)
0828 {
0829     uint32_t i;
0830     uint32_t activated_count = 0;
0831 
0832     for (i = 0; i < num_events; i++) {
0833         if (!READ_ONCE(event_waiters[i].event))
0834             return KFD_IOC_WAIT_RESULT_FAIL;
0835 
0836         if (READ_ONCE(event_waiters[i].activated)) {
0837             if (!all)
0838                 return KFD_IOC_WAIT_RESULT_COMPLETE;
0839 
0840             activated_count++;
0841         }
0842     }
0843 
0844     return activated_count == num_events ?
0845         KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT;
0846 }
0847 
0848 /*
0849  * Copy event specific data, if defined.
0850  * Currently only memory exception events have additional data to copy to user
0851  */
0852 static int copy_signaled_event_data(uint32_t num_events,
0853         struct kfd_event_waiter *event_waiters,
0854         struct kfd_event_data __user *data)
0855 {
0856     struct kfd_hsa_memory_exception_data *src;
0857     struct kfd_hsa_memory_exception_data __user *dst;
0858     struct kfd_event_waiter *waiter;
0859     struct kfd_event *event;
0860     uint32_t i;
0861 
0862     for (i = 0; i < num_events; i++) {
0863         waiter = &event_waiters[i];
0864         event = waiter->event;
0865         if (!event)
0866             return -EINVAL; /* event was destroyed */
0867         if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
0868             dst = &data[i].memory_exception_data;
0869             src = &event->memory_exception_data;
0870             if (copy_to_user(dst, src,
0871                 sizeof(struct kfd_hsa_memory_exception_data)))
0872                 return -EFAULT;
0873         }
0874     }
0875 
0876     return 0;
0877 }
0878 
0879 static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
0880 {
0881     if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
0882         return 0;
0883 
0884     if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE)
0885         return MAX_SCHEDULE_TIMEOUT;
0886 
0887     /*
0888      * msecs_to_jiffies interprets all values above 2^31-1 as infinite,
0889      * but we consider them finite.
0890      * This hack is wrong, but nobody is likely to notice.
0891      */
0892     user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF);
0893 
0894     return msecs_to_jiffies(user_timeout_ms) + 1;
0895 }
0896 
0897 static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters,
0898              bool undo_auto_reset)
0899 {
0900     uint32_t i;
0901 
0902     for (i = 0; i < num_events; i++)
0903         if (waiters[i].event) {
0904             spin_lock(&waiters[i].event->lock);
0905             remove_wait_queue(&waiters[i].event->wq,
0906                       &waiters[i].wait);
0907             if (undo_auto_reset && waiters[i].activated &&
0908                 waiters[i].event && waiters[i].event->auto_reset)
0909                 set_event(waiters[i].event);
0910             spin_unlock(&waiters[i].event->lock);
0911         }
0912 
0913     kfree(waiters);
0914 }
0915 
0916 int kfd_wait_on_events(struct kfd_process *p,
0917                uint32_t num_events, void __user *data,
0918                bool all, uint32_t *user_timeout_ms,
0919                uint32_t *wait_result)
0920 {
0921     struct kfd_event_data __user *events =
0922             (struct kfd_event_data __user *) data;
0923     uint32_t i;
0924     int ret = 0;
0925 
0926     struct kfd_event_waiter *event_waiters = NULL;
0927     long timeout = user_timeout_to_jiffies(*user_timeout_ms);
0928 
0929     event_waiters = alloc_event_waiters(num_events);
0930     if (!event_waiters) {
0931         ret = -ENOMEM;
0932         goto out;
0933     }
0934 
0935     /* Use p->event_mutex here to protect against concurrent creation and
0936      * destruction of events while we initialize event_waiters.
0937      */
0938     mutex_lock(&p->event_mutex);
0939 
0940     for (i = 0; i < num_events; i++) {
0941         struct kfd_event_data event_data;
0942 
0943         if (copy_from_user(&event_data, &events[i],
0944                 sizeof(struct kfd_event_data))) {
0945             ret = -EFAULT;
0946             goto out_unlock;
0947         }
0948 
0949         ret = init_event_waiter(p, &event_waiters[i],
0950                     event_data.event_id);
0951         if (ret)
0952             goto out_unlock;
0953     }
0954 
0955     /* Check condition once. */
0956     *wait_result = test_event_condition(all, num_events, event_waiters);
0957     if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) {
0958         ret = copy_signaled_event_data(num_events,
0959                            event_waiters, events);
0960         goto out_unlock;
0961     } else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) {
0962         /* This should not happen. Events shouldn't be
0963          * destroyed while we're holding the event_mutex
0964          */
0965         goto out_unlock;
0966     }
0967 
0968     mutex_unlock(&p->event_mutex);
0969 
0970     while (true) {
0971         if (fatal_signal_pending(current)) {
0972             ret = -EINTR;
0973             break;
0974         }
0975 
0976         if (signal_pending(current)) {
0977             ret = -ERESTARTSYS;
0978             if (*user_timeout_ms != KFD_EVENT_TIMEOUT_IMMEDIATE &&
0979                 *user_timeout_ms != KFD_EVENT_TIMEOUT_INFINITE)
0980                 *user_timeout_ms = jiffies_to_msecs(
0981                     max(0l, timeout-1));
0982             break;
0983         }
0984 
0985         /* Set task state to interruptible sleep before
0986          * checking wake-up conditions. A concurrent wake-up
0987          * will put the task back into runnable state. In that
0988          * case schedule_timeout will not put the task to
0989          * sleep and we'll get a chance to re-check the
0990          * updated conditions almost immediately. Otherwise,
0991          * this race condition would lead to a soft hang or a
0992          * very long sleep.
0993          */
0994         set_current_state(TASK_INTERRUPTIBLE);
0995 
0996         *wait_result = test_event_condition(all, num_events,
0997                             event_waiters);
0998         if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT)
0999             break;
1000 
1001         if (timeout <= 0)
1002             break;
1003 
1004         timeout = schedule_timeout(timeout);
1005     }
1006     __set_current_state(TASK_RUNNING);
1007 
1008     mutex_lock(&p->event_mutex);
1009     /* copy_signaled_event_data may sleep. So this has to happen
1010      * after the task state is set back to RUNNING.
1011      *
1012      * The event may also have been destroyed after signaling. So
1013      * copy_signaled_event_data also must confirm that the event
1014      * still exists. Therefore this must be under the p->event_mutex
1015      * which is also held when events are destroyed.
1016      */
1017     if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE)
1018         ret = copy_signaled_event_data(num_events,
1019                            event_waiters, events);
1020 
1021 out_unlock:
1022     free_waiters(num_events, event_waiters, ret == -ERESTARTSYS);
1023     mutex_unlock(&p->event_mutex);
1024 out:
1025     if (ret)
1026         *wait_result = KFD_IOC_WAIT_RESULT_FAIL;
1027     else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL)
1028         ret = -EIO;
1029 
1030     return ret;
1031 }
1032 
1033 int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
1034 {
1035     unsigned long pfn;
1036     struct kfd_signal_page *page;
1037     int ret;
1038 
1039     /* check required size doesn't exceed the allocated size */
1040     if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) <
1041             get_order(vma->vm_end - vma->vm_start)) {
1042         pr_err("Event page mmap requested illegal size\n");
1043         return -EINVAL;
1044     }
1045 
1046     page = p->signal_page;
1047     if (!page) {
1048         /* Probably KFD bug, but mmap is user-accessible. */
1049         pr_debug("Signal page could not be found\n");
1050         return -EINVAL;
1051     }
1052 
1053     pfn = __pa(page->kernel_address);
1054     pfn >>= PAGE_SHIFT;
1055 
1056     vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE
1057                | VM_DONTDUMP | VM_PFNMAP;
1058 
1059     pr_debug("Mapping signal page\n");
1060     pr_debug("     start user address  == 0x%08lx\n", vma->vm_start);
1061     pr_debug("     end user address    == 0x%08lx\n", vma->vm_end);
1062     pr_debug("     pfn                 == 0x%016lX\n", pfn);
1063     pr_debug("     vm_flags            == 0x%08lX\n", vma->vm_flags);
1064     pr_debug("     size                == 0x%08lX\n",
1065             vma->vm_end - vma->vm_start);
1066 
1067     page->user_address = (uint64_t __user *)vma->vm_start;
1068 
1069     /* mapping the page to user process */
1070     ret = remap_pfn_range(vma, vma->vm_start, pfn,
1071             vma->vm_end - vma->vm_start, vma->vm_page_prot);
1072     if (!ret)
1073         p->signal_mapped_size = vma->vm_end - vma->vm_start;
1074 
1075     return ret;
1076 }
1077 
1078 /*
1079  * Assumes that p is not going away.
1080  */
1081 static void lookup_events_by_type_and_signal(struct kfd_process *p,
1082         int type, void *event_data)
1083 {
1084     struct kfd_hsa_memory_exception_data *ev_data;
1085     struct kfd_event *ev;
1086     uint32_t id;
1087     bool send_signal = true;
1088 
1089     ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
1090 
1091     rcu_read_lock();
1092 
1093     id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1094     idr_for_each_entry_continue(&p->event_idr, ev, id)
1095         if (ev->type == type) {
1096             send_signal = false;
1097             dev_dbg(kfd_device,
1098                     "Event found: id %X type %d",
1099                     ev->event_id, ev->type);
1100             spin_lock(&ev->lock);
1101             set_event(ev);
1102             if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
1103                 ev->memory_exception_data = *ev_data;
1104             spin_unlock(&ev->lock);
1105         }
1106 
1107     if (type == KFD_EVENT_TYPE_MEMORY) {
1108         dev_warn(kfd_device,
1109             "Sending SIGSEGV to process %d (pasid 0x%x)",
1110                 p->lead_thread->pid, p->pasid);
1111         send_sig(SIGSEGV, p->lead_thread, 0);
1112     }
1113 
1114     /* Send SIGTERM no event of type "type" has been found*/
1115     if (send_signal) {
1116         if (send_sigterm) {
1117             dev_warn(kfd_device,
1118                 "Sending SIGTERM to process %d (pasid 0x%x)",
1119                     p->lead_thread->pid, p->pasid);
1120             send_sig(SIGTERM, p->lead_thread, 0);
1121         } else {
1122             dev_err(kfd_device,
1123                 "Process %d (pasid 0x%x) got unhandled exception",
1124                 p->lead_thread->pid, p->pasid);
1125         }
1126     }
1127 
1128     rcu_read_unlock();
1129 }
1130 
1131 #ifdef KFD_SUPPORT_IOMMU_V2
1132 void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
1133         unsigned long address, bool is_write_requested,
1134         bool is_execute_requested)
1135 {
1136     struct kfd_hsa_memory_exception_data memory_exception_data;
1137     struct vm_area_struct *vma;
1138     int user_gpu_id;
1139 
1140     /*
1141      * Because we are called from arbitrary context (workqueue) as opposed
1142      * to process context, kfd_process could attempt to exit while we are
1143      * running so the lookup function increments the process ref count.
1144      */
1145     struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
1146     struct mm_struct *mm;
1147 
1148     if (!p)
1149         return; /* Presumably process exited. */
1150 
1151     /* Take a safe reference to the mm_struct, which may otherwise
1152      * disappear even while the kfd_process is still referenced.
1153      */
1154     mm = get_task_mm(p->lead_thread);
1155     if (!mm) {
1156         kfd_unref_process(p);
1157         return; /* Process is exiting */
1158     }
1159 
1160     user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
1161     if (unlikely(user_gpu_id == -EINVAL)) {
1162         WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
1163         return;
1164     }
1165     memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1166 
1167     mmap_read_lock(mm);
1168     vma = find_vma(mm, address);
1169 
1170     memory_exception_data.gpu_id = user_gpu_id;
1171     memory_exception_data.va = address;
1172     /* Set failure reason */
1173     memory_exception_data.failure.NotPresent = 1;
1174     memory_exception_data.failure.NoExecute = 0;
1175     memory_exception_data.failure.ReadOnly = 0;
1176     if (vma && address >= vma->vm_start) {
1177         memory_exception_data.failure.NotPresent = 0;
1178 
1179         if (is_write_requested && !(vma->vm_flags & VM_WRITE))
1180             memory_exception_data.failure.ReadOnly = 1;
1181         else
1182             memory_exception_data.failure.ReadOnly = 0;
1183 
1184         if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
1185             memory_exception_data.failure.NoExecute = 1;
1186         else
1187             memory_exception_data.failure.NoExecute = 0;
1188     }
1189 
1190     mmap_read_unlock(mm);
1191     mmput(mm);
1192 
1193     pr_debug("notpresent %d, noexecute %d, readonly %d\n",
1194             memory_exception_data.failure.NotPresent,
1195             memory_exception_data.failure.NoExecute,
1196             memory_exception_data.failure.ReadOnly);
1197 
1198     /* Workaround on Raven to not kill the process when memory is freed
1199      * before IOMMU is able to finish processing all the excessive PPRs
1200      */
1201 
1202     if (KFD_GC_VERSION(dev) != IP_VERSION(9, 1, 0) &&
1203         KFD_GC_VERSION(dev) != IP_VERSION(9, 2, 2) &&
1204         KFD_GC_VERSION(dev) != IP_VERSION(9, 3, 0))
1205         lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY,
1206                 &memory_exception_data);
1207 
1208     kfd_unref_process(p);
1209 }
1210 #endif /* KFD_SUPPORT_IOMMU_V2 */
1211 
1212 void kfd_signal_hw_exception_event(u32 pasid)
1213 {
1214     /*
1215      * Because we are called from arbitrary context (workqueue) as opposed
1216      * to process context, kfd_process could attempt to exit while we are
1217      * running so the lookup function increments the process ref count.
1218      */
1219     struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
1220 
1221     if (!p)
1222         return; /* Presumably process exited. */
1223 
1224     lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
1225     kfd_unref_process(p);
1226 }
1227 
1228 void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
1229                 struct kfd_vm_fault_info *info)
1230 {
1231     struct kfd_event *ev;
1232     uint32_t id;
1233     struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
1234     struct kfd_hsa_memory_exception_data memory_exception_data;
1235     int user_gpu_id;
1236 
1237     if (!p)
1238         return; /* Presumably process exited. */
1239 
1240     user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
1241     if (unlikely(user_gpu_id == -EINVAL)) {
1242         WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
1243         return;
1244     }
1245 
1246     memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1247     memory_exception_data.gpu_id = user_gpu_id;
1248     memory_exception_data.failure.imprecise = true;
1249     /* Set failure reason */
1250     if (info) {
1251         memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
1252         memory_exception_data.failure.NotPresent =
1253             info->prot_valid ? 1 : 0;
1254         memory_exception_data.failure.NoExecute =
1255             info->prot_exec ? 1 : 0;
1256         memory_exception_data.failure.ReadOnly =
1257             info->prot_write ? 1 : 0;
1258         memory_exception_data.failure.imprecise = 0;
1259     }
1260 
1261     rcu_read_lock();
1262 
1263     id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1264     idr_for_each_entry_continue(&p->event_idr, ev, id)
1265         if (ev->type == KFD_EVENT_TYPE_MEMORY) {
1266             spin_lock(&ev->lock);
1267             ev->memory_exception_data = memory_exception_data;
1268             set_event(ev);
1269             spin_unlock(&ev->lock);
1270         }
1271 
1272     rcu_read_unlock();
1273     kfd_unref_process(p);
1274 }
1275 
1276 void kfd_signal_reset_event(struct kfd_dev *dev)
1277 {
1278     struct kfd_hsa_hw_exception_data hw_exception_data;
1279     struct kfd_hsa_memory_exception_data memory_exception_data;
1280     struct kfd_process *p;
1281     struct kfd_event *ev;
1282     unsigned int temp;
1283     uint32_t id, idx;
1284     int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
1285             KFD_HW_EXCEPTION_ECC :
1286             KFD_HW_EXCEPTION_GPU_HANG;
1287 
1288     /* Whole gpu reset caused by GPU hang and memory is lost */
1289     memset(&hw_exception_data, 0, sizeof(hw_exception_data));
1290     hw_exception_data.memory_lost = 1;
1291     hw_exception_data.reset_cause = reset_cause;
1292 
1293     memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1294     memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
1295     memory_exception_data.failure.imprecise = true;
1296 
1297     idx = srcu_read_lock(&kfd_processes_srcu);
1298     hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
1299         int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
1300 
1301         if (unlikely(user_gpu_id == -EINVAL)) {
1302             WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
1303             continue;
1304         }
1305 
1306         rcu_read_lock();
1307 
1308         id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1309         idr_for_each_entry_continue(&p->event_idr, ev, id) {
1310             if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
1311                 spin_lock(&ev->lock);
1312                 ev->hw_exception_data = hw_exception_data;
1313                 ev->hw_exception_data.gpu_id = user_gpu_id;
1314                 set_event(ev);
1315                 spin_unlock(&ev->lock);
1316             }
1317             if (ev->type == KFD_EVENT_TYPE_MEMORY &&
1318                 reset_cause == KFD_HW_EXCEPTION_ECC) {
1319                 spin_lock(&ev->lock);
1320                 ev->memory_exception_data = memory_exception_data;
1321                 ev->memory_exception_data.gpu_id = user_gpu_id;
1322                 set_event(ev);
1323                 spin_unlock(&ev->lock);
1324             }
1325         }
1326 
1327         rcu_read_unlock();
1328     }
1329     srcu_read_unlock(&kfd_processes_srcu, idx);
1330 }
1331 
1332 void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid)
1333 {
1334     struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
1335     struct kfd_hsa_memory_exception_data memory_exception_data;
1336     struct kfd_hsa_hw_exception_data hw_exception_data;
1337     struct kfd_event *ev;
1338     uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1339     int user_gpu_id;
1340 
1341     if (!p)
1342         return; /* Presumably process exited. */
1343 
1344     user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
1345     if (unlikely(user_gpu_id == -EINVAL)) {
1346         WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
1347         return;
1348     }
1349 
1350     memset(&hw_exception_data, 0, sizeof(hw_exception_data));
1351     hw_exception_data.gpu_id = user_gpu_id;
1352     hw_exception_data.memory_lost = 1;
1353     hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC;
1354 
1355     memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1356     memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED;
1357     memory_exception_data.gpu_id = user_gpu_id;
1358     memory_exception_data.failure.imprecise = true;
1359 
1360     rcu_read_lock();
1361 
1362     idr_for_each_entry_continue(&p->event_idr, ev, id) {
1363         if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
1364             spin_lock(&ev->lock);
1365             ev->hw_exception_data = hw_exception_data;
1366             set_event(ev);
1367             spin_unlock(&ev->lock);
1368         }
1369 
1370         if (ev->type == KFD_EVENT_TYPE_MEMORY) {
1371             spin_lock(&ev->lock);
1372             ev->memory_exception_data = memory_exception_data;
1373             set_event(ev);
1374             spin_unlock(&ev->lock);
1375         }
1376     }
1377 
1378     rcu_read_unlock();
1379 
1380     /* user application will handle SIGBUS signal */
1381     send_sig(SIGBUS, p->lead_thread, 0);
1382 
1383     kfd_unref_process(p);
1384 }