Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /* sched.c - SPU scheduler.
0003  *
0004  * Copyright (C) IBM 2005
0005  * Author: Mark Nutter <mnutter@us.ibm.com>
0006  *
0007  * 2006-03-31   NUMA domains added.
0008  */
0009 
0010 #undef DEBUG
0011 
0012 #include <linux/errno.h>
0013 #include <linux/sched/signal.h>
0014 #include <linux/sched/loadavg.h>
0015 #include <linux/sched/rt.h>
0016 #include <linux/kernel.h>
0017 #include <linux/mm.h>
0018 #include <linux/slab.h>
0019 #include <linux/completion.h>
0020 #include <linux/vmalloc.h>
0021 #include <linux/smp.h>
0022 #include <linux/stddef.h>
0023 #include <linux/unistd.h>
0024 #include <linux/numa.h>
0025 #include <linux/mutex.h>
0026 #include <linux/notifier.h>
0027 #include <linux/kthread.h>
0028 #include <linux/pid_namespace.h>
0029 #include <linux/proc_fs.h>
0030 #include <linux/seq_file.h>
0031 
0032 #include <asm/io.h>
0033 #include <asm/mmu_context.h>
0034 #include <asm/spu.h>
0035 #include <asm/spu_csa.h>
0036 #include <asm/spu_priv1.h>
0037 #include "spufs.h"
0038 #define CREATE_TRACE_POINTS
0039 #include "sputrace.h"
0040 
0041 struct spu_prio_array {
0042     DECLARE_BITMAP(bitmap, MAX_PRIO);
0043     struct list_head runq[MAX_PRIO];
0044     spinlock_t runq_lock;
0045     int nr_waiting;
0046 };
0047 
0048 static unsigned long spu_avenrun[3];
0049 static struct spu_prio_array *spu_prio;
0050 static struct task_struct *spusched_task;
0051 static struct timer_list spusched_timer;
0052 static struct timer_list spuloadavg_timer;
0053 
0054 /*
0055  * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
0056  */
0057 #define NORMAL_PRIO     120
0058 
0059 /*
0060  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
0061  * tick for every 10 CPU scheduler ticks.
0062  */
0063 #define SPUSCHED_TICK       (10)
0064 
0065 /*
0066  * These are the 'tuning knobs' of the scheduler:
0067  *
0068  * Minimum timeslice is 5 msecs (or 1 spu scheduler tick, whichever is
0069  * larger), default timeslice is 100 msecs, maximum timeslice is 800 msecs.
0070  */
0071 #define MIN_SPU_TIMESLICE   max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
0072 #define DEF_SPU_TIMESLICE   (100 * HZ / (1000 * SPUSCHED_TICK))
0073 
0074 #define SCALE_PRIO(x, prio) \
0075     max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE)
0076 
0077 /*
0078  * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
0079  * [800ms ... 100ms ... 5ms]
0080  *
0081  * The higher a thread's priority, the bigger timeslices
0082  * it gets during one round of execution. But even the lowest
0083  * priority thread gets MIN_TIMESLICE worth of execution time.
0084  */
0085 void spu_set_timeslice(struct spu_context *ctx)
0086 {
0087     if (ctx->prio < NORMAL_PRIO)
0088         ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE * 4, ctx->prio);
0089     else
0090         ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE, ctx->prio);
0091 }
0092 
0093 /*
0094  * Update scheduling information from the owning thread.
0095  */
0096 void __spu_update_sched_info(struct spu_context *ctx)
0097 {
0098     /*
0099      * assert that the context is not on the runqueue, so it is safe
0100      * to change its scheduling parameters.
0101      */
0102     BUG_ON(!list_empty(&ctx->rq));
0103 
0104     /*
0105      * 32-Bit assignments are atomic on powerpc, and we don't care about
0106      * memory ordering here because retrieving the controlling thread is
0107      * per definition racy.
0108      */
0109     ctx->tid = current->pid;
0110 
0111     /*
0112      * We do our own priority calculations, so we normally want
0113      * ->static_prio to start with. Unfortunately this field
0114      * contains junk for threads with a realtime scheduling
0115      * policy so we have to look at ->prio in this case.
0116      */
0117     if (rt_prio(current->prio))
0118         ctx->prio = current->prio;
0119     else
0120         ctx->prio = current->static_prio;
0121     ctx->policy = current->policy;
0122 
0123     /*
0124      * TO DO: the context may be loaded, so we may need to activate
0125      * it again on a different node. But it shouldn't hurt anything
0126      * to update its parameters, because we know that the scheduler
0127      * is not actively looking at this field, since it is not on the
0128      * runqueue. The context will be rescheduled on the proper node
0129      * if it is timesliced or preempted.
0130      */
0131     cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
0132 
0133     /* Save the current cpu id for spu interrupt routing. */
0134     ctx->last_ran = raw_smp_processor_id();
0135 }
0136 
0137 void spu_update_sched_info(struct spu_context *ctx)
0138 {
0139     int node;
0140 
0141     if (ctx->state == SPU_STATE_RUNNABLE) {
0142         node = ctx->spu->node;
0143 
0144         /*
0145          * Take list_mutex to sync with find_victim().
0146          */
0147         mutex_lock(&cbe_spu_info[node].list_mutex);
0148         __spu_update_sched_info(ctx);
0149         mutex_unlock(&cbe_spu_info[node].list_mutex);
0150     } else {
0151         __spu_update_sched_info(ctx);
0152     }
0153 }
0154 
0155 static int __node_allowed(struct spu_context *ctx, int node)
0156 {
0157     if (nr_cpus_node(node)) {
0158         const struct cpumask *mask = cpumask_of_node(node);
0159 
0160         if (cpumask_intersects(mask, &ctx->cpus_allowed))
0161             return 1;
0162     }
0163 
0164     return 0;
0165 }
0166 
0167 static int node_allowed(struct spu_context *ctx, int node)
0168 {
0169     int rval;
0170 
0171     spin_lock(&spu_prio->runq_lock);
0172     rval = __node_allowed(ctx, node);
0173     spin_unlock(&spu_prio->runq_lock);
0174 
0175     return rval;
0176 }
0177 
0178 void do_notify_spus_active(void)
0179 {
0180     int node;
0181 
0182     /*
0183      * Wake up the active spu_contexts.
0184      */
0185     for_each_online_node(node) {
0186         struct spu *spu;
0187 
0188         mutex_lock(&cbe_spu_info[node].list_mutex);
0189         list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
0190             if (spu->alloc_state != SPU_FREE) {
0191                 struct spu_context *ctx = spu->ctx;
0192                 set_bit(SPU_SCHED_NOTIFY_ACTIVE,
0193                     &ctx->sched_flags);
0194                 mb();
0195                 wake_up_all(&ctx->stop_wq);
0196             }
0197         }
0198         mutex_unlock(&cbe_spu_info[node].list_mutex);
0199     }
0200 }
0201 
0202 /**
0203  * spu_bind_context - bind spu context to physical spu
0204  * @spu:    physical spu to bind to
0205  * @ctx:    context to bind
0206  */
0207 static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
0208 {
0209     spu_context_trace(spu_bind_context__enter, ctx, spu);
0210 
0211     spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
0212 
0213     if (ctx->flags & SPU_CREATE_NOSCHED)
0214         atomic_inc(&cbe_spu_info[spu->node].reserved_spus);
0215 
0216     ctx->stats.slb_flt_base = spu->stats.slb_flt;
0217     ctx->stats.class2_intr_base = spu->stats.class2_intr;
0218 
0219     spu_associate_mm(spu, ctx->owner);
0220 
0221     spin_lock_irq(&spu->register_lock);
0222     spu->ctx = ctx;
0223     spu->flags = 0;
0224     ctx->spu = spu;
0225     ctx->ops = &spu_hw_ops;
0226     spu->pid = current->pid;
0227     spu->tgid = current->tgid;
0228     spu->ibox_callback = spufs_ibox_callback;
0229     spu->wbox_callback = spufs_wbox_callback;
0230     spu->stop_callback = spufs_stop_callback;
0231     spu->mfc_callback = spufs_mfc_callback;
0232     spin_unlock_irq(&spu->register_lock);
0233 
0234     spu_unmap_mappings(ctx);
0235 
0236     spu_switch_log_notify(spu, ctx, SWITCH_LOG_START, 0);
0237     spu_restore(&ctx->csa, spu);
0238     spu->timestamp = jiffies;
0239     ctx->state = SPU_STATE_RUNNABLE;
0240 
0241     spuctx_switch_state(ctx, SPU_UTIL_USER);
0242 }
0243 
0244 /*
0245  * Must be used with the list_mutex held.
0246  */
0247 static inline int sched_spu(struct spu *spu)
0248 {
0249     BUG_ON(!mutex_is_locked(&cbe_spu_info[spu->node].list_mutex));
0250 
0251     return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED));
0252 }
0253 
0254 static void aff_merge_remaining_ctxs(struct spu_gang *gang)
0255 {
0256     struct spu_context *ctx;
0257 
0258     list_for_each_entry(ctx, &gang->aff_list_head, aff_list) {
0259         if (list_empty(&ctx->aff_list))
0260             list_add(&ctx->aff_list, &gang->aff_list_head);
0261     }
0262     gang->aff_flags |= AFF_MERGED;
0263 }
0264 
0265 static void aff_set_offsets(struct spu_gang *gang)
0266 {
0267     struct spu_context *ctx;
0268     int offset;
0269 
0270     offset = -1;
0271     list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
0272                                 aff_list) {
0273         if (&ctx->aff_list == &gang->aff_list_head)
0274             break;
0275         ctx->aff_offset = offset--;
0276     }
0277 
0278     offset = 0;
0279     list_for_each_entry(ctx, gang->aff_ref_ctx->aff_list.prev, aff_list) {
0280         if (&ctx->aff_list == &gang->aff_list_head)
0281             break;
0282         ctx->aff_offset = offset++;
0283     }
0284 
0285     gang->aff_flags |= AFF_OFFSETS_SET;
0286 }
0287 
0288 static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
0289          int group_size, int lowest_offset)
0290 {
0291     struct spu *spu;
0292     int node, n;
0293 
0294     /*
0295      * TODO: A better algorithm could be used to find a good spu to be
0296      *       used as reference location for the ctxs chain.
0297      */
0298     node = cpu_to_node(raw_smp_processor_id());
0299     for (n = 0; n < MAX_NUMNODES; n++, node++) {
0300         /*
0301          * "available_spus" counts how many spus are not potentially
0302          * going to be used by other affinity gangs whose reference
0303          * context is already in place. Although this code seeks to
0304          * avoid having affinity gangs with a summed amount of
0305          * contexts bigger than the amount of spus in the node,
0306          * this may happen sporadically. In this case, available_spus
0307          * becomes negative, which is harmless.
0308          */
0309         int available_spus;
0310 
0311         node = (node < MAX_NUMNODES) ? node : 0;
0312         if (!node_allowed(ctx, node))
0313             continue;
0314 
0315         available_spus = 0;
0316         mutex_lock(&cbe_spu_info[node].list_mutex);
0317         list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
0318             if (spu->ctx && spu->ctx->gang && !spu->ctx->aff_offset
0319                     && spu->ctx->gang->aff_ref_spu)
0320                 available_spus -= spu->ctx->gang->contexts;
0321             available_spus++;
0322         }
0323         if (available_spus < ctx->gang->contexts) {
0324             mutex_unlock(&cbe_spu_info[node].list_mutex);
0325             continue;
0326         }
0327 
0328         list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
0329             if ((!mem_aff || spu->has_mem_affinity) &&
0330                             sched_spu(spu)) {
0331                 mutex_unlock(&cbe_spu_info[node].list_mutex);
0332                 return spu;
0333             }
0334         }
0335         mutex_unlock(&cbe_spu_info[node].list_mutex);
0336     }
0337     return NULL;
0338 }
0339 
0340 static void aff_set_ref_point_location(struct spu_gang *gang)
0341 {
0342     int mem_aff, gs, lowest_offset;
0343     struct spu_context *tmp, *ctx;
0344 
0345     mem_aff = gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM;
0346     lowest_offset = 0;
0347     gs = 0;
0348 
0349     list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
0350         gs++;
0351 
0352     list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
0353                                 aff_list) {
0354         if (&ctx->aff_list == &gang->aff_list_head)
0355             break;
0356         lowest_offset = ctx->aff_offset;
0357     }
0358 
0359     gang->aff_ref_spu = aff_ref_location(gang->aff_ref_ctx, mem_aff, gs,
0360                             lowest_offset);
0361 }
0362 
0363 static struct spu *ctx_location(struct spu *ref, int offset, int node)
0364 {
0365     struct spu *spu;
0366 
0367     spu = NULL;
0368     if (offset >= 0) {
0369         list_for_each_entry(spu, ref->aff_list.prev, aff_list) {
0370             BUG_ON(spu->node != node);
0371             if (offset == 0)
0372                 break;
0373             if (sched_spu(spu))
0374                 offset--;
0375         }
0376     } else {
0377         list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) {
0378             BUG_ON(spu->node != node);
0379             if (offset == 0)
0380                 break;
0381             if (sched_spu(spu))
0382                 offset++;
0383         }
0384     }
0385 
0386     return spu;
0387 }
0388 
0389 /*
0390  * affinity_check is called each time a context is going to be scheduled.
0391  * It returns the spu ptr on which the context must run.
0392  */
0393 static int has_affinity(struct spu_context *ctx)
0394 {
0395     struct spu_gang *gang = ctx->gang;
0396 
0397     if (list_empty(&ctx->aff_list))
0398         return 0;
0399 
0400     if (atomic_read(&ctx->gang->aff_sched_count) == 0)
0401         ctx->gang->aff_ref_spu = NULL;
0402 
0403     if (!gang->aff_ref_spu) {
0404         if (!(gang->aff_flags & AFF_MERGED))
0405             aff_merge_remaining_ctxs(gang);
0406         if (!(gang->aff_flags & AFF_OFFSETS_SET))
0407             aff_set_offsets(gang);
0408         aff_set_ref_point_location(gang);
0409     }
0410 
0411     return gang->aff_ref_spu != NULL;
0412 }
0413 
0414 /**
0415  * spu_unbind_context - unbind spu context from physical spu
0416  * @spu:    physical spu to unbind from
0417  * @ctx:    context to unbind
0418  */
0419 static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
0420 {
0421     u32 status;
0422 
0423     spu_context_trace(spu_unbind_context__enter, ctx, spu);
0424 
0425     spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
0426 
0427     if (spu->ctx->flags & SPU_CREATE_NOSCHED)
0428         atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
0429 
0430     if (ctx->gang)
0431         /*
0432          * If ctx->gang->aff_sched_count is positive, SPU affinity is
0433          * being considered in this gang. Using atomic_dec_if_positive
0434          * allow us to skip an explicit check for affinity in this gang
0435          */
0436         atomic_dec_if_positive(&ctx->gang->aff_sched_count);
0437 
0438     spu_unmap_mappings(ctx);
0439     spu_save(&ctx->csa, spu);
0440     spu_switch_log_notify(spu, ctx, SWITCH_LOG_STOP, 0);
0441 
0442     spin_lock_irq(&spu->register_lock);
0443     spu->timestamp = jiffies;
0444     ctx->state = SPU_STATE_SAVED;
0445     spu->ibox_callback = NULL;
0446     spu->wbox_callback = NULL;
0447     spu->stop_callback = NULL;
0448     spu->mfc_callback = NULL;
0449     spu->pid = 0;
0450     spu->tgid = 0;
0451     ctx->ops = &spu_backing_ops;
0452     spu->flags = 0;
0453     spu->ctx = NULL;
0454     spin_unlock_irq(&spu->register_lock);
0455 
0456     spu_associate_mm(spu, NULL);
0457 
0458     ctx->stats.slb_flt +=
0459         (spu->stats.slb_flt - ctx->stats.slb_flt_base);
0460     ctx->stats.class2_intr +=
0461         (spu->stats.class2_intr - ctx->stats.class2_intr_base);
0462 
0463     /* This maps the underlying spu state to idle */
0464     spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
0465     ctx->spu = NULL;
0466 
0467     if (spu_stopped(ctx, &status))
0468         wake_up_all(&ctx->stop_wq);
0469 }
0470 
0471 /**
0472  * spu_add_to_rq - add a context to the runqueue
0473  * @ctx:       context to add
0474  */
0475 static void __spu_add_to_rq(struct spu_context *ctx)
0476 {
0477     /*
0478      * Unfortunately this code path can be called from multiple threads
0479      * on behalf of a single context due to the way the problem state
0480      * mmap support works.
0481      *
0482      * Fortunately we need to wake up all these threads at the same time
0483      * and can simply skip the runqueue addition for every but the first
0484      * thread getting into this codepath.
0485      *
0486      * It's still quite hacky, and long-term we should proxy all other
0487      * threads through the owner thread so that spu_run is in control
0488      * of all the scheduling activity for a given context.
0489      */
0490     if (list_empty(&ctx->rq)) {
0491         list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
0492         set_bit(ctx->prio, spu_prio->bitmap);
0493         if (!spu_prio->nr_waiting++)
0494             mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
0495     }
0496 }
0497 
0498 static void spu_add_to_rq(struct spu_context *ctx)
0499 {
0500     spin_lock(&spu_prio->runq_lock);
0501     __spu_add_to_rq(ctx);
0502     spin_unlock(&spu_prio->runq_lock);
0503 }
0504 
0505 static void __spu_del_from_rq(struct spu_context *ctx)
0506 {
0507     int prio = ctx->prio;
0508 
0509     if (!list_empty(&ctx->rq)) {
0510         if (!--spu_prio->nr_waiting)
0511             del_timer(&spusched_timer);
0512         list_del_init(&ctx->rq);
0513 
0514         if (list_empty(&spu_prio->runq[prio]))
0515             clear_bit(prio, spu_prio->bitmap);
0516     }
0517 }
0518 
0519 void spu_del_from_rq(struct spu_context *ctx)
0520 {
0521     spin_lock(&spu_prio->runq_lock);
0522     __spu_del_from_rq(ctx);
0523     spin_unlock(&spu_prio->runq_lock);
0524 }
0525 
0526 static void spu_prio_wait(struct spu_context *ctx)
0527 {
0528     DEFINE_WAIT(wait);
0529 
0530     /*
0531      * The caller must explicitly wait for a context to be loaded
0532      * if the nosched flag is set.  If NOSCHED is not set, the caller
0533      * queues the context and waits for an spu event or error.
0534      */
0535     BUG_ON(!(ctx->flags & SPU_CREATE_NOSCHED));
0536 
0537     spin_lock(&spu_prio->runq_lock);
0538     prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE);
0539     if (!signal_pending(current)) {
0540         __spu_add_to_rq(ctx);
0541         spin_unlock(&spu_prio->runq_lock);
0542         mutex_unlock(&ctx->state_mutex);
0543         schedule();
0544         mutex_lock(&ctx->state_mutex);
0545         spin_lock(&spu_prio->runq_lock);
0546         __spu_del_from_rq(ctx);
0547     }
0548     spin_unlock(&spu_prio->runq_lock);
0549     __set_current_state(TASK_RUNNING);
0550     remove_wait_queue(&ctx->stop_wq, &wait);
0551 }
0552 
0553 static struct spu *spu_get_idle(struct spu_context *ctx)
0554 {
0555     struct spu *spu, *aff_ref_spu;
0556     int node, n;
0557 
0558     spu_context_nospu_trace(spu_get_idle__enter, ctx);
0559 
0560     if (ctx->gang) {
0561         mutex_lock(&ctx->gang->aff_mutex);
0562         if (has_affinity(ctx)) {
0563             aff_ref_spu = ctx->gang->aff_ref_spu;
0564             atomic_inc(&ctx->gang->aff_sched_count);
0565             mutex_unlock(&ctx->gang->aff_mutex);
0566             node = aff_ref_spu->node;
0567 
0568             mutex_lock(&cbe_spu_info[node].list_mutex);
0569             spu = ctx_location(aff_ref_spu, ctx->aff_offset, node);
0570             if (spu && spu->alloc_state == SPU_FREE)
0571                 goto found;
0572             mutex_unlock(&cbe_spu_info[node].list_mutex);
0573 
0574             atomic_dec(&ctx->gang->aff_sched_count);
0575             goto not_found;
0576         }
0577         mutex_unlock(&ctx->gang->aff_mutex);
0578     }
0579     node = cpu_to_node(raw_smp_processor_id());
0580     for (n = 0; n < MAX_NUMNODES; n++, node++) {
0581         node = (node < MAX_NUMNODES) ? node : 0;
0582         if (!node_allowed(ctx, node))
0583             continue;
0584 
0585         mutex_lock(&cbe_spu_info[node].list_mutex);
0586         list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
0587             if (spu->alloc_state == SPU_FREE)
0588                 goto found;
0589         }
0590         mutex_unlock(&cbe_spu_info[node].list_mutex);
0591     }
0592 
0593  not_found:
0594     spu_context_nospu_trace(spu_get_idle__not_found, ctx);
0595     return NULL;
0596 
0597  found:
0598     spu->alloc_state = SPU_USED;
0599     mutex_unlock(&cbe_spu_info[node].list_mutex);
0600     spu_context_trace(spu_get_idle__found, ctx, spu);
0601     spu_init_channels(spu);
0602     return spu;
0603 }
0604 
0605 /**
0606  * find_victim - find a lower priority context to preempt
0607  * @ctx:    candidate context for running
0608  *
0609  * Returns the freed physical spu to run the new context on.
0610  */
0611 static struct spu *find_victim(struct spu_context *ctx)
0612 {
0613     struct spu_context *victim = NULL;
0614     struct spu *spu;
0615     int node, n;
0616 
0617     spu_context_nospu_trace(spu_find_victim__enter, ctx);
0618 
0619     /*
0620      * Look for a possible preemption candidate on the local node first.
0621      * If there is no candidate look at the other nodes.  This isn't
0622      * exactly fair, but so far the whole spu scheduler tries to keep
0623      * a strong node affinity.  We might want to fine-tune this in
0624      * the future.
0625      */
0626  restart:
0627     node = cpu_to_node(raw_smp_processor_id());
0628     for (n = 0; n < MAX_NUMNODES; n++, node++) {
0629         node = (node < MAX_NUMNODES) ? node : 0;
0630         if (!node_allowed(ctx, node))
0631             continue;
0632 
0633         mutex_lock(&cbe_spu_info[node].list_mutex);
0634         list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
0635             struct spu_context *tmp = spu->ctx;
0636 
0637             if (tmp && tmp->prio > ctx->prio &&
0638                 !(tmp->flags & SPU_CREATE_NOSCHED) &&
0639                 (!victim || tmp->prio > victim->prio)) {
0640                 victim = spu->ctx;
0641             }
0642         }
0643         if (victim)
0644             get_spu_context(victim);
0645         mutex_unlock(&cbe_spu_info[node].list_mutex);
0646 
0647         if (victim) {
0648             /*
0649              * This nests ctx->state_mutex, but we always lock
0650              * higher priority contexts before lower priority
0651              * ones, so this is safe until we introduce
0652              * priority inheritance schemes.
0653              *
0654              * XXX if the highest priority context is locked,
0655              * this can loop a long time.  Might be better to
0656              * look at another context or give up after X retries.
0657              */
0658             if (!mutex_trylock(&victim->state_mutex)) {
0659                 put_spu_context(victim);
0660                 victim = NULL;
0661                 goto restart;
0662             }
0663 
0664             spu = victim->spu;
0665             if (!spu || victim->prio <= ctx->prio) {
0666                 /*
0667                  * This race can happen because we've dropped
0668                  * the active list mutex.  Not a problem, just
0669                  * restart the search.
0670                  */
0671                 mutex_unlock(&victim->state_mutex);
0672                 put_spu_context(victim);
0673                 victim = NULL;
0674                 goto restart;
0675             }
0676 
0677             spu_context_trace(__spu_deactivate__unload, ctx, spu);
0678 
0679             mutex_lock(&cbe_spu_info[node].list_mutex);
0680             cbe_spu_info[node].nr_active--;
0681             spu_unbind_context(spu, victim);
0682             mutex_unlock(&cbe_spu_info[node].list_mutex);
0683 
0684             victim->stats.invol_ctx_switch++;
0685             spu->stats.invol_ctx_switch++;
0686             if (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags))
0687                 spu_add_to_rq(victim);
0688 
0689             mutex_unlock(&victim->state_mutex);
0690             put_spu_context(victim);
0691 
0692             return spu;
0693         }
0694     }
0695 
0696     return NULL;
0697 }
0698 
0699 static void __spu_schedule(struct spu *spu, struct spu_context *ctx)
0700 {
0701     int node = spu->node;
0702     int success = 0;
0703 
0704     spu_set_timeslice(ctx);
0705 
0706     mutex_lock(&cbe_spu_info[node].list_mutex);
0707     if (spu->ctx == NULL) {
0708         spu_bind_context(spu, ctx);
0709         cbe_spu_info[node].nr_active++;
0710         spu->alloc_state = SPU_USED;
0711         success = 1;
0712     }
0713     mutex_unlock(&cbe_spu_info[node].list_mutex);
0714 
0715     if (success)
0716         wake_up_all(&ctx->run_wq);
0717     else
0718         spu_add_to_rq(ctx);
0719 }
0720 
0721 static void spu_schedule(struct spu *spu, struct spu_context *ctx)
0722 {
0723     /* not a candidate for interruptible because it's called either
0724        from the scheduler thread or from spu_deactivate */
0725     mutex_lock(&ctx->state_mutex);
0726     if (ctx->state == SPU_STATE_SAVED)
0727         __spu_schedule(spu, ctx);
0728     spu_release(ctx);
0729 }
0730 
0731 /**
0732  * spu_unschedule - remove a context from a spu, and possibly release it.
0733  * @spu:    The SPU to unschedule from
0734  * @ctx:    The context currently scheduled on the SPU
0735  * @free_spu    Whether to free the SPU for other contexts
0736  *
0737  * Unbinds the context @ctx from the SPU @spu. If @free_spu is non-zero, the
0738  * SPU is made available for other contexts (ie, may be returned by
0739  * spu_get_idle). If this is zero, the caller is expected to schedule another
0740  * context to this spu.
0741  *
0742  * Should be called with ctx->state_mutex held.
0743  */
0744 static void spu_unschedule(struct spu *spu, struct spu_context *ctx,
0745         int free_spu)
0746 {
0747     int node = spu->node;
0748 
0749     mutex_lock(&cbe_spu_info[node].list_mutex);
0750     cbe_spu_info[node].nr_active--;
0751     if (free_spu)
0752         spu->alloc_state = SPU_FREE;
0753     spu_unbind_context(spu, ctx);
0754     ctx->stats.invol_ctx_switch++;
0755     spu->stats.invol_ctx_switch++;
0756     mutex_unlock(&cbe_spu_info[node].list_mutex);
0757 }
0758 
0759 /**
0760  * spu_activate - find a free spu for a context and execute it
0761  * @ctx:    spu context to schedule
0762  * @flags:  flags (currently ignored)
0763  *
0764  * Tries to find a free spu to run @ctx.  If no free spu is available
0765  * add the context to the runqueue so it gets woken up once an spu
0766  * is available.
0767  */
0768 int spu_activate(struct spu_context *ctx, unsigned long flags)
0769 {
0770     struct spu *spu;
0771 
0772     /*
0773      * If there are multiple threads waiting for a single context
0774      * only one actually binds the context while the others will
0775      * only be able to acquire the state_mutex once the context
0776      * already is in runnable state.
0777      */
0778     if (ctx->spu)
0779         return 0;
0780 
0781 spu_activate_top:
0782     if (signal_pending(current))
0783         return -ERESTARTSYS;
0784 
0785     spu = spu_get_idle(ctx);
0786     /*
0787      * If this is a realtime thread we try to get it running by
0788      * preempting a lower priority thread.
0789      */
0790     if (!spu && rt_prio(ctx->prio))
0791         spu = find_victim(ctx);
0792     if (spu) {
0793         unsigned long runcntl;
0794 
0795         runcntl = ctx->ops->runcntl_read(ctx);
0796         __spu_schedule(spu, ctx);
0797         if (runcntl & SPU_RUNCNTL_RUNNABLE)
0798             spuctx_switch_state(ctx, SPU_UTIL_USER);
0799 
0800         return 0;
0801     }
0802 
0803     if (ctx->flags & SPU_CREATE_NOSCHED) {
0804         spu_prio_wait(ctx);
0805         goto spu_activate_top;
0806     }
0807 
0808     spu_add_to_rq(ctx);
0809 
0810     return 0;
0811 }
0812 
0813 /**
0814  * grab_runnable_context - try to find a runnable context
0815  *
0816  * Remove the highest priority context on the runqueue and return it
0817  * to the caller.  Returns %NULL if no runnable context was found.
0818  */
0819 static struct spu_context *grab_runnable_context(int prio, int node)
0820 {
0821     struct spu_context *ctx;
0822     int best;
0823 
0824     spin_lock(&spu_prio->runq_lock);
0825     best = find_first_bit(spu_prio->bitmap, prio);
0826     while (best < prio) {
0827         struct list_head *rq = &spu_prio->runq[best];
0828 
0829         list_for_each_entry(ctx, rq, rq) {
0830             /* XXX(hch): check for affinity here as well */
0831             if (__node_allowed(ctx, node)) {
0832                 __spu_del_from_rq(ctx);
0833                 goto found;
0834             }
0835         }
0836         best++;
0837     }
0838     ctx = NULL;
0839  found:
0840     spin_unlock(&spu_prio->runq_lock);
0841     return ctx;
0842 }
0843 
0844 static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
0845 {
0846     struct spu *spu = ctx->spu;
0847     struct spu_context *new = NULL;
0848 
0849     if (spu) {
0850         new = grab_runnable_context(max_prio, spu->node);
0851         if (new || force) {
0852             spu_unschedule(spu, ctx, new == NULL);
0853             if (new) {
0854                 if (new->flags & SPU_CREATE_NOSCHED)
0855                     wake_up(&new->stop_wq);
0856                 else {
0857                     spu_release(ctx);
0858                     spu_schedule(spu, new);
0859                     /* this one can't easily be made
0860                        interruptible */
0861                     mutex_lock(&ctx->state_mutex);
0862                 }
0863             }
0864         }
0865     }
0866 
0867     return new != NULL;
0868 }
0869 
0870 /**
0871  * spu_deactivate - unbind a context from it's physical spu
0872  * @ctx:    spu context to unbind
0873  *
0874  * Unbind @ctx from the physical spu it is running on and schedule
0875  * the highest priority context to run on the freed physical spu.
0876  */
0877 void spu_deactivate(struct spu_context *ctx)
0878 {
0879     spu_context_nospu_trace(spu_deactivate__enter, ctx);
0880     __spu_deactivate(ctx, 1, MAX_PRIO);
0881 }
0882 
0883 /**
0884  * spu_yield -  yield a physical spu if others are waiting
0885  * @ctx:    spu context to yield
0886  *
0887  * Check if there is a higher priority context waiting and if yes
0888  * unbind @ctx from the physical spu and schedule the highest
0889  * priority context to run on the freed physical spu instead.
0890  */
0891 void spu_yield(struct spu_context *ctx)
0892 {
0893     spu_context_nospu_trace(spu_yield__enter, ctx);
0894     if (!(ctx->flags & SPU_CREATE_NOSCHED)) {
0895         mutex_lock(&ctx->state_mutex);
0896         __spu_deactivate(ctx, 0, MAX_PRIO);
0897         mutex_unlock(&ctx->state_mutex);
0898     }
0899 }
0900 
0901 static noinline void spusched_tick(struct spu_context *ctx)
0902 {
0903     struct spu_context *new = NULL;
0904     struct spu *spu = NULL;
0905 
0906     if (spu_acquire(ctx))
0907         BUG();  /* a kernel thread never has signals pending */
0908 
0909     if (ctx->state != SPU_STATE_RUNNABLE)
0910         goto out;
0911     if (ctx->flags & SPU_CREATE_NOSCHED)
0912         goto out;
0913     if (ctx->policy == SCHED_FIFO)
0914         goto out;
0915 
0916     if (--ctx->time_slice && test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
0917         goto out;
0918 
0919     spu = ctx->spu;
0920 
0921     spu_context_trace(spusched_tick__preempt, ctx, spu);
0922 
0923     new = grab_runnable_context(ctx->prio + 1, spu->node);
0924     if (new) {
0925         spu_unschedule(spu, ctx, 0);
0926         if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
0927             spu_add_to_rq(ctx);
0928     } else {
0929         spu_context_nospu_trace(spusched_tick__newslice, ctx);
0930         if (!ctx->time_slice)
0931             ctx->time_slice++;
0932     }
0933 out:
0934     spu_release(ctx);
0935 
0936     if (new)
0937         spu_schedule(spu, new);
0938 }
0939 
0940 /**
0941  * count_active_contexts - count nr of active tasks
0942  *
0943  * Return the number of tasks currently running or waiting to run.
0944  *
0945  * Note that we don't take runq_lock / list_mutex here.  Reading
0946  * a single 32bit value is atomic on powerpc, and we don't care
0947  * about memory ordering issues here.
0948  */
0949 static unsigned long count_active_contexts(void)
0950 {
0951     int nr_active = 0, node;
0952 
0953     for (node = 0; node < MAX_NUMNODES; node++)
0954         nr_active += cbe_spu_info[node].nr_active;
0955     nr_active += spu_prio->nr_waiting;
0956 
0957     return nr_active;
0958 }
0959 
0960 /**
0961  * spu_calc_load - update the avenrun load estimates.
0962  *
0963  * No locking against reading these values from userspace, as for
0964  * the CPU loadavg code.
0965  */
0966 static void spu_calc_load(void)
0967 {
0968     unsigned long active_tasks; /* fixed-point */
0969 
0970     active_tasks = count_active_contexts() * FIXED_1;
0971     spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks);
0972     spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks);
0973     spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks);
0974 }
0975 
0976 static void spusched_wake(struct timer_list *unused)
0977 {
0978     mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
0979     wake_up_process(spusched_task);
0980 }
0981 
0982 static void spuloadavg_wake(struct timer_list *unused)
0983 {
0984     mod_timer(&spuloadavg_timer, jiffies + LOAD_FREQ);
0985     spu_calc_load();
0986 }
0987 
0988 static int spusched_thread(void *unused)
0989 {
0990     struct spu *spu;
0991     int node;
0992 
0993     while (!kthread_should_stop()) {
0994         set_current_state(TASK_INTERRUPTIBLE);
0995         schedule();
0996         for (node = 0; node < MAX_NUMNODES; node++) {
0997             struct mutex *mtx = &cbe_spu_info[node].list_mutex;
0998 
0999             mutex_lock(mtx);
1000             list_for_each_entry(spu, &cbe_spu_info[node].spus,
1001                     cbe_list) {
1002                 struct spu_context *ctx = spu->ctx;
1003 
1004                 if (ctx) {
1005                     get_spu_context(ctx);
1006                     mutex_unlock(mtx);
1007                     spusched_tick(ctx);
1008                     mutex_lock(mtx);
1009                     put_spu_context(ctx);
1010                 }
1011             }
1012             mutex_unlock(mtx);
1013         }
1014     }
1015 
1016     return 0;
1017 }
1018 
1019 void spuctx_switch_state(struct spu_context *ctx,
1020         enum spu_utilization_state new_state)
1021 {
1022     unsigned long long curtime;
1023     signed long long delta;
1024     struct spu *spu;
1025     enum spu_utilization_state old_state;
1026     int node;
1027 
1028     curtime = ktime_get_ns();
1029     delta = curtime - ctx->stats.tstamp;
1030 
1031     WARN_ON(!mutex_is_locked(&ctx->state_mutex));
1032     WARN_ON(delta < 0);
1033 
1034     spu = ctx->spu;
1035     old_state = ctx->stats.util_state;
1036     ctx->stats.util_state = new_state;
1037     ctx->stats.tstamp = curtime;
1038 
1039     /*
1040      * Update the physical SPU utilization statistics.
1041      */
1042     if (spu) {
1043         ctx->stats.times[old_state] += delta;
1044         spu->stats.times[old_state] += delta;
1045         spu->stats.util_state = new_state;
1046         spu->stats.tstamp = curtime;
1047         node = spu->node;
1048         if (old_state == SPU_UTIL_USER)
1049             atomic_dec(&cbe_spu_info[node].busy_spus);
1050         if (new_state == SPU_UTIL_USER)
1051             atomic_inc(&cbe_spu_info[node].busy_spus);
1052     }
1053 }
1054 
1055 #ifdef CONFIG_PROC_FS
1056 static int show_spu_loadavg(struct seq_file *s, void *private)
1057 {
1058     int a, b, c;
1059 
1060     a = spu_avenrun[0] + (FIXED_1/200);
1061     b = spu_avenrun[1] + (FIXED_1/200);
1062     c = spu_avenrun[2] + (FIXED_1/200);
1063 
1064     /*
1065      * Note that last_pid doesn't really make much sense for the
1066      * SPU loadavg (it even seems very odd on the CPU side...),
1067      * but we include it here to have a 100% compatible interface.
1068      */
1069     seq_printf(s, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
1070         LOAD_INT(a), LOAD_FRAC(a),
1071         LOAD_INT(b), LOAD_FRAC(b),
1072         LOAD_INT(c), LOAD_FRAC(c),
1073         count_active_contexts(),
1074         atomic_read(&nr_spu_contexts),
1075         idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
1076     return 0;
1077 }
1078 #endif
1079 
1080 int __init spu_sched_init(void)
1081 {
1082     struct proc_dir_entry *entry;
1083     int err = -ENOMEM, i;
1084 
1085     spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL);
1086     if (!spu_prio)
1087         goto out;
1088 
1089     for (i = 0; i < MAX_PRIO; i++) {
1090         INIT_LIST_HEAD(&spu_prio->runq[i]);
1091         __clear_bit(i, spu_prio->bitmap);
1092     }
1093     spin_lock_init(&spu_prio->runq_lock);
1094 
1095     timer_setup(&spusched_timer, spusched_wake, 0);
1096     timer_setup(&spuloadavg_timer, spuloadavg_wake, 0);
1097 
1098     spusched_task = kthread_run(spusched_thread, NULL, "spusched");
1099     if (IS_ERR(spusched_task)) {
1100         err = PTR_ERR(spusched_task);
1101         goto out_free_spu_prio;
1102     }
1103 
1104     mod_timer(&spuloadavg_timer, 0);
1105 
1106     entry = proc_create_single("spu_loadavg", 0, NULL, show_spu_loadavg);
1107     if (!entry)
1108         goto out_stop_kthread;
1109 
1110     pr_debug("spusched: tick: %d, min ticks: %d, default ticks: %d\n",
1111             SPUSCHED_TICK, MIN_SPU_TIMESLICE, DEF_SPU_TIMESLICE);
1112     return 0;
1113 
1114  out_stop_kthread:
1115     kthread_stop(spusched_task);
1116  out_free_spu_prio:
1117     kfree(spu_prio);
1118  out:
1119     return err;
1120 }
1121 
1122 void spu_sched_exit(void)
1123 {
1124     struct spu *spu;
1125     int node;
1126 
1127     remove_proc_entry("spu_loadavg", NULL);
1128 
1129     del_timer_sync(&spusched_timer);
1130     del_timer_sync(&spuloadavg_timer);
1131     kthread_stop(spusched_task);
1132 
1133     for (node = 0; node < MAX_NUMNODES; node++) {
1134         mutex_lock(&cbe_spu_info[node].list_mutex);
1135         list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
1136             if (spu->alloc_state != SPU_FREE)
1137                 spu->alloc_state = SPU_FREE;
1138         mutex_unlock(&cbe_spu_info[node].list_mutex);
1139     }
1140     kfree(spu_prio);
1141 }