Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright © 2014 Broadcom
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice (including the next
0012  * paragraph) shall be included in all copies or substantial portions of the
0013  * Software.
0014  *
0015  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0016  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0017  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0018  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
0019  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
0020  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
0021  * IN THE SOFTWARE.
0022  */
0023 
0024 /**
0025  * DOC: Shader validator for VC4.
0026  *
0027  * Since the VC4 has no IOMMU between it and system memory, a user
0028  * with access to execute shaders could escalate privilege by
0029  * overwriting system memory (using the VPM write address register in
0030  * the general-purpose DMA mode) or reading system memory it shouldn't
0031  * (reading it as a texture, uniform data, or direct-addressed TMU
0032  * lookup).
0033  *
0034  * The shader validator walks over a shader's BO, ensuring that its
0035  * accesses are appropriately bounded, and recording where texture
0036  * accesses are made so that we can do relocations for them in the
0037  * uniform stream.
0038  *
0039  * Shader BO are immutable for their lifetimes (enforced by not
0040  * allowing mmaps, GEM prime export, or rendering to from a CL), so
0041  * this validation is only performed at BO creation time.
0042  */
0043 
0044 #include "vc4_drv.h"
0045 #include "vc4_qpu_defines.h"
0046 
0047 #define LIVE_REG_COUNT (32 + 32 + 4)
0048 
0049 struct vc4_shader_validation_state {
0050     /* Current IP being validated. */
0051     uint32_t ip;
0052 
0053     /* IP at the end of the BO, do not read shader[max_ip] */
0054     uint32_t max_ip;
0055 
0056     uint64_t *shader;
0057 
0058     struct vc4_texture_sample_info tmu_setup[2];
0059     int tmu_write_count[2];
0060 
0061     /* For registers that were last written to by a MIN instruction with
0062      * one argument being a uniform, the address of the uniform.
0063      * Otherwise, ~0.
0064      *
0065      * This is used for the validation of direct address memory reads.
0066      */
0067     uint32_t live_min_clamp_offsets[LIVE_REG_COUNT];
0068     bool live_max_clamp_regs[LIVE_REG_COUNT];
0069     uint32_t live_immediates[LIVE_REG_COUNT];
0070 
0071     /* Bitfield of which IPs are used as branch targets.
0072      *
0073      * Used for validation that the uniform stream is updated at the right
0074      * points and clearing the texturing/clamping state.
0075      */
0076     unsigned long *branch_targets;
0077 
0078     /* Set when entering a basic block, and cleared when the uniform
0079      * address update is found.  This is used to make sure that we don't
0080      * read uniforms when the address is undefined.
0081      */
0082     bool needs_uniform_address_update;
0083 
0084     /* Set when we find a backwards branch.  If the branch is backwards,
0085      * the taraget is probably doing an address reset to read uniforms,
0086      * and so we need to be sure that a uniforms address is present in the
0087      * stream, even if the shader didn't need to read uniforms in later
0088      * basic blocks.
0089      */
0090     bool needs_uniform_address_for_loop;
0091 
0092     /* Set when we find an instruction writing the top half of the
0093      * register files.  If we allowed writing the unusable regs in
0094      * a threaded shader, then the other shader running on our
0095      * QPU's clamp validation would be invalid.
0096      */
0097     bool all_registers_used;
0098 };
0099 
0100 static uint32_t
0101 waddr_to_live_reg_index(uint32_t waddr, bool is_b)
0102 {
0103     if (waddr < 32) {
0104         if (is_b)
0105             return 32 + waddr;
0106         else
0107             return waddr;
0108     } else if (waddr <= QPU_W_ACC3) {
0109         return 64 + waddr - QPU_W_ACC0;
0110     } else {
0111         return ~0;
0112     }
0113 }
0114 
0115 static uint32_t
0116 raddr_add_a_to_live_reg_index(uint64_t inst)
0117 {
0118     uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
0119     uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
0120     uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
0121     uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
0122 
0123     if (add_a == QPU_MUX_A)
0124         return raddr_a;
0125     else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
0126         return 32 + raddr_b;
0127     else if (add_a <= QPU_MUX_R3)
0128         return 64 + add_a;
0129     else
0130         return ~0;
0131 }
0132 
0133 static bool
0134 live_reg_is_upper_half(uint32_t lri)
0135 {
0136     return  (lri >= 16 && lri < 32) ||
0137         (lri >= 32 + 16 && lri < 32 + 32);
0138 }
0139 
0140 static bool
0141 is_tmu_submit(uint32_t waddr)
0142 {
0143     return (waddr == QPU_W_TMU0_S ||
0144         waddr == QPU_W_TMU1_S);
0145 }
0146 
0147 static bool
0148 is_tmu_write(uint32_t waddr)
0149 {
0150     return (waddr >= QPU_W_TMU0_S &&
0151         waddr <= QPU_W_TMU1_B);
0152 }
0153 
0154 static bool
0155 record_texture_sample(struct vc4_validated_shader_info *validated_shader,
0156               struct vc4_shader_validation_state *validation_state,
0157               int tmu)
0158 {
0159     uint32_t s = validated_shader->num_texture_samples;
0160     int i;
0161     struct vc4_texture_sample_info *temp_samples;
0162 
0163     temp_samples = krealloc(validated_shader->texture_samples,
0164                 (s + 1) * sizeof(*temp_samples),
0165                 GFP_KERNEL);
0166     if (!temp_samples)
0167         return false;
0168 
0169     memcpy(&temp_samples[s],
0170            &validation_state->tmu_setup[tmu],
0171            sizeof(*temp_samples));
0172 
0173     validated_shader->num_texture_samples = s + 1;
0174     validated_shader->texture_samples = temp_samples;
0175 
0176     for (i = 0; i < 4; i++)
0177         validation_state->tmu_setup[tmu].p_offset[i] = ~0;
0178 
0179     return true;
0180 }
0181 
0182 static bool
0183 check_tmu_write(struct vc4_validated_shader_info *validated_shader,
0184         struct vc4_shader_validation_state *validation_state,
0185         bool is_mul)
0186 {
0187     uint64_t inst = validation_state->shader[validation_state->ip];
0188     uint32_t waddr = (is_mul ?
0189               QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
0190               QPU_GET_FIELD(inst, QPU_WADDR_ADD));
0191     uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
0192     uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
0193     int tmu = waddr > QPU_W_TMU0_B;
0194     bool submit = is_tmu_submit(waddr);
0195     bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
0196     uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
0197 
0198     if (is_direct) {
0199         uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
0200         uint32_t clamp_reg, clamp_offset;
0201 
0202         if (sig == QPU_SIG_SMALL_IMM) {
0203             DRM_DEBUG("direct TMU read used small immediate\n");
0204             return false;
0205         }
0206 
0207         /* Make sure that this texture load is an add of the base
0208          * address of the UBO to a clamped offset within the UBO.
0209          */
0210         if (is_mul ||
0211             QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
0212             DRM_DEBUG("direct TMU load wasn't an add\n");
0213             return false;
0214         }
0215 
0216         /* We assert that the clamped address is the first
0217          * argument, and the UBO base address is the second argument.
0218          * This is arbitrary, but simpler than supporting flipping the
0219          * two either way.
0220          */
0221         clamp_reg = raddr_add_a_to_live_reg_index(inst);
0222         if (clamp_reg == ~0) {
0223             DRM_DEBUG("direct TMU load wasn't clamped\n");
0224             return false;
0225         }
0226 
0227         clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
0228         if (clamp_offset == ~0) {
0229             DRM_DEBUG("direct TMU load wasn't clamped\n");
0230             return false;
0231         }
0232 
0233         /* Store the clamp value's offset in p1 (see reloc_tex() in
0234          * vc4_validate.c).
0235          */
0236         validation_state->tmu_setup[tmu].p_offset[1] =
0237             clamp_offset;
0238 
0239         if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
0240             !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
0241             DRM_DEBUG("direct TMU load didn't add to a uniform\n");
0242             return false;
0243         }
0244 
0245         validation_state->tmu_setup[tmu].is_direct = true;
0246     } else {
0247         if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
0248                           raddr_b == QPU_R_UNIF)) {
0249             DRM_DEBUG("uniform read in the same instruction as "
0250                   "texture setup.\n");
0251             return false;
0252         }
0253     }
0254 
0255     if (validation_state->tmu_write_count[tmu] >= 4) {
0256         DRM_DEBUG("TMU%d got too many parameters before dispatch\n",
0257               tmu);
0258         return false;
0259     }
0260     validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
0261         validated_shader->uniforms_size;
0262     validation_state->tmu_write_count[tmu]++;
0263     /* Since direct uses a RADDR uniform reference, it will get counted in
0264      * check_instruction_reads()
0265      */
0266     if (!is_direct) {
0267         if (validation_state->needs_uniform_address_update) {
0268             DRM_DEBUG("Texturing with undefined uniform address\n");
0269             return false;
0270         }
0271 
0272         validated_shader->uniforms_size += 4;
0273     }
0274 
0275     if (submit) {
0276         if (!record_texture_sample(validated_shader,
0277                        validation_state, tmu)) {
0278             return false;
0279         }
0280 
0281         validation_state->tmu_write_count[tmu] = 0;
0282     }
0283 
0284     return true;
0285 }
0286 
0287 static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader)
0288 {
0289     uint32_t o = validated_shader->num_uniform_addr_offsets;
0290     uint32_t num_uniforms = validated_shader->uniforms_size / 4;
0291 
0292     validated_shader->uniform_addr_offsets =
0293         krealloc(validated_shader->uniform_addr_offsets,
0294              (o + 1) *
0295              sizeof(*validated_shader->uniform_addr_offsets),
0296              GFP_KERNEL);
0297     if (!validated_shader->uniform_addr_offsets)
0298         return false;
0299 
0300     validated_shader->uniform_addr_offsets[o] = num_uniforms;
0301     validated_shader->num_uniform_addr_offsets++;
0302 
0303     return true;
0304 }
0305 
0306 static bool
0307 validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader,
0308                    struct vc4_shader_validation_state *validation_state,
0309                    bool is_mul)
0310 {
0311     uint64_t inst = validation_state->shader[validation_state->ip];
0312     u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
0313     u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
0314     u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
0315     u32 add_lri = raddr_add_a_to_live_reg_index(inst);
0316     /* We want our reset to be pointing at whatever uniform follows the
0317      * uniforms base address.
0318      */
0319     u32 expected_offset = validated_shader->uniforms_size + 4;
0320 
0321     /* We only support absolute uniform address changes, and we
0322      * require that they be in the current basic block before any
0323      * of its uniform reads.
0324      *
0325      * One could potentially emit more efficient QPU code, by
0326      * noticing that (say) an if statement does uniform control
0327      * flow for all threads and that the if reads the same number
0328      * of uniforms on each side.  However, this scheme is easy to
0329      * validate so it's all we allow for now.
0330      */
0331     switch (QPU_GET_FIELD(inst, QPU_SIG)) {
0332     case QPU_SIG_NONE:
0333     case QPU_SIG_SCOREBOARD_UNLOCK:
0334     case QPU_SIG_COLOR_LOAD:
0335     case QPU_SIG_LOAD_TMU0:
0336     case QPU_SIG_LOAD_TMU1:
0337         break;
0338     default:
0339         DRM_DEBUG("uniforms address change must be "
0340               "normal math\n");
0341         return false;
0342     }
0343 
0344     if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
0345         DRM_DEBUG("Uniform address reset must be an ADD.\n");
0346         return false;
0347     }
0348 
0349     if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) {
0350         DRM_DEBUG("Uniform address reset must be unconditional.\n");
0351         return false;
0352     }
0353 
0354     if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP &&
0355         !(inst & QPU_PM)) {
0356         DRM_DEBUG("No packing allowed on uniforms reset\n");
0357         return false;
0358     }
0359 
0360     if (add_lri == -1) {
0361         DRM_DEBUG("First argument of uniform address write must be "
0362               "an immediate value.\n");
0363         return false;
0364     }
0365 
0366     if (validation_state->live_immediates[add_lri] != expected_offset) {
0367         DRM_DEBUG("Resetting uniforms with offset %db instead of %db\n",
0368               validation_state->live_immediates[add_lri],
0369               expected_offset);
0370         return false;
0371     }
0372 
0373     if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
0374         !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
0375         DRM_DEBUG("Second argument of uniform address write must be "
0376               "a uniform.\n");
0377         return false;
0378     }
0379 
0380     validation_state->needs_uniform_address_update = false;
0381     validation_state->needs_uniform_address_for_loop = false;
0382     return require_uniform_address_uniform(validated_shader);
0383 }
0384 
0385 static bool
0386 check_reg_write(struct vc4_validated_shader_info *validated_shader,
0387         struct vc4_shader_validation_state *validation_state,
0388         bool is_mul)
0389 {
0390     uint64_t inst = validation_state->shader[validation_state->ip];
0391     uint32_t waddr = (is_mul ?
0392               QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
0393               QPU_GET_FIELD(inst, QPU_WADDR_ADD));
0394     uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
0395     bool ws = inst & QPU_WS;
0396     bool is_b = is_mul ^ ws;
0397     u32 lri = waddr_to_live_reg_index(waddr, is_b);
0398 
0399     if (lri != -1) {
0400         uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
0401         uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL);
0402 
0403         if (sig == QPU_SIG_LOAD_IMM &&
0404             QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP &&
0405             ((is_mul && cond_mul == QPU_COND_ALWAYS) ||
0406              (!is_mul && cond_add == QPU_COND_ALWAYS))) {
0407             validation_state->live_immediates[lri] =
0408                 QPU_GET_FIELD(inst, QPU_LOAD_IMM);
0409         } else {
0410             validation_state->live_immediates[lri] = ~0;
0411         }
0412 
0413         if (live_reg_is_upper_half(lri))
0414             validation_state->all_registers_used = true;
0415     }
0416 
0417     switch (waddr) {
0418     case QPU_W_UNIFORMS_ADDRESS:
0419         if (is_b) {
0420             DRM_DEBUG("relative uniforms address change "
0421                   "unsupported\n");
0422             return false;
0423         }
0424 
0425         return validate_uniform_address_write(validated_shader,
0426                               validation_state,
0427                               is_mul);
0428 
0429     case QPU_W_TLB_COLOR_MS:
0430     case QPU_W_TLB_COLOR_ALL:
0431     case QPU_W_TLB_Z:
0432         /* These only interact with the tile buffer, not main memory,
0433          * so they're safe.
0434          */
0435         return true;
0436 
0437     case QPU_W_TMU0_S:
0438     case QPU_W_TMU0_T:
0439     case QPU_W_TMU0_R:
0440     case QPU_W_TMU0_B:
0441     case QPU_W_TMU1_S:
0442     case QPU_W_TMU1_T:
0443     case QPU_W_TMU1_R:
0444     case QPU_W_TMU1_B:
0445         return check_tmu_write(validated_shader, validation_state,
0446                        is_mul);
0447 
0448     case QPU_W_HOST_INT:
0449     case QPU_W_TMU_NOSWAP:
0450     case QPU_W_TLB_ALPHA_MASK:
0451     case QPU_W_MUTEX_RELEASE:
0452         /* XXX: I haven't thought about these, so don't support them
0453          * for now.
0454          */
0455         DRM_DEBUG("Unsupported waddr %d\n", waddr);
0456         return false;
0457 
0458     case QPU_W_VPM_ADDR:
0459         DRM_DEBUG("General VPM DMA unsupported\n");
0460         return false;
0461 
0462     case QPU_W_VPM:
0463     case QPU_W_VPMVCD_SETUP:
0464         /* We allow VPM setup in general, even including VPM DMA
0465          * configuration setup, because the (unsafe) DMA can only be
0466          * triggered by QPU_W_VPM_ADDR writes.
0467          */
0468         return true;
0469 
0470     case QPU_W_TLB_STENCIL_SETUP:
0471         return true;
0472     }
0473 
0474     return true;
0475 }
0476 
0477 static void
0478 track_live_clamps(struct vc4_validated_shader_info *validated_shader,
0479           struct vc4_shader_validation_state *validation_state)
0480 {
0481     uint64_t inst = validation_state->shader[validation_state->ip];
0482     uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
0483     uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
0484     uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
0485     uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
0486     uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
0487     uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
0488     uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
0489     uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
0490     uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
0491     bool ws = inst & QPU_WS;
0492     uint32_t lri_add_a, lri_add, lri_mul;
0493     bool add_a_is_min_0;
0494 
0495     /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
0496      * before we clear previous live state.
0497      */
0498     lri_add_a = raddr_add_a_to_live_reg_index(inst);
0499     add_a_is_min_0 = (lri_add_a != ~0 &&
0500               validation_state->live_max_clamp_regs[lri_add_a]);
0501 
0502     /* Clear live state for registers written by our instruction. */
0503     lri_add = waddr_to_live_reg_index(waddr_add, ws);
0504     lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
0505     if (lri_mul != ~0) {
0506         validation_state->live_max_clamp_regs[lri_mul] = false;
0507         validation_state->live_min_clamp_offsets[lri_mul] = ~0;
0508     }
0509     if (lri_add != ~0) {
0510         validation_state->live_max_clamp_regs[lri_add] = false;
0511         validation_state->live_min_clamp_offsets[lri_add] = ~0;
0512     } else {
0513         /* Nothing further to do for live tracking, since only ADDs
0514          * generate new live clamp registers.
0515          */
0516         return;
0517     }
0518 
0519     /* Now, handle remaining live clamp tracking for the ADD operation. */
0520 
0521     if (cond_add != QPU_COND_ALWAYS)
0522         return;
0523 
0524     if (op_add == QPU_A_MAX) {
0525         /* Track live clamps of a value to a minimum of 0 (in either
0526          * arg).
0527          */
0528         if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
0529             (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
0530             return;
0531         }
0532 
0533         validation_state->live_max_clamp_regs[lri_add] = true;
0534     } else if (op_add == QPU_A_MIN) {
0535         /* Track live clamps of a value clamped to a minimum of 0 and
0536          * a maximum of some uniform's offset.
0537          */
0538         if (!add_a_is_min_0)
0539             return;
0540 
0541         if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
0542             !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
0543               sig != QPU_SIG_SMALL_IMM)) {
0544             return;
0545         }
0546 
0547         validation_state->live_min_clamp_offsets[lri_add] =
0548             validated_shader->uniforms_size;
0549     }
0550 }
0551 
0552 static bool
0553 check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
0554              struct vc4_shader_validation_state *validation_state)
0555 {
0556     uint64_t inst = validation_state->shader[validation_state->ip];
0557     uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
0558     uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
0559     bool ok;
0560 
0561     if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
0562         DRM_DEBUG("ADD and MUL both set up textures\n");
0563         return false;
0564     }
0565 
0566     ok = (check_reg_write(validated_shader, validation_state, false) &&
0567           check_reg_write(validated_shader, validation_state, true));
0568 
0569     track_live_clamps(validated_shader, validation_state);
0570 
0571     return ok;
0572 }
0573 
0574 static bool
0575 check_branch(uint64_t inst,
0576          struct vc4_validated_shader_info *validated_shader,
0577          struct vc4_shader_validation_state *validation_state,
0578          int ip)
0579 {
0580     int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
0581     uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
0582     uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
0583 
0584     if ((int)branch_imm < 0)
0585         validation_state->needs_uniform_address_for_loop = true;
0586 
0587     /* We don't want to have to worry about validation of this, and
0588      * there's no need for it.
0589      */
0590     if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) {
0591         DRM_DEBUG("branch instruction at %d wrote a register.\n",
0592               validation_state->ip);
0593         return false;
0594     }
0595 
0596     return true;
0597 }
0598 
0599 static bool
0600 check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
0601             struct vc4_shader_validation_state *validation_state)
0602 {
0603     uint64_t inst = validation_state->shader[validation_state->ip];
0604     uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
0605     uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
0606     uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
0607 
0608     if (raddr_a == QPU_R_UNIF ||
0609         (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
0610         /* This can't overflow the uint32_t, because we're reading 8
0611          * bytes of instruction to increment by 4 here, so we'd
0612          * already be OOM.
0613          */
0614         validated_shader->uniforms_size += 4;
0615 
0616         if (validation_state->needs_uniform_address_update) {
0617             DRM_DEBUG("Uniform read with undefined uniform "
0618                   "address\n");
0619             return false;
0620         }
0621     }
0622 
0623     if ((raddr_a >= 16 && raddr_a < 32) ||
0624         (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
0625         validation_state->all_registers_used = true;
0626     }
0627 
0628     return true;
0629 }
0630 
0631 /* Make sure that all branches are absolute and point within the shader, and
0632  * note their targets for later.
0633  */
0634 static bool
0635 vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
0636 {
0637     uint32_t max_branch_target = 0;
0638     int ip;
0639     int last_branch = -2;
0640 
0641     for (ip = 0; ip < validation_state->max_ip; ip++) {
0642         uint64_t inst = validation_state->shader[ip];
0643         int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
0644         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
0645         uint32_t after_delay_ip = ip + 4;
0646         uint32_t branch_target_ip;
0647 
0648         if (sig == QPU_SIG_PROG_END) {
0649             /* There are two delay slots after program end is
0650              * signaled that are still executed, then we're
0651              * finished.  validation_state->max_ip is the
0652              * instruction after the last valid instruction in the
0653              * program.
0654              */
0655             validation_state->max_ip = ip + 3;
0656             continue;
0657         }
0658 
0659         if (sig != QPU_SIG_BRANCH)
0660             continue;
0661 
0662         if (ip - last_branch < 4) {
0663             DRM_DEBUG("Branch at %d during delay slots\n", ip);
0664             return false;
0665         }
0666         last_branch = ip;
0667 
0668         if (inst & QPU_BRANCH_REG) {
0669             DRM_DEBUG("branching from register relative "
0670                   "not supported\n");
0671             return false;
0672         }
0673 
0674         if (!(inst & QPU_BRANCH_REL)) {
0675             DRM_DEBUG("relative branching required\n");
0676             return false;
0677         }
0678 
0679         /* The actual branch target is the instruction after the delay
0680          * slots, plus whatever byte offset is in the low 32 bits of
0681          * the instruction.  Make sure we're not branching beyond the
0682          * end of the shader object.
0683          */
0684         if (branch_imm % sizeof(inst) != 0) {
0685             DRM_DEBUG("branch target not aligned\n");
0686             return false;
0687         }
0688 
0689         branch_target_ip = after_delay_ip + (branch_imm >> 3);
0690         if (branch_target_ip >= validation_state->max_ip) {
0691             DRM_DEBUG("Branch at %d outside of shader (ip %d/%d)\n",
0692                   ip, branch_target_ip,
0693                   validation_state->max_ip);
0694             return false;
0695         }
0696         set_bit(branch_target_ip, validation_state->branch_targets);
0697 
0698         /* Make sure that the non-branching path is also not outside
0699          * the shader.
0700          */
0701         if (after_delay_ip >= validation_state->max_ip) {
0702             DRM_DEBUG("Branch at %d continues past shader end "
0703                   "(%d/%d)\n",
0704                   ip, after_delay_ip, validation_state->max_ip);
0705             return false;
0706         }
0707         set_bit(after_delay_ip, validation_state->branch_targets);
0708         max_branch_target = max(max_branch_target, after_delay_ip);
0709     }
0710 
0711     if (max_branch_target > validation_state->max_ip - 3) {
0712         DRM_DEBUG("Branch landed after QPU_SIG_PROG_END");
0713         return false;
0714     }
0715 
0716     return true;
0717 }
0718 
0719 /* Resets any known state for the shader, used when we may be branched to from
0720  * multiple locations in the program (or at shader start).
0721  */
0722 static void
0723 reset_validation_state(struct vc4_shader_validation_state *validation_state)
0724 {
0725     int i;
0726 
0727     for (i = 0; i < 8; i++)
0728         validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0;
0729 
0730     for (i = 0; i < LIVE_REG_COUNT; i++) {
0731         validation_state->live_min_clamp_offsets[i] = ~0;
0732         validation_state->live_max_clamp_regs[i] = false;
0733         validation_state->live_immediates[i] = ~0;
0734     }
0735 }
0736 
0737 static bool
0738 texturing_in_progress(struct vc4_shader_validation_state *validation_state)
0739 {
0740     return (validation_state->tmu_write_count[0] != 0 ||
0741         validation_state->tmu_write_count[1] != 0);
0742 }
0743 
0744 static bool
0745 vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state)
0746 {
0747     uint32_t ip = validation_state->ip;
0748 
0749     if (!test_bit(ip, validation_state->branch_targets))
0750         return true;
0751 
0752     if (texturing_in_progress(validation_state)) {
0753         DRM_DEBUG("Branch target landed during TMU setup\n");
0754         return false;
0755     }
0756 
0757     /* Reset our live values tracking, since this instruction may have
0758      * multiple predecessors.
0759      *
0760      * One could potentially do analysis to determine that, for
0761      * example, all predecessors have a live max clamp in the same
0762      * register, but we don't bother with that.
0763      */
0764     reset_validation_state(validation_state);
0765 
0766     /* Since we've entered a basic block from potentially multiple
0767      * predecessors, we need the uniforms address to be updated before any
0768      * unforms are read.  We require that after any branch point, the next
0769      * uniform to be loaded is a uniform address offset.  That uniform's
0770      * offset will be marked by the uniform address register write
0771      * validation, or a one-off the end-of-program check.
0772      */
0773     validation_state->needs_uniform_address_update = true;
0774 
0775     return true;
0776 }
0777 
0778 struct vc4_validated_shader_info *
0779 vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
0780 {
0781     struct vc4_dev *vc4 = to_vc4_dev(shader_obj->base.dev);
0782     bool found_shader_end = false;
0783     int shader_end_ip = 0;
0784     uint32_t last_thread_switch_ip = -3;
0785     uint32_t ip;
0786     struct vc4_validated_shader_info *validated_shader = NULL;
0787     struct vc4_shader_validation_state validation_state;
0788 
0789     if (WARN_ON_ONCE(vc4->is_vc5))
0790         return NULL;
0791 
0792     memset(&validation_state, 0, sizeof(validation_state));
0793     validation_state.shader = shader_obj->vaddr;
0794     validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
0795 
0796     reset_validation_state(&validation_state);
0797 
0798     validation_state.branch_targets =
0799         kcalloc(BITS_TO_LONGS(validation_state.max_ip),
0800             sizeof(unsigned long), GFP_KERNEL);
0801     if (!validation_state.branch_targets)
0802         goto fail;
0803 
0804     validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
0805     if (!validated_shader)
0806         goto fail;
0807 
0808     if (!vc4_validate_branches(&validation_state))
0809         goto fail;
0810 
0811     for (ip = 0; ip < validation_state.max_ip; ip++) {
0812         uint64_t inst = validation_state.shader[ip];
0813         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
0814 
0815         validation_state.ip = ip;
0816 
0817         if (!vc4_handle_branch_target(&validation_state))
0818             goto fail;
0819 
0820         if (ip == last_thread_switch_ip + 3) {
0821             /* Reset r0-r3 live clamp data */
0822             int i;
0823 
0824             for (i = 64; i < LIVE_REG_COUNT; i++) {
0825                 validation_state.live_min_clamp_offsets[i] = ~0;
0826                 validation_state.live_max_clamp_regs[i] = false;
0827                 validation_state.live_immediates[i] = ~0;
0828             }
0829         }
0830 
0831         switch (sig) {
0832         case QPU_SIG_NONE:
0833         case QPU_SIG_WAIT_FOR_SCOREBOARD:
0834         case QPU_SIG_SCOREBOARD_UNLOCK:
0835         case QPU_SIG_COLOR_LOAD:
0836         case QPU_SIG_LOAD_TMU0:
0837         case QPU_SIG_LOAD_TMU1:
0838         case QPU_SIG_PROG_END:
0839         case QPU_SIG_SMALL_IMM:
0840         case QPU_SIG_THREAD_SWITCH:
0841         case QPU_SIG_LAST_THREAD_SWITCH:
0842             if (!check_instruction_writes(validated_shader,
0843                               &validation_state)) {
0844                 DRM_DEBUG("Bad write at ip %d\n", ip);
0845                 goto fail;
0846             }
0847 
0848             if (!check_instruction_reads(validated_shader,
0849                              &validation_state))
0850                 goto fail;
0851 
0852             if (sig == QPU_SIG_PROG_END) {
0853                 found_shader_end = true;
0854                 shader_end_ip = ip;
0855             }
0856 
0857             if (sig == QPU_SIG_THREAD_SWITCH ||
0858                 sig == QPU_SIG_LAST_THREAD_SWITCH) {
0859                 validated_shader->is_threaded = true;
0860 
0861                 if (ip < last_thread_switch_ip + 3) {
0862                     DRM_DEBUG("Thread switch too soon after "
0863                           "last switch at ip %d\n", ip);
0864                     goto fail;
0865                 }
0866                 last_thread_switch_ip = ip;
0867             }
0868 
0869             break;
0870 
0871         case QPU_SIG_LOAD_IMM:
0872             if (!check_instruction_writes(validated_shader,
0873                               &validation_state)) {
0874                 DRM_DEBUG("Bad LOAD_IMM write at ip %d\n", ip);
0875                 goto fail;
0876             }
0877             break;
0878 
0879         case QPU_SIG_BRANCH:
0880             if (!check_branch(inst, validated_shader,
0881                       &validation_state, ip))
0882                 goto fail;
0883 
0884             if (ip < last_thread_switch_ip + 3) {
0885                 DRM_DEBUG("Branch in thread switch at ip %d",
0886                       ip);
0887                 goto fail;
0888             }
0889 
0890             break;
0891         default:
0892             DRM_DEBUG("Unsupported QPU signal %d at "
0893                   "instruction %d\n", sig, ip);
0894             goto fail;
0895         }
0896 
0897         /* There are two delay slots after program end is signaled
0898          * that are still executed, then we're finished.
0899          */
0900         if (found_shader_end && ip == shader_end_ip + 2)
0901             break;
0902     }
0903 
0904     if (ip == validation_state.max_ip) {
0905         DRM_DEBUG("shader failed to terminate before "
0906               "shader BO end at %zd\n",
0907               shader_obj->base.size);
0908         goto fail;
0909     }
0910 
0911     /* Might corrupt other thread */
0912     if (validated_shader->is_threaded &&
0913         validation_state.all_registers_used) {
0914         DRM_DEBUG("Shader uses threading, but uses the upper "
0915               "half of the registers, too\n");
0916         goto fail;
0917     }
0918 
0919     /* If we did a backwards branch and we haven't emitted a uniforms
0920      * reset since then, we still need the uniforms stream to have the
0921      * uniforms address available so that the backwards branch can do its
0922      * uniforms reset.
0923      *
0924      * We could potentially prove that the backwards branch doesn't
0925      * contain any uses of uniforms until program exit, but that doesn't
0926      * seem to be worth the trouble.
0927      */
0928     if (validation_state.needs_uniform_address_for_loop) {
0929         if (!require_uniform_address_uniform(validated_shader))
0930             goto fail;
0931         validated_shader->uniforms_size += 4;
0932     }
0933 
0934     /* Again, no chance of integer overflow here because the worst case
0935      * scenario is 8 bytes of uniforms plus handles per 8-byte
0936      * instruction.
0937      */
0938     validated_shader->uniforms_src_size =
0939         (validated_shader->uniforms_size +
0940          4 * validated_shader->num_texture_samples);
0941 
0942     kfree(validation_state.branch_targets);
0943 
0944     return validated_shader;
0945 
0946 fail:
0947     kfree(validation_state.branch_targets);
0948     if (validated_shader) {
0949         kfree(validated_shader->uniform_addr_offsets);
0950         kfree(validated_shader->texture_samples);
0951         kfree(validated_shader);
0952     }
0953     return NULL;
0954 }