i915/gt/intel_ggtt_fencing.c

0001 // SPDX-License-Identifier: MIT
0002 /*
0003  * Copyright © 2008-2015 Intel Corporation
0004  */
0005
0006 #include <linux/highmem.h>
0007
0008 #include "i915_drv.h"
0009 #include "i915_reg.h"
0010 #include "i915_scatterlist.h"
0011 #include "i915_pvinfo.h"
0012 #include "i915_vgpu.h"
0013 #include "intel_gt_regs.h"
0014 #include "intel_mchbar_regs.h"
0015
0016 /**
0017  * DOC: fence register handling
0018  *
0019  * Important to avoid confusions: "fences" in the i915 driver are not execution
0020  * fences used to track command completion but hardware detiler objects which
0021  * wrap a given range of the global GTT. Each platform has only a fairly limited
0022  * set of these objects.
0023  *
0024  * Fences are used to detile GTT memory mappings. They're also connected to the
0025  * hardware frontbuffer render tracking and hence interact with frontbuffer
0026  * compression. Furthermore on older platforms fences are required for tiled
0027  * objects used by the display engine. They can also be used by the render
0028  * engine - they're required for blitter commands and are optional for render
0029  * commands. But on gen4+ both display (with the exception of fbc) and rendering
0030  * have their own tiling state bits and don't need fences.
0031  *
0032  * Also note that fences only support X and Y tiling and hence can't be used for
0033  * the fancier new tiling formats like W, Ys and Yf.
0034  *
0035  * Finally note that because fences are such a restricted resource they're
0036  * dynamically associated with objects. Furthermore fence state is committed to
0037  * the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must
0038  * explicitly call i915_gem_object_get_fence() to synchronize fencing status
0039  * for cpu access. Also note that some code wants an unfenced view, for those
0040  * cases the fence can be removed forcefully with i915_gem_object_put_fence().
0041  *
0042  * Internally these functions will synchronize with userspace access by removing
0043  * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
0044  */
0045
0046 #define pipelined 0
0047
0048 static struct drm_i915_private *fence_to_i915(struct i915_fence_reg *fence)
0049 {
0050     return fence->ggtt->vm.i915;
0051 }
0052
0053 static struct intel_uncore *fence_to_uncore(struct i915_fence_reg *fence)
0054 {
0055     return fence->ggtt->vm.gt->uncore;
0056 }
0057
0058 static void i965_write_fence_reg(struct i915_fence_reg *fence)
0059 {
0060     i915_reg_t fence_reg_lo, fence_reg_hi;
0061     int fence_pitch_shift;
0062     u64 val;
0063
0064     if (GRAPHICS_VER(fence_to_i915(fence)) >= 6) {
0065         fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
0066         fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
0067         fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
0068
0069     } else {
0070         fence_reg_lo = FENCE_REG_965_LO(fence->id);
0071         fence_reg_hi = FENCE_REG_965_HI(fence->id);
0072         fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
0073     }
0074
0075     val = 0;
0076     if (fence->tiling) {
0077         unsigned int stride = fence->stride;
0078
0079         GEM_BUG_ON(!IS_ALIGNED(stride, 128));
0080
0081         val = fence->start + fence->size - I965_FENCE_PAGE;
0082         val <<= 32;
0083         val |= fence->start;
0084         val |= (u64)((stride / 128) - 1) << fence_pitch_shift;
0085         if (fence->tiling == I915_TILING_Y)
0086             val |= BIT(I965_FENCE_TILING_Y_SHIFT);
0087         val |= I965_FENCE_REG_VALID;
0088     }
0089
0090     if (!pipelined) {
0091         struct intel_uncore *uncore = fence_to_uncore(fence);
0092
0093         /*
0094          * To w/a incoherency with non-atomic 64-bit register updates,
0095          * we split the 64-bit update into two 32-bit writes. In order
0096          * for a partial fence not to be evaluated between writes, we
0097          * precede the update with write to turn off the fence register,
0098          * and only enable the fence as the last step.
0099          *
0100          * For extra levels of paranoia, we make sure each step lands
0101          * before applying the next step.
0102          */
0103         intel_uncore_write_fw(uncore, fence_reg_lo, 0);
0104         intel_uncore_posting_read_fw(uncore, fence_reg_lo);
0105
0106         intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val));
0107         intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val));
0108         intel_uncore_posting_read_fw(uncore, fence_reg_lo);
0109     }
0110 }
0111
0112 static void i915_write_fence_reg(struct i915_fence_reg *fence)
0113 {
0114     u32 val;
0115
0116     val = 0;
0117     if (fence->tiling) {
0118         unsigned int stride = fence->stride;
0119         unsigned int tiling = fence->tiling;
0120         bool is_y_tiled = tiling == I915_TILING_Y;
0121
0122         if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence)))
0123             stride /= 128;
0124         else
0125             stride /= 512;
0126         GEM_BUG_ON(!is_power_of_2(stride));
0127
0128         val = fence->start;
0129         if (is_y_tiled)
0130             val |= BIT(I830_FENCE_TILING_Y_SHIFT);
0131         val |= I915_FENCE_SIZE_BITS(fence->size);
0132         val |= ilog2(stride) << I830_FENCE_PITCH_SHIFT;
0133
0134         val |= I830_FENCE_REG_VALID;
0135     }
0136
0137     if (!pipelined) {
0138         struct intel_uncore *uncore = fence_to_uncore(fence);
0139         i915_reg_t reg = FENCE_REG(fence->id);
0140
0141         intel_uncore_write_fw(uncore, reg, val);
0142         intel_uncore_posting_read_fw(uncore, reg);
0143     }
0144 }
0145
0146 static void i830_write_fence_reg(struct i915_fence_reg *fence)
0147 {
0148     u32 val;
0149
0150     val = 0;
0151     if (fence->tiling) {
0152         unsigned int stride = fence->stride;
0153
0154         val = fence->start;
0155         if (fence->tiling == I915_TILING_Y)
0156             val |= BIT(I830_FENCE_TILING_Y_SHIFT);
0157         val |= I830_FENCE_SIZE_BITS(fence->size);
0158         val |= ilog2(stride / 128) << I830_FENCE_PITCH_SHIFT;
0159         val |= I830_FENCE_REG_VALID;
0160     }
0161
0162     if (!pipelined) {
0163         struct intel_uncore *uncore = fence_to_uncore(fence);
0164         i915_reg_t reg = FENCE_REG(fence->id);
0165
0166         intel_uncore_write_fw(uncore, reg, val);
0167         intel_uncore_posting_read_fw(uncore, reg);
0168     }
0169 }
0170
0171 static void fence_write(struct i915_fence_reg *fence)
0172 {
0173     struct drm_i915_private *i915 = fence_to_i915(fence);
0174
0175     /*
0176      * Previous access through the fence register is marshalled by
0177      * the mb() inside the fault handlers (i915_gem_release_mmaps)
0178      * and explicitly managed for internal users.
0179      */
0180
0181     if (GRAPHICS_VER(i915) == 2)
0182         i830_write_fence_reg(fence);
0183     else if (GRAPHICS_VER(i915) == 3)
0184         i915_write_fence_reg(fence);
0185     else
0186         i965_write_fence_reg(fence);
0187
0188     /*
0189      * Access through the fenced region afterwards is
0190      * ordered by the posting reads whilst writing the registers.
0191      */
0192 }
0193
0194 static bool gpu_uses_fence_registers(struct i915_fence_reg *fence)
0195 {
0196     return GRAPHICS_VER(fence_to_i915(fence)) < 4;
0197 }
0198
0199 static int fence_update(struct i915_fence_reg *fence,
0200             struct i915_vma *vma)
0201 {
0202     struct i915_ggtt *ggtt = fence->ggtt;
0203     struct intel_uncore *uncore = fence_to_uncore(fence);
0204     intel_wakeref_t wakeref;
0205     struct i915_vma *old;
0206     int ret;
0207
0208     fence->tiling = 0;
0209     if (vma) {
0210         GEM_BUG_ON(!i915_gem_object_get_stride(vma->obj) ||
0211                !i915_gem_object_get_tiling(vma->obj));
0212
0213         if (!i915_vma_is_map_and_fenceable(vma))
0214             return -EINVAL;
0215
0216         if (gpu_uses_fence_registers(fence)) {
0217             /* implicit 'unfenced' GPU blits */
0218             ret = i915_vma_sync(vma);
0219             if (ret)
0220                 return ret;
0221         }
0222
0223         fence->start = vma->node.start;
0224         fence->size = vma->fence_size;
0225         fence->stride = i915_gem_object_get_stride(vma->obj);
0226         fence->tiling = i915_gem_object_get_tiling(vma->obj);
0227     }
0228     WRITE_ONCE(fence->dirty, false);
0229
0230     old = xchg(&fence->vma, NULL);
0231     if (old) {
0232         /* XXX Ideally we would move the waiting to outside the mutex */
0233         ret = i915_active_wait(&fence->active);
0234         if (ret) {
0235             fence->vma = old;
0236             return ret;
0237         }
0238
0239         i915_vma_flush_writes(old);
0240
0241         /*
0242          * Ensure that all userspace CPU access is completed before
0243          * stealing the fence.
0244          */
0245         if (old != vma) {
0246             GEM_BUG_ON(old->fence != fence);
0247             i915_vma_revoke_mmap(old);
0248             old->fence = NULL;
0249         }
0250
0251         list_move(&fence->link, &ggtt->fence_list);
0252     }
0253
0254     /*
0255      * We only need to update the register itself if the device is awake.
0256      * If the device is currently powered down, we will defer the write
0257      * to the runtime resume, see intel_ggtt_restore_fences().
0258      *
0259      * This only works for removing the fence register, on acquisition
0260      * the caller must hold the rpm wakeref. The fence register must
0261      * be cleared before we can use any other fences to ensure that
0262      * the new fences do not overlap the elided clears, confusing HW.
0263      */
0264     wakeref = intel_runtime_pm_get_if_in_use(uncore->rpm);
0265     if (!wakeref) {
0266         GEM_BUG_ON(vma);
0267         return 0;
0268     }
0269
0270     WRITE_ONCE(fence->vma, vma);
0271     fence_write(fence);
0272
0273     if (vma) {
0274         vma->fence = fence;
0275         list_move_tail(&fence->link, &ggtt->fence_list);
0276     }
0277
0278     intel_runtime_pm_put(uncore->rpm, wakeref);
0279     return 0;
0280 }
0281
0282 /**
0283  * i915_vma_revoke_fence - force-remove fence for a VMA
0284  * @vma: vma to map linearly (not through a fence reg)
0285  *
0286  * This function force-removes any fence from the given object, which is useful
0287  * if the kernel wants to do untiled GTT access.
0288  */
0289 void i915_vma_revoke_fence(struct i915_vma *vma)
0290 {
0291     struct i915_fence_reg *fence = vma->fence;
0292     intel_wakeref_t wakeref;
0293
0294     lockdep_assert_held(&vma->vm->mutex);
0295     if (!fence)
0296         return;
0297
0298     GEM_BUG_ON(fence->vma != vma);
0299     GEM_BUG_ON(!i915_active_is_idle(&fence->active));
0300     GEM_BUG_ON(atomic_read(&fence->pin_count));
0301
0302     fence->tiling = 0;
0303     WRITE_ONCE(fence->vma, NULL);
0304     vma->fence = NULL;
0305
0306     /*
0307      * Skip the write to HW if and only if the device is currently
0308      * suspended.
0309      *
0310      * If the driver does not currently hold a wakeref (if_in_use == 0),
0311      * the device may currently be runtime suspended, or it may be woken
0312      * up before the suspend takes place. If the device is not suspended
0313      * (powered down) and we skip clearing the fence register, the HW is
0314      * left in an undefined state where we may end up with multiple
0315      * registers overlapping.
0316      */
0317     with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref)
0318         fence_write(fence);
0319 }
0320
0321 static bool fence_is_active(const struct i915_fence_reg *fence)
0322 {
0323     return fence->vma && i915_vma_is_active(fence->vma);
0324 }
0325
0326 static struct i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
0327 {
0328     struct i915_fence_reg *active = NULL;
0329     struct i915_fence_reg *fence, *fn;
0330
0331     list_for_each_entry_safe(fence, fn, &ggtt->fence_list, link) {
0332         GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
0333
0334         if (fence == active) /* now seen this fence twice */
0335             active = ERR_PTR(-EAGAIN);
0336
0337         /* Prefer idle fences so we do not have to wait on the GPU */
0338         if (active != ERR_PTR(-EAGAIN) && fence_is_active(fence)) {
0339             if (!active)
0340                 active = fence;
0341
0342             list_move_tail(&fence->link, &ggtt->fence_list);
0343             continue;
0344         }
0345
0346         if (atomic_read(&fence->pin_count))
0347             continue;
0348
0349         return fence;
0350     }
0351
0352     /* Wait for completion of pending flips which consume fences */
0353     if (intel_has_pending_fb_unpin(ggtt->vm.i915))
0354         return ERR_PTR(-EAGAIN);
0355
0356     return ERR_PTR(-ENOBUFS);
0357 }
0358
0359 int __i915_vma_pin_fence(struct i915_vma *vma)
0360 {
0361     struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm);
0362     struct i915_fence_reg *fence;
0363     struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
0364     int err;
0365
0366     lockdep_assert_held(&vma->vm->mutex);
0367
0368     /* Just update our place in the LRU if our fence is getting reused. */
0369     if (vma->fence) {
0370         fence = vma->fence;
0371         GEM_BUG_ON(fence->vma != vma);
0372         atomic_inc(&fence->pin_count);
0373         if (!fence->dirty) {
0374             list_move_tail(&fence->link, &ggtt->fence_list);
0375             return 0;
0376         }
0377     } else if (set) {
0378         fence = fence_find(ggtt);
0379         if (IS_ERR(fence))
0380             return PTR_ERR(fence);
0381
0382         GEM_BUG_ON(atomic_read(&fence->pin_count));
0383         atomic_inc(&fence->pin_count);
0384     } else {
0385         return 0;
0386     }
0387
0388     err = fence_update(fence, set);
0389     if (err)
0390         goto out_unpin;
0391
0392     GEM_BUG_ON(fence->vma != set);
0393     GEM_BUG_ON(vma->fence != (set ? fence : NULL));
0394
0395     if (set)
0396         return 0;
0397
0398 out_unpin:
0399     atomic_dec(&fence->pin_count);
0400     return err;
0401 }
0402
0403 /**
0404  * i915_vma_pin_fence - set up fencing for a vma
0405  * @vma: vma to map through a fence reg
0406  *
0407  * When mapping objects through the GTT, userspace wants to be able to write
0408  * to them without having to worry about swizzling if the object is tiled.
0409  * This function walks the fence regs looking for a free one for @obj,
0410  * stealing one if it can't find any.
0411  *
0412  * It then sets up the reg based on the object's properties: address, pitch
0413  * and tiling format.
0414  *
0415  * For an untiled surface, this removes any existing fence.
0416  *
0417  * Returns:
0418  *
0419  * 0 on success, negative error code on failure.
0420  */
0421 int i915_vma_pin_fence(struct i915_vma *vma)
0422 {
0423     int err;
0424
0425     if (!vma->fence && !i915_gem_object_is_tiled(vma->obj))
0426         return 0;
0427
0428     /*
0429      * Note that we revoke fences on runtime suspend. Therefore the user
0430      * must keep the device awake whilst using the fence.
0431      */
0432     assert_rpm_wakelock_held(vma->vm->gt->uncore->rpm);
0433     GEM_BUG_ON(!i915_vma_is_ggtt(vma));
0434
0435     err = mutex_lock_interruptible(&vma->vm->mutex);
0436     if (err)
0437         return err;
0438
0439     err = __i915_vma_pin_fence(vma);
0440     mutex_unlock(&vma->vm->mutex);
0441
0442     return err;
0443 }
0444
0445 /**
0446  * i915_reserve_fence - Reserve a fence for vGPU
0447  * @ggtt: Global GTT
0448  *
0449  * This function walks the fence regs looking for a free one and remove
0450  * it from the fence_list. It is used to reserve fence for vGPU to use.
0451  */
0452 struct i915_fence_reg *i915_reserve_fence(struct i915_ggtt *ggtt)
0453 {
0454     struct i915_fence_reg *fence;
0455     int count;
0456     int ret;
0457
0458     lockdep_assert_held(&ggtt->vm.mutex);
0459
0460     /* Keep at least one fence available for the display engine. */
0461     count = 0;
0462     list_for_each_entry(fence, &ggtt->fence_list, link)
0463         count += !atomic_read(&fence->pin_count);
0464     if (count <= 1)
0465         return ERR_PTR(-ENOSPC);
0466
0467     fence = fence_find(ggtt);
0468     if (IS_ERR(fence))
0469         return fence;
0470
0471     if (fence->vma) {
0472         /* Force-remove fence from VMA */
0473         ret = fence_update(fence, NULL);
0474         if (ret)
0475             return ERR_PTR(ret);
0476     }
0477
0478     list_del(&fence->link);
0479
0480     return fence;
0481 }
0482
0483 /**
0484  * i915_unreserve_fence - Reclaim a reserved fence
0485  * @fence: the fence reg
0486  *
0487  * This function add a reserved fence register from vGPU to the fence_list.
0488  */
0489 void i915_unreserve_fence(struct i915_fence_reg *fence)
0490 {
0491     struct i915_ggtt *ggtt = fence->ggtt;
0492
0493     lockdep_assert_held(&ggtt->vm.mutex);
0494
0495     list_add(&fence->link, &ggtt->fence_list);
0496 }
0497
0498 /**
0499  * intel_ggtt_restore_fences - restore fence state
0500  * @ggtt: Global GTT
0501  *
0502  * Restore the hw fence state to match the software tracking again, to be called
0503  * after a gpu reset and on resume. Note that on runtime suspend we only cancel
0504  * the fences, to be reacquired by the user later.
0505  */
0506 void intel_ggtt_restore_fences(struct i915_ggtt *ggtt)
0507 {
0508     int i;
0509
0510     for (i = 0; i < ggtt->num_fences; i++)
0511         fence_write(&ggtt->fence_regs[i]);
0512 }
0513
0514 /**
0515  * DOC: tiling swizzling details
0516  *
0517  * The idea behind tiling is to increase cache hit rates by rearranging
0518  * pixel data so that a group of pixel accesses are in the same cacheline.
0519  * Performance improvement from doing this on the back/depth buffer are on
0520  * the order of 30%.
0521  *
0522  * Intel architectures make this somewhat more complicated, though, by
0523  * adjustments made to addressing of data when the memory is in interleaved
0524  * mode (matched pairs of DIMMS) to improve memory bandwidth.
0525  * For interleaved memory, the CPU sends every sequential 64 bytes
0526  * to an alternate memory channel so it can get the bandwidth from both.
0527  *
0528  * The GPU also rearranges its accesses for increased bandwidth to interleaved
0529  * memory, and it matches what the CPU does for non-tiled.  However, when tiled
0530  * it does it a little differently, since one walks addresses not just in the
0531  * X direction but also Y.  So, along with alternating channels when bit
0532  * 6 of the address flips, it also alternates when other bits flip --  Bits 9
0533  * (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
0534  * are common to both the 915 and 965-class hardware.
0535  *
0536  * The CPU also sometimes XORs in higher bits as well, to improve
0537  * bandwidth doing strided access like we do so frequently in graphics.  This
0538  * is called "Channel XOR Randomization" in the MCH documentation.  The result
0539  * is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
0540  * decode.
0541  *
0542  * All of this bit 6 XORing has an effect on our memory management,
0543  * as we need to make sure that the 3d driver can correctly address object
0544  * contents.
0545  *
0546  * If we don't have interleaved memory, all tiling is safe and no swizzling is
0547  * required.
0548  *
0549  * When bit 17 is XORed in, we simply refuse to tile at all.  Bit
0550  * 17 is not just a page offset, so as we page an object out and back in,
0551  * individual pages in it will have different bit 17 addresses, resulting in
0552  * each 64 bytes being swapped with its neighbor!
0553  *
0554  * Otherwise, if interleaved, we have to tell the 3d driver what the address
0555  * swizzling it needs to do is, since it's writing with the CPU to the pages
0556  * (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
0557  * pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
0558  * required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
0559  * to match what the GPU expects.
0560  */
0561
0562 /**
0563  * detect_bit_6_swizzle - detect bit 6 swizzling pattern
0564  * @ggtt: Global GGTT
0565  *
0566  * Detects bit 6 swizzling of address lookup between IGD access and CPU
0567  * access through main memory.
0568  */
0569 static void detect_bit_6_swizzle(struct i915_ggtt *ggtt)
0570 {
0571     struct intel_uncore *uncore = ggtt->vm.gt->uncore;
0572     struct drm_i915_private *i915 = ggtt->vm.i915;
0573     u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
0574     u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
0575
0576     if (GRAPHICS_VER(i915) >= 8 || IS_VALLEYVIEW(i915)) {
0577         /*
0578          * On BDW+, swizzling is not used. We leave the CPU memory
0579          * controller in charge of optimizing memory accesses without
0580          * the extra address manipulation GPU side.
0581          *
0582          * VLV and CHV don't have GPU swizzling.
0583          */
0584         swizzle_x = I915_BIT_6_SWIZZLE_NONE;
0585         swizzle_y = I915_BIT_6_SWIZZLE_NONE;
0586     } else if (GRAPHICS_VER(i915) >= 6) {
0587         if (i915->preserve_bios_swizzle) {
0588             if (intel_uncore_read(uncore, DISP_ARB_CTL) &
0589                 DISP_TILE_SURFACE_SWIZZLING) {
0590                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
0591                 swizzle_y = I915_BIT_6_SWIZZLE_9;
0592             } else {
0593                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
0594                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
0595             }
0596         } else {
0597             u32 dimm_c0, dimm_c1;
0598
0599             dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0);
0600             dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1);
0601             dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
0602             dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
0603             /*
0604              * Enable swizzling when the channels are populated
0605              * with identically sized dimms. We don't need to check
0606              * the 3rd channel because no cpu with gpu attached
0607              * ships in that configuration. Also, swizzling only
0608              * makes sense for 2 channels anyway.
0609              */
0610             if (dimm_c0 == dimm_c1) {
0611                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
0612                 swizzle_y = I915_BIT_6_SWIZZLE_9;
0613             } else {
0614                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
0615                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
0616             }
0617         }
0618     } else if (GRAPHICS_VER(i915) == 5) {
0619         /*
0620          * On Ironlake whatever DRAM config, GPU always do
0621          * same swizzling setup.
0622          */
0623         swizzle_x = I915_BIT_6_SWIZZLE_9_10;
0624         swizzle_y = I915_BIT_6_SWIZZLE_9;
0625     } else if (GRAPHICS_VER(i915) == 2) {
0626         /*
0627          * As far as we know, the 865 doesn't have these bit 6
0628          * swizzling issues.
0629          */
0630         swizzle_x = I915_BIT_6_SWIZZLE_NONE;
0631         swizzle_y = I915_BIT_6_SWIZZLE_NONE;
0632     } else if (IS_G45(i915) || IS_I965G(i915) || IS_G33(i915)) {
0633         /*
0634          * The 965, G33, and newer, have a very flexible memory
0635          * configuration.  It will enable dual-channel mode
0636          * (interleaving) on as much memory as it can, and the GPU
0637          * will additionally sometimes enable different bit 6
0638          * swizzling for tiled objects from the CPU.
0639          *
0640          * Here's what I found on the G965:
0641          *    slot fill         memory size  swizzling
0642          * 0A   0B   1A   1B    1-ch   2-ch
0643          * 512  0    0    0     512    0     O
0644          * 512  0    512  0     16     1008  X
0645          * 512  0    0    512   16     1008  X
0646          * 0    512  0    512   16     1008  X
0647          * 1024 1024 1024 0     2048   1024  O
0648          *
0649          * We could probably detect this based on either the DRB
0650          * matching, which was the case for the swizzling required in
0651          * the table above, or from the 1-ch value being less than
0652          * the minimum size of a rank.
0653          *
0654          * Reports indicate that the swizzling actually
0655          * varies depending upon page placement inside the
0656          * channels, i.e. we see swizzled pages where the
0657          * banks of memory are paired and unswizzled on the
0658          * uneven portion, so leave that as unknown.
0659          */
0660         if (intel_uncore_read16(uncore, C0DRB3_BW) ==
0661             intel_uncore_read16(uncore, C1DRB3_BW)) {
0662             swizzle_x = I915_BIT_6_SWIZZLE_9_10;
0663             swizzle_y = I915_BIT_6_SWIZZLE_9;
0664         }
0665     } else {
0666         u32 dcc = intel_uncore_read(uncore, DCC);
0667
0668         /*
0669          * On 9xx chipsets, channel interleave by the CPU is
0670          * determined by DCC.  For single-channel, neither the CPU
0671          * nor the GPU do swizzling.  For dual channel interleaved,
0672          * the GPU's interleave is bit 9 and 10 for X tiled, and bit
0673          * 9 for Y tiled.  The CPU's interleave is independent, and
0674          * can be based on either bit 11 (haven't seen this yet) or
0675          * bit 17 (common).
0676          */
0677         switch (dcc & DCC_ADDRESSING_MODE_MASK) {
0678         case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
0679         case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
0680             swizzle_x = I915_BIT_6_SWIZZLE_NONE;
0681             swizzle_y = I915_BIT_6_SWIZZLE_NONE;
0682             break;
0683         case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
0684             if (dcc & DCC_CHANNEL_XOR_DISABLE) {
0685                 /*
0686                  * This is the base swizzling by the GPU for
0687                  * tiled buffers.
0688                  */
0689                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
0690                 swizzle_y = I915_BIT_6_SWIZZLE_9;
0691             } else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == 0) {
0692                 /* Bit 11 swizzling by the CPU in addition. */
0693                 swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
0694                 swizzle_y = I915_BIT_6_SWIZZLE_9_11;
0695             } else {
0696                 /* Bit 17 swizzling by the CPU in addition. */
0697                 swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
0698                 swizzle_y = I915_BIT_6_SWIZZLE_9_17;
0699             }
0700             break;
0701         }
0702
0703         /* check for L-shaped memory aka modified enhanced addressing */
0704         if (GRAPHICS_VER(i915) == 4 &&
0705             !(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) {
0706             swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
0707             swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
0708         }
0709
0710         if (dcc == 0xffffffff) {
0711             drm_err(&i915->drm, "Couldn't read from MCHBAR.  "
0712                   "Disabling tiling.\n");
0713             swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
0714             swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
0715         }
0716     }
0717
0718     if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN ||
0719         swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) {
0720         /*
0721          * Userspace likes to explode if it sees unknown swizzling,
0722          * so lie. We will finish the lie when reporting through
0723          * the get-tiling-ioctl by reporting the physical swizzle
0724          * mode as unknown instead.
0725          *
0726          * As we don't strictly know what the swizzling is, it may be
0727          * bit17 dependent, and so we need to also prevent the pages
0728          * from being moved.
0729          */
0730         i915->quirks |= QUIRK_PIN_SWIZZLED_PAGES;
0731         swizzle_x = I915_BIT_6_SWIZZLE_NONE;
0732         swizzle_y = I915_BIT_6_SWIZZLE_NONE;
0733     }
0734
0735     to_gt(i915)->ggtt->bit_6_swizzle_x = swizzle_x;
0736     to_gt(i915)->ggtt->bit_6_swizzle_y = swizzle_y;
0737 }
0738
0739 /*
0740  * Swap every 64 bytes of this page around, to account for it having a new
0741  * bit 17 of its physical address and therefore being interpreted differently
0742  * by the GPU.
0743  */
0744 static void swizzle_page(struct page *page)
0745 {
0746     char temp[64];
0747     char *vaddr;
0748     int i;
0749
0750     vaddr = kmap(page);
0751
0752     for (i = 0; i < PAGE_SIZE; i += 128) {
0753         memcpy(temp, &vaddr[i], 64);
0754         memcpy(&vaddr[i], &vaddr[i + 64], 64);
0755         memcpy(&vaddr[i + 64], temp, 64);
0756     }
0757
0758     kunmap(page);
0759 }
0760
0761 /**
0762  * i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
0763  * @obj: i915 GEM buffer object
0764  * @pages: the scattergather list of physical pages
0765  *
0766  * This function fixes up the swizzling in case any page frame number for this
0767  * object has changed in bit 17 since that state has been saved with
0768  * i915_gem_object_save_bit_17_swizzle().
0769  *
0770  * This is called when pinning backing storage again, since the kernel is free
0771  * to move unpinned backing storage around (either by directly moving pages or
0772  * by swapping them out and back in again).
0773  */
0774 void
0775 i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
0776                   struct sg_table *pages)
0777 {
0778     struct sgt_iter sgt_iter;
0779     struct page *page;
0780     int i;
0781
0782     if (obj->bit_17 == NULL)
0783         return;
0784
0785     i = 0;
0786     for_each_sgt_page(page, sgt_iter, pages) {
0787         char new_bit_17 = page_to_phys(page) >> 17;
0788
0789         if ((new_bit_17 & 0x1) != (test_bit(i, obj->bit_17) != 0)) {
0790             swizzle_page(page);
0791             set_page_dirty(page);
0792         }
0793
0794         i++;
0795     }
0796 }
0797
0798 /**
0799  * i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
0800  * @obj: i915 GEM buffer object
0801  * @pages: the scattergather list of physical pages
0802  *
0803  * This function saves the bit 17 of each page frame number so that swizzling
0804  * can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
0805  * be called before the backing storage can be unpinned.
0806  */
0807 void
0808 i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
0809                     struct sg_table *pages)
0810 {
0811     const unsigned int page_count = obj->base.size >> PAGE_SHIFT;
0812     struct sgt_iter sgt_iter;
0813     struct page *page;
0814     int i;
0815
0816     if (obj->bit_17 == NULL) {
0817         obj->bit_17 = bitmap_zalloc(page_count, GFP_KERNEL);
0818         if (obj->bit_17 == NULL) {
0819             DRM_ERROR("Failed to allocate memory for bit 17 "
0820                   "record\n");
0821             return;
0822         }
0823     }
0824
0825     i = 0;
0826
0827     for_each_sgt_page(page, sgt_iter, pages) {
0828         if (page_to_phys(page) & (1 << 17))
0829             __set_bit(i, obj->bit_17);
0830         else
0831             __clear_bit(i, obj->bit_17);
0832         i++;
0833     }
0834 }
0835
0836 void intel_ggtt_init_fences(struct i915_ggtt *ggtt)
0837 {
0838     struct drm_i915_private *i915 = ggtt->vm.i915;
0839     struct intel_uncore *uncore = ggtt->vm.gt->uncore;
0840     int num_fences;
0841     int i;
0842
0843     INIT_LIST_HEAD(&ggtt->fence_list);
0844     INIT_LIST_HEAD(&ggtt->userfault_list);
0845     intel_wakeref_auto_init(&ggtt->userfault_wakeref, uncore->rpm);
0846
0847     detect_bit_6_swizzle(ggtt);
0848
0849     if (!i915_ggtt_has_aperture(ggtt))
0850         num_fences = 0;
0851     else if (GRAPHICS_VER(i915) >= 7 &&
0852          !(IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)))
0853         num_fences = 32;
0854     else if (GRAPHICS_VER(i915) >= 4 ||
0855          IS_I945G(i915) || IS_I945GM(i915) ||
0856          IS_G33(i915) || IS_PINEVIEW(i915))
0857         num_fences = 16;
0858     else
0859         num_fences = 8;
0860
0861     if (intel_vgpu_active(i915))
0862         num_fences = intel_uncore_read(uncore,
0863                            vgtif_reg(avail_rs.fence_num));
0864     ggtt->fence_regs = kcalloc(num_fences,
0865                    sizeof(*ggtt->fence_regs),
0866                    GFP_KERNEL);
0867     if (!ggtt->fence_regs)
0868         num_fences = 0;
0869
0870     /* Initialize fence registers to zero */
0871     for (i = 0; i < num_fences; i++) {
0872         struct i915_fence_reg *fence = &ggtt->fence_regs[i];
0873
0874         i915_active_init(&fence->active, NULL, NULL, 0);
0875         fence->ggtt = ggtt;
0876         fence->id = i;
0877         list_add_tail(&fence->link, &ggtt->fence_list);
0878     }
0879     ggtt->num_fences = num_fences;
0880
0881     intel_ggtt_restore_fences(ggtt);
0882 }
0883
0884 void intel_ggtt_fini_fences(struct i915_ggtt *ggtt)
0885 {
0886     int i;
0887
0888     for (i = 0; i < ggtt->num_fences; i++) {
0889         struct i915_fence_reg *fence = &ggtt->fence_regs[i];
0890
0891         i915_active_fini(&fence->active);
0892     }
0893
0894     kfree(ggtt->fence_regs);
0895 }
0896
0897 void intel_gt_init_swizzling(struct intel_gt *gt)
0898 {
0899     struct drm_i915_private *i915 = gt->i915;
0900     struct intel_uncore *uncore = gt->uncore;
0901
0902     if (GRAPHICS_VER(i915) < 5 ||
0903         to_gt(i915)->ggtt->bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
0904         return;
0905
0906     intel_uncore_rmw(uncore, DISP_ARB_CTL, 0, DISP_TILE_SURFACE_SWIZZLING);
0907
0908     if (GRAPHICS_VER(i915) == 5)
0909         return;
0910
0911     intel_uncore_rmw(uncore, TILECTL, 0, TILECTL_SWZCTL);
0912
0913     if (GRAPHICS_VER(i915) == 6)
0914         intel_uncore_write(uncore,
0915                    ARB_MODE,
0916                    _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
0917     else if (GRAPHICS_VER(i915) == 7)
0918         intel_uncore_write(uncore,
0919                    ARB_MODE,
0920                    _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
0921     else if (GRAPHICS_VER(i915) == 8)
0922         intel_uncore_write(uncore,
0923                    GAMTARBMODE,
0924                    _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
0925     else
0926         MISSING_CASE(GRAPHICS_VER(i915));
0927 }