Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0 OR MIT
0002 /*
0003  * Copyright 2015-2022 Advanced Micro Devices, Inc.
0004  *
0005  * Permission is hereby granted, free of charge, to any person obtaining a
0006  * copy of this software and associated documentation files (the "Software"),
0007  * to deal in the Software without restriction, including without limitation
0008  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0009  * and/or sell copies of the Software, and to permit persons to whom the
0010  * Software is furnished to do so, subject to the following conditions:
0011  *
0012  * The above copyright notice and this permission notice shall be included in
0013  * all copies or substantial portions of the Software.
0014  *
0015  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0016  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0017  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0018  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0019  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0020  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0021  * OTHER DEALINGS IN THE SOFTWARE.
0022  */
0023 
0024 #include <linux/pci.h>
0025 #include <linux/acpi.h>
0026 #include "kfd_crat.h"
0027 #include "kfd_priv.h"
0028 #include "kfd_topology.h"
0029 #include "kfd_iommu.h"
0030 #include "amdgpu.h"
0031 #include "amdgpu_amdkfd.h"
0032 
0033 /* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
0034  * GPU processor ID are expressed with Bit[31]=1.
0035  * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
0036  * used in the CRAT.
0037  */
0038 static uint32_t gpu_processor_id_low = 0x80001000;
0039 
0040 /* Return the next available gpu_processor_id and increment it for next GPU
0041  *  @total_cu_count - Total CUs present in the GPU including ones
0042  *            masked off
0043  */
0044 static inline unsigned int get_and_inc_gpu_processor_id(
0045                 unsigned int total_cu_count)
0046 {
0047     int current_id = gpu_processor_id_low;
0048 
0049     gpu_processor_id_low += total_cu_count;
0050     return current_id;
0051 }
0052 
0053 /* Static table to describe GPU Cache information */
0054 struct kfd_gpu_cache_info {
0055     uint32_t    cache_size;
0056     uint32_t    cache_level;
0057     uint32_t    flags;
0058     /* Indicates how many Compute Units share this cache
0059      * within a SA. Value = 1 indicates the cache is not shared
0060      */
0061     uint32_t    num_cu_shared;
0062 };
0063 
0064 static struct kfd_gpu_cache_info kaveri_cache_info[] = {
0065     {
0066         /* TCP L1 Cache per CU */
0067         .cache_size = 16,
0068         .cache_level = 1,
0069         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0070                 CRAT_CACHE_FLAGS_DATA_CACHE |
0071                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0072         .num_cu_shared = 1,
0073     },
0074     {
0075         /* Scalar L1 Instruction Cache (in SQC module) per bank */
0076         .cache_size = 16,
0077         .cache_level = 1,
0078         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0079                 CRAT_CACHE_FLAGS_INST_CACHE |
0080                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0081         .num_cu_shared = 2,
0082     },
0083     {
0084         /* Scalar L1 Data Cache (in SQC module) per bank */
0085         .cache_size = 8,
0086         .cache_level = 1,
0087         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0088                 CRAT_CACHE_FLAGS_DATA_CACHE |
0089                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0090         .num_cu_shared = 2,
0091     },
0092 
0093     /* TODO: Add L2 Cache information */
0094 };
0095 
0096 
0097 static struct kfd_gpu_cache_info carrizo_cache_info[] = {
0098     {
0099         /* TCP L1 Cache per CU */
0100         .cache_size = 16,
0101         .cache_level = 1,
0102         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0103                 CRAT_CACHE_FLAGS_DATA_CACHE |
0104                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0105         .num_cu_shared = 1,
0106     },
0107     {
0108         /* Scalar L1 Instruction Cache (in SQC module) per bank */
0109         .cache_size = 8,
0110         .cache_level = 1,
0111         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0112                 CRAT_CACHE_FLAGS_INST_CACHE |
0113                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0114         .num_cu_shared = 4,
0115     },
0116     {
0117         /* Scalar L1 Data Cache (in SQC module) per bank. */
0118         .cache_size = 4,
0119         .cache_level = 1,
0120         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0121                 CRAT_CACHE_FLAGS_DATA_CACHE |
0122                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0123         .num_cu_shared = 4,
0124     },
0125 
0126     /* TODO: Add L2 Cache information */
0127 };
0128 
0129 #define hawaii_cache_info kaveri_cache_info
0130 #define tonga_cache_info carrizo_cache_info
0131 #define fiji_cache_info  carrizo_cache_info
0132 #define polaris10_cache_info carrizo_cache_info
0133 #define polaris11_cache_info carrizo_cache_info
0134 #define polaris12_cache_info carrizo_cache_info
0135 #define vegam_cache_info carrizo_cache_info
0136 
0137 /* NOTE: L1 cache information has been updated and L2/L3
0138  * cache information has been added for Vega10 and
0139  * newer ASICs. The unit for cache_size is KiB.
0140  * In future,  check & update cache details
0141  * for every new ASIC is required.
0142  */
0143 
0144 static struct kfd_gpu_cache_info vega10_cache_info[] = {
0145     {
0146         /* TCP L1 Cache per CU */
0147         .cache_size = 16,
0148         .cache_level = 1,
0149         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0150                 CRAT_CACHE_FLAGS_DATA_CACHE |
0151                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0152         .num_cu_shared = 1,
0153     },
0154     {
0155         /* Scalar L1 Instruction Cache per SQC */
0156         .cache_size = 32,
0157         .cache_level = 1,
0158         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0159                 CRAT_CACHE_FLAGS_INST_CACHE |
0160                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0161         .num_cu_shared = 3,
0162     },
0163     {
0164         /* Scalar L1 Data Cache per SQC */
0165         .cache_size = 16,
0166         .cache_level = 1,
0167         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0168                 CRAT_CACHE_FLAGS_DATA_CACHE |
0169                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0170         .num_cu_shared = 3,
0171     },
0172     {
0173         /* L2 Data Cache per GPU (Total Tex Cache) */
0174         .cache_size = 4096,
0175         .cache_level = 2,
0176         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0177                 CRAT_CACHE_FLAGS_DATA_CACHE |
0178                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0179         .num_cu_shared = 16,
0180     },
0181 };
0182 
0183 static struct kfd_gpu_cache_info raven_cache_info[] = {
0184     {
0185         /* TCP L1 Cache per CU */
0186         .cache_size = 16,
0187         .cache_level = 1,
0188         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0189                 CRAT_CACHE_FLAGS_DATA_CACHE |
0190                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0191         .num_cu_shared = 1,
0192     },
0193     {
0194         /* Scalar L1 Instruction Cache per SQC */
0195         .cache_size = 32,
0196         .cache_level = 1,
0197         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0198                 CRAT_CACHE_FLAGS_INST_CACHE |
0199                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0200         .num_cu_shared = 3,
0201     },
0202     {
0203         /* Scalar L1 Data Cache per SQC */
0204         .cache_size = 16,
0205         .cache_level = 1,
0206         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0207                 CRAT_CACHE_FLAGS_DATA_CACHE |
0208                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0209         .num_cu_shared = 3,
0210     },
0211     {
0212         /* L2 Data Cache per GPU (Total Tex Cache) */
0213         .cache_size = 1024,
0214         .cache_level = 2,
0215         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0216                 CRAT_CACHE_FLAGS_DATA_CACHE |
0217                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0218         .num_cu_shared = 11,
0219     },
0220 };
0221 
0222 static struct kfd_gpu_cache_info renoir_cache_info[] = {
0223     {
0224         /* TCP L1 Cache per CU */
0225         .cache_size = 16,
0226         .cache_level = 1,
0227         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0228                 CRAT_CACHE_FLAGS_DATA_CACHE |
0229                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0230         .num_cu_shared = 1,
0231     },
0232     {
0233         /* Scalar L1 Instruction Cache per SQC */
0234         .cache_size = 32,
0235         .cache_level = 1,
0236         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0237                 CRAT_CACHE_FLAGS_INST_CACHE |
0238                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0239         .num_cu_shared = 3,
0240     },
0241     {
0242         /* Scalar L1 Data Cache per SQC */
0243         .cache_size = 16,
0244         .cache_level = 1,
0245         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0246                 CRAT_CACHE_FLAGS_DATA_CACHE |
0247                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0248         .num_cu_shared = 3,
0249     },
0250     {
0251         /* L2 Data Cache per GPU (Total Tex Cache) */
0252         .cache_size = 1024,
0253         .cache_level = 2,
0254         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0255                 CRAT_CACHE_FLAGS_DATA_CACHE |
0256                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0257         .num_cu_shared = 8,
0258     },
0259 };
0260 
0261 static struct kfd_gpu_cache_info vega12_cache_info[] = {
0262     {
0263         /* TCP L1 Cache per CU */
0264         .cache_size = 16,
0265         .cache_level = 1,
0266         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0267                 CRAT_CACHE_FLAGS_DATA_CACHE |
0268                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0269         .num_cu_shared = 1,
0270     },
0271     {
0272         /* Scalar L1 Instruction Cache per SQC */
0273         .cache_size = 32,
0274         .cache_level = 1,
0275         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0276                 CRAT_CACHE_FLAGS_INST_CACHE |
0277                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0278         .num_cu_shared = 3,
0279     },
0280     {
0281         /* Scalar L1 Data Cache per SQC */
0282         .cache_size = 16,
0283         .cache_level = 1,
0284         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0285                 CRAT_CACHE_FLAGS_DATA_CACHE |
0286                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0287         .num_cu_shared = 3,
0288     },
0289     {
0290         /* L2 Data Cache per GPU (Total Tex Cache) */
0291         .cache_size = 2048,
0292         .cache_level = 2,
0293         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0294                 CRAT_CACHE_FLAGS_DATA_CACHE |
0295                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0296         .num_cu_shared = 5,
0297     },
0298 };
0299 
0300 static struct kfd_gpu_cache_info vega20_cache_info[] = {
0301     {
0302         /* TCP L1 Cache per CU */
0303         .cache_size = 16,
0304         .cache_level = 1,
0305         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0306                 CRAT_CACHE_FLAGS_DATA_CACHE |
0307                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0308         .num_cu_shared = 1,
0309     },
0310     {
0311         /* Scalar L1 Instruction Cache per SQC */
0312         .cache_size = 32,
0313         .cache_level = 1,
0314         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0315                 CRAT_CACHE_FLAGS_INST_CACHE |
0316                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0317         .num_cu_shared = 3,
0318     },
0319     {
0320         /* Scalar L1 Data Cache per SQC */
0321         .cache_size = 16,
0322         .cache_level = 1,
0323         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0324                 CRAT_CACHE_FLAGS_DATA_CACHE |
0325                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0326         .num_cu_shared = 3,
0327     },
0328     {
0329         /* L2 Data Cache per GPU (Total Tex Cache) */
0330         .cache_size = 8192,
0331         .cache_level = 2,
0332         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0333                 CRAT_CACHE_FLAGS_DATA_CACHE |
0334                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0335         .num_cu_shared = 16,
0336     },
0337 };
0338 
0339 static struct kfd_gpu_cache_info aldebaran_cache_info[] = {
0340     {
0341         /* TCP L1 Cache per CU */
0342         .cache_size = 16,
0343         .cache_level = 1,
0344         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0345                 CRAT_CACHE_FLAGS_DATA_CACHE |
0346                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0347         .num_cu_shared = 1,
0348     },
0349     {
0350         /* Scalar L1 Instruction Cache per SQC */
0351         .cache_size = 32,
0352         .cache_level = 1,
0353         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0354                 CRAT_CACHE_FLAGS_INST_CACHE |
0355                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0356         .num_cu_shared = 2,
0357     },
0358     {
0359         /* Scalar L1 Data Cache per SQC */
0360         .cache_size = 16,
0361         .cache_level = 1,
0362         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0363                 CRAT_CACHE_FLAGS_DATA_CACHE |
0364                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0365         .num_cu_shared = 2,
0366     },
0367     {
0368         /* L2 Data Cache per GPU (Total Tex Cache) */
0369         .cache_size = 8192,
0370         .cache_level = 2,
0371         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0372                 CRAT_CACHE_FLAGS_DATA_CACHE |
0373                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0374         .num_cu_shared = 14,
0375     },
0376 };
0377 
0378 static struct kfd_gpu_cache_info navi10_cache_info[] = {
0379     {
0380         /* TCP L1 Cache per CU */
0381         .cache_size = 16,
0382         .cache_level = 1,
0383         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0384                 CRAT_CACHE_FLAGS_DATA_CACHE |
0385                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0386         .num_cu_shared = 1,
0387     },
0388     {
0389         /* Scalar L1 Instruction Cache per SQC */
0390         .cache_size = 32,
0391         .cache_level = 1,
0392         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0393                 CRAT_CACHE_FLAGS_INST_CACHE |
0394                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0395         .num_cu_shared = 2,
0396     },
0397     {
0398         /* Scalar L1 Data Cache per SQC */
0399         .cache_size = 16,
0400         .cache_level = 1,
0401         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0402                 CRAT_CACHE_FLAGS_DATA_CACHE |
0403                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0404         .num_cu_shared = 2,
0405     },
0406     {
0407         /* GL1 Data Cache per SA */
0408         .cache_size = 128,
0409         .cache_level = 1,
0410         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0411                 CRAT_CACHE_FLAGS_DATA_CACHE |
0412                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0413         .num_cu_shared = 10,
0414     },
0415     {
0416         /* L2 Data Cache per GPU (Total Tex Cache) */
0417         .cache_size = 4096,
0418         .cache_level = 2,
0419         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0420                 CRAT_CACHE_FLAGS_DATA_CACHE |
0421                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0422         .num_cu_shared = 10,
0423     },
0424 };
0425 
0426 static struct kfd_gpu_cache_info vangogh_cache_info[] = {
0427     {
0428         /* TCP L1 Cache per CU */
0429         .cache_size = 16,
0430         .cache_level = 1,
0431         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0432                 CRAT_CACHE_FLAGS_DATA_CACHE |
0433                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0434         .num_cu_shared = 1,
0435     },
0436     {
0437         /* Scalar L1 Instruction Cache per SQC */
0438         .cache_size = 32,
0439         .cache_level = 1,
0440         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0441                 CRAT_CACHE_FLAGS_INST_CACHE |
0442                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0443         .num_cu_shared = 2,
0444     },
0445     {
0446         /* Scalar L1 Data Cache per SQC */
0447         .cache_size = 16,
0448         .cache_level = 1,
0449         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0450                 CRAT_CACHE_FLAGS_DATA_CACHE |
0451                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0452         .num_cu_shared = 2,
0453     },
0454     {
0455         /* GL1 Data Cache per SA */
0456         .cache_size = 128,
0457         .cache_level = 1,
0458         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0459                 CRAT_CACHE_FLAGS_DATA_CACHE |
0460                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0461         .num_cu_shared = 8,
0462     },
0463     {
0464         /* L2 Data Cache per GPU (Total Tex Cache) */
0465         .cache_size = 1024,
0466         .cache_level = 2,
0467         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0468                 CRAT_CACHE_FLAGS_DATA_CACHE |
0469                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0470         .num_cu_shared = 8,
0471     },
0472 };
0473 
0474 static struct kfd_gpu_cache_info navi14_cache_info[] = {
0475     {
0476         /* TCP L1 Cache per CU */
0477         .cache_size = 16,
0478         .cache_level = 1,
0479         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0480                 CRAT_CACHE_FLAGS_DATA_CACHE |
0481                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0482         .num_cu_shared = 1,
0483     },
0484     {
0485         /* Scalar L1 Instruction Cache per SQC */
0486         .cache_size = 32,
0487         .cache_level = 1,
0488         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0489                 CRAT_CACHE_FLAGS_INST_CACHE |
0490                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0491         .num_cu_shared = 2,
0492     },
0493     {
0494         /* Scalar L1 Data Cache per SQC */
0495         .cache_size = 16,
0496         .cache_level = 1,
0497         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0498                 CRAT_CACHE_FLAGS_DATA_CACHE |
0499                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0500         .num_cu_shared = 2,
0501     },
0502     {
0503         /* GL1 Data Cache per SA */
0504         .cache_size = 128,
0505         .cache_level = 1,
0506         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0507                 CRAT_CACHE_FLAGS_DATA_CACHE |
0508                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0509         .num_cu_shared = 12,
0510     },
0511     {
0512         /* L2 Data Cache per GPU (Total Tex Cache) */
0513         .cache_size = 2048,
0514         .cache_level = 2,
0515         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0516                 CRAT_CACHE_FLAGS_DATA_CACHE |
0517                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0518         .num_cu_shared = 12,
0519     },
0520 };
0521 
0522 static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = {
0523     {
0524         /* TCP L1 Cache per CU */
0525         .cache_size = 16,
0526         .cache_level = 1,
0527         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0528                 CRAT_CACHE_FLAGS_DATA_CACHE |
0529                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0530         .num_cu_shared = 1,
0531     },
0532     {
0533         /* Scalar L1 Instruction Cache per SQC */
0534         .cache_size = 32,
0535         .cache_level = 1,
0536         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0537                 CRAT_CACHE_FLAGS_INST_CACHE |
0538                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0539         .num_cu_shared = 2,
0540     },
0541     {
0542         /* Scalar L1 Data Cache per SQC */
0543         .cache_size = 16,
0544         .cache_level = 1,
0545         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0546                 CRAT_CACHE_FLAGS_DATA_CACHE |
0547                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0548         .num_cu_shared = 2,
0549     },
0550     {
0551         /* GL1 Data Cache per SA */
0552         .cache_size = 128,
0553         .cache_level = 1,
0554         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0555                 CRAT_CACHE_FLAGS_DATA_CACHE |
0556                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0557         .num_cu_shared = 10,
0558     },
0559     {
0560         /* L2 Data Cache per GPU (Total Tex Cache) */
0561         .cache_size = 4096,
0562         .cache_level = 2,
0563         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0564                 CRAT_CACHE_FLAGS_DATA_CACHE |
0565                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0566         .num_cu_shared = 10,
0567     },
0568     {
0569         /* L3 Data Cache per GPU */
0570         .cache_size = 128*1024,
0571         .cache_level = 3,
0572         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0573                 CRAT_CACHE_FLAGS_DATA_CACHE |
0574                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0575         .num_cu_shared = 10,
0576     },
0577 };
0578 
0579 static struct kfd_gpu_cache_info navy_flounder_cache_info[] = {
0580     {
0581         /* TCP L1 Cache per CU */
0582         .cache_size = 16,
0583         .cache_level = 1,
0584         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0585                 CRAT_CACHE_FLAGS_DATA_CACHE |
0586                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0587         .num_cu_shared = 1,
0588     },
0589     {
0590         /* Scalar L1 Instruction Cache per SQC */
0591         .cache_size = 32,
0592         .cache_level = 1,
0593         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0594                 CRAT_CACHE_FLAGS_INST_CACHE |
0595                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0596         .num_cu_shared = 2,
0597     },
0598     {
0599         /* Scalar L1 Data Cache per SQC */
0600         .cache_size = 16,
0601         .cache_level = 1,
0602         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0603                 CRAT_CACHE_FLAGS_DATA_CACHE |
0604                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0605         .num_cu_shared = 2,
0606     },
0607     {
0608         /* GL1 Data Cache per SA */
0609         .cache_size = 128,
0610         .cache_level = 1,
0611         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0612                 CRAT_CACHE_FLAGS_DATA_CACHE |
0613                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0614         .num_cu_shared = 10,
0615     },
0616     {
0617         /* L2 Data Cache per GPU (Total Tex Cache) */
0618         .cache_size = 3072,
0619         .cache_level = 2,
0620         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0621                 CRAT_CACHE_FLAGS_DATA_CACHE |
0622                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0623         .num_cu_shared = 10,
0624     },
0625     {
0626         /* L3 Data Cache per GPU */
0627         .cache_size = 96*1024,
0628         .cache_level = 3,
0629         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0630                 CRAT_CACHE_FLAGS_DATA_CACHE |
0631                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0632         .num_cu_shared = 10,
0633     },
0634 };
0635 
0636 static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = {
0637     {
0638         /* TCP L1 Cache per CU */
0639         .cache_size = 16,
0640         .cache_level = 1,
0641         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0642                 CRAT_CACHE_FLAGS_DATA_CACHE |
0643                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0644         .num_cu_shared = 1,
0645     },
0646     {
0647         /* Scalar L1 Instruction Cache per SQC */
0648         .cache_size = 32,
0649         .cache_level = 1,
0650         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0651                 CRAT_CACHE_FLAGS_INST_CACHE |
0652                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0653         .num_cu_shared = 2,
0654     },
0655     {
0656         /* Scalar L1 Data Cache per SQC */
0657         .cache_size = 16,
0658         .cache_level = 1,
0659         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0660                 CRAT_CACHE_FLAGS_DATA_CACHE |
0661                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0662         .num_cu_shared = 2,
0663     },
0664     {
0665         /* GL1 Data Cache per SA */
0666         .cache_size = 128,
0667         .cache_level = 1,
0668         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0669                 CRAT_CACHE_FLAGS_DATA_CACHE |
0670                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0671         .num_cu_shared = 8,
0672     },
0673     {
0674         /* L2 Data Cache per GPU (Total Tex Cache) */
0675         .cache_size = 2048,
0676         .cache_level = 2,
0677         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0678                 CRAT_CACHE_FLAGS_DATA_CACHE |
0679                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0680         .num_cu_shared = 8,
0681     },
0682     {
0683         /* L3 Data Cache per GPU */
0684         .cache_size = 32*1024,
0685         .cache_level = 3,
0686         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0687                 CRAT_CACHE_FLAGS_DATA_CACHE |
0688                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0689         .num_cu_shared = 8,
0690     },
0691 };
0692 
0693 static struct kfd_gpu_cache_info beige_goby_cache_info[] = {
0694     {
0695         /* TCP L1 Cache per CU */
0696         .cache_size = 16,
0697         .cache_level = 1,
0698         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0699                 CRAT_CACHE_FLAGS_DATA_CACHE |
0700                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0701         .num_cu_shared = 1,
0702     },
0703     {
0704         /* Scalar L1 Instruction Cache per SQC */
0705         .cache_size = 32,
0706         .cache_level = 1,
0707         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0708                 CRAT_CACHE_FLAGS_INST_CACHE |
0709                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0710         .num_cu_shared = 2,
0711     },
0712     {
0713         /* Scalar L1 Data Cache per SQC */
0714         .cache_size = 16,
0715         .cache_level = 1,
0716         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0717                 CRAT_CACHE_FLAGS_DATA_CACHE |
0718                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0719         .num_cu_shared = 2,
0720     },
0721     {
0722         /* GL1 Data Cache per SA */
0723         .cache_size = 128,
0724         .cache_level = 1,
0725         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0726                 CRAT_CACHE_FLAGS_DATA_CACHE |
0727                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0728         .num_cu_shared = 8,
0729     },
0730     {
0731         /* L2 Data Cache per GPU (Total Tex Cache) */
0732         .cache_size = 1024,
0733         .cache_level = 2,
0734         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0735                 CRAT_CACHE_FLAGS_DATA_CACHE |
0736                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0737         .num_cu_shared = 8,
0738     },
0739     {
0740         /* L3 Data Cache per GPU */
0741         .cache_size = 16*1024,
0742         .cache_level = 3,
0743         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0744                 CRAT_CACHE_FLAGS_DATA_CACHE |
0745                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0746         .num_cu_shared = 8,
0747     },
0748 };
0749 
0750 static struct kfd_gpu_cache_info yellow_carp_cache_info[] = {
0751     {
0752         /* TCP L1 Cache per CU */
0753         .cache_size = 16,
0754         .cache_level = 1,
0755         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0756                 CRAT_CACHE_FLAGS_DATA_CACHE |
0757                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0758         .num_cu_shared = 1,
0759     },
0760     {
0761         /* Scalar L1 Instruction Cache per SQC */
0762         .cache_size = 32,
0763         .cache_level = 1,
0764         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0765                 CRAT_CACHE_FLAGS_INST_CACHE |
0766                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0767         .num_cu_shared = 2,
0768     },
0769     {
0770         /* Scalar L1 Data Cache per SQC */
0771         .cache_size = 16,
0772         .cache_level = 1,
0773         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0774                 CRAT_CACHE_FLAGS_DATA_CACHE |
0775                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0776         .num_cu_shared = 2,
0777     },
0778     {
0779         /* GL1 Data Cache per SA */
0780         .cache_size = 128,
0781         .cache_level = 1,
0782         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0783                 CRAT_CACHE_FLAGS_DATA_CACHE |
0784                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0785         .num_cu_shared = 6,
0786     },
0787     {
0788         /* L2 Data Cache per GPU (Total Tex Cache) */
0789         .cache_size = 2048,
0790         .cache_level = 2,
0791         .flags = (CRAT_CACHE_FLAGS_ENABLED |
0792                 CRAT_CACHE_FLAGS_DATA_CACHE |
0793                 CRAT_CACHE_FLAGS_SIMD_CACHE),
0794         .num_cu_shared = 6,
0795     },
0796 };
0797 
0798 static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
0799         struct crat_subtype_computeunit *cu)
0800 {
0801     dev->node_props.cpu_cores_count = cu->num_cpu_cores;
0802     dev->node_props.cpu_core_id_base = cu->processor_id_low;
0803     if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
0804         dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
0805 
0806     pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
0807             cu->processor_id_low);
0808 }
0809 
0810 static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
0811         struct crat_subtype_computeunit *cu)
0812 {
0813     dev->node_props.simd_id_base = cu->processor_id_low;
0814     dev->node_props.simd_count = cu->num_simd_cores;
0815     dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
0816     dev->node_props.max_waves_per_simd = cu->max_waves_simd;
0817     dev->node_props.wave_front_size = cu->wave_front_size;
0818     dev->node_props.array_count = cu->array_count;
0819     dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
0820     dev->node_props.simd_per_cu = cu->num_simd_per_cu;
0821     dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
0822     if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
0823         dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
0824     pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
0825 }
0826 
0827 /* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
0828  * topology device present in the device_list
0829  */
0830 static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
0831                 struct list_head *device_list)
0832 {
0833     struct kfd_topology_device *dev;
0834 
0835     pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
0836             cu->proximity_domain, cu->hsa_capability);
0837     list_for_each_entry(dev, device_list, list) {
0838         if (cu->proximity_domain == dev->proximity_domain) {
0839             if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
0840                 kfd_populated_cu_info_cpu(dev, cu);
0841 
0842             if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
0843                 kfd_populated_cu_info_gpu(dev, cu);
0844             break;
0845         }
0846     }
0847 
0848     return 0;
0849 }
0850 
0851 static struct kfd_mem_properties *
0852 find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width,
0853         struct kfd_topology_device *dev)
0854 {
0855     struct kfd_mem_properties *props;
0856 
0857     list_for_each_entry(props, &dev->mem_props, list) {
0858         if (props->heap_type == heap_type
0859                 && props->flags == flags
0860                 && props->width == width)
0861             return props;
0862     }
0863 
0864     return NULL;
0865 }
0866 /* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
0867  * topology device present in the device_list
0868  */
0869 static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
0870                 struct list_head *device_list)
0871 {
0872     struct kfd_mem_properties *props;
0873     struct kfd_topology_device *dev;
0874     uint32_t heap_type;
0875     uint64_t size_in_bytes;
0876     uint32_t flags = 0;
0877     uint32_t width;
0878 
0879     pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
0880             mem->proximity_domain);
0881     list_for_each_entry(dev, device_list, list) {
0882         if (mem->proximity_domain == dev->proximity_domain) {
0883             /* We're on GPU node */
0884             if (dev->node_props.cpu_cores_count == 0) {
0885                 /* APU */
0886                 if (mem->visibility_type == 0)
0887                     heap_type =
0888                         HSA_MEM_HEAP_TYPE_FB_PRIVATE;
0889                 /* dGPU */
0890                 else
0891                     heap_type = mem->visibility_type;
0892             } else
0893                 heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
0894 
0895             if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
0896                 flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
0897             if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
0898                 flags |= HSA_MEM_FLAGS_NON_VOLATILE;
0899 
0900             size_in_bytes =
0901                 ((uint64_t)mem->length_high << 32) +
0902                             mem->length_low;
0903             width = mem->width;
0904 
0905             /* Multiple banks of the same type are aggregated into
0906              * one. User mode doesn't care about multiple physical
0907              * memory segments. It's managed as a single virtual
0908              * heap for user mode.
0909              */
0910             props = find_subtype_mem(heap_type, flags, width, dev);
0911             if (props) {
0912                 props->size_in_bytes += size_in_bytes;
0913                 break;
0914             }
0915 
0916             props = kfd_alloc_struct(props);
0917             if (!props)
0918                 return -ENOMEM;
0919 
0920             props->heap_type = heap_type;
0921             props->flags = flags;
0922             props->size_in_bytes = size_in_bytes;
0923             props->width = width;
0924 
0925             dev->node_props.mem_banks_count++;
0926             list_add_tail(&props->list, &dev->mem_props);
0927 
0928             break;
0929         }
0930     }
0931 
0932     return 0;
0933 }
0934 
0935 /* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
0936  * topology device present in the device_list
0937  */
0938 static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
0939             struct list_head *device_list)
0940 {
0941     struct kfd_cache_properties *props;
0942     struct kfd_topology_device *dev;
0943     uint32_t id;
0944     uint32_t total_num_of_cu;
0945 
0946     id = cache->processor_id_low;
0947 
0948     pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);
0949     list_for_each_entry(dev, device_list, list) {
0950         total_num_of_cu = (dev->node_props.array_count *
0951                     dev->node_props.cu_per_simd_array);
0952 
0953         /* Cache infomration in CRAT doesn't have proximity_domain
0954          * information as it is associated with a CPU core or GPU
0955          * Compute Unit. So map the cache using CPU core Id or SIMD
0956          * (GPU) ID.
0957          * TODO: This works because currently we can safely assume that
0958          *  Compute Units are parsed before caches are parsed. In
0959          *  future, remove this dependency
0960          */
0961         if ((id >= dev->node_props.cpu_core_id_base &&
0962             id <= dev->node_props.cpu_core_id_base +
0963                 dev->node_props.cpu_cores_count) ||
0964             (id >= dev->node_props.simd_id_base &&
0965             id < dev->node_props.simd_id_base +
0966                 total_num_of_cu)) {
0967             props = kfd_alloc_struct(props);
0968             if (!props)
0969                 return -ENOMEM;
0970 
0971             props->processor_id_low = id;
0972             props->cache_level = cache->cache_level;
0973             props->cache_size = cache->cache_size;
0974             props->cacheline_size = cache->cache_line_size;
0975             props->cachelines_per_tag = cache->lines_per_tag;
0976             props->cache_assoc = cache->associativity;
0977             props->cache_latency = cache->cache_latency;
0978             memcpy(props->sibling_map, cache->sibling_map,
0979                     sizeof(props->sibling_map));
0980 
0981             if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
0982                 props->cache_type |= HSA_CACHE_TYPE_DATA;
0983             if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
0984                 props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
0985             if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
0986                 props->cache_type |= HSA_CACHE_TYPE_CPU;
0987             if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
0988                 props->cache_type |= HSA_CACHE_TYPE_HSACU;
0989 
0990             dev->cache_count++;
0991             dev->node_props.caches_count++;
0992             list_add_tail(&props->list, &dev->cache_props);
0993 
0994             break;
0995         }
0996     }
0997 
0998     return 0;
0999 }
1000 
1001 /* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
1002  * topology device present in the device_list
1003  */
1004 static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
1005                     struct list_head *device_list)
1006 {
1007     struct kfd_iolink_properties *props = NULL, *props2;
1008     struct kfd_topology_device *dev, *to_dev;
1009     uint32_t id_from;
1010     uint32_t id_to;
1011 
1012     id_from = iolink->proximity_domain_from;
1013     id_to = iolink->proximity_domain_to;
1014 
1015     pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n",
1016             id_from, id_to);
1017     list_for_each_entry(dev, device_list, list) {
1018         if (id_from == dev->proximity_domain) {
1019             props = kfd_alloc_struct(props);
1020             if (!props)
1021                 return -ENOMEM;
1022 
1023             props->node_from = id_from;
1024             props->node_to = id_to;
1025             props->ver_maj = iolink->version_major;
1026             props->ver_min = iolink->version_minor;
1027             props->iolink_type = iolink->io_interface_type;
1028 
1029             if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
1030                 props->weight = 20;
1031             else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
1032                 props->weight = 15 * iolink->num_hops_xgmi;
1033             else
1034                 props->weight = node_distance(id_from, id_to);
1035 
1036             props->min_latency = iolink->minimum_latency;
1037             props->max_latency = iolink->maximum_latency;
1038             props->min_bandwidth = iolink->minimum_bandwidth_mbs;
1039             props->max_bandwidth = iolink->maximum_bandwidth_mbs;
1040             props->rec_transfer_size =
1041                     iolink->recommended_transfer_size;
1042 
1043             dev->node_props.io_links_count++;
1044             list_add_tail(&props->list, &dev->io_link_props);
1045             break;
1046         }
1047     }
1048 
1049     /* CPU topology is created before GPUs are detected, so CPU->GPU
1050      * links are not built at that time. If a PCIe type is discovered, it
1051      * means a GPU is detected and we are adding GPU->CPU to the topology.
1052      * At this time, also add the corresponded CPU->GPU link if GPU
1053      * is large bar.
1054      * For xGMI, we only added the link with one direction in the crat
1055      * table, add corresponded reversed direction link now.
1056      */
1057     if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
1058         to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
1059         if (!to_dev)
1060             return -ENODEV;
1061         /* same everything but the other direction */
1062         props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
1063         if (!props2)
1064             return -ENOMEM;
1065 
1066         props2->node_from = id_to;
1067         props2->node_to = id_from;
1068         props2->kobj = NULL;
1069         to_dev->node_props.io_links_count++;
1070         list_add_tail(&props2->list, &to_dev->io_link_props);
1071     }
1072 
1073     return 0;
1074 }
1075 
1076 /* kfd_parse_subtype - parse subtypes and attach it to correct topology device
1077  * present in the device_list
1078  *  @sub_type_hdr - subtype section of crat_image
1079  *  @device_list - list of topology devices present in this crat_image
1080  */
1081 static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
1082                 struct list_head *device_list)
1083 {
1084     struct crat_subtype_computeunit *cu;
1085     struct crat_subtype_memory *mem;
1086     struct crat_subtype_cache *cache;
1087     struct crat_subtype_iolink *iolink;
1088     int ret = 0;
1089 
1090     switch (sub_type_hdr->type) {
1091     case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
1092         cu = (struct crat_subtype_computeunit *)sub_type_hdr;
1093         ret = kfd_parse_subtype_cu(cu, device_list);
1094         break;
1095     case CRAT_SUBTYPE_MEMORY_AFFINITY:
1096         mem = (struct crat_subtype_memory *)sub_type_hdr;
1097         ret = kfd_parse_subtype_mem(mem, device_list);
1098         break;
1099     case CRAT_SUBTYPE_CACHE_AFFINITY:
1100         cache = (struct crat_subtype_cache *)sub_type_hdr;
1101         ret = kfd_parse_subtype_cache(cache, device_list);
1102         break;
1103     case CRAT_SUBTYPE_TLB_AFFINITY:
1104         /*
1105          * For now, nothing to do here
1106          */
1107         pr_debug("Found TLB entry in CRAT table (not processing)\n");
1108         break;
1109     case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
1110         /*
1111          * For now, nothing to do here
1112          */
1113         pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
1114         break;
1115     case CRAT_SUBTYPE_IOLINK_AFFINITY:
1116         iolink = (struct crat_subtype_iolink *)sub_type_hdr;
1117         ret = kfd_parse_subtype_iolink(iolink, device_list);
1118         break;
1119     default:
1120         pr_warn("Unknown subtype %d in CRAT\n",
1121                 sub_type_hdr->type);
1122     }
1123 
1124     return ret;
1125 }
1126 
1127 /* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
1128  * create a kfd_topology_device and add in to device_list. Also parse
1129  * CRAT subtypes and attach it to appropriate kfd_topology_device
1130  *  @crat_image - input image containing CRAT
1131  *  @device_list - [OUT] list of kfd_topology_device generated after
1132  *             parsing crat_image
1133  *  @proximity_domain - Proximity domain of the first device in the table
1134  *
1135  *  Return - 0 if successful else -ve value
1136  */
1137 int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
1138              uint32_t proximity_domain)
1139 {
1140     struct kfd_topology_device *top_dev = NULL;
1141     struct crat_subtype_generic *sub_type_hdr;
1142     uint16_t node_id;
1143     int ret = 0;
1144     struct crat_header *crat_table = (struct crat_header *)crat_image;
1145     uint16_t num_nodes;
1146     uint32_t image_len;
1147 
1148     if (!crat_image)
1149         return -EINVAL;
1150 
1151     if (!list_empty(device_list)) {
1152         pr_warn("Error device list should be empty\n");
1153         return -EINVAL;
1154     }
1155 
1156     num_nodes = crat_table->num_domains;
1157     image_len = crat_table->length;
1158 
1159     pr_debug("Parsing CRAT table with %d nodes\n", num_nodes);
1160 
1161     for (node_id = 0; node_id < num_nodes; node_id++) {
1162         top_dev = kfd_create_topology_device(device_list);
1163         if (!top_dev)
1164             break;
1165         top_dev->proximity_domain = proximity_domain++;
1166     }
1167 
1168     if (!top_dev) {
1169         ret = -ENOMEM;
1170         goto err;
1171     }
1172 
1173     memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
1174     memcpy(top_dev->oem_table_id, crat_table->oem_table_id,
1175             CRAT_OEMTABLEID_LENGTH);
1176     top_dev->oem_revision = crat_table->oem_revision;
1177 
1178     sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
1179     while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
1180             ((char *)crat_image) + image_len) {
1181         if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
1182             ret = kfd_parse_subtype(sub_type_hdr, device_list);
1183             if (ret)
1184                 break;
1185         }
1186 
1187         sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1188                 sub_type_hdr->length);
1189     }
1190 
1191 err:
1192     if (ret)
1193         kfd_release_topology_device_list(device_list);
1194 
1195     return ret;
1196 }
1197 
1198 /* Helper function. See kfd_fill_gpu_cache_info for parameter description */
1199 static int fill_in_l1_pcache(struct crat_subtype_cache *pcache,
1200                 struct kfd_gpu_cache_info *pcache_info,
1201                 struct kfd_cu_info *cu_info,
1202                 int mem_available,
1203                 int cu_bitmask,
1204                 int cache_type, unsigned int cu_processor_id,
1205                 int cu_block)
1206 {
1207     unsigned int cu_sibling_map_mask;
1208     int first_active_cu;
1209 
1210     /* First check if enough memory is available */
1211     if (sizeof(struct crat_subtype_cache) > mem_available)
1212         return -ENOMEM;
1213 
1214     cu_sibling_map_mask = cu_bitmask;
1215     cu_sibling_map_mask >>= cu_block;
1216     cu_sibling_map_mask &=
1217         ((1 << pcache_info[cache_type].num_cu_shared) - 1);
1218     first_active_cu = ffs(cu_sibling_map_mask);
1219 
1220     /* CU could be inactive. In case of shared cache find the first active
1221      * CU. and incase of non-shared cache check if the CU is inactive. If
1222      * inactive active skip it
1223      */
1224     if (first_active_cu) {
1225         memset(pcache, 0, sizeof(struct crat_subtype_cache));
1226         pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
1227         pcache->length = sizeof(struct crat_subtype_cache);
1228         pcache->flags = pcache_info[cache_type].flags;
1229         pcache->processor_id_low = cu_processor_id
1230                      + (first_active_cu - 1);
1231         pcache->cache_level = pcache_info[cache_type].cache_level;
1232         pcache->cache_size = pcache_info[cache_type].cache_size;
1233 
1234         /* Sibling map is w.r.t processor_id_low, so shift out
1235          * inactive CU
1236          */
1237         cu_sibling_map_mask =
1238             cu_sibling_map_mask >> (first_active_cu - 1);
1239 
1240         pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
1241         pcache->sibling_map[1] =
1242                 (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
1243         pcache->sibling_map[2] =
1244                 (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
1245         pcache->sibling_map[3] =
1246                 (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
1247         return 0;
1248     }
1249     return 1;
1250 }
1251 
1252 /* Helper function. See kfd_fill_gpu_cache_info for parameter description */
1253 static int fill_in_l2_l3_pcache(struct crat_subtype_cache *pcache,
1254                 struct kfd_gpu_cache_info *pcache_info,
1255                 struct kfd_cu_info *cu_info,
1256                 int mem_available,
1257                 int cache_type, unsigned int cu_processor_id)
1258 {
1259     unsigned int cu_sibling_map_mask;
1260     int first_active_cu;
1261     int i, j, k;
1262 
1263     /* First check if enough memory is available */
1264     if (sizeof(struct crat_subtype_cache) > mem_available)
1265         return -ENOMEM;
1266 
1267     cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
1268     cu_sibling_map_mask &=
1269         ((1 << pcache_info[cache_type].num_cu_shared) - 1);
1270     first_active_cu = ffs(cu_sibling_map_mask);
1271 
1272     /* CU could be inactive. In case of shared cache find the first active
1273      * CU. and incase of non-shared cache check if the CU is inactive. If
1274      * inactive active skip it
1275      */
1276     if (first_active_cu) {
1277         memset(pcache, 0, sizeof(struct crat_subtype_cache));
1278         pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
1279         pcache->length = sizeof(struct crat_subtype_cache);
1280         pcache->flags = pcache_info[cache_type].flags;
1281         pcache->processor_id_low = cu_processor_id
1282                      + (first_active_cu - 1);
1283         pcache->cache_level = pcache_info[cache_type].cache_level;
1284         pcache->cache_size = pcache_info[cache_type].cache_size;
1285 
1286         /* Sibling map is w.r.t processor_id_low, so shift out
1287          * inactive CU
1288          */
1289         cu_sibling_map_mask =
1290             cu_sibling_map_mask >> (first_active_cu - 1);
1291         k = 0;
1292         for (i = 0; i < cu_info->num_shader_engines; i++) {
1293             for (j = 0; j < cu_info->num_shader_arrays_per_engine;
1294                 j++) {
1295                 pcache->sibling_map[k] =
1296                  (uint8_t)(cu_sibling_map_mask & 0xFF);
1297                 pcache->sibling_map[k+1] =
1298                  (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
1299                 pcache->sibling_map[k+2] =
1300                  (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
1301                 pcache->sibling_map[k+3] =
1302                  (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
1303                 k += 4;
1304                 cu_sibling_map_mask =
1305                     cu_info->cu_bitmap[i % 4][j + i / 4];
1306                 cu_sibling_map_mask &= (
1307                  (1 << pcache_info[cache_type].num_cu_shared)
1308                  - 1);
1309             }
1310         }
1311         return 0;
1312     }
1313     return 1;
1314 }
1315 
1316 #define KFD_MAX_CACHE_TYPES 6
1317 
1318 static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
1319                            struct kfd_gpu_cache_info *pcache_info)
1320 {
1321     struct amdgpu_device *adev = kdev->adev;
1322     int i = 0;
1323 
1324     /* TCP L1 Cache per CU */
1325     if (adev->gfx.config.gc_tcp_l1_size) {
1326         pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size;
1327         pcache_info[i].cache_level = 1;
1328         pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1329                     CRAT_CACHE_FLAGS_DATA_CACHE |
1330                     CRAT_CACHE_FLAGS_SIMD_CACHE);
1331         pcache_info[0].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2;
1332         i++;
1333     }
1334     /* Scalar L1 Instruction Cache per SQC */
1335     if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {
1336         pcache_info[i].cache_size =
1337             adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;
1338         pcache_info[i].cache_level = 1;
1339         pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1340                     CRAT_CACHE_FLAGS_INST_CACHE |
1341                     CRAT_CACHE_FLAGS_SIMD_CACHE);
1342         pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;
1343         i++;
1344     }
1345     /* Scalar L1 Data Cache per SQC */
1346     if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {
1347         pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;
1348         pcache_info[i].cache_level = 1;
1349         pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1350                     CRAT_CACHE_FLAGS_DATA_CACHE |
1351                     CRAT_CACHE_FLAGS_SIMD_CACHE);
1352         pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;
1353         i++;
1354     }
1355     /* GL1 Data Cache per SA */
1356     if (adev->gfx.config.gc_gl1c_per_sa &&
1357         adev->gfx.config.gc_gl1c_size_per_instance) {
1358         pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa *
1359             adev->gfx.config.gc_gl1c_size_per_instance;
1360         pcache_info[i].cache_level = 1;
1361         pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1362                     CRAT_CACHE_FLAGS_DATA_CACHE |
1363                     CRAT_CACHE_FLAGS_SIMD_CACHE);
1364         pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1365         i++;
1366     }
1367     /* L2 Data Cache per GPU (Total Tex Cache) */
1368     if (adev->gfx.config.gc_gl2c_per_gpu) {
1369         pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu;
1370         pcache_info[i].cache_level = 2;
1371         pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1372                     CRAT_CACHE_FLAGS_DATA_CACHE |
1373                     CRAT_CACHE_FLAGS_SIMD_CACHE);
1374         pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1375         i++;
1376     }
1377     /* L3 Data Cache per GPU */
1378     if (adev->gmc.mall_size) {
1379         pcache_info[i].cache_size = adev->gmc.mall_size / 1024;
1380         pcache_info[i].cache_level = 3;
1381         pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1382                     CRAT_CACHE_FLAGS_DATA_CACHE |
1383                     CRAT_CACHE_FLAGS_SIMD_CACHE);
1384         pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1385         i++;
1386     }
1387     return i;
1388 }
1389 
1390 /* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
1391  * tables
1392  *
1393  *  @kdev - [IN] GPU device
1394  *  @gpu_processor_id - [IN] GPU processor ID to which these caches
1395  *              associate
1396  *  @available_size - [IN] Amount of memory available in pcache
1397  *  @cu_info - [IN] Compute Unit info obtained from KGD
1398  *  @pcache - [OUT] memory into which cache data is to be filled in.
1399  *  @size_filled - [OUT] amount of data used up in pcache.
1400  *  @num_of_entries - [OUT] number of caches added
1401  */
1402 static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
1403             int gpu_processor_id,
1404             int available_size,
1405             struct kfd_cu_info *cu_info,
1406             struct crat_subtype_cache *pcache,
1407             int *size_filled,
1408             int *num_of_entries)
1409 {
1410     struct kfd_gpu_cache_info *pcache_info;
1411     struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
1412     int num_of_cache_types = 0;
1413     int i, j, k;
1414     int ct = 0;
1415     int mem_available = available_size;
1416     unsigned int cu_processor_id;
1417     int ret;
1418     unsigned int num_cu_shared;
1419 
1420     switch (kdev->adev->asic_type) {
1421     case CHIP_KAVERI:
1422         pcache_info = kaveri_cache_info;
1423         num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
1424         break;
1425     case CHIP_HAWAII:
1426         pcache_info = hawaii_cache_info;
1427         num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
1428         break;
1429     case CHIP_CARRIZO:
1430         pcache_info = carrizo_cache_info;
1431         num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
1432         break;
1433     case CHIP_TONGA:
1434         pcache_info = tonga_cache_info;
1435         num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
1436         break;
1437     case CHIP_FIJI:
1438         pcache_info = fiji_cache_info;
1439         num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
1440         break;
1441     case CHIP_POLARIS10:
1442         pcache_info = polaris10_cache_info;
1443         num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
1444         break;
1445     case CHIP_POLARIS11:
1446         pcache_info = polaris11_cache_info;
1447         num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
1448         break;
1449     case CHIP_POLARIS12:
1450         pcache_info = polaris12_cache_info;
1451         num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
1452         break;
1453     case CHIP_VEGAM:
1454         pcache_info = vegam_cache_info;
1455         num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
1456         break;
1457     default:
1458         switch (KFD_GC_VERSION(kdev)) {
1459         case IP_VERSION(9, 0, 1):
1460             pcache_info = vega10_cache_info;
1461             num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
1462             break;
1463         case IP_VERSION(9, 2, 1):
1464             pcache_info = vega12_cache_info;
1465             num_of_cache_types = ARRAY_SIZE(vega12_cache_info);
1466             break;
1467         case IP_VERSION(9, 4, 0):
1468         case IP_VERSION(9, 4, 1):
1469             pcache_info = vega20_cache_info;
1470             num_of_cache_types = ARRAY_SIZE(vega20_cache_info);
1471             break;
1472         case IP_VERSION(9, 4, 2):
1473             pcache_info = aldebaran_cache_info;
1474             num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);
1475             break;
1476         case IP_VERSION(9, 1, 0):
1477         case IP_VERSION(9, 2, 2):
1478             pcache_info = raven_cache_info;
1479             num_of_cache_types = ARRAY_SIZE(raven_cache_info);
1480             break;
1481         case IP_VERSION(9, 3, 0):
1482             pcache_info = renoir_cache_info;
1483             num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
1484             break;
1485         case IP_VERSION(10, 1, 10):
1486         case IP_VERSION(10, 1, 2):
1487         case IP_VERSION(10, 1, 3):
1488         case IP_VERSION(10, 1, 4):
1489             pcache_info = navi10_cache_info;
1490             num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
1491             break;
1492         case IP_VERSION(10, 1, 1):
1493             pcache_info = navi14_cache_info;
1494             num_of_cache_types = ARRAY_SIZE(navi14_cache_info);
1495             break;
1496         case IP_VERSION(10, 3, 0):
1497             pcache_info = sienna_cichlid_cache_info;
1498             num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);
1499             break;
1500         case IP_VERSION(10, 3, 2):
1501             pcache_info = navy_flounder_cache_info;
1502             num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);
1503             break;
1504         case IP_VERSION(10, 3, 4):
1505             pcache_info = dimgrey_cavefish_cache_info;
1506             num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);
1507             break;
1508         case IP_VERSION(10, 3, 1):
1509             pcache_info = vangogh_cache_info;
1510             num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);
1511             break;
1512         case IP_VERSION(10, 3, 5):
1513             pcache_info = beige_goby_cache_info;
1514             num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);
1515             break;
1516         case IP_VERSION(10, 3, 3):
1517         case IP_VERSION(10, 3, 6): /* TODO: Double check these on production silicon */
1518         case IP_VERSION(10, 3, 7): /* TODO: Double check these on production silicon */
1519             pcache_info = yellow_carp_cache_info;
1520             num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
1521             break;
1522         case IP_VERSION(11, 0, 0):
1523         case IP_VERSION(11, 0, 1):
1524         case IP_VERSION(11, 0, 2):
1525             pcache_info = cache_info;
1526             num_of_cache_types =
1527                 kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info);
1528             break;
1529         default:
1530             return -EINVAL;
1531         }
1532     }
1533 
1534     *size_filled = 0;
1535     *num_of_entries = 0;
1536 
1537     /* For each type of cache listed in the kfd_gpu_cache_info table,
1538      * go through all available Compute Units.
1539      * The [i,j,k] loop will
1540      *      if kfd_gpu_cache_info.num_cu_shared = 1
1541      *          will parse through all available CU
1542      *      If (kfd_gpu_cache_info.num_cu_shared != 1)
1543      *          then it will consider only one CU from
1544      *          the shared unit
1545      */
1546 
1547     for (ct = 0; ct < num_of_cache_types; ct++) {
1548       cu_processor_id = gpu_processor_id;
1549       if (pcache_info[ct].cache_level == 1) {
1550         for (i = 0; i < cu_info->num_shader_engines; i++) {
1551           for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
1552             for (k = 0; k < cu_info->num_cu_per_sh;
1553           k += pcache_info[ct].num_cu_shared) {
1554           ret = fill_in_l1_pcache(pcache,
1555                     pcache_info,
1556                     cu_info,
1557                     mem_available,
1558                     cu_info->cu_bitmap[i % 4][j + i / 4],
1559                     ct,
1560                     cu_processor_id,
1561                     k);
1562 
1563           if (ret < 0)
1564             break;
1565 
1566           if (!ret) {
1567                 pcache++;
1568                 (*num_of_entries)++;
1569                 mem_available -= sizeof(*pcache);
1570                 (*size_filled) += sizeof(*pcache);
1571           }
1572 
1573           /* Move to next CU block */
1574           num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
1575                     cu_info->num_cu_per_sh) ?
1576                     pcache_info[ct].num_cu_shared :
1577                     (cu_info->num_cu_per_sh - k);
1578           cu_processor_id += num_cu_shared;
1579         }
1580           }
1581         }
1582       } else {
1583             ret = fill_in_l2_l3_pcache(pcache,
1584                 pcache_info,
1585                 cu_info,
1586                 mem_available,
1587                 ct,
1588                 cu_processor_id);
1589 
1590             if (ret < 0)
1591                 break;
1592 
1593             if (!ret) {
1594                 pcache++;
1595                 (*num_of_entries)++;
1596                 mem_available -= sizeof(*pcache);
1597                 (*size_filled) += sizeof(*pcache);
1598             }
1599       }
1600     }
1601 
1602     pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
1603 
1604     return 0;
1605 }
1606 
1607 static bool kfd_ignore_crat(void)
1608 {
1609     bool ret;
1610 
1611     if (ignore_crat)
1612         return true;
1613 
1614 #ifndef KFD_SUPPORT_IOMMU_V2
1615     ret = true;
1616 #else
1617     ret = false;
1618 #endif
1619 
1620     return ret;
1621 }
1622 
1623 /*
1624  * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
1625  * copies CRAT from ACPI (if available).
1626  * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
1627  *
1628  *  @crat_image: CRAT read from ACPI. If no CRAT in ACPI then
1629  *           crat_image will be NULL
1630  *  @size: [OUT] size of crat_image
1631  *
1632  *  Return 0 if successful else return error code
1633  */
1634 int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
1635 {
1636     struct acpi_table_header *crat_table;
1637     acpi_status status;
1638     void *pcrat_image;
1639     int rc = 0;
1640 
1641     if (!crat_image)
1642         return -EINVAL;
1643 
1644     *crat_image = NULL;
1645 
1646     if (kfd_ignore_crat()) {
1647         pr_info("CRAT table disabled by module option\n");
1648         return -ENODATA;
1649     }
1650 
1651     /* Fetch the CRAT table from ACPI */
1652     status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
1653     if (status == AE_NOT_FOUND) {
1654         pr_info("CRAT table not found\n");
1655         return -ENODATA;
1656     } else if (ACPI_FAILURE(status)) {
1657         const char *err = acpi_format_exception(status);
1658 
1659         pr_err("CRAT table error: %s\n", err);
1660         return -EINVAL;
1661     }
1662 
1663     pcrat_image = kvmalloc(crat_table->length, GFP_KERNEL);
1664     if (!pcrat_image) {
1665         rc = -ENOMEM;
1666         goto out;
1667     }
1668 
1669     memcpy(pcrat_image, crat_table, crat_table->length);
1670     *crat_image = pcrat_image;
1671     *size = crat_table->length;
1672 out:
1673     acpi_put_table(crat_table);
1674     return rc;
1675 }
1676 
1677 /* Memory required to create Virtual CRAT.
1678  * Since there is no easy way to predict the amount of memory required, the
1679  * following amount is allocated for GPU Virtual CRAT. This is
1680  * expected to cover all known conditions. But to be safe additional check
1681  * is put in the code to ensure we don't overwrite.
1682  */
1683 #define VCRAT_SIZE_FOR_GPU  (4 * PAGE_SIZE)
1684 
1685 /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
1686  *
1687  *  @numa_node_id: CPU NUMA node id
1688  *  @avail_size: Available size in the memory
1689  *  @sub_type_hdr: Memory into which compute info will be filled in
1690  *
1691  *  Return 0 if successful else return -ve value
1692  */
1693 static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
1694                 int proximity_domain,
1695                 struct crat_subtype_computeunit *sub_type_hdr)
1696 {
1697     const struct cpumask *cpumask;
1698 
1699     *avail_size -= sizeof(struct crat_subtype_computeunit);
1700     if (*avail_size < 0)
1701         return -ENOMEM;
1702 
1703     memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
1704 
1705     /* Fill in subtype header data */
1706     sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
1707     sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
1708     sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1709 
1710     cpumask = cpumask_of_node(numa_node_id);
1711 
1712     /* Fill in CU data */
1713     sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
1714     sub_type_hdr->proximity_domain = proximity_domain;
1715     sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
1716     if (sub_type_hdr->processor_id_low == -1)
1717         return -EINVAL;
1718 
1719     sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
1720 
1721     return 0;
1722 }
1723 
1724 /* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
1725  *
1726  *  @numa_node_id: CPU NUMA node id
1727  *  @avail_size: Available size in the memory
1728  *  @sub_type_hdr: Memory into which compute info will be filled in
1729  *
1730  *  Return 0 if successful else return -ve value
1731  */
1732 static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
1733             int proximity_domain,
1734             struct crat_subtype_memory *sub_type_hdr)
1735 {
1736     uint64_t mem_in_bytes = 0;
1737     pg_data_t *pgdat;
1738     int zone_type;
1739 
1740     *avail_size -= sizeof(struct crat_subtype_memory);
1741     if (*avail_size < 0)
1742         return -ENOMEM;
1743 
1744     memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
1745 
1746     /* Fill in subtype header data */
1747     sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
1748     sub_type_hdr->length = sizeof(struct crat_subtype_memory);
1749     sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1750 
1751     /* Fill in Memory Subunit data */
1752 
1753     /* Unlike si_meminfo, si_meminfo_node is not exported. So
1754      * the following lines are duplicated from si_meminfo_node
1755      * function
1756      */
1757     pgdat = NODE_DATA(numa_node_id);
1758     for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
1759         mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]);
1760     mem_in_bytes <<= PAGE_SHIFT;
1761 
1762     sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
1763     sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
1764     sub_type_hdr->proximity_domain = proximity_domain;
1765 
1766     return 0;
1767 }
1768 
1769 #ifdef CONFIG_X86_64
1770 static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,
1771                 uint32_t *num_entries,
1772                 struct crat_subtype_iolink *sub_type_hdr)
1773 {
1774     int nid;
1775     struct cpuinfo_x86 *c = &cpu_data(0);
1776     uint8_t link_type;
1777 
1778     if (c->x86_vendor == X86_VENDOR_AMD)
1779         link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;
1780     else
1781         link_type = CRAT_IOLINK_TYPE_QPI_1_1;
1782 
1783     *num_entries = 0;
1784 
1785     /* Create IO links from this node to other CPU nodes */
1786     for_each_online_node(nid) {
1787         if (nid == numa_node_id) /* node itself */
1788             continue;
1789 
1790         *avail_size -= sizeof(struct crat_subtype_iolink);
1791         if (*avail_size < 0)
1792             return -ENOMEM;
1793 
1794         memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
1795 
1796         /* Fill in subtype header data */
1797         sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
1798         sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
1799         sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1800 
1801         /* Fill in IO link data */
1802         sub_type_hdr->proximity_domain_from = numa_node_id;
1803         sub_type_hdr->proximity_domain_to = nid;
1804         sub_type_hdr->io_interface_type = link_type;
1805 
1806         (*num_entries)++;
1807         sub_type_hdr++;
1808     }
1809 
1810     return 0;
1811 }
1812 #endif
1813 
1814 /* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
1815  *
1816  *  @pcrat_image: Fill in VCRAT for CPU
1817  *  @size:  [IN] allocated size of crat_image.
1818  *      [OUT] actual size of data filled in crat_image
1819  */
1820 static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
1821 {
1822     struct crat_header *crat_table = (struct crat_header *)pcrat_image;
1823     struct acpi_table_header *acpi_table;
1824     acpi_status status;
1825     struct crat_subtype_generic *sub_type_hdr;
1826     int avail_size = *size;
1827     int numa_node_id;
1828 #ifdef CONFIG_X86_64
1829     uint32_t entries = 0;
1830 #endif
1831     int ret = 0;
1832 
1833     if (!pcrat_image)
1834         return -EINVAL;
1835 
1836     /* Fill in CRAT Header.
1837      * Modify length and total_entries as subunits are added.
1838      */
1839     avail_size -= sizeof(struct crat_header);
1840     if (avail_size < 0)
1841         return -ENOMEM;
1842 
1843     memset(crat_table, 0, sizeof(struct crat_header));
1844     memcpy(&crat_table->signature, CRAT_SIGNATURE,
1845             sizeof(crat_table->signature));
1846     crat_table->length = sizeof(struct crat_header);
1847 
1848     status = acpi_get_table("DSDT", 0, &acpi_table);
1849     if (status != AE_OK)
1850         pr_warn("DSDT table not found for OEM information\n");
1851     else {
1852         crat_table->oem_revision = acpi_table->revision;
1853         memcpy(crat_table->oem_id, acpi_table->oem_id,
1854                 CRAT_OEMID_LENGTH);
1855         memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,
1856                 CRAT_OEMTABLEID_LENGTH);
1857         acpi_put_table(acpi_table);
1858     }
1859     crat_table->total_entries = 0;
1860     crat_table->num_domains = 0;
1861 
1862     sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
1863 
1864     for_each_online_node(numa_node_id) {
1865         if (kfd_numa_node_to_apic_id(numa_node_id) == -1)
1866             continue;
1867 
1868         /* Fill in Subtype: Compute Unit */
1869         ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
1870             crat_table->num_domains,
1871             (struct crat_subtype_computeunit *)sub_type_hdr);
1872         if (ret < 0)
1873             return ret;
1874         crat_table->length += sub_type_hdr->length;
1875         crat_table->total_entries++;
1876 
1877         sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1878             sub_type_hdr->length);
1879 
1880         /* Fill in Subtype: Memory */
1881         ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
1882             crat_table->num_domains,
1883             (struct crat_subtype_memory *)sub_type_hdr);
1884         if (ret < 0)
1885             return ret;
1886         crat_table->length += sub_type_hdr->length;
1887         crat_table->total_entries++;
1888 
1889         sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1890             sub_type_hdr->length);
1891 
1892         /* Fill in Subtype: IO Link */
1893 #ifdef CONFIG_X86_64
1894         ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,
1895                 &entries,
1896                 (struct crat_subtype_iolink *)sub_type_hdr);
1897         if (ret < 0)
1898             return ret;
1899 
1900         if (entries) {
1901             crat_table->length += (sub_type_hdr->length * entries);
1902             crat_table->total_entries += entries;
1903 
1904             sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1905                     sub_type_hdr->length * entries);
1906         }
1907 #else
1908         pr_info("IO link not available for non x86 platforms\n");
1909 #endif
1910 
1911         crat_table->num_domains++;
1912     }
1913 
1914     /* TODO: Add cache Subtype for CPU.
1915      * Currently, CPU cache information is available in function
1916      * detect_cache_attributes(cpu) defined in the file
1917      * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not
1918      * exported and to get the same information the code needs to be
1919      * duplicated.
1920      */
1921 
1922     *size = crat_table->length;
1923     pr_info("Virtual CRAT table created for CPU\n");
1924 
1925     return 0;
1926 }
1927 
1928 static int kfd_fill_gpu_memory_affinity(int *avail_size,
1929         struct kfd_dev *kdev, uint8_t type, uint64_t size,
1930         struct crat_subtype_memory *sub_type_hdr,
1931         uint32_t proximity_domain,
1932         const struct kfd_local_mem_info *local_mem_info)
1933 {
1934     *avail_size -= sizeof(struct crat_subtype_memory);
1935     if (*avail_size < 0)
1936         return -ENOMEM;
1937 
1938     memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
1939     sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
1940     sub_type_hdr->length = sizeof(struct crat_subtype_memory);
1941     sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
1942 
1943     sub_type_hdr->proximity_domain = proximity_domain;
1944 
1945     pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",
1946             type, size);
1947 
1948     sub_type_hdr->length_low = lower_32_bits(size);
1949     sub_type_hdr->length_high = upper_32_bits(size);
1950 
1951     sub_type_hdr->width = local_mem_info->vram_width;
1952     sub_type_hdr->visibility_type = type;
1953 
1954     return 0;
1955 }
1956 
1957 #ifdef CONFIG_ACPI_NUMA
1958 static void kfd_find_numa_node_in_srat(struct kfd_dev *kdev)
1959 {
1960     struct acpi_table_header *table_header = NULL;
1961     struct acpi_subtable_header *sub_header = NULL;
1962     unsigned long table_end, subtable_len;
1963     u32 pci_id = pci_domain_nr(kdev->pdev->bus) << 16 |
1964             pci_dev_id(kdev->pdev);
1965     u32 bdf;
1966     acpi_status status;
1967     struct acpi_srat_cpu_affinity *cpu;
1968     struct acpi_srat_generic_affinity *gpu;
1969     int pxm = 0, max_pxm = 0;
1970     int numa_node = NUMA_NO_NODE;
1971     bool found = false;
1972 
1973     /* Fetch the SRAT table from ACPI */
1974     status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header);
1975     if (status == AE_NOT_FOUND) {
1976         pr_warn("SRAT table not found\n");
1977         return;
1978     } else if (ACPI_FAILURE(status)) {
1979         const char *err = acpi_format_exception(status);
1980         pr_err("SRAT table error: %s\n", err);
1981         return;
1982     }
1983 
1984     table_end = (unsigned long)table_header + table_header->length;
1985 
1986     /* Parse all entries looking for a match. */
1987     sub_header = (struct acpi_subtable_header *)
1988             ((unsigned long)table_header +
1989             sizeof(struct acpi_table_srat));
1990     subtable_len = sub_header->length;
1991 
1992     while (((unsigned long)sub_header) + subtable_len  < table_end) {
1993         /*
1994          * If length is 0, break from this loop to avoid
1995          * infinite loop.
1996          */
1997         if (subtable_len == 0) {
1998             pr_err("SRAT invalid zero length\n");
1999             break;
2000         }
2001 
2002         switch (sub_header->type) {
2003         case ACPI_SRAT_TYPE_CPU_AFFINITY:
2004             cpu = (struct acpi_srat_cpu_affinity *)sub_header;
2005             pxm = *((u32 *)cpu->proximity_domain_hi) << 8 |
2006                     cpu->proximity_domain_lo;
2007             if (pxm > max_pxm)
2008                 max_pxm = pxm;
2009             break;
2010         case ACPI_SRAT_TYPE_GENERIC_AFFINITY:
2011             gpu = (struct acpi_srat_generic_affinity *)sub_header;
2012             bdf = *((u16 *)(&gpu->device_handle[0])) << 16 |
2013                     *((u16 *)(&gpu->device_handle[2]));
2014             if (bdf == pci_id) {
2015                 found = true;
2016                 numa_node = pxm_to_node(gpu->proximity_domain);
2017             }
2018             break;
2019         default:
2020             break;
2021         }
2022 
2023         if (found)
2024             break;
2025 
2026         sub_header = (struct acpi_subtable_header *)
2027                 ((unsigned long)sub_header + subtable_len);
2028         subtable_len = sub_header->length;
2029     }
2030 
2031     acpi_put_table(table_header);
2032 
2033     /* Workaround bad cpu-gpu binding case */
2034     if (found && (numa_node < 0 ||
2035             numa_node > pxm_to_node(max_pxm)))
2036         numa_node = 0;
2037 
2038     if (numa_node != NUMA_NO_NODE)
2039         set_dev_node(&kdev->pdev->dev, numa_node);
2040 }
2041 #endif
2042 
2043 /* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
2044  * to its NUMA node
2045  *  @avail_size: Available size in the memory
2046  *  @kdev - [IN] GPU device
2047  *  @sub_type_hdr: Memory into which io link info will be filled in
2048  *  @proximity_domain - proximity domain of the GPU node
2049  *
2050  *  Return 0 if successful else return -ve value
2051  */
2052 static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
2053             struct kfd_dev *kdev,
2054             struct crat_subtype_iolink *sub_type_hdr,
2055             uint32_t proximity_domain)
2056 {
2057     *avail_size -= sizeof(struct crat_subtype_iolink);
2058     if (*avail_size < 0)
2059         return -ENOMEM;
2060 
2061     memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
2062 
2063     /* Fill in subtype header data */
2064     sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
2065     sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
2066     sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
2067     if (kfd_dev_is_large_bar(kdev))
2068         sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2069 
2070     /* Fill in IOLINK subtype.
2071      * TODO: Fill-in other fields of iolink subtype
2072      */
2073     if (kdev->adev->gmc.xgmi.connected_to_cpu) {
2074         /*
2075          * with host gpu xgmi link, host can access gpu memory whether
2076          * or not pcie bar type is large, so always create bidirectional
2077          * io link.
2078          */
2079         sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2080         sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
2081         sub_type_hdr->num_hops_xgmi = 1;
2082         if (KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 2)) {
2083             sub_type_hdr->minimum_bandwidth_mbs =
2084                     amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
2085                             kdev->adev, NULL, true);
2086             sub_type_hdr->maximum_bandwidth_mbs =
2087                     sub_type_hdr->minimum_bandwidth_mbs;
2088         }
2089     } else {
2090         sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
2091         sub_type_hdr->minimum_bandwidth_mbs =
2092                 amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, true);
2093         sub_type_hdr->maximum_bandwidth_mbs =
2094                 amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, false);
2095     }
2096 
2097     sub_type_hdr->proximity_domain_from = proximity_domain;
2098 
2099 #ifdef CONFIG_ACPI_NUMA
2100     if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
2101         kfd_find_numa_node_in_srat(kdev);
2102 #endif
2103 #ifdef CONFIG_NUMA
2104     if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
2105         sub_type_hdr->proximity_domain_to = 0;
2106     else
2107         sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node;
2108 #else
2109     sub_type_hdr->proximity_domain_to = 0;
2110 #endif
2111     return 0;
2112 }
2113 
2114 static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
2115             struct kfd_dev *kdev,
2116             struct kfd_dev *peer_kdev,
2117             struct crat_subtype_iolink *sub_type_hdr,
2118             uint32_t proximity_domain_from,
2119             uint32_t proximity_domain_to)
2120 {
2121     *avail_size -= sizeof(struct crat_subtype_iolink);
2122     if (*avail_size < 0)
2123         return -ENOMEM;
2124 
2125     memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
2126 
2127     sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
2128     sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
2129     sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED |
2130                    CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2131 
2132     sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
2133     sub_type_hdr->proximity_domain_from = proximity_domain_from;
2134     sub_type_hdr->proximity_domain_to = proximity_domain_to;
2135     sub_type_hdr->num_hops_xgmi =
2136         amdgpu_amdkfd_get_xgmi_hops_count(kdev->adev, peer_kdev->adev);
2137     sub_type_hdr->maximum_bandwidth_mbs =
2138         amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, peer_kdev->adev, false);
2139     sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ?
2140         amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, NULL, true) : 0;
2141 
2142     return 0;
2143 }
2144 
2145 /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
2146  *
2147  *  @pcrat_image: Fill in VCRAT for GPU
2148  *  @size:  [IN] allocated size of crat_image.
2149  *      [OUT] actual size of data filled in crat_image
2150  */
2151 static int kfd_create_vcrat_image_gpu(void *pcrat_image,
2152                       size_t *size, struct kfd_dev *kdev,
2153                       uint32_t proximity_domain)
2154 {
2155     struct crat_header *crat_table = (struct crat_header *)pcrat_image;
2156     struct crat_subtype_generic *sub_type_hdr;
2157     struct kfd_local_mem_info local_mem_info;
2158     struct kfd_topology_device *peer_dev;
2159     struct crat_subtype_computeunit *cu;
2160     struct kfd_cu_info cu_info;
2161     int avail_size = *size;
2162     uint32_t total_num_of_cu;
2163     int num_of_cache_entries = 0;
2164     int cache_mem_filled = 0;
2165     uint32_t nid = 0;
2166     int ret = 0;
2167 
2168     if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
2169         return -EINVAL;
2170 
2171     /* Fill the CRAT Header.
2172      * Modify length and total_entries as subunits are added.
2173      */
2174     avail_size -= sizeof(struct crat_header);
2175     if (avail_size < 0)
2176         return -ENOMEM;
2177 
2178     memset(crat_table, 0, sizeof(struct crat_header));
2179 
2180     memcpy(&crat_table->signature, CRAT_SIGNATURE,
2181             sizeof(crat_table->signature));
2182     /* Change length as we add more subtypes*/
2183     crat_table->length = sizeof(struct crat_header);
2184     crat_table->num_domains = 1;
2185     crat_table->total_entries = 0;
2186 
2187     /* Fill in Subtype: Compute Unit
2188      * First fill in the sub type header and then sub type data
2189      */
2190     avail_size -= sizeof(struct crat_subtype_computeunit);
2191     if (avail_size < 0)
2192         return -ENOMEM;
2193 
2194     sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
2195     memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
2196 
2197     sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
2198     sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
2199     sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
2200 
2201     /* Fill CU subtype data */
2202     cu = (struct crat_subtype_computeunit *)sub_type_hdr;
2203     cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
2204     cu->proximity_domain = proximity_domain;
2205 
2206     amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
2207     cu->num_simd_per_cu = cu_info.simd_per_cu;
2208     cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
2209     cu->max_waves_simd = cu_info.max_waves_per_simd;
2210 
2211     cu->wave_front_size = cu_info.wave_front_size;
2212     cu->array_count = cu_info.num_shader_arrays_per_engine *
2213         cu_info.num_shader_engines;
2214     total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
2215     cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
2216     cu->num_cu_per_array = cu_info.num_cu_per_sh;
2217     cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
2218     cu->num_banks = cu_info.num_shader_engines;
2219     cu->lds_size_in_kb = cu_info.lds_size;
2220 
2221     cu->hsa_capability = 0;
2222 
2223     /* Check if this node supports IOMMU. During parsing this flag will
2224      * translate to HSA_CAP_ATS_PRESENT
2225      */
2226     if (!kfd_iommu_check_device(kdev))
2227         cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT;
2228 
2229     crat_table->length += sub_type_hdr->length;
2230     crat_table->total_entries++;
2231 
2232     /* Fill in Subtype: Memory. Only on systems with large BAR (no
2233      * private FB), report memory as public. On other systems
2234      * report the total FB size (public+private) as a single
2235      * private heap.
2236      */
2237     local_mem_info = kdev->local_mem_info;
2238     sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
2239             sub_type_hdr->length);
2240 
2241     if (debug_largebar)
2242         local_mem_info.local_mem_size_private = 0;
2243 
2244     if (local_mem_info.local_mem_size_private == 0)
2245         ret = kfd_fill_gpu_memory_affinity(&avail_size,
2246                 kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
2247                 local_mem_info.local_mem_size_public,
2248                 (struct crat_subtype_memory *)sub_type_hdr,
2249                 proximity_domain,
2250                 &local_mem_info);
2251     else
2252         ret = kfd_fill_gpu_memory_affinity(&avail_size,
2253                 kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
2254                 local_mem_info.local_mem_size_public +
2255                 local_mem_info.local_mem_size_private,
2256                 (struct crat_subtype_memory *)sub_type_hdr,
2257                 proximity_domain,
2258                 &local_mem_info);
2259     if (ret < 0)
2260         return ret;
2261 
2262     crat_table->length += sizeof(struct crat_subtype_memory);
2263     crat_table->total_entries++;
2264 
2265     /* TODO: Fill in cache information. This information is NOT readily
2266      * available in KGD
2267      */
2268     sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
2269         sub_type_hdr->length);
2270     ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
2271                 avail_size,
2272                 &cu_info,
2273                 (struct crat_subtype_cache *)sub_type_hdr,
2274                 &cache_mem_filled,
2275                 &num_of_cache_entries);
2276 
2277     if (ret < 0)
2278         return ret;
2279 
2280     crat_table->length += cache_mem_filled;
2281     crat_table->total_entries += num_of_cache_entries;
2282     avail_size -= cache_mem_filled;
2283 
2284     /* Fill in Subtype: IO_LINKS
2285      *  Only direct links are added here which is Link from GPU to
2286      *  to its NUMA node. Indirect links are added by userspace.
2287      */
2288     sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
2289         cache_mem_filled);
2290     ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
2291         (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
2292 
2293     if (ret < 0)
2294         return ret;
2295 
2296     crat_table->length += sub_type_hdr->length;
2297     crat_table->total_entries++;
2298 
2299 
2300     /* Fill in Subtype: IO_LINKS
2301      * Direct links from GPU to other GPUs through xGMI.
2302      * We will loop GPUs that already be processed (with lower value
2303      * of proximity_domain), add the link for the GPUs with same
2304      * hive id (from this GPU to other GPU) . The reversed iolink
2305      * (from other GPU to this GPU) will be added
2306      * in kfd_parse_subtype_iolink.
2307      */
2308     if (kdev->hive_id) {
2309         for (nid = 0; nid < proximity_domain; ++nid) {
2310             peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid);
2311             if (!peer_dev->gpu)
2312                 continue;
2313             if (peer_dev->gpu->hive_id != kdev->hive_id)
2314                 continue;
2315             sub_type_hdr = (typeof(sub_type_hdr))(
2316                 (char *)sub_type_hdr +
2317                 sizeof(struct crat_subtype_iolink));
2318             ret = kfd_fill_gpu_xgmi_link_to_gpu(
2319                 &avail_size, kdev, peer_dev->gpu,
2320                 (struct crat_subtype_iolink *)sub_type_hdr,
2321                 proximity_domain, nid);
2322             if (ret < 0)
2323                 return ret;
2324             crat_table->length += sub_type_hdr->length;
2325             crat_table->total_entries++;
2326         }
2327     }
2328     *size = crat_table->length;
2329     pr_info("Virtual CRAT table created for GPU\n");
2330 
2331     return ret;
2332 }
2333 
2334 /* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
2335  *      creates a Virtual CRAT (VCRAT) image
2336  *
2337  * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
2338  *
2339  *  @crat_image: VCRAT image created because ACPI does not have a
2340  *           CRAT for this device
2341  *  @size: [OUT] size of virtual crat_image
2342  *  @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device
2343  *      COMPUTE_UNIT_GPU - Create VCRAT for GPU
2344  *      (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
2345  *          -- this option is not currently implemented.
2346  *          The assumption is that all AMD APUs will have CRAT
2347  *  @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU
2348  *
2349  *  Return 0 if successful else return -ve value
2350  */
2351 int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
2352                   int flags, struct kfd_dev *kdev,
2353                   uint32_t proximity_domain)
2354 {
2355     void *pcrat_image = NULL;
2356     int ret = 0, num_nodes;
2357     size_t dyn_size;
2358 
2359     if (!crat_image)
2360         return -EINVAL;
2361 
2362     *crat_image = NULL;
2363 
2364     /* Allocate the CPU Virtual CRAT size based on the number of online
2365      * nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image.
2366      * This should cover all the current conditions. A check is put not
2367      * to overwrite beyond allocated size for GPUs
2368      */
2369     switch (flags) {
2370     case COMPUTE_UNIT_CPU:
2371         num_nodes = num_online_nodes();
2372         dyn_size = sizeof(struct crat_header) +
2373             num_nodes * (sizeof(struct crat_subtype_computeunit) +
2374             sizeof(struct crat_subtype_memory) +
2375             (num_nodes - 1) * sizeof(struct crat_subtype_iolink));
2376         pcrat_image = kvmalloc(dyn_size, GFP_KERNEL);
2377         if (!pcrat_image)
2378             return -ENOMEM;
2379         *size = dyn_size;
2380         pr_debug("CRAT size is %ld", dyn_size);
2381         ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
2382         break;
2383     case COMPUTE_UNIT_GPU:
2384         if (!kdev)
2385             return -EINVAL;
2386         pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
2387         if (!pcrat_image)
2388             return -ENOMEM;
2389         *size = VCRAT_SIZE_FOR_GPU;
2390         ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,
2391                          proximity_domain);
2392         break;
2393     case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):
2394         /* TODO: */
2395         ret = -EINVAL;
2396         pr_err("VCRAT not implemented for APU\n");
2397         break;
2398     default:
2399         ret = -EINVAL;
2400     }
2401 
2402     if (!ret)
2403         *crat_image = pcrat_image;
2404     else
2405         kvfree(pcrat_image);
2406 
2407     return ret;
2408 }
2409 
2410 
2411 /* kfd_destroy_crat_image
2412  *
2413  *  @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
2414  *
2415  */
2416 void kfd_destroy_crat_image(void *crat_image)
2417 {
2418     kvfree(crat_image);
2419 }