Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Basic Node interface support
0004  */
0005 
0006 #include <linux/module.h>
0007 #include <linux/init.h>
0008 #include <linux/mm.h>
0009 #include <linux/memory.h>
0010 #include <linux/vmstat.h>
0011 #include <linux/notifier.h>
0012 #include <linux/node.h>
0013 #include <linux/hugetlb.h>
0014 #include <linux/compaction.h>
0015 #include <linux/cpumask.h>
0016 #include <linux/topology.h>
0017 #include <linux/nodemask.h>
0018 #include <linux/cpu.h>
0019 #include <linux/device.h>
0020 #include <linux/pm_runtime.h>
0021 #include <linux/swap.h>
0022 #include <linux/slab.h>
0023 
0024 static struct bus_type node_subsys = {
0025     .name = "node",
0026     .dev_name = "node",
0027 };
0028 
0029 static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj,
0030                   struct bin_attribute *attr, char *buf,
0031                   loff_t off, size_t count)
0032 {
0033     struct device *dev = kobj_to_dev(kobj);
0034     struct node *node_dev = to_node(dev);
0035     cpumask_var_t mask;
0036     ssize_t n;
0037 
0038     if (!alloc_cpumask_var(&mask, GFP_KERNEL))
0039         return 0;
0040 
0041     cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask);
0042     n = cpumap_print_bitmask_to_buf(buf, mask, off, count);
0043     free_cpumask_var(mask);
0044 
0045     return n;
0046 }
0047 
0048 static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES);
0049 
0050 static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
0051                    struct bin_attribute *attr, char *buf,
0052                    loff_t off, size_t count)
0053 {
0054     struct device *dev = kobj_to_dev(kobj);
0055     struct node *node_dev = to_node(dev);
0056     cpumask_var_t mask;
0057     ssize_t n;
0058 
0059     if (!alloc_cpumask_var(&mask, GFP_KERNEL))
0060         return 0;
0061 
0062     cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask);
0063     n = cpumap_print_list_to_buf(buf, mask, off, count);
0064     free_cpumask_var(mask);
0065 
0066     return n;
0067 }
0068 
0069 static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
0070 
0071 /**
0072  * struct node_access_nodes - Access class device to hold user visible
0073  *                relationships to other nodes.
0074  * @dev:    Device for this memory access class
0075  * @list_node:  List element in the node's access list
0076  * @access: The access class rank
0077  * @hmem_attrs: Heterogeneous memory performance attributes
0078  */
0079 struct node_access_nodes {
0080     struct device       dev;
0081     struct list_head    list_node;
0082     unsigned int        access;
0083 #ifdef CONFIG_HMEM_REPORTING
0084     struct node_hmem_attrs  hmem_attrs;
0085 #endif
0086 };
0087 #define to_access_nodes(dev) container_of(dev, struct node_access_nodes, dev)
0088 
0089 static struct attribute *node_init_access_node_attrs[] = {
0090     NULL,
0091 };
0092 
0093 static struct attribute *node_targ_access_node_attrs[] = {
0094     NULL,
0095 };
0096 
0097 static const struct attribute_group initiators = {
0098     .name   = "initiators",
0099     .attrs  = node_init_access_node_attrs,
0100 };
0101 
0102 static const struct attribute_group targets = {
0103     .name   = "targets",
0104     .attrs  = node_targ_access_node_attrs,
0105 };
0106 
0107 static const struct attribute_group *node_access_node_groups[] = {
0108     &initiators,
0109     &targets,
0110     NULL,
0111 };
0112 
0113 static void node_remove_accesses(struct node *node)
0114 {
0115     struct node_access_nodes *c, *cnext;
0116 
0117     list_for_each_entry_safe(c, cnext, &node->access_list, list_node) {
0118         list_del(&c->list_node);
0119         device_unregister(&c->dev);
0120     }
0121 }
0122 
0123 static void node_access_release(struct device *dev)
0124 {
0125     kfree(to_access_nodes(dev));
0126 }
0127 
0128 static struct node_access_nodes *node_init_node_access(struct node *node,
0129                                unsigned int access)
0130 {
0131     struct node_access_nodes *access_node;
0132     struct device *dev;
0133 
0134     list_for_each_entry(access_node, &node->access_list, list_node)
0135         if (access_node->access == access)
0136             return access_node;
0137 
0138     access_node = kzalloc(sizeof(*access_node), GFP_KERNEL);
0139     if (!access_node)
0140         return NULL;
0141 
0142     access_node->access = access;
0143     dev = &access_node->dev;
0144     dev->parent = &node->dev;
0145     dev->release = node_access_release;
0146     dev->groups = node_access_node_groups;
0147     if (dev_set_name(dev, "access%u", access))
0148         goto free;
0149 
0150     if (device_register(dev))
0151         goto free_name;
0152 
0153     pm_runtime_no_callbacks(dev);
0154     list_add_tail(&access_node->list_node, &node->access_list);
0155     return access_node;
0156 free_name:
0157     kfree_const(dev->kobj.name);
0158 free:
0159     kfree(access_node);
0160     return NULL;
0161 }
0162 
0163 #ifdef CONFIG_HMEM_REPORTING
0164 #define ACCESS_ATTR(name)                       \
0165 static ssize_t name##_show(struct device *dev,              \
0166                struct device_attribute *attr,       \
0167                char *buf)                   \
0168 {                                   \
0169     return sysfs_emit(buf, "%u\n",                  \
0170               to_access_nodes(dev)->hmem_attrs.name);   \
0171 }                                   \
0172 static DEVICE_ATTR_RO(name)
0173 
0174 ACCESS_ATTR(read_bandwidth);
0175 ACCESS_ATTR(read_latency);
0176 ACCESS_ATTR(write_bandwidth);
0177 ACCESS_ATTR(write_latency);
0178 
0179 static struct attribute *access_attrs[] = {
0180     &dev_attr_read_bandwidth.attr,
0181     &dev_attr_read_latency.attr,
0182     &dev_attr_write_bandwidth.attr,
0183     &dev_attr_write_latency.attr,
0184     NULL,
0185 };
0186 
0187 /**
0188  * node_set_perf_attrs - Set the performance values for given access class
0189  * @nid: Node identifier to be set
0190  * @hmem_attrs: Heterogeneous memory performance attributes
0191  * @access: The access class the for the given attributes
0192  */
0193 void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs,
0194              unsigned int access)
0195 {
0196     struct node_access_nodes *c;
0197     struct node *node;
0198     int i;
0199 
0200     if (WARN_ON_ONCE(!node_online(nid)))
0201         return;
0202 
0203     node = node_devices[nid];
0204     c = node_init_node_access(node, access);
0205     if (!c)
0206         return;
0207 
0208     c->hmem_attrs = *hmem_attrs;
0209     for (i = 0; access_attrs[i] != NULL; i++) {
0210         if (sysfs_add_file_to_group(&c->dev.kobj, access_attrs[i],
0211                         "initiators")) {
0212             pr_info("failed to add performance attribute to node %d\n",
0213                 nid);
0214             break;
0215         }
0216     }
0217 }
0218 
0219 /**
0220  * struct node_cache_info - Internal tracking for memory node caches
0221  * @dev:    Device represeting the cache level
0222  * @node:   List element for tracking in the node
0223  * @cache_attrs:Attributes for this cache level
0224  */
0225 struct node_cache_info {
0226     struct device dev;
0227     struct list_head node;
0228     struct node_cache_attrs cache_attrs;
0229 };
0230 #define to_cache_info(device) container_of(device, struct node_cache_info, dev)
0231 
0232 #define CACHE_ATTR(name, fmt)                       \
0233 static ssize_t name##_show(struct device *dev,              \
0234                struct device_attribute *attr,       \
0235                char *buf)                   \
0236 {                                   \
0237     return sysfs_emit(buf, fmt "\n",                \
0238               to_cache_info(dev)->cache_attrs.name);    \
0239 }                                   \
0240 static DEVICE_ATTR_RO(name);
0241 
0242 CACHE_ATTR(size, "%llu")
0243 CACHE_ATTR(line_size, "%u")
0244 CACHE_ATTR(indexing, "%u")
0245 CACHE_ATTR(write_policy, "%u")
0246 
0247 static struct attribute *cache_attrs[] = {
0248     &dev_attr_indexing.attr,
0249     &dev_attr_size.attr,
0250     &dev_attr_line_size.attr,
0251     &dev_attr_write_policy.attr,
0252     NULL,
0253 };
0254 ATTRIBUTE_GROUPS(cache);
0255 
0256 static void node_cache_release(struct device *dev)
0257 {
0258     kfree(dev);
0259 }
0260 
0261 static void node_cacheinfo_release(struct device *dev)
0262 {
0263     struct node_cache_info *info = to_cache_info(dev);
0264     kfree(info);
0265 }
0266 
0267 static void node_init_cache_dev(struct node *node)
0268 {
0269     struct device *dev;
0270 
0271     dev = kzalloc(sizeof(*dev), GFP_KERNEL);
0272     if (!dev)
0273         return;
0274 
0275     device_initialize(dev);
0276     dev->parent = &node->dev;
0277     dev->release = node_cache_release;
0278     if (dev_set_name(dev, "memory_side_cache"))
0279         goto put_device;
0280 
0281     if (device_add(dev))
0282         goto put_device;
0283 
0284     pm_runtime_no_callbacks(dev);
0285     node->cache_dev = dev;
0286     return;
0287 put_device:
0288     put_device(dev);
0289 }
0290 
0291 /**
0292  * node_add_cache() - add cache attribute to a memory node
0293  * @nid: Node identifier that has new cache attributes
0294  * @cache_attrs: Attributes for the cache being added
0295  */
0296 void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs)
0297 {
0298     struct node_cache_info *info;
0299     struct device *dev;
0300     struct node *node;
0301 
0302     if (!node_online(nid) || !node_devices[nid])
0303         return;
0304 
0305     node = node_devices[nid];
0306     list_for_each_entry(info, &node->cache_attrs, node) {
0307         if (info->cache_attrs.level == cache_attrs->level) {
0308             dev_warn(&node->dev,
0309                 "attempt to add duplicate cache level:%d\n",
0310                 cache_attrs->level);
0311             return;
0312         }
0313     }
0314 
0315     if (!node->cache_dev)
0316         node_init_cache_dev(node);
0317     if (!node->cache_dev)
0318         return;
0319 
0320     info = kzalloc(sizeof(*info), GFP_KERNEL);
0321     if (!info)
0322         return;
0323 
0324     dev = &info->dev;
0325     device_initialize(dev);
0326     dev->parent = node->cache_dev;
0327     dev->release = node_cacheinfo_release;
0328     dev->groups = cache_groups;
0329     if (dev_set_name(dev, "index%d", cache_attrs->level))
0330         goto put_device;
0331 
0332     info->cache_attrs = *cache_attrs;
0333     if (device_add(dev)) {
0334         dev_warn(&node->dev, "failed to add cache level:%d\n",
0335              cache_attrs->level);
0336         goto put_device;
0337     }
0338     pm_runtime_no_callbacks(dev);
0339     list_add_tail(&info->node, &node->cache_attrs);
0340     return;
0341 put_device:
0342     put_device(dev);
0343 }
0344 
0345 static void node_remove_caches(struct node *node)
0346 {
0347     struct node_cache_info *info, *next;
0348 
0349     if (!node->cache_dev)
0350         return;
0351 
0352     list_for_each_entry_safe(info, next, &node->cache_attrs, node) {
0353         list_del(&info->node);
0354         device_unregister(&info->dev);
0355     }
0356     device_unregister(node->cache_dev);
0357 }
0358 
0359 static void node_init_caches(unsigned int nid)
0360 {
0361     INIT_LIST_HEAD(&node_devices[nid]->cache_attrs);
0362 }
0363 #else
0364 static void node_init_caches(unsigned int nid) { }
0365 static void node_remove_caches(struct node *node) { }
0366 #endif
0367 
0368 #define K(x) ((x) << (PAGE_SHIFT - 10))
0369 static ssize_t node_read_meminfo(struct device *dev,
0370             struct device_attribute *attr, char *buf)
0371 {
0372     int len = 0;
0373     int nid = dev->id;
0374     struct pglist_data *pgdat = NODE_DATA(nid);
0375     struct sysinfo i;
0376     unsigned long sreclaimable, sunreclaimable;
0377     unsigned long swapcached = 0;
0378 
0379     si_meminfo_node(&i, nid);
0380     sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
0381     sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
0382 #ifdef CONFIG_SWAP
0383     swapcached = node_page_state_pages(pgdat, NR_SWAPCACHE);
0384 #endif
0385     len = sysfs_emit_at(buf, len,
0386                 "Node %d MemTotal:       %8lu kB\n"
0387                 "Node %d MemFree:        %8lu kB\n"
0388                 "Node %d MemUsed:        %8lu kB\n"
0389                 "Node %d SwapCached:     %8lu kB\n"
0390                 "Node %d Active:         %8lu kB\n"
0391                 "Node %d Inactive:       %8lu kB\n"
0392                 "Node %d Active(anon):   %8lu kB\n"
0393                 "Node %d Inactive(anon): %8lu kB\n"
0394                 "Node %d Active(file):   %8lu kB\n"
0395                 "Node %d Inactive(file): %8lu kB\n"
0396                 "Node %d Unevictable:    %8lu kB\n"
0397                 "Node %d Mlocked:        %8lu kB\n",
0398                 nid, K(i.totalram),
0399                 nid, K(i.freeram),
0400                 nid, K(i.totalram - i.freeram),
0401                 nid, K(swapcached),
0402                 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
0403                    node_page_state(pgdat, NR_ACTIVE_FILE)),
0404                 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
0405                    node_page_state(pgdat, NR_INACTIVE_FILE)),
0406                 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
0407                 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
0408                 nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
0409                 nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
0410                 nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
0411                 nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
0412 
0413 #ifdef CONFIG_HIGHMEM
0414     len += sysfs_emit_at(buf, len,
0415                  "Node %d HighTotal:      %8lu kB\n"
0416                  "Node %d HighFree:       %8lu kB\n"
0417                  "Node %d LowTotal:       %8lu kB\n"
0418                  "Node %d LowFree:        %8lu kB\n",
0419                  nid, K(i.totalhigh),
0420                  nid, K(i.freehigh),
0421                  nid, K(i.totalram - i.totalhigh),
0422                  nid, K(i.freeram - i.freehigh));
0423 #endif
0424     len += sysfs_emit_at(buf, len,
0425                  "Node %d Dirty:          %8lu kB\n"
0426                  "Node %d Writeback:      %8lu kB\n"
0427                  "Node %d FilePages:      %8lu kB\n"
0428                  "Node %d Mapped:         %8lu kB\n"
0429                  "Node %d AnonPages:      %8lu kB\n"
0430                  "Node %d Shmem:          %8lu kB\n"
0431                  "Node %d KernelStack:    %8lu kB\n"
0432 #ifdef CONFIG_SHADOW_CALL_STACK
0433                  "Node %d ShadowCallStack:%8lu kB\n"
0434 #endif
0435                  "Node %d PageTables:     %8lu kB\n"
0436                  "Node %d NFS_Unstable:   %8lu kB\n"
0437                  "Node %d Bounce:         %8lu kB\n"
0438                  "Node %d WritebackTmp:   %8lu kB\n"
0439                  "Node %d KReclaimable:   %8lu kB\n"
0440                  "Node %d Slab:           %8lu kB\n"
0441                  "Node %d SReclaimable:   %8lu kB\n"
0442                  "Node %d SUnreclaim:     %8lu kB\n"
0443 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0444                  "Node %d AnonHugePages:  %8lu kB\n"
0445                  "Node %d ShmemHugePages: %8lu kB\n"
0446                  "Node %d ShmemPmdMapped: %8lu kB\n"
0447                  "Node %d FileHugePages: %8lu kB\n"
0448                  "Node %d FilePmdMapped: %8lu kB\n"
0449 #endif
0450                  ,
0451                  nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
0452                  nid, K(node_page_state(pgdat, NR_WRITEBACK)),
0453                  nid, K(node_page_state(pgdat, NR_FILE_PAGES)),
0454                  nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
0455                  nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
0456                  nid, K(i.sharedram),
0457                  nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
0458 #ifdef CONFIG_SHADOW_CALL_STACK
0459                  nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
0460 #endif
0461                  nid, K(node_page_state(pgdat, NR_PAGETABLE)),
0462                  nid, 0UL,
0463                  nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
0464                  nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
0465                  nid, K(sreclaimable +
0466                     node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
0467                  nid, K(sreclaimable + sunreclaimable),
0468                  nid, K(sreclaimable),
0469                  nid, K(sunreclaimable)
0470 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
0471                  ,
0472                  nid, K(node_page_state(pgdat, NR_ANON_THPS)),
0473                  nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
0474                  nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
0475                  nid, K(node_page_state(pgdat, NR_FILE_THPS)),
0476                  nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
0477 #endif
0478                 );
0479     len += hugetlb_report_node_meminfo(buf, len, nid);
0480     return len;
0481 }
0482 
0483 #undef K
0484 static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL);
0485 
0486 static ssize_t node_read_numastat(struct device *dev,
0487                   struct device_attribute *attr, char *buf)
0488 {
0489     fold_vm_numa_events();
0490     return sysfs_emit(buf,
0491               "numa_hit %lu\n"
0492               "numa_miss %lu\n"
0493               "numa_foreign %lu\n"
0494               "interleave_hit %lu\n"
0495               "local_node %lu\n"
0496               "other_node %lu\n",
0497               sum_zone_numa_event_state(dev->id, NUMA_HIT),
0498               sum_zone_numa_event_state(dev->id, NUMA_MISS),
0499               sum_zone_numa_event_state(dev->id, NUMA_FOREIGN),
0500               sum_zone_numa_event_state(dev->id, NUMA_INTERLEAVE_HIT),
0501               sum_zone_numa_event_state(dev->id, NUMA_LOCAL),
0502               sum_zone_numa_event_state(dev->id, NUMA_OTHER));
0503 }
0504 static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL);
0505 
0506 static ssize_t node_read_vmstat(struct device *dev,
0507                 struct device_attribute *attr, char *buf)
0508 {
0509     int nid = dev->id;
0510     struct pglist_data *pgdat = NODE_DATA(nid);
0511     int i;
0512     int len = 0;
0513 
0514     for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
0515         len += sysfs_emit_at(buf, len, "%s %lu\n",
0516                      zone_stat_name(i),
0517                      sum_zone_node_page_state(nid, i));
0518 
0519 #ifdef CONFIG_NUMA
0520     fold_vm_numa_events();
0521     for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
0522         len += sysfs_emit_at(buf, len, "%s %lu\n",
0523                      numa_stat_name(i),
0524                      sum_zone_numa_event_state(nid, i));
0525 
0526 #endif
0527     for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
0528         unsigned long pages = node_page_state_pages(pgdat, i);
0529 
0530         if (vmstat_item_print_in_thp(i))
0531             pages /= HPAGE_PMD_NR;
0532         len += sysfs_emit_at(buf, len, "%s %lu\n", node_stat_name(i),
0533                      pages);
0534     }
0535 
0536     return len;
0537 }
0538 static DEVICE_ATTR(vmstat, 0444, node_read_vmstat, NULL);
0539 
0540 static ssize_t node_read_distance(struct device *dev,
0541                   struct device_attribute *attr, char *buf)
0542 {
0543     int nid = dev->id;
0544     int len = 0;
0545     int i;
0546 
0547     /*
0548      * buf is currently PAGE_SIZE in length and each node needs 4 chars
0549      * at the most (distance + space or newline).
0550      */
0551     BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE);
0552 
0553     for_each_online_node(i) {
0554         len += sysfs_emit_at(buf, len, "%s%d",
0555                      i ? " " : "", node_distance(nid, i));
0556     }
0557 
0558     len += sysfs_emit_at(buf, len, "\n");
0559     return len;
0560 }
0561 static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
0562 
0563 static struct attribute *node_dev_attrs[] = {
0564     &dev_attr_meminfo.attr,
0565     &dev_attr_numastat.attr,
0566     &dev_attr_distance.attr,
0567     &dev_attr_vmstat.attr,
0568     NULL
0569 };
0570 
0571 static struct bin_attribute *node_dev_bin_attrs[] = {
0572     &bin_attr_cpumap,
0573     &bin_attr_cpulist,
0574     NULL
0575 };
0576 
0577 static const struct attribute_group node_dev_group = {
0578     .attrs = node_dev_attrs,
0579     .bin_attrs = node_dev_bin_attrs
0580 };
0581 
0582 static const struct attribute_group *node_dev_groups[] = {
0583     &node_dev_group,
0584 #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
0585     &arch_node_dev_group,
0586 #endif
0587     NULL
0588 };
0589 
0590 #ifdef CONFIG_HUGETLBFS
0591 /*
0592  * hugetlbfs per node attributes registration interface:
0593  * When/if hugetlb[fs] subsystem initializes [sometime after this module],
0594  * it will register its per node attributes for all online nodes with
0595  * memory.  It will also call register_hugetlbfs_with_node(), below, to
0596  * register its attribute registration functions with this node driver.
0597  * Once these hooks have been initialized, the node driver will call into
0598  * the hugetlb module to [un]register attributes for hot-plugged nodes.
0599  */
0600 static node_registration_func_t __hugetlb_register_node;
0601 static node_registration_func_t __hugetlb_unregister_node;
0602 
0603 static inline bool hugetlb_register_node(struct node *node)
0604 {
0605     if (__hugetlb_register_node &&
0606             node_state(node->dev.id, N_MEMORY)) {
0607         __hugetlb_register_node(node);
0608         return true;
0609     }
0610     return false;
0611 }
0612 
0613 static inline void hugetlb_unregister_node(struct node *node)
0614 {
0615     if (__hugetlb_unregister_node)
0616         __hugetlb_unregister_node(node);
0617 }
0618 
0619 void register_hugetlbfs_with_node(node_registration_func_t doregister,
0620                   node_registration_func_t unregister)
0621 {
0622     __hugetlb_register_node   = doregister;
0623     __hugetlb_unregister_node = unregister;
0624 }
0625 #else
0626 static inline void hugetlb_register_node(struct node *node) {}
0627 
0628 static inline void hugetlb_unregister_node(struct node *node) {}
0629 #endif
0630 
0631 static void node_device_release(struct device *dev)
0632 {
0633     struct node *node = to_node(dev);
0634 
0635 #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
0636     /*
0637      * We schedule the work only when a memory section is
0638      * onlined/offlined on this node. When we come here,
0639      * all the memory on this node has been offlined,
0640      * so we won't enqueue new work to this work.
0641      *
0642      * The work is using node->node_work, so we should
0643      * flush work before freeing the memory.
0644      */
0645     flush_work(&node->node_work);
0646 #endif
0647     kfree(node);
0648 }
0649 
0650 /*
0651  * register_node - Setup a sysfs device for a node.
0652  * @num - Node number to use when creating the device.
0653  *
0654  * Initialize and register the node device.
0655  */
0656 static int register_node(struct node *node, int num)
0657 {
0658     int error;
0659 
0660     node->dev.id = num;
0661     node->dev.bus = &node_subsys;
0662     node->dev.release = node_device_release;
0663     node->dev.groups = node_dev_groups;
0664     error = device_register(&node->dev);
0665 
0666     if (error)
0667         put_device(&node->dev);
0668     else {
0669         hugetlb_register_node(node);
0670 
0671         compaction_register_node(node);
0672     }
0673     return error;
0674 }
0675 
0676 /**
0677  * unregister_node - unregister a node device
0678  * @node: node going away
0679  *
0680  * Unregisters a node device @node.  All the devices on the node must be
0681  * unregistered before calling this function.
0682  */
0683 void unregister_node(struct node *node)
0684 {
0685     compaction_unregister_node(node);
0686     hugetlb_unregister_node(node);      /* no-op, if memoryless node */
0687     node_remove_accesses(node);
0688     node_remove_caches(node);
0689     device_unregister(&node->dev);
0690 }
0691 
0692 struct node *node_devices[MAX_NUMNODES];
0693 
0694 /*
0695  * register cpu under node
0696  */
0697 int register_cpu_under_node(unsigned int cpu, unsigned int nid)
0698 {
0699     int ret;
0700     struct device *obj;
0701 
0702     if (!node_online(nid))
0703         return 0;
0704 
0705     obj = get_cpu_device(cpu);
0706     if (!obj)
0707         return 0;
0708 
0709     ret = sysfs_create_link(&node_devices[nid]->dev.kobj,
0710                 &obj->kobj,
0711                 kobject_name(&obj->kobj));
0712     if (ret)
0713         return ret;
0714 
0715     return sysfs_create_link(&obj->kobj,
0716                  &node_devices[nid]->dev.kobj,
0717                  kobject_name(&node_devices[nid]->dev.kobj));
0718 }
0719 
0720 /**
0721  * register_memory_node_under_compute_node - link memory node to its compute
0722  *                       node for a given access class.
0723  * @mem_nid:    Memory node number
0724  * @cpu_nid:    Cpu  node number
0725  * @access: Access class to register
0726  *
0727  * Description:
0728  *  For use with platforms that may have separate memory and compute nodes.
0729  *  This function will export node relationships linking which memory
0730  *  initiator nodes can access memory targets at a given ranked access
0731  *  class.
0732  */
0733 int register_memory_node_under_compute_node(unsigned int mem_nid,
0734                         unsigned int cpu_nid,
0735                         unsigned int access)
0736 {
0737     struct node *init_node, *targ_node;
0738     struct node_access_nodes *initiator, *target;
0739     int ret;
0740 
0741     if (!node_online(cpu_nid) || !node_online(mem_nid))
0742         return -ENODEV;
0743 
0744     init_node = node_devices[cpu_nid];
0745     targ_node = node_devices[mem_nid];
0746     initiator = node_init_node_access(init_node, access);
0747     target = node_init_node_access(targ_node, access);
0748     if (!initiator || !target)
0749         return -ENOMEM;
0750 
0751     ret = sysfs_add_link_to_group(&initiator->dev.kobj, "targets",
0752                       &targ_node->dev.kobj,
0753                       dev_name(&targ_node->dev));
0754     if (ret)
0755         return ret;
0756 
0757     ret = sysfs_add_link_to_group(&target->dev.kobj, "initiators",
0758                       &init_node->dev.kobj,
0759                       dev_name(&init_node->dev));
0760     if (ret)
0761         goto err;
0762 
0763     return 0;
0764  err:
0765     sysfs_remove_link_from_group(&initiator->dev.kobj, "targets",
0766                      dev_name(&targ_node->dev));
0767     return ret;
0768 }
0769 
0770 int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
0771 {
0772     struct device *obj;
0773 
0774     if (!node_online(nid))
0775         return 0;
0776 
0777     obj = get_cpu_device(cpu);
0778     if (!obj)
0779         return 0;
0780 
0781     sysfs_remove_link(&node_devices[nid]->dev.kobj,
0782               kobject_name(&obj->kobj));
0783     sysfs_remove_link(&obj->kobj,
0784               kobject_name(&node_devices[nid]->dev.kobj));
0785 
0786     return 0;
0787 }
0788 
0789 #ifdef CONFIG_MEMORY_HOTPLUG
0790 static int __ref get_nid_for_pfn(unsigned long pfn)
0791 {
0792 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
0793     if (system_state < SYSTEM_RUNNING)
0794         return early_pfn_to_nid(pfn);
0795 #endif
0796     return pfn_to_nid(pfn);
0797 }
0798 
0799 static void do_register_memory_block_under_node(int nid,
0800                         struct memory_block *mem_blk,
0801                         enum meminit_context context)
0802 {
0803     int ret;
0804 
0805     memory_block_add_nid(mem_blk, nid, context);
0806 
0807     ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
0808                        &mem_blk->dev.kobj,
0809                        kobject_name(&mem_blk->dev.kobj));
0810     if (ret && ret != -EEXIST)
0811         dev_err_ratelimited(&node_devices[nid]->dev,
0812                     "can't create link to %s in sysfs (%d)\n",
0813                     kobject_name(&mem_blk->dev.kobj), ret);
0814 
0815     ret = sysfs_create_link_nowarn(&mem_blk->dev.kobj,
0816                 &node_devices[nid]->dev.kobj,
0817                 kobject_name(&node_devices[nid]->dev.kobj));
0818     if (ret && ret != -EEXIST)
0819         dev_err_ratelimited(&mem_blk->dev,
0820                     "can't create link to %s in sysfs (%d)\n",
0821                     kobject_name(&node_devices[nid]->dev.kobj),
0822                     ret);
0823 }
0824 
0825 /* register memory section under specified node if it spans that node */
0826 static int register_mem_block_under_node_early(struct memory_block *mem_blk,
0827                            void *arg)
0828 {
0829     unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
0830     unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
0831     unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
0832     int nid = *(int *)arg;
0833     unsigned long pfn;
0834 
0835     for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
0836         int page_nid;
0837 
0838         /*
0839          * memory block could have several absent sections from start.
0840          * skip pfn range from absent section
0841          */
0842         if (!pfn_in_present_section(pfn)) {
0843             pfn = round_down(pfn + PAGES_PER_SECTION,
0844                      PAGES_PER_SECTION) - 1;
0845             continue;
0846         }
0847 
0848         /*
0849          * We need to check if page belongs to nid only at the boot
0850          * case because node's ranges can be interleaved.
0851          */
0852         page_nid = get_nid_for_pfn(pfn);
0853         if (page_nid < 0)
0854             continue;
0855         if (page_nid != nid)
0856             continue;
0857 
0858         do_register_memory_block_under_node(nid, mem_blk, MEMINIT_EARLY);
0859         return 0;
0860     }
0861     /* mem section does not span the specified node */
0862     return 0;
0863 }
0864 
0865 /*
0866  * During hotplug we know that all pages in the memory block belong to the same
0867  * node.
0868  */
0869 static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
0870                          void *arg)
0871 {
0872     int nid = *(int *)arg;
0873 
0874     do_register_memory_block_under_node(nid, mem_blk, MEMINIT_HOTPLUG);
0875     return 0;
0876 }
0877 
0878 /*
0879  * Unregister a memory block device under the node it spans. Memory blocks
0880  * with multiple nodes cannot be offlined and therefore also never be removed.
0881  */
0882 void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
0883 {
0884     if (mem_blk->nid == NUMA_NO_NODE)
0885         return;
0886 
0887     sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
0888               kobject_name(&mem_blk->dev.kobj));
0889     sysfs_remove_link(&mem_blk->dev.kobj,
0890               kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
0891 }
0892 
0893 void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
0894                        unsigned long end_pfn,
0895                        enum meminit_context context)
0896 {
0897     walk_memory_blocks_func_t func;
0898 
0899     if (context == MEMINIT_HOTPLUG)
0900         func = register_mem_block_under_node_hotplug;
0901     else
0902         func = register_mem_block_under_node_early;
0903 
0904     walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn),
0905                (void *)&nid, func);
0906     return;
0907 }
0908 
0909 #ifdef CONFIG_HUGETLBFS
0910 /*
0911  * Handle per node hstate attribute [un]registration on transistions
0912  * to/from memoryless state.
0913  */
0914 static void node_hugetlb_work(struct work_struct *work)
0915 {
0916     struct node *node = container_of(work, struct node, node_work);
0917 
0918     /*
0919      * We only get here when a node transitions to/from memoryless state.
0920      * We can detect which transition occurred by examining whether the
0921      * node has memory now.  hugetlb_register_node() already check this
0922      * so we try to register the attributes.  If that fails, then the
0923      * node has transitioned to memoryless, try to unregister the
0924      * attributes.
0925      */
0926     if (!hugetlb_register_node(node))
0927         hugetlb_unregister_node(node);
0928 }
0929 
0930 static void init_node_hugetlb_work(int nid)
0931 {
0932     INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work);
0933 }
0934 
0935 static int node_memory_callback(struct notifier_block *self,
0936                 unsigned long action, void *arg)
0937 {
0938     struct memory_notify *mnb = arg;
0939     int nid = mnb->status_change_nid;
0940 
0941     switch (action) {
0942     case MEM_ONLINE:
0943     case MEM_OFFLINE:
0944         /*
0945          * offload per node hstate [un]registration to a work thread
0946          * when transitioning to/from memoryless state.
0947          */
0948         if (nid != NUMA_NO_NODE)
0949             schedule_work(&node_devices[nid]->node_work);
0950         break;
0951 
0952     case MEM_GOING_ONLINE:
0953     case MEM_GOING_OFFLINE:
0954     case MEM_CANCEL_ONLINE:
0955     case MEM_CANCEL_OFFLINE:
0956     default:
0957         break;
0958     }
0959 
0960     return NOTIFY_OK;
0961 }
0962 #endif  /* CONFIG_HUGETLBFS */
0963 #endif /* CONFIG_MEMORY_HOTPLUG */
0964 
0965 #if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS)
0966 static inline int node_memory_callback(struct notifier_block *self,
0967                 unsigned long action, void *arg)
0968 {
0969     return NOTIFY_OK;
0970 }
0971 
0972 static void init_node_hugetlb_work(int nid) { }
0973 
0974 #endif
0975 
0976 int __register_one_node(int nid)
0977 {
0978     int error;
0979     int cpu;
0980 
0981     node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
0982     if (!node_devices[nid])
0983         return -ENOMEM;
0984 
0985     error = register_node(node_devices[nid], nid);
0986 
0987     /* link cpu under this node */
0988     for_each_present_cpu(cpu) {
0989         if (cpu_to_node(cpu) == nid)
0990             register_cpu_under_node(cpu, nid);
0991     }
0992 
0993     INIT_LIST_HEAD(&node_devices[nid]->access_list);
0994     /* initialize work queue for memory hot plug */
0995     init_node_hugetlb_work(nid);
0996     node_init_caches(nid);
0997 
0998     return error;
0999 }
1000 
1001 void unregister_one_node(int nid)
1002 {
1003     if (!node_devices[nid])
1004         return;
1005 
1006     unregister_node(node_devices[nid]);
1007     node_devices[nid] = NULL;
1008 }
1009 
1010 /*
1011  * node states attributes
1012  */
1013 
1014 struct node_attr {
1015     struct device_attribute attr;
1016     enum node_states state;
1017 };
1018 
1019 static ssize_t show_node_state(struct device *dev,
1020                    struct device_attribute *attr, char *buf)
1021 {
1022     struct node_attr *na = container_of(attr, struct node_attr, attr);
1023 
1024     return sysfs_emit(buf, "%*pbl\n",
1025               nodemask_pr_args(&node_states[na->state]));
1026 }
1027 
1028 #define _NODE_ATTR(name, state) \
1029     { __ATTR(name, 0444, show_node_state, NULL), state }
1030 
1031 static struct node_attr node_state_attr[] = {
1032     [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE),
1033     [N_ONLINE] = _NODE_ATTR(online, N_ONLINE),
1034     [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
1035 #ifdef CONFIG_HIGHMEM
1036     [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
1037 #endif
1038     [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
1039     [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
1040     [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
1041                        N_GENERIC_INITIATOR),
1042 };
1043 
1044 static struct attribute *node_state_attrs[] = {
1045     &node_state_attr[N_POSSIBLE].attr.attr,
1046     &node_state_attr[N_ONLINE].attr.attr,
1047     &node_state_attr[N_NORMAL_MEMORY].attr.attr,
1048 #ifdef CONFIG_HIGHMEM
1049     &node_state_attr[N_HIGH_MEMORY].attr.attr,
1050 #endif
1051     &node_state_attr[N_MEMORY].attr.attr,
1052     &node_state_attr[N_CPU].attr.attr,
1053     &node_state_attr[N_GENERIC_INITIATOR].attr.attr,
1054     NULL
1055 };
1056 
1057 static const struct attribute_group memory_root_attr_group = {
1058     .attrs = node_state_attrs,
1059 };
1060 
1061 static const struct attribute_group *cpu_root_attr_groups[] = {
1062     &memory_root_attr_group,
1063     NULL,
1064 };
1065 
1066 #define NODE_CALLBACK_PRI   2   /* lower than SLAB */
1067 void __init node_dev_init(void)
1068 {
1069     static struct notifier_block node_memory_callback_nb = {
1070         .notifier_call = node_memory_callback,
1071         .priority = NODE_CALLBACK_PRI,
1072     };
1073     int ret, i;
1074 
1075     BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES);
1076     BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES);
1077 
1078     ret = subsys_system_register(&node_subsys, cpu_root_attr_groups);
1079     if (ret)
1080         panic("%s() failed to register subsystem: %d\n", __func__, ret);
1081 
1082     register_hotmemory_notifier(&node_memory_callback_nb);
1083 
1084     /*
1085      * Create all node devices, which will properly link the node
1086      * to applicable memory block devices and already created cpu devices.
1087      */
1088     for_each_online_node(i) {
1089         ret = register_one_node(i);
1090         if (ret)
1091             panic("%s() failed to add node: %d\n", __func__, ret);
1092     }
1093 }