Skip to content

Commit c6018b4

Browse files
kvaneeshtorvalds
authored andcommitted
mm/mempolicy: add set_mempolicy_home_node syscall
This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Aneesh Kumar K.V <[email protected]> Cc: Ben Widawsky <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Feng Tang <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Mike Kravetz <[email protected]> Cc: Randy Dunlap <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Andi Kleen <[email protected]> Cc: Dan Williams <[email protected]> Cc: Huang Ying <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent c045511 commit c6018b4

3 files changed

Lines changed: 95 additions & 1 deletion

File tree

Documentation/admin-guide/mm/numa_memory_policy.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ follows:
408408
Memory Policy APIs
409409
==================
410410

411-
Linux supports 3 system calls for controlling memory policy. These APIS
411+
Linux supports 4 system calls for controlling memory policy. These APIS
412412
always affect only the calling task, the calling task's address space, or
413413
some shared object mapped into the calling task's address space.
414414

@@ -460,6 +460,20 @@ requested via the 'flags' argument.
460460

461461
See the mbind(2) man page for more details.
462462

463+
Set home node for a Range of Task's Address Spacec::
464+
465+
long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
466+
unsigned long home_node,
467+
unsigned long flags);
468+
469+
sys_set_mempolicy_home_node set the home node for a VMA policy present in the
470+
task's address range. The system call updates the home node only for the existing
471+
mempolicy range. Other address ranges are ignored. A home node is the NUMA node
472+
closest to which page allocation will come from. Specifying the home node override
473+
the default allocation policy to allocate memory close to the local node for an
474+
executing CPU.
475+
476+
463477
Memory Policy Command Line Interface
464478
====================================
465479

include/linux/mempolicy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ struct mempolicy {
4646
unsigned short mode; /* See MPOL_* above */
4747
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
4848
nodemask_t nodes; /* interleave/bind/perfer */
49+
int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
4950

5051
union {
5152
nodemask_t cpuset_mems_allowed; /* relative to these nodes */

mm/mempolicy.c

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
296296
atomic_set(&policy->refcnt, 1);
297297
policy->mode = mode;
298298
policy->flags = flags;
299+
policy->home_node = NUMA_NO_NODE;
299300

300301
return policy;
301302
}
@@ -1478,6 +1479,77 @@ static long kernel_mbind(unsigned long start, unsigned long len,
14781479
return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
14791480
}
14801481

1482+
SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1483+
unsigned long, home_node, unsigned long, flags)
1484+
{
1485+
struct mm_struct *mm = current->mm;
1486+
struct vm_area_struct *vma;
1487+
struct mempolicy *new;
1488+
unsigned long vmstart;
1489+
unsigned long vmend;
1490+
unsigned long end;
1491+
int err = -ENOENT;
1492+
1493+
start = untagged_addr(start);
1494+
if (start & ~PAGE_MASK)
1495+
return -EINVAL;
1496+
/*
1497+
* flags is used for future extension if any.
1498+
*/
1499+
if (flags != 0)
1500+
return -EINVAL;
1501+
1502+
/*
1503+
* Check home_node is online to avoid accessing uninitialized
1504+
* NODE_DATA.
1505+
*/
1506+
if (home_node >= MAX_NUMNODES || !node_online(home_node))
1507+
return -EINVAL;
1508+
1509+
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1510+
end = start + len;
1511+
1512+
if (end < start)
1513+
return -EINVAL;
1514+
if (end == start)
1515+
return 0;
1516+
mmap_write_lock(mm);
1517+
vma = find_vma(mm, start);
1518+
for (; vma && vma->vm_start < end; vma = vma->vm_next) {
1519+
1520+
vmstart = max(start, vma->vm_start);
1521+
vmend = min(end, vma->vm_end);
1522+
new = mpol_dup(vma_policy(vma));
1523+
if (IS_ERR(new)) {
1524+
err = PTR_ERR(new);
1525+
break;
1526+
}
1527+
/*
1528+
* Only update home node if there is an existing vma policy
1529+
*/
1530+
if (!new)
1531+
continue;
1532+
1533+
/*
1534+
* If any vma in the range got policy other than MPOL_BIND
1535+
* or MPOL_PREFERRED_MANY we return error. We don't reset
1536+
* the home node for vmas we already updated before.
1537+
*/
1538+
if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
1539+
err = -EOPNOTSUPP;
1540+
break;
1541+
}
1542+
1543+
new->home_node = home_node;
1544+
err = mbind_range(mm, vmstart, vmend, new);
1545+
mpol_put(new);
1546+
if (err)
1547+
break;
1548+
}
1549+
mmap_write_unlock(mm);
1550+
return err;
1551+
}
1552+
14811553
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
14821554
unsigned long, mode, const unsigned long __user *, nmask,
14831555
unsigned long, maxnode, unsigned int, flags)
@@ -1802,6 +1874,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
18021874
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
18031875
}
18041876

1877+
if ((policy->mode == MPOL_BIND ||
1878+
policy->mode == MPOL_PREFERRED_MANY) &&
1879+
policy->home_node != NUMA_NO_NODE)
1880+
return policy->home_node;
1881+
18051882
return nd;
18061883
}
18071884

@@ -2344,6 +2421,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
23442421
return false;
23452422
if (a->flags != b->flags)
23462423
return false;
2424+
if (a->home_node != b->home_node)
2425+
return false;
23472426
if (mpol_store_user_nodemask(a))
23482427
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
23492428
return false;

0 commit comments

Comments
 (0)