Skip to content

Commit b24f53a

Browse files
Lee SchermerhornMel Gorman
authored andcommitted
mm: mempolicy: Add MPOL_MF_LAZY
NOTE: Once again there is a lot of patch stealing and the end result is sufficiently different that I had to drop the signed-offs. Will re-add if the original authors are ok with that. This patch adds another mbind() flag to request "lazy migration". The flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected pages are marked PROT_NONE. The pages will be migrated in the fault path on "first touch", if the policy dictates at that time. "Lazy Migration" will allow testing of migrate-on-fault via mbind(). Also allows applications to specify that only subsequently touched pages be migrated to obey new policy, instead of all pages in range. This can be useful for multi-threaded applications working on a large shared data area that is initialized by an initial thread resulting in all pages on one [or a few, if overflowed] nodes. After PROT_NONE, the pages in regions assigned to the worker threads will be automatically migrated local to the threads on 1st touch. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com>
1 parent 4daae3b commit b24f53a

3 files changed

Lines changed: 185 additions & 18 deletions

File tree

include/linux/mm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
15511551
}
15521552
#endif
15531553

1554+
#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
1555+
void change_prot_numa(struct vm_area_struct *vma,
1556+
unsigned long start, unsigned long end);
1557+
#endif
1558+
15541559
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
15551560
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
15561561
unsigned long pfn, unsigned long size, pgprot_t);

include/uapi/linux/mempolicy.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,16 @@ enum mpol_rebind_step {
4949

5050
/* Flags for mbind */
5151
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
52-
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
53-
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
54-
#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
52+
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
53+
to policy */
54+
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
55+
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
56+
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
57+
58+
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
59+
MPOL_MF_MOVE | \
60+
MPOL_MF_MOVE_ALL | \
61+
MPOL_MF_LAZY)
5562

5663
/*
5764
* Internal flags that share the struct mempolicy flags word with

mm/mempolicy.c

Lines changed: 170 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
#include <linux/syscalls.h>
9191
#include <linux/ctype.h>
9292
#include <linux/mm_inline.h>
93+
#include <linux/mmu_notifier.h>
9394

9495
#include <asm/tlbflush.h>
9596
#include <asm/uaccess.h>
@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
565566
return 0;
566567
}
567568

569+
#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
570+
/*
571+
* Here we search for not shared page mappings (mapcount == 1) and we
572+
* set up the pmd/pte_numa on those mappings so the very next access
573+
* will fire a NUMA hinting page fault.
574+
*/
575+
static int
576+
change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma,
577+
unsigned long address)
578+
{
579+
pgd_t *pgd;
580+
pud_t *pud;
581+
pmd_t *pmd;
582+
pte_t *pte, *_pte;
583+
struct page *page;
584+
unsigned long _address, end;
585+
spinlock_t *ptl;
586+
int ret = 0;
587+
588+
VM_BUG_ON(address & ~PAGE_MASK);
589+
590+
pgd = pgd_offset(mm, address);
591+
if (!pgd_present(*pgd))
592+
goto out;
593+
594+
pud = pud_offset(pgd, address);
595+
if (!pud_present(*pud))
596+
goto out;
597+
598+
pmd = pmd_offset(pud, address);
599+
if (pmd_none(*pmd))
600+
goto out;
601+
602+
if (pmd_trans_huge_lock(pmd, vma) == 1) {
603+
int page_nid;
604+
ret = HPAGE_PMD_NR;
605+
606+
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
607+
608+
if (pmd_numa(*pmd)) {
609+
spin_unlock(&mm->page_table_lock);
610+
goto out;
611+
}
612+
613+
page = pmd_page(*pmd);
614+
615+
/* only check non-shared pages */
616+
if (page_mapcount(page) != 1) {
617+
spin_unlock(&mm->page_table_lock);
618+
goto out;
619+
}
620+
621+
page_nid = page_to_nid(page);
622+
623+
if (pmd_numa(*pmd)) {
624+
spin_unlock(&mm->page_table_lock);
625+
goto out;
626+
}
627+
628+
set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
629+
ret += HPAGE_PMD_NR;
630+
/* defer TLB flush to lower the overhead */
631+
spin_unlock(&mm->page_table_lock);
632+
goto out;
633+
}
634+
635+
if (pmd_trans_unstable(pmd))
636+
goto out;
637+
VM_BUG_ON(!pmd_present(*pmd));
638+
639+
end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK);
640+
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
641+
for (_address = address, _pte = pte; _address < end;
642+
_pte++, _address += PAGE_SIZE) {
643+
pte_t pteval = *_pte;
644+
if (!pte_present(pteval))
645+
continue;
646+
if (pte_numa(pteval))
647+
continue;
648+
page = vm_normal_page(vma, _address, pteval);
649+
if (unlikely(!page))
650+
continue;
651+
/* only check non-shared pages */
652+
if (page_mapcount(page) != 1)
653+
continue;
654+
655+
set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
656+
657+
/* defer TLB flush to lower the overhead */
658+
ret++;
659+
}
660+
pte_unmap_unlock(pte, ptl);
661+
662+
if (ret && !pmd_numa(*pmd)) {
663+
spin_lock(&mm->page_table_lock);
664+
set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
665+
spin_unlock(&mm->page_table_lock);
666+
/* defer TLB flush to lower the overhead */
667+
}
668+
669+
out:
670+
return ret;
671+
}
672+
673+
/* Assumes mmap_sem is held */
674+
void
675+
change_prot_numa(struct vm_area_struct *vma,
676+
unsigned long address, unsigned long end)
677+
{
678+
struct mm_struct *mm = vma->vm_mm;
679+
int progress = 0;
680+
681+
while (address < end) {
682+
VM_BUG_ON(address < vma->vm_start ||
683+
address + PAGE_SIZE > vma->vm_end);
684+
685+
progress += change_prot_numa_range(mm, vma, address);
686+
address = (address + PMD_SIZE) & PMD_MASK;
687+
}
688+
689+
/*
690+
* Flush the TLB for the mm to start the NUMA hinting
691+
* page faults after we finish scanning this vma part
692+
* if there were any PTE updates
693+
*/
694+
if (progress) {
695+
mmu_notifier_invalidate_range_start(vma->vm_mm, address, end);
696+
flush_tlb_range(vma, address, end);
697+
mmu_notifier_invalidate_range_end(vma->vm_mm, address, end);
698+
}
699+
}
700+
#else
701+
static unsigned long change_prot_numa(struct vm_area_struct *vma,
702+
unsigned long addr, unsigned long end)
703+
{
704+
return 0;
705+
}
706+
#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
707+
568708
/*
569709
* Check if all pages in a range are on a set of nodes.
570710
* If pagelist != NULL then isolate pages from the LRU and
@@ -583,29 +723,40 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
583723
return ERR_PTR(-EFAULT);
584724
prev = NULL;
585725
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
726+
unsigned long endvma = vma->vm_end;
727+
728+
if (endvma > end)
729+
endvma = end;
730+
if (vma->vm_start > start)
731+
start = vma->vm_start;
732+
586733
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
587734
if (!vma->vm_next && vma->vm_end < end)
588735
return ERR_PTR(-EFAULT);
589736
if (prev && prev->vm_end < vma->vm_start)
590737
return ERR_PTR(-EFAULT);
591738
}
592-
if (!is_vm_hugetlb_page(vma) &&
593-
((flags & MPOL_MF_STRICT) ||
739+
740+
if (is_vm_hugetlb_page(vma))
741+
goto next;
742+
743+
if (flags & MPOL_MF_LAZY) {
744+
change_prot_numa(vma, start, endvma);
745+
goto next;
746+
}
747+
748+
if ((flags & MPOL_MF_STRICT) ||
594749
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
595-
vma_migratable(vma)))) {
596-
unsigned long endvma = vma->vm_end;
750+
vma_migratable(vma))) {
597751

598-
if (endvma > end)
599-
endvma = end;
600-
if (vma->vm_start > start)
601-
start = vma->vm_start;
602752
err = check_pgd_range(vma, start, endvma, nodes,
603753
flags, private);
604754
if (err) {
605755
first = ERR_PTR(err);
606756
break;
607757
}
608758
}
759+
next:
609760
prev = vma;
610761
}
611762
return first;
@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len,
11381289
int err;
11391290
LIST_HEAD(pagelist);
11401291

1141-
if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1142-
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1292+
if (flags & ~(unsigned long)MPOL_MF_VALID)
11431293
return -EINVAL;
11441294
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
11451295
return -EPERM;
@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len,
11621312
if (IS_ERR(new))
11631313
return PTR_ERR(new);
11641314

1315+
if (flags & MPOL_MF_LAZY)
1316+
new->flags |= MPOL_F_MOF;
1317+
11651318
/*
11661319
* If we are using the default policy then operation
11671320
* on discontinuous address spaces is okay after all
@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len,
11981351
vma = check_range(mm, start, end, nmask,
11991352
flags | MPOL_MF_INVERT, &pagelist);
12001353

1201-
err = PTR_ERR(vma);
1202-
if (!IS_ERR(vma)) {
1203-
int nr_failed = 0;
1204-
1354+
err = PTR_ERR(vma); /* maybe ... */
1355+
if (!IS_ERR(vma) && mode != MPOL_NOOP)
12051356
err = mbind_range(mm, start, end, new);
12061357

1358+
if (!err) {
1359+
int nr_failed = 0;
1360+
12071361
if (!list_empty(&pagelist)) {
1362+
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
12081363
nr_failed = migrate_pages(&pagelist, new_vma_page,
12091364
(unsigned long)vma,
12101365
false, MIGRATE_SYNC,
@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len,
12131368
putback_lru_pages(&pagelist);
12141369
}
12151370

1216-
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1371+
if (nr_failed && (flags & MPOL_MF_STRICT))
12171372
err = -EIO;
12181373
} else
12191374
putback_lru_pages(&pagelist);

0 commit comments

Comments
 (0)