9090#include <linux/syscalls.h>
9191#include <linux/ctype.h>
9292#include <linux/mm_inline.h>
93+ #include <linux/mmu_notifier.h>
9394
9495#include <asm/tlbflush.h>
9596#include <asm/uaccess.h>
@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
565566 return 0 ;
566567}
567568
569+ #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
570+ /*
571+ * Here we search for not shared page mappings (mapcount == 1) and we
572+ * set up the pmd/pte_numa on those mappings so the very next access
573+ * will fire a NUMA hinting page fault.
574+ */
575+ static int
576+ change_prot_numa_range (struct mm_struct * mm , struct vm_area_struct * vma ,
577+ unsigned long address )
578+ {
579+ pgd_t * pgd ;
580+ pud_t * pud ;
581+ pmd_t * pmd ;
582+ pte_t * pte , * _pte ;
583+ struct page * page ;
584+ unsigned long _address , end ;
585+ spinlock_t * ptl ;
586+ int ret = 0 ;
587+
588+ VM_BUG_ON (address & ~PAGE_MASK );
589+
590+ pgd = pgd_offset (mm , address );
591+ if (!pgd_present (* pgd ))
592+ goto out ;
593+
594+ pud = pud_offset (pgd , address );
595+ if (!pud_present (* pud ))
596+ goto out ;
597+
598+ pmd = pmd_offset (pud , address );
599+ if (pmd_none (* pmd ))
600+ goto out ;
601+
602+ if (pmd_trans_huge_lock (pmd , vma ) == 1 ) {
603+ int page_nid ;
604+ ret = HPAGE_PMD_NR ;
605+
606+ VM_BUG_ON (address & ~HPAGE_PMD_MASK );
607+
608+ if (pmd_numa (* pmd )) {
609+ spin_unlock (& mm -> page_table_lock );
610+ goto out ;
611+ }
612+
613+ page = pmd_page (* pmd );
614+
615+ /* only check non-shared pages */
616+ if (page_mapcount (page ) != 1 ) {
617+ spin_unlock (& mm -> page_table_lock );
618+ goto out ;
619+ }
620+
621+ page_nid = page_to_nid (page );
622+
623+ if (pmd_numa (* pmd )) {
624+ spin_unlock (& mm -> page_table_lock );
625+ goto out ;
626+ }
627+
628+ set_pmd_at (mm , address , pmd , pmd_mknuma (* pmd ));
629+ ret += HPAGE_PMD_NR ;
630+ /* defer TLB flush to lower the overhead */
631+ spin_unlock (& mm -> page_table_lock );
632+ goto out ;
633+ }
634+
635+ if (pmd_trans_unstable (pmd ))
636+ goto out ;
637+ VM_BUG_ON (!pmd_present (* pmd ));
638+
639+ end = min (vma -> vm_end , (address + PMD_SIZE ) & PMD_MASK );
640+ pte = pte_offset_map_lock (mm , pmd , address , & ptl );
641+ for (_address = address , _pte = pte ; _address < end ;
642+ _pte ++ , _address += PAGE_SIZE ) {
643+ pte_t pteval = * _pte ;
644+ if (!pte_present (pteval ))
645+ continue ;
646+ if (pte_numa (pteval ))
647+ continue ;
648+ page = vm_normal_page (vma , _address , pteval );
649+ if (unlikely (!page ))
650+ continue ;
651+ /* only check non-shared pages */
652+ if (page_mapcount (page ) != 1 )
653+ continue ;
654+
655+ set_pte_at (mm , _address , _pte , pte_mknuma (pteval ));
656+
657+ /* defer TLB flush to lower the overhead */
658+ ret ++ ;
659+ }
660+ pte_unmap_unlock (pte , ptl );
661+
662+ if (ret && !pmd_numa (* pmd )) {
663+ spin_lock (& mm -> page_table_lock );
664+ set_pmd_at (mm , address , pmd , pmd_mknuma (* pmd ));
665+ spin_unlock (& mm -> page_table_lock );
666+ /* defer TLB flush to lower the overhead */
667+ }
668+
669+ out :
670+ return ret ;
671+ }
672+
673+ /* Assumes mmap_sem is held */
674+ void
675+ change_prot_numa (struct vm_area_struct * vma ,
676+ unsigned long address , unsigned long end )
677+ {
678+ struct mm_struct * mm = vma -> vm_mm ;
679+ int progress = 0 ;
680+
681+ while (address < end ) {
682+ VM_BUG_ON (address < vma -> vm_start ||
683+ address + PAGE_SIZE > vma -> vm_end );
684+
685+ progress += change_prot_numa_range (mm , vma , address );
686+ address = (address + PMD_SIZE ) & PMD_MASK ;
687+ }
688+
689+ /*
690+ * Flush the TLB for the mm to start the NUMA hinting
691+ * page faults after we finish scanning this vma part
692+ * if there were any PTE updates
693+ */
694+ if (progress ) {
695+ mmu_notifier_invalidate_range_start (vma -> vm_mm , address , end );
696+ flush_tlb_range (vma , address , end );
697+ mmu_notifier_invalidate_range_end (vma -> vm_mm , address , end );
698+ }
699+ }
700+ #else
701+ static unsigned long change_prot_numa (struct vm_area_struct * vma ,
702+ unsigned long addr , unsigned long end )
703+ {
704+ return 0 ;
705+ }
706+ #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
707+
568708/*
569709 * Check if all pages in a range are on a set of nodes.
570710 * If pagelist != NULL then isolate pages from the LRU and
@@ -583,29 +723,40 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
583723 return ERR_PTR (- EFAULT );
584724 prev = NULL ;
585725 for (vma = first ; vma && vma -> vm_start < end ; vma = vma -> vm_next ) {
726+ unsigned long endvma = vma -> vm_end ;
727+
728+ if (endvma > end )
729+ endvma = end ;
730+ if (vma -> vm_start > start )
731+ start = vma -> vm_start ;
732+
586733 if (!(flags & MPOL_MF_DISCONTIG_OK )) {
587734 if (!vma -> vm_next && vma -> vm_end < end )
588735 return ERR_PTR (- EFAULT );
589736 if (prev && prev -> vm_end < vma -> vm_start )
590737 return ERR_PTR (- EFAULT );
591738 }
592- if (!is_vm_hugetlb_page (vma ) &&
593- ((flags & MPOL_MF_STRICT ) ||
739+
740+ if (is_vm_hugetlb_page (vma ))
741+ goto next ;
742+
743+ if (flags & MPOL_MF_LAZY ) {
744+ change_prot_numa (vma , start , endvma );
745+ goto next ;
746+ }
747+
748+ if ((flags & MPOL_MF_STRICT ) ||
594749 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL )) &&
595- vma_migratable (vma )))) {
596- unsigned long endvma = vma -> vm_end ;
750+ vma_migratable (vma ))) {
597751
598- if (endvma > end )
599- endvma = end ;
600- if (vma -> vm_start > start )
601- start = vma -> vm_start ;
602752 err = check_pgd_range (vma , start , endvma , nodes ,
603753 flags , private );
604754 if (err ) {
605755 first = ERR_PTR (err );
606756 break ;
607757 }
608758 }
759+ next :
609760 prev = vma ;
610761 }
611762 return first ;
@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len,
11381289 int err ;
11391290 LIST_HEAD (pagelist );
11401291
1141- if (flags & ~(unsigned long )(MPOL_MF_STRICT |
1142- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL ))
1292+ if (flags & ~(unsigned long )MPOL_MF_VALID )
11431293 return - EINVAL ;
11441294 if ((flags & MPOL_MF_MOVE_ALL ) && !capable (CAP_SYS_NICE ))
11451295 return - EPERM ;
@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len,
11621312 if (IS_ERR (new ))
11631313 return PTR_ERR (new );
11641314
1315+ if (flags & MPOL_MF_LAZY )
1316+ new -> flags |= MPOL_F_MOF ;
1317+
11651318 /*
11661319 * If we are using the default policy then operation
11671320 * on discontinuous address spaces is okay after all
@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len,
11981351 vma = check_range (mm , start , end , nmask ,
11991352 flags | MPOL_MF_INVERT , & pagelist );
12001353
1201- err = PTR_ERR (vma );
1202- if (!IS_ERR (vma )) {
1203- int nr_failed = 0 ;
1204-
1354+ err = PTR_ERR (vma ); /* maybe ... */
1355+ if (!IS_ERR (vma ) && mode != MPOL_NOOP )
12051356 err = mbind_range (mm , start , end , new );
12061357
1358+ if (!err ) {
1359+ int nr_failed = 0 ;
1360+
12071361 if (!list_empty (& pagelist )) {
1362+ WARN_ON_ONCE (flags & MPOL_MF_LAZY );
12081363 nr_failed = migrate_pages (& pagelist , new_vma_page ,
12091364 (unsigned long )vma ,
12101365 false, MIGRATE_SYNC ,
@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len,
12131368 putback_lru_pages (& pagelist );
12141369 }
12151370
1216- if (! err && nr_failed && (flags & MPOL_MF_STRICT ))
1371+ if (nr_failed && (flags & MPOL_MF_STRICT ))
12171372 err = - EIO ;
12181373 } else
12191374 putback_lru_pages (& pagelist );
0 commit comments