| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/mm.h> |
| 3 | #include <linux/rmap.h> |
| 4 | #include <linux/hugetlb.h> |
| 5 | #include <linux/swap.h> |
| 6 | #include <linux/leafops.h> |
| 7 | |
| 8 | #include "internal.h" |
| 9 | |
| 10 | static inline bool not_found(struct page_vma_mapped_walk *pvmw) |
| 11 | { |
| 12 | page_vma_mapped_walk_done(pvmw); |
| 13 | return false; |
| 14 | } |
| 15 | |
| 16 | static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, |
| 17 | spinlock_t **ptlp) |
| 18 | { |
| 19 | bool is_migration; |
| 20 | pte_t ptent; |
| 21 | |
| 22 | if (pvmw->flags & PVMW_SYNC) { |
| 23 | /* Use the stricter lookup */ |
| 24 | pvmw->pte = pte_offset_map_lock(mm: pvmw->vma->vm_mm, pmd: pvmw->pmd, |
| 25 | addr: pvmw->address, ptlp: &pvmw->ptl); |
| 26 | *ptlp = pvmw->ptl; |
| 27 | return !!pvmw->pte; |
| 28 | } |
| 29 | |
| 30 | is_migration = pvmw->flags & PVMW_MIGRATION; |
| 31 | again: |
| 32 | /* |
| 33 | * It is important to return the ptl corresponding to pte, |
| 34 | * in case *pvmw->pmd changes underneath us; so we need to |
| 35 | * return it even when choosing not to lock, in case caller |
| 36 | * proceeds to loop over next ptes, and finds a match later. |
| 37 | * Though, in most cases, page lock already protects this. |
| 38 | */ |
| 39 | pvmw->pte = pte_offset_map_rw_nolock(mm: pvmw->vma->vm_mm, pmd: pvmw->pmd, |
| 40 | addr: pvmw->address, pmdvalp, ptlp); |
| 41 | if (!pvmw->pte) |
| 42 | return false; |
| 43 | |
| 44 | ptent = ptep_get(ptep: pvmw->pte); |
| 45 | |
| 46 | if (pte_none(pte: ptent)) { |
| 47 | return false; |
| 48 | } else if (pte_present(a: ptent)) { |
| 49 | if (is_migration) |
| 50 | return false; |
| 51 | } else if (!is_migration) { |
| 52 | softleaf_t entry; |
| 53 | |
| 54 | /* |
| 55 | * Handle un-addressable ZONE_DEVICE memory. |
| 56 | * |
| 57 | * We get here when we are trying to unmap a private |
| 58 | * device page from the process address space. Such |
| 59 | * page is not CPU accessible and thus is mapped as |
| 60 | * a special swap entry, nonetheless it still does |
| 61 | * count as a valid regular mapping for the page |
| 62 | * (and is accounted as such in page maps count). |
| 63 | * |
| 64 | * So handle this special case as if it was a normal |
| 65 | * page mapping ie lock CPU page table and return true. |
| 66 | * |
| 67 | * For more details on device private memory see HMM |
| 68 | * (include/linux/hmm.h or mm/hmm.c). |
| 69 | */ |
| 70 | entry = softleaf_from_pte(pte: ptent); |
| 71 | if (!softleaf_is_device_private(entry) && |
| 72 | !softleaf_is_device_exclusive(entry)) |
| 73 | return false; |
| 74 | } |
| 75 | spin_lock(lock: *ptlp); |
| 76 | if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) { |
| 77 | pte_unmap_unlock(pvmw->pte, *ptlp); |
| 78 | goto again; |
| 79 | } |
| 80 | pvmw->ptl = *ptlp; |
| 81 | |
| 82 | return true; |
| 83 | } |
| 84 | |
| 85 | /** |
| 86 | * check_pte - check if [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) is |
| 87 | * mapped at the @pvmw->pte |
| 88 | * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range |
| 89 | * for checking |
| 90 | * @pte_nr: the number of small pages described by @pvmw->pte. |
| 91 | * |
| 92 | * page_vma_mapped_walk() found a place where pfn range is *potentially* |
| 93 | * mapped. check_pte() has to validate this. |
| 94 | * |
| 95 | * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to |
| 96 | * arbitrary page. |
| 97 | * |
| 98 | * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration |
| 99 | * entry that points to [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) |
| 100 | * |
| 101 | * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to |
| 102 | * [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) |
| 103 | * |
| 104 | * Otherwise, return false. |
| 105 | * |
| 106 | */ |
| 107 | static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) |
| 108 | { |
| 109 | unsigned long pfn; |
| 110 | pte_t ptent = ptep_get(ptep: pvmw->pte); |
| 111 | |
| 112 | if (pvmw->flags & PVMW_MIGRATION) { |
| 113 | const softleaf_t entry = softleaf_from_pte(pte: ptent); |
| 114 | |
| 115 | if (!softleaf_is_migration(entry)) |
| 116 | return false; |
| 117 | |
| 118 | pfn = softleaf_to_pfn(entry); |
| 119 | } else if (pte_present(a: ptent)) { |
| 120 | pfn = pte_pfn(pte: ptent); |
| 121 | } else { |
| 122 | const softleaf_t entry = softleaf_from_pte(pte: ptent); |
| 123 | |
| 124 | /* Handle un-addressable ZONE_DEVICE memory */ |
| 125 | if (!softleaf_is_device_private(entry) && |
| 126 | !softleaf_is_device_exclusive(entry)) |
| 127 | return false; |
| 128 | |
| 129 | pfn = softleaf_to_pfn(entry); |
| 130 | } |
| 131 | |
| 132 | if ((pfn + pte_nr - 1) < pvmw->pfn) |
| 133 | return false; |
| 134 | if (pfn > (pvmw->pfn + pvmw->nr_pages - 1)) |
| 135 | return false; |
| 136 | return true; |
| 137 | } |
| 138 | |
| 139 | /* Returns true if the two ranges overlap. Careful to not overflow. */ |
| 140 | static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw) |
| 141 | { |
| 142 | if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn) |
| 143 | return false; |
| 144 | if (pfn > pvmw->pfn + pvmw->nr_pages - 1) |
| 145 | return false; |
| 146 | return true; |
| 147 | } |
| 148 | |
| 149 | static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) |
| 150 | { |
| 151 | pvmw->address = (pvmw->address + size) & ~(size - 1); |
| 152 | if (!pvmw->address) |
| 153 | pvmw->address = ULONG_MAX; |
| 154 | } |
| 155 | |
| 156 | /** |
| 157 | * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at |
| 158 | * @pvmw->address |
| 159 | * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags |
| 160 | * must be set. pmd, pte and ptl must be NULL. |
| 161 | * |
| 162 | * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point |
| 163 | * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is |
| 164 | * adjusted if needed (for PTE-mapped THPs). |
| 165 | * |
| 166 | * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page |
| 167 | * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in |
| 168 | * a loop to find all PTEs that map the THP. |
| 169 | * |
| 170 | * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry |
| 171 | * regardless of which page table level the page is mapped at. @pvmw->pmd is |
| 172 | * NULL. |
| 173 | * |
| 174 | * Returns false if there are no more page table entries for the page in |
| 175 | * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped. |
| 176 | * |
| 177 | * If you need to stop the walk before page_vma_mapped_walk() returned false, |
| 178 | * use page_vma_mapped_walk_done(). It will do the housekeeping. |
| 179 | */ |
| 180 | bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) |
| 181 | { |
| 182 | struct vm_area_struct *vma = pvmw->vma; |
| 183 | struct mm_struct *mm = vma->vm_mm; |
| 184 | unsigned long end; |
| 185 | spinlock_t *ptl; |
| 186 | pgd_t *pgd; |
| 187 | p4d_t *p4d; |
| 188 | pud_t *pud; |
| 189 | pmd_t pmde; |
| 190 | |
| 191 | /* The only possible pmd mapping has been handled on last iteration */ |
| 192 | if (pvmw->pmd && !pvmw->pte) |
| 193 | return not_found(pvmw); |
| 194 | |
| 195 | if (unlikely(is_vm_hugetlb_page(vma))) { |
| 196 | struct hstate *hstate = hstate_vma(vma); |
| 197 | unsigned long size = huge_page_size(h: hstate); |
| 198 | /* The only possible mapping was handled on last iteration */ |
| 199 | if (pvmw->pte) |
| 200 | return not_found(pvmw); |
| 201 | /* |
| 202 | * All callers that get here will already hold the |
| 203 | * i_mmap_rwsem. Therefore, no additional locks need to be |
| 204 | * taken before calling hugetlb_walk(). |
| 205 | */ |
| 206 | pvmw->pte = hugetlb_walk(vma, addr: pvmw->address, sz: size); |
| 207 | if (!pvmw->pte) |
| 208 | return false; |
| 209 | |
| 210 | pvmw->ptl = huge_pte_lock(h: hstate, mm, pte: pvmw->pte); |
| 211 | if (!check_pte(pvmw, pte_nr: pages_per_huge_page(h: hstate))) |
| 212 | return not_found(pvmw); |
| 213 | return true; |
| 214 | } |
| 215 | |
| 216 | end = vma_address_end(pvmw); |
| 217 | if (pvmw->pte) |
| 218 | goto next_pte; |
| 219 | restart: |
| 220 | do { |
| 221 | pgd = pgd_offset(mm, pvmw->address); |
| 222 | if (!pgd_present(pgd: *pgd)) { |
| 223 | step_forward(pvmw, PGDIR_SIZE); |
| 224 | continue; |
| 225 | } |
| 226 | p4d = p4d_offset(pgd, address: pvmw->address); |
| 227 | if (!p4d_present(p4d: *p4d)) { |
| 228 | step_forward(pvmw, P4D_SIZE); |
| 229 | continue; |
| 230 | } |
| 231 | pud = pud_offset(p4d, address: pvmw->address); |
| 232 | if (!pud_present(pud: *pud)) { |
| 233 | step_forward(pvmw, PUD_SIZE); |
| 234 | continue; |
| 235 | } |
| 236 | |
| 237 | pvmw->pmd = pmd_offset(pud, address: pvmw->address); |
| 238 | /* |
| 239 | * Make sure the pmd value isn't cached in a register by the |
| 240 | * compiler and used as a stale value after we've observed a |
| 241 | * subsequent update. |
| 242 | */ |
| 243 | pmde = pmdp_get_lockless(pmdp: pvmw->pmd); |
| 244 | |
| 245 | if (pmd_trans_huge(pmd: pmde) || pmd_is_migration_entry(pmd: pmde)) { |
| 246 | pvmw->ptl = pmd_lock(mm, pmd: pvmw->pmd); |
| 247 | pmde = *pvmw->pmd; |
| 248 | if (!pmd_present(pmd: pmde)) { |
| 249 | softleaf_t entry; |
| 250 | |
| 251 | if (!thp_migration_supported() || |
| 252 | !(pvmw->flags & PVMW_MIGRATION)) |
| 253 | return not_found(pvmw); |
| 254 | entry = softleaf_from_pmd(pmd: pmde); |
| 255 | |
| 256 | if (!softleaf_is_migration(entry) || |
| 257 | !check_pmd(pfn: softleaf_to_pfn(entry), pvmw)) |
| 258 | return not_found(pvmw); |
| 259 | return true; |
| 260 | } |
| 261 | if (likely(pmd_trans_huge(pmde))) { |
| 262 | if (pvmw->flags & PVMW_MIGRATION) |
| 263 | return not_found(pvmw); |
| 264 | if (!check_pmd(pfn: pmd_pfn(pmd: pmde), pvmw)) |
| 265 | return not_found(pvmw); |
| 266 | return true; |
| 267 | } |
| 268 | /* THP pmd was split under us: handle on pte level */ |
| 269 | spin_unlock(lock: pvmw->ptl); |
| 270 | pvmw->ptl = NULL; |
| 271 | } else if (!pmd_present(pmd: pmde)) { |
| 272 | /* |
| 273 | * If PVMW_SYNC, take and drop THP pmd lock so that we |
| 274 | * cannot return prematurely, while zap_huge_pmd() has |
| 275 | * cleared *pmd but not decremented compound_mapcount(). |
| 276 | */ |
| 277 | const softleaf_t entry = softleaf_from_pmd(pmd: pmde); |
| 278 | |
| 279 | if (softleaf_is_device_private(entry)) { |
| 280 | pvmw->ptl = pmd_lock(mm, pmd: pvmw->pmd); |
| 281 | return true; |
| 282 | } |
| 283 | |
| 284 | if ((pvmw->flags & PVMW_SYNC) && |
| 285 | thp_vma_suitable_order(vma, addr: pvmw->address, |
| 286 | PMD_ORDER) && |
| 287 | (pvmw->nr_pages >= HPAGE_PMD_NR)) { |
| 288 | spinlock_t *ptl = pmd_lock(mm, pmd: pvmw->pmd); |
| 289 | |
| 290 | spin_unlock(lock: ptl); |
| 291 | } |
| 292 | step_forward(pvmw, PMD_SIZE); |
| 293 | continue; |
| 294 | } |
| 295 | if (!map_pte(pvmw, pmdvalp: &pmde, ptlp: &ptl)) { |
| 296 | if (!pvmw->pte) |
| 297 | goto restart; |
| 298 | goto next_pte; |
| 299 | } |
| 300 | this_pte: |
| 301 | if (check_pte(pvmw, pte_nr: 1)) |
| 302 | return true; |
| 303 | next_pte: |
| 304 | do { |
| 305 | pvmw->address += PAGE_SIZE; |
| 306 | if (pvmw->address >= end) |
| 307 | return not_found(pvmw); |
| 308 | /* Did we cross page table boundary? */ |
| 309 | if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { |
| 310 | if (pvmw->ptl) { |
| 311 | spin_unlock(lock: pvmw->ptl); |
| 312 | pvmw->ptl = NULL; |
| 313 | } |
| 314 | pte_unmap(pte: pvmw->pte); |
| 315 | pvmw->pte = NULL; |
| 316 | pvmw->flags |= PVMW_PGTABLE_CROSSED; |
| 317 | goto restart; |
| 318 | } |
| 319 | pvmw->pte++; |
| 320 | } while (pte_none(pte: ptep_get(ptep: pvmw->pte))); |
| 321 | |
| 322 | if (!pvmw->ptl) { |
| 323 | spin_lock(lock: ptl); |
| 324 | if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) { |
| 325 | pte_unmap_unlock(pvmw->pte, ptl); |
| 326 | pvmw->pte = NULL; |
| 327 | goto restart; |
| 328 | } |
| 329 | pvmw->ptl = ptl; |
| 330 | } |
| 331 | goto this_pte; |
| 332 | } while (pvmw->address < end); |
| 333 | |
| 334 | return false; |
| 335 | } |
| 336 | |
| 337 | #ifdef CONFIG_MEMORY_FAILURE |
| 338 | /** |
| 339 | * page_mapped_in_vma - check whether a page is really mapped in a VMA |
| 340 | * @page: the page to test |
| 341 | * @vma: the VMA to test |
| 342 | * |
| 343 | * Return: The address the page is mapped at if the page is in the range |
| 344 | * covered by the VMA and present in the page table. If the page is |
| 345 | * outside the VMA or not present, returns -EFAULT. |
| 346 | * Only valid for normal file or anonymous VMAs. |
| 347 | */ |
| 348 | unsigned long page_mapped_in_vma(const struct page *page, |
| 349 | struct vm_area_struct *vma) |
| 350 | { |
| 351 | const struct folio *folio = page_folio(page); |
| 352 | struct page_vma_mapped_walk pvmw = { |
| 353 | .pfn = page_to_pfn(page), |
| 354 | .nr_pages = 1, |
| 355 | .vma = vma, |
| 356 | .flags = PVMW_SYNC, |
| 357 | }; |
| 358 | |
| 359 | pvmw.address = vma_address(vma, pgoff: page_pgoff(folio, page), nr_pages: 1); |
| 360 | if (pvmw.address == -EFAULT) |
| 361 | goto out; |
| 362 | if (!page_vma_mapped_walk(pvmw: &pvmw)) |
| 363 | return -EFAULT; |
| 364 | page_vma_mapped_walk_done(pvmw: &pvmw); |
| 365 | out: |
| 366 | return pvmw.address; |
| 367 | } |
| 368 | #endif |
| 369 | |