| 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | #ifndef _LINUX_PAGEWALK_H |
| 3 | #define _LINUX_PAGEWALK_H |
| 4 | |
| 5 | #include <linux/mm.h> |
| 6 | |
| 7 | struct mm_walk; |
| 8 | |
| 9 | /* Locking requirement during a page walk. */ |
| 10 | enum page_walk_lock { |
| 11 | /* mmap_lock should be locked for read to stabilize the vma tree */ |
| 12 | PGWALK_RDLOCK = 0, |
| 13 | /* vma will be write-locked during the walk */ |
| 14 | PGWALK_WRLOCK = 1, |
| 15 | /* vma is expected to be already write-locked during the walk */ |
| 16 | PGWALK_WRLOCK_VERIFY = 2, |
| 17 | /* vma is expected to be already read-locked during the walk */ |
| 18 | PGWALK_VMA_RDLOCK_VERIFY = 3, |
| 19 | }; |
| 20 | |
| 21 | /** |
| 22 | * struct mm_walk_ops - callbacks for walk_page_range |
| 23 | * @pgd_entry: if set, called for each non-empty PGD (top-level) entry |
| 24 | * @p4d_entry: if set, called for each non-empty P4D entry |
| 25 | * @pud_entry: if set, called for each non-empty PUD entry |
| 26 | * @pmd_entry: if set, called for each non-empty PMD entry |
| 27 | * this handler is required to be able to handle |
| 28 | * pmd_trans_huge() pmds. They may simply choose to |
| 29 | * split_huge_page() instead of handling it explicitly. |
| 30 | * @pte_entry: if set, called for each PTE (lowest-level) entry |
| 31 | * including empty ones, except if @install_pte is set. |
| 32 | * If @install_pte is set, @pte_entry is called only for |
| 33 | * existing PTEs. |
| 34 | * @pte_hole: if set, called for each hole at all levels, |
| 35 | * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. |
| 36 | * Any folded depths (where PTRS_PER_P?D is equal to 1) |
| 37 | * are skipped. If @install_pte is specified, this will |
| 38 | * not trigger for any populated ranges. |
| 39 | * @hugetlb_entry: if set, called for each hugetlb entry. This hook |
| 40 | * function is called with the vma lock held, in order to |
| 41 | * protect against a concurrent freeing of the pte_t* or |
| 42 | * the ptl. In some cases, the hook function needs to drop |
| 43 | * and retake the vma lock in order to avoid deadlocks |
| 44 | * while calling other functions. In such cases the hook |
| 45 | * function must either refrain from accessing the pte or |
| 46 | * ptl after dropping the vma lock, or else revalidate |
| 47 | * those items after re-acquiring the vma lock and before |
| 48 | * accessing them. |
| 49 | * @test_walk: caller specific callback function to determine whether |
| 50 | * we walk over the current vma or not. Returning 0 means |
| 51 | * "do page table walk over the current vma", returning |
| 52 | * a negative value means "abort current page table walk |
| 53 | * right now" and returning 1 means "skip the current vma" |
| 54 | * Note that this callback is not called when the caller |
| 55 | * passes in a single VMA as for walk_page_vma(). |
| 56 | * @pre_vma: if set, called before starting walk on a non-null vma. |
| 57 | * @post_vma: if set, called after a walk on a non-null vma, provided |
| 58 | * that @pre_vma and the vma walk succeeded. |
| 59 | * @install_pte: if set, missing page table entries are installed and |
| 60 | * thus all levels are always walked in the specified |
| 61 | * range. This callback is then invoked at the PTE level |
| 62 | * (having split any THP pages prior), providing the PTE to |
| 63 | * install. If allocations fail, the walk is aborted. This |
| 64 | * operation is only available for userland memory. Not |
| 65 | * usable for hugetlb ranges. |
| 66 | * |
| 67 | * p?d_entry callbacks are called even if those levels are folded on a |
| 68 | * particular architecture/configuration. |
| 69 | */ |
| 70 | struct mm_walk_ops { |
| 71 | int (*pgd_entry)(pgd_t *pgd, unsigned long addr, |
| 72 | unsigned long next, struct mm_walk *walk); |
| 73 | int (*p4d_entry)(p4d_t *p4d, unsigned long addr, |
| 74 | unsigned long next, struct mm_walk *walk); |
| 75 | int (*pud_entry)(pud_t *pud, unsigned long addr, |
| 76 | unsigned long next, struct mm_walk *walk); |
| 77 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, |
| 78 | unsigned long next, struct mm_walk *walk); |
| 79 | int (*pte_entry)(pte_t *pte, unsigned long addr, |
| 80 | unsigned long next, struct mm_walk *walk); |
| 81 | int (*pte_hole)(unsigned long addr, unsigned long next, |
| 82 | int depth, struct mm_walk *walk); |
| 83 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, |
| 84 | unsigned long addr, unsigned long next, |
| 85 | struct mm_walk *walk); |
| 86 | int (*test_walk)(unsigned long addr, unsigned long next, |
| 87 | struct mm_walk *walk); |
| 88 | int (*pre_vma)(unsigned long start, unsigned long end, |
| 89 | struct mm_walk *walk); |
| 90 | void (*post_vma)(struct mm_walk *walk); |
| 91 | int (*install_pte)(unsigned long addr, unsigned long next, |
| 92 | pte_t *ptep, struct mm_walk *walk); |
| 93 | enum page_walk_lock walk_lock; |
| 94 | }; |
| 95 | |
| 96 | /* |
| 97 | * Action for pud_entry / pmd_entry callbacks. |
| 98 | * ACTION_SUBTREE is the default |
| 99 | */ |
| 100 | enum page_walk_action { |
| 101 | /* Descend to next level, splitting huge pages if needed and possible */ |
| 102 | ACTION_SUBTREE = 0, |
| 103 | /* Continue to next entry at this level (ignoring any subtree) */ |
| 104 | ACTION_CONTINUE = 1, |
| 105 | /* Call again for this entry */ |
| 106 | ACTION_AGAIN = 2 |
| 107 | }; |
| 108 | |
| 109 | /** |
| 110 | * struct mm_walk - walk_page_range data |
| 111 | * @ops: operation to call during the walk |
| 112 | * @mm: mm_struct representing the target process of page table walk |
| 113 | * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) |
| 114 | * @vma: vma currently walked (NULL if walking outside vmas) |
| 115 | * @action: next action to perform (see enum page_walk_action) |
| 116 | * @no_vma: walk ignoring vmas (vma will always be NULL) |
| 117 | * @private: private data for callbacks' usage |
| 118 | * |
| 119 | * (see the comment on walk_page_range() for more details) |
| 120 | */ |
| 121 | struct mm_walk { |
| 122 | const struct mm_walk_ops *ops; |
| 123 | struct mm_struct *mm; |
| 124 | pgd_t *pgd; |
| 125 | struct vm_area_struct *vma; |
| 126 | enum page_walk_action action; |
| 127 | bool no_vma; |
| 128 | void *private; |
| 129 | }; |
| 130 | |
| 131 | int walk_page_range(struct mm_struct *mm, unsigned long start, |
| 132 | unsigned long end, const struct mm_walk_ops *ops, |
| 133 | void *private); |
| 134 | int walk_kernel_page_table_range(unsigned long start, |
| 135 | unsigned long end, const struct mm_walk_ops *ops, |
| 136 | pgd_t *pgd, void *private); |
| 137 | int walk_kernel_page_table_range_lockless(unsigned long start, |
| 138 | unsigned long end, const struct mm_walk_ops *ops, |
| 139 | pgd_t *pgd, void *private); |
| 140 | int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, |
| 141 | unsigned long end, const struct mm_walk_ops *ops, |
| 142 | void *private); |
| 143 | int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, |
| 144 | void *private); |
| 145 | int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, |
| 146 | pgoff_t nr, const struct mm_walk_ops *ops, |
| 147 | void *private); |
| 148 | |
| 149 | typedef int __bitwise folio_walk_flags_t; |
| 150 | |
| 151 | /* |
| 152 | * Walk migration entries as well. Careful: a large folio might get split |
| 153 | * concurrently. |
| 154 | */ |
| 155 | #define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) |
| 156 | |
| 157 | /* Walk shared zeropages (small + huge) as well. */ |
| 158 | #define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) |
| 159 | |
| 160 | enum folio_walk_level { |
| 161 | FW_LEVEL_PTE, |
| 162 | FW_LEVEL_PMD, |
| 163 | FW_LEVEL_PUD, |
| 164 | }; |
| 165 | |
| 166 | /** |
| 167 | * struct folio_walk - folio_walk_start() / folio_walk_end() data |
| 168 | * @page: exact folio page referenced (if applicable) |
| 169 | * @level: page table level identifying the entry type |
| 170 | * @pte: pointer to the page table entry (FW_LEVEL_PTE). |
| 171 | * @pmd: pointer to the page table entry (FW_LEVEL_PMD). |
| 172 | * @pud: pointer to the page table entry (FW_LEVEL_PUD). |
| 173 | * @ptl: pointer to the page table lock. |
| 174 | * |
| 175 | * (see folio_walk_start() documentation for more details) |
| 176 | */ |
| 177 | struct folio_walk { |
| 178 | /* public */ |
| 179 | struct page *page; |
| 180 | enum folio_walk_level level; |
| 181 | union { |
| 182 | pte_t *ptep; |
| 183 | pud_t *pudp; |
| 184 | pmd_t *pmdp; |
| 185 | }; |
| 186 | union { |
| 187 | pte_t pte; |
| 188 | pud_t pud; |
| 189 | pmd_t pmd; |
| 190 | }; |
| 191 | /* private */ |
| 192 | struct vm_area_struct *vma; |
| 193 | spinlock_t *ptl; |
| 194 | }; |
| 195 | |
| 196 | struct folio *folio_walk_start(struct folio_walk *fw, |
| 197 | struct vm_area_struct *vma, unsigned long addr, |
| 198 | folio_walk_flags_t flags); |
| 199 | |
| 200 | #define folio_walk_end(__fw, __vma) do { \ |
| 201 | spin_unlock((__fw)->ptl); \ |
| 202 | if (likely((__fw)->level == FW_LEVEL_PTE)) \ |
| 203 | pte_unmap((__fw)->ptep); \ |
| 204 | vma_pgtable_walk_end(__vma); \ |
| 205 | } while (0) |
| 206 | |
| 207 | #endif /* _LINUX_PAGEWALK_H */ |
| 208 | |