1// SPDX-License-Identifier: GPL-2.0
2/*
3 * KVM guest address space mapping code
4 *
5 * Copyright IBM Corp. 2007, 2020
6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7 * David Hildenbrand <david@redhat.com>
8 * Janosch Frank <frankja@linux.vnet.ibm.com>
9 */
10
11#include <linux/cpufeature.h>
12#include <linux/export.h>
13#include <linux/kernel.h>
14#include <linux/pagewalk.h>
15#include <linux/swap.h>
16#include <linux/smp.h>
17#include <linux/spinlock.h>
18#include <linux/slab.h>
19#include <linux/swapops.h>
20#include <linux/ksm.h>
21#include <linux/mman.h>
22#include <linux/pgtable.h>
23#include <asm/page-states.h>
24#include <asm/pgalloc.h>
25#include <asm/machine.h>
26#include <asm/gmap_helpers.h>
27#include <asm/gmap.h>
28#include <asm/page.h>
29
30/*
31 * The address is saved in a radix tree directly; NULL would be ambiguous,
32 * since 0 is a valid address, and NULL is returned when nothing was found.
33 * The lower bits are ignored by all users of the macro, so it can be used
34 * to distinguish a valid address 0 from a NULL.
35 */
36#define VALID_GADDR_FLAG 1
37#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
38#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
39
40#define GMAP_SHADOW_FAKE_TABLE 1ULL
41
42static struct page *gmap_alloc_crst(void)
43{
44 struct page *page;
45
46 page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
47 if (!page)
48 return NULL;
49 __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
50 return page;
51}
52
53/**
54 * gmap_alloc - allocate and initialize a guest address space
55 * @limit: maximum address of the gmap address space
56 *
57 * Returns a guest address space structure.
58 */
59struct gmap *gmap_alloc(unsigned long limit)
60{
61 struct gmap *gmap;
62 struct page *page;
63 unsigned long *table;
64 unsigned long etype, atype;
65
66 if (limit < _REGION3_SIZE) {
67 limit = _REGION3_SIZE - 1;
68 atype = _ASCE_TYPE_SEGMENT;
69 etype = _SEGMENT_ENTRY_EMPTY;
70 } else if (limit < _REGION2_SIZE) {
71 limit = _REGION2_SIZE - 1;
72 atype = _ASCE_TYPE_REGION3;
73 etype = _REGION3_ENTRY_EMPTY;
74 } else if (limit < _REGION1_SIZE) {
75 limit = _REGION1_SIZE - 1;
76 atype = _ASCE_TYPE_REGION2;
77 etype = _REGION2_ENTRY_EMPTY;
78 } else {
79 limit = -1UL;
80 atype = _ASCE_TYPE_REGION1;
81 etype = _REGION1_ENTRY_EMPTY;
82 }
83 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
84 if (!gmap)
85 goto out;
86 INIT_LIST_HEAD(list: &gmap->children);
87 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
88 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
89 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
90 spin_lock_init(&gmap->guest_table_lock);
91 spin_lock_init(&gmap->shadow_lock);
92 refcount_set(r: &gmap->ref_count, n: 1);
93 page = gmap_alloc_crst();
94 if (!page)
95 goto out_free;
96 table = page_to_virt(page);
97 crst_table_init(table, etype);
98 gmap->table = table;
99 gmap->asce = atype | _ASCE_TABLE_LENGTH |
100 _ASCE_USER_BITS | __pa(table);
101 gmap->asce_end = limit;
102 return gmap;
103
104out_free:
105 kfree(objp: gmap);
106out:
107 return NULL;
108}
109EXPORT_SYMBOL_GPL(gmap_alloc);
110
111/**
112 * gmap_create - create a guest address space
113 * @mm: pointer to the parent mm_struct
114 * @limit: maximum size of the gmap address space
115 *
116 * Returns a guest address space structure.
117 */
118struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
119{
120 struct gmap *gmap;
121 unsigned long gmap_asce;
122
123 gmap = gmap_alloc(limit);
124 if (!gmap)
125 return NULL;
126 gmap->mm = mm;
127 spin_lock(lock: &mm->context.lock);
128 list_add_rcu(new: &gmap->list, head: &mm->context.gmap_list);
129 if (list_is_singular(head: &mm->context.gmap_list))
130 gmap_asce = gmap->asce;
131 else
132 gmap_asce = -1UL;
133 WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
134 spin_unlock(lock: &mm->context.lock);
135 return gmap;
136}
137EXPORT_SYMBOL_GPL(gmap_create);
138
139static void gmap_flush_tlb(struct gmap *gmap)
140{
141 __tlb_flush_idte(gmap->asce);
142}
143
144static void gmap_radix_tree_free(struct radix_tree_root *root)
145{
146 struct radix_tree_iter iter;
147 unsigned long indices[16];
148 unsigned long index;
149 void __rcu **slot;
150 int i, nr;
151
152 /* A radix tree is freed by deleting all of its entries */
153 index = 0;
154 do {
155 nr = 0;
156 radix_tree_for_each_slot(slot, root, &iter, index) {
157 indices[nr] = iter.index;
158 if (++nr == 16)
159 break;
160 }
161 for (i = 0; i < nr; i++) {
162 index = indices[i];
163 radix_tree_delete(root, index);
164 }
165 } while (nr > 0);
166}
167
168static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
169{
170 struct gmap_rmap *rmap, *rnext, *head;
171 struct radix_tree_iter iter;
172 unsigned long indices[16];
173 unsigned long index;
174 void __rcu **slot;
175 int i, nr;
176
177 /* A radix tree is freed by deleting all of its entries */
178 index = 0;
179 do {
180 nr = 0;
181 radix_tree_for_each_slot(slot, root, &iter, index) {
182 indices[nr] = iter.index;
183 if (++nr == 16)
184 break;
185 }
186 for (i = 0; i < nr; i++) {
187 index = indices[i];
188 head = radix_tree_delete(root, index);
189 gmap_for_each_rmap_safe(rmap, rnext, head)
190 kfree(objp: rmap);
191 }
192 } while (nr > 0);
193}
194
195static void gmap_free_crst(unsigned long *table, bool free_ptes)
196{
197 bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
198 int i;
199
200 if (is_segment) {
201 if (!free_ptes)
202 goto out;
203 for (i = 0; i < _CRST_ENTRIES; i++)
204 if (!(table[i] & _SEGMENT_ENTRY_INVALID))
205 page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
206 } else {
207 for (i = 0; i < _CRST_ENTRIES; i++)
208 if (!(table[i] & _REGION_ENTRY_INVALID))
209 gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
210 }
211
212out:
213 free_pages(addr: (unsigned long)table, order: CRST_ALLOC_ORDER);
214}
215
216/**
217 * gmap_free - free a guest address space
218 * @gmap: pointer to the guest address space structure
219 *
220 * No locks required. There are no references to this gmap anymore.
221 */
222void gmap_free(struct gmap *gmap)
223{
224 /* Flush tlb of all gmaps (if not already done for shadows) */
225 if (!(gmap_is_shadow(gmap) && gmap->removed))
226 gmap_flush_tlb(gmap);
227 /* Free all segment & region tables. */
228 gmap_free_crst(table: gmap->table, free_ptes: gmap_is_shadow(gmap));
229
230 gmap_radix_tree_free(root: &gmap->guest_to_host);
231 gmap_radix_tree_free(root: &gmap->host_to_guest);
232
233 /* Free additional data for a shadow gmap */
234 if (gmap_is_shadow(gmap)) {
235 gmap_rmap_radix_tree_free(root: &gmap->host_to_rmap);
236 /* Release reference to the parent */
237 gmap_put(gmap->parent);
238 }
239
240 kfree(objp: gmap);
241}
242EXPORT_SYMBOL_GPL(gmap_free);
243
244/**
245 * gmap_get - increase reference counter for guest address space
246 * @gmap: pointer to the guest address space structure
247 *
248 * Returns the gmap pointer
249 */
250struct gmap *gmap_get(struct gmap *gmap)
251{
252 refcount_inc(r: &gmap->ref_count);
253 return gmap;
254}
255EXPORT_SYMBOL_GPL(gmap_get);
256
257/**
258 * gmap_put - decrease reference counter for guest address space
259 * @gmap: pointer to the guest address space structure
260 *
261 * If the reference counter reaches zero the guest address space is freed.
262 */
263void gmap_put(struct gmap *gmap)
264{
265 if (refcount_dec_and_test(r: &gmap->ref_count))
266 gmap_free(gmap);
267}
268EXPORT_SYMBOL_GPL(gmap_put);
269
270/**
271 * gmap_remove - remove a guest address space but do not free it yet
272 * @gmap: pointer to the guest address space structure
273 */
274void gmap_remove(struct gmap *gmap)
275{
276 struct gmap *sg, *next;
277 unsigned long gmap_asce;
278
279 /* Remove all shadow gmaps linked to this gmap */
280 if (!list_empty(head: &gmap->children)) {
281 spin_lock(lock: &gmap->shadow_lock);
282 list_for_each_entry_safe(sg, next, &gmap->children, list) {
283 list_del(entry: &sg->list);
284 gmap_put(sg);
285 }
286 spin_unlock(lock: &gmap->shadow_lock);
287 }
288 /* Remove gmap from the pre-mm list */
289 spin_lock(lock: &gmap->mm->context.lock);
290 list_del_rcu(entry: &gmap->list);
291 if (list_empty(head: &gmap->mm->context.gmap_list))
292 gmap_asce = 0;
293 else if (list_is_singular(head: &gmap->mm->context.gmap_list))
294 gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
295 struct gmap, list)->asce;
296 else
297 gmap_asce = -1UL;
298 WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
299 spin_unlock(lock: &gmap->mm->context.lock);
300 synchronize_rcu();
301 /* Put reference */
302 gmap_put(gmap);
303}
304EXPORT_SYMBOL_GPL(gmap_remove);
305
306/*
307 * gmap_alloc_table is assumed to be called with mmap_lock held
308 */
309static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
310 unsigned long init, unsigned long gaddr)
311{
312 struct page *page;
313 unsigned long *new;
314
315 /* since we dont free the gmap table until gmap_free we can unlock */
316 page = gmap_alloc_crst();
317 if (!page)
318 return -ENOMEM;
319 new = page_to_virt(page);
320 crst_table_init(new, init);
321 spin_lock(lock: &gmap->guest_table_lock);
322 if (*table & _REGION_ENTRY_INVALID) {
323 *table = __pa(new) | _REGION_ENTRY_LENGTH |
324 (*table & _REGION_ENTRY_TYPE_MASK);
325 page = NULL;
326 }
327 spin_unlock(lock: &gmap->guest_table_lock);
328 if (page)
329 __free_pages(page, CRST_ALLOC_ORDER);
330 return 0;
331}
332
333static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
334{
335 return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
336}
337
338static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
339{
340 return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
341}
342
343static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
344 unsigned long *gaddr)
345{
346 *gaddr = host_to_guest_delete(gmap, vmaddr);
347 if (IS_GADDR_VALID(*gaddr))
348 return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
349 return NULL;
350}
351
352/**
353 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
354 * @gmap: pointer to the guest address space structure
355 * @vmaddr: address in the host process address space
356 *
357 * Returns 1 if a TLB flush is required
358 */
359static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
360{
361 unsigned long gaddr;
362 int flush = 0;
363 pmd_t *pmdp;
364
365 BUG_ON(gmap_is_shadow(gmap));
366 spin_lock(lock: &gmap->guest_table_lock);
367
368 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, gaddr: &gaddr);
369 if (pmdp) {
370 flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
371 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
372 }
373
374 spin_unlock(lock: &gmap->guest_table_lock);
375 return flush;
376}
377
378/**
379 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
380 * @gmap: pointer to the guest address space structure
381 * @gaddr: address in the guest address space
382 *
383 * Returns 1 if a TLB flush is required
384 */
385static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
386{
387 unsigned long vmaddr;
388
389 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
390 gaddr >> PMD_SHIFT);
391 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
392}
393
394/**
395 * gmap_unmap_segment - unmap segment from the guest address space
396 * @gmap: pointer to the guest address space structure
397 * @to: address in the guest address space
398 * @len: length of the memory area to unmap
399 *
400 * Returns 0 if the unmap succeeded, -EINVAL if not.
401 */
402int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
403{
404 unsigned long off;
405 int flush;
406
407 BUG_ON(gmap_is_shadow(gmap));
408 if ((to | len) & (PMD_SIZE - 1))
409 return -EINVAL;
410 if (len == 0 || to + len < to)
411 return -EINVAL;
412
413 flush = 0;
414 mmap_write_lock(mm: gmap->mm);
415 for (off = 0; off < len; off += PMD_SIZE)
416 flush |= __gmap_unmap_by_gaddr(gmap, gaddr: to + off);
417 mmap_write_unlock(mm: gmap->mm);
418 if (flush)
419 gmap_flush_tlb(gmap);
420 return 0;
421}
422EXPORT_SYMBOL_GPL(gmap_unmap_segment);
423
424/**
425 * gmap_map_segment - map a segment to the guest address space
426 * @gmap: pointer to the guest address space structure
427 * @from: source address in the parent address space
428 * @to: target address in the guest address space
429 * @len: length of the memory area to map
430 *
431 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
432 */
433int gmap_map_segment(struct gmap *gmap, unsigned long from,
434 unsigned long to, unsigned long len)
435{
436 unsigned long off;
437 int flush;
438
439 BUG_ON(gmap_is_shadow(gmap));
440 if ((from | to | len) & (PMD_SIZE - 1))
441 return -EINVAL;
442 if (len == 0 || from + len < from || to + len < to ||
443 from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
444 return -EINVAL;
445
446 flush = 0;
447 mmap_write_lock(mm: gmap->mm);
448 for (off = 0; off < len; off += PMD_SIZE) {
449 /* Remove old translation */
450 flush |= __gmap_unmap_by_gaddr(gmap, gaddr: to + off);
451 /* Store new translation */
452 if (radix_tree_insert(&gmap->guest_to_host,
453 index: (to + off) >> PMD_SHIFT,
454 (void *) from + off))
455 break;
456 }
457 mmap_write_unlock(mm: gmap->mm);
458 if (flush)
459 gmap_flush_tlb(gmap);
460 if (off >= len)
461 return 0;
462 gmap_unmap_segment(gmap, to, len);
463 return -ENOMEM;
464}
465EXPORT_SYMBOL_GPL(gmap_map_segment);
466
467/**
468 * __gmap_translate - translate a guest address to a user space address
469 * @gmap: pointer to guest mapping meta data structure
470 * @gaddr: guest address
471 *
472 * Returns user space address which corresponds to the guest address or
473 * -EFAULT if no such mapping exists.
474 * This function does not establish potentially missing page table entries.
475 * The mmap_lock of the mm that belongs to the address space must be held
476 * when this function gets called.
477 *
478 * Note: Can also be called for shadow gmaps.
479 */
480unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
481{
482 unsigned long vmaddr;
483
484 vmaddr = (unsigned long)
485 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
486 /* Note: guest_to_host is empty for a shadow gmap */
487 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
488}
489EXPORT_SYMBOL_GPL(__gmap_translate);
490
491/**
492 * gmap_unlink - disconnect a page table from the gmap shadow tables
493 * @mm: pointer to the parent mm_struct
494 * @table: pointer to the host page table
495 * @vmaddr: vm address associated with the host page table
496 */
497void gmap_unlink(struct mm_struct *mm, unsigned long *table,
498 unsigned long vmaddr)
499{
500 struct gmap *gmap;
501 int flush;
502
503 rcu_read_lock();
504 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
505 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
506 if (flush)
507 gmap_flush_tlb(gmap);
508 }
509 rcu_read_unlock();
510}
511
512static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
513 unsigned long gaddr);
514
515/**
516 * __gmap_link - set up shadow page tables to connect a host to a guest address
517 * @gmap: pointer to guest mapping meta data structure
518 * @gaddr: guest address
519 * @vmaddr: vm address
520 *
521 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
522 * if the vm address is already mapped to a different guest segment.
523 * The mmap_lock of the mm that belongs to the address space must be held
524 * when this function gets called.
525 */
526int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
527{
528 struct mm_struct *mm;
529 unsigned long *table;
530 spinlock_t *ptl;
531 pgd_t *pgd;
532 p4d_t *p4d;
533 pud_t *pud;
534 pmd_t *pmd;
535 u64 unprot;
536 int rc;
537
538 BUG_ON(gmap_is_shadow(gmap));
539 /* Create higher level tables in the gmap page table */
540 table = gmap->table;
541 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
542 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
543 if ((*table & _REGION_ENTRY_INVALID) &&
544 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
545 gaddr & _REGION1_MASK))
546 return -ENOMEM;
547 table = __va(*table & _REGION_ENTRY_ORIGIN);
548 }
549 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
550 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
551 if ((*table & _REGION_ENTRY_INVALID) &&
552 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
553 gaddr & _REGION2_MASK))
554 return -ENOMEM;
555 table = __va(*table & _REGION_ENTRY_ORIGIN);
556 }
557 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
558 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
559 if ((*table & _REGION_ENTRY_INVALID) &&
560 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
561 gaddr & _REGION3_MASK))
562 return -ENOMEM;
563 table = __va(*table & _REGION_ENTRY_ORIGIN);
564 }
565 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
566 /* Walk the parent mm page table */
567 mm = gmap->mm;
568 pgd = pgd_offset(mm, vmaddr);
569 VM_BUG_ON(pgd_none(*pgd));
570 p4d = p4d_offset(pgd, address: vmaddr);
571 VM_BUG_ON(p4d_none(*p4d));
572 pud = pud_offset(p4d, address: vmaddr);
573 VM_BUG_ON(pud_none(*pud));
574 /* large puds cannot yet be handled */
575 if (pud_leaf(pud: *pud))
576 return -EFAULT;
577 pmd = pmd_offset(pud, address: vmaddr);
578 VM_BUG_ON(pmd_none(*pmd));
579 /* Are we allowed to use huge pages? */
580 if (pmd_leaf(pte: *pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
581 return -EFAULT;
582 /* Link gmap segment table entry location to page table. */
583 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
584 if (rc)
585 return rc;
586 ptl = pmd_lock(mm, pmd);
587 spin_lock(lock: &gmap->guest_table_lock);
588 if (*table == _SEGMENT_ENTRY_EMPTY) {
589 rc = radix_tree_insert(&gmap->host_to_guest,
590 index: vmaddr >> PMD_SHIFT,
591 (void *)MAKE_VALID_GADDR(gaddr));
592 if (!rc) {
593 if (pmd_leaf(pte: *pmd)) {
594 *table = (pmd_val(*pmd) &
595 _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
596 | _SEGMENT_ENTRY_GMAP_UC
597 | _SEGMENT_ENTRY;
598 } else
599 *table = (pmd_val(*pmd) &
600 _SEGMENT_ENTRY_HARDWARE_BITS)
601 | _SEGMENT_ENTRY;
602 }
603 } else if (*table & _SEGMENT_ENTRY_PROTECT &&
604 !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
605 unprot = (u64)*table;
606 unprot &= ~_SEGMENT_ENTRY_PROTECT;
607 unprot |= _SEGMENT_ENTRY_GMAP_UC;
608 gmap_pmdp_xchg(gmap, old: (pmd_t *)table, new: __pmd(val: unprot), gaddr);
609 }
610 spin_unlock(lock: &gmap->guest_table_lock);
611 spin_unlock(lock: ptl);
612 radix_tree_preload_end();
613 return rc;
614}
615EXPORT_SYMBOL(__gmap_link);
616
617/*
618 * this function is assumed to be called with mmap_lock held
619 */
620void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
621{
622 unsigned long vmaddr;
623
624 mmap_assert_locked(mm: gmap->mm);
625
626 /* Find the vm address for the guest address */
627 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
628 gaddr >> PMD_SHIFT);
629 if (vmaddr) {
630 vmaddr |= gaddr & ~PMD_MASK;
631 gmap_helper_zap_one_page(gmap->mm, vmaddr);
632 }
633}
634EXPORT_SYMBOL_GPL(__gmap_zap);
635
636static LIST_HEAD(gmap_notifier_list);
637static DEFINE_SPINLOCK(gmap_notifier_lock);
638
639/**
640 * gmap_register_pte_notifier - register a pte invalidation callback
641 * @nb: pointer to the gmap notifier block
642 */
643void gmap_register_pte_notifier(struct gmap_notifier *nb)
644{
645 spin_lock(lock: &gmap_notifier_lock);
646 list_add_rcu(new: &nb->list, head: &gmap_notifier_list);
647 spin_unlock(lock: &gmap_notifier_lock);
648}
649EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
650
651/**
652 * gmap_unregister_pte_notifier - remove a pte invalidation callback
653 * @nb: pointer to the gmap notifier block
654 */
655void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
656{
657 spin_lock(lock: &gmap_notifier_lock);
658 list_del_rcu(entry: &nb->list);
659 spin_unlock(lock: &gmap_notifier_lock);
660 synchronize_rcu();
661}
662EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
663
664/**
665 * gmap_call_notifier - call all registered invalidation callbacks
666 * @gmap: pointer to guest mapping meta data structure
667 * @start: start virtual address in the guest address space
668 * @end: end virtual address in the guest address space
669 */
670static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
671 unsigned long end)
672{
673 struct gmap_notifier *nb;
674
675 list_for_each_entry(nb, &gmap_notifier_list, list)
676 nb->notifier_call(gmap, start, end);
677}
678
679/**
680 * gmap_table_walk - walk the gmap page tables
681 * @gmap: pointer to guest mapping meta data structure
682 * @gaddr: virtual address in the guest address space
683 * @level: page table level to stop at
684 *
685 * Returns a table entry pointer for the given guest address and @level
686 * @level=0 : returns a pointer to a page table table entry (or NULL)
687 * @level=1 : returns a pointer to a segment table entry (or NULL)
688 * @level=2 : returns a pointer to a region-3 table entry (or NULL)
689 * @level=3 : returns a pointer to a region-2 table entry (or NULL)
690 * @level=4 : returns a pointer to a region-1 table entry (or NULL)
691 *
692 * Returns NULL if the gmap page tables could not be walked to the
693 * requested level.
694 *
695 * Note: Can also be called for shadow gmaps.
696 */
697unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
698{
699 const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
700 unsigned long *table = gmap->table;
701
702 if (gmap_is_shadow(gmap) && gmap->removed)
703 return NULL;
704
705 if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
706 return NULL;
707
708 if (asce_type != _ASCE_TYPE_REGION1 &&
709 gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
710 return NULL;
711
712 switch (asce_type) {
713 case _ASCE_TYPE_REGION1:
714 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
715 if (level == 4)
716 break;
717 if (*table & _REGION_ENTRY_INVALID)
718 return NULL;
719 table = __va(*table & _REGION_ENTRY_ORIGIN);
720 fallthrough;
721 case _ASCE_TYPE_REGION2:
722 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
723 if (level == 3)
724 break;
725 if (*table & _REGION_ENTRY_INVALID)
726 return NULL;
727 table = __va(*table & _REGION_ENTRY_ORIGIN);
728 fallthrough;
729 case _ASCE_TYPE_REGION3:
730 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
731 if (level == 2)
732 break;
733 if (*table & _REGION_ENTRY_INVALID)
734 return NULL;
735 table = __va(*table & _REGION_ENTRY_ORIGIN);
736 fallthrough;
737 case _ASCE_TYPE_SEGMENT:
738 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
739 if (level == 1)
740 break;
741 if (*table & _REGION_ENTRY_INVALID)
742 return NULL;
743 table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
744 table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
745 }
746 return table;
747}
748EXPORT_SYMBOL(gmap_table_walk);
749
750/**
751 * gmap_pte_op_walk - walk the gmap page table, get the page table lock
752 * and return the pte pointer
753 * @gmap: pointer to guest mapping meta data structure
754 * @gaddr: virtual address in the guest address space
755 * @ptl: pointer to the spinlock pointer
756 *
757 * Returns a pointer to the locked pte for a guest address, or NULL
758 */
759static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
760 spinlock_t **ptl)
761{
762 unsigned long *table;
763
764 BUG_ON(gmap_is_shadow(gmap));
765 /* Walk the gmap page table, lock and get pte pointer */
766 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
767 if (!table || *table & _SEGMENT_ENTRY_INVALID)
768 return NULL;
769 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
770}
771
772/**
773 * gmap_pte_op_fixup - force a page in and connect the gmap page table
774 * @gmap: pointer to guest mapping meta data structure
775 * @gaddr: virtual address in the guest address space
776 * @vmaddr: address in the host process address space
777 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
778 *
779 * Returns 0 if the caller can retry __gmap_translate (might fail again),
780 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
781 * up or connecting the gmap page table.
782 */
783static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
784 unsigned long vmaddr, int prot)
785{
786 struct mm_struct *mm = gmap->mm;
787 unsigned int fault_flags;
788 bool unlocked = false;
789
790 BUG_ON(gmap_is_shadow(gmap));
791 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
792 if (fixup_user_fault(mm, address: vmaddr, fault_flags, unlocked: &unlocked))
793 return -EFAULT;
794 if (unlocked)
795 /* lost mmap_lock, caller has to retry __gmap_translate */
796 return 0;
797 /* Connect the page tables */
798 return __gmap_link(gmap, gaddr, vmaddr);
799}
800
801/**
802 * gmap_pte_op_end - release the page table lock
803 * @ptep: pointer to the locked pte
804 * @ptl: pointer to the page table spinlock
805 */
806static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
807{
808 pte_unmap_unlock(ptep, ptl);
809}
810
811/**
812 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
813 * and return the pmd pointer
814 * @gmap: pointer to guest mapping meta data structure
815 * @gaddr: virtual address in the guest address space
816 *
817 * Returns a pointer to the pmd for a guest address, or NULL
818 */
819static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
820{
821 pmd_t *pmdp;
822
823 BUG_ON(gmap_is_shadow(gmap));
824 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
825 if (!pmdp)
826 return NULL;
827
828 /* without huge pages, there is no need to take the table lock */
829 if (!gmap->mm->context.allow_gmap_hpage_1m)
830 return pmd_none(pmd: *pmdp) ? NULL : pmdp;
831
832 spin_lock(lock: &gmap->guest_table_lock);
833 if (pmd_none(pmd: *pmdp)) {
834 spin_unlock(lock: &gmap->guest_table_lock);
835 return NULL;
836 }
837
838 /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
839 if (!pmd_leaf(pte: *pmdp))
840 spin_unlock(lock: &gmap->guest_table_lock);
841 return pmdp;
842}
843
844/**
845 * gmap_pmd_op_end - release the guest_table_lock if needed
846 * @gmap: pointer to the guest mapping meta data structure
847 * @pmdp: pointer to the pmd
848 */
849static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
850{
851 if (pmd_leaf(pte: *pmdp))
852 spin_unlock(lock: &gmap->guest_table_lock);
853}
854
855/*
856 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
857 * @pmdp: pointer to the pmd to be protected
858 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
859 * @bits: notification bits to set
860 *
861 * Returns:
862 * 0 if successfully protected
863 * -EAGAIN if a fixup is needed
864 * -EINVAL if unsupported notifier bits have been specified
865 *
866 * Expected to be called with sg->mm->mmap_lock in read and
867 * guest_table_lock held.
868 */
869static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
870 pmd_t *pmdp, int prot, unsigned long bits)
871{
872 int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
873 int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
874 pmd_t new = *pmdp;
875
876 /* Fixup needed */
877 if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
878 return -EAGAIN;
879
880 if (prot == PROT_NONE && !pmd_i) {
881 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
882 gmap_pmdp_xchg(gmap, old: pmdp, new, gaddr);
883 }
884
885 if (prot == PROT_READ && !pmd_p) {
886 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
887 new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
888 gmap_pmdp_xchg(gmap, old: pmdp, new, gaddr);
889 }
890
891 if (bits & GMAP_NOTIFY_MPROT)
892 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
893
894 /* Shadow GMAP protection needs split PMDs */
895 if (bits & GMAP_NOTIFY_SHADOW)
896 return -EINVAL;
897
898 return 0;
899}
900
901/*
902 * gmap_protect_pte - remove access rights to memory and set pgste bits
903 * @gmap: pointer to guest mapping meta data structure
904 * @gaddr: virtual address in the guest address space
905 * @pmdp: pointer to the pmd associated with the pte
906 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
907 * @bits: notification bits to set
908 *
909 * Returns 0 if successfully protected, -ENOMEM if out of memory and
910 * -EAGAIN if a fixup is needed.
911 *
912 * Expected to be called with sg->mm->mmap_lock in read
913 */
914static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
915 pmd_t *pmdp, int prot, unsigned long bits)
916{
917 int rc;
918 pte_t *ptep;
919 spinlock_t *ptl;
920 unsigned long pbits = 0;
921
922 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
923 return -EAGAIN;
924
925 ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
926 if (!ptep)
927 return -ENOMEM;
928
929 pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
930 pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
931 /* Protect and unlock. */
932 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
933 gmap_pte_op_end(ptep, ptl);
934 return rc;
935}
936
937/*
938 * gmap_protect_range - remove access rights to memory and set pgste bits
939 * @gmap: pointer to guest mapping meta data structure
940 * @gaddr: virtual address in the guest address space
941 * @len: size of area
942 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
943 * @bits: pgste notification bits to set
944 *
945 * Returns:
946 * PAGE_SIZE if a small page was successfully protected;
947 * HPAGE_SIZE if a large page was successfully protected;
948 * -ENOMEM if out of memory;
949 * -EFAULT if gaddr is invalid (or mapping for shadows is missing);
950 * -EAGAIN if the guest mapping is missing and should be fixed by the caller.
951 *
952 * Context: Called with sg->mm->mmap_lock in read.
953 */
954int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
955{
956 pmd_t *pmdp;
957 int rc = 0;
958
959 BUG_ON(gmap_is_shadow(gmap));
960
961 pmdp = gmap_pmd_op_walk(gmap, gaddr);
962 if (!pmdp)
963 return -EAGAIN;
964
965 if (!pmd_leaf(pte: *pmdp)) {
966 rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
967 if (!rc)
968 rc = PAGE_SIZE;
969 } else {
970 rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
971 if (!rc)
972 rc = HPAGE_SIZE;
973 }
974 gmap_pmd_op_end(gmap, pmdp);
975
976 return rc;
977}
978EXPORT_SYMBOL_GPL(gmap_protect_one);
979
980/**
981 * gmap_read_table - get an unsigned long value from a guest page table using
982 * absolute addressing, without marking the page referenced.
983 * @gmap: pointer to guest mapping meta data structure
984 * @gaddr: virtual address in the guest address space
985 * @val: pointer to the unsigned long value to return
986 *
987 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
988 * if reading using the virtual address failed. -EINVAL if called on a gmap
989 * shadow.
990 *
991 * Called with gmap->mm->mmap_lock in read.
992 */
993int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
994{
995 unsigned long address, vmaddr;
996 spinlock_t *ptl;
997 pte_t *ptep, pte;
998 int rc;
999
1000 if (gmap_is_shadow(gmap))
1001 return -EINVAL;
1002
1003 while (1) {
1004 rc = -EAGAIN;
1005 ptep = gmap_pte_op_walk(gmap, gaddr, ptl: &ptl);
1006 if (ptep) {
1007 pte = *ptep;
1008 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
1009 address = pte_val(pte) & PAGE_MASK;
1010 address += gaddr & ~PAGE_MASK;
1011 *val = *(unsigned long *)__va(address);
1012 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
1013 /* Do *NOT* clear the _PAGE_INVALID bit! */
1014 rc = 0;
1015 }
1016 gmap_pte_op_end(ptep, ptl);
1017 }
1018 if (!rc)
1019 break;
1020 vmaddr = __gmap_translate(gmap, gaddr);
1021 if (IS_ERR_VALUE(vmaddr)) {
1022 rc = vmaddr;
1023 break;
1024 }
1025 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
1026 if (rc)
1027 break;
1028 }
1029 return rc;
1030}
1031EXPORT_SYMBOL_GPL(gmap_read_table);
1032
1033/**
1034 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1035 * @sg: pointer to the shadow guest address space structure
1036 * @vmaddr: vm address associated with the rmap
1037 * @rmap: pointer to the rmap structure
1038 *
1039 * Called with the sg->guest_table_lock
1040 */
1041static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1042 struct gmap_rmap *rmap)
1043{
1044 struct gmap_rmap *temp;
1045 void __rcu **slot;
1046
1047 BUG_ON(!gmap_is_shadow(sg));
1048 slot = radix_tree_lookup_slot(&sg->host_to_rmap, index: vmaddr >> PAGE_SHIFT);
1049 if (slot) {
1050 rmap->next = radix_tree_deref_slot_protected(slot,
1051 treelock: &sg->guest_table_lock);
1052 for (temp = rmap->next; temp; temp = temp->next) {
1053 if (temp->raddr == rmap->raddr) {
1054 kfree(objp: rmap);
1055 return;
1056 }
1057 }
1058 radix_tree_replace_slot(&sg->host_to_rmap, slot, entry: rmap);
1059 } else {
1060 rmap->next = NULL;
1061 radix_tree_insert(&sg->host_to_rmap, index: vmaddr >> PAGE_SHIFT,
1062 rmap);
1063 }
1064}
1065
1066/**
1067 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
1068 * @sg: pointer to the shadow guest address space structure
1069 * @raddr: rmap address in the shadow gmap
1070 * @paddr: address in the parent guest address space
1071 * @len: length of the memory area to protect
1072 *
1073 * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1074 * if out of memory and -EFAULT if paddr is invalid.
1075 */
1076static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1077 unsigned long paddr, unsigned long len)
1078{
1079 struct gmap *parent;
1080 struct gmap_rmap *rmap;
1081 unsigned long vmaddr;
1082 spinlock_t *ptl;
1083 pte_t *ptep;
1084 int rc;
1085
1086 BUG_ON(!gmap_is_shadow(sg));
1087 parent = sg->parent;
1088 while (len) {
1089 vmaddr = __gmap_translate(parent, paddr);
1090 if (IS_ERR_VALUE(vmaddr))
1091 return vmaddr;
1092 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1093 if (!rmap)
1094 return -ENOMEM;
1095 rmap->raddr = raddr;
1096 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1097 if (rc) {
1098 kfree(objp: rmap);
1099 return rc;
1100 }
1101 rc = -EAGAIN;
1102 ptep = gmap_pte_op_walk(gmap: parent, gaddr: paddr, ptl: &ptl);
1103 if (ptep) {
1104 spin_lock(lock: &sg->guest_table_lock);
1105 rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
1106 PGSTE_VSIE_BIT);
1107 if (!rc)
1108 gmap_insert_rmap(sg, vmaddr, rmap);
1109 spin_unlock(lock: &sg->guest_table_lock);
1110 gmap_pte_op_end(ptep, ptl);
1111 }
1112 radix_tree_preload_end();
1113 if (rc) {
1114 kfree(objp: rmap);
1115 rc = gmap_pte_op_fixup(gmap: parent, gaddr: paddr, vmaddr, PROT_READ);
1116 if (rc)
1117 return rc;
1118 continue;
1119 }
1120 paddr += PAGE_SIZE;
1121 len -= PAGE_SIZE;
1122 }
1123 return 0;
1124}
1125
1126#define _SHADOW_RMAP_MASK 0x7
1127#define _SHADOW_RMAP_REGION1 0x5
1128#define _SHADOW_RMAP_REGION2 0x4
1129#define _SHADOW_RMAP_REGION3 0x3
1130#define _SHADOW_RMAP_SEGMENT 0x2
1131#define _SHADOW_RMAP_PGTABLE 0x1
1132
1133/**
1134 * gmap_idte_one - invalidate a single region or segment table entry
1135 * @asce: region or segment table *origin* + table-type bits
1136 * @vaddr: virtual address to identify the table entry to flush
1137 *
1138 * The invalid bit of a single region or segment table entry is set
1139 * and the associated TLB entries depending on the entry are flushed.
1140 * The table-type of the @asce identifies the portion of the @vaddr
1141 * that is used as the invalidation index.
1142 */
1143static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1144{
1145 asm volatile(
1146 " idte %0,0,%1"
1147 : : "a" (asce), "a" (vaddr) : "cc", "memory");
1148}
1149
1150/**
1151 * gmap_unshadow_page - remove a page from a shadow page table
1152 * @sg: pointer to the shadow guest address space structure
1153 * @raddr: rmap address in the shadow guest address space
1154 *
1155 * Called with the sg->guest_table_lock
1156 */
1157static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1158{
1159 unsigned long *table;
1160
1161 BUG_ON(!gmap_is_shadow(sg));
1162 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1163 if (!table || *table & _PAGE_INVALID)
1164 return;
1165 gmap_call_notifier(gmap: sg, start: raddr, end: raddr + PAGE_SIZE - 1);
1166 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1167}
1168
1169/**
1170 * __gmap_unshadow_pgt - remove all entries from a shadow page table
1171 * @sg: pointer to the shadow guest address space structure
1172 * @raddr: rmap address in the shadow guest address space
1173 * @pgt: pointer to the start of a shadow page table
1174 *
1175 * Called with the sg->guest_table_lock
1176 */
1177static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1178 unsigned long *pgt)
1179{
1180 int i;
1181
1182 BUG_ON(!gmap_is_shadow(sg));
1183 for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
1184 pgt[i] = _PAGE_INVALID;
1185}
1186
1187/**
1188 * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1189 * @sg: pointer to the shadow guest address space structure
1190 * @raddr: address in the shadow guest address space
1191 *
1192 * Called with the sg->guest_table_lock
1193 */
1194static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1195{
1196 unsigned long *ste;
1197 phys_addr_t sto, pgt;
1198 struct ptdesc *ptdesc;
1199
1200 BUG_ON(!gmap_is_shadow(sg));
1201 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1202 if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1203 return;
1204 gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
1205 sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
1206 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1207 pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
1208 *ste = _SEGMENT_ENTRY_EMPTY;
1209 __gmap_unshadow_pgt(sg, raddr, __va(pgt));
1210 /* Free page table */
1211 ptdesc = page_ptdesc(phys_to_page(pgt));
1212 page_table_free_pgste(ptdesc);
1213}
1214
1215/**
1216 * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1217 * @sg: pointer to the shadow guest address space structure
1218 * @raddr: rmap address in the shadow guest address space
1219 * @sgt: pointer to the start of a shadow segment table
1220 *
1221 * Called with the sg->guest_table_lock
1222 */
1223static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1224 unsigned long *sgt)
1225{
1226 struct ptdesc *ptdesc;
1227 phys_addr_t pgt;
1228 int i;
1229
1230 BUG_ON(!gmap_is_shadow(sg));
1231 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
1232 if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1233 continue;
1234 pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
1235 sgt[i] = _SEGMENT_ENTRY_EMPTY;
1236 __gmap_unshadow_pgt(sg, raddr, __va(pgt));
1237 /* Free page table */
1238 ptdesc = page_ptdesc(phys_to_page(pgt));
1239 page_table_free_pgste(ptdesc);
1240 }
1241}
1242
1243/**
1244 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1245 * @sg: pointer to the shadow guest address space structure
1246 * @raddr: rmap address in the shadow guest address space
1247 *
1248 * Called with the shadow->guest_table_lock
1249 */
1250static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1251{
1252 unsigned long r3o, *r3e;
1253 phys_addr_t sgt;
1254 struct page *page;
1255
1256 BUG_ON(!gmap_is_shadow(sg));
1257 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1258 if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1259 return;
1260 gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
1261 r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
1262 gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
1263 sgt = *r3e & _REGION_ENTRY_ORIGIN;
1264 *r3e = _REGION3_ENTRY_EMPTY;
1265 __gmap_unshadow_sgt(sg, raddr, __va(sgt));
1266 /* Free segment table */
1267 page = phys_to_page(sgt);
1268 __free_pages(page, CRST_ALLOC_ORDER);
1269}
1270
1271/**
1272 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1273 * @sg: pointer to the shadow guest address space structure
1274 * @raddr: address in the shadow guest address space
1275 * @r3t: pointer to the start of a shadow region-3 table
1276 *
1277 * Called with the sg->guest_table_lock
1278 */
1279static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1280 unsigned long *r3t)
1281{
1282 struct page *page;
1283 phys_addr_t sgt;
1284 int i;
1285
1286 BUG_ON(!gmap_is_shadow(sg));
1287 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
1288 if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1289 continue;
1290 sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
1291 r3t[i] = _REGION3_ENTRY_EMPTY;
1292 __gmap_unshadow_sgt(sg, raddr, __va(sgt));
1293 /* Free segment table */
1294 page = phys_to_page(sgt);
1295 __free_pages(page, CRST_ALLOC_ORDER);
1296 }
1297}
1298
1299/**
1300 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1301 * @sg: pointer to the shadow guest address space structure
1302 * @raddr: rmap address in the shadow guest address space
1303 *
1304 * Called with the sg->guest_table_lock
1305 */
1306static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1307{
1308 unsigned long r2o, *r2e;
1309 phys_addr_t r3t;
1310 struct page *page;
1311
1312 BUG_ON(!gmap_is_shadow(sg));
1313 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1314 if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1315 return;
1316 gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
1317 r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
1318 gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
1319 r3t = *r2e & _REGION_ENTRY_ORIGIN;
1320 *r2e = _REGION2_ENTRY_EMPTY;
1321 __gmap_unshadow_r3t(sg, raddr, __va(r3t));
1322 /* Free region 3 table */
1323 page = phys_to_page(r3t);
1324 __free_pages(page, CRST_ALLOC_ORDER);
1325}
1326
1327/**
1328 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1329 * @sg: pointer to the shadow guest address space structure
1330 * @raddr: rmap address in the shadow guest address space
1331 * @r2t: pointer to the start of a shadow region-2 table
1332 *
1333 * Called with the sg->guest_table_lock
1334 */
1335static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1336 unsigned long *r2t)
1337{
1338 phys_addr_t r3t;
1339 struct page *page;
1340 int i;
1341
1342 BUG_ON(!gmap_is_shadow(sg));
1343 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
1344 if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1345 continue;
1346 r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
1347 r2t[i] = _REGION2_ENTRY_EMPTY;
1348 __gmap_unshadow_r3t(sg, raddr, __va(r3t));
1349 /* Free region 3 table */
1350 page = phys_to_page(r3t);
1351 __free_pages(page, CRST_ALLOC_ORDER);
1352 }
1353}
1354
1355/**
1356 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1357 * @sg: pointer to the shadow guest address space structure
1358 * @raddr: rmap address in the shadow guest address space
1359 *
1360 * Called with the sg->guest_table_lock
1361 */
1362static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1363{
1364 unsigned long r1o, *r1e;
1365 struct page *page;
1366 phys_addr_t r2t;
1367
1368 BUG_ON(!gmap_is_shadow(sg));
1369 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1370 if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1371 return;
1372 gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
1373 r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
1374 gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
1375 r2t = *r1e & _REGION_ENTRY_ORIGIN;
1376 *r1e = _REGION1_ENTRY_EMPTY;
1377 __gmap_unshadow_r2t(sg, raddr, __va(r2t));
1378 /* Free region 2 table */
1379 page = phys_to_page(r2t);
1380 __free_pages(page, CRST_ALLOC_ORDER);
1381}
1382
1383/**
1384 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1385 * @sg: pointer to the shadow guest address space structure
1386 * @raddr: rmap address in the shadow guest address space
1387 * @r1t: pointer to the start of a shadow region-1 table
1388 *
1389 * Called with the shadow->guest_table_lock
1390 */
1391static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1392 unsigned long *r1t)
1393{
1394 unsigned long asce;
1395 struct page *page;
1396 phys_addr_t r2t;
1397 int i;
1398
1399 BUG_ON(!gmap_is_shadow(sg));
1400 asce = __pa(r1t) | _ASCE_TYPE_REGION1;
1401 for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
1402 if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1403 continue;
1404 r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
1405 __gmap_unshadow_r2t(sg, raddr, __va(r2t));
1406 /* Clear entry and flush translation r1t -> r2t */
1407 gmap_idte_one(asce, raddr);
1408 r1t[i] = _REGION1_ENTRY_EMPTY;
1409 /* Free region 2 table */
1410 page = phys_to_page(r2t);
1411 __free_pages(page, CRST_ALLOC_ORDER);
1412 }
1413}
1414
1415/**
1416 * gmap_unshadow - remove a shadow page table completely
1417 * @sg: pointer to the shadow guest address space structure
1418 *
1419 * Called with sg->guest_table_lock
1420 */
1421void gmap_unshadow(struct gmap *sg)
1422{
1423 unsigned long *table;
1424
1425 BUG_ON(!gmap_is_shadow(sg));
1426 if (sg->removed)
1427 return;
1428 sg->removed = 1;
1429 gmap_call_notifier(gmap: sg, start: 0, end: -1UL);
1430 gmap_flush_tlb(gmap: sg);
1431 table = __va(sg->asce & _ASCE_ORIGIN);
1432 switch (sg->asce & _ASCE_TYPE_MASK) {
1433 case _ASCE_TYPE_REGION1:
1434 __gmap_unshadow_r1t(sg, raddr: 0, r1t: table);
1435 break;
1436 case _ASCE_TYPE_REGION2:
1437 __gmap_unshadow_r2t(sg, raddr: 0, r2t: table);
1438 break;
1439 case _ASCE_TYPE_REGION3:
1440 __gmap_unshadow_r3t(sg, raddr: 0, r3t: table);
1441 break;
1442 case _ASCE_TYPE_SEGMENT:
1443 __gmap_unshadow_sgt(sg, raddr: 0, sgt: table);
1444 break;
1445 }
1446}
1447EXPORT_SYMBOL(gmap_unshadow);
1448
1449/**
1450 * gmap_shadow_r2t - create an empty shadow region 2 table
1451 * @sg: pointer to the shadow guest address space structure
1452 * @saddr: faulting address in the shadow gmap
1453 * @r2t: parent gmap address of the region 2 table to get shadowed
1454 * @fake: r2t references contiguous guest memory block, not a r2t
1455 *
1456 * The r2t parameter specifies the address of the source table. The
1457 * four pages of the source table are made read-only in the parent gmap
1458 * address space. A write to the source table area @r2t will automatically
1459 * remove the shadow r2 table and all of its descendants.
1460 *
1461 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1462 * shadow table structure is incomplete, -ENOMEM if out of memory and
1463 * -EFAULT if an address in the parent gmap could not be resolved.
1464 *
1465 * Called with sg->mm->mmap_lock in read.
1466 */
1467int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1468 int fake)
1469{
1470 unsigned long raddr, origin, offset, len;
1471 unsigned long *table;
1472 phys_addr_t s_r2t;
1473 struct page *page;
1474 int rc;
1475
1476 BUG_ON(!gmap_is_shadow(sg));
1477 /* Allocate a shadow region second table */
1478 page = gmap_alloc_crst();
1479 if (!page)
1480 return -ENOMEM;
1481 s_r2t = page_to_phys(page);
1482 /* Install shadow region second table */
1483 spin_lock(lock: &sg->guest_table_lock);
1484 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1485 if (!table) {
1486 rc = -EAGAIN; /* Race with unshadow */
1487 goto out_free;
1488 }
1489 if (!(*table & _REGION_ENTRY_INVALID)) {
1490 rc = 0; /* Already established */
1491 goto out_free;
1492 } else if (*table & _REGION_ENTRY_ORIGIN) {
1493 rc = -EAGAIN; /* Race with shadow */
1494 goto out_free;
1495 }
1496 crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
1497 /* mark as invalid as long as the parent table is not protected */
1498 *table = s_r2t | _REGION_ENTRY_LENGTH |
1499 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1500 if (sg->edat_level >= 1)
1501 *table |= (r2t & _REGION_ENTRY_PROTECT);
1502 if (fake) {
1503 /* nothing to protect for fake tables */
1504 *table &= ~_REGION_ENTRY_INVALID;
1505 spin_unlock(lock: &sg->guest_table_lock);
1506 return 0;
1507 }
1508 spin_unlock(lock: &sg->guest_table_lock);
1509 /* Make r2t read-only in parent gmap page table */
1510 raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
1511 origin = r2t & _REGION_ENTRY_ORIGIN;
1512 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1513 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1514 rc = gmap_protect_rmap(sg, raddr, paddr: origin + offset, len);
1515 spin_lock(lock: &sg->guest_table_lock);
1516 if (!rc) {
1517 table = gmap_table_walk(sg, saddr, 4);
1518 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
1519 rc = -EAGAIN; /* Race with unshadow */
1520 else
1521 *table &= ~_REGION_ENTRY_INVALID;
1522 } else {
1523 gmap_unshadow_r2t(sg, raddr);
1524 }
1525 spin_unlock(lock: &sg->guest_table_lock);
1526 return rc;
1527out_free:
1528 spin_unlock(lock: &sg->guest_table_lock);
1529 __free_pages(page, CRST_ALLOC_ORDER);
1530 return rc;
1531}
1532EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1533
1534/**
1535 * gmap_shadow_r3t - create a shadow region 3 table
1536 * @sg: pointer to the shadow guest address space structure
1537 * @saddr: faulting address in the shadow gmap
1538 * @r3t: parent gmap address of the region 3 table to get shadowed
1539 * @fake: r3t references contiguous guest memory block, not a r3t
1540 *
1541 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1542 * shadow table structure is incomplete, -ENOMEM if out of memory and
1543 * -EFAULT if an address in the parent gmap could not be resolved.
1544 *
1545 * Called with sg->mm->mmap_lock in read.
1546 */
1547int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1548 int fake)
1549{
1550 unsigned long raddr, origin, offset, len;
1551 unsigned long *table;
1552 phys_addr_t s_r3t;
1553 struct page *page;
1554 int rc;
1555
1556 BUG_ON(!gmap_is_shadow(sg));
1557 /* Allocate a shadow region second table */
1558 page = gmap_alloc_crst();
1559 if (!page)
1560 return -ENOMEM;
1561 s_r3t = page_to_phys(page);
1562 /* Install shadow region second table */
1563 spin_lock(lock: &sg->guest_table_lock);
1564 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1565 if (!table) {
1566 rc = -EAGAIN; /* Race with unshadow */
1567 goto out_free;
1568 }
1569 if (!(*table & _REGION_ENTRY_INVALID)) {
1570 rc = 0; /* Already established */
1571 goto out_free;
1572 } else if (*table & _REGION_ENTRY_ORIGIN) {
1573 rc = -EAGAIN; /* Race with shadow */
1574 goto out_free;
1575 }
1576 crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
1577 /* mark as invalid as long as the parent table is not protected */
1578 *table = s_r3t | _REGION_ENTRY_LENGTH |
1579 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1580 if (sg->edat_level >= 1)
1581 *table |= (r3t & _REGION_ENTRY_PROTECT);
1582 if (fake) {
1583 /* nothing to protect for fake tables */
1584 *table &= ~_REGION_ENTRY_INVALID;
1585 spin_unlock(lock: &sg->guest_table_lock);
1586 return 0;
1587 }
1588 spin_unlock(lock: &sg->guest_table_lock);
1589 /* Make r3t read-only in parent gmap page table */
1590 raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
1591 origin = r3t & _REGION_ENTRY_ORIGIN;
1592 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1593 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1594 rc = gmap_protect_rmap(sg, raddr, paddr: origin + offset, len);
1595 spin_lock(lock: &sg->guest_table_lock);
1596 if (!rc) {
1597 table = gmap_table_walk(sg, saddr, 3);
1598 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
1599 rc = -EAGAIN; /* Race with unshadow */
1600 else
1601 *table &= ~_REGION_ENTRY_INVALID;
1602 } else {
1603 gmap_unshadow_r3t(sg, raddr);
1604 }
1605 spin_unlock(lock: &sg->guest_table_lock);
1606 return rc;
1607out_free:
1608 spin_unlock(lock: &sg->guest_table_lock);
1609 __free_pages(page, CRST_ALLOC_ORDER);
1610 return rc;
1611}
1612EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1613
1614/**
1615 * gmap_shadow_sgt - create a shadow segment table
1616 * @sg: pointer to the shadow guest address space structure
1617 * @saddr: faulting address in the shadow gmap
1618 * @sgt: parent gmap address of the segment table to get shadowed
1619 * @fake: sgt references contiguous guest memory block, not a sgt
1620 *
1621 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1622 * shadow table structure is incomplete, -ENOMEM if out of memory and
1623 * -EFAULT if an address in the parent gmap could not be resolved.
1624 *
1625 * Called with sg->mm->mmap_lock in read.
1626 */
1627int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1628 int fake)
1629{
1630 unsigned long raddr, origin, offset, len;
1631 unsigned long *table;
1632 phys_addr_t s_sgt;
1633 struct page *page;
1634 int rc;
1635
1636 BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1637 /* Allocate a shadow segment table */
1638 page = gmap_alloc_crst();
1639 if (!page)
1640 return -ENOMEM;
1641 s_sgt = page_to_phys(page);
1642 /* Install shadow region second table */
1643 spin_lock(lock: &sg->guest_table_lock);
1644 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1645 if (!table) {
1646 rc = -EAGAIN; /* Race with unshadow */
1647 goto out_free;
1648 }
1649 if (!(*table & _REGION_ENTRY_INVALID)) {
1650 rc = 0; /* Already established */
1651 goto out_free;
1652 } else if (*table & _REGION_ENTRY_ORIGIN) {
1653 rc = -EAGAIN; /* Race with shadow */
1654 goto out_free;
1655 }
1656 crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
1657 /* mark as invalid as long as the parent table is not protected */
1658 *table = s_sgt | _REGION_ENTRY_LENGTH |
1659 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1660 if (sg->edat_level >= 1)
1661 *table |= sgt & _REGION_ENTRY_PROTECT;
1662 if (fake) {
1663 /* nothing to protect for fake tables */
1664 *table &= ~_REGION_ENTRY_INVALID;
1665 spin_unlock(lock: &sg->guest_table_lock);
1666 return 0;
1667 }
1668 spin_unlock(lock: &sg->guest_table_lock);
1669 /* Make sgt read-only in parent gmap page table */
1670 raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
1671 origin = sgt & _REGION_ENTRY_ORIGIN;
1672 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1673 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1674 rc = gmap_protect_rmap(sg, raddr, paddr: origin + offset, len);
1675 spin_lock(lock: &sg->guest_table_lock);
1676 if (!rc) {
1677 table = gmap_table_walk(sg, saddr, 2);
1678 if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
1679 rc = -EAGAIN; /* Race with unshadow */
1680 else
1681 *table &= ~_REGION_ENTRY_INVALID;
1682 } else {
1683 gmap_unshadow_sgt(sg, raddr);
1684 }
1685 spin_unlock(lock: &sg->guest_table_lock);
1686 return rc;
1687out_free:
1688 spin_unlock(lock: &sg->guest_table_lock);
1689 __free_pages(page, CRST_ALLOC_ORDER);
1690 return rc;
1691}
1692EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1693
1694static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
1695{
1696 unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
1697
1698 pgstes += _PAGE_ENTRIES;
1699
1700 pgstes[0] &= ~PGSTE_ST2_MASK;
1701 pgstes[1] &= ~PGSTE_ST2_MASK;
1702 pgstes[2] &= ~PGSTE_ST2_MASK;
1703 pgstes[3] &= ~PGSTE_ST2_MASK;
1704
1705 pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
1706 pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
1707 pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
1708 pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
1709}
1710
1711/**
1712 * gmap_shadow_pgt - instantiate a shadow page table
1713 * @sg: pointer to the shadow guest address space structure
1714 * @saddr: faulting address in the shadow gmap
1715 * @pgt: parent gmap address of the page table to get shadowed
1716 * @fake: pgt references contiguous guest memory block, not a pgtable
1717 *
1718 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1719 * shadow table structure is incomplete, -ENOMEM if out of memory,
1720 * -EFAULT if an address in the parent gmap could not be resolved and
1721 *
1722 * Called with gmap->mm->mmap_lock in read
1723 */
1724int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
1725 int fake)
1726{
1727 unsigned long raddr, origin;
1728 unsigned long *table;
1729 struct ptdesc *ptdesc;
1730 phys_addr_t s_pgt;
1731 int rc;
1732
1733 BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
1734 /* Allocate a shadow page table */
1735 ptdesc = page_table_alloc_pgste(sg->mm);
1736 if (!ptdesc)
1737 return -ENOMEM;
1738 origin = pgt & _SEGMENT_ENTRY_ORIGIN;
1739 if (fake)
1740 origin |= GMAP_SHADOW_FAKE_TABLE;
1741 gmap_pgste_set_pgt_addr(ptdesc, pgt_addr: origin);
1742 s_pgt = page_to_phys(ptdesc_page(ptdesc));
1743 /* Install shadow page table */
1744 spin_lock(lock: &sg->guest_table_lock);
1745 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1746 if (!table) {
1747 rc = -EAGAIN; /* Race with unshadow */
1748 goto out_free;
1749 }
1750 if (!(*table & _SEGMENT_ENTRY_INVALID)) {
1751 rc = 0; /* Already established */
1752 goto out_free;
1753 } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
1754 rc = -EAGAIN; /* Race with shadow */
1755 goto out_free;
1756 }
1757 /* mark as invalid as long as the parent table is not protected */
1758 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
1759 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
1760 if (fake) {
1761 /* nothing to protect for fake tables */
1762 *table &= ~_SEGMENT_ENTRY_INVALID;
1763 spin_unlock(lock: &sg->guest_table_lock);
1764 return 0;
1765 }
1766 spin_unlock(lock: &sg->guest_table_lock);
1767 /* Make pgt read-only in parent gmap page table (not the pgste) */
1768 raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
1769 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
1770 rc = gmap_protect_rmap(sg, raddr, paddr: origin, PAGE_SIZE);
1771 spin_lock(lock: &sg->guest_table_lock);
1772 if (!rc) {
1773 table = gmap_table_walk(sg, saddr, 1);
1774 if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
1775 rc = -EAGAIN; /* Race with unshadow */
1776 else
1777 *table &= ~_SEGMENT_ENTRY_INVALID;
1778 } else {
1779 gmap_unshadow_pgt(sg, raddr);
1780 }
1781 spin_unlock(lock: &sg->guest_table_lock);
1782 return rc;
1783out_free:
1784 spin_unlock(lock: &sg->guest_table_lock);
1785 page_table_free_pgste(ptdesc);
1786 return rc;
1787
1788}
1789EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
1790
1791/**
1792 * gmap_shadow_page - create a shadow page mapping
1793 * @sg: pointer to the shadow guest address space structure
1794 * @saddr: faulting address in the shadow gmap
1795 * @pte: pte in parent gmap address space to get shadowed
1796 *
1797 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1798 * shadow table structure is incomplete, -ENOMEM if out of memory and
1799 * -EFAULT if an address in the parent gmap could not be resolved.
1800 *
1801 * Called with sg->mm->mmap_lock in read.
1802 */
1803int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
1804{
1805 struct gmap *parent;
1806 struct gmap_rmap *rmap;
1807 unsigned long vmaddr, paddr;
1808 spinlock_t *ptl;
1809 pte_t *sptep, *tptep;
1810 int prot;
1811 int rc;
1812
1813 BUG_ON(!gmap_is_shadow(sg));
1814 parent = sg->parent;
1815 prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
1816
1817 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1818 if (!rmap)
1819 return -ENOMEM;
1820 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
1821
1822 while (1) {
1823 paddr = pte_val(pte) & PAGE_MASK;
1824 vmaddr = __gmap_translate(parent, paddr);
1825 if (IS_ERR_VALUE(vmaddr)) {
1826 rc = vmaddr;
1827 break;
1828 }
1829 rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1830 if (rc)
1831 break;
1832 rc = -EAGAIN;
1833 sptep = gmap_pte_op_walk(gmap: parent, gaddr: paddr, ptl: &ptl);
1834 if (sptep) {
1835 spin_lock(lock: &sg->guest_table_lock);
1836 /* Get page table pointer */
1837 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
1838 if (!tptep) {
1839 spin_unlock(lock: &sg->guest_table_lock);
1840 gmap_pte_op_end(ptep: sptep, ptl);
1841 radix_tree_preload_end();
1842 break;
1843 }
1844 rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
1845 if (rc > 0) {
1846 /* Success and a new mapping */
1847 gmap_insert_rmap(sg, vmaddr, rmap);
1848 rmap = NULL;
1849 rc = 0;
1850 }
1851 gmap_pte_op_end(ptep: sptep, ptl);
1852 spin_unlock(lock: &sg->guest_table_lock);
1853 }
1854 radix_tree_preload_end();
1855 if (!rc)
1856 break;
1857 rc = gmap_pte_op_fixup(gmap: parent, gaddr: paddr, vmaddr, prot);
1858 if (rc)
1859 break;
1860 }
1861 kfree(objp: rmap);
1862 return rc;
1863}
1864EXPORT_SYMBOL_GPL(gmap_shadow_page);
1865
1866/*
1867 * gmap_shadow_notify - handle notifications for shadow gmap
1868 *
1869 * Called with sg->parent->shadow_lock.
1870 */
1871static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
1872 unsigned long gaddr)
1873{
1874 struct gmap_rmap *rmap, *rnext, *head;
1875 unsigned long start, end, bits, raddr;
1876
1877 BUG_ON(!gmap_is_shadow(sg));
1878
1879 spin_lock(lock: &sg->guest_table_lock);
1880 if (sg->removed) {
1881 spin_unlock(lock: &sg->guest_table_lock);
1882 return;
1883 }
1884 /* Check for top level table */
1885 start = sg->orig_asce & _ASCE_ORIGIN;
1886 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
1887 if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
1888 gaddr < end) {
1889 /* The complete shadow table has to go */
1890 gmap_unshadow(sg);
1891 spin_unlock(lock: &sg->guest_table_lock);
1892 list_del(entry: &sg->list);
1893 gmap_put(sg);
1894 return;
1895 }
1896 /* Remove the page table tree from on specific entry */
1897 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1898 gmap_for_each_rmap_safe(rmap, rnext, head) {
1899 bits = rmap->raddr & _SHADOW_RMAP_MASK;
1900 raddr = rmap->raddr ^ bits;
1901 switch (bits) {
1902 case _SHADOW_RMAP_REGION1:
1903 gmap_unshadow_r2t(sg, raddr);
1904 break;
1905 case _SHADOW_RMAP_REGION2:
1906 gmap_unshadow_r3t(sg, raddr);
1907 break;
1908 case _SHADOW_RMAP_REGION3:
1909 gmap_unshadow_sgt(sg, raddr);
1910 break;
1911 case _SHADOW_RMAP_SEGMENT:
1912 gmap_unshadow_pgt(sg, raddr);
1913 break;
1914 case _SHADOW_RMAP_PGTABLE:
1915 gmap_unshadow_page(sg, raddr);
1916 break;
1917 }
1918 kfree(objp: rmap);
1919 }
1920 spin_unlock(lock: &sg->guest_table_lock);
1921}
1922
1923/**
1924 * ptep_notify - call all invalidation callbacks for a specific pte.
1925 * @mm: pointer to the process mm_struct
1926 * @vmaddr: virtual address in the process address space
1927 * @pte: pointer to the page table entry
1928 * @bits: bits from the pgste that caused the notify call
1929 *
1930 * This function is assumed to be called with the page table lock held
1931 * for the pte to notify.
1932 */
1933void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
1934 pte_t *pte, unsigned long bits)
1935{
1936 unsigned long offset, gaddr = 0;
1937 struct gmap *gmap, *sg, *next;
1938
1939 offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
1940 offset = offset * (PAGE_SIZE / sizeof(pte_t));
1941 rcu_read_lock();
1942 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
1943 spin_lock(lock: &gmap->guest_table_lock);
1944 gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
1945 spin_unlock(lock: &gmap->guest_table_lock);
1946 if (!IS_GADDR_VALID(gaddr))
1947 continue;
1948
1949 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
1950 spin_lock(lock: &gmap->shadow_lock);
1951 list_for_each_entry_safe(sg, next,
1952 &gmap->children, list)
1953 gmap_shadow_notify(sg, vmaddr, gaddr);
1954 spin_unlock(lock: &gmap->shadow_lock);
1955 }
1956 if (bits & PGSTE_IN_BIT)
1957 gmap_call_notifier(gmap, start: gaddr, end: gaddr + PAGE_SIZE - 1);
1958 }
1959 rcu_read_unlock();
1960}
1961EXPORT_SYMBOL_GPL(ptep_notify);
1962
1963static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
1964 unsigned long gaddr)
1965{
1966 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
1967 gmap_call_notifier(gmap, start: gaddr, end: gaddr + HPAGE_SIZE - 1);
1968}
1969
1970/**
1971 * gmap_pmdp_xchg - exchange a gmap pmd with another
1972 * @gmap: pointer to the guest address space structure
1973 * @pmdp: pointer to the pmd entry
1974 * @new: replacement entry
1975 * @gaddr: the affected guest address
1976 *
1977 * This function is assumed to be called with the guest_table_lock
1978 * held.
1979 */
1980static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
1981 unsigned long gaddr)
1982{
1983 gaddr &= HPAGE_MASK;
1984 pmdp_notify_gmap(gmap, pmdp, gaddr);
1985 new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
1986 if (machine_has_tlb_guest())
1987 __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
1988 IDTE_GLOBAL);
1989 else
1990 __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
1991 set_pmd(pmdp, pmd: new);
1992}
1993
1994static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
1995 int purge)
1996{
1997 pmd_t *pmdp;
1998 struct gmap *gmap;
1999 unsigned long gaddr;
2000
2001 rcu_read_lock();
2002 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2003 spin_lock(lock: &gmap->guest_table_lock);
2004 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, gaddr: &gaddr);
2005 if (pmdp) {
2006 pmdp_notify_gmap(gmap, pmdp, gaddr);
2007 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2008 _SEGMENT_ENTRY_GMAP_UC |
2009 _SEGMENT_ENTRY));
2010 if (purge)
2011 __pmdp_cspg(pmdp);
2012 set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
2013 }
2014 spin_unlock(lock: &gmap->guest_table_lock);
2015 }
2016 rcu_read_unlock();
2017}
2018
2019/**
2020 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
2021 * flushing
2022 * @mm: pointer to the process mm_struct
2023 * @vmaddr: virtual address in the process address space
2024 */
2025void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
2026{
2027 gmap_pmdp_clear(mm, vmaddr, purge: 0);
2028}
2029EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
2030
2031/**
2032 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
2033 * @mm: pointer to the process mm_struct
2034 * @vmaddr: virtual address in the process address space
2035 */
2036void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
2037{
2038 unsigned long gaddr;
2039 struct gmap *gmap;
2040 pmd_t *pmdp;
2041
2042 rcu_read_lock();
2043 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2044 spin_lock(lock: &gmap->guest_table_lock);
2045 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, gaddr: &gaddr);
2046 if (pmdp) {
2047 pmdp_notify_gmap(gmap, pmdp, gaddr);
2048 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2049 _SEGMENT_ENTRY_GMAP_UC |
2050 _SEGMENT_ENTRY));
2051 if (machine_has_tlb_guest())
2052 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2053 gmap->asce, IDTE_LOCAL);
2054 else
2055 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
2056 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2057 }
2058 spin_unlock(lock: &gmap->guest_table_lock);
2059 }
2060 rcu_read_unlock();
2061}
2062EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
2063
2064/**
2065 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
2066 * @mm: pointer to the process mm_struct
2067 * @vmaddr: virtual address in the process address space
2068 */
2069void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
2070{
2071 unsigned long gaddr;
2072 struct gmap *gmap;
2073 pmd_t *pmdp;
2074
2075 rcu_read_lock();
2076 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2077 spin_lock(lock: &gmap->guest_table_lock);
2078 pmdp = host_to_guest_pmd_delete(gmap, vmaddr, gaddr: &gaddr);
2079 if (pmdp) {
2080 pmdp_notify_gmap(gmap, pmdp, gaddr);
2081 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2082 _SEGMENT_ENTRY_GMAP_UC |
2083 _SEGMENT_ENTRY));
2084 if (machine_has_tlb_guest())
2085 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2086 gmap->asce, IDTE_GLOBAL);
2087 else
2088 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
2089 *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2090 }
2091 spin_unlock(lock: &gmap->guest_table_lock);
2092 }
2093 rcu_read_unlock();
2094}
2095EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
2096
2097/**
2098 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
2099 * @gmap: pointer to guest address space
2100 * @pmdp: pointer to the pmd to be tested
2101 * @gaddr: virtual address in the guest address space
2102 *
2103 * This function is assumed to be called with the guest_table_lock
2104 * held.
2105 */
2106static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2107 unsigned long gaddr)
2108{
2109 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
2110 return false;
2111
2112 /* Already protected memory, which did not change is clean */
2113 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
2114 !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
2115 return false;
2116
2117 /* Clear UC indication and reset protection */
2118 set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
2119 gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, bits: 0);
2120 return true;
2121}
2122
2123/**
2124 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
2125 * @gmap: pointer to guest address space
2126 * @bitmap: dirty bitmap for this pmd
2127 * @gaddr: virtual address in the guest address space
2128 * @vmaddr: virtual address in the host address space
2129 *
2130 * This function is assumed to be called with the guest_table_lock
2131 * held.
2132 */
2133void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
2134 unsigned long gaddr, unsigned long vmaddr)
2135{
2136 int i;
2137 pmd_t *pmdp;
2138 pte_t *ptep;
2139 spinlock_t *ptl;
2140
2141 pmdp = gmap_pmd_op_walk(gmap, gaddr);
2142 if (!pmdp)
2143 return;
2144
2145 if (pmd_leaf(pte: *pmdp)) {
2146 if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
2147 bitmap_fill(bitmap, _PAGE_ENTRIES);
2148 } else {
2149 for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
2150 ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
2151 if (!ptep)
2152 continue;
2153 if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
2154 set_bit(i, bitmap);
2155 pte_unmap_unlock(ptep, ptl);
2156 }
2157 }
2158 gmap_pmd_op_end(gmap, pmdp);
2159}
2160EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
2161
2162#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2163static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2164 unsigned long end, struct mm_walk *walk)
2165{
2166 struct vm_area_struct *vma = walk->vma;
2167
2168 split_huge_pmd(vma, pmd, addr);
2169 return 0;
2170}
2171
2172static const struct mm_walk_ops thp_split_walk_ops = {
2173 .pmd_entry = thp_split_walk_pmd_entry,
2174 .walk_lock = PGWALK_WRLOCK_VERIFY,
2175};
2176
2177static inline void thp_split_mm(struct mm_struct *mm)
2178{
2179 struct vm_area_struct *vma;
2180 VMA_ITERATOR(vmi, mm, 0);
2181
2182 for_each_vma(vmi, vma) {
2183 vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
2184 walk_page_vma(vma, ops: &thp_split_walk_ops, NULL);
2185 }
2186 mm->def_flags |= VM_NOHUGEPAGE;
2187}
2188#else
2189static inline void thp_split_mm(struct mm_struct *mm)
2190{
2191}
2192#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2193
2194/*
2195 * switch on pgstes for its userspace process (for kvm)
2196 */
2197int s390_enable_sie(void)
2198{
2199 struct mm_struct *mm = current->mm;
2200
2201 /* Do we have pgstes? if yes, we are done */
2202 if (mm_has_pgste(mm))
2203 return 0;
2204 mmap_write_lock(mm);
2205 mm->context.has_pgste = 1;
2206 /* split thp mappings and disable thp for future mappings */
2207 thp_split_mm(mm);
2208 mmap_write_unlock(mm);
2209 return 0;
2210}
2211EXPORT_SYMBOL_GPL(s390_enable_sie);
2212
2213/*
2214 * Enable storage key handling from now on and initialize the storage
2215 * keys with the default key.
2216 */
2217static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
2218 unsigned long next, struct mm_walk *walk)
2219{
2220 /* Clear storage key */
2221 ptep_zap_key(walk->mm, addr, pte);
2222 return 0;
2223}
2224
2225/*
2226 * Give a chance to schedule after setting a key to 256 pages.
2227 * We only hold the mm lock, which is a rwsem and the kvm srcu.
2228 * Both can sleep.
2229 */
2230static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
2231 unsigned long next, struct mm_walk *walk)
2232{
2233 cond_resched();
2234 return 0;
2235}
2236
2237static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2238 unsigned long hmask, unsigned long next,
2239 struct mm_walk *walk)
2240{
2241 pmd_t *pmd = (pmd_t *)pte;
2242 unsigned long start, end;
2243 struct folio *folio = page_folio(pmd_page(*pmd));
2244
2245 /*
2246 * The write check makes sure we do not set a key on shared
2247 * memory. This is needed as the walker does not differentiate
2248 * between actual guest memory and the process executable or
2249 * shared libraries.
2250 */
2251 if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
2252 !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
2253 return 0;
2254
2255 start = pmd_val(pmd: *pmd) & HPAGE_MASK;
2256 end = start + HPAGE_SIZE;
2257 __storage_key_init_range(start, end);
2258 set_bit(nr: PG_arch_1, addr: &folio->flags.f);
2259 cond_resched();
2260 return 0;
2261}
2262
2263static const struct mm_walk_ops enable_skey_walk_ops = {
2264 .hugetlb_entry = __s390_enable_skey_hugetlb,
2265 .pte_entry = __s390_enable_skey_pte,
2266 .pmd_entry = __s390_enable_skey_pmd,
2267 .walk_lock = PGWALK_WRLOCK,
2268};
2269
2270int s390_enable_skey(void)
2271{
2272 struct mm_struct *mm = current->mm;
2273 int rc = 0;
2274
2275 mmap_write_lock(mm);
2276 if (mm_uses_skeys(mm))
2277 goto out_up;
2278
2279 mm->context.uses_skeys = 1;
2280 rc = gmap_helper_disable_cow_sharing();
2281 if (rc) {
2282 mm->context.uses_skeys = 0;
2283 goto out_up;
2284 }
2285 walk_page_range(mm, start: 0, TASK_SIZE, ops: &enable_skey_walk_ops, NULL);
2286
2287out_up:
2288 mmap_write_unlock(mm);
2289 return rc;
2290}
2291EXPORT_SYMBOL_GPL(s390_enable_skey);
2292
2293/*
2294 * Reset CMMA state, make all pages stable again.
2295 */
2296static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2297 unsigned long next, struct mm_walk *walk)
2298{
2299 ptep_zap_unused(walk->mm, addr, pte, 1);
2300 return 0;
2301}
2302
2303static const struct mm_walk_ops reset_cmma_walk_ops = {
2304 .pte_entry = __s390_reset_cmma,
2305 .walk_lock = PGWALK_WRLOCK,
2306};
2307
2308void s390_reset_cmma(struct mm_struct *mm)
2309{
2310 mmap_write_lock(mm);
2311 walk_page_range(mm, start: 0, TASK_SIZE, ops: &reset_cmma_walk_ops, NULL);
2312 mmap_write_unlock(mm);
2313}
2314EXPORT_SYMBOL_GPL(s390_reset_cmma);
2315
2316#define GATHER_GET_PAGES 32
2317
2318struct reset_walk_state {
2319 unsigned long next;
2320 unsigned long count;
2321 unsigned long pfns[GATHER_GET_PAGES];
2322};
2323
2324static int s390_gather_pages(pte_t *ptep, unsigned long addr,
2325 unsigned long next, struct mm_walk *walk)
2326{
2327 struct reset_walk_state *p = walk->private;
2328 pte_t pte = READ_ONCE(*ptep);
2329
2330 if (pte_present(a: pte)) {
2331 /* we have a reference from the mapping, take an extra one */
2332 get_page(phys_to_page(pte_val(pte)));
2333 p->pfns[p->count] = phys_to_pfn(pte_val(pte));
2334 p->next = next;
2335 p->count++;
2336 }
2337 return p->count >= GATHER_GET_PAGES;
2338}
2339
2340static const struct mm_walk_ops gather_pages_ops = {
2341 .pte_entry = s390_gather_pages,
2342 .walk_lock = PGWALK_RDLOCK,
2343};
2344
2345/*
2346 * Call the Destroy secure page UVC on each page in the given array of PFNs.
2347 * Each page needs to have an extra reference, which will be released here.
2348 */
2349void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
2350{
2351 struct folio *folio;
2352 unsigned long i;
2353
2354 for (i = 0; i < count; i++) {
2355 folio = pfn_folio(pfn: pfns[i]);
2356 /* we always have an extra reference */
2357 uv_destroy_folio(folio);
2358 /* get rid of the extra reference */
2359 folio_put(folio);
2360 cond_resched();
2361 }
2362}
2363EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
2364
2365/**
2366 * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
2367 * in the given range of the given address space.
2368 * @mm: the mm to operate on
2369 * @start: the start of the range
2370 * @end: the end of the range
2371 * @interruptible: if not 0, stop when a fatal signal is received
2372 *
2373 * Walk the given range of the given address space and call the destroy
2374 * secure page UVC on each page. Optionally exit early if a fatal signal is
2375 * pending.
2376 *
2377 * Return: 0 on success, -EINTR if the function stopped before completing
2378 */
2379int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
2380 unsigned long end, bool interruptible)
2381{
2382 struct reset_walk_state state = { .next = start };
2383 int r = 1;
2384
2385 while (r > 0) {
2386 state.count = 0;
2387 mmap_read_lock(mm);
2388 r = walk_page_range(mm, start: state.next, end, ops: &gather_pages_ops, private: &state);
2389 mmap_read_unlock(mm);
2390 cond_resched();
2391 s390_uv_destroy_pfns(state.count, state.pfns);
2392 if (interruptible && fatal_signal_pending(current))
2393 return -EINTR;
2394 }
2395 return 0;
2396}
2397EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
2398
2399/**
2400 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
2401 * @gmap: the gmap whose ASCE needs to be replaced
2402 *
2403 * If the ASCE is a SEGMENT type then this function will return -EINVAL,
2404 * otherwise the pointers in the host_to_guest radix tree will keep pointing
2405 * to the wrong pages, causing use-after-free and memory corruption.
2406 * If the allocation of the new top level page table fails, the ASCE is not
2407 * replaced.
2408 * In any case, the old ASCE is always removed from the gmap CRST list.
2409 * Therefore the caller has to make sure to save a pointer to it
2410 * beforehand, unless a leak is actually intended.
2411 */
2412int s390_replace_asce(struct gmap *gmap)
2413{
2414 unsigned long asce;
2415 struct page *page;
2416 void *table;
2417
2418 /* Replacing segment type ASCEs would cause serious issues */
2419 if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
2420 return -EINVAL;
2421
2422 page = gmap_alloc_crst();
2423 if (!page)
2424 return -ENOMEM;
2425 table = page_to_virt(page);
2426 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
2427
2428 /* Set new table origin while preserving existing ASCE control bits */
2429 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
2430 WRITE_ONCE(gmap->asce, asce);
2431 WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
2432 WRITE_ONCE(gmap->table, table);
2433
2434 return 0;
2435}
2436EXPORT_SYMBOL_GPL(s390_replace_asce);
2437

source code of linux/arch/s390/mm/gmap.c