kexec_handover.c source code [linux/kernel/liveupdate/kexec_handover.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* kexec_handover.c - kexec handover metadata processing
4	* Copyright (C) 2023 Alexander Graf <graf@amazon.com>
5	* Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
6	* Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
7	* Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
8	*/
9
10	#define pr_fmt(fmt) "KHO: " fmt
11
12	#include <linux/cleanup.h>
13	#include <linux/cma.h>
14	#include <linux/kmemleak.h>
15	#include <linux/count_zeros.h>
16	#include <linux/kexec.h>
17	#include <linux/kexec_handover.h>
18	#include <linux/libfdt.h>
19	#include <linux/list.h>
20	#include <linux/memblock.h>
21	#include <linux/page-isolation.h>
22	#include <linux/unaligned.h>
23	#include <linux/vmalloc.h>
24
25	#include <asm/early_ioremap.h>
26
27	#include "kexec_handover_internal.h"
28	/*
29	* KHO is tightly coupled with mm init and needs access to some of mm
30	* internal APIs.
31	*/
32	#include "../../mm/internal.h"
33	#include "../kexec_internal.h"
34	#include "kexec_handover_internal.h"
35
36	#define KHO_FDT_COMPATIBLE "kho-v1"
37	#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
38	#define PROP_SUB_FDT "fdt"
39
40	#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
41
42	/*
43	* KHO uses page->private, which is an unsigned long, to store page metadata.
44	* Use it to store both the magic and the order.
45	*/
46	union kho_page_info {
47	unsigned long page_private;
48	struct {
49	unsigned int order;
50	unsigned int magic;
51	};
52	};
53
54	static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)`0`)->private));
55
56	static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT);
57
58	bool kho_is_enabled(void)
59	{
60	return kho_enable;
61	}
62	EXPORT_SYMBOL_GPL(kho_is_enabled);
63
64	static int __init kho_parse_enable(char *p)
65	{
66	return kstrtobool(s: p, res: &kho_enable);
67	}
68	early_param("kho", kho_parse_enable);
69
70	/*
71	* Keep track of memory that is to be preserved across KHO.
72	*
73	* The serializing side uses two levels of xarrays to manage chunks of per-order
74	* PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order
75	* of a 8TB system would fit inside a single 4096 byte bitmap. For order 0
76	* allocations each bitmap will cover 128M of address space. Thus, for 16G of
77	* memory at most 512K of bitmap memory will be needed for order 0.
78	*
79	* This approach is fully incremental, as the serialization progresses folios
80	* can continue be aggregated to the tracker. The final step, immediately prior
81	* to kexec would serialize the xarray information into a linked list for the
82	* successor kernel to parse.
83	*/
84
85	#define PRESERVE_BITS (PAGE_SIZE * 8)
86
87	struct kho_mem_phys_bits {
88	DECLARE_BITMAP(preserve, PRESERVE_BITS);
89	};
90
91	static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE);
92
93	struct kho_mem_phys {
94	/*
95	* Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
96	* to order.
97	*/
98	struct xarray phys_bits;
99	};
100
101	struct kho_mem_track {
102	/ Points to kho_mem_phys, each order gets its own bitmap tree /
103	struct xarray orders;
104	};
105
106	struct khoser_mem_chunk;
107
108	struct kho_out {
109	void *fdt;
110	bool finalized;
111	struct mutex lock; / protects KHO FDT finalization /
112
113	struct kho_mem_track track;
114	struct kho_debugfs dbg;
115	};
116
117	static struct kho_out kho_out = {
118	.lock = __MUTEX_INITIALIZER(kho_out.lock),
119	.track = {
120	.orders = XARRAY_INIT(kho_out.track.orders, `0`),
121	},
122	.finalized = false,
123	};
124
125	static void xa_load_or_alloc(struct* xarray xa, unsigned* long index)
126	{
127	void *res = xa_load(xa, index);
128
129	if (res)
130	return res;
131
132	void elm __free(free_page) = (void* *)get_zeroed_page(GFP_KERNEL);
133
134	if (!elm)
135	return ERR_PTR(error: -ENOMEM);
136
137	if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
138	return ERR_PTR(error: -EINVAL);
139
140	res = xa_cmpxchg(xa, index, NULL, entry: elm, GFP_KERNEL);
141	if (xa_is_err(entry: res))
142	return ERR_PTR(error: xa_err(entry: res));
143	else if (res)
144	return res;
145
146	return no_free_ptr(elm);
147	}
148
149	static void __kho_unpreserve_order(struct kho_mem_track track, unsigned* long pfn,
150	unsigned int order)
151	{
152	struct kho_mem_phys_bits *bits;
153	struct kho_mem_phys *physxa;
154	const unsigned long pfn_high = pfn >> order;
155
156	physxa = xa_load(&track->orders, index: order);
157	if (WARN_ON_ONCE(!physxa))
158	return;
159
160	bits = xa_load(&physxa->phys_bits, index: pfn_high / PRESERVE_BITS);
161	if (WARN_ON_ONCE(!bits))
162	return;
163
164	clear_bit(nr: pfn_high % PRESERVE_BITS, addr: bits->preserve);
165	}
166
167	static void __kho_unpreserve(struct kho_mem_track track, unsigned* long pfn,
168	unsigned long end_pfn)
169	{
170	unsigned int order;
171
172	while (pfn < end_pfn) {
173	order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
174
175	__kho_unpreserve_order(track, pfn, order);
176
177	pfn += `1` << order;
178	}
179	}
180
181	static int __kho_preserve_order(struct kho_mem_track track, unsigned* long pfn,
182	unsigned int order)
183	{
184	struct kho_mem_phys_bits *bits;
185	struct kho_mem_phys physxa, new_physxa;
186	const unsigned long pfn_high = pfn >> order;
187
188	might_sleep();
189	physxa = xa_load(&track->orders, index: order);
190	if (!physxa) {
191	int err;
192
193	new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
194	if (!new_physxa)
195	return -ENOMEM;
196
197	xa_init(xa: &new_physxa->phys_bits);
198	physxa = xa_cmpxchg(xa: &track->orders, index: order, NULL, entry: new_physxa,
199	GFP_KERNEL);
200
201	err = xa_err(entry: physxa);
202	if (err \|\| physxa) {
203	xa_destroy(&new_physxa->phys_bits);
204	kfree(objp: new_physxa);
205
206	if (err)
207	return err;
208	} else {
209	physxa = new_physxa;
210	}
211	}
212
213	bits = xa_load_or_alloc(xa: &physxa->phys_bits, index: pfn_high / PRESERVE_BITS);
214	if (IS_ERR(ptr: bits))
215	return PTR_ERR(ptr: bits);
216
217	set_bit(nr: pfn_high % PRESERVE_BITS, addr: bits->preserve);
218
219	return `0`;
220	}
221
222	static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
223	{
224	struct page *page = pfn_to_online_page(PHYS_PFN(phys));
225	unsigned int nr_pages, ref_cnt;
226	union kho_page_info info;
227
228	if (!page)
229	return NULL;
230
231	info.page_private = page->private;
232	/*
233	* deserialize_bitmap() only sets the magic on the head page. This magic
234	* check also implicitly makes sure phys is order-aligned since for
235	* non-order-aligned phys addresses, magic will never be set.
236	*/
237	if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC \|\| info.order > MAX_PAGE_ORDER))
238	return NULL;
239	nr_pages = (`1` << info.order);
240
241	/ Clear private to make sure later restores on this page error out. /
242	page->private = `0`;
243	/ Head page gets refcount of 1. /
244	set_page_count(page, v: `1`);
245
246	/*
247	* For higher order folios, tail pages get a page count of zero.
248	* For physically contiguous order-0 pages every pages gets a page
249	* count of 1
250	*/
251	ref_cnt = is_folio ? `0` : `1`;
252	for (unsigned int i = `1`; i < nr_pages; i++)
253	set_page_count(page: page + i, v: ref_cnt);
254
255	if (is_folio && info.order)
256	prep_compound_page(page, order: info.order);
257
258	/ Always mark headpage's codetag as empty to avoid accounting mismatch /
259	clear_page_tag_ref(page);
260	if (!is_folio) {
261	/ Also do that for the non-compound tail pages /
262	for (unsigned int i = `1`; i < nr_pages; i++)
263	clear_page_tag_ref(page: page + i);
264	}
265
266	adjust_managed_page_count(page, count: nr_pages);
267	return page;
268	}
269
270	/**
271	* kho_restore_folio - recreates the folio from the preserved memory.
272	* @phys: physical address of the folio.
273	*
274	* Return: pointer to the struct folio on success, NULL on failure.
275	*/
276	struct folio *kho_restore_folio(phys_addr_t phys)
277	{
278	struct page *page = kho_restore_page(phys, is_folio: true);
279
280	return page ? page_folio(page) : NULL;
281	}
282	EXPORT_SYMBOL_GPL(kho_restore_folio);
283
284	/**
285	* kho_restore_pages - restore list of contiguous order 0 pages.
286	* @phys: physical address of the first page.
287	* @nr_pages: number of pages.
288	*
289	* Restore a contiguous list of order 0 pages that was preserved with
290	* kho_preserve_pages().
291	*
292	* Return: 0 on success, error code on failure
293	*/
294	struct page kho_restore_pages(phys_addr_t phys, unsigned* int nr_pages)
295	{
296	const unsigned long start_pfn = PHYS_PFN(phys);
297	const unsigned long end_pfn = start_pfn + nr_pages;
298	unsigned long pfn = start_pfn;
299
300	while (pfn < end_pfn) {
301	const unsigned int order =
302	min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
303	struct page *page = kho_restore_page(PFN_PHYS(pfn), is_folio: false);
304
305	if (!page)
306	return NULL;
307	pfn += `1` << order;
308	}
309
310	return pfn_to_page(start_pfn);
311	}
312	EXPORT_SYMBOL_GPL(kho_restore_pages);
313
314	/ Serialize and deserialize struct kho_mem_phys across kexec*
315	*
316	* Record all the bitmaps in a linked list of pages for the next kernel to
317	* process. Each chunk holds bitmaps of the same order and each block of bitmaps
318	* starts at a given physical address. This allows the bitmaps to be sparse. The
319	* xarray is used to store them in a tree while building up the data structure,
320	* but the KHO successor kernel only needs to process them once in order.
321	*
322	* All of this memory is normal kmalloc() memory and is not marked for
323	* preservation. The successor kernel will remain isolated to the scratch space
324	* until it completes processing this list. Once processed all the memory
325	* storing these ranges will be marked as free.
326	*/
327
328	struct khoser_mem_bitmap_ptr {
329	phys_addr_t phys_start;
330	DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
331	};
332
333	struct khoser_mem_chunk_hdr {
334	DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
335	unsigned int order;
336	unsigned int num_elms;
337	};
338
339	#define KHOSER_BITMAP_SIZE \
340	((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
341	sizeof(struct khoser_mem_bitmap_ptr))
342
343	struct khoser_mem_chunk {
344	struct khoser_mem_chunk_hdr hdr;
345	struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
346	};
347
348	static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
349
350	static struct khoser_mem_chunk new_chunk(struct* khoser_mem_chunk *cur_chunk,
351	unsigned long order)
352	{
353	struct khoser_mem_chunk *chunk __free(free_page) = NULL;
354
355	chunk = (void *)get_zeroed_page(GFP_KERNEL);
356	if (!chunk)
357	return ERR_PTR(error: -ENOMEM);
358
359	if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE)))
360	return ERR_PTR(error: -EINVAL);
361
362	chunk->hdr.order = order;
363	if (cur_chunk)
364	KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
365	return no_free_ptr(chunk);
366	}
367
368	static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
369	{
370	struct khoser_mem_chunk *chunk = first_chunk;
371
372	while (chunk) {
373	struct khoser_mem_chunk *tmp = chunk;
374
375	chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
376	free_page((unsigned long)tmp);
377	}
378	}
379
380	/*
381	* Update memory map property, if old one is found discard it via
382	* kho_mem_ser_free().
383	*/
384	static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
385	{
386	void *ptr;
387	u64 phys;
388
389	ptr = fdt_getprop_w(fdt: kho_out.fdt, nodeoffset: `0`, PROP_PRESERVED_MEMORY_MAP, NULL);
390
391	/ Check and discard previous memory map /
392	phys = get_unaligned((u64 *)ptr);
393	if (phys)
394	kho_mem_ser_free(first_chunk: (struct khoser_mem_chunk *)phys_to_virt(address: phys));
395
396	/ Update with the new value /
397	phys = first_chunk ? (u64)virt_to_phys(address: first_chunk) : `0`;
398	put_unaligned(phys, (u64 *)ptr);
399	}
400
401	static int kho_mem_serialize(struct kho_out *kho_out)
402	{
403	struct khoser_mem_chunk *first_chunk = NULL;
404	struct khoser_mem_chunk *chunk = NULL;
405	struct kho_mem_phys *physxa;
406	unsigned long order;
407	int err = -ENOMEM;
408
409	xa_for_each(&kho_out->track.orders, order, physxa) {
410	struct kho_mem_phys_bits *bits;
411	unsigned long phys;
412
413	chunk = new_chunk(cur_chunk: chunk, order);
414	if (IS_ERR(ptr: chunk)) {
415	err = PTR_ERR(ptr: chunk);
416	goto err_free;
417	}
418
419	if (!first_chunk)
420	first_chunk = chunk;
421
422	xa_for_each(&physxa->phys_bits, phys, bits) {
423	struct khoser_mem_bitmap_ptr *elm;
424
425	if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
426	chunk = new_chunk(cur_chunk: chunk, order);
427	if (IS_ERR(ptr: chunk)) {
428	err = PTR_ERR(ptr: chunk);
429	goto err_free;
430	}
431	}
432
433	elm = &chunk->bitmaps[chunk->hdr.num_elms];
434	chunk->hdr.num_elms++;
435	elm->phys_start = (phys * PRESERVE_BITS)
436	<< (order + PAGE_SHIFT);
437	KHOSER_STORE_PTR(elm->bitmap, bits);
438	}
439	}
440
441	kho_update_memory_map(first_chunk);
442
443	return `0`;
444
445	err_free:
446	kho_mem_ser_free(first_chunk);
447	return err;
448	}
449
450	static void __init deserialize_bitmap(unsigned int order,
451	struct khoser_mem_bitmap_ptr *elm)
452	{
453	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
454	unsigned long bit;
455
456	for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
457	int sz = `1` << (order + PAGE_SHIFT);
458	phys_addr_t phys =
459	elm->phys_start + (bit << (order + PAGE_SHIFT));
460	struct page *page = phys_to_page(phys);
461	union kho_page_info info;
462
463	memblock_reserve(base: phys, size: sz);
464	memblock_reserved_mark_noinit(base: phys, size: sz);
465	info.magic = KHO_PAGE_MAGIC;
466	info.order = order;
467	page->private = info.page_private;
468	}
469	}
470
471	/ Returns physical address of the preserved memory map from FDT /
472	static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
473	{
474	const void *mem_ptr;
475	int len;
476
477	mem_ptr = fdt_getprop(fdt, nodeoffset: `0`, PROP_PRESERVED_MEMORY_MAP, lenp: &len);
478	if (!mem_ptr \|\| len != sizeof(u64)) {
479	pr_err("failed to get preserved memory bitmaps\n");
480	return `0`;
481	}
482
483	return get_unaligned((const u64 *)mem_ptr);
484	}
485
486	static void __init kho_mem_deserialize(struct khoser_mem_chunk *chunk)
487	{
488	while (chunk) {
489	unsigned int i;
490
491	for (i = `0`; i != chunk->hdr.num_elms; i++)
492	deserialize_bitmap(order: chunk->hdr.order,
493	elm: &chunk->bitmaps[i]);
494	chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
495	}
496	}
497
498	/*
499	* With KHO enabled, memory can become fragmented because KHO regions may
500	* be anywhere in physical address space. The scratch regions give us a
501	* safe zones that we will never see KHO allocations from. This is where we
502	* can later safely load our new kexec images into and then use the scratch
503	* area for early allocations that happen before page allocator is
504	* initialized.
505	*/
506	struct kho_scratch *kho_scratch;
507	unsigned int kho_scratch_cnt;
508
509	/*
510	* The scratch areas are scaled by default as percent of memory allocated from
511	* memblock. A user can override the scale with command line parameter:
512	*
513	* kho_scratch=N%
514	*
515	* It is also possible to explicitly define size for a lowmem, a global and
516	* per-node scratch areas:
517	*
518	* kho_scratch=l[KMG],n[KMG],m[KMG]
519	*
520	* The explicit size definition takes precedence over scale definition.
521	*/
522	static unsigned int scratch_scale __initdata = `200`;
523	static phys_addr_t scratch_size_global __initdata;
524	static phys_addr_t scratch_size_pernode __initdata;
525	static phys_addr_t scratch_size_lowmem __initdata;
526
527	static int __init kho_parse_scratch_size(char *p)
528	{
529	size_t len;
530	unsigned long sizes[`3`];
531	size_t total_size = `0`;
532	int i;
533
534	if (!p)
535	return -EINVAL;
536
537	len = strlen(p);
538	if (!len)
539	return -EINVAL;
540
541	/ parse nn% /
542	if (p[len - `1`] == `'%'`) {
543	/ unsigned int max is 4,294,967,295, 10 chars /
544	char s_scale[`11`] = {};
545	int ret = `0`;
546
547	if (len > ARRAY_SIZE(s_scale))
548	return -EINVAL;
549
550	memcpy(s_scale, p, len - `1`);
551	ret = kstrtouint(s: s_scale, base: `10`, res: &scratch_scale);
552	if (!ret)
553	pr_notice("scratch scale is %d%%\n", scratch_scale);
554	return ret;
555	}
556
557	/ parse ll[KMG],mm[KMG],nn[KMG] /
558	for (i = `0`; i < ARRAY_SIZE(sizes); i++) {
559	char *endp = p;
560
561	if (i > `0`) {
562	if (*p != `','`)
563	return -EINVAL;
564	p += `1`;
565	}
566
567	sizes[i] = memparse(ptr: p, retptr: &endp);
568	if (endp == p)
569	return -EINVAL;
570	p = endp;
571	total_size += sizes[i];
572	}
573
574	if (!total_size)
575	return -EINVAL;
576
577	/ The string should be fully consumed by now. /
578	if (*p)
579	return -EINVAL;
580
581	scratch_size_lowmem = sizes[`0`];
582	scratch_size_global = sizes[`1`];
583	scratch_size_pernode = sizes[`2`];
584	scratch_scale = `0`;
585
586	pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
587	(u64)(scratch_size_lowmem >> `20`),
588	(u64)(scratch_size_global >> `20`),
589	(u64)(scratch_size_pernode >> `20`));
590
591	return `0`;
592	}
593	early_param("kho_scratch", kho_parse_scratch_size);
594
595	static void __init scratch_size_update(void)
596	{
597	phys_addr_t size;
598
599	if (!scratch_scale)
600	return;
601
602	size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
603	NUMA_NO_NODE);
604	size = size * scratch_scale / `100`;
605	scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
606
607	size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
608	NUMA_NO_NODE);
609	size = size * scratch_scale / `100` - scratch_size_lowmem;
610	scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
611	}
612
613	static phys_addr_t __init scratch_size_node(int nid)
614	{
615	phys_addr_t size;
616
617	if (scratch_scale) {
618	size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
619	nid);
620	size = size * scratch_scale / `100`;
621	} else {
622	size = scratch_size_pernode;
623	}
624
625	return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
626	}
627
628	/**
629	* kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
630	*
631	* With KHO we can preserve arbitrary pages in the system. To ensure we still
632	* have a large contiguous region of memory when we search the physical address
633	* space for target memory, let's make sure we always have a large CMA region
634	* active. This CMA region will only be used for movable pages which are not a
635	* problem for us during KHO because we can just move them somewhere else.
636	*/
637	static void __init kho_reserve_scratch(void)
638	{
639	phys_addr_t addr, size;
640	int nid, i = `0`;
641
642	if (!kho_enable)
643	return;
644
645	scratch_size_update();
646
647	/ FIXME: deal with node hot-plug/remove /
648	kho_scratch_cnt = num_online_nodes() + `2`;
649	size = kho_scratch_cnt * sizeof(*kho_scratch);
650	kho_scratch = memblock_alloc(size, PAGE_SIZE);
651	if (!kho_scratch)
652	goto err_disable_kho;
653
654	/*
655	* reserve scratch area in low memory for lowmem allocations in the
656	* next kernel
657	*/
658	size = scratch_size_lowmem;
659	addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, start: `0`,
660	ARCH_LOW_ADDRESS_LIMIT);
661	if (!addr)
662	goto err_free_scratch_desc;
663
664	kho_scratch[i].addr = addr;
665	kho_scratch[i].size = size;
666	i++;
667
668	/ reserve large contiguous area for allocations without nid /
669	size = scratch_size_global;
670	addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
671	if (!addr)
672	goto err_free_scratch_areas;
673
674	kho_scratch[i].addr = addr;
675	kho_scratch[i].size = size;
676	i++;
677
678	for_each_online_node(nid) {
679	size = scratch_size_node(nid);
680	addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
681	start: `0`, MEMBLOCK_ALLOC_ACCESSIBLE,
682	nid, exact_nid: true);
683	if (!addr)
684	goto err_free_scratch_areas;
685
686	kho_scratch[i].addr = addr;
687	kho_scratch[i].size = size;
688	i++;
689	}
690
691	return;
692
693	err_free_scratch_areas:
694	for (i--; i >= `0`; i--)
695	memblock_phys_free(base: kho_scratch[i].addr, size: kho_scratch[i].size);
696	err_free_scratch_desc:
697	memblock_free(ptr: kho_scratch, size: kho_scratch_cnt * sizeof(*kho_scratch));
698	err_disable_kho:
699	pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
700	kho_enable = false;
701	}
702
703	/**
704	* kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
705	* @name: name of the sub tree.
706	* @fdt: the sub tree blob.
707	*
708	* Creates a new child node named @name in KHO root FDT and records
709	* the physical address of @fdt. The pages of @fdt must also be preserved
710	* by KHO for the new kernel to retrieve it after kexec.
711	*
712	* A debugfs blob entry is also created at
713	* ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with
714	* CONFIG_KEXEC_HANDOVER_DEBUGFS
715	*
716	* Return: 0 on success, error code on failure
717	*/
718	int kho_add_subtree(const char name, void* *fdt)
719	{
720	phys_addr_t phys = virt_to_phys(address: fdt);
721	void *root_fdt = kho_out.fdt;
722	int err = -ENOMEM;
723	int off, fdt_err;
724
725	guard(mutex)(T: &kho_out.lock);
726
727	fdt_err = fdt_open_into(fdt: root_fdt, buf: root_fdt, PAGE_SIZE);
728	if (fdt_err < `0`)
729	return err;
730
731	off = fdt_add_subnode(fdt: root_fdt, parentoffset: `0`, name);
732	if (off < `0`) {
733	if (off == -FDT_ERR_EXISTS)
734	err = -EEXIST;
735	goto out_pack;
736	}
737
738	err = fdt_setprop(fdt: root_fdt, nodeoffset: off, PROP_SUB_FDT, val: &phys, len: sizeof(phys));
739	if (err < `0`)
740	goto out_pack;
741
742	WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
743
744	out_pack:
745	fdt_pack(fdt: root_fdt);
746
747	return err;
748	}
749	EXPORT_SYMBOL_GPL(kho_add_subtree);
750
751	void kho_remove_subtree(void *fdt)
752	{
753	phys_addr_t target_phys = virt_to_phys(address: fdt);
754	void *root_fdt = kho_out.fdt;
755	int off;
756	int err;
757
758	guard(mutex)(T: &kho_out.lock);
759
760	err = fdt_open_into(fdt: root_fdt, buf: root_fdt, PAGE_SIZE);
761	if (err < `0`)
762	return;
763
764	for (off = fdt_first_subnode(fdt: root_fdt, offset: `0`); off >= `0`;
765	off = fdt_next_subnode(fdt: root_fdt, offset: off)) {
766	const u64 *val;
767	int len;
768
769	val = fdt_getprop(fdt: root_fdt, nodeoffset: off, PROP_SUB_FDT, lenp: &len);
770	if (!val \|\| len != sizeof(phys_addr_t))
771	continue;
772
773	if ((phys_addr_t)*val == target_phys) {
774	fdt_del_node(fdt: root_fdt, nodeoffset: off);
775	kho_debugfs_fdt_remove(dbg: &kho_out.dbg, fdt);
776	break;
777	}
778	}
779
780	fdt_pack(fdt: root_fdt);
781	}
782	EXPORT_SYMBOL_GPL(kho_remove_subtree);
783
784	/**
785	* kho_preserve_folio - preserve a folio across kexec.
786	* @folio: folio to preserve.
787	*
788	* Instructs KHO to preserve the whole folio across kexec. The order
789	* will be preserved as well.
790	*
791	* Return: 0 on success, error code on failure
792	*/
793	int kho_preserve_folio(struct folio *folio)
794	{
795	const unsigned long pfn = folio_pfn(folio);
796	const unsigned int order = folio_order(folio);
797	struct kho_mem_track *track = &kho_out.track;
798
799	if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
800	return -EINVAL;
801
802	return __kho_preserve_order(track, pfn, order);
803	}
804	EXPORT_SYMBOL_GPL(kho_preserve_folio);
805
806	/**
807	* kho_unpreserve_folio - unpreserve a folio.
808	* @folio: folio to unpreserve.
809	*
810	* Instructs KHO to unpreserve a folio that was preserved by
811	* kho_preserve_folio() before. The provided @folio (pfn and order)
812	* must exactly match a previously preserved folio.
813	*/
814	void kho_unpreserve_folio(struct folio *folio)
815	{
816	const unsigned long pfn = folio_pfn(folio);
817	const unsigned int order = folio_order(folio);
818	struct kho_mem_track *track = &kho_out.track;
819
820	__kho_unpreserve_order(track, pfn, order);
821	}
822	EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
823
824	/**
825	* kho_preserve_pages - preserve contiguous pages across kexec
826	* @page: first page in the list.
827	* @nr_pages: number of pages.
828	*
829	* Preserve a contiguous list of order 0 pages. Must be restored using
830	* kho_restore_pages() to ensure the pages are restored properly as order 0.
831	*
832	* Return: 0 on success, error code on failure
833	*/
834	int kho_preserve_pages(struct page page, unsigned* int nr_pages)
835	{
836	struct kho_mem_track *track = &kho_out.track;
837	const unsigned long start_pfn = page_to_pfn(page);
838	const unsigned long end_pfn = start_pfn + nr_pages;
839	unsigned long pfn = start_pfn;
840	unsigned long failed_pfn = `0`;
841	int err = `0`;
842
843	if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT,
844	nr_pages << PAGE_SHIFT))) {
845	return -EINVAL;
846	}
847
848	while (pfn < end_pfn) {
849	const unsigned int order =
850	min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
851
852	err = __kho_preserve_order(track, pfn, order);
853	if (err) {
854	failed_pfn = pfn;
855	break;
856	}
857
858	pfn += `1` << order;
859	}
860
861	if (err)
862	__kho_unpreserve(track, pfn: start_pfn, end_pfn: failed_pfn);
863
864	return err;
865	}
866	EXPORT_SYMBOL_GPL(kho_preserve_pages);
867
868	/**
869	* kho_unpreserve_pages - unpreserve contiguous pages.
870	* @page: first page in the list.
871	* @nr_pages: number of pages.
872	*
873	* Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page.
874	* This must be called with the same @page and @nr_pages as the corresponding
875	* kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
876	* preserved blocks is not supported.
877	*/
878	void kho_unpreserve_pages(struct page page, unsigned* int nr_pages)
879	{
880	struct kho_mem_track *track = &kho_out.track;
881	const unsigned long start_pfn = page_to_pfn(page);
882	const unsigned long end_pfn = start_pfn + nr_pages;
883
884	__kho_unpreserve(track, pfn: start_pfn, end_pfn);
885	}
886	EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
887
888	struct kho_vmalloc_hdr {
889	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
890	};
891
892	#define KHO_VMALLOC_SIZE \
893	((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
894	sizeof(phys_addr_t))
895
896	struct kho_vmalloc_chunk {
897	struct kho_vmalloc_hdr hdr;
898	phys_addr_t phys[KHO_VMALLOC_SIZE];
899	};
900
901	static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
902
903	/ vmalloc flags KHO supports /
904	#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC \| VM_ALLOW_HUGE_VMAP)
905
906	/ KHO internal flags for vmalloc preservations /
907	#define KHO_VMALLOC_ALLOC 0x0001
908	#define KHO_VMALLOC_HUGE_VMAP 0x0002
909
910	static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags)
911	{
912	unsigned short kho_flags = `0`;
913
914	if (vm_flags & VM_ALLOC)
915	kho_flags \|= KHO_VMALLOC_ALLOC;
916	if (vm_flags & VM_ALLOW_HUGE_VMAP)
917	kho_flags \|= KHO_VMALLOC_HUGE_VMAP;
918
919	return kho_flags;
920	}
921
922	static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags)
923	{
924	unsigned int vm_flags = `0`;
925
926	if (kho_flags & KHO_VMALLOC_ALLOC)
927	vm_flags \|= VM_ALLOC;
928	if (kho_flags & KHO_VMALLOC_HUGE_VMAP)
929	vm_flags \|= VM_ALLOW_HUGE_VMAP;
930
931	return vm_flags;
932	}
933
934	static struct kho_vmalloc_chunk new_vmalloc_chunk(struct* kho_vmalloc_chunk *cur)
935	{
936	struct kho_vmalloc_chunk *chunk;
937	int err;
938
939	chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL);
940	if (!chunk)
941	return NULL;
942
943	err = kho_preserve_pages(virt_to_page(chunk), `1`);
944	if (err)
945	goto err_free;
946	if (cur)
947	KHOSER_STORE_PTR(cur->hdr.next, chunk);
948	return chunk;
949
950	err_free:
951	free_page((unsigned long)chunk);
952	return NULL;
953	}
954
955	static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
956	unsigned short order)
957	{
958	struct kho_mem_track *track = &kho_out.track;
959	unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
960
961	__kho_unpreserve(track, pfn, end_pfn: pfn + `1`);
962
963	for (int i = `0`; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
964	pfn = PHYS_PFN(chunk->phys[i]);
965	__kho_unpreserve(track, pfn, end_pfn: pfn + (`1` << order));
966	}
967	}
968
969	/**
970	* kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
971	* @ptr: pointer to the area in vmalloc address space
972	* @preservation: placeholder for preservation metadata
973	*
974	* Instructs KHO to preserve the area in vmalloc address space at @ptr. The
975	* physical pages mapped at @ptr will be preserved and on successful return
976	* @preservation will hold the physical address of a structure that describes
977	* the preservation.
978	*
979	* NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
980	* restored on the same node
981	*
982	* Return: 0 on success, error code on failure
983	*/
984	int kho_preserve_vmalloc(void ptr, struct* kho_vmalloc *preservation)
985	{
986	struct kho_vmalloc_chunk *chunk;
987	struct vm_struct *vm = find_vm_area(addr: ptr);
988	unsigned int order, flags, nr_contig_pages;
989	unsigned int idx = `0`;
990	int err;
991
992	if (!vm)
993	return -EINVAL;
994
995	if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
996	return -EOPNOTSUPP;
997
998	flags = vmalloc_flags_to_kho(vm_flags: vm->flags);
999	order = get_vm_area_page_order(vm);
1000
1001	chunk = new_vmalloc_chunk(NULL);
1002	if (!chunk)
1003	return -ENOMEM;
1004	KHOSER_STORE_PTR(preservation->first, chunk);
1005
1006	nr_contig_pages = (`1` << order);
1007	for (int i = `0`; i < vm->nr_pages; i += nr_contig_pages) {
1008	phys_addr_t phys = page_to_phys(vm->pages[i]);
1009
1010	err = kho_preserve_pages(vm->pages[i], nr_contig_pages);
1011	if (err)
1012	goto err_free;
1013
1014	chunk->phys[idx++] = phys;
1015	if (idx == ARRAY_SIZE(chunk->phys)) {
1016	chunk = new_vmalloc_chunk(cur: chunk);
1017	if (!chunk) {
1018	err = -ENOMEM;
1019	goto err_free;
1020	}
1021	idx = `0`;
1022	}
1023	}
1024
1025	preservation->total_pages = vm->nr_pages;
1026	preservation->flags = flags;
1027	preservation->order = order;
1028
1029	return `0`;
1030
1031	err_free:
1032	kho_unpreserve_vmalloc(preservation);
1033	return err;
1034	}
1035	EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
1036
1037	/**
1038	* kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc()
1039	* @preservation: preservation metadata returned by kho_preserve_vmalloc()
1040	*
1041	* Instructs KHO to unpreserve the area in vmalloc address space that was
1042	* previously preserved with kho_preserve_vmalloc().
1043	*/
1044	void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
1045	{
1046	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
1047
1048	while (chunk) {
1049	struct kho_vmalloc_chunk *tmp = chunk;
1050
1051	kho_vmalloc_unpreserve_chunk(chunk, order: preservation->order);
1052
1053	chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
1054	free_page((unsigned long)tmp);
1055	}
1056	}
1057	EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
1058
1059	/**
1060	* kho_restore_vmalloc - recreates and populates an area in vmalloc address
1061	* space from the preserved memory.
1062	* @preservation: preservation metadata.
1063	*
1064	* Recreates an area in vmalloc address space and populates it with memory that
1065	* was preserved using kho_preserve_vmalloc().
1066	*
1067	* Return: pointer to the area in the vmalloc address space, NULL on failure.
1068	*/
1069	void kho_restore_vmalloc(const* struct kho_vmalloc *preservation)
1070	{
1071	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
1072	unsigned int align, order, shift, vm_flags;
1073	unsigned long total_pages, contig_pages;
1074	unsigned long addr, size;
1075	struct vm_struct *area;
1076	struct page **pages;
1077	unsigned int idx = `0`;
1078	int err;
1079
1080	vm_flags = kho_flags_to_vmalloc(kho_flags: preservation->flags);
1081	if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
1082	return NULL;
1083
1084	total_pages = preservation->total_pages;
1085	pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL);
1086	if (!pages)
1087	return NULL;
1088	order = preservation->order;
1089	contig_pages = (`1` << order);
1090	shift = PAGE_SHIFT + order;
1091	align = `1` << shift;
1092
1093	while (chunk) {
1094	struct page *page;
1095
1096	for (int i = `0`; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
1097	phys_addr_t phys = chunk->phys[i];
1098
1099	if (idx + contig_pages > total_pages)
1100	goto err_free_pages_array;
1101
1102	page = kho_restore_pages(phys, contig_pages);
1103	if (!page)
1104	goto err_free_pages_array;
1105
1106	for (int j = `0`; j < contig_pages; j++)
1107	pages[idx++] = page + j;
1108
1109	phys += contig_pages * PAGE_SIZE;
1110	}
1111
1112	page = kho_restore_pages(virt_to_phys(address: chunk), `1`);
1113	if (!page)
1114	goto err_free_pages_array;
1115	chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
1116	__free_page(page);
1117	}
1118
1119	if (idx != total_pages)
1120	goto err_free_pages_array;
1121
1122	area = __get_vm_area_node(size: total_pages * PAGE_SIZE, align, shift,
1123	vm_flags, VMALLOC_START, VMALLOC_END,
1124	NUMA_NO_NODE, GFP_KERNEL,
1125	caller: __builtin_return_address(`0`));
1126	if (!area)
1127	goto err_free_pages_array;
1128
1129	addr = (unsigned long)area->addr;
1130	size = get_vm_area_size(area);
1131	err = vmap_pages_range(addr, end: addr + size, PAGE_KERNEL, pages, page_shift: shift);
1132	if (err)
1133	goto err_free_vm_area;
1134
1135	area->nr_pages = total_pages;
1136	area->pages = pages;
1137
1138	return area->addr;
1139
1140	err_free_vm_area:
1141	free_vm_area(area);
1142	err_free_pages_array:
1143	kvfree(addr: pages);
1144	return NULL;
1145	}
1146	EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
1147
1148	/**
1149	* kho_alloc_preserve - Allocate, zero, and preserve memory.
1150	* @size: The number of bytes to allocate.
1151	*
1152	* Allocates a physically contiguous block of zeroed pages that is large
1153	* enough to hold @size bytes. The allocated memory is then registered with
1154	* KHO for preservation across a kexec.
1155	*
1156	* Note: The actual allocated size will be rounded up to the nearest
1157	* power-of-two page boundary.
1158	*
1159	* @return A virtual pointer to the allocated and preserved memory on success,
1160	* or an ERR_PTR() encoded error on failure.
1161	*/
1162	void *kho_alloc_preserve(size_t size)
1163	{
1164	struct folio *folio;
1165	int order, ret;
1166
1167	if (!size)
1168	return ERR_PTR(error: -EINVAL);
1169
1170	order = get_order(size);
1171	if (order > MAX_PAGE_ORDER)
1172	return ERR_PTR(error: -E2BIG);
1173
1174	folio = folio_alloc(GFP_KERNEL \| __GFP_ZERO, order);
1175	if (!folio)
1176	return ERR_PTR(error: -ENOMEM);
1177
1178	ret = kho_preserve_folio(folio);
1179	if (ret) {
1180	folio_put(folio);
1181	return ERR_PTR(error: ret);
1182	}
1183
1184	return folio_address(folio);
1185	}
1186	EXPORT_SYMBOL_GPL(kho_alloc_preserve);
1187
1188	/**
1189	* kho_unpreserve_free - Unpreserve and free memory.
1190	* @mem: Pointer to the memory allocated by kho_alloc_preserve().
1191	*
1192	* Unregisters the memory from KHO preservation and frees the underlying
1193	* pages back to the system. This function should be called to clean up
1194	* memory allocated with kho_alloc_preserve().
1195	*/
1196	void kho_unpreserve_free(void *mem)
1197	{
1198	struct folio *folio;
1199
1200	if (!mem)
1201	return;
1202
1203	folio = virt_to_folio(x: mem);
1204	kho_unpreserve_folio(folio);
1205	folio_put(folio);
1206	}
1207	EXPORT_SYMBOL_GPL(kho_unpreserve_free);
1208
1209	/**
1210	* kho_restore_free - Restore and free memory after kexec.
1211	* @mem: Pointer to the memory (in the new kernel's address space)
1212	* that was allocated by the old kernel.
1213	*
1214	* This function is intended to be called in the new kernel (post-kexec)
1215	* to take ownership of and free a memory region that was preserved by the
1216	* old kernel using kho_alloc_preserve().
1217	*
1218	* It first restores the pages from KHO (using their physical address)
1219	* and then frees the pages back to the new kernel's page allocator.
1220	*/
1221	void kho_restore_free(void *mem)
1222	{
1223	struct folio *folio;
1224
1225	if (!mem)
1226	return;
1227
1228	folio = kho_restore_folio(__pa(mem));
1229	if (!WARN_ON(!folio))
1230	folio_put(folio);
1231	}
1232	EXPORT_SYMBOL_GPL(kho_restore_free);
1233
1234	int kho_finalize(void)
1235	{
1236	int ret;
1237
1238	if (!kho_enable)
1239	return -EOPNOTSUPP;
1240
1241	guard(mutex)(T: &kho_out.lock);
1242	ret = kho_mem_serialize(kho_out: &kho_out);
1243	if (ret)
1244	return ret;
1245
1246	kho_out.finalized = true;
1247
1248	return `0`;
1249	}
1250
1251	bool kho_finalized(void)
1252	{
1253	guard(mutex)(T: &kho_out.lock);
1254	return kho_out.finalized;
1255	}
1256
1257	struct kho_in {
1258	phys_addr_t fdt_phys;
1259	phys_addr_t scratch_phys;
1260	phys_addr_t mem_map_phys;
1261	struct kho_debugfs dbg;
1262	};
1263
1264	static struct kho_in kho_in = {
1265	};
1266
1267	static const void kho_get_fdt(void*)
1268	{
1269	return kho_in.fdt_phys ? phys_to_virt(address: kho_in.fdt_phys) : NULL;
1270	}
1271
1272	/**
1273	* is_kho_boot - check if current kernel was booted via KHO-enabled
1274	* kexec
1275	*
1276	* This function checks if the current kernel was loaded through a kexec
1277	* operation with KHO enabled, by verifying that a valid KHO FDT
1278	* was passed.
1279	*
1280	* Note: This function returns reliable results only after
1281	* kho_populate() has been called during early boot. Before that,
1282	* it may return false even if KHO data is present.
1283	*
1284	* Return: true if booted via KHO-enabled kexec, false otherwise
1285	*/
1286	bool is_kho_boot(void)
1287	{
1288	return !!kho_get_fdt();
1289	}
1290	EXPORT_SYMBOL_GPL(is_kho_boot);
1291
1292	/**
1293	* kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
1294	* @name: the name of the sub FDT passed to kho_add_subtree().
1295	* @phys: if found, the physical address of the sub FDT is stored in @phys.
1296	*
1297	* Retrieve a preserved sub FDT named @name and store its physical
1298	* address in @phys.
1299	*
1300	* Return: 0 on success, error code on failure
1301	*/
1302	int kho_retrieve_subtree(const char name, phys_addr_t phys)
1303	{
1304	const void *fdt = kho_get_fdt();
1305	const u64 *val;
1306	int offset, len;
1307
1308	if (!fdt)
1309	return -ENOENT;
1310
1311	if (!phys)
1312	return -EINVAL;
1313
1314	offset = fdt_subnode_offset(fdt, parentoffset: `0`, name);
1315	if (offset < `0`)
1316	return -ENOENT;
1317
1318	val = fdt_getprop(fdt, nodeoffset: offset, PROP_SUB_FDT, lenp: &len);
1319	if (!val \|\| len != sizeof(*val))
1320	return -EINVAL;
1321
1322	phys = (phys_addr_t)val;
1323
1324	return `0`;
1325	}
1326	EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
1327
1328	static __init int kho_out_fdt_setup(void)
1329	{
1330	void *root = kho_out.fdt;
1331	u64 empty_mem_map = `0`;
1332	int err;
1333
1334	err = fdt_create(buf: root, PAGE_SIZE);
1335	err \|= fdt_finish_reservemap(fdt: root);
1336	err \|= fdt_begin_node(fdt: root, name: "");
1337	err \|= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
1338	err \|= fdt_property(fdt: root, PROP_PRESERVED_MEMORY_MAP, val: &empty_mem_map,
1339	len: sizeof(empty_mem_map));
1340	err \|= fdt_end_node(fdt: root);
1341	err \|= fdt_finish(fdt: root);
1342
1343	return err;
1344	}
1345
1346	static __init int kho_init(void)
1347	{
1348	const void *fdt = kho_get_fdt();
1349	int err = `0`;
1350
1351	if (!kho_enable)
1352	return `0`;
1353
1354	kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
1355	if (IS_ERR(ptr: kho_out.fdt)) {
1356	err = PTR_ERR(ptr: kho_out.fdt);
1357	goto err_free_scratch;
1358	}
1359
1360	err = kho_debugfs_init();
1361	if (err)
1362	goto err_free_fdt;
1363
1364	err = kho_out_debugfs_init(dbg: &kho_out.dbg);
1365	if (err)
1366	goto err_free_fdt;
1367
1368	err = kho_out_fdt_setup();
1369	if (err)
1370	goto err_free_fdt;
1371
1372	if (fdt) {
1373	kho_in_debugfs_init(dbg: &kho_in.dbg, fdt);
1374	return `0`;
1375	}
1376
1377	for (int i = `0`; i < kho_scratch_cnt; i++) {
1378	unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
1379	unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
1380	unsigned long pfn;
1381
1382	/*
1383	* When debug_pagealloc is enabled, __free_pages() clears the
1384	* corresponding PRESENT bit in the kernel page table.
1385	* Subsequent kmemleak scans of these pages cause the
1386	* non-PRESENT page faults.
1387	* Mark scratch areas with kmemleak_ignore_phys() to exclude
1388	* them from kmemleak scanning.
1389	*/
1390	kmemleak_ignore_phys(phys: kho_scratch[i].addr);
1391	for (pfn = base_pfn; pfn < base_pfn + count;
1392	pfn += pageblock_nr_pages)
1393	init_cma_reserved_pageblock(pfn_to_page(pfn));
1394	}
1395
1396	WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
1397	kho_out.fdt, true));
1398
1399	return `0`;
1400
1401	err_free_fdt:
1402	kho_unpreserve_free(kho_out.fdt);
1403	err_free_scratch:
1404	kho_out.fdt = NULL;
1405	for (int i = `0`; i < kho_scratch_cnt; i++) {
1406	void *start = __va(kho_scratch[i].addr);
1407	void *end = start + kho_scratch[i].size;
1408
1409	free_reserved_area(start, end, poison: -`1`, s: "");
1410	}
1411	kho_enable = false;
1412	return err;
1413	}
1414	fs_initcall(kho_init);
1415
1416	static void __init kho_release_scratch(void)
1417	{
1418	phys_addr_t start, end;
1419	u64 i;
1420
1421	memmap_init_kho_scratch_pages();
1422
1423	/*
1424	* Mark scratch mem as CMA before we return it. That way we
1425	* ensure that no kernel allocations happen on it. That means
1426	* we can reuse it as scratch memory again later.
1427	*/
1428	__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
1429	MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
1430	ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
1431	ulong end_pfn = pageblock_align(PFN_UP(end));
1432	ulong pfn;
1433
1434	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
1435	init_pageblock_migratetype(pfn_to_page(pfn),
1436	migratetype: MIGRATE_CMA, isolate: false);
1437	}
1438	}
1439
1440	void __init kho_memory_init(void)
1441	{
1442	if (kho_in.mem_map_phys) {
1443	kho_scratch = phys_to_virt(address: kho_in.scratch_phys);
1444	kho_release_scratch();
1445	kho_mem_deserialize(phys_to_virt(address: kho_in.mem_map_phys));
1446	} else {
1447	kho_reserve_scratch();
1448	}
1449	}
1450
1451	void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
1452	phys_addr_t scratch_phys, u64 scratch_len)
1453	{
1454	struct kho_scratch *scratch = NULL;
1455	phys_addr_t mem_map_phys;
1456	void *fdt = NULL;
1457	int err = `0`;
1458	unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
1459
1460	/ Validate the input FDT /
1461	fdt = early_memremap(phys_addr: fdt_phys, size: fdt_len);
1462	if (!fdt) {
1463	pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
1464	err = -EFAULT;
1465	goto out;
1466	}
1467	err = fdt_check_header(fdt);
1468	if (err) {
1469	pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
1470	fdt_phys, err);
1471	err = -EINVAL;
1472	goto out;
1473	}
1474	err = fdt_node_check_compatible(fdt, nodeoffset: `0`, KHO_FDT_COMPATIBLE);
1475	if (err) {
1476	pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
1477	fdt_phys, KHO_FDT_COMPATIBLE, err);
1478	err = -EINVAL;
1479	goto out;
1480	}
1481
1482	mem_map_phys = kho_get_mem_map_phys(fdt);
1483	if (!mem_map_phys) {
1484	err = -ENOENT;
1485	goto out;
1486	}
1487
1488	scratch = early_memremap(phys_addr: scratch_phys, size: scratch_len);
1489	if (!scratch) {
1490	pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
1491	scratch_phys, scratch_len);
1492	err = -EFAULT;
1493	goto out;
1494	}
1495
1496	/*
1497	* We pass a safe contiguous blocks of memory to use for early boot
1498	* purporses from the previous kernel so that we can resize the
1499	* memblock array as needed.
1500	*/
1501	for (int i = `0`; i < scratch_cnt; i++) {
1502	struct kho_scratch *area = &scratch[i];
1503	u64 size = area->size;
1504
1505	memblock_add(base: area->addr, size);
1506	err = memblock_mark_kho_scratch(base: area->addr, size);
1507	if (WARN_ON(err)) {
1508	pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
1509	&area->addr, &size, ERR_PTR(err));
1510	goto out;
1511	}
1512	pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
1513	}
1514
1515	memblock_reserve(base: scratch_phys, size: scratch_len);
1516
1517	/*
1518	* Now that we have a viable region of scratch memory, let's tell
1519	* the memblocks allocator to only use that for any allocations.
1520	* That way we ensure that nothing scribbles over in use data while
1521	* we initialize the page tables which we will need to ingest all
1522	* memory reservations from the previous kernel.
1523	*/
1524	memblock_set_kho_scratch_only();
1525
1526	kho_in.fdt_phys = fdt_phys;
1527	kho_in.scratch_phys = scratch_phys;
1528	kho_in.mem_map_phys = mem_map_phys;
1529	kho_scratch_cnt = scratch_cnt;
1530	pr_info("found kexec handover data.\n");
1531
1532	out:
1533	if (fdt)
1534	early_memunmap(addr: fdt, size: fdt_len);
1535	if (scratch)
1536	early_memunmap(addr: scratch, size: scratch_len);
1537	if (err)
1538	pr_warn("disabling KHO revival: %d\n", err);
1539	}
1540
1541	/ Helper functions for kexec_file_load /
1542
1543	int kho_fill_kimage(struct kimage *image)
1544	{
1545	ssize_t scratch_size;
1546	int err = `0`;
1547	struct kexec_buf scratch;
1548
1549	if (!kho_enable)
1550	return `0`;
1551
1552	image->kho.fdt = virt_to_phys(address: kho_out.fdt);
1553
1554	scratch_size = sizeof(kho_scratch) kho_scratch_cnt;
1555	scratch = (struct kexec_buf){
1556	.image = image,
1557	.buffer = kho_scratch,
1558	.bufsz = scratch_size,
1559	.mem = KEXEC_BUF_MEM_UNKNOWN,
1560	.memsz = scratch_size,
1561	.buf_align = SZ_64K, / Makes it easier to map /
1562	.buf_max = ULONG_MAX,
1563	.top_down = true,
1564	};
1565	err = kexec_add_buffer(kbuf: &scratch);
1566	if (err)
1567	return err;
1568	image->kho.scratch = &image->segment[image->nr_segments - `1`];
1569
1570	return `0`;
1571	}
1572
1573	static int kho_walk_scratch(struct kexec_buf *kbuf,
1574	int (func)(struct* resource , void* *))
1575	{
1576	int ret = `0`;
1577	int i;
1578
1579	for (i = `0`; i < kho_scratch_cnt; i++) {
1580	struct resource res = {
1581	.start = kho_scratch[i].addr,
1582	.end = kho_scratch[i].addr + kho_scratch[i].size - `1`,
1583	};
1584
1585	/ Try to fit the kimage into our KHO scratch region /
1586	ret = func(&res, kbuf);
1587	if (ret)
1588	break;
1589	}
1590
1591	return ret;
1592	}
1593
1594	int kho_locate_mem_hole(struct kexec_buf *kbuf,
1595	int (func)(struct* resource , void* *))
1596	{
1597	int ret;
1598
1599	if (!kho_enable \|\| kbuf->image->type == KEXEC_TYPE_CRASH)
1600	return `1`;
1601
1602	ret = kho_walk_scratch(kbuf, func);
1603
1604	return ret == `1` ? `0` : -EADDRNOTAVAIL;
1605	}
1606

source code of linux/kernel/liveupdate/kexec_handover.c