kvm.c source code [linux/arch/x86/kernel/kvm.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* KVM paravirt_ops implementation
4	*
5	* Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6	* Copyright IBM Corporation, 2007
7	* Authors: Anthony Liguori <aliguori@us.ibm.com>
8	*/
9
10	#define pr_fmt(fmt) "kvm-guest: " fmt
11
12	#include <linux/context_tracking.h>
13	#include <linux/init.h>
14	#include <linux/irq.h>
15	#include <linux/kernel.h>
16	#include <linux/kvm_para.h>
17	#include <linux/cpu.h>
18	#include <linux/mm.h>
19	#include <linux/highmem.h>
20	#include <linux/hardirq.h>
21	#include <linux/notifier.h>
22	#include <linux/reboot.h>
23	#include <linux/hash.h>
24	#include <linux/sched.h>
25	#include <linux/slab.h>
26	#include <linux/kprobes.h>
27	#include <linux/nmi.h>
28	#include <linux/swait.h>
29	#include <linux/syscore_ops.h>
30	#include <linux/cc_platform.h>
31	#include <linux/efi.h>
32	#include <linux/kvm_types.h>
33	#include <asm/timer.h>
34	#include <asm/cpu.h>
35	#include <asm/traps.h>
36	#include <asm/desc.h>
37	#include <asm/tlbflush.h>
38	#include <asm/apic.h>
39	#include <asm/apicdef.h>
40	#include <asm/hypervisor.h>
41	#include <asm/mtrr.h>
42	#include <asm/tlb.h>
43	#include <asm/cpuidle_haltpoll.h>
44	#include <asm/msr.h>
45	#include <asm/ptrace.h>
46	#include <asm/reboot.h>
47	#include <asm/svm.h>
48	#include <asm/e820/api.h>
49
50	DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
51
52	static int kvmapf = `1`;
53
54	static int __init parse_no_kvmapf(char *arg)
55	{
56	kvmapf = `0`;
57	return `0`;
58	}
59
60	early_param("no-kvmapf", parse_no_kvmapf);
61
62	static int steal_acc = `1`;
63	static int __init parse_no_stealacc(char *arg)
64	{
65	steal_acc = `0`;
66	return `0`;
67	}
68
69	early_param("no-steal-acc", parse_no_stealacc);
70
71	static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
72	static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(`64`);
73	DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(`64`) __visible;
74	static int has_steal_clock = `0`;
75
76	static int has_guest_poll = `0`;
77	/*
78	* No need for any "IO delay" on KVM
79	*/
80	static void kvm_io_delay(void)
81	{
82	}
83
84	#define KVM_TASK_SLEEP_HASHBITS 8
85	#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
86
87	struct kvm_task_sleep_node {
88	struct hlist_node link;
89	struct swait_queue_head wq;
90	u32 token;
91	int cpu;
92	bool dummy;
93	};
94
95	static struct kvm_task_sleep_head {
96	raw_spinlock_t lock;
97	struct hlist_head list;
98	} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
99
100	static struct kvm_task_sleep_node _find_apf_task(struct* kvm_task_sleep_head *b,
101	u32 token)
102	{
103	struct hlist_node *p;
104
105	hlist_for_each(p, &b->list) {
106	struct kvm_task_sleep_node *n =
107	hlist_entry(p, typeof(*n), link);
108	if (n->token == token)
109	return n;
110	}
111
112	return NULL;
113	}
114
115	static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
116	{
117	u32 key = hash_32(val: token, KVM_TASK_SLEEP_HASHBITS);
118	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
119	struct kvm_task_sleep_node *e;
120
121	raw_spin_lock(&b->lock);
122	e = _find_apf_task(b, token);
123	if (e) {
124	struct kvm_task_sleep_node *dummy = NULL;
125
126	/*
127	* The entry can either be a 'dummy' entry (which is put on the
128	* list when wake-up happens ahead of APF handling completion)
129	* or a token from another task which should not be touched.
130	*/
131	if (e->dummy) {
132	hlist_del(n: &e->link);
133	dummy = e;
134	}
135
136	raw_spin_unlock(&b->lock);
137	kfree(objp: dummy);
138	return false;
139	}
140
141	n->token = token;
142	n->cpu = smp_processor_id();
143	n->dummy = false;
144	init_swait_queue_head(&n->wq);
145	hlist_add_head(n: &n->link, h: &b->list);
146	raw_spin_unlock(&b->lock);
147	return true;
148	}
149
150	/*
151	* kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
152	* @token: Token to identify the sleep node entry
153	*
154	* Invoked from the async pagefault handling code or from the VM exit page
155	* fault handler. In both cases RCU is watching.
156	*/
157	void kvm_async_pf_task_wait_schedule(u32 token)
158	{
159	struct kvm_task_sleep_node n;
160	DECLARE_SWAITQUEUE(wait);
161
162	lockdep_assert_irqs_disabled();
163
164	if (!kvm_async_pf_queue_task(token, n: &n))
165	return;
166
167	for (;;) {
168	prepare_to_swait_exclusive(q: &n.wq, wait: &wait, TASK_UNINTERRUPTIBLE);
169	if (hlist_unhashed(h: &n.link))
170	break;
171
172	local_irq_enable();
173	schedule();
174	local_irq_disable();
175	}
176	finish_swait(q: &n.wq, wait: &wait);
177	}
178	EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
179
180	static void apf_task_wake_one(struct kvm_task_sleep_node *n)
181	{
182	hlist_del_init(n: &n->link);
183	if (swq_has_sleeper(wq: &n->wq))
184	swake_up_one(q: &n->wq);
185	}
186
187	static void apf_task_wake_all(void)
188	{
189	int i;
190
191	for (i = `0`; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
192	struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
193	struct kvm_task_sleep_node *n;
194	struct hlist_node p, next;
195
196	raw_spin_lock(&b->lock);
197	hlist_for_each_safe(p, next, &b->list) {
198	n = hlist_entry(p, typeof(*n), link);
199	if (n->cpu == smp_processor_id())
200	apf_task_wake_one(n);
201	}
202	raw_spin_unlock(&b->lock);
203	}
204	}
205
206	static void kvm_async_pf_task_wake(u32 token)
207	{
208	u32 key = hash_32(val: token, KVM_TASK_SLEEP_HASHBITS);
209	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
210	struct kvm_task_sleep_node n, dummy = NULL;
211
212	if (token == ~`0`) {
213	apf_task_wake_all();
214	return;
215	}
216
217	again:
218	raw_spin_lock(&b->lock);
219	n = _find_apf_task(b, token);
220	if (!n) {
221	/*
222	* Async #PF not yet handled, add a dummy entry for the token.
223	* Allocating the token must be down outside of the raw lock
224	* as the allocator is preemptible on PREEMPT_RT kernels.
225	*/
226	if (!dummy) {
227	raw_spin_unlock(&b->lock);
228	dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
229
230	/*
231	* Continue looping on allocation failure, eventually
232	* the async #PF will be handled and allocating a new
233	* node will be unnecessary.
234	*/
235	if (!dummy)
236	cpu_relax();
237
238	/*
239	* Recheck for async #PF completion before enqueueing
240	* the dummy token to avoid duplicate list entries.
241	*/
242	goto again;
243	}
244	dummy->token = token;
245	dummy->cpu = smp_processor_id();
246	dummy->dummy = true;
247	init_swait_queue_head(&dummy->wq);
248	hlist_add_head(n: &dummy->link, h: &b->list);
249	dummy = NULL;
250	} else {
251	apf_task_wake_one(n);
252	}
253	raw_spin_unlock(&b->lock);
254
255	/ A dummy token might be allocated and ultimately not used. /
256	kfree(objp: dummy);
257	}
258
259	noinstr u32 kvm_read_and_reset_apf_flags(void)
260	{
261	u32 flags = `0`;
262
263	if (__this_cpu_read(async_pf_enabled)) {
264	flags = __this_cpu_read(apf_reason.flags);
265	__this_cpu_write(apf_reason.flags, `0`);
266	}
267
268	return flags;
269	}
270	EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
271
272	noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
273	{
274	u32 flags = kvm_read_and_reset_apf_flags();
275	irqentry_state_t state;
276
277	if (!flags)
278	return false;
279
280	state = irqentry_enter(regs);
281	instrumentation_begin();
282
283	/*
284	* If the host managed to inject an async #PF into an interrupt
285	* disabled region, then die hard as this is not going to end well
286	* and the host side is seriously broken.
287	*/
288	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
289	panic(fmt: "Host injected async #PF in interrupt disabled region\n");
290
291	if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
292	if (unlikely(!(user_mode(regs))))
293	panic(fmt: "Host injected async #PF in kernel mode\n");
294	/ Page is swapped out by the host. /
295	kvm_async_pf_task_wait_schedule(token);
296	} else {
297	WARN_ONCE(`1`, "Unexpected async PF flags: %x\n", flags);
298	}
299
300	instrumentation_end();
301	irqentry_exit(regs, state);
302	return true;
303	}
304
305	DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
306	{
307	struct pt_regs *old_regs = set_irq_regs(regs);
308	u32 token;
309
310	apic_eoi();
311
312	inc_irq_stat(irq_hv_callback_count);
313
314	if (__this_cpu_read(async_pf_enabled)) {
315	token = __this_cpu_read(apf_reason.token);
316	kvm_async_pf_task_wake(token);
317	__this_cpu_write(apf_reason.token, `0`);
318	wrmsrq(MSR_KVM_ASYNC_PF_ACK, val: `1`);
319	}
320
321	set_irq_regs(old_regs);
322	}
323
324	static void __init paravirt_ops_setup(void)
325	{
326	pv_info.name = "KVM";
327
328	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
329	pv_ops.cpu.io_delay = kvm_io_delay;
330
331	#ifdef CONFIG_X86_IO_APIC
332	no_timer_check = `1`;
333	#endif
334	}
335
336	static void kvm_register_steal_time(void)
337	{
338	int cpu = smp_processor_id();
339	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
340
341	if (!has_steal_clock)
342	return;
343
344	wrmsrq(MSR_KVM_STEAL_TIME, val: (slow_virt_to_phys(address: st) \| KVM_MSR_ENABLED));
345	pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
346	(unsigned long long) slow_virt_to_phys(st));
347	}
348
349	static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
350
351	static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
352	{
353	/**
354	* This relies on __test_and_clear_bit to modify the memory
355	* in a way that is atomic with respect to the local CPU.
356	* The hypervisor only accesses this memory from the local CPU so
357	* there's no need for lock or memory barriers.
358	* An optimization barrier is implied in apic write.
359	*/
360	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
361	return;
362	apic_native_eoi();
363	}
364
365	static void kvm_guest_cpu_init(void)
366	{
367	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
368	u64 pa;
369
370	WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
371
372	pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
373	pa \|= KVM_ASYNC_PF_ENABLED \| KVM_ASYNC_PF_DELIVERY_AS_INT;
374
375	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
376	pa \|= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
377
378	wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
379
380	wrmsrq(MSR_KVM_ASYNC_PF_EN, val: pa);
381	__this_cpu_write(async_pf_enabled, true);
382	pr_debug("setup async PF for cpu %d\n", smp_processor_id());
383	}
384
385	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
386	unsigned long pa;
387
388	/ Size alignment is implied but just to make it explicit. /
389	BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < `4`);
390	__this_cpu_write(kvm_apic_eoi, `0`);
391	pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
392	\| KVM_MSR_ENABLED;
393	wrmsrq(MSR_KVM_PV_EOI_EN, val: pa);
394	}
395
396	if (has_steal_clock)
397	kvm_register_steal_time();
398	}
399
400	static void kvm_pv_disable_apf(void)
401	{
402	if (!__this_cpu_read(async_pf_enabled))
403	return;
404
405	wrmsrq(MSR_KVM_ASYNC_PF_EN, val: `0`);
406	__this_cpu_write(async_pf_enabled, false);
407
408	pr_debug("disable async PF for cpu %d\n", smp_processor_id());
409	}
410
411	static void kvm_disable_steal_time(void)
412	{
413	if (!has_steal_clock)
414	return;
415
416	wrmsrq(MSR_KVM_STEAL_TIME, val: `0`);
417	}
418
419	static u64 kvm_steal_clock(int cpu)
420	{
421	u64 steal;
422	struct kvm_steal_time *src;
423	int version;
424
425	src = &per_cpu(steal_time, cpu);
426	do {
427	version = src->version;
428	virt_rmb();
429	steal = src->steal;
430	virt_rmb();
431	} while ((version & `1`) \|\| (version != src->version));
432
433	return steal;
434	}
435
436	static inline __init void __set_percpu_decrypted(void ptr, unsigned* long size)
437	{
438	early_set_memory_decrypted(vaddr: (unsigned long) ptr, size);
439	}
440
441	/*
442	* Iterate through all possible CPUs and map the memory region pointed
443	* by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
444	*
445	* Note: we iterate through all possible CPUs to ensure that CPUs
446	* hotplugged will have their per-cpu variable already mapped as
447	* decrypted.
448	*/
449	static void __init sev_map_percpu_data(void)
450	{
451	int cpu;
452
453	if (cc_vendor != CC_VENDOR_AMD \|\|
454	!cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT))
455	return;
456
457	for_each_possible_cpu(cpu) {
458	__set_percpu_decrypted(ptr: &per_cpu(apf_reason, cpu), size: sizeof(apf_reason));
459	__set_percpu_decrypted(ptr: &per_cpu(steal_time, cpu), size: sizeof(steal_time));
460	__set_percpu_decrypted(ptr: &per_cpu(kvm_apic_eoi, cpu), size: sizeof(kvm_apic_eoi));
461	}
462	}
463
464	static void kvm_guest_cpu_offline(bool shutdown)
465	{
466	kvm_disable_steal_time();
467	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
468	wrmsrq(MSR_KVM_PV_EOI_EN, val: `0`);
469	if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
470	wrmsrq(MSR_KVM_MIGRATION_CONTROL, val: `0`);
471	kvm_pv_disable_apf();
472	if (!shutdown)
473	apf_task_wake_all();
474	kvmclock_disable();
475	}
476
477	static int kvm_cpu_online(unsigned int cpu)
478	{
479	unsigned long flags;
480
481	local_irq_save(flags);
482	kvm_guest_cpu_init();
483	local_irq_restore(flags);
484	return `0`;
485	}
486
487	#ifdef CONFIG_SMP
488
489	static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
490
491	static bool pv_tlb_flush_supported(void)
492	{
493	return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
494	!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
495	kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
496	!boot_cpu_has(X86_FEATURE_MWAIT) &&
497	(num_possible_cpus() != `1`));
498	}
499
500	static bool pv_ipi_supported(void)
501	{
502	return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
503	(num_possible_cpus() != `1`));
504	}
505
506	static bool pv_sched_yield_supported(void)
507	{
508	return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
509	!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
510	kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
511	!boot_cpu_has(X86_FEATURE_MWAIT) &&
512	(num_possible_cpus() != `1`));
513	}
514
515	#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
516
517	static void __send_ipi_mask(const struct cpumask mask, int* vector)
518	{
519	unsigned long flags;
520	int cpu, min = `0`, max = `0`;
521	#ifdef CONFIG_X86_64
522	__uint128_t ipi_bitmap = `0`;
523	#else
524	u64 ipi_bitmap = `0`;
525	#endif
526	u32 apic_id, icr;
527	long ret;
528
529	if (cpumask_empty(srcp: mask))
530	return;
531
532	local_irq_save(flags);
533
534	switch (vector) {
535	default:
536	icr = APIC_DM_FIXED \| vector;
537	break;
538	case NMI_VECTOR:
539	icr = APIC_DM_NMI;
540	break;
541	}
542
543	for_each_cpu(cpu, mask) {
544	apic_id = per_cpu(x86_cpu_to_apicid, cpu);
545	if (!ipi_bitmap) {
546	min = max = apic_id;
547	} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
548	ipi_bitmap <<= min - apic_id;
549	min = apic_id;
550	} else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
551	max = apic_id < max ? max : apic_id;
552	} else {
553	ret = kvm_hypercall4(KVM_HC_SEND_IPI, p1: (unsigned long)ipi_bitmap,
554	p2: (unsigned long)(ipi_bitmap >> BITS_PER_LONG), p3: min, p4: icr);
555	WARN_ONCE(ret < `0`, "kvm-guest: failed to send PV IPI: %ld",
556	ret);
557	min = max = apic_id;
558	ipi_bitmap = `0`;
559	}
560	__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
561	}
562
563	if (ipi_bitmap) {
564	ret = kvm_hypercall4(KVM_HC_SEND_IPI, p1: (unsigned long)ipi_bitmap,
565	p2: (unsigned long)(ipi_bitmap >> BITS_PER_LONG), p3: min, p4: icr);
566	WARN_ONCE(ret < `0`, "kvm-guest: failed to send PV IPI: %ld",
567	ret);
568	}
569
570	local_irq_restore(flags);
571	}
572
573	static void kvm_send_ipi_mask(const struct cpumask mask, int* vector)
574	{
575	__send_ipi_mask(mask, vector);
576	}
577
578	static void kvm_send_ipi_mask_allbutself(const struct cpumask mask, int* vector)
579	{
580	unsigned int this_cpu = smp_processor_id();
581	struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
582	const struct cpumask *local_mask;
583
584	cpumask_copy(dstp: new_mask, srcp: mask);
585	cpumask_clear_cpu(cpu: this_cpu, dstp: new_mask);
586	local_mask = new_mask;
587	__send_ipi_mask(mask: local_mask, vector);
588	}
589
590	static int __init setup_efi_kvm_sev_migration(void)
591	{
592	efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
593	efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
594	efi_status_t status;
595	unsigned long size;
596	bool enabled;
597
598	if (!cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT) \|\|
599	!kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
600	return `0`;
601
602	if (!efi_enabled(EFI_BOOT))
603	return `0`;
604
605	if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
606	pr_info("%s : EFI runtime services are not enabled\n", __func__);
607	return `0`;
608	}
609
610	size = sizeof(enabled);
611
612	/ Get variable contents into buffer /
613	status = efi.get_variable(efi_sev_live_migration_enabled,
614	&efi_variable_guid, NULL, &size, &enabled);
615
616	if (status == EFI_NOT_FOUND) {
617	pr_info("%s : EFI live migration variable not found\n", __func__);
618	return `0`;
619	}
620
621	if (status != EFI_SUCCESS) {
622	pr_info("%s : EFI variable retrieval failed\n", __func__);
623	return `0`;
624	}
625
626	if (enabled == `0`) {
627	pr_info("%s: live migration disabled in EFI\n", __func__);
628	return `0`;
629	}
630
631	pr_info("%s : live migration enabled in EFI\n", __func__);
632	wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
633
634	return `1`;
635	}
636
637	late_initcall(setup_efi_kvm_sev_migration);
638
639	/*
640	* Set the IPI entry points
641	*/
642	static __init void kvm_setup_pv_ipi(void)
643	{
644	apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
645	apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
646	pr_info("setup PV IPIs\n");
647	}
648
649	static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
650	{
651	int cpu;
652
653	native_send_call_func_ipi(mask);
654
655	/ Make sure other vCPUs get a chance to run if they need to. /
656	for_each_cpu(cpu, mask) {
657	if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
658	kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
659	break;
660	}
661	}
662	}
663
664	static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
665	const struct flush_tlb_info *info)
666	{
667	u8 state;
668	int cpu;
669	struct kvm_steal_time *src;
670	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
671
672	cpumask_copy(dstp: flushmask, srcp: cpumask);
673	/*
674	* We have to call flush only on online vCPUs. And
675	* queue flush_on_enter for pre-empted vCPUs
676	*/
677	for_each_cpu(cpu, flushmask) {
678	/*
679	* The local vCPU is never preempted, so we do not explicitly
680	* skip check for local vCPU - it will never be cleared from
681	* flushmask.
682	*/
683	src = &per_cpu(steal_time, cpu);
684	state = READ_ONCE(src->preempted);
685	if ((state & KVM_VCPU_PREEMPTED)) {
686	if (try_cmpxchg(&src->preempted, &state,
687	state \| KVM_VCPU_FLUSH_TLB))
688	__cpumask_clear_cpu(cpu, dstp: flushmask);
689	}
690	}
691
692	native_flush_tlb_multi(cpumask: flushmask, info);
693	}
694
695	static __init int kvm_alloc_cpumask(void)
696	{
697	int cpu;
698
699	if (!kvm_para_available() \|\| nopv)
700	return `0`;
701
702	if (pv_tlb_flush_supported() \|\| pv_ipi_supported())
703	for_each_possible_cpu(cpu) {
704	zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
705	GFP_KERNEL, cpu_to_node(cpu));
706	}
707
708	return `0`;
709	}
710	arch_initcall(kvm_alloc_cpumask);
711
712	static void __init kvm_smp_prepare_boot_cpu(void)
713	{
714	/*
715	* Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
716	* shares the guest physical address with the hypervisor.
717	*/
718	sev_map_percpu_data();
719
720	kvm_guest_cpu_init();
721	native_smp_prepare_boot_cpu();
722	kvm_spinlock_init();
723	}
724
725	static int kvm_cpu_down_prepare(unsigned int cpu)
726	{
727	unsigned long flags;
728
729	local_irq_save(flags);
730	kvm_guest_cpu_offline(shutdown: false);
731	local_irq_restore(flags);
732	return `0`;
733	}
734
735	#endif
736
737	static int kvm_suspend(void *data)
738	{
739	u64 val = `0`;
740
741	kvm_guest_cpu_offline(shutdown: false);
742
743	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
744	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
745	rdmsrq(MSR_KVM_POLL_CONTROL, val);
746	has_guest_poll = !(val & `1`);
747	#endif
748	return `0`;
749	}
750
751	static void kvm_resume(void *data)
752	{
753	kvm_cpu_online(raw_smp_processor_id());
754
755	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
756	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
757	wrmsrq(MSR_KVM_POLL_CONTROL, val: `0`);
758	#endif
759	}
760
761	static const struct syscore_ops kvm_syscore_ops = {
762	.suspend = kvm_suspend,
763	.resume = kvm_resume,
764	};
765
766	static struct syscore kvm_syscore = {
767	.ops = &kvm_syscore_ops,
768	};
769
770	static void kvm_pv_guest_cpu_reboot(void *unused)
771	{
772	kvm_guest_cpu_offline(shutdown: true);
773	}
774
775	static int kvm_pv_reboot_notify(struct notifier_block *nb,
776	unsigned long code, void *unused)
777	{
778	if (code == SYS_RESTART)
779	on_each_cpu(func: kvm_pv_guest_cpu_reboot, NULL, wait: `1`);
780	return NOTIFY_DONE;
781	}
782
783	static struct notifier_block kvm_pv_reboot_nb = {
784	.notifier_call = kvm_pv_reboot_notify,
785	};
786
787	/*
788	* After a PV feature is registered, the host will keep writing to the
789	* registered memory location. If the guest happens to shutdown, this memory
790	* won't be valid. In cases like kexec, in which you install a new kernel, this
791	* means a random memory location will be kept being written.
792	*/
793	#ifdef CONFIG_CRASH_DUMP
794	static void kvm_crash_shutdown(struct pt_regs *regs)
795	{
796	kvm_guest_cpu_offline(shutdown: true);
797	native_machine_crash_shutdown(regs);
798	}
799	#endif
800
801	#if defined(CONFIG_X86_32) \|\| !defined(CONFIG_SMP)
802	bool __kvm_vcpu_is_preempted(long cpu);
803
804	__visible bool __kvm_vcpu_is_preempted(long cpu)
805	{
806	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
807
808	return !!(src->preempted & KVM_VCPU_PREEMPTED);
809	}
810	PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
811
812	#else
813
814	#include <asm/asm-offsets.h>
815
816	extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
817
818	/*
819	* Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
820	* restoring to/from the stack.
821	*/
822	#define PV_VCPU_PREEMPTED_ASM \
823	"movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
824	"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
825	"setne %al\n\t"
826
827	DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
828	PV_VCPU_PREEMPTED_ASM, .text);
829	#endif
830
831	static void __init kvm_guest_init(void)
832	{
833	int i;
834
835	paravirt_ops_setup();
836	register_reboot_notifier(&kvm_pv_reboot_nb);
837	for (i = `0`; i < KVM_TASK_SLEEP_HASHSIZE; i++)
838	raw_spin_lock_init(&async_pf_sleepers[i].lock);
839
840	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
841	has_steal_clock = `1`;
842	static_call_update(pv_steal_clock, kvm_steal_clock);
843
844	pv_ops.lock.vcpu_is_preempted =
845	PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
846	}
847
848	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
849	apic_update_callback(eoi, kvm_guest_apic_eoi_write);
850
851	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
852	static_branch_enable(&kvm_async_pf_enabled);
853	sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
854	}
855
856	#ifdef CONFIG_SMP
857	if (pv_tlb_flush_supported()) {
858	pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
859	pr_info("KVM setup pv remote TLB flush\n");
860	}
861
862	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
863	if (pv_sched_yield_supported()) {
864	smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
865	pr_info("setup PV sched yield\n");
866	}
867	if (cpuhp_setup_state_nocalls(state: CPUHP_AP_ONLINE_DYN, name: "x86/kvm:online",
868	startup: kvm_cpu_online, teardown: kvm_cpu_down_prepare) < `0`)
869	pr_err("failed to install cpu hotplug callbacks\n");
870	#else
871	sev_map_percpu_data();
872	kvm_guest_cpu_init();
873	#endif
874
875	#ifdef CONFIG_CRASH_DUMP
876	machine_ops.crash_shutdown = kvm_crash_shutdown;
877	#endif
878
879	register_syscore(syscore: &kvm_syscore);
880
881	/*
882	* Hard lockup detection is enabled by default. Disable it, as guests
883	* can get false positives too easily, for example if the host is
884	* overcommitted.
885	*/
886	hardlockup_detector_disable();
887	}
888
889	static noinline uint32_t __kvm_cpuid_base(void)
890	{
891	if (boot_cpu_data.cpuid_level < `0`)
892	return `0`; / So we don't blow up on old processors /
893
894	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
895	return cpuid_base_hypervisor(KVM_SIGNATURE, leaves: `0`);
896
897	return `0`;
898	}
899
900	static inline uint32_t kvm_cpuid_base(void)
901	{
902	static int kvm_cpuid_base = -`1`;
903
904	if (kvm_cpuid_base == -`1`)
905	kvm_cpuid_base = __kvm_cpuid_base();
906
907	return kvm_cpuid_base;
908	}
909
910	bool kvm_para_available(void)
911	{
912	return kvm_cpuid_base() != `0`;
913	}
914	EXPORT_SYMBOL_GPL(kvm_para_available);
915
916	unsigned int kvm_arch_para_features(void)
917	{
918	return cpuid_eax(op: kvm_cpuid_base() \| KVM_CPUID_FEATURES);
919	}
920
921	unsigned int kvm_arch_para_hints(void)
922	{
923	return cpuid_edx(op: kvm_cpuid_base() \| KVM_CPUID_FEATURES);
924	}
925	EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
926
927	static uint32_t __init kvm_detect(void)
928	{
929	return kvm_cpuid_base();
930	}
931
932	static void __init kvm_apic_init(void)
933	{
934	#ifdef CONFIG_SMP
935	if (pv_ipi_supported())
936	kvm_setup_pv_ipi();
937	#endif
938	}
939
940	static bool __init kvm_msi_ext_dest_id(void)
941	{
942	return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
943	}
944
945	static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
946	{
947	kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, p1: pfn << PAGE_SHIFT, p2: npages,
948	KVM_MAP_GPA_RANGE_ENC_STAT(enc) \| KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
949	}
950
951	static void __init kvm_init_platform(void)
952	{
953	u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn());
954	/*
955	* Note, hardware requires variable MTRR ranges to be power-of-2 sized
956	* and naturally aligned. But when forcing guest MTRR state, Linux
957	* doesn't program the forced ranges into hardware. Don't bother doing
958	* the math to generate a technically-legal range.
959	*/
960	struct mtrr_var_range pci_hole = {
961	.base_lo = tolud \| X86_MEMTYPE_UC,
962	.mask_lo = (u32)(~(SZ_4G - tolud - `1`)) \| MTRR_PHYSMASK_V,
963	.mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - `1`) >> `32`,
964	};
965
966	if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT) &&
967	kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
968	unsigned long nr_pages;
969	int i;
970
971	pv_ops.mmu.notify_page_enc_status_changed =
972	kvm_sev_hc_page_enc_status;
973
974	/*
975	* Reset the host's shared pages list related to kernel
976	* specific page encryption status settings before we load a
977	* new kernel by kexec. Reset the page encryption status
978	* during early boot instead of just before kexec to avoid SMP
979	* races during kvm_pv_guest_cpu_reboot().
980	* NOTE: We cannot reset the complete shared pages list
981	* here as we need to retain the UEFI/OVMF firmware
982	* specific settings.
983	*/
984
985	for (i = `0`; i < e820_table->nr_entries; i++) {
986	struct e820_entry *entry = &e820_table->entries[i];
987
988	if (entry->type != E820_TYPE_RAM)
989	continue;
990
991	nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
992
993	kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, p1: entry->addr,
994	p2: nr_pages,
995	KVM_MAP_GPA_RANGE_ENCRYPTED \| KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
996	}
997
998	/*
999	* Ensure that _bss_decrypted section is marked as decrypted in the
1000	* shared pages list.
1001	*/
1002	early_set_mem_enc_dec_hypercall(vaddr: (unsigned long)__start_bss_decrypted,
1003	size: __end_bss_decrypted - __start_bss_decrypted, enc: `0`);
1004
1005	/*
1006	* If not booted using EFI, enable Live migration support.
1007	*/
1008	if (!efi_enabled(EFI_BOOT))
1009	wrmsrq(MSR_KVM_MIGRATION_CONTROL,
1010	KVM_MIGRATION_READY);
1011	}
1012	kvmclock_init();
1013	x86_platform.apic_post_init = kvm_apic_init;
1014
1015	/*
1016	* Set WB as the default cache mode for SEV-SNP and TDX, with a single
1017	* UC range for the legacy PCI hole, e.g. so that devices that expect
1018	* to get UC/WC mappings don't get surprised with WB.
1019	*/
1020	guest_force_mtrr_state(var: &pci_hole, num_var: `1`, MTRR_TYPE_WRBACK);
1021	}
1022
1023	#if defined(CONFIG_AMD_MEM_ENCRYPT)
1024	static void kvm_sev_es_hcall_prepare(struct ghcb ghcb, struct* pt_regs *regs)
1025	{
1026	/ RAX and CPL are already in the GHCB /
1027	ghcb_set_rbx(ghcb, value: regs->bx);
1028	ghcb_set_rcx(ghcb, value: regs->cx);
1029	ghcb_set_rdx(ghcb, value: regs->dx);
1030	ghcb_set_rsi(ghcb, value: regs->si);
1031	}
1032
1033	static bool kvm_sev_es_hcall_finish(struct ghcb ghcb, struct* pt_regs *regs)
1034	{
1035	/ No checking of the return state needed /
1036	return true;
1037	}
1038	#endif
1039
1040	const __initconst struct hypervisor_x86 x86_hyper_kvm = {
1041	.name = "KVM",
1042	.detect = kvm_detect,
1043	.type = X86_HYPER_KVM,
1044	.init.guest_late_init = kvm_guest_init,
1045	.init.x2apic_available = kvm_para_available,
1046	.init.msi_ext_dest_id = kvm_msi_ext_dest_id,
1047	.init.init_platform = kvm_init_platform,
1048	#if defined(CONFIG_AMD_MEM_ENCRYPT)
1049	.runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
1050	.runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
1051	#endif
1052	};
1053
1054	static __init int activate_jump_labels(void)
1055	{
1056	if (has_steal_clock) {
1057	static_key_slow_inc(key: &paravirt_steal_enabled);
1058	if (steal_acc)
1059	static_key_slow_inc(key: &paravirt_steal_rq_enabled);
1060	}
1061
1062	return `0`;
1063	}
1064	arch_initcall(activate_jump_labels);
1065
1066	#ifdef CONFIG_PARAVIRT_SPINLOCKS
1067
1068	/ Kick a cpu by its apicid. Used to wake up a halted vcpu /
1069	static void kvm_kick_cpu(int cpu)
1070	{
1071	unsigned long flags = `0`;
1072	u32 apicid;
1073
1074	apicid = per_cpu(x86_cpu_to_apicid, cpu);
1075	kvm_hypercall2(KVM_HC_KICK_CPU, p1: flags, p2: apicid);
1076	}
1077
1078	#include <asm/qspinlock.h>
1079
1080	static void kvm_wait(u8 *ptr, u8 val)
1081	{
1082	if (in_nmi())
1083	return;
1084
1085	/*
1086	* halt until it's our turn and kicked. Note that we do safe halt
1087	* for irq enabled case to avoid hang when lock info is overwritten
1088	* in irq spinlock slowpath and no spurious interrupt occur to save us.
1089	*/
1090	if (irqs_disabled()) {
1091	if (READ_ONCE(*ptr) == val)
1092	halt();
1093	} else {
1094	local_irq_disable();
1095
1096	/ safe_halt() will enable IRQ /
1097	if (READ_ONCE(*ptr) == val)
1098	safe_halt();
1099	else
1100	local_irq_enable();
1101	}
1102	}
1103
1104	/*
1105	* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
1106	*/
1107	void __init kvm_spinlock_init(void)
1108	{
1109	/*
1110	* Disable PV spinlocks and use native qspinlock when dedicated pCPUs
1111	* are available.
1112	*/
1113	if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
1114	pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
1115	goto out;
1116	}
1117
1118	if (num_possible_cpus() == `1`) {
1119	pr_info("PV spinlocks disabled, single CPU\n");
1120	goto out;
1121	}
1122
1123	if (nopvspin) {
1124	pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
1125	goto out;
1126	}
1127
1128	/*
1129	* In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
1130	* advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
1131	* preferred over native qspinlock when vCPU is preempted.
1132	*/
1133	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
1134	pr_info("PV spinlocks disabled, no host support\n");
1135	return;
1136	}
1137
1138	pr_info("PV spinlocks enabled\n");
1139
1140	__pv_init_lock_hash();
1141	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
1142	pv_ops.lock.queued_spin_unlock =
1143	PV_CALLEE_SAVE(__pv_queued_spin_unlock);
1144	pv_ops.lock.wait = kvm_wait;
1145	pv_ops.lock.kick = kvm_kick_cpu;
1146
1147	/*
1148	* When PV spinlock is enabled which is preferred over
1149	* virt_spin_lock(), virt_spin_lock_key's value is meaningless.
1150	* Just disable it anyway.
1151	*/
1152	out:
1153	static_branch_disable(&virt_spin_lock_key);
1154	}
1155
1156	#endif /* CONFIG_PARAVIRT_SPINLOCKS */
1157
1158	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
1159
1160	static void kvm_disable_host_haltpoll(void *i)
1161	{
1162	wrmsrq(MSR_KVM_POLL_CONTROL, val: `0`);
1163	}
1164
1165	static void kvm_enable_host_haltpoll(void *i)
1166	{
1167	wrmsrq(MSR_KVM_POLL_CONTROL, val: `1`);
1168	}
1169
1170	void arch_haltpoll_enable(unsigned int cpu)
1171	{
1172	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
1173	pr_err_once("host does not support poll control\n");
1174	pr_err_once("host upgrade recommended\n");
1175	return;
1176	}
1177
1178	/ Enable guest halt poll disables host halt poll /
1179	smp_call_function_single(cpuid: cpu, func: kvm_disable_host_haltpoll, NULL, wait: `1`);
1180	}
1181	EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
1182
1183	void arch_haltpoll_disable(unsigned int cpu)
1184	{
1185	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
1186	return;
1187
1188	/ Disable guest halt poll enables host halt poll /
1189	smp_call_function_single(cpuid: cpu, func: kvm_enable_host_haltpoll, NULL, wait: `1`);
1190	}
1191	EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
1192	#endif
1193

source code of linux/arch/x86/kernel/kvm.c