| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | |
| 3 | /* |
| 4 | * Clocksource driver for the synthetic counter and timers |
| 5 | * provided by the Hyper-V hypervisor to guest VMs, as described |
| 6 | * in the Hyper-V Top Level Functional Spec (TLFS). This driver |
| 7 | * is instruction set architecture independent. |
| 8 | * |
| 9 | * Copyright (C) 2019, Microsoft, Inc. |
| 10 | * |
| 11 | * Author: Michael Kelley <mikelley@microsoft.com> |
| 12 | */ |
| 13 | |
| 14 | #include <linux/percpu.h> |
| 15 | #include <linux/cpumask.h> |
| 16 | #include <linux/clockchips.h> |
| 17 | #include <linux/clocksource.h> |
| 18 | #include <linux/sched_clock.h> |
| 19 | #include <linux/mm.h> |
| 20 | #include <linux/cpuhotplug.h> |
| 21 | #include <linux/interrupt.h> |
| 22 | #include <linux/irq.h> |
| 23 | #include <linux/acpi.h> |
| 24 | #include <linux/hyperv.h> |
| 25 | #include <linux/export.h> |
| 26 | #include <clocksource/hyperv_timer.h> |
| 27 | #include <hyperv/hvhdk.h> |
| 28 | #include <asm/mshyperv.h> |
| 29 | |
| 30 | static struct clock_event_device __percpu *hv_clock_event; |
| 31 | /* Note: offset can hold negative values after hibernation. */ |
| 32 | static u64 hv_sched_clock_offset __read_mostly; |
| 33 | |
| 34 | /* |
| 35 | * If false, we're using the old mechanism for stimer0 interrupts |
| 36 | * where it sends a VMbus message when it expires. The old |
| 37 | * mechanism is used when running on older versions of Hyper-V |
| 38 | * that don't support Direct Mode. While Hyper-V provides |
| 39 | * four stimer's per CPU, Linux uses only stimer0. |
| 40 | * |
| 41 | * Because Direct Mode does not require processing a VMbus |
| 42 | * message, stimer interrupts can be enabled earlier in the |
| 43 | * process of booting a CPU, and consistent with when timer |
| 44 | * interrupts are enabled for other clocksource drivers. |
| 45 | * However, for legacy versions of Hyper-V when Direct Mode |
| 46 | * is not enabled, setting up stimer interrupts must be |
| 47 | * delayed until VMbus is initialized and can process the |
| 48 | * interrupt message. |
| 49 | */ |
| 50 | static bool direct_mode_enabled; |
| 51 | |
| 52 | static int stimer0_irq = -1; |
| 53 | static int stimer0_message_sint; |
| 54 | static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt); |
| 55 | |
| 56 | /* |
| 57 | * Common code for stimer0 interrupts coming via Direct Mode or |
| 58 | * as a VMbus message. |
| 59 | */ |
| 60 | void hv_stimer0_isr(void) |
| 61 | { |
| 62 | struct clock_event_device *ce; |
| 63 | |
| 64 | ce = this_cpu_ptr(hv_clock_event); |
| 65 | ce->event_handler(ce); |
| 66 | } |
| 67 | EXPORT_SYMBOL_GPL(hv_stimer0_isr); |
| 68 | |
| 69 | /* |
| 70 | * stimer0 interrupt handler for architectures that support |
| 71 | * per-cpu interrupts, which also implies Direct Mode. |
| 72 | */ |
| 73 | static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id) |
| 74 | { |
| 75 | hv_stimer0_isr(); |
| 76 | return IRQ_HANDLED; |
| 77 | } |
| 78 | |
| 79 | static int hv_ce_set_next_event(unsigned long delta, |
| 80 | struct clock_event_device *evt) |
| 81 | { |
| 82 | u64 current_tick; |
| 83 | |
| 84 | current_tick = hv_read_reference_counter(); |
| 85 | current_tick += delta; |
| 86 | hv_set_msr(HV_MSR_STIMER0_COUNT, value: current_tick); |
| 87 | return 0; |
| 88 | } |
| 89 | |
| 90 | static int hv_ce_shutdown(struct clock_event_device *evt) |
| 91 | { |
| 92 | hv_set_msr(HV_MSR_STIMER0_COUNT, value: 0); |
| 93 | hv_set_msr(HV_MSR_STIMER0_CONFIG, value: 0); |
| 94 | if (direct_mode_enabled && stimer0_irq >= 0) |
| 95 | disable_percpu_irq(irq: stimer0_irq); |
| 96 | |
| 97 | return 0; |
| 98 | } |
| 99 | |
| 100 | static int hv_ce_set_oneshot(struct clock_event_device *evt) |
| 101 | { |
| 102 | union hv_stimer_config timer_cfg; |
| 103 | |
| 104 | timer_cfg.as_uint64 = 0; |
| 105 | timer_cfg.enable = 1; |
| 106 | timer_cfg.auto_enable = 1; |
| 107 | if (direct_mode_enabled) { |
| 108 | /* |
| 109 | * When it expires, the timer will directly interrupt |
| 110 | * on the specified hardware vector/IRQ. |
| 111 | */ |
| 112 | timer_cfg.direct_mode = 1; |
| 113 | timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; |
| 114 | if (stimer0_irq >= 0) |
| 115 | enable_percpu_irq(irq: stimer0_irq, type: IRQ_TYPE_NONE); |
| 116 | } else { |
| 117 | /* |
| 118 | * When it expires, the timer will generate a VMbus message, |
| 119 | * to be handled by the normal VMbus interrupt handler. |
| 120 | */ |
| 121 | timer_cfg.direct_mode = 0; |
| 122 | timer_cfg.sintx = stimer0_message_sint; |
| 123 | } |
| 124 | hv_set_msr(HV_MSR_STIMER0_CONFIG, value: timer_cfg.as_uint64); |
| 125 | return 0; |
| 126 | } |
| 127 | |
| 128 | /* |
| 129 | * hv_stimer_init - Per-cpu initialization of the clockevent |
| 130 | */ |
| 131 | static int hv_stimer_init(unsigned int cpu) |
| 132 | { |
| 133 | struct clock_event_device *ce; |
| 134 | |
| 135 | if (!hv_clock_event) |
| 136 | return 0; |
| 137 | |
| 138 | ce = per_cpu_ptr(hv_clock_event, cpu); |
| 139 | ce->name = "Hyper-V clockevent" ; |
| 140 | ce->features = CLOCK_EVT_FEAT_ONESHOT; |
| 141 | ce->cpumask = cpumask_of(cpu); |
| 142 | |
| 143 | /* |
| 144 | * Lower the rating of the Hyper-V timer in a TDX VM without paravisor, |
| 145 | * so the local APIC timer (lapic_clockevent) is the default timer in |
| 146 | * such a VM. The Hyper-V timer is not preferred in such a VM because |
| 147 | * it depends on the slow VM Reference Counter MSR (the Hyper-V TSC |
| 148 | * page is not enbled in such a VM because the VM uses Invariant TSC |
| 149 | * as a better clocksource and it's challenging to mark the Hyper-V |
| 150 | * TSC page shared in very early boot). |
| 151 | */ |
| 152 | if (!ms_hyperv.paravisor_present && hv_isolation_type_tdx()) |
| 153 | ce->rating = 90; |
| 154 | else |
| 155 | ce->rating = 1000; |
| 156 | |
| 157 | ce->set_state_shutdown = hv_ce_shutdown; |
| 158 | ce->set_state_oneshot = hv_ce_set_oneshot; |
| 159 | ce->set_next_event = hv_ce_set_next_event; |
| 160 | |
| 161 | clockevents_config_and_register(dev: ce, |
| 162 | HV_CLOCK_HZ, |
| 163 | HV_MIN_DELTA_TICKS, |
| 164 | HV_MAX_MAX_DELTA_TICKS); |
| 165 | return 0; |
| 166 | } |
| 167 | |
| 168 | /* |
| 169 | * hv_stimer_cleanup - Per-cpu cleanup of the clockevent |
| 170 | */ |
| 171 | int hv_stimer_cleanup(unsigned int cpu) |
| 172 | { |
| 173 | struct clock_event_device *ce; |
| 174 | |
| 175 | if (!hv_clock_event) |
| 176 | return 0; |
| 177 | |
| 178 | /* |
| 179 | * In the legacy case where Direct Mode is not enabled |
| 180 | * (which can only be on x86/64), stimer cleanup happens |
| 181 | * relatively early in the CPU offlining process. We |
| 182 | * must unbind the stimer-based clockevent device so |
| 183 | * that the LAPIC timer can take over until clockevents |
| 184 | * are no longer needed in the offlining process. Note |
| 185 | * that clockevents_unbind_device() eventually calls |
| 186 | * hv_ce_shutdown(). |
| 187 | * |
| 188 | * The unbind should not be done when Direct Mode is |
| 189 | * enabled because we may be on an architecture where |
| 190 | * there are no other clockevent devices to fallback to. |
| 191 | */ |
| 192 | ce = per_cpu_ptr(hv_clock_event, cpu); |
| 193 | if (direct_mode_enabled) |
| 194 | hv_ce_shutdown(evt: ce); |
| 195 | else |
| 196 | clockevents_unbind_device(ced: ce, cpu); |
| 197 | |
| 198 | return 0; |
| 199 | } |
| 200 | EXPORT_SYMBOL_GPL(hv_stimer_cleanup); |
| 201 | |
| 202 | /* |
| 203 | * These placeholders are overridden by arch specific code on |
| 204 | * architectures that need special setup of the stimer0 IRQ because |
| 205 | * they don't support per-cpu IRQs (such as x86/x64). |
| 206 | */ |
| 207 | void __weak hv_setup_stimer0_handler(void (*handler)(void)) |
| 208 | { |
| 209 | }; |
| 210 | |
| 211 | void __weak hv_remove_stimer0_handler(void) |
| 212 | { |
| 213 | }; |
| 214 | |
| 215 | #ifdef CONFIG_ACPI |
| 216 | /* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ |
| 217 | static int hv_setup_stimer0_irq(void) |
| 218 | { |
| 219 | int ret; |
| 220 | |
| 221 | ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, |
| 222 | ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); |
| 223 | if (ret < 0) { |
| 224 | pr_err("Can't register Hyper-V stimer0 GSI. Error %d" , ret); |
| 225 | return ret; |
| 226 | } |
| 227 | stimer0_irq = ret; |
| 228 | |
| 229 | ret = request_percpu_irq(irq: stimer0_irq, handler: hv_stimer0_percpu_isr, |
| 230 | devname: "Hyper-V stimer0" , percpu_dev_id: &stimer0_evt); |
| 231 | if (ret) { |
| 232 | pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d" , |
| 233 | stimer0_irq, ret); |
| 234 | acpi_unregister_gsi(gsi: stimer0_irq); |
| 235 | stimer0_irq = -1; |
| 236 | } |
| 237 | return ret; |
| 238 | } |
| 239 | |
| 240 | static void hv_remove_stimer0_irq(void) |
| 241 | { |
| 242 | if (stimer0_irq == -1) { |
| 243 | hv_remove_stimer0_handler(); |
| 244 | } else { |
| 245 | free_percpu_irq(stimer0_irq, &stimer0_evt); |
| 246 | acpi_unregister_gsi(gsi: stimer0_irq); |
| 247 | stimer0_irq = -1; |
| 248 | } |
| 249 | } |
| 250 | #else |
| 251 | static int hv_setup_stimer0_irq(void) |
| 252 | { |
| 253 | return 0; |
| 254 | } |
| 255 | |
| 256 | static void hv_remove_stimer0_irq(void) |
| 257 | { |
| 258 | } |
| 259 | #endif |
| 260 | |
| 261 | /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ |
| 262 | int hv_stimer_alloc(bool have_percpu_irqs) |
| 263 | { |
| 264 | int ret; |
| 265 | |
| 266 | /* |
| 267 | * Synthetic timers are always available except on old versions of |
| 268 | * Hyper-V on x86. In that case, return as error as Linux will use a |
| 269 | * clockevent based on emulated LAPIC timer hardware. |
| 270 | */ |
| 271 | if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) |
| 272 | return -EINVAL; |
| 273 | |
| 274 | hv_clock_event = alloc_percpu(struct clock_event_device); |
| 275 | if (!hv_clock_event) |
| 276 | return -ENOMEM; |
| 277 | |
| 278 | direct_mode_enabled = ms_hyperv.misc_features & |
| 279 | HV_STIMER_DIRECT_MODE_AVAILABLE; |
| 280 | |
| 281 | /* |
| 282 | * If Direct Mode isn't enabled, the remainder of the initialization |
| 283 | * is done later by hv_stimer_legacy_init() |
| 284 | */ |
| 285 | if (!direct_mode_enabled) |
| 286 | return 0; |
| 287 | |
| 288 | if (have_percpu_irqs) { |
| 289 | ret = hv_setup_stimer0_irq(); |
| 290 | if (ret) |
| 291 | goto free_clock_event; |
| 292 | } else { |
| 293 | hv_setup_stimer0_handler(handler: hv_stimer0_isr); |
| 294 | } |
| 295 | |
| 296 | /* |
| 297 | * Since we are in Direct Mode, stimer initialization |
| 298 | * can be done now with a CPUHP value in the same range |
| 299 | * as other clockevent devices. |
| 300 | */ |
| 301 | ret = cpuhp_setup_state(state: CPUHP_AP_HYPERV_TIMER_STARTING, |
| 302 | name: "clockevents/hyperv/stimer:starting" , |
| 303 | startup: hv_stimer_init, teardown: hv_stimer_cleanup); |
| 304 | if (ret < 0) { |
| 305 | hv_remove_stimer0_irq(); |
| 306 | goto free_clock_event; |
| 307 | } |
| 308 | return ret; |
| 309 | |
| 310 | free_clock_event: |
| 311 | free_percpu(pdata: hv_clock_event); |
| 312 | hv_clock_event = NULL; |
| 313 | return ret; |
| 314 | } |
| 315 | EXPORT_SYMBOL_GPL(hv_stimer_alloc); |
| 316 | |
| 317 | /* |
| 318 | * hv_stimer_legacy_init -- Called from the VMbus driver to handle |
| 319 | * the case when Direct Mode is not enabled, and the stimer |
| 320 | * must be initialized late in the CPU onlining process. |
| 321 | * |
| 322 | */ |
| 323 | void hv_stimer_legacy_init(unsigned int cpu, int sint) |
| 324 | { |
| 325 | if (direct_mode_enabled) |
| 326 | return; |
| 327 | |
| 328 | /* |
| 329 | * This function gets called by each vCPU, so setting the |
| 330 | * global stimer_message_sint value each time is conceptually |
| 331 | * not ideal, but the value passed in is always the same and |
| 332 | * it avoids introducing yet another interface into this |
| 333 | * clocksource driver just to set the sint in the legacy case. |
| 334 | */ |
| 335 | stimer0_message_sint = sint; |
| 336 | (void)hv_stimer_init(cpu); |
| 337 | } |
| 338 | EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); |
| 339 | |
| 340 | /* |
| 341 | * hv_stimer_legacy_cleanup -- Called from the VMbus driver to |
| 342 | * handle the case when Direct Mode is not enabled, and the |
| 343 | * stimer must be cleaned up early in the CPU offlining |
| 344 | * process. |
| 345 | */ |
| 346 | void hv_stimer_legacy_cleanup(unsigned int cpu) |
| 347 | { |
| 348 | if (direct_mode_enabled) |
| 349 | return; |
| 350 | (void)hv_stimer_cleanup(cpu); |
| 351 | } |
| 352 | EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); |
| 353 | |
| 354 | /* |
| 355 | * Do a global cleanup of clockevents for the cases of kexec and |
| 356 | * vmbus exit |
| 357 | */ |
| 358 | void hv_stimer_global_cleanup(void) |
| 359 | { |
| 360 | int cpu; |
| 361 | |
| 362 | /* |
| 363 | * hv_stime_legacy_cleanup() will stop the stimer if Direct |
| 364 | * Mode is not enabled, and fallback to the LAPIC timer. |
| 365 | */ |
| 366 | for_each_present_cpu(cpu) { |
| 367 | hv_stimer_legacy_cleanup(cpu); |
| 368 | } |
| 369 | |
| 370 | if (!hv_clock_event) |
| 371 | return; |
| 372 | |
| 373 | if (direct_mode_enabled) { |
| 374 | cpuhp_remove_state(state: CPUHP_AP_HYPERV_TIMER_STARTING); |
| 375 | hv_remove_stimer0_irq(); |
| 376 | stimer0_irq = -1; |
| 377 | } |
| 378 | free_percpu(pdata: hv_clock_event); |
| 379 | hv_clock_event = NULL; |
| 380 | |
| 381 | } |
| 382 | EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); |
| 383 | |
| 384 | static __always_inline u64 read_hv_clock_msr(void) |
| 385 | { |
| 386 | /* |
| 387 | * Read the partition counter to get the current tick count. This count |
| 388 | * is set to 0 when the partition is created and is incremented in 100 |
| 389 | * nanosecond units. |
| 390 | * |
| 391 | * Use hv_raw_get_msr() because this function is used from |
| 392 | * noinstr. Notable; while HV_MSR_TIME_REF_COUNT is a synthetic |
| 393 | * register it doesn't need the GHCB path. |
| 394 | */ |
| 395 | return hv_raw_get_msr(HV_MSR_TIME_REF_COUNT); |
| 396 | } |
| 397 | |
| 398 | /* |
| 399 | * Code and definitions for the Hyper-V clocksources. Two |
| 400 | * clocksources are defined: one that reads the Hyper-V defined MSR, and |
| 401 | * the other that uses the TSC reference page feature as defined in the |
| 402 | * TLFS. The MSR version is for compatibility with old versions of |
| 403 | * Hyper-V and 32-bit x86. The TSC reference page version is preferred. |
| 404 | */ |
| 405 | |
| 406 | static union { |
| 407 | struct ms_hyperv_tsc_page page; |
| 408 | u8 reserved[PAGE_SIZE]; |
| 409 | } tsc_pg __bss_decrypted __aligned(PAGE_SIZE); |
| 410 | |
| 411 | static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; |
| 412 | static unsigned long tsc_pfn; |
| 413 | |
| 414 | unsigned long hv_get_tsc_pfn(void) |
| 415 | { |
| 416 | return tsc_pfn; |
| 417 | } |
| 418 | EXPORT_SYMBOL_GPL(hv_get_tsc_pfn); |
| 419 | |
| 420 | struct ms_hyperv_tsc_page *hv_get_tsc_page(void) |
| 421 | { |
| 422 | return tsc_page; |
| 423 | } |
| 424 | EXPORT_SYMBOL_GPL(hv_get_tsc_page); |
| 425 | |
| 426 | static __always_inline u64 read_hv_clock_tsc(void) |
| 427 | { |
| 428 | u64 cur_tsc, time; |
| 429 | |
| 430 | /* |
| 431 | * The Hyper-V Top-Level Function Spec (TLFS), section Timers, |
| 432 | * subsection Refererence Counter, guarantees that the TSC and MSR |
| 433 | * times are in sync and monotonic. Therefore we can fall back |
| 434 | * to the MSR in case the TSC page indicates unavailability. |
| 435 | */ |
| 436 | if (!hv_read_tsc_page_tsc(tsc_pg: tsc_page, cur_tsc: &cur_tsc, time: &time)) |
| 437 | time = read_hv_clock_msr(); |
| 438 | |
| 439 | return time; |
| 440 | } |
| 441 | |
| 442 | static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) |
| 443 | { |
| 444 | return read_hv_clock_tsc(); |
| 445 | } |
| 446 | |
| 447 | static u64 noinstr read_hv_sched_clock_tsc(void) |
| 448 | { |
| 449 | return (read_hv_clock_tsc() - hv_sched_clock_offset) * |
| 450 | (NSEC_PER_SEC / HV_CLOCK_HZ); |
| 451 | } |
| 452 | |
| 453 | static void suspend_hv_clock_tsc(struct clocksource *arg) |
| 454 | { |
| 455 | union hv_reference_tsc_msr tsc_msr; |
| 456 | |
| 457 | /* Disable the TSC page */ |
| 458 | tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); |
| 459 | tsc_msr.enable = 0; |
| 460 | hv_set_msr(HV_MSR_REFERENCE_TSC, value: tsc_msr.as_uint64); |
| 461 | } |
| 462 | |
| 463 | |
| 464 | static void resume_hv_clock_tsc(struct clocksource *arg) |
| 465 | { |
| 466 | union hv_reference_tsc_msr tsc_msr; |
| 467 | |
| 468 | /* Re-enable the TSC page */ |
| 469 | tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); |
| 470 | tsc_msr.enable = 1; |
| 471 | tsc_msr.pfn = tsc_pfn; |
| 472 | hv_set_msr(HV_MSR_REFERENCE_TSC, value: tsc_msr.as_uint64); |
| 473 | } |
| 474 | |
| 475 | /* |
| 476 | * Called during resume from hibernation, from overridden |
| 477 | * x86_platform.restore_sched_clock_state routine. This is to adjust offsets |
| 478 | * used to calculate time for hv tsc page based sched_clock, to account for |
| 479 | * time spent before hibernation. |
| 480 | */ |
| 481 | void hv_adj_sched_clock_offset(u64 offset) |
| 482 | { |
| 483 | hv_sched_clock_offset -= offset; |
| 484 | } |
| 485 | |
| 486 | #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK |
| 487 | static int hv_cs_enable(struct clocksource *cs) |
| 488 | { |
| 489 | vclocks_set_used(which: VDSO_CLOCKMODE_HVCLOCK); |
| 490 | return 0; |
| 491 | } |
| 492 | #endif |
| 493 | |
| 494 | static struct clocksource hyperv_cs_tsc = { |
| 495 | .name = "hyperv_clocksource_tsc_page" , |
| 496 | .rating = 500, |
| 497 | .read = read_hv_clock_tsc_cs, |
| 498 | .mask = CLOCKSOURCE_MASK(64), |
| 499 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
| 500 | .suspend= suspend_hv_clock_tsc, |
| 501 | .resume = resume_hv_clock_tsc, |
| 502 | #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK |
| 503 | .enable = hv_cs_enable, |
| 504 | .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, |
| 505 | #else |
| 506 | .vdso_clock_mode = VDSO_CLOCKMODE_NONE, |
| 507 | #endif |
| 508 | }; |
| 509 | |
| 510 | static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) |
| 511 | { |
| 512 | return read_hv_clock_msr(); |
| 513 | } |
| 514 | |
| 515 | static struct clocksource hyperv_cs_msr = { |
| 516 | .name = "hyperv_clocksource_msr" , |
| 517 | .rating = 495, |
| 518 | .read = read_hv_clock_msr_cs, |
| 519 | .mask = CLOCKSOURCE_MASK(64), |
| 520 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
| 521 | }; |
| 522 | |
| 523 | /* |
| 524 | * Reference to pv_ops must be inline so objtool |
| 525 | * detection of noinstr violations can work correctly. |
| 526 | */ |
| 527 | #ifdef CONFIG_GENERIC_SCHED_CLOCK |
| 528 | static __always_inline void hv_setup_sched_clock(void *sched_clock) |
| 529 | { |
| 530 | /* |
| 531 | * We're on an architecture with generic sched clock (not x86/x64). |
| 532 | * The Hyper-V sched clock read function returns nanoseconds, not |
| 533 | * the normal 100ns units of the Hyper-V synthetic clock. |
| 534 | */ |
| 535 | sched_clock_register(sched_clock, 64, NSEC_PER_SEC); |
| 536 | } |
| 537 | #elif defined CONFIG_PARAVIRT |
| 538 | static __always_inline void hv_setup_sched_clock(void *sched_clock) |
| 539 | { |
| 540 | /* We're on x86/x64 *and* using PV ops */ |
| 541 | paravirt_set_sched_clock(func: sched_clock); |
| 542 | } |
| 543 | #else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ |
| 544 | static __always_inline void hv_setup_sched_clock(void *sched_clock) {} |
| 545 | #endif /* CONFIG_GENERIC_SCHED_CLOCK */ |
| 546 | |
| 547 | static void __init hv_init_tsc_clocksource(void) |
| 548 | { |
| 549 | union hv_reference_tsc_msr tsc_msr; |
| 550 | |
| 551 | /* |
| 552 | * When running as a guest partition: |
| 553 | * |
| 554 | * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly |
| 555 | * handles frequency and offset changes due to live migration, |
| 556 | * pause/resume, and other VM management operations. So lower the |
| 557 | * Hyper-V Reference TSC rating, causing the generic TSC to be used. |
| 558 | * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference |
| 559 | * TSC will be preferred over the virtualized ARM64 arch counter. |
| 560 | * |
| 561 | * When running as the root partition: |
| 562 | * |
| 563 | * There is no HV_ACCESS_TSC_INVARIANT feature. Always lower the rating |
| 564 | * of the Hyper-V Reference TSC. |
| 565 | */ |
| 566 | if ((ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) || |
| 567 | hv_root_partition()) { |
| 568 | hyperv_cs_tsc.rating = 250; |
| 569 | hyperv_cs_msr.rating = 245; |
| 570 | } |
| 571 | |
| 572 | if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) |
| 573 | return; |
| 574 | |
| 575 | hv_read_reference_counter = read_hv_clock_tsc; |
| 576 | |
| 577 | /* |
| 578 | * TSC page mapping works differently in root compared to guest. |
| 579 | * - In guest partition the guest PFN has to be passed to the |
| 580 | * hypervisor. |
| 581 | * - In root partition it's other way around: it has to map the PFN |
| 582 | * provided by the hypervisor. |
| 583 | * But it can't be mapped right here as it's too early and MMU isn't |
| 584 | * ready yet. So, we only set the enable bit here and will remap the |
| 585 | * page later in hv_remap_tsc_clocksource(). |
| 586 | * |
| 587 | * It worth mentioning, that TSC clocksource read function |
| 588 | * (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when |
| 589 | * TSC page is zeroed (which is the case until the PFN is remapped) and |
| 590 | * thus TSC clocksource will work even without the real TSC page |
| 591 | * mapped. |
| 592 | */ |
| 593 | tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); |
| 594 | if (hv_root_partition()) |
| 595 | tsc_pfn = tsc_msr.pfn; |
| 596 | else |
| 597 | tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); |
| 598 | tsc_msr.enable = 1; |
| 599 | tsc_msr.pfn = tsc_pfn; |
| 600 | hv_set_msr(HV_MSR_REFERENCE_TSC, value: tsc_msr.as_uint64); |
| 601 | |
| 602 | clocksource_register_hz(cs: &hyperv_cs_tsc, NSEC_PER_SEC/100); |
| 603 | |
| 604 | /* |
| 605 | * If TSC is invariant, then let it stay as the sched clock since it |
| 606 | * will be faster than reading the TSC page. But if not invariant, use |
| 607 | * the TSC page so that live migrations across hosts with different |
| 608 | * frequencies is handled correctly. |
| 609 | */ |
| 610 | if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) { |
| 611 | hv_sched_clock_offset = hv_read_reference_counter(); |
| 612 | hv_setup_sched_clock(sched_clock: read_hv_sched_clock_tsc); |
| 613 | } |
| 614 | } |
| 615 | |
| 616 | void __init hv_init_clocksource(void) |
| 617 | { |
| 618 | /* |
| 619 | * Try to set up the TSC page clocksource, then the MSR clocksource. |
| 620 | * At least one of these will always be available except on very old |
| 621 | * versions of Hyper-V on x86. In that case we won't have a Hyper-V |
| 622 | * clocksource, but Linux will still run with a clocksource based |
| 623 | * on the emulated PIT or LAPIC timer. |
| 624 | * |
| 625 | * Never use the MSR clocksource as sched clock. It's too slow. |
| 626 | * Better to use the native sched clock as the fallback. |
| 627 | */ |
| 628 | hv_init_tsc_clocksource(); |
| 629 | |
| 630 | if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) |
| 631 | clocksource_register_hz(cs: &hyperv_cs_msr, NSEC_PER_SEC/100); |
| 632 | } |
| 633 | |
| 634 | void __init hv_remap_tsc_clocksource(void) |
| 635 | { |
| 636 | if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) |
| 637 | return; |
| 638 | |
| 639 | if (!hv_root_partition()) { |
| 640 | WARN(1, "%s: attempt to remap TSC page in guest partition\n" , |
| 641 | __func__); |
| 642 | return; |
| 643 | } |
| 644 | |
| 645 | tsc_page = memremap(offset: tsc_pfn << HV_HYP_PAGE_SHIFT, size: sizeof(tsc_pg), |
| 646 | flags: MEMREMAP_WB); |
| 647 | if (!tsc_page) |
| 648 | pr_err("Failed to remap Hyper-V TSC page.\n" ); |
| 649 | } |
| 650 | |