| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * This code is used on x86_64 to create page table identity mappings on |
| 4 | * demand by building up a new set of page tables (or appending to the |
| 5 | * existing ones), and then switching over to them when ready. |
| 6 | * |
| 7 | * Copyright (C) 2015-2016 Yinghai Lu |
| 8 | * Copyright (C) 2016 Kees Cook |
| 9 | */ |
| 10 | |
| 11 | /* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */ |
| 12 | #undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION |
| 13 | |
| 14 | #include "error.h" |
| 15 | #include "misc.h" |
| 16 | |
| 17 | /* These actually do the work of building the kernel identity maps. */ |
| 18 | #include <linux/pgtable.h> |
| 19 | #include <asm/cmpxchg.h> |
| 20 | #include <asm/trap_pf.h> |
| 21 | #include <asm/trapnr.h> |
| 22 | #include <asm/init.h> |
| 23 | /* Use the static base for this part of the boot process */ |
| 24 | #undef __PAGE_OFFSET |
| 25 | #define __PAGE_OFFSET __PAGE_OFFSET_BASE |
| 26 | #include "../../mm/ident_map.c" |
| 27 | |
| 28 | #define _SETUP |
| 29 | #include <asm/setup.h> /* For COMMAND_LINE_SIZE */ |
| 30 | #undef _SETUP |
| 31 | |
| 32 | extern unsigned long get_cmd_line_ptr(void); |
| 33 | |
| 34 | /* Used by PAGE_KERN* macros: */ |
| 35 | pteval_t __default_kernel_pte_mask __read_mostly = ~0; |
| 36 | |
| 37 | /* Used to track our page table allocation area. */ |
| 38 | struct alloc_pgt_data { |
| 39 | unsigned char *pgt_buf; |
| 40 | unsigned long pgt_buf_size; |
| 41 | unsigned long pgt_buf_offset; |
| 42 | }; |
| 43 | |
| 44 | /* |
| 45 | * Allocates space for a page table entry, using struct alloc_pgt_data |
| 46 | * above. Besides the local callers, this is used as the allocation |
| 47 | * callback in mapping_info below. |
| 48 | */ |
| 49 | static void *alloc_pgt_page(void *context) |
| 50 | { |
| 51 | struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; |
| 52 | unsigned char *entry; |
| 53 | |
| 54 | /* Validate there is space available for a new page. */ |
| 55 | if (pages->pgt_buf_offset >= pages->pgt_buf_size) { |
| 56 | debug_putstr("out of pgt_buf in " __FILE__ "!?\n" ); |
| 57 | debug_putaddr(pages->pgt_buf_offset); |
| 58 | debug_putaddr(pages->pgt_buf_size); |
| 59 | return NULL; |
| 60 | } |
| 61 | |
| 62 | /* Consumed more tables than expected? */ |
| 63 | if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) { |
| 64 | debug_putstr("pgt_buf running low in " __FILE__ "\n" ); |
| 65 | debug_putstr("Need to raise BOOT_PGT_SIZE?\n" ); |
| 66 | debug_putaddr(pages->pgt_buf_offset); |
| 67 | debug_putaddr(pages->pgt_buf_size); |
| 68 | } |
| 69 | |
| 70 | entry = pages->pgt_buf + pages->pgt_buf_offset; |
| 71 | pages->pgt_buf_offset += PAGE_SIZE; |
| 72 | |
| 73 | return entry; |
| 74 | } |
| 75 | |
| 76 | /* Used to track our allocated page tables. */ |
| 77 | static struct alloc_pgt_data pgt_data; |
| 78 | |
| 79 | /* The top level page table entry pointer. */ |
| 80 | static unsigned long top_level_pgt; |
| 81 | |
| 82 | phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; |
| 83 | |
| 84 | /* |
| 85 | * Mapping information structure passed to kernel_ident_mapping_init(). |
| 86 | * Due to relocation, pointers must be assigned at run time not build time. |
| 87 | */ |
| 88 | static struct x86_mapping_info mapping_info; |
| 89 | |
| 90 | /* |
| 91 | * Adds the specified range to the identity mappings. |
| 92 | */ |
| 93 | void kernel_add_identity_map(unsigned long start, unsigned long end) |
| 94 | { |
| 95 | int ret; |
| 96 | |
| 97 | /* Align boundary to 2M. */ |
| 98 | start = round_down(start, PMD_SIZE); |
| 99 | end = round_up(end, PMD_SIZE); |
| 100 | if (start >= end) |
| 101 | return; |
| 102 | |
| 103 | /* Build the mapping. */ |
| 104 | ret = kernel_ident_mapping_init(info: &mapping_info, pgd_page: (pgd_t *)top_level_pgt, pstart: start, pend: end); |
| 105 | if (ret) |
| 106 | error(m: "Error: kernel_ident_mapping_init() failed\n" ); |
| 107 | } |
| 108 | |
| 109 | /* Locates and clears a region for a new top level page table. */ |
| 110 | void initialize_identity_maps(void *rmode) |
| 111 | { |
| 112 | unsigned long cmdline; |
| 113 | struct setup_data *sd; |
| 114 | |
| 115 | /* Exclude the encryption mask from __PHYSICAL_MASK */ |
| 116 | physical_mask &= ~sme_me_mask; |
| 117 | |
| 118 | /* Init mapping_info with run-time function/buffer pointers. */ |
| 119 | mapping_info.alloc_pgt_page = alloc_pgt_page; |
| 120 | mapping_info.context = &pgt_data; |
| 121 | mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; |
| 122 | mapping_info.kernpg_flag = _KERNPG_TABLE; |
| 123 | |
| 124 | /* |
| 125 | * It should be impossible for this not to already be true, |
| 126 | * but since calling this a second time would rewind the other |
| 127 | * counters, let's just make sure this is reset too. |
| 128 | */ |
| 129 | pgt_data.pgt_buf_offset = 0; |
| 130 | |
| 131 | /* |
| 132 | * If we came here via startup_32(), cr3 will be _pgtable already |
| 133 | * and we must append to the existing area instead of entirely |
| 134 | * overwriting it. |
| 135 | * |
| 136 | * With 5-level paging, we use '_pgtable' to allocate the p4d page table, |
| 137 | * the top-level page table is allocated separately. |
| 138 | * |
| 139 | * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level |
| 140 | * cases. On 4-level paging it's equal to 'top_level_pgt'. |
| 141 | */ |
| 142 | top_level_pgt = read_cr3_pa(); |
| 143 | if (p4d_offset(pgd: (pgd_t *)top_level_pgt, address: 0) == (p4d_t *)_pgtable) { |
| 144 | pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; |
| 145 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; |
| 146 | memset(s: pgt_data.pgt_buf, c: 0, n: pgt_data.pgt_buf_size); |
| 147 | } else { |
| 148 | pgt_data.pgt_buf = _pgtable; |
| 149 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE; |
| 150 | memset(s: pgt_data.pgt_buf, c: 0, n: pgt_data.pgt_buf_size); |
| 151 | top_level_pgt = (unsigned long)alloc_pgt_page(context: &pgt_data); |
| 152 | } |
| 153 | |
| 154 | /* |
| 155 | * New page-table is set up - map the kernel image, boot_params and the |
| 156 | * command line. The uncompressed kernel requires boot_params and the |
| 157 | * command line to be mapped in the identity mapping. Map them |
| 158 | * explicitly here in case the compressed kernel does not touch them, |
| 159 | * or does not touch all the pages covering them. |
| 160 | */ |
| 161 | kernel_add_identity_map(start: (unsigned long)_head, end: (unsigned long)_end); |
| 162 | boot_params_ptr = rmode; |
| 163 | kernel_add_identity_map(start: (unsigned long)boot_params_ptr, |
| 164 | end: (unsigned long)(boot_params_ptr + 1)); |
| 165 | cmdline = get_cmd_line_ptr(); |
| 166 | kernel_add_identity_map(start: cmdline, end: cmdline + COMMAND_LINE_SIZE); |
| 167 | |
| 168 | /* |
| 169 | * Also map the setup_data entries passed via boot_params in case they |
| 170 | * need to be accessed by uncompressed kernel via the identity mapping. |
| 171 | */ |
| 172 | sd = (struct setup_data *)boot_params_ptr->hdr.setup_data; |
| 173 | while (sd) { |
| 174 | unsigned long sd_addr = (unsigned long)sd; |
| 175 | |
| 176 | kernel_add_identity_map(start: sd_addr, end: sd_addr + sizeof(*sd) + sd->len); |
| 177 | sd = (struct setup_data *)sd->next; |
| 178 | } |
| 179 | |
| 180 | sev_prep_identity_maps(top_level_pgt); |
| 181 | |
| 182 | /* Load the new page-table. */ |
| 183 | write_cr3(x: top_level_pgt); |
| 184 | |
| 185 | /* |
| 186 | * Now that the required page table mappings are established and a |
| 187 | * GHCB can be used, check for SNP guest/HV feature compatibility. |
| 188 | */ |
| 189 | snp_check_features(); |
| 190 | } |
| 191 | |
| 192 | static pte_t *split_large_pmd(struct x86_mapping_info *info, |
| 193 | pmd_t *pmdp, unsigned long __address) |
| 194 | { |
| 195 | unsigned long page_flags; |
| 196 | unsigned long address; |
| 197 | pte_t *pte; |
| 198 | pmd_t pmd; |
| 199 | int i; |
| 200 | |
| 201 | pte = (pte_t *)info->alloc_pgt_page(info->context); |
| 202 | if (!pte) |
| 203 | return NULL; |
| 204 | |
| 205 | address = __address & PMD_MASK; |
| 206 | /* No large page - clear PSE flag */ |
| 207 | page_flags = info->page_flag & ~_PAGE_PSE; |
| 208 | |
| 209 | /* Populate the PTEs */ |
| 210 | for (i = 0; i < PTRS_PER_PMD; i++) { |
| 211 | set_pte(&pte[i], __pte(address | page_flags)); |
| 212 | address += PAGE_SIZE; |
| 213 | } |
| 214 | |
| 215 | /* |
| 216 | * Ideally we need to clear the large PMD first and do a TLB |
| 217 | * flush before we write the new PMD. But the 2M range of the |
| 218 | * PMD might contain the code we execute and/or the stack |
| 219 | * we are on, so we can't do that. But that should be safe here |
| 220 | * because we are going from large to small mappings and we are |
| 221 | * also the only user of the page-table, so there is no chance |
| 222 | * of a TLB multihit. |
| 223 | */ |
| 224 | pmd = __pmd((unsigned long)pte | info->kernpg_flag); |
| 225 | set_pmd(pmdp, pmd); |
| 226 | /* Flush TLB to establish the new PMD */ |
| 227 | write_cr3(x: top_level_pgt); |
| 228 | |
| 229 | return pte + pte_index(address: __address); |
| 230 | } |
| 231 | |
| 232 | static void clflush_page(unsigned long address) |
| 233 | { |
| 234 | unsigned int flush_size; |
| 235 | char *cl, *start, *end; |
| 236 | |
| 237 | /* |
| 238 | * Hardcode cl-size to 64 - CPUID can't be used here because that might |
| 239 | * cause another #VC exception and the GHCB is not ready to use yet. |
| 240 | */ |
| 241 | flush_size = 64; |
| 242 | start = (char *)(address & PAGE_MASK); |
| 243 | end = start + PAGE_SIZE; |
| 244 | |
| 245 | /* |
| 246 | * First make sure there are no pending writes on the cache-lines to |
| 247 | * flush. |
| 248 | */ |
| 249 | asm volatile("mfence" : : : "memory" ); |
| 250 | |
| 251 | for (cl = start; cl != end; cl += flush_size) |
| 252 | clflush(p: cl); |
| 253 | } |
| 254 | |
| 255 | static int set_clr_page_flags(struct x86_mapping_info *info, |
| 256 | unsigned long address, |
| 257 | pteval_t set, pteval_t clr) |
| 258 | { |
| 259 | pgd_t *pgdp = (pgd_t *)top_level_pgt; |
| 260 | p4d_t *p4dp; |
| 261 | pud_t *pudp; |
| 262 | pmd_t *pmdp; |
| 263 | pte_t *ptep, pte; |
| 264 | |
| 265 | /* |
| 266 | * First make sure there is a PMD mapping for 'address'. |
| 267 | * It should already exist, but keep things generic. |
| 268 | * |
| 269 | * To map the page just read from it and fault it in if there is no |
| 270 | * mapping yet. kernel_add_identity_map() can't be called here because |
| 271 | * that would unconditionally map the address on PMD level, destroying |
| 272 | * any PTE-level mappings that might already exist. Use assembly here |
| 273 | * so the access won't be optimized away. |
| 274 | */ |
| 275 | asm volatile("mov %[address], %%r9" |
| 276 | :: [address] "g" (*(unsigned long *)address) |
| 277 | : "r9" , "memory" ); |
| 278 | |
| 279 | /* |
| 280 | * The page is mapped at least with PMD size - so skip checks and walk |
| 281 | * directly to the PMD. |
| 282 | */ |
| 283 | p4dp = p4d_offset(pgd: pgdp, address); |
| 284 | pudp = pud_offset(p4d: p4dp, address); |
| 285 | pmdp = pmd_offset(pud: pudp, address); |
| 286 | |
| 287 | if (pmd_leaf(pte: *pmdp)) |
| 288 | ptep = split_large_pmd(info, pmdp, address: address); |
| 289 | else |
| 290 | ptep = pte_offset_kernel(pmd: pmdp, address); |
| 291 | |
| 292 | if (!ptep) |
| 293 | return -ENOMEM; |
| 294 | |
| 295 | /* |
| 296 | * Changing encryption attributes of a page requires to flush it from |
| 297 | * the caches. |
| 298 | */ |
| 299 | if ((set | clr) & _PAGE_ENC) { |
| 300 | clflush_page(address); |
| 301 | |
| 302 | /* |
| 303 | * If the encryption attribute is being cleared, change the page state |
| 304 | * to shared in the RMP table. |
| 305 | */ |
| 306 | if (clr) |
| 307 | snp_set_page_shared(__pa(address & PAGE_MASK)); |
| 308 | } |
| 309 | |
| 310 | /* Update PTE */ |
| 311 | pte = *ptep; |
| 312 | pte = pte_set_flags(pte, set); |
| 313 | pte = pte_clear_flags(pte, clear: clr); |
| 314 | set_pte(ptep, pte); |
| 315 | |
| 316 | /* |
| 317 | * If the encryption attribute is being set, then change the page state to |
| 318 | * private in the RMP entry. The page state change must be done after the PTE |
| 319 | * is updated. |
| 320 | */ |
| 321 | if (set & _PAGE_ENC) |
| 322 | snp_set_page_private(__pa(address & PAGE_MASK)); |
| 323 | |
| 324 | /* Flush TLB after changing encryption attribute */ |
| 325 | write_cr3(x: top_level_pgt); |
| 326 | |
| 327 | return 0; |
| 328 | } |
| 329 | |
| 330 | int set_page_decrypted(unsigned long address) |
| 331 | { |
| 332 | return set_clr_page_flags(info: &mapping_info, address, set: 0, _PAGE_ENC); |
| 333 | } |
| 334 | |
| 335 | int set_page_encrypted(unsigned long address) |
| 336 | { |
| 337 | return set_clr_page_flags(info: &mapping_info, address, _PAGE_ENC, clr: 0); |
| 338 | } |
| 339 | |
| 340 | int set_page_non_present(unsigned long address) |
| 341 | { |
| 342 | return set_clr_page_flags(info: &mapping_info, address, set: 0, _PAGE_PRESENT); |
| 343 | } |
| 344 | |
| 345 | static void do_pf_error(const char *msg, unsigned long error_code, |
| 346 | unsigned long address, unsigned long ip) |
| 347 | { |
| 348 | error_putstr(msg); |
| 349 | |
| 350 | error_putstr("\nError Code: " ); |
| 351 | error_puthex(error_code); |
| 352 | error_putstr("\nCR2: 0x" ); |
| 353 | error_puthex(address); |
| 354 | error_putstr("\nRIP relative to _head: 0x" ); |
| 355 | error_puthex(ip - (unsigned long)_head); |
| 356 | error_putstr("\n" ); |
| 357 | |
| 358 | error(m: "Stopping.\n" ); |
| 359 | } |
| 360 | |
| 361 | void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) |
| 362 | { |
| 363 | unsigned long address = native_read_cr2(); |
| 364 | unsigned long end; |
| 365 | bool ghcb_fault; |
| 366 | |
| 367 | ghcb_fault = sev_es_check_ghcb_fault(address); |
| 368 | |
| 369 | address &= PMD_MASK; |
| 370 | end = address + PMD_SIZE; |
| 371 | |
| 372 | /* |
| 373 | * Check for unexpected error codes. Unexpected are: |
| 374 | * - Faults on present pages |
| 375 | * - User faults |
| 376 | * - Reserved bits set |
| 377 | */ |
| 378 | if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD)) |
| 379 | do_pf_error(msg: "Unexpected page-fault:" , error_code, address, ip: regs->ip); |
| 380 | else if (ghcb_fault) |
| 381 | do_pf_error(msg: "Page-fault on GHCB page:" , error_code, address, ip: regs->ip); |
| 382 | |
| 383 | /* |
| 384 | * Error code is sane - now identity map the 2M region around |
| 385 | * the faulting address. |
| 386 | */ |
| 387 | kernel_add_identity_map(start: address, end); |
| 388 | } |
| 389 | |
| 390 | void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code) |
| 391 | { |
| 392 | spurious_nmi_count++; |
| 393 | } |
| 394 | |