| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Copyright IBM Corp. 1999, 2023 |
| 4 | */ |
| 5 | |
| 6 | #include <linux/cpuhotplug.h> |
| 7 | #include <linux/sched/task.h> |
| 8 | #include <linux/errno.h> |
| 9 | #include <linux/init.h> |
| 10 | #include <linux/irq.h> |
| 11 | #include <asm/asm-extable.h> |
| 12 | #include <asm/asm-offsets.h> |
| 13 | #include <asm/pfault.h> |
| 14 | #include <asm/diag.h> |
| 15 | |
| 16 | #define __SUBCODE_MASK 0x0600 |
| 17 | #define __PF_RES_FIELD 0x8000000000000000UL |
| 18 | |
| 19 | /* |
| 20 | * 'pfault' pseudo page faults routines. |
| 21 | */ |
| 22 | static int pfault_disable; |
| 23 | |
| 24 | static int __init nopfault(char *str) |
| 25 | { |
| 26 | pfault_disable = 1; |
| 27 | return 1; |
| 28 | } |
| 29 | early_param("nopfault" , nopfault); |
| 30 | |
| 31 | struct pfault_refbk { |
| 32 | u16 refdiagc; |
| 33 | u16 reffcode; |
| 34 | u16 refdwlen; |
| 35 | u16 refversn; |
| 36 | u64 refgaddr; |
| 37 | u64 refselmk; |
| 38 | u64 refcmpmk; |
| 39 | u64 reserved; |
| 40 | }; |
| 41 | |
| 42 | static struct pfault_refbk pfault_init_refbk = { |
| 43 | .refdiagc = 0x258, |
| 44 | .reffcode = 0, |
| 45 | .refdwlen = 5, |
| 46 | .refversn = 2, |
| 47 | .refgaddr = __LC_LPP, |
| 48 | .refselmk = 1UL << 48, |
| 49 | .refcmpmk = 1UL << 48, |
| 50 | .reserved = __PF_RES_FIELD |
| 51 | }; |
| 52 | |
| 53 | int __pfault_init(void) |
| 54 | { |
| 55 | int rc = -EOPNOTSUPP; |
| 56 | |
| 57 | if (pfault_disable) |
| 58 | return rc; |
| 59 | diag_stat_inc(DIAG_STAT_X258); |
| 60 | asm_inline volatile( |
| 61 | " diag %[refbk],%[rc],0x258\n" |
| 62 | "0: nopr %%r7\n" |
| 63 | EX_TABLE(0b, 0b) |
| 64 | : [rc] "+d" (rc) |
| 65 | : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) |
| 66 | : "cc" ); |
| 67 | return rc; |
| 68 | } |
| 69 | |
| 70 | static struct pfault_refbk pfault_fini_refbk = { |
| 71 | .refdiagc = 0x258, |
| 72 | .reffcode = 1, |
| 73 | .refdwlen = 5, |
| 74 | .refversn = 2, |
| 75 | }; |
| 76 | |
| 77 | void __pfault_fini(void) |
| 78 | { |
| 79 | if (pfault_disable) |
| 80 | return; |
| 81 | diag_stat_inc(DIAG_STAT_X258); |
| 82 | asm_inline volatile( |
| 83 | " diag %[refbk],0,0x258\n" |
| 84 | "0: nopr %%r7\n" |
| 85 | EX_TABLE(0b, 0b) |
| 86 | : |
| 87 | : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) |
| 88 | : "cc" ); |
| 89 | } |
| 90 | |
| 91 | static DEFINE_SPINLOCK(pfault_lock); |
| 92 | static LIST_HEAD(pfault_list); |
| 93 | |
| 94 | #define PF_COMPLETE 0x0080 |
| 95 | |
| 96 | /* |
| 97 | * The mechanism of our pfault code: if Linux is running as guest, runs a user |
| 98 | * space process and the user space process accesses a page that the host has |
| 99 | * paged out we get a pfault interrupt. |
| 100 | * |
| 101 | * This allows us, within the guest, to schedule a different process. Without |
| 102 | * this mechanism the host would have to suspend the whole virtual cpu until |
| 103 | * the page has been paged in. |
| 104 | * |
| 105 | * So when we get such an interrupt then we set the state of the current task |
| 106 | * to uninterruptible and also set the need_resched flag. Both happens within |
| 107 | * interrupt context(!). If we later on want to return to user space we |
| 108 | * recognize the need_resched flag and then call schedule(). It's not very |
| 109 | * obvious how this works... |
| 110 | * |
| 111 | * Of course we have a lot of additional fun with the completion interrupt (-> |
| 112 | * host signals that a page of a process has been paged in and the process can |
| 113 | * continue to run). This interrupt can arrive on any cpu and, since we have |
| 114 | * virtual cpus, actually appear before the interrupt that signals that a page |
| 115 | * is missing. |
| 116 | */ |
| 117 | static void pfault_interrupt(struct ext_code ext_code, |
| 118 | unsigned int param32, unsigned long param64) |
| 119 | { |
| 120 | struct task_struct *tsk; |
| 121 | __u16 subcode; |
| 122 | pid_t pid; |
| 123 | |
| 124 | /* |
| 125 | * Get the external interruption subcode & pfault initial/completion |
| 126 | * signal bit. VM stores this in the 'cpu address' field associated |
| 127 | * with the external interrupt. |
| 128 | */ |
| 129 | subcode = ext_code.subcode; |
| 130 | if ((subcode & 0xff00) != __SUBCODE_MASK) |
| 131 | return; |
| 132 | inc_irq_stat(IRQEXT_PFL); |
| 133 | /* Get the token (= pid of the affected task). */ |
| 134 | pid = param64 & LPP_PID_MASK; |
| 135 | rcu_read_lock(); |
| 136 | tsk = find_task_by_pid_ns(nr: pid, ns: &init_pid_ns); |
| 137 | if (tsk) |
| 138 | get_task_struct(t: tsk); |
| 139 | rcu_read_unlock(); |
| 140 | if (!tsk) |
| 141 | return; |
| 142 | spin_lock(lock: &pfault_lock); |
| 143 | if (subcode & PF_COMPLETE) { |
| 144 | /* signal bit is set -> a page has been swapped in by VM */ |
| 145 | if (tsk->thread.pfault_wait == 1) { |
| 146 | /* |
| 147 | * Initial interrupt was faster than the completion |
| 148 | * interrupt. pfault_wait is valid. Set pfault_wait |
| 149 | * back to zero and wake up the process. This can |
| 150 | * safely be done because the task is still sleeping |
| 151 | * and can't produce new pfaults. |
| 152 | */ |
| 153 | tsk->thread.pfault_wait = 0; |
| 154 | list_del(entry: &tsk->thread.list); |
| 155 | wake_up_process(tsk); |
| 156 | put_task_struct(t: tsk); |
| 157 | } else { |
| 158 | /* |
| 159 | * Completion interrupt was faster than initial |
| 160 | * interrupt. Set pfault_wait to -1 so the initial |
| 161 | * interrupt doesn't put the task to sleep. |
| 162 | * If the task is not running, ignore the completion |
| 163 | * interrupt since it must be a leftover of a PFAULT |
| 164 | * CANCEL operation which didn't remove all pending |
| 165 | * completion interrupts. |
| 166 | */ |
| 167 | if (task_is_running(tsk)) |
| 168 | tsk->thread.pfault_wait = -1; |
| 169 | } |
| 170 | } else { |
| 171 | /* signal bit not set -> a real page is missing. */ |
| 172 | if (WARN_ON_ONCE(tsk != current)) |
| 173 | goto out; |
| 174 | if (tsk->thread.pfault_wait == 1) { |
| 175 | /* Already on the list with a reference: put to sleep */ |
| 176 | goto block; |
| 177 | } else if (tsk->thread.pfault_wait == -1) { |
| 178 | /* |
| 179 | * Completion interrupt was faster than the initial |
| 180 | * interrupt (pfault_wait == -1). Set pfault_wait |
| 181 | * back to zero and exit. |
| 182 | */ |
| 183 | tsk->thread.pfault_wait = 0; |
| 184 | } else { |
| 185 | /* |
| 186 | * Initial interrupt arrived before completion |
| 187 | * interrupt. Let the task sleep. |
| 188 | * An extra task reference is needed since a different |
| 189 | * cpu may set the task state to TASK_RUNNING again |
| 190 | * before the scheduler is reached. |
| 191 | */ |
| 192 | get_task_struct(t: tsk); |
| 193 | tsk->thread.pfault_wait = 1; |
| 194 | list_add(new: &tsk->thread.list, head: &pfault_list); |
| 195 | block: |
| 196 | /* |
| 197 | * Since this must be a userspace fault, there |
| 198 | * is no kernel task state to trample. Rely on the |
| 199 | * return to userspace schedule() to block. |
| 200 | */ |
| 201 | __set_current_state(TASK_UNINTERRUPTIBLE); |
| 202 | set_need_resched_current(); |
| 203 | } |
| 204 | } |
| 205 | out: |
| 206 | spin_unlock(lock: &pfault_lock); |
| 207 | put_task_struct(t: tsk); |
| 208 | } |
| 209 | |
| 210 | static int pfault_cpu_dead(unsigned int cpu) |
| 211 | { |
| 212 | struct thread_struct *thread, *next; |
| 213 | struct task_struct *tsk; |
| 214 | |
| 215 | spin_lock_irq(lock: &pfault_lock); |
| 216 | list_for_each_entry_safe(thread, next, &pfault_list, list) { |
| 217 | thread->pfault_wait = 0; |
| 218 | list_del(entry: &thread->list); |
| 219 | tsk = container_of(thread, struct task_struct, thread); |
| 220 | wake_up_process(tsk); |
| 221 | put_task_struct(t: tsk); |
| 222 | } |
| 223 | spin_unlock_irq(lock: &pfault_lock); |
| 224 | return 0; |
| 225 | } |
| 226 | |
| 227 | static int __init pfault_irq_init(void) |
| 228 | { |
| 229 | int rc; |
| 230 | |
| 231 | rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); |
| 232 | if (rc) |
| 233 | goto out_extint; |
| 234 | rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; |
| 235 | if (rc) |
| 236 | goto out_pfault; |
| 237 | irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); |
| 238 | cpuhp_setup_state_nocalls(state: CPUHP_S390_PFAULT_DEAD, name: "s390/pfault:dead" , |
| 239 | NULL, teardown: pfault_cpu_dead); |
| 240 | return 0; |
| 241 | |
| 242 | out_pfault: |
| 243 | unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); |
| 244 | out_extint: |
| 245 | pfault_disable = 1; |
| 246 | return rc; |
| 247 | } |
| 248 | early_initcall(pfault_irq_init); |
| 249 | |