| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /****************************************************************************** |
| 3 | ******************************************************************************* |
| 4 | ** |
| 5 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. |
| 6 | ** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. |
| 7 | ** |
| 8 | ** |
| 9 | ******************************************************************************* |
| 10 | ******************************************************************************/ |
| 11 | |
| 12 | #include "dlm_internal.h" |
| 13 | #include "lockspace.h" |
| 14 | #include "member.h" |
| 15 | #include "dir.h" |
| 16 | #include "ast.h" |
| 17 | #include "recover.h" |
| 18 | #include "lowcomms.h" |
| 19 | #include "lock.h" |
| 20 | #include "requestqueue.h" |
| 21 | #include "recoverd.h" |
| 22 | |
| 23 | static int dlm_create_masters_list(struct dlm_ls *ls) |
| 24 | { |
| 25 | struct dlm_rsb *r; |
| 26 | int error = 0; |
| 27 | |
| 28 | write_lock_bh(&ls->ls_masters_lock); |
| 29 | if (!list_empty(head: &ls->ls_masters_list)) { |
| 30 | log_error(ls, "root list not empty" ); |
| 31 | error = -EINVAL; |
| 32 | goto out; |
| 33 | } |
| 34 | |
| 35 | read_lock_bh(&ls->ls_rsbtbl_lock); |
| 36 | list_for_each_entry(r, &ls->ls_slow_active, res_slow_list) { |
| 37 | if (r->res_nodeid) |
| 38 | continue; |
| 39 | |
| 40 | list_add(new: &r->res_masters_list, head: &ls->ls_masters_list); |
| 41 | dlm_hold_rsb(r); |
| 42 | } |
| 43 | read_unlock_bh(&ls->ls_rsbtbl_lock); |
| 44 | out: |
| 45 | write_unlock_bh(&ls->ls_masters_lock); |
| 46 | return error; |
| 47 | } |
| 48 | |
| 49 | static void dlm_release_masters_list(struct dlm_ls *ls) |
| 50 | { |
| 51 | struct dlm_rsb *r, *safe; |
| 52 | |
| 53 | write_lock_bh(&ls->ls_masters_lock); |
| 54 | list_for_each_entry_safe(r, safe, &ls->ls_masters_list, res_masters_list) { |
| 55 | list_del_init(entry: &r->res_masters_list); |
| 56 | dlm_put_rsb(r); |
| 57 | } |
| 58 | write_unlock_bh(&ls->ls_masters_lock); |
| 59 | } |
| 60 | |
| 61 | static void dlm_create_root_list(struct dlm_ls *ls, struct list_head *root_list) |
| 62 | { |
| 63 | struct dlm_rsb *r; |
| 64 | |
| 65 | read_lock_bh(&ls->ls_rsbtbl_lock); |
| 66 | list_for_each_entry(r, &ls->ls_slow_active, res_slow_list) { |
| 67 | list_add(new: &r->res_root_list, head: root_list); |
| 68 | dlm_hold_rsb(r); |
| 69 | } |
| 70 | |
| 71 | WARN_ON_ONCE(!list_empty(&ls->ls_slow_inactive)); |
| 72 | read_unlock_bh(&ls->ls_rsbtbl_lock); |
| 73 | } |
| 74 | |
| 75 | static void dlm_release_root_list(struct list_head *root_list) |
| 76 | { |
| 77 | struct dlm_rsb *r, *safe; |
| 78 | |
| 79 | list_for_each_entry_safe(r, safe, root_list, res_root_list) { |
| 80 | list_del_init(entry: &r->res_root_list); |
| 81 | dlm_put_rsb(r); |
| 82 | } |
| 83 | } |
| 84 | |
| 85 | /* If the start for which we're re-enabling locking (seq) has been superseded |
| 86 | by a newer stop (ls_recover_seq), we need to leave locking disabled. |
| 87 | |
| 88 | We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees |
| 89 | locking stopped and b) adds a message to the requestqueue, but dlm_recoverd |
| 90 | enables locking and clears the requestqueue between a and b. */ |
| 91 | |
| 92 | static int enable_locking(struct dlm_ls *ls, uint64_t seq) |
| 93 | { |
| 94 | int error = -EINTR; |
| 95 | |
| 96 | write_lock_bh(&ls->ls_recv_active); |
| 97 | |
| 98 | spin_lock_bh(lock: &ls->ls_recover_lock); |
| 99 | if (ls->ls_recover_seq == seq) { |
| 100 | set_bit(LSFL_RUNNING, addr: &ls->ls_flags); |
| 101 | /* Schedule next timer if recovery put something on inactive. |
| 102 | * |
| 103 | * The rsbs that was queued while recovery on toss hasn't |
| 104 | * started yet because LSFL_RUNNING was set everything |
| 105 | * else recovery hasn't started as well because ls_in_recovery |
| 106 | * is still hold. So we should not run into the case that |
| 107 | * resume_scan_timer() queues a timer that can occur in |
| 108 | * a no op. |
| 109 | */ |
| 110 | resume_scan_timer(ls); |
| 111 | /* unblocks processes waiting to enter the dlm */ |
| 112 | up_write(sem: &ls->ls_in_recovery); |
| 113 | clear_bit(LSFL_RECOVER_LOCK, addr: &ls->ls_flags); |
| 114 | error = 0; |
| 115 | } |
| 116 | spin_unlock_bh(lock: &ls->ls_recover_lock); |
| 117 | |
| 118 | write_unlock_bh(&ls->ls_recv_active); |
| 119 | return error; |
| 120 | } |
| 121 | |
| 122 | static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) |
| 123 | { |
| 124 | LIST_HEAD(root_list); |
| 125 | unsigned long start; |
| 126 | int error, neg = 0; |
| 127 | |
| 128 | log_rinfo(ls, "dlm_recover %llu" , (unsigned long long)rv->seq); |
| 129 | |
| 130 | mutex_lock(&ls->ls_recoverd_active); |
| 131 | |
| 132 | dlm_callback_suspend(ls); |
| 133 | |
| 134 | dlm_clear_inactive(ls); |
| 135 | |
| 136 | /* |
| 137 | * This list of root rsb's will be the basis of most of the recovery |
| 138 | * routines. |
| 139 | */ |
| 140 | |
| 141 | dlm_create_root_list(ls, root_list: &root_list); |
| 142 | |
| 143 | /* |
| 144 | * Add or remove nodes from the lockspace's ls_nodes list. |
| 145 | * |
| 146 | * Due to the fact that we must report all membership changes to lsops |
| 147 | * or midcomms layer, it is not permitted to abort ls_recover() until |
| 148 | * this is done. |
| 149 | */ |
| 150 | |
| 151 | error = dlm_recover_members(ls, rv, neg_out: &neg); |
| 152 | if (error) { |
| 153 | log_rinfo(ls, "dlm_recover_members error %d" , error); |
| 154 | goto fail_root_list; |
| 155 | } |
| 156 | |
| 157 | dlm_recover_dir_nodeid(ls, root_list: &root_list); |
| 158 | |
| 159 | /* Create a snapshot of all active rsbs were we are the master of. |
| 160 | * During the barrier between dlm_recover_members_wait() and |
| 161 | * dlm_recover_directory() other nodes can dump their necessary |
| 162 | * directory dlm_rsb (r->res_dir_nodeid == nodeid) in rcom |
| 163 | * communication dlm_copy_master_names() handling. |
| 164 | * |
| 165 | * TODO We should create a per lockspace list that contains rsbs |
| 166 | * that we are the master of. Instead of creating this list while |
| 167 | * recovery we keep track of those rsbs while locking handling and |
| 168 | * recovery can use it when necessary. |
| 169 | */ |
| 170 | error = dlm_create_masters_list(ls); |
| 171 | if (error) { |
| 172 | log_rinfo(ls, "dlm_create_masters_list error %d" , error); |
| 173 | goto fail_root_list; |
| 174 | } |
| 175 | |
| 176 | ls->ls_recover_locks_in = 0; |
| 177 | |
| 178 | dlm_set_recover_status(ls, DLM_RS_NODES); |
| 179 | |
| 180 | error = dlm_recover_members_wait(ls, seq: rv->seq); |
| 181 | if (error) { |
| 182 | log_rinfo(ls, "dlm_recover_members_wait error %d" , error); |
| 183 | dlm_release_masters_list(ls); |
| 184 | goto fail_root_list; |
| 185 | } |
| 186 | |
| 187 | start = jiffies; |
| 188 | |
| 189 | /* |
| 190 | * Rebuild our own share of the directory by collecting from all other |
| 191 | * nodes their master rsb names that hash to us. |
| 192 | */ |
| 193 | |
| 194 | error = dlm_recover_directory(ls, seq: rv->seq); |
| 195 | if (error) { |
| 196 | log_rinfo(ls, "dlm_recover_directory error %d" , error); |
| 197 | dlm_release_masters_list(ls); |
| 198 | goto fail_root_list; |
| 199 | } |
| 200 | |
| 201 | dlm_set_recover_status(ls, DLM_RS_DIR); |
| 202 | |
| 203 | error = dlm_recover_directory_wait(ls, seq: rv->seq); |
| 204 | if (error) { |
| 205 | log_rinfo(ls, "dlm_recover_directory_wait error %d" , error); |
| 206 | dlm_release_masters_list(ls); |
| 207 | goto fail_root_list; |
| 208 | } |
| 209 | |
| 210 | dlm_release_masters_list(ls); |
| 211 | |
| 212 | /* |
| 213 | * We may have outstanding operations that are waiting for a reply from |
| 214 | * a failed node. Mark these to be resent after recovery. Unlock and |
| 215 | * cancel ops can just be completed. |
| 216 | */ |
| 217 | |
| 218 | dlm_recover_waiters_pre(ls); |
| 219 | |
| 220 | if (dlm_recovery_stopped(ls)) { |
| 221 | error = -EINTR; |
| 222 | goto fail_root_list; |
| 223 | } |
| 224 | |
| 225 | if (neg || dlm_no_directory(ls)) { |
| 226 | /* |
| 227 | * Clear lkb's for departed nodes. |
| 228 | */ |
| 229 | |
| 230 | dlm_recover_purge(ls, root_list: &root_list); |
| 231 | |
| 232 | /* |
| 233 | * Get new master nodeid's for rsb's that were mastered on |
| 234 | * departed nodes. |
| 235 | */ |
| 236 | |
| 237 | error = dlm_recover_masters(ls, seq: rv->seq, root_list: &root_list); |
| 238 | if (error) { |
| 239 | log_rinfo(ls, "dlm_recover_masters error %d" , error); |
| 240 | goto fail_root_list; |
| 241 | } |
| 242 | |
| 243 | /* |
| 244 | * Send our locks on remastered rsb's to the new masters. |
| 245 | */ |
| 246 | |
| 247 | error = dlm_recover_locks(ls, seq: rv->seq, root_list: &root_list); |
| 248 | if (error) { |
| 249 | log_rinfo(ls, "dlm_recover_locks error %d" , error); |
| 250 | goto fail_root_list; |
| 251 | } |
| 252 | |
| 253 | dlm_set_recover_status(ls, DLM_RS_LOCKS); |
| 254 | |
| 255 | error = dlm_recover_locks_wait(ls, seq: rv->seq); |
| 256 | if (error) { |
| 257 | log_rinfo(ls, "dlm_recover_locks_wait error %d" , error); |
| 258 | goto fail_root_list; |
| 259 | } |
| 260 | |
| 261 | log_rinfo(ls, "dlm_recover_locks %u in" , |
| 262 | ls->ls_recover_locks_in); |
| 263 | |
| 264 | /* |
| 265 | * Finalize state in master rsb's now that all locks can be |
| 266 | * checked. This includes conversion resolution and lvb |
| 267 | * settings. |
| 268 | */ |
| 269 | |
| 270 | dlm_recover_rsbs(ls, root_list: &root_list); |
| 271 | } else { |
| 272 | /* |
| 273 | * Other lockspace members may be going through the "neg" steps |
| 274 | * while also adding us to the lockspace, in which case they'll |
| 275 | * be doing the recover_locks (RS_LOCKS) barrier. |
| 276 | */ |
| 277 | dlm_set_recover_status(ls, DLM_RS_LOCKS); |
| 278 | |
| 279 | error = dlm_recover_locks_wait(ls, seq: rv->seq); |
| 280 | if (error) { |
| 281 | log_rinfo(ls, "dlm_recover_locks_wait error %d" , error); |
| 282 | goto fail_root_list; |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | dlm_release_root_list(root_list: &root_list); |
| 287 | |
| 288 | /* |
| 289 | * Purge directory-related requests that are saved in requestqueue. |
| 290 | * All dir requests from before recovery are invalid now due to the dir |
| 291 | * rebuild and will be resent by the requesting nodes. |
| 292 | */ |
| 293 | |
| 294 | dlm_purge_requestqueue(ls); |
| 295 | |
| 296 | dlm_set_recover_status(ls, DLM_RS_DONE); |
| 297 | |
| 298 | error = dlm_recover_done_wait(ls, seq: rv->seq); |
| 299 | if (error) { |
| 300 | log_rinfo(ls, "dlm_recover_done_wait error %d" , error); |
| 301 | goto fail; |
| 302 | } |
| 303 | |
| 304 | dlm_clear_members_gone(ls); |
| 305 | |
| 306 | dlm_callback_resume(ls); |
| 307 | |
| 308 | error = enable_locking(ls, seq: rv->seq); |
| 309 | if (error) { |
| 310 | log_rinfo(ls, "enable_locking error %d" , error); |
| 311 | goto fail; |
| 312 | } |
| 313 | |
| 314 | error = dlm_process_requestqueue(ls); |
| 315 | if (error) { |
| 316 | log_rinfo(ls, "dlm_process_requestqueue error %d" , error); |
| 317 | goto fail; |
| 318 | } |
| 319 | |
| 320 | error = dlm_recover_waiters_post(ls); |
| 321 | if (error) { |
| 322 | log_rinfo(ls, "dlm_recover_waiters_post error %d" , error); |
| 323 | goto fail; |
| 324 | } |
| 325 | |
| 326 | dlm_recover_grant(ls); |
| 327 | |
| 328 | log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms" , |
| 329 | (unsigned long long)rv->seq, ls->ls_generation, |
| 330 | jiffies_to_msecs(jiffies - start)); |
| 331 | mutex_unlock(lock: &ls->ls_recoverd_active); |
| 332 | |
| 333 | return 0; |
| 334 | |
| 335 | fail_root_list: |
| 336 | dlm_release_root_list(root_list: &root_list); |
| 337 | fail: |
| 338 | mutex_unlock(lock: &ls->ls_recoverd_active); |
| 339 | |
| 340 | return error; |
| 341 | } |
| 342 | |
| 343 | /* The dlm_ls_start() that created the rv we take here may already have been |
| 344 | stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP |
| 345 | flag set. */ |
| 346 | |
| 347 | static void do_ls_recovery(struct dlm_ls *ls) |
| 348 | { |
| 349 | struct dlm_recover *rv = NULL; |
| 350 | int error; |
| 351 | |
| 352 | spin_lock_bh(lock: &ls->ls_recover_lock); |
| 353 | rv = ls->ls_recover_args; |
| 354 | ls->ls_recover_args = NULL; |
| 355 | if (rv && ls->ls_recover_seq == rv->seq) |
| 356 | clear_bit(LSFL_RECOVER_STOP, addr: &ls->ls_flags); |
| 357 | spin_unlock_bh(lock: &ls->ls_recover_lock); |
| 358 | |
| 359 | if (rv) { |
| 360 | error = ls_recover(ls, rv); |
| 361 | switch (error) { |
| 362 | case 0: |
| 363 | ls->ls_recovery_result = 0; |
| 364 | complete(&ls->ls_recovery_done); |
| 365 | |
| 366 | dlm_lsop_recover_done(ls); |
| 367 | break; |
| 368 | case -EINTR: |
| 369 | /* if recovery was interrupted -EINTR we wait for the next |
| 370 | * ls_recover() iteration until it hopefully succeeds. |
| 371 | */ |
| 372 | log_rinfo(ls, "%s %llu interrupted and should be queued to run again" , |
| 373 | __func__, (unsigned long long)rv->seq); |
| 374 | break; |
| 375 | default: |
| 376 | log_rinfo(ls, "%s %llu error %d" , __func__, |
| 377 | (unsigned long long)rv->seq, error); |
| 378 | |
| 379 | /* let new_lockspace() get aware of critical error */ |
| 380 | ls->ls_recovery_result = error; |
| 381 | complete(&ls->ls_recovery_done); |
| 382 | break; |
| 383 | } |
| 384 | |
| 385 | kfree(objp: rv->nodes); |
| 386 | kfree(objp: rv); |
| 387 | } |
| 388 | } |
| 389 | |
| 390 | static int dlm_recoverd(void *arg) |
| 391 | { |
| 392 | struct dlm_ls *ls; |
| 393 | |
| 394 | ls = dlm_find_lockspace_local(id: arg); |
| 395 | if (!ls) { |
| 396 | log_print("dlm_recoverd: no lockspace %p" , arg); |
| 397 | return -1; |
| 398 | } |
| 399 | |
| 400 | down_write(sem: &ls->ls_in_recovery); |
| 401 | set_bit(LSFL_RECOVER_LOCK, addr: &ls->ls_flags); |
| 402 | wake_up(&ls->ls_recover_lock_wait); |
| 403 | |
| 404 | while (1) { |
| 405 | /* |
| 406 | * We call kthread_should_stop() after set_current_state(). |
| 407 | * This is because it works correctly if kthread_stop() is |
| 408 | * called just before set_current_state(). |
| 409 | */ |
| 410 | set_current_state(TASK_INTERRUPTIBLE); |
| 411 | if (kthread_should_stop()) { |
| 412 | set_current_state(TASK_RUNNING); |
| 413 | break; |
| 414 | } |
| 415 | if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && |
| 416 | !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { |
| 417 | if (kthread_should_stop()) |
| 418 | break; |
| 419 | schedule(); |
| 420 | } |
| 421 | set_current_state(TASK_RUNNING); |
| 422 | |
| 423 | if (test_and_clear_bit(LSFL_RECOVER_DOWN, addr: &ls->ls_flags)) { |
| 424 | down_write(sem: &ls->ls_in_recovery); |
| 425 | set_bit(LSFL_RECOVER_LOCK, addr: &ls->ls_flags); |
| 426 | wake_up(&ls->ls_recover_lock_wait); |
| 427 | } |
| 428 | |
| 429 | if (test_and_clear_bit(LSFL_RECOVER_WORK, addr: &ls->ls_flags)) |
| 430 | do_ls_recovery(ls); |
| 431 | } |
| 432 | |
| 433 | if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)) |
| 434 | up_write(sem: &ls->ls_in_recovery); |
| 435 | |
| 436 | dlm_put_lockspace(ls); |
| 437 | return 0; |
| 438 | } |
| 439 | |
| 440 | int dlm_recoverd_start(struct dlm_ls *ls) |
| 441 | { |
| 442 | struct task_struct *p; |
| 443 | int error = 0; |
| 444 | |
| 445 | p = kthread_run(dlm_recoverd, ls, "dlm_recoverd" ); |
| 446 | if (IS_ERR(ptr: p)) |
| 447 | error = PTR_ERR(ptr: p); |
| 448 | else |
| 449 | ls->ls_recoverd_task = p; |
| 450 | return error; |
| 451 | } |
| 452 | |
| 453 | void dlm_recoverd_stop(struct dlm_ls *ls) |
| 454 | { |
| 455 | kthread_stop(k: ls->ls_recoverd_task); |
| 456 | } |
| 457 | |
| 458 | void dlm_recoverd_suspend(struct dlm_ls *ls) |
| 459 | { |
| 460 | wake_up(&ls->ls_wait_general); |
| 461 | mutex_lock(&ls->ls_recoverd_active); |
| 462 | } |
| 463 | |
| 464 | void dlm_recoverd_resume(struct dlm_ls *ls) |
| 465 | { |
| 466 | mutex_unlock(lock: &ls->ls_recoverd_active); |
| 467 | } |
| 468 | |
| 469 | |