| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* |
| 3 | * Copyright (C) 2005-2007 Red Hat GmbH |
| 4 | * |
| 5 | * A target that delays reads and/or writes and can send |
| 6 | * them to different devices. |
| 7 | * |
| 8 | * This file is released under the GPL. |
| 9 | */ |
| 10 | |
| 11 | #include <linux/module.h> |
| 12 | #include <linux/init.h> |
| 13 | #include <linux/blkdev.h> |
| 14 | #include <linux/bio.h> |
| 15 | #include <linux/slab.h> |
| 16 | #include <linux/kthread.h> |
| 17 | #include <linux/delay.h> |
| 18 | |
| 19 | #include <linux/device-mapper.h> |
| 20 | |
| 21 | #define DM_MSG_PREFIX "delay" |
| 22 | |
| 23 | #define SLEEP_SHIFT 3 |
| 24 | |
| 25 | struct delay_class { |
| 26 | struct dm_dev *dev; |
| 27 | sector_t start; |
| 28 | unsigned int delay; |
| 29 | unsigned int ops; |
| 30 | }; |
| 31 | |
| 32 | struct delay_c { |
| 33 | struct timer_list delay_timer; |
| 34 | struct mutex process_bios_lock; /* hold while removing bios to be processed from list */ |
| 35 | spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */ |
| 36 | struct workqueue_struct *kdelayd_wq; |
| 37 | struct work_struct flush_expired_bios; |
| 38 | struct list_head delayed_bios; |
| 39 | struct task_struct *worker; |
| 40 | unsigned int worker_sleep_us; |
| 41 | bool may_delay; |
| 42 | |
| 43 | struct delay_class read; |
| 44 | struct delay_class write; |
| 45 | struct delay_class flush; |
| 46 | |
| 47 | int argc; |
| 48 | }; |
| 49 | |
| 50 | struct dm_delay_info { |
| 51 | struct delay_c *context; |
| 52 | struct delay_class *class; |
| 53 | struct list_head list; |
| 54 | unsigned long expires; |
| 55 | }; |
| 56 | |
| 57 | static void handle_delayed_timer(struct timer_list *t) |
| 58 | { |
| 59 | struct delay_c *dc = timer_container_of(dc, t, delay_timer); |
| 60 | |
| 61 | queue_work(wq: dc->kdelayd_wq, work: &dc->flush_expired_bios); |
| 62 | } |
| 63 | |
| 64 | static void queue_timeout(struct delay_c *dc, unsigned long expires) |
| 65 | { |
| 66 | timer_reduce(timer: &dc->delay_timer, expires); |
| 67 | } |
| 68 | |
| 69 | static inline bool delay_is_fast(struct delay_c *dc) |
| 70 | { |
| 71 | return !!dc->worker; |
| 72 | } |
| 73 | |
| 74 | static void flush_bios(struct bio *bio) |
| 75 | { |
| 76 | struct bio *n; |
| 77 | |
| 78 | while (bio) { |
| 79 | n = bio->bi_next; |
| 80 | bio->bi_next = NULL; |
| 81 | dm_submit_bio_remap(clone: bio, NULL); |
| 82 | bio = n; |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | static void flush_delayed_bios(struct delay_c *dc, bool flush_all) |
| 87 | { |
| 88 | struct dm_delay_info *delayed, *next; |
| 89 | struct bio_list flush_bio_list; |
| 90 | LIST_HEAD(local_list); |
| 91 | unsigned long next_expires = 0; |
| 92 | bool start_timer = false; |
| 93 | bio_list_init(bl: &flush_bio_list); |
| 94 | |
| 95 | mutex_lock(&dc->process_bios_lock); |
| 96 | spin_lock(lock: &dc->delayed_bios_lock); |
| 97 | list_replace_init(old: &dc->delayed_bios, new: &local_list); |
| 98 | spin_unlock(lock: &dc->delayed_bios_lock); |
| 99 | list_for_each_entry_safe(delayed, next, &local_list, list) { |
| 100 | cond_resched(); |
| 101 | if (flush_all || time_after_eq(jiffies, delayed->expires)) { |
| 102 | struct bio *bio = dm_bio_from_per_bio_data(data: delayed, |
| 103 | data_size: sizeof(struct dm_delay_info)); |
| 104 | list_del(entry: &delayed->list); |
| 105 | bio_list_add(bl: &flush_bio_list, bio); |
| 106 | delayed->class->ops--; |
| 107 | continue; |
| 108 | } |
| 109 | |
| 110 | if (!delay_is_fast(dc)) { |
| 111 | if (!start_timer) { |
| 112 | start_timer = true; |
| 113 | next_expires = delayed->expires; |
| 114 | } else { |
| 115 | next_expires = min(next_expires, delayed->expires); |
| 116 | } |
| 117 | } |
| 118 | } |
| 119 | spin_lock(lock: &dc->delayed_bios_lock); |
| 120 | list_splice(list: &local_list, head: &dc->delayed_bios); |
| 121 | spin_unlock(lock: &dc->delayed_bios_lock); |
| 122 | mutex_unlock(lock: &dc->process_bios_lock); |
| 123 | |
| 124 | if (start_timer) |
| 125 | queue_timeout(dc, expires: next_expires); |
| 126 | |
| 127 | flush_bios(bio: bio_list_get(bl: &flush_bio_list)); |
| 128 | } |
| 129 | |
| 130 | static int flush_worker_fn(void *data) |
| 131 | { |
| 132 | struct delay_c *dc = data; |
| 133 | |
| 134 | while (!kthread_should_stop()) { |
| 135 | flush_delayed_bios(dc, flush_all: false); |
| 136 | spin_lock(lock: &dc->delayed_bios_lock); |
| 137 | if (unlikely(list_empty(&dc->delayed_bios))) { |
| 138 | set_current_state(TASK_INTERRUPTIBLE); |
| 139 | spin_unlock(lock: &dc->delayed_bios_lock); |
| 140 | schedule(); |
| 141 | } else { |
| 142 | spin_unlock(lock: &dc->delayed_bios_lock); |
| 143 | fsleep(usecs: dc->worker_sleep_us); |
| 144 | cond_resched(); |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | return 0; |
| 149 | } |
| 150 | |
| 151 | static void flush_expired_bios(struct work_struct *work) |
| 152 | { |
| 153 | struct delay_c *dc; |
| 154 | |
| 155 | dc = container_of(work, struct delay_c, flush_expired_bios); |
| 156 | flush_delayed_bios(dc, flush_all: false); |
| 157 | } |
| 158 | |
| 159 | static void delay_dtr(struct dm_target *ti) |
| 160 | { |
| 161 | struct delay_c *dc = ti->private; |
| 162 | |
| 163 | if (dc->kdelayd_wq) { |
| 164 | timer_shutdown_sync(timer: &dc->delay_timer); |
| 165 | destroy_workqueue(wq: dc->kdelayd_wq); |
| 166 | } |
| 167 | |
| 168 | if (dc->read.dev) |
| 169 | dm_put_device(ti, d: dc->read.dev); |
| 170 | if (dc->write.dev) |
| 171 | dm_put_device(ti, d: dc->write.dev); |
| 172 | if (dc->flush.dev) |
| 173 | dm_put_device(ti, d: dc->flush.dev); |
| 174 | if (dc->worker) |
| 175 | kthread_stop(k: dc->worker); |
| 176 | |
| 177 | mutex_destroy(lock: &dc->process_bios_lock); |
| 178 | |
| 179 | kfree(objp: dc); |
| 180 | } |
| 181 | |
| 182 | static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv) |
| 183 | { |
| 184 | int ret; |
| 185 | unsigned long long tmpll; |
| 186 | char dummy; |
| 187 | |
| 188 | if (sscanf(argv[1], "%llu%c" , &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { |
| 189 | ti->error = "Invalid device sector" ; |
| 190 | return -EINVAL; |
| 191 | } |
| 192 | c->start = tmpll; |
| 193 | |
| 194 | if (sscanf(argv[2], "%u%c" , &c->delay, &dummy) != 1) { |
| 195 | ti->error = "Invalid delay" ; |
| 196 | return -EINVAL; |
| 197 | } |
| 198 | |
| 199 | ret = dm_get_device(ti, path: argv[0], mode: dm_table_get_mode(t: ti->table), result: &c->dev); |
| 200 | if (ret) { |
| 201 | ti->error = "Device lookup failed" ; |
| 202 | return ret; |
| 203 | } |
| 204 | |
| 205 | return 0; |
| 206 | } |
| 207 | |
| 208 | /* |
| 209 | * Mapping parameters: |
| 210 | * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>] |
| 211 | * |
| 212 | * With separate write parameters, the first set is only used for reads. |
| 213 | * Offsets are specified in sectors. |
| 214 | * Delays are specified in milliseconds. |
| 215 | */ |
| 216 | static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
| 217 | { |
| 218 | struct delay_c *dc; |
| 219 | int ret; |
| 220 | unsigned int max_delay, min_delay; |
| 221 | |
| 222 | if (argc != 3 && argc != 6 && argc != 9) { |
| 223 | ti->error = "Requires exactly 3, 6 or 9 arguments" ; |
| 224 | return -EINVAL; |
| 225 | } |
| 226 | |
| 227 | dc = kzalloc(sizeof(*dc), GFP_KERNEL); |
| 228 | if (!dc) { |
| 229 | ti->error = "Cannot allocate context" ; |
| 230 | return -ENOMEM; |
| 231 | } |
| 232 | |
| 233 | ti->private = dc; |
| 234 | INIT_LIST_HEAD(list: &dc->delayed_bios); |
| 235 | mutex_init(&dc->process_bios_lock); |
| 236 | spin_lock_init(&dc->delayed_bios_lock); |
| 237 | dc->may_delay = true; |
| 238 | dc->argc = argc; |
| 239 | |
| 240 | ret = delay_class_ctr(ti, c: &dc->read, argv); |
| 241 | if (ret) |
| 242 | goto bad; |
| 243 | min_delay = max_delay = dc->read.delay; |
| 244 | |
| 245 | if (argc == 3) { |
| 246 | ret = delay_class_ctr(ti, c: &dc->write, argv); |
| 247 | if (ret) |
| 248 | goto bad; |
| 249 | ret = delay_class_ctr(ti, c: &dc->flush, argv); |
| 250 | if (ret) |
| 251 | goto bad; |
| 252 | goto out; |
| 253 | } |
| 254 | |
| 255 | ret = delay_class_ctr(ti, c: &dc->write, argv: argv + 3); |
| 256 | if (ret) |
| 257 | goto bad; |
| 258 | max_delay = max(max_delay, dc->write.delay); |
| 259 | min_delay = min_not_zero(min_delay, dc->write.delay); |
| 260 | |
| 261 | if (argc == 6) { |
| 262 | ret = delay_class_ctr(ti, c: &dc->flush, argv: argv + 3); |
| 263 | if (ret) |
| 264 | goto bad; |
| 265 | goto out; |
| 266 | } |
| 267 | |
| 268 | ret = delay_class_ctr(ti, c: &dc->flush, argv: argv + 6); |
| 269 | if (ret) |
| 270 | goto bad; |
| 271 | max_delay = max(max_delay, dc->flush.delay); |
| 272 | min_delay = min_not_zero(min_delay, dc->flush.delay); |
| 273 | |
| 274 | out: |
| 275 | if (max_delay < 50) { |
| 276 | if (min_delay >> SLEEP_SHIFT) |
| 277 | dc->worker_sleep_us = 1000; |
| 278 | else |
| 279 | dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT; |
| 280 | /* |
| 281 | * In case of small requested delays, use kthread instead of |
| 282 | * timers and workqueue to achieve better latency. |
| 283 | */ |
| 284 | dc->worker = kthread_run(&flush_worker_fn, dc, "dm-delay-flush-worker" ); |
| 285 | if (IS_ERR(ptr: dc->worker)) { |
| 286 | ret = PTR_ERR(ptr: dc->worker); |
| 287 | dc->worker = NULL; |
| 288 | goto bad; |
| 289 | } |
| 290 | } else { |
| 291 | timer_setup(&dc->delay_timer, handle_delayed_timer, 0); |
| 292 | INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); |
| 293 | dc->kdelayd_wq = alloc_workqueue("kdelayd" , WQ_MEM_RECLAIM, 0); |
| 294 | if (!dc->kdelayd_wq) { |
| 295 | ret = -EINVAL; |
| 296 | DMERR("Couldn't start kdelayd" ); |
| 297 | goto bad; |
| 298 | } |
| 299 | } |
| 300 | |
| 301 | ti->num_flush_bios = 1; |
| 302 | ti->num_discard_bios = 1; |
| 303 | ti->accounts_remapped_io = true; |
| 304 | ti->per_io_data_size = sizeof(struct dm_delay_info); |
| 305 | return 0; |
| 306 | |
| 307 | bad: |
| 308 | delay_dtr(ti); |
| 309 | return ret; |
| 310 | } |
| 311 | |
| 312 | static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) |
| 313 | { |
| 314 | struct dm_delay_info *delayed; |
| 315 | unsigned long expires = 0; |
| 316 | |
| 317 | if (!c->delay) |
| 318 | return DM_MAPIO_REMAPPED; |
| 319 | |
| 320 | delayed = dm_per_bio_data(bio, data_size: sizeof(struct dm_delay_info)); |
| 321 | |
| 322 | delayed->context = dc; |
| 323 | delayed->expires = expires = jiffies + msecs_to_jiffies(m: c->delay); |
| 324 | |
| 325 | spin_lock(lock: &dc->delayed_bios_lock); |
| 326 | if (unlikely(!dc->may_delay)) { |
| 327 | spin_unlock(lock: &dc->delayed_bios_lock); |
| 328 | return DM_MAPIO_REMAPPED; |
| 329 | } |
| 330 | c->ops++; |
| 331 | list_add_tail(new: &delayed->list, head: &dc->delayed_bios); |
| 332 | spin_unlock(lock: &dc->delayed_bios_lock); |
| 333 | |
| 334 | if (delay_is_fast(dc)) |
| 335 | wake_up_process(tsk: dc->worker); |
| 336 | else |
| 337 | queue_timeout(dc, expires); |
| 338 | |
| 339 | return DM_MAPIO_SUBMITTED; |
| 340 | } |
| 341 | |
| 342 | static void delay_presuspend(struct dm_target *ti) |
| 343 | { |
| 344 | struct delay_c *dc = ti->private; |
| 345 | |
| 346 | spin_lock(lock: &dc->delayed_bios_lock); |
| 347 | dc->may_delay = false; |
| 348 | spin_unlock(lock: &dc->delayed_bios_lock); |
| 349 | |
| 350 | if (!delay_is_fast(dc)) |
| 351 | timer_delete(timer: &dc->delay_timer); |
| 352 | flush_delayed_bios(dc, flush_all: true); |
| 353 | } |
| 354 | |
| 355 | static void delay_resume(struct dm_target *ti) |
| 356 | { |
| 357 | struct delay_c *dc = ti->private; |
| 358 | |
| 359 | dc->may_delay = true; |
| 360 | } |
| 361 | |
| 362 | static int delay_map(struct dm_target *ti, struct bio *bio) |
| 363 | { |
| 364 | struct delay_c *dc = ti->private; |
| 365 | struct delay_class *c; |
| 366 | struct dm_delay_info *delayed = dm_per_bio_data(bio, data_size: sizeof(struct dm_delay_info)); |
| 367 | |
| 368 | if (bio_data_dir(bio) == WRITE) { |
| 369 | if (unlikely(bio->bi_opf & REQ_PREFLUSH)) |
| 370 | c = &dc->flush; |
| 371 | else |
| 372 | c = &dc->write; |
| 373 | } else { |
| 374 | c = &dc->read; |
| 375 | } |
| 376 | delayed->class = c; |
| 377 | bio_set_dev(bio, bdev: c->dev->bdev); |
| 378 | bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); |
| 379 | |
| 380 | return delay_bio(dc, c, bio); |
| 381 | } |
| 382 | |
| 383 | #ifdef CONFIG_BLK_DEV_ZONED |
| 384 | static int delay_report_zones(struct dm_target *ti, |
| 385 | struct dm_report_zones_args *args, unsigned int nr_zones) |
| 386 | { |
| 387 | struct delay_c *dc = ti->private; |
| 388 | struct delay_class *c = &dc->read; |
| 389 | |
| 390 | return dm_report_zones(bdev: c->dev->bdev, start: c->start, |
| 391 | sector: c->start + dm_target_offset(ti, args->next_sector), |
| 392 | args, nr_zones); |
| 393 | } |
| 394 | #else |
| 395 | #define delay_report_zones NULL |
| 396 | #endif |
| 397 | |
| 398 | #define DMEMIT_DELAY_CLASS(c) \ |
| 399 | DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) |
| 400 | |
| 401 | static void delay_status(struct dm_target *ti, status_type_t type, |
| 402 | unsigned int status_flags, char *result, unsigned int maxlen) |
| 403 | { |
| 404 | struct delay_c *dc = ti->private; |
| 405 | int sz = 0; |
| 406 | |
| 407 | switch (type) { |
| 408 | case STATUSTYPE_INFO: |
| 409 | DMEMIT("%u %u %u" , dc->read.ops, dc->write.ops, dc->flush.ops); |
| 410 | break; |
| 411 | |
| 412 | case STATUSTYPE_TABLE: |
| 413 | DMEMIT_DELAY_CLASS(&dc->read); |
| 414 | if (dc->argc >= 6) { |
| 415 | DMEMIT(" " ); |
| 416 | DMEMIT_DELAY_CLASS(&dc->write); |
| 417 | } |
| 418 | if (dc->argc >= 9) { |
| 419 | DMEMIT(" " ); |
| 420 | DMEMIT_DELAY_CLASS(&dc->flush); |
| 421 | } |
| 422 | break; |
| 423 | |
| 424 | case STATUSTYPE_IMA: |
| 425 | *result = '\0'; |
| 426 | break; |
| 427 | } |
| 428 | } |
| 429 | |
| 430 | static int delay_iterate_devices(struct dm_target *ti, |
| 431 | iterate_devices_callout_fn fn, void *data) |
| 432 | { |
| 433 | struct delay_c *dc = ti->private; |
| 434 | int ret = 0; |
| 435 | |
| 436 | ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data); |
| 437 | if (ret) |
| 438 | goto out; |
| 439 | ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data); |
| 440 | if (ret) |
| 441 | goto out; |
| 442 | ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data); |
| 443 | if (ret) |
| 444 | goto out; |
| 445 | |
| 446 | out: |
| 447 | return ret; |
| 448 | } |
| 449 | |
| 450 | static struct target_type delay_target = { |
| 451 | .name = "delay" , |
| 452 | .version = {1, 5, 0}, |
| 453 | .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, |
| 454 | .module = THIS_MODULE, |
| 455 | .ctr = delay_ctr, |
| 456 | .dtr = delay_dtr, |
| 457 | .map = delay_map, |
| 458 | .report_zones = delay_report_zones, |
| 459 | .presuspend = delay_presuspend, |
| 460 | .resume = delay_resume, |
| 461 | .status = delay_status, |
| 462 | .iterate_devices = delay_iterate_devices, |
| 463 | }; |
| 464 | module_dm(delay); |
| 465 | |
| 466 | MODULE_DESCRIPTION(DM_NAME " delay target" ); |
| 467 | MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>" ); |
| 468 | MODULE_LICENSE("GPL" ); |
| 469 | |