| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* |
| 3 | * Copyright (c) 2025, Christoph Hellwig. |
| 4 | * Copyright (c) 2025, Western Digital Corporation or its affiliates. |
| 5 | * |
| 6 | * Zoned Loop Device driver - exports a zoned block device using one file per |
| 7 | * zone as backing storage. |
| 8 | */ |
| 9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 10 | |
| 11 | #include <linux/module.h> |
| 12 | #include <linux/blk-mq.h> |
| 13 | #include <linux/blkzoned.h> |
| 14 | #include <linux/pagemap.h> |
| 15 | #include <linux/miscdevice.h> |
| 16 | #include <linux/falloc.h> |
| 17 | #include <linux/mutex.h> |
| 18 | #include <linux/parser.h> |
| 19 | #include <linux/seq_file.h> |
| 20 | |
| 21 | /* |
| 22 | * Options for adding (and removing) a device. |
| 23 | */ |
| 24 | enum { |
| 25 | ZLOOP_OPT_ERR = 0, |
| 26 | ZLOOP_OPT_ID = (1 << 0), |
| 27 | ZLOOP_OPT_CAPACITY = (1 << 1), |
| 28 | ZLOOP_OPT_ZONE_SIZE = (1 << 2), |
| 29 | ZLOOP_OPT_ZONE_CAPACITY = (1 << 3), |
| 30 | ZLOOP_OPT_NR_CONV_ZONES = (1 << 4), |
| 31 | ZLOOP_OPT_BASE_DIR = (1 << 5), |
| 32 | ZLOOP_OPT_NR_QUEUES = (1 << 6), |
| 33 | ZLOOP_OPT_QUEUE_DEPTH = (1 << 7), |
| 34 | ZLOOP_OPT_BUFFERED_IO = (1 << 8), |
| 35 | ZLOOP_OPT_ZONE_APPEND = (1 << 9), |
| 36 | ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), |
| 37 | }; |
| 38 | |
| 39 | static const match_table_t zloop_opt_tokens = { |
| 40 | { ZLOOP_OPT_ID, "id=%d" }, |
| 41 | { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" }, |
| 42 | { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" }, |
| 43 | { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" }, |
| 44 | { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" }, |
| 45 | { ZLOOP_OPT_BASE_DIR, "base_dir=%s" }, |
| 46 | { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" }, |
| 47 | { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" }, |
| 48 | { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, |
| 49 | { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, |
| 50 | { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, |
| 51 | { ZLOOP_OPT_ERR, NULL } |
| 52 | }; |
| 53 | |
| 54 | /* Default values for the "add" operation. */ |
| 55 | #define ZLOOP_DEF_ID -1 |
| 56 | #define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT) |
| 57 | #define ZLOOP_DEF_NR_ZONES 64 |
| 58 | #define ZLOOP_DEF_NR_CONV_ZONES 8 |
| 59 | #define ZLOOP_DEF_BASE_DIR "/var/local/zloop" |
| 60 | #define ZLOOP_DEF_NR_QUEUES 1 |
| 61 | #define ZLOOP_DEF_QUEUE_DEPTH 128 |
| 62 | #define ZLOOP_DEF_BUFFERED_IO false |
| 63 | #define ZLOOP_DEF_ZONE_APPEND true |
| 64 | #define ZLOOP_DEF_ORDERED_ZONE_APPEND false |
| 65 | |
| 66 | /* Arbitrary limit on the zone size (16GB). */ |
| 67 | #define ZLOOP_MAX_ZONE_SIZE_MB 16384 |
| 68 | |
| 69 | struct zloop_options { |
| 70 | unsigned int mask; |
| 71 | int id; |
| 72 | sector_t capacity; |
| 73 | sector_t zone_size; |
| 74 | sector_t zone_capacity; |
| 75 | unsigned int nr_conv_zones; |
| 76 | char *base_dir; |
| 77 | unsigned int nr_queues; |
| 78 | unsigned int queue_depth; |
| 79 | bool buffered_io; |
| 80 | bool zone_append; |
| 81 | bool ordered_zone_append; |
| 82 | }; |
| 83 | |
| 84 | /* |
| 85 | * Device states. |
| 86 | */ |
| 87 | enum { |
| 88 | Zlo_creating = 0, |
| 89 | Zlo_live, |
| 90 | Zlo_deleting, |
| 91 | }; |
| 92 | |
| 93 | enum zloop_zone_flags { |
| 94 | ZLOOP_ZONE_CONV = 0, |
| 95 | ZLOOP_ZONE_SEQ_ERROR, |
| 96 | }; |
| 97 | |
| 98 | struct zloop_zone { |
| 99 | struct file *file; |
| 100 | |
| 101 | unsigned long flags; |
| 102 | struct mutex lock; |
| 103 | spinlock_t wp_lock; |
| 104 | enum blk_zone_cond cond; |
| 105 | sector_t start; |
| 106 | sector_t wp; |
| 107 | |
| 108 | gfp_t old_gfp_mask; |
| 109 | }; |
| 110 | |
| 111 | struct zloop_device { |
| 112 | unsigned int id; |
| 113 | unsigned int state; |
| 114 | |
| 115 | struct blk_mq_tag_set tag_set; |
| 116 | struct gendisk *disk; |
| 117 | |
| 118 | struct workqueue_struct *workqueue; |
| 119 | bool buffered_io; |
| 120 | bool zone_append; |
| 121 | bool ordered_zone_append; |
| 122 | |
| 123 | const char *base_dir; |
| 124 | struct file *data_dir; |
| 125 | |
| 126 | unsigned int zone_shift; |
| 127 | sector_t zone_size; |
| 128 | sector_t zone_capacity; |
| 129 | unsigned int nr_zones; |
| 130 | unsigned int nr_conv_zones; |
| 131 | unsigned int block_size; |
| 132 | |
| 133 | struct zloop_zone zones[] __counted_by(nr_zones); |
| 134 | }; |
| 135 | |
| 136 | struct zloop_cmd { |
| 137 | struct work_struct work; |
| 138 | atomic_t ref; |
| 139 | sector_t sector; |
| 140 | sector_t nr_sectors; |
| 141 | long ret; |
| 142 | struct kiocb iocb; |
| 143 | struct bio_vec *bvec; |
| 144 | }; |
| 145 | |
| 146 | static DEFINE_IDR(zloop_index_idr); |
| 147 | static DEFINE_MUTEX(zloop_ctl_mutex); |
| 148 | |
| 149 | static unsigned int rq_zone_no(struct request *rq) |
| 150 | { |
| 151 | struct zloop_device *zlo = rq->q->queuedata; |
| 152 | |
| 153 | return blk_rq_pos(rq) >> zlo->zone_shift; |
| 154 | } |
| 155 | |
| 156 | static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) |
| 157 | { |
| 158 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 159 | struct kstat stat; |
| 160 | sector_t file_sectors; |
| 161 | unsigned long flags; |
| 162 | int ret; |
| 163 | |
| 164 | lockdep_assert_held(&zone->lock); |
| 165 | |
| 166 | ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); |
| 167 | if (ret < 0) { |
| 168 | pr_err("Failed to get zone %u file stat (err=%d)\n" , |
| 169 | zone_no, ret); |
| 170 | set_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags); |
| 171 | return ret; |
| 172 | } |
| 173 | |
| 174 | file_sectors = stat.size >> SECTOR_SHIFT; |
| 175 | if (file_sectors > zlo->zone_capacity) { |
| 176 | pr_err("Zone %u file too large (%llu sectors > %llu)\n" , |
| 177 | zone_no, file_sectors, zlo->zone_capacity); |
| 178 | return -EINVAL; |
| 179 | } |
| 180 | |
| 181 | if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { |
| 182 | pr_err("Zone %u file size not aligned to block size %u\n" , |
| 183 | zone_no, zlo->block_size); |
| 184 | return -EINVAL; |
| 185 | } |
| 186 | |
| 187 | spin_lock_irqsave(&zone->wp_lock, flags); |
| 188 | if (!file_sectors) { |
| 189 | zone->cond = BLK_ZONE_COND_EMPTY; |
| 190 | zone->wp = zone->start; |
| 191 | } else if (file_sectors == zlo->zone_capacity) { |
| 192 | zone->cond = BLK_ZONE_COND_FULL; |
| 193 | zone->wp = ULLONG_MAX; |
| 194 | } else { |
| 195 | zone->cond = BLK_ZONE_COND_CLOSED; |
| 196 | zone->wp = zone->start + file_sectors; |
| 197 | } |
| 198 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 199 | |
| 200 | return 0; |
| 201 | } |
| 202 | |
| 203 | static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no) |
| 204 | { |
| 205 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 206 | int ret = 0; |
| 207 | |
| 208 | if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) |
| 209 | return -EIO; |
| 210 | |
| 211 | mutex_lock(&zone->lock); |
| 212 | |
| 213 | if (test_and_clear_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags)) { |
| 214 | ret = zloop_update_seq_zone(zlo, zone_no); |
| 215 | if (ret) |
| 216 | goto unlock; |
| 217 | } |
| 218 | |
| 219 | switch (zone->cond) { |
| 220 | case BLK_ZONE_COND_EXP_OPEN: |
| 221 | break; |
| 222 | case BLK_ZONE_COND_EMPTY: |
| 223 | case BLK_ZONE_COND_CLOSED: |
| 224 | case BLK_ZONE_COND_IMP_OPEN: |
| 225 | zone->cond = BLK_ZONE_COND_EXP_OPEN; |
| 226 | break; |
| 227 | case BLK_ZONE_COND_FULL: |
| 228 | default: |
| 229 | ret = -EIO; |
| 230 | break; |
| 231 | } |
| 232 | |
| 233 | unlock: |
| 234 | mutex_unlock(lock: &zone->lock); |
| 235 | |
| 236 | return ret; |
| 237 | } |
| 238 | |
| 239 | static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) |
| 240 | { |
| 241 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 242 | unsigned long flags; |
| 243 | int ret = 0; |
| 244 | |
| 245 | if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) |
| 246 | return -EIO; |
| 247 | |
| 248 | mutex_lock(&zone->lock); |
| 249 | |
| 250 | if (test_and_clear_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags)) { |
| 251 | ret = zloop_update_seq_zone(zlo, zone_no); |
| 252 | if (ret) |
| 253 | goto unlock; |
| 254 | } |
| 255 | |
| 256 | switch (zone->cond) { |
| 257 | case BLK_ZONE_COND_CLOSED: |
| 258 | break; |
| 259 | case BLK_ZONE_COND_IMP_OPEN: |
| 260 | case BLK_ZONE_COND_EXP_OPEN: |
| 261 | spin_lock_irqsave(&zone->wp_lock, flags); |
| 262 | if (zone->wp == zone->start) |
| 263 | zone->cond = BLK_ZONE_COND_EMPTY; |
| 264 | else |
| 265 | zone->cond = BLK_ZONE_COND_CLOSED; |
| 266 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 267 | break; |
| 268 | case BLK_ZONE_COND_EMPTY: |
| 269 | case BLK_ZONE_COND_FULL: |
| 270 | default: |
| 271 | ret = -EIO; |
| 272 | break; |
| 273 | } |
| 274 | |
| 275 | unlock: |
| 276 | mutex_unlock(lock: &zone->lock); |
| 277 | |
| 278 | return ret; |
| 279 | } |
| 280 | |
| 281 | static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) |
| 282 | { |
| 283 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 284 | unsigned long flags; |
| 285 | int ret = 0; |
| 286 | |
| 287 | if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) |
| 288 | return -EIO; |
| 289 | |
| 290 | mutex_lock(&zone->lock); |
| 291 | |
| 292 | if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && |
| 293 | zone->cond == BLK_ZONE_COND_EMPTY) |
| 294 | goto unlock; |
| 295 | |
| 296 | if (vfs_truncate(&zone->file->f_path, 0)) { |
| 297 | set_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags); |
| 298 | ret = -EIO; |
| 299 | goto unlock; |
| 300 | } |
| 301 | |
| 302 | spin_lock_irqsave(&zone->wp_lock, flags); |
| 303 | zone->cond = BLK_ZONE_COND_EMPTY; |
| 304 | zone->wp = zone->start; |
| 305 | clear_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags); |
| 306 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 307 | |
| 308 | unlock: |
| 309 | mutex_unlock(lock: &zone->lock); |
| 310 | |
| 311 | return ret; |
| 312 | } |
| 313 | |
| 314 | static int zloop_reset_all_zones(struct zloop_device *zlo) |
| 315 | { |
| 316 | unsigned int i; |
| 317 | int ret; |
| 318 | |
| 319 | for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) { |
| 320 | ret = zloop_reset_zone(zlo, zone_no: i); |
| 321 | if (ret) |
| 322 | return ret; |
| 323 | } |
| 324 | |
| 325 | return 0; |
| 326 | } |
| 327 | |
| 328 | static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) |
| 329 | { |
| 330 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 331 | unsigned long flags; |
| 332 | int ret = 0; |
| 333 | |
| 334 | if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) |
| 335 | return -EIO; |
| 336 | |
| 337 | mutex_lock(&zone->lock); |
| 338 | |
| 339 | if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && |
| 340 | zone->cond == BLK_ZONE_COND_FULL) |
| 341 | goto unlock; |
| 342 | |
| 343 | if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) { |
| 344 | set_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags); |
| 345 | ret = -EIO; |
| 346 | goto unlock; |
| 347 | } |
| 348 | |
| 349 | spin_lock_irqsave(&zone->wp_lock, flags); |
| 350 | zone->cond = BLK_ZONE_COND_FULL; |
| 351 | zone->wp = ULLONG_MAX; |
| 352 | clear_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags); |
| 353 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 354 | |
| 355 | unlock: |
| 356 | mutex_unlock(lock: &zone->lock); |
| 357 | |
| 358 | return ret; |
| 359 | } |
| 360 | |
| 361 | static void zloop_put_cmd(struct zloop_cmd *cmd) |
| 362 | { |
| 363 | struct request *rq = blk_mq_rq_from_pdu(pdu: cmd); |
| 364 | |
| 365 | if (!atomic_dec_and_test(v: &cmd->ref)) |
| 366 | return; |
| 367 | kfree(objp: cmd->bvec); |
| 368 | cmd->bvec = NULL; |
| 369 | if (likely(!blk_should_fake_timeout(rq->q))) |
| 370 | blk_mq_complete_request(rq); |
| 371 | } |
| 372 | |
| 373 | static void zloop_rw_complete(struct kiocb *iocb, long ret) |
| 374 | { |
| 375 | struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb); |
| 376 | |
| 377 | cmd->ret = ret; |
| 378 | zloop_put_cmd(cmd); |
| 379 | } |
| 380 | |
| 381 | static void zloop_rw(struct zloop_cmd *cmd) |
| 382 | { |
| 383 | struct request *rq = blk_mq_rq_from_pdu(pdu: cmd); |
| 384 | struct zloop_device *zlo = rq->q->queuedata; |
| 385 | unsigned int zone_no = rq_zone_no(rq); |
| 386 | sector_t sector = blk_rq_pos(rq); |
| 387 | sector_t nr_sectors = blk_rq_sectors(rq); |
| 388 | bool is_append = req_op(req: rq) == REQ_OP_ZONE_APPEND; |
| 389 | bool is_write = req_op(req: rq) == REQ_OP_WRITE || is_append; |
| 390 | int rw = is_write ? ITER_SOURCE : ITER_DEST; |
| 391 | struct req_iterator rq_iter; |
| 392 | struct zloop_zone *zone; |
| 393 | struct iov_iter iter; |
| 394 | struct bio_vec tmp; |
| 395 | unsigned long flags; |
| 396 | sector_t zone_end; |
| 397 | unsigned int nr_bvec; |
| 398 | int ret; |
| 399 | |
| 400 | atomic_set(v: &cmd->ref, i: 2); |
| 401 | cmd->sector = sector; |
| 402 | cmd->nr_sectors = nr_sectors; |
| 403 | cmd->ret = 0; |
| 404 | |
| 405 | if (WARN_ON_ONCE(is_append && !zlo->zone_append)) { |
| 406 | ret = -EIO; |
| 407 | goto out; |
| 408 | } |
| 409 | |
| 410 | /* We should never get an I/O beyond the device capacity. */ |
| 411 | if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { |
| 412 | ret = -EIO; |
| 413 | goto out; |
| 414 | } |
| 415 | zone = &zlo->zones[zone_no]; |
| 416 | zone_end = zone->start + zlo->zone_capacity; |
| 417 | |
| 418 | /* |
| 419 | * The block layer should never send requests that are not fully |
| 420 | * contained within the zone. |
| 421 | */ |
| 422 | if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) { |
| 423 | ret = -EIO; |
| 424 | goto out; |
| 425 | } |
| 426 | |
| 427 | if (test_and_clear_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags)) { |
| 428 | mutex_lock(&zone->lock); |
| 429 | ret = zloop_update_seq_zone(zlo, zone_no); |
| 430 | mutex_unlock(lock: &zone->lock); |
| 431 | if (ret) |
| 432 | goto out; |
| 433 | } |
| 434 | |
| 435 | if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { |
| 436 | mutex_lock(&zone->lock); |
| 437 | |
| 438 | spin_lock_irqsave(&zone->wp_lock, flags); |
| 439 | |
| 440 | /* |
| 441 | * Zone append operations always go at the current write |
| 442 | * pointer, but regular write operations must already be |
| 443 | * aligned to the write pointer when submitted. |
| 444 | */ |
| 445 | if (is_append) { |
| 446 | /* |
| 447 | * If ordered zone append is in use, we already checked |
| 448 | * and set the target sector in zloop_queue_rq(). |
| 449 | */ |
| 450 | if (!zlo->ordered_zone_append) { |
| 451 | if (zone->cond == BLK_ZONE_COND_FULL || |
| 452 | zone->wp + nr_sectors > zone_end) { |
| 453 | spin_unlock_irqrestore(lock: &zone->wp_lock, |
| 454 | flags); |
| 455 | ret = -EIO; |
| 456 | goto unlock; |
| 457 | } |
| 458 | sector = zone->wp; |
| 459 | } |
| 460 | cmd->sector = sector; |
| 461 | } else if (sector != zone->wp) { |
| 462 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 463 | pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n" , |
| 464 | zone_no, sector, zone->wp); |
| 465 | ret = -EIO; |
| 466 | goto unlock; |
| 467 | } |
| 468 | |
| 469 | /* Implicitly open the target zone. */ |
| 470 | if (zone->cond == BLK_ZONE_COND_CLOSED || |
| 471 | zone->cond == BLK_ZONE_COND_EMPTY) |
| 472 | zone->cond = BLK_ZONE_COND_IMP_OPEN; |
| 473 | |
| 474 | /* |
| 475 | * Advance the write pointer, unless ordered zone append is in |
| 476 | * use. If the write fails, the write pointer position will be |
| 477 | * corrected when the next I/O starts execution. |
| 478 | */ |
| 479 | if (!is_append || !zlo->ordered_zone_append) { |
| 480 | zone->wp += nr_sectors; |
| 481 | if (zone->wp == zone_end) { |
| 482 | zone->cond = BLK_ZONE_COND_FULL; |
| 483 | zone->wp = ULLONG_MAX; |
| 484 | } |
| 485 | } |
| 486 | |
| 487 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 488 | } |
| 489 | |
| 490 | nr_bvec = blk_rq_nr_bvec(rq); |
| 491 | |
| 492 | if (rq->bio != rq->biotail) { |
| 493 | struct bio_vec *bvec; |
| 494 | |
| 495 | cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO); |
| 496 | if (!cmd->bvec) { |
| 497 | ret = -EIO; |
| 498 | goto unlock; |
| 499 | } |
| 500 | |
| 501 | /* |
| 502 | * The bios of the request may be started from the middle of |
| 503 | * the 'bvec' because of bio splitting, so we can't directly |
| 504 | * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec |
| 505 | * API will take care of all details for us. |
| 506 | */ |
| 507 | bvec = cmd->bvec; |
| 508 | rq_for_each_bvec(tmp, rq, rq_iter) { |
| 509 | *bvec = tmp; |
| 510 | bvec++; |
| 511 | } |
| 512 | iov_iter_bvec(i: &iter, direction: rw, bvec: cmd->bvec, nr_segs: nr_bvec, count: blk_rq_bytes(rq)); |
| 513 | } else { |
| 514 | /* |
| 515 | * Same here, this bio may be started from the middle of the |
| 516 | * 'bvec' because of bio splitting, so offset from the bvec |
| 517 | * must be passed to iov iterator |
| 518 | */ |
| 519 | iov_iter_bvec(i: &iter, direction: rw, |
| 520 | __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter), |
| 521 | nr_segs: nr_bvec, count: blk_rq_bytes(rq)); |
| 522 | iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; |
| 523 | } |
| 524 | |
| 525 | cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT; |
| 526 | cmd->iocb.ki_filp = zone->file; |
| 527 | cmd->iocb.ki_complete = zloop_rw_complete; |
| 528 | if (!zlo->buffered_io) |
| 529 | cmd->iocb.ki_flags = IOCB_DIRECT; |
| 530 | cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); |
| 531 | |
| 532 | if (rw == ITER_SOURCE) |
| 533 | ret = zone->file->f_op->write_iter(&cmd->iocb, &iter); |
| 534 | else |
| 535 | ret = zone->file->f_op->read_iter(&cmd->iocb, &iter); |
| 536 | unlock: |
| 537 | if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) |
| 538 | mutex_unlock(lock: &zone->lock); |
| 539 | out: |
| 540 | if (ret != -EIOCBQUEUED) |
| 541 | zloop_rw_complete(iocb: &cmd->iocb, ret); |
| 542 | zloop_put_cmd(cmd); |
| 543 | } |
| 544 | |
| 545 | static void zloop_handle_cmd(struct zloop_cmd *cmd) |
| 546 | { |
| 547 | struct request *rq = blk_mq_rq_from_pdu(pdu: cmd); |
| 548 | struct zloop_device *zlo = rq->q->queuedata; |
| 549 | |
| 550 | /* We can block in this context, so ignore REQ_NOWAIT. */ |
| 551 | if (rq->cmd_flags & REQ_NOWAIT) |
| 552 | rq->cmd_flags &= ~REQ_NOWAIT; |
| 553 | |
| 554 | switch (req_op(req: rq)) { |
| 555 | case REQ_OP_READ: |
| 556 | case REQ_OP_WRITE: |
| 557 | case REQ_OP_ZONE_APPEND: |
| 558 | /* |
| 559 | * zloop_rw() always executes asynchronously or completes |
| 560 | * directly. |
| 561 | */ |
| 562 | zloop_rw(cmd); |
| 563 | return; |
| 564 | case REQ_OP_FLUSH: |
| 565 | /* |
| 566 | * Sync the entire FS containing the zone files instead of |
| 567 | * walking all files |
| 568 | */ |
| 569 | cmd->ret = sync_filesystem(file_inode(f: zlo->data_dir)->i_sb); |
| 570 | break; |
| 571 | case REQ_OP_ZONE_RESET: |
| 572 | cmd->ret = zloop_reset_zone(zlo, zone_no: rq_zone_no(rq)); |
| 573 | break; |
| 574 | case REQ_OP_ZONE_RESET_ALL: |
| 575 | cmd->ret = zloop_reset_all_zones(zlo); |
| 576 | break; |
| 577 | case REQ_OP_ZONE_FINISH: |
| 578 | cmd->ret = zloop_finish_zone(zlo, zone_no: rq_zone_no(rq)); |
| 579 | break; |
| 580 | case REQ_OP_ZONE_OPEN: |
| 581 | cmd->ret = zloop_open_zone(zlo, zone_no: rq_zone_no(rq)); |
| 582 | break; |
| 583 | case REQ_OP_ZONE_CLOSE: |
| 584 | cmd->ret = zloop_close_zone(zlo, zone_no: rq_zone_no(rq)); |
| 585 | break; |
| 586 | default: |
| 587 | WARN_ON_ONCE(1); |
| 588 | pr_err("Unsupported operation %d\n" , req_op(rq)); |
| 589 | cmd->ret = -EOPNOTSUPP; |
| 590 | break; |
| 591 | } |
| 592 | |
| 593 | blk_mq_complete_request(rq); |
| 594 | } |
| 595 | |
| 596 | static void zloop_cmd_workfn(struct work_struct *work) |
| 597 | { |
| 598 | struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work); |
| 599 | int orig_flags = current->flags; |
| 600 | |
| 601 | current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; |
| 602 | zloop_handle_cmd(cmd); |
| 603 | current->flags = orig_flags; |
| 604 | } |
| 605 | |
| 606 | static void zloop_complete_rq(struct request *rq) |
| 607 | { |
| 608 | struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); |
| 609 | struct zloop_device *zlo = rq->q->queuedata; |
| 610 | unsigned int zone_no = cmd->sector >> zlo->zone_shift; |
| 611 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 612 | blk_status_t sts = BLK_STS_OK; |
| 613 | |
| 614 | switch (req_op(req: rq)) { |
| 615 | case REQ_OP_READ: |
| 616 | if (cmd->ret < 0) |
| 617 | pr_err("Zone %u: failed read sector %llu, %llu sectors\n" , |
| 618 | zone_no, cmd->sector, cmd->nr_sectors); |
| 619 | |
| 620 | if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { |
| 621 | /* short read */ |
| 622 | struct bio *bio; |
| 623 | |
| 624 | __rq_for_each_bio(bio, rq) |
| 625 | zero_fill_bio(bio); |
| 626 | } |
| 627 | break; |
| 628 | case REQ_OP_WRITE: |
| 629 | case REQ_OP_ZONE_APPEND: |
| 630 | if (cmd->ret < 0) |
| 631 | pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n" , |
| 632 | zone_no, |
| 633 | req_op(rq) == REQ_OP_WRITE ? "" : "append " , |
| 634 | cmd->sector, cmd->nr_sectors); |
| 635 | |
| 636 | if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { |
| 637 | pr_err("Zone %u: partial write %ld/%u B\n" , |
| 638 | zone_no, cmd->ret, blk_rq_bytes(rq)); |
| 639 | cmd->ret = -EIO; |
| 640 | } |
| 641 | |
| 642 | if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { |
| 643 | /* |
| 644 | * A write to a sequential zone file failed: mark the |
| 645 | * zone as having an error. This will be corrected and |
| 646 | * cleared when the next IO is submitted. |
| 647 | */ |
| 648 | set_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags); |
| 649 | break; |
| 650 | } |
| 651 | if (req_op(req: rq) == REQ_OP_ZONE_APPEND) |
| 652 | rq->__sector = cmd->sector; |
| 653 | |
| 654 | break; |
| 655 | default: |
| 656 | break; |
| 657 | } |
| 658 | |
| 659 | if (cmd->ret < 0) |
| 660 | sts = errno_to_blk_status(errno: cmd->ret); |
| 661 | blk_mq_end_request(rq, error: sts); |
| 662 | } |
| 663 | |
| 664 | static bool zloop_set_zone_append_sector(struct request *rq) |
| 665 | { |
| 666 | struct zloop_device *zlo = rq->q->queuedata; |
| 667 | unsigned int zone_no = rq_zone_no(rq); |
| 668 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 669 | sector_t zone_end = zone->start + zlo->zone_capacity; |
| 670 | sector_t nr_sectors = blk_rq_sectors(rq); |
| 671 | unsigned long flags; |
| 672 | |
| 673 | spin_lock_irqsave(&zone->wp_lock, flags); |
| 674 | |
| 675 | if (zone->cond == BLK_ZONE_COND_FULL || |
| 676 | zone->wp + nr_sectors > zone_end) { |
| 677 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 678 | return false; |
| 679 | } |
| 680 | |
| 681 | rq->__sector = zone->wp; |
| 682 | zone->wp += blk_rq_sectors(rq); |
| 683 | if (zone->wp >= zone_end) { |
| 684 | zone->cond = BLK_ZONE_COND_FULL; |
| 685 | zone->wp = ULLONG_MAX; |
| 686 | } |
| 687 | |
| 688 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 689 | |
| 690 | return true; |
| 691 | } |
| 692 | |
| 693 | static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, |
| 694 | const struct blk_mq_queue_data *bd) |
| 695 | { |
| 696 | struct request *rq = bd->rq; |
| 697 | struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); |
| 698 | struct zloop_device *zlo = rq->q->queuedata; |
| 699 | |
| 700 | if (data_race(READ_ONCE(zlo->state)) == Zlo_deleting) |
| 701 | return BLK_STS_IOERR; |
| 702 | |
| 703 | /* |
| 704 | * If we need to strongly order zone append operations, set the request |
| 705 | * sector to the zone write pointer location now instead of when the |
| 706 | * command work runs. |
| 707 | */ |
| 708 | if (zlo->ordered_zone_append && req_op(req: rq) == REQ_OP_ZONE_APPEND) { |
| 709 | if (!zloop_set_zone_append_sector(rq)) |
| 710 | return BLK_STS_IOERR; |
| 711 | } |
| 712 | |
| 713 | blk_mq_start_request(rq); |
| 714 | |
| 715 | INIT_WORK(&cmd->work, zloop_cmd_workfn); |
| 716 | queue_work(wq: zlo->workqueue, work: &cmd->work); |
| 717 | |
| 718 | return BLK_STS_OK; |
| 719 | } |
| 720 | |
| 721 | static const struct blk_mq_ops zloop_mq_ops = { |
| 722 | .queue_rq = zloop_queue_rq, |
| 723 | .complete = zloop_complete_rq, |
| 724 | }; |
| 725 | |
| 726 | static int zloop_open(struct gendisk *disk, blk_mode_t mode) |
| 727 | { |
| 728 | struct zloop_device *zlo = disk->private_data; |
| 729 | int ret; |
| 730 | |
| 731 | ret = mutex_lock_killable(&zloop_ctl_mutex); |
| 732 | if (ret) |
| 733 | return ret; |
| 734 | |
| 735 | if (zlo->state != Zlo_live) |
| 736 | ret = -ENXIO; |
| 737 | mutex_unlock(lock: &zloop_ctl_mutex); |
| 738 | return ret; |
| 739 | } |
| 740 | |
| 741 | static int zloop_report_zones(struct gendisk *disk, sector_t sector, |
| 742 | unsigned int nr_zones, struct blk_report_zones_args *args) |
| 743 | { |
| 744 | struct zloop_device *zlo = disk->private_data; |
| 745 | struct blk_zone blkz = {}; |
| 746 | unsigned int first, i; |
| 747 | unsigned long flags; |
| 748 | int ret; |
| 749 | |
| 750 | first = disk_zone_no(disk, sector); |
| 751 | if (first >= zlo->nr_zones) |
| 752 | return 0; |
| 753 | nr_zones = min(nr_zones, zlo->nr_zones - first); |
| 754 | |
| 755 | for (i = 0; i < nr_zones; i++) { |
| 756 | unsigned int zone_no = first + i; |
| 757 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 758 | |
| 759 | mutex_lock(&zone->lock); |
| 760 | |
| 761 | if (test_and_clear_bit(nr: ZLOOP_ZONE_SEQ_ERROR, addr: &zone->flags)) { |
| 762 | ret = zloop_update_seq_zone(zlo, zone_no); |
| 763 | if (ret) { |
| 764 | mutex_unlock(lock: &zone->lock); |
| 765 | return ret; |
| 766 | } |
| 767 | } |
| 768 | |
| 769 | blkz.start = zone->start; |
| 770 | blkz.len = zlo->zone_size; |
| 771 | spin_lock_irqsave(&zone->wp_lock, flags); |
| 772 | blkz.wp = zone->wp; |
| 773 | spin_unlock_irqrestore(lock: &zone->wp_lock, flags); |
| 774 | blkz.cond = zone->cond; |
| 775 | if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { |
| 776 | blkz.type = BLK_ZONE_TYPE_CONVENTIONAL; |
| 777 | blkz.capacity = zlo->zone_size; |
| 778 | } else { |
| 779 | blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ; |
| 780 | blkz.capacity = zlo->zone_capacity; |
| 781 | } |
| 782 | |
| 783 | mutex_unlock(lock: &zone->lock); |
| 784 | |
| 785 | ret = disk_report_zone(disk, zone: &blkz, idx: i, args); |
| 786 | if (ret) |
| 787 | return ret; |
| 788 | } |
| 789 | |
| 790 | return nr_zones; |
| 791 | } |
| 792 | |
| 793 | static void zloop_free_disk(struct gendisk *disk) |
| 794 | { |
| 795 | struct zloop_device *zlo = disk->private_data; |
| 796 | unsigned int i; |
| 797 | |
| 798 | blk_mq_free_tag_set(set: &zlo->tag_set); |
| 799 | |
| 800 | for (i = 0; i < zlo->nr_zones; i++) { |
| 801 | struct zloop_zone *zone = &zlo->zones[i]; |
| 802 | |
| 803 | mapping_set_gfp_mask(m: zone->file->f_mapping, |
| 804 | mask: zone->old_gfp_mask); |
| 805 | fput(zone->file); |
| 806 | } |
| 807 | |
| 808 | fput(zlo->data_dir); |
| 809 | destroy_workqueue(wq: zlo->workqueue); |
| 810 | kfree(objp: zlo->base_dir); |
| 811 | kvfree(addr: zlo); |
| 812 | } |
| 813 | |
| 814 | static const struct block_device_operations zloop_fops = { |
| 815 | .owner = THIS_MODULE, |
| 816 | .open = zloop_open, |
| 817 | .report_zones = zloop_report_zones, |
| 818 | .free_disk = zloop_free_disk, |
| 819 | }; |
| 820 | |
| 821 | __printf(3, 4) |
| 822 | static struct file *zloop_filp_open_fmt(int oflags, umode_t mode, |
| 823 | const char *fmt, ...) |
| 824 | { |
| 825 | struct file *file; |
| 826 | va_list ap; |
| 827 | char *p; |
| 828 | |
| 829 | va_start(ap, fmt); |
| 830 | p = kvasprintf(GFP_KERNEL, fmt, args: ap); |
| 831 | va_end(ap); |
| 832 | |
| 833 | if (!p) |
| 834 | return ERR_PTR(error: -ENOMEM); |
| 835 | file = filp_open(p, oflags, mode); |
| 836 | kfree(objp: p); |
| 837 | return file; |
| 838 | } |
| 839 | |
| 840 | static int zloop_get_block_size(struct zloop_device *zlo, |
| 841 | struct zloop_zone *zone) |
| 842 | { |
| 843 | struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev; |
| 844 | struct kstat st; |
| 845 | |
| 846 | /* |
| 847 | * If the FS block size is lower than or equal to 4K, use that as the |
| 848 | * device block size. Otherwise, fallback to the FS direct IO alignment |
| 849 | * constraint if that is provided, and to the FS underlying device |
| 850 | * physical block size if the direct IO alignment is unknown. |
| 851 | */ |
| 852 | if (file_inode(f: zone->file)->i_sb->s_blocksize <= SZ_4K) |
| 853 | zlo->block_size = file_inode(f: zone->file)->i_sb->s_blocksize; |
| 854 | else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) && |
| 855 | (st.result_mask & STATX_DIOALIGN)) |
| 856 | zlo->block_size = st.dio_offset_align; |
| 857 | else if (sb_bdev) |
| 858 | zlo->block_size = bdev_physical_block_size(bdev: sb_bdev); |
| 859 | else |
| 860 | zlo->block_size = SECTOR_SIZE; |
| 861 | |
| 862 | if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { |
| 863 | pr_err("Zone capacity is not aligned to block size %u\n" , |
| 864 | zlo->block_size); |
| 865 | return -EINVAL; |
| 866 | } |
| 867 | |
| 868 | return 0; |
| 869 | } |
| 870 | |
| 871 | static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, |
| 872 | unsigned int zone_no, bool restore) |
| 873 | { |
| 874 | struct zloop_zone *zone = &zlo->zones[zone_no]; |
| 875 | int oflags = O_RDWR; |
| 876 | struct kstat stat; |
| 877 | sector_t file_sectors; |
| 878 | int ret; |
| 879 | |
| 880 | mutex_init(&zone->lock); |
| 881 | spin_lock_init(&zone->wp_lock); |
| 882 | zone->start = (sector_t)zone_no << zlo->zone_shift; |
| 883 | |
| 884 | if (!restore) |
| 885 | oflags |= O_CREAT; |
| 886 | |
| 887 | if (!opts->buffered_io) |
| 888 | oflags |= O_DIRECT; |
| 889 | |
| 890 | if (zone_no < zlo->nr_conv_zones) { |
| 891 | /* Conventional zone file. */ |
| 892 | set_bit(nr: ZLOOP_ZONE_CONV, addr: &zone->flags); |
| 893 | zone->cond = BLK_ZONE_COND_NOT_WP; |
| 894 | zone->wp = U64_MAX; |
| 895 | |
| 896 | zone->file = zloop_filp_open_fmt(oflags, mode: 0600, fmt: "%s/%u/cnv-%06u" , |
| 897 | zlo->base_dir, zlo->id, zone_no); |
| 898 | if (IS_ERR(ptr: zone->file)) { |
| 899 | pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)" , |
| 900 | zone_no, zlo->base_dir, zlo->id, zone_no, |
| 901 | PTR_ERR(zone->file)); |
| 902 | return PTR_ERR(ptr: zone->file); |
| 903 | } |
| 904 | |
| 905 | if (!zlo->block_size) { |
| 906 | ret = zloop_get_block_size(zlo, zone); |
| 907 | if (ret) |
| 908 | return ret; |
| 909 | } |
| 910 | |
| 911 | ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); |
| 912 | if (ret < 0) { |
| 913 | pr_err("Failed to get zone %u file stat\n" , zone_no); |
| 914 | return ret; |
| 915 | } |
| 916 | file_sectors = stat.size >> SECTOR_SHIFT; |
| 917 | |
| 918 | if (restore && file_sectors != zlo->zone_size) { |
| 919 | pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n" , |
| 920 | zone_no, file_sectors, zlo->zone_capacity); |
| 921 | return ret; |
| 922 | } |
| 923 | |
| 924 | ret = vfs_truncate(&zone->file->f_path, |
| 925 | zlo->zone_size << SECTOR_SHIFT); |
| 926 | if (ret < 0) { |
| 927 | pr_err("Failed to truncate zone %u file (err=%d)\n" , |
| 928 | zone_no, ret); |
| 929 | return ret; |
| 930 | } |
| 931 | |
| 932 | return 0; |
| 933 | } |
| 934 | |
| 935 | /* Sequential zone file. */ |
| 936 | zone->file = zloop_filp_open_fmt(oflags, mode: 0600, fmt: "%s/%u/seq-%06u" , |
| 937 | zlo->base_dir, zlo->id, zone_no); |
| 938 | if (IS_ERR(ptr: zone->file)) { |
| 939 | pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)" , |
| 940 | zone_no, zlo->base_dir, zlo->id, zone_no, |
| 941 | PTR_ERR(zone->file)); |
| 942 | return PTR_ERR(ptr: zone->file); |
| 943 | } |
| 944 | |
| 945 | if (!zlo->block_size) { |
| 946 | ret = zloop_get_block_size(zlo, zone); |
| 947 | if (ret) |
| 948 | return ret; |
| 949 | } |
| 950 | |
| 951 | zloop_get_block_size(zlo, zone); |
| 952 | |
| 953 | mutex_lock(&zone->lock); |
| 954 | ret = zloop_update_seq_zone(zlo, zone_no); |
| 955 | mutex_unlock(lock: &zone->lock); |
| 956 | |
| 957 | return ret; |
| 958 | } |
| 959 | |
| 960 | static bool zloop_dev_exists(struct zloop_device *zlo) |
| 961 | { |
| 962 | struct file *cnv, *seq; |
| 963 | bool exists; |
| 964 | |
| 965 | cnv = zloop_filp_open_fmt(O_RDONLY, mode: 0600, fmt: "%s/%u/cnv-%06u" , |
| 966 | zlo->base_dir, zlo->id, 0); |
| 967 | seq = zloop_filp_open_fmt(O_RDONLY, mode: 0600, fmt: "%s/%u/seq-%06u" , |
| 968 | zlo->base_dir, zlo->id, 0); |
| 969 | exists = !IS_ERR(ptr: cnv) || !IS_ERR(ptr: seq); |
| 970 | |
| 971 | if (!IS_ERR(ptr: cnv)) |
| 972 | fput(cnv); |
| 973 | if (!IS_ERR(ptr: seq)) |
| 974 | fput(seq); |
| 975 | |
| 976 | return exists; |
| 977 | } |
| 978 | |
| 979 | static int zloop_ctl_add(struct zloop_options *opts) |
| 980 | { |
| 981 | struct queue_limits lim = { |
| 982 | .max_hw_sectors = SZ_1M >> SECTOR_SHIFT, |
| 983 | .chunk_sectors = opts->zone_size, |
| 984 | .features = BLK_FEAT_ZONED, |
| 985 | }; |
| 986 | unsigned int nr_zones, i, j; |
| 987 | struct zloop_device *zlo; |
| 988 | int ret = -EINVAL; |
| 989 | bool restore; |
| 990 | |
| 991 | __module_get(THIS_MODULE); |
| 992 | |
| 993 | nr_zones = opts->capacity >> ilog2(opts->zone_size); |
| 994 | if (opts->nr_conv_zones >= nr_zones) { |
| 995 | pr_err("Invalid number of conventional zones %u\n" , |
| 996 | opts->nr_conv_zones); |
| 997 | goto out; |
| 998 | } |
| 999 | |
| 1000 | zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL); |
| 1001 | if (!zlo) { |
| 1002 | ret = -ENOMEM; |
| 1003 | goto out; |
| 1004 | } |
| 1005 | WRITE_ONCE(zlo->state, Zlo_creating); |
| 1006 | |
| 1007 | ret = mutex_lock_killable(&zloop_ctl_mutex); |
| 1008 | if (ret) |
| 1009 | goto out_free_dev; |
| 1010 | |
| 1011 | /* Allocate id, if @opts->id >= 0, we're requesting that specific id */ |
| 1012 | if (opts->id >= 0) { |
| 1013 | ret = idr_alloc(&zloop_index_idr, ptr: zlo, |
| 1014 | start: opts->id, end: opts->id + 1, GFP_KERNEL); |
| 1015 | if (ret == -ENOSPC) |
| 1016 | ret = -EEXIST; |
| 1017 | } else { |
| 1018 | ret = idr_alloc(&zloop_index_idr, ptr: zlo, start: 0, end: 0, GFP_KERNEL); |
| 1019 | } |
| 1020 | mutex_unlock(lock: &zloop_ctl_mutex); |
| 1021 | if (ret < 0) |
| 1022 | goto out_free_dev; |
| 1023 | |
| 1024 | zlo->id = ret; |
| 1025 | zlo->zone_shift = ilog2(opts->zone_size); |
| 1026 | zlo->zone_size = opts->zone_size; |
| 1027 | if (opts->zone_capacity) |
| 1028 | zlo->zone_capacity = opts->zone_capacity; |
| 1029 | else |
| 1030 | zlo->zone_capacity = zlo->zone_size; |
| 1031 | zlo->nr_zones = nr_zones; |
| 1032 | zlo->nr_conv_zones = opts->nr_conv_zones; |
| 1033 | zlo->buffered_io = opts->buffered_io; |
| 1034 | zlo->zone_append = opts->zone_append; |
| 1035 | if (zlo->zone_append) |
| 1036 | zlo->ordered_zone_append = opts->ordered_zone_append; |
| 1037 | |
| 1038 | zlo->workqueue = alloc_workqueue("zloop%d" , WQ_UNBOUND | WQ_FREEZABLE, |
| 1039 | opts->nr_queues * opts->queue_depth, zlo->id); |
| 1040 | if (!zlo->workqueue) { |
| 1041 | ret = -ENOMEM; |
| 1042 | goto out_free_idr; |
| 1043 | } |
| 1044 | |
| 1045 | if (opts->base_dir) |
| 1046 | zlo->base_dir = kstrdup(s: opts->base_dir, GFP_KERNEL); |
| 1047 | else |
| 1048 | zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL); |
| 1049 | if (!zlo->base_dir) { |
| 1050 | ret = -ENOMEM; |
| 1051 | goto out_destroy_workqueue; |
| 1052 | } |
| 1053 | |
| 1054 | zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, mode: 0, fmt: "%s/%u" , |
| 1055 | zlo->base_dir, zlo->id); |
| 1056 | if (IS_ERR(ptr: zlo->data_dir)) { |
| 1057 | ret = PTR_ERR(ptr: zlo->data_dir); |
| 1058 | pr_warn("Failed to open directory %s/%u (err=%d)\n" , |
| 1059 | zlo->base_dir, zlo->id, ret); |
| 1060 | goto out_free_base_dir; |
| 1061 | } |
| 1062 | |
| 1063 | /* |
| 1064 | * If we already have zone files, we are restoring a device created by a |
| 1065 | * previous add operation. In this case, zloop_init_zone() will check |
| 1066 | * that the zone files are consistent with the zone configuration given. |
| 1067 | */ |
| 1068 | restore = zloop_dev_exists(zlo); |
| 1069 | for (i = 0; i < nr_zones; i++) { |
| 1070 | ret = zloop_init_zone(zlo, opts, zone_no: i, restore); |
| 1071 | if (ret) |
| 1072 | goto out_close_files; |
| 1073 | } |
| 1074 | |
| 1075 | lim.physical_block_size = zlo->block_size; |
| 1076 | lim.logical_block_size = zlo->block_size; |
| 1077 | if (zlo->zone_append) |
| 1078 | lim.max_hw_zone_append_sectors = lim.max_hw_sectors; |
| 1079 | |
| 1080 | zlo->tag_set.ops = &zloop_mq_ops; |
| 1081 | zlo->tag_set.nr_hw_queues = opts->nr_queues; |
| 1082 | zlo->tag_set.queue_depth = opts->queue_depth; |
| 1083 | zlo->tag_set.numa_node = NUMA_NO_NODE; |
| 1084 | zlo->tag_set.cmd_size = sizeof(struct zloop_cmd); |
| 1085 | zlo->tag_set.driver_data = zlo; |
| 1086 | |
| 1087 | ret = blk_mq_alloc_tag_set(set: &zlo->tag_set); |
| 1088 | if (ret) { |
| 1089 | pr_err("blk_mq_alloc_tag_set failed (err=%d)\n" , ret); |
| 1090 | goto out_close_files; |
| 1091 | } |
| 1092 | |
| 1093 | zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo); |
| 1094 | if (IS_ERR(ptr: zlo->disk)) { |
| 1095 | pr_err("blk_mq_alloc_disk failed (err=%d)\n" , ret); |
| 1096 | ret = PTR_ERR(ptr: zlo->disk); |
| 1097 | goto out_cleanup_tags; |
| 1098 | } |
| 1099 | zlo->disk->flags = GENHD_FL_NO_PART; |
| 1100 | zlo->disk->fops = &zloop_fops; |
| 1101 | zlo->disk->private_data = zlo; |
| 1102 | sprintf(buf: zlo->disk->disk_name, fmt: "zloop%d" , zlo->id); |
| 1103 | set_capacity(disk: zlo->disk, size: (u64)lim.chunk_sectors * zlo->nr_zones); |
| 1104 | |
| 1105 | ret = blk_revalidate_disk_zones(disk: zlo->disk); |
| 1106 | if (ret) |
| 1107 | goto out_cleanup_disk; |
| 1108 | |
| 1109 | ret = add_disk(disk: zlo->disk); |
| 1110 | if (ret) { |
| 1111 | pr_err("add_disk failed (err=%d)\n" , ret); |
| 1112 | goto out_cleanup_disk; |
| 1113 | } |
| 1114 | |
| 1115 | mutex_lock(&zloop_ctl_mutex); |
| 1116 | WRITE_ONCE(zlo->state, Zlo_live); |
| 1117 | mutex_unlock(lock: &zloop_ctl_mutex); |
| 1118 | |
| 1119 | pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n" , |
| 1120 | zlo->id, zlo->nr_zones, |
| 1121 | ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20, |
| 1122 | zlo->block_size); |
| 1123 | pr_info("zloop%d: using %s%s zone append\n" , |
| 1124 | zlo->id, |
| 1125 | zlo->ordered_zone_append ? "ordered " : "" , |
| 1126 | zlo->zone_append ? "native" : "emulated" ); |
| 1127 | |
| 1128 | return 0; |
| 1129 | |
| 1130 | out_cleanup_disk: |
| 1131 | put_disk(disk: zlo->disk); |
| 1132 | out_cleanup_tags: |
| 1133 | blk_mq_free_tag_set(set: &zlo->tag_set); |
| 1134 | out_close_files: |
| 1135 | for (j = 0; j < i; j++) { |
| 1136 | struct zloop_zone *zone = &zlo->zones[j]; |
| 1137 | |
| 1138 | if (!IS_ERR_OR_NULL(ptr: zone->file)) |
| 1139 | fput(zone->file); |
| 1140 | } |
| 1141 | fput(zlo->data_dir); |
| 1142 | out_free_base_dir: |
| 1143 | kfree(objp: zlo->base_dir); |
| 1144 | out_destroy_workqueue: |
| 1145 | destroy_workqueue(wq: zlo->workqueue); |
| 1146 | out_free_idr: |
| 1147 | mutex_lock(&zloop_ctl_mutex); |
| 1148 | idr_remove(&zloop_index_idr, id: zlo->id); |
| 1149 | mutex_unlock(lock: &zloop_ctl_mutex); |
| 1150 | out_free_dev: |
| 1151 | kvfree(addr: zlo); |
| 1152 | out: |
| 1153 | module_put(THIS_MODULE); |
| 1154 | if (ret == -ENOENT) |
| 1155 | ret = -EINVAL; |
| 1156 | return ret; |
| 1157 | } |
| 1158 | |
| 1159 | static int zloop_ctl_remove(struct zloop_options *opts) |
| 1160 | { |
| 1161 | struct zloop_device *zlo; |
| 1162 | int ret; |
| 1163 | |
| 1164 | if (!(opts->mask & ZLOOP_OPT_ID)) { |
| 1165 | pr_err("No ID specified\n" ); |
| 1166 | return -EINVAL; |
| 1167 | } |
| 1168 | |
| 1169 | ret = mutex_lock_killable(&zloop_ctl_mutex); |
| 1170 | if (ret) |
| 1171 | return ret; |
| 1172 | |
| 1173 | zlo = idr_find(&zloop_index_idr, id: opts->id); |
| 1174 | if (!zlo || zlo->state == Zlo_creating) { |
| 1175 | ret = -ENODEV; |
| 1176 | } else if (zlo->state == Zlo_deleting) { |
| 1177 | ret = -EINVAL; |
| 1178 | } else { |
| 1179 | idr_remove(&zloop_index_idr, id: zlo->id); |
| 1180 | WRITE_ONCE(zlo->state, Zlo_deleting); |
| 1181 | } |
| 1182 | |
| 1183 | mutex_unlock(lock: &zloop_ctl_mutex); |
| 1184 | if (ret) |
| 1185 | return ret; |
| 1186 | |
| 1187 | del_gendisk(gp: zlo->disk); |
| 1188 | put_disk(disk: zlo->disk); |
| 1189 | |
| 1190 | pr_info("Removed device %d\n" , opts->id); |
| 1191 | |
| 1192 | module_put(THIS_MODULE); |
| 1193 | |
| 1194 | return 0; |
| 1195 | } |
| 1196 | |
| 1197 | static int zloop_parse_options(struct zloop_options *opts, const char *buf) |
| 1198 | { |
| 1199 | substring_t args[MAX_OPT_ARGS]; |
| 1200 | char *options, *o, *p; |
| 1201 | unsigned int token; |
| 1202 | int ret = 0; |
| 1203 | |
| 1204 | /* Set defaults. */ |
| 1205 | opts->mask = 0; |
| 1206 | opts->id = ZLOOP_DEF_ID; |
| 1207 | opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES; |
| 1208 | opts->zone_size = ZLOOP_DEF_ZONE_SIZE; |
| 1209 | opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES; |
| 1210 | opts->nr_queues = ZLOOP_DEF_NR_QUEUES; |
| 1211 | opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; |
| 1212 | opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; |
| 1213 | opts->zone_append = ZLOOP_DEF_ZONE_APPEND; |
| 1214 | opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND; |
| 1215 | |
| 1216 | if (!buf) |
| 1217 | return 0; |
| 1218 | |
| 1219 | /* Skip leading spaces before the options. */ |
| 1220 | while (isspace(*buf)) |
| 1221 | buf++; |
| 1222 | |
| 1223 | options = o = kstrdup(s: buf, GFP_KERNEL); |
| 1224 | if (!options) |
| 1225 | return -ENOMEM; |
| 1226 | |
| 1227 | /* Parse the options, doing only some light invalid value checks. */ |
| 1228 | while ((p = strsep(&o, ",\n" )) != NULL) { |
| 1229 | if (!*p) |
| 1230 | continue; |
| 1231 | |
| 1232 | token = match_token(p, table: zloop_opt_tokens, args); |
| 1233 | opts->mask |= token; |
| 1234 | switch (token) { |
| 1235 | case ZLOOP_OPT_ID: |
| 1236 | if (match_int(args, result: &opts->id)) { |
| 1237 | ret = -EINVAL; |
| 1238 | goto out; |
| 1239 | } |
| 1240 | break; |
| 1241 | case ZLOOP_OPT_CAPACITY: |
| 1242 | if (match_uint(s: args, result: &token)) { |
| 1243 | ret = -EINVAL; |
| 1244 | goto out; |
| 1245 | } |
| 1246 | if (!token) { |
| 1247 | pr_err("Invalid capacity\n" ); |
| 1248 | ret = -EINVAL; |
| 1249 | goto out; |
| 1250 | } |
| 1251 | opts->capacity = |
| 1252 | ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; |
| 1253 | break; |
| 1254 | case ZLOOP_OPT_ZONE_SIZE: |
| 1255 | if (match_uint(s: args, result: &token)) { |
| 1256 | ret = -EINVAL; |
| 1257 | goto out; |
| 1258 | } |
| 1259 | if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB || |
| 1260 | !is_power_of_2(n: token)) { |
| 1261 | pr_err("Invalid zone size %u\n" , token); |
| 1262 | ret = -EINVAL; |
| 1263 | goto out; |
| 1264 | } |
| 1265 | opts->zone_size = |
| 1266 | ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; |
| 1267 | break; |
| 1268 | case ZLOOP_OPT_ZONE_CAPACITY: |
| 1269 | if (match_uint(s: args, result: &token)) { |
| 1270 | ret = -EINVAL; |
| 1271 | goto out; |
| 1272 | } |
| 1273 | if (!token) { |
| 1274 | pr_err("Invalid zone capacity\n" ); |
| 1275 | ret = -EINVAL; |
| 1276 | goto out; |
| 1277 | } |
| 1278 | opts->zone_capacity = |
| 1279 | ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; |
| 1280 | break; |
| 1281 | case ZLOOP_OPT_NR_CONV_ZONES: |
| 1282 | if (match_uint(s: args, result: &token)) { |
| 1283 | ret = -EINVAL; |
| 1284 | goto out; |
| 1285 | } |
| 1286 | opts->nr_conv_zones = token; |
| 1287 | break; |
| 1288 | case ZLOOP_OPT_BASE_DIR: |
| 1289 | p = match_strdup(args); |
| 1290 | if (!p) { |
| 1291 | ret = -ENOMEM; |
| 1292 | goto out; |
| 1293 | } |
| 1294 | kfree(objp: opts->base_dir); |
| 1295 | opts->base_dir = p; |
| 1296 | break; |
| 1297 | case ZLOOP_OPT_NR_QUEUES: |
| 1298 | if (match_uint(s: args, result: &token)) { |
| 1299 | ret = -EINVAL; |
| 1300 | goto out; |
| 1301 | } |
| 1302 | if (!token) { |
| 1303 | pr_err("Invalid number of queues\n" ); |
| 1304 | ret = -EINVAL; |
| 1305 | goto out; |
| 1306 | } |
| 1307 | opts->nr_queues = min(token, num_online_cpus()); |
| 1308 | break; |
| 1309 | case ZLOOP_OPT_QUEUE_DEPTH: |
| 1310 | if (match_uint(s: args, result: &token)) { |
| 1311 | ret = -EINVAL; |
| 1312 | goto out; |
| 1313 | } |
| 1314 | if (!token) { |
| 1315 | pr_err("Invalid queue depth\n" ); |
| 1316 | ret = -EINVAL; |
| 1317 | goto out; |
| 1318 | } |
| 1319 | opts->queue_depth = token; |
| 1320 | break; |
| 1321 | case ZLOOP_OPT_BUFFERED_IO: |
| 1322 | opts->buffered_io = true; |
| 1323 | break; |
| 1324 | case ZLOOP_OPT_ZONE_APPEND: |
| 1325 | if (match_uint(s: args, result: &token)) { |
| 1326 | ret = -EINVAL; |
| 1327 | goto out; |
| 1328 | } |
| 1329 | if (token != 0 && token != 1) { |
| 1330 | pr_err("Invalid zone_append value\n" ); |
| 1331 | ret = -EINVAL; |
| 1332 | goto out; |
| 1333 | } |
| 1334 | opts->zone_append = token; |
| 1335 | break; |
| 1336 | case ZLOOP_OPT_ORDERED_ZONE_APPEND: |
| 1337 | opts->ordered_zone_append = true; |
| 1338 | break; |
| 1339 | case ZLOOP_OPT_ERR: |
| 1340 | default: |
| 1341 | pr_warn("unknown parameter or missing value '%s'\n" , p); |
| 1342 | ret = -EINVAL; |
| 1343 | goto out; |
| 1344 | } |
| 1345 | } |
| 1346 | |
| 1347 | ret = -EINVAL; |
| 1348 | if (opts->capacity <= opts->zone_size) { |
| 1349 | pr_err("Invalid capacity\n" ); |
| 1350 | goto out; |
| 1351 | } |
| 1352 | |
| 1353 | if (opts->zone_capacity > opts->zone_size) { |
| 1354 | pr_err("Invalid zone capacity\n" ); |
| 1355 | goto out; |
| 1356 | } |
| 1357 | |
| 1358 | ret = 0; |
| 1359 | out: |
| 1360 | kfree(objp: options); |
| 1361 | return ret; |
| 1362 | } |
| 1363 | |
| 1364 | enum { |
| 1365 | ZLOOP_CTL_ADD, |
| 1366 | ZLOOP_CTL_REMOVE, |
| 1367 | }; |
| 1368 | |
| 1369 | static struct zloop_ctl_op { |
| 1370 | int code; |
| 1371 | const char *name; |
| 1372 | } zloop_ctl_ops[] = { |
| 1373 | { ZLOOP_CTL_ADD, "add" }, |
| 1374 | { ZLOOP_CTL_REMOVE, "remove" }, |
| 1375 | { -1, NULL }, |
| 1376 | }; |
| 1377 | |
| 1378 | static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf, |
| 1379 | size_t count, loff_t *pos) |
| 1380 | { |
| 1381 | struct zloop_options opts = { }; |
| 1382 | struct zloop_ctl_op *op; |
| 1383 | const char *buf, *opts_buf; |
| 1384 | int i, ret; |
| 1385 | |
| 1386 | if (count > PAGE_SIZE) |
| 1387 | return -ENOMEM; |
| 1388 | |
| 1389 | buf = memdup_user_nul(ubuf, count); |
| 1390 | if (IS_ERR(ptr: buf)) |
| 1391 | return PTR_ERR(ptr: buf); |
| 1392 | |
| 1393 | for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) { |
| 1394 | op = &zloop_ctl_ops[i]; |
| 1395 | if (!op->name) { |
| 1396 | pr_err("Invalid operation\n" ); |
| 1397 | ret = -EINVAL; |
| 1398 | goto out; |
| 1399 | } |
| 1400 | if (!strncmp(buf, op->name, strlen(op->name))) |
| 1401 | break; |
| 1402 | } |
| 1403 | |
| 1404 | if (count <= strlen(op->name)) |
| 1405 | opts_buf = NULL; |
| 1406 | else |
| 1407 | opts_buf = buf + strlen(op->name); |
| 1408 | |
| 1409 | ret = zloop_parse_options(opts: &opts, buf: opts_buf); |
| 1410 | if (ret) { |
| 1411 | pr_err("Failed to parse options\n" ); |
| 1412 | goto out; |
| 1413 | } |
| 1414 | |
| 1415 | switch (op->code) { |
| 1416 | case ZLOOP_CTL_ADD: |
| 1417 | ret = zloop_ctl_add(opts: &opts); |
| 1418 | break; |
| 1419 | case ZLOOP_CTL_REMOVE: |
| 1420 | ret = zloop_ctl_remove(opts: &opts); |
| 1421 | break; |
| 1422 | default: |
| 1423 | pr_err("Invalid operation\n" ); |
| 1424 | ret = -EINVAL; |
| 1425 | goto out; |
| 1426 | } |
| 1427 | |
| 1428 | out: |
| 1429 | kfree(objp: opts.base_dir); |
| 1430 | kfree(objp: buf); |
| 1431 | return ret ? ret : count; |
| 1432 | } |
| 1433 | |
| 1434 | static int zloop_ctl_show(struct seq_file *seq_file, void *private) |
| 1435 | { |
| 1436 | const struct match_token *tok; |
| 1437 | int i; |
| 1438 | |
| 1439 | /* Add operation */ |
| 1440 | seq_printf(m: seq_file, fmt: "%s " , zloop_ctl_ops[0].name); |
| 1441 | for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) { |
| 1442 | tok = &zloop_opt_tokens[i]; |
| 1443 | if (!tok->pattern) |
| 1444 | break; |
| 1445 | if (i) |
| 1446 | seq_putc(m: seq_file, c: ','); |
| 1447 | seq_puts(m: seq_file, s: tok->pattern); |
| 1448 | } |
| 1449 | seq_putc(m: seq_file, c: '\n'); |
| 1450 | |
| 1451 | /* Remove operation */ |
| 1452 | seq_puts(m: seq_file, s: zloop_ctl_ops[1].name); |
| 1453 | seq_puts(m: seq_file, s: " id=%d\n" ); |
| 1454 | |
| 1455 | return 0; |
| 1456 | } |
| 1457 | |
| 1458 | static int zloop_ctl_open(struct inode *inode, struct file *file) |
| 1459 | { |
| 1460 | file->private_data = NULL; |
| 1461 | return single_open(file, zloop_ctl_show, NULL); |
| 1462 | } |
| 1463 | |
| 1464 | static int zloop_ctl_release(struct inode *inode, struct file *file) |
| 1465 | { |
| 1466 | return single_release(inode, file); |
| 1467 | } |
| 1468 | |
| 1469 | static const struct file_operations zloop_ctl_fops = { |
| 1470 | .owner = THIS_MODULE, |
| 1471 | .open = zloop_ctl_open, |
| 1472 | .release = zloop_ctl_release, |
| 1473 | .write = zloop_ctl_write, |
| 1474 | .read = seq_read, |
| 1475 | }; |
| 1476 | |
| 1477 | static struct miscdevice zloop_misc = { |
| 1478 | .minor = MISC_DYNAMIC_MINOR, |
| 1479 | .name = "zloop-control" , |
| 1480 | .fops = &zloop_ctl_fops, |
| 1481 | }; |
| 1482 | |
| 1483 | static int __init zloop_init(void) |
| 1484 | { |
| 1485 | int ret; |
| 1486 | |
| 1487 | ret = misc_register(misc: &zloop_misc); |
| 1488 | if (ret) { |
| 1489 | pr_err("Failed to register misc device: %d\n" , ret); |
| 1490 | return ret; |
| 1491 | } |
| 1492 | pr_info("Module loaded\n" ); |
| 1493 | |
| 1494 | return 0; |
| 1495 | } |
| 1496 | |
| 1497 | static void __exit zloop_exit(void) |
| 1498 | { |
| 1499 | misc_deregister(misc: &zloop_misc); |
| 1500 | idr_destroy(&zloop_index_idr); |
| 1501 | } |
| 1502 | |
| 1503 | module_init(zloop_init); |
| 1504 | module_exit(zloop_exit); |
| 1505 | |
| 1506 | MODULE_DESCRIPTION("Zoned loopback device" ); |
| 1507 | MODULE_LICENSE("GPL" ); |
| 1508 | |