| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * NVMe I/O command implementation. |
| 4 | * Copyright (c) 2015-2016 HGST, a Western Digital Company. |
| 5 | */ |
| 6 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 7 | #include <linux/blkdev.h> |
| 8 | #include <linux/blk-integrity.h> |
| 9 | #include <linux/memremap.h> |
| 10 | #include <linux/module.h> |
| 11 | #include "nvmet.h" |
| 12 | |
| 13 | void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) |
| 14 | { |
| 15 | /* Logical blocks per physical block, 0's based. */ |
| 16 | const __le16 lpp0b = to0based(a: bdev_physical_block_size(bdev) / |
| 17 | bdev_logical_block_size(bdev)); |
| 18 | |
| 19 | /* |
| 20 | * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, |
| 21 | * NAWUPF, and NACWU are defined for this namespace and should be |
| 22 | * used by the host for this namespace instead of the AWUN, AWUPF, |
| 23 | * and ACWU fields in the Identify Controller data structure. If |
| 24 | * any of these fields are zero that means that the corresponding |
| 25 | * field from the identify controller data structure should be used. |
| 26 | */ |
| 27 | id->nsfeat |= 1 << 1; |
| 28 | id->nawun = lpp0b; |
| 29 | id->nawupf = lpp0b; |
| 30 | id->nacwu = lpp0b; |
| 31 | |
| 32 | /* |
| 33 | * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and |
| 34 | * NOWS are defined for this namespace and should be used by |
| 35 | * the host for I/O optimization. |
| 36 | */ |
| 37 | id->nsfeat |= 1 << 4; |
| 38 | /* NPWG = Namespace Preferred Write Granularity. 0's based */ |
| 39 | id->npwg = to0based(a: bdev_io_min(bdev) / bdev_logical_block_size(bdev)); |
| 40 | /* NPWA = Namespace Preferred Write Alignment. 0's based */ |
| 41 | id->npwa = id->npwg; |
| 42 | /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ |
| 43 | id->npdg = to0based(a: bdev_discard_granularity(bdev) / |
| 44 | bdev_logical_block_size(bdev)); |
| 45 | /* NPDG = Namespace Preferred Deallocate Alignment */ |
| 46 | id->npda = id->npdg; |
| 47 | /* NOWS = Namespace Optimal Write Size */ |
| 48 | id->nows = to0based(a: bdev_io_opt(bdev) / bdev_logical_block_size(bdev)); |
| 49 | |
| 50 | /* Set WZDS and DRB if device supports unmapped write zeroes */ |
| 51 | if (bdev_write_zeroes_unmap_sectors(bdev)) |
| 52 | id->dlfeat = (1 << 3) | 0x1; |
| 53 | } |
| 54 | |
| 55 | void nvmet_bdev_ns_disable(struct nvmet_ns *ns) |
| 56 | { |
| 57 | if (ns->bdev_file) { |
| 58 | fput(ns->bdev_file); |
| 59 | ns->bdev = NULL; |
| 60 | ns->bdev_file = NULL; |
| 61 | } |
| 62 | } |
| 63 | |
| 64 | static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) |
| 65 | { |
| 66 | struct blk_integrity *bi = bdev_get_integrity(bdev: ns->bdev); |
| 67 | |
| 68 | if (!bi) |
| 69 | return; |
| 70 | |
| 71 | if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC) { |
| 72 | ns->metadata_size = bi->metadata_size; |
| 73 | if (bi->flags & BLK_INTEGRITY_REF_TAG) |
| 74 | ns->pi_type = NVME_NS_DPS_PI_TYPE1; |
| 75 | else |
| 76 | ns->pi_type = NVME_NS_DPS_PI_TYPE3; |
| 77 | } else { |
| 78 | ns->metadata_size = 0; |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | int nvmet_bdev_ns_enable(struct nvmet_ns *ns) |
| 83 | { |
| 84 | int ret; |
| 85 | |
| 86 | /* |
| 87 | * When buffered_io namespace attribute is enabled that means user want |
| 88 | * this block device to be used as a file, so block device can take |
| 89 | * an advantage of cache. |
| 90 | */ |
| 91 | if (ns->buffered_io) |
| 92 | return -ENOTBLK; |
| 93 | |
| 94 | ns->bdev_file = bdev_file_open_by_path(path: ns->device_path, |
| 95 | BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); |
| 96 | if (IS_ERR(ptr: ns->bdev_file)) { |
| 97 | ret = PTR_ERR(ptr: ns->bdev_file); |
| 98 | if (ret != -ENOTBLK) { |
| 99 | pr_err("failed to open block device %s: (%d)\n" , |
| 100 | ns->device_path, ret); |
| 101 | } |
| 102 | ns->bdev_file = NULL; |
| 103 | return ret; |
| 104 | } |
| 105 | ns->bdev = file_bdev(bdev_file: ns->bdev_file); |
| 106 | ns->size = bdev_nr_bytes(bdev: ns->bdev); |
| 107 | ns->blksize_shift = blksize_bits(size: bdev_logical_block_size(bdev: ns->bdev)); |
| 108 | |
| 109 | ns->pi_type = 0; |
| 110 | ns->metadata_size = 0; |
| 111 | if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) |
| 112 | nvmet_bdev_ns_enable_integrity(ns); |
| 113 | |
| 114 | if (bdev_is_zoned(bdev: ns->bdev)) { |
| 115 | if (!nvmet_bdev_zns_enable(ns)) { |
| 116 | nvmet_bdev_ns_disable(ns); |
| 117 | return -EINVAL; |
| 118 | } |
| 119 | ns->csi = NVME_CSI_ZNS; |
| 120 | } |
| 121 | |
| 122 | return 0; |
| 123 | } |
| 124 | |
| 125 | void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns) |
| 126 | { |
| 127 | ns->size = bdev_nr_bytes(bdev: ns->bdev); |
| 128 | } |
| 129 | |
| 130 | u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) |
| 131 | { |
| 132 | u16 status = NVME_SC_SUCCESS; |
| 133 | |
| 134 | if (likely(blk_sts == BLK_STS_OK)) |
| 135 | return status; |
| 136 | /* |
| 137 | * Right now there exists M : 1 mapping between block layer error |
| 138 | * to the NVMe status code (see nvme_error_status()). For consistency, |
| 139 | * when we reverse map we use most appropriate NVMe Status code from |
| 140 | * the group of the NVMe status codes used in the nvme_error_status(). |
| 141 | */ |
| 142 | switch (blk_sts) { |
| 143 | case BLK_STS_NOSPC: |
| 144 | status = NVME_SC_CAP_EXCEEDED | NVME_STATUS_DNR; |
| 145 | req->error_loc = offsetof(struct nvme_rw_command, length); |
| 146 | break; |
| 147 | case BLK_STS_TARGET: |
| 148 | status = NVME_SC_LBA_RANGE | NVME_STATUS_DNR; |
| 149 | req->error_loc = offsetof(struct nvme_rw_command, slba); |
| 150 | break; |
| 151 | case BLK_STS_NOTSUPP: |
| 152 | status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; |
| 153 | req->error_loc = offsetof(struct nvme_common_command, opcode); |
| 154 | break; |
| 155 | case BLK_STS_MEDIUM: |
| 156 | status = NVME_SC_ACCESS_DENIED; |
| 157 | req->error_loc = offsetof(struct nvme_rw_command, nsid); |
| 158 | break; |
| 159 | case BLK_STS_IOERR: |
| 160 | default: |
| 161 | status = NVME_SC_INTERNAL | NVME_STATUS_DNR; |
| 162 | req->error_loc = offsetof(struct nvme_common_command, opcode); |
| 163 | } |
| 164 | |
| 165 | switch (req->cmd->common.opcode) { |
| 166 | case nvme_cmd_read: |
| 167 | case nvme_cmd_write: |
| 168 | req->error_slba = le64_to_cpu(req->cmd->rw.slba); |
| 169 | break; |
| 170 | case nvme_cmd_write_zeroes: |
| 171 | req->error_slba = |
| 172 | le64_to_cpu(req->cmd->write_zeroes.slba); |
| 173 | break; |
| 174 | default: |
| 175 | req->error_slba = 0; |
| 176 | } |
| 177 | return status; |
| 178 | } |
| 179 | |
| 180 | static void nvmet_bio_done(struct bio *bio) |
| 181 | { |
| 182 | struct nvmet_req *req = bio->bi_private; |
| 183 | blk_status_t blk_status = bio->bi_status; |
| 184 | |
| 185 | nvmet_req_bio_put(req, bio); |
| 186 | nvmet_req_complete(req, status: blk_to_nvme_status(req, blk_sts: blk_status)); |
| 187 | } |
| 188 | |
| 189 | #ifdef CONFIG_BLK_DEV_INTEGRITY |
| 190 | static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, |
| 191 | struct sg_mapping_iter *miter) |
| 192 | { |
| 193 | struct blk_integrity *bi; |
| 194 | struct bio_integrity_payload *bip; |
| 195 | int rc; |
| 196 | size_t resid, len; |
| 197 | |
| 198 | bi = bdev_get_integrity(bdev: req->ns->bdev); |
| 199 | if (unlikely(!bi)) { |
| 200 | pr_err("Unable to locate bio_integrity\n" ); |
| 201 | return -ENODEV; |
| 202 | } |
| 203 | |
| 204 | bip = bio_integrity_alloc(bio, GFP_NOIO, |
| 205 | nr: bio_max_segs(nr_segs: req->metadata_sg_cnt)); |
| 206 | if (IS_ERR(ptr: bip)) { |
| 207 | pr_err("Unable to allocate bio_integrity_payload\n" ); |
| 208 | return PTR_ERR(ptr: bip); |
| 209 | } |
| 210 | |
| 211 | /* virtual start sector must be in integrity interval units */ |
| 212 | bip_set_seed(bip, seed: bio->bi_iter.bi_sector >> |
| 213 | (bi->interval_exp - SECTOR_SHIFT)); |
| 214 | |
| 215 | resid = bio_integrity_bytes(bi, bio_sectors(bio)); |
| 216 | while (resid > 0 && sg_miter_next(miter)) { |
| 217 | len = min_t(size_t, miter->length, resid); |
| 218 | rc = bio_integrity_add_page(bio, page: miter->page, len, |
| 219 | offset_in_page(miter->addr)); |
| 220 | if (unlikely(rc != len)) { |
| 221 | pr_err("bio_integrity_add_page() failed; %d\n" , rc); |
| 222 | sg_miter_stop(miter); |
| 223 | return -ENOMEM; |
| 224 | } |
| 225 | |
| 226 | resid -= len; |
| 227 | if (len < miter->length) |
| 228 | miter->consumed -= miter->length - len; |
| 229 | } |
| 230 | sg_miter_stop(miter); |
| 231 | |
| 232 | return 0; |
| 233 | } |
| 234 | #else |
| 235 | static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, |
| 236 | struct sg_mapping_iter *miter) |
| 237 | { |
| 238 | return -EINVAL; |
| 239 | } |
| 240 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
| 241 | |
| 242 | static void nvmet_bdev_execute_rw(struct nvmet_req *req) |
| 243 | { |
| 244 | unsigned int sg_cnt = req->sg_cnt; |
| 245 | struct bio *bio; |
| 246 | struct scatterlist *sg; |
| 247 | struct blk_plug plug; |
| 248 | sector_t sector; |
| 249 | blk_opf_t opf; |
| 250 | int i, rc; |
| 251 | struct sg_mapping_iter prot_miter; |
| 252 | unsigned int iter_flags; |
| 253 | unsigned int total_len = nvmet_rw_data_len(req) + req->metadata_len; |
| 254 | |
| 255 | if (!nvmet_check_transfer_len(req, len: total_len)) |
| 256 | return; |
| 257 | |
| 258 | if (!req->sg_cnt) { |
| 259 | nvmet_req_complete(req, status: 0); |
| 260 | return; |
| 261 | } |
| 262 | |
| 263 | if (req->cmd->rw.opcode == nvme_cmd_write) { |
| 264 | opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; |
| 265 | if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) |
| 266 | opf |= REQ_FUA; |
| 267 | iter_flags = SG_MITER_TO_SG; |
| 268 | } else { |
| 269 | opf = REQ_OP_READ; |
| 270 | iter_flags = SG_MITER_FROM_SG; |
| 271 | } |
| 272 | |
| 273 | if (req->cmd->rw.control & cpu_to_le16(NVME_RW_LR)) |
| 274 | opf |= REQ_FAILFAST_DEV; |
| 275 | |
| 276 | if (is_pci_p2pdma_page(page: sg_page(sg: req->sg))) |
| 277 | opf |= REQ_NOMERGE; |
| 278 | |
| 279 | sector = nvmet_lba_to_sect(ns: req->ns, lba: req->cmd->rw.slba); |
| 280 | |
| 281 | if (nvmet_use_inline_bvec(req)) { |
| 282 | bio = &req->b.inline_bio; |
| 283 | bio_init(bio, bdev: req->ns->bdev, table: req->inline_bvec, |
| 284 | ARRAY_SIZE(req->inline_bvec), opf); |
| 285 | } else { |
| 286 | bio = bio_alloc(bdev: req->ns->bdev, nr_vecs: bio_max_segs(nr_segs: sg_cnt), opf, |
| 287 | GFP_KERNEL); |
| 288 | } |
| 289 | bio->bi_iter.bi_sector = sector; |
| 290 | bio->bi_private = req; |
| 291 | bio->bi_end_io = nvmet_bio_done; |
| 292 | |
| 293 | blk_start_plug(&plug); |
| 294 | if (req->metadata_len) |
| 295 | sg_miter_start(miter: &prot_miter, sgl: req->metadata_sg, |
| 296 | nents: req->metadata_sg_cnt, flags: iter_flags); |
| 297 | |
| 298 | for_each_sg(req->sg, sg, req->sg_cnt, i) { |
| 299 | while (bio_add_page(bio, page: sg_page(sg), len: sg->length, off: sg->offset) |
| 300 | != sg->length) { |
| 301 | struct bio *prev = bio; |
| 302 | |
| 303 | if (req->metadata_len) { |
| 304 | rc = nvmet_bdev_alloc_bip(req, bio, |
| 305 | miter: &prot_miter); |
| 306 | if (unlikely(rc)) { |
| 307 | bio_io_error(bio); |
| 308 | return; |
| 309 | } |
| 310 | } |
| 311 | |
| 312 | bio = bio_alloc(bdev: req->ns->bdev, nr_vecs: bio_max_segs(nr_segs: sg_cnt), |
| 313 | opf, GFP_KERNEL); |
| 314 | bio->bi_iter.bi_sector = sector; |
| 315 | |
| 316 | bio_chain(bio, prev); |
| 317 | submit_bio(bio: prev); |
| 318 | } |
| 319 | |
| 320 | sector += sg->length >> 9; |
| 321 | sg_cnt--; |
| 322 | } |
| 323 | |
| 324 | if (req->metadata_len) { |
| 325 | rc = nvmet_bdev_alloc_bip(req, bio, miter: &prot_miter); |
| 326 | if (unlikely(rc)) { |
| 327 | bio_io_error(bio); |
| 328 | return; |
| 329 | } |
| 330 | } |
| 331 | |
| 332 | submit_bio(bio); |
| 333 | blk_finish_plug(&plug); |
| 334 | } |
| 335 | |
| 336 | static void nvmet_bdev_execute_flush(struct nvmet_req *req) |
| 337 | { |
| 338 | struct bio *bio = &req->b.inline_bio; |
| 339 | |
| 340 | if (!bdev_write_cache(bdev: req->ns->bdev)) { |
| 341 | nvmet_req_complete(req, status: NVME_SC_SUCCESS); |
| 342 | return; |
| 343 | } |
| 344 | |
| 345 | if (!nvmet_check_transfer_len(req, len: 0)) |
| 346 | return; |
| 347 | |
| 348 | bio_init(bio, bdev: req->ns->bdev, table: req->inline_bvec, |
| 349 | ARRAY_SIZE(req->inline_bvec), opf: REQ_OP_WRITE | REQ_PREFLUSH); |
| 350 | bio->bi_private = req; |
| 351 | bio->bi_end_io = nvmet_bio_done; |
| 352 | |
| 353 | submit_bio(bio); |
| 354 | } |
| 355 | |
| 356 | u16 nvmet_bdev_flush(struct nvmet_req *req) |
| 357 | { |
| 358 | if (!bdev_write_cache(bdev: req->ns->bdev)) |
| 359 | return 0; |
| 360 | |
| 361 | if (blkdev_issue_flush(bdev: req->ns->bdev)) |
| 362 | return NVME_SC_INTERNAL | NVME_STATUS_DNR; |
| 363 | return 0; |
| 364 | } |
| 365 | |
| 366 | static u16 nvmet_bdev_discard_range(struct nvmet_req *req, |
| 367 | struct nvme_dsm_range *range, struct bio **bio) |
| 368 | { |
| 369 | struct nvmet_ns *ns = req->ns; |
| 370 | int ret; |
| 371 | |
| 372 | ret = __blkdev_issue_discard(bdev: ns->bdev, |
| 373 | sector: nvmet_lba_to_sect(ns, lba: range->slba), |
| 374 | le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), |
| 375 | GFP_KERNEL, biop: bio); |
| 376 | if (ret && ret != -EOPNOTSUPP) { |
| 377 | req->error_slba = le64_to_cpu(range->slba); |
| 378 | return errno_to_nvme_status(req, errno: ret); |
| 379 | } |
| 380 | return NVME_SC_SUCCESS; |
| 381 | } |
| 382 | |
| 383 | static void nvmet_bdev_execute_discard(struct nvmet_req *req) |
| 384 | { |
| 385 | struct nvme_dsm_range range; |
| 386 | struct bio *bio = NULL; |
| 387 | int i; |
| 388 | u16 status; |
| 389 | |
| 390 | for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { |
| 391 | status = nvmet_copy_from_sgl(req, off: i * sizeof(range), buf: &range, |
| 392 | len: sizeof(range)); |
| 393 | if (status) |
| 394 | break; |
| 395 | |
| 396 | status = nvmet_bdev_discard_range(req, range: &range, bio: &bio); |
| 397 | if (status) |
| 398 | break; |
| 399 | } |
| 400 | |
| 401 | if (bio) { |
| 402 | bio->bi_private = req; |
| 403 | bio->bi_end_io = nvmet_bio_done; |
| 404 | if (status) |
| 405 | bio_io_error(bio); |
| 406 | else |
| 407 | submit_bio(bio); |
| 408 | } else { |
| 409 | nvmet_req_complete(req, status); |
| 410 | } |
| 411 | } |
| 412 | |
| 413 | static void nvmet_bdev_execute_dsm(struct nvmet_req *req) |
| 414 | { |
| 415 | if (!nvmet_check_data_len_lte(req, data_len: nvmet_dsm_len(req))) |
| 416 | return; |
| 417 | |
| 418 | switch (le32_to_cpu(req->cmd->dsm.attributes)) { |
| 419 | case NVME_DSMGMT_AD: |
| 420 | nvmet_bdev_execute_discard(req); |
| 421 | return; |
| 422 | case NVME_DSMGMT_IDR: |
| 423 | case NVME_DSMGMT_IDW: |
| 424 | default: |
| 425 | /* Not supported yet */ |
| 426 | nvmet_req_complete(req, status: 0); |
| 427 | return; |
| 428 | } |
| 429 | } |
| 430 | |
| 431 | static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) |
| 432 | { |
| 433 | struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes; |
| 434 | struct bio *bio = NULL; |
| 435 | sector_t sector; |
| 436 | sector_t nr_sector; |
| 437 | int ret; |
| 438 | |
| 439 | if (!nvmet_check_transfer_len(req, len: 0)) |
| 440 | return; |
| 441 | |
| 442 | sector = nvmet_lba_to_sect(ns: req->ns, lba: write_zeroes->slba); |
| 443 | nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << |
| 444 | (req->ns->blksize_shift - 9)); |
| 445 | |
| 446 | ret = __blkdev_issue_zeroout(bdev: req->ns->bdev, sector, nr_sects: nr_sector, |
| 447 | GFP_KERNEL, biop: &bio, flags: 0); |
| 448 | if (bio) { |
| 449 | bio->bi_private = req; |
| 450 | bio->bi_end_io = nvmet_bio_done; |
| 451 | submit_bio(bio); |
| 452 | } else { |
| 453 | nvmet_req_complete(req, status: errno_to_nvme_status(req, errno: ret)); |
| 454 | } |
| 455 | } |
| 456 | |
| 457 | u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) |
| 458 | { |
| 459 | switch (req->cmd->common.opcode) { |
| 460 | case nvme_cmd_read: |
| 461 | case nvme_cmd_write: |
| 462 | req->execute = nvmet_bdev_execute_rw; |
| 463 | if (req->sq->ctrl->pi_support && nvmet_ns_has_pi(ns: req->ns)) |
| 464 | req->metadata_len = nvmet_rw_metadata_len(req); |
| 465 | return 0; |
| 466 | case nvme_cmd_flush: |
| 467 | req->execute = nvmet_bdev_execute_flush; |
| 468 | return 0; |
| 469 | case nvme_cmd_dsm: |
| 470 | req->execute = nvmet_bdev_execute_dsm; |
| 471 | return 0; |
| 472 | case nvme_cmd_write_zeroes: |
| 473 | req->execute = nvmet_bdev_execute_write_zeroes; |
| 474 | return 0; |
| 475 | default: |
| 476 | return nvmet_report_invalid_opcode(req); |
| 477 | } |
| 478 | } |
| 479 | |