| 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | /* |
| 3 | * Copyright (C) 2008 Oracle. All rights reserved. |
| 4 | */ |
| 5 | |
| 6 | #ifndef BTRFS_DELAYED_REF_H |
| 7 | #define BTRFS_DELAYED_REF_H |
| 8 | |
| 9 | #include <linux/types.h> |
| 10 | #include <linux/refcount.h> |
| 11 | #include <linux/list.h> |
| 12 | #include <linux/rbtree.h> |
| 13 | #include <linux/mutex.h> |
| 14 | #include <linux/spinlock.h> |
| 15 | #include <linux/slab.h> |
| 16 | #include <uapi/linux/btrfs_tree.h> |
| 17 | #include "fs.h" |
| 18 | #include "messages.h" |
| 19 | |
| 20 | struct btrfs_trans_handle; |
| 21 | struct btrfs_fs_info; |
| 22 | |
| 23 | /* these are the possible values of struct btrfs_delayed_ref_node->action */ |
| 24 | enum btrfs_delayed_ref_action { |
| 25 | /* Add one backref to the tree */ |
| 26 | BTRFS_ADD_DELAYED_REF = 1, |
| 27 | /* Delete one backref from the tree */ |
| 28 | BTRFS_DROP_DELAYED_REF, |
| 29 | /* Record a full extent allocation */ |
| 30 | BTRFS_ADD_DELAYED_EXTENT, |
| 31 | /* Not changing ref count on head ref */ |
| 32 | BTRFS_UPDATE_DELAYED_HEAD, |
| 33 | } __packed; |
| 34 | |
| 35 | struct btrfs_data_ref { |
| 36 | /* For EXTENT_DATA_REF */ |
| 37 | |
| 38 | /* Inode which refers to this data extent */ |
| 39 | u64 objectid; |
| 40 | |
| 41 | /* |
| 42 | * file_offset - extent_offset |
| 43 | * |
| 44 | * file_offset is the key.offset of the EXTENT_DATA key. |
| 45 | * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. |
| 46 | */ |
| 47 | u64 offset; |
| 48 | }; |
| 49 | |
| 50 | struct btrfs_tree_ref { |
| 51 | /* |
| 52 | * Level of this tree block. |
| 53 | * |
| 54 | * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. |
| 55 | */ |
| 56 | int level; |
| 57 | |
| 58 | /* For non-skinny metadata, no special member needed */ |
| 59 | }; |
| 60 | |
| 61 | struct btrfs_delayed_ref_node { |
| 62 | struct rb_node ref_node; |
| 63 | /* |
| 64 | * If action is BTRFS_ADD_DELAYED_REF, also link this node to |
| 65 | * ref_head->ref_add_list, then we do not need to iterate the |
| 66 | * refs rbtree in the corresponding delayed ref head |
| 67 | * (struct btrfs_delayed_ref_head::ref_tree). |
| 68 | */ |
| 69 | struct list_head add_list; |
| 70 | |
| 71 | /* the starting bytenr of the extent */ |
| 72 | u64 bytenr; |
| 73 | |
| 74 | /* the size of the extent */ |
| 75 | u64 num_bytes; |
| 76 | |
| 77 | /* seq number to keep track of insertion order */ |
| 78 | u64 seq; |
| 79 | |
| 80 | /* The ref_root for this ref */ |
| 81 | u64 ref_root; |
| 82 | |
| 83 | /* |
| 84 | * The parent for this ref, if this isn't set the ref_root is the |
| 85 | * reference owner. |
| 86 | */ |
| 87 | u64 parent; |
| 88 | |
| 89 | /* ref count on this data structure */ |
| 90 | refcount_t refs; |
| 91 | |
| 92 | /* |
| 93 | * how many refs is this entry adding or deleting. For |
| 94 | * head refs, this may be a negative number because it is keeping |
| 95 | * track of the total mods done to the reference count. |
| 96 | * For individual refs, this will always be a positive number |
| 97 | * |
| 98 | * It may be more than one, since it is possible for a single |
| 99 | * parent to have more than one ref on an extent |
| 100 | */ |
| 101 | int ref_mod; |
| 102 | |
| 103 | unsigned int action:8; |
| 104 | unsigned int type:8; |
| 105 | |
| 106 | union { |
| 107 | struct btrfs_tree_ref tree_ref; |
| 108 | struct btrfs_data_ref data_ref; |
| 109 | }; |
| 110 | }; |
| 111 | |
| 112 | struct btrfs_delayed_extent_op { |
| 113 | struct btrfs_disk_key key; |
| 114 | bool update_key; |
| 115 | bool update_flags; |
| 116 | u64 flags_to_set; |
| 117 | }; |
| 118 | |
| 119 | /* |
| 120 | * the head refs are used to hold a lock on a given extent, which allows us |
| 121 | * to make sure that only one process is running the delayed refs |
| 122 | * at a time for a single extent. They also store the sum of all the |
| 123 | * reference count modifications we've queued up. |
| 124 | */ |
| 125 | struct btrfs_delayed_ref_head { |
| 126 | u64 bytenr; |
| 127 | u64 num_bytes; |
| 128 | /* |
| 129 | * the mutex is held while running the refs, and it is also |
| 130 | * held when checking the sum of reference modifications. |
| 131 | */ |
| 132 | struct mutex mutex; |
| 133 | |
| 134 | refcount_t refs; |
| 135 | |
| 136 | /* Protects 'ref_tree' and 'ref_add_list'. */ |
| 137 | spinlock_t lock; |
| 138 | struct rb_root_cached ref_tree; |
| 139 | /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ |
| 140 | struct list_head ref_add_list; |
| 141 | |
| 142 | struct btrfs_delayed_extent_op *extent_op; |
| 143 | |
| 144 | /* |
| 145 | * This is used to track the final ref_mod from all the refs associated |
| 146 | * with this head ref, this is not adjusted as delayed refs are run, |
| 147 | * this is meant to track if we need to do the csum accounting or not. |
| 148 | */ |
| 149 | int total_ref_mod; |
| 150 | |
| 151 | /* |
| 152 | * This is the current outstanding mod references for this bytenr. This |
| 153 | * is used with lookup_extent_info to get an accurate reference count |
| 154 | * for a bytenr, so it is adjusted as delayed refs are run so that any |
| 155 | * on disk reference count + ref_mod is accurate. |
| 156 | */ |
| 157 | int ref_mod; |
| 158 | |
| 159 | /* |
| 160 | * The root that triggered the allocation when must_insert_reserved is |
| 161 | * set to true. |
| 162 | */ |
| 163 | u64 owning_root; |
| 164 | |
| 165 | /* |
| 166 | * Track reserved bytes when setting must_insert_reserved. On success |
| 167 | * or cleanup, we will need to free the reservation. |
| 168 | */ |
| 169 | u64 reserved_bytes; |
| 170 | |
| 171 | /* Tree block level, for metadata only. */ |
| 172 | u8 level; |
| 173 | |
| 174 | /* |
| 175 | * when a new extent is allocated, it is just reserved in memory |
| 176 | * The actual extent isn't inserted into the extent allocation tree |
| 177 | * until the delayed ref is processed. must_insert_reserved is |
| 178 | * used to flag a delayed ref so the accounting can be updated |
| 179 | * when a full insert is done. |
| 180 | * |
| 181 | * It is possible the extent will be freed before it is ever |
| 182 | * inserted into the extent allocation tree. In this case |
| 183 | * we need to update the in ram accounting to properly reflect |
| 184 | * the free has happened. |
| 185 | */ |
| 186 | bool must_insert_reserved; |
| 187 | |
| 188 | bool is_data; |
| 189 | bool is_system; |
| 190 | bool processing; |
| 191 | /* |
| 192 | * Indicate if it's currently in the data structure that tracks head |
| 193 | * refs (struct btrfs_delayed_ref_root::head_refs). |
| 194 | */ |
| 195 | bool tracked; |
| 196 | }; |
| 197 | |
| 198 | enum btrfs_delayed_ref_flags { |
| 199 | /* Indicate that we are flushing delayed refs for the commit */ |
| 200 | BTRFS_DELAYED_REFS_FLUSHING, |
| 201 | }; |
| 202 | |
| 203 | struct btrfs_delayed_ref_root { |
| 204 | /* |
| 205 | * Track head references. |
| 206 | * The keys correspond to the logical address of the extent ("bytenr") |
| 207 | * right shifted by fs_info->sectorsize_bits. This is both to get a more |
| 208 | * dense index space (optimizes xarray structure) and because indexes in |
| 209 | * xarrays are of "unsigned long" type, meaning they are 32 bits wide on |
| 210 | * 32 bits platforms, limiting the extent range to 4G which is too low |
| 211 | * and makes it unusable (truncated index values) on 32 bits platforms. |
| 212 | * Protected by the spinlock 'lock' defined below. |
| 213 | */ |
| 214 | struct xarray head_refs; |
| 215 | |
| 216 | /* |
| 217 | * Track dirty extent records. |
| 218 | * The keys correspond to the logical address of the extent ("bytenr") |
| 219 | * right shifted by fs_info->sectorsize_bits, for same reasons as above. |
| 220 | */ |
| 221 | struct xarray dirty_extents; |
| 222 | |
| 223 | /* |
| 224 | * Protects the xarray head_refs, its entries and the following fields: |
| 225 | * num_heads, num_heads_ready, pending_csums and run_delayed_start. |
| 226 | */ |
| 227 | spinlock_t lock; |
| 228 | |
| 229 | /* Total number of head refs, protected by the spinlock 'lock'. */ |
| 230 | unsigned long num_heads; |
| 231 | |
| 232 | /* |
| 233 | * Total number of head refs ready for processing, protected by the |
| 234 | * spinlock 'lock'. |
| 235 | */ |
| 236 | unsigned long num_heads_ready; |
| 237 | |
| 238 | /* |
| 239 | * Track space reserved for deleting csums of data extents. |
| 240 | * Protected by the spinlock 'lock'. |
| 241 | */ |
| 242 | u64 pending_csums; |
| 243 | |
| 244 | unsigned long flags; |
| 245 | |
| 246 | /* |
| 247 | * Track from which bytenr to start searching ref heads. |
| 248 | * Protected by the spinlock 'lock'. |
| 249 | */ |
| 250 | u64 run_delayed_start; |
| 251 | |
| 252 | /* |
| 253 | * To make qgroup to skip given root. |
| 254 | * This is for snapshot, as btrfs_qgroup_inherit() will manually |
| 255 | * modify counters for snapshot and its source, so we should skip |
| 256 | * the snapshot in new_root/old_roots or it will get calculated twice |
| 257 | */ |
| 258 | u64 qgroup_to_skip; |
| 259 | }; |
| 260 | |
| 261 | enum btrfs_ref_type { |
| 262 | BTRFS_REF_NOT_SET, |
| 263 | BTRFS_REF_DATA, |
| 264 | BTRFS_REF_METADATA, |
| 265 | } __packed; |
| 266 | |
| 267 | struct btrfs_ref { |
| 268 | enum btrfs_ref_type type; |
| 269 | enum btrfs_delayed_ref_action action; |
| 270 | |
| 271 | /* |
| 272 | * Whether this extent should go through qgroup record. |
| 273 | * |
| 274 | * Normally false, but for certain cases like delayed subtree scan, |
| 275 | * setting this flag can hugely reduce qgroup overhead. |
| 276 | */ |
| 277 | bool skip_qgroup; |
| 278 | |
| 279 | u64 bytenr; |
| 280 | u64 num_bytes; |
| 281 | u64 owning_root; |
| 282 | |
| 283 | /* |
| 284 | * The root that owns the reference for this reference, this will be set |
| 285 | * or ->parent will be set, depending on what type of reference this is. |
| 286 | */ |
| 287 | u64 ref_root; |
| 288 | |
| 289 | /* Bytenr of the parent tree block */ |
| 290 | u64 parent; |
| 291 | union { |
| 292 | struct btrfs_data_ref data_ref; |
| 293 | struct btrfs_tree_ref tree_ref; |
| 294 | }; |
| 295 | |
| 296 | #ifdef CONFIG_BTRFS_DEBUG |
| 297 | /* Through which root is this modification. */ |
| 298 | u64 real_root; |
| 299 | #endif |
| 300 | }; |
| 301 | |
| 302 | extern struct kmem_cache *btrfs_delayed_ref_head_cachep; |
| 303 | extern struct kmem_cache *btrfs_delayed_ref_node_cachep; |
| 304 | extern struct kmem_cache *btrfs_delayed_extent_op_cachep; |
| 305 | |
| 306 | int __init btrfs_delayed_ref_init(void); |
| 307 | void __cold btrfs_delayed_ref_exit(void); |
| 308 | |
| 309 | static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info, |
| 310 | int num_delayed_refs) |
| 311 | { |
| 312 | u64 num_bytes; |
| 313 | |
| 314 | num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items: num_delayed_refs); |
| 315 | |
| 316 | /* |
| 317 | * We have to check the mount option here because we could be enabling |
| 318 | * the free space tree for the first time and don't have the compat_ro |
| 319 | * option set yet. |
| 320 | * |
| 321 | * We need extra reservations if we have the free space tree because |
| 322 | * we'll have to modify that tree as well. |
| 323 | */ |
| 324 | if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) |
| 325 | num_bytes *= 2; |
| 326 | |
| 327 | return num_bytes; |
| 328 | } |
| 329 | |
| 330 | static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info, |
| 331 | int num_csum_items) |
| 332 | { |
| 333 | /* |
| 334 | * Deleting csum items does not result in new nodes/leaves and does not |
| 335 | * require changing the free space tree, only the csum tree, so this is |
| 336 | * all we need. |
| 337 | */ |
| 338 | return btrfs_calc_metadata_size(fs_info, num_items: num_csum_items); |
| 339 | } |
| 340 | |
| 341 | void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, |
| 342 | bool skip_qgroup); |
| 343 | void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, |
| 344 | u64 mod_root, bool skip_qgroup); |
| 345 | |
| 346 | static inline struct btrfs_delayed_extent_op * |
| 347 | btrfs_alloc_delayed_extent_op(void) |
| 348 | { |
| 349 | return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); |
| 350 | } |
| 351 | |
| 352 | static inline void |
| 353 | btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) |
| 354 | { |
| 355 | if (op) |
| 356 | kmem_cache_free(s: btrfs_delayed_extent_op_cachep, objp: op); |
| 357 | } |
| 358 | |
| 359 | void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref); |
| 360 | |
| 361 | static inline u64 btrfs_ref_head_to_space_flags( |
| 362 | struct btrfs_delayed_ref_head *head_ref) |
| 363 | { |
| 364 | if (head_ref->is_data) |
| 365 | return BTRFS_BLOCK_GROUP_DATA; |
| 366 | else if (head_ref->is_system) |
| 367 | return BTRFS_BLOCK_GROUP_SYSTEM; |
| 368 | return BTRFS_BLOCK_GROUP_METADATA; |
| 369 | } |
| 370 | |
| 371 | static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) |
| 372 | { |
| 373 | if (refcount_dec_and_test(r: &head->refs)) |
| 374 | kmem_cache_free(s: btrfs_delayed_ref_head_cachep, objp: head); |
| 375 | } |
| 376 | |
| 377 | int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, |
| 378 | struct btrfs_ref *generic_ref, |
| 379 | struct btrfs_delayed_extent_op *extent_op); |
| 380 | int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, |
| 381 | struct btrfs_ref *generic_ref, |
| 382 | u64 reserved); |
| 383 | int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, |
| 384 | u64 bytenr, u64 num_bytes, u8 level, |
| 385 | struct btrfs_delayed_extent_op *extent_op); |
| 386 | void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, |
| 387 | struct btrfs_delayed_ref_root *delayed_refs, |
| 388 | struct btrfs_delayed_ref_head *head); |
| 389 | |
| 390 | struct btrfs_delayed_ref_head * |
| 391 | btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, |
| 392 | struct btrfs_delayed_ref_root *delayed_refs, |
| 393 | u64 bytenr); |
| 394 | static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) |
| 395 | { |
| 396 | mutex_unlock(lock: &head->mutex); |
| 397 | } |
| 398 | void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, |
| 399 | struct btrfs_delayed_ref_root *delayed_refs, |
| 400 | struct btrfs_delayed_ref_head *head); |
| 401 | |
| 402 | struct btrfs_delayed_ref_head *btrfs_select_ref_head( |
| 403 | const struct btrfs_fs_info *fs_info, |
| 404 | struct btrfs_delayed_ref_root *delayed_refs); |
| 405 | void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, |
| 406 | struct btrfs_delayed_ref_head *head); |
| 407 | struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head); |
| 408 | |
| 409 | int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); |
| 410 | |
| 411 | void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums); |
| 412 | void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); |
| 413 | void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); |
| 414 | void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); |
| 415 | void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); |
| 416 | void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); |
| 417 | int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, |
| 418 | enum btrfs_reserve_flush_enum flush); |
| 419 | bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); |
| 420 | bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, |
| 421 | u64 root, u64 parent); |
| 422 | void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); |
| 423 | |
| 424 | static inline u64 btrfs_delayed_ref_owner(const struct btrfs_delayed_ref_node *node) |
| 425 | { |
| 426 | if (node->type == BTRFS_EXTENT_DATA_REF_KEY || |
| 427 | node->type == BTRFS_SHARED_DATA_REF_KEY) |
| 428 | return node->data_ref.objectid; |
| 429 | return node->tree_ref.level; |
| 430 | } |
| 431 | |
| 432 | static inline u64 btrfs_delayed_ref_offset(const struct btrfs_delayed_ref_node *node) |
| 433 | { |
| 434 | if (node->type == BTRFS_EXTENT_DATA_REF_KEY || |
| 435 | node->type == BTRFS_SHARED_DATA_REF_KEY) |
| 436 | return node->data_ref.offset; |
| 437 | return 0; |
| 438 | } |
| 439 | |
| 440 | static inline u8 btrfs_ref_type(const struct btrfs_ref *ref) |
| 441 | { |
| 442 | ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA); |
| 443 | |
| 444 | if (ref->type == BTRFS_REF_DATA) { |
| 445 | if (ref->parent) |
| 446 | return BTRFS_SHARED_DATA_REF_KEY; |
| 447 | else |
| 448 | return BTRFS_EXTENT_DATA_REF_KEY; |
| 449 | } else { |
| 450 | if (ref->parent) |
| 451 | return BTRFS_SHARED_BLOCK_REF_KEY; |
| 452 | else |
| 453 | return BTRFS_TREE_BLOCK_REF_KEY; |
| 454 | } |
| 455 | |
| 456 | return 0; |
| 457 | } |
| 458 | |
| 459 | #endif |
| 460 | |