From 94f54924b96d3565c6b559294b3401b5496c21ac Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 12 Sep 2025 15:43:21 +0900 Subject: btrfs: zoned: fix conventional zone capacity calculation When a block group contains both conventional zone and sequential zone, the capacity of the block group is wrongly set to the block group's full length. The capacity should be calculated in btrfs_load_block_group_* using the last allocation offset. Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree") CC: stable@vger.kernel.org # v6.12+ Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 838149fa60ce..8f006dff8893 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1317,6 +1317,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, if (!btrfs_dev_is_sequential(device, info->physical)) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; + info->capacity = device->zone_info->zone_size; return 0; } @@ -1683,8 +1684,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { - /* Zone capacity is always zone size in emulation */ - cache->zone_capacity = cache->length; ret = calculate_alloc_pointer(cache, &last_alloc, new); if (ret) { btrfs_err(fs_info, @@ -1693,6 +1692,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; + cache->zone_capacity = cache->length; set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } -- cgit v1.3-3-g829e From 6a1ab50135ce829b834b448ce49867b5210a1641 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 16 Sep 2025 11:46:11 +0900 Subject: btrfs: zoned: fix stripe width calculation The stripe offset calculation in the zoned code for raid0 and raid10 wrongly uses map->stripe_size to calculate it. In fact, map->stripe_size is the size of the device extent composing the block group, which always is the zone_size on the zoned setup. Fix it by using BTRFS_STRIPE_LEN and BTRFS_STRIPE_LEN_SHIFT. Also, optimize the calculation a bit by doing the common calculation only once. Fixes: c0d90a79e8e6 ("btrfs: zoned: fix alloc_offset calculation for partly conventional block groups") CC: stable@vger.kernel.org # 6.17+ Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 56 ++++++++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 8f006dff8893..b622c73fe30f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1523,6 +1523,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1530,28 +1532,26 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, map->num_stripes); - div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > i) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == i) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if (test_bit(0, active) != test_bit(i, active)) { @@ -1575,6 +1575,8 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1582,6 +1584,14 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes / map->sub_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; @@ -1595,26 +1605,12 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, - map->num_stripes / map->sub_stripes); - div_u64_rem(stripe_nr, - (map->num_stripes / map->sub_stripes), - &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > (i / map->sub_stripes)) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == (i / map->sub_stripes)) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if ((i % map->sub_stripes) == 0) { -- cgit v1.3-3-g829e From bfe3d755ef7cec71aac6ecda34a107624735aac7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 29 Oct 2025 13:05:32 +0000 Subject: btrfs: do not update last_log_commit when logging inode due to a new name When logging that a new name exists, we skip updating the inode's last_log_commit field to prevent a later explicit fsync against the inode from doing nothing (as updating last_log_commit makes btrfs_inode_in_log() return true). We are detecting, at btrfs_log_inode(), that logging a new name is happening by checking the logging mode is not LOG_INODE_EXISTS, but that is not enough because we may log parent directories when logging a new name of a file in LOG_INODE_ALL mode - we need to check that the logging_new_name field of the log context too. An example scenario where this results in an explicit fsync against a directory not persisting changes to the directory is the following: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ touch /mnt/foo $ sync $ mkdir /mnt/dir # Write some data to our file and fsync it. $ xfs_io -c "pwrite -S 0xab 0 64K" -c "fsync" /mnt/foo # Add a new link to our file. Since the file was logged before, we # update it in the log tree by calling btrfs_log_new_name(). $ ln /mnt/foo /mnt/dir/bar # fsync the root directory - we expect it to persist the dentry for # the new directory "dir". $ xfs_io -c "fsync" /mnt After mounting the fs the entry for directory "dir" does not exists, despite the explicit fsync on the root directory. Here's why this happens: 1) When we fsync the file we log the inode, so that it's present in the log tree; 2) When adding the new link we enter btrfs_log_new_name(), and since the inode is in the log tree we proceed to updating the inode in the log tree; 3) We first set the inode's last_unlink_trans to the current transaction (early in btrfs_log_new_name()); 4) We then eventually enter btrfs_log_inode_parent(), and after logging the file's inode, we call btrfs_log_all_parents() because the inode's last_unlink_trans matches the current transaction's ID (updated in the previous step); 5) So btrfs_log_all_parents() logs the root directory by calling btrfs_log_inode() for the root's inode with a log mode of LOG_INODE_ALL so that new dentries are logged; 6) At btrfs_log_inode(), because the log mode is LOG_INODE_ALL, we update root inode's last_log_commit to the last transaction that changed the inode (->last_sub_trans field of the inode), which corresponds to the current transaction's ID; 7) Then later when user space explicitly calls fsync against the root directory, we enter btrfs_sync_file(), which calls skip_inode_logging() and that returns true, since its call to btrfs_inode_in_log() returns true and there are no ordered extents (it's a directory, never has ordered extents). This results in btrfs_sync_file() returning without syncing the log or committing the current transaction, so all the updates we did when logging the new name, including logging the root directory, are not persisted. So fix this by but updating the inode's last_log_commit if we are sure we are not logging a new name (if ctx->logging_new_name is false). A test case for fstests will follow soon. Reported-by: Vyacheslav Kovalevsky Link: https://lore.kernel.org/linux-btrfs/03c5d7ec-5b3d-49d1-95bc-8970a7f82d87@gmail.com/ Fixes: 130341be7ffa ("btrfs: always update the logged transaction when logging new names") CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 00a59fb79167..98599644986f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7122,7 +7122,7 @@ log_extents: * a power failure unless the log was synced as part of an fsync * against any other unrelated inode. */ - if (inode_only != LOG_INODE_EXISTS) + if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); -- cgit v1.3-3-g829e From 5fea61aa1ca70c4b3738eebad9ce2d7e7938ebbd Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Wed, 5 Nov 2025 03:53:21 +0000 Subject: btrfs: scrub: put bio after errors in scrub_raid56_parity_stripe() scrub_raid56_parity_stripe() allocates a bio with bio_alloc(), but fails to release it on some error paths, leading to a potential memory leak. Add the missing bio_put() calls to properly drop the bio reference in those error cases. Fixes: 1009254bf22a3 ("btrfs: scrub: use scrub_stripe to implement RAID56 P/Q scrub") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Qu Wenruo Signed-off-by: Zilin Guan Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 651b11884f82..ba20d9286a34 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2203,6 +2203,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, &length, &bioc, NULL, NULL); if (ret < 0) { + bio_put(bio); btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); goto out; @@ -2212,6 +2213,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_put_bioc(bioc); if (!rbio) { ret = -ENOMEM; + bio_put(bio); btrfs_bio_counter_dec(fs_info); goto out; } -- cgit v1.3-3-g829e From c367af440e03eba7beb0c9f3fe540f9bcb69134a Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Wed, 5 Nov 2025 02:37:22 +0000 Subject: btrfs: release root after error in data_reloc_print_warning_inode() data_reloc_print_warning_inode() calls btrfs_get_fs_root() to obtain local_root, but fails to release its reference when paths_from_inode() returns an error. This causes a potential memory leak. Add a missing btrfs_put_root() call in the error path to properly decrease the reference count of local_root. Fixes: b9a9a85059cde ("btrfs: output affected files when relocation fails") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Qu Wenruo Signed-off-by: Zilin Guan Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b95175116ea3..d097532fd85b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -177,8 +177,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return ret; } ret = paths_from_inode(inum, ipath); - if (ret < 0) + if (ret < 0) { + btrfs_put_root(local_root); goto err; + } /* * We deliberately ignore the bit ipath might have been too small to -- cgit v1.3-3-g829e