Skip to content

Commit c53e965

Browse files
adam900710kdave
authored andcommitted
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM] There are known problem related to how btrfs handles qgroup reserved space. One of the most obvious case is the the test case btrfs/153, which do fallocate, then write into the preallocated range. btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad) --- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800 +++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800 @@ -1,2 +1,5 @@ QA output created by 153 +pwrite: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded Silence is golden ... (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff) [CAUSE] Since commit c6887cd ("Btrfs: don't do nocow check unless we have to"), we always reserve space no matter if it's COW or not. Such behavior change is mostly for performance, and reverting it is not a good idea anyway. For preallcoated extent, we reserve qgroup data space for it already, and since we also reserve data space for qgroup at buffered write time, it needs twice the space for us to write into preallocated space. This leads to the -EDQUOT in buffered write routine. And we can't follow the same solution, unlike data/meta space check, qgroup reserved space is shared between data/metadata. The EDQUOT can happen at the metadata reservation, so doing NODATACOW check after qgroup reservation failure is not a solution. [FIX] To solve the problem, we don't return -EDQUOT directly, but every time we got a -EDQUOT, we try to flush qgroup space: - Flush all inodes of the root NODATACOW writes will free the qgroup reserved at run_dealloc_range(). However we don't have the infrastructure to only flush NODATACOW inodes, here we flush all inodes anyway. - Wait for ordered extents This would convert the preallocated metadata space into per-trans metadata, which can be freed in later transaction commit. - Commit transaction This will free all per-trans metadata space. Also we don't want to trigger flush multiple times, so here we introduce a per-root wait list and a new root status, to ensure only one thread starts the flushing. Fixes: c6887cd ("Btrfs: don't do nocow check unless we have to") Reviewed-by: Josef Bacik <[email protected]> Signed-off-by: Qu Wenruo <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 263da81 commit c53e965

File tree

3 files changed

+96
-8
lines changed

3 files changed

+96
-8
lines changed

fs/btrfs/ctree.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,6 +1012,8 @@ enum {
10121012
BTRFS_ROOT_DEAD_TREE,
10131013
/* The root has a log tree. Used only for subvolume roots. */
10141014
BTRFS_ROOT_HAS_LOG_TREE,
1015+
/* Qgroup flushing is in progress */
1016+
BTRFS_ROOT_QGROUP_FLUSHING,
10151017
};
10161018

10171019
/*
@@ -1164,6 +1166,7 @@ struct btrfs_root {
11641166
spinlock_t qgroup_meta_rsv_lock;
11651167
u64 qgroup_meta_rsv_pertrans;
11661168
u64 qgroup_meta_rsv_prealloc;
1169+
wait_queue_head_t qgroup_flush_wait;
11671170

11681171
/* Number of active swapfiles */
11691172
atomic_t nr_swapfiles;

fs/btrfs/disk-io.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,6 +1116,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
11161116
mutex_init(&root->log_mutex);
11171117
mutex_init(&root->ordered_extent_mutex);
11181118
mutex_init(&root->delalloc_mutex);
1119+
init_waitqueue_head(&root->qgroup_flush_wait);
11191120
init_waitqueue_head(&root->log_writer_wait);
11201121
init_waitqueue_head(&root->log_commit_wait[0]);
11211122
init_waitqueue_head(&root->log_commit_wait[1]);

fs/btrfs/qgroup.c

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3515,17 +3515,58 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
35153515
}
35163516

35173517
/*
3518-
* Reserve qgroup space for range [start, start + len).
3518+
* Try to free some space for qgroup.
35193519
*
3520-
* This function will either reserve space from related qgroups or doing
3521-
* nothing if the range is already reserved.
3520+
* For qgroup, there are only 3 ways to free qgroup space:
3521+
* - Flush nodatacow write
3522+
* Any nodatacow write will free its reserved data space at run_delalloc_range().
3523+
* In theory, we should only flush nodatacow inodes, but it's not yet
3524+
* possible, so we need to flush the whole root.
35223525
*
3523-
* Return 0 for successful reserve
3524-
* Return <0 for error (including -EQUOT)
3526+
* - Wait for ordered extents
3527+
* When ordered extents are finished, their reserved metadata is finally
3528+
* converted to per_trans status, which can be freed by later commit
3529+
* transaction.
35253530
*
3526-
* NOTE: this function may sleep for memory allocation.
3531+
* - Commit transaction
3532+
* This would free the meta_per_trans space.
3533+
* In theory this shouldn't provide much space, but any more qgroup space
3534+
* is needed.
35273535
*/
3528-
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3536+
static int try_flush_qgroup(struct btrfs_root *root)
3537+
{
3538+
struct btrfs_trans_handle *trans;
3539+
int ret;
3540+
3541+
/*
3542+
* We don't want to run flush again and again, so if there is a running
3543+
* one, we won't try to start a new flush, but exit directly.
3544+
*/
3545+
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3546+
wait_event(root->qgroup_flush_wait,
3547+
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3548+
return 0;
3549+
}
3550+
3551+
ret = btrfs_start_delalloc_snapshot(root);
3552+
if (ret < 0)
3553+
goto out;
3554+
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3555+
3556+
trans = btrfs_join_transaction(root);
3557+
if (IS_ERR(trans)) {
3558+
ret = PTR_ERR(trans);
3559+
goto out;
3560+
}
3561+
3562+
ret = btrfs_commit_transaction(trans);
3563+
out:
3564+
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3565+
wake_up(&root->qgroup_flush_wait);
3566+
return ret;
3567+
}
3568+
3569+
static int qgroup_reserve_data(struct btrfs_inode *inode,
35293570
struct extent_changeset **reserved_ret, u64 start,
35303571
u64 len)
35313572
{
@@ -3578,6 +3619,34 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
35783619
return ret;
35793620
}
35803621

3622+
/*
3623+
* Reserve qgroup space for range [start, start + len).
3624+
*
3625+
* This function will either reserve space from related qgroups or do nothing
3626+
* if the range is already reserved.
3627+
*
3628+
* Return 0 for successful reservation
3629+
* Return <0 for error (including -EQUOT)
3630+
*
3631+
* NOTE: This function may sleep for memory allocation, dirty page flushing and
3632+
* commit transaction. So caller should not hold any dirty page locked.
3633+
*/
3634+
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3635+
struct extent_changeset **reserved_ret, u64 start,
3636+
u64 len)
3637+
{
3638+
int ret;
3639+
3640+
ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3641+
if (ret <= 0 && ret != -EDQUOT)
3642+
return ret;
3643+
3644+
ret = try_flush_qgroup(inode->root);
3645+
if (ret < 0)
3646+
return ret;
3647+
return qgroup_reserve_data(inode, reserved_ret, start, len);
3648+
}
3649+
35813650
/* Free ranges specified by @reserved, normally in error path */
35823651
static int qgroup_free_reserved_data(struct btrfs_inode *inode,
35833652
struct extent_changeset *reserved, u64 start, u64 len)
@@ -3746,7 +3815,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
37463815
return num_bytes;
37473816
}
37483817

3749-
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3818+
static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
37503819
enum btrfs_qgroup_rsv_type type, bool enforce)
37513820
{
37523821
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3773,6 +3842,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
37733842
return ret;
37743843
}
37753844

3845+
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3846+
enum btrfs_qgroup_rsv_type type, bool enforce)
3847+
{
3848+
int ret;
3849+
3850+
ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
3851+
if (ret <= 0 && ret != -EDQUOT)
3852+
return ret;
3853+
3854+
ret = try_flush_qgroup(root);
3855+
if (ret < 0)
3856+
return ret;
3857+
return qgroup_reserve_meta(root, num_bytes, type, enforce);
3858+
}
3859+
37763860
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
37773861
{
37783862
struct btrfs_fs_info *fs_info = root->fs_info;

0 commit comments

Comments
 (0)