Linux btrfs checksum tree与csum查找校验匹配
Linux btrfs checksum tree与csum查找校验匹配
btrfs使用独立的checksum tree(csum tree)来存储文件数据块的校验和。csum tree是btrfs中一棵特殊的B-tree,其root存储在fs_info->csum_root中。每个csum tree的key类型为BTRFS_EXTENT_CSUM_KEY,key的objectid为文件extent在逻辑地址空间中的起始偏移量,offset为csum条目的序号。
checksum tree的叶子节点存储struct btrfs_csum_item,每个csum_item对应一个sector(通常为4KB)的校验值:
struct btrfs_csum_item {
u8 csum;
} __attribute__ ((__packed__));
csum字段的长度取决于文件系统创建时选择的校验算法。默认使用CRC32C时csum长度为4字节,使用xxhash时长度为8字节,使用sha256时长度为32字节,使用blake2b时长度为64字节。一个16KB的leaf节点中,CRC32C模式可存放约4096个csum_item,覆盖约16MB的数据范围。
写入数据时,btrfs在btrfs_finish_ordered_io()中计算校验和并插入csum tree:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
struct btrfs_fs_info *fs_info = ordered->fs_info;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_ordered_sum *sums;
u64 logical;
int ret;
logical = ordered->disk_bytenr;
list_for_each_entry(sums, &ordered->list, list) {
ret = btrfs_csum_one_bio(csum_root, ordered->inode,
sums, sums->bytenr);
if (ret)
return ret;
}
ret = insert_ordered_extent_file_extent(csum_root, ordered);
return ret;
}
btrfs_csum_one_bio()接收ordered_io完成时生成的struct btrfs_ordered_sum,其中包含了bio中每个sector的校验值。该函数将这些校验值批量插入csum tree:
int btrfs_csum_one_bio(struct btrfs_root *csum_root,
struct inode *inode,
struct btrfs_ordered_sum *sums,
u64 disk_bytenr)
{
struct btrfs_csum_item *item;
struct btrfs_path *path;
u64 offset;
u64 csum_offset;
int ret;
path = btrfs_alloc_path();
for (offset = 0; offset < sums->len; offset += fs_info->sectorsize) {
csum_offset = offset >> fs_info->sectorsize_bits;
item = btrfs_lookup_csum(csum_root, path,
disk_bytenr + offset, 1);
if (IS_ERR(item)) {
ret = btrfs_insert_csum(csum_root, path,
disk_bytenr + offset,
sums->sums + csum_offset *
csum_size);
} else {
write_extent_buffer(path->nodes[0],
sums->sums + csum_offset * csum_size,
(unsigned long)item, csum_size);
}
btrfs_release_path(path);
}
btrfs_free_path(path);
return 0;
}
btrfs_lookup_csum()在csum tree中搜索指定逻辑地址对应的csum item。如果找到,原地覆盖写入新校验值;如果未找到,调用btrfs_insert_csum()插入新的csum条目。
读取数据时,校验验证由btrfs_check_read_bio()完成。该函数在bio读取完成后被调用,将读取到的数据重新计算校验值并与csum tree中的记录比较:
blk_status_t btrfs_check_read_bio(struct btrfs_fs_info *fs_info,
struct bio *bio, int mirror_num)
{
struct bvec_iter iter;
struct bio_vec *bv;
u32 sectorsize = fs_info->sectorsize;
u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u8 *csum_expected;
u8 *csum_found;
int ret;
csum_expected = kzalloc(fs_info->csum_size, GFP_NOFS);
csum_found = kzalloc(fs_info->csum_size, GFP_NOFS);
bio_for_each_segment(bv, bio, iter) {
u64 offset = logical + iter.bi_bvec_done;
u64 cur_offset;
for (cur_offset = offset;
cur_offset < offset + bv->bv_len;
cur_offset += sectorsize) {
ret = btrfs_lookup_csum(fs_info->csum_root,
NULL, cur_offset, csum_expected);
btrfs_csum_data(fs_info, bv->bv_page,
cur_offset - logical, csum_found,
sectorsize);
if (memcmp(csum_expected, csum_found,
fs_info->csum_size)) {
if (mirror_num > fs_info->mirror_num_max)
return BLK_STS_IOERR;
return btrfs_check_read_bio(fs_info, bio,
mirror_num + 1);
}
}
}
kfree(csum_expected);
kfree(csum_found);
return 0;
}
csum查找失败时触发btrfs的镜像切换(mirror switching)机制。对于RAID1/10/5/6配置,btrfs自动尝试从下一个镜像读取相同数据,如果所有镜像校验均失败则返回IO错误。btrfs的读路径中,校验失败计数记录在设备的BTRFS_DEV_STAT_CSUM_ERRS统计项中。
csum tree的查询性能直接影响读写时延。btrfs在csum tree中采用批量查找优化,通过btrfs_lookup_csums_range()一次性查询一个连续逻辑地址范围内的所有校验值,减少B-tree遍历次数:
int btrfs_lookup_csums_range(struct btrfs_root *csum_root,
u64 start, u64 end,
struct list_head *list, int search_commit)
{
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_csum_item *item;
struct btrfs_ordered_sum *sums;
unsigned long offset;
int ret;
key.objectid = start;
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
path = btrfs_alloc_path();
ret = btrfs_search_slot(NULL, csum_root, &key, path, 0, 0);
while (1) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid > end)
break;
offset = (unsigned long)btrfs_item_ptr(path->nodes[0],
path->slots[0],
struct btrfs_csum_item);
sums = kzalloc(sizeof(*sums) + fs_info->csum_size *
(key.offset - key.objectid) / sectorsize);
read_extent_buffer(path->nodes[0], sums->sums,
offset, item_size);
list_add_tail(&sums->list, list);
ret = btrfs_next_item(csum_root, path);
if (ret)
break;
}
btrfs_free_path(path);
return 0;
}
csum tree的更新采用copy-on-write机制,与btrfs的其他B-tree一致。每次插入新的csum条目时,csum tree的extent buffer被CoW到新位置,更新后的根节点地址在事务提交时写入superblock。
