linux文件系统写过程简析

时间:2023-03-10 03:23:45
linux文件系统写过程简析

linux写入磁盘过程经历VFS ->  页缓存(page cache) -> 具体的文件系统(ext2/3/4、XFS、ReiserFS等) -> Block IO ->设备驱动 -> SCSI指令(或者其他指令),总体来说linux文件写入磁盘过程比较复杂

1、VFS(虚拟文件系统)

Linux中采用了VFS的方式屏蔽了多个文件系统的差别, 当需要不同的设备或者其他文件系统时,采用挂载mount的方式访问其他设备或者其他文件系统(这里可以把文件系统理解为具体的设备)。正是因为使用了VFS,所以所有的文件系统设备使用统一的文件目录树视图访问,整个存储空间采用一个文件系统目录树来管理,屏蔽了底层多个文件系统之间的差别。当然,如果你需要把你自己编写的文件系统集成到Linux内核,采用VFS的方式进行访问,你需要采用模块加载的方式进行处理,相应的文件系统模块文件需要编入到系统目录/lib/modules/your-system-name/kernel/fs当中。当然VFS的作用远不止这些,通过VFS也进行访问设备,在Linux下所有的对象都是文件,简化了系统的访问。

1.1 正常情况下,所有的文件操作通过系统调用进入到VFS中,特殊的处理,直接操作原始设备。文件系统写入的系统调用为:

  #include <unistd.h>

  ssize_t  write(int fd,  const void * buffer, size_t  count);

1.2 当采用系统调用进入VFS时,接下来的处理交给VFS层。处理过程比较中要的是vfs_write、generic_file_aio_write

 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret; if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= ) {
count = ret;
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > ) {
fsnotify_modify(file->f_path.dentry);
add_wchar(current, ret);
}
inc_syscw(current);
} return ret;
}
 /**
* generic_file_aio_write - write data to a file
* @iocb: IO state structure
* @iov: vector with data to write
* @nr_segs: number of segments in the vector
* @pos: position in file where to write
*
* This is a wrapper around __generic_file_aio_write() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_mutex as needed.
*/
ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex); if (ret > || ret == -EIOCBQUEUED) {
ssize_t err; err = generic_write_sync(file, pos, ret);
if (err < && ret > )
ret = err;
}
return ret;
}

2、 对于VFS层也有采用page cache和非page cache两种,下面重要介绍采用page cache的处理。

在VFS中, 每个打开操作的文件对应内核都有一个address_space 数据结构, 该数据结构是用来表示系统中打开的文件,并且一个打开的文件只有一个address_space数据结构。

如下:

 struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and lock protecting it */
unsigned int i_mmap_writable;/* count VM_SHARED mappings */
struct prio_tree_root i_mmap; /* tree of private and shared mappings */
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
spinlock_t i_mmap_lock; /* protect tree, count, list */
unsigned int truncate_count; /* Cover race condition with truncate */
unsigned long nrpages; /* number of total pages */
pgoff_t writeback_index;/* writeback starts here */
const struct address_space_operations *a_ops; /* methods */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
struct mutex unmap_mutex; /* to protect unmapping */
} __attribute__((aligned(sizeof(long))));

对于文件中的文件内容缓存采用的是基数树的方式来保存的,在成员变量page_tree中,关于基数树的介绍参考[1]和[2]。 下面是关于page cache写处理的几个重要的函数

 ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
size_t count, ssize_t written)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
ssize_t status;
struct iov_iter i; iov_iter_init(&i, iov, nr_segs, count, written);
status = generic_perform_write(file, &i, pos); if (likely(status >= )) {
written += status;
*ppos = pos + status;
} /*
* If we get here for O_DIRECT writes then we must have fallen through
* to buffered writes (block instantiation inside i_size). So we sync
* the file data here, to try to honour O_DIRECT expectations.
*/
if (unlikely(file->f_flags & O_DIRECT) && written)
status = filemap_write_and_wait_range(mapping,
pos, pos + written - ); return written ? written : status;
}

调用page cache中的write_begin 和write_end

Note: 在进行VFS系统调用写入文件过程中,可以允许在文件中的任何位置写入,这其中就包括当写入的过程中写入的起始位置不是一个block的开始位置,这时需要特殊的处理,上述的过程都在write_begin这个函数调用过程中处理完毕。

3、ext2/3/4中文件的处理。

当在page cache中进行到write_begin时,需要ext4中的ext4_write_begin处理, 如下:

 static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
int ret, needed_blocks;
handle_t *handle;
int retries = ;
struct page *page;
pgoff_t index;
unsigned from, to;
......... index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - );
to = from + len; retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
} /* We cannot recurse into the filesystem as the transaction is already
* started */
flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
ext4_journal_stop(handle);
ret = -ENOMEM;
goto out;
}
*pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_get_block); if (!ret && ext4_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, do_journal_get_write_access);
} if (ret) {
unlock_page(page);
page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_mutex.
*
* Add inode to orphan list in case we crash before
* truncate finishes
*/
if (pos + len > inode->i_size && ext4_can_truncate(inode))
ext4_orphan_add(handle, inode); ext4_journal_stop(handle);
if (pos + len > inode->i_size) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
* make sure the inode is removed from the
* orphan list in that case.
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
} if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
out:
return ret;
}

其中在ext4_write_begin中包含了很多的处理功能,包括文件物理块的分配(假设ext4中的delay allocation特性没有开启)、文件块的部分写过程的处理等。下面是在ext_write_begin函数调用过程中比较重要的几个函数。

 /*
* block_write_begin takes care of the basic task of block allocation and
* bringing partial write blocks uptodate first.
*
* If *pagep is not NULL, then block_write_begin uses the locked page
* at *pagep rather than allocating its own. In this case, the page will
* not be unlocked or deallocated on failure.
*/
int block_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
get_block_t *get_block)
{
struct inode *inode = mapping->host;
int status = ;
struct page *page;
pgoff_t index;
unsigned start, end;
int ownpage = ; index = pos >> PAGE_CACHE_SHIFT;
start = pos & (PAGE_CACHE_SIZE - );
end = start + len; page = *pagep;
if (page == NULL) {
ownpage = ;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
status = -ENOMEM;
goto out;
}
*pagep = page;
} else
BUG_ON(!PageLocked(page)); status = __block_prepare_write(inode, page, start, end, get_block);
if (unlikely(status)) {
ClearPageUptodate(page); if (ownpage) {
unlock_page(page);
page_cache_release(page);
*pagep = NULL; /*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_mutex.
*/
if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
}
} out:
return status;
}
 static int __block_prepare_write(struct inode *inode, struct page *page,
unsigned from, unsigned to, get_block_t *get_block)
{
unsigned block_start, block_end;
sector_t block;
int err = ;
unsigned blocksize, bbits;
struct buffer_head *bh, *head, *wait[], **wait_bh=wait; BUG_ON(!PageLocked(page));
BUG_ON(from > PAGE_CACHE_SIZE);
BUG_ON(to > PAGE_CACHE_SIZE);
BUG_ON(from > to); blocksize = << inode->i_blkbits;
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, );
head = page_buffers(page); bbits = inode->i_blkbits;
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = ; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
}
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, );
if (err)
break;
if (buffer_new(bh)) {
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
continue;
}
if (block_end > to || block_start < from)
zero_user_segments(page,
to, block_end,
block_start, from);
continue;
}
}
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
continue;
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
ll_rw_block(READ, , &bh);
*wait_bh++=bh;
}
}
/*
* If we issued read requests - let them complete.
*/
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
if (unlikely(err))
page_zero_new_buffers(page, from, to);
return err;
}
 /**
* ll_rw_block: low-level access to block devices (DEPRECATED)
* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
* @nr: number of &struct buffer_heads in the array
* @bhs: array of pointers to &struct buffer_head
*
* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
* requests an I/O operation on them, either a %READ or a %WRITE. The third
* %SWRITE is like %WRITE only we make sure that the *current* data in buffers
* are sent to disk. The fourth %READA option is described in the documentation
* for generic_make_request() which ll_rw_block() calls.
*
* This function drops any buffer that it cannot get a lock on (with the
* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
* clean when doing a write request, and any buffer that appears to be
* up-to-date when doing read request. Further it marks as clean buffers that
* are processed for writing (the buffer cache won't assume that they are
* actually clean until the buffer gets unlocked).
*
* ll_rw_block sets b_end_io to simple completion handler that marks
* the buffer up-to-date (if approriate), unlocks the buffer and wakes
* any waiters.
*
* All of the buffers must be for the same device, and must also be a
* multiple of the current approved size for the device.
*/
void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
{
int i; for (i = ; i < nr; i++) {
struct buffer_head *bh = bhs[i]; if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
lock_buffer(bh);
else if (!trylock_buffer(bh))
continue; if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
rw == SWRITE_SYNC_PLUG) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
if (rw == SWRITE_SYNC)
submit_bh(WRITE_SYNC, bh);
else
submit_bh(WRITE, bh);
continue;
}
} else {
if (!buffer_uptodate(bh)) {
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
submit_bh(rw, bh);
continue;
}
}
unlock_buffer(bh);
}
}

其中在ext4中块的分配过程中,管理块分配处理的函数实现在fs/ext4/balloc.c  fs/ext4/mballoc.c

4、当page cache中的数据需要刷新到disk上的时候,这时处理的过程由Block IO接管。

在进行文件page cache刷新到disk上的过程中比较重要的数据结构有如下两个buffer_head 和 bio     

 struct buffer_head {
unsigned long b_state; /* buffer state bitmap (see above) */
struct buffer_head *b_this_page;/* circular list of page's buffers */
struct page *b_page; /* the page this bh is mapped to */ sector_t b_blocknr; /* start block number */
size_t b_size; /* size of mapping */
char *b_data; /* pointer to data within the page */ struct block_device *b_bdev;
bh_end_io_t *b_end_io; /* I/O completion */
void *b_private; /* reserved for b_end_io */
struct list_head b_assoc_buffers; /* associated with another mapping */
struct address_space *b_assoc_map; /* mapping this buffer is
associated with */
atomic_t b_count; /* users using this buffer_head */
};
 /*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
*/
struct bio {
sector_t bi_sector; /* device address in 512 byte
sectors */
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
unsigned long bi_flags; /* status, command, etc */
unsigned long bi_rw; /* bottom bits READ/WRITE,
* top bits priority
*/ unsigned short bi_vcnt; /* how many bio_vec's */
unsigned short bi_idx; /* current index into bvl_vec */
............... /*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[];
};

在Block IO层进行基本的IO request的合并和处理调度, 基本的层由elevator管理, 具体的调度算法有noop、deadline和anticipate等多种调度算法,现在默认的调度算法是deadline,当然调度算法可调,根据系统可以调成系统最有的处理。

[1] 基数树(radix tree). http://blog.****.net/joker0910/article/details/8250085

[2] Radix Tree. http://en.wikipedia.org/wiki/Radix_tree