linux文件系统写过程简析

linux写入磁盘过程经历VFS -> 页缓存（page cache） -> 具体的文件系统（ext2/3/4、XFS、ReiserFS等） -> Block IO ->设备驱动 -> SCSI指令（或者其他指令），总体来说linux文件写入磁盘过程比较复杂

1、VFS（虚拟文件系统）

Linux中采用了VFS的方式屏蔽了多个文件系统的差别，当需要不同的设备或者其他文件系统时，采用挂载mount的方式访问其他设备或者其他文件系统（这里可以把文件系统理解为具体的设备）。正是因为使用了VFS，所以所有的文件系统设备使用统一的文件目录树视图访问，整个存储空间采用一个文件系统目录树来管理，屏蔽了底层多个文件系统之间的差别。当然，如果你需要把你自己编写的文件系统集成到Linux内核，采用VFS的方式进行访问，你需要采用模块加载的方式进行处理，相应的文件系统模块文件需要编入到系统目录/lib/modules/your-system-name/kernel/fs当中。当然VFS的作用远不止这些，通过VFS也进行访问设备，在Linux下所有的对象都是文件，简化了系统的访问。

1.1 正常情况下，所有的文件操作通过系统调用进入到VFS中，特殊的处理，直接操作原始设备。文件系统写入的系统调用为：

　 #include <unistd.h>

　 ssize_t write(int fd, const void * buffer, size_t count);

1.2 当采用系统调用进入VFS时，接下来的处理交给VFS层。处理过程比较中要的是vfs_write、generic_file_aio_write

 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)

 {

     ssize_t ret;

     if (!(file->f_mode & FMODE_WRITE))

         return -EBADF;

     if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))

         return -EINVAL;

     if (unlikely(!access_ok(VERIFY_READ, buf, count)))

         return -EFAULT;

     ret = rw_verify_area(WRITE, file, pos, count);

     if (ret >= ) {

         count = ret;

         if (file->f_op->write)

             ret = file->f_op->write(file, buf, count, pos);

         else

             ret = do_sync_write(file, buf, count, pos);

         if (ret > ) {

             fsnotify_modify(file->f_path.dentry);

             add_wchar(current, ret);

         }

         inc_syscw(current);

     }

     return ret;

 }

 /**

  * generic_file_aio_write - write data to a file

  * @iocb:    IO state structure

  * @iov:    vector with data to write

  * @nr_segs:    number of segments in the vector

  * @pos:    position in file where to write

  *

  * This is a wrapper around __generic_file_aio_write() to be used by most

  * filesystems. It takes care of syncing the file in case of O_SYNC file

  * and acquires i_mutex as needed.

  */

 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,

         unsigned long nr_segs, loff_t pos)

 {

     struct file *file = iocb->ki_filp;

     struct inode *inode = file->f_mapping->host;

     ssize_t ret;

     BUG_ON(iocb->ki_pos != pos);

     mutex_lock(&inode->i_mutex);

     ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);

     mutex_unlock(&inode->i_mutex);

     if (ret >  || ret == -EIOCBQUEUED) {

         ssize_t err;

         err = generic_write_sync(file, pos, ret);

         if (err <  && ret > )

             ret = err;

     }

     return ret;

 }

2、对于VFS层也有采用page cache和非page cache两种，下面重要介绍采用page cache的处理。

在VFS中，每个打开操作的文件对应内核都有一个address_space 数据结构，该数据结构是用来表示系统中打开的文件，并且一个打开的文件只有一个address_space数据结构。

如下：

 struct address_space {

     struct inode        *host;        /* owner: inode, block_device */

     struct radix_tree_root    page_tree;    /* radix tree of all pages */

     spinlock_t        tree_lock;    /* and lock protecting it */

     unsigned int        i_mmap_writable;/* count VM_SHARED mappings */

     struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */

     struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */

     spinlock_t        i_mmap_lock;    /* protect tree, count, list */

     unsigned int        truncate_count;    /* Cover race condition with truncate */

     unsigned long        nrpages;    /* number of total pages */

     pgoff_t            writeback_index;/* writeback starts here */

     const struct address_space_operations *a_ops;    /* methods */

     unsigned long        flags;        /* error bits/gfp mask */

     struct backing_dev_info *backing_dev_info; /* device readahead, etc */

     spinlock_t        private_lock;    /* for use by the address_space */

     struct list_head    private_list;    /* ditto */

     struct address_space    *assoc_mapping;    /* ditto */

     struct mutex        unmap_mutex;    /* to protect unmapping */

 } __attribute__((aligned(sizeof(long))));

对于文件中的文件内容缓存采用的是基数树的方式来保存的，在成员变量page_tree中，关于基数树的介绍参考[1]和[2]。下面是关于page cache写处理的几个重要的函数

 ssize_t

 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,

         unsigned long nr_segs, loff_t pos, loff_t *ppos,

         size_t count, ssize_t written)

 {

     struct file *file = iocb->ki_filp;

     struct address_space *mapping = file->f_mapping;

     ssize_t status;

     struct iov_iter i;

     iov_iter_init(&i, iov, nr_segs, count, written);

     status = generic_perform_write(file, &i, pos);

     if (likely(status >= )) {

         written += status;

         *ppos = pos + status;

       }

     /*

      * If we get here for O_DIRECT writes then we must have fallen through

      * to buffered writes (block instantiation inside i_size).  So we sync

      * the file data here, to try to honour O_DIRECT expectations.

      */

     if (unlikely(file->f_flags & O_DIRECT) && written)

         status = filemap_write_and_wait_range(mapping,

                     pos, pos + written - );

     return written ? written : status;

 }

调用page cache中的write_begin 和write_end

Note：在进行VFS系统调用写入文件过程中，可以允许在文件中的任何位置写入，这其中就包括当写入的过程中写入的起始位置不是一个block的开始位置，这时需要特殊的处理，上述的过程都在write_begin这个函数调用过程中处理完毕。

3、ext2/3/4中文件的处理。

当在page cache中进行到write_begin时，需要ext4中的ext4_write_begin处理，如下：

 static int ext4_write_begin(struct file *file, struct address_space *mapping,

                 loff_t pos, unsigned len, unsigned flags,

                 struct page **pagep, void **fsdata)

 {

     struct inode *inode = mapping->host;

     int ret, needed_blocks;

     handle_t *handle;

     int retries = ;

     struct page *page;

     pgoff_t index;

     unsigned from, to;

         .........

     index = pos >> PAGE_CACHE_SHIFT;

     from = pos & (PAGE_CACHE_SIZE - );

     to = from + len;

 retry:

     handle = ext4_journal_start(inode, needed_blocks);

     if (IS_ERR(handle)) {

         ret = PTR_ERR(handle);

         goto out;

     }

     /* We cannot recurse into the filesystem as the transaction is already

      * started */

     flags |= AOP_FLAG_NOFS;

     page = grab_cache_page_write_begin(mapping, index, flags);

     if (!page) {

         ext4_journal_stop(handle);

         ret = -ENOMEM;

         goto out;

     }

     *pagep = page;

     ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,

                 ext4_get_block);

     if (!ret && ext4_should_journal_data(inode)) {

         ret = walk_page_buffers(handle, page_buffers(page),

                 from, to, NULL, do_journal_get_write_access);

     }

     if (ret) {

         unlock_page(page);

         page_cache_release(page);

         /*

          * block_write_begin may have instantiated a few blocks

          * outside i_size.  Trim these off again. Don't need

          * i_size_read because we hold i_mutex.

          *

          * Add inode to orphan list in case we crash before

          * truncate finishes

          */

         if (pos + len > inode->i_size && ext4_can_truncate(inode))

             ext4_orphan_add(handle, inode);

         ext4_journal_stop(handle);

         if (pos + len > inode->i_size) {

             ext4_truncate_failed_write(inode);

             /*

              * If truncate failed early the inode might

              * still be on the orphan list; we need to

              * make sure the inode is removed from the

              * orphan list in that case.

              */

             if (inode->i_nlink)

                 ext4_orphan_del(NULL, inode);

         }

     }

     if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

         goto retry;

 out:

     return ret;

 }

其中在ext4_write_begin中包含了很多的处理功能，包括文件物理块的分配（假设ext4中的delay allocation特性没有开启）、文件块的部分写过程的处理等。下面是在ext_write_begin函数调用过程中比较重要的几个函数。

 /*

  * block_write_begin takes care of the basic task of block allocation and

  * bringing partial write blocks uptodate first.

  *

  * If *pagep is not NULL, then block_write_begin uses the locked page

  * at *pagep rather than allocating its own. In this case, the page will

  * not be unlocked or deallocated on failure.

  */

 int block_write_begin(struct file *file, struct address_space *mapping,

             loff_t pos, unsigned len, unsigned flags,

             struct page **pagep, void **fsdata,

             get_block_t *get_block)

 {

     struct inode *inode = mapping->host;

     int status = ;

     struct page *page;

     pgoff_t index;

     unsigned start, end;

     int ownpage = ;

     index = pos >> PAGE_CACHE_SHIFT;

     start = pos & (PAGE_CACHE_SIZE - );

     end = start + len;

     page = *pagep;

     if (page == NULL) {

         ownpage = ;

         page = grab_cache_page_write_begin(mapping, index, flags);

         if (!page) {

             status = -ENOMEM;

             goto out;

         }

         *pagep = page;

     } else

         BUG_ON(!PageLocked(page));

     status = __block_prepare_write(inode, page, start, end, get_block);

     if (unlikely(status)) {

         ClearPageUptodate(page);

         if (ownpage) {

             unlock_page(page);

             page_cache_release(page);

             *pagep = NULL;

             /*

              * prepare_write() may have instantiated a few blocks

              * outside i_size.  Trim these off again. Don't need

              * i_size_read because we hold i_mutex.

              */

             if (pos + len > inode->i_size)

                 vmtruncate(inode, inode->i_size);

         }

     }

 out:

     return status;

 }

 static int __block_prepare_write(struct inode *inode, struct page *page,

         unsigned from, unsigned to, get_block_t *get_block)

 {

     unsigned block_start, block_end;

     sector_t block;

     int err = ;

     unsigned blocksize, bbits;

     struct buffer_head *bh, *head, *wait[], **wait_bh=wait;

     BUG_ON(!PageLocked(page));

     BUG_ON(from > PAGE_CACHE_SIZE);

     BUG_ON(to > PAGE_CACHE_SIZE);

     BUG_ON(from > to);

     blocksize =  << inode->i_blkbits;

     if (!page_has_buffers(page))

         create_empty_buffers(page, blocksize, );

     head = page_buffers(page);

     bbits = inode->i_blkbits;

     block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

     for(bh = head, block_start = ; bh != head || !block_start;

         block++, block_start=block_end, bh = bh->b_this_page) {

         block_end = block_start + blocksize;

         if (block_end <= from || block_start >= to) {

             if (PageUptodate(page)) {

                 if (!buffer_uptodate(bh))

                     set_buffer_uptodate(bh);

             }

             continue;

         }

         if (buffer_new(bh))

             clear_buffer_new(bh);

         if (!buffer_mapped(bh)) {

             WARN_ON(bh->b_size != blocksize);

             err = get_block(inode, block, bh, );

             if (err)

                 break;

             if (buffer_new(bh)) {

                 unmap_underlying_metadata(bh->b_bdev,

                             bh->b_blocknr);

                 if (PageUptodate(page)) {

                     clear_buffer_new(bh);

                     set_buffer_uptodate(bh);

                     mark_buffer_dirty(bh);

                     continue;

                 }

                 if (block_end > to || block_start < from)

                     zero_user_segments(page,

                         to, block_end,

                         block_start, from);

                 continue;

             }

         }

         if (PageUptodate(page)) {

             if (!buffer_uptodate(bh))

                 set_buffer_uptodate(bh);

             continue;

         }

         if (!buffer_uptodate(bh) && !buffer_delay(bh) &&

             !buffer_unwritten(bh) &&

              (block_start < from || block_end > to)) {

             ll_rw_block(READ, , &bh);

             *wait_bh++=bh;

         }

     }

     /*

      * If we issued read requests - let them complete.

      */

     while(wait_bh > wait) {

         wait_on_buffer(*--wait_bh);

         if (!buffer_uptodate(*wait_bh))

             err = -EIO;

     }

     if (unlikely(err))

         page_zero_new_buffers(page, from, to);

     return err;

 }

 /**

  * ll_rw_block: low-level access to block devices (DEPRECATED)

  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)

  * @nr: number of &struct buffer_heads in the array

  * @bhs: array of pointers to &struct buffer_head

  *

  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and

  * requests an I/O operation on them, either a %READ or a %WRITE.  The third

  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers

  * are sent to disk. The fourth %READA option is described in the documentation

  * for generic_make_request() which ll_rw_block() calls.

  *

  * This function drops any buffer that it cannot get a lock on (with the

  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be

  * clean when doing a write request, and any buffer that appears to be

  * up-to-date when doing read request.  Further it marks as clean buffers that

  * are processed for writing (the buffer cache won't assume that they are

  * actually clean until the buffer gets unlocked).

  *

  * ll_rw_block sets b_end_io to simple completion handler that marks

  * the buffer up-to-date (if approriate), unlocks the buffer and wakes

  * any waiters.

  *

  * All of the buffers must be for the same device, and must also be a

  * multiple of the current approved size for the device.

  */

 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])

 {

     int i;

     for (i = ; i < nr; i++) {

         struct buffer_head *bh = bhs[i];

         if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)

             lock_buffer(bh);

         else if (!trylock_buffer(bh))

             continue;

         if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||

             rw == SWRITE_SYNC_PLUG) {

             if (test_clear_buffer_dirty(bh)) {

                 bh->b_end_io = end_buffer_write_sync;

                 get_bh(bh);

                 if (rw == SWRITE_SYNC)

                     submit_bh(WRITE_SYNC, bh);

                 else

                     submit_bh(WRITE, bh);

                 continue;

             }

         } else {

             if (!buffer_uptodate(bh)) {

                 bh->b_end_io = end_buffer_read_sync;

                 get_bh(bh);

                 submit_bh(rw, bh);

                 continue;

             }

         }

         unlock_buffer(bh);

     }

 }

其中在ext4中块的分配过程中，管理块分配处理的函数实现在fs/ext4/balloc.c fs/ext4/mballoc.c

4、当page cache中的数据需要刷新到disk上的时候，这时处理的过程由Block IO接管。

在进行文件page cache刷新到disk上的过程中比较重要的数据结构有如下两个buffer_head 和 bio

 struct buffer_head {

     unsigned long b_state;        /* buffer state bitmap (see above) */

     struct buffer_head *b_this_page;/* circular list of page's buffers */

     struct page *b_page;        /* the page this bh is mapped to */

     sector_t b_blocknr;        /* start block number */

     size_t b_size;            /* size of mapping */

     char *b_data;            /* pointer to data within the page */

     struct block_device *b_bdev;

     bh_end_io_t *b_end_io;        /* I/O completion */

      void *b_private;        /* reserved for b_end_io */

     struct list_head b_assoc_buffers; /* associated with another mapping */

     struct address_space *b_assoc_map;    /* mapping this buffer is

                            associated with */

     atomic_t b_count;        /* users using this buffer_head */

 };

 /*

  * main unit of I/O for the block layer and lower layers (ie drivers and

  * stacking drivers)

  */

 struct bio {

     sector_t        bi_sector;    /* device address in 512 byte

                            sectors */

     struct bio        *bi_next;    /* request queue link */

     struct block_device    *bi_bdev;

     unsigned long        bi_flags;    /* status, command, etc */

     unsigned long        bi_rw;        /* bottom bits READ/WRITE,

                          * top bits priority

                          */

     unsigned short        bi_vcnt;    /* how many bio_vec's */

     unsigned short        bi_idx;        /* current index into bvl_vec */

     ...............

     /*

      * We can inline a number of vecs at the end of the bio, to avoid

      * double allocations for a small number of bio_vecs. This member

      * MUST obviously be kept at the very end of the bio.

      */

     struct bio_vec        bi_inline_vecs[];

 };

在Block IO层进行基本的IO request的合并和处理调度，基本的层由elevator管理，具体的调度算法有noop、deadline和anticipate等多种调度算法，现在默认的调度算法是deadline，当然调度算法可调，根据系统可以调成系统最有的处理。

[1] 基数树(radix tree). http://blog.****.net/joker0910/article/details/8250085

[2] Radix Tree. http://en.wikipedia.org/wiki/Radix_tree

秒客网

linux文件系统写过程简析

相关文章