
Linux filemap_write_and_wait_range是内核中最核心的页面写回同步接口它将指定文件区间内的脏页提交给块设备层并在WB_SYNC_ALL模式下阻塞等待I/O完成。该函数位于mm/filemap.c是VFS层与页缓存之间的关键同步屏障。// mm/filemap.cint filemap_write_and_wait_range(struct address_space *mapping,loff_t lstart, loff_t lend){int err 0, write_err;if (mapping_needs_writeback(mapping)) {write_err __filemap_fdatawrite_range(mapping, lstart, lend,WB_SYNC_ALL);err filemap_check_errors(mapping);if (!err)err write_err;} else {err filemap_check_errors(mapping);}return err;}函数入口先通过mapping_needs_writeback检查mapping的nrpages是否非零且tag-0脏标记是否存在。如果没有任何脏页直接检查mapping的错误标志并返回避免了不必要的写回开销。static inline bool mapping_needs_writeback(struct address_space *mapping){return mapping-nrpages mapping_tagged(mapping, PAGECACHE_TAG_DIRTY);}当判断存在需要写回的页面时__filemap_fdatawrite_range以WB_SYNC_ALL模式调用核心写回引擎。WB_SYNC_ALL的关键语义是同步地启动并等待所有提交到块层I/O请求完成。// mm/page-writeback.cint __filemap_fdatawrite_range(struct address_space *mapping,loff_t start, loff_t end, int sync_mode){struct writeback_control wbc {.sync_mode sync_mode,.range_start start,.range_end end,.nr_to_write LONG_MAX,};if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping,PAGECACHE_TAG_DIRTY))return 0;return do_writepages(mapping, wbc);}注意sync_mode的传递路径filemap_write_and_wait_range固定传入WB_SYNC_ALL区别于WB_SYNC_NONE的尽力而为模式。两种模式在do_writepages以下的行为有本质差别。// fs/fs-writeback.cvoid writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,struct wb_writeback_work *work){// ...while (!list_empty(wb-b_io)) {struct inode *inode wb_inode(wb-b_io.prev);struct address_space *mapping inode-i_mapping;if (work-sync_mode WB_SYNC_ALL)inode-i_state | I_SYNC;pages_skipped wbc-pages_skipped;write_cache_pages(mapping, wbc, __writepage, mapping);// ...if (work-sync_mode WB_SYNC_ALL) {struct writeback_control wbc_sync { .sync_mode WB_SYNC_ALL };filemap_fdatawait(mapping);inode-i_state ~I_SYNC;wake_up_bit(inode-i_state, __I_SYNC);}}}WB_SYNC_ALL模式下每个inode在写回开始前设置I_SYNC标志阻止并发写回页面写出后立即调用filemap_fdatawait遍历mapping的所有页面对每个页面调用wait_on_page_writeback阻塞当前进程直到该页面PG_writeback标志清除。这个等待过程是filemap_write_and_wait_range名字中wait的语义来源。// mm/filemap.cint filemap_fdatawait(struct address_space *mapping){loff_t i_size i_size_read(mapping-host);if (i_size 0)return 0;return filemap_fdatawait_range(mapping, 0, i_size - 1);}int filemap_fdatawait_range(struct address_space *mapping,loff_t start_byte, loff_t end_byte){pgoff_t index start_byte PAGE_SHIFT;pgoff_t end end_byte PAGE_SHIFT;struct pagevec pvec;int nr_pages, ret 0;pagevec_init(pvec);while (index end) {nr_pages pagevec_lookup_range_tag(pvec, mapping, index,end, PAGECACHE_TAG_WRITEBACK);if (!nr_pages)break;for (int i 0; i nr_pages; i) {struct page *page pvec.pages[i];wait_on_page_writeback(page);if (PageError(page))ret -EIO;}pagevec_release(pvec);}return ret;}filemap_fdatawait_range通过PAGECACHE_TAG_WRITEBACK标记在radix tree或xarray中查找仍在写回中的页面逐个调用wait_on_page_writeback。wait_on_page_writeback本质上是将当前进程置于该page的等待队列上直到I/O完成时end_page_writeback唤醒它。// include/linux/pagemap.hstatic inline void wait_on_page_writeback(struct page *page){if (PageWriteback(page))wait_on_page_bit(page, PG_writeback);}filemap_write_and_wait_range的返回值设计如果写回阶段出错write_err非零或者mapping上累积了之前I/O的错误filemap_check_errors返回-EIO或-ENOSPC都会向上传播。典型的调用模式如下// fs/fs-writeback.c 中 sync_inode_one 示例int sync_inode_one(struct inode *inode, struct writeback_control *wbc){struct address_space *mapping inode-i_mapping;int ret;ret filemap_write_and_wait_range(mapping, 0, LLONG_MAX);if (ret 0)return ret;if (mapping_needs_writeback(mapping))ret __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, WB_SYNC_ALL);return ret;}这种重试写回的代码模式在某些文件系统如XFS的data integrity路径中常见原因是第一次写回后可能有新脏页产生例如元数据日志写入途中又修改了页面需要再次调用写回。总结filemap_write_and_wait_range的调用链调用者 - filemap_write_and_wait_range - __filemap_fdatawrite_range(WB_SYNC_ALL) - do_writepages - write_cache_pages - a_ops-writepage - 块层提交 - filemap_fdatawait - wait_on_page_writeback。整个路径是同步阻塞的保证函数返回时指定区间的数据已经在磁盘上或在块设备的I/O完成队列中。