接上一篇:https://my.oschina.net/fileoptions/blog/3061997 ,linux的io路径比较复杂,通常我们在阅读内核源码的时候,有时候也很难完整的跟踪整个路径。其实,我们可以使用工具跟踪代码的io路径,看一下一次完成的open、write、read都调用了哪些内核函数,这里我使用ftrace工具。假设要跟踪的代码编译成iotrace ,则可以使用trace-cmd 对iotrace的io路径进行跟踪(命令如下)。
sudo trace-cmd record -p function -F ./iotrace
trace-cmd 会启动iotrace程序,然后开始跟踪它的内核函数调用(并不是所有内核函数都会被跟踪到,只有ftrace支持的才可以), 由于代码很少,因此这里trace会很快结束,然后使用trace-cmd report命令将产生的trace文件格式化为人类可读的文本,就可以清晰的看到io的调用路径和层次关系。
由于我不想在虚拟机Linux上做这些测试(总感觉虚拟机会带来很多没必要的虚拟机相关的代码调用,增加复杂性),而我又没有运行Linux的物理机,怎么办呢?幸好在我玩硬件的时候入手了一块树莓派3B+,上面运行着完整的linux系统,而且我还把内核升级到5.x版本,更重要的是,在树莓派上可以尽情的编写、加载内核模块和修改内核,而不用担心把内核搞core dump,大不了快速重启或者重新烧写内核。
iotrace的源码如下,测试了7中io场景,主要为了测试普通模式、带O_DIRECT、O_SYNC标志、mmap模式、fsync和msync的调用路径等。
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#define BUF_SIZE 1024
int main(void)
{
int fd,size,len;
int ret = 0;
/*1. 普通io模式 */
char * normal_write_buf = "normal io";
len = strlen(normal_write_buf);
char normal_read_buf[100] = {0};
/* mark start */
pid_t pid = getpid();
if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR,0666 ))<0) {
perror("open:");
exit(1);
}
/* 第一次wrirte,此时文件应该还不在page cache */
if ((size = write( fd, normal_write_buf, len)) < 0){
perror("write:");
exit(1);
}
/* 第二次wrirte,此时文件应该在page cache,因此路径会比上一次端 */
if ((size = write( fd, normal_write_buf, len)) < 0){
perror("write:");
exit(1);
}
lseek(fd, 0, SEEK_SET );
if ((size = read( fd, normal_read_buf, len))<0) {
perror("read:");
exit(1);
}
if(strncmp(normal_read_buf,normal_write_buf,len) != 0){
perror("strncmp:");
exit(1);
}
/* 强制刷盘 */
if(fsync(fd) < 0){
perror("fysnc:");
exit(1);
}
if ( close(fd) < 0 ) {
perror("close:");
exit(1);
}
/*2. direct io模式*/
char *direct_write_buf,*direct_read_buf;
ret = posix_memalign((void **)&direct_write_buf, 512, BUF_SIZE);
if (ret) {
perror("posix_memalign:");
exit(1);
}
ret = posix_memalign((void **)&direct_read_buf, 512, BUF_SIZE);
if (ret) {
perror("posix_memalign:");
exit(1);
}
strcpy(direct_write_buf,"direct mode");
len = strlen("direct mode");
/* mark start */
pid = getpid();
if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR | O_DIRECT,0666 ))<0) {
perror("open:");
exit(1);
}
if ((size = write(fd, direct_write_buf, BUF_SIZE)) < 0){
perror("write:");
exit(1);
}
/* direct模式只是说不经过page cache,但是数据要想持久化还必须用fsync */
if(fsync(fd) < 0){
perror("fysnc:");
exit(1);
}
lseek(fd, 0, SEEK_SET );
if ((size = read( fd, direct_read_buf, BUF_SIZE))<0) {
perror("read:");
exit(1);
}
if(strncmp(direct_read_buf,direct_write_buf,len) != 0){
perror("strncmp:");
exit(1);
}
if ( close(fd) < 0 ) {
perror("close:");
exit(1);
}
/*3. 使用O_SYNC标志 */
char * o_sync_write_buf = "o_sync io";
len = strlen(o_sync_write_buf);
char o_sync_read_buf[100] = {0};
pid = getpid();
if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR | O_SYNC ,0666 ))<0) {
perror("open:");
exit(1);
}
if ((size = write( fd, o_sync_write_buf, len)) < 0){
perror("write:");
exit(1);
}
if ((size = write( fd, o_sync_write_buf, len)) < 0){
perror("write:");
exit(1);
}
/* 使用了O_SYNC模式后再调用fsync会是什么行为 */
if(fsync(fd) < 0){
perror("fysnc:");
exit(1);
}
lseek(fd, 0, SEEK_SET );
if ((size = read( fd, o_sync_read_buf, len))<0) {
perror("read:");
exit(1);
}
if(strncmp(o_sync_read_buf,o_sync_write_buf,len) != 0){
perror("strncmp:");
exit(1);
}
if ( close(fd) < 0 ) {
perror("close:");
exit(1);
}
/*4. 使用__O_DIRECT 和 O_SYNC 组合 */
char *o_sync_direct_write_buf,*o_sync_direct_read_buf;
ret = posix_memalign((void **)&o_sync_direct_write_buf, 512, BUF_SIZE);
if (ret) {
perror("posix_memalign:");
exit(1);
}
ret = posix_memalign((void **)&o_sync_direct_read_buf, 512, BUF_SIZE);
if (ret) {
perror("posix_memalign:");
exit(1);
}
strcpy(direct_write_buf,"o_sync o_direct mode");
len = strlen("o_sync o_direct mode");
pid = getpid();
if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR | O_DIRECT | O_SYNC ,0666 ))<0) {
perror("open:");
exit(1);
}
if ((size = write( fd, o_sync_direct_write_buf, BUF_SIZE)) < 0){
perror("write:");
exit(1);
}
if(fsync(fd) < 0){
perror("fysnc:");
exit(1);
}
lseek(fd, 0, SEEK_SET );
if ((size = read( fd, o_sync_direct_read_buf, BUF_SIZE))<0) {
perror("read:");
exit(1);
}
if(strncmp(o_sync_direct_read_buf,o_sync_direct_write_buf,len) != 0){
perror("strncmp:");
exit(1);
}
if ( close(fd) < 0 ) {
perror("close:");
exit(1);
}
/*5. mmap io use msync */
char * mmap_write_buf = "mmap io";
char mmap_read_buf[100];
len = strlen(mmap_write_buf);
pid = getpid();
char * file_map = NULL;
if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR ,0666 ))<0) {
perror("open:");
exit(1);
}
if((file_map = mmap(0, 1024, PROT_READ | PROT_WRITE, MAP_SHARED , fd, 0)) == (void *)-1){
perror("mmap:");
exit(1);
}
if(fallocate(fd,0,0,BUF_SIZE) < 0){
perror("fallocate:");
exit(1);
}
memcpy(file_map,mmap_write_buf,strlen(mmap_write_buf));
/* 此处不签msync就可以read,因为都在page cache中 */
if ((size = read( fd, mmap_read_buf, strlen(mmap_write_buf)))<0) {
perror("read:");
exit(1);
}
if(strncmp(mmap_read_buf,mmap_write_buf,strlen(mmap_write_buf)) != 0){
perror("strncmp:");
exit(1);
}
/* msync和fsync区别? */
if(msync(file_map,strlen(mmap_write_buf),MS_SYNC) < 0){
perror("msync:");
exit(1);
}
if ( close(fd) < 0 ) {
perror("close:");
exit(1);
}
munmap(file_map,1024);
/*6. mmap io use fsync */
char * mmap_write_buf2 = "mmap io 2";
char mmap_read_buf2[100];
len = strlen(mmap_write_buf2);
pid = getpid();
file_map = NULL;
if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR ,0666 ))<0) {
perror("open:");
exit(1);
}
if((file_map = mmap(0, 1024, PROT_READ | PROT_WRITE, MAP_SHARED , fd, 0)) == (void *)-1){
perror("mmap:");
exit(1);
}
if(fallocate(fd,0,0,BUF_SIZE) < 0){
perror("fallocate:");
exit(1);
}
memcpy(file_map,mmap_write_buf2,strlen(mmap_write_buf2));
/* 此处不签msync就可以read,因为都在page cache中 */
if ((size = read( fd, mmap_read_buf2, strlen(mmap_write_buf2)))<0) {
perror("read:");
exit(1);
}
if(strncmp(mmap_read_buf2,mmap_write_buf2,strlen(mmap_write_buf)) != 0){
perror("strncmp:");
exit(1);
}
/* msync和fsync区别? */
if(fsync(fd) < 0){
perror("fsync:");
exit(1);
}
if ( close(fd) < 0 ) {
perror("close:");
exit(1);
}
munmap(file_map,1024);
/*7. mmap io with O_DIRECT */
char * mmap_direct_write_buf = "mmap direct mode";
char * mmap_direct_read_buf;
len = strlen(mmap_direct_write_buf);
ret = posix_memalign((void **)&mmap_direct_read_buf, 512, BUF_SIZE);
if (ret) {
perror("posix_memalign:");
exit(1);
}
pid = getpid();
file_map = NULL;
if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR | O_DIRECT,0666 ))<0) {
perror("open:");
exit(1);
}
if((file_map = mmap(0, 1024, PROT_READ | PROT_WRITE, MAP_SHARED , fd, 0)) == (void *)-1){
perror("mmap:");
exit(1);
}
if(fallocate(fd,0,0,BUF_SIZE) < 0){
perror("fallocate:");
exit(1);
}
memcpy(file_map,mmap_direct_write_buf,len);
/* 由于使用了O_DIRECT,如果之前不msync,那么此时read会绕过page cahce,能读到正确数据? */
if ((size = read( fd, mmap_direct_read_buf, BUF_SIZE))<0) {
perror("read:");
exit(1);
}
if(strncmp(mmap_direct_read_buf,mmap_direct_write_buf,len) != 0){
perror("strncmp:");
exit(1);
}
/* msync和fsync区别? */
if(msync(file_map,strlen(mmap_write_buf),MS_SYNC) < 0){
perror("msync:");
exit(1);
}
if ( close(fd) < 0 ) {
perror("close:");
exit(1);
}
munmap(file_map,1024);
return 0;
}
1.普通io模式
-
open
https://github.com/chenyang8094/iotrace/blob/master/1.open
-
第一次write
https://github.com/chenyang8094/iotrace/blob/master/1.write1
-
第二次write
https://github.com/chenyang8094/iotrace/blob/master/1.write2
可以通过http://tool.chinaz.com/tools/diff/ 这个比较第一次和第二次write路径的区别,如下图所示,左边是第一次write,右边是第二次write。由于第一次write的时候,文件没有在page cache中,因此调用路径会比第二次长,会有读磁盘数据并添加page cache的动作。
-
read
https://github.com/chenyang8094/iotrace/blob/master/1.read
-
fsync
https://github.com/chenyang8094/iotrace/blob/master/1.fsync
2.O_DIRECT模式
-
open
https://github.com/chenyang8094/iotrace/blob/master/2.open
使用O_DIRECT打开之后,和上一中open的差异可以看下图:
-
write
https://github.com/chenyang8094/iotrace/blob/master/2.write
-
read
https://github.com/chenyang8094/iotrace/blob/master/2.read
-
fsync
https://github.com/chenyang8094/iotrace/blob/master/2.fsync
3.O_SYNC模式
-
open
https://github.com/chenyang8094/iotrace/blob/master/3.open
-
第一次write
https://github.com/chenyang8094/iotrace/blob/master/3.write1
-
第二次write
https://github.com/chenyang8094/iotrace/blob/master/3.write2
-
read
https://github.com/chenyang8094/iotrace/blob/master/3.read
-
fsync
https://github.com/chenyang8094/iotrace/blob/master/3.fsync
4.O_DIRECT+O_SYNC模式
-
open
https://github.com/chenyang8094/iotrace/blob/master/4.open
-
write
https://github.com/chenyang8094/iotrace/blob/master/4.write
-
fsync
https://github.com/chenyang8094/iotrace/blob/master/4.sync
-
read
https://github.com/chenyang8094/iotrace/blob/master/4.read
5.mmap使用msync刷新
-
open
https://github.com/chenyang8094/iotrace/blob/master/5.open
-
mmap
https://github.com/chenyang8094/iotrace/blob/master/5.mmap
-
read
https://github.com/chenyang8094/iotrace/blob/master/5.read
-
msync
https://github.com/chenyang8094/iotrace/blob/master/5.msync
6.mmap使用fsync刷新
-
open
https://github.com/chenyang8094/iotrace/blob/master/6.open
-
read
https://github.com/chenyang8094/iotrace/blob/master/6.read
-
fsync
https://github.com/chenyang8094/iotrace/blob/master/6.fsync
7.mmap使用O_DIRECT方式open
-
open
https://github.com/chenyang8094/iotrace/blob/master/7.open
-
read
https://github.com/chenyang8094/iotrace/blob/master/7.read
-
msync
https://github.com/chenyang8094/iotrace/blob/master/7.msync