在块设备上运行 FIO/Vdbench 时,是否可以使用 ptrace 来模拟特定 lba 范围内的 IO 错误

is it possible to use the ptrace to simulate the IO error on the specific lba range when running the FIO/Vdbench on the block device

提问人:wang larry 提问时间:6/30/2023 更新时间:6/30/2023 访问量:23

问:

作为标题,我正在寻找一种方法来模拟使用其他 IO 工具运行工作负载(通过 FIO/Vdbench)时的 IO 错误或慢磁盘问题,我尝试使用 ptrace 通过以下代码来做到这一点

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <linux/blkpg.h>
#include <sys/mman.h>
#include <signal.h>
#include <sys/wait.h>
#include <dirent.h>
#include <sys/user.h>
#include <sys/ptrace.h>

#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif

#define IO_ERROR_RANGE_START 0x1000000
#define IO_ERROR_RANGE_END 0x2000000

int pid_for_device(char *device)
{
    char cmd[256];
    snprintf(cmd, sizeof(cmd), "lsof %s", device);

    FILE *fp = popen(cmd, "r");
    if (fp == NULL)
    {
        perror("popen");
        return -1;
    }

    char buf[256];
    int pid = -1;
    while (fgets(buf, sizeof(buf), fp) != NULL)
    {
        if (strstr(buf, "r") != NULL)
        {
            char *ptr = strtok(buf, " ");
            ptr = strtok(NULL, " ");
            pid = atoi(ptr);
            break;
        }
    }

    pclose(fp);

    return pid;
}

void simulate_io_error(pid_t pid, unsigned long addr, unsigned long count)
{
    struct user_regs_struct regs;
    
    if (ptrace(PTRACE_GETREGS, pid, NULL, &regs) == -1)
    {
        perror("ptrace(PTRACE_GETREGS) failed");
        exit(1);
    }

    if (addr >= IO_ERROR_RANGE_START && addr + count <= IO_ERROR_RANGE_END)
    {
        printf("Simulating I/O error for range %lu-%lu\n", addr, addr + count);
        regs.rax = -EIO;
    }

    if (ptrace(PTRACE_SETREGS, pid, NULL, &regs) == -1)
    {
        perror("ptrace(PTRACE_SETREGS) failed");
        exit(1);
    }
}

int main(int argc, char *argv[])
{
    if (argc != 5)
    {
        printf("Usage: %s <device file> <start range in GB> <end range in GB> <duration in seconds>\n", argv[0]);
        return 1;
    }

    char *device_file = argv[1];
    long start_range_gb = strtol(argv[2], NULL, 10);
    long end_range_gb = strtol(argv[3], NULL, 10);
    int duration = atoi(argv[4]);

    int fd = open(device_file, O_RDWR);
    if (fd == -1)
    {
        perror("open failed");
        return 1;
    }

    unsigned long start_range_lba = start_range_gb * (1024 * 1024 * 1024 / 512);
    unsigned long end_range_lba = end_range_gb * (1024 * 1024 * 1024 / 512);

    printf("Simulating I/O errors in range %lu-%lu for %d seconds\n", start_range_lba, end_range_lba, duration);

    unsigned long range_size = (end_range_lba - start_range_lba) * 512;
    unsigned long range_start = start_range_lba * 512;

    void *mapped = mmap(NULL, range_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, range_start);
    if (mapped == MAP_FAILED)
    {
        perror("mmap failed");
        return 1;
    }

    pid_t pid = pid_for_device(device_file);
    if (pid == -1)
    {
        perror("get_pid_from_lsof failed");
        return 1;
    }

    int i = 0;
    for (; i < duration; i++)
    {
        unsigned long addr = (unsigned long)mapped + (i % (range_size / PAGE_SIZE)) * PAGE_SIZE;
        unsigned long count = PAGE_SIZE;

        simulate_io_error(pid, addr, count);

        usleep(1000);
    }

    munmap(mapped, range_size);
    close(fd);

    return 0;
}

程序将首先使用命令获取进程的 PID

lsofof /dev/sda
COMMAND   PID USER   FD   TYPE  DEVICE   SIZE/OFF       NODE NAME
fio     76954 root    3r   BLK 134,144 0x3b8ee000 1035083385 /dev/sda

然后它尝试使用 ptrace 连接 PID 76954,但是,我的程序总是以错误 ptrace(PTRACE_GETREGS) 失败告终:即使进程已准备就绪并正在运行,也没有这样的进程,我的代码中有什么问题吗?

巴西 拉里

C IO PTRACE

评论


答: 暂无答案