Skip to content

Latest commit

 

History

History
2791 lines (2224 loc) · 82.7 KB

18.md

File metadata and controls

2791 lines (2224 loc) · 82.7 KB

下面的內容代表課程走向,大多都是貼下列的網站README,我會在程式代碼裡面多加上一些新手補充注釋

https://github.com/riscv2os/riscv2os/wiki/xv6CatRead

https://github.com/riscv2os/riscv2os/wiki/xv6Trap

https://github.com/riscv2os/riscv2os/wiki/xv6Lock

https://github.com/riscv2os/riscv2os/wiki/xv6Disk

https://github.com/riscv2os/riscv2os/wiki/xv6Mmu

https://github.com/riscv2os/riscv2os/wiki/xv6

xv6-續

xv6: 理解檔案系統結構 -- 從 cat.c 的 open() 指令開始追蹤

在前一篇中,我們透過 cat.c 的 write(1, buf, n) 指令追蹤了 xv6 的輸出原理,特別是 file.c:filewrite 是如何被檔案表 ftable 對應到 console 裝置而輸出到 uart 的那段程式流程。

在這一篇中,我們繼續追蹤 cat.c 中的 open() 與 read() ,以便理解 xv6 當中那個基於 inode 所串起來的《檔案系統結構》。

整體來看,xv6 的檔案系統有七個層次,如下表所示:

7. File Descriptor (file.c/FILE)       // ftable.file 是 struct file (FILE) 的陣列
                                       // fd 就是其索引值

6. Pathname (fs.c/namei+dirlookup)     // 透過 namei() 查找對應某路徑的 inode

5. Directory (fs.h/struct dirent)      // 目錄以 dirent 形式存在 inode 中

4. Inode (fs.c/struct inode/dinode)    // 一個檔案對應一個 inode

3. Logging (log.c)                     // 日誌,防止寫到一半出現狀況。

2. Buffer cache (bio.c)                // 區塊讀取函數 bread(dev, i)/bwrite(dev, i) 
                                       // 有緩衝功能。

1. Disk (virtio_disk.c)                // 透過 virtio 協定請求讀寫硬碟

cat.c

首先在讓我們回顧一下 cat.c 的原始碼!

user/cat.c

#include "kernel/types.h"
#include "kernel/stat.h"
#include "user/user.h"

char buf[512];

void
cat(int fd)
{
  int n;

  while((n = read(fd, buf, sizeof(buf))) > 0) {
    if (write(1, buf, n) != n) {
      fprintf(2, "cat: write error\n");
      exit(1);
    }
  }
  if(n < 0){
    fprintf(2, "cat: read error\n");
    exit(1);
  }
}

int
main(int argc, char *argv[])
{
  int fd, i;

  if(argc <= 1){
    cat(0);
    exit(0);
  }

  for(i = 1; i < argc; i++){
    if((fd = open(argv[i], 0)) < 0){
      fprintf(2, "cat: cannot open %s\n", argv[i]);
      exit(1);
    }
    cat(fd);
    close(fd);
  }
  exit(0);
}

open()

現在讓我們將焦點放在 fd = open(argv[i], 0) 這個指令上面。

對於 cat README 這樣的指令, argv[i] 就是 "README" 這個字串,於是該指令就是 fd = open("README", 0)

如同上一篇所示,open(), read() 等系統呼叫,被定義在 user/usys.S 這個檔案中,

user/usys.S

.global open
open:
 li a7, SYS_open
 ecall
 ret
.global read
read:
 li a7, SYS_read
 ecall
 ret
.global write
write:
 li a7, SYS_write
 ecall
 ret
.global close
close:
 li a7, SYS_close
 ecall
 ret

透過 ecall 呼叫軟體中斷,會讓系統《自陷到核心模式》,並呼叫 syscalls 中的 sys_open() 函數。

static uint64 (*syscalls[])(void) = {
[SYS_fork]    sys_fork,
[SYS_exit]    sys_exit,
[SYS_wait]    sys_wait,
[SYS_pipe]    sys_pipe,
[SYS_read]    sys_read,
[SYS_kill]    sys_kill,
[SYS_exec]    sys_exec,
[SYS_fstat]   sys_fstat,
[SYS_chdir]   sys_chdir,
[SYS_dup]     sys_dup,
[SYS_getpid]  sys_getpid,
[SYS_sbrk]    sys_sbrk,
[SYS_sleep]   sys_sleep,
[SYS_uptime]  sys_uptime,
[SYS_open]    sys_open,
[SYS_write]   sys_write,
[SYS_mknod]   sys_mknod,
[SYS_unlink]  sys_unlink,
[SYS_link]    sys_link,
[SYS_mkdir]   sys_mkdir,
[SYS_close]   sys_close,
};

void
syscall(void)  // 小寫的是syscall(),大小的是變數
{
  int num;
  struct proc *p = myproc();

  num = p->trapframe->a7;
  if(num > 0 && num < NELEM(syscalls) && syscalls[num]) {
    p->trapframe->a0 = syscalls[num]();
  } else {
    printf("%d %s: unknown sys call %d\n",
            p->pid, p->name, num);
    p->trapframe->a0 = -1;
  }
}

sys_open() 定義在 sysfile.c 中,其功能為取得對應的 inode ,創建《檔案描述子》,放入檔案表 ftable 中,然後傳回其代號 fd。

kernel/sysfile.c

uint64
sys_open(void)
{
  char path[MAXPATH];
  int fd, omode;
  struct file *f;
  struct inode *ip;
  int n;

  if((n = argstr(0, path, MAXPATH)) < 0 || argint(1, &omode) < 0)
    return -1;

  begin_op();

  if(omode & O_CREATE){ // 模式為創建新檔案
    ip = create(path, T_FILE, 0, 0); // 創建新的 inode
    if(ip == 0){
      end_op();
      return -1;
    }
  } else {  // 已經存在的檔案
    if((ip = namei(path)) == 0){ // 取得該路徑名對應的 inode
      end_op();
      return -1;
    }
    ilock(ip); // 將該 inode 鎖定
    if(ip->type == T_DIR && omode != O_RDONLY){
      iunlockput(ip); // 若是目錄且要寫入,那就先解鎖該 inode
      end_op();
      return -1;
    }
  }

  if(ip->type == T_DEVICE && (ip->major < 0 || ip->major >= NDEV)){
    iunlockput(ip); // 若是裝置,那也先解鎖該 inode
    end_op();
    return -1;
  }

  if((f = filealloc()) == 0 || (fd = fdalloc(f)) < 0){ // 分配檔案描述子,通常是從3開始
    if(f) // 若分配失敗就關閉之。
      fileclose(f);
    iunlockput(ip); // 分配成功的話,先解鎖該 inode,一直鎖住別人就無法讀取檔案
    end_op();
    return -1;
  }
  // 設定 FILE (struct file) 結構 (也就是檔案描述子)
  if(ip->type == T_DEVICE){  // 設備
    f->type = FD_DEVICE; 
    f->major = ip->major;
  } else {  // 一般檔案
    f->type = FD_INODE;
    f->off = 0;
  }
  f->ip = ip;
  f->readable = !(omode & O_WRONLY);
  f->writable = (omode & O_WRONLY) || (omode & O_RDWR);

  if((omode & O_TRUNC) && ip->type == T_FILE){
    itrunc(ip); // 若是檔案且模式為 O_TRUNC,就將該 inode 清空。
  }

  iunlock(ip);
  end_op();

  return fd; // 傳回檔案描述子代碼 (在檔案表 ftable 中的位置)
}

read()

類似的,read() 函數也是轉交到 sys_read() 去處理

kernel/sysfile.c

uint64
sys_read(void)
{
  struct file *f;
  int n;
  uint64 p;

  if(argfd(0, 0, &f) < 0 || argint(2, &n) < 0 || argaddr(1, &p) < 0)
    return -1;
  return fileread(f, p, n);
}

sys_read() 又轉到 file.c 的 fileread() 中

// Read from file f.
// addr is a user virtual address.
int
fileread(struct file *f, uint64 addr, int n)
{
  int r = 0;

  if(f->readable == 0)
    return -1;

  if(f->type == FD_PIPE){  // 管道
    r = piperead(f->pipe, addr, n);  // 進行管道讀取
  } else if(f->type == FD_DEVICE){  // 裝置
    if(f->major < 0 || f->major >= NDEV || !devsw[f->major].read)
      return -1;
    r = devsw[f->major].read(1, addr, n);
  } else if(f->type == FD_INODE){  // 檔案
    ilock(f->ip);
    if((r = readi(f->ip, 1, addr, f->off, n)) > 0)
      f->off += r;
    iunlock(f->ip);
  } else {
    panic("fileread");
  }

  return r;
}

如果 read() 的對象是檔案的話,那麼就會在剛剛 open() 時已經取得其 inode,於是上列程式中的readi(f->ip, 1, addr, f->off, n) 指令就會被執行。

readi() 會透過 inode (f->ip) 讀取對應的資料進來。

inode, superblock 與檔案系統

問題是,inode 的結構到底長得怎麼樣,他是如何組織的呢?請看下圖:

img

在 xv6/UNIX 等系統中,inode 是用來指向區塊 BLOCK 的結構。

從更上層的角度看,硬碟 (或磁碟分割區,像是 ext2/ext3 ....) 是以區塊為單位,每個區塊大小都相同。

xv6 中整個硬碟被分成下列的區塊結構,其中寫 1 的代表單一區塊,寫 * 的代表占用多個區塊。

			| inode 資訊 |                   |紀錄那些區塊可以用|
[ boot block | sb block | log | inode blocks | free bit map | data blocks ]
      1            1        *         *             1              *

只用1個區塊紀錄那些區塊可以用,只有 8192(1024 * 8) bit,所以硬碟大小最多 8M(1024 * 8192)

每個區塊大小為 1k (1024 bytes),這在 kernel/fs.h 當中有定義。

#define BSIZE 1024  // block size

xv6 把第一個區塊留給啟動程式,第二個區塊是超級區塊 (superblock),超級區塊紀錄了總體的結構資訊。

// Disk layout:
// [ boot block | super block | log | inode blocks |
//                                          free bit map | data blocks]
//
// mkfs computes the super block and builds an initial file system. The
// super block describes the disk layout:
struct superblock { // 超級區塊
  // 特數的數字,代表被格式化過
  uint magic;        // Must be FSMAGIC      // 用來辨識的魔數:0x10203040  
  uint size;         // Size of file system image (blocks) // 全部區塊數
  uint nblocks;      // Number of data blocks // 資料區塊數
  uint ninodes;      // Number of inodes.     // inodes 數量
  uint nlog;         // Number of log blocks  // 日誌 log 的區塊數
  uint logstart;     // Block number of first log block   // log 的首區塊
  uint inodestart;   // Block number of first inode block // inode 的首區塊
  uint bmapstart;    // Block number of first free map block // free bitmap 的首區塊
};

#define FSMAGIC 0x10203040

file.h 中定義了記憶體內的 inode 結構。

// in-memory copy of an inode
struct inode {  // Linux檔案結構
  // 記憶體多出來的資訊
  uint dev;           // Device number // 可能有多顆硬碟
  uint inum;          // Inode number
  int ref;            // Reference count  // 引用次數
  struct sleeplock lock; // protects everything below here
  int valid;          // inode has been read from disk?
  
  // disk inode struct
  short type;         // copy of disk inode
  short major;
  short minor;
  short nlink;        // 檔案連結數量,用在創建硬連結或軟連結 ln
  uint size;
  uint addrs[NDIRECT+1];
};

但是存到硬碟時不需要存這麼多資訊,因此還有個 dinode 結構在 fs.h 當中:

#define NDIRECT 12  // 12個區塊
#define NINDIRECT (BSIZE / sizeof(uint))
#define MAXFILE (NDIRECT + NINDIRECT)

// On-disk inode structure
struct dinode {
  short type;           // File type
  short major;          // Major device number (T_DEVICE only)
  short minor;          // Minor device number (T_DEVICE only)
  short nlink;          // Number of links to inode in file system
  uint size;            // Size of file (bytes)
  uint addrs[NDIRECT+1];   // Data block addresses   // 直接區塊和間接區塊的表格
};

更詳細的 inode 與超級區塊資訊請參看鳥哥的文章:

http://linux.vbird.org/linux_basic/0230filesystem.php#harddisk-inode

ext2 檔案系統: http://linux.vbird.org/linux_basic/0230filesystem/ext2_filesystem.jpg

inode: http://linux.vbird.org/linux_basic/0230filesystem/inode.jpg

讓我們回到剛剛的 readi(f->ip, 1, addr, f->off, n) 函數繼續追蹤下去。

kernel/fs.c

// Read data from inode.
// Caller must hold ip->lock.
// If user_dst==1, then dst is a user virtual address;
// otherwise, dst is a kernel address.
int                                  // uint64跑在64bit作業系統
readi(struct inode *ip, int user_dst, uint64 dst, uint off, uint n) 
{
  uint tot, m;
  struct buf *bp;

  if(off > ip->size || off + n < off)
    return 0;
  if(off + n > ip->size)
    n = ip->size - off;

  // 每個區塊 1024,讀的資料可能很大,所以使用for循環讀取
  for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
    bp = bread(ip->dev, bmap(ip, off/BSIZE));  // 一塊一塊的讀取 // bio.c
    m = min(n - tot, BSIZE - off%BSIZE);
    // 放到目的地
    if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) {
      brelse(bp);
      tot = -1;
      break;
    }
    brelse(bp);
  }
  return tot;
}

緩衝模組 bio.c

readi() 會呼叫 bio.c 中的 bread() 函數讀取對應區塊進來,然後放入緩衝區內。

bio.c 模組,主要是為基礎的檔案讀寫函數 virtio_disk.c 模組,加上緩衝的功能,以下的 struct bcache 是其緩衝結構。

kernel/bio.c

struct {
  struct spinlock lock;
  struct buf buf[NBUF];

  // Linked list of all buffers, through prev/next.
  // Sorted by how recently the buffer was used.
  // head.next is most recent, head.prev is least.
  struct buf head;
} bcache;

在 bread() 函數中,若資料已經在緩中區內,則可以用 bget() 直接取得,若不在緩衝區內,則必須呼叫《虛擬硬碟存取函數》 virtio_disk_rw(b, 0) 去讀取對應的區塊 (第 blockno 個區塊)

如果一直讀一個區塊,會造成速度變慢,如果有緩衝區,就可以不用一直讀取區塊。對於作業系統,緩衝功能很重要。

// Return a locked buf with the contents of the indicated block.
struct buf*
bread(uint dev, uint blockno)
{
  struct buf *b;

  b = bget(dev, blockno);  // 讀取緩衝區內的內容
  if(!b->valid) {
    virtio_disk_rw(b, 0);  // 請硬碟讀取
    b->valid = 1;
  }
  return b;
}

虛擬硬碟存取 virtio.h/virtio_disk.c

virtio_disk_rw() 是 xv6 用來存取 qemu 虛擬硬碟的函數,透過 virtio 這個存取協定,可以讓 qemu 在執行 xv6 時,能夠比較有效率,而不需要去模擬 CPU 在存取 DMA 裝置時的所有動作。

// 參考 [Virtual I/O Device (VIRTIO) Version 1.1 -- 4.2.2 MMIO Device Register Layout](https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1460002)
void
virtio_disk_init(void) // 初始化本模組
{
  uint32 status = 0;

  initlock(&disk.vdisk_lock, "virtio_disk");

  // 檢查是否有 MMIO 磁碟裝置存在?
  if(*R(VIRTIO_MMIO_MAGIC_VALUE) != 0x74726976 ||
     *R(VIRTIO_MMIO_VERSION) != 1 ||
     *R(VIRTIO_MMIO_DEVICE_ID) != 2 ||
     *R(VIRTIO_MMIO_VENDOR_ID) != 0x554d4551){
    panic("could not find virtio disk");
  }
  
  // 設定 VIRTIO_MMIO 狀態
  status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
  *R(VIRTIO_MMIO_STATUS) = status;

  status |= VIRTIO_CONFIG_S_DRIVER;
  *R(VIRTIO_MMIO_STATUS) = status;

  // negotiate features // 設定 feature 準備開始協商
  uint64 features = *R(VIRTIO_MMIO_DEVICE_FEATURES);
  features &= ~(1 << VIRTIO_BLK_F_RO);
  features &= ~(1 << VIRTIO_BLK_F_SCSI);
  features &= ~(1 << VIRTIO_BLK_F_CONFIG_WCE);
  features &= ~(1 << VIRTIO_BLK_F_MQ);
  features &= ~(1 << VIRTIO_F_ANY_LAYOUT);
  features &= ~(1 << VIRTIO_RING_F_EVENT_IDX);
  features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
  *R(VIRTIO_MMIO_DRIVER_FEATURES) = features;

  // tell device that feature negotiation is complete.
  status |= VIRTIO_CONFIG_S_FEATURES_OK; // feature 設定好了
  *R(VIRTIO_MMIO_STATUS) = status;

  // tell device we're completely ready. // 完成準備,要求 QEMU 協商
  status |= VIRTIO_CONFIG_S_DRIVER_OK;
  *R(VIRTIO_MMIO_STATUS) = status;

  *R(VIRTIO_MMIO_GUEST_PAGE_SIZE) = PGSIZE;

  // initialize queue 0.
  *R(VIRTIO_MMIO_QUEUE_SEL) = 0;
  uint32 max = *R(VIRTIO_MMIO_QUEUE_NUM_MAX);
  if(max == 0)
    panic("virtio disk has no queue 0");
  if(max < NUM)
    panic("virtio disk max queue too short");
  *R(VIRTIO_MMIO_QUEUE_NUM) = NUM;
  memset(disk.pages, 0, sizeof(disk.pages));
  *R(VIRTIO_MMIO_QUEUE_PFN) = ((uint64)disk.pages) >> PGSHIFT; // PFN: Guest physical page number of the virtual queue

  // desc = pages -- num * virtq_desc
  // avail = pages + 0x40 -- 2 * uint16, then num * uint16
  // used = pages + 4096 -- 2 * uint16, then num * vRingUsedElem

  disk.desc = (struct virtq_desc *) disk.pages;
  disk.avail = (struct virtq_avail *)(disk.pages + NUM*sizeof(struct virtq_desc));
  disk.used = (struct virtq_used *) (disk.pages + PGSIZE);

  // all NUM descriptors start out unused. // 設定所有描述子均為可用
  for(int i = 0; i < NUM; i++)
    disk.free[i] = 1;

  // plic.c and trap.c arrange for interrupts from VIRTIO0_IRQ.
}

// find a free descriptor, mark it non-free, return its index.
static int
alloc_desc() // 找到一個可用的 free descriptor 分配之
{
  for(int i = 0; i < NUM; i++){
    if(disk.free[i]){
      disk.free[i] = 0;
      return i;
    }
  }
  return -1;
}

// mark a descriptor as free.
static void
free_desc(int i) // 歸還 descriptor ,回到 free 狀態
{
  if(i >= NUM)
    panic("free_desc 1");
  if(disk.free[i])
    panic("free_desc 2");
  disk.desc[i].addr = 0;
  disk.desc[i].len = 0;
  disk.desc[i].flags = 0;
  disk.desc[i].next = 0;
  disk.free[i] = 1;
  wakeup(&disk.free[0]);
}

// free a chain of descriptors.
static void
free_chain(int i) // 釋放一整條的鍊 (descriptor chain)
{
  while(1){
    int flag = disk.desc[i].flags;
    int nxt = disk.desc[i].next;
    free_desc(i);
    if(flag & VRING_DESC_F_NEXT)
      i = nxt;
    else
      break;
  }
}

// allocate three descriptors (they need not be contiguous).
// disk transfers always use three descriptors.
static int
alloc3_desc(int *idx) // 分配空間連續的 3 個 descriptor
{
  for(int i = 0; i < 3; i++){
    idx[i] = alloc_desc();
    if(idx[i] < 0){
      for(int j = 0; j < i; j++)
        free_desc(idx[j]);
      return -1;
    }
  }
  return 0;
}

void
virtio_disk_rw(struct buf *b, int write) // 啟動 virtio 的磁碟寫入動作
{
  uint64 sector = b->blockno * (BSIZE / 512);

  acquire(&disk.vdisk_lock);

  // the spec's Section 5.2 says that legacy block operations use
  // three descriptors: one for type/reserved/sector, one for the
  // data, one for a 1-byte status result.

  // allocate the three descriptors. // 分配直到成功為止
  int idx[3];
  while(1){
    if(alloc3_desc(idx) == 0) {
      break;
    }
    sleep(&disk.free[0], &disk.vdisk_lock);
  }

  // format the three descriptors.
  // qemu's virtio-blk.c reads them.
  // 參考 -- https://github.com/qemu/qemu/blob/master/hw/block/virtio-blk.c

  struct virtio_blk_req *buf0 = &disk.ops[idx[0]]; // 讀寫的區塊

  if(write) // 根據 write 來設定為《寫入或讀取》
    buf0->type = VIRTIO_BLK_T_OUT; // write the disk // 讀寫類型為寫入
  else
    buf0->type = VIRTIO_BLK_T_IN; // read the disk // 讀寫類型為讀取
  buf0->reserved = 0;
  buf0->sector = sector; // 讀寫的磁區號碼 (sector)

  // 第 0 個描述子
  disk.desc[idx[0]].addr = (uint64) buf0;
  disk.desc[idx[0]].len = sizeof(struct virtio_blk_req);
  disk.desc[idx[0]].flags = VRING_DESC_F_NEXT;
  disk.desc[idx[0]].next = idx[1];

  // 第 1 個描述子
  disk.desc[idx[1]].addr = (uint64) b->data;
  disk.desc[idx[1]].len = BSIZE;
  if(write)
    disk.desc[idx[1]].flags = 0; // device reads b->data
  else
    disk.desc[idx[1]].flags = VRING_DESC_F_WRITE; // device writes b->data
  disk.desc[idx[1]].flags |= VRING_DESC_F_NEXT;
  disk.desc[idx[1]].next = idx[2];

  // 第 2 個描述子
  disk.info[idx[0]].status = 0xff; // device writes 0 on success
  disk.desc[idx[2]].addr = (uint64) &disk.info[idx[0]].status;
  disk.desc[idx[2]].len = 1;
  disk.desc[idx[2]].flags = VRING_DESC_F_WRITE; // device writes the status
  disk.desc[idx[2]].next = 0;

  // record struct buf for virtio_disk_intr().
  b->disk = 1;
  disk.info[idx[0]].b = b;

  // tell the device the first index in our chain of descriptors.
  disk.avail->ring[disk.avail->idx % NUM] = idx[0];

  __sync_synchronize();

  // tell the device another avail ring entry is available.
  disk.avail->idx += 1; // not % NUM ...

  __sync_synchronize();

  *R(VIRTIO_MMIO_QUEUE_NOTIFY) = 0; // value is queue number

  // Wait for virtio_disk_intr() to say request has finished.
  while(b->disk == 1) { // b->disk=1 代表磁碟正在讀取到緩衝區 buf  // block
    sleep(b, &disk.vdisk_lock);
  }

  // 讀完了,釋放 idx[0]
  disk.info[idx[0]].b = 0;
  free_chain(idx[0]);

  release(&disk.vdisk_lock);
}

void
virtio_disk_intr() // virtio_disk_rw() 請求讀寫,完成後 qemu 會發中斷給 xv6
{
  acquire(&disk.vdisk_lock);

  // the device won't raise another interrupt until we tell it
  // we've seen this interrupt, which the following line does.
  // this may race with the device writing new entries to
  // the "used" ring, in which case we may process the new
  // completion entries in this interrupt, and have nothing to do
  // in the next interrupt, which is harmless.
  *R(VIRTIO_MMIO_INTERRUPT_ACK) = *R(VIRTIO_MMIO_INTERRUPT_STATUS) & 0x3;

  __sync_synchronize();

  // the device increments disk.used->idx when it
  // adds an entry to the used ring.

  while(disk.used_idx != disk.used->idx){
    __sync_synchronize();
    int id = disk.used->ring[disk.used_idx % NUM].id;

    if(disk.info[id].status != 0)
      panic("virtio_disk_intr status");

    struct buf *b = disk.info[id].b;
    b->disk = 0;   // disk is done with buf
    wakeup(b); // 讀取完成,喚醒等待此磁碟事件的行程 (加入排程佇列)

    disk.used_idx += 1;
  }

  release(&disk.vdisk_lock);
}

file

file.c

struct file {
  enum { FD_NONE, FD_PIPE, FD_INODE, FD_DEVICE } type;
  int ref; // reference count
  char readable;
  char writable;
  struct pipe *pipe; // FD_PIPE
  struct inode *ip;  // FD_INODE and FD_DEVICE
  uint off;          // FD_INODE
  short major;       // FD_DEVICE
};

fs.c、fs.h: 主架構在裡面訂出來,包含 5、6 層

dirent

// Inodes per block.
#define IPB           (BSIZE / sizeof(struct dinode))

// Block containing inode i
#define IBLOCK(i, sb)     ((i) / IPB + sb.inodestart)

// Bitmap bits per block
#define BPB           (BSIZE*8)  // 使用一個byte代表區塊有人用還是沒人用

// Block of free map containing bit for block b
#define BBLOCK(b, sb) ((b)/BPB + sb.bmapstart)

// Directory is a file containing a sequence of dirent structures.
#define DIRSIZ 14

struct dirent {
  ushort inum;
  char name[DIRSIZ];
};

整體的檔案系統結構

整體來看,xv6 的檔案系統有七個層次,如下表所示:

7. File Descriptor (file.c/FILE)       // ftable.file 是 struct file (FILE) 的陣列
                                       // fd 就是其索引值

6. Pathname (fs.c/namei+dirlookup)     // 透過 namei() 查找對應某路徑的 inode

5. Directory (fs.h/struct dirent)      // 目錄以 dirent 形式存在 inode 中

4. Inode (fs.c/struct inode/dinode)    // 一個檔案對應一個 inode

3. Logging (log.c)                     // 日誌,防止寫到一半出現狀況。

2. Buffer cache (bio.c)                // 區塊讀取函數 bread(dev, i)/bwrite(dev, i) 
                                       // 有緩衝功能。

1. Disk (virtio_disk.c)                // 透過 virtio 協定請求讀寫硬碟

xv6: 中斷 Interrupt (Trap)

RISC-V 通常有兩種中斷控制器,局部中斷 CLINT (Core Local Interruptor) 和外部中斷 PLIC (Platform-Level Interrupt Controller)。

CLINT 包含軟體中斷 (Software Interrupt) 和時間中斷 (Timer Interrupt),發生後會立刻執行。

PLIC 則是外部設備的中斷,像是磁碟,鍵盤,網路的中斷,必須經過仲裁決定哪個中斷優先處理。

重要觀念

使用者行程,會在下列兩種情況將控制權交給作業系統

  1. 占用 CPU 太久,被強制時間中斷
  2. 進行系統呼叫時,控制權交還作業系統
    • 這種情況會在作業系統完成系統呼叫後,喚醒使用者行程 (將行程狀態設為 ready 並放進去排程)

當中斷發生時,由於 stvec 暫存器被設定為 kernelvec,所以會跳到 kernelvec 去執行

kernel/trap.c

// set up to take exceptions and traps while in the kernel.
void
trapinithart(void) // main 作業系統載入就會做中斷點設定
{
  w_stvec((uint64)kernelvec); // 設定核心的中斷函數為 kernelvec
}

kernelvec 會呼叫 kerneltrap

.globl kerneltrap
.globl kernelvec
.align 4
kernelvec: // 中斷向量
        // make room to save registers. # 分配堆疊以儲存暫存器
        addi sp, sp, -256

        // save the registers.
        sd ra, 0(sp)
        sd sp, 8(sp)
        sd gp, 16(sp)
        sd tp, 24(sp)
        sd t0, 32(sp)
        sd t1, 40(sp)
        sd t2, 48(sp)
        sd s0, 56(sp)
        sd s1, 64(sp)
        sd a0, 72(sp)
        sd a1, 80(sp)
        sd a2, 88(sp)
        sd a3, 96(sp)
        sd a4, 104(sp)
        sd a5, 112(sp)
        sd a6, 120(sp)
        sd a7, 128(sp)
        sd s2, 136(sp)
        sd s3, 144(sp)
        sd s4, 152(sp)
        sd s5, 160(sp)
        sd s6, 168(sp)
        sd s7, 176(sp)
        sd s8, 184(sp)
        sd s9, 192(sp)
        sd s10, 200(sp)
        sd s11, 208(sp)
        sd t3, 216(sp)
        sd t4, 224(sp)
        sd t5, 232(sp)
        sd t6, 240(sp)

	// call the C trap handler in trap.c # 呼叫 C 語言的 kerneltrap() 函數。 回復暫存器
        call kerneltrap

        // restore registers. // 恢復暫存器
        ld ra, 0(sp)
        ld sp, 8(sp)
        ld gp, 16(sp)
        // not this, in case we moved CPUs: ld tp, 24(sp)
        ld t0, 32(sp)
        ld t1, 40(sp)
        ld t2, 48(sp)
        ld s0, 56(sp)
        ld s1, 64(sp)
        ld a0, 72(sp)
        ld a1, 80(sp)
        ld a2, 88(sp)
        ld a3, 96(sp)
        ld a4, 104(sp)
        ld a5, 112(sp)
        ld a6, 120(sp)
        ld a7, 128(sp)
        ld s2, 136(sp)
        ld s3, 144(sp)
        ld s4, 152(sp)
        ld s5, 160(sp)
        ld s6, 168(sp)
        ld s7, 176(sp)
        ld s8, 184(sp)
        ld s9, 192(sp)
        ld s10, 200(sp)
        ld s11, 208(sp)
        ld t3, 216(sp)
        ld t4, 224(sp)
        ld t5, 232(sp)
        ld t6, 240(sp)

        addi sp, sp, 256 # 恢復堆疊指標

        // return to whatever we were doing in the kernel.
        sret

kerneltrap 會呼叫 devintr() 做出對應處理

kernel/trap.c

// interrupts and exceptions from kernel code go here via kernelvec,
// on whatever the current kernel stack is.
void 
kerneltrap()  // 判斷是什麼類型的中斷,並做出相應的動作
{
  int which_dev = 0;
  uint64 sepc = r_sepc();
  uint64 sstatus = r_sstatus();
  uint64 scause = r_scause();
  
  if((sstatus & SSTATUS_SPP) == 0)
    panic("kerneltrap: not from supervisor mode");
  if(intr_get() != 0)
    panic("kerneltrap: interrupts enabled");

  if((which_dev = devintr()) == 0){ // 1. 裝置中斷
    printf("scause %p\n", scause);
    printf("sepc=%p stval=%p\n", r_sepc(), r_stval());
    panic("kerneltrap");
  }

  // give up the CPU if this is a timer interrupt.
  if(which_dev == 2 && myproc() != 0 && myproc()->state == RUNNING)  // 2. 時間中斷,禮讓給別人
    yield(); // 註:時間中斷對 user mode 與 kernel mode 都是有效的,都必須禮讓給別人。

  // the yield() may have caused some traps to occur,
  // so restore trap registers for use by kernelvec.S's sepc instruction.
  w_sepc(sepc);
  w_sstatus(sstatus);
}

devintr() 是真正處理中斷的地方

kernel/trap.c

// check if it's an external interrupt or software interrupt,
// and handle it.
// returns 2 if timer interrupt,
// 1 if other device,
// 0 if not recognized.
int
devintr() // 裝置中斷
{
  uint64 scause = r_scause();

  if((scause & 0x8000000000000000L) &&
     (scause & 0xff) == 9){ // 1. 硬體外部中斷
    // this is a supervisor external interrupt, via PLIC.

    // irq indicates which device interrupted.
    int irq = plic_claim();

    if(irq == UART0_IRQ){ // UART 中斷
      uartintr();
    } else if(irq == VIRTIO0_IRQ){ // 磁碟中斷
      virtio_disk_intr();
    } else if(irq){
      printf("unexpected interrupt irq=%d\n", irq);
    }

    // the PLIC allows each device to raise at most one
    // interrupt at a time; tell the PLIC the device is
    // now allowed to interrupt again.
    if(irq)
      plic_complete(irq); // 可以允許再次中斷了。

    return 1;
  } else if(scause == 0x8000000000000001L){ // 2. 時間中斷
    // software interrupt from a machine-mode timer interrupt,
    // forwarded by timervec in kernelvec.S.

    if(cpuid() == 0){
      clockintr();
    }
    
    // acknowledge the software interrupt by clearing
    // the SSIP bit in sip.
    w_sip(r_sip() & ~2); // 註:sip 是 software interrupt-pending

    return 2;
  } else {
    return 0;
  }
}
  1. 裝置中斷: 呼叫對應的裝置中斷函數 (uartintr/virtio_disk_intr)
  2. 時間中斷:呼叫 clockintr() 處理時間中斷,然後再返回上一層的 kerneltrap() 時透過 yield() 禮讓給下一個行程

這是 xv6 中斷機制的一些關鍵點,以下讓我們從 kernel 主程式 main.c 開始進行導覽!

main.c

在 xv6 這類的多工 (多行程) 系統中,中斷是《打斷又串起》所有行程執行順序的關鍵。

xv6 啟動 (boot) 後會進入 main() 函數,然後開始設定整個作業系統必須的環境與功能。

kernel/main.c

#include "types.h"
#include "param.h"
#include "memlayout.h"
#include "riscv.h"
#include "defs.h"

volatile static int started = 0;

// start() jumps here in supervisor mode on all CPUs.
void
main()
{
  if(cpuid() == 0){  // 第 0 的 hart 用來跑核心
    consoleinit();   // 準備好 console
    printfinit();    // 準備好 printf
    printf("\n");
    printf("xv6 kernel is booting\n");
    printf("\n");
    kinit();         // physical page allocator // 準備好實體分頁
    kvminit();       // create kernel page table // 準備好核心分頁表
    kvminithart();   // turn on paging // 啟動分頁表
    procinit();      // process table // 準備好行程表
    trapinit();      // trap vectors  // 設定好 trap 中斷
    trapinithart();  // install kernel trap vector  // 安裝核心的中斷向量
    plicinit();      // set up interrupt controller // 設定中斷控制器
    plicinithart();  // ask PLIC for device interrupts // 設定裝置中斷
    binit();         // buffer cache // 檔案系統: 緩衝快取
    iinit();         // inode cache  // 檔案系統: inode快取
    fileinit();      // file table   // 檔案系統: 設置檔案表
    virtio_disk_init(); // emulated hard disk // 檔案系統: 設置 virtio 虛擬硬碟
    userinit();      // first user process // 啟動第一個使用者行程 init
    __sync_synchronize();
    started = 1;     // 啟動已完成
  } else {  // 其他的 hart 用來跑一般程式
    while(started == 0)
      ;
    __sync_synchronize();
    printf("hart %d starting\n", cpuid());
    kvminithart();    // turn on paging // 啟動分頁表
    trapinithart();   // install kernel trap vector  // 安裝核心的中斷向量
    plicinithart();   // ask PLIC for device interrupts // 設定裝置中斷
  }

  scheduler(); // 進入排程系統 (無窮迴圈)
}

main() 當中的下列段落就是在設定中斷處理函數

    trapinit();      // trap vectors  // 設定好 trap 中斷
    trapinithart();  // install kernel trap vector  // 安裝核心的中斷向量
    plicinit();      // set up interrupt controller // 設定中斷控制器
    plicinithart();  // ask PLIC for device interrupts // 設定裝置中斷

讓我們一個一個追蹤進去!

struct spinlock tickslock;
uint ticks;
//...
void
trapinit(void)
{
  initlock(&tickslock, "time");
}

// ...
void
clockintr() // 時間中斷
{
  acquire(&tickslock);  // 進入臨界區間
  ticks++;
  wakeup(&ticks);
  release(&tickslock);
}

trapinit() 只是設定了時間中斷的鎖 tickslock,於是在 clockintr() 中我們可以透過 acquire() 和 release() 該鎖,避免修改 ticks 變數時會有競爭情況發生。

其中 wakeup(&ticks) 會喚醒等待 &ticks 這個 channel 的行程,讓這些行程恢復執行狀態。

(註:我想這應該是喚醒那些之前被時間中斷切換出去的行程,讓他們有機會恢復執行)

kernel/proc.c

// Wake up all processes sleeping on chan.
// Must be called without any p->lock.
void
wakeup(void *chan)
{
  struct proc *p;

  for(p = proc; p < &proc[NPROC]; p++) {
    if(p != myproc()){
      acquire(&p->lock);
      if(p->state == SLEEPING && p->chan == chan) {  
        p->state = RUNNABLE;  // 睡眠狀態變成可執行狀態
      }
      release(&p->lock);
    }
  }
}

// Atomically release lock and sleep on chan.
// Reacquires lock when awakened.
void
sleep(void *chan, struct spinlock *lk)
{
  struct proc *p = myproc();
  
  // Must acquire p->lock in order to
  // change p->state and then call sched.
  // Once we hold p->lock, we can be
  // guaranteed that we won't miss any wakeup
  // (wakeup locks p->lock),
  // so it's okay to release lk.

  acquire(&p->lock);  //DOC: sleeplock1
  release(lk);

  // Go to sleep.
  p->chan = chan;
  p->state = SLEEPING;

  sched();  // 排程

  // Tidy up.
  p->chan = 0;

  // Reacquire original lock.
  release(&p->lock);
  acquire(lk);
}

接著 trapinit() 之後的 trapinithart() ,則是設定了核心的中斷向量 kernelvec,於是中斷機制開始真正被啟動。

// set up to take exceptions and traps while in the kernel.
void
trapinithart(void)
{
  w_stvec((uint64)kernelvec); // 設定核心的中斷函數為 kernelvec
}

kernelvec 主要是將暫存器先保存在核心堆疊裏,然後就呼叫 C 語言的 kerneltrap() 函數。

kernel/kernelvec.S

	#
        # interrupts and exceptions while in supervisor
        # mode come here.
        #
        # push all registers, call kerneltrap(), restore, return.
        #
.globl kerneltrap
.globl kernelvec
.align 4
kernelvec:
        // make room to save registers. # 分配堆疊以儲存暫存器
        addi sp, sp, -256

        // save the registers.
        sd ra, 0(sp)
        sd sp, 8(sp)
        sd gp, 16(sp)
        sd tp, 24(sp)
        sd t0, 32(sp)
        sd t1, 40(sp)
        sd t2, 48(sp)
        sd s0, 56(sp)
        sd s1, 64(sp)
        sd a0, 72(sp)
        sd a1, 80(sp)
        sd a2, 88(sp)
        sd a3, 96(sp)
        sd a4, 104(sp)
        sd a5, 112(sp)
        sd a6, 120(sp)
        sd a7, 128(sp)
        sd s2, 136(sp)
        sd s3, 144(sp)
        sd s4, 152(sp)
        sd s5, 160(sp)
        sd s6, 168(sp)
        sd s7, 176(sp)
        sd s8, 184(sp)
        sd s9, 192(sp)
        sd s10, 200(sp)
        sd s11, 208(sp)
        sd t3, 216(sp)
        sd t4, 224(sp)
        sd t5, 232(sp)
        sd t6, 240(sp)

	// call the C trap handler in trap.c # 呼叫 C 語言的 kerneltrap() 函數。
        call kerneltrap

        // restore registers. // 恢復暫存器
        ld ra, 0(sp)
        ld sp, 8(sp)
        ld gp, 16(sp)
        // not this, in case we moved CPUs: ld tp, 24(sp)
        ld t0, 32(sp)
        ld t1, 40(sp)
        ld t2, 48(sp)
        ld s0, 56(sp)
        ld s1, 64(sp)
        ld a0, 72(sp)
        ld a1, 80(sp)
        ld a2, 88(sp)
        ld a3, 96(sp)
        ld a4, 104(sp)
        ld a5, 112(sp)
        ld a6, 120(sp)
        ld a7, 128(sp)
        ld s2, 136(sp)
        ld s3, 144(sp)
        ld s4, 152(sp)
        ld s5, 160(sp)
        ld s6, 168(sp)
        ld s7, 176(sp)
        ld s8, 184(sp)
        ld s9, 192(sp)
        ld s10, 200(sp)
        ld s11, 208(sp)
        ld t3, 216(sp)
        ld t4, 224(sp)
        ld t5, 232(sp)
        ld t6, 240(sp)

        addi sp, sp, 256 # 恢復堆疊指標

        // return to whatever we were doing in the kernel.
        sret

於是焦點又轉回 trap.c 裏的 kerneltrap() 函數。

// interrupts and exceptions from kernel code go here via kernelvec,
// on whatever the current kernel stack is.
void 
kerneltrap()
{
  int which_dev = 0;
  uint64 sepc = r_sepc();
  uint64 sstatus = r_sstatus();
  uint64 scause = r_scause();
  
  if((sstatus & SSTATUS_SPP) == 0)
    panic("kerneltrap: not from supervisor mode");
  if(intr_get() != 0)
    panic("kerneltrap: interrupts enabled");

  if((which_dev = devintr()) == 0){ // 1. 裝置中斷
    printf("scause %p\n", scause);
    printf("sepc=%p stval=%p\n", r_sepc(), r_stval());
    panic("kerneltrap");
  }

  // give up the CPU if this is a timer interrupt.
  if(which_dev == 2 && myproc() != 0 && myproc()->state == RUNNING)  // 1. 時間中斷,禮讓給別人
    yield(); // 註:時間中斷對 user mode 與 kernel mode 都是有效的,都必須禮讓給別人。

  // the yield() may have caused some traps to occur,
  // so restore trap registers for use by kernelvec.S's sepc instruction.
  w_sepc(sepc);
  w_sstatus(sstatus);
}

kerneltrap() 會判斷中斷類型並做出如下反應

  1. 裝置中斷:呼叫 devintr() 處理之
  2. 時間中斷:呼叫 yield() 將處理器讓給下一個行程。

對於裝置中斷,devintr() 會判斷是哪個裝置 (UART鍵盤/VIRTIO磁碟/TIMER定時器)發生了中斷,然後呼叫對應的處理函數。

  1. UART: uartintr()
  2. VIRTIO: virtio_disk_intr()
  3. TIMER: clockintr()

kernel/trap.c

// ...
// check if it's an external interrupt or software interrupt,
// and handle it.
// returns 2 if timer interrupt,
// 1 if other device,
// 0 if not recognized.
int
devintr() // 裝置中斷
{
  uint64 scause = r_scause();

  if((scause & 0x8000000000000000L) &&
     (scause & 0xff) == 9){ // 1. 硬體外部中斷
    // this is a supervisor external interrupt, via PLIC.

    // irq indicates which device interrupted.
    int irq = plic_claim();

    if(irq == UART0_IRQ){ // UART 中斷
      uartintr();
    } else if(irq == VIRTIO0_IRQ){ // 磁碟中斷
      virtio_disk_intr();
    } else if(irq){
      printf("unexpected interrupt irq=%d\n", irq);
    }

    // the PLIC allows each device to raise at most one
    // interrupt at a time; tell the PLIC the device is
    // now allowed to interrupt again.
    if(irq)
      plic_complete(irq); // 可以允許再次中斷了。

    return 1;
  } else if(scause == 0x8000000000000001L){ // 2. 時間中斷
    // software interrupt from a machine-mode timer interrupt,
    // forwarded by timervec in kernelvec.S.

    if(cpuid() == 0){
      clockintr();
    }
    
    // acknowledge the software interrupt by clearing
    // the SSIP bit in sip.
    w_sip(r_sip() & ~2); // 註:sip 是 software interrupt-pending

    return 2;
  } else {
    return 0;
  }
}

這些對應裝置的中斷處理函數,會讀取裝置傳來的資料,然後啟動對應處理函數。

  1. UART: uartintr()
  2. VIRTIO: virtio_disk_intr()
  3. TIMER: clockintr()
// handle a uart interrupt, raised because input has
// arrived, or the uart is ready for more output, or
// both. called from trap.c.
void
uartintr(void)
{
  // read and process incoming characters.
  while(1){
    int c = uartgetc();
    if(c == -1)
      break;
    consoleintr(c);
  }

  // send buffered characters.  
  acquire(&uart_tx_lock); 
  uartstart();
  release(&uart_tx_lock);
}

像是 uartintr() 會啟動 consoleintr(c),去處理該字元的詮釋動作,例如碰到 \b (backspace) 時該退一格等事項,並且透過 cons.buf[cons.e++ % INPUT_BUF] = c 這行將處理後的字元放進輸入行緩衝區。

kernel/console.c

#define BACKSPACE 0x100
#define C(x)  ((x)-'@')  // Control-x

consoleintr(int c)
{
  acquire(&cons.lock);

  switch(c){
  case C('P'):  // Print process list.  // ctrl + P
    procdump();
    break;
  case C('U'):  // Kill line.
    while(cons.e != cons.w &&
          cons.buf[(cons.e-1) % INPUT_BUF] != '\n'){
      cons.e--;
      consputc(BACKSPACE);
    }
    break;
  case C('H'): // Backspace
  case '\x7f':
    if(cons.e != cons.w){
      cons.e--;
      consputc(BACKSPACE);
    }
    break;
  default:
    if(c != 0 && cons.e-cons.r < INPUT_BUF){
      c = (c == '\r') ? '\n' : c;

      // echo back to the user.
      consputc(c);

      // store for consumption by consoleread().
      cons.buf[cons.e++ % INPUT_BUF] = c;

      if(c == '\n' || c == C('D') || cons.e == cons.r+INPUT_BUF){
        // wake up consoleread() if a whole line (or end-of-file)
        // has arrived.
        cons.w = cons.e;
        wakeup(&cons.r);
      }
    }
    break;
  }
  
  release(&cons.lock);
}

// ...
//
// send one character to the uart.
// called by printf, and to echo input characters,
// but not from write().
//
void
consputc(int c)  // 回上一個字,再把它刪掉
{
  if(c == BACKSPACE){
    // if the user typed backspace, overwrite with a space.
    uartputc_sync('\b'); uartputc_sync(' '); uartputc_sync('\b');
  } else {
    uartputc_sync(c);
  }
}

kernel/uart.c

// alternate version of uartputc() that doesn't 
// use interrupts, for use by kernel printf() and
// to echo characters. it spins waiting for the uart's
// output register to be empty.
void
uartputc_sync(int c)
{
  push_off();

  if(panicked){
    for(;;)
      ;
  }

  // wait for Transmit Holding Empty to be set in LSR.
  while((ReadReg(LSR) & LSR_TX_IDLE) == 0)
    ;
  WriteReg(THR, c);

  pop_off();
}

磁碟的中斷處理函數 virtio_disk_intr() 則定義在 virtio_disk.c 當中,主要是用來喚醒那些磁碟存取已經完成的行程。

kernel/virtio_disk.c

void
virtio_disk_intr()
{
  acquire(&disk.vdisk_lock);

  // the device won't raise another interrupt until we tell it
  // we've seen this interrupt, which the following line does.
  // this may race with the device writing new entries to
  // the "used" ring, in which case we may process the new
  // completion entries in this interrupt, and have nothing to do
  // in the next interrupt, which is harmless.
  *R(VIRTIO_MMIO_INTERRUPT_ACK) = *R(VIRTIO_MMIO_INTERRUPT_STATUS) & 0x3;

  __sync_synchronize();

  // the device increments disk.used->idx when it
  // adds an entry to the used ring.

  while(disk.used_idx != disk.used->idx){
    __sync_synchronize();
    int id = disk.used->ring[disk.used_idx % NUM].id;

    if(disk.info[id].status != 0)
      panic("virtio_disk_intr status");

    struct buf *b = disk.info[id].b;
    b->disk = 0;   // disk is done with buf
    wakeup(b);

    disk.used_idx += 1;
  }

  release(&disk.vdisk_lock);
}

至於軟體中斷和時間中斷,我們已經在前幾篇的文章中看過了,在此就不重複了!

xv6: 平行與鎖 lock -- 多核心的控管

在多工系統中,為了避免同時存取某變數,會使用鎖 (lock) 來防止競爭情況。

在 xv6 當中,主要使用《旋轉鎖》(spinlock) ,其結構定義在 spinlock.h 裏。

旋轉鎖可以防止多個行程同時進入到臨界區間裡面

kernel/spinlock.h

// Mutual exclusion lock.
struct spinlock {
  uint locked;       // Is the lock held?

  // For debugging:
  char *name;        // Name of lock.
  struct cpu *cpu;   // The cpu holding the lock.
};

舉例而言,檔案系統 file.c 裏的 filedup() 函數,就使用了 acquire() 鎖定,然後使用 release() 解鎖。

kernel/file.c

// Increment ref count for file f.
struct file*
filedup(struct file *f)
{
  acquire(&ftable.lock);
  if(f->ref < 1)
    panic("filedup");
  f->ref++;
  release(&ftable.lock);
  return f;
}

這樣就可以防止多個行程同時存取檔案表,造成 f->ref 變數因競爭情況而導致不一致的問題。

在 xv6 中 acquire() 與 release() 函數的實作如下:

kernel/spinlock.c

// Acquire the lock.
// Loops (spins) until the lock is acquired.
void
acquire(struct spinlock *lk)
{
  push_off(); // disable interrupts to avoid deadlock. 禁止中斷
  if(holding(lk)) // 如果重複鎖定,那就是嚴重錯誤
    panic("acquire");

  // On RISC-V, sync_lock_test_and_set turns into an atomic swap:
  //   a5 = 1
  //   s1 = &lk->locked
  //   amoswap.w.aq a5, a5, (s1)
  while(__sync_lock_test_and_set(&lk->locked, 1) != 0) // 等待直到鎖定成功
    ;

  // Tell the C compiler and the processor to not move loads or stores
  // past this point, to ensure that the critical section's memory
  // references happen strictly after the lock is acquired.
  // On RISC-V, this emits a fence instruction.
  // 亂序執行不跨越邊界
  __sync_synchronize(); // 要求編譯器載入儲存指令不跨越此邊界

  // Record info about lock acquisition for holding() and debugging.
  lk->cpu = mycpu(); // 設定上鎖者的處理器代號
}

// Release the lock.
void
release(struct spinlock *lk)
{
  if(!holding(lk)) // 如果沒有鎖定就呼叫 release(),那就是嚴重錯誤
    panic("release");

  lk->cpu = 0;

  // Tell the C compiler and the CPU to not move loads or stores
  // past this point, to ensure that all the stores in the critical
  // section are visible to other CPUs before the lock is released,
  // and that loads in the critical section occur strictly before
  // the lock is released.
  // On RISC-V, this emits a fence instruction.
  __sync_synchronize(); // 要求編譯器載入儲存指令不跨越此邊界

  // Release the lock, equivalent to lk->locked = 0.
  // This code doesn't use a C assignment, since the C standard
  // implies that an assignment might be implemented with
  // multiple store instructions.
  // On RISC-V, sync_lock_release turns into an atomic swap:
  //   s1 = &lk->locked
  //   amoswap.w zero, zero, (s1)
  __sync_lock_release(&lk->locked); // 解除鎖定

  pop_off(); // 允許中斷
}

要避免競爭情況,首先要禁止中斷,所以 acquire() 當中 呼叫了 push_off() 禁止中斷,並在 release() 當中解除禁令。

kernel/spinlock.c

// push_off/pop_off are like intr_off()/intr_on() except that they are matched:
// it takes two pop_off()s to undo two push_off()s.  Also, if interrupts
// are initially off, then push_off, pop_off leaves them off.

void
push_off(void)
{
  int old = intr_get();

  intr_off();
  if(mycpu()->noff == 0)
    mycpu()->intena = old;
  mycpu()->noff += 1;
}

void
pop_off(void)
{
  struct cpu *c = mycpu();
  if(intr_get())
    panic("pop_off - interruptible");
  if(c->noff < 1)
    panic("pop_off");
  c->noff -= 1;
  if(c->noff == 0 && c->intena)
    intr_on();
}

acquire() 中有些編譯器專屬的特殊展開,像是 __sync_lock_test_and_set(&lk->locked, 1) 會被展開成以下 RISC-V 的組合語言

a5 = 1
s1 = &lk->locked
amoswap.w.aq a5, a5, (s1)

其中的 amoswap 是原子指令,可以在單一指令內交換暫存器 a5 與記憶體 (s1),也就是將 lk->locked 設定為 1,透過這樣來達成鎖定的目的。

另外,__sync_synchronize() 也是編譯器指引,告訴編譯器別讓 load, store 等記憶體存取指令跨越該行,以避免編譯器優化造成的競爭存取問題。

release() 中的 __sync_lock_release(&lk->locked) 會被展開成以下組合語言:

s1 = &lk->locked
amoswap.w zero, zero, (s1)

這樣就能透過原子指令 amoswap 將 lk->locked 設定為 0,完成解鎖動作。

您可能會注意到 acquire() 與 release() 當中都有使用 holding() 函數檢查是否該鎖已經被同一個 CPU 鎖住 (如果是就會導致 panic 恐慌並讓作業系統停止) (panic 類似一般程式的 assert 函數,會停掉並印錯誤訊息)

// Check whether this cpu is holding the lock.
// Interrupts must be off.
int
holding(struct spinlock *lk)
{
  int r;
  r = (lk->locked && lk->cpu == mycpu());
  return r;
}

上述程式中的 mycpu() 定義在 proc.c 裏

kernel/proc.c

// Must be called with interrupts disabled,
// to prevent race with process being moved
// to a different CPU.
int
cpuid()
{
  int id = r_tp();
  return id;
}

// Return this CPU's cpu struct.
// Interrupts must be disabled.
struct cpu*
mycpu(void) {
  int id = cpuid();
  struct cpu *c = &cpus[id];
  return c;
}

r_tp() 函數則定義在 riscv.h 當中,呼叫嵌入式組合語言指令 mv %0, tp 去讀取 tp (thread pointer 暫存器),tp 暫存器裏儲存了目前的 hartid (核心代號)。

kernel/riscv.h

// read and write tp, the thread pointer, which holds
// this core's hartid (core number), the index into cpus[].
static inline uint64
r_tp()
{
  uint64 x;
  asm volatile("mv %0, tp" : "=r" (x) );
  return x;
}

在整個鎖定機制中,RISC-V 處理器的原子指令 amoswap 扮演關鍵角色,關於原子指令的詳細說明可以參考

其中的 Figure 8.2 展示了使用 amoswap 鎖定與解鎖的組合語言範例:

li t0, 1 # Initialize swap value.
again:
lw t1, (a0) # Check if lock is held.
bnez t1, again # Retry if held.
amoswap.w.aq t1, t0, (a0) # Attempt to acquire lock.
bnez t1, again # Retry if held.
# ...
# Critical section.
# ...
amoswap.w.rl x0, x0, (a0) # Release lock by storing 0.
Figure 8.2: Sample code for mutual exclusion. a0 contains the address of the lock.

一旦 acquire()/release() 這樣的函數建構完成後,我們就能在 xv6 核心中運用鎖定機制避免競爭情況,確保臨界區間的存取不會出問題了。

xv6: 磁碟驅動與 virtio 交互協議

這個是作業系統底層的交互方式,跟xv6的設計架構有很大的關連性

參考

  • virtio详细介绍和1.1新功能
  • virtio: Towards a De-Facto Standard For Virtual I/O Devices (PDF), Rusty Russell, IBM OzLabs
  • 什麼是QEMU ? 什麼是KVM ? 什麼是QEMU-KVM?
    • KVM實際是linux內核提供的虛擬化架構,可將內核直接充當hypervisor來使用。KVM本身不實現任何模擬,僅僅是暴露了一個/dev/kvm接口,這個接口可被宿主機用來主要負責vCPU的創建,虛擬內存的地址空間分配,vCPU寄存器的讀寫以及vCPU的運行。在QEMU-KVM中,KVM運行在內核空間,QEMU運行在用户空間,實際模擬創建、管理各種虛擬硬件,KVM加上QEMU後就是完整意義上的服務器虛擬化。QEMU-KVM具有兩大作用 1. 提供對cpu,內存(KVM負責),IO設備(QEMU負責)的虛擬 2. 對各種虛擬設備的創建,調用進行管理(QEMU負責)

簡介

作業系統 (例如 xv6) 在 Hypervisor (例如 qemu) 中執行,xv6 的驅動程式 driver (virtio_disk.c) 會使用 virtio 協定與 qemu 中模擬出的硬碟 device 溝通。

driver 和 device 是生產者/消費者模式, driver 生產資料,寫入記憶體,device 消費資料,從記憶體中讀取。

  1. Doorbell: xv6 driver 寫入特定記憶體,要求 qemu device 讀取資料區塊
  2. 中斷: qemu device 讀完後發中斷給 xv6,讓 xv6 來取回 (消費) 資料。

在生產者/消費者模式中,需要一個環狀佇列 (Circular Queue),在 virtio 裡稱為 virt_queue ,其相關資料結構如下:

kernel/virtio.h

// a single descriptor, from the spec.
struct virtq_desc {
  uint64 addr;
  uint32 len;
  uint16 flags;
  uint16 next;
};
#define VRING_DESC_F_NEXT  1 // chained with another descriptor
#define VRING_DESC_F_WRITE 2 // device writes (vs read)

// the (entire) avail ring, from the spec.
struct virtq_avail {
  uint16 flags; // always zero
  uint16 idx;   // driver will write ring[idx] next
  uint16 ring[NUM]; // descriptor numbers of chain heads
  uint16 unused;
};

// one entry in the "used" ring, with which the
// device tells the driver about completed requests.
struct virtq_used_elem {
  uint32 id;   // index of start of completed descriptor chain
  uint32 len;
};

struct virtq_used {
  uint16 flags; // always zero
  uint16 idx;   // device increments when it adds a ring[] entry
  struct virtq_used_elem ring[NUM];
};

kernel/virtio_disk.c

static struct disk {
  // the virtio driver and device mostly communicate through a set of
  // structures in RAM. pages[] allocates that memory. pages[] is a
  // global (instead of calls to kalloc()) because it must consist of
  // two contiguous pages of page-aligned physical memory.
  char pages[2*PGSIZE];

  // pages[] is divided into three regions (descriptors, avail, and
  // used), as explained in Section 2.6 of the virtio specification
  // for the legacy interface.
  // https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.pdf
  
  // the first region of pages[] is a set (not a ring) of DMA
  // descriptors, with which the driver tells the device where to read
  // and write individual disk operations. there are NUM descriptors.
  // most commands consist of a "chain" (a linked list) of a couple of
  // these descriptors.
  // points into pages[].
  struct virtq_desc *desc;

  // next is a ring in which the driver writes descriptor numbers
  // that the driver would like the device to process.  it only
  // includes the head descriptor of each chain. the ring has
  // NUM elements.
  // points into pages[].
  struct virtq_avail *avail;

  // finally a ring in which the device writes descriptor numbers that
  // the device has finished processing (just the head of each chain).
  // there are NUM used ring entries.
  // points into pages[].
  struct virtq_used *used;

  // our own book-keeping.
  char free[NUM];  // is a descriptor free?
  uint16 used_idx; // we've looked this far in used[2..NUM].

  // track info about in-flight operations,
  // for use when completion interrupt arrives.
  // indexed by first descriptor index of chain.
  struct {
    struct buf *b;
    char status;
  } info[NUM];

  // disk command headers.
  // one-for-one with descriptors, for convenience.
  struct virtio_blk_req ops[NUM];
  
  struct spinlock vdisk_lock;
  
} __attribute__ ((aligned (PGSIZE))) disk;

運作方式

  1. 初始化: xv6 分配 virt queue,並告訴 qemu virt queue 的位址。
  2. 交互: Virtio 的三種交互模式,分別是 PCI (主機),MMIO (嵌入式) 與 Channel I/O (很少見),以下是 xv6 的 virtio 相關程式碼,採用 MMIO 的交互模式。

初始化步驟,請參考 Virtual I/O Device (VIRTIO) Version 1.1 -- Section 3.1 Device Initialization


3.1 Device Initialization 3.1.1 Driver Requirements: Device Initialization

The driver MUST follow this sequence to initialize a device:

  1. Reset the device.
  2. Set the ACKNOWLEDGE status bit: the guest OS has noticed the device.
  3. Set the DRIVER status bit: the guest OS knows how to drive the device.
  4. Read device feature bits, and write the subset of feature bits understood by the OS and driver to the device. During this step the driver MAY read (but MUST NOT write) the device-specific configuration fields to check that it can support the device before accepting it.
  5. Set the FEATURES_OK status bit. The driver MUST NOT accept new feature bits after this step.
  6. Re-read device status to ensure the FEATURES_OK bit is still set: otherwise, the device does not support our subset of features and the device is unusable.
  7. Perform device-specific setup, including discovery of virtqueues for the device, optional per-bus setup, reading and possibly writing the device’s virtio configuration space, and population of virtqueues.
  8. Set the DRIVER_OK status bit. At this point the device is “live”.

If any of these steps go irrecoverably wrong, the driver SHOULD set the FAILED status bit to indicate that it has given up on the device (it can reset the device later to restart if desired). The driver MUST NOT continue initialization in that case.

The driver MUST NOT send any buffer available notifications to the device before setting DRIVER_OK.


xv6 中的 virtio 記憶體區域配置在 1001000 開始的位址:

kernel/memlayout.h

// 00001000 -- boot ROM, provided by qemu // 啟動的 ROM 區塊
// 02000000 -- CLINT // 局部中斷 Core Local Interruptor
// 0C000000 -- PLIC // 外部中斷 Platform-Level Interrupt Controller
// 10000000 -- uart0 // UART: 反映在宿主機的 Console 上,代表終端機
// 10001000 -- virtio disk // 磁碟 virtio 區域
// 80000000 -- boot ROM jumps here in machine mode // 啟動後會跳到這裡
//             -kernel loads the kernel here
// unused RAM after 80000000.

kernel/virtio_disk.c

// the address of virtio mmio register r.
#define R(r) ((volatile uint32 *)(VIRTIO0 + (r)))
// 參考 [Virtual I/O Device (VIRTIO) Version 1.1 -- 4.2.2 MMIO Device Register Layout](https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1460002)
void
virtio_disk_init(void) // 初始化本模組
{
  uint32 status = 0;

  initlock(&disk.vdisk_lock, "virtio_disk");

  // 檢查是否有 MMIO 磁碟裝置存在?
  if(*R(VIRTIO_MMIO_MAGIC_VALUE) != 0x74726976 ||
     *R(VIRTIO_MMIO_VERSION) != 1 ||
     *R(VIRTIO_MMIO_DEVICE_ID) != 2 ||
     *R(VIRTIO_MMIO_VENDOR_ID) != 0x554d4551){
    panic("could not find virtio disk");
  }
  
  // 設定 VIRTIO_MMIO 狀態
  status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
  *R(VIRTIO_MMIO_STATUS) = status;

  status |= VIRTIO_CONFIG_S_DRIVER;
  *R(VIRTIO_MMIO_STATUS) = status;

  // negotiate features // 設定 feature 準備開始協商
  uint64 features = *R(VIRTIO_MMIO_DEVICE_FEATURES);
  features &= ~(1 << VIRTIO_BLK_F_RO);
  features &= ~(1 << VIRTIO_BLK_F_SCSI);
  features &= ~(1 << VIRTIO_BLK_F_CONFIG_WCE);
  features &= ~(1 << VIRTIO_BLK_F_MQ);
  features &= ~(1 << VIRTIO_F_ANY_LAYOUT);
  features &= ~(1 << VIRTIO_RING_F_EVENT_IDX);
  features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
  *R(VIRTIO_MMIO_DRIVER_FEATURES) = features;

  // tell device that feature negotiation is complete.
  status |= VIRTIO_CONFIG_S_FEATURES_OK; // feature 設定好了
  *R(VIRTIO_MMIO_STATUS) = status;

  // tell device we're completely ready. // 完成準備,要求 QEMU 協商
  status |= VIRTIO_CONFIG_S_DRIVER_OK;
  *R(VIRTIO_MMIO_STATUS) = status;

  *R(VIRTIO_MMIO_GUEST_PAGE_SIZE) = PGSIZE;

  // initialize queue 0.
  *R(VIRTIO_MMIO_QUEUE_SEL) = 0;
  uint32 max = *R(VIRTIO_MMIO_QUEUE_NUM_MAX);
  if(max == 0)
    panic("virtio disk has no queue 0");
  if(max < NUM)
    panic("virtio disk max queue too short");
  *R(VIRTIO_MMIO_QUEUE_NUM) = NUM;
  memset(disk.pages, 0, sizeof(disk.pages));
  *R(VIRTIO_MMIO_QUEUE_PFN) = ((uint64)disk.pages) >> PGSHIFT; // PFN: Guest physical page number of the virtual queue

  // desc = pages -- num * virtq_desc
  // avail = pages + 0x40 -- 2 * uint16, then num * uint16
  // used = pages + 4096 -- 2 * uint16, then num * vRingUsedElem

  disk.desc = (struct virtq_desc *) disk.pages;
  disk.avail = (struct virtq_avail *)(disk.pages + NUM*sizeof(struct virtq_desc));
  disk.used = (struct virtq_used *) (disk.pages + PGSIZE);

  // all NUM descriptors start out unused. // 設定所有描述子均為可用
  for(int i = 0; i < NUM; i++)
    disk.free[i] = 1;

  // plic.c and trap.c arrange for interrupts from VIRTIO0_IRQ.
}

其中 disk.desc, avail, used 的 alignment 規定如下圖:

img

讀寫前要先分配連續空間的三個描述子,然後呼叫 virtio_disk_rw() 函數

void
virtio_disk_rw(struct buf *b, int write) // 啟動 virtio 的磁碟寫入動作
{
  uint64 sector = b->blockno * (BSIZE / 512);

  acquire(&disk.vdisk_lock);

  // the spec's Section 5.2 says that legacy block operations use
  // three descriptors: one for type/reserved/sector, one for the
  // data, one for a 1-byte status result.

  // allocate the three descriptors. // 分配直到成功為止
  int idx[3];
  while(1){
    if(alloc3_desc(idx) == 0) {
      break;
    }
    sleep(&disk.free[0], &disk.vdisk_lock);
  }

  // format the three descriptors.
  // qemu's virtio-blk.c reads them.
  // 參考 -- https://github.com/qemu/qemu/blob/master/hw/block/virtio-blk.c

  struct virtio_blk_req *buf0 = &disk.ops[idx[0]]; // 讀寫的區塊

  if(write) // 根據 write 來設定為《寫入或讀取》
    buf0->type = VIRTIO_BLK_T_OUT; // write the disk // 讀寫類型為寫入
  else
    buf0->type = VIRTIO_BLK_T_IN; // read the disk // 讀寫類型為讀取
  buf0->reserved = 0;
  buf0->sector = sector; // 讀寫的磁區號碼 (sector)

  // 第 0 個描述子
  disk.desc[idx[0]].addr = (uint64) buf0;
  disk.desc[idx[0]].len = sizeof(struct virtio_blk_req);
  disk.desc[idx[0]].flags = VRING_DESC_F_NEXT;
  disk.desc[idx[0]].next = idx[1];

  // 第 1 個描述子
  disk.desc[idx[1]].addr = (uint64) b->data;
  disk.desc[idx[1]].len = BSIZE;
  if(write)
    disk.desc[idx[1]].flags = 0; // device reads b->data
  else
    disk.desc[idx[1]].flags = VRING_DESC_F_WRITE; // device writes b->data
  disk.desc[idx[1]].flags |= VRING_DESC_F_NEXT;
  disk.desc[idx[1]].next = idx[2];

  // 第 2 個描述子
  disk.info[idx[0]].status = 0xff; // device writes 0 on success
  disk.desc[idx[2]].addr = (uint64) &disk.info[idx[0]].status;
  disk.desc[idx[2]].len = 1;
  disk.desc[idx[2]].flags = VRING_DESC_F_WRITE; // device writes the status
  disk.desc[idx[2]].next = 0;

  // record struct buf for virtio_disk_intr().
  b->disk = 1;
  disk.info[idx[0]].b = b;

  // tell the device the first index in our chain of descriptors.
  disk.avail->ring[disk.avail->idx % NUM] = idx[0];

  __sync_synchronize();

  // tell the device another avail ring entry is available.
  disk.avail->idx += 1; // not % NUM ...

  __sync_synchronize();

  *R(VIRTIO_MMIO_QUEUE_NOTIFY) = 0; // value is queue number

  // Wait for virtio_disk_intr() to say request has finished.
  while(b->disk == 1) { // b->disk=1 代表磁碟正在讀取到緩衝區 buf
    sleep(b, &disk.vdisk_lock);
  }

  // 讀完了,釋放 idx[0]
  disk.info[idx[0]].b = 0;
  free_chain(idx[0]);

  release(&disk.vdisk_lock);
}

這時就會交由 qemu 去讀寫,等到讀寫完成之後, qemu 會發中斷給 xv6 ,然後 xv6 會執行如下的 virtio_disk_intr() 函數

void
virtio_disk_intr() // virtio_disk_rw() 請求讀寫,完成後 qemu 會發中斷給 xv6
{
  acquire(&disk.vdisk_lock);

  // the device won't raise another interrupt until we tell it
  // we've seen this interrupt, which the following line does.
  // this may race with the device writing new entries to
  // the "used" ring, in which case we may process the new
  // completion entries in this interrupt, and have nothing to do
  // in the next interrupt, which is harmless.
  *R(VIRTIO_MMIO_INTERRUPT_ACK) = *R(VIRTIO_MMIO_INTERRUPT_STATUS) & 0x3;

  __sync_synchronize();

  // the device increments disk.used->idx when it
  // adds an entry to the used ring.

  while(disk.used_idx != disk.used->idx){
    __sync_synchronize();
    int id = disk.used->ring[disk.used_idx % NUM].id;

    if(disk.info[id].status != 0)
      panic("virtio_disk_intr status");

    struct buf *b = disk.info[id].b;
    b->disk = 0;   // disk is done with buf
    wakeup(b); // 讀取完成,喚醒等待此磁碟事件的行程 (加入排程佇列)

    disk.used_idx += 1;
  }

  release(&disk.vdisk_lock);
}

透過這樣的方式, xv6 和 QEMU 以 virtio 協議,合作完成了磁碟讀寫的動作。

當然,這只是 xv6 底層的讀寫驅動程式,整個 xv6 的檔案系統共包含了七個層次,請參考下列文章:

xv6: 分頁表與 MMU

每個行程擁有自己的分頁表,可以讓執行檔載入時,擁有完整且獨立的記憶體位址空間 (虛擬位址),這樣就不需要做很多的《重定位》工作,這是使用分頁表的第一個好處。

使用分頁表,也能讓記憶體的載入比較有效率,不需要一開始就把整份執行檔全部載進來,而是一邊執行一邊載入,這是使用分頁表的第二個好處。

另外、當程式用 fork() 分叉成兩份時,雖然會複製分頁表,但是卻可以共用分頁記憶體,這樣的共用可以提高記憶體的使用效率,這是使用分頁表的第三個好處。

riscv.h

xv6 的分頁表資料結構,定義在 riscv.h 裡面,名稱是 pagetable_t。

RISC-V 處理器的分頁表結構如下圖所示,其中虛擬位址的前 25 位 EXT 部分是未來擴充用,目前還沒用上。

img

kernel/riscv.h

通常自定義的型態前面會加上 P

#define PGSIZE 4096 // bytes per page  每頁 4096 byte
#define PGSHIFT 12  // bits of offset within a page

#define PGROUNDUP(sz)  (((sz)+PGSIZE-1) & ~(PGSIZE-1))
#define PGROUNDDOWN(a) (((a)) & ~(PGSIZE-1)) // 取得頁碼 (截掉最後 12 位元的 offset 部分)

#define PTE_V (1L << 0) // valid, 有效
#define PTE_R (1L << 1) // 可讀取
#define PTE_W (1L << 2) // 可寫入
#define PTE_X (1L << 3) // 可執行
#define PTE_U (1L << 4) // 1 -> user can access

// shift a physical address to the right place for a PTE. 將實體位址(PA)轉為頁表項(PTE)
#define PA2PTE(pa) ((((uint64)pa) >> 12) << 10) // 由於每頁 4096 byte, 所以最小的 12 bits offset 不在頁表項中,可去掉。

#define PTE2PA(pte) (((pte) >> 10) << 12)

#define PTE_FLAGS(pte) ((pte) & 0x3FF)

// extract the three 9-bit page table indices from a virtual address.
#define PXMASK          0x1FF // 9 bits
#define PXSHIFT(level)  (PGSHIFT+(9*(level)))
#define PX(level, va) ((((uint64) (va)) >> PXSHIFT(level)) & PXMASK)

// one beyond the highest possible virtual address.
// MAXVA is actually one bit less than the max allowed by
// Sv39, to avoid having to sign-extend virtual addresses
// that have the high bit set.
#define MAXVA (1L << (9 + 9 + 9 + 12 - 1))

typedef uint64 pte_t;
typedef uint64 *pagetable_t; // 512 PTEs

pagetable_t 是 64 位元無號整數 uint64 的指標 (陣列)。其中的每一項稱為 PTE (Page Table Entry),也就是上述的 pte_t。

每個分頁大小 PGSIZE 為 4096 bytes,所以當實體位址 (PA, Physical Address) 要被轉換為虛擬位址 (VA, Virtual Address) 時,需要被右移 12 位元以取得頁碼。

#define PA2PTE(pa) ((((uint64)pa) >> 12) << 10)
#define PTE2PA(pte) (((pte) >> 10) << 12)

每一個頁,都可以設定 《V:有效, R:可讀, W:可寫, X:可執行, U:使用者可存取》 等屬性,這些屬性被放在最右邊的 11 個 bit。

#define PTE_FLAGS(pte) ((pte) & 0x3FF)

xv6 核心的記憶體配置,記錄在 memlayout.h 這個檔案中

kernel/memlayout.h

// Physical memory layout

// qemu -machine virt is set up like this,
// based on qemu's hw/riscv/virt.c:
//
// 00001000 -- boot ROM, provided by qemu
// 02000000 -- CLINT
// 0C000000 -- PLIC
// 10000000 -- uart0 
// 10001000 -- virtio disk 
// 80000000 -- boot ROM jumps here in machine mode
//             -kernel loads the kernel here
// unused RAM after 80000000.

// the kernel uses physical memory thus:
// 80000000 -- entry.S, then kernel text and data
// end -- start of kernel page allocation area
// PHYSTOP -- end RAM used by the kernel

// qemu puts UART registers here in physical memory.
#define UART0 0x10000000L
#define UART0_IRQ 10

// virtio mmio interface
#define VIRTIO0 0x10001000
#define VIRTIO0_IRQ 1

// core local interruptor (CLINT), which contains the timer.
#define CLINT 0x2000000L
#define CLINT_MTIMECMP(hartid) (CLINT + 0x4000 + 8*(hartid))
#define CLINT_MTIME (CLINT + 0xBFF8) // cycles since boot.

// qemu puts platform-level interrupt controller (PLIC) here.
#define PLIC 0x0c000000L
#define PLIC_PRIORITY (PLIC + 0x0)
#define PLIC_PENDING (PLIC + 0x1000)
#define PLIC_MENABLE(hart) (PLIC + 0x2000 + (hart)*0x100)
#define PLIC_SENABLE(hart) (PLIC + 0x2080 + (hart)*0x100)
#define PLIC_MPRIORITY(hart) (PLIC + 0x200000 + (hart)*0x2000)
#define PLIC_SPRIORITY(hart) (PLIC + 0x201000 + (hart)*0x2000)
#define PLIC_MCLAIM(hart) (PLIC + 0x200004 + (hart)*0x2000)
#define PLIC_SCLAIM(hart) (PLIC + 0x201004 + (hart)*0x2000)

// the kernel expects there to be RAM
// for use by the kernel and user pages
// from physical address 0x80000000 to PHYSTOP.
#define KERNBASE 0x80000000L
#define PHYSTOP (KERNBASE + 128*1024*1024)

// map the trampoline page to the highest address,
// in both user and kernel space.
#define TRAMPOLINE (MAXVA - PGSIZE)

// map kernel stacks beneath the trampoline,
// each surrounded by invalid guard pages.
#define KSTACK(p) (TRAMPOLINE - ((p)+1)* 2*PGSIZE)

// User memory layout.
// Address zero first:
//   text
//   original data and bss
//   fixed-size stack
//   expandable heap
//   ...
//   TRAPFRAME (p->trapframe, used by the trampoline)
//   TRAMPOLINE (the same page as in the kernel)
#define TRAPFRAME (TRAMPOLINE - PGSIZE)

因此我們可以用 C 語言的記憶體映射指令去存取這些內容,例如 uart.c 裏的程式碼,其中的 WriteReg(THR, c) 定義為 (*(Reg(reg)) = (v)) ,也就是 *(volatile unsigned char *)(UART0+THR) = v

這個指令透過將 v 寫到 UART+THR (0x10000000L+0) 當中,因而傳送一個字元給宿主機顯示。

下面是之前有提到的彈跳床

#define Reg(reg) ((volatile unsigned char *)(UART0 + reg))
//...
#define RHR 0                 // receive holding register (for input bytes)
#define THR 0                 // transmit holding register (for output bytes)
//...
#define ReadReg(reg) (*(Reg(reg)))
#define WriteReg(reg, v) (*(Reg(reg)) = (v))
//...
void
uartputc_sync(int c)
{
  push_off();

  if(panicked){
    for(;;)
      ;
  }

  // wait for Transmit Holding Empty to be set in LSR.
  while((ReadReg(LSR) & LSR_TX_IDLE) == 0)
    ;
  WriteReg(THR, c);

  pop_off();
}

上述的記憶體配置,在 vm.c 的 kvmmake() 函數落實為程式。

kernel/vm.c

/*
 * the kernel's page table.
 */
pagetable_t kernel_pagetable;

extern char etext[];  // kernel.ld sets this to end of kernel code.

extern char trampoline[]; // trampoline.S

// Make a direct-map page table for the kernel.
pagetable_t
kvmmake(void)
{
  pagetable_t kpgtbl;

  kpgtbl = (pagetable_t) kalloc();
  memset(kpgtbl, 0, PGSIZE);

  // uart registers
  kvmmap(kpgtbl, UART0, UART0, PGSIZE, PTE_R | PTE_W);

  // virtio mmio disk interface
  kvmmap(kpgtbl, VIRTIO0, VIRTIO0, PGSIZE, PTE_R | PTE_W);

  // PLIC
  kvmmap(kpgtbl, PLIC, PLIC, 0x400000, PTE_R | PTE_W);

  // map kernel text executable and read-only.
  kvmmap(kpgtbl, KERNBASE, KERNBASE, (uint64)etext-KERNBASE, PTE_R | PTE_X);

  // map kernel data and the physical RAM we'll make use of.
  kvmmap(kpgtbl, (uint64)etext, (uint64)etext, PHYSTOP-(uint64)etext, PTE_R | PTE_W);

  // map the trampoline for trap entry/exit to
  // the highest virtual address in the kernel.
  kvmmap(kpgtbl, TRAMPOLINE, (uint64)trampoline, PGSIZE, PTE_R | PTE_X);

  // map kernel stacks
  proc_mapstacks(kpgtbl);
  
  return kpgtbl;
}

// Initialize the one kernel_pagetable
void
kvminit(void)
{
  kernel_pagetable = kvmmake();
}
// ...
// add a mapping to the kernel page table.
// only used when booting.
// does not flush TLB or enable paging.
void
kvmmap(pagetable_t kpgtbl, uint64 va, uint64 pa, uint64 sz, int perm)
{
  if(mappages(kpgtbl, va, sz, pa, perm) != 0)
    panic("kvmmap");
}
// Create PTEs for virtual addresses starting at va that refer to
// physical addresses starting at pa. va and size might not
// be page-aligned. Returns 0 on success, -1 if walk() couldn't
// allocate a needed page-table page.
int
mappages(pagetable_t pagetable, uint64 va, uint64 size, uint64 pa, int perm)
{
  uint64 a, last;
  pte_t *pte;
  // 映射範圍:從 va 到 va+size
  a = PGROUNDDOWN(va); // 第一頁的頁號
  last = PGROUNDDOWN(va + size - 1); // 最後一頁的頁號
  for(;;){
    if((pte = walk(pagetable, a, 1)) == 0) // 找出頁 a 對應的 pte,若不存在則創造一個可用空頁 (因 alloc=1)
      return -1;
    if(*pte & PTE_V)
      panic("remap");
    *pte = PA2PTE(pa) | perm | PTE_V;
    if(a == last) // 如果已經到了最後一頁,則完成並離開
      break;
    a += PGSIZE;
    pa += PGSIZE;
  }
  return 0;
}
// ...
// Return the address of the PTE in page table pagetable
// that corresponds to virtual address va.  If alloc!=0,
// create any required page-table pages.
//
// The risc-v Sv39 scheme has three levels of page-table
// pages. A page-table page contains 512 64-bit PTEs.
// A 64-bit virtual address is split into five fields:
//   39..63 -- must be zero.
//   30..38 -- 9 bits of level-2 index.
//   21..29 -- 9 bits of level-1 index.
//   12..20 -- 9 bits of level-0 index.
//    0..11 -- 12 bits of byte offset within the page.
pte_t *
walk(pagetable_t pagetable, uint64 va, int alloc)
{
  if(va >= MAXVA)
    panic("walk");

  for(int level = 2; level > 0; level--) { // 逐級下降頁表 (共三級)
    pte_t *pte = &pagetable[PX(level, va)]; // 看看虛擬位址 va 的是否在頁表裏
    if(*pte & PTE_V) { // 若是,則取得頁表。
      pagetable = (pagetable_t)PTE2PA(*pte);
    } else { // 否則,分配新頁表
      if(!alloc || (pagetable = (pde_t*)kalloc()) == 0) // 不分配或分配失敗
        return 0;
      memset(pagetable, 0, PGSIZE); // 將頁表清為 0
      *pte = PA2PTE(pagetable) | PTE_V; // 取得頁表項 PTE
    }
  }
  return &pagetable[PX(0, va)]; // 傳回 0 級頁表
}

從以上程式中,您可以發現頁表為三級結構如下圖所示:

img

其中每一級都用九位元去定址大小為 512 words 的分頁表。

然而,上述的 kvmmake() 只是配置核心 kernel 的分頁表,使用者行程的分頁表,則是在 vm.c 裏的那些 uvm...() 的函數所負責的。

第一個被載入的使用者行程是 init ,在前面的文章中我們看過,有個很短的 initcode 機器碼被用來載入 init 行程,以下重複列出這些碼。

kernel/proc.c

// Set up first user process.
void
userinit(void) 
{
  struct proc *p;

  p = allocproc();
  initproc = p;
  
  // allocate one user page and copy init's instructions
  // and data into it.
  uvminit(p->pagetable, initcode, sizeof(initcode));
  p->sz = PGSIZE;

  // prepare for the very first "return" from kernel to user.
  p->trapframe->epc = 0;      // user program counter
  p->trapframe->sp = PGSIZE;  // user stack pointer

  safestrcpy(p->name, "initcode", sizeof(p->name));
  p->cwd = namei("/");

  p->state = RUNNABLE;

  release(&p->lock);
}

而其中的 initcode 這個變數,則是一段奇特的十六進位碼如下:

// a user program that calls exec("/init")
// od -t xC initcode
uchar initcode[] = {
  0x17, 0x05, 0x00, 0x00, 0x13, 0x05, 0x45, 0x02,
  0x97, 0x05, 0x00, 0x00, 0x93, 0x85, 0x35, 0x02,
  0x93, 0x08, 0x70, 0x00, 0x73, 0x00, 0x00, 0x00,
  0x93, 0x08, 0x20, 0x00, 0x73, 0x00, 0x00, 0x00,
  0xef, 0xf0, 0x9f, 0xff, 0x2f, 0x69, 0x6e, 0x69,
  0x74, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x00,
  0x00, 0x00, 0x00, 0x00
};

上面那堆 uchar initcode[] = { 0x17, 0x05, 0x00 ... 到底是甚麼呢?

其實就是下列組合語言的機器碼,被直接寫在 initcode[] 陣列裡面了。

user/initcode.S

# Initial process that execs /init.
# This code runs in user space.

#include "syscall.h"

# exec(init, argv)
.globl start
start:
        la a0, init
        la a1, argv
        li a7, SYS_exec
        ecall

# for(;;) exit();
exit:
        li a7, SYS_exit
        ecall
        jal exit

# char init[] = "/init\0";
init:
  .string "/init\0"

# char *argv[] = { init, 0 };
.p2align 2
argv:
  .long init
  .long 0

然後回到 vm.c 裏,你可以看到 uvminit() 在核心中分配了一頁給這個 initcode ,這頁程式碼執行時就可以把 init 載入到記憶體了。

// Load the user initcode into address 0 of pagetable,
// for the very first process.
// sz must be less than a page.
void
uvminit(pagetable_t pagetable, uchar *src, uint sz)  // 要先init分頁表,才能執行程式
{
  char *mem;

  if(sz >= PGSIZE)
    panic("inituvm: more than a page");
  mem = kalloc();
  memset(mem, 0, PGSIZE);
  mappages(pagetable, 0, PGSIZE, (uint64)mem, PTE_W|PTE_R|PTE_X|PTE_U);
  memmove(mem, src, sz);
}

由於執行檔為 elf 格式,而且是在 fork() 之後用 exec() 函數去載入置換掉的,因此讓我們重複看一下 exec() 這個函數。

您會發現 exec() 先執行了 proc_pagetable(p) 這個函數來分配頁表,然後就對每個分段都呼叫 uvmalloc(pagetable, sz, ph.vaddr + ph.memsz)) 去分配該段的頁空間。

int
exec(char *path, char **argv)
{
  char *s, *last;
  int i, off;
  uint64 argc, sz = 0, sp, ustack[MAXARG+1], stackbase;
  struct elfhdr elf;
  struct inode *ip;
  struct proghdr ph;
  pagetable_t pagetable = 0, oldpagetable;
  struct proc *p = myproc();

  begin_op();

  if((ip = namei(path)) == 0){ // 取得 path ELF 檔對應的 inode ptr (ip)
    end_op();
    return -1;
  }
  ilock(ip);

  // Check ELF header
  if(readi(ip, 0, (uint64)&elf, 0, sizeof(elf)) != sizeof(elf)) // 讀取該 inode
    goto bad;
  if(elf.magic != ELF_MAGIC) // 若不是 ELF 則失敗
    goto bad;

  if((pagetable = proc_pagetable(p)) == 0) // 分配頁表
    goto bad;

  // Load program into memory.
  for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){
    if(readi(ip, 0, (uint64)&ph, off, sizeof(ph)) != sizeof(ph))
      goto bad;
    if(ph.type != ELF_PROG_LOAD)
      continue;
    if(ph.memsz < ph.filesz)
      goto bad;
    if(ph.vaddr + ph.memsz < ph.vaddr)
      goto bad;
    uint64 sz1;
    if((sz1 = uvmalloc(pagetable, sz, ph.vaddr + ph.memsz)) == 0) // 為每個ELF段分配記憶體
      goto bad;
    sz = sz1;
    if(ph.vaddr % PGSIZE != 0)
      goto bad;
    if(loadseg(pagetable, ph.vaddr, ip, ph.off, ph.filesz) < 0) // 把每個段加載到記憶體中 (loadseg用walkaddr找到分配記憶體的物理地址,在這個地址上寫入ELF段的每一頁,用readi從文件中讀取)
      goto bad; 
  }
  iunlockput(ip);
  end_op();
  ip = 0;

  p = myproc();
  uint64 oldsz = p->sz;

  // Allocate two pages at the next page boundary. 為何分配兩頁?第二個是堆疊,那第一個幹嘛用?
  // Use the second as the user stack.  答:第一個是不可訪問頁,當堆疊溢位時會觸發錯誤中斷。
  sz = PGROUNDUP(sz);
  uint64 sz1;
  if((sz1 = uvmalloc(pagetable, sz, sz + 2*PGSIZE)) == 0)
    goto bad;
  sz = sz1;
  uvmclear(pagetable, sz-2*PGSIZE);
  sp = sz;
  stackbase = sp - PGSIZE;

  // Push argument strings, prepare rest of stack in ustack. 在堆疊中推入 argv 字串
  for(argc = 0; argv[argc]; argc++) {
    if(argc >= MAXARG)
      goto bad;
    sp -= strlen(argv[argc]) + 1;
    sp -= sp % 16; // riscv sp must be 16-byte aligned
    if(sp < stackbase)
      goto bad;
    if(copyout(pagetable, sp, argv[argc], strlen(argv[argc]) + 1) < 0) // 複製失敗就離開
      goto bad;
    ustack[argc] = sp;
  }
  ustack[argc] = 0;

  // push the array of argv[] pointers. 推入 argv 的指標
  sp -= (argc+1) * sizeof(uint64);
  sp -= sp % 16;
  if(sp < stackbase)
    goto bad;
  if(copyout(pagetable, sp, (char *)ustack, (argc+1)*sizeof(uint64)) < 0)
    goto bad;

  // arguments to user main(argc, argv)
  // argc is returned via the system call return
  // value, which goes in a0.
  p->trapframe->a1 = sp; // 設定 a1=argv

  // Save program name for debugging.
  for(last=s=path; *s; s++)
    if(*s == '/')
      last = s+1;
  safestrcpy(p->name, last, sizeof(p->name));
    
  // Commit to the user image.
  oldpagetable = p->pagetable; // 註:oldpagetable 指向 fork 時的 process,現在已經換成新 process 了。
  p->pagetable = pagetable;
  p->sz = sz;
  p->trapframe->epc = elf.entry;  // initial program counter = main (進入點為 main)
  p->trapframe->sp = sp; // initial stack pointer
  proc_freepagetable(oldpagetable, oldsz);

  return argc; // this ends up in a0, the first argument to main(argc, argv)

 bad:
  if(pagetable)
    proc_freepagetable(pagetable, sz);
  if(ip){
    iunlockput(ip);
    end_op();
  }
  return -1;
}

最後,每個行程都需要的堆疊段,則是在 uvmalloc(pagetable, sz, sz + 2*PGSIZE) 這個指令分配的。

上面程式中的 proc_pagetable() 呼叫,定義在 proc.c 裏

kernel/proc.c

// Create a user page table for a given process,
// with no user memory, but with trampoline pages.
pagetable_t
proc_pagetable(struct proc *p)
{
  pagetable_t pagetable;

  // An empty page table.
  pagetable = uvmcreate();
  if(pagetable == 0)
    return 0;

  // map the trampoline code (for system call return)
  // at the highest user virtual address.
  // only the supervisor uses it, on the way
  // to/from user space, so not PTE_U.
  if(mappages(pagetable, TRAMPOLINE, PGSIZE,
              (uint64)trampoline, PTE_R | PTE_X) < 0){
    uvmfree(pagetable, 0);
    return 0;
  }

  // map the trapframe just below TRAMPOLINE, for trampoline.S.
  if(mappages(pagetable, TRAPFRAME, PGSIZE,
              (uint64)(p->trapframe), PTE_R | PTE_W) < 0){
    uvmunmap(pagetable, TRAMPOLINE, 1, 0);
    uvmfree(pagetable, 0);
    return 0;
  }

  return pagetable;
}

其中透過 uvmcreate() 創建使用者行程的分頁表

// create an empty user page table.
// returns 0 if out of memory.
pagetable_t
uvmcreate()
{
  pagetable_t pagetable;
  pagetable = (pagetable_t) kalloc();
  if(pagetable == 0)
    return 0;
  memset(pagetable, 0, PGSIZE);
  return pagetable;
}

proc_pagetable() 程式後半部用 mappages(pagetable, TRAMPOLINE,... 去映射一段由 kernel 與 user process 共用的 TRAMPOLINE 區域,可以用來作為系統呼叫的使用區。

核心和使用者行程在TRAMPOLINE 的位址都應射到相同的高位址區,這個 TRAMPOLINE 位址也是定義在 memlayout.h 當中

kernel/memlayout.h

// the kernel expects there to be RAM
// for use by the kernel and user pages
// from physical address 0x80000000 to PHYSTOP.
#define KERNBASE 0x80000000L
#define PHYSTOP (KERNBASE + 128*1024*1024)

// map the trampoline page to the highest address,
// in both user and kernel space.
#define TRAMPOLINE (MAXVA - PGSIZE)

其中的 MAXVA 則是定義在 riscv.h 中

// one beyond the highest possible virtual address.
// MAXVA is actually one bit less than the max allowed by
// Sv39, to avoid having to sign-extend virtual addresses
// that have the high bit set.
#define MAXVA (1L << (9 + 9 + 9 + 12 - 1))

Q