xv6 文件系统

最新推荐文章于 2025-10-20 20:58:03 发布

原创最新推荐文章于 2025-10-20 20:58:03 发布 · 870 阅读

23 ·

CC 4.0 BY-SA版权

文章标签：

#linux #risc-v

xv6 专栏收录该内容

6 篇文章

订阅专栏

Buffer Cache

buffer Cache 结构体

bcache 存放了 NBUF 个 buf 框，每个框对应 disk 上某一个 block。从初始化函数 binit中可以看出，bcache 是一个循环双向链表。通过双链表组织这些 buf，以近似 LRU 的策略管理，大概如下图。

struct buf {
  int valid;   // has data been read from disk?
  int disk;    // does disk "own" buf?
  uint dev;
  uint blockno;
  struct sleeplock lock;
  uint refcnt;
  struct buf *prev; // LRU cache list
  struct buf *next;
  uchar data[BSIZE];
};

struct {
  struct spinlock lock;
  struct buf buf[NBUF];

  // Linked list of all buffers, through prev/next.
  // Sorted by how recently the buffer was used.
  // head.next is most recent, head.prev is least.
  struct buf head;
} bcache;

void
binit(void)
{
  struct buf *b;

  initlock(&bcache.lock, "bcache");

  // Create linked list of buffers
  bcache.head.prev = &bcache.head;
  bcache.head.next = &bcache.head;
  for(b = bcache.buf; b < bcache.buf+NBUF; b++){
    b->next = bcache.head.next;
    b->prev = &bcache.head;
    initsleeplock(&b->lock, "buffer");
    bcache.head.next->prev = b;
    bcache.head.next = b;
  }
}

Buffer Cache 基本操作

只需知道 bcache 是结构是循环双向链表，那么这些操作就很容易明白，直接看代码即可。

// Look through buffer cache for block on device dev.
// If not found, allocate a buffer.
// In either case, return locked buffer.
static struct buf*
bget(uint dev, uint blockno)
{
  struct buf *b;

  acquire(&bcache.lock);

  // Is the block already cached?
  for(b = bcache.head.next; b != &bcache.head; b = b->next){
    if(b->dev == dev && b->blockno == blockno){
      b->refcnt++;
      release(&bcache.lock);
      acquiresleep(&b->lock);
      return b;
    }
  }

  // Not cached.
  // Recycle the least recently used (LRU) unused buffer.
  for(b = bcache.head.prev; b != &bcache.head; b = b->prev){
    if(b->refcnt == 0) {
      b->dev = dev;
      b->blockno = blockno;
      b->valid = 0;
      b->refcnt = 1;
      release(&bcache.lock);
      acquiresleep(&b->lock);
      return b;
    }
  }
  panic("bget: no buffers");
}

// Return a locked buf with the contents of the indicated block.
struct buf*
bread(uint dev, uint blockno)
{
  struct buf *b;

  b = bget(dev, blockno);
  if(!b->valid) {
    virtio_disk_rw(b, 0);
    b->valid = 1;
  }
  return b;
}

// Write b's contents to disk.  Must be locked.
void
bwrite(struct buf *b)
{
  if(!holdingsleep(&b->lock))
    panic("bwrite");
  virtio_disk_rw(b, 1);
}

// Release a locked buffer.
// Move to the head of the most-recently-used list.
void
brelse(struct buf *b)
{
  if(!holdingsleep(&b->lock))
    panic("brelse");

  releasesleep(&b->lock);

  acquire(&bcache.lock);
  b->refcnt--;
  if (b->refcnt == 0) {
    // no one is waiting for it.
    b->next->prev = b->prev;
    b->prev->next = b->next;
    b->next = bcache.head.next;
    b->prev = &bcache.head;
    bcache.head.next->prev = b;
    bcache.head.next = b;
  }
  
  release(&bcache.lock);
}

void
bpin(struct buf *b) {
  acquire(&bcache.lock);
  b->refcnt++;
  release(&bcache.lock);
}

void
bunpin(struct buf *b) {
  acquire(&bcache.lock);
  b->refcnt--;
  release(&bcache.lock);
}

日志

xv6 操作系统中的日志机制，目的是为了解决系统崩溃时的崩溃一致性问题，通过物理重做日志（redo log），在崩溃后恢复文件系统状态。

该日志系统遵守 Wirte Ahead Log 原则，即系统首先将所有写操作以及写的内容记录在 log 中，然后写入到文件的实际位置。

图片来自xv6——文件系统：磁盘的LOG日志机制 - 殷大侠 - 博客园

日志存储布局

xv6 的日志模块如上图DISK中的排布，主要分为两个部分：

日志头块：日志系统的第一块是一个特殊的块，称为日志头块。它包含了一个 logheader 结构，该结构存储了本次日志事务中所有块的块号列表。
日志数据块：日志头块后面的连续块用于存储实际的日志数据。这些块紧跟在日志头块之后，形成了一个物理的日志记录。日志数据块是一个被更新的块的拷贝。

// 日志头的结构，包含了一个事务中涉及的块的数量和一个块号数组
struct logheader {
  int n;      //  n 字段表示当前日志事务中所包含的块的数量。
  int block[LOGSIZE]; // 记录修改块的最终磁盘块号，如上图中的55、56、57
};

// 日志结构，包含了自旋锁、日志开始块号、日志大小、正在进行的文件系统系统调用的数量、是否正在提交的标志、设备号和日志头
struct log {
  struct spinlock lock;
  int start;       // 记录位于磁盘中的日志层起始块号
  int size;        // size 字段表示整个日志层（log layer）的大小
  int outstanding; // how many FS sys calls are executing.记录当前并发系统调用数目
  int committing;  // in commit(), please wait.记录是否当前处于提交状态
  int dev;
  struct logheader lh;
};
struct log log;

在上述结构体中，最不好理解的就是n、size 和 LOGSIZE 之间的关系。

在 xv6 操作系统的日志系统中，n、size 和 LOGSIZE 是三个关键的参数，它们之间的关系对于理解日志系统的工作原理至关重要。下面是这三个参数的定义和它们之间的关系：

log.lh.n
log.lh.n 是 struct logheader 中的一个字段，表示当前日志事务中已经记录的块的数量。每次有新的数据块被添加到日志中时，n 的值会增加。当事务提交时，这些块会被写入磁盘。
log.size
log.size 是 struct log 中的一个字段，它表示日志系统总共可以存储的块的数量。这个值通常在日志系统初始化时设置，并在整个日志系统生命周期中保持不变。它基于文件系统的超级块中的 nlog 字段确定。
LOGSIZE
LOGSIZE 是一个常量，定义了 struct logheader 中 block[] 数组的大小。这个数组用于存储当前日志事务中所有涉及块的块号。LOGSIZE 必须足够大，以容纳任何单个日志事务中可能涉及的所有块号。

它们之间的关系

log.lh.n 必须小于或等于 LOGSIZE。LOGSIZE 是 logheader 结构中 block[] 数组的大小，它限制了单个日志事务可以涉及的块的最大数量。
log.size 是日志系统的总容量，它应该大于或等于 LOGSIZE。log.size 定义了整个日志层可以存储的块的数量，而 LOGSIZE 是单个事务可以涉及的块的数量。在实际应用中，log.size 可能会比 LOGSIZE 大得多，因为日志层需要能够处理多个并发事务。
事务大小限制：一个日志事务的大小不能超过 LOGSIZE，因为这是 logheader 中 block[] 数组可以存储的块号的最大数量。如果一个事务需要记录超过 LOGSIZE 个块的修改，那么这个事务将无法被完整地记录在日志中。
日志系统容量：log.size 定义了整个日志系统的容量，它决定了日志系统可以处理的并发事务的数量。如果 log.size 过小，可能会导致日志系统在高并发情况下的性能瓶颈。

在设计日志系统时，合理地设置这些参数对于确保系统的性能、可靠性和可扩展性至关重要。LOGSIZE 直接影响到单个事务可以涉及的数据量，而 log.size 决定了整个日志系统的容量和并发处理能力。

崩溃恢复

日志的目的就是要解决崩溃导致的不一致性问题，下面的几个函数可以在发生崩溃后，恢复崩溃前已经提交过的事务的数据，保证系统的一致性。其中关键的函数就是 recover_from_log 函数以及其调用的 install_trans 函数。

recover_from_log 函数首先读取已经保存的日志头，然后调用 install_trans 恢复数据，最后将日志头的 n 设为 0，表示数据已恢复，当前该没有写入新数据。最后将新的空日志头写入文件系统，防止连续崩溃导致恢复的数据不一致。
install_trans 函数首先会读取日志头中记录的 block 数据块（这些块里记录了已提交事务修改的内容），然后读取真实需要修改的 block 数据块，根据日志头的记录恢复真实 block 中的数据，最后将其写回到磁盘。

void
initlog(int dev, struct superblock *sb)
{
  if (sizeof(struct logheader) >= BSIZE)
    panic("initlog: too big logheader");

  initlock(&log.lock, "log");
  log.start = sb->logstart;
  log.size = sb->nlog;
  log.dev = dev;
  // 在 initlog 函数中调用 recover_from_log 的目的是为了在文件系统启动时从日志中恢复数据，确保文件系统能够从上次的运行状态中恢复过来，特别是在系统崩溃后重启的情况下。
  recover_from_log();
}

static void
recover_from_log(void)
{
  read_head();
  install_trans(1); // if committed, copy from log to disk
  log.lh.n = 0;
  write_head(); // clear the log
}

// Copy committed blocks from log to their home location
// 将已提交的事务中的块从日志复制到它们在磁盘上的最终位置。
static void
install_trans(int recovering)
{
  int tail;

  for (tail = 0; tail < log.lh.n; tail++) {
    struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block
    struct buf *dbuf = bread(log.dev, log.lh.block[tail]); // read dst
    memmove(dbuf->data, lbuf->data, BSIZE);  // copy block to dst
    bwrite(dbuf);  // write dst to disk
    if(recovering == 0)
      bunpin(dbuf);
    brelse(lbuf);
    brelse(dbuf);
  }
}

// Read the log header from disk into the in-memory log header
static void
read_head(void)
{
  struct buf *buf = bread(log.dev, log.start);
  struct logheader *lh = (struct logheader *) (buf->data);
  int i;
  log.lh.n = lh->n;
  for (i = 0; i < log.lh.n; i++) {
    log.lh.block[i] = lh->block[i];
  }
  brelse(buf);
}

// Write in-memory log header to disk.
// This is the true point at which the
// current transaction commits.
static void
write_head(void)
{
  struct buf *buf = bread(log.dev, log.start);
  struct logheader *hb = (struct logheader *) (buf->data);
  int i;
  hb->n = log.lh.n;
  for (i = 0; i < log.lh.n; i++) {
    hb->block[i] = log.lh.block[i];
  }
  bwrite(buf);
  brelse(buf);
}

log 基本操作

文件系统操作：
+--------------+
| bcache       | 修改块 100
| buf          |
| blockno=100  |
| data=new data|
+--------------+
     |
log_write:
     v
log.lh: n=1, block[0]=100

commit:
     |
write_log:
     v
磁盘日志:
+--------------+---------------+
| log.start    | log.start+1   |
|              | data block 0  |
| n=1          | block 100 data|
| block[0]=100 |               |
+--------------+---------------+

上述流程图展示了一个典型的文件系统操作的流程：首先，文件系统通过 bread 从缓冲区缓存（bcache）获取块 100 的缓冲区（struct buf），修改其 data 为新内容。接着，log_write 被调用，将块号 100 记录到内存中的日志头（log.lh），设置 n=1 和 block[0]=100，并通过 bpin 固定缓冲区。随后，当所有操作完成，end_op 触发 commit，其中 write_log 将块 100 的修改内容从 bcache 写入磁盘日志的数据块（log.start+1）。最终，日志头（log.start）更新为 n=1 和 block[0]=100，记录事务状态，为后续提交或恢复做准备。

下面着重分析一下该流程图涉及到的几个重要函数。

`log_write`

在 log 基本操作中，最值得关注的是 log_write 函数。log_write 替代了 bwrite（如果不使用日志时，修改了块后，会直接调用 bwrite 直接写入 disk）。但是为了防止崩溃一致性，先把修改后的数据块的块号记录到 log 的日志头，等进行 log commit 时后再写入 disk。

log_write 函数的整体流程如下：

检查日志空间和事务状态。
遍历 log.lh.block，若块号已存在（日志吸收），跳出。
记录 b->blockno 到 log.lh.block[i]。
若为新块（i == log.lh.n）：
- bpin(b) 固定缓冲区。
- 增加 log.lh.n。

最终结果：更新内存中的 log.lh，但不立即写入磁盘。

// log_write() replaces bwrite(); a typical use is:
//   bp = bread(...)
//   modify bp->data[]
//   log_write(bp)
//   brelse(bp)
// log_write替代了bwrite，是指之前修改了块后，直接写入disk。
// 但是为了防止崩溃一致性，先把修改的数据块记录到log的日志头，等log commit时后再写入disk
void
log_write(struct buf *b)
{
  int i;

  acquire(&log.lock);
  if (log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1)
    panic("too big a transaction");
  if (log.outstanding < 1)
    panic("log_write outside of trans");

  for (i = 0; i < log.lh.n; i++) {
    if (log.lh.block[i] == b->blockno)   // log absorption
      break;
  }
  log.lh.block[i] = b->blockno;
  if (i == log.lh.n) {  // Add new block to log?
    bpin(b);
    log.lh.n++;
  }
  release(&log.lock);
}

`write_log`

和 log_write 容易混淆的是 write_log 函数。write_log 的作用是将内存中所有记录的修改块写入磁盘上的日志数据块。该函数会在事务提交（commit）时调用，作为提交的第一步。

// Copy modified blocks from cache to log.
static void
write_log(void)
{
  int tail;

  for (tail = 0; tail < log.lh.n; tail++) {
    struct buf *to = bread(log.dev, log.start+tail+1); // log block
    struct buf *from = bread(log.dev, log.lh.block[tail]); // cache block
    memmove(to->data, from->data, BSIZE);
    bwrite(to);  // write the log
    brelse(from);
    brelse(to);
  }
}

步骤：

遍历 log.lh.n 个记录的块。
获取日志数据块（log.start+tail+1）的缓冲区 to。
获取原始修改块（log.lh.block[tail]）的缓冲区 from。
复制 from->data 到 to->data。
写入日志数据块，释放缓冲区。

输出：将修改内容写入磁盘上的日志数据块。

`log_write` 和 `write_log` 的区别

特性	`log_write`	`write_log`
作用	记录修改块的块号到日志头	将修改内容写入日志数据块
调用时机	文件系统调用修改缓冲区后	事务提交（`commit`）时
操作对象	内存中的 `log.lh` 和缓冲区	磁盘上的日志数据块
磁盘操作	无，直接更新内存	有，写入日志数据块
频率	每个修改块调用一次	每个事务提交时调用一次
锁	使用 `log.lock` 保护 `log.lh`	无显式锁（由 `commit` 控制）
缓冲区管理	`bpin` 固定缓冲区	不修改 `refcnt`，仅读写

`begin_op` 和 `end_op` — 协调并发文件系统调用

begin_op 和 end_op 共同协调并发文件系统调用，确保日志空间充足，并在适当时候提交事务。

// called at the start of each FS system call.
void
begin_op(void)
{
  acquire(&log.lock);
  while(1){
    if(log.committing){
      sleep(&log, &log.lock);
    } else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE){
      // this op might exhaust log space; wait for commit.
      sleep(&log, &log.lock);
    } else {
      log.outstanding += 1;
      release(&log.lock);
      break;
    }
  }
}

获取锁：acquire(&log.lock) 保护日志结构。
无限循环检查：
- if(log.committing)：
  - 若正在提交（committing == 1），调用 sleep 等待。
- else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE)：
  - 检查日志空间是否足够：
    - log.lh.n：当前事务已记录的块数。
    - log.outstanding+1：包括新操作的并发调用数。
    - MAXOPBLOCKS：每个操作可能修改的最大块数（通常为 10）。
    - LOGSIZE：日志最大块数（通常为 30）。
  - 若空间不足，sleep 等待提交。
- else：
  - 增加 outstanding，释放锁，退出。

// called at the end of each FS system call.
// commits if this was the last outstanding operation.
void
end_op(void)
{
  int do_commit = 0;

  acquire(&log.lock);
  log.outstanding -= 1;
  if(log.committing)
    panic("log.committing");
  // 本次事务所有进程操作都完成，可以提交，设置状态
  if(log.outstanding == 0){
    do_commit = 1;
    log.committing = 1;
  } else {
    // begin_op() may be waiting for log space,
    // and decrementing log.outstanding has decreased
    // the amount of reserved space.
    // outstanding不为0，此时还可能有别的进程加入事务的可能，唤醒可能在等待的进程
    wakeup(&log);
  }
  release(&log.lock);

  if(do_commit){
    // call commit w/o holding locks, since not allowed
    // to sleep with locks.
    commit();
    acquire(&log.lock);
    log.committing = 0;
    wakeup(&log);
    release(&log.lock);
  }
}

获取锁并减少计数：log.outstanding -= 1：标记一个操作结束。
检查状态：
- if(log.committing)：若正在提交，panic，因为不应在提交时调用。
- if(log.outstanding == 0)：若所有操作完成，设置 do_commit = 1 和 committing = 1。
- else：唤醒等待的 begin_op，因为空间需求减少。
释放锁：release(&log.lock)。
提交（若需要）：
- 若 do_commit，调用 commit()。
- 完成后清除 committing，唤醒等待者。

static void
commit()
{
  if (log.lh.n > 0) {
    write_log();     // Write modified blocks from cache to log
    write_head();    // Write header to disk -- the real commit
    install_trans(0); // Now install writes to home locations
    log.lh.n = 0;
    write_head();    // Erase the transaction from the log
  }
}

Inode

Inode 结构

磁盘中的 inode

#define NDIRECT 12
// On-disk inode structure
struct dinode {
  short type;           // 文件类型--文件或目录
  short major;          // 主设备号 (T_DEVICE only)
  short minor;          // 从设备号 (T_DEVICE only)
  short nlink;          // 文件系统中inode的link数目
  uint size;            // 文件的大小 (bytes)
  uint addrs[NDIRECT+1];   // 数据块号组成的数组。Data block addresses
};

type 表示该 𝑖𝑛𝑜𝑑𝑒 指向的文件的类型，在 𝑥𝑣6 里面就只有三种类型：普通文件、目录文件以及设备文件。type 为 0，则此 Inode 未被使用。
nlink 记录多少个文件指向这个 Inode 节点。nlink 表示该文件的链接数，链接分为硬链接和软连接，这里与链接数目直接相关的是硬链接，后面实现文件系统调用的时候我们会看到，sys_link() 系统调用会创建一个新目录项并且增加一个链接数。sys_unlink() 系统调用将链接数减 1，如果该文件在内存中的引用数和链接数都为 0 的话，则会删除该文件。
addrs 是一个数组，用于存储该 Inode 所表示的文件的数据存储在哪些块中。在文件系统中，数据被分成固定大小的块进行存储，各个文件的数据存储于位于 bitmap 块之后的数据块区域。这些块通过数据块号进行索引。xv6 的 Inode 中 addrs 前 NDIRECT(12) 个块号为直接索引，指向本文件的内容所在的数据块号。addrs 数组多出的一个最后一个元素为一个一级间接索引，即它指向的数据块中存储的才是直接索引，一个 uint 大小为 4B，一块大小为 1024B，故而这个一级间接索引可以为文件索引共 1024/4=256 个数据块。总的来说，xv6 文件系统最大支持文件大小为 (12 + 256) * BLOCKSIZE = 268KB。

内存中的 inode

由于程序操作的所有数据都必须位于内存，xv6 对 dinode 在内存中的抽象为结构体 inode。文件系统中所有对 Inode 的操作都是操作内存中 Inode，它是硬盘中的 dinode 的内存镜像/副本。当然，一个硬盘 dinode 在内存中有且仅有一个副本。

// kernel/file.h
// in-memory copy of an inode
struct inode {
  uint dev;           // Device number
  uint inum;          // Inode number
  int ref;            // Reference count
  struct sleeplock lock; // protects everything below here
  int valid;          // inode has been read from disk?

  short type;         // copy of disk inode
  short major;
  short minor;
  short nlink;
  uint size;
  uint addrs[NDIRECT+2];  // NDIRECT+1 -> NDIRECT+2
};

额外字段：

dev：文件系统所在设备号。
inum：i节点号，标识磁盘上的 dinode。
ref：引用计数，表示打开该 i节点的进程数。
lock：睡眠锁，保护以下字段的并发访问。
valid：标记是否已从磁盘读取（0 表示未初始化）。

特性	`ref` (struct inode)	`nlink` (struct dinode)
位置	内存（`icache`）	磁盘（i节点块）
含义	内存中 i节点的引用数	磁盘上 i节点的硬链接数
作用	管理 `inode` 的缓存生命周期	管理文件在文件系统中的存在性
范围	运行时，动态变化	文件系统生命周期，持久存储
操作	`iget` 增加，`iput` 减少	`link` 增加，`unlink` 减少
归零后果	`inode` 可被重用	i节点和数据块被释放
关联	进程/文件描述符	目录条目（`struct dirent`）

Inode 相关

inode 用于描述单个未命名的文件，并在磁盘上存储文件的元数据，如文件类型、大小、链接数和块列表。

inode 磁盘结构：包含文件的类型、大小、链接数和存储文件内容的块列表。如上小节磁盘中的 inode 结构。
inode 表：内核在内存中维护一个使用中的 inode 表，以同步多个进程对 inode 的访问。内存中的 inode 包括不在磁盘上存储的额外信息，如 ip->ref（引用计数）和 ip->valid（有效标志）。

struct {
  struct spinlock lock;
  struct inode inode[NINODE];
} itable;

inode 状态序列：
- 分配：当 inode 的类型非零时，表示它已被分配。ialloc() 用于分配，iput() 用于释放。
- 引用表中的条目：如果 ip->ref 为零，则表条目是空闲的。iget() 用于查找或创建表条目，并增加引用计数。iput() 用于减少引用计数。
- 有效：当 ip->valid 为 1 时，inode 表条目中的信息（类型、大小等）才是正确的。ilock() 从磁盘读取 inode 并设置 ip->valid，而 iput() 在引用计数为零时清除 ip->valid。
- 锁定：文件系统代码必须先锁定 inode，然后才能检查和修改 inode 及其内容的信息。
典型序列：
- 使用 iget() 获取 inode 条目。
- 使用 ilock() 锁定 inode。
- 检查和修改 ip->xxx。
- 使用 iunlock() 解锁 inode。
- 使用 iput() 释放 inode。
ilock() 和 iget() 的分离：允许系统调用对 inode 持有长期引用（如打开的文件），并且只在短时间内（如读取操作）锁定它。分离还有助于在路径名查找期间避免死锁和竞争。
itable.lock 保护：itable.lock 用于保护 itable 条目的分配。在使用条目的 ref、dev 和 inum 字段时，必须持有 itable.lock。
ip->lock 睡眠锁：保护除了 ref、dev 和 inum 之外的所有 ip-> 字段。要读取或写入 inode 的 ip->valid、ip->size、ip->type 等字段，必须持有 ip->lock。

inode 分配和释放

`iget` 和 `ialloc`

ialloc 函数负责在指定的设备上分配一个新的 inode。它首先遍历 inode 表，寻找一个未被分配的 inode（即 type 字段为 0）。一旦找到，该函数会将该 inode 的所有字段清零，设置其 type 字段为传入的 type 参数，并通过日志系统记录这次分配，以确保分配操作的持久性。然后，它释放读取的缓冲区，并调用 iget 函数来获取指向新分配 inode 的指针，最后返回这个指针。如果在 inode 表中找不到空闲的 inode，则函数会调用 panic 函数，打印错误信息并终止程序。

static struct inode*
iget(uint dev, uint inum)
{
  struct inode *ip, *empty;

  acquire(&itable.lock);

  // Is the inode already in the table?
  empty = 0;
  for(ip = &itable.inode[0]; ip < &itable.inode[NINODE]; ip++){
    if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
      ip->ref++;
      release(&itable.lock);
      return ip;
    }
    if(empty == 0 && ip->ref == 0)    // Remember empty slot.
      empty = ip;
  }

  // Recycle an inode entry.
  if(empty == 0)
    panic("iget: no inodes");

  ip = empty;
  ip->dev = dev;
  ip->inum = inum;
  ip->ref = 1;
  ip->valid = 0;
  release(&itable.lock);

  return ip;
}

// Allocate an inode on device dev.
// Mark it as allocated by  giving it type type.
// Returns an unlocked but allocated and referenced inode.
struct inode*
ialloc(uint dev, short type)
{
  int inum;
  struct buf *bp;
  struct dinode *dip;

  for(inum = 1; inum < sb.ninodes; inum++){
    bp = bread(dev, IBLOCK(inum, sb));
    dip = (struct dinode*)bp->data + inum%IPB;
    if(dip->type == 0){  // a free inode
      memset(dip, 0, sizeof(*dip));
      dip->type = type;
      log_write(bp);   // mark it allocated on the disk
      brelse(bp);
      return iget(dev, inum);
    }
    brelse(bp);
  }
  panic("ialloc: no inodes");
}

`iput`

iput 函数用于减少一个已在内存中的 inode 的引用计数，并在适当的时候释放它。首先，它获取 inode 表的锁，检查传入的 inode 是否是最后一个引用（即 ref 字段为 1）且没有链接指向它（即 nlink 字段为 0）。如果是，它进一步锁定该 inode，截断与该 inode 关联的文件内容，清除其类型，更新 inode 信息到磁盘，并标记该 inode 为无效。完成这些操作后，它释放 inode 锁，再次获取 inode 表的锁，减少 inode 的引用计数，然后释放 inode 表的锁。如果引用计数不为 1 或者 inode 有链接指向它，则直接减少引用计数并释放锁。这个过程确保了当一个 inode 不再被使用时，能够安全地回收其资源。

// Drop a reference to an in-memory inode.
// If that was the last reference, the inode table entry can
// be recycled.
// If that was the last reference and the inode has no links
// to it, free the inode (and its content) on disk.
// All calls to iput() must be inside a transaction in
// case it has to free the inode.
void
iput(struct inode *ip)
{
  acquire(&itable.lock);

  if(ip->ref == 1 && ip->valid && ip->nlink == 0){
    // inode has no links and no other references: truncate and free.

    // ip->ref == 1 means no other process can have ip locked,
    // so this acquiresleep() won't block (or deadlock).
    acquiresleep(&ip->lock);

    release(&itable.lock);

    itrunc(ip);
    ip->type = 0;
    iupdate(ip);
    ip->valid = 0;

    releasesleep(&ip->lock);

    acquire(&itable.lock);
  }

  ip->ref--;
  release(&itable.lock);
}

inode 加锁解锁

// Lock the given inode.
// Reads the inode from disk if necessary.
void
ilock(struct inode *ip)
{
  struct buf *bp;
  struct dinode *dip;

  if(ip == 0 || ip->ref < 1)
    panic("ilock");

  acquiresleep(&ip->lock);

  if(ip->valid == 0){
    bp = bread(ip->dev, IBLOCK(ip->inum, sb));
    dip = (struct dinode*)bp->data + ip->inum%IPB;
    ip->type = dip->type;
    ip->major = dip->major;
    ip->minor = dip->minor;
    ip->nlink = dip->nlink;
    ip->size = dip->size;
    memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
    brelse(bp);
    ip->valid = 1;
    if(ip->type == 0)
      panic("ilock: no type");
  }
}

步骤：
1. 参数检查：
  - if(ip == 0 || ip->ref < 1)：
    - 确保 ip 非空且被引用（ref >= 1），否则 panic。
2. 加锁：
  - acquiresleep(&ip->lock)：
    - 使用睡眠锁锁定 inode，若已被锁定，当前进程睡眠。
3. 加载数据：
  - if(ip->valid == 0)：
    - bread 读取 i节点所在块（IBLOCK(ip->inum, sb)）。
    - 计算 dinode 偏移（inum % IPB）。
    - 从 dinode 复制字段到 inode（type, major, minor, nlink, size, addrs）。
    - 释放缓冲区，置 valid = 1。
  - if(ip->type == 0)：
    - 检查加载的类型，若为 0（未分配），panic。
结果：
- inode 被锁定，数据已加载。

// Unlock the given inode.
void
iunlock(struct inode *ip)
{
  if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1)
    panic("iunlock");

  releasesleep(&ip->lock);
}

步骤：
1. 参数检查：
  - if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1)：
    - 确保 ip 非空、当前持有锁、被引用，否则 panic。
2. 解锁：
  - releasesleep(&ip->lock)：
    - 释放睡眠锁，唤醒等待者。
结果：
- inode 解锁，可被其他进程访问。

通过inode获取数据块

// Inode content
//
// The content (data) associated with each inode is stored
// in blocks on the disk. The first NDIRECT block numbers
// are listed in ip->addrs[].  The next NINDIRECT blocks are
// listed in block ip->addrs[NDIRECT].

// Return the disk block address of the nth block in inode ip.
// If there is no such block, bmap allocates one.
// bmap 函数是文件系统的核心部分，它负责管理文件数据块的分配和查找
// bmap 函数的目的是找到与 inode（磁盘索引节点）关联的特定数据块的磁盘地址。
// 如果请求的数据块尚不存在，bmap 会分配一个新的数据块并将地址返回给调用者。
static uint
bmap(struct inode *ip, uint bn)
{
  uint addr, *a;
  struct buf *bp;

  // 如果请求的块索引bn小于NDIRECT（直接块的数量），函数检查ip->addrs[bn]是否已经有一个块地址。
  // 如果没有，它使用 balloc 函数为该块分配一个新的磁盘地址，并将该地址存入ip->addrs[bn]。
  if(bn < NDIRECT){
    if((addr = ip->addrs[bn]) == 0)
      ip->addrs[bn] = addr = balloc(ip->dev);
    return addr;
  }
  // 如果bn大于或等于NDIRECT，函数处理间接块。它首先计算间接块的索引（bn - NDIRECT）。
  bn -= NDIRECT;

  if(bn < NINDIRECT){
    // Load indirect block, allocating if necessary.
    // 函数检查 ip->addrs[NDIRECT] 是否有一个块地址。如果没有，它为间接块分配一个新的磁盘地址。
    if((addr = ip->addrs[NDIRECT]) == 0)
      ip->addrs[NDIRECT] = addr = balloc(ip->dev);
    // 使用 bread 函数读取间接块到缓冲区 bp
    bp = bread(ip->dev, addr);
    // 函数获取指向间接块数据的指针 a，并检查 a[bn] 是否有一个块地址。
    // 如果没有，它使用 balloc 函数为该间接块分配一个新的磁盘地址，并将该地址存入a[bn]
    a = (uint*)bp->data;
    if((addr = a[bn]) == 0){
      a[bn] = addr = balloc(ip->dev);
      // 如果间接块的内容被修改（即分配了新的块），函数使用 log_write 函数将缓冲区 bp 写入日志。
      log_write(bp);
    }
    // 使用 brelse 函数释放间接块的缓冲区 bp
    brelse(bp);
    return addr;
  }

  bn -= NINDIRECT;

  if(bn < NINDIRECT * NINDIRECT){
    // Load indirect block, allocating if necessary.
    if((addr = ip->addrs[NDIRECT + 1]) == 0)
      ip->addrs[NDIRECT + 1] = addr = balloc(ip->dev);
    bp = bread(ip->dev, addr);
    a = (uint*)bp->data;
    if((addr = a[bn/NINDIRECT]) == 0){
      a[bn/NINDIRECT] = addr = balloc(ip->dev);
      log_write(bp);
    }
    brelse(bp);

    bn %= NINDIRECT;
    bp = bread(ip->dev, addr);
    a = (uint*)bp->data;
    if((addr = a[bn]) == 0){
      a[bn] = addr = balloc(ip->dev);
      log_write(bp);
    }
    brelse(bp);
    return addr;
  }

  panic("bmap: out of range");
}

步骤分解

直接块（bn < NDIRECT）：
- 检查 ip->addrs[bn]：
  - 若为 0，调用 balloc 分配新块，记录地址。
  - 返回已有或新分配的地址。
- 范围：bn = 0 到 10。
一级间接块（bn < NINDIRECT）：
- 计算偏移：bn -= NDIRECT（bn = 0 到 255）。
- 检查一级间接块地址 ip->addrs[NDIRECT]：
  - 若为 0，分配新块。
- 读取间接块（bread），获取地址数组 a。
- 检查 a[bn]：
  - 若为 0，分配新块，记录并写日志（log_write）。
- 释放缓冲区，返回地址。
二级间接块（bn < NINDIRECT * NINDIRECT）：
- 计算偏移：bn -= NINDIRECT（bn = 0 到 65535）。
- 检查二级间接块地址 ip->addrs[NDIRECT+1]：
  - 若为 0，分配新块。
- 第一级查找：
  - 读取二级间接块，获取 a。
  - 计算一级间接块索引：bn / NINDIRECT（0 到 255）。
  - 检查 a[bn/NINDIRECT]，若为 0，分配并更新。
- 第二级查找：
  - 计算剩余偏移：bn %= NINDIRECT（0 到 255）。
  - 读取一级间接块，检查 a[bn]，若为 0，分配并更新。
- 返回地址。
超出范围：
- 若 bn >= 65803，panic。

通过 inode 读写数据

`readi`

从 inode 的指定偏移（off）读取 n 字节数据到目标地址（dst）。

// Read data from inode.
// Caller must hold ip->lock.
// If user_dst==1, then dst is a user virtual address;
// otherwise, dst is a kernel address.
int
readi(struct inode *ip, int user_dst, uint64 dst, uint off, uint n)
{
  uint tot, m;
  struct buf *bp;

  if(off > ip->size || off + n < off)
    return 0;
  if(off + n > ip->size)
    n = ip->size - off;

  for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
    bp = bread(ip->dev, bmap(ip, off/BSIZE));
    //  m为请求剩余长度 n - tot 和当前块剩余空间 BSIZE - off % BSIZE 中的较小值。
    m = min(n - tot, BSIZE - off%BSIZE);
    if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) {
      brelse(bp);
      tot = -1;
      break;
    }
    brelse(bp);
  }
  return tot;
}

参数：
- user_dst：1 表示用户态地址，0 表示内核态地址。
- dst：目标地址。
- off：文件偏移。
- n：读取字节数。
返回：
- 成功读取的字节数，错误返回 -1。
步骤：
1. 参数检查：
  - off > ip->size：偏移超出文件大小，返回 0。
  - off + n < off：溢出检查，返回 0。
  - off + n > ip->size：调整 n 为剩余字节数。
2. 循环读取：
  - tot：已读取字节数。
  - bp = bread(ip->dev, bmap(ip, off/BSIZE))：
    - bmap 获取块号，bread 读取块。
  - m = min(n - tot, BSIZE - off%BSIZE)：
    - 计算本次读取字节数（剩余请求 vs 块内剩余）。
  - either_copyout：
    - 从块内偏移（off % BSIZE）复制 m 字节到 dst。
    - 若失败（-1），设置 tot = -1，退出。
  - brelse(bp)：释放缓冲区。
3. 返回：
  - 返回 tot（成功字节数或 -1）。

`writei`

将 n 字节数据从源地址（src）写入 inode 的指定偏移（off）。

int writei(struct inode *ip, int user_src, uint64 src, uint off, uint n)
{
  uint tot, m;
  struct buf *bp;

  if(off > ip->size || off + n < off)
    return -1;
  if(off + n > MAXFILE*BSIZE)
    return -1;

  for(tot=0; tot<n; tot+=m, off+=m, src+=m){
    bp = bread(ip->dev, bmap(ip, off/BSIZE));
    m = min(n - tot, BSIZE - off%BSIZE);
    if(either_copyin(bp->data + (off % BSIZE), user_src, src, m) == -1) {
      brelse(bp);
      break;
    }
    log_write(bp);
    brelse(bp);
  }

  if(off > ip->size)
    ip->size = off;

  iupdate(ip);

  return tot;
}

参数：
- user_src：1 表示用户态地址，0 表示内核态地址。
- src：源地址。
- off：文件偏移。
- n：写入字节数。
返回：
- 成功写入的字节数，错误返回小于 n 的值。
步骤：
1. 参数检查：
  - off > ip->size：偏移超出当前大小，返回 -1。
  - off + n < off：溢出检查，返回 -1。
  - off + n > MAXFILE*BSIZE：超出最大文件大小（67.4 MB），返回 -1。
2. 循环写入：
  - tot：已写入字节数。
  - bp = bread(ip->dev, bmap(ip, off/BSIZE))：
    - bmap 获取或分配块。
  - m = min(n - tot, BSIZE - off%BSIZE)：
    - 计算本次写入字节数。
  - either_copyin：
    - 从 src 复制 m 字节到块内偏移。
    - 若失败（-1），退出循环。
  - log_write(bp)：记录修改。
  - brelse(bp)：释放缓冲区。
3. 更新大小：
  - 若 off > ip->size，更新 ip->size。
4. 同步：
  - iupdate(ip)：将 inode 写回磁盘。
5. 返回：
  - 返回 tot（成功字节数）。

// Look for a directory entry in a directory.
// If found, set *poff to byte offset of entry.
struct inode*
dirlookup(struct inode *dp, char *name, uint *poff)
{
  uint off, inum;
  struct dirent de;

  if(dp->type != T_DIR)
    panic("dirlookup not DIR");

  for(off = 0; off < dp->size; off += sizeof(de)){
    if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de))
      panic("dirlookup read");
    if(de.inum == 0)
      continue;
    if(namecmp(name, de.name) == 0){
      // entry matches path element
      if(poff)
        *poff = off;
      inum = de.inum;
      return iget(dp->dev, inum);
    }
  }

  return 0;
}

添加目录项

此函数用来在 inode dp 指向的目录文件中添加一个目录项，通常是创建了一个新文件，需要在该目录下添加这个新文件的信息。

首先查找该目录项是否存在，如果不存在则找一个空闲目录项位置，将新文件的 inode 和文件名写进去。

// Write a new directory entry (name, inum) into the directory dp.
int
dirlink(struct inode *dp, char *name, uint inum)
{
  int off;
  struct dirent de;
  struct inode *ip;

  // Check that name is not present.
  // 检查目录 dp 中是否已存在文件名 name。如果存在，说明不能创建新的目录项，调用 iput 释放查找到的 inode 并返回 -1
  if((ip = dirlookup(dp, name, 0)) != 0){
    iput(ip);
    return -1;
  }

  // Look for an empty dirent.
  for(off = 0; off < dp->size; off += sizeof(de)){
    if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de))
      panic("dirlink read");
    if(de.inum == 0)
      break;
  }

  strncpy(de.name, name, DIRSIZ);
  de.inum = inum;
  // 调用 writei 函数将新目录项 de 写入到目录 inode dp 中的偏移量 off 处。
  if(writei(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de))
    panic("dirlink");

  return 0;
}

路径

skipelem() 调用一次解析一个头部的文件名放在 𝑛𝑎𝑚𝑒 中，返回剩下的路径：

skipelem("a/bb/c", name) = "bb/c", setting name = "a"
skipelem("///a//bb", name) = "bb", setting name = "a"
skipelem("a", name) = "", setting name = "a"
skipelem("", name) = skipelem("////", name) = 0

static char*
skipelem(char *path, char *name)
{
  char *s;
  int len;

  while(*path == '/')
    path++;
  if(*path == 0)
    return 0;
  s = path;
  while(*path != '/' && *path != 0)
    path++;
  len = path - s;
  if(len >= DIRSIZ)
    memmove(name, s, DIRSIZ);
  else {
    memmove(name, s, len);
    name[len] = 0;
  }
  while(*path == '/')
    path++;
  return path;
}

// Look up and return the inode for a path name.
// If parent != 0, return the inode for the parent and copy the final
// path element into name, which must have room for DIRSIZ bytes.
// Must be called inside a transaction since it calls iput().
static struct inode*
namex(char *path, int nameiparent, char *name)
{
  struct inode *ip, *next;

  if(*path == '/')
    ip = iget(ROOTDEV, ROOTINO);
  else
    ip = idup(myproc()->cwd);

  while((path = skipelem(path, name)) != 0){
    ilock(ip);
    if(ip->type != T_DIR){
      iunlockput(ip);
      return 0;
    }
    if(nameiparent && *path == '\0'){
      // Stop one level early.
      iunlock(ip);
      return ip;
    }
    if((next = dirlookup(ip, name, 0)) == 0){
      iunlockput(ip);
      return 0;
    }
    iunlockput(ip);
    ip = next;
  }
  if(nameiparent){
    iput(ip);
    return 0;
  }
  return ip;
}

若 nameiparent 为 1，返回父目录 inode 节点并填充最后一个元素到 name。

struct inode*
namei(char *path)
{
  char name[DIRSIZ];
  return namex(path, 0, name);
}

struct inode*
nameiparent(char *path, char *name)
{
  return namex(path, 1, name);
}

逻辑流程示例

`namei("/usr/file")`

输入：path = "/usr/file"。
执行：
- ip = iget(ROOTDEV, 1)（根目录）。
- skipelem("/usr/file", name) → name = "usr", path = "file"。
  - ilock(ip)，dirlookup(ip, "usr") → next = inum=2。
  - iunlockput(ip)，ip = next。
- skipelem("file", name) → name = "file", path = ""。
  - ilock(ip)，dirlookup(ip, "file") → next = inum=3。
  - iunlockput(ip)，ip = next。
- path = 0，返回 ip（inum=3）。
输出：文件 /usr/file 的 i节点。

`nameiparent("/usr/file", name)`

输入：path = "/usr/file"。
执行：
- ip = iget(ROOTDEV, 1)。
- skipelem("/usr/file", name) → name = "usr", path = "file"。
  - ilock(ip)，dirlookup(ip, "usr") → next = inum=2。
  - iunlockput(ip)，ip = next。
- skipelem("file", name) → name = "file", path = ""。
  - nameiparent && *path == '\0'，iunlock(ip)，返回 ip（inum=2）。
输出：目录 /usr 的 i节点，name = "file"。

文件描述符

struct file {
  enum { FD_NONE, FD_PIPE, FD_INODE, FD_DEVICE } type;
  int ref; // reference count
  char readable;
  char writable;
  struct pipe *pipe; // FD_PIPE
  struct inode *ip;  // FD_INODE and FD_DEVICE
  uint off;          // FD_INODE
  short major;       // FD_DEVICE
};

字段解析

type：
- 类型：枚举类型，表示文件描述符的种类。
- 值：
  - FD_NONE：未使用。
  - FD_PIPE：管道。
  - FD_INODE：普通文件或目录。
  - FD_DEVICE：设备文件。
- 作用：区分底层资源类型，决定如何操作。
ref：
- 类型：整数。
- 作用：引用计数，记录有多少文件描述符引用此结构。
- 用途：支持文件描述符的共享（如 dup）。
readable：
- 类型：字符（布尔值，0 或 1）。
- 作用：标记文件是否可读。
writable：
- 类型：字符（布尔值，0 或 1）。
- 作用：标记文件是否可写。
pipe：
- 类型：指向 struct pipe 的指针。
- 条件：type == FD_PIPE 时有效。
- 作用：关联管道结构，用于管道读写。
ip：
- 类型：指向 struct inode 的指针。
- 条件：type == FD_INODE 或 FD_DEVICE 时有效。
- 作用：关联文件或设备的 i节点。
off：
- 类型：无符号整数。
- 条件：type == FD_INODE 时有效。
- 作用：记录文件的当前读写偏移量（字节）。
major：
- 类型：短整数。
- 条件：type == FD_DEVICE 时有效。
- 作用：设备的主设备号，用于区分设备类型。

struct file 是文件系统的抽象层：
- 统一管理不同类型的文件对象（普通文件、管道、设备）。
- 提供标准接口（如 read、write），隐藏底层实现。
存储位置：
- 全局文件表 ftable（kernel/file.c）中分配。
生命周期：
- 通过 filealloc 创建，fileclose 释放。

使用场景

普通文件（`FD_INODE`）

字段：type = FD_INODE, ip, off, readable, writable。
例子：
- 打开文件 /usr/file：
  - ip 指向 inum=3 的 i节点。
  - off = 0（初始偏移）。
  - readable = 1, writable = 0（只读）。
操作：
- read：从 ip 的 off 读取，更新 off。
- write：写入 ip 的 off，更新 off 和 ip->size。

管道（`FD_PIPE`）

字段：type = FD_PIPE, pipe, readable, writable。
例子：
- 创建管道 pipe(fd)：
  - pipe 指向管道结构。
  - 读端：readable = 1, writable = 0。
  - 写端：readable = 0, writable = 1。
操作：
- read：从 pipe 读取。
- write：写入 pipe。

设备（`FD_DEVICE`）

字段：type = FD_DEVICE, ip, major。
例子：
- 打开 /dev/console：
  - ip 指向设备 i节点（type = T_DEV）。
  - major 指定设备类型（如 1 表示控制台）。
操作：
- read/write：调用设备驱动程序。