MIT 6.S081 | 0x0a Mmap

2024-07-06

You should implement enough mmap and munmap functionality to make the mmaptest test program work. If mmaptest doesn't use a mmap feature, you don't need to implement that feature.

本节实验严重参考了 Millgon's blog 的做法

分析

之前都没怎么接触过这俩函数,于是问了下通义千问。

函数解释

mmap()munmap() 是在Unix和类Unix操作系统(如Linux)中用于内存映射的系统调用。这些函数允许用户空间程序直接访问内核管理的内存或文件,提供了一种高效的数据访问机制。

mmap()

mmap() 函数的主要作用是将一个文件或其它对象映射到进程的地址空间中,这样文件的内容就可以像访问普通内存一样进行读写。当一个文件被映射后,对映射区域的任何修改都会反映到原始文件上,除非使用了某种特殊的映射标志来改变这种行为。

mmap() 的基本原型如下:

#include <sys/mman.h>
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);

参数解释:

  • addr:建议的映射起始地址,通常设为NULL让内核选择合适的地址。
  • length:要映射的字节数。
  • prot:页面保护标志,指定映射区域的权限,例如读、写等。
  • flags:映射类型和特殊标志,例如MAP_SHARED(共享映射)或MAP_PRIVATE(私有拷贝映射)。
  • fd:文件描述符,通常是通过open()打开的文件。
  • offset:文件的偏移量,映射的起点位置。

munmap()

munmap() 函数的作用是取消之前通过mmap()创建的内存映射。一旦调用munmap(),对原来映射区域的访问将不再有效,且对该区域的任何引用都会导致段错误(Segmentation fault)。

munmap() 的基本原型如下:

#include <sys/mman.h>
int munmap(void *addr, size_t length);

参数解释:

  • addr:映射区域的起始地址,这是mmap()返回的地址。
  • length:映射区域的长度,与mmap()中的length相同。

这两个函数提供了非常强大的功能,可以用于实现高效的文件I/O、进程间的共享内存通信以及动态链接库的加载等场景。然而,由于它们直接操作内核资源,因此在使用时需要特别小心,确保正确的地址和长度被传递给函数,避免内存泄漏或其他潜在的错误。

实现思路

proc space 图源

mmap() 是将一个文件映射到进程的地址空间,看一下进程在内存中的布局,这个映射的起始地址可以选为 TRAPEFRAME 之前的空闲区域。内存的映射和懒加载有点像 COW 那节实验做的事,mmap() 时只需要记录一下映射的信息,然后在缺页异常时再去加载文件内容。进程数据也需要记录一下这个映射信息,方便后续的各种操作。总结一下,需要做以下几个事情:

  • mmap():在proc结构体中增加一个vma数组,用于存储虚拟内存区域的信息。在sysfile.c中实现sys_mmap()函数,用于处理mmap()系统调用。在trap.c中处理缺页异常,实现懒加载。
  • munmap():在sysfile.c中实现sys_munmap()函数,用于处理munmap()系统调用。
  • allocproc()freeproc() 中初始化和释放 vma。在 fork() 中复制 vma

代码

diff --git a/Makefile b/Makefile
index 9b83591..5f6af9c 100644
--- a/Makefile
+++ b/Makefile
@@ -188,6 +188,7 @@ UPROGS=\
   $U/_grind\
   $U/_wc\
   $U/_zombie\
+  $U/_mmaptest\
 
 
 
diff --git a/kernel/defs.h b/kernel/defs.h
index a3c962b..14605aa 100644
--- a/kernel/defs.h
+++ b/kernel/defs.h
@@ -8,6 +8,7 @@ struct spinlock;
 struct sleeplock;
 struct stat;
 struct superblock;
+struct vma;
 
 // bio.c
 void            binit(void);
@@ -141,6 +142,10 @@ int             fetchstr(uint64, char*, int);
 int             fetchaddr(uint64, uint64*);
 void            syscall();
 
+// sysfile.c
+int             vmatryalloc(uint64 va);
+void            vmaunmap(pagetable_t pagetable, uint64 va, uint64 sz, struct vma *v);
+
 // trap.c
 extern uint     ticks;
 void            trapinit(void);
diff --git a/kernel/memlayout.h b/kernel/memlayout.h
index 776f98c..61c030c 100644
--- a/kernel/memlayout.h
+++ b/kernel/memlayout.h
@@ -65,3 +65,6 @@
 //   TRAPFRAME (p->trapframe, used by the trampoline)
 //   TRAMPOLINE (the same page as in the kernel)
 #define TRAPFRAME (TRAMPOLINE - PGSIZE)
+
+// Used for virtual memory areas
+#define VMAEND TRAPFRAME
diff --git a/kernel/param.h b/kernel/param.h
index 6624bff..3b99e82 100644
--- a/kernel/param.h
+++ b/kernel/param.h
@@ -1,5 +1,6 @@
 #define NPROC        64  // maximum number of processes
 #define NCPU          8  // maximum number of CPUs
+#define NVMA         16  // maximum number of virtual memory areas
 #define NOFILE       16  // open files per process
 #define NFILE       100  // open files per system
 #define NINODE       50  // maximum number of active i-nodes
diff --git a/kernel/proc.c b/kernel/proc.c
index 959b778..efd03c7 100644
--- a/kernel/proc.c
+++ b/kernel/proc.c
@@ -146,6 +146,10 @@ found:
   p->context.ra = (uint64)forkret;
   p->context.sp = p->kstack + PGSIZE;
 
+  for (int i = 0; i < NVMA; i++) {
+    p->vma[i].valid = 0;
+  }
+
   return p;
 }
 
@@ -158,6 +162,10 @@ freeproc(struct proc *p)
   if(p->trapframe)
     kfree((void*)p->trapframe);
   p->trapframe = 0;
+  for (int i = 0; i < NVMA; i++) {
+    struct vma *v = &p->vma[i];
+    vmaunmap(p->pagetable, v->addr, v->sz, v);
+  }
   if(p->pagetable)
     proc_freepagetable(p->pagetable, p->sz);
   p->pagetable = 0;
@@ -308,6 +316,14 @@ fork(void)
       np->ofile[i] = filedup(p->ofile[i]);
   np->cwd = idup(p->cwd);
 
+  for (i = 0; i < NVMA; i++) {
+    struct vma *v = &p->vma[i];
+    if (v->valid) {
+      np->vma[i] = *v;
+      filedup(v->f);
+    }
+  }
+
   safestrcpy(np->name, p->name, sizeof(p->name));
 
   pid = np->pid;
diff --git a/kernel/proc.h b/kernel/proc.h
index d021857..e1ff755 100644
--- a/kernel/proc.h
+++ b/kernel/proc.h
@@ -79,6 +79,18 @@ struct trapframe {
   /* 280 */ uint64 t6;
 };
 
+// Virtual memory areas
+struct vma {
+  int valid;
+
+  uint64 addr;
+  int sz;
+  struct file *f;
+  int prot;
+  int flags;
+  int offset;
+};
+
 enum procstate { UNUSED, USED, SLEEPING, RUNNABLE, RUNNING, ZOMBIE };
 
 // Per-process state
@@ -104,4 +116,5 @@ struct proc {
   struct file *ofile[NOFILE];  // Open files
   struct inode *cwd;           // Current directory
   char name[16];               // Process name (debugging)
+  struct vma vma[NVMA];        // Virual memory areas
 };
diff --git a/kernel/riscv.h b/kernel/riscv.h
index 20a01db..3b14dca 100644
--- a/kernel/riscv.h
+++ b/kernel/riscv.h
@@ -343,6 +343,9 @@ typedef uint64 *pagetable_t; // 512 PTEs
 #define PTE_W (1L << 2)
 #define PTE_X (1L << 3)
 #define PTE_U (1L << 4) // user can access
+#define PTE_G (1L << 5) // global mapping
+#define PTE_A (1L << 6) // accessed
+#define PTE_D (1L << 7) // dirty
 
 // shift a physical address to the right place for a PTE.
 #define PA2PTE(pa) ((((uint64)pa) >> 12) << 10)
diff --git a/kernel/syscall.c b/kernel/syscall.c
index ed65409..4fb9baa 100644
--- a/kernel/syscall.c
+++ b/kernel/syscall.c
@@ -101,6 +101,8 @@ extern uint64 sys_unlink(void);
 extern uint64 sys_link(void);
 extern uint64 sys_mkdir(void);
 extern uint64 sys_close(void);
+extern uint64 sys_mmap(void);
+extern uint64 sys_munmap(void);
 
 // An array mapping syscall numbers from syscall.h
 // to the function that handles the system call.
@@ -126,6 +128,8 @@ static uint64 (*syscalls[])(void) = {
 [SYS_link]    sys_link,
 [SYS_mkdir]   sys_mkdir,
 [SYS_close]   sys_close,
+[SYS_mmap]    sys_mmap,
+[SYS_munmap]  sys_munmap,
 };
 
 void
diff --git a/kernel/syscall.h b/kernel/syscall.h
index bc5f356..e7b18d6 100644
--- a/kernel/syscall.h
+++ b/kernel/syscall.h
@@ -20,3 +20,5 @@
 #define SYS_link   19
 #define SYS_mkdir  20
 #define SYS_close  21
+#define SYS_mmap   22
+#define SYS_munmap 23
diff --git a/kernel/sysfile.c b/kernel/sysfile.c
index 16b668c..849bdd2 100644
--- a/kernel/sysfile.c
+++ b/kernel/sysfile.c
@@ -15,6 +15,7 @@
 #include "sleeplock.h"
 #include "file.h"
 #include "fcntl.h"
+#include "memlayout.h"
 
 // Fetch the nth word-sized system call argument as a file descriptor
 // and return both the descriptor and the corresponding struct file.
@@ -503,3 +504,180 @@ sys_pipe(void)
   }
   return 0;
 }
+
+uint64
+sys_mmap(void)
+{
+  uint64 addr;
+  int sz, prot, flags, fd, offset;
+  struct file *f;
+  struct proc *p = myproc();
+
+  argaddr(0, &addr);
+  argint(1, &sz);
+  argint(2, &prot);
+  argint(3, &flags);
+  if (argfd(4, &fd, &f) < 0)
+    return -1;
+  argint(5, &offset);
+
+  if ((!f->readable && (prot & PROT_READ)) || (!f->writable && (prot & PROT_WRITE) && !(flags & MAP_PRIVATE)))
+    return -1;
+
+  sz = PGROUNDUP(sz);
+
+  struct vma* v = 0;
+
+  uint64 vmaend = VMAEND;
+
+  for (int i = 0; i < NVMA; i++) {
+    if (!p->vma[i].valid) {
+      v = &p->vma[i];
+      v->valid = 1;
+      break;
+    }
+    if (p->vma[i].addr < vmaend) {
+      vmaend = PGROUNDDOWN(p->vma[i].addr);
+    }
+  }
+
+  if (v == 0)
+    panic("mmap: no free space.");
+
+
+  v->addr = vmaend - sz;
+  v->sz = sz;
+  v->prot = prot;
+  v->flags = flags;
+  v->f = f;
+  v->offset = offset;
+
+  filedup(f);
+
+  return v->addr;
+}
+
+uint64
+sys_munmap(void)
+{
+  uint64 addr;
+  int sz;
+
+  argaddr(0, &addr);
+  argint(1, &sz);
+
+  struct proc *p = myproc();
+  struct vma *v = 0;
+  for (int i = 0; i < NVMA; i++) {
+    if (p->vma[i].valid && p->vma[i].addr <= addr && addr < p->vma[i].addr + p->vma[i].sz) {
+      v = &p->vma[i];
+      break;
+    }
+  }
+
+  if (v == 0) {
+    return -1;
+  }
+
+  if (addr > v->addr && addr + sz < v->addr + v->sz) {
+    return -1;
+  }
+
+  uint64 aaddr = addr;
+  if (addr > v->addr) {
+    aaddr = PGROUNDUP(addr);
+  }
+
+  int nbytes = sz - (aaddr - addr); // bytes to unmap
+  if (nbytes < 0) {
+    nbytes = 0;
+  }
+
+  vmaunmap(p->pagetable, aaddr, nbytes, v);
+
+  if (addr <= v->addr && addr + sz > v->addr) {
+    v->offset += addr + sz - v->addr;
+    v->addr = addr + sz;
+  }
+  v->sz -= sz;
+
+  if (v->sz <= 0) {
+    fileclose(v->f);
+    v->valid = 0;
+  }
+
+  return 0;
+}
+
+int
+vmatryalloc(uint64 va) {
+  struct proc *p = myproc();
+  struct vma *v = 0;
+
+  for (int i = 0; i < NVMA; i++) {
+    if (p->vma[i].valid && p->vma[i].addr <= va && va < p->vma[i].addr + p->vma[i].sz) {
+      v = &p->vma[i];
+      break;
+    }
+  }
+
+  if (v == 0) {
+    printf("vmatryalloc(): cannot find vma contain specific addr %p\n", va);
+    return -1;
+  }
+
+  void *pa = kalloc();
+  if (pa == 0)
+    panic("vmatryalloc(): kalloc");
+  memset(pa, 0, PGSIZE);
+
+  begin_op();
+  ilock(v->f->ip);
+  readi(v->f->ip, 0, (uint64)pa, v->offset + PGROUNDDOWN(va - v->addr), PGSIZE);
+  iunlock(v->f->ip);
+  end_op();
+
+  if (mappages(p->pagetable, va, PGSIZE, (uint64)pa, PTE_R | PTE_W | PTE_U) < 0)
+    panic("vmatryalloc(): mappages");
+
+  return 0;
+}
+
+void
+vmaunmap(pagetable_t pagetable, uint64 va, uint64 nbytes, struct vma *v)
+{
+  uint64 a;
+  pte_t *pte;
+
+  if((va % PGSIZE) != 0)
+    panic("vmaunmap: not aligned");
+
+  for(a = va; a < va + nbytes; a += PGSIZE){
+    if((pte = walk(pagetable, a, 0)) == 0)
+      continue;
+    if(PTE_FLAGS(*pte) == PTE_V)
+      panic("uvmunmap: not a leaf");
+    if(*pte & PTE_V) {
+      uint64 pa = PTE2PA(*pte);
+      if ((*pte & PTE_D) && (v->flags & MAP_SHARED)) {
+        begin_op();
+        ilock(v->f->ip);
+        uint64 off = a - v->addr;
+        if (off < 0) {
+          // the first page is not full
+          writei(v->f->ip, 0, pa + (-off), v->offset, PGSIZE + off);
+        } else if (off + PGSIZE > v->sz) {
+          // the last page is not full
+          writei(v->f->ip, 0, pa, v->offset + off, v->sz - off);
+        } else {
+          // full page
+          writei(v->f->ip, 0, pa, v->offset + off, PGSIZE);
+        }
+        iunlock(v->f->ip);
+        end_op();
+      }
+      kfree((void*)pa);
+      *pte = 0;
+    }
+  }
+}
\ No newline at end of file
diff --git a/kernel/trap.c b/kernel/trap.c
index 512c850..263726e 100644
--- a/kernel/trap.c
+++ b/kernel/trap.c
@@ -67,6 +67,11 @@ usertrap(void)
     syscall();
   } else if((which_dev = devintr()) != 0){
     // ok
+  } else if(r_scause() == 13 || r_scause() == 15) {
+    if (vmatryalloc(r_stval()) != 0) {
+      printf("usertrap(): vma lazy allocation error!\n");
+      setkilled(p);
+    }
   } else {
     printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
     printf("            sepc=%p stval=%p\n", r_sepc(), r_stval());
diff --git a/user/user.h b/user/user.h
index 4d398d5..eaf7e4e 100644
--- a/user/user.h
+++ b/user/user.h
@@ -22,6 +22,8 @@ int getpid(void);
 char* sbrk(int);
 int sleep(int);
 int uptime(void);
+void* mmap(const void*, int, int, int, int, int);
+int munmap(const void*, int);
 
 // ulib.c
 int stat(const char*, struct stat*);
diff --git a/user/usys.pl b/user/usys.pl
index 01e426e..d23b9cc 100755
--- a/user/usys.pl
+++ b/user/usys.pl
@@ -36,3 +36,5 @@ entry("getpid");
 entry("sbrk");
 entry("sleep");
 entry("uptime");
+entry("mmap");
+entry("munmap");