系统调用的实现

以write()函数为例，write()函数就是一个系统调用函数，其定义在linux-0.11/lib/write.c中，这个文件的内容非常简单

// linux-0.11/lib/write.c

#define __LIBRARY__
#include <unistd.h>
_syscall3(int,write,int,fd,const char *,buf,off_t,count)

就三行，开启了write()函数的定义

函数的定义并不像我们平常写C语言那样，它是用宏去定义的，所以会发现这里完全不符合C语言的规定，甚至没有分号

这段宏展开后是什么样的，需要先去查看以下宏定义，_syscall3()的宏定义在linux0.11/include/unistd.h下，将其截取出来

// linux0.11/include/unistd.h

#define __NR_write	4

#define _syscall3(type,name,atype,a,btype,b,ctype,c) \
type name(atype a,btype b,ctype c) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
	: "=a" (__res) \
	: "0" (__NR_##name),"b" ((long)(a)),"c" ((long)(b)),"d" ((long)(c))); \
if (__res>=0) \
	return (type) __res; \
errno=-__res; \
return -1; \
}

有点复杂，简单来说它相当于一个类，定义了一个系统函数的大致模样，而这个_syscall3就描述了需要接收3个参数的函数的样子。

所以write.c的宏展开后是这样的

// write.c

int write(int fd, const char* buf, off_t count)
{
    long __res;
    __asm__ volatile (
        "int $0x80"
        : "=a" (__res)
        : "0" (__NR_write), "b" ((long)(fd)), "c" ((long)(buf)), "d" ((long)(count))
    );
    if (__res >= 0)
        return (int) __res;
    errno = -__res;
    return -1;
}

__NR_write的宏定义已经在/include/unistd.h中给出，就是 4

这就是write()作为系统调用函数的完整定义了，主要还是用汇编实现的，重点是在eax中填入功能号然后调用0x80中断

这个中断，使得CPL=3的write()函数访问了idt中编号为0x80的中断描述符，这个中断描述符存储了DPL、段选择子和偏移地址，其中DPL=3使得write()能够访问这个中断描述符，而段选择子的CPL=0使得之后CS中的CPL=0，从用户态转为了内核态，由此可以访问内核中的东西。

一句话总结int 0x80的伟大之处就是，令CPL为3的指令访问了DPL为3的中断描述符，并强制修改了接下来的CPL为0

以上就是系统调用的从用户态转换到内核态的过程。接下谈谈int 0x80的实现

在idt中0x80处的中断描述符是如何定义的，这要从linux-0.11/init/main.c开始，简单截取一下

// linux-0.11/init/main.c

void main(void) 
{
    ...
    sched_init();
    ...
}

sched这个单词是schedule的缩写，也就是说sched_init()这个函数就是一个表格的初始化

这个函数定义在linux-0.11/kernal/sched.c下，省略了一些东西，重点关注一下最后的set_system_gate()函数

// sched.c

void sched_init(void)
{
	...
	set_system_gate(0x80,&system_call);
}

set_system_gate()的实现也是个宏，其定义在linux-0.11/include/asm/system.h中

// system.h

#define set_system_gate(n,addr) \
	_set_gate(&idt[n],15,3,addr)

_set_gate()的实现也在同一文件中

// system.h

#define _set_gate(gate_addr,type,dpl,addr) \
__asm__ ("movw %%dx,%%ax\n\t" \
	"movw %0,%%dx\n\t" \
	"movl %%eax,%1\n\t" \
	"movl %%edx,%2" \
	: \
	: "i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
	"o" (*((char *) (gate_addr))), \
	"o" (*(4+(char *) (gate_addr))), \
	"d" ((char *) (addr)),"a" (0x00080000))

整理一下set_system_gate(0x80,&system_call)的最终被替换为

// dpl = 3, type = 15, gate_addr = &idt[0x80], addr = &system_call
__asm__ (
    "movw %%dx,%%ax\n\t"
    "movl %%eax,%1\n\t"
    "movl %%edx,%2"
    :
    : "i" ((short) (0x8000+(dpl<<13)+(type<<8))),
    "o" (*((char *) (gate_addr))),
    "o" (*(4+(char *) (gate_addr))),
    "d" ((char *) (addr)),"a" (0x00080000)
)

虽然看起来挺麻烦，但实际上很简单，就是填写 IDT（中断描述符表），将 system_call 函数地址写到 0x80 对应的中断描述符中，也就是在中断 0x80 发生后，自动调用system_call()函数

最终这个idt中的中断描述符长这样

63                               48 47 46  44 43     40 39 37 36        32
+----------------------------------+--+----+--+--------+-+-+-+----------+
|                                  |  |    |  |        |     |          |
|       &system_call[31:16]        |P |DPL |S |  TYPE  |0 0 0| Reserved |
|                                  |1 | 00 |0 | 1|1|1|1|     |          |
+-------------+--+--+--+--+--------+--+----+--+--------+-+-+-+----------+
31                               17 16                                  0
+----------------------------------+------------------------------------+
|                                  |                                    |
|         Segment Selector         |           &system_call[15:0]       |
|              0x0008              |                                    |
+----------------------------------+------------------------------------+

可见，系统调用的 CS 段选择符指向了内核代码段描述符，段内偏移是 system_call 的地址。

然后来说说system_call()函数

这个system_call()函数是用纯汇编实现，定义在linux-0.11/kernel/system_call.s中：

nr_system_calls = 72
.globl _system_call # 用.globl修饰符修饰为其他函数可见
_system_call:
	cmpl $nr_system_calls-1,%eax # 检查系统调用编号是否在合法范围内 
	ja bad_sys_call
	push %ds
	push %es
	push %fs
	pushl %edx
	pushl %ecx		# push %ebx,%ecx,%edx as parameters
	pushl %ebx		# to the system call
	movl $0x10,%edx		# set up ds,es to kernel space，内核地址空间
	mov %dx,%ds3
	mov %dx,%es		# ds=es=8，指向内核代码段	
	movl $0x17,%edx		# 让fs指向LDT，用户地址空间
	mov %dx,%fs
	# _sys_call_table(, %eax, 4) = _sys_call_table + 4*eax
	# 为什么乘4？因为一个函数指针4个字节
	call _sys_call_table(,%eax,4)	# 调用了函数表，假设eax=__NR_write
	pushl %eax
	movl _current,%eax
	cmpl $0,state(%eax)		# state
	jne reschedule
	cmpl $0,counter(%eax)		# counter
	je reschedule

其中_sys_call_table是一个全局函数数组，定义在linux-0.11/include/linux/sys.h

1
2
3

// linux-0.11/include/linux/sys.h
...
fn_ptr sys_call_table[] = { ..., sys_write, ... }; // 恰好在下标为4的地方

这个数组里还有其他很多函数的地址比如sys_close、sys_fork
fn_ptr的定义在linux-0.11/include/linux/sched.h中

1
2
3

// linux-0.11/include/linux/sched.h
...
typedef int (*fn_ptr)(); // 一个指针，指向返回int的函数

很明显，fn_ptr也就是function_pointer是函数指针类型