要怎麼做到每個 task 的 virtual address 獨立呢?很直覺的方式就是為每個 task 都建立各自的 page table,並且在 context switch 的時候進行轉換,而這些資訊最適合保存在每個 task 的資料結構中
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
| #define MAX_USER_PAGES 16
#define MAX_KERNEL_PAGES 16
struct mm_struct {
uint64_t pgd;
};
struct task_t {
...
struct mm_struct mm;
}
...
void update_pgd(uint64_t pgd);
|
1
2
3
4
5
6
7
8
| .global update_pgd
update_pgd:
dsb ish // ensure write has completed
msr ttbr0_el1, x0 // switch translation based address.
tlbi vmalle1is // invalidate all TLB entries
dsb ish // ensure completion of TLB invalidatation
isb // clear pipeline
ret
|
1
| #define USTACK_ADDR (0x0000ffffffffe000 - 8)
|
user_program 這個 process 將 user library 中的 code 複製到 0x1000
(一個 User Space 的位址) (同時要一塊 page 指向 0x1000
),並且將 EL0 的 stack 指到 User Space 的最上面,將 PGD 換掉,最後跳回 EL0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
| void user_program(){
extern uint64_t _binary_user_img_start;
extern uint64_t _binary_user_img_end;
uint64_t begin = (uint64_t)&_binary_user_img_start;
uint64_t end = (uint64_t)&_binary_user_img_end;
do_exec(begin, end - begin, 0x1000);
}
void schedule_init() {
...
privilege_task_create(user_program, 10);
...
}
int do_exec(uint64_t start, uint64_t size, uint64_t pc) {
void* code_page = map_page(current_task, pc);
void* stack_page = map_page(current_task, USTACK_ADDR);
if (!code_page || !stack_page) return -1;
// copy code to pc
uint8_t* pc_ptr = (uint8_t*)code_page;
uint8_t* code_ptr = (uint8_t*)start;
for (uint64_t i = 0; i < size; i++) {
*(pc_ptr + i) = *(code_ptr + i);
}
asm volatile("msr sp_el0, %0" : : "r"(USTACK_ADDR));
asm volatile("msr elr_el1, %0": : "r"(pc));
asm volatile("msr spsr_el1, %0" : : "r"(SPSR_EL1_VALUE));
update_pgd(current_task->mm.pgd);
asm volatile("eret");
return 0;
}
|
將一個 page 的 physical address (屬於指定的 task) map 到指定的 user_addr
get_page_user 會在要一個 page 的同時,建立相對應的 PGD, PUD, PMD, PTE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
| // allocate new page in user's address table, return page's virtual address
void* map_page(struct task_t* task, uint64_t user_addr) {
uint64_t pgd_idx = (user_addr & (PD_MASK << PGD_SHIFT)) >> PGD_SHIFT;
uint64_t pud_idx = (user_addr & (PD_MASK << PUD_SHIFT)) >> PUD_SHIFT;
uint64_t pmd_idx = (user_addr & (PD_MASK << PMD_SHIFT)) >> PMD_SHIFT;
uint64_t pte_idx = (user_addr & (PD_MASK << PTE_SHIFT)) >> PTE_SHIFT;
uint64_t* pgd = create_pgd(task);
uint64_t* pud = create_page_table(pgd, pgd_idx);
uint64_t* pmd = create_page_table(pud, pud_idx);
uint64_t* pte = create_page_table(pmd, pmd_idx);
return create_page(pte, pte_idx);
}
// create pgd, return pgd address
void* create_pgd(struct task_t* task) {
if (!task->mm.pgd) {
void* page = page_alloc();
if (page == NULL) return NULL;
task->mm.pgd = virtual_to_physical((uint64_t)page);
}
return (void*)(task->mm.pgd + KERNEL_VIRT_BASE);
}
// create page table, return next level table
void* create_page_table(uint64_t* table, uint64_t idx) {
if (table == NULL) return NULL;
if (!table[idx]) {
void* page = page_alloc();
if (page == NULL) return NULL;
table[idx] = virtual_to_physical((uint64_t)page) | PD_TABLE;
}
return (void*)((table[idx] & PAGE_MASK) + KERNEL_VIRT_BASE);
}
// create page, return page address
void* create_page(uint64_t* pte, uint64_t idx) {
if (pte == NULL) return NULL;
if (!pte[idx]) {
void* page = page_alloc();
if (page == NULL) return NULL;
pte[idx] = virtual_to_physical((uint64_t)page) | PTE_NORAL_ATTR | PD_ACCESS_PERM_RW;
}
return (void*)((pte[idx] & PAGE_MASK) + KERNEL_VIRT_BASE);
}
|
搜尋可用的 page,回傳 page 的 virtual address
1
2
3
4
5
6
7
8
9
10
| // get one page for kernel space
void* page_alloc() {
uint64_t page_phy_addr = get_free_page();
if (page_phy_addr == 0) {
return NULL;
}
uint64_t page_virt_addr = page_phy_addr | KERNEL_VIRT_BASE;
remain_page--;
return (void*)page_virt_addr;
}
|
在 page booking list 中搜尋可用的 page,並清零
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| void memzero(uint8_t* addr, int size) {
for (int i = 0; i < size; i++) {
*(addr + i) = 0;
}
}
uint64_t get_free_page() { // return page physical address
for (int i = first_aval_page; i < last_aval_page; i++) {
if (page[i].used == AVAL) {
page[i].used = USED;
uint64_t page_virt_addr = i * PAGE_SIZE + KERNEL_VIRT_BASE;
uint64_t page_phy_addr = i * PAGE_SIZE;
memzero((uint8_t*)page_virt_addr, PAGE_SIZE);
return page_phy_addr;
}
}
return 0;
}
|
1
2
3
4
5
| void context_switch(struct task_t* next) {
...
update_pgd(next->mm.pgd);
switch_to(&prev->cpu_context, &next->cpu_context);
}
|
kernel stack 的複製與上次一致,user stack 改為複製 parent task 在 page table 上的 page 到 child 上
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
| void sys_fork(struct trapframe* trapframe) {
struct task_t* parent_task = get_current_task();
int child_id = privilege_task_create(return_from_fork, parent_task->priority);
struct task_t* child_task = &task_pool[child_id];
// copy kernel stack
char* child_kstack = &kstack_pool[child_task->id][KSTACK_TOP_IDX];
char* parent_kstack = &kstack_pool[parent_task->id][KSTACK_TOP_IDX];
uint64_t kstack_offset = parent_kstack - (char*)trapframe;
for (uint64_t i = 0; i < kstack_offset; i++) {
*(child_kstack - i) = *(parent_kstack - i);
}
// place child's kernel stack to right place
child_task->cpu_context.sp = (uint64_t)child_kstack - kstack_offset;
// copy all user pages
fork_pgd(current_task, child_task);
// place child's user stack to right place
struct trapframe* child_trapframe = (struct trapframe*) child_task->cpu_context.sp;
child_trapframe->sp_el0 = trapframe->sp_el0;
child_trapframe->x[0] = 0;
trapframe->x[0] = child_task->id;
}
|
依序從 PGD 一路複製到 PTE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
| void fork_pte(uint64_t* target_pte, uint64_t* dest_pte) {
for (int i = 0; i < 512; i++) {
if (target_pte[i]) {
uint64_t* target_page = (uint64_t*)((target_pte[i] & PAGE_MASK) | KERNEL_VIRT_BASE);
uint64_t* dest_page = create_page(dest_pte, i);
memcpy((void*)dest_page, (void*)target_page, PAGE_SIZE);
}
}
}
void fork_pmd(uint64_t* target_pmd, uint64_t* dest_pmd) {
for (int i = 0; i < 512; i++) {
if (target_pmd[i]) {
uint64_t* target_pte = (uint64_t*)((target_pmd[i] & PAGE_MASK) | KERNEL_VIRT_BASE);
uint64_t* dest_pte = create_page_table(dest_pmd, i);
fork_pte(target_pte, dest_pte);
}
}
}
void fork_pud(uint64_t* target_pud, uint64_t* dest_pud) {
for (int i = 0; i < 512; i++) {
if (target_pud[i]) {
uint64_t* target_pmd = (uint64_t*)((target_pud[i] & PAGE_MASK) | KERNEL_VIRT_BASE);
uint64_t* dest_pmd = create_page_table(dest_pud, i);
fork_pmd(target_pmd, dest_pmd);
}
}
}
void fork_pgd(struct task_t* target, struct task_t* dest) {
uint64_t* target_pgd = (uint64_t*)((target->mm.pgd & PAGE_MASK) | KERNEL_VIRT_BASE);
uint64_t* dest_pgd = create_pgd(dest);
for (int i = 0; i < 512; i++) {
if (target_pgd[i]) {
uint64_t* target_pud = (uint64_t*)((target_pgd[i] & PAGE_MASK) | KERNEL_VIRT_BASE);
uint64_t* dest_pud = create_page_table(dest_pgd, i);
fork_pud(target_pud, dest_pud);
}
}
}
|
user_addr_to_page_addr 會拿到 user address 對應的真實 memory address (在 kernel 的 virtual address)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
| void memcpy(void *dest, void *src, uint64_t size) {
uint8_t *csrc = (uint8_t*)src;
uint8_t *cdest = (uint8_t*)dest;
for (uint64_t i = 0; i < size; i++)
cdest[i] = csrc[i];
}
uint64_t user_addr_to_page_addr(uint64_t user_addr, uint64_t pgd_phy) {
uint64_t pgd_idx = (user_addr & (PD_MASK << PGD_SHIFT)) >> PGD_SHIFT;
uint64_t pud_idx = (user_addr & (PD_MASK << PUD_SHIFT)) >> PUD_SHIFT;
uint64_t pmd_idx = (user_addr & (PD_MASK << PMD_SHIFT)) >> PMD_SHIFT;
uint64_t pte_idx = (user_addr & (PD_MASK << PTE_SHIFT)) >> PTE_SHIFT;
uint64_t* pgd = (uint64_t*)(pgd_phy | KERNEL_VIRT_BASE);
uint64_t* pud = (uint64_t*)((pgd[pgd_idx] & ~0xFFF) | KERNEL_VIRT_BASE);
uint64_t* pmd = (uint64_t*)((pud[pud_idx] & ~0xFFF) | KERNEL_VIRT_BASE);
uint64_t* pte = (uint64_t*)((pmd[pmd_idx] & ~0xFFF) | KERNEL_VIRT_BASE);
return (pte[pte_idx] & ~0xFFF) | KERNEL_VIRT_BASE;
}
|