Multitasking Revisiting

PGD for All tasks

要怎麼做到每個 task 的 virtual address 獨立呢?很直覺的方式就是為每個 task 都建立各自的 page table,並且在 context switch 的時候進行轉換,而這些資訊最適合保存在每個 task 的資料結構中

include/schedule.h

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
#define MAX_USER_PAGES      16
#define MAX_KERNEL_PAGES    16

struct mm_struct {
    uint64_t pgd;
};

struct task_t {
    ...
    struct mm_struct mm;
}

...

void update_pgd(uint64_t pgd);

src/schedule.S

1
2
3
4
5
6
7
8
.global update_pgd
update_pgd:
    dsb ish  // ensure write has completed
    msr ttbr0_el1, x0  // switch translation based address.
    tlbi vmalle1is  // invalidate all TLB entries
    dsb ish  // ensure completion of TLB invalidatation
    isb  // clear pipeline
    ret

User Program Loader

incluse/schedule.h

1
#define USTACK_ADDR (0x0000ffffffffe000 - 8)

src/schedule.c

user_program 這個 process 將 user library 中的 code 複製到 0x1000 (一個 User Space 的位址) (同時要一塊 page 指向 0x1000),並且將 EL0 的 stack 指到 User Space 的最上面,將 PGD 換掉,最後跳回 EL0

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
void user_program(){
    extern uint64_t _binary_user_img_start;
    extern uint64_t _binary_user_img_end;
    uint64_t begin = (uint64_t)&_binary_user_img_start;
    uint64_t end = (uint64_t)&_binary_user_img_end;

    do_exec(begin, end - begin, 0x1000);
}

void schedule_init() {
    ...
    privilege_task_create(user_program, 10);
    ...
}

int do_exec(uint64_t start, uint64_t size, uint64_t pc) {
    void* code_page = map_page(current_task, pc);
    void* stack_page = map_page(current_task, USTACK_ADDR);
    if (!code_page || !stack_page) return -1;

    // copy code to pc
    uint8_t* pc_ptr = (uint8_t*)code_page;
    uint8_t* code_ptr = (uint8_t*)start;
    for (uint64_t i = 0; i < size; i++) {
        *(pc_ptr + i) = *(code_ptr + i);
    }

    asm volatile("msr sp_el0, %0" : : "r"(USTACK_ADDR));
    asm volatile("msr elr_el1, %0": : "r"(pc));
    asm volatile("msr spsr_el1, %0" : : "r"(SPSR_EL1_VALUE));

    update_pgd(current_task->mm.pgd);

    asm volatile("eret");

    return 0;
}

User Space Page Allocation

將一個 page 的 physical address (屬於指定的 task) map 到指定的 user_addr

src/mm.c

get_page_user 會在要一個 page 的同時,建立相對應的 PGD, PUD, PMD, PTE

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// allocate new page in user's address table, return page's virtual address
void* map_page(struct task_t* task, uint64_t user_addr) {
    uint64_t pgd_idx = (user_addr & (PD_MASK << PGD_SHIFT)) >> PGD_SHIFT;
    uint64_t pud_idx = (user_addr & (PD_MASK << PUD_SHIFT)) >> PUD_SHIFT;
    uint64_t pmd_idx = (user_addr & (PD_MASK << PMD_SHIFT)) >> PMD_SHIFT;
    uint64_t pte_idx = (user_addr & (PD_MASK << PTE_SHIFT)) >> PTE_SHIFT;

    uint64_t* pgd = create_pgd(task);
    uint64_t* pud = create_page_table(pgd, pgd_idx);
    uint64_t* pmd = create_page_table(pud, pud_idx);
    uint64_t* pte = create_page_table(pmd, pmd_idx);
    return create_page(pte, pte_idx);
}

// create pgd, return pgd address
void* create_pgd(struct task_t* task) {
    if (!task->mm.pgd) {
        void* page = page_alloc();
        if (page == NULL) return NULL;
        task->mm.pgd = virtual_to_physical((uint64_t)page);
    }
    return (void*)(task->mm.pgd + KERNEL_VIRT_BASE);
}

// create page table, return next level table
void* create_page_table(uint64_t* table, uint64_t idx) {
    if (table == NULL) return NULL;
    if (!table[idx]) {
        void* page = page_alloc();
        if (page == NULL) return NULL;
        table[idx] = virtual_to_physical((uint64_t)page) | PD_TABLE;
    }
    return (void*)((table[idx] & PAGE_MASK) + KERNEL_VIRT_BASE);
}

// create page, return page address
void* create_page(uint64_t* pte, uint64_t idx) {
    if (pte == NULL) return NULL;
    if (!pte[idx]) {
        void* page = page_alloc();
        if (page == NULL) return NULL;
        pte[idx] = virtual_to_physical((uint64_t)page) | PTE_NORAL_ATTR | PD_ACCESS_PERM_RW;
    }
    return (void*)((pte[idx] & PAGE_MASK) + KERNEL_VIRT_BASE);
}

搜尋可用的 page,回傳 page 的 virtual address

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
// get one page for kernel space
void* page_alloc() {
    uint64_t page_phy_addr = get_free_page();
    if (page_phy_addr == 0) {
        return NULL;
    }
    uint64_t page_virt_addr = page_phy_addr | KERNEL_VIRT_BASE;
    remain_page--;
    return (void*)page_virt_addr;
}

在 page booking list 中搜尋可用的 page,並清零

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
void memzero(uint8_t* addr, int size) {
    for (int i = 0; i < size; i++) {
        *(addr + i) = 0;
    }
}

uint64_t get_free_page() {  // return page physical address
    for (int i = first_aval_page; i < last_aval_page; i++) {
        if (page[i].used == AVAL) {
            page[i].used = USED;
            uint64_t page_virt_addr = i * PAGE_SIZE + KERNEL_VIRT_BASE;
            uint64_t page_phy_addr = i * PAGE_SIZE;
            memzero((uint8_t*)page_virt_addr, PAGE_SIZE);
            return page_phy_addr;
        }
    }
    return 0;
}

Multitasking

src/schedule.c

1
2
3
4
5
void context_switch(struct task_t* next) {
    ...
    update_pgd(next->mm.pgd);
    switch_to(&prev->cpu_context, &next->cpu_context);
}

src/exception.c

kernel stack 的複製與上次一致,user stack 改為複製 parent task 在 page table 上的 page 到 child 上

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
void sys_fork(struct trapframe* trapframe) {
    struct task_t* parent_task = get_current_task();

    int child_id = privilege_task_create(return_from_fork, parent_task->priority);
    struct task_t* child_task = &task_pool[child_id];

    // copy kernel stack
    char* child_kstack = &kstack_pool[child_task->id][KSTACK_TOP_IDX];
    char* parent_kstack = &kstack_pool[parent_task->id][KSTACK_TOP_IDX];
    uint64_t kstack_offset = parent_kstack - (char*)trapframe;
    for (uint64_t i = 0; i < kstack_offset; i++) {
        *(child_kstack - i) = *(parent_kstack - i);
    }
    // place child's kernel stack to right place
    child_task->cpu_context.sp = (uint64_t)child_kstack - kstack_offset;

    // copy all user pages
    fork_pgd(current_task, child_task);

    // place child's user stack to right place
    struct trapframe* child_trapframe = (struct trapframe*) child_task->cpu_context.sp;
    child_trapframe->sp_el0 = trapframe->sp_el0;

    child_trapframe->x[0] = 0;
    trapframe->x[0] = child_task->id;
}

src/mm.c

依序從 PGD 一路複製到 PTE

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
void fork_pte(uint64_t* target_pte, uint64_t* dest_pte) {
    for (int i = 0; i < 512; i++) {
        if (target_pte[i]) {
            uint64_t* target_page = (uint64_t*)((target_pte[i] & PAGE_MASK) | KERNEL_VIRT_BASE);
            uint64_t* dest_page = create_page(dest_pte, i);
            memcpy((void*)dest_page, (void*)target_page, PAGE_SIZE);
        }
    }
}

void fork_pmd(uint64_t* target_pmd, uint64_t* dest_pmd) {
    for (int i = 0; i < 512; i++) {
        if (target_pmd[i]) {
            uint64_t* target_pte = (uint64_t*)((target_pmd[i] & PAGE_MASK) | KERNEL_VIRT_BASE);
            uint64_t* dest_pte = create_page_table(dest_pmd, i);
            fork_pte(target_pte, dest_pte);
        }
    }
}

void fork_pud(uint64_t* target_pud, uint64_t* dest_pud) {
    for (int i = 0; i < 512; i++) {
        if (target_pud[i]) {
            uint64_t* target_pmd = (uint64_t*)((target_pud[i] & PAGE_MASK) | KERNEL_VIRT_BASE);
            uint64_t* dest_pmd = create_page_table(dest_pud, i);
            fork_pmd(target_pmd, dest_pmd);
        }
    }
}

void fork_pgd(struct task_t* target, struct task_t* dest) {
    uint64_t* target_pgd = (uint64_t*)((target->mm.pgd & PAGE_MASK) | KERNEL_VIRT_BASE);
    uint64_t* dest_pgd = create_pgd(dest);
    for (int i = 0; i < 512; i++) {
        if (target_pgd[i]) {
            uint64_t* target_pud = (uint64_t*)((target_pgd[i] & PAGE_MASK) | KERNEL_VIRT_BASE);
            uint64_t* dest_pud = create_page_table(dest_pgd, i);
            fork_pud(target_pud, dest_pud);
        }
    }
}

user_addr_to_page_addr 會拿到 user address 對應的真實 memory address (在 kernel 的 virtual address)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
void memcpy(void *dest, void *src, uint64_t size) {
    uint8_t *csrc = (uint8_t*)src;
    uint8_t *cdest = (uint8_t*)dest;

    for (uint64_t i = 0; i < size; i++)
        cdest[i] = csrc[i];
}

uint64_t user_addr_to_page_addr(uint64_t user_addr, uint64_t pgd_phy) {
    uint64_t pgd_idx = (user_addr & (PD_MASK << PGD_SHIFT)) >> PGD_SHIFT;
    uint64_t pud_idx = (user_addr & (PD_MASK << PUD_SHIFT)) >> PUD_SHIFT;
    uint64_t pmd_idx = (user_addr & (PD_MASK << PMD_SHIFT)) >> PMD_SHIFT;
    uint64_t pte_idx = (user_addr & (PD_MASK << PTE_SHIFT)) >> PTE_SHIFT;

    uint64_t* pgd = (uint64_t*)(pgd_phy | KERNEL_VIRT_BASE);
    uint64_t* pud = (uint64_t*)((pgd[pgd_idx] & ~0xFFF) | KERNEL_VIRT_BASE);
    uint64_t* pmd = (uint64_t*)((pud[pud_idx] & ~0xFFF) | KERNEL_VIRT_BASE);
    uint64_t* pte = (uint64_t*)((pmd[pmd_idx] & ~0xFFF) | KERNEL_VIRT_BASE);
    return (pte[pte_idx] & ~0xFFF) | KERNEL_VIRT_BASE;
}
comments powered by Disqus