20 __attribute__((aligned(4096))) struct pml4e kernel_pml4[512];
22 struct pml4e *kernel_pml4ptr;
23 uint64_t kernel_space;
25 extern uintptr_t _kernel_end;
26 extern uintptr_t _kernel_phys_end;
27 extern uintptr_t _asm_physmap;
30 static void pstackdump() {
31 printk("phys stack, %llu total pages, %llu top pages, stack = %llx, first free = %llx\n",
34 (uintptr_t)phys.stack,
35 phys.stack->page_addr[phys.stack->pages-1]
40 static void dump_pml4(paddr_t space) {
42 uint64_t *e = PHY2VIRTP(space);
46 pa = (uint64_t *)(0x21000 + 384*8);
47 printk("*(%llx) should be 0x2A003, is %llx\n", pa, *pa);
49 printk("*(%llx) should be 0x2A003, is %llx\n", pa, *pa);
53 printk("base = %llx, virt = %llx", space, (uintptr_t)e);
54 for (i = 0; i < 512; i ++) {
56 printk("\n%2x:", e+i);
58 printk(" %llx", e[i]);
64 void test_address(void *a) {
69 *high = 0x1badd00dULL;
70 printk("testing 1baddood (0x%16llx) = 0x%llx\n", high,*high);
71 *high = 0xcafebabeULL;
72 printk("testing cafebabe (0x%16llx) = 0x%llx\n", high,*high);
75 void pfree(paddr_t pa) {
79 /* no stack? use freed page */
80 phys.stack = PHY2VIRTP(pa);
81 printk("initializing phys.stack at %llx -> %llx\n", pa, phys.stack);
82 phys.stack->pages = 0;
86 /* if the stack page is full, use the new free
87 * page for the next stack page
89 if (phys.stack->pages == 510) {
91 phys.stack = PHY2VIRTP(pa);
92 phys.stack->pages = 0;
93 phys.stack->next = old;
95 phys.stack->page_addr[phys.stack->pages++] = pa;
99 /* we never return the zero page, so we return 0 if there's a
102 paddr_t palloc(void) {
106 panic("palloc() failed\n");
110 pa = phys.stack->page_addr[--phys.stack->pages];
111 if (phys.stack->pages == 0) {
112 /* we're about to hand out the stack page itself, so
115 phys.stack = phys.stack->next;
119 /* clear the page here so callers don't have to */
120 memset(PHY2VIRTP(pa), 0, 4096);
124 #define MEM_ALIGNP(addr, align) mem_align((uintptr_t)a, align)
126 uintptr_t mem_align(uintptr_t a, size_t alignment) {
128 uintptr_t a = (uintptr_t) addr;
134 a = a + alignment - a % alignment;
137 a = (uintptr_t) addr;
138 mask = (alignment - 1) ^ (size_t) -1;
140 if (a != (uintptr_t) addr) {
148 void *kvalloc(size_t bytes) {
152 pages = bytes / 4096 + (bytes % 4096 > 0);
153 /* TODO check if not enough room */
154 base = (void *)memory.kend;
155 memory.kend += pages * 4096;
159 void *kpalloc(size_t bytes) {
163 pages = bytes / 4096 + (bytes % 4096 != 0);
164 /* TODO check if not enough room */
165 base = mem_align(memory.kphysend, 4096);
166 memory.kphysend = base + pages * 4096;
170 #define CHECK_FLAG(flags,bit) ((flags) & (1 << (bit)))
171 void phys_init(struct multiboot_info *mbi) {
172 struct multiboot_mmap_entry *mmap;
173 struct multiboot_mmap_entry *base;
175 struct phys p = { 0 };
176 struct memory m = { 0 };
181 memory.kend = mem_align(_kernel_end, 4096);
182 memory.kphysend = _kernel_phys_end;
184 /* TODO don't know if we need this now */
185 phys.page_map = kvalloc(4096 * 2); /* this is where we will map in a page of memory to adjust */
187 /* phys.kmap needs to be the virtual address of the page table entry
189 * so, phys.kmap -> vadecode(&phys.stack)
192 if (!CHECK_FLAG(mbi->flags, 6)) {
193 panic("no memory map available\n");
198 printk("cr3 = %llx\n", cr3);
199 printk("new cr3 = %llx\n", (uintptr_t)&kernel_pml4 - _kernel_vma);
201 printk("cr3 = %llx\n", cr3);
203 //memset((void *)0x21000, 0, 4096);
205 /* loop over the multiboot info, free pages */
207 /* TODO refactor this into a function to get max physical address */
208 base = (struct multiboot_mmap_entry *) (uint64_t) mbi->mmap_addr;
210 memory.physmap = _asm_physmap; /* where the early boot initialized the physmap */
213 for (mmap = base; (unsigned long) mmap < mbi->mmap_addr + mbi->mmap_length;
214 mmap = (struct multiboot_mmap_entry *) ((unsigned long) mmap + mmap->size + sizeof (mmap->size))) {
215 if (mmap->type == 1 && mmap->addr + mmap->len > memory.phys_max) {
216 memory.phys_max = mmap->addr + mmap->len;
220 printk("phys_max = %llx, physmap = %llx, %llx 1GB pages\n", memory.phys_max,
222 memory.phys_max / GB(1) + (memory.phys_max % GB(1)) != 0);
224 /* set up kernel physical memory map */
225 /* need one for each 512 GB */
226 /* for now, assume that these are available after the kernel physical
227 * memory, and below 4 MB, which is still identity mapped
228 * TODO hardcode the first one into the kernel, map the lowest
229 * 512 GB there, then use that to get more if needed
230 * also mark all of these as "global" so they don't get flushed at space change cr3
234 * get free memory and add it to the free page stack
235 * free memory is generally anything above memory.kphysend
236 * we won't need to call kpalloc after this point, we can call palloc
237 * for new physical pages
238 * we may need to figure out how to get mapping for DMA pages.
239 * I would really prefer to use 2 MB pages for everything, but the
240 * system really doesn't like that
241 * And in any case, I saw some math on usenet that implies that the
242 * average wasted space would be greater with 2 MB pages than 4KB pages + extra
245 for (mmap = base; (unsigned long) mmap < mbi->mmap_addr + mbi->mmap_length;
246 mmap = (struct multiboot_mmap_entry *) ((unsigned long) mmap + mmap->size + sizeof (mmap->size))) {
248 printk(" size = 0x%x, base_addr = 0x%llx," " length = 0x%llx, type = 0x%x\n",
249 mmap->size, mmap->addr, mmap->len, mmap->type);
251 printk(" addr = 0x%18llx," " length = 0x%llx, type = 0x%x\n", mmap->addr, mmap->len, mmap->type);
253 if (mmap->type == 1) {
254 uintptr_t start, end;
257 end = start + mmap->len;
259 /* TODO we don't map in anything below 16 MB so we can
260 * use it for PCI and DMA */
261 if (start < MB(16)) {
264 start = mem_align(start, 4096);
266 if (start < end - 4096) {
267 printk(" freeing %llu pages, %llu KB, starting from 0x%llx\n", (end - start)/4096,
268 (end-start)/1024, start);
271 while (start <= end - 4096) {
272 /* TODO don't free pages used by modules/initrd */
279 /* copy the PML4 somewhere in kernel memory */
280 /* need to know the physical address */
281 MEM_KERNEL = (getcr3() & ~(uintptr_t)0xfff);
285 MEM_KERNEL = create_addrspace();
286 switch_address_space(MEM_KERNEL);
287 /* TODO can free the top level pml4. We really should copy all of the
288 * initial blocks into kernel memory, as it is, this didn't really get us
293 paddr_t create_addrspace(void) {
298 pa = palloc(); /* will zero the memory */
299 space = PHY2VIRTP(pa);
300 /* higher half copied into every address space */
301 for (i=128;i<512;i++) {
302 space->pml4e[i] = KERNEL_PML4->pml4e[i];
304 printk("created address space %llx\n", pa);
308 #define PML4i(vaddr) ( (((vaddt_t) vaddr) >> 39) & 0x1ff )
309 #define PDPi(vaddr) ( (((vaddt_t) vaddr) >> 30) & 0x1ff )
310 #define PDi(vaddr) ( (((vaddt_t) vaddr) >> 21) & 0x1ff )
311 #define PTi(vaddr) ( (((vaddt_t) vaddr) >> 12) & 0x1ff )
314 paddr_t getphysaddr(uint64_t space, vaddr_t vaddr) {
315 struct vaddr_decode ds;
316 decode_vaddr(&ds, (struct pml4 *)space, vaddr);
321 /* TODO this can probably be done inline, but this makes it a bit easier to
323 * should this return an integer or something to indicate if it was a complete
326 #define VADDRMASK 0xfffffffffffff000
327 void decode_vaddr(struct vaddr_decode *ds, uint64_t space, vaddr_t vaddr) {
328 ds->pml4e = (struct pml4e){ 0 };
329 ds->pdpe = (struct pdpe){ 0 };
330 ds->pde = (struct pde){ 0 };
331 ds->pte = (struct pte){ 0 };
334 /* broken down virtual address */
335 int pml4offset, pdpoffset, pdoffset, ptoffset;
336 paddr_t offset; /* physical page offset */
338 ds->pml4offset = (vaddr >> 39) & 0x1ff; /* index into pagemap level 4 table */
339 ds->pdpoffset = (vaddr >> 30) & 0x1ff; /* index into */
340 ds->pdoffset = (vaddr >> 21) & 0x1ff;
341 ds->ptoffset = (vaddr >> 12) & 0x1ff;
342 ds->offset = vaddr & 0xfff;
344 ds->present = 0; /* assume not present */
345 /* offset = vaddr & 0xfff; i.e. low 12 bits */
348 struct pml4 *pml4; /* pml4 base address */
349 paddr_t pml4_phys; /* pml4 physical base address */
351 ds->pml4_phys = space & VADDRMASK;
352 ds->pml4t = PHY2VIRTP(ds->pml4_phys);
355 printk("space = %llx ds->pml4t = %llx ", space, ds->pml4t);
356 struct pml4e pml4e; /* entry in page map level 4 */
357 /* at pml4 + 8 * pml4offset */
358 printk(" pml4offset = %llx\n", ds->pml4offset);
360 ds->pml4e = ds->pml4t->pml4e[ds->pml4offset];
364 struct pdpe pdpe; /* entry in page directory pointer table */
365 /* at virt(pml4e.addr) + 8 * pdpoffset */
366 /* does not exist unless pml4e.present */
367 /* pdpe.ps == 1 means 1 GB pages, and the next level doesn't exist */
368 /* offset = vaddr & 0x3fffffff, physical page = standard - offset */
370 if (!ds->pml4e.present) {
374 ds->pdpt_phys = ds->pml4e.addr << 12;
375 ds->pdpe = ((struct pdpt *)PHY2VIRTP(ds->pdpt_phys))->pdpe[ds->pdpoffset];
377 if (ds->pdpe.ps == 1) {
378 paddr_t mask = GB(1)-1;
380 ds->pagesize = GB(1);
381 ds->offset = vaddr & mask;
382 ds->present = ds->pdpe.present;
383 ds->page = ds->pdpe.addr << 12;
389 struct pde pde; /* entry in page directory table */
390 /* does not exist unless pdpe.present */
391 /* at virt(pdpe.addr) + 8 * pdoffset */
392 /* pde.ps == 0 means 4 KB pages and the next level exists if present */
393 /* pde.ps == 1 means 2 MB pages, pts doesn't exist and
394 * offset = vaddr & 0x1fffff; i.e. low 21 bits
395 * physical page address = same mask minus offset bits
398 if (!ds->pdpe.present) {
401 ds->pdt_phys = ds->pdpe.addr << 12;
402 ds->pde = ((struct pdt *)PHY2VIRTP(ds->pdt_phys))->pde[ds->pdoffset];
404 if (ds->pde.ps == 1) {
405 paddr_t mask = MB(2)-1;
407 ds->offset = vaddr & mask;
408 ds->page = ds->pde.addr << 12;
409 ds->present = ds->pdpe.present;
414 struct pte pte; /* entry in page table */
415 /* does not exist unless pde.present */
416 /* at virt(pde.addr) + 8 * ptoffset */
417 /* offset = vaddr & 0xfff; i.e. low 12 bits */
418 /* physical page address = pde.addr, or vaddr & 0xffffffffff000 */
420 if (!ds->pde.present) {
423 ds->pt_phys = ds->pde.addr << 12;
424 ds->pte = ((struct pt *)PHY2VIRTP(ds->pt_phys))->pte[ds->ptoffset];
426 ds->page = ds->pte.addr << 12;
427 /* offset is good already */
428 ds->present = ds->pte.present;
432 /* actual physical addresses */
438 int present; /* physical address actually mapped */
439 paddr_t paddr; /* decoded physical address */
445 void print_decode(struct vaddr_decode *ds, uint64_t space, vaddr_t vaddr) {
446 struct vaddr_decode d;
450 decode_vaddr(ds, space, vaddr);
451 printk("%llx %llx %u offsets %x %x %x %x,\n tables %llx %llx %llx",
452 space, vaddr, ds->level,
453 ds->pml4offset, ds->pdpoffset, ds->pdoffset, ds->ptoffset,
454 ds->pdpt_phys, ds->pdt_phys, ds->pt_phys
457 printk(" page %llx", ds->page);
463 #define PT_PAMASK 0x000FFFFFFFFFF000
465 /* could page fault it */
466 /* map a physical page to a virtual address */
467 void *map_page(uint64_t space, vaddr_t addr, paddr_t pa, unsigned int flags) {
468 struct vaddr_decode ds;
471 trace = flags & MAP_TRACE;
473 //if (space == 0) space = kernel_space;
475 panic("attempted to map into a null space\n");
478 /* if addr == 0 then caller doesn't care where it's mapped */
480 addr = (vaddr_t)kvalloc(4096);
483 printk("map_page space = %llx vaddr %llx paddr %llx\n", space, addr, pa);
485 /* break up the vaddr and get pointers to tables */
486 decode_vaddr(&ds, (uint64_t)space, addr);
488 //printk("map page\n");
489 flags &= 0xC1C; /* only accept the user, rw, and available bits */
490 /* now also accepting PCD and PWT bits */
495 #define PML4E ( *(uint64_t *)&(ds.pml4t->pml4e[ds.pml4offset]))
496 /* do we need to make a pdp */
497 if (!ds.pml4e.present) {
498 /* need a new pdpt for this address */
504 page_table = palloc();
505 ds.pdpt_phys = page_table;
507 printk(" at paddr %llx", page_table);
509 //ds.pdp = PHY2VIRTP(page_table);
511 //*(uint64_t *)&ds.pml4t->pml4e[ds.pml4offset] = (page_table | flags);
513 PML4E = (page_table | flags);
516 printk(" ds.pml4t->pml4e[%x] = %llx\n", ds.pml4offset, PML4E);
520 #define PDPE ( *(uint64_t *)&((struct pdpt *)PHY2VIRTP(ds.pdpt_phys))->pdpe[ds.pdpoffset] )
521 if (!ds.pdpe.present) {
522 /* need a new pdt for this address */
528 page_table = palloc();
529 ds.pdt_phys = page_table;
531 printk(" at paddr %llx", page_table);
533 //ds.pd = PHY2VIRTP(page_table);
535 PDPE = (page_table | flags);
538 printk(" ds.pdpt->pdpe[%x] = %x\n", ds.pdpoffset, PDPE);
542 #define PDE ( *(uint64_t *)&((struct pdt *)PHY2VIRTP(ds.pdt_phys))->pde[ds.pdoffset] )
543 if (!ds.pde.present) {
544 /* need a new pt for this address */
550 page_table = palloc();
551 ds.pt_phys = page_table;
553 printk(" at paddr %llx", page_table);
555 //ds.pt = PHY2VIRTP(page_table);
557 PDE = (page_table | flags);
560 printk(" ds.pd->pde[%x] = %x\n", ds.pdoffset, PDE);
564 /* TODO check if the page is present in the table already, if it is, we may be losing a
567 #define PTE ( *(uint64_t *)&((struct pt *)PHY2VIRTP(ds.pt_phys))->pte[ds.ptoffset] )
569 //printk("mapping addr %llx to %llx\n", (uintptr_t)addr, (uintptr_t)(pa | flags ));
571 printk(" ds.pt->pte[%x] = %x\n", ds.ptoffset, pa | flags);
574 //*(uint64_t *)&((struct pt *)PHY2VIRTP(ds.pt_phys))->pte[ds.ptoffset] = (pa | flags);
575 flush_tlb((void *)addr);
577 /* a not present page, will need to be faulted in */
578 printk("warning: mapping a non present page\n");
579 PTE = (pa | (flags & 0xffe));
580 flush_tlb((void *)addr);
584 print_decode(&ds, (uint64_t)space, addr);
590 paddr_t unmap_page(uint64_t space, vaddr_t addr) {
591 struct vaddr_decode ds;
593 /* break up the vaddr and get pointers to tables */
594 decode_vaddr(&ds, space, addr);
599 /* TODO pteptr not set by decode */
600 ds.pteptr->present = 0;
601 flush_tlb((void *)addr);
605 void vunmap(uint64_t space, vaddr_t addr, paddr_t pa, size_t length) {
608 panic("attempted to unmap in a null space\n");
611 while (length >= 4096) {
612 /* freeing these is the responsibility of the caller.
613 * they might be shared, so we don't/can't do that here
615 unmap_page(space, addr);
622 /* what about non-aligned? */
623 /* assume that the source is already mapped in */
624 /* destination space, dest start address (in the destination space),
625 * from start address in the current space, number of bytes to copy
626 * this function really only works in kernel space, because it relies
627 * on the physical memory being mapped in.
628 * we could, in principle, have a 4KB virtual address space and
629 * map the pages in as needed
631 void copy_to_space(uint64_t dspace, void *to, void *from, size_t length) {
632 vaddr_t dst, src; /* to and from */
633 //paddr_t dpage, spage; /* to and from physical */
634 size_t dfrag = 0, sfrag = 0; /* fragment size of non-aligned */
637 src = (vaddr_t) from;
640 size_t n; /* number of bytes to copy */
641 /* get locally mapped virtual addresses of physical pages */
642 //to = PHY2VIRTP(getphysaddr(dspace, dst));
643 sfrag = 4096 - (src & 0x1FFF); /* bytes to bring src up to page boundary */
644 dfrag = 4096 - (dst & 0x1FFF); /* bytes to bring dest up to page boundary */
647 n = sfrag < length ? sfrag : length;
649 n = dfrag < length ? dfrag : length;
651 /* copy up to length/fragment from source bytes */
652 memmove((void *)dst, (void *)src, n);
659 #define PML4E(space, addr) ((struct pdp *)(((struct pml4 *)space)->pml4e[PML4i(addr)]))
660 void vmap(uint64_t space, vaddr_t addr, paddr_t pa, size_t length, unsigned int flags) {
663 panic("attempted to map into a null space\n");
666 while (length >= 4096) {
667 map_page(space, addr, pa, flags);
674 void vmapz(uint64_t space, vaddr_t addr, size_t length, unsigned int flags) {
678 panic("attempted to map into a null space\n");
681 //printk("mapping %zx bytes at %llx\n", length, addr);
684 map_page(space, addr, pa, flags);
688 //printk("mapped\n");
691 /* TODO should probably specify pages rather than bytes */
692 void *kalloc(size_t bytes) {
693 void *va = kvalloc(bytes);
694 vmapz(MEM_KERNEL, (vaddr_t)va, bytes, MEM_RW);
698 /* TODO we can free the space, but we really can't free the virtual address */
699 /* so we can keep a list of free virtual addresses in the kernel space
700 * for kalloc to reuse...
702 void kfree(void *va) {
703 unmap_page(MEM_KERNEL, (vaddr_t)va);
710 /* test an allocation */
711 printk("starting vmem test\n");
712 kvtest = kvalloc(4096);
713 printk("allocated virtual block at %llx\n", kvtest);
715 printk("allocated physical block at %llx\n", pa);
716 vmap(MEM_KERNEL, (vaddr_t) kvtest, pa, 4096, MAP_TRACE);
717 test_address(kvtest);
718 vunmap(MEM_KERNEL, (vaddr_t) kvtest, pa, 4096);
720 vmap(MEM_KERNEL, (vaddr_t) GB(32), pa, 4096, MAP_TRACE);
721 test_address((void *)GB(32));
722 vunmap(MEM_KERNEL, (vaddr_t) GB(32), pa, 4096);
723 printk("ending vmem test\n");
725 /* should page fault */
726 test_address((void *)GB(32));
730 #define NUMOBJSTACKS 10
732 * carefully constructed to take 16 bytes, I hope
733 * This isn't really a requirement, but since
734 * they'll be allocated out of kernel objects
735 * anyway, they will be rounded up to a power of two
738 void *top; /* pointer to first free object */
740 uint32_t size; /* for allocating objects of a given size */
741 struct spinlock_t spinlock;
744 struct kobjstack kobjstacks[NUMOBJSTACKS];
749 printk("initializing virtual memory\n");
750 /* rebuild the boot page tables */
752 /* build the kernel object allocator */
753 for (i=0; i < NUMOBJSTACKS; i++) {
754 kobjstacks[i].top = 0;
755 kobjstacks[i].freeobjs = 0;
756 kobjstacks[i].size = 1 << (i+3);
757 spinlock_init(&kobjstacks[i].spinlock);
761 void dumpostacks(int f, int t) {
765 if (t >= NUMOBJSTACKS) t = NUMOBJSTACKS-1;
767 for (i=f; i <= t; i++) {
768 /* TODO spinlock the stack for the print */
769 printk("stack %4u: %u free, top %llx\n",
771 kobjstacks[i].freeobjs,
777 void mem_init(struct multiboot_info *mbi) {
779 /* physical page allocator is working now */
785 * kernel objects, grouped by size power of two
786 * minimum size is a pointer (i.e. 8 bytes),
787 * maximum size is a page, i.e 4096 bytes
788 * when free, each object points to the next free object
789 * each stack size allocator is itself a kernel object
792 static size_t nextpowerof2(size_t x) {
805 void *koalloc(size_t size) {
807 struct kobjstack *ostack;
810 /* Assembly: bsr rax, rdi; bsr rbx, rdi; cmp rax, rbx; je .done; shl rax, 1; .done ret; */
811 //printk("allocating %u", size);
812 size = nextpowerof2(size);
813 //printk(" using size %u", size);
819 //printk(" from stack %u\n", stack_index);
820 //dumpostacks(stack_index,stack_index);
822 ostack = &kobjstacks[stack_index];
825 panic("no kernel object stack for size %u\n", size);
828 //printk("locking stack\n");
829 spinlock_acquire(&ostack->spinlock);
830 //printk("got lock\n");
832 if (!ostack->freeobjs) {
835 newpage = kalloc(4096);
836 /* TODO should probably map this in to high memory */
837 for (free = (uintptr_t)newpage; free < (uintptr_t)newpage + 4096; free += ostack->size) {
838 *(uintptr_t *)free = free + ostack->size;
840 ostack->top = newpage;
841 ostack->freeobjs = 4096/ostack->size;
842 //printk("ostack init stack %4u, %u free\n", size, ostack->freeobjs);
846 ostack->top = (void *)*(uintptr_t *)obj;
849 spinlock_release(&ostack->spinlock);
850 //printk("ostack alloc stack %4u, %u free, top %llx, obj %llx\n", ostack->size, ostack->freeobjs, ostack->top, obj);
851 //dumpostacks(stack_index,stack_index);
856 /* it might be worth an idle thread doing a qsort and see
857 * if we can release any memory if there's a least
858 * a page worth of free objects
860 void kofree(void *obj, size_t size) {
862 struct kobjstack *ostack;
864 /* Assembly: bsr rax, rdi; bsr rbx, rdi; cmp rax, rbx; je .done; shl rax, 1; .done ret; */
865 //printk("freeing %u", size);
866 size = nextpowerof2(size);
867 //printk(" using size %u", size);
873 //printk(" from stack %u\n", stack_index);
874 //dumpostacks(stack_index,stack_index);
876 ostack = &kobjstacks[stack_index];
878 spinlock_acquire(&ostack->spinlock);
880 *(vaddr_t *)obj = (vaddr_t)ostack->top;
883 spinlock_release(&ostack->spinlock);
884 //dumpostacks(stack_index,stack_index);
885 printk("ostack free stack %4u, %u free, top %llx, obj %llx\n", ostack->size, ostack->freeobjs, ostack->top, obj);