--- /dev/null
+%macro zeropage32 1
+ mov edi, %1
+ xor eax, eax
+ mov ecx, 1024 ; 1024 times for 4 byte stosd
+ rep stosd
+%endmacro
+
+USE32
+
+section .multiboot
+
+global multiboot_magic:data
+global multiboot_infoptr:data
+global _multiboot_entry:function
+
+_multiboot_entry:
+ jmp start32
+
+ ; multiboot header needs to be 4 byte aligned.
+ ; it would be anyway I think, but make it explicit
+ align 4
+
+multiboot_magic:
+ dd 0x1BADB002 ; multiboot 1 magic
+ dd 0x00000007 ; flags: 4 KB aligned modules, memory and video informatino
+ dd -(0x1BADB002 + 0x00000007) ; checksum
+multiboot_infoptr:
+ dd 0 ; a place to store the multiboot info pointer until the kernel gets it
+
+section .text
+
+extern kernel_bootstrap
+
+PML4ADDR equ 0x10000
+
+align 4
+start32:
+ ; boot loader passes multiboot info in ebx and eax
+ mov [multiboot_infoptr], ebx
+ mov [multiboot_magic], eax
+
+ ; set up initial page tables
+ ; We could also just use 1GB pages for now and let the
+ ; actual memory manager handle it later, we just need
+ ; these tables set up so that we can run the kernel
+ ; in the higher half.
+
+ ; free memory at this point is pretty much everything normally free
+ ; below 1 MB
+ ; 0x1000 - 0x7ffff is all free, plus 0x80000 - bottom of EBDA
+
+ ; map the first four megabytes of physical memory twice
+ ; once to identity map it, once to map it into the high kernel
+ ; virtual memory area for a higher half kernel
+
+ ; 0x10000 PML4 (512 GB/entry)
+ ; 0x11000 PDPT identity map (1 GB/entry)
+ ; 0x12000 PDT identity map (2 MB/entry)
+ ; 0x13000 PT identity map 0-2 MB (4 KB/entry)
+ ; 0x14000 PT identity map 2-4 MB
+ ; 0x15000 PDPT kernel map for higher half (1 GB/entry)
+ ; 0x16000 PDT kernel map for higher half (2 MB/entry)
+ ; 0x17000 PT kernel map 0-2 MB
+ ; 0x18000 PT kernel map 2-4 MB
+ ; 0x19000 1GB PDPT physical low 512 GB map
+ ; 0x20000-0x2ffff 64KB initial stack
+
+ ; zero the initial stack
+
+ mov edi, 0x20000
+ xor eax, eax
+ mov ecx, PML4ADDR
+ rep stosd
+
+ mov esp, 0x30000 ; set to top of stack
+
+ ; The 3 at the end of these marks the page as present and read/write
+
+ ; PML4
+ zeropage32 PML4ADDR
+ mov edi, PML4ADDR ; address of initial PML4
+ mov [edi], dword 0x11003 ; first entry, covers virtual 0-512 GB
+ ; higher half map
+ mov [edi + 511 * 8], dword 0x15003 ; last entry, covers upper virtual 512 GB
+
+ ; identity map PDPT
+ zeropage32 0x11000
+ mov edi, 0x11000
+ mov [edi], dword 0x12003 ; virtual 0-1 GB
+
+ ; higher half PDPT
+ zeropage32 0x15000
+ mov edi, 0x15000
+ ; 0xFFFFFFFF80000000 kernel is at here + 1 MB
+ mov [edi + 510 * 8], dword 0x16003 ; virtual -2 - -1 GB
+
+ ; page directory
+ zeropage32 0x12000
+ mov edi, 0x12000
+ mov [edi], dword 0x13003 ; first 2 MB
+ mov [edi+8], dword 0x14003 ; second 2 MB
+
+ ; higher half PDT
+ zeropage32 0x16000
+ mov edi, 0x16000
+ mov [edi], dword 0x017003
+ mov [edi+8], dword 0x018003
+
+ ; page table
+ ; map the first 4 MB
+
+ ; Don't need to zero-page these, because we're going
+ ; to fill them anyway
+ mov edi, 0x13000
+ mov eax, 0x17000
+ mov ebx, 0x3
+ mov ecx, 1024 ; i.e. two tables worth
+.donext:
+ mov [edi], ebx
+ mov [eax], ebx
+ add ebx, 0x1000
+ add eax, 8
+ add edi, 8
+ loop .donext
+
+ mov eax, PML4ADDR
+ mov cr3, eax ; paging is off, so this is safe
+
+ ; enable PAE
+ mov eax, cr4
+ or eax, 0x20
+ mov cr4, eax
+
+ ; enable long mode
+ mov ecx, 0xC0000080
+ rdmsr
+ or eax, 0x100
+ wrmsr
+
+ ; enable paging and enter long mode (still 32-bit)
+ mov eax, cr0
+ or eax, 0x80000000
+ mov cr0, eax
+
+ ; Load the long mode GDT.
+ mov eax, GDT_load
+ lgdt [GDT_load]
+
+ ; set the SS register. Probably not needed
+ mov ax, 0x18 ; 0x18 is 64 bit kernel data segment
+ mov ss, ax
+
+ ; jump to the 64 bit code segment
+ jmp 0x10:bootstrap64
+
+USE64
+align 8
+bootstrap64:
+ ; TODO set up the higher half map here. easier to do in 64 bit code
+
+ ; from the AMD manual...
+ ; System software must create at least one 64-bit TSS for use after
+ ; activating long mode, and it must execute the LTR instruction, in
+ ; 64-bit mode, to load the TR register with a pointer to the 64-bit TSS
+ ; that serves both 64-bit-mode programs and compatibility-mode
+ ; programs.
+
+ ; fill in the TSS selector with the address of the TSS
+ mov rax, qword TSS
+ mov rbx, qword TSS_selector
+ ; load up the address fragments
+ mov [rbx + 2], ax
+ shl rax, 16
+ mov [rbx + 4], al
+ shl rax, 8
+ mov [rbx + 7], al
+ shl rax, 8
+ mov [rbx + 8], eax
+
+ ; calculate the TSS segment offset
+ mov rax, GDT64
+ sub rbx, rax ; rbx still holds the TSS_selector address
+ ; and load it
+ LTR bx
+
+ ; and jump to the start code
+ mov rax, qword kernel_bootstrap
+ jmp rax
+
+USE32
+
+section .data
+
+global GDT64:data
+global GDT_load:data
+global GDT_tss:data
+
+%define MAX_CPUS 2
+
+; 00 00 00 00
+; II GI A? II
+; G = -01- ---- so G = 2 always, unless you want to use the avl bit and make it 3
+; or G = 4 for a 32 bit segment
+; A = 1pl1 1CRA I don't think C matters, and RA should be ignored, so 8?
+; but in principle C == 1 for code?
+; R == 1 (i.e +2 for read write). Supposedly ignored, but should't hurt
+; so, 8 + 0 + 2 + 0 == non conforming read write
+; Thus 00 20 9A 00 for code
+; 00 20 F2 00 for user code
+
+; 00 L0 AC 00
+; L = 2 for 64 bit, 4 for 32 bit
+; A = 9 for kernel, F for user
+; C = A for code, 2 for data
+
+;summary
+; 0x00209A00 ; 64 bit kernel code
+; 0x00209200 ; 64 bit kernel data
+; 0x
+
+GDT64:
+ dd 0,0
+ dd 0,0
+ dd 0x00000000, 0x00209A00 ; 0x10: 64-bit Kernel Code
+ dd 0x00000000, 0x00209200 ; 0x18: 64-bit Kernel Data
+ dd 0x00000000, 0x0040FA00 ; 0x20: 32-bit User Code, should be unused
+ dd 0x00000000, 0x0040F200 ; 0x28: 32-bit User Data
+ dd 0x00000000, 0x0020FA00 ; 0x30: 64-bit User Code
+ dd 0x00000000, 0x0000F200 ; 0x38: 64-bit User Data
+TSS_selector:
+ times MAX_CPUS dd 0, 0x00008900, 0, 0 ; 0x38+16*n: TSS 0
+align 16
+GDT_load:
+ dw GDT_load - GDT64 - 1 ; limit
+ dq GDT64 ; base
+ dd 0 ; pad
+
+align 16
+TSS:
+ dd 0 ; ignored
+ dq 0 ; rsp for cpl 0
+ dq 0 ; rsp for cpl 1
+ dq 0 ; rsp for cpl 2
+ times 2 dd 0 ; ignored
+ times 7 dq 0 ; rsp for ist 1-7
+ times 2 dd 0 ; ignored
+ dd 0 ; ignored
+ dd 0 ; base address for io permission bitmap
+
+GDT_tss:
+ dq TSS_selector - GDT64
+
--- /dev/null
+USE64
+
+section .text
+
+; TODO if this were aligned to a page, the kernel
+; could reclaim the memory below it from 0x100000 to the start of this section
+
+; import some symbols from boot32.s
+extern GDT64
+extern GDT_load
+extern multiboot_infoptr
+extern multiboot_magic
+
+; export so that boot32.s can find it
+global kernel_bootstrap:function
+
+; export for the kernel
+global _asm_physmap:data
+
+; I think from here the code expects to be using the high memory addresses
+kernel_bootstrap:
+ ; zero the other segment registers, we aren't going to use them
+ xor rax, rax
+ mov fs, ax
+ mov gs, ax
+
+ ; enable the floating point unit
+ mov rax, cr0
+ and ax, 0xFFFD
+ or ax, 0x10
+ mov cr0, rax
+ fninit
+
+ ; Enable Streaming SIMD Extensions
+ mov rax, cr0
+ and ax, 0xFFFB
+ or ax, 0x2
+ mov cr0, rax
+ mov rax, cr4
+ or rax, 0x600
+ mov cr4, rax
+
+ ; TODO might as well just enable syscall/sysret here
+
+ jmp beginkernel
+.end:
+size kernel_bootstrap (kernel_bootstrap.end - kernel_bootstrap)
+
+section .text
+
+extern stack
+extern kernel_main
+
+global beginkernel
+type beginkernel function
+beginkernel:
+ ; initialize the stack pointer
+ ; hard coded kernel stack. could be freed later with
+ ; a dynamic stack, but 64 KB should be enough, and I'd
+ ; like to not page fault the kernel, especially for
+ ; a stack issue
+ ; TODO this stack pretty much leaks once tasking is set
+ ; up. the initial task should probably be freed, and
+ ; the 64 KB handed over to the pmm, or used for DMA or
+ ; something
+ mov rsp, qword stack
+ add rsp, 65536 ; 64 KB, see kernel.c
+
+ ; sets up the 1 GB page maps for the first 512 GB
+ ; doing this early makes the pmm's job easier
+ call mapphys
+
+ ; reload GDT to high memory
+ mov rax, [ qword _asm_physmap ]
+ add rax, GDT64
+ mov qword [GDT_load + 2], rax
+ lgdt [GDT_load]
+
+ ; SysV ABI args for kernel_main, which is written in C
+ mov edi, [multiboot_infoptr]
+ mov esi, [multiboot_magic]
+ call kernel_main
+.end
+size beginkernel beginkernel.end - beginkernel
+
+mapphys:
+ ; rdi, rsi, rdx, rcx, r8, r9
+ ; scratch rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+
+ ; now that we're in 64 bit mode, map in the physical memory to
+ ; the high addreses
+ ; set up 1GB PDPT for physical map at 0xFFFFC00000000000
+ ; 1 GB = 0x 4000 0000
+ ; 512 GB = 0x 80 0000 0000
+
+ ; rax = physical address and bits
+ mov eax, 0x183 ; Global, 1 GB PS, RW, and Present bits
+ ; rdi = pdpt slot address, i.e. pdpe address
+ mov rdi, 0x19000
+ ; rsi = (1<<30) i.e. 1 GB
+ mov rsi, 0x40000000 ; 1 GB
+ ; ecx = GB count
+ xor ecx, ecx
+
+ .nextgb
+ stosq
+ add rax, rsi ; next physical address
+ inc ecx
+ cmp ecx, 512 ; could not use ecx and just cmp rdi to 0x2B000
+ jl .nextgb ; I think I can just do a loop .nextgb and skip the inc and cmp
+
+ ; calculate the pml4t offset
+ mov rax, [qword _asm_physmap]
+ mov rdx, rax
+ shr rdx, 39
+ and edx, 0x1ff ; edx is now 384 if the physmap offset is still 0xffffc000000000
+ mov rax, 0x19003 ; physical address of pdpt + ReadWrite and Present
+ mov rdi, cr3
+ ; now, set the pml4e
+ mov [rdi + rdx*8], rax
+
+ ret
+
+global allstop:function
+allstop:
+ mov rax, [rsp]
+ mov dr0, rax
+ cli
+ hlt
+ jmp allstop
+.end
+size allstop, allstop.end - allstop
+
+global halt:function
+halt:
+ sti ; ensure that we can receive interrupts
+ hlt
+ ret
+
+section .data
+
+align 8
+_asm_physmap:
+dq 0xFFFFC00000000000