+USE64
+
+section .text
+
+; TODO if this were aligned to a page, the kernel
+; could reclaim the memory below it from 0x100000 to the start of this section
+
+; import some symbols from boot32.s
+extern GDT64
+extern GDT_load
+extern multiboot_infoptr
+extern multiboot_magic
+
+; export so that boot32.s can find it
+global kernel_bootstrap:function
+
+; export for the kernel
+global _asm_physmap:data
+
+; I think from here the code expects to be using the high memory addresses
+kernel_bootstrap:
+ ; zero the other segment registers, we aren't going to use them
+ xor rax, rax
+ mov fs, ax
+ mov gs, ax
+
+ ; enable the floating point unit
+ mov rax, cr0
+ and ax, 0xFFFD
+ or ax, 0x10
+ mov cr0, rax
+ fninit
+
+ ; Enable Streaming SIMD Extensions
+ mov rax, cr0
+ and ax, 0xFFFB
+ or ax, 0x2
+ mov cr0, rax
+ mov rax, cr4
+ or rax, 0x600
+ mov cr4, rax
+
+ ; TODO might as well just enable syscall/sysret here
+
+ jmp beginkernel
+.end:
+size kernel_bootstrap (kernel_bootstrap.end - kernel_bootstrap)
+
+section .text
+
+extern stack
+extern kernel_main
+
+global beginkernel
+type beginkernel function
+beginkernel:
+ ; initialize the stack pointer
+ ; hard coded kernel stack. could be freed later with
+ ; a dynamic stack, but 64 KB should be enough, and I'd
+ ; like to not page fault the kernel, especially for
+ ; a stack issue
+ ; TODO this stack pretty much leaks once tasking is set
+ ; up. the initial task should probably be freed, and
+ ; the 64 KB handed over to the pmm, or used for DMA or
+ ; something
+ mov rsp, qword stack
+ add rsp, 65536 ; 64 KB, see kernel.c
+
+ ; sets up the 1 GB page maps for the first 512 GB
+ ; doing this early makes the pmm's job easier
+ call mapphys
+
+ ; reload GDT to high memory
+ mov rax, [ qword _asm_physmap ]
+ add rax, GDT64
+ mov qword [GDT_load + 2], rax
+ lgdt [GDT_load]
+
+ ; SysV ABI args for kernel_main, which is written in C
+ mov edi, [multiboot_infoptr]
+ mov esi, [multiboot_magic]
+ call kernel_main
+.end
+size beginkernel beginkernel.end - beginkernel
+
+mapphys:
+ ; rdi, rsi, rdx, rcx, r8, r9
+ ; scratch rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+
+ ; now that we're in 64 bit mode, map in the physical memory to
+ ; the high addreses
+ ; set up 1GB PDPT for physical map at 0xFFFFC00000000000
+ ; 1 GB = 0x 4000 0000
+ ; 512 GB = 0x 80 0000 0000
+
+ ; rax = physical address and bits
+ mov eax, 0x183 ; Global, 1 GB PS, RW, and Present bits
+ ; rdi = pdpt slot address, i.e. pdpe address
+ mov rdi, 0x19000
+ ; rsi = (1<<30) i.e. 1 GB
+ mov rsi, 0x40000000 ; 1 GB
+ ; ecx = GB count
+ xor ecx, ecx
+
+ .nextgb
+ stosq
+ add rax, rsi ; next physical address
+ inc ecx
+ cmp ecx, 512 ; could not use ecx and just cmp rdi to 0x2B000
+ jl .nextgb ; I think I can just do a loop .nextgb and skip the inc and cmp
+
+ ; calculate the pml4t offset
+ mov rax, [qword _asm_physmap]
+ mov rdx, rax
+ shr rdx, 39
+ and edx, 0x1ff ; edx is now 384 if the physmap offset is still 0xffffc000000000
+ mov rax, 0x19003 ; physical address of pdpt + ReadWrite and Present
+ mov rdi, cr3
+ ; now, set the pml4e
+ mov [rdi + rdx*8], rax
+
+ ret
+
+global allstop:function
+allstop:
+ mov rax, [rsp]
+ mov dr0, rax
+ cli
+ hlt
+ jmp allstop
+.end
+size allstop, allstop.end - allstop
+
+global halt:function
+halt:
+ sti ; ensure that we can receive interrupts
+ hlt
+ ret
+
+section .data
+
+align 8
+_asm_physmap:
+dq 0xFFFFC00000000000