From 6b143324786cc25e00dc4cafcfe9ad9ef1ccae06 Mon Sep 17 00:00:00 2001 From: Irene Knapp Date: Sun, 19 Oct 2025 01:26:38 -0700 Subject: add extensive documentation on the execution model implement some of the earliest Forth loading stuff, following Jonesforth closely the Forth memory space isn't totally set up yet - we are going to use the data segment after all, though it won't be heap-ish really. everything up through _start is ready for Forthy-y use though, and it does `cld` besides. Force-Push: yeah Change-Id: Ibd12223f304aff71f4e78744f7a9da09b9072a45 --- quine.asm | 413 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 379 insertions(+), 34 deletions(-) (limited to 'quine.asm') diff --git a/quine.asm b/quine.asm index a468578..7e15a51 100644 --- a/quine.asm +++ b/quine.asm @@ -1,4 +1,11 @@ -;;; Workflow tips: +;;; QUINE +;;; +;;; This file is formatted to be read at 80-columns or wider. + + +;;;;;;;;;;;;;;;;;;;;; +;;; Workflow tips ;;; +;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Currently, this is not yet fully self-hosting; it is based on ;;; flatassembler[1]. A minimal command to build and run it is: @@ -127,46 +134,139 @@ macro mov.qreg.qreg target, source end macro -; TODO what register size does this use? -macro add.b target, source - match =rax, target - rex.w - db 0x83 - modrm 3, 0, 0 - db source +; Take a 64-bit source register, treat it as an address and look up the 64-bit +; value it points to, store that into a 64-bit target register. The only modes +; available also have displacement; we use an 8-bit one and set it to zero. +; +; In understanding this, pay close attention to the Op/En column in the opcode +; table. The "RM" variant means the ModRM byte's R/M field (the third one) +; is the source, while its reg field (the middle one) is the target. This is +; what we want, because the R/M field is the one that gets indirection applied +; to it. Opcode 0x8B with an REX.W prefix is the all-64-bit RM variant. +; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV". +; +; For the indirection modes, don't be confused by the many similar tables. +; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W +; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte. +; The presence or absence of an SIB byte is determined by where in that table +; we fall, and we aren't using a mode that has one. [Intel] volume 2A, +; chapter 2, section 2-1.5, table 2-2. +; +; We disallow rsp as a source because that's the mode that would want an SIB. +macro mov.qreg.indirect.qreg target, source + match =rsp, source + assert 0 else + qwordreg sreg, source + qwordreg treg, target + rex.w + rb 0x8B + modrm 1, treg, sreg + db 0 + end match +end macro + + +; Take a 64-bit source register, store its value into the address pointed to +; by a 64-bit target register. The only modes available also have +; displacement; we use an 8-bit one and set it to zero. +; +; In understanding this, pay close attention to the Op/En column in the opcode +; table. The "MR" variant means the ModRM byte's reg field (the middle one) +; is the source, while its R/M field (the third one) is the target. This is +; what we want, because the R/M field is the one that gets indirection applied +; to it. Opcode 0x89 with an REX.W prefix is the all-64-bit MR variant. +; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV". +; +; For the indirection modes, don't be confused by the many similar tables. +; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W +; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte. +; The presence or absence of an SIB byte is determined by where in that table +; we fall, and we aren't using a mode that has one. [Intel] volume 2A, +; chapter 2, section 2-1.5, table 2-2. +; +; We disallow rsp as a target because that's the mode that would want an SIB. +macro mov.indirect.qreg.qreg target, source + match =rsp, target assert 0 + else + qwordreg sreg, source + qwordreg treg, target + rex.w + rb 0x89 + modrm 1, sreg, treg + db 0 end match end macro -macro add.q target, source - db 0x01 +macro add.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source + rex.w + db 0x01 modrm 3, sreg, treg end macro -; TODO what register size does this use? -macro sub.b target, source - match =rsp, target - rex.w - db 0x83 - modrm 3, 5, 4 - db source - else - assert 0 - end match +; This adds a signed 8-bit immediate value to a 64-bit register, in place. +; +; Notice the use of 3 as the addressing mode. This says to use the register +; itself. The 0 in the reg field is part of the opcode. +macro add.qreg.bimm target, source + qwordreg treg, target + rex.w + db 0x83 + modrm 3, 0, treg + db source +end macro + +; This adds a signed 32-bit immediate value to a 64-bit register, in place. +; +; Notice the use of 3 as the addressing mode. This says to use the register +; itself. The 0 in the reg field is part of the opcode. +macro add.qreg.dimm target, source + qwordreg treg, target + rex.w + db 0x81 + modrm 3, 0, treg + dd source +end macro + +; This subtracts a signed 8-bit immediate value from a 64-bit register, in +; place. +; +; Notice the use of 3 as the addressing mode. This says to use the register +; itself. The 5 in the reg field is part of the opcode. +macro sub.qreg.bimm target, source + qwordreg treg, target + rex.w + db 0x83 + modrm 3, 5, treg + db source end macro +; This subtracts a signed 32-bit immediate value from a 64-bit register, in +; place. +; +; Notice the use of 3 as the addressing mode. This says to use the register +; itself. The 5 in th reg field is part of the opcode. +macro sub.qreg.dimm target, source + qwordreg treg, target + rex.w + db 0x81 + modrm 3, 5, treg + dd source +end macro ; Move from an 8-bit immediate value, to a location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0xC6, which has w = 0. Since we run in 64-bit mode, that ; makes the operand size 8 bits, regardless of the current operand-size -; attribute. [Intel] volume 2D, section B.1.43, table B-6. +; attribute. [Intel] volume 2D, appendix B, section B-1.4.3, table B-6. +; +; TODO this should be mov.(something).disp8 macro mov.rel.b target, offset, source match =rsp, target db 0xC6 @@ -256,20 +356,57 @@ macro mov.rel.q.d target, offset, source end match end macro +; "Load effective address". Compute a 64-bit address as you would for +; indexed addressing, with an 8-bit displacement and no indexing, but instead +; of doing anything with the memory, just store the address itself into a +; register. +macro lea.qreg.qreg.disp8 target, offset, source + rex.w + db 0x8D + qwordreg treg, target + qwordreg sreg, source + modrm 1, treg, sreg + db offset +end macro + +; Clear the DF flag. This makes string instructions increment RSI. +macro cld + db 0xFC +end macro + +; Load 64 bits from the address in RSI into RAX. Then, increment or decrement +; RSI by 8 bytes, depending on the value of the DF flag. +macro lodsq + rex.w + db 0xAD +end macro + +; Do an absolute indirect jump with a 64-bit register operand. That is: given +; a register which holds a pointer, read another address from the pointed-to +; memory and jump to it. +; +; Technically this is a "near" jump in x86 terms, but we just pretend far +; jumps and segments don't exist. They are still a thing in 64-bit mode, we +; just don't use them. +macro jmp.abs.indirect.qreg location + db 0xFF + qwordreg lreg, location + modrm 0, lreg, 4 +end macro + +; Invoke a system call provided by the kernel. On Linux, the System V ABI +; describes the semantics of such calls (at least, on x86). macro syscall db 0x0F, 0x05 -; 0f two-byte escape -; 05 syscall ^ o64 end macro - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Executable file format ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; -;;; Before we get into the meat of the program, we do a lot of ELF-specific +;;; Before we get into the body of the program, we do a lot of ELF-specific ;;; stuff to ensure that our output is in a format Linux knows how to run. ;;; ;;; First, we set the origin to load at. This is arbitrary, but it can't be @@ -337,6 +474,214 @@ program_header: program_header_entry_size = $ - program_header +;;;;;;;;;;;;;;;;;;;;;;; +;;; Execution model ;;; +;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; We use Forth-style dual stacks, one for values and one for control. We +;;; use rsp for values, just like C does. We use rbp for the control stack, +;;; which is a special Forth-y stack: These are pointers into the bodies of +;;; Forth words, not return addresses. +;;; +;;; The choice of rsp and rbp for the stack pointers imitates Jonesforth; +;;; I'm hopeful that it gives us convenient addressing modes, and will report +;;; back about that when I feel that I understand the implications. +;;; +;;; In Forth, everything is a "word", including mutable variables. +;;; Conceptually, a word is a unit of execution, which may be implemented +;;; either in machine code or as an array of pointer to other words. +;;; +;;; This polymorphism is implemented by having each word's contents begin +;;; with a "codeword", which is a pointer to machine code that "interprets" +;;; the rest of the contents. In the case of words implemented in machine +;;; code, the codeword points directly to that code, which is normally right +;;; next to it. +;;; +;;; Variables, to Forth, are simply one more thing that can be executed; the +;;; effect of executing a variable is to push its address onto the value +;;; stack. +;;; +;;; We adopt this model of words, codewords, and variables-as-words. It's +;;; nice for us because it works without a heap. +;;; +;;; One way in which we differ from Forth is that we don't have a +;;; dictionary, and our words don't have names. Nothing would prevent this, +;;; it just isn't useful to this single-purpose program. The Forth dictionary +;;; is usually a linked list of every word that has ever been defined, with +;;; the newest at the head; the names of words are stored in string fields as +;;; part of every word's internal header. Our header has neither the pointer +;;; field for the dictionary, nor the string; the only header we have is the +;;; the codeword. +;;; +;;; We specifically implement a version of calling and returning that Forth +;;; calls indirect threaded code: The control stack is a stack of pointers +;;; into the middle of interpreted words. The interpreter snippet, called +;;; DOCOL, implements calling. Each word is responsible for making sure +;;; returning works properly. Interpreted words accomplish this by ending with +;;; the word EXIT, while machine-code words accomplish it by ending with a +;;; verbatim snippet called NEXT. +;;; +;;; Conceptually, NEXT returns, but more specifically it accomplishes this +;;; by doing the caller's next dispatch for it; thus control never actually +;;; goes back to the caller's interpreter after initial setup. For performance +;;; reasons, NEXT is always inlined, so we define it as a macro. +;;; +;;; DOCOL is just ordinary code, not a macro. It's defined later in this +;;; file, as a label. +;;; +;;; +;;; +;;; -------------------------------------------------------------------------- +;;; Quick Reference +;;; -------------------------------------------------------------------------- +;;; +;;; The layout of an interpreted word: +;;; +;;; 0x00 - 0x08 Codeword (address of DOCOL snippet) +;;; 0x08 - ???? (8-byte chunks) Addresses of other words +;;; ... (end) Address of EXIT word +;;; +;;; The layout of a machine-code word: +;;; +;;; 0x00 - 0x08 Addresss of immediately following byte +;;; 0x08 - ???? Arbitrary machine code +;;; ... (end) Inlined implementation of NEXT +;;; +;;; +;;; REGISTER usage conventions: +;;; +;;; * rsi is the "instruction pointer" for the "interpreter". +;;; That is, it points to some word-pointer inside an array of +;;; word-pointers inside the content of the word they're part of. It always +;;; points to the next word that should be executed, whose execution hasn't +;;; begun yet. +;;; +;;; * rbp points to the top of the control stack +;;; These are former values of rsi, to eventually be returned to, from +;;; successively older callers as you look further up the stack. The stack +;;; grows downwards in memory. Since values are kept separately, the only +;;; thing on the control stack is return addresses, one per layer of call. +;;; +;;; * esp points to the top of the value stack +;;; The value stack has no specific format, but it grows downwards in +;;; memory. In particular there's no concept of stack frames, because items +;;; on the stack don't belong to any particular word; the value stack in +;;; Forth is in part a mechanism for passing values between words. +;;; +;;; Additionally, immediately after beginning execution of a word: +;;; +;;; * rax points to the address being executed +;;; The value of rax is purely for the callee's benefit, and does not need +;;; to be preserved. +;;; +;;; Other registers are purely discretionary, and are not preserved across +;;; calls. +;;; +;;; +;;; FLAG usage: +;;; +;;; * DF should be 0 +;;; We use lodsq extensively and that makes it increment rsi after using it. +;;; +;;; -------------------------------------------------------------------------- + +;;; +;;; Macro NEXT +;;; ---------- +;;; +;;; Include this inline at the end of a word implemented in machine-code. +;;; Conceptually, it returns. What it actually does is do the next thing the +;;; caller would do, which is call the next word from the caller's array of +;;; word pointers. +;;; +;;; Registers in: +;;; +;;; * rsi points to the address of the word to execute +;;; +;;; Registers out: +;;; +;;; * rax points to the codeword in the contents of the word that was executed +;;; * rsi points to the next word-address after this one +;;; +;;; Flags +;;; * DF = 0 is required +;;; +macro NEXT + ; Copy the next word's address from *rsi into rax. Increment the stack + ; pointer (as per the DF flag). + lodsq + + ; Load the codeword from the word's contents, and jump to the interpreter it + ; points to. + jmp.abs.indirect.qreg rax +end macro + +;;; +;;; Macros PUSHCONTROL +;;; POPCONTROL +;;; ------------------ +;;; +;;; Include these inline to push an address onto the control stack, or pop +;;; one off of it. You will recall the control stack is kept in rbp. The +;;; parameter is given in a user-specified register. +;;; +;;; Jonesforth's analogous macros are called PUSHRSP and POPRSP but I think +;;; that's super confusing, since rsp is also the name of a register, but a +;;; different one. I guess it was less confusing in 32-bit, since esp doesn't +;;; start with an "r". Anyway, this has to be named something that +;;; distinguishes it from Intel's PUSH and POP opcodes, so... +;;; +;;; "Load effective address" is just a cute way to do arithmetic on a +;;; register, here. To push or pop we decrement or increment rbp by 8. To +;;; actually interact with the space in the stack, we indirect through rbp. +;;; +;;; Registers in and out: +;;; +;;; * rbp points to the top of the control stack. +;;; +macro PUSHCONTROL source + lea.qreg.qreg.disp8 rbp, -8, rbp + mov.indirect.qreg.qreg rbp, source +end macro + +macro POPCONTROL target + mov.qreg.indirect.qreg target, rbp + lea.qreg.qreg.disp8 rbp, 8, rbp +end macro + +;;; +;;; Routine DOCOL +;;; ------------- +;;; +;;; Reference this via its label as the codeword of a word to make it an +;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer") +;;; to the control stack, takes the address of the codeword from rax and +;;; increments it in-place to form the new instruction pointer, and copies +;;; that to rsi. +;;; +;;; Having then done this, we're now in the state that normal execution +;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution, +;;; kicking off a nested call. +;;; +;;; The name is said to be short for "do colon", because Forth high-level +;;; code begins word definitions with a colon. +;;; +;;; Registers in: +;;; +;;; * rsi is the caller's instruction pointer +;;; * rbp is the control stack pointer +;;; * rax is the address of the callee's codeword +;;; +;;; Registers out: +;;; +;;; * rsi is the callee's instruction pointer +;;; * rbp is the control stack pointer +DOCOL: + PUSHCONTROL rsi + add.qreg.bimm rax, 8 + mov.qreg.qreg rsi, rax + NEXT + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Implementation strategy ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -363,8 +708,9 @@ _start: ;;; ;;; Initialize registers ;;; + cld ; clear the DF flag mov.dreg.dimm rdx, 0 ; store running file size here - sub.b rsp, 0xFF ; reserve stack space + sub.qreg.bimm rsp, 0xFF ; reserve stack space ;;; ;;; ELF header @@ -381,8 +727,9 @@ _start: mov.rel.d rsp, 0x14, 1 ; ELF format version ; Compute the entry pointer. - mov.qreg.qimm rax, $$ - add.b rax, 120 + mov.qreg.qimm rax, $$ ; the memory origin + add.qreg.bimm rax, 0x78 ; the size of the headers + add.qreg.dimm rax, 155 ; the offset of _start mov.rel.q rsp, 0x18, rax ; entry point mov.rel.q.d rsp, 0x20, 64 ; program header offset @@ -399,7 +746,7 @@ _start: ; Add the size of the ELF header to the running total mov.dreg.dimm rax, 0x40 - add.q rdx, rax + add.qreg.qreg rdx, rax ;;; ;;; Program header @@ -420,16 +767,14 @@ _start: ; Add the size of the program header to the running total mov.dreg.dimm rax, 0x38 - add.q rdx, rax - - ; Add the guessed, wrong size of the program + add.qreg.qreg rdx, rax ;;; Hardcode the size of the actual code chunk, since we don't yet have a ;;; way to generate it. ;;; ;;; TODO of course, really we want to for-real track this - mov.qreg.qimm rax, 0x15a - add.q rdx, rax + mov.qreg.qimm rax, 0x200 + add.qreg.qreg rdx, rax ;;; ;;; Go back and fill in the file size now that we know it. -- cgit 1.4.1