From 6b143324786cc25e00dc4cafcfe9ad9ef1ccae06 Mon Sep 17 00:00:00 2001
From: Irene Knapp <ireneista@irenes.space>
Date: Sun, 19 Oct 2025 01:26:38 -0700
Subject: add extensive documentation on the execution model

implement some of the earliest Forth loading stuff, following Jonesforth closely

the Forth memory space isn't totally set up yet - we are going to use the data segment after all, though it won't be heap-ish really. everything up through _start is ready for Forthy-y use though, and it does `cld` besides.

Force-Push: yeah
Change-Id: Ibd12223f304aff71f4e78744f7a9da09b9072a45
---
 quine.asm | 413 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 379 insertions(+), 34 deletions(-)

(limited to 'quine.asm')

diff --git a/quine.asm b/quine.asm
index a468578..7e15a51 100644
--- a/quine.asm
+++ b/quine.asm
@@ -1,4 +1,11 @@
-;;; Workflow tips:
+;;; QUINE
+;;;
+;;; This file is formatted to be read at 80-columns or wider.
+
+
+;;;;;;;;;;;;;;;;;;;;;
+;;; Workflow tips ;;;
+;;;;;;;;;;;;;;;;;;;;;
 ;;;
 ;;; Currently, this is not yet fully self-hosting; it is based on
 ;;; flatassembler[1]. A minimal command to build and run it is:
@@ -127,46 +134,139 @@ macro mov.qreg.qreg target, source
 end macro
 
 
-; TODO what register size does this use?
-macro add.b target, source
-  match =rax, target
-    rex.w
-    db 0x83
-    modrm 3, 0, 0
-    db source
+; Take a 64-bit source register, treat it as an address and look up the 64-bit
+; value it points to, store that into a 64-bit target register. The only modes
+; available also have displacement; we use an 8-bit one and set it to zero.
+;
+; In understanding this, pay close attention to the Op/En column in the opcode
+; table. The "RM" variant means the ModRM byte's R/M field (the third one)
+; is the source, while its reg field (the middle one) is the target. This is
+; what we want, because the R/M field is the one that gets indirection applied
+; to it. Opcode 0x8B with an REX.W prefix is the all-64-bit RM variant.
+; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV".
+;
+; For the indirection modes, don't be confused by the many similar tables.
+; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W
+; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte.
+; The presence or absence of an SIB byte is determined by where in that table
+; we fall, and we aren't using a mode that has one. [Intel] volume 2A,
+; chapter 2, section 2-1.5, table 2-2.
+;
+; We disallow rsp as a source because that's the mode that would want an SIB.
+macro mov.qreg.indirect.qreg target, source
+  match =rsp, source
+    assert 0
   else
+    qwordreg sreg, source
+    qwordreg treg, target
+    rex.w
+    rb 0x8B
+    modrm 1, treg, sreg
+    db 0
+  end match
+end macro
+
+
+; Take a 64-bit source register, store its value into the address pointed to
+; by a 64-bit target register. The only modes available also have
+; displacement; we use an 8-bit one and set it to zero.
+;
+; In understanding this, pay close attention to the Op/En column in the opcode
+; table. The "MR" variant means the ModRM byte's reg field (the middle one)
+; is the source, while its R/M field (the third one) is the target. This is
+; what we want, because the R/M field is the one that gets indirection applied
+; to it. Opcode 0x89 with an REX.W prefix is the all-64-bit MR variant.
+; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV".
+;
+; For the indirection modes, don't be confused by the many similar tables.
+; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W
+; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte.
+; The presence or absence of an SIB byte is determined by where in that table
+; we fall, and we aren't using a mode that has one. [Intel] volume 2A,
+; chapter 2, section 2-1.5, table 2-2.
+;
+; We disallow rsp as a target because that's the mode that would want an SIB.
+macro mov.indirect.qreg.qreg target, source
+  match =rsp, target
     assert 0
+  else
+    qwordreg sreg, source
+    qwordreg treg, target
+    rex.w
+    rb 0x89
+    modrm 1, sreg, treg
+    db 0
   end match
 end macro
 
 
-macro add.q target, source
-  db 0x01
+macro add.qreg.qreg target, source
   qwordreg treg, target
   qwordreg sreg, source
+  rex.w
+  db 0x01
   modrm 3, sreg, treg
 end macro
 
 
-; TODO what register size does this use?
-macro sub.b target, source
-  match =rsp, target
-    rex.w
-    db 0x83
-    modrm 3, 5, 4
-    db source
-  else
-    assert 0
-  end match
+; This adds a signed 8-bit immediate value to a 64-bit register, in place.
+;
+; Notice the use of 3 as the addressing mode. This says to use the register
+; itself. The 0 in the reg field is part of the opcode.
+macro add.qreg.bimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x83
+  modrm 3, 0, treg
+  db source
+end macro
+
+; This adds a signed 32-bit immediate value to a 64-bit register, in place.
+;
+; Notice the use of 3 as the addressing mode. This says to use the register
+; itself. The 0 in the reg field is part of the opcode.
+macro add.qreg.dimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x81
+  modrm 3, 0, treg
+  dd source
+end macro
+
+; This subtracts a signed 8-bit immediate value from a 64-bit register, in
+; place.
+;
+; Notice the use of 3 as the addressing mode. This says to use the register
+; itself. The 5 in the reg field is part of the opcode.
+macro sub.qreg.bimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x83
+  modrm 3, 5, treg
+  db source
 end macro
 
+; This subtracts a signed 32-bit immediate value from a 64-bit register, in
+; place.
+;
+; Notice the use of 3 as the addressing mode. This says to use the register
+; itself. The 5 in th reg field is part of the opcode.
+macro sub.qreg.dimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x81
+  modrm 3, 5, treg
+  dd source
+end macro
 
 ; Move from an 8-bit immediate value, to a location relative to a 64-bit
 ; register, with an 8-bit displacement and no indexing.
 ;
 ; This uses opcode 0xC6, which has w = 0. Since we run in 64-bit mode, that
 ; makes the operand size 8 bits, regardless of the current operand-size
-; attribute. [Intel] volume 2D, section B.1.43, table B-6.
+; attribute. [Intel] volume 2D, appendix B, section B-1.4.3, table B-6.
+;
+; TODO this should be mov.(something).disp8
 macro mov.rel.b target, offset, source
   match =rsp, target
     db 0xC6
@@ -256,20 +356,57 @@ macro mov.rel.q.d target, offset, source
   end match
 end macro
 
+; "Load effective address". Compute a 64-bit address as you would for
+;  indexed addressing, with an 8-bit displacement and no indexing, but instead
+; of doing anything with the memory, just store the address itself into a
+; register.
+macro lea.qreg.qreg.disp8 target, offset, source
+  rex.w
+  db 0x8D
+  qwordreg treg, target
+  qwordreg sreg, source
+  modrm 1, treg, sreg
+  db offset
+end macro
+
+; Clear the DF flag. This makes string instructions increment RSI.
+macro cld
+  db 0xFC
+end macro
+
+; Load 64 bits from the address in RSI into RAX. Then, increment or decrement
+; RSI by 8 bytes, depending on the value of the DF flag.
+macro lodsq
+  rex.w
+  db 0xAD
+end macro
+
+; Do an absolute indirect jump with a 64-bit register operand. That is: given
+; a register which holds a pointer, read another address from the pointed-to
+; memory and jump to it.
+;
+; Technically this is a "near" jump in x86 terms, but we just pretend far
+; jumps and segments don't exist. They are still a thing in 64-bit mode, we
+; just don't use them.
+macro jmp.abs.indirect.qreg location
+  db 0xFF
+  qwordreg lreg, location
+  modrm 0, lreg, 4
+end macro
+
+; Invoke a system call provided by the kernel. On Linux, the System V ABI
+; describes the semantics of such calls (at least, on x86).
 macro syscall
   db 0x0F, 0x05
-;        0f                      two-byte escape
-;           05                   syscall ^ o64
 end macro
 
 
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Executable file format ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
-;;; Before we get into the meat of the program, we do a lot of ELF-specific
+;;; Before we get into the body of the program, we do a lot of ELF-specific
 ;;; stuff to ensure that our output is in a format Linux knows how to run.
 ;;;
 ;;; First, we set the origin to load at. This is arbitrary, but it can't be
@@ -337,6 +474,214 @@ program_header:
 program_header_entry_size = $ - program_header
 
 
+;;;;;;;;;;;;;;;;;;;;;;;
+;;; Execution model ;;;
+;;;;;;;;;;;;;;;;;;;;;;;
+;;;
+;;;   We use Forth-style dual stacks, one for values and one for control. We
+;;; use rsp for values, just like C does. We use rbp for the control stack,
+;;; which is a special Forth-y stack: These are pointers into the bodies of
+;;; Forth words, not return addresses.
+;;;
+;;;   The choice of rsp and rbp for the stack pointers imitates Jonesforth;
+;;; I'm hopeful that it gives us convenient addressing modes, and will report
+;;; back about that when I feel that I understand the implications.
+;;;
+;;;   In Forth, everything is a "word", including mutable variables.
+;;; Conceptually, a word is a unit of execution, which may be implemented
+;;; either in machine code or as an array of pointer to other words.
+;;;
+;;;   This polymorphism is implemented by having each word's contents begin
+;;; with a "codeword", which is a pointer to machine code that "interprets"
+;;; the rest of the contents. In the case of words implemented in machine
+;;; code, the codeword points directly to that code, which is normally right
+;;; next to it.
+;;;
+;;;   Variables, to Forth, are simply one more thing that can be executed; the
+;;; effect of executing a variable is to push its address onto the value
+;;; stack.
+;;;
+;;;   We adopt this model of words, codewords, and variables-as-words. It's
+;;; nice for us because it works without a heap.
+;;;
+;;;   One way in which we differ from Forth is that we don't have a
+;;; dictionary, and our words don't have names. Nothing would prevent this,
+;;; it just isn't useful to this single-purpose program. The Forth dictionary
+;;; is usually a linked list of every word that has ever been defined, with
+;;; the newest at the head; the names of words are stored in string fields as
+;;; part of every word's internal header. Our header has neither the pointer
+;;; field for the dictionary, nor the string; the only header we have is the
+;;; the codeword.
+;;;
+;;;   We specifically implement a version of calling and returning that Forth
+;;; calls indirect threaded code: The control stack is a stack of pointers
+;;; into the middle of interpreted words. The interpreter snippet, called
+;;; DOCOL, implements calling.  Each word is responsible for making sure
+;;; returning works properly. Interpreted words accomplish this by ending with
+;;; the word EXIT, while machine-code words accomplish it by ending with a
+;;; verbatim snippet called NEXT.
+;;;
+;;;   Conceptually, NEXT returns, but more specifically it accomplishes this
+;;; by doing the caller's next dispatch for it; thus control never actually
+;;; goes back to the caller's interpreter after initial setup. For performance
+;;; reasons, NEXT is always inlined, so we define it as a macro.
+;;;
+;;;   DOCOL is just ordinary code, not a macro. It's defined later in this
+;;; file, as a label.
+;;;
+;;;
+;;;
+;;; --------------------------------------------------------------------------
+;;;  Quick Reference
+;;; --------------------------------------------------------------------------
+;;;
+;;; The layout of an interpreted word:
+;;;
+;;;     0x00 - 0x08                     Codeword (address of DOCOL snippet)
+;;;     0x08 - ???? (8-byte chunks)     Addresses of other words
+;;;       ... (end)                     Address of EXIT word
+;;;
+;;; The layout of a machine-code word:
+;;;
+;;;     0x00 - 0x08                     Addresss of immediately following byte
+;;;     0x08 - ????                     Arbitrary machine code
+;;;       ... (end)                     Inlined implementation of NEXT
+;;;
+;;;
+;;; REGISTER usage conventions:
+;;;
+;;; * rsi is the "instruction pointer" for the "interpreter".
+;;;     That is, it points to some word-pointer inside an array of
+;;;   word-pointers inside the content of the word they're part of. It always
+;;;   points to the next word that should be executed, whose execution hasn't
+;;;   begun yet.
+;;;
+;;; * rbp points to the top of the control stack
+;;;     These are former values of rsi, to eventually be returned to, from
+;;;   successively older callers as you look further up the stack. The stack
+;;;   grows downwards in memory. Since values are kept separately, the only
+;;;   thing on the control stack is return addresses, one per layer of call.
+;;;
+;;; * esp points to the top of the value stack
+;;;     The value stack has no specific format, but it grows downwards in
+;;;   memory. In particular there's no concept of stack frames, because items
+;;;   on the stack don't belong to any particular word; the value stack in
+;;;   Forth is in part a mechanism for passing values between words.
+;;;
+;;; Additionally, immediately after beginning execution of a word:
+;;;
+;;; * rax points to the address being executed
+;;;     The value of rax is purely for the callee's benefit, and does not need
+;;;   to be preserved.
+;;;
+;;;   Other registers are purely discretionary, and are not preserved across
+;;; calls.
+;;;
+;;;
+;;; FLAG usage:
+;;;
+;;; * DF should be 0
+;;;   We use lodsq extensively and that makes it increment rsi after using it.
+;;;
+;;; --------------------------------------------------------------------------
+
+;;;
+;;; Macro NEXT
+;;; ----------
+;;;
+;;;   Include this inline at the end of a word implemented in machine-code.
+;;; Conceptually, it returns. What it actually does is do the next thing the
+;;; caller would do, which is call the next word from the caller's array of
+;;; word pointers.
+;;;
+;;; Registers in:
+;;;
+;;; * rsi points to the address of the word to execute
+;;;
+;;; Registers out:
+;;;
+;;; * rax points to the codeword in the contents of the word that was executed
+;;; * rsi points to the next word-address after this one
+;;;
+;;; Flags
+;;; * DF = 0 is required
+;;;
+macro NEXT
+  ; Copy the next word's address from *rsi into rax. Increment the stack
+  ; pointer (as per the DF flag).
+  lodsq
+
+  ; Load the codeword from the word's contents, and jump to the interpreter it
+  ; points to.
+  jmp.abs.indirect.qreg rax
+end macro
+
+;;;
+;;; Macros PUSHCONTROL
+;;;        POPCONTROL
+;;; ------------------
+;;;
+;;;   Include these inline to push an address onto the control stack, or pop
+;;; one off of it. You will recall the control stack is kept in rbp. The
+;;; parameter is given in a user-specified register.
+;;;
+;;;   Jonesforth's analogous macros are called PUSHRSP and POPRSP but I think
+;;; that's super confusing, since rsp is also the name of a register, but a
+;;; different one. I guess it was less confusing in 32-bit, since esp doesn't
+;;; start with an "r". Anyway, this has to be named something that
+;;; distinguishes it from Intel's PUSH and POP opcodes, so...
+;;;
+;;;   "Load effective address" is just a cute way to do arithmetic on a
+;;; register, here. To push or pop we decrement or increment rbp by 8. To
+;;; actually interact with the space in the stack, we indirect through rbp.
+;;;
+;;; Registers in and out:
+;;;
+;;; * rbp points to the top of the control stack.
+;;;
+macro PUSHCONTROL source
+  lea.qreg.qreg.disp8 rbp, -8, rbp
+  mov.indirect.qreg.qreg rbp, source
+end macro
+
+macro POPCONTROL target
+  mov.qreg.indirect.qreg target, rbp
+  lea.qreg.qreg.disp8 rbp, 8, rbp
+end macro
+
+;;;
+;;; Routine DOCOL
+;;; -------------
+;;;
+;;;   Reference this via its label as the codeword of a word to make it an
+;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer")
+;;; to the control stack, takes the address of the codeword from rax and
+;;; increments it in-place to form the new instruction pointer, and copies
+;;; that to rsi.
+;;;
+;;;   Having then done this, we're now in the state that normal execution
+;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution,
+;;; kicking off a nested call.
+;;;
+;;;   The name is said to be short for "do colon", because Forth high-level
+;;; code begins word definitions with a colon.
+;;;
+;;; Registers in:
+;;;
+;;; * rsi is the caller's instruction pointer
+;;; * rbp is the control stack pointer
+;;; * rax is the address of the callee's codeword
+;;;
+;;; Registers out:
+;;;
+;;; * rsi is the callee's instruction pointer
+;;; * rbp is the control stack pointer
+DOCOL:
+  PUSHCONTROL rsi
+  add.qreg.bimm rax, 8
+  mov.qreg.qreg rsi, rax
+  NEXT
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Implementation strategy ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -363,8 +708,9 @@ _start:
   ;;;
   ;;; Initialize registers
   ;;;
+  cld                                      ; clear the DF flag
   mov.dreg.dimm rdx, 0                     ; store running file size here
-  sub.b rsp, 0xFF                          ; reserve stack space
+  sub.qreg.bimm rsp, 0xFF                  ; reserve stack space
 
   ;;;
   ;;; ELF header
@@ -381,8 +727,9 @@ _start:
   mov.rel.d rsp, 0x14, 1                   ; ELF format version
 
   ; Compute the entry pointer.
-  mov.qreg.qimm rax, $$
-  add.b rax, 120
+  mov.qreg.qimm rax, $$                    ; the memory origin
+  add.qreg.bimm rax, 0x78                  ; the size of the headers
+  add.qreg.dimm rax, 155                   ; the offset of _start
   mov.rel.q rsp, 0x18, rax                 ; entry point
 
   mov.rel.q.d rsp, 0x20, 64                ; program header offset
@@ -399,7 +746,7 @@ _start:
 
   ; Add the size of the ELF header to the running total
   mov.dreg.dimm rax, 0x40
-  add.q rdx, rax
+  add.qreg.qreg rdx, rax
 
   ;;;
   ;;; Program header
@@ -420,16 +767,14 @@ _start:
 
   ; Add the size of the program header to the running total
   mov.dreg.dimm rax, 0x38
-  add.q rdx, rax
-
-  ; Add the guessed, wrong size of the program
+  add.qreg.qreg rdx, rax
 
   ;;; Hardcode the size of the actual code chunk, since we don't yet have a
   ;;; way to generate it.
   ;;;
   ;;; TODO of course, really we want to for-real track this
-  mov.qreg.qimm rax, 0x15a
-  add.q rdx, rax
+  mov.qreg.qimm rax, 0x200
+  add.qreg.qreg rdx, rax
 
   ;;;
   ;;; Go back and fill in the file size now that we know it.
-- 
cgit 1.4.1