1 files changed, 439 insertions, 78 deletions
diff --git a/quine.asm b/quine.asm
index a468578..3b6df78 100644
--- a/quine.asm
+++ b/quine.asm
@@ -1,4 +1,11 @@
-;;; Workflow tips:
+;;; QUINE
+;;;
+;;; This file is formatted to be read at 80-columns or wider.
+
+
+;;;;;;;;;;;;;;;;;;;;;
+;;; Workflow tips ;;;
+;;;;;;;;;;;;;;;;;;;;;
 ;;;
 ;;; Currently, this is not yet fully self-hosting; it is based on
 ;;; flatassembler[1]. A minimal command to build and run it is:
@@ -7,7 +14,7 @@
 ;;;
 ;;; A workflow you may wish to use for debugging is:
 ;;;
-;;; rm quine2; fasmg quine.asm quine && ./quine > quine2; echo "exit code:" $?; echo; hexdump -C quine; echo; hexdump -C quine2; echo; cmp quine quine2 ; echo cmp: $?
+;;; rm quine2; fasmg quine.asm quine && ./quine > quine2; echo "exit code:" $?; echo; hexdump -C quine; echo; hexdump -C quine2; echo; cmp -l quine quine2 ; echo cmp: $?
 ;;;
 ;;; The reason this removes the old one first is that otherwise, there's a
 ;;; risk the error message will be scrolled off the top of the screen and
@@ -66,7 +73,7 @@ end macro
 macro qwordreg result, register
   match =rax?, register
     result = 0
-  else match =rcx?, regiser
+  else match =rcx?, register
     result = 1
   else match =rdx?, register
     result = 2
@@ -127,47 +134,138 @@ macro mov.qreg.qreg target, source
 end macro
 
 
-; TODO what register size does this use?
-macro add.b target, source
-  match =rax, target
-    rex.w
-    db 0x83
-    modrm 3, 0, 0
-    db source
+; Take a 64-bit source register, treat it as an address and look up the 64-bit
+; value it points to, store that into a 64-bit target register. The only modes
+; available also have displacement; we use an 8-bit one and set it to zero.
+;
+; In understanding this, pay close attention to the Op/En column in the opcode
+; table. The "RM" variant means the ModRM byte's R/M field (the third one)
+; is the source, while its reg field (the middle one) is the target. This is
+; what we want, because the R/M field is the one that gets indirection applied
+; to it. Opcode 0x8B with an REX.W prefix is the all-64-bit RM variant.
+; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV".
+;
+; For the indirection modes, don't be confused by the many similar tables.
+; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W
+; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte.
+; The presence or absence of an SIB byte is determined by where in that table
+; we fall, and we aren't using a mode that has one. [Intel] volume 2A,
+; chapter 2, section 2-1.5, table 2-2.
+;
+; We disallow rsp as a source because that's the mode that would want an SIB.
+macro mov.qreg.indirect.qreg target, source
+  match =rsp, source
+    assert 0
   else
+    qwordreg sreg, source
+    qwordreg treg, target
+    rex.w
+    rb 0x8B
+    modrm 1, treg, sreg
+    db 0
+  end match
+end macro
+
+
+; Take a 64-bit source register, store its value into the address pointed to
+; by a 64-bit target register. The only modes available also have
+; displacement; we use an 8-bit one and set it to zero.
+;
+; In understanding this, pay close attention to the Op/En column in the opcode
+; table. The "MR" variant means the ModRM byte's reg field (the middle one)
+; is the source, while its R/M field (the third one) is the target. This is
+; what we want, because the R/M field is the one that gets indirection applied
+; to it. Opcode 0x89 with an REX.W prefix is the all-64-bit MR variant.
+; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV".
+;
+; For the indirection modes, don't be confused by the many similar tables.
+; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W
+; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte.
+; The presence or absence of an SIB byte is determined by where in that table
+; we fall, and we aren't using a mode that has one. [Intel] volume 2A,
+; chapter 2, section 2-1.5, table 2-2.
+;
+; We disallow rsp as a target because that's the mode that would want an SIB.
+macro mov.indirect.qreg.qreg target, source
+  match =rsp, target
     assert 0
+  else
+    qwordreg sreg, source
+    qwordreg treg, target
+    rex.w
+    rb 0x89
+    modrm 1, sreg, treg
+    db 0
   end match
 end macro
 
 
-macro add.q target, source
-  db 0x01
+macro add.qreg.qreg target, source
   qwordreg treg, target
   qwordreg sreg, source
+  rex.w
+  db 0x01
   modrm 3, sreg, treg
 end macro
 
 
-; TODO what register size does this use?
-macro sub.b target, source
-  match =rsp, target
-    rex.w
-    db 0x83
-    modrm 3, 5, 4
-    db source
-  else
-    assert 0
-  end match
+; This adds a signed 8-bit immediate value to a 64-bit register, in place.
+;
+; Notice the use of 3 as the addressing mode. This says to use the register
+; itself. The 0 in the reg field is part of the opcode.
+macro add.qreg.bimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x83
+  modrm 3, 0, treg
+  db source
 end macro
 
+; This adds a signed 32-bit immediate value to a 64-bit register, in place.
+;
+; Notice the use of 3 as the addressing mode. This says to use the register
+; itself. The 0 in the reg field is part of the opcode.
+macro add.qreg.dimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x81
+  modrm 3, 0, treg
+  dd source
+end macro
+
+; This subtracts a signed 8-bit immediate value from a 64-bit register, in
+; place.
+;
+; Notice the use of 3 as the addressing mode. This says to use the register
+; itself. The 5 in the reg field is part of the opcode.
+macro sub.qreg.bimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x83
+  modrm 3, 5, treg
+  db source
+end macro
+
+; This subtracts a signed 32-bit immediate value from a 64-bit register, in
+; place.
+;
+; Notice the use of 3 as the addressing mode. This says to use the register
+; itself. The 5 in th reg field is part of the opcode.
+macro sub.qreg.dimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x81
+  modrm 3, 5, treg
+  dd source
+end macro
 
 ; Move from an 8-bit immediate value, to a location relative to a 64-bit
 ; register, with an 8-bit displacement and no indexing.
 ;
 ; This uses opcode 0xC6, which has w = 0. Since we run in 64-bit mode, that
 ; makes the operand size 8 bits, regardless of the current operand-size
-; attribute. [Intel] volume 2D, section B.1.43, table B-6.
-macro mov.rel.b target, offset, source
+; attribute. [Intel] volume 2D, appendix B, section B-1.4.3, table B-6.
+macro mov.qreg.disp8.bimm target, offset, source
   match =rsp, target
     db 0xC6
     modrm 1, 0, 4
@@ -186,7 +284,7 @@ end macro
 ; us an operand size of 32 bits by default. [Intel] volume 1, section 3.6.1,
 ; table 3-4. We want a 16-bit operand, so we use the operand-size prefix,
 ; 0x66, and we leave REX.W unset.
-macro mov.rel.w target, offset, source
+macro mov.qreg.disp8.wimm target, offset, source
   match =rsp, target
     db 0x66
     db 0xC7
@@ -205,7 +303,7 @@ end macro
 ; This uses opcode 0x67, which has w = 1. We run in 64-bit mode, so that gives
 ; us an operand size of 32 by default. [Intel] volume 2D, section B.1.43,
 ; table B-6. This is what we want, so we leave it.
-macro mov.rel.d target, offset, source
+macro mov.qreg.disp8.dimm target, offset, source
   match =rsp, target
     db 0xC7
     modrm 1, 0, 4
@@ -221,7 +319,7 @@ end macro
 ; register, with an 8-bit displacement and no indexing.
 ;
 ; This uses opcode 0x89.
-macro mov.rel.q target, offset, source
+macro mov.qreg.disp8.qreg target, offset, source
   match =rsp, target
     qwordreg sreg, source
     rex.w
@@ -243,7 +341,7 @@ end macro
 ; gives us an operand size of 32 by default. [Intel] volume 2D,
 ; section B.1.43, table B-6. We want a 64-bit operand, so we use the REX.W
 ; prefix, 0x48.
-macro mov.rel.q.d target, offset, source
+macro mov.qreg.disp8.dimm target, offset, source
   match =rsp, target
     rex.w
     db 0xC7
@@ -256,20 +354,57 @@ macro mov.rel.q.d target, offset, source
   end match
 end macro
 
+; "Load effective address". Compute a 64-bit address as you would for
+;  indexed addressing, with an 8-bit displacement and no indexing, but instead
+; of doing anything with the memory, just store the address itself into a
+; register.
+macro lea.qreg.qreg.disp8 target, offset, source
+  rex.w
+  db 0x8D
+  qwordreg treg, target
+  qwordreg sreg, source
+  modrm 1, treg, sreg
+  db offset
+end macro
+
+; Clear the DF flag. This makes string instructions increment RSI.
+macro cld
+  db 0xFC
+end macro
+
+; Load 64 bits from the address in RSI into RAX. Then, increment or decrement
+; RSI by 8 bytes, depending on the value of the DF flag.
+macro lodsq
+  rex.w
+  db 0xAD
+end macro
+
+; Do an absolute indirect jump with a 64-bit register operand. That is: given
+; a register which holds a pointer, read another address from the pointed-to
+; memory and jump to it.
+;
+; Technically this is a "near" jump in x86 terms, but we just pretend far
+; jumps and segments don't exist. They are still a thing in 64-bit mode, we
+; just don't use them.
+macro jmp.abs.indirect.qreg location
+  db 0xFF
+  qwordreg lreg, location
+  modrm 0, lreg, 4
+end macro
+
+; Invoke a system call provided by the kernel. On Linux, the System V ABI
+; describes the semantics of such calls (at least, on x86).
 macro syscall
   db 0x0F, 0x05
-;        0f                      two-byte escape
-;           05                   syscall ^ o64
 end macro
 
 
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; Executable file format ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
-;;; Before we get into the meat of the program, we do a lot of ELF-specific
+;;; Before we get into the body of the program, we do a lot of ELF-specific
 ;;; stuff to ensure that our output is in a format Linux knows how to run.
 ;;;
 ;;; First, we set the origin to load at. This is arbitrary, but it can't be
@@ -337,9 +472,240 @@ program_header:
 program_header_entry_size = $ - program_header
 
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; Implementation strategy ;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;
+;;; Execution model ;;;
+;;;;;;;;;;;;;;;;;;;;;;;
+;;;
+;;;   We use Forth-style dual stacks, one for values and one for control. We
+;;; use rsp for values, just like C does. We use rbp for the control stack,
+;;; which is a special Forth-y stack: These are pointers into the bodies of
+;;; Forth words, not return addresses.
+;;;
+;;;   The choice of rsp and rbp for the stack pointers imitates Jonesforth;
+;;; I'm hopeful that it gives us convenient addressing modes, and will report
+;;; back about that when I feel that I understand the implications.
+;;;
+;;;   In Forth, everything is a "word", including mutable variables.
+;;; Conceptually, a word is a unit of execution, which may be implemented
+;;; either in machine code or as an array of pointer to other words.
+;;;
+;;;   This polymorphism is implemented by having each word's contents begin
+;;; with a "codeword", which is a pointer to machine code that "interprets"
+;;; the rest of the contents. In the case of words implemented in machine
+;;; code, the codeword points directly to that code, which is normally right
+;;; next to it.
+;;;
+;;;   Variables, to Forth, are simply one more thing that can be executed; the
+;;; effect of executing a variable is to push its address onto the value
+;;; stack.
+;;;
+;;;   We adopt this model of words, codewords, and variables-as-words. It's
+;;; nice for us because it works without a heap.
+;;;
+;;;   One way in which we differ from Forth is that we don't have a
+;;; dictionary, and our words don't have names. Nothing would prevent this,
+;;; it just isn't useful to this single-purpose program. The Forth dictionary
+;;; is usually a linked list of every word that has ever been defined, with
+;;; the newest at the head; the names of words are stored in string fields as
+;;; part of every word's internal header. Our header has neither the pointer
+;;; field for the dictionary, nor the string; the only header we have is the
+;;; the codeword.
+;;;
+;;;   We specifically implement a version of calling and returning that Forth
+;;; calls indirect threaded code: The control stack is a stack of pointers
+;;; into the middle of interpreted words. The interpreter snippet, called
+;;; DOCOL, implements calling.  Each word is responsible for making sure
+;;; returning works properly. Interpreted words accomplish this by ending with
+;;; the word EXIT, while machine-code words accomplish it by ending with a
+;;; verbatim snippet called NEXT.
+;;;
+;;;   Conceptually, NEXT returns, but more specifically it accomplishes this
+;;; by doing the caller's next dispatch for it; thus control never actually
+;;; goes back to the caller's interpreter after initial setup. For performance
+;;; reasons, NEXT is always inlined, so we define it as a macro.
+;;;
+;;;   DOCOL is just ordinary code, not a macro. It's defined later in this
+;;; file, as a label.
+;;;
+;;;
+;;;
+;;; --------------------------------------------------------------------------
+;;;  Quick Reference
+;;; --------------------------------------------------------------------------
+;;;
+;;; The layout of an interpreted word:
+;;;
+;;;     0x00 - 0x08                     Codeword (address of DOCOL snippet)
+;;;     0x08 - ???? (8-byte chunks)     Addresses of other words
+;;;       ... (end)                     Address of EXIT word
+;;;
+;;; The layout of a machine-code word:
+;;;
+;;;     0x00 - 0x08                     Addresss of immediately following byte
+;;;     0x08 - ????                     Arbitrary machine code
+;;;       ... (end)                     Inlined implementation of NEXT
+;;;
+;;;
+;;; REGISTER usage conventions:
+;;;
+;;; * rsi is the "instruction pointer" for the "interpreter".
+;;;     That is, it points to some word-pointer inside an array of
+;;;   word-pointers inside the content of the word they're part of. It always
+;;;   points to the next word that should be executed, whose execution hasn't
+;;;   begun yet.
+;;;
+;;; * rbp points to the top of the control stack
+;;;     These are former values of rsi, to eventually be returned to, from
+;;;   successively older callers as you look further up the stack. The stack
+;;;   grows downwards in memory. Since values are kept separately, the only
+;;;   thing on the control stack is return addresses, one per layer of call.
+;;;
+;;; * esp points to the top of the value stack
+;;;     The value stack has no specific format, but it grows downwards in
+;;;   memory. In particular there's no concept of stack frames, because items
+;;;   on the stack don't belong to any particular word; the value stack in
+;;;   Forth is in part a mechanism for passing values between words.
+;;;
+;;; Additionally, immediately after beginning execution of a word:
+;;;
+;;; * rax points to the address being executed
+;;;     The value of rax is purely for the callee's benefit, and does not need
+;;;   to be preserved.
+;;;
+;;;   Other registers are purely discretionary, and are not preserved across
+;;; calls.
+;;;
+;;;
+;;; FLAG usage:
+;;;
+;;; * DF should be 0
+;;;   We use lodsq extensively and that makes it increment rsi after using it.
+;;;
+;;; --------------------------------------------------------------------------
+
+;;;
+;;; Macro NEXT
+;;; ----------
+;;;
+;;;   Include this inline at the end of a word implemented in machine-code.
+;;; Conceptually, it returns. What it actually does is do the next thing the
+;;; caller would do, which is call the next word from the caller's array of
+;;; word pointers.
+;;;
+;;; Registers in:
+;;;
+;;; * rsi points to the address of the word to execute
+;;;
+;;; Registers out:
+;;;
+;;; * rax points to the codeword in the contents of the word that was executed
+;;; * rsi points to the next word-address after this one
+;;;
+;;; Flags
+;;; * DF = 0 is required
+;;;
+macro NEXT
+  ; Copy the next word's address from *rsi into rax. Increment the stack
+  ; pointer (as per the DF flag).
+  lodsq
+
+  ; Load the codeword from the word's contents, and jump to the interpreter it
+  ; points to.
+  jmp.abs.indirect.qreg rax
+end macro
+
+;;;
+;;; Macros PUSHCONTROL
+;;;        POPCONTROL
+;;; ------------------
+;;;
+;;;   Include these inline to push an address onto the control stack, or pop
+;;; one off of it. You will recall the control stack is kept in rbp. The
+;;; parameter is given in a user-specified register.
+;;;
+;;;   Jonesforth's analogous macros are called PUSHRSP and POPRSP but I think
+;;; that's super confusing, since rsp is also the name of a register, but a
+;;; different one. I guess it was less confusing in 32-bit, since esp doesn't
+;;; start with an "r". Anyway, this has to be named something that
+;;; distinguishes it from Intel's PUSH and POP opcodes, so...
+;;;
+;;;   "Load effective address" is just a cute way to do arithmetic on a
+;;; register, here. To push or pop we decrement or increment rbp by 8. To
+;;; actually interact with the space in the stack, we indirect through rbp.
+;;;
+;;; Registers in and out:
+;;;
+;;; * rbp points to the top of the control stack.
+;;;
+macro PUSHCONTROL source
+  lea.qreg.qreg.disp8 rbp, -8, rbp
+  mov.indirect.qreg.qreg rbp, source
+end macro
+
+macro POPCONTROL target
+  mov.qreg.indirect.qreg target, rbp
+  lea.qreg.qreg.disp8 rbp, 8, rbp
+end macro
+
+;;;
+;;; Routine DOCOL
+;;; -------------
+;;;
+;;;   Reference this via its label as the codeword of a word to make it an
+;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer")
+;;; to the control stack, takes the address of the codeword from rax and
+;;; increments it in-place to form the new instruction pointer, and copies
+;;; that to rsi.
+;;;
+;;;   Having then done this, we're now in the state that normal execution
+;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution,
+;;; kicking off a nested call.
+;;;
+;;;   The name is said to be short for "do colon", because Forth high-level
+;;; code begins word definitions with a colon.
+;;;
+;;; Registers in:
+;;;
+;;; * rsi is the caller's instruction pointer
+;;; * rbp is the control stack pointer
+;;; * rax is the address of the callee's codeword
+;;;
+;;; Registers out:
+;;;
+;;; * rsi is the callee's instruction pointer
+;;; * rbp is the control stack pointer
+DOCOL:
+  PUSHCONTROL rsi
+  add.qreg.bimm rax, 8
+  mov.qreg.qreg rsi, rax
+  NEXT
+
+;;;
+;;; Routine _start
+;;; --------------
+;;;
+;;;   This is the entry point of the whole program, the very first code we
+;;; actually execute. We go with the name linkers generally use for it, though
+;;; it's not very Forth-y. The ELF header points to it.
+;;;
+;;;   The kernel gives us most registers zeroed, and rsp pointing to the
+;;; command-line stuff (argc, argv, envp), which is at an ASLR'd address with
+;;; some stack space allocated for us, despite the fact we didn't request any.
+;;; It also gives us all the flags clear except IF, but we don't rely on that.
+;;; Lastly, of course, it loads our code segment and sets the instruction
+;;; pointer where we asked; we don't need to check what those addresses are,
+;;; because they're not randomized.
+;;;
+;;;   This routine is really only responsible for one-time initialization.
+;;;
+_start:
+  cld                                      ; clear the DF flag
+  ; If we wanted to save the initial stack pointer, we'd do that here.
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; (old) Implementation strategy ;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
 ;;;   We assemble the entire file contents in a stack-allocated buffer.
 ;;; We avoid using the stack for any other purpose. When the file is fully
@@ -359,83 +725,78 @@ program_header_entry_size = $ - program_header
 ;;;
 ;;; * rsp points to the bottom of the buffer.
 ;;;
-_start:
-  ;;;
-  ;;; Initialize registers
-  ;;;
   mov.dreg.dimm rdx, 0                     ; store running file size here
-  sub.b rsp, 0xFF                          ; reserve stack space
+  ;sub.qreg.bimm rsp, 0xFF                  ; reserve stack space
 
   ;;;
   ;;; ELF header
   ;;;
-  mov.rel.d rsp, 0x00, 0x7F bappend "ELF"  ; magic number
-  mov.rel.b rsp, 0x04, 2                   ; 64-bit
-  mov.rel.b rsp, 0x05, 1                   ; little-endian
-  mov.rel.b rsp, 0x06, 1                   ; ELF header format version 1
-  mov.rel.b rsp, 0x07, 0                   ; System-V ABI
-  mov.rel.q.d rsp, 0x08, 0                 ; (padding)
+  mov.qreg.disp8.dimm rsp, 0x00, 0x7F bappend "ELF"  ; magic number
+  mov.qreg.disp8.bimm rsp, 0x04, 2         ; 64-bit
+  mov.qreg.disp8.bimm rsp, 0x05, 1         ; little-endian
+  mov.qreg.disp8.bimm rsp, 0x06, 1         ; ELF header format version 1
+  mov.qreg.disp8.bimm rsp, 0x07, 0         ; System-V ABI
+  mov.qreg.disp8.dimm rsp, 0x08, 0         ; (padding)
 
-  mov.rel.w rsp, 0x10, 2                   ; executable
-  mov.rel.w rsp, 0x12, 0x3E                ; Intel x86-64
-  mov.rel.d rsp, 0x14, 1                   ; ELF format version
+  mov.qreg.disp8.wimm rsp, 0x10, 2         ; executable
+  mov.qreg.disp8.wimm rsp, 0x12, 0x3E      ; Intel x86-64
+  mov.qreg.disp8.dimm rsp, 0x14, 1         ; ELF format version
 
   ; Compute the entry pointer.
-  mov.qreg.qimm rax, $$
-  add.b rax, 120
-  mov.rel.q rsp, 0x18, rax                 ; entry point
+  mov.qreg.qimm rax, $$                    ; the memory origin
+  add.qreg.bimm rax, 0x78                  ; the size of the headers
+  add.qreg.dimm rax, 155                   ; the offset of _start
+  mov.qreg.disp8.qreg rsp, 0x18, rax       ; entry point
 
-  mov.rel.q.d rsp, 0x20, 64                ; program header offset
+  mov.qreg.disp8.dimm rsp, 0x20, 64        ; program header offset
     ; We place the program header immediately after the ELF header. This
     ; offset is from the start of the file.
-  mov.rel.q.d rsp, 0x28, 0                 ; section header offset
-  mov.rel.d rsp, 0x30, 0                   ; processor flags
-  mov.rel.w rsp, 0x34, 64                  ; ELF header size
-  mov.rel.w rsp, 0x36, 56                  ; program header entry size
-  mov.rel.w rsp, 0x38, 1                   ; number of program header entries
-  mov.rel.w rsp, 0x3a, 0                   ; section header entry size
-  mov.rel.w rsp, 0x3c, 0                   ; number of section header entries
-  mov.rel.w rsp, 0x3e, 0                   ; section name string table index
+  mov.qreg.disp8.dimm rsp, 0x28, 0         ; section header offset
+  mov.qreg.disp8.dimm rsp, 0x30, 0         ; processor flags
+  mov.qreg.disp8.wimm rsp, 0x34, 64        ; ELF header size
+  mov.qreg.disp8.wimm rsp, 0x36, 56        ; program header entry size
+  mov.qreg.disp8.wimm rsp, 0x38, 1         ; number of program header entries
+  mov.qreg.disp8.wimm rsp, 0x3a, 0         ; section header entry size
+  mov.qreg.disp8.wimm rsp, 0x3c, 0         ; number of section header entries
+  mov.qreg.disp8.wimm rsp, 0x3e, 0         ; section name string table index
 
   ; Add the size of the ELF header to the running total
   mov.dreg.dimm rax, 0x40
-  add.q rdx, rax
+  add.qreg.qreg rdx, rax
 
   ;;;
   ;;; Program header
   ;;;
-  mov.rel.d rsp, 0x40, 1                   ; "loadable" segment type
-  mov.rel.d rsp, 0x44, 0x05                ; read+execute permission
-  mov.rel.q.d rsp, 0x48, 0                 ; offset in file
-  mov.rel.q.d rsp, 0x50, $$                ; virtual address
+  mov.qreg.disp8.dimm rsp, 0x40, 1         ; "loadable" segment type
+  mov.qreg.disp8.dimm rsp, 0x44, 0x05      ; read+execute permission
+  mov.qreg.disp8.dimm rsp, 0x48, 0         ; offset in file
+  mov.qreg.disp8.dimm rsp, 0x50, $$        ; virtual address
     ; required, but can be anything, subject to alignment
-  mov.rel.q.d rsp, 0x58, 0                 ; physical address (ignored)
+  mov.qreg.disp8.dimm rsp, 0x58, 0         ; physical address (ignored)
 
   ; Fill in 0 as the file size for now, to avoid unitialized memory.
-  mov.rel.q.d rsp, 0x60, 0                 ; size in file
-  mov.rel.q.d rsp, 0x68, 0                 ; size in memory
+  mov.qreg.disp8.dimm rsp, 0x60, 0         ; size in file
+  mov.qreg.disp8.dimm rsp, 0x68, 0         ; size in memory
 
-  mov.rel.q.d rsp, 0x70, 0                 ; segment alignment
-    ;   for relocation - will we be ASLR'd?
+  mov.qreg.disp8.dimm rsp, 0x70, 0         ; segment alignment
+    ; for relocation, but this doesn't apply to us
 
   ; Add the size of the program header to the running total
   mov.dreg.dimm rax, 0x38
-  add.q rdx, rax
-
-  ; Add the guessed, wrong size of the program
+  add.qreg.qreg rdx, rax
 
   ;;; Hardcode the size of the actual code chunk, since we don't yet have a
   ;;; way to generate it.
   ;;;
   ;;; TODO of course, really we want to for-real track this
-  mov.qreg.qimm rax, 0x15a
-  add.q rdx, rax
+  mov.qreg.qimm rax, 0x201
+  add.qreg.qreg rdx, rax
 
   ;;;
   ;;; Go back and fill in the file size now that we know it.
   ;;;
-  mov.rel.q rsp, 0x60, rdx                 ; size in file
-  mov.rel.q rsp, 0x68, rdx                 ; size in memory
+  mov.qreg.disp8.qreg rsp, 0x60, rdx       ; size in file
+  mov.qreg.disp8.qreg rsp, 0x68, rdx       ; size in memory
 
   ;;;
   ;;; The buffer is ready; output the file.