1 files changed, 290 insertions, 27 deletions
diff --git a/quine.asm b/quine.asm
index 3b6df78..1139cc8 100644
--- a/quine.asm
+++ b/quine.asm
@@ -34,21 +34,61 @@
 ;;; Assembly language ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
-;;; Before doing any actual code, we define macros for writing x86-64 assembly
-;;; language. This is built from scratch, relying only on flatassembler's
-;;; built-in semantics. No include files of any kind are used for it.
+;;;   Before doing any actual code, we define macros for writing x86-64
+;;; assembly language. This is built from scratch, relying only on
+;;; flatassembler's built-in semantics. No include files of any kind are used
+;;; for it.
 
+; The way these are all spelled out like this is slightly ridiculous, there
+; must be a better way.
 macro rex.0
   db 0x40
 end macro
-
 macro rex.w
   db 0x48
 end macro
-
+macro rex.r
+  db 0x44
+end macro
+macro rex.x
+  db 0x42
+end macro
+macro rex.b
+  db 0x41
+end macro
+macro rex.wr
+  db 0x4C
+end macro
+macro rex.wx
+  db 0x4A
+end macro
+macro rex.wb
+  db 0x49
+end macro
+macro rex.rx
+  db 0x46
+end macro
+macro rex.rb
+  db 0x45
+end macro
 macro rex.xb
   db 0x43
 end macro
+macro rex.wrx
+  db 0x4E
+end macro
+macro rex.wrb
+  db 0x4D
+end macro
+macro rex.wxb
+  db 0x4B
+end macro
+macro rex.rxb
+  db 0x47
+end macro
+macro rex.wrxb
+  db 0x4F
+end macro
 
 macro modrm mod, reg, rm
   assert mod >= 0 & mod < 4
@@ -92,6 +132,28 @@ macro qwordreg result, register
   end match
 end macro
 
+macro owordreg result, register
+  match =r8?, register
+    result = 0
+  else match =r9?, register
+    result = 1
+  else match =r10?, register
+    result = 2
+  else match =r11?, register
+    result = 3
+  else match =r12?, register
+    result = 4
+  else match =r13?, register
+    result = 5
+  else match =r14?, register
+    result = 6
+  else match =r15?, register
+    result = 7
+  else
+    assert 0
+  end match
+end macro
+
 
 ; TODO what register size does this use?
 macro mov.b target, source
@@ -118,8 +180,18 @@ end macro
 
 
 macro mov.qreg.qimm target, source
-  rex.w
   qwordreg treg, target
+  rex.w
+  opcodereg 0xB8, treg
+  dq source
+end macro
+
+
+; Notice the use of REX.B here; this instruction puts the register number in
+; the opcode field, so it uses Table 3-1.
+macro mov.oreg.qimm target, source
+  owordreg treg, target
+  rex.wb
   opcodereg 0xB8, treg
   dq source
 end macro
@@ -135,8 +207,12 @@ end macro
 
 
 ; Take a 64-bit source register, treat it as an address and look up the 64-bit
-; value it points to, store that into a 64-bit target register. The only modes
-; available also have displacement; we use an 8-bit one and set it to zero.
+; value it points to, store that into a 64-bit target register.
+;
+; For rsp and rbp, the only modes available also have displacement; we use an
+; 8-bit one and set it to zero. The other registers could be encoded without
+; the displacement, but for simplicity's sake we do the same thing for all of
+; them.
 ;
 ; In understanding this, pay close attention to the Op/En column in the opcode
 ; table. The "RM" variant means the ModRM byte's R/M field (the third one)
@@ -200,6 +276,55 @@ macro mov.indirect.qreg.qreg target, source
 end macro
 
 
+; Take a 64-bit source register, store its value into a high 64-bit target
+; register (r8-r15).
+;
+; Notice that there are two ways to add another bit to the register encoding.
+; Table 3-1 is about REX.B, but does not apply here, it's for instructions
+; that use opcode bits to specify a register, and none of the
+; register-to-register MOV variants do that (it's for immediate mode).
+;
+; Instead, we want the mechanism that uses REX.R as the extra bit, and it
+; combines with the reg field of ModRM, as per 2.2.1.2.
+;
+; Therefore, we want the variant of MOV which puts the target in the reg
+; field. That's Op/En "RM", opcode 0x8B with REX.WR.
+;
+; Mode 3 is direct addressing.
+macro mov.oreg.qreg target, source
+  owordreg treg, target
+  qwordreg sreg, source
+  rex.wr
+  rb 0x8B
+  modrm 3, treg, sreg
+end macro
+
+
+; Take a high 64-bit source register (r8-r15), store its value into a 64-bit
+; target register.
+;
+; Notice that there are two ways to add another bit to the register encoding.
+; Table 3-1 is about REX.B, but does not apply here, it's for instructions
+; that use opcode bits to specify a register, and none of the
+; register-to-register MOV variants do that (it's for immediate mode).
+;
+; Instead, we want the mechanism that uses REX.R as the extra bit, and it
+; combines with the reg field of ModRM, as per 2.2.1.2.
+;
+; Therefore, we want the variant of MOV which puts the source in the reg
+; field. That's Op/En "MR", opcode 0x89 with REX.WR.
+;
+; Mode 3 is direct addressing.
+macro mov.qreg.oreg target, source
+  qwordreg treg, target
+  owordreg sreg, source
+  rex.wr
+  rb 0x89
+  modrm 3, sreg, treg
+end macro
+
+
+; This adds a 64-bit register to another 64-bit register, in place.
 macro add.qreg.qreg target, source
   qwordreg treg, target
   qwordreg sreg, source
@@ -221,6 +346,7 @@ macro add.qreg.bimm target, source
   db source
 end macro
 
+
 ; This adds a signed 32-bit immediate value to a 64-bit register, in place.
 ;
 ; Notice the use of 3 as the addressing mode. This says to use the register
@@ -269,7 +395,9 @@ macro mov.qreg.disp8.bimm target, offset, source
   match =rsp, target
     db 0xC6
     modrm 1, 0, 4
-    sib 0, 0, 4
+      ; 4 is rsp, but it's a special case
+    sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
     db source
   else
@@ -284,12 +412,16 @@ end macro
 ; us an operand size of 32 bits by default. [Intel] volume 1, section 3.6.1,
 ; table 3-4. We want a 16-bit operand, so we use the operand-size prefix,
 ; 0x66, and we leave REX.W unset.
+;
+; We need to treat rsp specially because it's the SIB case, per table 2-2.
 macro mov.qreg.disp8.wimm target, offset, source
   match =rsp, target
     db 0x66
     db 0xC7
     modrm 1, 0, 4
+      ; 4 is rsp, but it's a special case
     sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
     dw source
   else
@@ -307,7 +439,9 @@ macro mov.qreg.disp8.dimm target, offset, source
   match =rsp, target
     db 0xC7
     modrm 1, 0, 4
+      ; 4 is rsp, but it's a special case
     sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
     dd source
   else
@@ -318,17 +452,27 @@ end macro
 ; Move from a 64-bit register, to a 64-bit location relative to a 64-bit
 ; register, with an 8-bit displacement and no indexing.
 ;
-; This uses opcode 0x89.
+; This uses opcode 0x89 with REX.W, so that gives us the reg field as the
+; 64-bit source and the R/M field as the 64-bit destination.
+;
+; We need to treat a target of rsp specially because it's the SIB case per
+; table 2-2.
 macro mov.qreg.disp8.qreg target, offset, source
+  qwordreg sreg, source
+  qwordreg treg, target
   match =rsp, target
-    qwordreg sreg, source
     rex.w
     db 0x89
-    modrm 1, sreg, 4
+    modrm 1, sreg, treg
+      ; treg is rsp by assumption, and R/M = rsp is the SIB case
     sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
   else
-    assert 0
+    rex.w
+    db 0x89
+    modrm 1, sreg, treg
+    db offset
   end match
 end macro
 
@@ -346,7 +490,9 @@ macro mov.qreg.disp8.dimm target, offset, source
     rex.w
     db 0xC7
     modrm 1, 0, 4
+      ; 4 is rsp, but it's a special case
     sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
     dd source
   else
@@ -404,16 +550,17 @@ end macro
 ;;; Executable file format ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
-;;; Before we get into the body of the program, we do a lot of ELF-specific
+;;;   Before we get into the body of the program, we do a lot of ELF-specific
 ;;; stuff to ensure that our output is in a format Linux knows how to run.
 ;;;
-;;; First, we set the origin to load at. This is arbitrary, but it can't be
+;;;   First, we set the origin to load at. This is arbitrary, but it can't be
 ;;; zero. We tell flatassembler about it because it's used in label
 ;;; calculations; we can reference it as $$ any time we need it in future.
 org 0x08000000
 
 ;;;
-;;; Second, we output ELF's top-level file header.
+;;;   Second, we output ELF's top-level file header. The only interesting
+;;; thing here is the entry pointer.
 ;;;
 elf_header:
   ; * denotes mandatory fields according to breadbox
@@ -443,18 +590,20 @@ elf_header:
 elf_header_size = $ - elf_header
 
 ;;;
-;;; Third, immediately after the ELF file header, we output ELF's program
+;;;   Third, immediately after the ELF file header, we output ELF's program
 ;;; header, which lists the memory regions ("segments") we want to have and
 ;;; where we want them to come from. We list just a single region, which is
 ;;; the entire contents of the ELF file from disk.
 ;;;
-;;; It would be more typical to have separate code and data segments, and
-;;; perhaps a stack or heap, but this keeps things simple. We do have a little
-;;; stack space available, though we don't explicitily request any; the kernel
-;;; allocates it for us as part of exec() so that it can pass us argc and argv
-;;; (which we ignore). That stack space will be at a random address, different
-;;; every time, because of ASLR; that's a neat security feature, so we leave
-;;; it as-is.
+;;;   It would be more typical to use this header to ask the loader to give us
+;;; separate code and data segments, and perhaps a stack or heap, but this
+;;; keeps things simple, and we can create those things for ourselves later.
+;;;
+;;;    We do have a little stack space available, though we don't explicitily
+;;; request any; the kernel allocates it for us as part of exec() so that it
+;;; can pass us argc and argv (which we ignore). That stack space will be at a
+;;; random address, different every time, because of ASLR; that's a neat
+;;; security feature, so we leave it as-is.
 ;;;
 program_header:
   dd 1                           ; *"loadable" segment type
@@ -568,7 +717,7 @@ program_header_entry_size = $ - program_header
 ;;;
 ;;; Additionally, immediately after beginning execution of a word:
 ;;;
-;;; * rax points to the address being executed
+;;; * rax points to the address of the codeword being executed
 ;;;     The value of rax is purely for the callee's benefit, and does not need
 ;;;   to be preserved.
 ;;;
@@ -698,9 +847,123 @@ DOCOL:
 ;;;
 ;;;   This routine is really only responsible for one-time initialization.
 ;;;
+;;; Registers in:
+;;;
+;;; * rsp points to the top (low end) of the value stack
+;;;     The kernel sets this up for us, and we need to save it somewhere so
+;;;   Forth can use it.
+;;;
+;;; Registers out:
+;;;
+;;; * rsp points to the top of the control stack
+;;; * rsi points within QUIT
+;;;     QUIT is the word that's Forth's closest equivalent to main().
+;;;
+;;; Registers within:
+;;;
+;;; * rdi points to the base the heap was allocated at, once it is
+;;;     This is the same value that S0 will hold, once we reach a point
+;;;   where we can rely on Forth variable-words.
+;;;
 _start:
   cld                                      ; clear the DF flag
-  ; If we wanted to save the initial stack pointer, we'd do that here.
+
+  ;;;
+  ;;; Prepare the heap.
+  ;;;
+  ;;;   We could ask for a data segment in the program header, but where's the
+  ;;; fun in that? Instead, we call mmap().
+  ;;;
+  ;;;   If we wanted the kernel to do ASLR for us, passing address zero would
+  ;;; cause it to pick somewhere at random, but instead we choose our own
+  ;;; location. It's still not guaranteed to be where we ask for, so we still
+  ;;; do the work to record where it wound up. We could pass the "fixed" flag
+  ;;; and the kernel would trust us, but this gives us more options for
+  ;;; interoperating with other runtimes.
+  ;;;
+  mov.b rax, 9                             ; mmap()
+  mov.qreg.qimm rdi, 0x0000001000000000    ; address (very arbitrary)
+  mov.qreg.qimm rsi, 0x0000000001000000    ; size (one meg)
+  mov.qreg.qimm rdx, 0x03                  ; protection (read+write)
+  mov.oreg.qimm r10, 0x22                  ; flags (private+anonymous)
+  mov.oreg.qimm r8, 0                      ; file descriptor (ignored)
+  mov.oreg.qimm r9, 0                      ; offset (ignored)
+  syscall
+
+  ;;;
+  ;;;   The return value of the system call is in rax, we'll use it in a sec.
+  ;;; We need to save this somewhere in case we ever want to munmap() it;
+  ;;; there's no widely-used name for it so we have to make one up. S0 and R0
+  ;;; are widely-used names for the logical tops of the value and control
+  ;;; stacks, respectively, and we will eventually set those up as well, so we
+  ;;; should keep those names in mind. The control stack lives within the
+  ;;; heap, while the value stack is its own segment. This value, though, is
+  ;;; the physical bottom of the segment, meaning that it stays the same even
+  ;;; as we allocate and deallocate things within it. This is unlike the two
+  ;;; stack pointers, so we give it a name that doesn't suggest similarity:
+  ;;; HEAP.
+  ;;;
+  ;;;   Once Forth is fully set up, its internal variables will be accessed
+  ;;; through variable-words like any other Forth data, including HEAP. To get
+  ;;; to that point, though, we need to be able to hold onto variable data
+  ;;; between now and then. In fact, if we don't have at least one of HEAP and
+  ;;; HERE (its counterpart which points to the logical top end), all our
+  ;;; efforts to hold onto anything seem a bit doomed.
+  ;;;
+  ;;;   So, we temporarily dedicate rdi to HEAP - only within this routine -
+  ;;; and store everything else in ways that let us find things by reference
+  ;;; to it. We choose rdi because it works with the indexing modes we care
+  ;;; about, and its name suggests its function.
+  ;;;
+  ;;;   The strategy Jonesforth uses is not applicable to us; Jonesforth
+  ;;; takes advantage of the linker to let its code segment refer to specific,
+  ;;; pre-allocated objects in the data segment. We are our own linker.
+  ;;; Hence, this approach.
+  ;;;
+  ;;;   Keying things off HEAP is the fundamental decision, but to make sure
+  ;;; our variables are accessible both during early bootstrapping, and later,
+  ;;; we also have to be thoughtful about data structures. More on that in a
+  ;;; moment.
+  ;;;
+  mov.qreg.qreg rdi, rax
+
+  ;;;
+  ;;;   Now we save some stuff onto the heap. These are the locations that
+  ;;; will eventually be the backing stores of the Forth variables, but we
+  ;;; don't create the word headers yet, since there's no requirement that
+  ;;; they be next to the backing stores. We'll do that later, once we have
+  ;;; word-writing infrastructure in place. For now, we just use their offsets
+  ;;; relative to the physical bottom of the heap, which are fixed.
+  ;;;
+  ;;;   These will be the permanent homes of these values, though we have
+  ;;; copies of them elsewhere while we're still in this routine.
+  ;;;
+  mov.qreg.disp8.qreg rdi, 0x00, rdi       ; HEAP
+  mov.qreg.disp8.qreg rdi, 0x08, rsp       ; S0
+  ; TODO this isn't done yet, need to reserve space and explain it more
+  ;;;
+  ;;; * HEAP is the physical bottom of the heap
+  ;;;     The heap grows upwards in memory, so this is also the logical
+  ;;;   bottom. This comes from the address mmap() just returned to us.
+  ;;; * S0 is the logical bottom of the value stack
+  ;;;     The value stack grows downwards in memory, so this is the physical
+  ;;;   top of it. This comes from the stack pointer the kernel initialized us
+  ;;;   with.
+  ;;; * R0 is the logical bottom of the control stack
+  ;;;     The control stack also grows downwards, so this is its pysical top
+  ;;;   as well. We allocate this dedicated space within the heap right here,
+  ;;;   in this routine, through our choice of where to put things.
+  ;;;
+  ;;;     S0 and R0 are mostly used when we want to initialize or reinitialize
+  ;;;   their respective stacks - that is, discard all their contents at once.
+
+  ;;; TODO we don't do this yet
+  ;;;   Now we're going to create a word in the heap, to hold the value of
+  ;;; HERE long-term.
+
+  ;;;
+  ;;;   We would like very much to get out of the bootstrap code and into a
+  ;;; proper threaded-execution setup.
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -789,7 +1052,7 @@ _start:
   ;;; way to generate it.
   ;;;
   ;;; TODO of course, really we want to for-real track this
-  mov.qreg.qimm rax, 0x201
+  mov.qreg.qimm rax, 0x24F
   add.qreg.qreg rdx, rax
 
   ;;;