there was an important bug in mov.qreg.disp8.bimm

it was using rax as an index register, when it should have been using no index register. this was working by coincidence, because rax was zero, but now it's not. yay fixed! there's also lots of decisions made and documented about the execution model, but they're only part-implemented and I wouldn't be checking them in right now if that were all that's new Force-Push: yes Change-Id: Ie8f64914484cd405272d640feeb1cf586ad915d7
author: Irene Knapp <ireneista@irenes.space> 2025-10-19 17:49:06 -0700
committer: Irene Knapp <ireneista@irenes.space> 2025-10-19 17:49:06 -0700
commit: c5c02fa49385bd965e3466ecdd8b8c791b67d26a (patch)
tree: a54534064b235232145434d7643af0583636ec57 /quine.asm
parent: 8c9157ba5a54f3ab08c2082864da9395f99dad94 (diff)
1 files changed, 290 insertions, 27 deletions
diff --git a/quine.asm b/quine.asm
index 3b6df78..1139cc8 100644
--- a/quine.asm
+++ b/quine.asm
@@ -34,21 +34,61 @@
 ;;; Assembly language ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
-;;; Before doing any actual code, we define macros for writing x86-64 assembly
-;;; language. This is built from scratch, relying only on flatassembler's
-;;; built-in semantics. No include files of any kind are used for it.
+;;;   Before doing any actual code, we define macros for writing x86-64
+;;; assembly language. This is built from scratch, relying only on
+;;; flatassembler's built-in semantics. No include files of any kind are used
+;;; for it.
 
+; The way these are all spelled out like this is slightly ridiculous, there
+; must be a better way.
 macro rex.0
   db 0x40
 end macro
-
 macro rex.w
   db 0x48
 end macro
-
+macro rex.r
+  db 0x44
+end macro
+macro rex.x
+  db 0x42
+end macro
+macro rex.b
+  db 0x41
+end macro
+macro rex.wr
+  db 0x4C
+end macro
+macro rex.wx
+  db 0x4A
+end macro
+macro rex.wb
+  db 0x49
+end macro
+macro rex.rx
+  db 0x46
+end macro
+macro rex.rb
+  db 0x45
+end macro
 macro rex.xb
   db 0x43
 end macro
+macro rex.wrx
+  db 0x4E
+end macro
+macro rex.wrb
+  db 0x4D
+end macro
+macro rex.wxb
+  db 0x4B
+end macro
+macro rex.rxb
+  db 0x47
+end macro
+macro rex.wrxb
+  db 0x4F
+end macro
 
 macro modrm mod, reg, rm
   assert mod >= 0 & mod < 4
@@ -92,6 +132,28 @@ macro qwordreg result, register
   end match
 end macro
 
+macro owordreg result, register
+  match =r8?, register
+    result = 0
+  else match =r9?, register
+    result = 1
+  else match =r10?, register
+    result = 2
+  else match =r11?, register
+    result = 3
+  else match =r12?, register
+    result = 4
+  else match =r13?, register
+    result = 5
+  else match =r14?, register
+    result = 6
+  else match =r15?, register
+    result = 7
+  else
+    assert 0
+  end match
+end macro
+
 
 ; TODO what register size does this use?
 macro mov.b target, source
@@ -118,8 +180,18 @@ end macro
 
 
 macro mov.qreg.qimm target, source
-  rex.w
   qwordreg treg, target
+  rex.w
+  opcodereg 0xB8, treg
+  dq source
+end macro
+
+
+; Notice the use of REX.B here; this instruction puts the register number in
+; the opcode field, so it uses Table 3-1.
+macro mov.oreg.qimm target, source
+  owordreg treg, target
+  rex.wb
   opcodereg 0xB8, treg
   dq source
 end macro
@@ -135,8 +207,12 @@ end macro
 
 
 ; Take a 64-bit source register, treat it as an address and look up the 64-bit
-; value it points to, store that into a 64-bit target register. The only modes
-; available also have displacement; we use an 8-bit one and set it to zero.
+; value it points to, store that into a 64-bit target register.
+;
+; For rsp and rbp, the only modes available also have displacement; we use an
+; 8-bit one and set it to zero. The other registers could be encoded without
+; the displacement, but for simplicity's sake we do the same thing for all of
+; them.
 ;
 ; In understanding this, pay close attention to the Op/En column in the opcode
 ; table. The "RM" variant means the ModRM byte's R/M field (the third one)
@@ -200,6 +276,55 @@ macro mov.indirect.qreg.qreg target, source
 end macro
 
 
+; Take a 64-bit source register, store its value into a high 64-bit target
+; register (r8-r15).
+;
+; Notice that there are two ways to add another bit to the register encoding.
+; Table 3-1 is about REX.B, but does not apply here, it's for instructions
+; that use opcode bits to specify a register, and none of the
+; register-to-register MOV variants do that (it's for immediate mode).
+;
+; Instead, we want the mechanism that uses REX.R as the extra bit, and it
+; combines with the reg field of ModRM, as per 2.2.1.2.
+;
+; Therefore, we want the variant of MOV which puts the target in the reg
+; field. That's Op/En "RM", opcode 0x8B with REX.WR.
+;
+; Mode 3 is direct addressing.
+macro mov.oreg.qreg target, source
+  owordreg treg, target
+  qwordreg sreg, source
+  rex.wr
+  rb 0x8B
+  modrm 3, treg, sreg
+end macro
+
+
+; Take a high 64-bit source register (r8-r15), store its value into a 64-bit
+; target register.
+;
+; Notice that there are two ways to add another bit to the register encoding.
+; Table 3-1 is about REX.B, but does not apply here, it's for instructions
+; that use opcode bits to specify a register, and none of the
+; register-to-register MOV variants do that (it's for immediate mode).
+;
+; Instead, we want the mechanism that uses REX.R as the extra bit, and it
+; combines with the reg field of ModRM, as per 2.2.1.2.
+;
+; Therefore, we want the variant of MOV which puts the source in the reg
+; field. That's Op/En "MR", opcode 0x89 with REX.WR.
+;
+; Mode 3 is direct addressing.
+macro mov.qreg.oreg target, source
+  qwordreg treg, target
+  owordreg sreg, source
+  rex.wr
+  rb 0x89
+  modrm 3, sreg, treg
+end macro
+
+
+; This adds a 64-bit register to another 64-bit register, in place.
 macro add.qreg.qreg target, source
   qwordreg treg, target
   qwordreg sreg, source
@@ -221,6 +346,7 @@ macro add.qreg.bimm target, source
   db source
 end macro
 
+
 ; This adds a signed 32-bit immediate value to a 64-bit register, in place.
 ;
 ; Notice the use of 3 as the addressing mode. This says to use the register
@@ -269,7 +395,9 @@ macro mov.qreg.disp8.bimm target, offset, source
   match =rsp, target
     db 0xC6
     modrm 1, 0, 4
-    sib 0, 0, 4
+      ; 4 is rsp, but it's a special case
+    sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
     db source
   else
@@ -284,12 +412,16 @@ end macro
 ; us an operand size of 32 bits by default. [Intel] volume 1, section 3.6.1,
 ; table 3-4. We want a 16-bit operand, so we use the operand-size prefix,
 ; 0x66, and we leave REX.W unset.
+;
+; We need to treat rsp specially because it's the SIB case, per table 2-2.
 macro mov.qreg.disp8.wimm target, offset, source
   match =rsp, target
     db 0x66
     db 0xC7
     modrm 1, 0, 4
+      ; 4 is rsp, but it's a special case
     sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
     dw source
   else
@@ -307,7 +439,9 @@ macro mov.qreg.disp8.dimm target, offset, source
   match =rsp, target
     db 0xC7
     modrm 1, 0, 4
+      ; 4 is rsp, but it's a special case
     sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
     dd source
   else
@@ -318,17 +452,27 @@ end macro
 ; Move from a 64-bit register, to a 64-bit location relative to a 64-bit
 ; register, with an 8-bit displacement and no indexing.
 ;
-; This uses opcode 0x89.
+; This uses opcode 0x89 with REX.W, so that gives us the reg field as the
+; 64-bit source and the R/M field as the 64-bit destination.
+;
+; We need to treat a target of rsp specially because it's the SIB case per
+; table 2-2.
 macro mov.qreg.disp8.qreg target, offset, source
+  qwordreg sreg, source
+  qwordreg treg, target
   match =rsp, target
-    qwordreg sreg, source
     rex.w
     db 0x89
-    modrm 1, sreg, 4
+    modrm 1, sreg, treg
+      ; treg is rsp by assumption, and R/M = rsp is the SIB case
     sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
   else
-    assert 0
+    rex.w
+    db 0x89
+    modrm 1, sreg, treg
+    db offset
   end match
 end macro
 
@@ -346,7 +490,9 @@ macro mov.qreg.disp8.dimm target, offset, source
     rex.w
     db 0xC7
     modrm 1, 0, 4
+      ; 4 is rsp, but it's a special case
     sib 0, 4, 4
+      ; no scaling, no indexing, rsp as base
     db offset
     dd source
   else
@@ -404,16 +550,17 @@ end macro
 ;;; Executable file format ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
-;;; Before we get into the body of the program, we do a lot of ELF-specific
+;;;   Before we get into the body of the program, we do a lot of ELF-specific
 ;;; stuff to ensure that our output is in a format Linux knows how to run.
 ;;;
-;;; First, we set the origin to load at. This is arbitrary, but it can't be
+;;;   First, we set the origin to load at. This is arbitrary, but it can't be
 ;;; zero. We tell flatassembler about it because it's used in label
 ;;; calculations; we can reference it as $$ any time we need it in future.
 org 0x08000000
 
 ;;;
-;;; Second, we output ELF's top-level file header.
+;;;   Second, we output ELF's top-level file header. The only interesting
+;;; thing here is the entry pointer.
 ;;;
 elf_header:
   ; * denotes mandatory fields according to breadbox
@@ -443,18 +590,20 @@ elf_header:
 elf_header_size = $ - elf_header
 
 ;;;
-;;; Third, immediately after the ELF file header, we output ELF's program
+;;;   Third, immediately after the ELF file header, we output ELF's program
 ;;; header, which lists the memory regions ("segments") we want to have and
 ;;; where we want them to come from. We list just a single region, which is
 ;;; the entire contents of the ELF file from disk.
 ;;;
-;;; It would be more typical to have separate code and data segments, and
-;;; perhaps a stack or heap, but this keeps things simple. We do have a little
-;;; stack space available, though we don't explicitily request any; the kernel
-;;; allocates it for us as part of exec() so that it can pass us argc and argv
-;;; (which we ignore). That stack space will be at a random address, different
-;;; every time, because of ASLR; that's a neat security feature, so we leave
-;;; it as-is.
+;;;   It would be more typical to use this header to ask the loader to give us
+;;; separate code and data segments, and perhaps a stack or heap, but this
+;;; keeps things simple, and we can create those things for ourselves later.
+;;;
+;;;    We do have a little stack space available, though we don't explicitily
+;;; request any; the kernel allocates it for us as part of exec() so that it
+;;; can pass us argc and argv (which we ignore). That stack space will be at a
+;;; random address, different every time, because of ASLR; that's a neat
+;;; security feature, so we leave it as-is.
 ;;;
 program_header:
   dd 1                           ; *"loadable" segment type
@@ -568,7 +717,7 @@ program_header_entry_size = $ - program_header
 ;;;
 ;;; Additionally, immediately after beginning execution of a word:
 ;;;
-;;; * rax points to the address being executed
+;;; * rax points to the address of the codeword being executed
 ;;;     The value of rax is purely for the callee's benefit, and does not need
 ;;;   to be preserved.
 ;;;
@@ -698,9 +847,123 @@ DOCOL:
 ;;;
 ;;;   This routine is really only responsible for one-time initialization.
 ;;;
+;;; Registers in:
+;;;
+;;; * rsp points to the top (low end) of the value stack
+;;;     The kernel sets this up for us, and we need to save it somewhere so
+;;;   Forth can use it.
+;;;
+;;; Registers out:
+;;;
+;;; * rsp points to the top of the control stack
+;;; * rsi points within QUIT
+;;;     QUIT is the word that's Forth's closest equivalent to main().
+;;;
+;;; Registers within:
+;;;
+;;; * rdi points to the base the heap was allocated at, once it is
+;;;     This is the same value that S0 will hold, once we reach a point
+;;;   where we can rely on Forth variable-words.
+;;;
 _start:
   cld                                      ; clear the DF flag
-  ; If we wanted to save the initial stack pointer, we'd do that here.
+
+  ;;;
+  ;;; Prepare the heap.
+  ;;;
+  ;;;   We could ask for a data segment in the program header, but where's the
+  ;;; fun in that? Instead, we call mmap().
+  ;;;
+  ;;;   If we wanted the kernel to do ASLR for us, passing address zero would
+  ;;; cause it to pick somewhere at random, but instead we choose our own
+  ;;; location. It's still not guaranteed to be where we ask for, so we still
+  ;;; do the work to record where it wound up. We could pass the "fixed" flag
+  ;;; and the kernel would trust us, but this gives us more options for
+  ;;; interoperating with other runtimes.
+  ;;;
+  mov.b rax, 9                             ; mmap()
+  mov.qreg.qimm rdi, 0x0000001000000000    ; address (very arbitrary)
+  mov.qreg.qimm rsi, 0x0000000001000000    ; size (one meg)
+  mov.qreg.qimm rdx, 0x03                  ; protection (read+write)
+  mov.oreg.qimm r10, 0x22                  ; flags (private+anonymous)
+  mov.oreg.qimm r8, 0                      ; file descriptor (ignored)
+  mov.oreg.qimm r9, 0                      ; offset (ignored)
+  syscall
+
+  ;;;
+  ;;;   The return value of the system call is in rax, we'll use it in a sec.
+  ;;; We need to save this somewhere in case we ever want to munmap() it;
+  ;;; there's no widely-used name for it so we have to make one up. S0 and R0
+  ;;; are widely-used names for the logical tops of the value and control
+  ;;; stacks, respectively, and we will eventually set those up as well, so we
+  ;;; should keep those names in mind. The control stack lives within the
+  ;;; heap, while the value stack is its own segment. This value, though, is
+  ;;; the physical bottom of the segment, meaning that it stays the same even
+  ;;; as we allocate and deallocate things within it. This is unlike the two
+  ;;; stack pointers, so we give it a name that doesn't suggest similarity:
+  ;;; HEAP.
+  ;;;
+  ;;;   Once Forth is fully set up, its internal variables will be accessed
+  ;;; through variable-words like any other Forth data, including HEAP. To get
+  ;;; to that point, though, we need to be able to hold onto variable data
+  ;;; between now and then. In fact, if we don't have at least one of HEAP and
+  ;;; HERE (its counterpart which points to the logical top end), all our
+  ;;; efforts to hold onto anything seem a bit doomed.
+  ;;;
+  ;;;   So, we temporarily dedicate rdi to HEAP - only within this routine -
+  ;;; and store everything else in ways that let us find things by reference
+  ;;; to it. We choose rdi because it works with the indexing modes we care
+  ;;; about, and its name suggests its function.
+  ;;;
+  ;;;   The strategy Jonesforth uses is not applicable to us; Jonesforth
+  ;;; takes advantage of the linker to let its code segment refer to specific,
+  ;;; pre-allocated objects in the data segment. We are our own linker.
+  ;;; Hence, this approach.
+  ;;;
+  ;;;   Keying things off HEAP is the fundamental decision, but to make sure
+  ;;; our variables are accessible both during early bootstrapping, and later,
+  ;;; we also have to be thoughtful about data structures. More on that in a
+  ;;; moment.
+  ;;;
+  mov.qreg.qreg rdi, rax
+
+  ;;;
+  ;;;   Now we save some stuff onto the heap. These are the locations that
+  ;;; will eventually be the backing stores of the Forth variables, but we
+  ;;; don't create the word headers yet, since there's no requirement that
+  ;;; they be next to the backing stores. We'll do that later, once we have
+  ;;; word-writing infrastructure in place. For now, we just use their offsets
+  ;;; relative to the physical bottom of the heap, which are fixed.
+  ;;;
+  ;;;   These will be the permanent homes of these values, though we have
+  ;;; copies of them elsewhere while we're still in this routine.
+  ;;;
+  mov.qreg.disp8.qreg rdi, 0x00, rdi       ; HEAP
+  mov.qreg.disp8.qreg rdi, 0x08, rsp       ; S0
+  ; TODO this isn't done yet, need to reserve space and explain it more
+  ;;;
+  ;;; * HEAP is the physical bottom of the heap
+  ;;;     The heap grows upwards in memory, so this is also the logical
+  ;;;   bottom. This comes from the address mmap() just returned to us.
+  ;;; * S0 is the logical bottom of the value stack
+  ;;;     The value stack grows downwards in memory, so this is the physical
+  ;;;   top of it. This comes from the stack pointer the kernel initialized us
+  ;;;   with.
+  ;;; * R0 is the logical bottom of the control stack
+  ;;;     The control stack also grows downwards, so this is its pysical top
+  ;;;   as well. We allocate this dedicated space within the heap right here,
+  ;;;   in this routine, through our choice of where to put things.
+  ;;;
+  ;;;     S0 and R0 are mostly used when we want to initialize or reinitialize
+  ;;;   their respective stacks - that is, discard all their contents at once.
+
+  ;;; TODO we don't do this yet
+  ;;;   Now we're going to create a word in the heap, to hold the value of
+  ;;; HERE long-term.
+
+  ;;;
+  ;;;   We would like very much to get out of the bootstrap code and into a
+  ;;; proper threaded-execution setup.
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -789,7 +1052,7 @@ _start:
   ;;; way to generate it.
   ;;;
   ;;; TODO of course, really we want to for-real track this
-  mov.qreg.qimm rax, 0x201
+  mov.qreg.qimm rax, 0x24F
   add.qreg.qreg rdx, rax
 
   ;;;
author	Irene Knapp <ireneista@irenes.space>	2025-10-19 17:49:06 -0700
committer	Irene Knapp <ireneista@irenes.space>	2025-10-19 17:49:06 -0700
commit	c5c02fa49385bd965e3466ecdd8b8c791b67d26a (patch)
tree	a54534064b235232145434d7643af0583636ec57 /quine.asm
parent	8c9157ba5a54f3ab08c2082864da9395f99dad94 (diff)