From a2dbddffcfc70e2ab84173b7eb7e24ac09f5dfe2 Mon Sep 17 00:00:00 2001
From: Irene Knapp <ireneista@irenes.space>
Date: Mon, 27 Oct 2025 01:45:20 -0700
Subject: ROLL, UNROLL, and a whole bunch of addressing modes

also a couple more gdb tips

Force-Push: yes
Change-Id: I31038334449ea45238c811b4e97e2d87833d8ea6
---
 quine.asm | 450 +++++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 373 insertions(+), 77 deletions(-)

(limited to 'quine.asm')

diff --git a/quine.asm b/quine.asm
index 22ebb1e..6ab7da4 100644
--- a/quine.asm
+++ b/quine.asm
@@ -46,13 +46,14 @@
 ;;;
 ;;; (gdb) disassemble/r 0x0x80007c0,+32
 ;;;
-;;; If you get the value of rsp out of info registers, and for example it's an
-;;; address along the lines of 0x7fffffff8650, you can do
+;;; If you want to see the value stack, you can do
 ;;;
-;;; (gdb) x/16xg 0x7fffffff8650
+;;; (gdb) x/16xg $rsp
 ;;;
-;;; The same will work with rbp for the control stack, and don't forget that
-;;; the "instruction pointer" is rsi.
+;;; The same will work with $rbp for the control stack, and don't forget that
+;;; the "instruction pointer" is rsi. To see all the registers, do
+;;;
+;;; (gdb) info registers
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -151,6 +152,21 @@ macro opcodereg opcode, reg
   db opcode or reg
 end macro
 
+macro scalefield sfield, scale
+  if 1 = scale
+    sfield = 0
+  else if 2 = scale
+    sfield = 1
+  else if 4 = scale
+    sfield = 2
+  else if 8 = scale
+    sfield = 3
+  else
+    assert 0
+  end if
+end macro
+
+
 ;;; On registers
 ;;; ------------
 ;;;
@@ -326,9 +342,9 @@ end macro
 ; Take a 64-bit source register, treat it as an address and look up the 64-bit
 ; value it points to, store that into a 64-bit target register.
 ;
-; For rsp and rbp, the only modes available also have displacement; we use an
-; 8-bit one and set it to zero. The other registers could be encoded without
-; the displacement, but for simplicity's sake we do the same thing for all of
+; For rbp, the only modes available also have displacement; we use an 8-bit
+; one and set it to zero. The other registers could be encoded without the
+; displacement, but for simplicity's sake we do the same thing for all of
 ; them.
 ;
 ; In understanding this, pay close attention to the Op/En column in the opcode
@@ -347,22 +363,27 @@ end macro
 ;
 ; We disallow rsp as a source because that's the mode that would want an SIB.
 macro mov.qreg.indirect.qreg target, source
+  qwordreg sreg, source
+  qwordreg treg, target
+  rex.w
+  db 0x8B
+  modrm 1, treg, sreg
   match =rsp, source
-    assert 0
-  else
-    qwordreg sreg, source
-    qwordreg treg, target
-    rex.w
-    db 0x8B
-    modrm 1, treg, sreg
-    db 0
+    ; R/M = rsp is the SIB case
+    sib 0, 4, sreg
+      ; no scaling, no indexing, source as base
   end match
+  db 0
 end macro
 
 
 ; Take a 64-bit source register, store its value into the address pointed to
-; by a 64-bit target register. The only modes available also have
-; displacement; we use an 8-bit one and set it to zero.
+; by a 64-bit target register.
+;
+; For rbp, the only modes available also have displacement; we use an 8-bit
+; one and set it to zero. The other registers could be encoded without the
+; displacement, but for simplicity's sake we do the same thing for all of
+; them.
 ;
 ; In understanding this, pay close attention to the Op/En column in the opcode
 ; table. The "MR" variant means the ModRM byte's reg field (the middle one)
@@ -379,17 +400,21 @@ end macro
 ; chapter 2, section 2-1.5, table 2-2.
 ;
 ; We disallow rsp as a target because that's the mode that would want an SIB.
+; When you look at other addressing modes, be aware that the special treatment
+; is for whichever register is specified in the R/M field. Sometimes that's
+; the source, and sometimes it's the target, depending on the opcode.
 macro mov.indirect.qreg.qreg target, source
+  qwordreg sreg, source
+  qwordreg treg, target
+  rex.w
+  db 0x89
+  modrm 1, sreg, treg
   match =rsp, target
-    assert 0
-  else
-    qwordreg sreg, source
-    qwordreg treg, target
-    rex.w
-    db 0x89
-    modrm 1, sreg, treg
-    db 0
+    ; R/M = rsp is the SIB case
+    sib 0, 4, treg
+      ; no scaling, no indexing, target as base
   end match
+  db 0
 end macro
 
 
@@ -410,10 +435,10 @@ macro mov.indirect.qreg.breg target, source
 end macro
 
 macro mov.breg.indirect.qreg target, source
-  match =rsp, target
+  match =rsp, source
     assert 0
     ; The SIB case.
-  else match =rbp, target
+  else match =rbp, source
     assert 0
     ; An unrelated addressing mode.
   else
@@ -443,10 +468,10 @@ end macro
 
 ; We use the operand-size prefix to specify 16-bit. No REX.W. Table 3-4.
 macro mov.wreg.indirect.qreg target, source
-  match =rsp, target
+  match =rsp, source
     assert 0
     ; The SIB case.
-  else match =rbp, target
+  else match =rbp, source
     assert 0
     ; An unrelated addressing mode.
   else
@@ -476,10 +501,10 @@ end macro
 
 ; It defaults to 32-bit, no prefix needed, also no REX.W. Table 3-4.
 macro mov.dreg.indirect.qreg target, source
-  match =rsp, target
+  match =rsp, source
     assert 0
     ; The SIB case.
-  else match =rbp, target
+  else match =rbp, source
     assert 0
     ; An unrelated addressing mode.
   else
@@ -491,6 +516,46 @@ macro mov.dreg.indirect.qreg target, source
 end macro
 
 
+macro mov.qreg.indexed.qreg target, source, index, scale
+  match =rbp, source
+    assert 0
+    ; This is divided into some subcases we don't wish to deal with yet.
+  else match =rsp, index
+    assert 0
+    ; This is the case where it's not actually indexed after all.
+  else
+    qwordreg treg, target
+    qwordreg sreg, source
+    qwordreg ireg, index
+    scalefield sfield, scale
+    rex.w
+    db 0x8B
+    modrm 0, treg, 4
+    sib sfield, ireg, sreg
+  end match
+end macro
+
+
+macro mov.indexed.qreg.qreg target, index, scale, source
+  match =rbp, source
+    assert 0
+    ; This is divided into some subcases we don't wish to deal with yet.
+  else match =rsp, index
+    assert 0
+    ; This is the case where it's not actually indexed after all.
+  else
+    qwordreg treg, target
+    qwordreg sreg, source
+    qwordreg ireg, index
+    scalefield sfield, scale
+    rex.w
+    db 0x89
+    modrm 0, sreg, 4
+    sib sfield, ireg, treg
+  end match
+end macro
+
+
 ; Take a 64-bit source register, store its value into a high 64-bit target
 ; register (r8-r15).
 ;
@@ -539,6 +604,24 @@ macro mov.qreg.oreg target, source
 end macro
 
 
+; This increments a 64-bit register by 1, in place;
+macro inc.qreg target
+  qwordreg treg, target
+  rex.w
+  db 0xFF
+  modrm 3, 0, treg
+    ; The 0 is part of the opcode.
+end macro
+
+; This decrements a 64-bit register by 1, in place;
+macro dec.qreg target
+  qwordreg treg, target
+  rex.w
+  db 0xFF
+  modrm 3, 1, treg
+    ; The 1 is part of the opcode.
+end macro
+
 ; This adds a 64-bit register to another 64-bit register, in place.
 macro add.qreg.qreg target, source
   qwordreg treg, target
@@ -980,11 +1063,59 @@ macro lea.qreg.qreg.disp32 target, source, offset
   end match
 end macro
 
+macro lea.qreg.indexed.qreg target, source, index, scale
+  match =rbp, source
+    assert 0
+    ; This is divided into some subcases we don't wish to deal with yet.
+  else match =rsp, index
+    assert 0
+    ; This is the case where it's not actually indexed after all.
+  else
+    qwordreg treg, target
+    qwordreg sreg, source
+    qwordreg ireg, index
+    scalefield sfield, scale
+    rex.w
+    db 0x8D
+    modrm 0, treg, 4
+    sib sfield, ireg, sreg
+  end match
+end macro
+
+; Wow, we use ALL the instruction suffixes for this, huh. See [Intel] volume
+; 2A, chapter 2, section 2-1, with particular attention to figure 2-1.
+macro lea.qreg.disp8.indexed.qreg target, offset, source, index, scale
+  match =rbp, source
+    assert 0
+    ; This is divided into some subcases we don't wish to deal with yet.
+  else match =rsp, index
+    assert 0
+    ; This is the case where it's not actually indexed after all.
+  else
+    qwordreg treg, target
+    qwordreg sreg, source
+    qwordreg ireg, index
+    scalefield sfield, scale
+    rex.w
+    db 0x8D
+    modrm 1, treg, 4
+      ; 1 in the mode field says we want a disp8.
+      ; 4 in the R/M field says we want an SIB byte.
+    sib sfield, ireg, sreg
+    db offset
+  end match
+end macro
+
 ; Clear the DF flag. This makes string instructions increment RSI.
 macro cld
   db 0xFC
 end macro
 
+; Set the DF flag. This makes string instructions decrement RSI.
+macro std
+  db 0xFD
+end macro
+
 ; Load 64 bits from the address in RSI into RAX. Then, increment or decrement
 ; RSI by 8 bytes, depending on the value of the DF flag.
 macro lodsq
@@ -992,6 +1123,34 @@ macro lodsq
   db 0xAD
 end macro
 
+;   [Intel] describes two different styles of mnemonic for the repeated string
+; operations. See, their parameters are always rsi and rdi, or the smaller
+; versions of those same specific registers. Intel thinks we might want to
+; write out "rsi" explicitly, even though the only information it conveys is
+; the size. The position we take is that it's better to let that be conveyed
+; by the instruction name; otherwise it'd be a point of confusion for new
+; readers, who might mistakenly think it's possible to pass it different
+; registers.
+;
+;   With the string instructions, the reader SHOULD be thinking, "Wait...
+; where does this get its parameters from?" Writing them in a way that makes
+; them appear simpler than they are would be confusing.
+macro rep operation
+  match =movsq, operation
+    ; The "rep" instruction can also be thought of as a prefix to other
+    ; instructions, though only a few specific ones are allowed. Anyway, it
+    ; comes before the REX byte.
+    db 0xF3
+    ; The rest of this is the same as the encoding of normal, non-repeated
+    ; movsq.
+    rex.w
+    db 0xA5
+    ; There's no explicit parameters. String operations are magic.
+  else
+    assert 0
+  end match
+end macro
+
 ; Push a 64-bit value from a register onto the stack (the one pointed to by
 ; rsp). Decrement rsp, then write the value at the new location.
 ;
@@ -1780,6 +1939,136 @@ defword DROP2, 0
   pop.qreg rax
   NEXT
 
+; Rotates "up" (pops its parameter, n; nth item then becomes current item).
+;
+; We implement this the high-performance way, with rep movsq, aka the
+; instruction that exists to optimize C's memcpy(). The details of setting
+; that up are complex; see below.
+defword ROLL, 0
+  dq $ + 0x8                     ; codeword
+
+  ;   Pop our parameter. The rep instruction takes rcx as its count, so we
+  ; reduce copying by using it to hold our count, as well.
+  pop.qreg rcx
+
+  ;   We have n - 1 items to slide, so decrement rcx. For the purpose of
+  ; counting how many repetitions will happen, it's one-based. This is because
+  ; the rep instruction performs a single movsq, then decrements rcx, then
+  ; stops if rcx is zero.
+  dec.qreg rcx
+
+  ;   Retrieve the nth item, for later. For this purpose we're thinking in
+  ;  zero-based terms, so we do this after already having decremented rcx.
+  mov.qreg.indexed.qreg rbx, rsp, rcx, 8
+
+  ;   The source address for movsq is rsi and the destination is rdi; we can
+  ; use rdi as we wish, but rsi is our Forth "instruction pointer", so we must
+  ; save and restore it. Doing so alters rsp, so we have to adjust the address
+  ; calculations by eight bytes as compared to the expressions above, but
+  ; happily we can use the disp8 field to do that. We'd be using disp8 anyway
+  ; because it's helpful.
+  push.qreg rsi
+
+  ;   Now we set up parameters for the memory-sliding operation. We have
+  ; n - 1 items to copy, moving the range rsp through rsp + (n-2)*8 onto the
+  ; range rsp + 8 through rsp + (n-1)*8. That's with the value of rsp as it
+  ; exists at this moment (it's going to change soon).
+  ;
+  ;   We're sliding them upwards in memory, so we start at the high end so
+  ; that we're always moving into a location that doesn't have anything
+  ; precious. We use lea as a convenient way to do the stack math.
+  ;
+  ;   When rcx is 1, we want rsp + 8.
+  lea.qreg.indexed.qreg rsi, rsp, rcx, 8
+  ;   When rcx is 1, we want rsp + 16.
+  lea.qreg.disp8.indexed.qreg rdi, 8, rsp, rcx, 8
+  ;
+  ;   Using rcx = 1 is the most convenient example to use for figuring out the
+  ; arithmetic. It's a linear relationship, so as long as we get the 8-byte
+  ; stride correct, we just need to pick a single point and verify that our
+  ; math is right for that point, and it'll be right for any value of rcx.
+
+  ;   Another of our Forth conventions is that the DF flag should be kept at
+  ; zero, which directs string instruction to increment rsi. Here, however,
+  ; because our source and destination ranges overlap, we need to start at the
+  ; high end, which means we need it to decrement. So we set DF to one, and
+  ; we'll clear it after.
+  std
+  rep movsq
+
+  ; Set everything back.
+  cld
+  pop.qreg rsi
+
+  ; There is now an extra item at the low end of the stack (the top) that
+  ; needs to go away, and coincidentally we have a value in rbx that needs to
+  ; be in that spot. Rather than doing a drop and push, we overwrite it, to
+  ; save a little work.
+  mov.indirect.qreg.qreg rsp, rbx
+
+  ; All done, wow! What a mouthful.
+  NEXT
+
+; Rotates "down" (pops its parameter, n; current item then becomes nth item).
+;
+; We implement this the high-performance way, with rep movsq, aka the
+; instruction that exists to optimize C's memcpy(). The details of setting
+; that up are complex; see below.
+defword UNROLL, 0
+  dq $ + 0x8                     ; codeword
+
+  ;   Pop our parameter. The rep instruction takes rcx as its count, so we
+  ; reduce copying by using it to hold our count, as well.
+  pop.qreg rcx
+
+  ; We have n - 1 items to slide, so decrement rcx. Also, save a copy of it in
+  ; rdx after doing that, for later.
+  dec.qreg rcx
+  mov.qreg.qreg rdx, rcx
+
+  ;   Retrieve the 0th item, for later.
+  mov.qreg.indirect.qreg rbx, rsp
+
+  ;   Now we set up parameters for the memory-sliding operation. We have
+  ; n - 1 items to copy, moving the range rsp + 8 through rsp + (n-1)*8 onto
+  ; the range rsp through rsp + (n-2)*8. That's with the value of rsp as it
+  ; exists at this moment (it's going to change soon).
+  ;
+  ;   We're sliding them downwards in memory, so we start at the low end so
+  ; that we're always moving into a location that doesn't have anything
+  ; precious. We use lea as a convenient way to do the stack math.
+  ;
+  ;   As with ROLL, we need to save rsi and adjust those rsp calculations
+  ; accordingly.
+  push.qreg rsi
+
+  ;   When rcx is 1, we want rsp + 16.
+  lea.qreg.disp8.indexed.qreg rsi, 8, rsp, rcx, 8
+  ;   When rcx is 1, we want rsp + 8.
+  lea.qreg.indexed.qreg rdi, rsp, rcx, 8
+
+  ;   With ROLL, we were starting at the high end. Here, we start at the low
+  ; end, which means we need rsi to increment after each repetition. That's
+  ; what it does when the DF flag is clear, and another of our Forth
+  ; conventions is to keep it clear normally. So, we don't have to touch DF!
+  ; Yay!
+  rep movsq
+
+  ; Restore our original rsi.
+  pop.qreg rsi
+
+  ; There is now an extra item in the middle of the stack, at the high end of
+  ; the sliding we did, that needs to be overwritten with our value in rbx.
+  ; Since we destructively updated our count in rcx, we saved a copy of the
+  ; count in rdx, and we use that to find the right address.
+  ;
+  ; When the original count was n, we want rsp + (n-1)*8, so we saved rdx
+  ; after decrementing rcx, above.
+  mov.indexed.qreg.qreg rsp, rdx, 8, rbx
+
+  ; All done, wow! What a mouthful.
+  NEXT
+
 ; Rotates "up" (third item becomes current item)
 defword ROLL3, 0
   dq $ + 0x8                     ; codeword
@@ -1792,7 +2081,7 @@ defword ROLL3, 0
   NEXT
 
 ; Rotates "down" (current item becomes third item)
-defword ROLLD3, 0
+defword UNROLL3, 0
   dq $ + 0x8                     ; codeword
   pop.qreg rax
   pop.qreg rbx
@@ -2130,10 +2419,61 @@ defword SYS_WRITE, 0
   NEXT
 
 
+;;;;;;;;;;;;;;;;;;;;;;
+;;; Ouptut helpers ;;;
+;;;;;;;;;;;;;;;;;;;;;;
+
+; In: base address, value
+; Out: new base address
+defword PACK64, 0
+  dq DOCOL                       ; codeword
+  dq SWAP, DUP, UNROLL3, STORE, LIT, 8, ADD
+  dq EXIT
+defword PACK32, 0
+  dq DOCOL                       ; codeword
+  dq SWAP, DUP, UNROLL3, STORE32, LIT, 4, ADD
+  dq EXIT
+defword PACK16, 0
+  dq DOCOL                       ; codeword
+  dq SWAP, DUP, UNROLL3, STORE16, LIT, 2, ADD
+  dq EXIT
+defword PACK8, 0
+  dq DOCOL                       ; codeword
+  dq SWAP, DUP, UNROLL3, STORE8, LIT, 1, ADD
+  dq EXIT
+
+; In the interests of reducing our executable's size, since a lot of it goes
+; to PACK* invocations, we define words that combine LIT with PACK*. This
+; shaves roughly 700 bytes as of when it was added.
+defword LITPACK64, 0
+  dq $ + 0x8                     ; codeword
+  lodsq
+  push.qreg rax
+  BEFORENEXT PACK64
+defword LITPACK32, 0
+  dq $ + 0x8                     ; codeword
+  lodsq
+  push.qreg rax
+  BEFORENEXT PACK32
+defword LITPACK16, 0
+  dq $ + 0x8                     ; codeword
+  lodsq
+  push.qreg rax
+  BEFORENEXT PACK16
+defword LITPACK8, 0
+  dq $ + 0x8                     ; codeword
+  lodsq
+  push.qreg rax
+  BEFORENEXT PACK8
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; (new) Implementation strategy ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
+;;;   We assemble the entire file contents in a heap-allocated buffer. When
+;;; the file is fully assembled, we output it.
+;;;
 defword QUINE, 0
   dq DOCOL                       ; codeword
 
@@ -2176,50 +2516,6 @@ defword WRITE_SELF_RAW_H, 0
   NEXT
 
 
-; In: base address, value
-; Out: new base address
-defword PACK64, 0
-  dq DOCOL                       ; codeword
-  dq SWAP, DUP, ROLLD3, STORE, LIT, 8, ADD
-  dq EXIT
-defword PACK32, 0
-  dq DOCOL                       ; codeword
-  dq SWAP, DUP, ROLLD3, STORE32, LIT, 4, ADD
-  dq EXIT
-defword PACK16, 0
-  dq DOCOL                       ; codeword
-  dq SWAP, DUP, ROLLD3, STORE16, LIT, 2, ADD
-  dq EXIT
-defword PACK8, 0
-  dq DOCOL                       ; codeword
-  dq SWAP, DUP, ROLLD3, STORE8, LIT, 1, ADD
-  dq EXIT
-
-; In the interests of reducing our executable's size, since a lot of it goes
-; to PACK* invocations, we define words that combine LIT with PACK*. This
-; shaves roughly 700 bytes as of when it was added.
-defword LITPACK64, 0
-  dq $ + 0x8                     ; codeword
-  lodsq
-  push.qreg rax
-  BEFORENEXT PACK64
-defword LITPACK32, 0
-  dq $ + 0x8                     ; codeword
-  lodsq
-  push.qreg rax
-  BEFORENEXT PACK32
-defword LITPACK16, 0
-  dq $ + 0x8                     ; codeword
-  lodsq
-  push.qreg rax
-  BEFORENEXT PACK16
-defword LITPACK8, 0
-  dq $ + 0x8                     ; codeword
-  lodsq
-  push.qreg rax
-  BEFORENEXT PACK8
-
-
 ;;;
 ;;; ELF header
 ;;;
-- 
cgit 1.4.1