1 files changed, 146 insertions, 24 deletions
diff --git a/quine.asm b/quine.asm
index f44f608..a25e1c5 100644
--- a/quine.asm
+++ b/quine.asm
@@ -13,11 +13,11 @@
 ;;; Currently, this is not yet fully self-hosting; it is based on
 ;;; flatassembler[1]. A minimal command to build and run it is:
 ;;;
-;;; fasmg quine.asm quine && chmod 755 quine && ./quine; echo $?
+;;; $ fasmg quine.asm quine && chmod 755 quine && ./quine; echo $?
 ;;;
 ;;; A workflow you may wish to use for debugging is:
 ;;;
-;;; rm quine2; fasmg quine.asm quine && chmod 755 quine && ./quine > quine2; echo "exit code:" $?; echo; hexdump -C quine; echo; hexdump -C quine2; echo; cmp -l quine quine2 ; echo cmp: $?
+;;; $ rm quine2; fasmg quine.asm quine && chmod 755 quine && ./quine > quine2; echo "exit code:" $?; echo; hexdump -C quine; echo; hexdump -C quine2; echo; cmp -l quine quine2 ; echo cmp: $?
 ;;;
 ;;; The reason this removes the old one first is that otherwise, there's a
 ;;; risk the error message will be scrolled off the top of the screen and
@@ -25,12 +25,34 @@
 ;;;
 ;;; You may also wish to do:
 ;;;
-;;; objdump --disassemble quine
-;;; ZydisDisasm -64 quine
+;;; $ objdump --disassemble quine
+;;; $ ZydisDisasm -64 quine
 ;;;
 ;;; This relies on GNU binutils, and on zydis, respectively.
 ;;;
 ;;; [1] https://flatassembler.net/
+;;;
+;;;
+;;; gdb
+;;; ---
+;;;
+;;; You can run gdb on it if you want; there's no symbols, but if you are
+;;; familiar with the hex it should be readable. Keep a hexdump of the program
+;;; handy to look up what addresses are.
+;;;
+;;; If you want to see a routine implemented in assembly, look at the hexdump
+;;; of the overall file, find it by looking at the ASCII names, skip past the
+;;; codeword, and do ie
+;;;
+;;; (gdb) disassemble/r 0x0x80007c0,+32
+;;;
+;;; If you get the value of rsp out of info registers, and for example it's an
+;;; address along the lines of 0x7fffffff8650, you can do
+;;;
+;;; (gdb) x/16xg 0x7fffffff8650
+;;;
+;;; The same will work with rbp for the control stack, and don't forget that
+;;; the "instruction pointer" is rsi.
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -527,6 +549,23 @@ macro add.qreg.qreg target, source
 end macro
 
 
+macro add.indirect.qreg.qreg target, source
+  match =rsp, target
+    assert 0
+    ; The SIB case.
+  else match =rbp, target
+    assert 0
+    ; An unrelated addressing mode.
+  else
+    qwordreg treg, target
+    qwordreg sreg, source
+    rex.w
+    db 0x01
+    modrm 0, sreg, treg
+  end match
+end macro
+
+
 ; This adds a signed 8-bit immediate value to a 64-bit register, in place.
 ;
 ; Notice the use of 3 as the addressing mode. This says to use the register
@@ -561,6 +600,21 @@ macro sub.qreg.qreg target, source
   modrm 3, treg, sreg
 end macro
 
+
+macro sub.indirect.qreg.qreg target, source
+  match =rsp, target
+    ; The SIB case.
+    assert 0
+  else
+    qwordreg treg, target
+    qwordreg sreg, source
+    rex.w
+    db 0x2B
+    modrm 0, sreg, treg
+  end match
+end macro
+
+
 ; This subtracts a signed 8-bit immediate value from a 64-bit register, in
 ; place.
 ;
@@ -774,9 +828,9 @@ end macro
 ; register, with an 8-bit displacement and no indexing.
 ;
 ; This uses opcode 0xC7, which has w = 1. We run in 64-bit mode, so that gives
-; us an operand size of 32 bits by default. [Intel] volume 1, section 3.6.1,
-; table 3-4. We want a 16-bit operand, so we use the operand-size prefix,
-; 0x66, and we leave REX.W unset.
+; us an operand size of 32 bits by default. [Intel] volume 1, chapter 3,
+; section 3-6.1, table 3-4. We want a 16-bit operand, so we use the
+; operand-size prefix, 0x66, and we leave REX.W unset.
 ;
 ; We need to treat rsp specially because it's the SIB case, per table 2-2.
 macro mov.qreg.disp8.wimm target, offset, source
@@ -912,7 +966,7 @@ macro lea.qreg.qreg.disp8 target, offset, source
   end match
 end macro
 
-macro lea.qreg.qreg.disp32 target, offset, source
+macro lea.qreg.qreg.disp32 target, source, offset
   match =rsp, target
     ; This is the SIB case
     assert 0
@@ -957,14 +1011,42 @@ macro push.bimm source
 end macro
 
 ; Operand-size prefix makes it 16-bit.
+;
+; If you're trying to fake pushing a larger size by doing several 16-bit
+; pushes, remember to start by pushing the low end and proceed upwards.
+; [Intel] volume 1, chapter 9, section 9-2.4, "Memory Data Formats".
 macro push.wimm source
   db 0x66
   db 0x68
   dw source
 end macro
 
-; There is no 64-bit immediate push. To fake it, push the low half, then the
-; high half. [Intel] volume 1, chapter 9, section 9-2.4, "Memory Data Formats".
+; There is no 64-bit immediate push. So, can we have a push instruction that
+; pushes a 32-bit immediate value? Sort-of, but it's sign-extended to 64 bits,
+; so rsp is decremented by 8, not by 4. This is that instruction.
+;
+; You need to do a really close read of a number of things to understand why.
+; The opcode tables in [Intel] in volume 2D, appendix A, section A-3 give it
+; the d64 annotation, which per table A-1 in section A-2.5 indicates that the
+; operand size is always 64 bits and that there is no corresponding 32-bit
+; version. Yet, the actual immediate value is still only 32 bits! Direct your
+; attention to the instruction's details page, volume 2B, chapter 4, section
+; 4-3, "PUSH". The description section clearly details that the immediate may
+; be less than the operand size, which makes sense once you know it, but it
+; doesn't explictly call out that the operand size is still 64 bits here.
+;
+; In general, the size of an immediate doesn't determine operand size, as you
+; can read about in detail in [Intel] volume 1, chapter 3, section 3-6.1, with
+; particular attention to table 3-4.
+;
+; Why is this surprising, given that it's consistent with the behavior of
+; other instructions? Well, most instructions don't have such obvious
+; side-effects. It's easy to not notice the operand size disagreeing with the
+; immediate size when you'e only writing to a register, but changing the stack
+; in an unexpected way breaks things much more obviously.
+;
+; Anyway, if you really want to decrement the stack pointer by 32 bits after
+; a push, consider pushing a register.
 macro push.dimm source
   db 0x68
   dd source
@@ -1444,7 +1526,7 @@ _start:
   ;;; your home. See below for a little more thought about why here in
   ;;; particular.
   ;;;
-  lea.qreg.qreg.disp32 rbp, control_stack_size, rdi
+  lea.qreg.qreg.disp32 rbp, rdi, control_stack_size
 
   ;;;
   ;;;   Now we save some stuff onto the heap. These are the locations that
@@ -1460,7 +1542,9 @@ _start:
   mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi    ; HEAP
   mov.qreg.disp32.qreg rdi, control_stack_size + 0x08, rsp    ; S0
   mov.qreg.disp32.qreg rdi, control_stack_size + 0x10, rbp    ; R0
-  ; TODO also consider HERE, LATEST, and STATE
+  lea.qreg.qreg.disp32 rax, rdi, control_stack_size + 0x20
+  mov.qreg.disp32.qreg rdi, control_stack_size + 0x18, rax    ; HERE
+  ; TODO also consider LATEST and STATE
   ; strictly speaking, R0 could be a constant... but it isn't known until
   ; runtime, so we might as well make it a variable
   ;;;
@@ -1475,6 +1559,11 @@ _start:
   ;;;     The control stack also grows downwards, so this is its pysical top
   ;;;   as well. We allocate this dedicated space within the heap right here,
   ;;;   in this routine, through our choice of where to put things.
+  ;;; * HERE is the physical start of the unallocated space in the heap
+  ;;;     We allocate heap space from bottom to top, by incrementing this
+  ;;;   value. So, it would also be accurate to say that it points immediately
+  ;;;   after the physical top of the allocated space. At any rate, the
+  ;;;   address it points to is the first one that hasn't been used yet.
   ;;;
   ;;;   S0 and R0 are mostly used when we want to initialize or reinitialize
   ;;; their respective stacks - that is, discard all their contents at once.
@@ -1484,6 +1573,11 @@ _start:
   ;;; convince yourself that it only ever writes things just below the rbp
   ;;; address it receives, never right on top of it.
   ;;;
+  ;;;   Notice that HERE points immediately after itself. This is just a
+  ;;; convenience, making it the last one like that so that the concern is
+  ;;; dealt with in a single place and is easy to keep up-to-date with code
+  ;;; changes.
+  ;;;
   ;;;   A little more detail about why we offset everything by
   ;;; control_stack_size: We're carving out some space at the bottom of the
   ;;; heap - which grows low-to-high - to be the control stack - which grows
@@ -1867,6 +1961,23 @@ defword FETCH, 0
   push.qreg rax
   NEXT
 
+; Address on top, value second
+; I might have done it the other way, but this is what Jonesforth does and it
+; seems reasonable enough.
+defword ADDSTORE, 0
+  dq $ + 0x8                     ; codeword
+  pop.qreg rbx
+  pop.qreg rax
+  add.indirect.qreg.qreg rbx, rax
+  NEXT
+
+defword SUBSTORE, 0
+  dq $ + 0x8                     ; codeword
+  pop.qreg rbx
+  pop.qreg rax
+  sub.indirect.qreg.qreg rbx, rax
+  NEXT
+
 defword STORE8, 0
   dq $ + 0x8                     ; codeword
   pop.qreg rbx
@@ -2000,10 +2111,17 @@ defword SYS_WRITE, 0
 defword QUINE, 0
   dq DOCOL                       ; codeword
 
-  ; This stack-allocates a buffer, then finishes by pushing its length and
-  ; address on the value stack. Thus we don't need to care about how it
+  ; We still have HEAP on the stack. Use it to find HERE...
+  dq DUP, LIT, control_stack_size + 0x18, ADD
+  ; ... add a constant to HERE in-place, keeping a copy of the pointer ...
+  dq DUP, LIT, 0x78, SWAP, ADDSTORE
+  ; ... and now we have allocated a block of memory, with its address on the
+  ; stack. We also still have HEAP at the bottom of the stack, for future use.
+
+  ; This takes a buffer's address on the stack, populates it, then finishes by
+  ; pushing its length and address. Thus we don't need to care about how it
   ; internally uses registers.
-  dq OLD_CODE
+  dq DUP, OLD_CODE, SWAP
 
   ; write() from stack-allocated buffer
   dq SYS_WRITE
@@ -2014,6 +2132,9 @@ defword QUINE, 0
   dq SYS_WRITE
 
   dq EXIT
+defword HLT, 0
+  dq $ + 0x8                     ; codeword
+  hlt
 
 defword WRITE_SELF_RAW_H, 0
   dq $ + 0x8                     ; codeword
@@ -2038,6 +2159,12 @@ defword WRITE_SELF_RAW_H, 0
 ;;; which refers to how many bytes have actually been populated, not to the
 ;;; size of the buffer.
 ;;;
+;;; Stack in:
+;;; * Top: The address of a block of memory to use.
+;;;
+;;; Stack out:
+;;; * Top: The length of the data that was written to the provided memory.
+;;;
 ;;; Registers within:
 ;;;
 ;;; * rdx holds the total used file size so far. During hand-off between
@@ -2047,11 +2174,9 @@ defword WRITE_SELF_RAW_H, 0
 ;;; * rcx points to the bottom of the buffer.
 ;;;
 defword OLD_CODE, 0
-  dq $ + 0x8                     ; codeword
-
-  mov.qreg.qreg rcx, rdi
-  add.qreg.dimm rcx, control_stack_size + 0x18
+  dq $ + 0x8                               ; codeword
 
+  pop.qreg rcx                             ; our parameter - a block of memory
   mov.dreg.dimm rdx, 0                     ; store running file size here
 
   ;;;
@@ -2130,13 +2255,10 @@ defword OLD_CODE, 0
   mov.qreg.disp8.qreg rcx, 0x68, rdx       ; size in memory
 
   ;;;
-  ;;; The buffer is ready; push its length and address on the value stack, so
-  ;;; our caller can handle write()ing it out.
+  ;;; The buffer is ready; push its length on the value stack, so our caller
+  ;;; can handle write()ing it out.
   ;;;
-
-  push.dimm 0
   push.dimm 0x78
-  push.qreg rcx
 
   NEXT