1 files changed, 228 insertions, 9 deletions
diff --git a/quine.asm b/quine.asm
index 7bc78b9..8b21819 100644
--- a/quine.asm
+++ b/quine.asm
@@ -417,17 +417,16 @@ end macro
 ; Take a 64-bit source register, treat it as an address and look up the 64-bit
 ; value it points to, store that into a 64-bit target register.
 ;
-; For rbp, the only modes available also have displacement; we use an 8-bit
-; one and set it to zero. The other registers could be encoded without the
-; displacement, but for simplicity's sake we do the same thing for all of
-; them.
+; For a source of rbp, the only modes available also have displacement; we
+; disallow that. Use mov.qreg.disp8.qreg for that case, and set a displacement
+; of 0.
 ;
 ; In understanding this, pay close attention to the Op/En column in the opcode
 ; table. The "RM" variant means the ModRM byte's R/M field (the third one)
 ; is the source, while its reg field (the middle one) is the target. This is
 ; what we want, because the R/M field is the one that gets indirection applied
 ; to it. Opcode 0x8B with an REX.W prefix is the all-64-bit RM variant.
-; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV".
+; [Intel] volume 2B, chapter 4, section 4-3, "MOV".
 ;
 ; For the indirection modes, don't be confused by the many similar tables.
 ; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W
@@ -438,6 +437,23 @@ end macro
 ;
 ; We disallow rsp as a source because that's the mode that would want an SIB.
 macro mov.qreg.indirect.qreg target, source
+  match =rbp, source
+    assert 0
+  end match
+  qwordreg sreg, source
+  qwordreg treg, target
+  rex.w
+  db 0x8B
+  modrm 0, treg, sreg
+  match =rsp, source
+    ; R/M = rsp is the SIB case
+    sib 0, 4, sreg
+      ; no scaling, no indexing, source as base
+  end match
+end macro
+
+
+macro mov.qreg.disp8.qreg target, offset, source
   qwordreg sreg, source
   qwordreg treg, target
   rex.w
@@ -448,7 +464,7 @@ macro mov.qreg.indirect.qreg target, source
     sib 0, 4, sreg
       ; no scaling, no indexing, source as base
   end match
-  db 0
+  db offset
 end macro
 
 
@@ -465,7 +481,7 @@ end macro
 ; is the source, while its R/M field (the third one) is the target. This is
 ; what we want, because the R/M field is the one that gets indirection applied
 ; to it. Opcode 0x89 with an REX.W prefix is the all-64-bit MR variant.
-; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV".
+; [Intel] volume 2B, chapter 4, section 4-3, "MOV".
 ;
 ; For the indirection modes, don't be confused by the many similar tables.
 ; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W
@@ -816,6 +832,24 @@ macro sub.qreg.dimm target, source
   dd source
 end macro
 
+; Add with carry.
+macro adc.qreg.bimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x83
+  modrm 3, 2, treg
+  db source
+end macro
+
+; Subtract with borrow.
+macro sbb.qreg.bimm target, source
+  qwordreg treg, target
+  rex.w
+  db 0x83
+  modrm 3, 3, treg
+  db source
+end macro
+
 ; This multiplies rax, as 64-bits, with another 64-bit register, in place.
 ;
 ; The 4 in the reg field is part of the opcode.
@@ -1004,7 +1038,7 @@ end macro
 ;
 ; We need to treat a target of rsp specially because it's the SIB case per
 ; table 2-2.
-macro mov.qreg.disp8.qreg target, offset, source
+macro mov.disp8.qreg.qreg target, offset, source
   qwordreg sreg, source
   qwordreg treg, target
   rex.w
@@ -1151,6 +1185,13 @@ macro lea.qreg.disp8.indexed.qreg target, offset, source, index, scale
   end match
 end macro
 
+macro mov.breg.breg target, source
+  bytereg treg, target
+  bytereg sreg, source
+  db 0x88
+  modrm 3, sreg, treg
+end macro
+
 macro mov.breg.bimm target, source
   bytereg treg, target
   db 0xC6
@@ -1176,6 +1217,10 @@ macro lodsq
   db 0xAD
 end macro
 
+macro cmps8
+  db 0xA6
+end macro
+
 ;   [Intel] describes two different styles of mnemonic for the repeated string
 ; operations. See, their parameters are always rsi and rdi, or the smaller
 ; versions of those same specific registers. Intel thinks we might want to
@@ -1311,6 +1356,13 @@ macro jmp.abs.indirect.qreg location
   modrm 0, 4, lreg
 end macro
 
+; The location is relative to the start of the instruction immediately
+; following the jmp.
+macro jmp.rel.bimm location
+  db 0xEB
+  db location
+end macro
+
 ; There in no 64-bit immediate "near" jump, so we use 32-bit. It's relatve,
 ; so that's honestly plenty.
 ;
@@ -1665,7 +1717,7 @@ macro pushcontrol source
 end macro
 
 macro popcontrol target
-  mov.qreg.indirect.qreg target, rbp
+  mov.qreg.disp8.qreg target, 0, rbp
   lea.qreg.disp8.qreg rbp, 8, rbp
 end macro
 
@@ -3167,6 +3219,160 @@ defword stringlen, 0
   push.qreg rdi
   next
 
+;   We make this work using exactly two jump instructions, which is likely the
+; minimum possible. To avoid relying on labels, we hand-compute the byte
+; offsets, so every instruction within their ranges is annotated with its
+; length in bytes. Good fun.
+;
+; Stack in:
+;   left string address
+;   right string address
+; Stack out:
+;   comparison value (0 for equal, 1 for left > right, -1 for left < right)
+defword stringcmp, 0
+  dq $ + 8
+  ;   Save the rsi register.
+  ;
+  ;   Happily, we don't need a lot of registers for this code, so we can
+  ; dedicate rdx to this and not have to deal with stack juggling..
+  mov.qreg.qreg rdx, rsi
+
+  ;   Get the parameters off the stack.
+  ;
+  ;   For reasons that will be explained below, the left and right strings
+  ; from our caller's perspective are swapped from the perspective of the
+  ; comparisons we'll be doing. The rdi register points to the right-hand
+  ; operand of the "cmps" instruction, while rsi points to the left-hand
+  ; operand; [Intel] volume 2A, chapter 3, section 4-3.3, "CMPS"; note that
+  ; the PDF mis-numbers the sections in the second half of chapter 3.
+  pop.qreg rsi
+  pop.qreg rdi
+
+  ;   For the comparison-unequal loop-exit code, we'll need rbx set to zero.
+  ; The easy way to do that is by xor'ing it with itself, but doing it at the
+  ; end of the loop would overwrite the comparison flags, which we need. So we
+  ; do it in advance, and carefully don't touch it anywhere else.
+  ;
+  ;   See the list of which instructions affect the flags in [Intel] volume 1,
+  ; appendix A, section A-1, table A-2.
+  xor.qreg.qreg rbx, rbx
+
+  ;   We also clear rax; we'll be using al later, and it will be convenient to
+  ; know the upper bits are always zero.
+  xor.qreg.qreg rax, rax
+
+  ; Now we've initialized everything we need; everything after this point is
+  ; part of the loop.
+
+  ;   At the start of each iteration, save a copy of the byte we're at now. We
+  ; need to do this before cmps because the pointers will increment; we'll
+  ; eventually use it to test whether we've reached the end delimiter.
+  ;
+  ;   We do a 64-bit load into rcx, which is otherwise unused, then copy the
+  ; low byte from cl to al. This avoids having to think about addressing modes
+  ; that combine 64-bit addresses with smaller data sizes; those are very
+  ; subtle.
+  mov.qreg.indirect.qreg rcx, rsi   ; 3 bytes
+  mov.breg.breg al, cl              ; 2 bytes
+
+  ;   Now do the cmps, which is the heart of the loop.
+  ;
+  ;   Note that, in addition to relying on this comparison to test content
+  ; bytes against each other, in the event that one string is a prefix of the
+  ; other, this will also test content bytes against null delimiters. The
+  ; longer string will compare as "greater", because zero is less than all
+  ; other possible byte values. This is what we want.
+  ;
+  ;   Since strings of different lengths are necessarily unequal, letting this
+  ; test do the work of detecting that also means we don't have to deal with
+  ; scenarios where we're past the end of one string but not the other.
+  cmps8                             ; 1 byte
+
+  ;   The flags are now set based on the simulated subtraction of the next
+  ; bytes from the two strings. If they were unequal, the loop will end. If
+  ; they were equal, we have another test to do before we're finished with
+  ; this iteration. So we put the loop-end code next, and conditionally jump
+  ; forward past it.
+  ;
+  ;   Recall that relative offsets are from the start of the instruction after
+  ; the jmp.
+  jmp.cc.rel.bimm equal, 15         ; 2 bytes
+
+  ;   If we got here, the strings are unequal, so we need to turn the flags
+  ; into a comparison value. We cleared rbx earlier; now we set its low bit to
+  ; the "above" flag, then use sbb to subtract the "carry" flag (which it
+  ; thinks of as the "borrow" flag).
+  ;
+  ;   Recall that "carry" is based on whether the comparison would cause a
+  ; change to the next bit "outside" the bits being compared; [Intel]
+  ; volume 1, chapter 3, section 3-4.3.1. Since the comparison is a
+  ; subtraction, "carry" is true when the right-hand byte is strictly greater
+  ; than the left-hand byte.
+  ;
+  ;   Also recall that "zero", also called "equal", is based on whether the
+  ; comparison's result produces output that's all zero bits. Since the
+  ; comparison is a subtraction, "zero" is true when the two bytes are
+  ; identical.
+  ;
+  ;   The "above" condition is not a flag, but a composite of flags. It's true
+  ; when both "carry" and "zero" are false; [Intel] volume 1, chapter 7,
+  ; section 7-3.1.1, table 7-2. For us, this is equivalent to saying the
+  ; left-hand byte is strictly greater than the right-hand.
+  ;
+  ;   So, after the "set" instruction, rbx is 1 if left > right, and 0 if
+  ; left = right or left < right. The "sbb" instruction with an immediate
+  ; value of zero subtracts the value of the carry flag from rbx; the flag is
+  ; true in the case where left < right, and false otherwise. So, after the
+  ; "sbb", rbx is 1 if left > right, 0 if left = right, and -1 if
+  ; left < right.
+  ;
+  ;   This sounds like the opposite of what we want, but recall that we
+  ; exchanged the operands, so these are the values our caller is expecting.
+  ; While it might be tempting to look for a way to not need that
+  ; transposition, and indeed there exists an approach using "not above", it's
+  ; fiddly in a way that's even harder to explain.
+  ;
+  ;   Note that it's important that the "jmp" and "set" instructions don't
+  ; change the flags (see table A-2 again), so the "cmps" is the most recent
+  ; thing that did. The "sbb" instruction does change them, but we don't need
+  ; them again after that point so it doesn't matter.
+  ;
+  ;   Finally, we need to restore rsi before we return to Forth.
+  ;
+  ;   The two-instruction set-sbb sequence is one of those classic assembly
+  ; programming tricks that often goes unexplained, because most of its appeal
+  ; is its brevity and a proper explanation is quite lengthy. Please enjoy
+  ; this gift of knowledge, and thanks to our friends who showed it to us.
+  set.breg.cc bl, above             ; 3 bytes
+  sbb.qreg.bimm rbx, 0              ; 4 bytes
+  push.qreg rbx                     ; 1 byte
+  mov.qreg.qreg rsi, rdx            ; 3 bytes
+  next                              ; 4 bytes
+
+  ;   If we got here, the current bytes are equal to each other. We still need
+  ; to test if they're null terminators; if so, we exit, and if not, we loop.
+  ;
+  ;   At the beginning of this iteration, we saved a copy of the byte being
+  ; inspected in rax. The "test" instruction simulates a xor and sets the
+  ; flags accordingly. We check the "not equal" condition, also known as "not
+  ; zero", and jump backwards to the start of the loop when that's the case.
+  ;
+  ;   Recall that relative offsets are from the start of the instruction after
+  ; the jmp.
+  test.qreg.qreg rax, rax           ; 3 bytes
+  jmp.cc.rel.bimm not_equal, -28    ; 2 bytes
+
+  ;   If we got here, we got all the way to the end of both strings and found
+  ; a null byte, so we return 0 to indicate they're equal.
+  ;
+  ;   We can use push.dimm even though our stack holds 64-bit values, because
+  ; it gets sign-extended.
+  ;
+  ;   We need to restore rsi before we return to Forth.
+  push.dimm 0
+  mov.qreg.qreg rsi, rdx
+  next
+
 
 ;;;;;;;;;;;;;;;;;
 ;;; Branching ;;;
@@ -4048,6 +4254,10 @@ defword pop_reg64, 0
 ;;; as different instructions is up to you. At any rate the machine code
 ;;; representation of the repeatable variants is the same as for the regular
 ;;; variants with an extra prefix, so we define them together.
+;;;
+;;;   This is a proper superset of the flatassembler implementations of string
+;;; instructions. The wisdom of that is questionable, but at least it's noted
+;;; here...
 
 ; Stack:
 ;   output point
@@ -4399,6 +4609,15 @@ defword jmp_abs_indirect_reg64, 0
 ; Stack:
 ;   output point
 ;   address offset value
+defword jmp_rel_imm8, 0
+  dq docol
+  dq swap, lit, 0xEB, pack8
+  dq swap, pack8
+  dq exit
+
+; Stack:
+;   output point
+;   address offset value
 defword jmp_rel_imm32, 0
   dq docol
   dq swap, lit, 0xE9, pack8