diff options
| author | Irene Knapp <ireneista@irenes.space> | 2025-11-10 17:39:47 -0800 |
|---|---|---|
| committer | Irene Knapp <ireneista@irenes.space> | 2025-11-10 21:25:56 -0800 |
| commit | e85b9d90978705054991cfb67c975d9856813a21 (patch) | |
| tree | cd78a64bb303a16186f5b73dd64f5828f0261da1 | |
| parent | 9368baad9d59f4d00aa11fa1f277be27cf9b412b (diff) | |
implement stringcmp, with extensive commenting
and a lot of other stuff necessary for that, as follows implement the "adc" and "sbb" instructions in flatassembler (there's no Forth implementation of them yet) implement "cmps8" in flatassembler; Forth already has a much more thorough suite of string instructions add Forth jmp_rel_imm8 and flatassembler jmp.rel.bimm unconditional jumps. they're nver used, it just seems like they should exist. split the disp8 case from the indirect case for the flatassembler implementation of register-to-register mov, to match how the Forth implementation works fix the name of mov.disp8.qreg.qreg, which had been incorrectly named mov.qreg.disp8.qreg. it's never called anyway. fix a citation for the MOV instruction; Intel's section numbers are confusing. Force-Push: yeah Change-Id: I53761a3487041c142ade1a82a8ab9a3f1129b853
| -rw-r--r-- | quine.asm | 237 |
1 files changed, 228 insertions, 9 deletions
diff --git a/quine.asm b/quine.asm index 7bc78b9..8b21819 100644 --- a/quine.asm +++ b/quine.asm @@ -417,17 +417,16 @@ end macro ; Take a 64-bit source register, treat it as an address and look up the 64-bit ; value it points to, store that into a 64-bit target register. ; -; For rbp, the only modes available also have displacement; we use an 8-bit -; one and set it to zero. The other registers could be encoded without the -; displacement, but for simplicity's sake we do the same thing for all of -; them. +; For a source of rbp, the only modes available also have displacement; we +; disallow that. Use mov.qreg.disp8.qreg for that case, and set a displacement +; of 0. ; ; In understanding this, pay close attention to the Op/En column in the opcode ; table. The "RM" variant means the ModRM byte's R/M field (the third one) ; is the source, while its reg field (the middle one) is the target. This is ; what we want, because the R/M field is the one that gets indirection applied ; to it. Opcode 0x8B with an REX.W prefix is the all-64-bit RM variant. -; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV". +; [Intel] volume 2B, chapter 4, section 4-3, "MOV". ; ; For the indirection modes, don't be confused by the many similar tables. ; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W @@ -438,6 +437,23 @@ end macro ; ; We disallow rsp as a source because that's the mode that would want an SIB. macro mov.qreg.indirect.qreg target, source + match =rbp, source + assert 0 + end match + qwordreg sreg, source + qwordreg treg, target + rex.w + db 0x8B + modrm 0, treg, sreg + match =rsp, source + ; R/M = rsp is the SIB case + sib 0, 4, sreg + ; no scaling, no indexing, source as base + end match +end macro + + +macro mov.qreg.disp8.qreg target, offset, source qwordreg sreg, source qwordreg treg, target rex.w @@ -448,7 +464,7 @@ macro mov.qreg.indirect.qreg target, source sib 0, 4, sreg ; no scaling, no indexing, source as base end match - db 0 + db offset end macro @@ -465,7 +481,7 @@ end macro ; is the source, while its R/M field (the third one) is the target. This is ; what we want, because the R/M field is the one that gets indirection applied ; to it. Opcode 0x89 with an REX.W prefix is the all-64-bit MR variant. -; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV". +; [Intel] volume 2B, chapter 4, section 4-3, "MOV". ; ; For the indirection modes, don't be confused by the many similar tables. ; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W @@ -816,6 +832,24 @@ macro sub.qreg.dimm target, source dd source end macro +; Add with carry. +macro adc.qreg.bimm target, source + qwordreg treg, target + rex.w + db 0x83 + modrm 3, 2, treg + db source +end macro + +; Subtract with borrow. +macro sbb.qreg.bimm target, source + qwordreg treg, target + rex.w + db 0x83 + modrm 3, 3, treg + db source +end macro + ; This multiplies rax, as 64-bits, with another 64-bit register, in place. ; ; The 4 in the reg field is part of the opcode. @@ -1004,7 +1038,7 @@ end macro ; ; We need to treat a target of rsp specially because it's the SIB case per ; table 2-2. -macro mov.qreg.disp8.qreg target, offset, source +macro mov.disp8.qreg.qreg target, offset, source qwordreg sreg, source qwordreg treg, target rex.w @@ -1151,6 +1185,13 @@ macro lea.qreg.disp8.indexed.qreg target, offset, source, index, scale end match end macro +macro mov.breg.breg target, source + bytereg treg, target + bytereg sreg, source + db 0x88 + modrm 3, sreg, treg +end macro + macro mov.breg.bimm target, source bytereg treg, target db 0xC6 @@ -1176,6 +1217,10 @@ macro lodsq db 0xAD end macro +macro cmps8 + db 0xA6 +end macro + ; [Intel] describes two different styles of mnemonic for the repeated string ; operations. See, their parameters are always rsi and rdi, or the smaller ; versions of those same specific registers. Intel thinks we might want to @@ -1311,6 +1356,13 @@ macro jmp.abs.indirect.qreg location modrm 0, 4, lreg end macro +; The location is relative to the start of the instruction immediately +; following the jmp. +macro jmp.rel.bimm location + db 0xEB + db location +end macro + ; There in no 64-bit immediate "near" jump, so we use 32-bit. It's relatve, ; so that's honestly plenty. ; @@ -1665,7 +1717,7 @@ macro pushcontrol source end macro macro popcontrol target - mov.qreg.indirect.qreg target, rbp + mov.qreg.disp8.qreg target, 0, rbp lea.qreg.disp8.qreg rbp, 8, rbp end macro @@ -3167,6 +3219,160 @@ defword stringlen, 0 push.qreg rdi next +; We make this work using exactly two jump instructions, which is likely the +; minimum possible. To avoid relying on labels, we hand-compute the byte +; offsets, so every instruction within their ranges is annotated with its +; length in bytes. Good fun. +; +; Stack in: +; left string address +; right string address +; Stack out: +; comparison value (0 for equal, 1 for left > right, -1 for left < right) +defword stringcmp, 0 + dq $ + 8 + ; Save the rsi register. + ; + ; Happily, we don't need a lot of registers for this code, so we can + ; dedicate rdx to this and not have to deal with stack juggling.. + mov.qreg.qreg rdx, rsi + + ; Get the parameters off the stack. + ; + ; For reasons that will be explained below, the left and right strings + ; from our caller's perspective are swapped from the perspective of the + ; comparisons we'll be doing. The rdi register points to the right-hand + ; operand of the "cmps" instruction, while rsi points to the left-hand + ; operand; [Intel] volume 2A, chapter 3, section 4-3.3, "CMPS"; note that + ; the PDF mis-numbers the sections in the second half of chapter 3. + pop.qreg rsi + pop.qreg rdi + + ; For the comparison-unequal loop-exit code, we'll need rbx set to zero. + ; The easy way to do that is by xor'ing it with itself, but doing it at the + ; end of the loop would overwrite the comparison flags, which we need. So we + ; do it in advance, and carefully don't touch it anywhere else. + ; + ; See the list of which instructions affect the flags in [Intel] volume 1, + ; appendix A, section A-1, table A-2. + xor.qreg.qreg rbx, rbx + + ; We also clear rax; we'll be using al later, and it will be convenient to + ; know the upper bits are always zero. + xor.qreg.qreg rax, rax + + ; Now we've initialized everything we need; everything after this point is + ; part of the loop. + + ; At the start of each iteration, save a copy of the byte we're at now. We + ; need to do this before cmps because the pointers will increment; we'll + ; eventually use it to test whether we've reached the end delimiter. + ; + ; We do a 64-bit load into rcx, which is otherwise unused, then copy the + ; low byte from cl to al. This avoids having to think about addressing modes + ; that combine 64-bit addresses with smaller data sizes; those are very + ; subtle. + mov.qreg.indirect.qreg rcx, rsi ; 3 bytes + mov.breg.breg al, cl ; 2 bytes + + ; Now do the cmps, which is the heart of the loop. + ; + ; Note that, in addition to relying on this comparison to test content + ; bytes against each other, in the event that one string is a prefix of the + ; other, this will also test content bytes against null delimiters. The + ; longer string will compare as "greater", because zero is less than all + ; other possible byte values. This is what we want. + ; + ; Since strings of different lengths are necessarily unequal, letting this + ; test do the work of detecting that also means we don't have to deal with + ; scenarios where we're past the end of one string but not the other. + cmps8 ; 1 byte + + ; The flags are now set based on the simulated subtraction of the next + ; bytes from the two strings. If they were unequal, the loop will end. If + ; they were equal, we have another test to do before we're finished with + ; this iteration. So we put the loop-end code next, and conditionally jump + ; forward past it. + ; + ; Recall that relative offsets are from the start of the instruction after + ; the jmp. + jmp.cc.rel.bimm equal, 15 ; 2 bytes + + ; If we got here, the strings are unequal, so we need to turn the flags + ; into a comparison value. We cleared rbx earlier; now we set its low bit to + ; the "above" flag, then use sbb to subtract the "carry" flag (which it + ; thinks of as the "borrow" flag). + ; + ; Recall that "carry" is based on whether the comparison would cause a + ; change to the next bit "outside" the bits being compared; [Intel] + ; volume 1, chapter 3, section 3-4.3.1. Since the comparison is a + ; subtraction, "carry" is true when the right-hand byte is strictly greater + ; than the left-hand byte. + ; + ; Also recall that "zero", also called "equal", is based on whether the + ; comparison's result produces output that's all zero bits. Since the + ; comparison is a subtraction, "zero" is true when the two bytes are + ; identical. + ; + ; The "above" condition is not a flag, but a composite of flags. It's true + ; when both "carry" and "zero" are false; [Intel] volume 1, chapter 7, + ; section 7-3.1.1, table 7-2. For us, this is equivalent to saying the + ; left-hand byte is strictly greater than the right-hand. + ; + ; So, after the "set" instruction, rbx is 1 if left > right, and 0 if + ; left = right or left < right. The "sbb" instruction with an immediate + ; value of zero subtracts the value of the carry flag from rbx; the flag is + ; true in the case where left < right, and false otherwise. So, after the + ; "sbb", rbx is 1 if left > right, 0 if left = right, and -1 if + ; left < right. + ; + ; This sounds like the opposite of what we want, but recall that we + ; exchanged the operands, so these are the values our caller is expecting. + ; While it might be tempting to look for a way to not need that + ; transposition, and indeed there exists an approach using "not above", it's + ; fiddly in a way that's even harder to explain. + ; + ; Note that it's important that the "jmp" and "set" instructions don't + ; change the flags (see table A-2 again), so the "cmps" is the most recent + ; thing that did. The "sbb" instruction does change them, but we don't need + ; them again after that point so it doesn't matter. + ; + ; Finally, we need to restore rsi before we return to Forth. + ; + ; The two-instruction set-sbb sequence is one of those classic assembly + ; programming tricks that often goes unexplained, because most of its appeal + ; is its brevity and a proper explanation is quite lengthy. Please enjoy + ; this gift of knowledge, and thanks to our friends who showed it to us. + set.breg.cc bl, above ; 3 bytes + sbb.qreg.bimm rbx, 0 ; 4 bytes + push.qreg rbx ; 1 byte + mov.qreg.qreg rsi, rdx ; 3 bytes + next ; 4 bytes + + ; If we got here, the current bytes are equal to each other. We still need + ; to test if they're null terminators; if so, we exit, and if not, we loop. + ; + ; At the beginning of this iteration, we saved a copy of the byte being + ; inspected in rax. The "test" instruction simulates a xor and sets the + ; flags accordingly. We check the "not equal" condition, also known as "not + ; zero", and jump backwards to the start of the loop when that's the case. + ; + ; Recall that relative offsets are from the start of the instruction after + ; the jmp. + test.qreg.qreg rax, rax ; 3 bytes + jmp.cc.rel.bimm not_equal, -28 ; 2 bytes + + ; If we got here, we got all the way to the end of both strings and found + ; a null byte, so we return 0 to indicate they're equal. + ; + ; We can use push.dimm even though our stack holds 64-bit values, because + ; it gets sign-extended. + ; + ; We need to restore rsi before we return to Forth. + push.dimm 0 + mov.qreg.qreg rsi, rdx + next + ;;;;;;;;;;;;;;;;; ;;; Branching ;;; @@ -4048,6 +4254,10 @@ defword pop_reg64, 0 ;;; as different instructions is up to you. At any rate the machine code ;;; representation of the repeatable variants is the same as for the regular ;;; variants with an extra prefix, so we define them together. +;;; +;;; This is a proper superset of the flatassembler implementations of string +;;; instructions. The wisdom of that is questionable, but at least it's noted +;;; here... ; Stack: ; output point @@ -4399,6 +4609,15 @@ defword jmp_abs_indirect_reg64, 0 ; Stack: ; output point ; address offset value +defword jmp_rel_imm8, 0 + dq docol + dq swap, lit, 0xEB, pack8 + dq swap, pack8 + dq exit + +; Stack: +; output point +; address offset value defword jmp_rel_imm32, 0 dq docol dq swap, lit, 0xE9, pack8 |