From a2dbddffcfc70e2ab84173b7eb7e24ac09f5dfe2 Mon Sep 17 00:00:00 2001 From: Irene Knapp Date: Mon, 27 Oct 2025 01:45:20 -0700 Subject: ROLL, UNROLL, and a whole bunch of addressing modes also a couple more gdb tips Force-Push: yes Change-Id: I31038334449ea45238c811b4e97e2d87833d8ea6 --- quine.asm | 450 +++++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 373 insertions(+), 77 deletions(-) (limited to 'quine.asm') diff --git a/quine.asm b/quine.asm index 22ebb1e..6ab7da4 100644 --- a/quine.asm +++ b/quine.asm @@ -46,13 +46,14 @@ ;;; ;;; (gdb) disassemble/r 0x0x80007c0,+32 ;;; -;;; If you get the value of rsp out of info registers, and for example it's an -;;; address along the lines of 0x7fffffff8650, you can do +;;; If you want to see the value stack, you can do ;;; -;;; (gdb) x/16xg 0x7fffffff8650 +;;; (gdb) x/16xg $rsp ;;; -;;; The same will work with rbp for the control stack, and don't forget that -;;; the "instruction pointer" is rsi. +;;; The same will work with $rbp for the control stack, and don't forget that +;;; the "instruction pointer" is rsi. To see all the registers, do +;;; +;;; (gdb) info registers ;;;;;;;;;;;;;;;;;;;;;;;;; @@ -151,6 +152,21 @@ macro opcodereg opcode, reg db opcode or reg end macro +macro scalefield sfield, scale + if 1 = scale + sfield = 0 + else if 2 = scale + sfield = 1 + else if 4 = scale + sfield = 2 + else if 8 = scale + sfield = 3 + else + assert 0 + end if +end macro + + ;;; On registers ;;; ------------ ;;; @@ -326,9 +342,9 @@ end macro ; Take a 64-bit source register, treat it as an address and look up the 64-bit ; value it points to, store that into a 64-bit target register. ; -; For rsp and rbp, the only modes available also have displacement; we use an -; 8-bit one and set it to zero. The other registers could be encoded without -; the displacement, but for simplicity's sake we do the same thing for all of +; For rbp, the only modes available also have displacement; we use an 8-bit +; one and set it to zero. The other registers could be encoded without the +; displacement, but for simplicity's sake we do the same thing for all of ; them. ; ; In understanding this, pay close attention to the Op/En column in the opcode @@ -347,22 +363,27 @@ end macro ; ; We disallow rsp as a source because that's the mode that would want an SIB. macro mov.qreg.indirect.qreg target, source + qwordreg sreg, source + qwordreg treg, target + rex.w + db 0x8B + modrm 1, treg, sreg match =rsp, source - assert 0 - else - qwordreg sreg, source - qwordreg treg, target - rex.w - db 0x8B - modrm 1, treg, sreg - db 0 + ; R/M = rsp is the SIB case + sib 0, 4, sreg + ; no scaling, no indexing, source as base end match + db 0 end macro ; Take a 64-bit source register, store its value into the address pointed to -; by a 64-bit target register. The only modes available also have -; displacement; we use an 8-bit one and set it to zero. +; by a 64-bit target register. +; +; For rbp, the only modes available also have displacement; we use an 8-bit +; one and set it to zero. The other registers could be encoded without the +; displacement, but for simplicity's sake we do the same thing for all of +; them. ; ; In understanding this, pay close attention to the Op/En column in the opcode ; table. The "MR" variant means the ModRM byte's reg field (the middle one) @@ -379,17 +400,21 @@ end macro ; chapter 2, section 2-1.5, table 2-2. ; ; We disallow rsp as a target because that's the mode that would want an SIB. +; When you look at other addressing modes, be aware that the special treatment +; is for whichever register is specified in the R/M field. Sometimes that's +; the source, and sometimes it's the target, depending on the opcode. macro mov.indirect.qreg.qreg target, source + qwordreg sreg, source + qwordreg treg, target + rex.w + db 0x89 + modrm 1, sreg, treg match =rsp, target - assert 0 - else - qwordreg sreg, source - qwordreg treg, target - rex.w - db 0x89 - modrm 1, sreg, treg - db 0 + ; R/M = rsp is the SIB case + sib 0, 4, treg + ; no scaling, no indexing, target as base end match + db 0 end macro @@ -410,10 +435,10 @@ macro mov.indirect.qreg.breg target, source end macro macro mov.breg.indirect.qreg target, source - match =rsp, target + match =rsp, source assert 0 ; The SIB case. - else match =rbp, target + else match =rbp, source assert 0 ; An unrelated addressing mode. else @@ -443,10 +468,10 @@ end macro ; We use the operand-size prefix to specify 16-bit. No REX.W. Table 3-4. macro mov.wreg.indirect.qreg target, source - match =rsp, target + match =rsp, source assert 0 ; The SIB case. - else match =rbp, target + else match =rbp, source assert 0 ; An unrelated addressing mode. else @@ -476,10 +501,10 @@ end macro ; It defaults to 32-bit, no prefix needed, also no REX.W. Table 3-4. macro mov.dreg.indirect.qreg target, source - match =rsp, target + match =rsp, source assert 0 ; The SIB case. - else match =rbp, target + else match =rbp, source assert 0 ; An unrelated addressing mode. else @@ -491,6 +516,46 @@ macro mov.dreg.indirect.qreg target, source end macro +macro mov.qreg.indexed.qreg target, source, index, scale + match =rbp, source + assert 0 + ; This is divided into some subcases we don't wish to deal with yet. + else match =rsp, index + assert 0 + ; This is the case where it's not actually indexed after all. + else + qwordreg treg, target + qwordreg sreg, source + qwordreg ireg, index + scalefield sfield, scale + rex.w + db 0x8B + modrm 0, treg, 4 + sib sfield, ireg, sreg + end match +end macro + + +macro mov.indexed.qreg.qreg target, index, scale, source + match =rbp, source + assert 0 + ; This is divided into some subcases we don't wish to deal with yet. + else match =rsp, index + assert 0 + ; This is the case where it's not actually indexed after all. + else + qwordreg treg, target + qwordreg sreg, source + qwordreg ireg, index + scalefield sfield, scale + rex.w + db 0x89 + modrm 0, sreg, 4 + sib sfield, ireg, treg + end match +end macro + + ; Take a 64-bit source register, store its value into a high 64-bit target ; register (r8-r15). ; @@ -539,6 +604,24 @@ macro mov.qreg.oreg target, source end macro +; This increments a 64-bit register by 1, in place; +macro inc.qreg target + qwordreg treg, target + rex.w + db 0xFF + modrm 3, 0, treg + ; The 0 is part of the opcode. +end macro + +; This decrements a 64-bit register by 1, in place; +macro dec.qreg target + qwordreg treg, target + rex.w + db 0xFF + modrm 3, 1, treg + ; The 1 is part of the opcode. +end macro + ; This adds a 64-bit register to another 64-bit register, in place. macro add.qreg.qreg target, source qwordreg treg, target @@ -980,11 +1063,59 @@ macro lea.qreg.qreg.disp32 target, source, offset end match end macro +macro lea.qreg.indexed.qreg target, source, index, scale + match =rbp, source + assert 0 + ; This is divided into some subcases we don't wish to deal with yet. + else match =rsp, index + assert 0 + ; This is the case where it's not actually indexed after all. + else + qwordreg treg, target + qwordreg sreg, source + qwordreg ireg, index + scalefield sfield, scale + rex.w + db 0x8D + modrm 0, treg, 4 + sib sfield, ireg, sreg + end match +end macro + +; Wow, we use ALL the instruction suffixes for this, huh. See [Intel] volume +; 2A, chapter 2, section 2-1, with particular attention to figure 2-1. +macro lea.qreg.disp8.indexed.qreg target, offset, source, index, scale + match =rbp, source + assert 0 + ; This is divided into some subcases we don't wish to deal with yet. + else match =rsp, index + assert 0 + ; This is the case where it's not actually indexed after all. + else + qwordreg treg, target + qwordreg sreg, source + qwordreg ireg, index + scalefield sfield, scale + rex.w + db 0x8D + modrm 1, treg, 4 + ; 1 in the mode field says we want a disp8. + ; 4 in the R/M field says we want an SIB byte. + sib sfield, ireg, sreg + db offset + end match +end macro + ; Clear the DF flag. This makes string instructions increment RSI. macro cld db 0xFC end macro +; Set the DF flag. This makes string instructions decrement RSI. +macro std + db 0xFD +end macro + ; Load 64 bits from the address in RSI into RAX. Then, increment or decrement ; RSI by 8 bytes, depending on the value of the DF flag. macro lodsq @@ -992,6 +1123,34 @@ macro lodsq db 0xAD end macro +; [Intel] describes two different styles of mnemonic for the repeated string +; operations. See, their parameters are always rsi and rdi, or the smaller +; versions of those same specific registers. Intel thinks we might want to +; write out "rsi" explicitly, even though the only information it conveys is +; the size. The position we take is that it's better to let that be conveyed +; by the instruction name; otherwise it'd be a point of confusion for new +; readers, who might mistakenly think it's possible to pass it different +; registers. +; +; With the string instructions, the reader SHOULD be thinking, "Wait... +; where does this get its parameters from?" Writing them in a way that makes +; them appear simpler than they are would be confusing. +macro rep operation + match =movsq, operation + ; The "rep" instruction can also be thought of as a prefix to other + ; instructions, though only a few specific ones are allowed. Anyway, it + ; comes before the REX byte. + db 0xF3 + ; The rest of this is the same as the encoding of normal, non-repeated + ; movsq. + rex.w + db 0xA5 + ; There's no explicit parameters. String operations are magic. + else + assert 0 + end match +end macro + ; Push a 64-bit value from a register onto the stack (the one pointed to by ; rsp). Decrement rsp, then write the value at the new location. ; @@ -1780,6 +1939,136 @@ defword DROP2, 0 pop.qreg rax NEXT +; Rotates "up" (pops its parameter, n; nth item then becomes current item). +; +; We implement this the high-performance way, with rep movsq, aka the +; instruction that exists to optimize C's memcpy(). The details of setting +; that up are complex; see below. +defword ROLL, 0 + dq $ + 0x8 ; codeword + + ; Pop our parameter. The rep instruction takes rcx as its count, so we + ; reduce copying by using it to hold our count, as well. + pop.qreg rcx + + ; We have n - 1 items to slide, so decrement rcx. For the purpose of + ; counting how many repetitions will happen, it's one-based. This is because + ; the rep instruction performs a single movsq, then decrements rcx, then + ; stops if rcx is zero. + dec.qreg rcx + + ; Retrieve the nth item, for later. For this purpose we're thinking in + ; zero-based terms, so we do this after already having decremented rcx. + mov.qreg.indexed.qreg rbx, rsp, rcx, 8 + + ; The source address for movsq is rsi and the destination is rdi; we can + ; use rdi as we wish, but rsi is our Forth "instruction pointer", so we must + ; save and restore it. Doing so alters rsp, so we have to adjust the address + ; calculations by eight bytes as compared to the expressions above, but + ; happily we can use the disp8 field to do that. We'd be using disp8 anyway + ; because it's helpful. + push.qreg rsi + + ; Now we set up parameters for the memory-sliding operation. We have + ; n - 1 items to copy, moving the range rsp through rsp + (n-2)*8 onto the + ; range rsp + 8 through rsp + (n-1)*8. That's with the value of rsp as it + ; exists at this moment (it's going to change soon). + ; + ; We're sliding them upwards in memory, so we start at the high end so + ; that we're always moving into a location that doesn't have anything + ; precious. We use lea as a convenient way to do the stack math. + ; + ; When rcx is 1, we want rsp + 8. + lea.qreg.indexed.qreg rsi, rsp, rcx, 8 + ; When rcx is 1, we want rsp + 16. + lea.qreg.disp8.indexed.qreg rdi, 8, rsp, rcx, 8 + ; + ; Using rcx = 1 is the most convenient example to use for figuring out the + ; arithmetic. It's a linear relationship, so as long as we get the 8-byte + ; stride correct, we just need to pick a single point and verify that our + ; math is right for that point, and it'll be right for any value of rcx. + + ; Another of our Forth conventions is that the DF flag should be kept at + ; zero, which directs string instruction to increment rsi. Here, however, + ; because our source and destination ranges overlap, we need to start at the + ; high end, which means we need it to decrement. So we set DF to one, and + ; we'll clear it after. + std + rep movsq + + ; Set everything back. + cld + pop.qreg rsi + + ; There is now an extra item at the low end of the stack (the top) that + ; needs to go away, and coincidentally we have a value in rbx that needs to + ; be in that spot. Rather than doing a drop and push, we overwrite it, to + ; save a little work. + mov.indirect.qreg.qreg rsp, rbx + + ; All done, wow! What a mouthful. + NEXT + +; Rotates "down" (pops its parameter, n; current item then becomes nth item). +; +; We implement this the high-performance way, with rep movsq, aka the +; instruction that exists to optimize C's memcpy(). The details of setting +; that up are complex; see below. +defword UNROLL, 0 + dq $ + 0x8 ; codeword + + ; Pop our parameter. The rep instruction takes rcx as its count, so we + ; reduce copying by using it to hold our count, as well. + pop.qreg rcx + + ; We have n - 1 items to slide, so decrement rcx. Also, save a copy of it in + ; rdx after doing that, for later. + dec.qreg rcx + mov.qreg.qreg rdx, rcx + + ; Retrieve the 0th item, for later. + mov.qreg.indirect.qreg rbx, rsp + + ; Now we set up parameters for the memory-sliding operation. We have + ; n - 1 items to copy, moving the range rsp + 8 through rsp + (n-1)*8 onto + ; the range rsp through rsp + (n-2)*8. That's with the value of rsp as it + ; exists at this moment (it's going to change soon). + ; + ; We're sliding them downwards in memory, so we start at the low end so + ; that we're always moving into a location that doesn't have anything + ; precious. We use lea as a convenient way to do the stack math. + ; + ; As with ROLL, we need to save rsi and adjust those rsp calculations + ; accordingly. + push.qreg rsi + + ; When rcx is 1, we want rsp + 16. + lea.qreg.disp8.indexed.qreg rsi, 8, rsp, rcx, 8 + ; When rcx is 1, we want rsp + 8. + lea.qreg.indexed.qreg rdi, rsp, rcx, 8 + + ; With ROLL, we were starting at the high end. Here, we start at the low + ; end, which means we need rsi to increment after each repetition. That's + ; what it does when the DF flag is clear, and another of our Forth + ; conventions is to keep it clear normally. So, we don't have to touch DF! + ; Yay! + rep movsq + + ; Restore our original rsi. + pop.qreg rsi + + ; There is now an extra item in the middle of the stack, at the high end of + ; the sliding we did, that needs to be overwritten with our value in rbx. + ; Since we destructively updated our count in rcx, we saved a copy of the + ; count in rdx, and we use that to find the right address. + ; + ; When the original count was n, we want rsp + (n-1)*8, so we saved rdx + ; after decrementing rcx, above. + mov.indexed.qreg.qreg rsp, rdx, 8, rbx + + ; All done, wow! What a mouthful. + NEXT + ; Rotates "up" (third item becomes current item) defword ROLL3, 0 dq $ + 0x8 ; codeword @@ -1792,7 +2081,7 @@ defword ROLL3, 0 NEXT ; Rotates "down" (current item becomes third item) -defword ROLLD3, 0 +defword UNROLL3, 0 dq $ + 0x8 ; codeword pop.qreg rax pop.qreg rbx @@ -2130,10 +2419,61 @@ defword SYS_WRITE, 0 NEXT +;;;;;;;;;;;;;;;;;;;;;; +;;; Ouptut helpers ;;; +;;;;;;;;;;;;;;;;;;;;;; + +; In: base address, value +; Out: new base address +defword PACK64, 0 + dq DOCOL ; codeword + dq SWAP, DUP, UNROLL3, STORE, LIT, 8, ADD + dq EXIT +defword PACK32, 0 + dq DOCOL ; codeword + dq SWAP, DUP, UNROLL3, STORE32, LIT, 4, ADD + dq EXIT +defword PACK16, 0 + dq DOCOL ; codeword + dq SWAP, DUP, UNROLL3, STORE16, LIT, 2, ADD + dq EXIT +defword PACK8, 0 + dq DOCOL ; codeword + dq SWAP, DUP, UNROLL3, STORE8, LIT, 1, ADD + dq EXIT + +; In the interests of reducing our executable's size, since a lot of it goes +; to PACK* invocations, we define words that combine LIT with PACK*. This +; shaves roughly 700 bytes as of when it was added. +defword LITPACK64, 0 + dq $ + 0x8 ; codeword + lodsq + push.qreg rax + BEFORENEXT PACK64 +defword LITPACK32, 0 + dq $ + 0x8 ; codeword + lodsq + push.qreg rax + BEFORENEXT PACK32 +defword LITPACK16, 0 + dq $ + 0x8 ; codeword + lodsq + push.qreg rax + BEFORENEXT PACK16 +defword LITPACK8, 0 + dq $ + 0x8 ; codeword + lodsq + push.qreg rax + BEFORENEXT PACK8 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; (new) Implementation strategy ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; +;;; We assemble the entire file contents in a heap-allocated buffer. When +;;; the file is fully assembled, we output it. +;;; defword QUINE, 0 dq DOCOL ; codeword @@ -2176,50 +2516,6 @@ defword WRITE_SELF_RAW_H, 0 NEXT -; In: base address, value -; Out: new base address -defword PACK64, 0 - dq DOCOL ; codeword - dq SWAP, DUP, ROLLD3, STORE, LIT, 8, ADD - dq EXIT -defword PACK32, 0 - dq DOCOL ; codeword - dq SWAP, DUP, ROLLD3, STORE32, LIT, 4, ADD - dq EXIT -defword PACK16, 0 - dq DOCOL ; codeword - dq SWAP, DUP, ROLLD3, STORE16, LIT, 2, ADD - dq EXIT -defword PACK8, 0 - dq DOCOL ; codeword - dq SWAP, DUP, ROLLD3, STORE8, LIT, 1, ADD - dq EXIT - -; In the interests of reducing our executable's size, since a lot of it goes -; to PACK* invocations, we define words that combine LIT with PACK*. This -; shaves roughly 700 bytes as of when it was added. -defword LITPACK64, 0 - dq $ + 0x8 ; codeword - lodsq - push.qreg rax - BEFORENEXT PACK64 -defword LITPACK32, 0 - dq $ + 0x8 ; codeword - lodsq - push.qreg rax - BEFORENEXT PACK32 -defword LITPACK16, 0 - dq $ + 0x8 ; codeword - lodsq - push.qreg rax - BEFORENEXT PACK16 -defword LITPACK8, 0 - dq $ + 0x8 ; codeword - lodsq - push.qreg rax - BEFORENEXT PACK8 - - ;;; ;;; ELF header ;;; -- cgit 1.4.1