;;; QUINE ;;; ;;; This file is formatted to be read at 80-columns or wider. ;;; ;;; There's some tabular information, but diagrams have been avoided, in an ;;; attempt to make this manageable in screen readers. Feedback welcome. ;;;;;;;;;;;;;;;;;;;;; ;;; Workflow tips ;;; ;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Currently, this is not yet fully self-hosting; it is based on ;;; flatassembler[1]. A minimal command to build and run it is: ;;; ;;; $ fasmg quine.asm quine && chmod 755 quine && ./quine; echo $? ;;; ;;; A workflow you may wish to use for debugging is: ;;; ;;; $ rm quine2; fasmg quine.asm quine && chmod 755 quine && ./quine > quine2; echo "exit code:" $?; echo; hexdump -C quine; echo; hexdump -C quine2; echo; cmp -l quine quine2 ; echo cmp: $? ;;; ;;; The reason this removes the old one first is that otherwise, there's a ;;; risk the error message will be scrolled off the top of the screen and ;;; you'll see stale output and not realize. ;;; ;;; You may also wish to do: ;;; ;;; $ objdump --disassemble quine ;;; $ ZydisDisasm -64 quine ;;; ;;; This relies on GNU binutils, and on zydis, respectively. ;;; ;;; [1] https://flatassembler.net/ ;;; ;;; ;;; gdb ;;; --- ;;; ;;; You can run gdb on it if you want; there's no symbols, but if you are ;;; familiar with the hex it should be readable. Keep a hexdump of the program ;;; handy to look up what addresses are. ;;; ;;; If you want to see a routine implemented in assembly, look at the hexdump ;;; of the overall file, find it by looking at the ASCII names, skip past the ;;; codeword, and do ie ;;; ;;; (gdb) disassemble/r 0x0x80007c0,+32 ;;; ;;; If you want to see the value stack, you can do ;;; ;;; (gdb) x/16xg $rsp ;;; ;;; The same will work with $rbp for the control stack, and don't forget that ;;; the "instruction pointer" is rsi. To see all the registers, do ;;; ;;; (gdb) info registers ;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Assembly language ;;; ;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Before doing any actual code, we define macros for writing x86-64 ;;; assembly language. This is built from scratch, relying only on ;;; flatassembler's built-in semantics. No include files of any kind are used ;;; for it. macro pad: bytes if bytes > 0 db 0x00 pad (bytes - 1) end if end macro macro align: bytes if bytes > 0 if $ mod bytes <> 0 db 0x00 align bytes end if end if end macro ; The way these are all spelled out like this is slightly ridiculous, there ; must be a better way. macro rex.0 db 0x40 end macro macro rex.w db 0x48 end macro macro rex.r db 0x44 end macro macro rex.x db 0x42 end macro macro rex.b db 0x41 end macro macro rex.wr db 0x4C end macro macro rex.wx db 0x4A end macro macro rex.wb db 0x49 end macro macro rex.rx db 0x46 end macro macro rex.rb db 0x45 end macro macro rex.xb db 0x43 end macro macro rex.wrx db 0x4E end macro macro rex.wrb db 0x4D end macro macro rex.wxb db 0x4B end macro macro rex.rxb db 0x47 end macro macro rex.wrxb db 0x4F end macro macro modrm mod, reg, rm assert mod >= 0 & mod < 4 assert reg >= 0 & reg < 8 assert rm >= 0 & rm < 8 db (mod shl 6) OR (reg shl 3) OR rm end macro macro sib scale, index, base assert scale >= 0 & scale < 4 assert index >= 0 & index < 8 assert base >= 0 & index < 8 db (scale shl 6) OR (index shl 3) OR base end macro macro opcodereg opcode, reg assert opcode >= 0 & opcode < 256 & opcode AND 7 = 0 assert reg >= 0 & reg < 8 db opcode OR reg end macro macro opcodecc opcode, cc assert opcode >= 0 & opcode < 256 & opcode AND 15 = 0 assert cc >= 0 & cc < 16 db opcode OR cc end macro macro scalefield sfield, scale if 1 = scale sfield = 0 else if 2 = scale sfield = 1 else if 4 = scale sfield = 2 else if 8 = scale sfield = 3 else assert 0 end if end macro ; Yep, there sure is a lot of duplication in these. This is based on Intel's ; documented mnemonics... ; ; "Above" and "below" are for unsigned comparisons. "Greater" and "less" are ; for signed comparisons. ; ; This is documented on the individual opcode pages, and also in B.1.4.7. macro conditioncode cc, condition match =above, condition cc = 0x07 else match =above_equal, condition cc = 0x03 else match =below, condition cc = 0x02 else match =below_equal, condition cc = 0x06 else match =carry, condition cc = 0x02 else match =equal, condition cc = 0x04 else match =greater, condition cc = 0x0F else match =greater_equal, condition cc = 0x0D else match =less, condition cc = 0x0C else match =less_equal, condition cc = 0x0E else match =not_above, condition cc = 0x06 else match =not_above_equal, condition cc = 0x02 else match =not_below, condition cc = 0x03 else match =not_below_equal, condition cc = 0x07 else match =not_carry, condition cc = 0x03 else match =not_equal, condition cc = 0x05 else match =not_greater, condition cc = 0x0E else match =not_greater_equal, condition cc = 0x0C else match =not_less, condition cc = 0x0D else match =not_less_equal, condition cc = 0x0F else match =not_overflow, condition cc = 0x01 else match =not_parity, condition cc = 0x0B else match =not_sign, condition cc = 0x09 else match =not_zero, condition cc = 0x05 else match =overflow, condition cc = 0x00 else match =parity, condition cc = 0x0A else match =parity_even, condition cc = 0x0A else match =parity_odd, condition cc = 0x0B else match =sign, condition cc = 0x08 else match =zero, condition cc = 0x04 else assert 0 end match end macro ;;; On registers ;;; ------------ ;;; ;;; The x86 architecture has been around a while, it has been through ;;; several transitions from smaller word sizes to larger ones. Therefore it ;;; has different names for the "same" registers, depending on how much of ;;; them you're using. ;;; ;;; TODO there's more to write here macro bytereg result, register match =al?, register result = 0 else match =cl?, register result = 1 else match =dl?, register result = 2 else match =bl?, register result = 3 else match =ah?, register result = 4 else match =ch?, register result = 5 else match =dh?, register result = 6 else match =bh?, register result = 7 else assert 0 end match end macro macro wordreg result, register match =ax?, register result = 0 else match =cx?, register result = 1 else match =dx?, register result = 2 else match =bx?, register result = 3 else match =sp?, register result = 4 else match =bp?, register result = 5 else match =si?, register result = 6 else match =di?, register result = 7 else assert 0 end match end macro macro dwordreg result, register match =eax?, register result = 0 else match =ecx?, register result = 1 else match =edx?, register result = 2 else match =ebx?, register result = 3 else match =esp?, register result = 4 else match =ebp?, register result = 5 else match =esi?, register result = 6 else match =edi?, register result = 7 else assert 0 end match end macro macro qwordreg result, register match =rax?, register result = 0 else match =rcx?, register result = 1 else match =rdx?, register result = 2 else match =rbx?, register result = 3 else match =rsp?, register result = 4 else match =rbp?, register result = 5 else match =rsi?, register result = 6 else match =rdi?, register result = 7 else assert 0 end match end macro macro owordreg result, register match =r8?, register result = 0 else match =r9?, register result = 1 else match =r10?, register result = 2 else match =r11?, register result = 3 else match =r12?, register result = 4 else match =r13?, register result = 5 else match =r14?, register result = 6 else match =r15?, register result = 7 else assert 0 end match end macro ;;; Instructions ;;; ------------ macro mov.dreg.dimm target, source dwordreg treg, target opcodereg 0xB8, treg dd source end macro macro mov.qreg.dimm target, source qwordreg treg, target rex.w db 0xC7 modrm 3, 0, treg dd source end macro macro mov.qreg.qimm target, source qwordreg treg, target rex.w opcodereg 0xB8, treg dq source end macro ; Notice the use of REX.B here; this instruction puts the register number in ; the opcode field, so it uses Table 3-1. macro mov.oreg.qimm target, source owordreg treg, target rex.wb opcodereg 0xB8, treg dq source end macro macro mov.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source rex.w db 0x89 modrm 3, sreg, treg end macro ; Take a 64-bit source register, treat it as an address and look up the 64-bit ; value it points to, store that into a 64-bit target register. ; ; For rbp, the only modes available also have displacement; we use an 8-bit ; one and set it to zero. The other registers could be encoded without the ; displacement, but for simplicity's sake we do the same thing for all of ; them. ; ; In understanding this, pay close attention to the Op/En column in the opcode ; table. The "RM" variant means the ModRM byte's R/M field (the third one) ; is the source, while its reg field (the middle one) is the target. This is ; what we want, because the R/M field is the one that gets indirection applied ; to it. Opcode 0x8B with an REX.W prefix is the all-64-bit RM variant. ; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV". ; ; For the indirection modes, don't be confused by the many similar tables. ; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W ; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte. ; The presence or absence of an SIB byte is determined by where in that table ; we fall, and we aren't using a mode that has one. [Intel] volume 2A, ; chapter 2, section 2-1.5, table 2-2. ; ; We disallow rsp as a source because that's the mode that would want an SIB. macro mov.qreg.indirect.qreg target, source qwordreg sreg, source qwordreg treg, target rex.w db 0x8B modrm 1, treg, sreg match =rsp, source ; R/M = rsp is the SIB case sib 0, 4, sreg ; no scaling, no indexing, source as base end match db 0 end macro ; Take a 64-bit source register, store its value into the address pointed to ; by a 64-bit target register. ; ; For rbp, the only modes available also have displacement; we use an 8-bit ; one and set it to zero. The other registers could be encoded without the ; displacement, but for simplicity's sake we do the same thing for all of ; them. ; ; In understanding this, pay close attention to the Op/En column in the opcode ; table. The "MR" variant means the ModRM byte's reg field (the middle one) ; is the source, while its R/M field (the third one) is the target. This is ; what we want, because the R/M field is the one that gets indirection applied ; to it. Opcode 0x89 with an REX.W prefix is the all-64-bit MR variant. ; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV". ; ; For the indirection modes, don't be confused by the many similar tables. ; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W ; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte. ; The presence or absence of an SIB byte is determined by where in that table ; we fall, and we aren't using a mode that has one. [Intel] volume 2A, ; chapter 2, section 2-1.5, table 2-2. ; ; We disallow rsp as a target because that's the mode that would want an SIB. ; When you look at other addressing modes, be aware that the special treatment ; is for whichever register is specified in the R/M field. Sometimes that's ; the source, and sometimes it's the target, depending on the opcode. macro mov.indirect.qreg.qreg target, source qwordreg sreg, source qwordreg treg, target rex.w db 0x89 modrm 1, sreg, treg match =rsp, target ; R/M = rsp is the SIB case sib 0, 4, treg ; no scaling, no indexing, target as base end match db 0 end macro ; 8-bit source register macro mov.indirect.qreg.breg target, source match =rsp, target assert 0 ; The SIB case. else match =rbp, target assert 0 ; An unrelated addressing mode. else bytereg sreg, source qwordreg treg, target db 0x88 modrm 0, sreg, treg end match end macro macro mov.breg.indirect.qreg target, source match =rsp, source assert 0 ; The SIB case. else match =rbp, source assert 0 ; An unrelated addressing mode. else qwordreg sreg, source bytereg treg, target db 0x8A modrm 0, treg, sreg end match end macro ; We use the operand-size prefix to specify 16-bit. No REX.W. Table 3-4. macro mov.indirect.qreg.wreg target, source match =rsp, target assert 0 ; The SIB case. else match =rbp, target assert 0 ; An unrelated addressing mode. else wordreg sreg, source qwordreg treg, target db 0x66 db 0x89 modrm 0, sreg, treg end match end macro ; We use the operand-size prefix to specify 16-bit. No REX.W. Table 3-4. macro mov.wreg.indirect.qreg target, source match =rsp, source assert 0 ; The SIB case. else match =rbp, source assert 0 ; An unrelated addressing mode. else qwordreg sreg, source wordreg treg, target db 0x66 db 0x8B modrm 0, treg, sreg end match end macro ; It defaults to 32-bit, no prefix needed, also no REX.W. Table 3-4. macro mov.indirect.qreg.dreg target, source match =rsp, target assert 0 ; The SIB case. else match =rbp, target assert 0 ; An unrelated addressing mode. else dwordreg sreg, source qwordreg treg, target db 0x89 modrm 0, sreg, treg end match end macro ; It defaults to 32-bit, no prefix needed, also no REX.W. Table 3-4. macro mov.dreg.indirect.qreg target, source match =rsp, source assert 0 ; The SIB case. else match =rbp, source assert 0 ; An unrelated addressing mode. else qwordreg sreg, source dwordreg treg, target db 0x8B modrm 0, treg, sreg end match end macro macro mov.qreg.indexed.qreg target, source, index, scale match =rbp, source assert 0 ; This is divided into some subcases we don't wish to deal with yet. else match =rsp, index assert 0 ; This is the case where it's not actually indexed after all. else qwordreg treg, target qwordreg sreg, source qwordreg ireg, index scalefield sfield, scale rex.w db 0x8B modrm 0, treg, 4 sib sfield, ireg, sreg end match end macro macro mov.indexed.qreg.qreg target, index, scale, source match =rbp, source assert 0 ; This is divided into some subcases we don't wish to deal with yet. else match =rsp, index assert 0 ; This is the case where it's not actually indexed after all. else qwordreg treg, target qwordreg sreg, source qwordreg ireg, index scalefield sfield, scale rex.w db 0x89 modrm 0, sreg, 4 sib sfield, ireg, treg end match end macro ; Take a 64-bit source register, store its value into a high 64-bit target ; register (r8-r15). ; ; Notice that there are two ways to add another bit to the register encoding. ; Table 3-1 is about REX.B, but does not apply here, it's for instructions ; that use opcode bits to specify a register, and none of the ; register-to-register MOV variants do that (it's for immediate mode). ; ; Instead, we want the mechanism that uses REX.R as the extra bit, and it ; combines with the reg field of ModRM, as per 2.2.1.2. ; ; Therefore, we want the variant of MOV which puts the target in the reg ; field. That's Op/En "RM", opcode 0x8B with REX.WR. ; ; Mode 3 is direct addressing. macro mov.oreg.qreg target, source owordreg treg, target qwordreg sreg, source rex.wr db 0x8B modrm 3, treg, sreg end macro ; Take a high 64-bit source register (r8-r15), store its value into a 64-bit ; target register. ; ; Notice that there are two ways to add another bit to the register encoding. ; Table 3-1 is about REX.B, but does not apply here, it's for instructions ; that use opcode bits to specify a register, and none of the ; register-to-register MOV variants do that (it's for immediate mode). ; ; Instead, we want the mechanism that uses REX.R as the extra bit, and it ; combines with the reg field of ModRM, as per 2.2.1.2. ; ; Therefore, we want the variant of MOV which puts the source in the reg ; field. That's Op/En "MR", opcode 0x89 with REX.WR. ; ; Mode 3 is direct addressing. macro mov.qreg.oreg target, source qwordreg treg, target owordreg sreg, source rex.wr db 0x89 modrm 3, sreg, treg end macro ; This increments a 64-bit register by 1, in place; macro inc.qreg target qwordreg treg, target rex.w db 0xFF modrm 3, 0, treg ; The 0 is part of the opcode. end macro ; This decrements a 64-bit register by 1, in place; macro dec.qreg target qwordreg treg, target rex.w db 0xFF modrm 3, 1, treg ; The 1 is part of the opcode. end macro ; This adds a 64-bit register to another 64-bit register, in place. macro add.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source rex.w db 0x01 modrm 3, sreg, treg end macro macro add.indirect.qreg.qreg target, source match =rsp, target assert 0 ; The SIB case. else match =rbp, target assert 0 ; An unrelated addressing mode. else qwordreg treg, target qwordreg sreg, source rex.w db 0x01 modrm 0, sreg, treg end match end macro macro add.qreg.indirect.qreg target, source match =rsp, source assert 0 ; The SIB case. else match =rbp, source assert 0 ; An unrelated addressing mode else qwordreg treg, target qwordreg sreg, source rex.w db 0x03 modrm 0, treg, sreg end match end macro ; This adds a signed 8-bit immediate value to a 64-bit register, in place. ; ; Notice the use of 3 as the addressing mode. This says to use the register ; itself. The 0 in the reg field is part of the opcode. macro add.qreg.bimm target, source qwordreg treg, target rex.w db 0x83 modrm 3, 0, treg db source end macro ; This adds a signed 32-bit immediate value to a 64-bit register, in place. ; ; Notice the use of 3 as the addressing mode. This says to use the register ; itself. The 0 in the reg field is part of the opcode. macro add.qreg.dimm target, source qwordreg treg, target rex.w db 0x81 modrm 3, 0, treg dd source end macro ; This subtracts a 64-bit register from another 64-bit register, in place. macro sub.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source rex.w db 0x2B modrm 3, treg, sreg end macro macro sub.indirect.qreg.qreg target, source match =rsp, target ; The SIB case. assert 0 else qwordreg treg, target qwordreg sreg, source rex.w db 0x2B modrm 0, sreg, treg end match end macro ; This subtracts a signed 8-bit immediate value from a 64-bit register, in ; place. ; ; Notice the use of 3 as the addressing mode. This says to use the register ; itself. The 5 in the reg field is part of the opcode. macro sub.qreg.bimm target, source qwordreg treg, target rex.w db 0x83 modrm 3, 5, treg db source end macro ; This subtracts a signed 32-bit immediate value from a 64-bit register, in ; place. ; ; Notice the use of 3 as the addressing mode. This says to use the register ; itself. The 5 in the reg field is part of the opcode. macro sub.qreg.dimm target, source qwordreg treg, target rex.w db 0x81 modrm 3, 5, treg dd source end macro ; This multiplies rax, as 64-bits, with another 64-bit register, in place. ; ; The 4 in the reg field is part of the opcode. macro mul.rax.qreg source qwordreg sreg, source rex.w db 0xF7 modrm 3, 4, sreg end macro ; The official mnemonic for this is "div", but it's divmod: It takes a 128-bit ; dividend formed from concatenating rdx as the high half with rax as the low ; half, and divides it by a 64-bit divisor from a specified register. It ; stores the quotient, truncated towards zero, in rax, and it stores the ; remainder in rdx. This entire process is unsigned. ; ; The 6 in the reg field is part of the opcode. macro div.rdxrax.qreg source qwordreg sreg, source rex.w db 0xF7 modrm 3, 6, sreg end macro ; Same as div, but signed. ; ; The 7 in the reg field is part of the opcode. macro idiv.rdxrax.qreg source qwordreg sreg, source rex.w db 0xF7 modrm 3, 7, sreg end macro macro and.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source rex.w db 0x23 modrm 3, treg, sreg end macro macro and.qreg.bimm target, source qwordreg treg, target rex.w db 0x83 modrm 3, 4, treg ; The 4 is part of the opcode. db source end macro macro or.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source rex.w db 0x0B modrm 3, treg, sreg end macro macro or.qreg.bimm target, source qwordreg treg, target rex.w db 0x83 modrm 3, 1, treg ; The 4 is part of the opcode. db source end macro macro xor.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source rex.w db 0x33 modrm 3, treg, sreg end macro macro not.qreg target qwordreg treg, target rex.w db 0xF7 modrm 3, 2, treg ; The 2 is part of the opcode. end macro ; This sets the flags to the same things they'd be set to if subtracting ; right from left. macro cmp.qreg.qreg left, right qwordreg lreg, left qwordreg rreg, right rex.w db 0x3B modrm 3, lreg, rreg end macro ; This sets the flags to the same things they'd be set to if and'ing right ; with left. macro test.qreg.qreg left, right qwordreg lreg, left qwordreg rreg, right rex.w db 0x85 modrm 3, rreg, lreg end macro macro set.breg.cc target, condition bytereg treg, target conditioncode cc, condition db 0x0F opcodecc 0x90, cc modrm 3, 0, treg end macro ; Move from an 8-bit immediate value, to a location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0xC6, which has w = 0. Since we run in 64-bit mode, that ; makes the operand size 8 bits, regardless of the current operand-size ; attribute. [Intel] volume 2D, appendix B, section B-1.4.3, table B-6. macro mov.qreg.disp8.bimm target, offset, source qwordreg treg, target db 0xC6 modrm 1, 0, treg ; the 0 is part of the opcode ; 4 is rsp, but it's a special case match =rsp, target ; R/M = rsp is the SIB case sib 0, 4, treg ; no scaling, no indexing, target as base end match db offset db source end macro ; Move from a 16-bit immediate value, to a location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0xC7, which has w = 1. We run in 64-bit mode, so that gives ; us an operand size of 32 bits by default. [Intel] volume 1, chapter 3, ; section 3-6.1, table 3-4. We want a 16-bit operand, so we use the ; operand-size prefix, 0x66, and we leave REX.W unset. ; ; We need to treat rsp specially because it's the SIB case, per table 2-2. macro mov.qreg.disp8.wimm target, offset, source qwordreg treg, target db 0x66 db 0xC7 modrm 1, 0, treg ; the 0 is part of the opcode match =rsp, target ; R/M = rsp is the SIB case sib 0, 4, treg ; no scaling, no indexing, target as base end match db offset dw source end macro ; Move from a 32-bit immediate value, to a location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0x67, which has w = 1. We run in 64-bit mode, so that gives ; us an operand size of 32 by default. [Intel] volume 2D, section B.1.43, ; table B-6. This is what we want, so we leave it. macro mov.qreg.disp8.dimm target, offset, source qwordreg treg, target db 0xC7 modrm 1, 0, treg ; the 0 is part of the opcode match =rsp, target ; R/M = rsp is the SIB case sib 0, 4, treg ; no scaling, no indexing, target as base end match db offset dd source end macro ; Move from a 64-bit register, to a 64-bit location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0x89 with REX.W, so that gives us the reg field as the ; 64-bit source and the R/M field as the 64-bit destination. ; ; We need to treat a target of rsp specially because it's the SIB case per ; table 2-2. macro mov.qreg.disp8.qreg target, offset, source qwordreg sreg, source qwordreg treg, target rex.w db 0x89 modrm 1, sreg, treg match =rsp, source ; R/M = rsp is the SIB case sib 0, 4, 4 ; no scaling, no indexing, rsp as base end match db offset end macro ; Move from a 64-bit register, to a 64-bit location relative to a 64-bit ; register, with a 32-bit displacement and no indexing. ; ; This uses opcode 0x89 with REX.W, so that gives us the reg field as the ; 64-bit source and the R/M field as the 64-bit destination. ; ; We need to treat a target of rsp specially because it's the SIB case per ; table 2-2. macro mov.qreg.disp32.qreg target, offset, source qwordreg sreg, source qwordreg treg, target match =rsp, target rex.w db 0x89 modrm 2, sreg, treg ; treg is rsp by assumption, and R/M = rsp is the SIB case sib 0, 4, 4 ; no scaling, no indexing, rsp as base dd offset else rex.w db 0x89 modrm 2, sreg, treg dd offset end match end macro ; Move from a 32-bit immediate value, to a 64-bit location relative to a ; 64-bit register, with an 8-bit displacement and no indexing. ; ; Note that there is no instruction to move a 64-bit immediate to memory. ; ; This uses opcode 0xC7, which has w = 1. We run in 64-bit mode, so that ; gives us an operand size of 32 by default. [Intel] volume 2D, ; section B.1.43, table B-6. We want a 64-bit operand, so we use the REX.W ; prefix, 0x48. macro mov.qreg.disp8.dimm target, offset, source qwordreg treg, target match =rsp, target rex.w db 0xC7 modrm 1, 0, treg ; the 0 is part of the opcode ; 4 is rsp, but it's a special case sib 0, 4, treg ; no scaling, no indexing, rsp as base db offset dd source else rex.w db 0xC7 modrm 1, 0, treg ; the 0 is part of the opcode db offset dd source end match end macro ; "Load effective address". Compute a 64-bit address as you would for ; indexed addressing, with an 8-bit displacement and no indexing, but instead ; of doing anything with the memory, just store the address itself into a ; register. macro lea.qreg.disp8.qreg target, offset, source qwordreg treg, target qwordreg sreg, source rex.w db 0x8D modrm 1, treg, sreg match =rsp, source ; R/M = rsp is the SIB case sib 0, 4, sreg ; no scaling, no indexing, rsp as base end match db offset end macro macro lea.qreg.disp32.qreg target, offset, source qwordreg treg, target qwordreg sreg, source rex.w db 0x8D modrm 2, treg, sreg match =rsp, source ; R/M = rsp is the SIB case sib 0, 4, sreg ; no scaling, no indexing, rsp as base end match dd offset end macro macro lea.qreg.indexed.qreg target, source, index, scale match =rbp, source assert 0 ; This is divided into some subcases we don't wish to deal with yet. else match =rsp, index assert 0 ; This is the case where it's not actually indexed after all. else qwordreg treg, target qwordreg sreg, source qwordreg ireg, index scalefield sfield, scale rex.w db 0x8D modrm 0, treg, 4 sib sfield, ireg, sreg end match end macro ; Wow, we use ALL the instruction suffixes for this, huh. See [Intel] volume ; 2A, chapter 2, section 2-1, with particular attention to figure 2-1. macro lea.qreg.disp8.indexed.qreg target, offset, source, index, scale match =rbp, source assert 0 ; This is divided into some subcases we don't wish to deal with yet. else match =rsp, index assert 0 ; This is the case where it's not actually indexed after all. else qwordreg treg, target qwordreg sreg, source qwordreg ireg, index scalefield sfield, scale rex.w db 0x8D modrm 1, treg, 4 ; 1 in the mode field says we want a disp8. ; 4 in the R/M field says we want an SIB byte. sib sfield, ireg, sreg db offset end match end macro macro mov.breg.bimm target, source bytereg treg, target db 0xC6 modrm 3, 0, treg ; the 0 is part of the opcode db source end macro ; Clear the DF flag. This makes string instructions increment RSI. macro cld db 0xFC end macro ; Set the DF flag. This makes string instructions decrement RSI. macro std db 0xFD end macro ; Load 64 bits from the address in RSI into RAX. Then, increment or decrement ; RSI by 8 bytes, depending on the value of the DF flag. macro lodsq rex.w db 0xAD end macro ; [Intel] describes two different styles of mnemonic for the repeated string ; operations. See, their parameters are always rsi and rdi, or the smaller ; versions of those same specific registers. Intel thinks we might want to ; write out "rsi" explicitly, even though the only information it conveys is ; the size. The position we take is that it's better to let that be conveyed ; by the instruction name; otherwise it'd be a point of confusion for new ; readers, who might mistakenly think it's possible to pass it different ; registers. ; ; With the string instructions, the reader SHOULD be thinking, "Wait... ; where does this get its parameters from?" Writing them in a way that makes ; them appear simpler than they are would be confusing. macro rep operation match =movsb, operation db 0xF3 ; rep prefix db 0xA4 ; opcode else match =movsw, operation db 0xF3 ; rep prefix db 0x66 ; operand-size prefix db 0xA5 ; opcode else match =movsd, operation db 0xF3 ; rep prefix db 0xA5 ; opcode else match =movsq, operation ; The "rep" instruction can also be thought of as a prefix to other ; instructions, though only a few specific ones are allowed. Anyway, it ; comes before the REX byte. db 0xF3 ; The rest of this is the same as the encoding of normal, non-repeated ; movsq. rex.w db 0xA5 ; There's no explicit parameters. String operations are magic. else assert 0 end match end macro macro repnz operation match =scasb, operation db 0xF2 ; rep prefix db 0xAE ; opcode else assert 0 end match end macro ; Push a 64-bit value from a register onto the stack (the one pointed to by ; rsp). Decrement rsp, then write the value at the new location. ; ; In the corner case where rsp is also the value being pushed, the old value ; is the one used. ; ; There's an alternate encoding of this that uses a ModRM byte, but doing it ; without is more compact, so we do without. macro push.qreg source qwordreg sreg, source opcodereg 0x50, sreg end macro macro push.bimm source db 0x6A db source end macro ; Operand-size prefix makes it 16-bit. ; ; If you're trying to fake pushing a larger size by doing several 16-bit ; pushes, remember to start by pushing the low end and proceed upwards. ; [Intel] volume 1, chapter 9, section 9-2.4, "Memory Data Formats". macro push.wimm source db 0x66 db 0x68 dw source end macro ; There is no 64-bit immediate push. So, can we have a push instruction that ; pushes a 32-bit immediate value? Sort-of, but it's sign-extended to 64 bits, ; so rsp is decremented by 8, not by 4. This is that instruction. ; ; You need to do a really close read of a number of things to understand why. ; The opcode tables in [Intel] in volume 2D, appendix A, section A-3 give it ; the d64 annotation, which per table A-1 in section A-2.5 indicates that the ; operand size is always 64 bits and that there is no corresponding 32-bit ; version. Yet, the actual immediate value is still only 32 bits! Direct your ; attention to the instruction's details page, volume 2B, chapter 4, section ; 4-3, "PUSH". The description section clearly details that the immediate may ; be less than the operand size, which makes sense once you know it, but it ; doesn't explictly call out that the operand size is still 64 bits here. ; ; In general, the size of an immediate doesn't determine operand size, as you ; can read about in detail in [Intel] volume 1, chapter 3, section 3-6.1, with ; particular attention to table 3-4. ; ; Why is this surprising, given that it's consistent with the behavior of ; other instructions? Well, most instructions don't have such obvious ; side-effects. It's easy to not notice the operand size disagreeing with the ; immediate size when you'e only writing to a register, but changing the stack ; in an unexpected way breaks things much more obviously. ; ; Anyway, if you really want to decrement the stack pointer by 32 bits after ; a push, consider pushing a register. macro push.dimm source db 0x68 dd source end macro ; Pop a 64-bit value into a register from the stack (the one pointed to by ; rsp). Read the value from the old location, then increment rsp. ; ; In the corner case where rsp is also the destination being written to, the ; read happens from the old location, then the write causes the increment to ; be irrelevant. ; ; There's an alternate encoding of this that uses a ModRM byte, but doing it ; without is more compact, so we do without. macro pop.qreg target qwordreg treg, target opcodereg 0x58, treg end macro ; Do an absolute indirect jump with a 64-bit register operand. That is: given ; a register which holds a pointer, read another address from the pointed-to ; memory and jump to it. ; ; Technically this is a "near" jump in x86 terms, but we just pretend far ; jumps and segments don't exist. They are still a thing in 64-bit mode, we ; just don't use them. macro jmp.abs.indirect.qreg location qwordreg lreg, location db 0xFF modrm 0, 4, lreg end macro ; There in no 64-bit immediate "near" jump, so we use 32-bit. It's relatve, ; so that's honestly plenty. ; ; The location is relative to the start of the instruction immediately ; following the jmp. macro jmp.rel.dimm location db 0xE9 dd location end macro ; The location is relative to the start of the instruction immediately ; following the jmp. macro jmp.cc.rel.bimm condition, location conditioncode cc, condition opcodecc 0x70, cc db location end macro ; The location is relative to the start of the instruction immediately ; following the jmp. macro jmp.cc.rel.dimm condition, location conditioncode cc, condition db 0x0F opcodecc 0x70, cc dd location end macro ; Invoke a system call provided by the kernel. On Linux, the System V ABI ; describes the semantics of such calls (at least, on x86). macro syscall db 0x0F, 0x05 end macro ; Halts the CPU. We can't actually run this in userspace, but the kernel will ; kill our process or the debugger will break, and those are the outcomes we ; actually want. macro hlt db 0xf4 end macro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Executable file format ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Before we get into the body of the program, we do a lot of ELF-specific ;;; stuff to ensure that our output is in a format Linux knows how to run. ;;; ;;; First, we set the origin to load at. This is arbitrary, but it can't be ;;; zero. We tell flatassembler about it because it's used in label ;;; calculations; we can reference it as $$ any time we need it in future. org 0x08000000 ;;; ;;; Second, we output ELF's top-level file header. The only interesting ;;; thing here is the entry pointer. ;;; elf_header: ; * denotes mandatory fields according to breadbox db 0x7F, "ELF" ; *magic number db 2 ; 64-bit db 1 ; little-endian db 1 ; ELF header format version 1 db 0 ; System-V ABI db 8 DUP 0 ; (padding) dw 2 ; *executable dw 0x3E ; *Intel x86-64 dd 1 ; ELF format version dq _start ; *entry point dq program_header - $$ ; *program header offset dq 0 ; section header offset dd 0 ; processor flags dw elf_header_size dw program_header_entry_size ; * dw 1 ; *number of program header entries dw 0 ; section header entry size dw 0 ; number of section header entries dw 0 ; section name string table index ; Save a copy of the size of this chunk for our future reference, by comparing ; the current posiion to the label above. elf_header_size = $ - elf_header ;;; ;;; Third, immediately after the ELF file header, we output ELF's program ;;; header, which lists the memory regions ("segments") we want to have and ;;; where we want them to come from. We list just a single region, which is ;;; the entire contents of the ELF file from disk. ;;; ;;; It would be more typical to use this header to ask the loader to give us ;;; separate code and data segments, and perhaps a stack or heap, but this ;;; keeps things simple, and we can create those things for ourselves later. ;;; ;;; We do have a little stack space available, though we don't explicitily ;;; request any; the kernel allocates it for us as part of exec() so that it ;;; can pass us argc and argv (which we ignore). That stack space will be at a ;;; random address, different every time, because of ASLR; that's a neat ;;; security feature, so we leave it as-is. ;;; program_header: dd 1 ; *"loadable" segment type dd 0x05 ; *read+execute permission dq 0 ; *offset in file dq $$ ; *virtual address ; required, but can be anything, subject to alignment dq 0 ; physical address (ignored) dq file_size ; *size in file dq file_size ; *size in memory dq 0 ; segment alignment ; for relocation - will we be ASLR'd? ; Save the size of this chunk, as well. program_header_entry_size = $ - program_header ; Everything after this point is code or data, not headers, so save the start ; of it for use in size calculations later. code_start: ;;;;;;;;;;;;;;;;;;;;;;; ;;; Execution model ;;; ;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; We use Forth-style dual stacks, one for values and one for control. We ;;; use rsp for values, just like C does. We use rbp for the control stack, ;;; which is a special Forth-y stack: These are pointers into the bodies of ;;; Forth words, not return addresses. ;;; ;;; The choice of rsp and rbp for the stack pointers imitates Jonesforth; ;;; I'm hopeful that it gives us convenient addressing modes, and will report ;;; back about that when I feel that I understand the implications. ;;; ;;; In Forth, everything is a "word", including mutable variables. ;;; Conceptually, a word is a unit of execution, which may be implemented ;;; either in machine code or as an array of pointer to other words. ;;; ;;; This polymorphism is implemented by having each word's contents begin ;;; with a "codeword", which is a pointer to machine code that "interprets" ;;; the rest of the contents. In the case of words implemented in machine ;;; code, the codeword points directly to that code, which is normally right ;;; next to it. ;;; ;;; Variables, to Forth, are simply one more thing that can be executed; the ;;; effect of executing a variable is to push its address onto the value ;;; stack. ;;; ;;; We adopt this model of words, codewords, and variables-as-words. It's ;;; really nice how it doesn't force anything else on us, not even a heap, ;;; though we do end up using a heap. ;;; ;;; We specifically implement a version of calling and returning that Forth ;;; calls indirect threaded code: The control stack is a stack of pointers ;;; into the middle of interpreted words. The interpreter snippet, called ;;; docol, implements calling. Each word is responsible for making sure ;;; returning works properly. Interpreted words accomplish this by ending with ;;; the word "exit", while machine-code words accomplish it by ending with a ;;; verbatim snippet called "next". ;;; ;;; Conceptually, "next" returns, but more specifically it accomplishes this ;;; by doing the caller's next dispatch for it; thus control never actually ;;; goes back to the caller's interpreter after initial setup. For performance ;;; reasons, "next" is always inlined, so we define it as a macro. ;;; ;;; The docol routine is just ordinary code, not a macro. It's defined later ;;; in this file, as a label. ;;; ;;; Notionally, we could consider not having a dictionary, and not giving ;;; our words names. However, it feels silly to stop when we're so close to ;;; being a full Forth, and using names for things solves a bootstrapping ;;; problem related to heap management - see the write-up of _start about how ;;; the heap is created, below. So, we add an additional header before the ;;; codeword for this purpose. ;;; ;;; The Forth dictionary is usually a linked list of every word that has ;;; ever been defined, with the newest at the head; the names of words are ;;; stored in string fields, often right next to the link pointer. We adopt ;;; this model, with the field sizes and order shown in the quick reference ;;; below. We break with Forth tradition in one way: Rather than having a ;;; length field, we use a null-terminated string. Thus, there's no length ;;; limit on names. This necessitates breaking out the flags (to be explained ;;; later) into their own byte, rather than taking bits from the length field ;;; for them. ;;; ;;; There's an important performance consideration: Executable words ;;; reference each other by pointers to their respective codewords. However, ;;; dictionary entries reference each other by pointers to their respective ;;; link fields. Traversing from the link field to the codeword is easy, ;;; though it's a non-constant-time operation: Just walk the string. In order ;;; to make Forth words easy to "decompile", it would be nice to also have a ;;; way to traverse backwards. We solve this by making the name field be ;;; null-terminated at both ends. Fun, yeah? ;;; ;;; ;;; ;;; ;;; -------------------------------------------------------------------------- ;;; Quick Reference ;;; -------------------------------------------------------------------------- ;;; ;;; The layout of an interpreted word: ;;; ;;; (overall start) ;;; 0x00 - 0x08 Link (to next-oldest word) ;;; 0x09 - 0x09 I0H00000 Flags ;;; I - immediate ;;; H - hidden ;;; all other bits reserved ;;; (name start) ;;; 0x0a - 0x0a Null byte (terminates name) ;;; 0x0b - name-end - 1 Name, as UTF-8 ;;; name-end - name-end Null byte (terminates name) ;;; (padding start) ;;; name-end + 1 - codeword-start - 1 Zero-pad to 8-byte boundary ;;; (it's possible this will be zero bytes long) ;;; (codeword start) ;;; ... + 0x00 - ... + 0x08 Codeword (ie. address of docol) ;;; (8-byte chunks) Addresses of other words ;;; - ... (end) Address of "exit" word ;;; ;;; The layout of a machine-code word is different only from the codeword on: ;;; ;;; ... + 0x00 - ... + 0x08 Addresss of next byte ;;; ... + 0x08 - ???? Arbitrary machine code ;;; - ... (end) Inlined implementation of next ;;; ;;; Also, words always start at 8-byte boundaries. ;;; ;;; ;;; REGISTER usage conventions: ;;; ;;; * rsi is the "instruction pointer" for the "interpreter". ;;; That is, it points to some word-pointer inside an array of ;;; word-pointers inside the content of the word they're part of. It always ;;; points to the next word that should be executed, whose execution hasn't ;;; begun yet. ;;; ;;; * rbp points to the top of the control stack ;;; These are former values of rsi, to eventually be returned to, from ;;; successively older callers as you look further up the stack. The stack ;;; grows downwards in memory. Since values are kept separately, the only ;;; thing on the control stack is return addresses, one per layer of call. ;;; ;;; * rsp points to the top of the value stack ;;; The value stack has no specific format, but it grows downwards in ;;; memory. In particular there's no concept of stack frames, because items ;;; on the stack don't belong to any particular word; the value stack in ;;; Forth is in part a mechanism for passing values between words. ;;; ;;; Additionally, immediately after beginning execution of a word: ;;; ;;; * rax points to the address of the codeword being executed ;;; The value of rax is purely for the callee's benefit, and does not need ;;; to be preserved. ;;; ;;; Other registers are purely discretionary, and are not preserved across ;;; calls. ;;; ;;; ;;; FLAG usage: ;;; ;;; * DF should be 0 ;;; We use lodsq extensively and that makes it increment rsi after using it. ;;; ;;; -------------------------------------------------------------------------- ;;; ;;; Macro next ;;; ---------- ;;; ;;; Include this inline at the end of a word implemented in machine-code. ;;; Conceptually, it returns. What it actually does is do the next thing the ;;; caller would do, which is call the next word from the caller's array of ;;; word pointers. ;;; ;;; Registers in: ;;; ;;; * rsi points to the address of the word to execute ;;; ;;; Registers out: ;;; ;;; * rax points to the codeword in the contents of the word that was executed ;;; * rsi points to the next word-address after this one ;;; ;;; Flags ;;; * DF = 0 is required ;;; macro next ; Copy the next word's address from *rsi into rax. Increment rsi (as per the ; DF flag). lodsq ; Load the codeword from the word's contents, and jump to the interpreter it ; points to. jmp.abs.indirect.qreg rax end macro ;;; ;;; Macro beforenext ;;; ---------------- ;;; ;;; Sometimes we want to transfer control from a word implemented in ;;; machine-code to another word, without coming back after, as if we were ;;; simply jumping to it. This is an innovation of ours; Jonesforth doesn't do ;;; it. ;;; ;;; This implementation will work regardless of how the receiving word is ;;; implemented. It impersonates the "next" snippet, setting up rax to point ;;; to the codeword then jumping to the interpreter. Since it doesn't change ;;; the control stack or rsi, when the receiving word eventually invokes ;;; "next"; it will pick up in the same place as if this sending word had done ;;; it. ;;; ;;; Thus, notionally we are doing just this one transfer of control before ;;; eventually getting around to inlining "next". Hence the name. ;;; macro beforenext target ; Do a permanent transfer of control by setting rax and invoking the ; codeword. Of course, we could jump to docol ourselves but this will work ; regardless of what the receiving codeword is. mov.qreg.qimm rax, target jmp.abs.indirect.qreg rax end macro ;;; ;;; Macros pushcontrol ;;; popcontrol ;;; ------------------ ;;; ;;; Include these inline to push an address onto the control stack, or pop ;;; one off of it. You will recall the control stack is kept in rbp. The ;;; parameter is given in a user-specified register. ;;; ;;; Jonesforth's analogous macros are called PUSHRSP and POPRSP but I think ;;; that's super confusing, since rsp is also the name of a register, but a ;;; different one. I guess it was less confusing in 32-bit, since esp doesn't ;;; start with an "r". Anyway, this has to be named something that ;;; distinguishes it from Intel's PUSH and POP opcodes, so... ;;; ;;; "Load effective address" is just a cute way to do arithmetic on a ;;; register, here. To push or pop we decrement or increment rbp by 8. To ;;; actually interact with the space in the stack, we indirect through rbp. ;;; ;;; Registers in and out: ;;; ;;; * rbp points to the top of the control stack. ;;; macro pushcontrol source lea.qreg.disp8.qreg rbp, -8, rbp mov.indirect.qreg.qreg rbp, source end macro macro popcontrol target mov.qreg.indirect.qreg target, rbp lea.qreg.disp8.qreg rbp, 8, rbp end macro ;;; ;;; Routine _start ;;; -------------- ;;; ;;; This is the entry point of the whole program, the very first code we ;;; actually execute. Linkers traditionally call this _start, and on balance ;;; I think it's probably best to keep that name, though I've honestly never ;;; liked it... Anyway, the ELF header points to it and exec() jumps to it. ;;; Also, though it could be anywhere in the code part of the output, in order ;;; to make the hexdump pretty we put it at the start. ;;; ;;; The kernel gives us most registers zeroed, and rsp pointing to the ;;; command-line stuff (argc, argv, envp), which is at an ASLR'd address with ;;; some stack space allocated for us, despite the fact we didn't request any. ;;; It also gives us all the flags clear except IF, but we don't rely on that. ;;; Lastly, of course, it loads our code segment and sets the instruction ;;; pointer where we asked; we don't need to check what those addresses are, ;;; because they're not randomized. ;;; ;;; This routine is really only responsible for one-time initialization. ;;; ;;; Registers in: ;;; ;;; * rsp points to the logical top of the value stack ;;; The kernel sets this up for us, and we need to save it somewhere so ;;; Forth can use it. ;;; ;;; Registers out: ;;; ;;; * rsi points within "quit" ;;; Quit is the word that's Forth's closest equivalent to main(). ;;; * rsp points to the top of the value stack ;;; ;;; Notably, rbp is still uninitialialized after _start. ;;; ;;; Stack in: ;;; ;;; * argc, argv, envp in the usual Unix way ;;; We ignore them, though. ;;; ;;; Stack out: ;;; ;;; * The value of "heap", as a pointer ;;; The meaning of this will be explained below. ;;; ;;; Registers within: ;;; ;;; * rdi points to the base the heap was allocated at, once it exists ;;; This is the same value that "heap" will hold, once we reach a point ;;; where we have variables. Of course, variables are stored on the heap, ;;; hence this temporary measure. ;;; ;;; We also take this opportunity to define soeme memory layout parameters ;;; that this routine will be responsible for doing something with: ;;; heap_requested_address = 0x0000001000000000 ; (very arbitrary) heap_size = 0x0000000001000000 ; 16 MiB control_stack_size = 0x10000 ; 64 KiB _start: cld ; clear the DF flag ;;; ;;; Prepare the heap. ;;; ;;; We could ask for a data segment in the program header, but where's the ;;; fun in that? Instead, we call mmap(). ;;; ;;; If we wanted the kernel to do ASLR for us, passing address zero would ;;; cause it to pick somewhere at random, but instead we choose our own ;;; location. It's still not guaranteed to be where we ask for, so we still ;;; do the work to record where it wound up. We could pass the "fixed" flag ;;; and the kernel would trust us, but this gives us more options for ;;; interoperating with other runtimes. ;;; mov.qreg.qimm rax, 9 ; mmap() mov.qreg.qimm rdi, heap_requested_address ; address (very arbitrary) mov.qreg.qimm rsi, heap_size ; size (one meg) mov.qreg.qimm rdx, 0x03 ; protection (read+write) mov.oreg.qimm r10, 0x22 ; flags (private+anonymous) mov.oreg.qimm r8, 0 ; file descriptor (ignored) mov.oreg.qimm r9, 0 ; offset (ignored) syscall ;;; ;;; The return value of the system call is in rax, we'll use it in a sec. ;;; We need to save this somewhere in case we ever want to munmap() it; ;;; there's no widely-used name for it so we have to make one up. S0 and r0 ;;; are widely-used names for the physical tops (logical bottoms) of the ;;; value and control stacks, respectively, and we will eventually set those ;;; up as well, so we should keep those names in mind. The control stack ;;; lives within the heap, while the value stack is its own segment. This ;;; value, though, is the physical bottom of the segment, meaning that it ;;; stays the same even as we allocate and deallocate things within it. This ;;; is unlike the two stack pointers, so we give it a name that doesn't ;;; suggest similarity: "heap". ;;; ;;; Once Forth is fully set up, its internal variables will be accessed ;;; through variable-words like any other Forth data, including "heap". To ;;; get to that point, though, we need to be able to hold onto variable data ;;; between now and then. In fact, if we don't have at least one of "heap" ;;; and "here" (its counterpart which points to the logical top end), all ;;; our efforts to hold onto anything seem a bit doomed. ;;; ;;; So, we temporarily dedicate rdi to "heap" - only within this routine - ;;; and store everything else in ways that let us find things by reference ;;; to it. We choose rdi because it works with the indexing modes we care ;;; about, and its name suggests its function. ;;; ;;; The strategy Jonesforth uses is not applicable to us; Jonesforth ;;; takes advantage of the linker to let its code segment refer to specific, ;;; pre-allocated objects in the data segment. We are our own linker. ;;; Hence, this approach. ;;; ;;; Keying things off "heap" is the fundamental decision, but to make sure ;;; our variables are accessible both during early bootstrapping, and later, ;;; we also have to be thoughtful about data structures. More on that in a ;;; moment. ;;; mov.qreg.qreg rdi, rax ;;; ;;; We also initialize rbp. We could hold off and let "quit" do this, but ;;; doing it now is the easiest way to initialize the r0 variable, since ;;; there's no instruction that moves a 64-bit immediate to memory. ;;; ;;; This is the moment at which we decide where the control stack starts! ;;; Fun, right? "Allocation" is just a fancy word for picking where we want ;;; something, then being consistent about it - like placing furniture in ;;; your home. See below for a little more thought about why here in ;;; particular. ;;; lea.qreg.disp32.qreg rbp, control_stack_size, rdi ;;; ;;; Now we save some stuff onto the heap. These are the locations that ;;; will eventually be the backing stores of the Forth variables, but we ;;; don't create the word headers yet, since there's no requirement that ;;; they be next to the backing stores. We'll do that later, once we have ;;; word-writing infrastructure in place. For now, we just use their offsets ;;; relative to the physical bottom of the heap, which are fixed. ;;; ;;; These will be the permanent homes of these values, though we have ;;; copies of them elsewhere while we're still in this routine. ;;; mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi ; heap mov.qreg.disp32.qreg rdi, control_stack_size + 0x08, rsp ; s0 mov.qreg.disp32.qreg rdi, control_stack_size + 0x10, rbp ; r0 mov.qreg.qimm rax, final_word_name mov.qreg.disp32.qreg rdi, control_stack_size + 0x18, rax ; latest lea.qreg.disp32.qreg rax, control_stack_size + 0x28, rdi mov.qreg.disp32.qreg rdi, control_stack_size + 0x20, rax ; here ; TODO also consider STATE ; strictly speaking, r0 could be a constant... but it isn't known until ; runtime, so we might as well make it a variable ;;; ;;; * "heap" is the physical bottom of the heap ;;; The heap grows upwards in memory, so this is also the logical ;;; bottom. This comes from the address mmap() just returned to us. ;;; * "s0" is the logical bottom of the value stack ;;; The value stack grows downwards in memory, so this is the physical ;;; top of it. This comes from the stack pointer the kernel initialized us ;;; with. ;;; * "r0" is the logical bottom of the control stack ;;; The control stack also grows downwards, so this is its pysical top ;;; as well. We allocate this dedicated space within the heap right here, ;;; in this routine, through our choice of where to put things. ;;; * "here" is the physical start of the unallocated space in the heap ;;; We allocate heap space from bottom to top, by incrementing this ;;; value. So, it would also be accurate to say that it points immediately ;;; after the physical top of the allocated space. At any rate, the ;;; address it points to is the first one that hasn't been used yet. ;;; * "latest" is the address of the most-recently-defined word's header ;;; Defining new words changes this value. ;;; ;;; s0 and r0 are mostly used when we want to initialize or reinitialize ;;; their respective stacks - that is, discard all their contents at once. ;;; ;;; The value of r0 is the same address these variables start at, so ;;; you'll want to do a close read of the implementation of pushcontrol ;;; and convince yourself that it only ever writes things just below the rbp ;;; address it receives, never right on top of it. ;;; ;;; Notice that "here" points immediately after itself. This is just a ;;; convenience, making it the last one like that so that the concern is ;;; dealt with in a single place and is easy to keep up-to-date with code ;;; changes. ;;; ;;; A little more detail about why we offset everything by ;;; control_stack_size: We're carving out some space at the bottom of the ;;; heap - which grows low-to-high - to be the control stack - which grows ;;; high-to-low. So the control stack is allocated out of the heap as a ;;; fixed-size, one-time thing, and then the variables come immediately ;;; after that. We do need to use 32-bit displacement indexing to access ;;; them this way, but that's no big deal. ;;; ;;; This is perhaps questionable, they should maybe be separate segments ;;; created with separate calls to mmap(), but for now we're not worried ;;; about overflow so we use the same allocation for both. ;;; ;;; We'll come back to these variables a bit later and generate the word ;;; headers that point at them, but now we're almost ready to switch to ;;; proper threaded-execution, so we finish that setup first... ;;; ;;; ;;; Push the value of "heap" onto the value stack so that it can be the ;;; breadcrumb the threaded code needs to find... the backing store of ;;; "heap". Yes, self-reference can be weird like that sometimes. There's ;;; nothing stopping "quit" from reading rdi, it just violates the ;;; abstraction... ;;; push.qreg rdi ;;; ;;; We are about to set up rsi, we did rbp already, and rsp came to us ;;; already set up. That's all that "next" needs, so take it away! ;;; mov.qreg.qimm rsi, cold_start next ;;; ;;; This isn't really a routine so much as it's an array of words (exactly ;;; one of them), which is what "next" wants rsi to point to. It's only ever ;;; used this one time, so we just put it right here. ;;; align 8 cold_start: ;;; TODO this is probably where we should deal with that "heap" that we passed ;;; on the stack ;;; Before handing off to us, _start pushed a single value onto the stack, ;;; a pointer to the beginning of the heap. Now, we load our entire Forth ;;; implementation onto that heap, beginning with the minimal set of words ;;; needed to define more words. We do this because we need variables as ;;; infrastructure so we can eventually have dynamic definitions. ;;; ;;; There's something non-obvious here: words implemented statically as ;;; part of the executable image can't contain things that vary at runtime. ;;; That means that even if these words tried to implement some sort of ;;; dynamic lookup, they would have no way to find the root of whatever ;;; dynamic data structure they use. Dynamism needs to be bootstrapped. ;;; ;;; In a more traditional C-style program, static code could look up ;;; variables based on fixed addresses that are the same on every run. ;;; Failing that, we could dedicate a register to it, though that's a ;;; considerable expense. We chose not to do either of those things, because ;;; we want the versatility that comes with not being picky about our ;;; address space: It allows us to contemplate future improvements such as ;;; ASLR, or embedding into other processes that impose their own addressing ;;; constraints, or even coexisting with multiple versions of ourselves. ;;; That choice does mean we have the hard version of this bootstrapping ;;; problem, and copying ourselves to the heap is how we solve it. ;;; ;;; We do have the heap address right now, though that won't last. In case ;;; it's unclear why not: keeping it on the stack would require all future ;;; references to walk the stack, and somehow know when they've reached the ;;; bottom. The stack is a good place to keep things with clearly delimited ;;; lifetimes and visibility, but when we want something to live for our ;;; entire program and be easy to find from any code within it, we need to ;;; do something else. Anyway, since we have the address, we can use it for ;;; the next little bit of setup. ;;; ;;; The first few words we define are our variables, which hardcode the ;;; addresses they will return - but since we're doing this at runtime, ;;; "hardcoding" can reflect where our heap is. This is the fundamental ;;; trick that makes the heap usable. ;;; ;;; One more thing to notice: We already allocated the backing stores of ;;; these variables, and populated their initial values, in _start. The ;;; words we're defining return those same addresses for the same backing ;;; stores. So, we have continuity: Stuff defined in terms of the ;;; variable-words we're defining now will interoperate with the stuff that ;;; we define in the "early" way, which includes those very words. Both the ;;; early code and the later code are dealing with the same data structures, ;;; they're just using a different technique to find them. ;;; ;;; This is the only hardcoding we need to do; by building on top of it, ;;; we will soon reach a point where the rest of the system can be defined ;;; within itself. dq early_heap, litstring, "heap", early_variable dq early_s0, litstring, "s0", early_variable dq early_r0, litstring, "r0", early_variable dq early_latest, litstring, "latest", early_variable dq early_here, litstring, "here", early_variable ;;; Now we define a heap version of docol. Strictly speaking it doesn't ;;; need to be among the first words, it only needs to come before the first ;;; words implemented in Forth. However, it's conceptually tidy to have it ;;; that way, so that's what we do. ;;; ;;; Docol also presents a unique challenge, in that it's two snippets of ;;; code and one of them needs to refer to the other. When we use docol as ;;; the codeword of a word we're defining, we point to a snippet which acts ;;; as an interpreter for the word's body. However, when we look up "docol" ;;; in the dictionary, what we get is a word that returns the address of the ;;; interpreter snippet, effectively acting as a constant. ;;; ;;; One way to make this work would be to use a forward-referencing ;;; address using the labels system. However, it turns out that only docol ;;; and zbranch would benefit from this, and we drastically simplify our ;;; code by reworking things so that no forward reference is needed. ;;; ;;; What we do is define the interpreter snippet first, allocating space ;;; for the machine code directly out of "here", with no word header nor any ;;; dictionary entry pointing to it. We keep track of the address we put ;;; that at, then we define the constant to point to it. ;;; ;;; While it may seem weird to use space that's "outside" of any word, ;;; keep in mind that using the heap in creative ways is part of the spirit ;;; of Forth. Jonesforth doesn't have this bootstapping problem, but its ;;; variables use this same technique of putting the value before the word ;;; header to avoid a forward reference. Also, words don't have end ;;; delimiters, so who's to say what's inside or outside them? dq early_here, fetch, dup dq rsi, pack_pushcontrol dq lit, 8, rax, add_reg64_imm8 dq rax, rsi, mov_reg64_reg64 dq pack_next dq lit, 8, packalign dq roll3, swap, early_here_store, swap ; Now the interpreter snippet is in-place and "here" points after it, so ; that future allocation won't step on it. We also still have a copy of its ; start address, which we will now pass to early_variable. dq litstring, "docol", early_variable ; (While it might be tidy to have a separate "early_constant", it would do ; the same thing. Late variables and constants will be different because the ; real "variable" word will also be responsible for allocating the backing ; store, but the only thing early_variable is doing is returning an ; address.) dq litstring, "exit", early_create, early_self_codeword, early_here, fetch dq rsi, pack_popcontrol, pack_next dq lit, 8, packalign, early_here_store ;;; For triage's sake, here's an inventory of everything else in the file. ;;; ;;; Macros: ;;; next, beforenext, pushcontrol, popcontrol ;;; Assembly: ;;; docol (the constant, and the actual codeword implementation) ;;; needs a label ;;; pushcontrol, next ;;; exit ;;; popcontrol, next ;;; swap, drop, drop2, roll, unroll, roll3, unroll3, dup, dup2 ;;; add, sub, mul, divmod ;;; eq, ne, gt, lt, ge, le, and, or, xor, invert ;;; lit, litstring ;;; store, fetch ;;; addstore, substore, store8, fetch8, store16, fetch16, store32, fetch32 ;;; ccopy, stringlen ;;; no dependencies except next for any of these ;;; branch, zbranch ;;; sorta needs a label but might be avoidable ;;; needs next ;;; Forth: ;;; pack64, pack32, pack16, pack8, packstring, packalign ;;; only the basics above ;;; Assembly: ;;; litpack64, litpack32, litpack16, litpack8 ;;; beforenext, pack* ;;; Forth: ;;; rax, rcx, dx, rbx, rsp, rbp, rsi, rdi ;;; r8, r9, r10, r11, r12, r13, r14, r15 ;;; only the basics above ;;; reg64, extrareg64 ;;; only the basics plus optional emitstring and sys_exit ;;; (notice that these are forward references!) ;;; rex_w, rex_wb, opcodereg, modrm ;;; only the basics above ;;; cld, mov_reg64_imm64, mov_reg64_reg64, push_reg64, lodsq ;;; jmp_abs_indirect_reg64, syscall ;;; basics plus assembly helpers ;;; Forth, not needed on heap: ;;; early_heap, early_s0, early_r0, early_latest, early_here ;;; early_create, early_comma, early_self_codeword, early_here_store, ;;; pack_next, early_variable ;;; ;;; It's likely that nothing past this point is required for the heap copy, ;;; but it's here for completeness. ;;; ;;; Forth, subject to reconsideration: ;;; quit ;;; quine, sys_exit (these are forward references) ;;; Assembly: ;;; sys_exit, sys_write ;;; nothing special ;;; Forth: ;;; emitstring ;;; stringlen, sys_write, basics ;;; quine ;;; early_here (removable), all_contents (forward) ;;; sys_write, basics ;;; use_label, set_label ;;; only basics ;;; Assembly: ;;; hlt ;;; nothing ;;; Forth: ;;; all_contents, elf_file_header, elf_program_header ;;; output_start_routine ;;; nothing special in any of these (they do want use/set_label) ;;; self_raw ;;; self-reference ;;; dq quit ;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Now we are in Forth ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Everything we define from here on out is an actual Forth word, with a ;;; proper header and everything. So, you'll see some more preamble before the ;;; definitions. ;;; ;;; Keep in mind, though, that, although we have threaded execution, we ;;; don't yet have Forth-style variables. That's because the heap is at a ;;; dynamically-chosen location, so none of this read-only code that we're ;;; defining now can reference it. Before invoking cold_start, we thoughtfully ;;; put the value of "heap" on the stack for ourselves; our first task will be ;;; to dynamically allocate some words on the heap that know how to find the ;;; heap. We'll do that by defining bootstrapping versions of the ;;; word-defining words, which will eventually be replaced. ; Token pasting is possible in flatassembler, but kind of a pain. calminstruction defword_label_helper name local full_name, address arrange full_name, name#=_name ; do the token-pasting compute address, $ ; compute the current address publish full_name:, address ; bind the label end calminstruction ; The only time we actually use this variant is docol. macro defword_unlabeled name, flags align 8 defword_label_helper name dq latest_word latest_word = $ - 8 db flags, 0x00, `name, 0x00 align 8 end macro ; This is the variant we use to define ordinary words. macro defword name, flags defword_unlabeled name, flags label name end macro latest_word = 0 ;;; ;;; Routine docol ;;; ------------- ;;; ;;; Reference this via its label as the codeword of a word to make it an ;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer") ;;; to the control stack, takes the address of the codeword from rax and ;;; increments it in-place to form the new instruction pointer, and copies ;;; that to rsi. ;;; ;;; Having then done this, we're now in the state that normal execution ;;; expects, so docol ends by it using "next" to begin the callee's execution, ;;; kicking off a nested call. ;;; ;;; The name is said to be short for "do colon", because Forth high-level ;;; code begins word definitions with a colon. ;;; ;;; Registers in: ;;; ;;; * rsi is the caller's instruction pointer ;;; * rbp is the control stack pointer ;;; * rax is the address of the callee's codeword ;;; ;;; Registers out: ;;; ;;; * rsi is the callee's instruction pointer ;;; * rbp is the control stack pointer defword_unlabeled docol, 0 docol_constant: ; Evaluated as a word, docol is a constant which returns a pointer. dq $ + 8 mov.qreg.qimm rax, docol push.qreg rax next align 8 docol: ; Since docol is not a normal word, the label points to the value we care ; about from the assembly side of things, which is the address we use as the ; codeword. pushcontrol rsi add.qreg.bimm rax, 8 mov.qreg.qreg rsi, rax next latest_word = docol_name ;;; ;;; This is the mechanism to "return" from a word interpreted by docol. ;;; We pop the control stack, and then, since this is threaded execution, we ;;; do the next thing the caller wants to do, by inlining "next". ;;; defword exit, 0 dq $ + 8 popcontrol rsi next ;;; ;;; Stack manipulation routines ;;; --------------------------- ;;; ;;; We start with the three traditional stack operations, swap drop and ;;; roll. Sorry to fans of the name "ROT"; we were an HP calculator kid. It'll ;;; always be roll to us. Anyway, we do a couple other operations too. Since ;;; our goal right now is just to bootstrap the heap, we keep this short and ;;; sweet. ;;; ;;; There is definitely plenty of optimization that could be done. ;;; defword swap, 0 dq $ + 8 pop.qreg rax pop.qreg rbx push.qreg rax push.qreg rbx next defword drop, 0 dq $ + 8 pop.qreg rax next defword drop2, 0 dq $ + 8 pop.qreg rax pop.qreg rax next ; Rotates "up" (pops its parameter, n; nth item then becomes current item). ; ; We implement this the high-performance way, with rep movsq, aka the ; instruction that exists to optimize C's memcpy(). The details of setting ; that up are complex; see below. defword roll, 0 dq $ + 8 ; Pop our parameter. The rep instruction takes rcx as its count, so we ; reduce copying by using it to hold our count, as well. pop.qreg rcx ; We have n - 1 items to slide, so decrement rcx. For the purpose of ; counting how many repetitions will happen, it's one-based. This is because ; the rep instruction performs a single movsq, then decrements rcx, then ; stops if rcx is zero. dec.qreg rcx ; Retrieve the nth item, for later. For this purpose we're thinking in ; zero-based terms, so we do this after already having decremented rcx. mov.qreg.indexed.qreg rbx, rsp, rcx, 8 ; The source address for movsq is rsi and the destination is rdi; we can ; use rdi as we wish, but rsi is our Forth "instruction pointer", so we must ; save and restore it. Doing so alters rsp, so we have to adjust the address ; calculations by eight bytes as compared to the expressions above, but ; happily we can use the disp8 field to do that. We'd be using disp8 anyway ; because it's helpful. push.qreg rsi ; Now we set up parameters for the memory-sliding operation. We have ; n - 1 items to copy, moving the range rsp through rsp + (n-2)*8 onto the ; range rsp + 8 through rsp + (n-1)*8. That's with the value of rsp as it ; exists at this moment (it's going to change soon). ; ; We're sliding them upwards in memory, so we start at the high end so ; that we're always moving into a location that doesn't have anything ; precious. We use lea as a convenient way to do the stack math. ; ; When rcx is 1, we want rsp + 8. lea.qreg.indexed.qreg rsi, rsp, rcx, 8 ; When rcx is 1, we want rsp + 16. lea.qreg.disp8.indexed.qreg rdi, 8, rsp, rcx, 8 ; ; Using rcx = 1 is the most convenient example to use for figuring out the ; arithmetic. It's a linear relationship, so as long as we get the 8-byte ; stride correct, we just need to pick a single point and verify that our ; math is right for that point, and it'll be right for any value of rcx. ; Another of our Forth conventions is that the DF flag should be kept at ; zero, which directs string instruction to increment rsi. Here, however, ; because our source and destination ranges overlap, we need to start at the ; high end, which means we need it to decrement. So we set DF to one, and ; we'll clear it after. std rep movsq ; Set everything back. cld pop.qreg rsi ; There is now an extra item at the low end of the stack (the top) that ; needs to go away, and coincidentally we have a value in rbx that needs to ; be in that spot. Rather than doing a drop and push, we overwrite it, to ; save a little work. mov.indirect.qreg.qreg rsp, rbx ; All done, wow! What a mouthful. next ; Rotates "down" (pops its parameter, n; current item then becomes nth item). ; ; We implement this the high-performance way, with rep movsq, aka the ; instruction that exists to optimize C's memcpy(). The details of setting ; that up are complex; see below. defword unroll, 0 dq $ + 8 ; Pop our parameter. The rep instruction takes rcx as its count, so we ; reduce copying by using it to hold our count, as well. pop.qreg rcx ; We have n - 1 items to slide, so decrement rcx. Also, save a copy of it in ; rdx after doing that, for later. dec.qreg rcx mov.qreg.qreg rdx, rcx ; Retrieve the 0th item, for later. mov.qreg.indirect.qreg rbx, rsp ; Now we set up parameters for the memory-sliding operation. We have ; n - 1 items to copy, moving the range rsp + 8 through rsp + (n-1)*8 onto ; the range rsp through rsp + (n-2)*8. That's with the value of rsp as it ; exists at this moment (it's going to change soon). ; ; We're sliding them downwards in memory, so we start at the low end so ; that we're always moving into a location that doesn't have anything ; precious. We use lea as a convenient way to do the stack math. ; ; As with roll, we need to save rsi and adjust those rsp calculations ; accordingly. push.qreg rsi ; Regardless of rcx, we want rsp + 16. lea.qreg.disp8.qreg rsi, 16, rsp ; Regardless of rcx, we want rsp + 8. lea.qreg.disp8.qreg rdi, 8, rsp ; With roll, we were starting at the high end. Here, we start at the low ; end, which means we need rsi to increment after each repetition. That's ; what it does when the DF flag is clear, and another of our Forth ; conventions is to keep it clear normally. So, we don't have to touch DF! ; Yay! rep movsq ; Restore our original rsi. pop.qreg rsi ; There is now an extra item in the middle of the stack, at the high end of ; the sliding we did, that needs to be overwritten with our value in rbx. ; Since we destructively updated our count in rcx, we saved a copy of the ; count in rdx, and we use that to find the right address. ; ; When the original count was n, we want rsp + (n-1)*8, so we saved rdx ; after decrementing rcx, above. mov.indexed.qreg.qreg rsp, rdx, 8, rbx ; All done, wow! What a mouthful. next ; Rotates "up" (third item becomes current item) defword roll3, 0 dq $ + 8 pop.qreg rax pop.qreg rbx pop.qreg rcx push.qreg rbx push.qreg rax push.qreg rcx next ; Rotates "down" (current item becomes third item) defword unroll3, 0 dq $ + 8 pop.qreg rax pop.qreg rbx pop.qreg rcx push.qreg rax push.qreg rcx push.qreg rbx next defword dup, 0 dq $ + 8 pop.qreg rax push.qreg rax push.qreg rax next defword dup2, 0 dq $ + 8 pop.qreg rax pop.qreg rbx push.qreg rbx push.qreg rax push.qreg rbx push.qreg rax next ;;; ;;; Arithmetic routines ;;; ------------------- ;;; ;;; No surprises here. Again, since our goal is to bootstrap the heap, we ;;; keep it short. Also again, this is nowhere near optimal. ;;; defword add, 0 dq $ + 8 pop.qreg rbx pop.qreg rax add.qreg.qreg rax, rbx push.qreg rax next defword sub, 0 dq $ + 8 pop.qreg rbx pop.qreg rax sub.qreg.qreg rax, rbx push.qreg rax next defword mul, 0 dq $ + 8 pop.qreg rax pop.qreg rbx mul.rax.qreg rbx push.qreg rax next defword divmod, 0 dq $ + 8 xor.qreg.qreg rdx, rdx ; rdx is the high bits of the input; zero it pop.qreg rbx pop.qreg rax div.rdxrax.qreg rbx push.qreg rdx ; remainder push.qreg rax ; quotient next ;;; ;;; Comparison routines ;;; ------------------- ;;; defword eq, 0 dq $ + 8 pop.qreg rbx pop.qreg rax cmp.qreg.qreg rax, rbx set.breg.cc al, equal and.qreg.bimm rax, 0x01 push.qreg rax next defword ne, 0 dq $ + 8 pop.qreg rbx pop.qreg rax cmp.qreg.qreg rax, rbx set.breg.cc al, not_equal and.qreg.bimm rax, 0x01 push.qreg rax next defword gt, 0 dq $ + 8 pop.qreg rbx pop.qreg rax cmp.qreg.qreg rax, rbx set.breg.cc al, greater and.qreg.bimm rax, 0x01 push.qreg rax next ; Is the top of the stack less than the second item in the stack? defword lt, 0 dq $ + 8 pop.qreg rbx pop.qreg rax cmp.qreg.qreg rax, rbx set.breg.cc al, less and.qreg.bimm rax, 0x01 push.qreg rax next defword ge, 0 dq $ + 8 pop.qreg rbx pop.qreg rax set.breg.cc al, greater_equal cmp.qreg.qreg rax, rbx push.qreg rax next defword le, 0 dq $ + 8 pop.qreg rbx pop.qreg rax set.breg.cc al, less_equal cmp.qreg.qreg rax, rbx push.qreg rax next ;;; ;;; Bitwise routines ;;; ---------------- ;;; defword and, 0 dq $ + 8 pop.qreg rbx pop.qreg rax and.qreg.qreg rax, rbx push.qreg rax next defword or, 0 dq $ + 8 pop.qreg rbx pop.qreg rax or.qreg.qreg rax, rbx push.qreg rax next defword xor, 0 dq $ + 8 pop.qreg rbx pop.qreg rax xor.qreg.qreg rax, rbx push.qreg rax next ; The HP overloads the name "not", so we follow the Forth convention. defword invert, 0 dq $ + 8 pop.qreg rax not.qreg rax push.qreg rax next ;;; ;;; Routine lit ;;; ------------ ;;; defword lit, 0 dq $ + 8 lodsq push.qreg rax next defword litstring, 0 dq $ + 8 ; The string immediately follows the codeword in memory, so rsi is already ; pointing to it. That address will be our returned result, so we push it to ; the stack. push.qreg rsi ; Now we need to skip over the string, so that rsi will be valid for the ; next Forth word. To do that, we're going to do a string operation with ; scasb, which takes rdi as the address to look at. This means scasb is ; treating its operand as analogous to the destination operand of movs*. mov.qreg.qreg rdi, rsi ; We want to compare for equality with zero; scasb looks in al for the other ; half of the comparison, so we clear rax. xor.qreg.qreg rax, rax ; Counterintuitively, we do need to pass a count. We pass -1, which will ; always work. xor.qreg.qreg rcx, rcx not.qreg rcx ; The DF flag is zero per our Forth execution-model convention, which means ; scasb increments rdi after each iteration. This is what we want, so ; there's no need to mess with it. ; ; We also need to worry about ZF, but fortunately that "not" will make sure ; it's clear. ; ; So, we're ready; do it! repnz scasb ; The scasb completes, incrementing rdi, before the repnz checks its ; condition. So, rdi is now pointing immediately after the terminating null ; byte. Of course, we want this in rsi for Forth's purposes. mov.qreg.qreg rsi, rdi ; Finally, we need to align rsi to the next word boundary. add.qreg.bimm rsi, 7 and.qreg.bimm rsi, NOT 7 next ;;; ;;; Memory access routines ;;; ---------------------- ;;; ;;; We go with Forth names for this stuff. The HP's names for memory and ;;; storage operations heavily leverage the fact they have an object system ;;; with type tags and so on; we want to stay close to the bytes. ;;; ; Address on the top of the stack, value in the second position defword store, 0 dq $ + 8 pop.qreg rbx pop.qreg rax mov.indirect.qreg.qreg rbx, rax next defword fetch, 0 dq $ + 8 pop.qreg rax mov.qreg.indirect.qreg rax, rax push.qreg rax next ; Address on top, value second ; I might have done it the other way, but this is what Jonesforth does and it ; seems reasonable enough. defword addstore, 0 dq $ + 8 pop.qreg rbx pop.qreg rax add.indirect.qreg.qreg rbx, rax next defword substore, 0 dq $ + 8 pop.qreg rbx pop.qreg rax sub.indirect.qreg.qreg rbx, rax next defword store8, 0 dq $ + 8 pop.qreg rbx pop.qreg rax mov.indirect.qreg.breg rbx, al next defword fetch8, 0 dq $ + 8 pop.qreg rbx xor.qreg.qreg rax, rax mov.breg.indirect.qreg al, rbx next defword store16, 0 dq $ + 8 pop.qreg rbx pop.qreg rax mov.indirect.qreg.wreg rbx, ax next defword fetch16, 0 dq $ + 8 pop.qreg rbx xor.qreg.qreg rax, rax mov.wreg.indirect.qreg ax, rbx next defword store32, 0 dq $ + 8 pop.qreg rbx pop.qreg rax mov.indirect.qreg.dreg rbx, eax next defword fetch32, 0 dq $ + 8 pop.qreg rbx xor.qreg.qreg rax, rax mov.dreg.indirect.qreg eax, rbx next ; Copy one non-overlapping block of memory over another. For the overlapping ; logic, see roll and unroll. This always copies at byte granularity, for ease ; of implementation. ; ; Jonesforth calls this CMOVE and has a ccopy which is a single byte. POSIX, ; however, has memcpy() for non-overlapping blocks and memmove() for ; overlapping blocks. We follow the latter convention, because it feels like ; the more important distinction. ; ; Stack in: ; destination ; source ; length ; (top) defword ccopy, 0 dq $ + 8 ; We need to save and restore rsi; the other registers we can trample. mov.qreg.qreg rdx, rsi pop.qreg rcx pop.qreg rsi pop.qreg rdi ; We start from the low end, since that's easier arithmetic. So, we get to ; leave the DF flag alone. rep movsb mov.qreg.qreg rsi, rdx next ; Stack in: ; string address ; Stack out: ; string length including null byte defword stringlen, 0 dq $ + 8 pop.qreg rdi mov.qreg.qreg rbx, rdi xor.qreg.qreg rax, rax xor.qreg.qreg rcx, rcx not.qreg rcx repnz scasb sub.qreg.qreg rdi, rbx push.qreg rdi next ;;;;;;;;;;;;;;;;; ;;; Branching ;;; ;;;;;;;;;;;;;;;;; ; This takes a number of bytes, not machine words. That allows it to be used ; for putting weird things embedded in the code. ; ; The offset is relative to the start of the word the number of bytes is in, ; so, make sure to have it skip itself. defword branch, 0 dq $ + 8 add.qreg.indirect.qreg rsi, rsi next ; This should probably be 0branch, but right now the auto-label code is picky. defword zbranch, 0 dq $ + 8 pop.qreg rax test.qreg.qreg rax, rax ; Please notice the 8-bit branch to the nearby word. jmp.cc.rel.bimm zero, branch + 8 - zbranch_after_jmp zbranch_after_jmp: lodsq ; just a convenient way to skip rsi forward next ;;;;;;;;;;;;;;;;;;;;;; ;;; Output helpers ;;; ;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; These routines are for building up data structures in-memory. Sometimes ;;; they're used for structures that are meant to stay in memory; other times ;;; it's a buffer that will become output. ;;; ;;; The general pattern is that each routine takes an output address and ;;; some specific datum, and returns the output address adjusted to point ;;; after the new datum. That makes them easy to chain together. ; In: base address, value ; Out: new base address defword pack64, 0 dq docol dq swap, dup, unroll3, store, lit, 8, add dq exit defword pack32, 0 dq docol dq swap, dup, unroll3, store32, lit, 4, add dq exit defword pack16, 0 dq docol dq swap, dup, unroll3, store16, lit, 2, add dq exit defword pack8, 0 dq docol dq swap, dup, unroll3, store8, lit, 1, add dq exit ; This works on C-style strings, which are characters followed by a null ; terminator. The packed data includes the null terminator. ; ; Stack in: ; base address, string pointer ; Stack out: ; new base address defword packstring, 0 dq docol dq dup, stringlen, dup ; base/destination, source, length, length dq lit, 4, roll, dup, lit, 5, unroll ; destination, source, length, length, base/destination dq add, lit, 4, unroll ; new base, destination, source, length dq ccopy dq exit ; Stack in: ; base address ; byte size ; Stack out: ; new base address defword packalign, 0 dq docol dq dup2, divmod, drop, zbranch, 8*8 dq swap, lit, 0, pack8, swap dq branch, -11*8 dq drop, exit ; In the interests of reducing our executable's size, since a lot of it goes ; to pack* invocations, we define words that combine lit with pack*. This ; shaves roughly 700 bytes as of when it was added. defword litpack64, 0 dq $ + 8 lodsq push.qreg rax beforenext pack64 defword litpack32, 0 dq $ + 8 lodsq push.qreg rax beforenext pack32 defword litpack16, 0 dq $ + 8 lodsq push.qreg rax beforenext pack16 defword litpack8, 0 dq $ + 8 lodsq push.qreg rax beforenext pack8 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Assembly language, but in Forth ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; It's all backwards and stuff. ;;; ;;; Okay, but seriously, the convention is: target on the top of the stack, ;;; source behind it. This is similar to how the Forth "store" and "fetch" ;;; words work. ;;; ;;; These routines use the output helpers, defined above. They're called in ;;; the same way: an output address, followed by data items specific to what's ;;; being output. ; We define a bunch of keywords, which evaluate to their own codeword ; addresses. defword rax, 0 dq docol, lit, rax, exit defword rcx, 0 dq docol, lit, rcx, exit defword rdx, 0 dq docol, lit, rdx, exit defword rbx, 0 dq docol, lit, rbx, exit defword rsp, 0 dq docol, lit, rsp, exit defword rbp, 0 dq docol, lit, rbp, exit defword rsi, 0 dq docol, lit, rsi, exit defword rdi, 0 dq docol, lit, rdi, exit defword r8, 0 dq docol, lit, r8, exit defword r9, 0 dq docol, lit, r9, exit defword r10, 0 dq docol, lit, r10, exit defword r11, 0 dq docol, lit, r11, exit defword r12, 0 dq docol, lit, r12, exit defword r13, 0 dq docol, lit, r13, exit defword r14, 0 dq docol, lit, r14, exit defword r15, 0 dq docol, lit, r15, exit defword eax, 0 dq docol, lit, rax, exit defword ecx, 0 dq docol, lit, rcx, exit defword edx, 0 dq docol, lit, rdx, exit defword ebx, 0 dq docol, lit, rbx, exit defword esp, 0 dq docol, lit, rsp, exit defword ebp, 0 dq docol, lit, rbp, exit defword esi, 0 dq docol, lit, rsi, exit defword edi, 0 dq docol, lit, rdi, exit defword ax, 0 dq docol, lit, rax, exit defword cx, 0 dq docol, lit, rcx, exit defword dx, 0 dq docol, lit, rdx, exit defword bx, 0 dq docol, lit, rbx, exit defword sp, 0 dq docol, lit, rsp, exit defword bp, 0 dq docol, lit, rbp, exit defword si, 0 dq docol, lit, rsi, exit defword di, 0 dq docol, lit, rdi, exit defword al, 0 dq docol, lit, rax, exit defword cl, 0 dq docol, lit, rcx, exit defword dl, 0 dq docol, lit, rdx, exit defword bl, 0 dq docol, lit, rbx, exit defword ah, 0 dq docol, lit, rsp, exit defword ch, 0 dq docol, lit, rbp, exit defword dh, 0 dq docol, lit, rsi, exit defword bh, 0 dq docol, lit, rdi, exit ; Stack in: ; register name ; Stack out: ; 3-bit encoded value for register defword reg64, 0 dq docol dq dup, rax, eq, zbranch, 5*8, drop, lit, 0, exit dq dup, rcx, eq, zbranch, 5*8, drop, lit, 1, exit dq dup, rdx, eq, zbranch, 5*8, drop, lit, 2, exit dq dup, rbx, eq, zbranch, 5*8, drop, lit, 3, exit dq dup, rsp, eq, zbranch, 5*8, drop, lit, 4, exit dq dup, rbp, eq, zbranch, 5*8, drop, lit, 5, exit dq dup, rsi, eq, zbranch, 5*8, drop, lit, 6, exit dq dup, rdi, eq, zbranch, 5*8, drop, lit, 7, exit dq litstring, "Parameter to reg64 is not a reg64.", emitstring dq lit, 1, sys_exit ; Stack in: ; register name ; Stack out: ; 3-bit encoded value for register defword extrareg64, 0 dq docol dq dup, r8, eq, zbranch, 5*8, drop, lit, 0, exit dq dup, r9, eq, zbranch, 5*8, drop, lit, 1, exit dq dup, r10, eq, zbranch, 5*8, drop, lit, 2, exit dq dup, r11, eq, zbranch, 5*8, drop, lit, 3, exit dq dup, r12, eq, zbranch, 5*8, drop, lit, 4, exit dq dup, r13, eq, zbranch, 5*8, drop, lit, 5, exit dq dup, r14, eq, zbranch, 5*8, drop, lit, 6, exit dq dup, r15, eq, zbranch, 5*8, drop, lit, 7, exit dq litstring, "Parameter to extrareg64 is not an extrareg64.", emitstring dq lit, 1, sys_exit ; Stack in: ; register name ; Stack out: ; 3-bit encoded value for register defword reg32, 0 dq docol dq dup, eax, eq, zbranch, 5*8, drop, lit, 0, exit dq dup, ecx, eq, zbranch, 5*8, drop, lit, 1, exit dq dup, edx, eq, zbranch, 5*8, drop, lit, 2, exit dq dup, ebx, eq, zbranch, 5*8, drop, lit, 3, exit dq dup, esp, eq, zbranch, 5*8, drop, lit, 4, exit dq dup, ebp, eq, zbranch, 5*8, drop, lit, 5, exit dq dup, esi, eq, zbranch, 5*8, drop, lit, 6, exit dq dup, edi, eq, zbranch, 5*8, drop, lit, 7, exit dq litstring, "Parameter to reg32 is not a reg32.", emitstring dq lit, 1, sys_exit ; Stack in: ; register name ; Stack out: ; 3-bit encoded value for register defword reg16, 0 dq docol dq dup, ax, eq, zbranch, 5*8, drop, lit, 0, exit dq dup, cx, eq, zbranch, 5*8, drop, lit, 1, exit dq dup, dx, eq, zbranch, 5*8, drop, lit, 2, exit dq dup, bx, eq, zbranch, 5*8, drop, lit, 3, exit dq dup, sp, eq, zbranch, 5*8, drop, lit, 4, exit dq dup, bp, eq, zbranch, 5*8, drop, lit, 5, exit dq dup, si, eq, zbranch, 5*8, drop, lit, 6, exit dq dup, di, eq, zbranch, 5*8, drop, lit, 7, exit dq litstring, "Parameter to reg16 is not a reg16.", emitstring dq lit, 1, sys_exit ; Stack in: ; register name ; Stack out: ; 3-bit encoded value for register defword reg8, 0 dq dup, al, eq, zbranch, 5*8, drop, lit, 0, exit dq dup, cl, eq, zbranch, 5*8, drop, lit, 1, exit dq dup, dl, eq, zbranch, 5*8, drop, lit, 2, exit dq dup, bl, eq, zbranch, 5*8, drop, lit, 3, exit dq dup, ah, eq, zbranch, 5*8, drop, lit, 4, exit dq dup, ch, eq, zbranch, 5*8, drop, lit, 5, exit dq dup, dh, eq, zbranch, 5*8, drop, lit, 6, exit dq dup, bh, eq, zbranch, 5*8, drop, lit, 7, exit dq litstring, "Parameter to reg8 is not a reg8.", emitstring dq lit, 1, sys_exit ; Stack in: ; scale factor, as a count of bytes ; Stack out: ; 2-bit encoded value for scale field in SIB byte defword scalefield, 0 dq docol dq dup, 1, eq, zbranch, 5*8, drop, lit, 0, exit dq dup, 2, eq, zbranch, 5*8, drop, lit, 1, exit dq dup, 4, eq, zbranch, 5*8, drop, lit, 2, exit dq dup, 8, eq, zbranch, 5*8, drop, lit, 3, exit dq litstring, "Parameter to scalefield is not 1, 2, 4, or 8.", emitstring dq lit, 1, sys_exit ; Stack: ; output point defword rex_w, 0 dq docol, lit, 0x48, pack8, exit defword rex_wb, 0 dq docol, lit, 0x49, pack8, exit ; Stack: ; output point ; 3-bit encoded value for register ; opcode byte defword opcodereg, 0 dq docol, or, pack8, exit ; The low-level word that outputs a modrm byte given fully-processed, ; numeric values for its fields. Most code will want to call one of the ; higher-level modrm_* words, instead. ; ; Stack ; output point ; mode ("mod") field ; register/opcode field ; register/memory ("RM") field defword modrm, 0 dq docol, swap, lit, 8, mul, or, swap, lit, 64, mul, or, pack8, exit ; Stack ; output point ; scale field ; index field ; base field defword sib, 0 dq docol, swap, lit, 8, mul, or, swap, lit, 64, mul, or, pack8, exit ; The simplest of the modrm modes: Direct register addressing. There are no ; special cases to check. ; ; It's important to notice that the R/M field may describe either a source, ; or a target, depending on what the instruction is. So, this helper doesn't ; get to know that. It also doesn't get to know whether the value in the ; reg/op field describes a register, or if instead it's an extension of the ; opcode. The caller is responsible for figuring that all out. ; ; Stack: ; output point ; reg/op field value (raw number) ; reg/mem field register name defword addressing_reg64, 0 dq docol, reg64, lit, 3, unroll3, modrm, exit ; This is a helper for assembly instructions that want to do a form of ; addressing that requires a value of 1 in the modrm byte's mode field, and ; do not want to do any indexing. ; ; Its main responsibility is to deal with the scenario that requires an SIB ; byte, which happens when the R/M field has a value of 4, which would ; otherwise refer to the register rsp. In that situation, it also generates ; an SIB byte which indicates a scale of 1, no indexing, and rsp as the base ; register. ; ; Stack: ; output point ; reg/op field value (raw number) ; reg/mem field register name defword addressing_indirect_reg64, 0 dq docol ; Exit with an error if the R/M register is rbp. dq dup, rbp, ne, zbranch, 23*8 ; Check whether the R/M register is rsp; save the test result for later. dq dup, rsp, eq, lit, 4, unroll dq reg64, lit, 0, unroll3, modrm ; If the R/M register was rsp, we need an SIB byte; otherwise, skip it. dq swap, zbranch, 8*8, lit, 0, lit, 4, rsp, reg64, sib dq exit dq litstring, "R/M parameter to addressing_indirect_reg64 is rbp." dq emitstring dq lit, 1, sys_exit ; Stack: ; output point ; reg/op field value (raw number) ; reg/mem field register name ; displacement value defword addressing_disp8_reg64, 0 dq docol ; This mode can do rbp fine, so no need to check for that. ; Check whether the R/M register is rsp; save the test result for later. dq swap, dup, rsp, eq, lit, 5, unroll, swap ; Stash the displacement value out of the way, too. dq lit, 4, unroll dq reg64, lit, 1, unroll3, modrm ; If the R/M register was rsp, we need an SIB byte; otherwise, skip it. dq roll3, zbranch, 8*8, lit, 0, lit, 4, rsp, reg64, sib ; The displacement byte. dq swap, pack8 dq exit ; Stack: ; output point ; reg/op field value (raw number) ; scale factor, as a count of bytes ; index register name ; base field register name defword addressing_indexed_reg64, 0 dq docol ; Exit with an error if the base register is rbp. dq dup, rbp, ne, zbranch, 17*8 ; Reg/mem value 4 means to use an SIB byte (at least, with this mode). dq lit, 5, roll, lit, 0, lit, 6, roll, lit, 4, modrm, lit, 4, unroll dq reg64, unroll3, reg64, unroll3, scalefield, unroll3, sib dq exit dq litstring, "Base parameter to addressing_indexed_reg64 is rbp." dq emitstring dq lit, 1, sys_exit ; Stack: ; output point ; reg/op field value (raw number) ; scale factor, as a count of bytes ; index register name ; base field register name ; displacement value defword addressing_disp8_indexed_reg64, 0 dq docol ; This mode can do rbp fine, so no need to check for that. ; Reg/mem value 4 means to use an SIB byte (at least, with this mode). dq lit, 6, roll, lit, 1, lit, 7, roll, lit, 4, modrm, lit, 5, unroll dq lit, 5, unroll, reg64, unroll3, reg64, unroll3, scalefield, unroll3, sib dq swap, pack8 dq exit ; Stack: ; output point defword cld, 0 dq docol, lit, 0xFC, pack8, exit ; Stack: ; output point defword std, 0 dq docol, lit, 0xFD, pack8, exit ; Stack: ; output point ; immediate value ; register name defword mov_reg64_imm64, 0 dq docol dq roll3, rex_w, swap, reg64, lit, 0xB8, opcodereg, swap, pack64 dq exit defword mov_extrareg64_imm64, 0 dq docol dq roll3, rex_wb, swap, extrareg64, lit, 0xB8, opcodereg, swap, pack64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x89, pack8, unroll3 dq swap, reg64, swap, addressing_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_indirect_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x89, pack8, unroll3 dq swap, reg64, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; target register name ; target displacement value defword mov_disp8_reg64_reg64, 0 dq docol dq lit, 4, roll, rex_w, lit, 0x89, pack8, lit, 4, unroll dq roll3, reg64, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_reg64_indirect_reg64, 0 dq docol dq roll3, rex_w, lit, 0x8B, pack8, unroll3 dq reg64, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; source displacement value ; target register name defword mov_reg64_disp8_reg64, 0 dq docol dq lit, 4, roll, rex_w, lit, 0x8B, pack8, lit, 4, unroll dq reg64, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; source index register name ; source index scale factor, as a count of bytes ; target register name defword mov_reg64_indexed_reg64, 0 dq docol dq lit, 5, roll, rex_w, lit, 0x8B, pack8, lit, 5, unroll dq reg64, lit, 4, unroll, unroll3, swap, addressing_indexed_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_indirect_reg64_reg32, 0 dq docol dq roll3, lit, 0x89, pack8, unroll3 dq swap, reg32, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; target register name ; target displacement value defword mov_disp8_reg64_reg32, 0 dq docol dq lit, 4, roll, lit, 0x89, pack8, lit, 4, unroll dq roll3, reg32, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_reg32_indirect_reg64, 0 dq docol dq roll3, lit, 0x8B, pack8, unroll3 dq reg32, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; source displacement value ; target register name defword mov_reg32_disp8_reg64, 0 dq docol dq lit, 4, roll, lit, 0x8B, pack8, lit, 4, unroll dq reg32, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_indirect_reg64_reg16, 0 dq docol dq roll3, lit, 0x66, pack8, lit, 0x89, pack8, unroll3 dq swap, reg16, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; target register name ; target displacement value defword mov_disp8_reg64_reg16, 0 dq docol dq lit, 4, roll, lit, 0x66, pack8, lit, 0x89, pack8, lit, 4, unroll dq roll3, reg16, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_reg16_indirect_reg64, 0 dq docol dq roll3, lit, 0x66, pack8, lit, 0x8B, pack8, unroll3 dq reg16, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; source displacement value ; target register name defword mov_reg16_disp8_reg64, 0 dq docol dq lit, 4, roll, lit, 0x66, pack8, lit, 0x8B, pack8, lit, 4, unroll dq reg16, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_indirect_reg64_reg8, 0 dq docol dq roll3, lit, 0x88, pack8, unroll3 dq swap, reg8, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; target register name ; target displacement value defword mov_disp8_reg64_reg8, 0 dq docol dq lit, 4, roll, lit, 0x88, pack8, lit, 4, unroll dq roll3, reg8, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword mov_reg8_indirect_reg64, 0 dq docol dq roll3, pack8, lit, 0x8A, pack8, unroll3 dq reg8, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; source displacement value ; target register name defword mov_reg8_disp8_reg64, 0 dq docol dq lit, 4, roll, pack8, lit, 0x8A, pack8, lit, 4, unroll dq reg8, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; source displacement value ; target register name defword lea_reg64_disp8_reg64, 0 dq docol dq lit, 4, roll, rex_w, lit, 0x8D, pack8, lit, 4, unroll dq reg64, unroll3, addressing_disp8_reg64 dq exit ; Stack: ; output point ; source register name ; source index register name ; source index scale factor, as a count of bytes ; target register name defword lea_reg64_indexed_reg64, 0 dq docol dq lit, 5, roll, rex_w, lit, 0x8D, pack8, lit, 5, unroll dq reg64, lit, 4, unroll, unroll3, swap, addressing_indexed_reg64 dq exit ; Stack: ; output point ; source register name ; source index register name ; source index scale factor, as a count of bytes ; source displacement value ; target register name defword lea_reg64_disp8_indexed_reg64, 0 dq docol dq lit, 6, roll, rex_w, lit, 0x8D, pack8, lit, 6, unroll dq reg64, lit, 5, unroll, lit, 3, roll, lit, 4, roll, lit, 3, roll dq addressing_disp8_indexed_reg64 dq exit ; Stack: ; output point ; source register name defword push_reg64, 0 dq docol, reg64, lit, 0x50, opcodereg, exit ; Stack: ; output point ; target register name defword pop_reg64, 0 dq docol, reg64, lit, 0x58, opcodereg, exit ; Stack: ; output point defword lodsq, 0 dq docol, rex_w, lit, 0xAD, pack8, exit ; Stack: ; output point ; source register name ; target register name defword add_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x01, pack8, unroll3 dq swap, reg64, swap, addressing_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword add_indirect_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x01, pack8, unroll3 dq swap, reg64, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword add_reg64_indirect_reg64, 0 dq docol dq roll3, rex_w, lit, 0x03, pack8, unroll3 dq reg64, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point ; source value ; target register name defword add_reg64_imm8, 0 dq docol dq roll3, rex_w, lit, 0x83, pack8, swap, lit, 0, swap, addressing_reg64 dq swap, pack8 dq exit ; Stack: ; output point ; source register name ; target register name defword sub_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x2B, pack8, unroll3 dq swap, reg64, swap, addressing_reg64 ; Stack: ; output point ; source register name ; target register name defword sub_indirect_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x2B, pack8, unroll3 dq swap, reg64, swap, addressing_indirect_reg64 ; Stack: ; output point ; source register name ; target register name defword mul_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0xF7, pack8, unroll3 dq swap, reg64, swap, addressing_reg64 dq exit ; The dividend is 128 bits, and is formed from rdx as the high half and rax ; as the low half. The divisor is a specified register. The quotient is ; returned in rax, truncated towards zero. The remainder is in rdx. This ; entire process is unsigned. ; ; The official mnemonic for this is "div", but divmod is what it does. ; ; Stack: ; output point ; divisor register name defword divmod_reg64, 0 dq docol dq swap, rex_w, lit, 0xF7, pack8, swap dq lit, 6, swap, addressing_reg64 ; Same as divmod, but signed. ; ; Stack: ; output point ; divisor register name defword idivmod_reg64, 0 dq docol dq swap, rex_w, lit, 0xF7, pack8, swap dq lit, 7, swap, addressing_reg64 ; Stack: ; output point ; target register name defword inc_reg64, 0 dq docol dq swap, rex_w, lit, 0xFF, pack8, swap, lit, 0, swap, addressing_reg64 dq exit ; Stack: ; output point ; target register name defword dec_reg64, 0 dq docol dq swap, rex_w, lit, 0xFF, pack8, swap, lit, 1, swap, addressing_reg64 dq exit ; Stack: ; output point ; source register name ; target register name defword and_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x23, pack8, unroll3 dq reg64, swap, addressing_reg64 dq exit ; Stack: ; output point ; source value ; target register name defword and_reg64_imm8, 0 dq docol dq roll3, rex_w, lit, 0x83, pack8, swap dq lit, 4, swap, addressing_reg64 dq swap, pack8 dq exit ; Stack: ; output point ; source register name ; target register name defword or_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x0B, pack8, unroll3 dq reg64, swap, addressing_reg64 dq exit ; Stack: ; output point ; source value ; target register name defword or_reg64_imm8, 0 dq docol dq roll3, rex_w, lit, 0x83, pack8, swap dq lit, 1, swap, addressing_reg64 dq swap, pack8 dq exit ; Stack: ; output point ; source register name ; target register name defword xor_reg64_reg64, 0 dq docol dq roll3, rex_w, lit, 0x33, pack8, unroll3 dq reg64, swap, addressing_reg64 dq exit ; Stack: ; output point ; target register name defword not_reg64, 0 dq docol dq swap, rex_w, lit, 0xF7, pack8 dq swap, lit, 2, swap, addressing_reg64 dq exit ; Stack: ; output point ; left register name ; right register name defword cmp_reg64_reg64, 0 dq docol dq roll3, rex_w, 0x3B, pack8, unroll3 dq swap, reg64, swap, addressing_reg64 dq exit ; Stack: ; output point ; left register name ; right register name defword test_reg64_reg64, 0 dq docol dq roll3, rex_w, 0x85, pack8, unroll3 dq swap, reg64, swap, addressing_reg64 dq exit ; Stack: ; output point ; register name defword jmp_abs_indirect_reg64, 0 dq docol dq swap, lit, 0xFF, pack8, swap dq lit, 4, swap, addressing_indirect_reg64 dq exit ; Stack: ; output point defword syscall, 0 dq docol, lit, 0x0F, pack8, lit, 0x05, pack8, exit ; TODO rep movsb ; TODO rep movsq ; TODO repnz scasb ; TODO set_reg8_cc ; (and condition codes) ; TODO jmp_cc_rel_imm8 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Runtime word definition ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; We need to be able to use Forth code that defines other Forth code; it ;;; will be the easiest way to create our Forth-flavored assembly languge. To ;;; do that, there's a critical bootstrapping problem: The variables that we ;;; allocate on the heap in _start must be used by the Forth words that define ;;; other Forth words (and, in general, they are needed for anything that ;;; does memory management). Since the heap's address isn't known until ;;; runtime, we can't use build-time labels to locate the heap or the ;;; variables on it, in the way we've done for everything up to now. ;;; ;;; In fact, it's worse than that: nothing we define at build time can ;;; statically reference anything on the heap; it must always be given the ;;; heap's address as a parameter. That also means statically defined words ;;; can't reference runtime words, at all, except via special code that goes ;;; outside the normal Forth execution model. ;;; ;;; So, what we'll want to do to deal with that is copy ourselves onto the ;;; heap, and run the rest of the program from there. It would be okay in ;;; principle to keep a few things such as docol in the static code segment, ;;; as long as they don't need to reference the heap, though it might in some ;;; sense be more elegant to entirely abandon the code segment and run ;;; heap-only. At any rate, to keep the set of stuff we have to copy small, ;;; we'll want to make the change-over as soon as possible. ;;; ;;; Following a common Forth practice, we implement variables as words that ;;; push the corresponding addresses onto the stack. Since the variables that ;;; define the heap are ON the heap, we will definitely need two distinct ;;; versions of these; having the two versions be almost-compatible allows us ;;; to minimize code duplication. Since they do have slightly different ;;; interfaces, we prefix the names of the early ones with "early_". ; Stack in: ; heap address ; Stack out: ; heap address ; requested variable address defword early_heap, 0 dq docol, dup, lit, control_stack_size, add, exit defword early_s0, 0 dq docol, early_heap, lit, 8, add, exit defword early_r0, 0 dq docol, early_heap, lit, 16, add, exit defword early_latest, 0 dq docol, early_heap, lit, 24, add, exit defword early_here, 0 dq docol, early_heap, lit, 32, add, exit ; Allocate space by incrementing "here", and output a word header in it. ; Also add it to the "latest" linked list. Use zero as the flag values; ; callers that want something else can do that themselves. ; ; We'll use the pack* words to do this, the way they update the pointer is ; convenient. ; ; See "Execution model", above, for reminders of the details of this format. ; Create is responsible for everything up to the codeword, not including it. ; ; This is surprisingly painful to implement, in terms of stack juggling, but ; then there's a sense in which it's the core trick to bootstrapping the heap. ; ; Stack in: ; heap address ; name string ; Stack out: ; heap address defword early_create, 0 dq docol ; Retrieve the values of "here" and "latest". dq swap, early_here, fetch, swap, early_latest, fetch, swap, lit, 4, unroll ; Pack the old value of "latest" as the first field of the header, linking ; from the newly-defined word to the next-newest word. dq pack64 ; Pack a null byte as the flags. dq lit, 0, pack8 ; Pack another null byte as the start terminator of the name (which is ; terminated at both ends). dq lit, 0, pack8 ; Pack the name and its end terminator. dq swap, packstring ; Alignment before the codeword. dq lit, 8, packalign ; Retrieve the value of "here" (which still doesn't reflect our additions), ; and store it at the address of "latest". It's the start of our ; newly-defined word, which makes it the latest word. dq swap, early_here, fetch, swap, early_latest, swap, unroll3, store, swap ; Retrieve the address of "here" again and store our updated value there. dq swap, early_here, swap, unroll3, store dq exit ; Stack in: ; heap address ; value to append to current word-in-progress ; Stack out: ; heap address defword early_comma, 0 dq docol ; Retrieve the value of "here" and pack the value there. dq swap, early_here, fetch, swap, unroll3, swap, pack64 ; Store the updated value of "here". dq swap, early_here, swap, unroll3, store dq exit ; Stack in: ; heap address ; Stack out: ; heap address defword early_self_codeword, 0 dq docol, early_here, fetch, lit, 8, add, early_comma, exit ; Stack in: ; heap address ; new value to overwrite "here" ; Stack out: ; heap address defword early_here_store, 0 dq docol, swap, early_here, swap, unroll3, store, exit ; Notice that we've switched over to stuff that follows the pack* idioms. ; ; This is a helper "macro" that we'll use in defining assembly words from ; Forth. This is in a sense a redefinition of it; the flatassembler version of ; it is far, far above, and has more documentation. ; ; Stack in: ; base address ; Stack out: ; new base address defword pack_next, 0 dq docol, lodsq, rax, jmp_abs_indirect_reg64, exit ; This is another helper "macro" that we'll use in defining assembly words ; from Forth. In particular, this one is used in docol. As before, see the ; flatassembler version for more explanation. ; ; Stack in: ; base address ; source register keyword ; Stack out: ; new base address defword pack_pushcontrol, 0 dq docol dq swap, rbp, lit, -8, rbp, lea_reg64_disp8_reg64, swap dq rbp, lit, 0, mov_disp8_reg64_reg64 dq exit ; This is another helper "macro" that we'll use in defining assembly words ; from Forth. In particular, this one is used in "exit". See the flatassembler ; version for more explanation. ; ; Stack in: ; base address ; target register keyword ; Stack out: ; new base address defword pack_popcontrol, 0 dq docol dq rbp, lit, 0, roll3, mov_reg64_disp8_reg64 dq rbp, lit, 8, rbp, lea_reg64_disp8_reg64 dq exit ; Now we're back to heap idioms again. ; ; Stack in: ; heap address ; address for new variable word to return ; name string ; Stack out: ; heap address defword early_variable, 0 dq docol dq swap, unroll3, early_create, early_self_codeword ; (address to return, heap address) dq early_here, fetch, swap, unroll3, swap ; (heap address, modified "here" value, address to return) dq rax, mov_reg64_imm64, rax, push_reg64 ; (heap address, modified "here" value) dq pack_next, lit, 8, packalign, early_here_store dq exit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Now.... what was our original goal, again? ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Finally, having fully bootstrapped our runtime environment, we move on ;;; to the core stuff we actually want to accomplish. For this quine, that's ;;; outputting itself. ;;; ;;; One of the most charming naming traditions in Forth is that the ;;; top-level word that stays running forever, is called "quit". ;;; defword quit, 0 dq docol ;;; ;;; Although we initialized rbp already, we do so again because we'll want ;;; that on subsequent visits to this word - it's the main thing it's for. ;;; Keep in mind that rsi is the actual "instruction pointer", and we're ;;; leaving it unchanged, we just get rid of everything above it. ;;; ;dq r0, control! ; overwrite rbp to reset the control stack ; TODO though the implementation of r0 is trivial, it depends on where we ; put the heap, so it can't be hardcoded, we'll have to build it in RAM. ; the same therefore goes for anything that needs to call it. so we can't ; call it here, now, yet - we have prep work to do first ;;; ;;; Do the read-eval-print-loop, which is the main body of the Forth ;;; interpreter. ;;; ;dq interpret ; run the repl dq quine ;;; ;;; If the repl ever exits, do it all again. ;;; ;dq branch, quit - $ dq lit, 0, sys_exit ;;;;;;;;;;;;;;;;;;;; ;;; System calls ;;; ;;;;;;;;;;;;;;;;;;;; ;;; ;;; The kernel preserves every register except rax, rcx, and r11. The system ;;; call number goes in rax, as does the return value. Parameters go in rdi, ;;; rsi, rdx, r10, r8, and r9, in that order. [SysV] A.2.1. ;;; ;;; Notice that rsi is our control stack, so we have to save it (for ;;; syscalls with at least two parameters). We can use the value stack to do ;;; that, since rsp is preserved. We don't save other registers because our ;;; caller should do that, if it cares. ;;; ;;; ;;; This does the Linux exit() system call, passing it an exit code taken ;;; from the stack. ;;; defword sys_exit, 0 dq $ + 8 mov.qreg.qimm rax, 60 ; syscall number pop.qreg rdi ; exit code syscall ; In the event we're still here, let's minimize confusion. hlt ;;; ;;; This does the Linux write() system call, passing it an address from the ;;; top of the stack and a length from the second position on the stack. It ;;; writes to file descriptor 1, which is stdout. ;;; ;;; For our length parameter, we can pop directly from the stack into rdx, ;;; which directly becomes the syscall parameter. For our address parameter, ;;; the syscall wants it in rsi, which we also care about, so we have to do a ;;; little juggling. ;;; defword sys_write, 0 dq $ + 8 pop.qreg rcx ; address from stack pop.qreg rdx ; length from stack, passed directly push.qreg rsi ; save rsi mov.qreg.qimm rax, 1 ; syscall number mov.qreg.qimm rdi, 1 ; file descriptor mov.qreg.qreg rsi, rcx ; pass address syscall pop.qreg rsi ; restore rsi next ;;;;;;;;;;;;;;;;;;;;;;;; ;;; I/O conveniences ;;; ;;;;;;;;;;;;;;;;;;;;;;;; defword emitstring, 0 dq docol, dup, stringlen, swap, sys_write, exit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; (new) Implementation strategy ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; We assemble the entire file contents in a heap-allocated buffer. When ;;; the file is fully assembled, we output it. ;;; defword quine, 0 dq docol ; We still have "heap" on the stack, so we can call early_here. Add a ; constant to "here" in-place, keeping a copy of the pointer ... dq early_here, lit, 0x78, swap, addstore ; ... and now we have allocated a block of memory, with its address on the ; stack. We also still have "heap" at the bottom of the stack, for future ; use. ; We have one label, and two pieces of information about it: value and ; status. We keep them on the stack. We also have a mutable copy of the ; buffer's address. ; ; This stack layout is the interface used throughout the "assembler"; this ; quine routine uses it when calling the routines that build the various ; pieces of the output, and those pieces in turn use it when calling the ; label-handling routines. ; ; Stack: ; "heap" ; start address of allocated output ; label's value ; label's status ; address of current output point within allocated block ; (top) ; ; Status is a bit field: ; bit zero is whether it was used before being defined ; bit one is whether it's been defined ; bit two is whether the guessed value wound up equaling the actual value dq dup, lit, 0, lit, 0, roll3 ; This takes an address to write to on the stack, writes stuff there, and ; returns the next address after where it stopped Writing. dq all_contents ; The two-pass magick: ; Reset the output offset to the beginning of the ; block. dq drop, roll3, dup, lit, 4, unroll dq all_contents ; Drop the copy of the buffer's address. dq drop ; Drop the label data. dq drop, drop ; This takes a buffer's address on the stack, skips an ELF file header and ; program header based on hardcoded size, computes an offset (secretly ; hardcoded), and writes that offset into an appopriate place in the middle ; of those headers. It then returns the length of the used portion of the ; buffer. ;dq lit, 0x78, swap dq lit, file_size, swap ; write() from stack-allocated buffer dq sys_write dq exit ; Stack in: ; output memory start ; label value (guessed or actual) ; label status ; output memory current point ; Stack out: ; output memory start ; label value (guessed or actual) ; label status (potentially modified) ; output memory current point ; label value for caller to use defword use_label, 0 dq docol ; Fetch the status dq swap ; Check the bit that indicates it's been set. dq dup, lit, 2, and, zbranch, 10*8 ; If we're here, it has been set already, so just put the status back... dq swap ; Fetch the actual value... dq lit, 3, roll, dup, lit, 4, unroll ; ... and exit dq exit ; If we're here, it hasn't been set yet, so mark it used-before-set. dq lit, 1, or ; Put the status back... dq swap ; Fetch the guessed value... dq lit, 3, roll, dup, lit, 4, unroll ; ... and exit dq exit ; Stack in: ; output memory start ; label value (guessed or actual) ; label status ; output memory current point ; Stack out: ; output memory start ; label value (now set to actual) ; label status (modified) ; output memory current point defword set_label, 0 dq docol ; Compute the current offset, to use as the actual value dq dup, lit, 5, roll, dup, lit, 6, unroll, sub ; old value, status, point, new value ; Overwrite the old actual value; keep a copy dq dup, lit, 5, roll ; status, point, new value, new value, old value dq eq, swap ; status, point, equality, new value dq lit, 4, unroll ; new value, status, point, equality ; We don't need to branch. Now we mark the status as having been defined, ; and we also set bit 2 if appropriate. dq lit, 4, mul dq lit, 3, roll, or, lit, 2, or, swap dq exit defword hlt, 0 dq $ + 8 hlt ; This takes an address to write to on the stack, writes stuff there, and ; returns the next address after where it stopped Writing. ; ; It also makes use of label stuff, further back on the stack. defword all_contents, 0 dq docol dq elf_file_header, elf_program_header dq output_start_routine dq self_raw dq set_label dq exit ;;; ;;; ELF header ;;; ;;; This is the top-level ELF header, for the entire file. An ELF always has ;;; exactly one of this header, which is always at the start of the file. ;;; defword elf_file_header, 0 dq docol dq litpack32, 0x7f bappend "ELF" ; magic number dq litpack8, 2 ; 64-bit dq litpack8, 1 ; little-endian dq litpack8, 1 ; ELF header format v1 dq litpack8, 0 ; System-V ABI dq litpack64, 0 ; (padding) dq litpack16, 2 ; executable dq litpack16, 0x3e ; Intel x86-64 dq litpack32, 1 ; ELF format version ; Compute the entry pointer. dq litpack64, _start ; entry point ; The offset of _start. This includes the origin, intentionally. dq litpack64, 64 ; program header offset ; We place the program header immediately after the ELF header. This ; offset is from the start of the file. dq litpack64, 0 ; section header offset dq litpack32, 0 ; processor flags dq litpack16, 64 ; ELF header size dq litpack16, 56 ; program header entry size dq litpack16, 1 ; number of program header entries dq litpack16, 0 ; section header entry size dq litpack16, 0 ; number of section header entries dq litpack16, 0 ; section name string table index dq exit ;;; ;;; Program header ;;; ;;; An ELF program header consists of any number of these entries; they are ;;; always consecutive, but may be anywhere in the file. We always have ;;; exactly one, and it's always right after the ELF file header. ;;; defword elf_program_header, 0 dq docol dq litpack32, 1 ; "loadable" segment type dq litpack32, 0x05 ; read+execute permission dq litpack64, 0 ; offset in file dq litpack64, $$ ; virtual address ; required, but can be anything, subject to alignment dq litpack64, 0 ; physical address (ignored) ; Fill in 0 as the file size for now, to avoid unitialized memory. dq use_label, pack64 ; size in file dq use_label, pack64 ; size in memory dq litpack64, 0 ; segment alignment ; for relocation, but this doesn't apply to us dq exit defword output_start_routine, 0 dq docol dq cld dq lit, 9, rax, mov_reg64_imm64 dq lit, heap_requested_address, rdi, mov_reg64_imm64 dq lit, heap_size, rsi, mov_reg64_imm64 dq lit, 0x03, rdx, mov_reg64_imm64 dq lit, 0x22, r10, mov_extrareg64_imm64 dq lit, 0, r8, mov_extrareg64_imm64 dq lit, 0, r9, mov_extrareg64_imm64 dq syscall dq rax, rdi, mov_reg64_reg64 dq exit ; write() the machine code by using self-reference ; TODO do this in a "real" quine way defword self_raw, 0 dq docol dq dup dq lit, elf_header + 0xc4 ; source dq lit, file_size - 0xc4 ; length ; destination destination source length dq dup, lit, 4, roll, add, lit, 4, unroll ; result destination source length dq ccopy dq exit final_word_name = self_raw code_size = $ - code_start file_size = $ - $$