;;; QUINE
;;;
;;; This file is formatted to be read at 80-columns or wider.
;;;
;;; There's some tabular information, but diagrams have been avoided, in an
;;; attempt to make this manageable in screen readers. Feedback welcome.


;;;;;;;;;;;;;;;;;;;;;
;;; Workflow tips ;;;
;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Currently, this is not yet fully self-hosting; it is based on
;;; flatassembler[1]. A minimal command to build and run it is:
;;;
;;; $ fasmg quine.asm quine && chmod 755 quine && ./quine; echo $?
;;;
;;; A workflow you may wish to use for debugging is:
;;;
;;; $ rm quine2; fasmg quine.asm quine && chmod 755 quine && ./quine > quine2; echo "exit code:" $?; echo; hexdump -C quine; echo; hexdump -C quine2; echo; cmp -l quine quine2 ; echo cmp: $?
;;;
;;; The reason this removes the old one first is that otherwise, there's a
;;; risk the error message will be scrolled off the top of the screen and
;;; you'll see stale output and not realize.
;;;
;;; You may also wish to do:
;;;
;;; $ objdump --disassemble quine
;;; $ ZydisDisasm -64 quine
;;;
;;; This relies on GNU binutils, and on zydis, respectively.
;;;
;;; [1] https://flatassembler.net/
;;;
;;;
;;; gdb
;;; ---
;;;
;;; You can run gdb on it if you want; there's no symbols, but if you are
;;; familiar with the hex it should be readable. Keep a hexdump of the program
;;; handy to look up what addresses are.
;;;
;;; If you want to see a routine implemented in assembly, look at the hexdump
;;; of the overall file, find it by looking at the ASCII names, skip past the
;;; codeword, and do ie
;;;
;;; (gdb) disassemble/r 0x0x80007c0,+32
;;;
;;; If you want to see the value stack, you can do
;;;
;;; (gdb) x/16xg $rsp
;;;
;;; The same will work with $rbp for the control stack, and don't forget that
;;; the "instruction pointer" is rsi. To see all the registers, do
;;;
;;; (gdb) info registers


;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Assembly language ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   Before doing any actual code, we define macros for writing x86-64
;;; assembly language. This is built from scratch, relying only on
;;; flatassembler's built-in semantics. No include files of any kind are used
;;; for it.

macro pad: bytes
  if bytes > 0
    db 0x00
    pad (bytes - 1)
  end if
end macro

macro align: bytes
  if bytes > 0
    if $ mod bytes <> 0
      db 0x00
      align bytes
    end if
  end if
end macro

; The way these are all spelled out like this is slightly ridiculous, there
; must be a better way.
macro rex.0
  db 0x40
end macro
macro rex.w
  db 0x48
end macro
macro rex.r
  db 0x44
end macro
macro rex.x
  db 0x42
end macro
macro rex.b
  db 0x41
end macro
macro rex.wr
  db 0x4C
end macro
macro rex.wx
  db 0x4A
end macro
macro rex.wb
  db 0x49
end macro
macro rex.rx
  db 0x46
end macro
macro rex.rb
  db 0x45
end macro
macro rex.xb
  db 0x43
end macro
macro rex.wrx
  db 0x4E
end macro
macro rex.wrb
  db 0x4D
end macro
macro rex.wxb
  db 0x4B
end macro
macro rex.rxb
  db 0x47
end macro
macro rex.wrxb
  db 0x4F
end macro

macro modrm mod, reg, rm
  assert mod >= 0 & mod < 4
  assert reg >= 0 & reg < 8
  assert rm >= 0 & rm < 8
  db (mod shl 6) OR (reg shl 3) OR rm
end macro

macro sib scale, index, base
  assert scale >= 0 & scale < 4
  assert index >= 0 & index < 8
  assert base >= 0 & index < 8
  db (scale shl 6) OR (index shl 3) OR base
end macro

macro opcodereg opcode, reg
  assert opcode >= 0 & opcode < 256 & opcode AND 7 = 0
  assert reg >= 0 & reg < 8
  db opcode OR reg
end macro

macro opcodecc opcode, cc
  assert opcode >= 0 & opcode < 256 & opcode AND 15 = 0
  assert cc >= 0 & cc < 16
  db opcode OR cc
end macro

macro scalefield sfield, scale
  if 1 = scale
    sfield = 0
  else if 2 = scale
    sfield = 1
  else if 4 = scale
    sfield = 2
  else if 8 = scale
    sfield = 3
  else
    assert 0
  end if
end macro

; Yep, there sure is a lot of duplication in these. This is based on Intel's
; documented mnemonics...
;
; "Above" and "below" are for unsigned comparisons. "Greater" and "less" are
; for signed comparisons.
;
; This is documented on the individual opcode pages, and also in B.1.4.7.
macro conditioncode cc, condition
  match =above, condition
    cc = 0x07
  else match =above_equal, condition
    cc = 0x03
  else match =below, condition
    cc = 0x02
  else match =below_equal, condition
    cc = 0x06
  else match =carry, condition
    cc = 0x02
  else match =equal, condition
    cc = 0x04
  else match =greater, condition
    cc = 0x0F
  else match =greater_equal, condition
    cc = 0x0D
  else match =less, condition
    cc = 0x0C
  else match =less_equal, condition
    cc = 0x0E
  else match =not_above, condition
    cc = 0x06
  else match =not_above_equal, condition
    cc = 0x02
  else match =not_below, condition
    cc = 0x03
  else match =not_below_equal, condition
    cc = 0x07
  else match =not_carry, condition
    cc = 0x03
  else match =not_equal, condition
    cc = 0x05
  else match =not_greater, condition
    cc = 0x0E
  else match =not_greater_equal, condition
    cc = 0x0C
  else match =not_less, condition
    cc = 0x0D
  else match =not_less_equal, condition
    cc = 0x0F
  else match =not_overflow, condition
    cc = 0x01
  else match =not_parity, condition
    cc = 0x0B
  else match =not_sign, condition
    cc = 0x09
  else match =not_zero, condition
    cc = 0x05
  else match =overflow, condition
    cc = 0x00
  else match =parity, condition
    cc = 0x0A
  else match =parity_even, condition
    cc = 0x0A
  else match =parity_odd, condition
    cc = 0x0B
  else match =sign, condition
    cc = 0x08
  else match =zero, condition
    cc = 0x04
  else
    assert 0
  end match
end macro


;;; On registers
;;; ------------
;;;
;;;   The x86 architecture has been around a while, it has been through
;;; several transitions from smaller word sizes to larger ones. Therefore it
;;; has different names for the "same" registers, depending on how much of
;;; them you're using.
;;;
;;; TODO there's more to write here

macro bytereg result, register
  match =al?, register
    result = 0
  else match =cl?, register
    result = 1
  else match =dl?, register
    result = 2
  else match =bl?, register
    result = 3
  else match =ah?, register
    result = 4
  else match =ch?, register
    result = 5
  else match =dh?, register
    result = 6
  else match =bh?, register
    result = 7
  else
    assert 0
  end match
end macro

macro wordreg result, register
  match =ax?, register
    result = 0
  else match =cx?, register
    result = 1
  else match =dx?, register
    result = 2
  else match =bx?, register
    result = 3
  else match =sp?, register
    result = 4
  else match =bp?, register
    result = 5
  else match =si?, register
    result = 6
  else match =di?, register
    result = 7
  else
    assert 0
  end match
end macro

macro dwordreg result, register
  match =eax?, register
    result = 0
  else match =ecx?, register
    result = 1
  else match =edx?, register
    result = 2
  else match =ebx?, register
    result = 3
  else match =esp?, register
    result = 4
  else match =ebp?, register
    result = 5
  else match =esi?, register
    result = 6
  else match =edi?, register
    result = 7
  else
    assert 0
  end match
end macro

macro qwordreg result, register
  match =rax?, register
    result = 0
  else match =rcx?, register
    result = 1
  else match =rdx?, register
    result = 2
  else match =rbx?, register
    result = 3
  else match =rsp?, register
    result = 4
  else match =rbp?, register
    result = 5
  else match =rsi?, register
    result = 6
  else match =rdi?, register
    result = 7
  else
    assert 0
  end match
end macro

macro owordreg result, register
  match =r8?, register
    result = 0
  else match =r9?, register
    result = 1
  else match =r10?, register
    result = 2
  else match =r11?, register
    result = 3
  else match =r12?, register
    result = 4
  else match =r13?, register
    result = 5
  else match =r14?, register
    result = 6
  else match =r15?, register
    result = 7
  else
    assert 0
  end match
end macro


;;; Instructions
;;; ------------


macro mov.dreg.dimm target, source
  dwordreg treg, target
  opcodereg 0xB8, treg
  dd source
end macro


macro mov.qreg.dimm target, source
  qwordreg treg, target
  rex.w
  db 0xC7
  modrm 3, 0, treg
  dd source
end macro


macro mov.qreg.qimm target, source
  qwordreg treg, target
  rex.w
  opcodereg 0xB8, treg
  dq source
end macro


; Notice the use of REX.B here; this instruction puts the register number in
; the opcode field, so it uses Table 3-1.
macro mov.oreg.qimm target, source
  owordreg treg, target
  rex.wb
  opcodereg 0xB8, treg
  dq source
end macro


macro mov.qreg.qreg target, source
  qwordreg treg, target
  qwordreg sreg, source
  rex.w
  db 0x89
  modrm 3, sreg, treg
end macro


; Take a 64-bit source register, treat it as an address and look up the 64-bit
; value it points to, store that into a 64-bit target register.
;
; For rbp, the only modes available also have displacement; we use an 8-bit
; one and set it to zero. The other registers could be encoded without the
; displacement, but for simplicity's sake we do the same thing for all of
; them.
;
; In understanding this, pay close attention to the Op/En column in the opcode
; table. The "RM" variant means the ModRM byte's R/M field (the third one)
; is the source, while its reg field (the middle one) is the target. This is
; what we want, because the R/M field is the one that gets indirection applied
; to it. Opcode 0x8B with an REX.W prefix is the all-64-bit RM variant.
; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV".
;
; For the indirection modes, don't be confused by the many similar tables.
; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W
; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte.
; The presence or absence of an SIB byte is determined by where in that table
; we fall, and we aren't using a mode that has one. [Intel] volume 2A,
; chapter 2, section 2-1.5, table 2-2.
;
; We disallow rsp as a source because that's the mode that would want an SIB.
macro mov.qreg.indirect.qreg target, source
  qwordreg sreg, source
  qwordreg treg, target
  rex.w
  db 0x8B
  modrm 1, treg, sreg
  match =rsp, source
    ; R/M = rsp is the SIB case
    sib 0, 4, sreg
      ; no scaling, no indexing, source as base
  end match
  db 0
end macro


; Take a 64-bit source register, store its value into the address pointed to
; by a 64-bit target register.
;
; For rbp, the only modes available also have displacement; we use an 8-bit
; one and set it to zero. The other registers could be encoded without the
; displacement, but for simplicity's sake we do the same thing for all of
; them.
;
; In understanding this, pay close attention to the Op/En column in the opcode
; table. The "MR" variant means the ModRM byte's reg field (the middle one)
; is the source, while its R/M field (the third one) is the target. This is
; what we want, because the R/M field is the one that gets indirection applied
; to it. Opcode 0x89 with an REX.W prefix is the all-64-bit MR variant.
; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV".
;
; For the indirection modes, don't be confused by the many similar tables.
; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W
; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte.
; The presence or absence of an SIB byte is determined by where in that table
; we fall, and we aren't using a mode that has one. [Intel] volume 2A,
; chapter 2, section 2-1.5, table 2-2.
;
; We disallow rsp as a target because that's the mode that would want an SIB.
; When you look at other addressing modes, be aware that the special treatment
; is for whichever register is specified in the R/M field. Sometimes that's
; the source, and sometimes it's the target, depending on the opcode.
macro mov.indirect.qreg.qreg target, source
  qwordreg sreg, source
  qwordreg treg, target
  rex.w
  db 0x89
  modrm 1, sreg, treg
  match =rsp, target
    ; R/M = rsp is the SIB case
    sib 0, 4, treg
      ; no scaling, no indexing, target as base
  end match
  db 0
end macro


; 8-bit source register
macro mov.indirect.qreg.breg target, source
  match =rsp, target
    assert 0
    ; The SIB case.
  else match =rbp, target
    assert 0
    ; An unrelated addressing mode.
  else
    bytereg sreg, source
    qwordreg treg, target
    db 0x88
    modrm 0, sreg, treg
  end match
end macro

macro mov.breg.indirect.qreg target, source
  match =rsp, source
    assert 0
    ; The SIB case.
  else match =rbp, source
    assert 0
    ; An unrelated addressing mode.
  else
    qwordreg sreg, source
    bytereg treg, target
    db 0x8A
    modrm 0, treg, sreg
  end match
end macro

; We use the operand-size prefix to specify 16-bit. No REX.W. Table 3-4.
macro mov.indirect.qreg.wreg target, source
  match =rsp, target
    assert 0
    ; The SIB case.
  else match =rbp, target
    assert 0
    ; An unrelated addressing mode.
  else
    wordreg sreg, source
    qwordreg treg, target
    db 0x66
    db 0x89
    modrm 0, sreg, treg
  end match
end macro

; We use the operand-size prefix to specify 16-bit. No REX.W. Table 3-4.
macro mov.wreg.indirect.qreg target, source
  match =rsp, source
    assert 0
    ; The SIB case.
  else match =rbp, source
    assert 0
    ; An unrelated addressing mode.
  else
    qwordreg sreg, source
    wordreg treg, target
    db 0x66
    db 0x8B
    modrm 0, treg, sreg
  end match
end macro

; It defaults to 32-bit, no prefix needed, also no REX.W. Table 3-4.
macro mov.indirect.qreg.dreg target, source
  match =rsp, target
    assert 0
    ; The SIB case.
  else match =rbp, target
    assert 0
    ; An unrelated addressing mode.
  else
    dwordreg sreg, source
    qwordreg treg, target
    db 0x89
    modrm 0, sreg, treg
  end match
end macro

; It defaults to 32-bit, no prefix needed, also no REX.W. Table 3-4.
macro mov.dreg.indirect.qreg target, source
  match =rsp, source
    assert 0
    ; The SIB case.
  else match =rbp, source
    assert 0
    ; An unrelated addressing mode.
  else
    qwordreg sreg, source
    dwordreg treg, target
    db 0x8B
    modrm 0, treg, sreg
  end match
end macro


macro mov.qreg.indexed.qreg target, source, index, scale
  match =rbp, source
    assert 0
    ; This is divided into some subcases we don't wish to deal with yet.
  else match =rsp, index
    assert 0
    ; This is the case where it's not actually indexed after all.
  else
    qwordreg treg, target
    qwordreg sreg, source
    qwordreg ireg, index
    scalefield sfield, scale
    rex.w
    db 0x8B
    modrm 0, treg, 4
    sib sfield, ireg, sreg
  end match
end macro


macro mov.indexed.qreg.qreg target, index, scale, source
  match =rbp, source
    assert 0
    ; This is divided into some subcases we don't wish to deal with yet.
  else match =rsp, index
    assert 0
    ; This is the case where it's not actually indexed after all.
  else
    qwordreg treg, target
    qwordreg sreg, source
    qwordreg ireg, index
    scalefield sfield, scale
    rex.w
    db 0x89
    modrm 0, sreg, 4
    sib sfield, ireg, treg
  end match
end macro


; Take a 64-bit source register, store its value into a high 64-bit target
; register (r8-r15).
;
; Notice that there are two ways to add another bit to the register encoding.
; Table 3-1 is about REX.B, but does not apply here, it's for instructions
; that use opcode bits to specify a register, and none of the
; register-to-register MOV variants do that (it's for immediate mode).
;
; Instead, we want the mechanism that uses REX.R as the extra bit, and it
; combines with the reg field of ModRM, as per 2.2.1.2.
;
; Therefore, we want the variant of MOV which puts the target in the reg
; field. That's Op/En "RM", opcode 0x8B with REX.WR.
;
; Mode 3 is direct addressing.
macro mov.oreg.qreg target, source
  owordreg treg, target
  qwordreg sreg, source
  rex.wr
  db 0x8B
  modrm 3, treg, sreg
end macro


; Take a high 64-bit source register (r8-r15), store its value into a 64-bit
; target register.
;
; Notice that there are two ways to add another bit to the register encoding.
; Table 3-1 is about REX.B, but does not apply here, it's for instructions
; that use opcode bits to specify a register, and none of the
; register-to-register MOV variants do that (it's for immediate mode).
;
; Instead, we want the mechanism that uses REX.R as the extra bit, and it
; combines with the reg field of ModRM, as per 2.2.1.2.
;
; Therefore, we want the variant of MOV which puts the source in the reg
; field. That's Op/En "MR", opcode 0x89 with REX.WR.
;
; Mode 3 is direct addressing.
macro mov.qreg.oreg target, source
  qwordreg treg, target
  owordreg sreg, source
  rex.wr
  db 0x89
  modrm 3, sreg, treg
end macro


; This increments a 64-bit register by 1, in place;
macro inc.qreg target
  qwordreg treg, target
  rex.w
  db 0xFF
  modrm 3, 0, treg
    ; The 0 is part of the opcode.
end macro

; This decrements a 64-bit register by 1, in place;
macro dec.qreg target
  qwordreg treg, target
  rex.w
  db 0xFF
  modrm 3, 1, treg
    ; The 1 is part of the opcode.
end macro

; This adds a 64-bit register to another 64-bit register, in place.
macro add.qreg.qreg target, source
  qwordreg treg, target
  qwordreg sreg, source
  rex.w
  db 0x01
  modrm 3, sreg, treg
end macro


macro add.indirect.qreg.qreg target, source
  match =rsp, target
    assert 0
    ; The SIB case.
  else match =rbp, target
    assert 0
    ; An unrelated addressing mode.
  else
    qwordreg treg, target
    qwordreg sreg, source
    rex.w
    db 0x01
    modrm 0, sreg, treg
  end match
end macro


macro add.qreg.indirect.qreg target, source
  match =rsp, source
    assert 0
    ; The SIB case.
  else match =rbp, source
    assert 0
    ; An unrelated addressing mode
  else
    qwordreg treg, target
    qwordreg sreg, source
    rex.w
    db 0x03
    modrm 0, treg, sreg
  end match
end macro


; This adds a signed 8-bit immediate value to a 64-bit register, in place.
;
; Notice the use of 3 as the addressing mode. This says to use the register
; itself. The 0 in the reg field is part of the opcode.
macro add.qreg.bimm target, source
  qwordreg treg, target
  rex.w
  db 0x83
  modrm 3, 0, treg
  db source
end macro


; This adds a signed 32-bit immediate value to a 64-bit register, in place.
;
; Notice the use of 3 as the addressing mode. This says to use the register
; itself. The 0 in the reg field is part of the opcode.
macro add.qreg.dimm target, source
  qwordreg treg, target
  rex.w
  db 0x81
  modrm 3, 0, treg
  dd source
end macro

; This subtracts a 64-bit register from another 64-bit register, in place.
macro sub.qreg.qreg target, source
  qwordreg treg, target
  qwordreg sreg, source
  rex.w
  db 0x2B
  modrm 3, treg, sreg
end macro


macro sub.indirect.qreg.qreg target, source
  match =rsp, target
    ; The SIB case.
    assert 0
  else
    qwordreg treg, target
    qwordreg sreg, source
    rex.w
    db 0x2B
    modrm 0, sreg, treg
  end match
end macro


; This subtracts a signed 8-bit immediate value from a 64-bit register, in
; place.
;
; Notice the use of 3 as the addressing mode. This says to use the register
; itself. The 5 in the reg field is part of the opcode.
macro sub.qreg.bimm target, source
  qwordreg treg, target
  rex.w
  db 0x83
  modrm 3, 5, treg
  db source
end macro

; This subtracts a signed 32-bit immediate value from a 64-bit register, in
; place.
;
; Notice the use of 3 as the addressing mode. This says to use the register
; itself. The 5 in the reg field is part of the opcode.
macro sub.qreg.dimm target, source
  qwordreg treg, target
  rex.w
  db 0x81
  modrm 3, 5, treg
  dd source
end macro

; This multiplies rax, as 64-bits, with another 64-bit register, in place.
;
; The 4 in the reg field is part of the opcode.
macro mul.rax.qreg source
  qwordreg sreg, source
  rex.w
  db 0xF7
  modrm 3, 4, sreg
end macro

; The official mnemonic for this is "div", but it's divmod: It takes a 128-bit
; dividend formed from concatenating rdx as the high half with rax as the low
; half, and divides it by a 64-bit divisor from a specified register. It
; stores the quotient, truncated towards zero, in rax, and it stores the
; remainder in rdx. This entire process is unsigned.
;
; The 6 in the reg field is part of the opcode.
macro div.rdxrax.qreg source
  qwordreg sreg, source
  rex.w
  db 0xF7
  modrm 3, 6, sreg
end macro

; Same as div, but signed.
;
; The 7 in the reg field is part of the opcode.
macro idiv.rdxrax.qreg source
  qwordreg sreg, source
  rex.w
  db 0xF7
  modrm 3, 7, sreg
end macro


macro and.qreg.qreg target, source
  qwordreg treg, target
  qwordreg sreg, source
  rex.w
  db 0x23
  modrm 3, treg, sreg
end macro

macro and.qreg.bimm target, source
  qwordreg treg, target
  rex.w
  db 0x83
  modrm 3, 4, treg
    ; The 4 is part of the opcode.
  db source
end macro

macro or.qreg.qreg target, source
  qwordreg treg, target
  qwordreg sreg, source
  rex.w
  db 0x0B
  modrm 3, treg, sreg
end macro

macro or.qreg.bimm target, source
  qwordreg treg, target
  rex.w
  db 0x83
  modrm 3, 1, treg
    ; The 4 is part of the opcode.
  db source
end macro

macro xor.qreg.qreg target, source
  qwordreg treg, target
  qwordreg sreg, source
  rex.w
  db 0x33
  modrm 3, treg, sreg
end macro

macro not.qreg target
  qwordreg treg, target
  rex.w
  db 0xF7
  modrm 3, 2, treg
    ; The 2 is part of the opcode.
end macro


; This sets the flags to the same things they'd be set to if subtracting
; right from left.
macro cmp.qreg.qreg left, right
  qwordreg lreg, left
  qwordreg rreg, right
  rex.w
  db 0x3B
  modrm 3, lreg, rreg
end macro

; This sets the flags to the same things they'd be set to if and'ing right
; with left.
macro test.qreg.qreg left, right
  qwordreg lreg, left
  qwordreg rreg, right
  rex.w
  db 0x85
  modrm 3, rreg, lreg
end macro

macro set.breg.cc target, condition
  bytereg treg, target
  conditioncode cc, condition
  db 0x0F
  opcodecc 0x90, cc
  modrm 3, 0, treg
end macro


; Move from an 8-bit immediate value, to a location relative to a 64-bit
; register, with an 8-bit displacement and no indexing.
;
; This uses opcode 0xC6, which has w = 0. Since we run in 64-bit mode, that
; makes the operand size 8 bits, regardless of the current operand-size
; attribute. [Intel] volume 2D, appendix B, section B-1.4.3, table B-6.
macro mov.qreg.disp8.bimm target, offset, source
  qwordreg treg, target
  db 0xC6
  modrm 1, 0, treg
    ; the 0 is part of the opcode
    ; 4 is rsp, but it's a special case
  match =rsp, target
    ; R/M = rsp is the SIB case
    sib 0, 4, treg
      ; no scaling, no indexing, target as base
  end match
  db offset
  db source
end macro

; Move from a 16-bit immediate value, to a location relative to a 64-bit
; register, with an 8-bit displacement and no indexing.
;
; This uses opcode 0xC7, which has w = 1. We run in 64-bit mode, so that gives
; us an operand size of 32 bits by default. [Intel] volume 1, chapter 3,
; section 3-6.1, table 3-4. We want a 16-bit operand, so we use the
; operand-size prefix, 0x66, and we leave REX.W unset.
;
; We need to treat rsp specially because it's the SIB case, per table 2-2.
macro mov.qreg.disp8.wimm target, offset, source
  qwordreg treg, target
  db 0x66
  db 0xC7
  modrm 1, 0, treg
    ; the 0 is part of the opcode
  match =rsp, target
    ; R/M = rsp is the SIB case
    sib 0, 4, treg
      ; no scaling, no indexing, target as base
  end match
  db offset
  dw source
end macro

; Move from a 32-bit immediate value, to a location relative to a 64-bit
; register, with an 8-bit displacement and no indexing.
;
; This uses opcode 0x67, which has w = 1. We run in 64-bit mode, so that gives
; us an operand size of 32 by default. [Intel] volume 2D, section B.1.43,
; table B-6. This is what we want, so we leave it.
macro mov.qreg.disp8.dimm target, offset, source
  qwordreg treg, target
  db 0xC7
  modrm 1, 0, treg
    ; the 0 is part of the opcode
  match =rsp, target
    ; R/M = rsp is the SIB case
    sib 0, 4, treg
      ; no scaling, no indexing, target as base
  end match
  db offset
  dd source
end macro

; Move from a 64-bit register, to a 64-bit location relative to a 64-bit
; register, with an 8-bit displacement and no indexing.
;
; This uses opcode 0x89 with REX.W, so that gives us the reg field as the
; 64-bit source and the R/M field as the 64-bit destination.
;
; We need to treat a target of rsp specially because it's the SIB case per
; table 2-2.
macro mov.qreg.disp8.qreg target, offset, source
  qwordreg sreg, source
  qwordreg treg, target
  rex.w
  db 0x89
  modrm 1, sreg, treg
  match =rsp, source
    ; R/M = rsp is the SIB case
    sib 0, 4, 4
      ; no scaling, no indexing, rsp as base
  end match
  db offset
end macro

; Move from a 64-bit register, to a 64-bit location relative to a 64-bit
; register, with a 32-bit displacement and no indexing.
;
; This uses opcode 0x89 with REX.W, so that gives us the reg field as the
; 64-bit source and the R/M field as the 64-bit destination.
;
; We need to treat a target of rsp specially because it's the SIB case per
; table 2-2.
macro mov.qreg.disp32.qreg target, offset, source
  qwordreg sreg, source
  qwordreg treg, target
  match =rsp, target
    rex.w
    db 0x89
    modrm 2, sreg, treg
      ; treg is rsp by assumption, and R/M = rsp is the SIB case
    sib 0, 4, 4
      ; no scaling, no indexing, rsp as base
    dd offset
  else
    rex.w
    db 0x89
    modrm 2, sreg, treg
    dd offset
  end match
end macro

; Move from a 32-bit immediate value, to a 64-bit location relative to a
; 64-bit register, with an 8-bit displacement and no indexing.
;
; Note that there is no instruction to move a 64-bit immediate to memory.
;
; This uses opcode 0xC7, which has w = 1. We run in 64-bit mode, so that
; gives us an operand size of 32 by default. [Intel] volume 2D,
; section B.1.43, table B-6. We want a 64-bit operand, so we use the REX.W
; prefix, 0x48.
macro mov.qreg.disp8.dimm target, offset, source
  qwordreg treg, target
  match =rsp, target
    rex.w
    db 0xC7
    modrm 1, 0, treg
      ; the 0 is part of the opcode
      ; 4 is rsp, but it's a special case
    sib 0, 4, treg
      ; no scaling, no indexing, rsp as base
    db offset
    dd source
  else
    rex.w
    db 0xC7
    modrm 1, 0, treg
      ; the 0 is part of the opcode
    db offset
    dd source
  end match
end macro

; "Load effective address". Compute a 64-bit address as you would for
;  indexed addressing, with an 8-bit displacement and no indexing, but instead
; of doing anything with the memory, just store the address itself into a
; register.
macro lea.qreg.disp8.qreg target, offset, source
  qwordreg treg, target
  qwordreg sreg, source
  rex.w
  db 0x8D
  modrm 1, treg, sreg
  match =rsp, source
    ; R/M = rsp is the SIB case
    sib 0, 4, sreg
      ; no scaling, no indexing, rsp as base
  end match
  db offset
end macro

macro lea.qreg.disp32.qreg target, offset, source
  qwordreg treg, target
  qwordreg sreg, source
  rex.w
  db 0x8D
  modrm 2, treg, sreg
  match =rsp, source
    ; R/M = rsp is the SIB case
    sib 0, 4, sreg
      ; no scaling, no indexing, rsp as base
  end match
  dd offset
end macro

macro lea.qreg.indexed.qreg target, source, index, scale
  match =rbp, source
    assert 0
    ; This is divided into some subcases we don't wish to deal with yet.
  else match =rsp, index
    assert 0
    ; This is the case where it's not actually indexed after all.
  else
    qwordreg treg, target
    qwordreg sreg, source
    qwordreg ireg, index
    scalefield sfield, scale
    rex.w
    db 0x8D
    modrm 0, treg, 4
    sib sfield, ireg, sreg
  end match
end macro

; Wow, we use ALL the instruction suffixes for this, huh. See [Intel] volume
; 2A, chapter 2, section 2-1, with particular attention to figure 2-1.
macro lea.qreg.disp8.indexed.qreg target, offset, source, index, scale
  match =rbp, source
    assert 0
    ; This is divided into some subcases we don't wish to deal with yet.
  else match =rsp, index
    assert 0
    ; This is the case where it's not actually indexed after all.
  else
    qwordreg treg, target
    qwordreg sreg, source
    qwordreg ireg, index
    scalefield sfield, scale
    rex.w
    db 0x8D
    modrm 1, treg, 4
      ; 1 in the mode field says we want a disp8.
      ; 4 in the R/M field says we want an SIB byte.
    sib sfield, ireg, sreg
    db offset
  end match
end macro

macro mov.breg.bimm target, source
  bytereg treg, target
  db 0xC6
  modrm 3, 0, treg
    ; the 0 is part of the opcode
  db source
end macro

; Clear the DF flag. This makes string instructions increment RSI.
macro cld
  db 0xFC
end macro

; Set the DF flag. This makes string instructions decrement RSI.
macro std
  db 0xFD
end macro

; Load 64 bits from the address in RSI into RAX. Then, increment or decrement
; RSI by 8 bytes, depending on the value of the DF flag.
macro lodsq
  rex.w
  db 0xAD
end macro

;   [Intel] describes two different styles of mnemonic for the repeated string
; operations. See, their parameters are always rsi and rdi, or the smaller
; versions of those same specific registers. Intel thinks we might want to
; write out "rsi" explicitly, even though the only information it conveys is
; the size. The position we take is that it's better to let that be conveyed
; by the instruction name; otherwise it'd be a point of confusion for new
; readers, who might mistakenly think it's possible to pass it different
; registers.
;
;   With the string instructions, the reader SHOULD be thinking, "Wait...
; where does this get its parameters from?" Writing them in a way that makes
; them appear simpler than they are would be confusing.
macro rep operation
  match =movsb, operation
    db 0xF3 ; rep prefix
    db 0xA4 ; opcode
  else match =movsw, operation
    db 0xF3 ; rep prefix
    db 0x66 ; operand-size prefix
    db 0xA5 ; opcode
  else match =movsd, operation
    db 0xF3 ; rep prefix
    db 0xA5 ; opcode
  else match =movsq, operation
    ; The "rep" instruction can also be thought of as a prefix to other
    ; instructions, though only a few specific ones are allowed. Anyway, it
    ; comes before the REX byte.
    db 0xF3
    ; The rest of this is the same as the encoding of normal, non-repeated
    ; movsq.
    rex.w
    db 0xA5
    ; There's no explicit parameters. String operations are magic.
  else
    assert 0
  end match
end macro

macro repnz operation
  match =scasb, operation
    db 0xF2 ; rep prefix
    db 0xAE ; opcode
  else
    assert 0
  end match
end macro

; Push a 64-bit value from a register onto the stack (the one pointed to by
; rsp). Decrement rsp, then write the value at the new location.
;
; In the corner case where rsp is also the value being pushed, the old value
; is the one used.
;
; There's an alternate encoding of this that uses a ModRM byte, but doing it
; without is more compact, so we do without.
macro push.qreg source
  qwordreg sreg, source
  opcodereg 0x50, sreg
end macro

macro push.bimm source
  db 0x6A
  db source
end macro

; Operand-size prefix makes it 16-bit.
;
; If you're trying to fake pushing a larger size by doing several 16-bit
; pushes, remember to start by pushing the low end and proceed upwards.
; [Intel] volume 1, chapter 9, section 9-2.4, "Memory Data Formats".
macro push.wimm source
  db 0x66
  db 0x68
  dw source
end macro

; There is no 64-bit immediate push. So, can we have a push instruction that
; pushes a 32-bit immediate value? Sort-of, but it's sign-extended to 64 bits,
; so rsp is decremented by 8, not by 4. This is that instruction.
;
; You need to do a really close read of a number of things to understand why.
; The opcode tables in [Intel] in volume 2D, appendix A, section A-3 give it
; the d64 annotation, which per table A-1 in section A-2.5 indicates that the
; operand size is always 64 bits and that there is no corresponding 32-bit
; version. Yet, the actual immediate value is still only 32 bits! Direct your
; attention to the instruction's details page, volume 2B, chapter 4, section
; 4-3, "PUSH". The description section clearly details that the immediate may
; be less than the operand size, which makes sense once you know it, but it
; doesn't explictly call out that the operand size is still 64 bits here.
;
; In general, the size of an immediate doesn't determine operand size, as you
; can read about in detail in [Intel] volume 1, chapter 3, section 3-6.1, with
; particular attention to table 3-4.
;
; Why is this surprising, given that it's consistent with the behavior of
; other instructions? Well, most instructions don't have such obvious
; side-effects. It's easy to not notice the operand size disagreeing with the
; immediate size when you'e only writing to a register, but changing the stack
; in an unexpected way breaks things much more obviously.
;
; Anyway, if you really want to decrement the stack pointer by 32 bits after
; a push, consider pushing a register.
macro push.dimm source
  db 0x68
  dd source
end macro

; Pop a 64-bit value into a register from the stack (the one pointed to by
; rsp). Read the value from the old location, then increment rsp.
;
; In the corner case where rsp is also the destination being written to, the
; read happens from the old location, then the write causes the increment to
; be irrelevant.
;
; There's an alternate encoding of this that uses a ModRM byte, but doing it
; without is more compact, so we do without.
macro pop.qreg target
  qwordreg treg, target
  opcodereg 0x58, treg
end macro


; Do an absolute indirect jump with a 64-bit register operand. That is: given
; a register which holds a pointer, read another address from the pointed-to
; memory and jump to it.
;
; Technically this is a "near" jump in x86 terms, but we just pretend far
; jumps and segments don't exist. They are still a thing in 64-bit mode, we
; just don't use them.
macro jmp.abs.indirect.qreg location
  qwordreg lreg, location
  db 0xFF
  modrm 0, 4, lreg
end macro

; There in no 64-bit immediate "near" jump, so we use 32-bit. It's relatve,
; so that's honestly plenty.
;
; The location is relative to the start of the instruction immediately
; following the jmp.
macro jmp.rel.dimm location
  db 0xE9
  dd location
end macro

; The location is relative to the start of the instruction immediately
; following the jmp.
macro jmp.cc.rel.bimm condition, location
  conditioncode cc, condition
  opcodecc 0x70, cc
  db location
end macro

; The location is relative to the start of the instruction immediately
; following the jmp.
macro jmp.cc.rel.dimm condition, location
  conditioncode cc, condition
  db 0x0F
  opcodecc 0x70, cc
  dd location
end macro

; Invoke a system call provided by the kernel. On Linux, the System V ABI
; describes the semantics of such calls (at least, on x86).
macro syscall
  db 0x0F, 0x05
end macro


; Halts the CPU. We can't actually run this in userspace, but the kernel will
; kill our process or the debugger will break, and those are the outcomes we
; actually want.
macro hlt
  db 0xf4
end macro


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Executable file format ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   Before we get into the body of the program, we do a lot of ELF-specific
;;; stuff to ensure that our output is in a format Linux knows how to run.
;;;
;;;   First, we set the origin to load at. This is arbitrary, but it can't be
;;; zero. We tell flatassembler about it because it's used in label
;;; calculations; we can reference it as $$ any time we need it in future.
org 0x08000000

;;;
;;;   Second, we output ELF's top-level file header. The only interesting
;;; thing here is the entry pointer.
;;;
elf_header:
  ; * denotes mandatory fields according to breadbox
  db 0x7F, "ELF"                 ; *magic number
  db 2                           ; 64-bit
  db 1                           ; little-endian
  db 1                           ; ELF header format version 1
  db 0                           ; System-V ABI
  db 8 DUP 0                     ; (padding)

  dw 2                           ; *executable
  dw 0x3E                        ; *Intel x86-64
  dd 1                           ; ELF format version

  dq _start                      ; *entry point
  dq program_header - $$         ; *program header offset
  dq 0                           ; section header offset
  dd 0                           ; processor flags
  dw elf_header_size
  dw program_header_entry_size   ; *
  dw 1                           ; *number of program header entries
  dw 0                           ; section header entry size
  dw 0                           ; number of section header entries
  dw 0                           ; section name string table index
; Save a copy of the size of this chunk for our future reference, by comparing
; the current posiion to the label above.
elf_header_size = $ - elf_header

;;;
;;;   Third, immediately after the ELF file header, we output ELF's program
;;; header, which lists the memory regions ("segments") we want to have and
;;; where we want them to come from. We list just a single region, which is
;;; the entire contents of the ELF file from disk.
;;;
;;;   It would be more typical to use this header to ask the loader to give us
;;; separate code and data segments, and perhaps a stack or heap, but this
;;; keeps things simple, and we can create those things for ourselves later.
;;;
;;;    We do have a little stack space available, though we don't explicitily
;;; request any; the kernel allocates it for us as part of exec() so that it
;;; can pass us argc and argv (which we ignore). That stack space will be at a
;;; random address, different every time, because of ASLR; that's a neat
;;; security feature, so we leave it as-is.
;;;
program_header:
  dd 1                           ; *"loadable" segment type
  dd 0x05                        ; *read+execute permission
  dq 0                           ; *offset in file
  dq $$                          ; *virtual address
    ; required, but can be anything, subject to alignment
  dq 0                           ; physical address (ignored)
  dq file_size                   ; *size in file
  dq file_size                   ; *size in memory
  dq 0                           ; segment alignment
                                 ;   for relocation - will we be ASLR'd?
; Save the size of this chunk, as well.
program_header_entry_size = $ - program_header

; Everything after this point is code or data, not headers, so save the start
; of it for use in size calculations later.
code_start:


;;;;;;;;;;;;;;;;;;;;;;;
;;; Execution model ;;;
;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   We use Forth-style dual stacks, one for values and one for control. We
;;; use rsp for values, just like C does. We use rbp for the control stack,
;;; which is a special Forth-y stack: These are pointers into the bodies of
;;; Forth words, not return addresses.
;;;
;;;   The choice of rsp and rbp for the stack pointers imitates Jonesforth;
;;; I'm hopeful that it gives us convenient addressing modes, and will report
;;; back about that when I feel that I understand the implications.
;;;
;;;   In Forth, everything is a "word", including mutable variables.
;;; Conceptually, a word is a unit of execution, which may be implemented
;;; either in machine code or as an array of pointer to other words.
;;;
;;;   This polymorphism is implemented by having each word's contents begin
;;; with a "codeword", which is a pointer to machine code that "interprets"
;;; the rest of the contents. In the case of words implemented in machine
;;; code, the codeword points directly to that code, which is normally right
;;; next to it.
;;;
;;;   Variables, to Forth, are simply one more thing that can be executed; the
;;; effect of executing a variable is to push its address onto the value
;;; stack.
;;;
;;;   We adopt this model of words, codewords, and variables-as-words. It's
;;; really nice how it doesn't force anything else on us, not even a heap,
;;; though we do end up using a heap.
;;;
;;;   We specifically implement a version of calling and returning that Forth
;;; calls indirect threaded code: The control stack is a stack of pointers
;;; into the middle of interpreted words. The interpreter snippet, called
;;; docol, implements calling.  Each word is responsible for making sure
;;; returning works properly. Interpreted words accomplish this by ending with
;;; the word "exit", while machine-code words accomplish it by ending with a
;;; verbatim snippet called "next".
;;;
;;;   Conceptually, "next" returns, but more specifically it accomplishes this
;;; by doing the caller's next dispatch for it; thus control never actually
;;; goes back to the caller's interpreter after initial setup. For performance
;;; reasons, "next" is always inlined, so we define it as a macro.
;;;
;;;   The docol routine is just ordinary code, not a macro. It's defined later
;;; in this file, as a label.
;;;
;;;   Notionally, we could consider not having a dictionary, and not giving
;;; our words names. However, it feels silly to stop when we're so close to
;;; being a full Forth, and using names for things solves a bootstrapping
;;; problem related to heap management - see the write-up of _start about how
;;; the heap is created, below. So, we add an additional header before the
;;; codeword for this purpose.
;;;
;;;   The Forth dictionary is usually a linked list of every word that has
;;; ever been defined, with the newest at the head; the names of words are
;;; stored in string fields, often right next to the link pointer. We adopt
;;; this model, with the field sizes and order shown in the quick reference
;;; below. We break with Forth tradition in one way: Rather than having a
;;; length field, we use a null-terminated string. Thus, there's no length
;;; limit on names. This necessitates breaking out the flags (to be explained
;;; later) into their own byte, rather than taking bits from the length field
;;; for them.
;;;
;;;   There's an important performance consideration: Executable words
;;; reference each other by pointers to their respective codewords. However,
;;; dictionary entries reference each other by pointers to their respective
;;; link fields. Traversing from the link field to the codeword is easy,
;;; though it's a non-constant-time operation: Just walk the string. In order
;;; to make Forth words easy to "decompile", it would be nice to also have a
;;; way to traverse backwards. We solve this by making the name field be
;;; null-terminated at both ends. Fun, yeah?
;;;
;;;
;;;
;;;
;;; --------------------------------------------------------------------------
;;;  Quick Reference
;;; --------------------------------------------------------------------------
;;;
;;; The layout of an interpreted word:
;;;
;;; (overall start)
;;;            0x00 - 0x08                     Link (to next-oldest word)
;;;            0x09 - 0x09  I0H00000           Flags
;;;                                                I - immediate
;;;                                                H - hidden
;;;                                                all other bits reserved
;;; (name start)
;;;            0x0a - 0x0a                     Null byte (terminates name)
;;;            0x0b - name-end - 1             Name, as UTF-8
;;;        name-end - name-end                 Null byte (terminates name)
;;; (padding start)
;;;    name-end + 1 - codeword-start - 1       Zero-pad to 8-byte boundary
;;;    (it's possible this will be zero bytes long)
;;; (codeword start)
;;;      ... + 0x00 - ... + 0x08               Codeword (ie. address of docol)
;;;          (8-byte chunks)                   Addresses of other words
;;;                 - ... (end)                Address of "exit" word
;;;
;;; The layout of a machine-code word is different only from the codeword on:
;;;
;;;      ... + 0x00 - ... + 0x08               Addresss of next byte
;;;      ... + 0x08 - ????                     Arbitrary machine code
;;;                 - ... (end)                Inlined implementation of next
;;;
;;; Also, words always start at 8-byte boundaries.
;;;
;;;
;;; REGISTER usage conventions:
;;;
;;; * rsi is the "instruction pointer" for the "interpreter".
;;;     That is, it points to some word-pointer inside an array of
;;;   word-pointers inside the content of the word they're part of. It always
;;;   points to the next word that should be executed, whose execution hasn't
;;;   begun yet.
;;;
;;; * rbp points to the top of the control stack
;;;     These are former values of rsi, to eventually be returned to, from
;;;   successively older callers as you look further up the stack. The stack
;;;   grows downwards in memory. Since values are kept separately, the only
;;;   thing on the control stack is return addresses, one per layer of call.
;;;
;;; * rsp points to the top of the value stack
;;;     The value stack has no specific format, but it grows downwards in
;;;   memory. In particular there's no concept of stack frames, because items
;;;   on the stack don't belong to any particular word; the value stack in
;;;   Forth is in part a mechanism for passing values between words.
;;;
;;; Additionally, immediately after beginning execution of a word:
;;;
;;; * rax points to the address of the codeword being executed
;;;     The value of rax is purely for the callee's benefit, and does not need
;;;   to be preserved.
;;;
;;;   Other registers are purely discretionary, and are not preserved across
;;; calls.
;;;
;;;
;;; FLAG usage:
;;;
;;; * DF should be 0
;;;   We use lodsq extensively and that makes it increment rsi after using it.
;;;
;;; --------------------------------------------------------------------------

;;;
;;; Macro next
;;; ----------
;;;
;;;   Include this inline at the end of a word implemented in machine-code.
;;; Conceptually, it returns. What it actually does is do the next thing the
;;; caller would do, which is call the next word from the caller's array of
;;; word pointers.
;;;
;;; Registers in:
;;;
;;; * rsi points to the address of the word to execute
;;;
;;; Registers out:
;;;
;;; * rax points to the codeword in the contents of the word that was executed
;;; * rsi points to the next word-address after this one
;;;
;;; Flags
;;; * DF = 0 is required
;;;
macro next
  ; Copy the next word's address from *rsi into rax. Increment rsi (as per the
  ; DF flag).
  lodsq

  ; Load the codeword from the word's contents, and jump to the interpreter it
  ; points to.
  jmp.abs.indirect.qreg rax
end macro

;;;
;;; Macro beforenext
;;; ----------------
;;;
;;;   Sometimes we want to transfer control from a word implemented in
;;; machine-code to another word, without coming back after, as if we were
;;; simply jumping to it. This is an innovation of ours; Jonesforth doesn't do
;;; it.
;;;
;;;   This implementation will work regardless of how the receiving word is
;;; implemented. It impersonates the "next" snippet, setting up rax to point
;;; to the codeword then jumping to the interpreter. Since it doesn't change
;;; the control stack or rsi, when the receiving word eventually invokes
;;; "next"; it will pick up in the same place as if this sending word had done
;;; it.
;;;
;;;   Thus, notionally we are doing just this one transfer of control before
;;; eventually getting around to inlining "next". Hence the name.
;;;
macro beforenext target
  ; Do a permanent transfer of control by setting rax and invoking the
  ; codeword. Of course, we could jump to docol ourselves but this will work
  ; regardless of what the receiving codeword is.
  mov.qreg.qimm rax, target
  jmp.abs.indirect.qreg rax
end macro

;;;
;;; Macros pushcontrol
;;;        popcontrol
;;; ------------------
;;;
;;;   Include these inline to push an address onto the control stack, or pop
;;; one off of it. You will recall the control stack is kept in rbp. The
;;; parameter is given in a user-specified register.
;;;
;;;   Jonesforth's analogous macros are called PUSHRSP and POPRSP but I think
;;; that's super confusing, since rsp is also the name of a register, but a
;;; different one. I guess it was less confusing in 32-bit, since esp doesn't
;;; start with an "r". Anyway, this has to be named something that
;;; distinguishes it from Intel's PUSH and POP opcodes, so...
;;;
;;;   "Load effective address" is just a cute way to do arithmetic on a
;;; register, here. To push or pop we decrement or increment rbp by 8. To
;;; actually interact with the space in the stack, we indirect through rbp.
;;;
;;; Registers in and out:
;;;
;;; * rbp points to the top of the control stack.
;;;
macro pushcontrol source
  lea.qreg.disp8.qreg rbp, -8, rbp
  mov.indirect.qreg.qreg rbp, source
end macro

macro popcontrol target
  mov.qreg.indirect.qreg target, rbp
  lea.qreg.disp8.qreg rbp, 8, rbp
end macro

;;;
;;; Routine _start
;;; --------------
;;;
;;;   This is the entry point of the whole program, the very first code we
;;; actually execute. Linkers traditionally call this _start, and on balance
;;; I think it's probably best to keep that name, though I've honestly never
;;; liked it... Anyway, the ELF header points to it and exec() jumps to it.
;;; Also, though it could be anywhere in the code part of the output, in order
;;; to make the hexdump pretty we put it at the start.
;;;
;;;   The kernel gives us most registers zeroed, and rsp pointing to the
;;; command-line stuff (argc, argv, envp), which is at an ASLR'd address with
;;; some stack space allocated for us, despite the fact we didn't request any.
;;; It also gives us all the flags clear except IF, but we don't rely on that.
;;; Lastly, of course, it loads our code segment and sets the instruction
;;; pointer where we asked; we don't need to check what those addresses are,
;;; because they're not randomized.
;;;
;;;   This routine is really only responsible for one-time initialization.
;;;
;;; Registers in:
;;;
;;; * rsp points to the logical top  of the value stack
;;;     The kernel sets this up for us, and we need to save it somewhere so
;;;   Forth can use it.
;;;
;;; Registers out:
;;;
;;; * rsi points within "quit"
;;;     Quit is the word that's Forth's closest equivalent to main().
;;; * rsp points to the top of the value stack
;;;
;;;   Notably, rbp is still uninitialialized after _start.
;;;
;;; Stack in:
;;;
;;; * argc, argv, envp in the usual Unix way
;;;     We ignore them, though.
;;;
;;; Stack out:
;;;
;;; * The value of "heap", as a pointer
;;;     The meaning of this will be explained below.
;;;
;;; Registers within:
;;;
;;; * rdi points to the base the heap was allocated at, once it exists
;;;     This is the same value that "heap" will hold, once we reach a point
;;;   where we have variables. Of course, variables are stored on the heap,
;;;   hence this temporary measure.
;;;
;;;   We also take this opportunity to define soeme memory layout parameters
;;; that this routine will be responsible for doing something with:
;;;
heap_requested_address = 0x0000001000000000 ; (very arbitrary)
heap_size =              0x0000000001000000 ; 16 MiB
control_stack_size =                0x10000 ; 64 KiB

_start:
  cld                                      ; clear the DF flag

  ;;;
  ;;; Prepare the heap.
  ;;;
  ;;;   We could ask for a data segment in the program header, but where's the
  ;;; fun in that? Instead, we call mmap().
  ;;;
  ;;;   If we wanted the kernel to do ASLR for us, passing address zero would
  ;;; cause it to pick somewhere at random, but instead we choose our own
  ;;; location. It's still not guaranteed to be where we ask for, so we still
  ;;; do the work to record where it wound up. We could pass the "fixed" flag
  ;;; and the kernel would trust us, but this gives us more options for
  ;;; interoperating with other runtimes.
  ;;;
  mov.qreg.qimm rax, 9                       ; mmap()
  mov.qreg.qimm rdi, heap_requested_address  ; address (very arbitrary)
  mov.qreg.qimm rsi, heap_size               ; size (one meg)
  mov.qreg.qimm rdx, 0x03                    ; protection (read+write)
  mov.oreg.qimm r10, 0x22                    ; flags (private+anonymous)
  mov.oreg.qimm r8, 0                        ; file descriptor (ignored)
  mov.oreg.qimm r9, 0                        ; offset (ignored)
  syscall

  ;;;
  ;;;   The return value of the system call is in rax, we'll use it in a sec.
  ;;; We need to save this somewhere in case we ever want to munmap() it;
  ;;; there's no widely-used name for it so we have to make one up. S0 and r0
  ;;; are widely-used names for the physical tops (logical bottoms) of the
  ;;; value and control stacks, respectively, and we will eventually set those
  ;;; up as well, so we should keep those names in mind. The control stack
  ;;; lives within the heap, while the value stack is its own segment. This
  ;;; value, though, is the physical bottom of the segment, meaning that it
  ;;; stays the same even as we allocate and deallocate things within it. This
  ;;; is unlike the two stack pointers, so we give it a name that doesn't
  ;;; suggest similarity: "heap".
  ;;;
  ;;;   Once Forth is fully set up, its internal variables will be accessed
  ;;; through variable-words like any other Forth data, including "heap". To
  ;;; get to that point, though, we need to be able to hold onto variable data
  ;;; between now and then. In fact, if we don't have at least one of "heap"
  ;;; and "here" (its counterpart which points to the logical top end), all
  ;;; our efforts to hold onto anything seem a bit doomed.
  ;;;
  ;;;   So, we temporarily dedicate rdi to "heap" - only within this routine -
  ;;; and store everything else in ways that let us find things by reference
  ;;; to it. We choose rdi because it works with the indexing modes we care
  ;;; about, and its name suggests its function.
  ;;;
  ;;;   The strategy Jonesforth uses is not applicable to us; Jonesforth
  ;;; takes advantage of the linker to let its code segment refer to specific,
  ;;; pre-allocated objects in the data segment. We are our own linker.
  ;;; Hence, this approach.
  ;;;
  ;;;   Keying things off "heap" is the fundamental decision, but to make sure
  ;;; our variables are accessible both during early bootstrapping, and later,
  ;;; we also have to be thoughtful about data structures. More on that in a
  ;;; moment.
  ;;;
  mov.qreg.qreg rdi, rax

  ;;;
  ;;;   We also initialize rbp. We could hold off and let "quit" do this, but
  ;;; doing it now is the easiest way to initialize the r0 variable, since
  ;;; there's no instruction that moves a 64-bit immediate to memory.
  ;;;
  ;;;   This is the moment at which we decide where the control stack starts!
  ;;; Fun, right? "Allocation" is just a fancy word for picking where we want
  ;;; something, then being consistent about it - like placing furniture in
  ;;; your home. See below for a little more thought about why here in
  ;;; particular.
  ;;;
  lea.qreg.disp32.qreg rbp, control_stack_size, rdi

  ;;;
  ;;;   Now we save some stuff onto the heap. These are the locations that
  ;;; will eventually be the backing stores of the Forth variables, but we
  ;;; don't create the word headers yet, since there's no requirement that
  ;;; they be next to the backing stores. We'll do that later, once we have
  ;;; word-writing infrastructure in place. For now, we just use their offsets
  ;;; relative to the physical bottom of the heap, which are fixed.
  ;;;
  ;;;   These will be the permanent homes of these values, though we have
  ;;; copies of them elsewhere while we're still in this routine.
  ;;;
  mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi    ; heap
  mov.qreg.disp32.qreg rdi, control_stack_size + 0x08, rsp    ; s0
  mov.qreg.disp32.qreg rdi, control_stack_size + 0x10, rbp    ; r0
  mov.qreg.qimm rax, final_word_name
  mov.qreg.disp32.qreg rdi, control_stack_size + 0x18, rax    ; latest
  lea.qreg.disp32.qreg rax, control_stack_size + 0x28, rdi
  mov.qreg.disp32.qreg rdi, control_stack_size + 0x20, rax    ; here
  ; TODO also consider STATE
  ; strictly speaking, r0 could be a constant... but it isn't known until
  ; runtime, so we might as well make it a variable
  ;;;
  ;;; * "heap" is the physical bottom of the heap
  ;;;     The heap grows upwards in memory, so this is also the logical
  ;;;   bottom. This comes from the address mmap() just returned to us.
  ;;; * "s0" is the logical bottom of the value stack
  ;;;     The value stack grows downwards in memory, so this is the physical
  ;;;   top of it. This comes from the stack pointer the kernel initialized us
  ;;;   with.
  ;;; * "r0" is the logical bottom of the control stack
  ;;;     The control stack also grows downwards, so this is its pysical top
  ;;;   as well. We allocate this dedicated space within the heap right here,
  ;;;   in this routine, through our choice of where to put things.
  ;;; * "here" is the physical start of the unallocated space in the heap
  ;;;     We allocate heap space from bottom to top, by incrementing this
  ;;;   value. So, it would also be accurate to say that it points immediately
  ;;;   after the physical top of the allocated space. At any rate, the
  ;;;   address it points to is the first one that hasn't been used yet.
  ;;; * "latest" is the address of the most-recently-defined word's header
  ;;;     Defining new words changes this value.
  ;;;
  ;;;   s0 and r0 are mostly used when we want to initialize or reinitialize
  ;;; their respective stacks - that is, discard all their contents at once.
  ;;;
  ;;;   The value of r0 is the same address these variables start at, so
  ;;; you'll want to do a close read of the implementation of pushcontrol
  ;;; and convince yourself that it only ever writes things just below the rbp
  ;;; address it receives, never right on top of it.
  ;;;
  ;;;   Notice that "here" points immediately after itself. This is just a
  ;;; convenience, making it the last one like that so that the concern is
  ;;; dealt with in a single place and is easy to keep up-to-date with code
  ;;; changes.
  ;;;
  ;;;   A little more detail about why we offset everything by
  ;;; control_stack_size: We're carving out some space at the bottom of the
  ;;; heap - which grows low-to-high - to be the control stack - which grows
  ;;; high-to-low. So the control stack is allocated out of the heap as a
  ;;; fixed-size, one-time thing, and then the variables come immediately
  ;;; after that. We do need to use 32-bit displacement indexing to access
  ;;; them this way, but that's no big deal.
  ;;;
  ;;;   This is perhaps questionable, they should maybe be separate segments
  ;;; created with separate calls to mmap(), but for now we're not worried
  ;;; about overflow so we use the same allocation for both.
  ;;;
  ;;;   We'll come back to these variables a bit later and generate the word
  ;;; headers that point at them, but now we're almost ready to switch to
  ;;; proper threaded-execution, so we finish that setup first...
  ;;;

  ;;;
  ;;;   Push the value of "heap" onto the value stack so that it can be the
  ;;; breadcrumb the threaded code needs to find... the backing store of
  ;;; "heap". Yes, self-reference can be weird like that sometimes. There's
  ;;; nothing stopping "quit" from reading rdi, it just violates the
  ;;; abstraction...
  ;;;
  push.qreg rdi

  ;;;
  ;;;   We are about to set up rsi, we did rbp already, and rsp came to us
  ;;; already set up. That's all that "next" needs, so take it away!
  ;;;
  mov.qreg.qimm rsi, cold_start
  next

;;;
;;;   This isn't really a routine so much as it's an array of words (exactly
;;; one of them), which is what "next" wants rsi to point to. It's only ever
;;; used this one time, so we just put it right here.
;;;

  align 8
cold_start:
  ;;; TODO this is probably where we should deal with that "heap" that we passed
  ;;; on the stack
  ;;;   Before handing off to us, _start pushed a single value onto the stack,
  ;;; a pointer to the beginning of the heap. Now, we load our entire Forth
  ;;; implementation onto that heap, beginning with the minimal set of words
  ;;; needed to define more words. We do this because we need variables as
  ;;; infrastructure so we can eventually have dynamic definitions.
  ;;;
  ;;;   There's something non-obvious here: words implemented statically as
  ;;; part of the executable image can't contain things that vary at runtime.
  ;;; That means that even if these words tried to implement some sort of
  ;;; dynamic lookup, they would have no way to find the root of whatever
  ;;; dynamic data structure they use. Dynamism needs to be bootstrapped.
  ;;;
  ;;;   In a more traditional C-style program, static code could look up
  ;;; variables based on fixed addresses that are the same on every run.
  ;;; Failing that, we could dedicate a register to it, though that's a
  ;;; considerable expense. We chose not to do either of those things, because
  ;;; we want the versatility that comes with not being picky about our
  ;;; address space: It allows us to contemplate future improvements such as
  ;;; ASLR, or embedding into other processes that impose their own addressing
  ;;; constraints, or even coexisting with multiple versions of ourselves.
  ;;; That choice does mean we have the hard version of this bootstrapping
  ;;; problem, and copying ourselves to the heap is how we solve it.
  ;;;
  ;;;   We do have the heap address right now, though that won't last. In case
  ;;; it's unclear why not: keeping it on the stack would require all future
  ;;; references to walk the stack, and somehow know when they've reached the
  ;;; bottom. The stack is a good place to keep things with clearly delimited
  ;;; lifetimes and visibility, but when we want something to live for our
  ;;; entire program and be easy to find from any code within it, we need to
  ;;; do something else. Anyway, since we have the address, we can use it for
  ;;; the next little bit of setup.
  ;;;
  ;;;   The first few words we define are our variables, which hardcode the
  ;;; addresses they will return - but since we're doing this at runtime,
  ;;; "hardcoding" can reflect where our heap is. This is the fundamental
  ;;; trick that makes the heap usable.
  ;;;
  ;;;   One more thing to notice: We already allocated the backing stores of
  ;;; these variables, and populated their initial values, in _start. The
  ;;; words we're defining return those same addresses for the same backing
  ;;; stores. So, we have continuity: Stuff defined in terms of the
  ;;; variable-words we're defining now will interoperate with the stuff that
  ;;; we define in the "early" way, which includes those very words. Both the
  ;;; early code and the later code are dealing with the same data structures,
  ;;; they're just using a different technique to find them.
  ;;;
  ;;;   This is the only hardcoding we need to do; by building on top of it,
  ;;; we will soon reach a point where the rest of the system can be defined
  ;;; within itself.
  dq early_heap, litstring, "heap", early_variable
  dq early_s0, litstring, "s0", early_variable
  dq early_r0, litstring, "r0", early_variable
  dq early_latest, litstring, "latest", early_variable
  dq early_here, litstring, "here", early_variable

  ;;;   Now we define a heap version of docol. Strictly speaking it doesn't
  ;;; need to be among the first words, it only needs to come before the first
  ;;; words implemented in Forth. However, it's conceptually tidy to have it
  ;;; that way, so that's what we do.
  ;;;
  ;;;   Docol also presents a unique challenge, in that it's two snippets of
  ;;; code and one of them needs to refer to the other. When we use docol as
  ;;; the codeword of a word we're defining, we point to a snippet which acts
  ;;; as an interpreter for the word's body. However, when we look up "docol"
  ;;; in the dictionary, what we get is a word that returns the address of the
  ;;; interpreter snippet, effectively acting as a constant.
  ;;;
  ;;;   One way to make this work would be to use a forward-referencing
  ;;; address using the labels system. However, it turns out that only docol
  ;;; and zbranch would benefit from this, and we drastically simplify our
  ;;; code by reworking things so that no forward reference is needed.
  ;;;
  ;;;   What we do is define the interpreter snippet first, allocating space
  ;;; for the machine code directly out of "here", with no word header nor any
  ;;; dictionary entry pointing to it. We keep track of the address we put
  ;;; that at, then we define the constant to point to it.
  ;;;
  ;;;   While it may seem weird to use space that's "outside" of any word,
  ;;; keep in mind that using the heap in creative ways is part of the spirit
  ;;; of Forth. Jonesforth doesn't have this bootstapping problem, but its
  ;;; variables use this same technique of putting the value before the word
  ;;; header to avoid a forward reference. Also, words don't have end
  ;;; delimiters, so who's to say what's inside or outside them?
  dq early_here, fetch, dup
  dq rsi, pack_pushcontrol
  dq lit, 8, rax, add_reg64_imm8
  dq rax, rsi, mov_reg64_reg64
  dq pack_next
  dq lit, 8, packalign
  dq roll3, swap, early_here_store, swap
  ; Now the interpreter snippet is in-place and "here" points after it, so
  ; that future allocation won't step on it. We also still have a copy of its
  ; start address, which we will now pass to early_variable.
  dq litstring, "docol", early_variable
  ; (While it might be tidy to have a separate "early_constant", it would do
  ; the same thing. Late variables and constants will be different because the
  ; real "variable" word will also be responsible for allocating the backing
  ; store, but the only thing early_variable is doing is returning an
  ; address.)

  dq litstring, "exit", early_create, early_self_codeword, early_here, fetch
  dq rsi, pack_popcontrol, pack_next
  dq lit, 8, packalign, early_here_store

  ;;; For triage's sake, here's an inventory of everything else in the file.
  ;;;
  ;;; Macros:
  ;;;   next, beforenext, pushcontrol, popcontrol
  ;;; Assembly:
  ;;;   docol (the constant, and the actual codeword implementation)
  ;;;     needs a label
  ;;;     pushcontrol, next
  ;;;   exit
  ;;;     popcontrol, next
  ;;;   swap, drop, drop2, roll, unroll, roll3, unroll3, dup, dup2
  ;;;   add, sub, mul, divmod
  ;;;   eq, ne, gt, lt, ge, le, and, or, xor, invert
  ;;;   lit, litstring
  ;;;   store, fetch
  ;;;   addstore, substore, store8, fetch8, store16, fetch16, store32, fetch32
  ;;;   ccopy, stringlen
  ;;;     no dependencies except next for any of these
  ;;;   branch, zbranch
  ;;;     sorta needs a label but might be avoidable
  ;;;     needs next
  ;;; Forth:
  ;;;   pack64, pack32, pack16, pack8, packstring, packalign
  ;;;     only the basics above
  ;;; Assembly:
  ;;;   litpack64, litpack32, litpack16, litpack8
  ;;;     beforenext, pack*
  ;;; Forth:
  ;;;   rax, rcx, dx, rbx, rsp, rbp, rsi, rdi
  ;;;   r8, r9, r10, r11, r12, r13, r14, r15
  ;;;     only the basics above
  ;;;   reg64, extrareg64
  ;;;     only the basics plus optional emitstring and sys_exit
  ;;;     (notice that these are forward references!)
  ;;;   rex_w, rex_wb, opcodereg, modrm
  ;;;     only the basics above
  ;;;   cld, mov_reg64_imm64, mov_reg64_reg64, push_reg64, lodsq
  ;;;   jmp_abs_indirect_reg64, syscall
  ;;;     basics plus assembly helpers
  ;;; Forth, not needed on heap:
  ;;;   early_heap, early_s0, early_r0, early_latest, early_here
  ;;;   early_create, early_comma, early_self_codeword, early_here_store,
  ;;;   pack_next, early_variable
  ;;;
  ;;; It's likely that nothing past this point is required for the heap copy,
  ;;; but it's here for completeness.
  ;;;
  ;;; Forth, subject to reconsideration:
  ;;;   quit
  ;;;     quine, sys_exit (these are forward references)
  ;;; Assembly:
  ;;;   sys_exit, sys_write
  ;;;     nothing special
  ;;; Forth:
  ;;;   emitstring
  ;;;     stringlen, sys_write, basics
  ;;;   quine
  ;;;     early_here (removable), all_contents (forward)
  ;;;     sys_write, basics
  ;;;   use_label, set_label
  ;;;     only basics
  ;;; Assembly:
  ;;;   hlt
  ;;;     nothing
  ;;; Forth:
  ;;;   all_contents, elf_file_header, elf_program_header
  ;;;   output_start_routine
  ;;;     nothing special in any of these (they do want use/set_label)
  ;;;   self_raw
  ;;;     self-reference
  ;;;
  dq quit

;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Now we are in Forth ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   Everything we define from here on out is an actual Forth word, with a
;;; proper header and everything. So, you'll see some more preamble before the
;;; definitions.
;;;
;;;   Keep in mind, though, that, although we have threaded execution, we
;;; don't yet have Forth-style variables. That's because the heap is at a
;;; dynamically-chosen location, so none of this read-only code that we're
;;; defining now can reference it. Before invoking cold_start, we thoughtfully
;;; put the value of "heap" on the stack for ourselves; our first task will be
;;; to dynamically allocate some words on the heap that know how to find the
;;; heap. We'll do that by defining bootstrapping versions of the
;;; word-defining words, which will eventually be replaced.

; Token pasting is possible in flatassembler, but kind of a pain.
calminstruction defword_label_helper name
  local full_name, address
  arrange full_name, name#=_name        ; do the token-pasting
  compute address, $                    ; compute the current address
  publish full_name:, address           ; bind the label
end calminstruction

; The only time we actually use this variant is docol.
macro defword_unlabeled name, flags
  align 8
  defword_label_helper name
  dq latest_word
  latest_word = $ - 8
  db flags, 0x00, `name, 0x00
  align 8
end macro

; This is the variant we use to define ordinary words.
macro defword name, flags
  defword_unlabeled name, flags
  label name
end macro

latest_word = 0

;;;
;;; Routine docol
;;; -------------
;;;
;;;   Reference this via its label as the codeword of a word to make it an
;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer")
;;; to the control stack, takes the address of the codeword from rax and
;;; increments it in-place to form the new instruction pointer, and copies
;;; that to rsi.
;;;
;;;   Having then done this, we're now in the state that normal execution
;;; expects, so docol ends by it using "next" to begin the callee's execution,
;;; kicking off a nested call.
;;;
;;;   The name is said to be short for "do colon", because Forth high-level
;;; code begins word definitions with a colon.
;;;
;;; Registers in:
;;;
;;; * rsi is the caller's instruction pointer
;;; * rbp is the control stack pointer
;;; * rax is the address of the callee's codeword
;;;
;;; Registers out:
;;;
;;; * rsi is the callee's instruction pointer
;;; * rbp is the control stack pointer
defword_unlabeled docol, 0
docol_constant:
  ; Evaluated as a word, docol is a constant which returns a pointer.
  dq $ + 8
  mov.qreg.qimm rax, docol
  push.qreg rax
  next
  align 8
docol:
  ; Since docol is not a normal word, the label points to the value we care
  ; about from the assembly side of things, which is the address we use as the
  ; codeword.
  pushcontrol rsi
  add.qreg.bimm rax, 8
  mov.qreg.qreg rsi, rax
  next

latest_word = docol_name

;;;
;;;   This is the mechanism to "return" from a word interpreted by docol.
;;; We pop the control stack, and then, since this is threaded execution, we
;;; do the next thing the caller wants to do, by inlining "next".
;;;
defword exit, 0
  dq $ + 8
  popcontrol rsi
  next


;;;
;;; Stack manipulation routines
;;; ---------------------------
;;;
;;;   We start with the three traditional stack operations, swap drop and
;;; roll. Sorry to fans of the name "ROT"; we were an HP calculator kid. It'll
;;; always be roll to us. Anyway, we do a couple other operations too. Since
;;; our goal right now is just to bootstrap the heap, we keep this short and
;;; sweet.
;;;
;;;   There is definitely plenty of optimization that could be done.
;;;

defword swap, 0
  dq $ + 8
  pop.qreg rax
  pop.qreg rbx
  push.qreg rax
  push.qreg rbx
  next

defword drop, 0
  dq $ + 8
  pop.qreg rax
  next

defword drop2, 0
  dq $ + 8
  pop.qreg rax
  pop.qreg rax
  next

; Rotates "up" (pops its parameter, n; nth item then becomes current item).
;
; We implement this the high-performance way, with rep movsq, aka the
; instruction that exists to optimize C's memcpy(). The details of setting
; that up are complex; see below.
defword roll, 0
  dq $ + 8

  ;   Pop our parameter. The rep instruction takes rcx as its count, so we
  ; reduce copying by using it to hold our count, as well.
  pop.qreg rcx

  ;   We have n - 1 items to slide, so decrement rcx. For the purpose of
  ; counting how many repetitions will happen, it's one-based. This is because
  ; the rep instruction performs a single movsq, then decrements rcx, then
  ; stops if rcx is zero.
  dec.qreg rcx

  ;   Retrieve the nth item, for later. For this purpose we're thinking in
  ;  zero-based terms, so we do this after already having decremented rcx.
  mov.qreg.indexed.qreg rbx, rsp, rcx, 8

  ;   The source address for movsq is rsi and the destination is rdi; we can
  ; use rdi as we wish, but rsi is our Forth "instruction pointer", so we must
  ; save and restore it. Doing so alters rsp, so we have to adjust the address
  ; calculations by eight bytes as compared to the expressions above, but
  ; happily we can use the disp8 field to do that. We'd be using disp8 anyway
  ; because it's helpful.
  push.qreg rsi

  ;   Now we set up parameters for the memory-sliding operation. We have
  ; n - 1 items to copy, moving the range rsp through rsp + (n-2)*8 onto the
  ; range rsp + 8 through rsp + (n-1)*8. That's with the value of rsp as it
  ; exists at this moment (it's going to change soon).
  ;
  ;   We're sliding them upwards in memory, so we start at the high end so
  ; that we're always moving into a location that doesn't have anything
  ; precious. We use lea as a convenient way to do the stack math.
  ;
  ;   When rcx is 1, we want rsp + 8.
  lea.qreg.indexed.qreg rsi, rsp, rcx, 8
  ;   When rcx is 1, we want rsp + 16.
  lea.qreg.disp8.indexed.qreg rdi, 8, rsp, rcx, 8
  ;
  ;   Using rcx = 1 is the most convenient example to use for figuring out the
  ; arithmetic. It's a linear relationship, so as long as we get the 8-byte
  ; stride correct, we just need to pick a single point and verify that our
  ; math is right for that point, and it'll be right for any value of rcx.

  ;   Another of our Forth conventions is that the DF flag should be kept at
  ; zero, which directs string instruction to increment rsi. Here, however,
  ; because our source and destination ranges overlap, we need to start at the
  ; high end, which means we need it to decrement. So we set DF to one, and
  ; we'll clear it after.
  std
  rep movsq

  ; Set everything back.
  cld
  pop.qreg rsi

  ; There is now an extra item at the low end of the stack (the top) that
  ; needs to go away, and coincidentally we have a value in rbx that needs to
  ; be in that spot. Rather than doing a drop and push, we overwrite it, to
  ; save a little work.
  mov.indirect.qreg.qreg rsp, rbx

  ; All done, wow! What a mouthful.
  next

; Rotates "down" (pops its parameter, n; current item then becomes nth item).
;
; We implement this the high-performance way, with rep movsq, aka the
; instruction that exists to optimize C's memcpy(). The details of setting
; that up are complex; see below.
defword unroll, 0
  dq $ + 8

  ;   Pop our parameter. The rep instruction takes rcx as its count, so we
  ; reduce copying by using it to hold our count, as well.
  pop.qreg rcx

  ; We have n - 1 items to slide, so decrement rcx. Also, save a copy of it in
  ; rdx after doing that, for later.
  dec.qreg rcx
  mov.qreg.qreg rdx, rcx

  ;   Retrieve the 0th item, for later.
  mov.qreg.indirect.qreg rbx, rsp

  ;   Now we set up parameters for the memory-sliding operation. We have
  ; n - 1 items to copy, moving the range rsp + 8 through rsp + (n-1)*8 onto
  ; the range rsp through rsp + (n-2)*8. That's with the value of rsp as it
  ; exists at this moment (it's going to change soon).
  ;
  ;   We're sliding them downwards in memory, so we start at the low end so
  ; that we're always moving into a location that doesn't have anything
  ; precious. We use lea as a convenient way to do the stack math.
  ;
  ;   As with roll, we need to save rsi and adjust those rsp calculations
  ; accordingly.
  push.qreg rsi

  ;   Regardless of rcx, we want rsp + 16.
  lea.qreg.disp8.qreg rsi, 16, rsp
  ;   Regardless of rcx, we want rsp + 8.
  lea.qreg.disp8.qreg rdi, 8, rsp

  ;   With roll, we were starting at the high end. Here, we start at the low
  ; end, which means we need rsi to increment after each repetition. That's
  ; what it does when the DF flag is clear, and another of our Forth
  ; conventions is to keep it clear normally. So, we don't have to touch DF!
  ; Yay!
  rep movsq

  ; Restore our original rsi.
  pop.qreg rsi

  ; There is now an extra item in the middle of the stack, at the high end of
  ; the sliding we did, that needs to be overwritten with our value in rbx.
  ; Since we destructively updated our count in rcx, we saved a copy of the
  ; count in rdx, and we use that to find the right address.
  ;
  ; When the original count was n, we want rsp + (n-1)*8, so we saved rdx
  ; after decrementing rcx, above.
  mov.indexed.qreg.qreg rsp, rdx, 8, rbx

  ; All done, wow! What a mouthful.
  next

; Rotates "up" (third item becomes current item)
defword roll3, 0
  dq $ + 8
  pop.qreg rax
  pop.qreg rbx
  pop.qreg rcx
  push.qreg rbx
  push.qreg rax
  push.qreg rcx
  next

; Rotates "down" (current item becomes third item)
defword unroll3, 0
  dq $ + 8
  pop.qreg rax
  pop.qreg rbx
  pop.qreg rcx
  push.qreg rax
  push.qreg rcx
  push.qreg rbx
  next

defword dup, 0
  dq $ + 8
  pop.qreg rax
  push.qreg rax
  push.qreg rax
  next

defword dup2, 0
  dq $ + 8
  pop.qreg rax
  pop.qreg rbx
  push.qreg rbx
  push.qreg rax
  push.qreg rbx
  push.qreg rax
  next

;;;
;;; Arithmetic routines
;;; -------------------
;;;
;;;   No surprises here. Again, since our goal is to bootstrap the heap, we
;;; keep it short. Also again, this is nowhere near optimal.
;;;

defword add, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  add.qreg.qreg rax, rbx
  push.qreg rax
  next

defword sub, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  sub.qreg.qreg rax, rbx
  push.qreg rax
  next

defword mul, 0
  dq $ + 8
  pop.qreg rax
  pop.qreg rbx
  mul.rax.qreg rbx
  push.qreg rax
  next

defword divmod, 0
  dq $ + 8
  xor.qreg.qreg rdx, rdx         ; rdx is the high bits of the input; zero it
  pop.qreg rbx
  pop.qreg rax
  div.rdxrax.qreg rbx
  push.qreg rdx                  ; remainder
  push.qreg rax                  ; quotient
  next

;;;
;;; Comparison routines
;;; -------------------
;;;

defword eq, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  cmp.qreg.qreg rax, rbx
  set.breg.cc al, equal
  and.qreg.bimm rax, 0x01
  push.qreg rax
  next

defword ne, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  cmp.qreg.qreg rax, rbx
  set.breg.cc al, not_equal
  and.qreg.bimm rax, 0x01
  push.qreg rax
  next

defword gt, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  cmp.qreg.qreg rax, rbx
  set.breg.cc al, greater
  and.qreg.bimm rax, 0x01
  push.qreg rax
  next

; Is the top of the stack less than the second item in the stack?
defword lt, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  cmp.qreg.qreg rax, rbx
  set.breg.cc al, less
  and.qreg.bimm rax, 0x01
  push.qreg rax
  next

defword ge, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  set.breg.cc al, greater_equal
  cmp.qreg.qreg rax, rbx
  push.qreg rax
  next

defword le, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  set.breg.cc al, less_equal
  cmp.qreg.qreg rax, rbx
  push.qreg rax
  next

;;;
;;; Bitwise routines
;;; ----------------
;;;

defword and, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  and.qreg.qreg rax, rbx
  push.qreg rax
  next

defword or, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  or.qreg.qreg rax, rbx
  push.qreg rax
  next

defword xor, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  xor.qreg.qreg rax, rbx
  push.qreg rax
  next

; The HP overloads the name "not", so we follow the Forth convention.
defword invert, 0
  dq $ + 8
  pop.qreg rax
  not.qreg rax
  push.qreg rax
  next

;;;
;;; Routine lit
;;; ------------
;;;

defword lit, 0
  dq $ + 8
  lodsq
  push.qreg rax
  next

defword litstring, 0
  dq $ + 8
  ; The string immediately follows the codeword in memory, so rsi is already
  ; pointing to it. That address will be our returned result, so we push it to
  ; the stack.
  push.qreg rsi

  ; Now we need to skip over the string, so that rsi will be valid for the
  ; next Forth word. To do that, we're going to do a string operation with
  ; scasb, which takes rdi as the address to look at. This means scasb is
  ; treating its operand as analogous to the destination operand of movs*.
  mov.qreg.qreg rdi, rsi
  ; We want to compare for equality with zero; scasb looks in al for the other
  ; half of the comparison, so we clear rax.
  xor.qreg.qreg rax, rax
  ; Counterintuitively, we do need to pass a count. We pass -1, which will
  ; always work.
  xor.qreg.qreg rcx, rcx
  not.qreg rcx
  ; The DF flag is zero per our Forth execution-model convention, which means
  ; scasb increments rdi after each iteration. This is what we want, so
  ; there's no need to mess with it.
  ;
  ; We also need to worry about ZF, but fortunately that "not" will make sure
  ; it's clear.
  ;
  ; So, we're ready; do it!
  repnz scasb
  ; The scasb completes, incrementing rdi, before the repnz checks its
  ; condition. So, rdi is now pointing immediately after the terminating null
  ; byte. Of course, we want this in rsi for Forth's purposes.
  mov.qreg.qreg rsi, rdi

  ; Finally, we need to align rsi to the next word boundary.
  add.qreg.bimm rsi, 7
  and.qreg.bimm rsi, NOT 7

  next

;;;
;;; Memory access routines
;;; ----------------------
;;;
;;;   We go with Forth names for this stuff. The HP's names for memory and
;;; storage operations heavily leverage the fact they have an object system
;;; with type tags and so on; we want to stay close to the bytes.
;;;

; Address on the top of the stack, value in the second position
defword store, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  mov.indirect.qreg.qreg rbx, rax
  next

defword fetch, 0
  dq $ + 8
  pop.qreg rax
  mov.qreg.indirect.qreg rax, rax
  push.qreg rax
  next

; Address on top, value second
; I might have done it the other way, but this is what Jonesforth does and it
; seems reasonable enough.
defword addstore, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  add.indirect.qreg.qreg rbx, rax
  next

defword substore, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  sub.indirect.qreg.qreg rbx, rax
  next

defword store8, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  mov.indirect.qreg.breg rbx, al
  next

defword fetch8, 0
  dq $ + 8
  pop.qreg rbx
  xor.qreg.qreg rax, rax
  mov.breg.indirect.qreg al, rbx
  next

defword store16, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  mov.indirect.qreg.wreg rbx, ax
  next

defword fetch16, 0
  dq $ + 8
  pop.qreg rbx
  xor.qreg.qreg rax, rax
  mov.wreg.indirect.qreg ax, rbx
  next

defword store32, 0
  dq $ + 8
  pop.qreg rbx
  pop.qreg rax
  mov.indirect.qreg.dreg rbx, eax
  next

defword fetch32, 0
  dq $ + 8
  pop.qreg rbx
  xor.qreg.qreg rax, rax
  mov.dreg.indirect.qreg eax, rbx
  next

;   Copy one non-overlapping block of memory over another. For the overlapping
; logic, see roll and unroll. This always copies at byte granularity, for ease
; of implementation.
;
;   Jonesforth calls this CMOVE and has a ccopy which is a single byte. POSIX,
; however, has memcpy() for non-overlapping blocks and memmove() for
; overlapping blocks. We follow the latter convention, because it feels like
; the more important distinction.
;
; Stack in:
;   destination
;   source
;   length
; (top)
defword ccopy, 0
  dq $ + 8
  ; We need to save and restore rsi; the other registers we can trample.
  mov.qreg.qreg rdx, rsi
  pop.qreg rcx
  pop.qreg rsi
  pop.qreg rdi
  ; We start from the low end, since that's easier arithmetic. So, we get to
  ; leave the DF flag alone.
  rep movsb
  mov.qreg.qreg rsi, rdx
  next

; Stack in:
;   string address
; Stack out:
;   string length including null byte
defword stringlen, 0
  dq $ + 8
  pop.qreg rdi
  mov.qreg.qreg rbx, rdi
  xor.qreg.qreg rax, rax
  xor.qreg.qreg rcx, rcx
  not.qreg rcx
  repnz scasb
  sub.qreg.qreg rdi, rbx
  push.qreg rdi
  next


;;;;;;;;;;;;;;;;;
;;; Branching ;;;
;;;;;;;;;;;;;;;;;

;   This takes a number of bytes, not machine words. That allows it to be used
; for putting weird things embedded in the code.
;
;   The offset is relative to the start of the word the number of bytes is in,
; so, make sure to have it skip itself.
defword branch, 0
  dq $ + 8
  add.qreg.indirect.qreg rsi, rsi
  next

; This should probably be 0branch, but right now the auto-label code is picky.
defword zbranch, 0
  dq $ + 8
  pop.qreg rax
  test.qreg.qreg rax, rax
  ; Please notice the 8-bit branch to the nearby word.
  jmp.cc.rel.bimm zero, branch + 8 - zbranch_after_jmp
zbranch_after_jmp:
  lodsq                          ; just a convenient way to skip rsi forward
  next


;;;;;;;;;;;;;;;;;;;;;;
;;; Output helpers ;;;
;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   These routines are for building up data structures in-memory. Sometimes
;;; they're used for structures that are meant to stay in memory; other times
;;; it's a buffer that will become output.
;;;
;;;   The general pattern is that each routine takes an output address and
;;; some specific datum, and returns the output address adjusted to point
;;; after the new datum. That makes them easy to chain together.

; In: base address, value
; Out: new base address
defword pack64, 0
  dq docol
  dq swap, dup, unroll3, store, lit, 8, add
  dq exit
defword pack32, 0
  dq docol
  dq swap, dup, unroll3, store32, lit, 4, add
  dq exit
defword pack16, 0
  dq docol
  dq swap, dup, unroll3, store16, lit, 2, add
  dq exit
defword pack8, 0
  dq docol
  dq swap, dup, unroll3, store8, lit, 1, add
  dq exit

;   This works on C-style strings, which are characters followed by a null
; terminator. The packed data includes the null terminator.
;
; Stack in:
;   base address, string pointer
; Stack out:
;   new base address
defword packstring, 0
  dq docol
  dq dup, stringlen, dup
  ; base/destination, source, length, length
  dq lit, 4, roll, dup, lit, 5, unroll
  ; destination, source, length, length, base/destination
  dq add, lit, 4, unroll
  ; new base, destination, source, length
  dq ccopy
  dq exit

; Stack in:
;   base address
;   byte size
; Stack out:
;   new base address
defword packalign, 0
  dq docol
  dq dup2, divmod, drop, zbranch, 8*8
  dq swap, lit, 0, pack8, swap
  dq branch, -11*8
  dq drop, exit

; In the interests of reducing our executable's size, since a lot of it goes
; to pack* invocations, we define words that combine lit with pack*. This
; shaves roughly 700 bytes as of when it was added.
defword litpack64, 0
  dq $ + 8
  lodsq
  push.qreg rax
  beforenext pack64
defword litpack32, 0
  dq $ + 8
  lodsq
  push.qreg rax
  beforenext pack32
defword litpack16, 0
  dq $ + 8
  lodsq
  push.qreg rax
  beforenext pack16
defword litpack8, 0
  dq $ + 8
  lodsq
  push.qreg rax
  beforenext pack8


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Assembly language, but in Forth ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   It's all backwards and stuff.
;;;
;;;   Okay, but seriously, the convention is: target on the top of the stack,
;;; source behind it. This is similar to how the Forth "store" and "fetch"
;;; words work.
;;;
;;;   These routines use the output helpers, defined above.  They're called in
;;; the same way: an output address, followed by data items specific to what's
;;; being output.

; We define a bunch of keywords, which evaluate to their own codeword
; addresses.
defword rax, 0
  dq docol, lit, rax, exit
defword rcx, 0
  dq docol, lit, rcx, exit
defword rdx, 0
  dq docol, lit, rdx, exit
defword rbx, 0
  dq docol, lit, rbx, exit
defword rsp, 0
  dq docol, lit, rsp, exit
defword rbp, 0
  dq docol, lit, rbp, exit
defword rsi, 0
  dq docol, lit, rsi, exit
defword rdi, 0
  dq docol, lit, rdi, exit
defword r8, 0
  dq docol, lit, r8, exit
defword r9, 0
  dq docol, lit, r9, exit
defword r10, 0
  dq docol, lit, r10, exit
defword r11, 0
  dq docol, lit, r11, exit
defword r12, 0
  dq docol, lit, r12, exit
defword r13, 0
  dq docol, lit, r13, exit
defword r14, 0
  dq docol, lit, r14, exit
defword r15, 0
  dq docol, lit, r15, exit
defword eax, 0
  dq docol, lit, rax, exit
defword ecx, 0
  dq docol, lit, rcx, exit
defword edx, 0
  dq docol, lit, rdx, exit
defword ebx, 0
  dq docol, lit, rbx, exit
defword esp, 0
  dq docol, lit, rsp, exit
defword ebp, 0
  dq docol, lit, rbp, exit
defword esi, 0
  dq docol, lit, rsi, exit
defword edi, 0
  dq docol, lit, rdi, exit
defword ax, 0
  dq docol, lit, rax, exit
defword cx, 0
  dq docol, lit, rcx, exit
defword dx, 0
  dq docol, lit, rdx, exit
defword bx, 0
  dq docol, lit, rbx, exit
defword sp, 0
  dq docol, lit, rsp, exit
defword bp, 0
  dq docol, lit, rbp, exit
defword si, 0
  dq docol, lit, rsi, exit
defword di, 0
  dq docol, lit, rdi, exit
defword al, 0
  dq docol, lit, rax, exit
defword cl, 0
  dq docol, lit, rcx, exit
defword dl, 0
  dq docol, lit, rdx, exit
defword bl, 0
  dq docol, lit, rbx, exit
defword ah, 0
  dq docol, lit, rsp, exit
defword ch, 0
  dq docol, lit, rbp, exit
defword dh, 0
  dq docol, lit, rsi, exit
defword bh, 0
  dq docol, lit, rdi, exit

; Stack in:
;   register name
; Stack out:
;   3-bit encoded value for register
defword reg64, 0
  dq docol
  dq dup, rax, eq, zbranch, 5*8, drop, lit, 0, exit
  dq dup, rcx, eq, zbranch, 5*8, drop, lit, 1, exit
  dq dup, rdx, eq, zbranch, 5*8, drop, lit, 2, exit
  dq dup, rbx, eq, zbranch, 5*8, drop, lit, 3, exit
  dq dup, rsp, eq, zbranch, 5*8, drop, lit, 4, exit
  dq dup, rbp, eq, zbranch, 5*8, drop, lit, 5, exit
  dq dup, rsi, eq, zbranch, 5*8, drop, lit, 6, exit
  dq dup, rdi, eq, zbranch, 5*8, drop, lit, 7, exit
  dq litstring, "Parameter to reg64 is not a reg64.", emitstring
  dq lit, 1, sys_exit

; Stack in:
;   register name
; Stack out:
;   3-bit encoded value for register
defword extrareg64, 0
  dq docol
  dq dup, r8, eq, zbranch, 5*8, drop, lit, 0, exit
  dq dup, r9, eq, zbranch, 5*8, drop, lit, 1, exit
  dq dup, r10, eq, zbranch, 5*8, drop, lit, 2, exit
  dq dup, r11, eq, zbranch, 5*8, drop, lit, 3, exit
  dq dup, r12, eq, zbranch, 5*8, drop, lit, 4, exit
  dq dup, r13, eq, zbranch, 5*8, drop, lit, 5, exit
  dq dup, r14, eq, zbranch, 5*8, drop, lit, 6, exit
  dq dup, r15, eq, zbranch, 5*8, drop, lit, 7, exit
  dq litstring, "Parameter to extrareg64 is not an extrareg64.", emitstring
  dq lit, 1, sys_exit

; Stack in:
;   register name
; Stack out:
;   3-bit encoded value for register
defword reg32, 0
  dq docol
  dq dup, eax, eq, zbranch, 5*8, drop, lit, 0, exit
  dq dup, ecx, eq, zbranch, 5*8, drop, lit, 1, exit
  dq dup, edx, eq, zbranch, 5*8, drop, lit, 2, exit
  dq dup, ebx, eq, zbranch, 5*8, drop, lit, 3, exit
  dq dup, esp, eq, zbranch, 5*8, drop, lit, 4, exit
  dq dup, ebp, eq, zbranch, 5*8, drop, lit, 5, exit
  dq dup, esi, eq, zbranch, 5*8, drop, lit, 6, exit
  dq dup, edi, eq, zbranch, 5*8, drop, lit, 7, exit
  dq litstring, "Parameter to reg32 is not a reg32.", emitstring
  dq lit, 1, sys_exit

; Stack in:
;   register name
; Stack out:
;   3-bit encoded value for register
defword reg16, 0
  dq docol
  dq dup, ax, eq, zbranch, 5*8, drop, lit, 0, exit
  dq dup, cx, eq, zbranch, 5*8, drop, lit, 1, exit
  dq dup, dx, eq, zbranch, 5*8, drop, lit, 2, exit
  dq dup, bx, eq, zbranch, 5*8, drop, lit, 3, exit
  dq dup, sp, eq, zbranch, 5*8, drop, lit, 4, exit
  dq dup, bp, eq, zbranch, 5*8, drop, lit, 5, exit
  dq dup, si, eq, zbranch, 5*8, drop, lit, 6, exit
  dq dup, di, eq, zbranch, 5*8, drop, lit, 7, exit
  dq litstring, "Parameter to reg16 is not a reg16.", emitstring
  dq lit, 1, sys_exit

; Stack in:
;   register name
; Stack out:
;   3-bit encoded value for register
defword reg8, 0
  dq dup, al, eq, zbranch, 5*8, drop, lit, 0, exit
  dq dup, cl, eq, zbranch, 5*8, drop, lit, 1, exit
  dq dup, dl, eq, zbranch, 5*8, drop, lit, 2, exit
  dq dup, bl, eq, zbranch, 5*8, drop, lit, 3, exit
  dq dup, ah, eq, zbranch, 5*8, drop, lit, 4, exit
  dq dup, ch, eq, zbranch, 5*8, drop, lit, 5, exit
  dq dup, dh, eq, zbranch, 5*8, drop, lit, 6, exit
  dq dup, bh, eq, zbranch, 5*8, drop, lit, 7, exit
  dq litstring, "Parameter to reg8 is not a reg8.", emitstring
  dq lit, 1, sys_exit

; Stack in:
;   scale factor, as a count of bytes
; Stack out:
;   2-bit encoded value for scale field in SIB byte
defword scalefield, 0
  dq docol
  dq dup, 1, eq, zbranch, 5*8, drop, lit, 0, exit
  dq dup, 2, eq, zbranch, 5*8, drop, lit, 1, exit
  dq dup, 4, eq, zbranch, 5*8, drop, lit, 2, exit
  dq dup, 8, eq, zbranch, 5*8, drop, lit, 3, exit
  dq litstring, "Parameter to scalefield is not 1, 2, 4, or 8.", emitstring
  dq lit, 1, sys_exit

; Stack:
;   output point
defword rex_w, 0
  dq docol, lit, 0x48, pack8, exit
defword rex_wb, 0
  dq docol, lit, 0x49, pack8, exit

; Stack:
;   output point
;   3-bit encoded value for register
;   opcode byte
defword opcodereg, 0
  dq docol, or, pack8, exit

;   The low-level word that outputs a modrm byte given fully-processed,
; numeric values for its fields. Most code will want to call one of the
; higher-level modrm_* words, instead.
;
; Stack
;   output point
;   mode ("mod") field
;   register/opcode field
;   register/memory ("RM") field
defword modrm, 0
  dq docol, swap, lit, 8, mul, or, swap, lit, 64, mul, or, pack8, exit

; Stack
;   output point
;   scale field
;   index field
;   base field
defword sib, 0
  dq docol, swap, lit, 8, mul, or, swap, lit, 64, mul, or, pack8, exit

;   The simplest of the modrm modes: Direct register addressing. There are no
; special cases to check.
;
;   It's important to notice that the R/M field may describe either a source,
; or a target, depending on what the instruction is. So, this helper doesn't
; get to know that. It also doesn't get to know whether the value in the
; reg/op field describes a register, or if instead it's an extension of the
; opcode. The caller is responsible for figuring that all out.
;
; Stack:
;   output point
;   reg/op field value (raw number)
;   reg/mem field register name
defword addressing_reg64, 0
  dq docol, reg64, lit, 3, unroll3, modrm, exit

;   This is a helper for assembly instructions that want to do a form of
; addressing that requires a value of 1 in the modrm byte's mode field, and
; do not want to do any indexing.
;
;   Its main responsibility is to deal with the scenario that requires an SIB
; byte, which happens when the R/M field has a value of 4, which would
; otherwise refer to the register rsp. In that situation, it also generates
; an SIB byte which indicates a scale of 1, no indexing, and rsp as the base
; register.
;
; Stack:
;   output point
;   reg/op field value (raw number)
;   reg/mem field register name
defword addressing_indirect_reg64, 0
  dq docol
  ; Exit with an error if the R/M register is rbp.
  dq dup, rbp, ne, zbranch, 23*8
  ; Check whether the R/M register is rsp; save the test result for later.
  dq dup, rsp, eq, lit, 4, unroll
  dq reg64, lit, 0, unroll3, modrm
  ; If the R/M register was rsp, we need an SIB byte; otherwise, skip it.
  dq swap, zbranch, 8*8, lit, 0, lit, 4, rsp, reg64, sib
  dq exit
  dq litstring, "R/M parameter to addressing_indirect_reg64 is rbp."
  dq emitstring
  dq lit, 1, sys_exit

; Stack:
;   output point
;   reg/op field value (raw number)
;   reg/mem field register name
;   displacement value
defword addressing_disp8_reg64, 0
  dq docol
  ; This mode can do rbp fine, so no need to check for that.
  ; Check whether the R/M register is rsp; save the test result for later.
  dq swap, dup, rsp, eq, lit, 5, unroll, swap
  ; Stash the displacement value out of the way, too.
  dq lit, 4, unroll
  dq reg64, lit, 1, unroll3, modrm
  ; If the R/M register was rsp, we need an SIB byte; otherwise, skip it.
  dq roll3, zbranch, 8*8, lit, 0, lit, 4, rsp, reg64, sib
  ; The displacement byte.
  dq swap, pack8
  dq exit

; Stack:
;   output point
;   reg/op field value (raw number)
;   scale factor, as a count of bytes
;   index register name
;   base field register name
defword addressing_indexed_reg64, 0
  dq docol
  ; Exit with an error if the base register is rbp.
  dq dup, rbp, ne, zbranch, 17*8
  ; Reg/mem value 4 means to use an SIB byte (at least, with this mode).
  dq lit, 5, roll, lit, 0, lit, 6, roll, lit, 4, modrm, lit, 4, unroll
  dq reg64, unroll3, reg64, unroll3, scalefield, unroll3, sib
  dq exit
  dq litstring, "Base parameter to addressing_indexed_reg64 is rbp."
  dq emitstring
  dq lit, 1, sys_exit

; Stack:
;   output point
;   reg/op field value (raw number)
;   scale factor, as a count of bytes
;   index register name
;   base field register name
;   displacement value
defword addressing_disp8_indexed_reg64, 0
  dq docol
  ; This mode can do rbp fine, so no need to check for that.
  ; Reg/mem value 4 means to use an SIB byte (at least, with this mode).
  dq lit, 6, roll, lit, 1, lit, 7, roll, lit, 4, modrm, lit, 5, unroll
  dq lit, 5, unroll, reg64, unroll3, reg64, unroll3, scalefield, unroll3, sib
  dq swap, pack8
  dq exit

; Stack:
;   output point
defword cld, 0
  dq docol, lit, 0xFC, pack8, exit

; Stack:
;   output point
defword std, 0
  dq docol, lit, 0xFD, pack8, exit

; Stack:
;   output point
;   immediate value
;   register name
defword mov_reg64_imm64, 0
  dq docol
  dq roll3, rex_w, swap, reg64, lit, 0xB8, opcodereg, swap, pack64
  dq exit
defword mov_extrareg64_imm64, 0
  dq docol
  dq roll3, rex_wb, swap, extrareg64, lit, 0xB8, opcodereg, swap, pack64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x89, pack8, unroll3
  dq swap, reg64, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_indirect_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x89, pack8, unroll3
  dq swap, reg64, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
;   target displacement value
defword mov_disp8_reg64_reg64, 0
  dq docol
  dq lit, 4, roll, rex_w, lit, 0x89, pack8, lit, 4, unroll
  dq roll3, reg64, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_reg64_indirect_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x8B, pack8, unroll3
  dq reg64, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   source displacement value
;   target register name
defword mov_reg64_disp8_reg64, 0
  dq docol
  dq lit, 4, roll, rex_w, lit, 0x8B, pack8, lit, 4, unroll
  dq reg64, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   source index register name
;   source index scale factor, as a count of bytes
;   target register name
defword mov_reg64_indexed_reg64, 0
  dq docol
  dq lit, 5, roll, rex_w, lit, 0x8B, pack8, lit, 5, unroll
  dq reg64, lit, 4, unroll, unroll3, swap, addressing_indexed_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_indirect_reg64_reg32, 0
  dq docol
  dq roll3, lit, 0x89, pack8, unroll3
  dq swap, reg32, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
;   target displacement value
defword mov_disp8_reg64_reg32, 0
  dq docol
  dq lit, 4, roll, lit, 0x89, pack8, lit, 4, unroll
  dq roll3, reg32, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_reg32_indirect_reg64, 0
  dq docol
  dq roll3, lit, 0x8B, pack8, unroll3
  dq reg32, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   source displacement value
;   target register name
defword mov_reg32_disp8_reg64, 0
  dq docol
  dq lit, 4, roll, lit, 0x8B, pack8, lit, 4, unroll
  dq reg32, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_indirect_reg64_reg16, 0
  dq docol
  dq roll3, lit, 0x66, pack8, lit, 0x89, pack8, unroll3
  dq swap, reg16, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
;   target displacement value
defword mov_disp8_reg64_reg16, 0
  dq docol
  dq lit, 4, roll, lit, 0x66, pack8, lit, 0x89, pack8, lit, 4, unroll
  dq roll3, reg16, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_reg16_indirect_reg64, 0
  dq docol
  dq roll3, lit, 0x66, pack8, lit, 0x8B, pack8, unroll3
  dq reg16, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   source displacement value
;   target register name
defword mov_reg16_disp8_reg64, 0
  dq docol
  dq lit, 4, roll, lit, 0x66, pack8, lit, 0x8B, pack8, lit, 4, unroll
  dq reg16, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_indirect_reg64_reg8, 0
  dq docol
  dq roll3, lit, 0x88, pack8, unroll3
  dq swap, reg8, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
;   target displacement value
defword mov_disp8_reg64_reg8, 0
  dq docol
  dq lit, 4, roll, lit, 0x88, pack8, lit, 4, unroll
  dq roll3, reg8, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword mov_reg8_indirect_reg64, 0
  dq docol
  dq roll3, pack8, lit, 0x8A, pack8, unroll3
  dq reg8, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   source displacement value
;   target register name
defword mov_reg8_disp8_reg64, 0
  dq docol
  dq lit, 4, roll, pack8, lit, 0x8A, pack8, lit, 4, unroll
  dq reg8, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   source displacement value
;   target register name
defword lea_reg64_disp8_reg64, 0
  dq docol
  dq lit, 4, roll, rex_w, lit, 0x8D, pack8, lit, 4, unroll
  dq reg64, unroll3, addressing_disp8_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   source index register name
;   source index scale factor, as a count of bytes
;   target register name
defword lea_reg64_indexed_reg64, 0
  dq docol
  dq lit, 5, roll, rex_w, lit, 0x8D, pack8, lit, 5, unroll
  dq reg64, lit, 4, unroll, unroll3, swap, addressing_indexed_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   source index register name
;   source index scale factor, as a count of bytes
;   source displacement value
;   target register name
defword lea_reg64_disp8_indexed_reg64, 0
  dq docol
  dq lit, 6, roll, rex_w, lit, 0x8D, pack8, lit, 6, unroll
  dq reg64, lit, 5, unroll, lit, 3, roll, lit, 4, roll, lit, 3, roll
  dq addressing_disp8_indexed_reg64
  dq exit

; Stack:
;   output point
;   source register name
defword push_reg64, 0
  dq docol, reg64, lit, 0x50, opcodereg, exit

; Stack:
;   output point
;   target register name
defword pop_reg64, 0
  dq docol, reg64, lit, 0x58, opcodereg, exit

; Stack:
;   output point
defword lodsq, 0
  dq docol, rex_w, lit, 0xAD, pack8, exit

; Stack:
;   output point
;   source register name
;   target register name
defword add_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x01, pack8, unroll3
  dq swap, reg64, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword add_indirect_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x01, pack8, unroll3
  dq swap, reg64, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword add_reg64_indirect_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x03, pack8, unroll3
  dq reg64, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
;   source value
;   target register name
defword add_reg64_imm8, 0
  dq docol
  dq roll3, rex_w, lit, 0x83, pack8, swap, lit, 0, swap, addressing_reg64
  dq swap, pack8
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword sub_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x2B, pack8, unroll3
  dq swap, reg64, swap, addressing_reg64

; Stack:
;   output point
;   source register name
;   target register name
defword sub_indirect_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x2B, pack8, unroll3
  dq swap, reg64, swap, addressing_indirect_reg64

; Stack:
;   output point
;   source register name
;   target register name
defword mul_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0xF7, pack8, unroll3
  dq swap, reg64, swap, addressing_reg64
  dq exit

;   The dividend is 128 bits, and is formed from rdx as the high half and rax
; as the low half. The divisor is a specified register. The quotient is
; returned in rax, truncated towards zero. The remainder is in rdx. This
; entire process is unsigned.
;
;   The official mnemonic for this is "div", but divmod is what it does.
;
; Stack:
;   output point
;   divisor register name
defword divmod_reg64, 0
  dq docol
  dq swap, rex_w, lit, 0xF7, pack8, swap
  dq lit, 6, swap, addressing_reg64

; Same as divmod, but signed.
;
; Stack:
;   output point
;   divisor register name
defword idivmod_reg64, 0
  dq docol
  dq swap, rex_w, lit, 0xF7, pack8, swap
  dq lit, 7, swap, addressing_reg64

; Stack:
;   output point
;   target register name
defword inc_reg64, 0
  dq docol
  dq swap, rex_w, lit, 0xFF, pack8, swap, lit, 0, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   target register name
defword dec_reg64, 0
  dq docol
  dq swap, rex_w, lit, 0xFF, pack8, swap, lit, 1, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword and_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x23, pack8, unroll3
  dq reg64, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   source value
;   target register name
defword and_reg64_imm8, 0
  dq docol
  dq roll3, rex_w, lit, 0x83, pack8, swap
  dq lit, 4, swap, addressing_reg64
  dq swap, pack8
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword or_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x0B, pack8, unroll3
  dq reg64, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   source value
;   target register name
defword or_reg64_imm8, 0
  dq docol
  dq roll3, rex_w, lit, 0x83, pack8, swap
  dq lit, 1, swap, addressing_reg64
  dq swap, pack8
  dq exit

; Stack:
;   output point
;   source register name
;   target register name
defword xor_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, lit, 0x33, pack8, unroll3
  dq reg64, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   target register name
defword not_reg64, 0
  dq docol
  dq swap, rex_w, lit, 0xF7, pack8
  dq swap, lit, 2, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   left register name
;   right register name
defword cmp_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, 0x3B, pack8, unroll3
  dq swap, reg64, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   left register name
;   right register name
defword test_reg64_reg64, 0
  dq docol
  dq roll3, rex_w, 0x85, pack8, unroll3
  dq swap, reg64, swap, addressing_reg64
  dq exit

; Stack:
;   output point
;   register name
defword jmp_abs_indirect_reg64, 0
  dq docol
  dq swap, lit, 0xFF, pack8, swap
  dq lit, 4, swap, addressing_indirect_reg64
  dq exit

; Stack:
;   output point
defword syscall, 0
  dq docol, lit, 0x0F, pack8, lit, 0x05, pack8, exit

; TODO rep movsb
; TODO rep movsq
; TODO repnz scasb
; TODO set_reg8_cc
;   (and condition codes)
; TODO jmp_cc_rel_imm8

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Runtime word definition ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   We need to be able to use Forth code that defines other Forth code; it
;;; will be the easiest way to create our Forth-flavored assembly languge. To
;;; do that, there's a critical bootstrapping problem: The variables that we
;;; allocate on the heap in _start must be used by the Forth words that define
;;; other Forth words (and, in general, they are needed for anything that
;;; does memory management). Since the heap's address isn't known until
;;; runtime, we can't use build-time labels to locate the heap or the
;;; variables on it, in the way we've done for everything up to now.
;;;
;;;   In fact, it's worse than that: nothing we define at build time can
;;; statically reference anything on the heap; it must always be given the
;;; heap's address as a parameter. That also means statically defined words
;;; can't reference runtime words, at all, except via special code that goes
;;; outside the normal Forth execution model.
;;;
;;;   So, what we'll want to do to deal with that is copy ourselves onto the
;;; heap, and run the rest of the program from there. It would be okay in
;;; principle to keep a few things such as docol in the static code segment,
;;; as long as they don't need to reference the heap, though it might in some
;;; sense be more elegant to entirely abandon the code segment and run
;;; heap-only. At any rate, to keep the set of stuff we have to copy small,
;;; we'll want to make the change-over as soon as possible.
;;;
;;;   Following a common Forth practice, we implement variables as words that
;;; push the corresponding addresses onto the stack. Since the variables that
;;; define the heap are ON the heap, we will definitely need two distinct
;;; versions of these; having the two versions be almost-compatible allows us
;;; to minimize code duplication. Since they do have slightly different
;;; interfaces, we prefix the names of the early ones with "early_".

; Stack in:
;   heap address
; Stack out:
;   heap address
;   requested variable address
defword early_heap, 0
  dq docol, dup, lit, control_stack_size, add, exit
defword early_s0, 0
  dq docol, early_heap, lit, 8, add, exit
defword early_r0, 0
  dq docol, early_heap, lit, 16, add, exit
defword early_latest, 0
  dq docol, early_heap, lit, 24, add, exit
defword early_here, 0
  dq docol, early_heap, lit, 32, add, exit

;   Allocate space by incrementing "here", and output a word header in it.
; Also add it to the "latest" linked list. Use zero as the flag values;
; callers that want something else can do that themselves.
;
;   We'll use the pack* words to do this, the way they update the pointer is
; convenient.
;
;   See "Execution model", above, for reminders of the details of this format.
; Create is responsible for everything up to the codeword, not including it.
;
;   This is surprisingly painful to implement, in terms of stack juggling, but
; then there's a sense in which it's the core trick to bootstrapping the heap.
;
; Stack in:
;   heap address
;   name string
; Stack out:
;   heap address
defword early_create, 0
  dq docol
  ; Retrieve the values of "here" and "latest".
  dq swap, early_here, fetch, swap, early_latest, fetch, swap, lit, 4, unroll
  ; Pack the old value of "latest" as the first field of the header, linking
  ; from the newly-defined word to the next-newest word.
  dq pack64
  ; Pack a null byte as the flags.
  dq lit, 0, pack8
  ; Pack another null byte as the start terminator of the name (which is
  ; terminated at both ends).
  dq lit, 0, pack8
  ; Pack the name and its end terminator.
  dq swap, packstring
  ; Alignment before the codeword.
  dq lit, 8, packalign
  ; Retrieve the value of "here" (which still doesn't reflect our additions),
  ; and store it at the address of "latest". It's the start of our
  ; newly-defined word, which makes it the latest word.
  dq swap, early_here, fetch, swap, early_latest, swap, unroll3, store, swap
  ; Retrieve the address of "here" again and store our updated value there.
  dq swap, early_here, swap, unroll3, store
  dq exit

; Stack in:
;   heap address
;   value to append to current word-in-progress
; Stack out:
;   heap address
defword early_comma, 0
  dq docol
  ; Retrieve the value of "here" and pack the value there.
  dq swap, early_here, fetch, swap, unroll3, swap, pack64
  ; Store the updated value of "here".
  dq swap, early_here, swap, unroll3, store
  dq exit

; Stack in:
;   heap address
; Stack out:
;   heap address
defword early_self_codeword, 0
  dq docol, early_here, fetch, lit, 8, add, early_comma, exit

; Stack in:
;   heap address
;   new value to overwrite "here"
; Stack out:
;   heap address
defword early_here_store, 0
  dq docol, swap, early_here, swap, unroll3, store, exit

;   Notice that we've switched over to stuff that follows the pack* idioms.
;
;   This is a helper "macro" that we'll use in defining assembly words from
; Forth. This is in a sense a redefinition of it; the flatassembler version of
; it is far, far above, and has more documentation.
;
; Stack in:
;   base address
; Stack out:
;   new base address
defword pack_next, 0
  dq docol, lodsq, rax, jmp_abs_indirect_reg64, exit

;   This is another helper "macro" that we'll use in defining assembly words
; from Forth. In particular, this one is used in docol. As before, see the
; flatassembler version for more explanation.
;
; Stack in:
;   base address
;   source register keyword
; Stack out:
;   new base address
defword pack_pushcontrol, 0
  dq docol
  dq swap, rbp, lit, -8, rbp, lea_reg64_disp8_reg64, swap
  dq rbp, lit, 0, mov_disp8_reg64_reg64
  dq exit

;   This is another helper "macro" that we'll use in defining assembly words
; from Forth. In particular, this one is used in "exit". See the flatassembler
; version for more explanation.
;
; Stack in:
;   base address
;   target register keyword
; Stack out:
;   new base address
defword pack_popcontrol, 0
  dq docol
  dq rbp, lit, 0, roll3, mov_reg64_disp8_reg64
  dq rbp, lit, 8, rbp, lea_reg64_disp8_reg64
  dq exit

;   Now we're back to heap idioms again.
;
; Stack in:
;   heap address
;   address for new variable word to return
;   name string
; Stack out:
;   heap address
defword early_variable, 0
  dq docol
  dq swap, unroll3, early_create, early_self_codeword
  ; (address to return, heap address)
  dq early_here, fetch, swap, unroll3, swap
  ; (heap address, modified "here" value, address to return)
  dq rax, mov_reg64_imm64, rax, push_reg64
  ; (heap address, modified "here" value)
  dq pack_next, lit, 8, packalign, early_here_store
  dq exit


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Now.... what was our original goal, again? ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   Finally, having fully bootstrapped our runtime environment, we move on
;;; to the core stuff we actually want to accomplish. For this quine, that's
;;; outputting itself.

;;;
;;;   One of the most charming naming traditions in Forth is that the
;;; top-level word that stays running forever, is called "quit".
;;;
defword quit, 0
  dq docol

  ;;;
  ;;; Although we initialized rbp already, we do so again because we'll want
  ;;; that on subsequent visits to this word - it's the main thing it's for.
  ;;; Keep in mind that rsi is the actual "instruction pointer", and we're
  ;;; leaving it unchanged, we just get rid of everything above it.
  ;;;
  ;dq r0, control!                ; overwrite rbp to reset the control stack
    ; TODO though the implementation of r0 is trivial, it depends on where we
    ; put the heap, so it can't be hardcoded, we'll have to build it in RAM.
    ; the same therefore goes for anything that needs to call it. so we can't
    ; call it here, now, yet - we have prep work to do first

  ;;;
  ;;; Do the read-eval-print-loop, which is the main body of the Forth
  ;;; interpreter.
  ;;;
  ;dq interpret                   ; run the repl
  dq quine

  ;;;
  ;;; If the repl ever exits, do it all again.
  ;;;
  ;dq branch, quit - $
  dq lit, 0, sys_exit


;;;;;;;;;;;;;;;;;;;;
;;; System calls ;;;
;;;;;;;;;;;;;;;;;;;;
;;;
;;;   The kernel preserves every register except rax, rcx, and r11. The system
;;; call number goes in rax, as does the return value. Parameters go in rdi,
;;; rsi, rdx, r10, r8, and r9, in that order. [SysV] A.2.1.
;;;
;;;   Notice that rsi is our control stack, so we have to save it (for
;;; syscalls with at least two parameters). We can use the value stack to do
;;; that, since rsp is preserved. We don't save other registers because our
;;; caller should do that, if it cares.
;;;

;;;
;;;   This does the Linux exit() system call, passing it an exit code taken
;;; from the stack.
;;;
defword sys_exit, 0
  dq $ + 8

  mov.qreg.qimm rax, 60          ; syscall number
  pop.qreg rdi                   ; exit code
  syscall

  ; In the event we're still here, let's minimize confusion.
  hlt


;;;
;;;   This does the Linux write() system call, passing it an address from the
;;; top of the stack and a length from the second position on the stack. It
;;; writes to file descriptor 1, which is stdout.
;;;
;;;   For our length parameter, we can pop directly from the stack into rdx,
;;; which directly becomes the syscall parameter. For our address parameter,
;;; the syscall wants it in rsi, which we also care about, so we have to do a
;;; little juggling.
;;;
defword sys_write, 0
  dq $ + 8
  pop.qreg rcx                   ; address from stack
  pop.qreg rdx                   ; length from stack, passed directly
  push.qreg rsi                  ; save rsi
  mov.qreg.qimm rax, 1           ; syscall number
  mov.qreg.qimm rdi, 1           ; file descriptor
  mov.qreg.qreg rsi, rcx         ; pass address
  syscall
  pop.qreg rsi                   ; restore rsi
  next


;;;;;;;;;;;;;;;;;;;;;;;;
;;; I/O conveniences ;;;
;;;;;;;;;;;;;;;;;;;;;;;;


defword emitstring, 0
  dq docol, dup, stringlen, swap, sys_write, exit

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; (new) Implementation strategy ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   We assemble the entire file contents in a heap-allocated buffer. When
;;; the file is fully assembled, we output it.
;;;
defword quine, 0
  dq docol

  ; We still have "heap" on the stack, so we can call early_here. Add a
  ; constant to "here" in-place, keeping a copy of the pointer ...
  dq early_here, lit, 0x78, swap, addstore
  ; ... and now we have allocated a block of memory, with its address on the
  ; stack. We also still have "heap" at the bottom of the stack, for future
  ; use.

  ; We have one label, and two pieces of information about it: value and
  ; status. We keep them on the stack. We also have a mutable copy of the
  ; buffer's address.
  ;
  ; This stack layout is the interface used throughout the "assembler"; this
  ; quine routine uses it when calling the routines that build the various
  ; pieces of the output, and those pieces in turn use it when calling the
  ; label-handling routines.
  ;
  ; Stack:
  ;   "heap"
  ;   start address of allocated output
  ;   label's value
  ;   label's status
  ;   address of current output point within allocated block
  ; (top)
  ;
  ; Status is a bit field:
  ;    bit zero is whether it was used before being defined
  ;    bit one is whether it's been defined
  ;    bit two is whether the guessed value wound up equaling the actual value
  dq dup, lit, 0, lit, 0, roll3

  ; This takes an address to write to on the stack, writes stuff there, and
  ; returns the next address after where it stopped Writing.
  dq all_contents

  ; The two-pass magick: ; Reset the output offset to the beginning of the
  ; block.
  dq drop, roll3, dup, lit, 4, unroll
  dq all_contents

  ; Drop the copy of the buffer's address.
  dq drop

  ; Drop the label data.
  dq drop, drop

  ; This takes a buffer's address on the stack, skips an ELF file header and
  ; program header based on hardcoded size, computes an offset (secretly
  ; hardcoded), and writes that offset into an appopriate place in the middle
  ; of those headers. It then returns the length of the used portion of the
  ; buffer.
  ;dq lit, 0x78, swap
  dq lit, file_size, swap

  ; write() from stack-allocated buffer
  dq sys_write

  dq exit

; Stack in:
;   output memory start
;   label value (guessed or actual)
;   label status
;   output memory current point
; Stack out:
;   output memory start
;   label value (guessed or actual)
;   label status (potentially modified)
;   output memory current point
;   label value for caller to use
defword use_label, 0
  dq docol

  ; Fetch the status
  dq swap
  ; Check the bit that indicates it's been set.
  dq dup, lit, 2, and, zbranch, 10*8

  ; If we're here, it has been set already, so just put the status back...
  dq swap
  ; Fetch the actual value...
  dq lit, 3, roll, dup, lit, 4, unroll
  ; ... and exit
  dq exit

  ; If we're here, it hasn't been set yet, so mark it used-before-set.
  dq lit, 1, or
  ; Put the status back...
  dq swap
  ; Fetch the guessed value...
  dq lit, 3, roll, dup, lit, 4, unroll
  ; ... and exit
  dq exit

; Stack in:
;   output memory start
;   label value (guessed or actual)
;   label status
;   output memory current point
; Stack out:
;   output memory start
;   label value (now set to actual)
;   label status (modified)
;   output memory current point
defword set_label, 0
  dq docol

  ; Compute the current offset, to use as the actual value
  dq dup, lit, 5, roll, dup, lit, 6, unroll, sub
      ; old value, status, point, new value

  ; Overwrite the old actual value; keep a copy
  dq dup, lit, 5, roll
      ; status, point, new value, new value, old value
  dq eq, swap
      ; status, point, equality, new value
  dq lit, 4, unroll
      ; new value, status, point, equality

  ; We don't need to branch. Now we mark the status as having been defined,
  ; and we also set bit 2 if appropriate.
  dq lit, 4, mul
  dq lit, 3, roll, or, lit, 2, or, swap

  dq exit


defword hlt, 0
  dq $ + 8
  hlt


; This takes an address to write to on the stack, writes stuff there, and
; returns the next address after where it stopped Writing.
;
; It also makes use of label stuff, further back on the stack.
defword all_contents, 0
  dq docol
  dq elf_file_header, elf_program_header
  dq output_start_routine
  dq self_raw
  dq set_label
  dq exit

;;;
;;; ELF header
;;;
;;;   This is the top-level ELF header, for the entire file. An ELF always has
;;; exactly one of this header, which is always at the start of the file.
;;;
defword elf_file_header, 0
  dq docol

  dq litpack32, 0x7f bappend "ELF"          ; magic number
  dq litpack8, 2                            ; 64-bit
  dq litpack8, 1                            ; little-endian
  dq litpack8, 1                            ; ELF header format v1
  dq litpack8, 0                            ; System-V ABI
  dq litpack64, 0                           ; (padding)

  dq litpack16, 2                           ; executable
  dq litpack16, 0x3e                        ; Intel x86-64
  dq litpack32, 1                           ; ELF format version

  ; Compute the entry pointer.
  dq litpack64, _start                      ; entry point
    ; The offset of _start. This includes the origin, intentionally.

  dq litpack64, 64                          ; program header offset
    ; We place the program header immediately after the ELF header. This
    ; offset is from the start of the file.
  dq litpack64, 0                           ; section header offset
  dq litpack32, 0                           ; processor flags
  dq litpack16, 64                          ; ELF header size
  dq litpack16, 56                          ; program header entry size
  dq litpack16, 1                           ; number of program header entries
  dq litpack16, 0                           ; section header entry size
  dq litpack16, 0                           ; number of section header entries
  dq litpack16, 0                           ; section name string table index

  dq exit


;;;
;;; Program header
;;;
;;;   An ELF program header consists of any number of these entries; they are
;;; always consecutive, but may be anywhere in the file. We always have
;;; exactly one, and it's always right after the ELF file header.
;;;
defword elf_program_header, 0
  dq docol

  dq litpack32, 1                          ; "loadable" segment type
  dq litpack32, 0x05                       ; read+execute permission
  dq litpack64, 0                          ; offset in file
  dq litpack64, $$                         ; virtual address
    ; required, but can be anything, subject to alignment
  dq litpack64, 0                          ; physical address (ignored)

  ; Fill in 0 as the file size for now, to avoid unitialized memory.
  dq use_label, pack64                     ; size in file
  dq use_label, pack64                     ; size in memory

  dq litpack64, 0                          ; segment alignment
    ; for relocation, but this doesn't apply to us

  dq exit


defword output_start_routine, 0
  dq docol
  dq cld
  dq lit, 9, rax, mov_reg64_imm64
  dq lit, heap_requested_address, rdi, mov_reg64_imm64
  dq lit, heap_size, rsi, mov_reg64_imm64
  dq lit, 0x03, rdx, mov_reg64_imm64
  dq lit, 0x22, r10, mov_extrareg64_imm64
  dq lit, 0, r8, mov_extrareg64_imm64
  dq lit, 0, r9, mov_extrareg64_imm64
  dq syscall
  dq rax, rdi, mov_reg64_reg64
  dq exit


; write() the machine code by using self-reference
; TODO do this in a "real" quine way
defword self_raw, 0
  dq docol
  dq dup
  dq lit, elf_header + 0xc4      ; source
  dq lit, file_size - 0xc4       ; length
  ; destination destination source length
  dq dup, lit, 4, roll, add, lit, 4, unroll
  ; result destination source length
  dq ccopy
  dq exit


final_word_name = self_raw
code_size = $ - code_start
file_size = $ - $$