;;; QUINE ;;; ;;; This file is formatted to be read at 80-columns or wider. ;;;;;;;;;;;;;;;;;;;;; ;;; Workflow tips ;;; ;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Currently, this is not yet fully self-hosting; it is based on ;;; flatassembler[1]. A minimal command to build and run it is: ;;; ;;; fasmg quine.asm quine && chmod 755 quine && ./quine; echo $? ;;; ;;; A workflow you may wish to use for debugging is: ;;; ;;; rm quine2; fasmg quine.asm quine && ./quine > quine2; echo "exit code:" $?; echo; hexdump -C quine; echo; hexdump -C quine2; echo; cmp -l quine quine2 ; echo cmp: $? ;;; ;;; The reason this removes the old one first is that otherwise, there's a ;;; risk the error message will be scrolled off the top of the screen and ;;; you'll see stale output and not realize. ;;; ;;; You may also wish to do: ;;; ;;; objdump --disassemble quine ;;; ZydisDisasm -64 quine ;;; ;;; This relies on GNU binutils, and on zydis, respectively. ;;; ;;; [1] https://flatassembler.net/ ;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Assembly language ;;; ;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Before doing any actual code, we define macros for writing x86-64 ;;; assembly language. This is built from scratch, relying only on ;;; flatassembler's built-in semantics. No include files of any kind are used ;;; for it. ; The way these are all spelled out like this is slightly ridiculous, there ; must be a better way. macro rex.0 db 0x40 end macro macro rex.w db 0x48 end macro macro rex.r db 0x44 end macro macro rex.x db 0x42 end macro macro rex.b db 0x41 end macro macro rex.wr db 0x4C end macro macro rex.wx db 0x4A end macro macro rex.wb db 0x49 end macro macro rex.rx db 0x46 end macro macro rex.rb db 0x45 end macro macro rex.xb db 0x43 end macro macro rex.wrx db 0x4E end macro macro rex.wrb db 0x4D end macro macro rex.wxb db 0x4B end macro macro rex.rxb db 0x47 end macro macro rex.wrxb db 0x4F end macro macro modrm mod, reg, rm assert mod >= 0 & mod < 4 assert reg >= 0 & reg < 8 assert rm >= 0 & rm < 8 db (mod shl 6) or (reg shl 3) or rm end macro macro sib scale, index, base assert scale >= 0 & scale < 4 assert index >= 0 & index < 8 assert base >= 0 & index < 8 db (scale shl 6) or (index shl 3) or base end macro macro opcodereg opcode, reg assert opcode >= 0 & opcode < 256 & opcode and 7 = 0 assert reg >= 0 & reg < 8 db opcode or reg end macro macro qwordreg result, register match =rax?, register result = 0 else match =rcx?, register result = 1 else match =rdx?, register result = 2 else match =rbx?, register result = 3 else match =rsp?, register result = 4 else match =rbp?, register result = 5 else match =rsi?, register result = 6 else match =rdi?, register result = 7 else assert 0 end match end macro macro owordreg result, register match =r8?, register result = 0 else match =r9?, register result = 1 else match =r10?, register result = 2 else match =r11?, register result = 3 else match =r12?, register result = 4 else match =r13?, register result = 5 else match =r14?, register result = 6 else match =r15?, register result = 7 else assert 0 end match end macro ; TODO what register size does this use? macro mov.b target, source match =rax?, target db 0xB8 dd source else match =rdi?, target db 0xBF dd source else assert 0 end match end macro ; TODO what register size does this use? macro mov.dreg.dimm target, source rex.w db 0xC7 qwordreg reg, target modrm 3, 0, reg dd source end macro macro mov.qreg.qimm target, source qwordreg treg, target rex.w opcodereg 0xB8, treg dq source end macro ; Notice the use of REX.B here; this instruction puts the register number in ; the opcode field, so it uses Table 3-1. macro mov.oreg.qimm target, source owordreg treg, target rex.wb opcodereg 0xB8, treg dq source end macro macro mov.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source rex.w db 0x89 modrm 3, sreg, treg end macro ; Take a 64-bit source register, treat it as an address and look up the 64-bit ; value it points to, store that into a 64-bit target register. ; ; For rsp and rbp, the only modes available also have displacement; we use an ; 8-bit one and set it to zero. The other registers could be encoded without ; the displacement, but for simplicity's sake we do the same thing for all of ; them. ; ; In understanding this, pay close attention to the Op/En column in the opcode ; table. The "RM" variant means the ModRM byte's R/M field (the third one) ; is the source, while its reg field (the middle one) is the target. This is ; what we want, because the R/M field is the one that gets indirection applied ; to it. Opcode 0x8B with an REX.W prefix is the all-64-bit RM variant. ; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV". ; ; For the indirection modes, don't be confused by the many similar tables. ; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W ; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte. ; The presence or absence of an SIB byte is determined by where in that table ; we fall, and we aren't using a mode that has one. [Intel] volume 2A, ; chapter 2, section 2-1.5, table 2-2. ; ; We disallow rsp as a source because that's the mode that would want an SIB. macro mov.qreg.indirect.qreg target, source match =rsp, source assert 0 else qwordreg sreg, source qwordreg treg, target rex.w rb 0x8B modrm 1, treg, sreg db 0 end match end macro ; Take a 64-bit source register, store its value into the address pointed to ; by a 64-bit target register. The only modes available also have ; displacement; we use an 8-bit one and set it to zero. ; ; In understanding this, pay close attention to the Op/En column in the opcode ; table. The "MR" variant means the ModRM byte's reg field (the middle one) ; is the source, while its R/M field (the third one) is the target. This is ; what we want, because the R/M field is the one that gets indirection applied ; to it. Opcode 0x89 with an REX.W prefix is the all-64-bit MR variant. ; [Intel] volume 2B, chapter 3, section 3-4.3, "MOV". ; ; For the indirection modes, don't be confused by the many similar tables. ; 64-bit mode is encoded the same as 32-bit mode except for adding a REX.W ; prefix, as per 2.2.1.1, so you want table 2-2 to understand the ModRM byte. ; The presence or absence of an SIB byte is determined by where in that table ; we fall, and we aren't using a mode that has one. [Intel] volume 2A, ; chapter 2, section 2-1.5, table 2-2. ; ; We disallow rsp as a target because that's the mode that would want an SIB. macro mov.indirect.qreg.qreg target, source match =rsp, target assert 0 else qwordreg sreg, source qwordreg treg, target rex.w db 0x89 modrm 1, sreg, treg db 0 end match end macro ; Take a 64-bit source register, store its value into a high 64-bit target ; register (r8-r15). ; ; Notice that there are two ways to add another bit to the register encoding. ; Table 3-1 is about REX.B, but does not apply here, it's for instructions ; that use opcode bits to specify a register, and none of the ; register-to-register MOV variants do that (it's for immediate mode). ; ; Instead, we want the mechanism that uses REX.R as the extra bit, and it ; combines with the reg field of ModRM, as per 2.2.1.2. ; ; Therefore, we want the variant of MOV which puts the target in the reg ; field. That's Op/En "RM", opcode 0x8B with REX.WR. ; ; Mode 3 is direct addressing. macro mov.oreg.qreg target, source owordreg treg, target qwordreg sreg, source rex.wr rb 0x8B modrm 3, treg, sreg end macro ; Take a high 64-bit source register (r8-r15), store its value into a 64-bit ; target register. ; ; Notice that there are two ways to add another bit to the register encoding. ; Table 3-1 is about REX.B, but does not apply here, it's for instructions ; that use opcode bits to specify a register, and none of the ; register-to-register MOV variants do that (it's for immediate mode). ; ; Instead, we want the mechanism that uses REX.R as the extra bit, and it ; combines with the reg field of ModRM, as per 2.2.1.2. ; ; Therefore, we want the variant of MOV which puts the source in the reg ; field. That's Op/En "MR", opcode 0x89 with REX.WR. ; ; Mode 3 is direct addressing. macro mov.qreg.oreg target, source qwordreg treg, target owordreg sreg, source rex.wr rb 0x89 modrm 3, sreg, treg end macro ; This adds a 64-bit register to another 64-bit register, in place. macro add.qreg.qreg target, source qwordreg treg, target qwordreg sreg, source rex.w db 0x01 modrm 3, sreg, treg end macro ; This adds a signed 8-bit immediate value to a 64-bit register, in place. ; ; Notice the use of 3 as the addressing mode. This says to use the register ; itself. The 0 in the reg field is part of the opcode. macro add.qreg.bimm target, source qwordreg treg, target rex.w db 0x83 modrm 3, 0, treg db source end macro ; This adds a signed 32-bit immediate value to a 64-bit register, in place. ; ; Notice the use of 3 as the addressing mode. This says to use the register ; itself. The 0 in the reg field is part of the opcode. macro add.qreg.dimm target, source qwordreg treg, target rex.w db 0x81 modrm 3, 0, treg dd source end macro ; This subtracts a signed 8-bit immediate value from a 64-bit register, in ; place. ; ; Notice the use of 3 as the addressing mode. This says to use the register ; itself. The 5 in the reg field is part of the opcode. macro sub.qreg.bimm target, source qwordreg treg, target rex.w db 0x83 modrm 3, 5, treg db source end macro ; This subtracts a signed 32-bit immediate value from a 64-bit register, in ; place. ; ; Notice the use of 3 as the addressing mode. This says to use the register ; itself. The 5 in th reg field is part of the opcode. macro sub.qreg.dimm target, source qwordreg treg, target rex.w db 0x81 modrm 3, 5, treg dd source end macro ; Move from an 8-bit immediate value, to a location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0xC6, which has w = 0. Since we run in 64-bit mode, that ; makes the operand size 8 bits, regardless of the current operand-size ; attribute. [Intel] volume 2D, appendix B, section B-1.4.3, table B-6. macro mov.qreg.disp8.bimm target, offset, source match =rsp, target db 0xC6 modrm 1, 0, 4 ; 4 is rsp, but it's a special case sib 0, 4, 4 ; no scaling, no indexing, rsp as base db offset db source else assert 0 end match end macro ; Move from a 16-bit immediate value, to a location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0xC7, which has w = 1. We run in 64-bit mode, so that gives ; us an operand size of 32 bits by default. [Intel] volume 1, section 3.6.1, ; table 3-4. We want a 16-bit operand, so we use the operand-size prefix, ; 0x66, and we leave REX.W unset. ; ; We need to treat rsp specially because it's the SIB case, per table 2-2. macro mov.qreg.disp8.wimm target, offset, source match =rsp, target db 0x66 db 0xC7 modrm 1, 0, 4 ; 4 is rsp, but it's a special case sib 0, 4, 4 ; no scaling, no indexing, rsp as base db offset dw source else assert 0 end match end macro ; Move from a 32-bit immediate value, to a location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0x67, which has w = 1. We run in 64-bit mode, so that gives ; us an operand size of 32 by default. [Intel] volume 2D, section B.1.43, ; table B-6. This is what we want, so we leave it. macro mov.qreg.disp8.dimm target, offset, source match =rsp, target db 0xC7 modrm 1, 0, 4 ; 4 is rsp, but it's a special case sib 0, 4, 4 ; no scaling, no indexing, rsp as base db offset dd source else assert 0 end match end macro ; Move from a 64-bit register, to a 64-bit location relative to a 64-bit ; register, with an 8-bit displacement and no indexing. ; ; This uses opcode 0x89 with REX.W, so that gives us the reg field as the ; 64-bit source and the R/M field as the 64-bit destination. ; ; We need to treat a target of rsp specially because it's the SIB case per ; table 2-2. macro mov.qreg.disp8.qreg target, offset, source qwordreg sreg, source qwordreg treg, target match =rsp, target rex.w db 0x89 modrm 1, sreg, treg ; treg is rsp by assumption, and R/M = rsp is the SIB case sib 0, 4, 4 ; no scaling, no indexing, rsp as base db offset else rex.w db 0x89 modrm 1, sreg, treg db offset end match end macro ; Move from a 64-bit register, to a 64-bit location relative to a 64-bit ; register, with a 32-bit displacement and no indexing. ; ; This uses opcode 0x89 with REX.W, so that gives us the reg field as the ; 64-bit source and the R/M field as the 64-bit destination. ; ; We need to treat a target of rsp specially because it's the SIB case per ; table 2-2. macro mov.qreg.disp32.qreg target, offset, source qwordreg sreg, source qwordreg treg, target match =rsp, target rex.w db 0x89 modrm 2, sreg, treg ; treg is rsp by assumption, and R/M = rsp is the SIB case sib 0, 4, 4 ; no scaling, no indexing, rsp as base dd offset else rex.w db 0x89 modrm 2, sreg, treg dd offset end match end macro ; Move from a 32-bit immediate value, to a 64-bit location relative to a ; 64-bit register, with an 8-bit displacement and no indexing. ; ; Note that there is no instruction to move a 64-bit immediate to memory. ; ; This uses opcode 0xC7, which has w = 1. We run in 64-bit mode, so that ; gives us an operand size of 32 by default. [Intel] volume 2D, ; section B.1.43, table B-6. We want a 64-bit operand, so we use the REX.W ; prefix, 0x48. macro mov.qreg.disp8.dimm target, offset, source match =rsp, target rex.w db 0xC7 modrm 1, 0, 4 ; 4 is rsp, but it's a special case sib 0, 4, 4 ; no scaling, no indexing, rsp as base db offset dd source else assert 0 end match end macro ; "Load effective address". Compute a 64-bit address as you would for ; indexed addressing, with an 8-bit displacement and no indexing, but instead ; of doing anything with the memory, just store the address itself into a ; register. macro lea.qreg.qreg.disp8 target, offset, source match =rsp, target ; This is the SIB case assert 0 else qwordreg treg, target qwordreg sreg, source rex.w db 0x8D modrm 1, treg, sreg db offset end match end macro macro lea.qreg.qreg.disp32 target, offset, source match =rsp, target ; This is the SIB case assert 0 else qwordreg treg, target qwordreg sreg, source rex.w db 0x8D modrm 2, treg, sreg dd offset end match end macro ; Clear the DF flag. This makes string instructions increment RSI. macro cld db 0xFC end macro ; Load 64 bits from the address in RSI into RAX. Then, increment or decrement ; RSI by 8 bytes, depending on the value of the DF flag. macro lodsq rex.w db 0xAD end macro ; Push a 64-bit value from a register onto the stack (the one pointed to by ; rsp). Decrement rsp, then write the value at the new location. ; ; In the corner case where rsp is also the value being pushed, the old value ; is the one used. ; ; There's an alternate encoding of this that uses a ModRM byte, but doing it ; without is more compact, so we do without. macro push.qreg source qwordreg sreg, source opcodereg 0x50, sreg end macro ; Pop a 64-bit value into a register from the stack (the one pointed to by ; rsp). Read the value from the old location, then increment rsp. ; ; In the corner case where rsp is also the destination being written to, the ; read happens from the old location, then the write causes the increment to ; be irrelevant. ; ; There's an alternate encoding of this that uses a ModRM byte, but doing it ; without is more compact, so we do without. macro pop.qreg target qwordreg treg, target opcodereg 0x58, target end macro ; Do an absolute indirect jump with a 64-bit register operand. That is: given ; a register which holds a pointer, read another address from the pointed-to ; memory and jump to it. ; ; Technically this is a "near" jump in x86 terms, but we just pretend far ; jumps and segments don't exist. They are still a thing in 64-bit mode, we ; just don't use them. macro jmp.abs.indirect.qreg location qwordreg lreg, location db 0xFF modrm 0, 4, lreg end macro ; There in no 64-bit immediate "near" jump, so we use 32-bit. It's relatve, ; so that's honestly plenty. ; ; The location is relative to the start of the instruction immediately ; following the jmp. macro jmp.rel.dimm location db 0xE9 dd location end macro ; Invoke a system call provided by the kernel. On Linux, the System V ABI ; describes the semantics of such calls (at least, on x86). macro syscall db 0x0F, 0x05 end macro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Executable file format ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Before we get into the body of the program, we do a lot of ELF-specific ;;; stuff to ensure that our output is in a format Linux knows how to run. ;;; ;;; First, we set the origin to load at. This is arbitrary, but it can't be ;;; zero. We tell flatassembler about it because it's used in label ;;; calculations; we can reference it as $$ any time we need it in future. org 0x08000000 ;;; ;;; Second, we output ELF's top-level file header. The only interesting ;;; thing here is the entry pointer. ;;; elf_header: ; * denotes mandatory fields according to breadbox db 0x7F, "ELF" ; *magic number db 2 ; 64-bit db 1 ; little-endian db 1 ; ELF header format version 1 db 0 ; System-V ABI db 8 dup 0 ; (padding) dw 2 ; *executable dw 0x3E ; *Intel x86-64 dd 1 ; ELF format version dq _start ; *entry point dq program_header - $$ ; *program header offset dq 0 ; section header offset dd 0 ; processor flags dw elf_header_size dw program_header_entry_size ; * dw 1 ; *number of program header entries dw 0 ; section header entry size dw 0 ; number of section header entries dw 0 ; section name string table index ; Save a copy of the size of this chunk for our future reference, by comparing ; the current posiion to the label above. elf_header_size = $ - elf_header ;;; ;;; Third, immediately after the ELF file header, we output ELF's program ;;; header, which lists the memory regions ("segments") we want to have and ;;; where we want them to come from. We list just a single region, which is ;;; the entire contents of the ELF file from disk. ;;; ;;; It would be more typical to use this header to ask the loader to give us ;;; separate code and data segments, and perhaps a stack or heap, but this ;;; keeps things simple, and we can create those things for ourselves later. ;;; ;;; We do have a little stack space available, though we don't explicitily ;;; request any; the kernel allocates it for us as part of exec() so that it ;;; can pass us argc and argv (which we ignore). That stack space will be at a ;;; random address, different every time, because of ASLR; that's a neat ;;; security feature, so we leave it as-is. ;;; program_header: dd 1 ; *"loadable" segment type dd 0x05 ; *read+execute permission dq 0 ; *offset in file dq $$ ; *virtual address ; required, but can be anything, subject to alignment dq 0 ; physical address (ignored) dq file_size ; *size in file dq file_size ; *size in memory dq 0 ; segment alignment ; for relocation - will we be ASLR'd? ; Save the size of this chunk, as well. program_header_entry_size = $ - program_header ; Everything after this point is code or data, not headers, so save the start ; of it for use in size calculations later. code_start: ;;;;;;;;;;;;;;;;;;;;;;; ;;; Execution model ;;; ;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; We use Forth-style dual stacks, one for values and one for control. We ;;; use rsp for values, just like C does. We use rbp for the control stack, ;;; which is a special Forth-y stack: These are pointers into the bodies of ;;; Forth words, not return addresses. ;;; ;;; The choice of rsp and rbp for the stack pointers imitates Jonesforth; ;;; I'm hopeful that it gives us convenient addressing modes, and will report ;;; back about that when I feel that I understand the implications. ;;; ;;; In Forth, everything is a "word", including mutable variables. ;;; Conceptually, a word is a unit of execution, which may be implemented ;;; either in machine code or as an array of pointer to other words. ;;; ;;; This polymorphism is implemented by having each word's contents begin ;;; with a "codeword", which is a pointer to machine code that "interprets" ;;; the rest of the contents. In the case of words implemented in machine ;;; code, the codeword points directly to that code, which is normally right ;;; next to it. ;;; ;;; Variables, to Forth, are simply one more thing that can be executed; the ;;; effect of executing a variable is to push its address onto the value ;;; stack. ;;; ;;; We adopt this model of words, codewords, and variables-as-words. It's ;;; nice for us because it works without a heap. ;;; ;;; One way in which we differ from Forth is that we don't have a ;;; dictionary, and our words don't have names. Nothing would prevent this, ;;; it just isn't useful to this single-purpose program. The Forth dictionary ;;; is usually a linked list of every word that has ever been defined, with ;;; the newest at the head; the names of words are stored in string fields as ;;; part of every word's internal header. Our header has neither the pointer ;;; field for the dictionary, nor the string; the only header we have is the ;;; the codeword. ;;; ;;; We specifically implement a version of calling and returning that Forth ;;; calls indirect threaded code: The control stack is a stack of pointers ;;; into the middle of interpreted words. The interpreter snippet, called ;;; DOCOL, implements calling. Each word is responsible for making sure ;;; returning works properly. Interpreted words accomplish this by ending with ;;; the word EXIT, while machine-code words accomplish it by ending with a ;;; verbatim snippet called NEXT. ;;; ;;; Conceptually, NEXT returns, but more specifically it accomplishes this ;;; by doing the caller's next dispatch for it; thus control never actually ;;; goes back to the caller's interpreter after initial setup. For performance ;;; reasons, NEXT is always inlined, so we define it as a macro. ;;; ;;; DOCOL is just ordinary code, not a macro. It's defined later in this ;;; file, as a label. ;;; ;;; ;;; ;;; -------------------------------------------------------------------------- ;;; Quick Reference ;;; -------------------------------------------------------------------------- ;;; ;;; The layout of an interpreted word: ;;; ;;; 0x00 - 0x08 Codeword (address of DOCOL snippet) ;;; 0x08 - ???? (8-byte chunks) Addresses of other words ;;; ... (end) Address of EXIT word ;;; ;;; The layout of a machine-code word: ;;; ;;; 0x00 - 0x08 Addresss of immediately following byte ;;; 0x08 - ???? Arbitrary machine code ;;; ... (end) Inlined implementation of NEXT ;;; ;;; ;;; REGISTER usage conventions: ;;; ;;; * rsi is the "instruction pointer" for the "interpreter". ;;; That is, it points to some word-pointer inside an array of ;;; word-pointers inside the content of the word they're part of. It always ;;; points to the next word that should be executed, whose execution hasn't ;;; begun yet. ;;; ;;; * rbp points to the top of the control stack ;;; These are former values of rsi, to eventually be returned to, from ;;; successively older callers as you look further up the stack. The stack ;;; grows downwards in memory. Since values are kept separately, the only ;;; thing on the control stack is return addresses, one per layer of call. ;;; ;;; * esp points to the top of the value stack ;;; The value stack has no specific format, but it grows downwards in ;;; memory. In particular there's no concept of stack frames, because items ;;; on the stack don't belong to any particular word; the value stack in ;;; Forth is in part a mechanism for passing values between words. ;;; ;;; Additionally, immediately after beginning execution of a word: ;;; ;;; * rax points to the address of the codeword being executed ;;; The value of rax is purely for the callee's benefit, and does not need ;;; to be preserved. ;;; ;;; Other registers are purely discretionary, and are not preserved across ;;; calls. ;;; ;;; ;;; FLAG usage: ;;; ;;; * DF should be 0 ;;; We use lodsq extensively and that makes it increment rsi after using it. ;;; ;;; -------------------------------------------------------------------------- ;;; ;;; Macro NEXT ;;; ---------- ;;; ;;; Include this inline at the end of a word implemented in machine-code. ;;; Conceptually, it returns. What it actually does is do the next thing the ;;; caller would do, which is call the next word from the caller's array of ;;; word pointers. ;;; ;;; Registers in: ;;; ;;; * rsi points to the address of the word to execute ;;; ;;; Registers out: ;;; ;;; * rax points to the codeword in the contents of the word that was executed ;;; * rsi points to the next word-address after this one ;;; ;;; Flags ;;; * DF = 0 is required ;;; macro NEXT ; Copy the next word's address from *rsi into rax. Increment the stack ; pointer (as per the DF flag). lodsq ; Load the codeword from the word's contents, and jump to the interpreter it ; points to. jmp.abs.indirect.qreg rax end macro ;;; ;;; Macros PUSHCONTROL ;;; POPCONTROL ;;; ------------------ ;;; ;;; Include these inline to push an address onto the control stack, or pop ;;; one off of it. You will recall the control stack is kept in rbp. The ;;; parameter is given in a user-specified register. ;;; ;;; Jonesforth's analogous macros are called PUSHRSP and POPRSP but I think ;;; that's super confusing, since rsp is also the name of a register, but a ;;; different one. I guess it was less confusing in 32-bit, since esp doesn't ;;; start with an "r". Anyway, this has to be named something that ;;; distinguishes it from Intel's PUSH and POP opcodes, so... ;;; ;;; "Load effective address" is just a cute way to do arithmetic on a ;;; register, here. To push or pop we decrement or increment rbp by 8. To ;;; actually interact with the space in the stack, we indirect through rbp. ;;; ;;; Registers in and out: ;;; ;;; * rbp points to the top of the control stack. ;;; macro PUSHCONTROL source lea.qreg.qreg.disp8 rbp, -8, rbp mov.indirect.qreg.qreg rbp, source end macro macro POPCONTROL target mov.qreg.indirect.qreg target, rbp lea.qreg.qreg.disp8 rbp, 8, rbp end macro ;;; ;;; Routine DOCOL ;;; ------------- ;;; ;;; Reference this via its label as the codeword of a word to make it an ;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer") ;;; to the control stack, takes the address of the codeword from rax and ;;; increments it in-place to form the new instruction pointer, and copies ;;; that to rsi. ;;; ;;; Having then done this, we're now in the state that normal execution ;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution, ;;; kicking off a nested call. ;;; ;;; The name is said to be short for "do colon", because Forth high-level ;;; code begins word definitions with a colon. ;;; ;;; Registers in: ;;; ;;; * rsi is the caller's instruction pointer ;;; * rbp is the control stack pointer ;;; * rax is the address of the callee's codeword ;;; ;;; Registers out: ;;; ;;; * rsi is the callee's instruction pointer ;;; * rbp is the control stack pointer DOCOL: PUSHCONTROL rsi add.qreg.bimm rax, 8 mov.qreg.qreg rsi, rax NEXT ;;; ;;; Routine _start ;;; -------------- ;;; ;;; This is the entry point of the whole program, the very first code we ;;; actually execute. Linkers traditionally call this _start, and on balance ;;; I think it's probably best to keep that name, though I've honestly never ;;; liked it... Anyway, the ELF header points to it and exec() jumps to it. ;;; ;;; The kernel gives us most registers zeroed, and rsp pointing to the ;;; command-line stuff (argc, argv, envp), which is at an ASLR'd address with ;;; some stack space allocated for us, despite the fact we didn't request any. ;;; It also gives us all the flags clear except IF, but we don't rely on that. ;;; Lastly, of course, it loads our code segment and sets the instruction ;;; pointer where we asked; we don't need to check what those addresses are, ;;; because they're not randomized. ;;; ;;; This routine is really only responsible for one-time initialization. ;;; ;;; Registers in: ;;; ;;; * rsp points to the logical top of the value stack ;;; The kernel sets this up for us, and we need to save it somewhere so ;;; Forth can use it. ;;; ;;; Registers out: ;;; ;;; * rsi points within QUIT ;;; QUIT is the word that's Forth's closest equivalent to main(). ;;; * rsp points to the top of the value stack ;;; ;;; Notably, rbp is still uninitialialized after _start. ;;; ;;; Stack in: ;;; ;;; * argc, argv, envp in the usual Unix way ;;; We ignore them, though. ;;; ;;; Stack out: ;;; ;;; * The value of HEAP, as a pointer ;;; The meaning of this will be explained below. ;;; ;;; Registers within: ;;; ;;; * rdi points to the base the heap was allocated at, once it exists ;;; This is the same value that HEAP will hold, once we reach a point ;;; where we have variables. Of course, variables are stored on the heap, ;;; hence this temporary measure. ;;; ;;; We also take this opportunity to define soeme memory layout parameters ;;; that this routine will be responsible for doing something with: ;;; heap_requested_address = 0x0000001000000000 ; (very arbitrary) heap_size = 0x0000000001000000 ; 16 MiB control_stack_size = 0x10000 ; 64 KiB _start: cld ; clear the DF flag ;;; ;;; Prepare the heap. ;;; ;;; We could ask for a data segment in the program header, but where's the ;;; fun in that? Instead, we call mmap(). ;;; ;;; If we wanted the kernel to do ASLR for us, passing address zero would ;;; cause it to pick somewhere at random, but instead we choose our own ;;; location. It's still not guaranteed to be where we ask for, so we still ;;; do the work to record where it wound up. We could pass the "fixed" flag ;;; and the kernel would trust us, but this gives us more options for ;;; interoperating with other runtimes. ;;; mov.b rax, 9 ; mmap() mov.qreg.qimm rdi, heap_requested_address ; address (very arbitrary) mov.qreg.qimm rsi, heap_size ; size (one meg) mov.qreg.qimm rdx, 0x03 ; protection (read+write) mov.oreg.qimm r10, 0x22 ; flags (private+anonymous) mov.oreg.qimm r8, 0 ; file descriptor (ignored) mov.oreg.qimm r9, 0 ; offset (ignored) syscall ;;; ;;; The return value of the system call is in rax, we'll use it in a sec. ;;; We need to save this somewhere in case we ever want to munmap() it; ;;; there's no widely-used name for it so we have to make one up. S0 and R0 ;;; are widely-used names for the logical tops of the value and control ;;; stacks, respectively, and we will eventually set those up as well, so we ;;; should keep those names in mind. The control stack lives within the ;;; heap, while the value stack is its own segment. This value, though, is ;;; the physical bottom of the segment, meaning that it stays the same even ;;; as we allocate and deallocate things within it. This is unlike the two ;;; stack pointers, so we give it a name that doesn't suggest similarity: ;;; HEAP. ;;; ;;; Once Forth is fully set up, its internal variables will be accessed ;;; through variable-words like any other Forth data, including HEAP. To get ;;; to that point, though, we need to be able to hold onto variable data ;;; between now and then. In fact, if we don't have at least one of HEAP and ;;; HERE (its counterpart which points to the logical top end), all our ;;; efforts to hold onto anything seem a bit doomed. ;;; ;;; So, we temporarily dedicate rdi to HEAP - only within this routine - ;;; and store everything else in ways that let us find things by reference ;;; to it. We choose rdi because it works with the indexing modes we care ;;; about, and its name suggests its function. ;;; ;;; The strategy Jonesforth uses is not applicable to us; Jonesforth ;;; takes advantage of the linker to let its code segment refer to specific, ;;; pre-allocated objects in the data segment. We are our own linker. ;;; Hence, this approach. ;;; ;;; Keying things off HEAP is the fundamental decision, but to make sure ;;; our variables are accessible both during early bootstrapping, and later, ;;; we also have to be thoughtful about data structures. More on that in a ;;; moment. ;;; mov.qreg.qreg rdi, rax ;;; ;;; We also initialize rbp. We could hold off and let QUIT do this, but ;;; doing it now is the easiest way to initialize the R0 variable, since ;;; there's no instruction that moves a 64-bit immediate to memory. ;;; ;;; This is the moment at which we decide where the control stack starts! ;;; Fun, right? "Allocation" is just a fancy word for picking where we want ;;; something, then being consistent about it - like placing furniture in ;;; your home. See below for a little more thought about why here in ;;; particular. ;;; lea.qreg.qreg.disp32 rbp, control_stack_size, rdi ;;; ;;; Now we save some stuff onto the heap. These are the locations that ;;; will eventually be the backing stores of the Forth variables, but we ;;; don't create the word headers yet, since there's no requirement that ;;; they be next to the backing stores. We'll do that later, once we have ;;; word-writing infrastructure in place. For now, we just use their offsets ;;; relative to the physical bottom of the heap, which are fixed. ;;; ;;; These will be the permanent homes of these values, though we have ;;; copies of them elsewhere while we're still in this routine. ;;; mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi ; HEAP mov.qreg.disp32.qreg rdi, control_stack_size + 0x08, rsp ; S0 mov.qreg.disp32.qreg rdi, control_stack_size + 0x10, rbp ; R0 ;;; ;;; * HEAP is the physical bottom of the heap ;;; The heap grows upwards in memory, so this is also the logical ;;; bottom. This comes from the address mmap() just returned to us. ;;; * S0 is the logical bottom of the value stack ;;; The value stack grows downwards in memory, so this is the physical ;;; top of it. This comes from the stack pointer the kernel initialized us ;;; with. ;;; * R0 is the logical bottom of the control stack ;;; The control stack also grows downwards, so this is its pysical top ;;; as well. We allocate this dedicated space within the heap right here, ;;; in this routine, through our choice of where to put things. ;;; ;;; S0 and R0 are mostly used when we want to initialize or reinitialize ;;; their respective stacks - that is, discard all their contents at once. ;;; ;;; The value of R0 is the same address these variables start at, so ;;; you'll want to do a close read of the implementation of PUSHCONTROL and ;;; convince yourself that it only ever writes things just below the rbp ;;; address it receives, never right on top of it. ;;; ;;; A little more detail about why we offset everything by ;;; control_stack_size: We're carving out some space at the bottom of the ;;; heap - which grows low-to-high - to be the control stack - which grows ;;; high-to-low. So the control stack is allocated out of the heap as a ;;; fixed-size, one-time thing, and then the variables come immediately ;;; after that. We do need to use 32-bit displacement indexing to access ;;; them this way, but that's no big deal. ;;; ;;; This is perhaps questionable, they should maybe be separate segments ;;; created with separate calls to mmap(), but for now we're not worried ;;; about overflow so we use the same allocation for both. ;;; ;;; We'll come back to these variables a bit later and generate the word ;;; headers that point at them, but now we're almost ready to switch to ;;; proper threaded-execution, so we finish that setup first... ;;; ;;; ;;; Push the value of HEAP onto the value stack so that it can be the ;;; breadcrumb the threaded code needs to find... the backing store of HEAP. ;;; Yes, self-reference can be weird like that sometimes. There's nothing ;;; stopping QUIT from reading rdi, it just violates the abstraction... ;;; push.qreg rdi ;;; ;;; We are about to set up rsi, we did rbp already, and rsp came to us ;;; already set up. That's all that NEXT needs, so take it away! ;;; mov.qreg.qimm rsi, cold_start NEXT ;;; ;;; This isn't really a routine so much as it's an array of words (exactly ;;; one of them), which is what NEXT wants rsi to point to. It's only ever ;;; used this one time, so we just put it right here. ;;; cold_start: dq QUIT ;;; ;;; One of the most charming naming traditions in Forth is that the ;;; top-level word that stays running forever, is called "quit". ;;; QUIT: dq DOCOL ; codeword dq OLD_CODE ; TODO this is more like what it should be ; Although we initialized rbp already, we do so again because we'll want ; that on subsequent visits to this word - it's the main thing it's for. ;dq R0, CONTROL! ; overwrite rbp to reset the control stack ; Keep in mind that rsi is the actual "instruction pointer", and we're ; leaving it unchanged, we just get rid of everything above it. ;dq INTERPRET ; run the repl ;dq BRANCH, QUIT - $ ; if the repl ever exits, start again ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; (old) Implementation strategy ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; We assemble the entire file contents in a stack-allocated buffer. ;;; We avoid using the stack for any other purpose. When the file is fully ;;; assembled, we output it. ;;; ;;; The assembly proceeds in several chunks - ELF header, program header, ;;; etc. Each chunk extends the buffer as per its own needs, by adjusting ;;; the stack pointer. All chunks also update a running total file size, ;;; which refers to how many bytes have actually been populated, not to the ;;; size of the buffer. ;;; ;;; Register usage: ;;; ;;; * rdx holds the total used file size so far. During hand-off between ;;; chunks, this size must be equal to the buffer size; within a chunk it ;;; may be less. ;;; ;;; * ~~~~rsp points to the bottom of the buffer.~~~~ ;;; TODO WRONG this is all just scribbling where it shouldn't ;;; OLD_CODE: dq $ + 0x8 ; The codeword mov.dreg.dimm rdx, 0 ; store running file size here ;sub.qreg.bimm rsp, 0xFF ; reserve stack space ;;; ;;; ELF header ;;; mov.qreg.disp8.dimm rsp, 0x00, 0x7F bappend "ELF" ; magic number mov.qreg.disp8.bimm rsp, 0x04, 2 ; 64-bit mov.qreg.disp8.bimm rsp, 0x05, 1 ; little-endian mov.qreg.disp8.bimm rsp, 0x06, 1 ; ELF header format version 1 mov.qreg.disp8.bimm rsp, 0x07, 0 ; System-V ABI mov.qreg.disp8.dimm rsp, 0x08, 0 ; (padding) mov.qreg.disp8.wimm rsp, 0x10, 2 ; executable mov.qreg.disp8.wimm rsp, 0x12, 0x3E ; Intel x86-64 mov.qreg.disp8.dimm rsp, 0x14, 1 ; ELF format version ; Compute the entry pointer. mov.qreg.qimm rax, _start ; the offset of _start ; This includes the origin, intentionally. mov.qreg.disp8.qreg rsp, 0x18, rax ; entry point mov.qreg.disp8.dimm rsp, 0x20, 64 ; program header offset ; We place the program header immediately after the ELF header. This ; offset is from the start of the file. mov.qreg.disp8.dimm rsp, 0x28, 0 ; section header offset mov.qreg.disp8.dimm rsp, 0x30, 0 ; processor flags mov.qreg.disp8.wimm rsp, 0x34, 64 ; ELF header size mov.qreg.disp8.wimm rsp, 0x36, 56 ; program header entry size mov.qreg.disp8.wimm rsp, 0x38, 1 ; number of program header entries mov.qreg.disp8.wimm rsp, 0x3a, 0 ; section header entry size mov.qreg.disp8.wimm rsp, 0x3c, 0 ; number of section header entries mov.qreg.disp8.wimm rsp, 0x3e, 0 ; section name string table index ; Add the size of the ELF header to the running total mov.dreg.dimm rax, 0x40 add.qreg.qreg rdx, rax ;;; ;;; Program header ;;; mov.qreg.disp8.dimm rsp, 0x40, 1 ; "loadable" segment type mov.qreg.disp8.dimm rsp, 0x44, 0x05 ; read+execute permission mov.qreg.disp8.dimm rsp, 0x48, 0 ; offset in file mov.qreg.disp8.dimm rsp, 0x50, $$ ; virtual address ; required, but can be anything, subject to alignment mov.qreg.disp8.dimm rsp, 0x58, 0 ; physical address (ignored) ; Fill in 0 as the file size for now, to avoid unitialized memory. mov.qreg.disp8.dimm rsp, 0x60, 0 ; size in file mov.qreg.disp8.dimm rsp, 0x68, 0 ; size in memory mov.qreg.disp8.dimm rsp, 0x70, 0 ; segment alignment ; for relocation, but this doesn't apply to us ; Add the size of the program header to the running total mov.dreg.dimm rax, 0x38 add.qreg.qreg rdx, rax ;;; Hardcode the size of the actual code chunk based on flatassembler's ;;; label calculations, since we don't yet have a way to generate it from ;;; within our code. ;;; ;;; Originally this was a constant number, to discourage reliance on label ;;; math, but the direction things are growing in now is to implement ;;; general label math ourselves, so that's okay. ;;; ;;; TODO of course, really we want to for-real compute this at runtime mov.qreg.qimm rax, code_size add.qreg.qreg rdx, rax ;;; ;;; Go back and fill in the file size now that we know it (ill-gotten ;;; knowledge though it is). ;;; mov.qreg.disp8.qreg rsp, 0x60, rdx ; size in file mov.qreg.disp8.qreg rsp, 0x68, rdx ; size in memory ;;; ;;; The buffer is ready; output the file. ;;; ; write() from stack-allocated buffer mov.b rax,1 mov.qreg.qimm rdi, 1 mov.qreg.qreg rsi, rsp mov.qreg.qimm rdx, 0x78 syscall ; write() the machine code by using self-reference ; ; TODO do this in a "real" quine way mov.b rax, 1 mov.qreg.qimm rdi, 1 mov.qreg.qimm rsi, elf_header + 0x78 mov.qreg.qimm rdx, file_size - 0x78 syscall ;;; ;;; Clean up. ;;; ; exit() mov.b rax, 60 mov.b rdi, 0 syscall code_size = $ - code_start file_size = $ - $$