diff options
Diffstat (limited to 'quine.asm')
| -rw-r--r-- | quine.asm | 227 |
1 files changed, 173 insertions, 54 deletions
diff --git a/quine.asm b/quine.asm index e1f91c6..2f4015d 100644 --- a/quine.asm +++ b/quine.asm @@ -1,6 +1,9 @@ ;;; QUINE ;;; ;;; This file is formatted to be read at 80-columns or wider. +;;; +;;; There's some tabular information, but diagrams have been avoided, in an +;;; attempt to make this manageable in screen readers. Feedback welcome. ;;;;;;;;;;;;;;;;;;;;; @@ -39,6 +42,22 @@ ;;; flatassembler's built-in semantics. No include files of any kind are used ;;; for it. +macro pad: bytes + if bytes > 0 + db 0x00 + pad (bytes - 1) + end if +end macro + +macro align: bytes + if bytes > 0 + if $ mod bytes <> 0 + db 0x00 + align bytes + end if + end if +end macro + ; The way these are all spelled out like this is slightly ridiculous, there ; must be a better way. macro rex.0 @@ -747,16 +766,8 @@ code_start: ;;; stack. ;;; ;;; We adopt this model of words, codewords, and variables-as-words. It's -;;; nice for us because it works without a heap. -;;; -;;; One way in which we differ from Forth is that we don't have a -;;; dictionary, and our words don't have names. Nothing would prevent this, -;;; it just isn't useful to this single-purpose program. The Forth dictionary -;;; is usually a linked list of every word that has ever been defined, with -;;; the newest at the head; the names of words are stored in string fields as -;;; part of every word's internal header. Our header has neither the pointer -;;; field for the dictionary, nor the string; the only header we have is the -;;; the codeword. +;;; really nice how it doesn't force anything else on us, not even a heap, +;;; though we do end up using a heap. ;;; ;;; We specifically implement a version of calling and returning that Forth ;;; calls indirect threaded code: The control stack is a stack of pointers @@ -774,6 +785,33 @@ code_start: ;;; DOCOL is just ordinary code, not a macro. It's defined later in this ;;; file, as a label. ;;; +;;; Notionally, we could consider not having a dictionary, and not giving +;;; our words names. However, it feels silly to stop when we're so close to +;;; being a full Forth, and using names for things solves a bootstrapping +;;; problem related to heap management - see the write-up of _start about how +;;; the heap is created, below. So, we add an additional header before the +;;; codeword for this purpose. +;;; +;;; The Forth dictionary is usually a linked list of every word that has +;;; ever been defined, with the newest at the head; the names of words are +;;; stored in string fields, often right next to the link pointer. We adopt +;;; this model, with the field sizes and order shown in the quick reference +;;; below. We break with Forth tradition in one way: Rather than having a +;;; length field, we use a null-terminated string. Thus, there's no length +;;; limit on names. This necessitates breaking out the flags (to be explained +;;; later) into their own byte, rather than taking bits from the length field +;;; for them. +;;; +;;; There's an important performance consideration: Executable words +;;; reference each other by pointers to their respective codewords. However, +;;; dictionary entries reference each other by pointers to their respective +;;; link fields. Traversing from the link field to the codeword is easy, +;;; though it's a non-constant-time operation: Just walk the string. In order +;;; to make Forth words easy to "decompile", it would be nice to also have a +;;; way to traverse backwards. We solve this by making the name field be +;;; null-terminated at both ends. Fun, yeah? +;;; +;;; ;;; ;;; ;;; -------------------------------------------------------------------------- @@ -782,15 +820,31 @@ code_start: ;;; ;;; The layout of an interpreted word: ;;; -;;; 0x00 - 0x08 Codeword (address of DOCOL snippet) -;;; 0x08 - ???? (8-byte chunks) Addresses of other words -;;; ... (end) Address of EXIT word -;;; -;;; The layout of a machine-code word: -;;; -;;; 0x00 - 0x08 Addresss of immediately following byte -;;; 0x08 - ???? Arbitrary machine code -;;; ... (end) Inlined implementation of NEXT +;;; (overall start) +;;; 0x00 - 0x08 Link (to next-oldest word) +;;; 0x09 - 0x09 I0H00000 Flags +;;; I - immediate +;;; H - hidden +;;; all other bits reserved +;;; (name start) +;;; 0x0a - 0x0a Null byte (terminates name) +;;; 0x0b - name-end - 1 Name, as UTF-8 +;;; name-end - name-end Null byte (terminates name) +;;; (padding start) +;;; name-end + 1 - codeword-start - 1 Zero-pad to 8-byte boundary +;;; (it's possible this will be zero bytes long) +;;; (codeword start) +;;; ... + 0x00 - ... + 0x08 Codeword (ie. address of DOCOL) +;;; (8-byte chunks) Addresses of other words +;;; - ... (end) Address of EXIT word +;;; +;;; The layout of a machine-code word is different only from the codeword on: +;;; +;;; ... + 0x00 - ... + 0x08 Addresss of next byte +;;; ... + 0x08 - ???? Arbitrary machine code +;;; - ... (end) Inlined implementation of NEXT +;;; +;;; Also, words always start at 8-byte boundaries. ;;; ;;; ;;; REGISTER usage conventions: @@ -895,39 +949,6 @@ macro POPCONTROL target end macro ;;; -;;; Routine DOCOL -;;; ------------- -;;; -;;; Reference this via its label as the codeword of a word to make it an -;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer") -;;; to the control stack, takes the address of the codeword from rax and -;;; increments it in-place to form the new instruction pointer, and copies -;;; that to rsi. -;;; -;;; Having then done this, we're now in the state that normal execution -;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution, -;;; kicking off a nested call. -;;; -;;; The name is said to be short for "do colon", because Forth high-level -;;; code begins word definitions with a colon. -;;; -;;; Registers in: -;;; -;;; * rsi is the caller's instruction pointer -;;; * rbp is the control stack pointer -;;; * rax is the address of the callee's codeword -;;; -;;; Registers out: -;;; -;;; * rsi is the callee's instruction pointer -;;; * rbp is the control stack pointer -DOCOL: - PUSHCONTROL rsi - add.qreg.bimm rax, 8 - mov.qreg.qreg rsi, rax - NEXT - -;;; ;;; Routine _start ;;; -------------- ;;; @@ -935,6 +956,8 @@ DOCOL: ;;; actually execute. Linkers traditionally call this _start, and on balance ;;; I think it's probably best to keep that name, though I've honestly never ;;; liked it... Anyway, the ELF header points to it and exec() jumps to it. +;;; Also, though it could be anywhere in the code part of the output, in order +;;; to make the hexdump pretty we put it at the start. ;;; ;;; The kernel gives us most registers zeroed, and rsp pointing to the ;;; command-line stuff (argc, argv, envp), which is at an ASLR'd address with @@ -1073,6 +1096,9 @@ _start: mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi ; HEAP mov.qreg.disp32.qreg rdi, control_stack_size + 0x08, rsp ; S0 mov.qreg.disp32.qreg rdi, control_stack_size + 0x10, rbp ; R0 + ; TODO also consider HERE, LATEST, and STATE + ; strictly speaking, R0 could be a constant... but it isn't known until + ; runtime, so we might as well make it a variable ;;; ;;; * HEAP is the physical bottom of the heap ;;; The heap grows upwards in memory, so this is also the logical @@ -1131,16 +1157,89 @@ _start: ;;; one of them), which is what NEXT wants rsi to point to. It's only ever ;;; used this one time, so we just put it right here. ;;; + + align 8 cold_start: ;;; TODO this is probably where we should deal with that HEAP that we passed ;;; on the stack dq QUIT +;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Now we are in Forth ;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Everything we define from here on out is an actual Forth word, with a +;;; proper header and everything. So, you'll see some more preamble before the +;;; definitions. +;;; +;;; Keep in mind, though, that, although we have threaded execution, we +;;; don't yet have Forth-style variables. That's because the heap is at a +;;; dynamically-chosen location, so none of this read-only code that we're +;;; defining now can reference it. Before invoking cold_start, we thoughtfully +;;; put the value of HEAP on the stack for ourselves; our first task will be +;;; to dynamically allocate some words on the heap that know how to find the +;;; heap. We'll do that by defining bootstrapping versions of the +;;; word-defining words, which will eventually be replaced. + +;;; +;;; Routine DOCOL +;;; ------------- +;;; +;;; Reference this via its label as the codeword of a word to make it an +;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer") +;;; to the control stack, takes the address of the codeword from rax and +;;; increments it in-place to form the new instruction pointer, and copies +;;; that to rsi. +;;; +;;; Having then done this, we're now in the state that normal execution +;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution, +;;; kicking off a nested call. +;;; +;;; The name is said to be short for "do colon", because Forth high-level +;;; code begins word definitions with a colon. +;;; +;;; Registers in: +;;; +;;; * rsi is the caller's instruction pointer +;;; * rbp is the control stack pointer +;;; * rax is the address of the callee's codeword +;;; +;;; Registers out: +;;; +;;; * rsi is the callee's instruction pointer +;;; * rbp is the control stack pointer + align 8 +DOCOL_name: + dq 0 ; This is the very first word, so its link is null. + db 0x00, "DOCOL", 0x00 + align 8 +DOCOL_constant: + ; Evaluated as a word, DOCOL is a constant which returns a pointer. + dq $ + 0x8 ; codeword + mov.qreg.qimm rax, DOCOL + push.qreg rax + NEXT + align 8 +DOCOL: + ; Since DOCOL is not a normal word, the label points to the value we care + ; about from the assembly side of things, wich is the address we use as the + ; codeword. + PUSHCONTROL rsi + add.qreg.bimm rax, 8 + mov.qreg.qreg rsi, rax + NEXT + + ;;; ;;; This is the mechanism to "return" from a word interpreted by DOCOL. ;;; We pop the control stack, and then, since this is threaded execution, we ;;; do the next thing the caller wants to do, by inlining NEXT. ;;; + align 8 +EXIT_name: + dq DOCOL_name + db 0x00, "EXIT", 0x00 + align 8 EXIT: dq $ + 0x8 ; codeword POPCONTROL rsi @@ -1150,6 +1249,11 @@ EXIT: ;;; One of the most charming naming traditions in Forth is that the ;;; top-level word that stays running forever, is called "quit". ;;; + align 8 +QUIT_name: + dq EXIT_name + db 0x00, "QUIT", 0x00 + align 8 QUIT: dq DOCOL ; codeword @@ -1182,6 +1286,11 @@ QUIT: ;;; ;;; This does the Linux exit() system call, passing it exit code zero. ;;; + align 8 +SYS_EXIT_name: + dq QUIT_name + db 0x00, "SYS_EXIT", 0x00 + align 8 SYS_EXIT: dq $ + 0x8 ; codeword @@ -1189,7 +1298,7 @@ SYS_EXIT: mov.b rdi, 0 ; exit code syscall - ; In the event we're still here, let's minmize confusion. + ; In the event we're still here, let's minimize confusion. hlt @@ -1197,6 +1306,11 @@ SYS_EXIT: ;;; (new) Implementation strategy ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; + align 8 +QUINE_name: + dq SYS_EXIT_name + db 0x00, "QUINE", 0x00 + align 8 QUINE: dq DOCOL ; codeword dq OLD_CODE @@ -1225,8 +1339,13 @@ QUINE: ;;; ;;; * rcx points to the bottom of the buffer. ;;; + align 8 +OLD_CODE_name: + dq QUINE_name + db 0x00, "OLD_CODE", 0x00 + align 8 OLD_CODE: - dq $ + 0x8 ; The codeword + dq $ + 0x8 ; codeword mov.qreg.qreg rcx, rdi add.qreg.dimm rcx, control_stack_size + 0x18 |