From 95c85fffa1d24abf751582b02dd62cf390fd8c95 Mon Sep 17 00:00:00 2001 From: Irene Knapp Date: Thu, 23 Oct 2025 02:51:30 -0700 Subject: pick a header format, document it also all the existing stuff has word headers now. it doesn't need them to run, it just has them anyway. and EXIT was moved after DOCOL because it feels poetic that DOCOL is the first Forth word in the file Force-Push: yes Change-Id: I592445310712bfde42fde8cffd7bc672f16c4e6c --- quine.asm | 227 +++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 173 insertions(+), 54 deletions(-) (limited to 'quine.asm') diff --git a/quine.asm b/quine.asm index e1f91c6..2f4015d 100644 --- a/quine.asm +++ b/quine.asm @@ -1,6 +1,9 @@ ;;; QUINE ;;; ;;; This file is formatted to be read at 80-columns or wider. +;;; +;;; There's some tabular information, but diagrams have been avoided, in an +;;; attempt to make this manageable in screen readers. Feedback welcome. ;;;;;;;;;;;;;;;;;;;;; @@ -39,6 +42,22 @@ ;;; flatassembler's built-in semantics. No include files of any kind are used ;;; for it. +macro pad: bytes + if bytes > 0 + db 0x00 + pad (bytes - 1) + end if +end macro + +macro align: bytes + if bytes > 0 + if $ mod bytes <> 0 + db 0x00 + align bytes + end if + end if +end macro + ; The way these are all spelled out like this is slightly ridiculous, there ; must be a better way. macro rex.0 @@ -747,16 +766,8 @@ code_start: ;;; stack. ;;; ;;; We adopt this model of words, codewords, and variables-as-words. It's -;;; nice for us because it works without a heap. -;;; -;;; One way in which we differ from Forth is that we don't have a -;;; dictionary, and our words don't have names. Nothing would prevent this, -;;; it just isn't useful to this single-purpose program. The Forth dictionary -;;; is usually a linked list of every word that has ever been defined, with -;;; the newest at the head; the names of words are stored in string fields as -;;; part of every word's internal header. Our header has neither the pointer -;;; field for the dictionary, nor the string; the only header we have is the -;;; the codeword. +;;; really nice how it doesn't force anything else on us, not even a heap, +;;; though we do end up using a heap. ;;; ;;; We specifically implement a version of calling and returning that Forth ;;; calls indirect threaded code: The control stack is a stack of pointers @@ -774,6 +785,33 @@ code_start: ;;; DOCOL is just ordinary code, not a macro. It's defined later in this ;;; file, as a label. ;;; +;;; Notionally, we could consider not having a dictionary, and not giving +;;; our words names. However, it feels silly to stop when we're so close to +;;; being a full Forth, and using names for things solves a bootstrapping +;;; problem related to heap management - see the write-up of _start about how +;;; the heap is created, below. So, we add an additional header before the +;;; codeword for this purpose. +;;; +;;; The Forth dictionary is usually a linked list of every word that has +;;; ever been defined, with the newest at the head; the names of words are +;;; stored in string fields, often right next to the link pointer. We adopt +;;; this model, with the field sizes and order shown in the quick reference +;;; below. We break with Forth tradition in one way: Rather than having a +;;; length field, we use a null-terminated string. Thus, there's no length +;;; limit on names. This necessitates breaking out the flags (to be explained +;;; later) into their own byte, rather than taking bits from the length field +;;; for them. +;;; +;;; There's an important performance consideration: Executable words +;;; reference each other by pointers to their respective codewords. However, +;;; dictionary entries reference each other by pointers to their respective +;;; link fields. Traversing from the link field to the codeword is easy, +;;; though it's a non-constant-time operation: Just walk the string. In order +;;; to make Forth words easy to "decompile", it would be nice to also have a +;;; way to traverse backwards. We solve this by making the name field be +;;; null-terminated at both ends. Fun, yeah? +;;; +;;; ;;; ;;; ;;; -------------------------------------------------------------------------- @@ -782,15 +820,31 @@ code_start: ;;; ;;; The layout of an interpreted word: ;;; -;;; 0x00 - 0x08 Codeword (address of DOCOL snippet) -;;; 0x08 - ???? (8-byte chunks) Addresses of other words -;;; ... (end) Address of EXIT word -;;; -;;; The layout of a machine-code word: -;;; -;;; 0x00 - 0x08 Addresss of immediately following byte -;;; 0x08 - ???? Arbitrary machine code -;;; ... (end) Inlined implementation of NEXT +;;; (overall start) +;;; 0x00 - 0x08 Link (to next-oldest word) +;;; 0x09 - 0x09 I0H00000 Flags +;;; I - immediate +;;; H - hidden +;;; all other bits reserved +;;; (name start) +;;; 0x0a - 0x0a Null byte (terminates name) +;;; 0x0b - name-end - 1 Name, as UTF-8 +;;; name-end - name-end Null byte (terminates name) +;;; (padding start) +;;; name-end + 1 - codeword-start - 1 Zero-pad to 8-byte boundary +;;; (it's possible this will be zero bytes long) +;;; (codeword start) +;;; ... + 0x00 - ... + 0x08 Codeword (ie. address of DOCOL) +;;; (8-byte chunks) Addresses of other words +;;; - ... (end) Address of EXIT word +;;; +;;; The layout of a machine-code word is different only from the codeword on: +;;; +;;; ... + 0x00 - ... + 0x08 Addresss of next byte +;;; ... + 0x08 - ???? Arbitrary machine code +;;; - ... (end) Inlined implementation of NEXT +;;; +;;; Also, words always start at 8-byte boundaries. ;;; ;;; ;;; REGISTER usage conventions: @@ -894,39 +948,6 @@ macro POPCONTROL target lea.qreg.qreg.disp8 rbp, 8, rbp end macro -;;; -;;; Routine DOCOL -;;; ------------- -;;; -;;; Reference this via its label as the codeword of a word to make it an -;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer") -;;; to the control stack, takes the address of the codeword from rax and -;;; increments it in-place to form the new instruction pointer, and copies -;;; that to rsi. -;;; -;;; Having then done this, we're now in the state that normal execution -;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution, -;;; kicking off a nested call. -;;; -;;; The name is said to be short for "do colon", because Forth high-level -;;; code begins word definitions with a colon. -;;; -;;; Registers in: -;;; -;;; * rsi is the caller's instruction pointer -;;; * rbp is the control stack pointer -;;; * rax is the address of the callee's codeword -;;; -;;; Registers out: -;;; -;;; * rsi is the callee's instruction pointer -;;; * rbp is the control stack pointer -DOCOL: - PUSHCONTROL rsi - add.qreg.bimm rax, 8 - mov.qreg.qreg rsi, rax - NEXT - ;;; ;;; Routine _start ;;; -------------- @@ -935,6 +956,8 @@ DOCOL: ;;; actually execute. Linkers traditionally call this _start, and on balance ;;; I think it's probably best to keep that name, though I've honestly never ;;; liked it... Anyway, the ELF header points to it and exec() jumps to it. +;;; Also, though it could be anywhere in the code part of the output, in order +;;; to make the hexdump pretty we put it at the start. ;;; ;;; The kernel gives us most registers zeroed, and rsp pointing to the ;;; command-line stuff (argc, argv, envp), which is at an ASLR'd address with @@ -1073,6 +1096,9 @@ _start: mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi ; HEAP mov.qreg.disp32.qreg rdi, control_stack_size + 0x08, rsp ; S0 mov.qreg.disp32.qreg rdi, control_stack_size + 0x10, rbp ; R0 + ; TODO also consider HERE, LATEST, and STATE + ; strictly speaking, R0 could be a constant... but it isn't known until + ; runtime, so we might as well make it a variable ;;; ;;; * HEAP is the physical bottom of the heap ;;; The heap grows upwards in memory, so this is also the logical @@ -1131,16 +1157,89 @@ _start: ;;; one of them), which is what NEXT wants rsi to point to. It's only ever ;;; used this one time, so we just put it right here. ;;; + + align 8 cold_start: ;;; TODO this is probably where we should deal with that HEAP that we passed ;;; on the stack dq QUIT +;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Now we are in Forth ;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; Everything we define from here on out is an actual Forth word, with a +;;; proper header and everything. So, you'll see some more preamble before the +;;; definitions. +;;; +;;; Keep in mind, though, that, although we have threaded execution, we +;;; don't yet have Forth-style variables. That's because the heap is at a +;;; dynamically-chosen location, so none of this read-only code that we're +;;; defining now can reference it. Before invoking cold_start, we thoughtfully +;;; put the value of HEAP on the stack for ourselves; our first task will be +;;; to dynamically allocate some words on the heap that know how to find the +;;; heap. We'll do that by defining bootstrapping versions of the +;;; word-defining words, which will eventually be replaced. + +;;; +;;; Routine DOCOL +;;; ------------- +;;; +;;; Reference this via its label as the codeword of a word to make it an +;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer") +;;; to the control stack, takes the address of the codeword from rax and +;;; increments it in-place to form the new instruction pointer, and copies +;;; that to rsi. +;;; +;;; Having then done this, we're now in the state that normal execution +;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution, +;;; kicking off a nested call. +;;; +;;; The name is said to be short for "do colon", because Forth high-level +;;; code begins word definitions with a colon. +;;; +;;; Registers in: +;;; +;;; * rsi is the caller's instruction pointer +;;; * rbp is the control stack pointer +;;; * rax is the address of the callee's codeword +;;; +;;; Registers out: +;;; +;;; * rsi is the callee's instruction pointer +;;; * rbp is the control stack pointer + align 8 +DOCOL_name: + dq 0 ; This is the very first word, so its link is null. + db 0x00, "DOCOL", 0x00 + align 8 +DOCOL_constant: + ; Evaluated as a word, DOCOL is a constant which returns a pointer. + dq $ + 0x8 ; codeword + mov.qreg.qimm rax, DOCOL + push.qreg rax + NEXT + align 8 +DOCOL: + ; Since DOCOL is not a normal word, the label points to the value we care + ; about from the assembly side of things, wich is the address we use as the + ; codeword. + PUSHCONTROL rsi + add.qreg.bimm rax, 8 + mov.qreg.qreg rsi, rax + NEXT + + ;;; ;;; This is the mechanism to "return" from a word interpreted by DOCOL. ;;; We pop the control stack, and then, since this is threaded execution, we ;;; do the next thing the caller wants to do, by inlining NEXT. ;;; + align 8 +EXIT_name: + dq DOCOL_name + db 0x00, "EXIT", 0x00 + align 8 EXIT: dq $ + 0x8 ; codeword POPCONTROL rsi @@ -1150,6 +1249,11 @@ EXIT: ;;; One of the most charming naming traditions in Forth is that the ;;; top-level word that stays running forever, is called "quit". ;;; + align 8 +QUIT_name: + dq EXIT_name + db 0x00, "QUIT", 0x00 + align 8 QUIT: dq DOCOL ; codeword @@ -1182,6 +1286,11 @@ QUIT: ;;; ;;; This does the Linux exit() system call, passing it exit code zero. ;;; + align 8 +SYS_EXIT_name: + dq QUIT_name + db 0x00, "SYS_EXIT", 0x00 + align 8 SYS_EXIT: dq $ + 0x8 ; codeword @@ -1189,7 +1298,7 @@ SYS_EXIT: mov.b rdi, 0 ; exit code syscall - ; In the event we're still here, let's minmize confusion. + ; In the event we're still here, let's minimize confusion. hlt @@ -1197,6 +1306,11 @@ SYS_EXIT: ;;; (new) Implementation strategy ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; + align 8 +QUINE_name: + dq SYS_EXIT_name + db 0x00, "QUINE", 0x00 + align 8 QUINE: dq DOCOL ; codeword dq OLD_CODE @@ -1225,8 +1339,13 @@ QUINE: ;;; ;;; * rcx points to the bottom of the buffer. ;;; + align 8 +OLD_CODE_name: + dq QUINE_name + db 0x00, "OLD_CODE", 0x00 + align 8 OLD_CODE: - dq $ + 0x8 ; The codeword + dq $ + 0x8 ; codeword mov.qreg.qreg rcx, rdi add.qreg.dimm rcx, control_stack_size + 0x18 -- cgit 1.4.1