summary refs log tree commit diff
path: root/quine.asm
diff options
context:
space:
mode:
Diffstat (limited to 'quine.asm')
-rw-r--r--quine.asm227
1 files changed, 173 insertions, 54 deletions
diff --git a/quine.asm b/quine.asm
index e1f91c6..2f4015d 100644
--- a/quine.asm
+++ b/quine.asm
@@ -1,6 +1,9 @@
 ;;; QUINE
 ;;;
 ;;; This file is formatted to be read at 80-columns or wider.
+;;;
+;;; There's some tabular information, but diagrams have been avoided, in an
+;;; attempt to make this manageable in screen readers. Feedback welcome.
 
 
 ;;;;;;;;;;;;;;;;;;;;;
@@ -39,6 +42,22 @@
 ;;; flatassembler's built-in semantics. No include files of any kind are used
 ;;; for it.
 
+macro pad: bytes
+  if bytes > 0
+    db 0x00
+    pad (bytes - 1)
+  end if
+end macro
+
+macro align: bytes
+  if bytes > 0
+    if $ mod bytes <> 0
+      db 0x00
+      align bytes
+    end if
+  end if
+end macro
+
 ; The way these are all spelled out like this is slightly ridiculous, there
 ; must be a better way.
 macro rex.0
@@ -747,16 +766,8 @@ code_start:
 ;;; stack.
 ;;;
 ;;;   We adopt this model of words, codewords, and variables-as-words. It's
-;;; nice for us because it works without a heap.
-;;;
-;;;   One way in which we differ from Forth is that we don't have a
-;;; dictionary, and our words don't have names. Nothing would prevent this,
-;;; it just isn't useful to this single-purpose program. The Forth dictionary
-;;; is usually a linked list of every word that has ever been defined, with
-;;; the newest at the head; the names of words are stored in string fields as
-;;; part of every word's internal header. Our header has neither the pointer
-;;; field for the dictionary, nor the string; the only header we have is the
-;;; the codeword.
+;;; really nice how it doesn't force anything else on us, not even a heap,
+;;; though we do end up using a heap.
 ;;;
 ;;;   We specifically implement a version of calling and returning that Forth
 ;;; calls indirect threaded code: The control stack is a stack of pointers
@@ -774,6 +785,33 @@ code_start:
 ;;;   DOCOL is just ordinary code, not a macro. It's defined later in this
 ;;; file, as a label.
 ;;;
+;;;   Notionally, we could consider not having a dictionary, and not giving
+;;; our words names. However, it feels silly to stop when we're so close to
+;;; being a full Forth, and using names for things solves a bootstrapping
+;;; problem related to heap management - see the write-up of _start about how
+;;; the heap is created, below. So, we add an additional header before the
+;;; codeword for this purpose.
+;;;
+;;;   The Forth dictionary is usually a linked list of every word that has
+;;; ever been defined, with the newest at the head; the names of words are
+;;; stored in string fields, often right next to the link pointer. We adopt
+;;; this model, with the field sizes and order shown in the quick reference
+;;; below. We break with Forth tradition in one way: Rather than having a
+;;; length field, we use a null-terminated string. Thus, there's no length
+;;; limit on names. This necessitates breaking out the flags (to be explained
+;;; later) into their own byte, rather than taking bits from the length field
+;;; for them.
+;;;
+;;;   There's an important performance consideration: Executable words
+;;; reference each other by pointers to their respective codewords. However,
+;;; dictionary entries reference each other by pointers to their respective
+;;; link fields. Traversing from the link field to the codeword is easy,
+;;; though it's a non-constant-time operation: Just walk the string. In order
+;;; to make Forth words easy to "decompile", it would be nice to also have a
+;;; way to traverse backwards. We solve this by making the name field be
+;;; null-terminated at both ends. Fun, yeah?
+;;;
+;;;
 ;;;
 ;;;
 ;;; --------------------------------------------------------------------------
@@ -782,15 +820,31 @@ code_start:
 ;;;
 ;;; The layout of an interpreted word:
 ;;;
-;;;     0x00 - 0x08                     Codeword (address of DOCOL snippet)
-;;;     0x08 - ???? (8-byte chunks)     Addresses of other words
-;;;       ... (end)                     Address of EXIT word
-;;;
-;;; The layout of a machine-code word:
-;;;
-;;;     0x00 - 0x08                     Addresss of immediately following byte
-;;;     0x08 - ????                     Arbitrary machine code
-;;;       ... (end)                     Inlined implementation of NEXT
+;;; (overall start)
+;;;            0x00 - 0x08                     Link (to next-oldest word)
+;;;            0x09 - 0x09  I0H00000           Flags
+;;;                                                I - immediate
+;;;                                                H - hidden
+;;;                                                all other bits reserved
+;;; (name start)
+;;;            0x0a - 0x0a                     Null byte (terminates name)
+;;;            0x0b - name-end - 1             Name, as UTF-8
+;;;        name-end - name-end                 Null byte (terminates name)
+;;; (padding start)
+;;;    name-end + 1 - codeword-start - 1       Zero-pad to 8-byte boundary
+;;;    (it's possible this will be zero bytes long)
+;;; (codeword start)
+;;;      ... + 0x00 - ... + 0x08               Codeword (ie. address of DOCOL)
+;;;          (8-byte chunks)                   Addresses of other words
+;;;                 - ... (end)                Address of EXIT word
+;;;
+;;; The layout of a machine-code word is different only from the codeword on:
+;;;
+;;;      ... + 0x00 - ... + 0x08               Addresss of next byte
+;;;      ... + 0x08 - ????                     Arbitrary machine code
+;;;                 - ... (end)                Inlined implementation of NEXT
+;;;
+;;; Also, words always start at 8-byte boundaries.
 ;;;
 ;;;
 ;;; REGISTER usage conventions:
@@ -895,39 +949,6 @@ macro POPCONTROL target
 end macro
 
 ;;;
-;;; Routine DOCOL
-;;; -------------
-;;;
-;;;   Reference this via its label as the codeword of a word to make it an
-;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer")
-;;; to the control stack, takes the address of the codeword from rax and
-;;; increments it in-place to form the new instruction pointer, and copies
-;;; that to rsi.
-;;;
-;;;   Having then done this, we're now in the state that normal execution
-;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution,
-;;; kicking off a nested call.
-;;;
-;;;   The name is said to be short for "do colon", because Forth high-level
-;;; code begins word definitions with a colon.
-;;;
-;;; Registers in:
-;;;
-;;; * rsi is the caller's instruction pointer
-;;; * rbp is the control stack pointer
-;;; * rax is the address of the callee's codeword
-;;;
-;;; Registers out:
-;;;
-;;; * rsi is the callee's instruction pointer
-;;; * rbp is the control stack pointer
-DOCOL:
-  PUSHCONTROL rsi
-  add.qreg.bimm rax, 8
-  mov.qreg.qreg rsi, rax
-  NEXT
-
-;;;
 ;;; Routine _start
 ;;; --------------
 ;;;
@@ -935,6 +956,8 @@ DOCOL:
 ;;; actually execute. Linkers traditionally call this _start, and on balance
 ;;; I think it's probably best to keep that name, though I've honestly never
 ;;; liked it... Anyway, the ELF header points to it and exec() jumps to it.
+;;; Also, though it could be anywhere in the code part of the output, in order
+;;; to make the hexdump pretty we put it at the start.
 ;;;
 ;;;   The kernel gives us most registers zeroed, and rsp pointing to the
 ;;; command-line stuff (argc, argv, envp), which is at an ASLR'd address with
@@ -1073,6 +1096,9 @@ _start:
   mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi    ; HEAP
   mov.qreg.disp32.qreg rdi, control_stack_size + 0x08, rsp    ; S0
   mov.qreg.disp32.qreg rdi, control_stack_size + 0x10, rbp    ; R0
+  ; TODO also consider HERE, LATEST, and STATE
+  ; strictly speaking, R0 could be a constant... but it isn't known until
+  ; runtime, so we might as well make it a variable
   ;;;
   ;;; * HEAP is the physical bottom of the heap
   ;;;     The heap grows upwards in memory, so this is also the logical
@@ -1131,16 +1157,89 @@ _start:
 ;;; one of them), which is what NEXT wants rsi to point to. It's only ever
 ;;; used this one time, so we just put it right here.
 ;;;
+
+  align 8
 cold_start:
 ;;; TODO this is probably where we should deal with that HEAP that we passed
 ;;; on the stack
   dq QUIT
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Now we are in Forth ;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;
+;;;   Everything we define from here on out is an actual Forth word, with a
+;;; proper header and everything. So, you'll see some more preamble before the
+;;; definitions.
+;;;
+;;;   Keep in mind, though, that, although we have threaded execution, we
+;;; don't yet have Forth-style variables. That's because the heap is at a
+;;; dynamically-chosen location, so none of this read-only code that we're
+;;; defining now can reference it. Before invoking cold_start, we thoughtfully
+;;; put the value of HEAP on the stack for ourselves; our first task will be
+;;; to dynamically allocate some words on the heap that know how to find the
+;;; heap. We'll do that by defining bootstrapping versions of the
+;;; word-defining words, which will eventually be replaced.
+
+;;;
+;;; Routine DOCOL
+;;; -------------
+;;;
+;;;   Reference this via its label as the codeword of a word to make it an
+;;; "interpreted" word. Concretely, it saves rsi (the "instruction pointer")
+;;; to the control stack, takes the address of the codeword from rax and
+;;; increments it in-place to form the new instruction pointer, and copies
+;;; that to rsi.
+;;;
+;;;   Having then done this, we're now in the state that normal execution
+;;; expects, so DOCOL ends by it using NEXT to begin the callee's execution,
+;;; kicking off a nested call.
+;;;
+;;;   The name is said to be short for "do colon", because Forth high-level
+;;; code begins word definitions with a colon.
+;;;
+;;; Registers in:
+;;;
+;;; * rsi is the caller's instruction pointer
+;;; * rbp is the control stack pointer
+;;; * rax is the address of the callee's codeword
+;;;
+;;; Registers out:
+;;;
+;;; * rsi is the callee's instruction pointer
+;;; * rbp is the control stack pointer
+  align 8
+DOCOL_name:
+  dq 0                  ; This is the very first word, so its link is null.
+  db 0x00, "DOCOL", 0x00
+  align 8
+DOCOL_constant:
+  ; Evaluated as a word, DOCOL is a constant which returns a pointer.
+  dq $ + 0x8                     ; codeword
+  mov.qreg.qimm rax, DOCOL
+  push.qreg rax
+  NEXT
+  align 8
+DOCOL:
+  ; Since DOCOL is not a normal word, the label points to the value we care
+  ; about from the assembly side of things, wich is the address we use as the
+  ; codeword.
+  PUSHCONTROL rsi
+  add.qreg.bimm rax, 8
+  mov.qreg.qreg rsi, rax
+  NEXT
+
+
 ;;;
 ;;;   This is the mechanism to "return" from a word interpreted by DOCOL.
 ;;; We pop the control stack, and then, since this is threaded execution, we
 ;;; do the next thing the caller wants to do, by inlining NEXT.
 ;;;
+  align 8
+EXIT_name:
+  dq DOCOL_name
+  db 0x00, "EXIT", 0x00
+  align 8
 EXIT:
   dq $ + 0x8                     ; codeword
   POPCONTROL rsi
@@ -1150,6 +1249,11 @@ EXIT:
 ;;;   One of the most charming naming traditions in Forth is that the
 ;;; top-level word that stays running forever, is called "quit".
 ;;;
+  align 8
+QUIT_name:
+  dq EXIT_name
+  db 0x00, "QUIT", 0x00
+  align 8
 QUIT:
   dq DOCOL                       ; codeword
 
@@ -1182,6 +1286,11 @@ QUIT:
 ;;;
 ;;; This does the Linux exit() system call, passing it exit code zero.
 ;;;
+  align 8
+SYS_EXIT_name:
+  dq QUIT_name
+  db 0x00, "SYS_EXIT", 0x00
+  align 8
 SYS_EXIT:
   dq $ + 0x8                     ; codeword
 
@@ -1189,7 +1298,7 @@ SYS_EXIT:
   mov.b rdi, 0                   ; exit code
   syscall
 
-  ; In the event we're still here, let's minmize confusion.
+  ; In the event we're still here, let's minimize confusion.
   hlt
 
 
@@ -1197,6 +1306,11 @@ SYS_EXIT:
 ;;; (new) Implementation strategy ;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;
+  align 8
+QUINE_name:
+  dq SYS_EXIT_name
+  db 0x00, "QUINE", 0x00
+  align 8
 QUINE:
   dq DOCOL                       ; codeword
   dq OLD_CODE
@@ -1225,8 +1339,13 @@ QUINE:
 ;;;
 ;;; * rcx points to the bottom of the buffer.
 ;;;
+  align 8
+OLD_CODE_name:
+  dq QUINE_name
+  db 0x00, "OLD_CODE", 0x00
+  align 8
 OLD_CODE:
-  dq $ + 0x8                               ; The codeword
+  dq $ + 0x8                     ; codeword
 
   mov.qreg.qreg rcx, rdi
   add.qreg.dimm rcx, control_stack_size + 0x18