5 files changed, 1072 insertions, 4 deletions
diff --git a/core.e b/core.e
new file mode 100644
index 0000000..cb3921a
--- /dev/null
+++ b/core.e
@@ -0,0 +1,952 @@
+~ Stack manipulation routines
+~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~
+~   We start with the three traditional stack operations, swap drop and
+~ roll. Sorry to fans of the name "ROT"; we were an HP calculator kid. It'll
+~ always be roll to us. Anyway, we do a couple other operations too. Since
+~ our goal right now is just to bootstrap the heap, we keep this short and
+~ sweet.
+~
+~   There is definitely plenty of optimization that could be done.
+
+: swap
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rax push-reg64
+    :rbx push-reg64
+    here ! ] ;asm
+
+: drop
+  [ here @
+    : rax pop-reg64
+    here ! ] ;asm
+
+: 2drop
+  [ here @
+    :rax pop-reg64
+    :rax pop-reg64
+    here ! ] ;asm
+
+~   Rotates "up" (pops its parameter, n; nth item then becomes current item).
+~
+~   We implement this the high-performance way, with rep movsq, aka the
+~ instruction that exists to optimize C's memcpy(). The details of setting
+~ that up are complex; see below.
+: roll
+  [ here @
+
+    ~   Pop our parameter. The rep instruction takes rcx as its count, so we
+    ~ reduce copying by using it to hold our count, as well.
+    :rcx pop-reg64
+
+    ~   We have n - 1 items to slide, so decrement rcx. For the purpose of
+    ~ counting how many repetitions will happen, it's one-based. This is because
+    ~ the rep instruction performs a single movsq, then decrements rcx, then
+    ~ stops if rcx is zero.
+    :rcx dec-reg64
+
+    ~   Retrieve the nth item, for later. For this purpose we're thinking in
+    ~  zero-based terms, so we do this after already having decremented rcx.
+    :rsp :rcx 8 :rbx mov-reg64-indexed-reg64
+
+    ~   The source address for movsq is rsi and the destination is rdi; we can
+    ~ use rdi as we wish, but rsi is our Forth "instruction pointer", so we must
+    ~ save and restore it. Doing so alters rsp, so we have to adjust the address
+    ~ calculations by eight bytes as compared to the expressions above, but
+    ~ happily we can use the disp8 field to do that. We'd be using disp8 anyway
+    ~ because it's helpful.
+    :rsi push-reg64
+
+    ~   Now we set up parameters for the memory-sliding operation. We have
+    ~ n - 1 items to copy, moving the range rsp through rsp + (n-2)*8 onto the
+    ~ range rsp + 8 through rsp + (n-1)*8. That's with the value of rsp as it
+    ~ exists at this moment (it's going to change soon).
+    ~
+    ~   We're sliding them upwards in memory, so we start at the high end so
+    ~ that we're always moving into a location that doesn't have anything
+    ~ precious. We use lea as a convenient way to do the stack math.
+    ~
+    ~   When rcx is 1, we want rsp + 8.
+    :rsp :rcx 8 :rsi lea-reg64-indexed-reg64
+    ~   When rcx is 1, we want rsp + 16.
+    :rsp :rcx 8 8 :rdi lea-reg64-disp8-indexed-reg64
+    ~
+    ~   Using rcx = 1 is the most convenient example to use for figuring out the
+    ~ arithmetic. It's a linear relationship, so as long as we get the 8-byte
+    ~ stride correct, we just need to pick a single point and verify that our
+    ~ math is right for that point, and it'll be right for any value of rcx.
+
+    ~   Another of our Forth conventions is that the DF flag should be kept at
+    ~ zero, which directs string instruction to increment rsi. Here, however,
+    ~ because our source and destination ranges overlap, we need to start at the
+    ~ high end, which means we need it to decrement. So we set DF to one, and
+    ~ we'll clear it after.
+    std
+    rep-movs64
+
+    ~ Set everything back.
+    cld
+    :rsi pop-reg64
+
+    ~ There is now an extra item at the low end of the stack (the top) that
+    ~ needs to go away, and coincidentally we have a value in rbx that needs to
+    ~ be in that spot. Rather than doing a drop and push, we overwrite it, to
+    ~ save a little work.
+    :rbx :rsp mov-indirect-reg64-reg64
+
+    ~ All done, wow! What a mouthful.
+    here ! ] ;asm
+
+~   Rotates "down" (pops its parameter, n; current item then becomes nth
+~ item).
+~
+~   Jonesforth calls this "-roll" and we could do that, but honestly the name
+~ unroll sounds nicer and it's only a single character longer. You might say
+~ it rolls off the tongue better.
+~
+~   We implement this the high-performance way, with rep movsq, aka the
+~ instruction that exists to optimize C's memcpy(). The details of setting
+~ that up are complex; see below.
+: unroll
+  [ here @
+
+    ~   Pop our parameter. The rep instruction takes rcx as its count, so we
+    ~ reduce copying by using it to hold our count, as well.
+    :rcx pop-reg64
+
+    ~ We have n - 1 items to slide, so decrement rcx. Also, save a copy of it in
+    ~ rdx after doing that, for later.
+    :rcx dec-reg64
+    :rcx :rdx mov-reg64-reg64
+
+    ~   Retrieve the 0th item, for later.
+    :rsp :rbx mov-reg64-indirect-reg64
+
+    ~   Now we set up parameters for the memory-sliding operation. We have
+    ~ n - 1 items to copy, moving the range rsp + 8 through rsp + (n-1)*8 onto
+    ~ the range rsp through rsp + (n-2)*8. That's with the value of rsp as it
+    ~ exists at this moment (it's going to change soon).
+    ~
+    ~   We're sliding them downwards in memory, so we start at the low end so
+    ~ that we're always moving into a location that doesn't have anything
+    ~ precious. We use lea as a convenient way to do the stack math.
+    ~
+    ~   As with roll, we need to save rsi and adjust those rsp calculations
+    ~ accordingly.
+    :rsi push-reg64
+
+    ~   Regardless of rcx, we want rsp + 16.
+    :rsp 16 :rsi lea-reg64-disp8-reg64
+    ~   Regardless of rcx, we want rsp + 8.
+    :rsp 8 :rdi lea-reg64-disp8-reg64
+
+    ~   With roll, we were starting at the high end. Here, we start at the low
+    ~ end, which means we need rsi to increment after each repetition. That's
+    ~ what it does when the DF flag is clear, and another of our Forth
+    ~ conventions is to keep it clear normally. So, we don't have to touch DF!
+    ~ Yay!
+    rep-movs64
+
+    ~ Restore our original rsi.
+    :rsi pop-reg64
+
+    ~ There is now an extra item in the middle of the stack, at the high end of
+    ~ the sliding we did, that needs to be overwritten with our value in rbx.
+    ~ Since we destructively updated our count in rcx, we saved a copy of the
+    ~ count in rdx, and we use that to find the right address.
+    ~
+    ~ When the original count was n, we want rsp + (n-1)*8, so we saved rdx
+    ~ after decrementing rcx, above.
+    :rbx :rsp :rdx 8 mov-indexed-reg64-reg64
+
+    ~ All done, wow! What a mouthful.
+    here ! ] ;asm
+
+~ Rotates "up" (third item becomes current item)
+: 3roll
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rcx pop-reg64
+    :rbx push-reg64
+    :rax push-reg64
+    :rcx push-reg64
+    here ! ] ;asm
+
+~ Rotates "down" (current item becomes third item)
+: 3unroll
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rcx pop-reg64
+    :rax push-reg64
+    :rcx push-reg64
+    :rbx push-reg64
+    here ! ] ;asm
+
+: dup
+  [ here @
+    :rax pop-reg64
+    :rax push-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+: 2dup
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx push-reg64
+    :rax push-reg64
+    :rbx push-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+
+~ Arithmetic routines
+~ ~~~~~~~~~~~~~~~~~~~
+~
+~   No surprises here. Again, since our goal is to bootstrap the heap, we
+~ keep it short. Also again, this is nowhere near optimal.
+
+: +
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rbx :rax add-reg64-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+: -
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rbx :rax sub-reg64-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+: *
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx mul-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+~   Jonesforth calls this "/mod" but % is widely recognized and has no special
+~ Forth significance.
+: /%
+  [ here @
+    :rdx :rdx xor-reg64-reg64    ~ rdx is the high bits of the input; zero it
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rbx divmod-reg64
+    :rdx push-reg64              ~ remainder
+    :rax push-reg64              ~ quotient
+    here ! ] ;asm
+
+
+~ Comparison routines
+~ ~~~~~~~~~~~~~~~~~~~
+~
+~   So. This is subtle. These comparison routines all have the same structure.
+~ Notice that = and != are commutative, whereas the others are not, so
+~ consider > as the archetypical one when reasoning about correctness. To
+~ test if A > B, we do cmp A, B, which sets the flags the same way as
+~ subtracting B from A does. The mnemonic names of the condition codes are
+~ based on the assumption you do it in this order.
+~
+~   We want to treat the top of the stack as the first operand, so we
+~ carefully pop in the appropriate order.
+~
+~   There are both signed and unsigned variants of the condition codes. We
+~ provide both; the unmarked comparisons such as > and <= are signed, while
+~ the marked ones such as >unsigned and <=unsigned are of course unsigned.
+
+~   Jonesforth calls this "="; most languages would call it "==".
+: =
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-equal :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+~   Jonesforth calls this "<>", but even most modern SQL dialects recognize
+~ C's legacy and allow "!=" these days. As someone who learned C in childhood,
+~ this is not actually a hard call for us.
+: !=
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-not-equal :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+: >
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-greater :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+~ Is the top of the stack less than the second item in the stack?
+: <
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-less :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+: >=
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-greater-equal :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+: <=
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-less-equal :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+: >unsigned
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-above :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+: <unsigned
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-below :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+: >=unsigned
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-above-equal :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+: <=unsigned
+  [ here @
+    :rax pop-reg64
+    :rbx pop-reg64
+    :rbx :rax cmp-reg64-reg64
+    :cc-below-equal :al set-reg8-cc
+    0x01 :rax and-reg64-imm8
+    :rax push-reg64
+    here ! ] ;asm
+
+~ Bitwise routines
+~ ~~~~~~~~~~~~~~~~
+~
+
+: &
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rbx :rax and-reg64-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+: |
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rbx :rax or-reg64-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+: xor
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rbx :rax xor-reg64-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+~   The HP overloads the name "not", so we follow the Forth convention.
+: invert
+  [ here @
+    :rax pop-reg64
+    :rax not-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+
+~ Literal routines
+~ ~~~~~~~~~~~~~~~~
+~
+~   These words are rarely used directly from user code, but are emitted by
+~ the interpreter, in compile mode. They allow literal values of various kinds
+~ to exist inline as part of compiled code, which would otherwise be only an
+~ array of codeword pointers. They do this, in their various ways, by pushing
+~ any appropriate value on the stack and making sure execution skips over the
+~ bytes used to represent it.
+
+: lit
+  [ here @
+    lods64
+    :rax push-reg64
+    here ! ] ;asm
+
+: litstring
+  [ here @
+    ~   The string immediately follows the codeword in memory, so rsi is
+    ~ already pointing to it. That address will be our returned result, so we
+    ~ push it to the stack.
+    :rsi push-reg64
+
+    ~   Now we need to skip over the string, so that rsi will be valid for the
+    ~ next Forth word. To do that, we're going to do a string operation with
+    ~ scas8, which takes rdi as the address to look at. This means scas8 is
+    ~ treating its operand as analogous to the destination operand of movs*.
+    :rsi :rdi mov-reg64-reg64
+    ~   We want to compare for equality with zero; scas8 looks in al for the
+    ~ other half of the comparison, so we clear rax.
+    :rax :rax xor-reg64-reg64
+    ~   Counterintuitively, we do need to pass a count. We pass -1, which will
+    ~ always work.
+    :rcx :rcx xor-reg64-reg64
+    :rcx not-reg64
+    ~   The DF flag is zero per our Forth execution-model convention, which
+    ~ means scas8 increments rdi after each iteration. This is what we want,
+    ~ so there's no need to mess with it.
+    ~
+    ~   We also need to worry about ZF, but fortunately that "not" will make
+    ~ sure it's clear.
+    ~
+    ~   So, we're ready; do it!
+    repnz-scas8
+    ~   The scas8 completes, incrementing rdi, before the repnz checks its
+    ~ condition. So, rdi is now pointing immediately after the terminating
+    ~ null byte. Of course, we want this in rsi for Forth's purposes.
+    :rdi :rsi mov-reg64-reg64
+
+    ~   Finally, we need to align rsi to the next word boundary.
+    7 :rsi add-reg64-imm8
+    7 invert :rsi and-reg64-imm8
+
+    here ! ] ;asm
+
+
+~ Memory access routines
+~ ~~~~~~~~~~~~~~~~~~~~~~
+~
+~   We go with Forth names for this stuff. The HP's names for memory and
+~ storage operations heavily leverage the fact they have an object system
+~ with type tags and so on; we want to stay close to the bytes.
+
+~ Address on the top of the stack, value in the second position
+: !
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rax :rbx mov-indirect-reg64-reg64
+    here ! ] ;asm
+
+: @
+  [ here @
+    :rax pop-reg64
+    :rax :rax mov-reg64-indirect-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+~   I might have put the parameters the other way round, but this is what
+~ Jonesforth does and it seems reasonable enough.
+~
+~ (value, address --)
+: +!
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rax :rbx add-indirect-reg64-reg64
+    here ! ] ;asm
+
+~ (value, address --)
+: -!
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :rax :rbx sub-indirect-reg64-reg64
+    here ! ] ;asm
+
+~   Jonesforth calls this "c!" but we categorically reject the use of letters
+~ that are meant to indicate byte sizes. It's unfriendly to newcomers. There's
+~ some risk that this name will be confused for meaning "store a value of 8",
+~ but that would not be a useful task, so hopefully it'll be okay.
+: 8!
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :al :rbx mov-indirect-reg64-reg8
+    here ! ] ;asm
+
+: 8@
+  [ here @
+    :rbx pop-reg64
+    :rax :rax xor-reg64-reg64
+    :rbx :al mov-reg8-indirect-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+: 16!
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :ax :rbx mov-indirect-reg64-reg16
+    here ! ] ;asm
+
+: 16@
+  [ here @
+    :rbx pop-reg64
+    :rax :rax xor-reg64-reg64
+    :rbx :ax mov-reg16-indirect-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+: 32!
+  [ here @
+    :rbx pop-reg64
+    :rax pop-reg64
+    :eax :rbx mov-indirect-reg64-reg32
+    here ! ] ;asm
+
+: 32@
+  [ here @
+    :rbx pop-reg64
+    :rax :rax xor-reg64-reg64
+    :rbx :eax mov-reg32-indirect-reg64
+    :rax push-reg64
+    here ! ] ;asm
+
+~   Before we get too deep into it, we also define a few reflection routines
+~ that retrieve, or set, the address of either of the two stacks.
+~
+~   The result, or the new value, is on the top of the data stack. That's the
+~ same as how it works for any other word, but given the metacircular nature
+~ of these ones it's easy to get confused about that...
+~
+~   Jonesforth calls this RSP!, which looks as if it's meant to be an Intel
+~ register name but is actually short for return stack pointer. There is no
+~ register by that name, it's a Forth-provided abstraction. That's super
+~ confusing, plus as discussed above we call it the control stack not the
+~ return stack, so we call the words...
+: control!
+  [ here @
+    :rbp pop-reg64
+    here ! ] ;asm
+: control@
+  [ here @
+    :rbp push-reg64
+    here ! ] ;asm
+
+~   Jonesforth calls this DSP!, for data stack pointer. Again, there's no
+~ Intel register by that name, and we call it the value stack, so...
+: value!
+  [ here @
+    ~   Per Intel's description of POP this reads from the old location, and
+    ~ there is no increment applied to the resulting value. See, the
+    ~ description says it increments the register then overwrites it, in that
+    ~ order.
+    :rsp pop-reg64
+    here ! ] ;asm
+: value@
+  [ here @
+    ~ Per Intel's description of PUSH this pushes the old value.
+    :rsp push-reg64
+    here ! ] ;asm
+
+~   Copy one non-overlapping block of memory over another. For the overlapping
+~ logic, see roll and unroll. This always copies at byte granularity, for ease
+~ of implementation.
+~
+~   Jonesforth calls this CMOVE and has a CCOPY which is a single byte. POSIX,
+~ however, has memcpy() for non-overlapping blocks and memmove() for
+~ overlapping blocks. We follow the latter convention, because it feels like
+~ the more important distinction.
+~
+~   Also, the "c" was meant to indicate that it works at one-byte granularity,
+~ but that isn't, uh... actually an important property here, and as a blanket
+~ call we're not using letters to denote data sizes. So we call it "memcopy".
+~ Apologies to the C programming tradition but vowels are good, actually.
+~
+~   Jonesforth also offers C@C! as another name for its CCOPY, but neither
+~ "@!" nor "mem@mem!" seems particulaly nice.
+~
+~ (destination, source, length --)
+: memcopy
+  [ here @
+    ~   We need to save and restore rsi; the other registers we can trample.
+    :rsi :rdx mov-reg64-reg64
+    :rcx pop-reg64
+    :rsi pop-reg64
+    :rdi pop-reg64
+    ~   We start from the low end, since that's easier arithmetic. So, we get
+    ~ to leave the DF flag alone.
+    rep-movs8
+    :rdx :rsi mov-reg64-reg64
+    here ! ] ;asm
+
+~   This does the same thing as memcopy, but correctly handles situations
+~ where the destination overlaps with the source. It achieves this by being
+~ careful about which end the transfer starts from. This "move" vs. "copy"
+~ distinction mirrors C terminology.
+~
+~ (destination, source, length --)
+: memmove
+  [ here @
+    ~ We need to save and restore rsi; the other registers we can trample.
+    :rsi :rdx mov-reg64-reg64
+    :rcx pop-reg64
+    :rsi pop-reg64
+    :rdi pop-reg64
+
+    ~ We need to check source < destination to decide which end to start from.
+    :rsi :rax mov-reg64-reg64
+    :rdi :rax cmp-reg64-reg64
+    ~ Relative offsets are from the start of the instruction after the jmp.
+    4 :cc-below jmp-cc-rel-imm8
+
+    ~ If source is greater, we are sliding downwards so we start from the low
+    ~ end. So, we get to leave the DF flag alone.
+    rep-movs8                         ~ 2 bytes
+    16 jmp-rel-imm8                   ~ 2 bytes
+
+    ~ If destination is greater, we are sliding upwards so we start from the
+    ~ high end. So, we have to save and restore DF. Also, we have to adjust the
+    ~ pointers.
+    :rcx :rsi add-reg64-reg64         ~ 3 bytes
+    :rsi dec-reg64                    ~ 3 bytes
+    :rcx :rdi add-reg64-reg64         ~ 3 bytes
+    :rdi dec-reg64                    ~ 3 bytes
+    std                               ~ 1 byte
+    rep-movs8                         ~ 2 bytes
+    cld                               ~ 1 byte
+
+    :rdx :rsi mov-reg64-reg64
+    here ! ] ;asm
+
+
+~ (string pointer -- string length not including null byte)
+: stringlen
+  [ here @
+    :rdi pop-reg64
+    :rdi :rbx mov-reg64-reg64
+    :rax :rax xor-reg64-reg64
+    :rcx :rcx xor-reg64-reg64
+    :rcx not-reg64
+    repnz-scas8
+    :rbx :rdi sub-reg64-reg64
+    1 :rdi sub-reg64-imm8
+    :rdi push-reg64
+    here ! ] ;asm
+
+~   This word is for working with strings that are null-terminated at both
+~ ends. It counts backwards from the end to find the string length. It accepts
+~ an input pointer to the final non-null byte at the end of the string, and
+~ returns the length not including either null byte.
+~
+~ (string end pointer -- string length not including null)
+: reverse-stringlen
+  [ here @
+    :rdi pop-reg64
+
+    :rdi :rbx mov-reg64-reg64
+    :rax :rax xor-reg64-reg64
+    :rcx :rcx xor-reg64-reg64
+    :rcx not-reg64
+
+    std
+    repnz-scas8
+    cld
+
+    :rdi :rbx sub-reg64-reg64
+    1 :rbx sub-reg64-imm8
+
+    :rbx push-reg64
+    here ! ] ;asm
+
+
+~   If you have a variable-length string followed by alignment padding, and
+~ you want to traverse it in reverse, you also need to skip the alignment
+~ padding...
+~
+~   The alignment padding can have a total length of no more than 8 bytes. The
+~ returned value counts from the end, backwards, to the nearest non-null byte,
+~ or at most a distance of eight.
+~
+~ (address of final null byte in padding -- count of null bytes in padding)
+: reverse-padding-len
+  [ here @
+    :rdi pop-reg64
+
+    :rdi :rbx mov-reg64-reg64
+    :rax :rax xor-reg64-reg64
+    9 :rcx mov-reg64-imm32
+
+    std
+    repz-scas8
+    cld
+
+    :rdi :rbx sub-reg64-reg64
+    1 :rbx sub-reg64-imm8
+
+    :rbx push-reg64
+    here ! ] ;asm
+
+
+~   We make this work using exactly two jump instructions, which is likely the
+~ minimum possible. To avoid relying on labels, we hand-compute the byte
+~ offsets, so every instruction within their ranges is annotated with its
+~ length in bytes. Good fun.
+~
+~   The returned comparsion value is 0 for equal, 1 to indicate the left
+~ string (lower on the stack) is greater, and -1 to indicate the right string
+~ (top of the stack) is greater.
+~
+~ (left string pointer, right string pointer -- comparison value)
+: stringcmp
+  [ here @
+    ~   Save the rsi register.
+    ~
+    ~   Happily, we don't need a lot of registers for this code, so we can
+    ~ dedicate rdx to this and not have to deal with stack juggling..
+    :rsi :rdx mov-reg64-reg64
+
+    ~   Get the parameters off the stack.
+    ~
+    ~   For reasons that will be explained below, the left and right strings
+    ~ from our caller's perspective are swapped from the perspective of the
+    ~ comparisons we'll be doing. The rdi register points to the right-hand
+    ~ operand of the "cmps" instruction, while rsi points to the left-hand
+    ~ operand; [Intel] volume 2A, chapter 3, section 4-3.3, "CMPS"; note that
+    ~ the PDF mis-numbers the sections in the second half of chapter 3.
+    :rsi pop-reg64
+    :rdi pop-reg64
+
+    ~   For the comparison-unequal loop-exit code, we'll need rbx set to zero.
+    ~ The easy way to do that is by xor'ing it with itself, but doing it at
+    ~ the end of the loop would overwrite the comparison flags, which we need.
+    ~ So we do it in advance, and carefully don't touch it anywhere else.
+    ~
+    ~   See the list of which instructions affect the flags in [Intel]
+    ~ volume 1, appendix A, section A-1, table A-2.
+    :rbx :rbx xor-reg64-reg64
+
+    ~   We also clear rax; we'll be using al later, and it will be convenient
+    ~ to know the upper bits are always zero.
+    :rax :rax xor-reg64-reg64
+
+    ~ Now we've initialized everything we need; everything after this point is
+    ~ part of the loop.
+
+    ~   At the start of each iteration, save a copy of the byte we're at now.
+    ~ We need to do this before cmps because the pointers will increment;
+    ~ we'll eventually use it to test whether we've reached the end delimiter.
+    ~
+    ~   We do a 64-bit load into rcx, which is otherwise unused, then copy the
+    ~ low byte from cl to al. This avoids having to think about addressing
+    ~ modes that combine 64-bit addresses with smaller data sizes; those are
+    ~ very subtle.
+    :rsi :rcx mov-reg64-indirect-reg64  ~ 3 bytes
+    :cl :al mov-reg8-reg8               ~ 2 bytes
+
+    ~   Now do the cmps, which is the heart of the loop.
+    ~
+    ~   Note that, in addition to relying on this comparison to test content
+    ~ bytes against each other, in the event that one string is a prefix of
+    ~ the other, this will also test content bytes against null delimiters.
+    ~ The longer string will compare as "greater", because zero is less than
+    ~ all other possible byte values. This is what we want.
+    ~
+    ~   Since strings of different lengths are necessarily unequal, letting
+    ~ this test do the work of detecting that also means we don't have to deal
+    ~ with scenarios where we're past the end of one string but not the other.
+    cmps8                               ~ 1 byte
+
+    ~   The flags are now set based on the simulated subtraction of the next
+    ~ bytes from the two strings. If they were unequal, the loop will end. If
+    ~ they were equal, we have another test to do before we're finished with
+    ~ this iteration. So we put the loop-end code next, and conditionally jump
+    ~ forward past it.
+    ~
+    ~   Recall that relative offsets are from the start of the instruction after
+    ~ the jmp.
+    15 :cc-equal jmp-cc-rel-imm8        ~ 2 bytes
+
+    ~   If we got here, the strings are unequal, so we need to turn the flags
+    ~ into a comparison value. We cleared rbx earlier; now we set its low bit
+    ~ to the "above" flag, then use sbb to subtract the "carry" flag (which it
+    ~ thinks of as the "borrow" flag).
+    ~
+    ~   Recall that "carry" is based on whether the comparison would cause a
+    ~ change to the next bit "outside" the bits being compared; [Intel]
+    ~ volume 1, chapter 3, section 3-4.3.1. Since the comparison is a
+    ~ subtraction, "carry" is true when the right-hand byte is strictly
+    ~ greater than the left-hand byte.
+    ~
+    ~   Also recall that "zero", also called "equal", is based on whether the
+    ~ comparison's result produces output that's all zero bits. Since the
+    ~ comparison is a subtraction, "zero" is true when the two bytes are
+    ~ identical.
+    ~
+    ~   The "above" condition is not a flag, but a composite of flags. It's
+    ~ true when both "carry" and "zero" are false; [Intel] volume 1, chapter
+    ~ 7, section 7-3.1.1, table 7-2. For us, this is equivalent to saying the
+    ~ left-hand byte is strictly greater than the right-hand.
+    ~
+    ~   So, after the "set" instruction, rbx is 1 if left > right, and 0 if
+    ~ left = right or left < right. The "sbb" instruction with an immediate
+    ~ value of zero subtracts the value of the carry flag from rbx; the flag
+    ~ is true in the case where left < right, and false otherwise. So, after
+    ~ the "sbb", rbx is 1 if left > right, 0 if left = right, and -1 if
+    ~ left < right.
+    ~
+    ~   This sounds like the opposite of what we want, but recall that we
+    ~ exchanged the operands, so these are the values our caller is expecting.
+    ~ While it might be tempting to look for a way to not need that
+    ~ transposition, and indeed there exists an approach using "not above",
+    ~ it's fiddly in a way that's even harder to explain.
+    ~
+    ~   Note that it's important that the "jmp" and "set" instructions don't
+    ~ change the flags (see table A-2 again), so the "cmps" is the most recent
+    ~ thing that did. The "sbb" instruction does change them, but we don't
+    ~ need them again after that point so it doesn't matter.
+    ~
+    ~   Finally, we need to restore rsi before we return to Forth.
+    ~
+    ~   The two-instruction set-sbb sequence is one of those classic assembly
+    ~ programming tricks that often goes unexplained, because most of its
+    ~ appeal is its brevity and a proper explanation is quite lengthy. Please
+    ~ enjoy this gift of knowledge, and thanks to our friends who showed it to
+    ~ us.
+    :cc-above :bl set-reg8-cc           ~ 3 bytes
+    0 :rbx sbb-reg64-imm8               ~ 4 bytes
+    :rbx push-reg64                     ~ 1 byte
+    :rdx :rsi mov-reg64-reg64           ~ 3 bytes
+    pack-next                           ~ 4 bytes
+
+    ~   If we got here, the current bytes are equal to each other. We still
+    ~ need to test if they're null terminators; if so, we exit, and if not, we
+    ~ loop.
+    ~
+    ~   At the beginning of this iteration, we saved a copy of the byte being
+    ~ inspected in rax. The "test" instruction simulates a xor and sets the
+    ~ flags accordingly. We check the "not equal" condition, also known as
+    ~ "not zero", and jump backwards to the start of the loop when that's the
+    ~ case.
+    ~
+    ~   Recall that relative offsets are from the start of the instruction
+    ~ after the jmp.
+    :rax :rax test-reg64-reg64          ~ 3 bytes
+    -28 :cc-not-equal jmp-cc-rel-imm8   ~ 2 bytes
+
+    ~   If we got here, we got all the way to the end of both strings and
+    ~ found a null byte, so we return 0 to indicate they're equal.
+    ~
+    ~   We can use push-imm32-extended64 even though our stack holds 64-bit
+    ~ values, because it gets sign-extended.
+    ~
+    ~   We need to restore rsi before we return to Forth.
+    0 push-imm32-extended64
+    :rdx :rsi mov-reg64-reg64
+    here ! ] ;asm
+
+
+~ Branching
+~ ~~~~~~~~~
+
+~   Okay, so there's a weird thing about how branch and 0branch are defined.
+~ 0branch jumps into branch, so they need to be next to each other in the log.
+~ To compute the offset to jump by, we pass an address on the stack. Notice
+~ that assembly words are always defined in immediate mode; it's just they
+~ usually only do trivial logic.
+
+~   This takes a number of bytes, not machine words. That allows it to be used
+~ for putting weird things embedded in the code.
+~
+~   The offset is relative to the start of the word the number of bytes is in,
+~ so, make sure to have it skip itself.
+: branch
+  [ here @
+    dup swap ~ Save a copy of this address for later.
+    :rsi :rsi add-reg64-indirect-reg64
+    here ! ] ;asm
+~ (jump destination)
+
+: 0branch
+  [ here @
+    ~ (jump destination, output point)
+
+    :rax pop-reg64
+    :rax :rax test-reg64-reg64
+
+    ~   Please notice the 8-bit branch to the nearby word. This is the offset
+    ~ from the end of the jmp instruction, to the start of the target
+    ~ instruction. To avoid needing the label system for this, we compute it
+    ~ based on the address before the jmp; the jmp itself is two bytes.
+    ~
+    ~   Remember, with jump arithmetic, subtract the start from the
+    ~ destination.
+    ~
+    ~   What we're testing with :cc-equal is that the input value is zero.
+    ~ The name is slightly counterintuitive; it's a result of the condition
+    ~ names favoring cmp over test. While cmp simulates subtraction, test
+    ~ simulates bitwise AND.
+    dup 2 + 3roll swap - :cc-equal jmp-cc-rel-imm8
+    ~ (output point)
+
+    ~   In the event we didn't jump, we still need to skip over the literal
+    ~ value.  Using lods here is just a convenient way to skip rsi forward.
+    lods64
+
+    here ! ] ;asm
+~ the stack is empty once more
+
+
+~   This is like next, but instead of using rsi as the "instruction pointer",
+~ it takes a codeword address from the value stack.
+~
+~   In the event that the codeword is docol, docol will handle any
+~ manipulation of the control stack that needs to happen. Yes, it really is
+~ that simple.
+~
+~ (execution token --)
+: execute
+  [ here @
+    :rax pop-reg64
+    :rax jmp-abs-indirect-reg64
+    here ! ] ;asm
+
diff --git a/evoke.e b/evoke.e
index 529a9a4..14ac458 100644
--- a/evoke.e
+++ b/evoke.e
@@ -10,6 +10,7 @@
   elf-program-header
   cold-start
   warm-start
+  output-docol
   0 L!' final-word-name
   current-offset L!' total-size
   ;
diff --git a/execution.e b/execution.e
index f6e5978..fb83688 100644
--- a/execution.e
+++ b/execution.e
@@ -174,7 +174,7 @@
 ~ * DF = 0 is required
 ~
 ~ (base address -- new base address)
-: next
+: pack-next
   ~ Copy the next word's address from *rsi into rax. Increment rsi (as per the
   ~ DF flag).
   lods64
@@ -204,7 +204,7 @@
 ~ eventually getting around to inlining "next". Hence the name.
 ~
 ~ (target address, base address -- new base address)
-: beforenext
+: pack-beforenext
   ~ Do a permanent transfer of control by setting rax and invoking the
   ~ codeword. Of course, we could jump to docol ourselves but this will work
   ~ regardless of what the receiving codeword is.
@@ -464,7 +464,7 @@
   ~   We are about to set up rsi, we did rbp already, and rsp came to us
   ~ already set up. That's all that "next" needs, so take it away!
   L@' warm-start :rsi mov-reg64-imm64
-  next ;
+  pack-next ;
 
 ~ Routine warm-start
 ~ ~~~~~~~~~~~~~~~~~~
@@ -537,3 +537,78 @@
   ~ dq early_latest, litstring, "latest", early_variable
   ~ dq early_here, litstring, "here", early_variable
   ;
+
+
+~ (previous entry address, output point, name string pointer
+~  -- new entry address, output point)
+: output-create
+  3roll dup 4 roll swap pack64
+  ~ (string pointer, new entry address, output point)
+  0 pack8
+  0 pack8
+  roll3 packstring
+  ~ (new entry address, output point)
+  8 packalign
+  ;
+
+
+~ Routine docol
+~ ~~~~~~~~~~~~~
+~
+~   Reference this via its label as the codeword of a word to make it an
+~ "interpreted" word. Concretely, it saves rsi (the "instruction pointer")
+~ to the control stack, takes the address of the codeword from rax and
+~ increments it in-place to form the new instruction pointer, and copies
+~ that to rsi.
+~
+~   Having then done this, we're now in the state that normal execution
+~ expects, so docol ends by it using "next" to begin the callee's execution,
+~ kicking off a nested call.
+~
+~   The name is said to be short for "do colon", because Forth high-level
+~ code begins word definitions with a colon.
+~
+~ Registers in:
+~
+~ * rsi is the caller's instruction pointer
+~ * rbp is the control stack pointer
+~ * rax is the address of the callee's codeword
+~
+~ Registers out:
+~
+~ * rsi is the callee's instruction pointer
+~ * rbp is the control stack pointer
+~
+~ (previous entry address, output point)
+: output-docol
+  s" docol" output-create
+
+  ~ Evaluated as a word, docol is a constant which returns a pointer.
+  L@' docol :rax mov-reg64-imm64
+  :rax push-reg64
+  pack-next
+  8 packalign
+
+  ~ Since docol is not a normal word, the label points to the value we care
+  ~ about from the assembly side of things, which is the address we use as the
+  ~ codeword.
+  L!' docol
+  :rsi pack-pushcontrol
+  8 :rax add-reg64-imm8
+  :rax :rsi mov-reg64-reg64
+  pack-next
+  ;
+
+~   This is the mechanism to "return" from a word interpreted by docol.
+~ We pop the control stack, and then, since this is threaded execution, we
+~ do the next thing the caller wants to do, by inlining "next".
+~
+~ (previous entry address, output point
+~  -- new entry address, output point)
+: output-exit
+  s" exit" output-create
+  L!' exit
+  :rsi pack-popcontrol
+  pack-next
+  ;
+
diff --git a/interpret.e b/interpret.e
index 7884feb..eabbb09 100644
--- a/interpret.e
+++ b/interpret.e
@@ -139,7 +139,7 @@
   here @
   swap :rax mov-reg64-imm64
   :rax push-reg64
-  next
+  pack-next
   8 packalign
   here ! ;
 
@@ -410,6 +410,16 @@ latest @ dup hide-entry
 ~ (pointer to [ entry, pointer to ] entry, pointer to ; entry)
 
 
+: ;asm
+  here @ pack-next 8 packalign here !
+  latest @ dup unhide-entry entry-to-execution-token dup 8 + swap !
+  ~ See above.
+  [ 2 pick entry-to-execution-token , ]
+  ; make-immediate
+latest @ dup hide-entry
+~ (pointer to [ entry, pointer to ], pointer to ;, pointer to ;asm)
+
+
 ~   Although we will eventually define the word "'" to give us the symbol of
 ~ a word, it will rely on being able to compile a literal. Rather than do lots
 ~ of string processing later, we choose to define this word now to avoid
diff --git a/quine.asm b/quine.asm
index 12a45f8..44ff249 100644
--- a/quine.asm
+++ b/quine.asm
@@ -7737,6 +7737,36 @@ cold_start:
   dq swap, litstring, ";", early_find, roll3, execute
 
 
+  dq litstring, ";asm", early_create, early_docol_codeword
+  dq litstring, "here", early_find, entry_to_execution_token, early_comma
+  dq litstring, "@", early_find, entry_to_execution_token, early_comma
+  dq litstring, "pack-next", early_find, entry_to_execution_token, early_comma
+  dq litstring, "lit", early_find, entry_to_execution_token, early_comma
+  dq lit, 8, early_comma
+  dq litstring, "packalign", early_find, entry_to_execution_token, early_comma
+  dq litstring, "here", early_find, entry_to_execution_token, early_comma
+  dq litstring, "!", early_find, entry_to_execution_token, early_comma
+  dq litstring, "latest", early_find, entry_to_execution_token, early_comma
+  dq litstring, "@", early_find, entry_to_execution_token, early_comma
+  dq litstring, "dup", early_find, entry_to_execution_token, early_comma
+  dq litstring, "unhide-entry", early_find, entry_to_execution_token
+  dq early_comma
+  dq litstring, "entry-to-execution-token", 0, early_find
+  dq entry_to_execution_token, early_comma
+  dq litstring, "dup", early_find, entry_to_execution_token, early_comma
+  dq litstring, "lit", early_find, entry_to_execution_token, early_comma
+  dq lit, 8, early_comma
+  dq litstring, "+", early_find, entry_to_execution_token, early_comma
+  dq litstring, "swap", early_find, entry_to_execution_token, early_comma
+  dq litstring, "!", early_find, entry_to_execution_token, early_comma
+  dq litstring, "[", early_find, entry_to_execution_token, early_comma
+  dq litstring, "exit", early_find, entry_to_execution_token, early_comma
+  dq early_here, fetch, lit, 8, packalign, early_here_store
+
+  dq litstring, "set-word-immediate", early_find, entry_to_execution_token
+  dq swap, litstring, ";asm", early_find, roll3, execute
+
+
   ; Although we will eventually define the word "'" to give us the symbol of a
   ; word, it will rely on being able to compile a literal. Rather than do
   ; lots of string processing later, we choose to define this word now to