1 files changed, 220 insertions, 102 deletions
diff --git a/interpret.e b/interpret.e
index d2bf10f..ee0a52b 100644
--- a/interpret.e
+++ b/interpret.e
@@ -38,37 +38,132 @@
 ~   It is primarily concerned with managing the contents of an area of memory
 ~ we call the "log"; see log-load.e for more detail on terminology.
 
-: hide-entry dup entry-flags@ 0x80 | entry-flags! ;
-
-: unhide-entry dup entry-flags@ 0x80 invert & entry-flags! ;
-
-
-~ TODO
-~ unroll-past-string                                    0000001000018db8
-~ swap-past-string                                      0000001000018ea0
-~ dropstring                                            0000001000018ee8
-~ dropstring-with-result                                0000001000018f80
-~ accumulate-string                                     0000001000018fc8
-~ is-space                                              0000001000018b00
-~ word                                                  00000010000194a0
-
-~   The word "'", often pronounced "tick", quotes the following word, looking
-~ it up and treating it as a constant. In immediate mode, the constant winds
-~ up on the stack; in compile mode it gets compiled.
-~
-~   There are a few possible implementation strategies here. Running as an
-~ immediate word means there's a clear and unambiguous concept of "the
-~ following word", so that's what we do; otherwise we'd have to get clever
-~ about somehow finding out where we were called from. That means we take on
-~ what would otherwise be the interpreter's responsibility, of checking what
-~ mode we're in. Happily, that's easy to do.
+~   As a convenience for "word", we have some facilities for working with
+~ stack-allocated strings. Yeah, trippy concept. Also, it would be a
+~ buffer overrun hazard if we were worried about that, which is why this
+~ is no longer common practice in C.
 ~
-~   There's a cyclic dependency where "if" relies on "'", and "'" relies on
-~ "if". Fortunately both of them are treated as alternates by the log-load
-~ transform, so we don't have to worry about it.
-: ' word value@ find dropstring-with-result
-  interpreter-flags @ 1 & { literal } if
-  ; make-immediate
+~   The most important of these is accumulate-string, but we need some
+~ smaller pieces first.
+
+~ (stack string..., items to be left alone..., item to be unrolled,
+~  number of items above string that participate in the unroll
+~  -- item that was unrolled, stack string..., items left alone)
+: unroll-past-string
+  8 *
+  ~ (string, other items, top item, byte offset to string start)
+  dup value@ +
+  ~ We have two copies of the offset present, in addition to the stuff we want
+  ~ to rotate. So, the actual string starts two words on... We could have
+  ~ adjusted the offset instead, but we'll want the unmodified offset again
+  ~ later.
+  16 +
+  ~ (string, other items, top item, offset to start, string pointer)
+  stringlen
+  ~ Same reasoning as in accumulate-string (see below).
+  1 + 8 align-size
+  ~ (string, other items, top item, offset to start, string length w/ padding)
+  + 8 /% swap drop
+  ~ (string, other items, top item, number of words to unroll)
+  unroll ;
+
+
+
+~ (stack string..., item to be swapped
+~  -- item that was swapped, stack string...)
+: swap-past-string 1 unroll-past-string ;
+
+
+~ (stack string... --)
+: dropstring
+  value@ stringlen
+  ~ Same reasoning as in accumulate-string (see below).
+  1 + 8 align-size value@ +
+  ~   At the time we fetched the stack pointer, there was an extra value atop
+  ~ it, so we have to add one more word.
+  8 + value! ;
+
+
+~ (stack string..., item to be kept
+~  -- item that was kept)
+: dropstring-with-result swap-past-string dropstring ;
+
+
+~ (stack string-so-far..., new character byte
+~  -- updated string-so-far)
+: accumulate-string
+  ~ Compute the address of the final word of the string.
+  ~
+  ~   It's a little bit difficult to get the start pointer right, since all
+  ~ our intermediate products affect what we get from value@, so we compute
+  ~ that just once, here at the beginning.
+  value@ 8 +
+  ~ (string so far, new character byte, pointer to start of string)
+  dup stringlen
+  ~ There are two concerns here that overlap: First, we always want at least
+  ~ one word. Recall that a length of zero bytes won't receive any alignment
+  ~ padding because it's already divisible by 8. Second, the result of
+  ~ stringlen doesn't include the null byte, which might be in a word by
+  ~ itself that needs to be counted. We can address both of them by
+  ~ unconditionally adding 1 to the length before applying alignment.
+  1 +
+  ~ Pad the length for alignment.
+  8 align-size
+  ~ We want an offset from the first word of the string to the last word of
+  ~ the string, so we subtract one word from the length.
+  8 - +
+  ~ (string so far, new character byte, address of final word)
+
+  ~ Examine the final word of the string, leaving other stuff undisturbed.
+  ~ Work low-to-high to figure out where to store the new byte, taking the
+  ~ first one that's available.
+  ~ (string so far, new character byte, address of final word)
+  dup @ dup 0x00000000000000FF & { 3roll | swap ! exit } unless
+  ~ (string so far, new character byte, address of final word, old value)
+
+  ~ This next part is repeated several times, changing only the offsets, for
+  ~ bytes 1 through 6; bytes 0 (above) and 7 (way below) are different.
+  ~ (string so far, new character byte, address of final word, old value)
+  dup 0x000000000000FF00 & { 3roll 0x0000000000000100 * | swap ! exit } unless
+  dup 0x0000000000FF0000 & { 3roll 0x0000000000010000 * | swap ! exit } unless
+  dup 0x00000000FF000000 & { 3roll 0x0000000001000000 * | swap ! exit } unless
+  dup 0x000000FF00000000 & { 3roll 0x0000000100000000 * | swap ! exit } unless
+  dup 0x0000FF0000000000 & { 3roll 0x0000010000000000 * | swap ! exit } unless
+  dup 0x00FF000000000000 & { 3roll 0x0001000000000000 * | swap ! exit } unless
+
+  ~   The top byte of the final word is always zero (or else stringlen
+  ~ wouldn't have called it the final word), so we don't need to check it, we
+  ~ can just use it.
+  ~
+  ~   We need to put the new value in the top byte, which will mean we have no
+  ~ null terminator, so we also need to start a new word.
+  ~
+  ~   There is a fiddly order-dependency here: unroll-past-string relies on
+  ~ being able to find the null terminator, which won't work if we've gotten
+  ~ rid of it. Also, calling it will move all the earlier words, including
+  ~ the one we intend to write to, which will invalidate any pointer we're
+  ~ keeping at that point. There's a few ways to resolve this; what we do is
+  ~ put the new terminator in place first, manually nudge the pointer, and
+  ~ then write the new value.
+  ~ (string so far, new character byte, address of final word, old value)
+  3roll 0x0100000000000000 * | swap
+  ~ (string so far, new value, address of final word)
+  0 3 unroll-past-string
+  ~ (new null terminator, string so far, new value, invalid address)
+  8 -
+  ~ (new null terminator, string so far, new value, updated address)
+  ! ;
+
+
+~ (byte -- boolean)
+: is-space
+  dup 0x20 = { drop 1 exit } if
+  dup 0x09 = { drop 1 exit } if
+  dup 0x0a = { drop 1 exit } if
+  dup 0x0b = { drop 1 exit } if
+  dup 0x0c = { drop 1 exit } if
+  dup 0x0d = { drop 1 exit } if
+  drop 0 ;
 
 
 ~ (character -- 1 for true or 0 for false)
@@ -84,6 +179,30 @@
   drop 0 ; ~ Greater than "z".
 
 
+~ Okay, this is the big one, the lexer! Wow.
+~
+~ (-- stack string occupying multiple words)
+: word
+  ~ We allocate an empty string first, so that the result of "key" will
+  ~ conveniently be on the easy-to-find end of it.
+  0
+
+  ~ Skip whitespace.
+  { key dup is-space } { drop } while
+
+  ~ Early exit if it's a zero byte.
+  {
+    dup 0 = { drop dropstring 0 exit } if
+
+    accumulate-string
+
+    peek dup is-space { drop exit } if
+    dup { drop exit } unless
+    consume accumulate-string
+  } forever ;
+
+
+
 ~ (character -- value)
 : generalized-digit-value
   ~ We don't have a character-literal syntax; the hex constants here are
@@ -252,91 +371,87 @@
 ~ harder for words that attempt to work with the contents of other words. So
 ~ we give it a name.
 
-s" interpreter-flags-storage" create make-immediate make-visible
+s" interpreter-flags-storage" create make-hidden
 here @ 0 , s" interpreter-flags" variable
 
+
 ~   There's an important bootstrapping concern: If you're loading this
 ~ interpreter into a running Evocation, it's important to not use the wrong
-~ interpreter state value. TODO longer explanation
-
-~ TODO the definition of set-word-immediate would come here; is it needed?
-
-: [ interpreter-flags @ 0x01 invert & interpreter-flags ! ; make-immediate
-latest @ dup hide-entry
-
-: ] interpreter-flags @ 0x01 | interpreter-flags ! ;
-latest @ dup hide-entry
+~ interpreter state value. The way we actually load this is under the log-load
+~ transform, which will be running its alternate versions of these words and
+~ therefore won't have trouble, but for the sake of making it easy to
+~ experiment with modified versions of this file, we do the whole hide-show
+~ dance.
+
+: [ interpreter-flags @ 0x01 invert & interpreter-flags !
+  ; make-immediate make-hidden
+latest @
+~ (pointer to [ entry)
+
+: ] interpreter-flags @ 0x01 | interpreter-flags !
+  ; make-hidden
+latest @
 ~ (pointer to [ entry, pointer to ] entry)
 
-~   It may seem nonsensical to use : to define :, but the bootstrapping stuff
-~ overrides what it does, so it works. The same, of course, goes for all these
-~ other word-defining words.
+~   It may seem nonsensical to use : to define :, but the log-load transform
+~ provides an alternate for it, so it works. The same, of course, goes for all
+~ these other word-defining words. The syntax-centric ones such as : are here
+~ in interpret.e because they need to interact with interpreter-flags, which
+~ isn't defined before this point.
 ~
 ~   If the ] at the end feels backwards, imagine to yourself that everything
 ~ that ISN'T defining a word body is part of an implicit [ ... ] sequence.
 ~ Doing so doesn't really change anything, but may make you happier.
-: : word value@ create dropstring docol , latest @ hide-entry ] ;
+: : word value@ create dropstring docol , make-hidden ] ;
 
 ~   The counterpart of : is ;.
 : ;
-  ~ See commentary on "literal", below, regarding "lit exit".
+  ~ See commentary on "literal", in dynamic.e, regarding "lit exit".
   lit exit ,
-  latest @ unhide-entry
-  ~ See above regarding [. Since it's an immediate word, we have to go to
+  make-visible
+  ~   See above regarding [. Since it's an immediate word, we have to go to
   ~ extra trouble to compile it as part of ;. Since it's also hidden, we have
   ~ to go behind the interpreter's back to even get its entry pointer.
+  ~
+  ~   Adding insult to injury, the log transform breaks , for unclear reasons,
+  ~ so we do that by hand as well.
   [ over entry-to-execution-token , ]
-  ; make-immediate
-latest @ dup hide-entry
+  ; make-immediate make-hidden
 ~ (pointer to [ entry, pointer to ] entry, pointer to ; entry)
 
 
 : ;asm
   here @ pack-next 8 packalign here !
-  latest @ dup unhide-entry entry-to-execution-token dup 8 + swap !
+  make-visible
+  latest @ entry-to-execution-token dup 8 + swap !
   ~ See above.
   [ 2 pick entry-to-execution-token , ]
-  ; make-immediate
-latest @ dup hide-entry
+  ; make-immediate make-hidden
+latest @
 ~ (pointer to [ entry, pointer to ], pointer to ;, pointer to ;asm)
 
 
-~   Although we will eventually define the word "'" to give us the symbol of
-~ a word, it will rely on being able to compile a literal. Rather than do lots
-~ of string processing later, we choose to define this word now to avoid
-~ having to look up the word "lit" as part of that.
-~
-~   It may be slightly surprising that the construction "lit lit" works as
-~ expected, given that ie. "lit 5" will break, as will "lit [", so it's worth
-~ explaining why it does.
-~
-~   In most respects "lit" is just an ordinary word, which compilation turns
-~ into a pointer to its codeword. That's what happens to most words, if
-~ they're not a special syntax nor flagged as immediate. It just happens to be
-~ a word that it rarely makes sense to use directly, since its purpose is to
-~ be generated as part of the output when compiling number literals. The
-~ special behavior around number literals is that when "interpret" sees ie.
-~ "5", it first compiles "lit", then appends the numeric value 5 as the
-~ following item in the compiled word body.
-~
-~   The job of "lit" when it's later executed is to push the appropriate value
-~ onto the stack and ensure that it doesn't get executed as code. So, whatever
-~ you put immediately after it gets treated as a value, even if it's a
-~ pointer.
+~   The word "'", often pronounced "tick", quotes the following word, looking
+~ it up and treating it as a constant. In immediate mode, the constant winds
+~ up on the stack; in compile mode it gets compiled.
 ~
-~   The reason that writing "lit 5" in Evocation syntax crashes is that it
-~ gets turned into "lit lit 5" when compiled, which treats the second "lit" as
-~ a value then tries to use "5" as a codeword pointer. So you can use "lit"
-~ to quote whatever you want, it's just if it's already a special syntax you
-~ might need to go behind "interpret"'s back to get it into the compiled
-~ output. In practice, this is likely the only place that needs to happen, but
-~ the mechanism is documented for the sake of whatever comes up in the future.
+~   There are a few possible implementation strategies here. Running as an
+~ immediate word means there's a clear and unambiguous concept of "the
+~ following word", so that's what we do; otherwise we'd have to get clever
+~ about somehow finding out where we were called from. That means we take on
+~ what would otherwise be the interpreter's responsibility, of checking what
+~ mode we're in. Happily, that's easy to do.
 ~
-~ (value -- )
-: literal lit lit , , ;
+~   There's a cyclic dependency where "if" relies on "'", and "'" relies on
+~ "if". Fortunately flow-control is done with alternates by the log-load
+~ transform, so we don't have to worry about it.
+: ' word value@ find dropstring-with-result
+  interpreter-flags @ 1 & { literal } if
+  ; make-immediate
+
 
 
-~ Now the single most important word...
+~ ~ Now the single most important word...
 : interpret
   word
 
@@ -392,21 +507,24 @@ latest @ dup hide-entry
   ~ If it's neither in the dictionary nor a number, just print an error.
   s" No such word: " emitstring value@ emitstring dropstring ;
 
-~ TODO for ease of debugging, this isn't the full implementation, which lets
-~ us exit it to the outer "quit"
-: quit { interpret } forever ;
-
-~   Now we switch into the new interpreter, enabling the three words we'd been
-~ keeping hidden and then calling "quit".
-unhide-entry unhide-entry unhide-entry quit
-
--0x10 newline . newline
-4 5 + . : za 13 12 - . ; za
-~ : ' word value@ find dropstring-with-result
-~  interpreter-flags @ 1 & { literal } if ; make-immediate
-' za . newline
-: piz ' za . newline ; piz
-~ ' interpret forget quit 2 3 * .
-' ' describe ' za describe ' piz describe
-bye
-
+~ ~ TODO for ease of debugging, this isn't the full implementation, which lets
+~ ~ us exit it to the outer "quit"
+~ : quit { interpret } forever ;
+~ 
+~ ~   Now we switch into the new interpreter, enabling the three words we'd been
+~ ~ keeping hidden and then calling "quit".
+~ dup entry-flags@ 0x80 invert & entry-flags!
+~ dup entry-flags@ 0x80 invert & entry-flags!
+~ dup entry-flags@ 0x80 invert & entry-flags!
+~ quit
+~ 
+~ ~ -0x10 newline . newline
+~ compile ~ 4 5 + . : za 13 12 - . ; za
+~ ~ ~ : ' word value@ find dropstring-with-result
+~ ~ ~  interpreter-flags @ 1 & { literal } if ; make-immediate
+~ ~ ' za . newline
+~ ~ : piz ' za . newline ; piz
+~ ~ ~ ' interpret forget quit 2 3 * .
+~ ~ ' ' describe ' za describe ' piz describe
+~ ~ bye
+~ ~