From ff9b791acf12688caec27e8ca883f3eefac8891d Mon Sep 17 00:00:00 2001 From: Irene Knapp Date: Thu, 21 May 2026 22:39:21 -0700 Subject: whew, okay, lots of interpret stuff implemented now not fully tried though Force-Push: yes Change-Id: I19b39f2b982fde66863c710b6d458c3bd12bdf4a --- dynamic.e | 35 +++++++ evoke.e | 2 +- interpret.e | 322 +++++++++++++++++++++++++++++++++++++++++------------------- transform.e | 32 ++++-- 4 files changed, 280 insertions(+), 111 deletions(-) diff --git a/dynamic.e b/dynamic.e index 0adc5c6..2edcc13 100644 --- a/dynamic.e +++ b/dynamic.e @@ -399,3 +399,38 @@ here ! ; + +~ Although we will eventually define the word "'" to give us the symbol of +~ a word, it will rely on being able to compile a literal. Rather than do lots +~ of string processing later, we choose to define this word now to avoid +~ having to look up the word "lit" as part of that. +~ +~ It may be slightly surprising that the construction "lit lit" works as +~ expected, given that ie. "lit 5" will break, as will "lit [", so it's worth +~ explaining why it does. +~ +~ In most respects "lit" is just an ordinary word, which compilation turns +~ into a pointer to its codeword. That's what happens to most words, if +~ they're not a special syntax nor flagged as immediate. It just happens to be +~ a word that it rarely makes sense to use directly, since its purpose is to +~ be generated as part of the output when compiling number literals. The +~ special behavior around number literals is that when "interpret" sees ie. +~ "5", it first compiles "lit", then appends the numeric value 5 as the +~ following item in the compiled word body. +~ +~ The job of "lit" when it's later executed is to push the appropriate value +~ onto the stack and ensure that it doesn't get executed as code. So, whatever +~ you put immediately after it gets treated as a value, even if it's a +~ pointer. +~ +~ The reason that writing "lit 5" in Evocation syntax crashes is that it +~ gets turned into "lit lit 5" when compiled, which treats the second "lit" as +~ a value then tries to use "5" as a codeword pointer. So you can use "lit" +~ to quote whatever you want, it's just if it's already a special syntax you +~ might need to go behind "interpret"'s back to get it into the compiled +~ output. In practice, this is likely the only place that needs to happen, but +~ the mechanism is documented for the sake of whatever comes up in the future. +~ +~ (value -- ) +: literal lit lit , , ; + diff --git a/evoke.e b/evoke.e index 54fe3fb..fed48b1 100644 --- a/evoke.e +++ b/evoke.e @@ -3,7 +3,7 @@ ~ cat core.e linux.e output.e amd64.e execution-support.e log-load.e; \ ~ echo pyrzqxgl 262144 read-to-buffer; \ ~ cat core.e linux.e output.e amd64.e execution-support.e log-load.e \ -~ dynamic.e input.e ; \ +~ dynamic.e input.e interpret.e ; \ ~ echo 0 sys-exit pyrzqxgl; \ ~ cat evoke.e) \ ~ | ./quine > evoke && chmod 755 evoke && ./evoke diff --git a/interpret.e b/interpret.e index d2bf10f..ee0a52b 100644 --- a/interpret.e +++ b/interpret.e @@ -38,37 +38,132 @@ ~ It is primarily concerned with managing the contents of an area of memory ~ we call the "log"; see log-load.e for more detail on terminology. -: hide-entry dup entry-flags@ 0x80 | entry-flags! ; - -: unhide-entry dup entry-flags@ 0x80 invert & entry-flags! ; - - -~ TODO -~ unroll-past-string 0000001000018db8 -~ swap-past-string 0000001000018ea0 -~ dropstring 0000001000018ee8 -~ dropstring-with-result 0000001000018f80 -~ accumulate-string 0000001000018fc8 -~ is-space 0000001000018b00 -~ word 00000010000194a0 - -~ The word "'", often pronounced "tick", quotes the following word, looking -~ it up and treating it as a constant. In immediate mode, the constant winds -~ up on the stack; in compile mode it gets compiled. -~ -~ There are a few possible implementation strategies here. Running as an -~ immediate word means there's a clear and unambiguous concept of "the -~ following word", so that's what we do; otherwise we'd have to get clever -~ about somehow finding out where we were called from. That means we take on -~ what would otherwise be the interpreter's responsibility, of checking what -~ mode we're in. Happily, that's easy to do. +~ As a convenience for "word", we have some facilities for working with +~ stack-allocated strings. Yeah, trippy concept. Also, it would be a +~ buffer overrun hazard if we were worried about that, which is why this +~ is no longer common practice in C. ~ -~ There's a cyclic dependency where "if" relies on "'", and "'" relies on -~ "if". Fortunately both of them are treated as alternates by the log-load -~ transform, so we don't have to worry about it. -: ' word value@ find dropstring-with-result - interpreter-flags @ 1 & { literal } if - ; make-immediate +~ The most important of these is accumulate-string, but we need some +~ smaller pieces first. + +~ (stack string..., items to be left alone..., item to be unrolled, +~ number of items above string that participate in the unroll +~ -- item that was unrolled, stack string..., items left alone) +: unroll-past-string + 8 * + ~ (string, other items, top item, byte offset to string start) + dup value@ + + ~ We have two copies of the offset present, in addition to the stuff we want + ~ to rotate. So, the actual string starts two words on... We could have + ~ adjusted the offset instead, but we'll want the unmodified offset again + ~ later. + 16 + + ~ (string, other items, top item, offset to start, string pointer) + stringlen + ~ Same reasoning as in accumulate-string (see below). + 1 + 8 align-size + ~ (string, other items, top item, offset to start, string length w/ padding) + + 8 /% swap drop + ~ (string, other items, top item, number of words to unroll) + unroll ; + + + +~ (stack string..., item to be swapped +~ -- item that was swapped, stack string...) +: swap-past-string 1 unroll-past-string ; + + +~ (stack string... --) +: dropstring + value@ stringlen + ~ Same reasoning as in accumulate-string (see below). + 1 + 8 align-size value@ + + ~ At the time we fetched the stack pointer, there was an extra value atop + ~ it, so we have to add one more word. + 8 + value! ; + + +~ (stack string..., item to be kept +~ -- item that was kept) +: dropstring-with-result swap-past-string dropstring ; + + +~ (stack string-so-far..., new character byte +~ -- updated string-so-far) +: accumulate-string + ~ Compute the address of the final word of the string. + ~ + ~ It's a little bit difficult to get the start pointer right, since all + ~ our intermediate products affect what we get from value@, so we compute + ~ that just once, here at the beginning. + value@ 8 + + ~ (string so far, new character byte, pointer to start of string) + dup stringlen + ~ There are two concerns here that overlap: First, we always want at least + ~ one word. Recall that a length of zero bytes won't receive any alignment + ~ padding because it's already divisible by 8. Second, the result of + ~ stringlen doesn't include the null byte, which might be in a word by + ~ itself that needs to be counted. We can address both of them by + ~ unconditionally adding 1 to the length before applying alignment. + 1 + + ~ Pad the length for alignment. + 8 align-size + ~ We want an offset from the first word of the string to the last word of + ~ the string, so we subtract one word from the length. + 8 - + + ~ (string so far, new character byte, address of final word) + + ~ Examine the final word of the string, leaving other stuff undisturbed. + ~ Work low-to-high to figure out where to store the new byte, taking the + ~ first one that's available. + ~ (string so far, new character byte, address of final word) + dup @ dup 0x00000000000000FF & { 3roll | swap ! exit } unless + ~ (string so far, new character byte, address of final word, old value) + + ~ This next part is repeated several times, changing only the offsets, for + ~ bytes 1 through 6; bytes 0 (above) and 7 (way below) are different. + ~ (string so far, new character byte, address of final word, old value) + dup 0x000000000000FF00 & { 3roll 0x0000000000000100 * | swap ! exit } unless + dup 0x0000000000FF0000 & { 3roll 0x0000000000010000 * | swap ! exit } unless + dup 0x00000000FF000000 & { 3roll 0x0000000001000000 * | swap ! exit } unless + dup 0x000000FF00000000 & { 3roll 0x0000000100000000 * | swap ! exit } unless + dup 0x0000FF0000000000 & { 3roll 0x0000010000000000 * | swap ! exit } unless + dup 0x00FF000000000000 & { 3roll 0x0001000000000000 * | swap ! exit } unless + + ~ The top byte of the final word is always zero (or else stringlen + ~ wouldn't have called it the final word), so we don't need to check it, we + ~ can just use it. + ~ + ~ We need to put the new value in the top byte, which will mean we have no + ~ null terminator, so we also need to start a new word. + ~ + ~ There is a fiddly order-dependency here: unroll-past-string relies on + ~ being able to find the null terminator, which won't work if we've gotten + ~ rid of it. Also, calling it will move all the earlier words, including + ~ the one we intend to write to, which will invalidate any pointer we're + ~ keeping at that point. There's a few ways to resolve this; what we do is + ~ put the new terminator in place first, manually nudge the pointer, and + ~ then write the new value. + ~ (string so far, new character byte, address of final word, old value) + 3roll 0x0100000000000000 * | swap + ~ (string so far, new value, address of final word) + 0 3 unroll-past-string + ~ (new null terminator, string so far, new value, invalid address) + 8 - + ~ (new null terminator, string so far, new value, updated address) + ! ; + + +~ (byte -- boolean) +: is-space + dup 0x20 = { drop 1 exit } if + dup 0x09 = { drop 1 exit } if + dup 0x0a = { drop 1 exit } if + dup 0x0b = { drop 1 exit } if + dup 0x0c = { drop 1 exit } if + dup 0x0d = { drop 1 exit } if + drop 0 ; ~ (character -- 1 for true or 0 for false) @@ -84,6 +179,30 @@ drop 0 ; ~ Greater than "z". +~ Okay, this is the big one, the lexer! Wow. +~ +~ (-- stack string occupying multiple words) +: word + ~ We allocate an empty string first, so that the result of "key" will + ~ conveniently be on the easy-to-find end of it. + 0 + + ~ Skip whitespace. + { key dup is-space } { drop } while + + ~ Early exit if it's a zero byte. + { + dup 0 = { drop dropstring 0 exit } if + + accumulate-string + + peek dup is-space { drop exit } if + dup { drop exit } unless + consume accumulate-string + } forever ; + + + ~ (character -- value) : generalized-digit-value ~ We don't have a character-literal syntax; the hex constants here are @@ -252,91 +371,87 @@ ~ harder for words that attempt to work with the contents of other words. So ~ we give it a name. -s" interpreter-flags-storage" create make-immediate make-visible +s" interpreter-flags-storage" create make-hidden here @ 0 , s" interpreter-flags" variable + ~ There's an important bootstrapping concern: If you're loading this ~ interpreter into a running Evocation, it's important to not use the wrong -~ interpreter state value. TODO longer explanation - -~ TODO the definition of set-word-immediate would come here; is it needed? - -: [ interpreter-flags @ 0x01 invert & interpreter-flags ! ; make-immediate -latest @ dup hide-entry - -: ] interpreter-flags @ 0x01 | interpreter-flags ! ; -latest @ dup hide-entry +~ interpreter state value. The way we actually load this is under the log-load +~ transform, which will be running its alternate versions of these words and +~ therefore won't have trouble, but for the sake of making it easy to +~ experiment with modified versions of this file, we do the whole hide-show +~ dance. + +: [ interpreter-flags @ 0x01 invert & interpreter-flags ! + ; make-immediate make-hidden +latest @ +~ (pointer to [ entry) + +: ] interpreter-flags @ 0x01 | interpreter-flags ! + ; make-hidden +latest @ ~ (pointer to [ entry, pointer to ] entry) -~ It may seem nonsensical to use : to define :, but the bootstrapping stuff -~ overrides what it does, so it works. The same, of course, goes for all these -~ other word-defining words. +~ It may seem nonsensical to use : to define :, but the log-load transform +~ provides an alternate for it, so it works. The same, of course, goes for all +~ these other word-defining words. The syntax-centric ones such as : are here +~ in interpret.e because they need to interact with interpreter-flags, which +~ isn't defined before this point. ~ ~ If the ] at the end feels backwards, imagine to yourself that everything ~ that ISN'T defining a word body is part of an implicit [ ... ] sequence. ~ Doing so doesn't really change anything, but may make you happier. -: : word value@ create dropstring docol , latest @ hide-entry ] ; +: : word value@ create dropstring docol , make-hidden ] ; ~ The counterpart of : is ;. : ; - ~ See commentary on "literal", below, regarding "lit exit". + ~ See commentary on "literal", in dynamic.e, regarding "lit exit". lit exit , - latest @ unhide-entry - ~ See above regarding [. Since it's an immediate word, we have to go to + make-visible + ~ See above regarding [. Since it's an immediate word, we have to go to ~ extra trouble to compile it as part of ;. Since it's also hidden, we have ~ to go behind the interpreter's back to even get its entry pointer. + ~ + ~ Adding insult to injury, the log transform breaks , for unclear reasons, + ~ so we do that by hand as well. [ over entry-to-execution-token , ] - ; make-immediate -latest @ dup hide-entry + ; make-immediate make-hidden ~ (pointer to [ entry, pointer to ] entry, pointer to ; entry) : ;asm here @ pack-next 8 packalign here ! - latest @ dup unhide-entry entry-to-execution-token dup 8 + swap ! + make-visible + latest @ entry-to-execution-token dup 8 + swap ! ~ See above. [ 2 pick entry-to-execution-token , ] - ; make-immediate -latest @ dup hide-entry + ; make-immediate make-hidden +latest @ ~ (pointer to [ entry, pointer to ], pointer to ;, pointer to ;asm) -~ Although we will eventually define the word "'" to give us the symbol of -~ a word, it will rely on being able to compile a literal. Rather than do lots -~ of string processing later, we choose to define this word now to avoid -~ having to look up the word "lit" as part of that. -~ -~ It may be slightly surprising that the construction "lit lit" works as -~ expected, given that ie. "lit 5" will break, as will "lit [", so it's worth -~ explaining why it does. -~ -~ In most respects "lit" is just an ordinary word, which compilation turns -~ into a pointer to its codeword. That's what happens to most words, if -~ they're not a special syntax nor flagged as immediate. It just happens to be -~ a word that it rarely makes sense to use directly, since its purpose is to -~ be generated as part of the output when compiling number literals. The -~ special behavior around number literals is that when "interpret" sees ie. -~ "5", it first compiles "lit", then appends the numeric value 5 as the -~ following item in the compiled word body. -~ -~ The job of "lit" when it's later executed is to push the appropriate value -~ onto the stack and ensure that it doesn't get executed as code. So, whatever -~ you put immediately after it gets treated as a value, even if it's a -~ pointer. +~ The word "'", often pronounced "tick", quotes the following word, looking +~ it up and treating it as a constant. In immediate mode, the constant winds +~ up on the stack; in compile mode it gets compiled. ~ -~ The reason that writing "lit 5" in Evocation syntax crashes is that it -~ gets turned into "lit lit 5" when compiled, which treats the second "lit" as -~ a value then tries to use "5" as a codeword pointer. So you can use "lit" -~ to quote whatever you want, it's just if it's already a special syntax you -~ might need to go behind "interpret"'s back to get it into the compiled -~ output. In practice, this is likely the only place that needs to happen, but -~ the mechanism is documented for the sake of whatever comes up in the future. +~ There are a few possible implementation strategies here. Running as an +~ immediate word means there's a clear and unambiguous concept of "the +~ following word", so that's what we do; otherwise we'd have to get clever +~ about somehow finding out where we were called from. That means we take on +~ what would otherwise be the interpreter's responsibility, of checking what +~ mode we're in. Happily, that's easy to do. ~ -~ (value -- ) -: literal lit lit , , ; +~ There's a cyclic dependency where "if" relies on "'", and "'" relies on +~ "if". Fortunately flow-control is done with alternates by the log-load +~ transform, so we don't have to worry about it. +: ' word value@ find dropstring-with-result + interpreter-flags @ 1 & { literal } if + ; make-immediate + -~ Now the single most important word... +~ ~ Now the single most important word... : interpret word @@ -392,21 +507,24 @@ latest @ dup hide-entry ~ If it's neither in the dictionary nor a number, just print an error. s" No such word: " emitstring value@ emitstring dropstring ; -~ TODO for ease of debugging, this isn't the full implementation, which lets -~ us exit it to the outer "quit" -: quit { interpret } forever ; - -~ Now we switch into the new interpreter, enabling the three words we'd been -~ keeping hidden and then calling "quit". -unhide-entry unhide-entry unhide-entry quit - --0x10 newline . newline -4 5 + . : za 13 12 - . ; za -~ : ' word value@ find dropstring-with-result -~ interpreter-flags @ 1 & { literal } if ; make-immediate -' za . newline -: piz ' za . newline ; piz -~ ' interpret forget quit 2 3 * . -' ' describe ' za describe ' piz describe -bye - +~ ~ TODO for ease of debugging, this isn't the full implementation, which lets +~ ~ us exit it to the outer "quit" +~ : quit { interpret } forever ; +~ +~ ~ Now we switch into the new interpreter, enabling the three words we'd been +~ ~ keeping hidden and then calling "quit". +~ dup entry-flags@ 0x80 invert & entry-flags! +~ dup entry-flags@ 0x80 invert & entry-flags! +~ dup entry-flags@ 0x80 invert & entry-flags! +~ quit +~ +~ ~ -0x10 newline . newline +~ compile ~ 4 5 + . : za 13 12 - . ; za +~ ~ ~ : ' word value@ find dropstring-with-result +~ ~ ~ interpreter-flags @ 1 & { literal } if ; make-immediate +~ ~ ' za . newline +~ ~ : piz ' za . newline ; piz +~ ~ ~ ' interpret forget quit 2 3 * . +~ ~ ' ' describe ' za describe ' piz describe +~ ~ bye +~ ~ diff --git a/transform.e b/transform.e index 6b2b64b..1502ee4 100644 --- a/transform.e +++ b/transform.e @@ -726,6 +726,7 @@ allocate-transform-state s" transform-state" variable dup s" self-codeword" stringcmp 0 = { drop 0 exit } if dup s" variable" stringcmp 0 = { drop -2 exit } if dup s" keyword" stringcmp 0 = { drop -1 exit } if + dup s" literal" stringcmp 0 = { drop -1 exit } if ~ From input.e. dup s" buffer-physical-start" stringcmp 0 = { drop 0 exit } if @@ -740,6 +741,27 @@ allocate-transform-state s" transform-state" variable dup s" allocate-input-buffer" stringcmp 0 = { drop 0 exit } if dup s" attach-string-to-input-buffer" stringcmp 0 = { drop -2 exit } if + ~ From interpret.e. + dup s" unroll-past-string" stringcmp 0 = { drop 0 exit } if + dup s" swap-past-string" stringcmp 0 = { drop 0 exit } if + ~ The following are deliberate omissions: dropstring, + ~ dropstring-with-result, accumulate-string. + dup s" is-space" stringcmp 0 = { drop 0 exit } if + dup s" is-alphanumeric" stringcmp 0 = { drop 0 exit } if + ~ The following is a deliberate omission: word. + dup s" generalized-digit-value" stringcmp 0 = { drop 0 exit } if + ~ The following are deliberate omissions: decode-generalized-digit, + ~ read-base-unsigned, read-integer-unsigned, read-integer, read-decimal. + dup s" interpreter-flags" stringcmp 0 = { drop 1 exit } if + dup s" [" stringcmp 0 = { drop 0 exit } if + dup s" ]" stringcmp 0 = { drop 0 exit } if + dup s" :" stringcmp 0 = { drop 0 exit } if + dup s" ;" stringcmp 0 = { drop 0 exit } if + dup s" ;asm" stringcmp 0 = { drop 0 exit } if + dup s" '" stringcmp 0 = { drop 1 exit } if + ~ The following is a deliberate omission: interpret. + dup s" quit" stringcmp 0 = { drop 0 exit } if + ~ Created by warm-start in execution.e. dup s" log" stringcmp 0 = { drop 1 exit } if dup s" s0" stringcmp 0 = { drop 1 exit } if @@ -748,11 +770,6 @@ allocate-transform-state s" transform-state" variable dup s" here" stringcmp 0 = { drop 1 exit } if ~ Word not provided statically, but used during the log-load routine anyway. - dup s" [" stringcmp 0 = { drop 0 exit } if - dup s" ]" stringcmp 0 = { drop 0 exit } if - dup s" :" stringcmp 0 = { drop 0 exit } if - dup s" ;" stringcmp 0 = { drop 0 exit } if - dup s" ;asm" stringcmp 0 = { drop 0 exit } if dup s" L@'" stringcmp 0 = { drop 1 exit } if dup s" L!'" stringcmp 0 = { drop -1 exit } if @@ -1621,7 +1638,7 @@ allocate-transform-state s" transform-state" variable ~ which is the same thing that would happen if we didn't have an alternate ~ at all. interpreter-flags @ 0x01 & { - s" ," log-load-compile-dynamic-word + s" create" log-load-compile-dynamic-word } { log-load-roll-log-address @@ -1774,7 +1791,7 @@ allocate-transform-state s" transform-state" variable ~ We consumed the value, so we apply a delta. -1 transform-apply-stack-delta - log-load-roll-log-address + log-load-unroll-log-address } if-else ; make-immediate @@ -2159,7 +2176,6 @@ allocate-transform-state s" transform-state" variable swap drop ' log-load-left-square-brace-alternate swap } if dup s" ]" stringcmp 0 = { swap drop ' log-load-right-square-brace-alternate swap } if - dup s" '" stringcmp 0 = { swap drop ' log-load-tick-alternate swap } if dup s" ," stringcmp 0 = { swap drop ' log-load-comma-alternate swap } if dup s" variable" stringcmp 0 = { swap drop ' log-load-variable-alternate swap } if -- cgit 1.4.1