summary refs log tree commit diff
path: root/transform.e
diff options
context:
space:
mode:
authorIrene Knapp <ireneista@irenes.space>2026-05-15 20:51:13 -0700
committerIrene Knapp <ireneista@irenes.space>2026-05-15 21:58:20 -0700
commit447921598269bae3e1406470015c37f23943cc74 (patch)
tree464583448ae74f83af637cefded39d8b3d431aff /transform.e
parent1af849abc637c4890285c4d3cc08d99faae2ea41 (diff)
make the label transformation work all the way, no crashing
the code doesn't quite run yet, that'll be a future CL

Force-Push: yes
Change-Id: I71e6a45127c1fc37906d902e36142c17afef2a21
Diffstat (limited to 'transform.e')
-rw-r--r--transform.e346
1 files changed, 245 insertions, 101 deletions
diff --git a/transform.e b/transform.e
index 1c325df..154f477 100644
--- a/transform.e
+++ b/transform.e
@@ -1,3 +1,73 @@
+~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~ ~~ Code transformation facility ~~
+~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~
+~ TODO explain what problem this is solving and why
+~
+~   The label transform operates on code that compiles itself, and ensures
+~ that the result of the compilation is suitable to be included in an
+~ executable binary. To achieve this, it makes several changes to the
+~ semantics of that code. The transform relies on the label facility, and
+~ expects to run from within label-loop.
+~
+~   The most fundamental change is that the label transform separates words
+~ that run in compile mode from words that run immediately.  There is no
+~ distinction made between words running in immediate mode, and words declared
+~ as immediate. Immediate words are looked up and executed based on their
+~ "real", currently-executing definitions. Compiled words, including
+~ literals, are looked up via the label facility.
+~
+~   Since the label facility is able to resolve forward references, there is
+~ no hard requirement that words be topologically sorted, but forward
+~ references should still be kept to a minimum, since that's a significant
+~ difference from un-transformed code that could easily become confusing.
+~
+~   Compilation words do make extensive reference to the global variables
+~ "here" and "latest". In particular, flow-control words such as if-else
+~ expect the log to have recent compilation outputs on it, and to be able to
+~ mutate them in-place. In order to make this work, we provide temporary
+~ values of these two variables which point to the location of the output
+~ buffer. This allows pointer resolution to work correctly without additional
+~ effort, but notice that the buffer's address will differ from the address
+~ the resulting program loads itself at. There's no simple way to avoid this
+~ concern, since the variables must point to one of those addresses or the
+~ other, not both.
+~
+~   We resolve the issue by running our own, alternate versions of the words
+~ "create", ":", ";", and ";asm" which use the label facility to compute the
+~ addresses that will be needed at runtime. These alternates run instead of
+~ the normal versions of these words. The code being compiled is responsible
+~ for not doing anything else that would rely on "here" and "latest" matching
+~ their runtime addresses, though it is otherwise allowed to modify and rely
+~ on them in all the usual ways. The alternate versions are defined in this
+~ file as their own words, "Lcreate", "L:", "L;", and "L;asm". The alternates
+~ rely on various labels, all of which must be defined elsewhere:
+~
+~   * origin
+~   * docol
+~   * exit
+~   * :
+~   * ;
+~   * ;asm
+~
+~   Note that these alternates are applied via a purely lexical
+~ transformation: when a word would be looked up in the dictionary to
+~ interpret, first check if it's one of these. That means the transformation
+~ won't apply to indirect callers of these words, nor to tick-quotes of them.
+~ The code being compiled is responsible for not doing either of those things.
+~
+~   Notably, the transformation uses the same "interpreter-flags" variable as
+~ the rest of Evocation. There's no need to keep it separate like there is
+~ with the other variables. This makes it easy to change modes.
+~
+~   All of these limitations result in the compiled code being, in effect,
+~ written in a dialect which is like Evocation, but more restricted. This is
+~ acceptable, because the label transform is intended for compiling code that
+~ is an early part of Evocation itself, and the necessary code has all been
+~ written to follow these restrictions.
+
+
+~ TODO all this buffer stuff should be in its own file
 ~ (buffer size -- buffer address)
 : read-to-buffer
   dup allocate dup dup
@@ -39,6 +109,7 @@
           { ~ If not, leave the word start alone.
             3roll pack8 } if-else } if-else } forever ;
 
+
 ~   In logical terms, this modifies an input buffer metadata structure
 ~ in-place to push a new, zeroed one into the start of the linked list formed
 ~ through the next-source field.
@@ -56,8 +127,8 @@
   2dup swap 6 8 * memcopy
   ~ (original metadata pointer, new metadata pointer)
   swap dup zero-input-buffer-metadata
-  input-buffer-next-source !
-  ;
+  input-buffer-next-source ! ;
+
 
 ~   This does the inverse of push-input-buffer. In the event that the
 ~ next-source field is null, it zeroes the buffer.
@@ -73,58 +144,109 @@
   dup { 6 8 * memcopy }
       { drop zero-input-buffer-metadata } if-else ;
 
-: L:
-  ' L' entry-to-execution-token execute
-  { ' set-label entry-to-execution-token , }
-  { set-label } if-else
-  ; make-immediate
-  ~ TODO probably needs to do more
 
 : transform-state-saved-here ;
 : transform-state-saved-latest 8 + ;
+: transform-state-output-buffer-start 2 8 * + ;
 : allocate-transform-state
-  2 8 * allocate
+  3 8 * allocate
   dup transform-state-saved-here 0 swap !
-  dup transform-state-saved-latest 0 swap ! ;
+  dup transform-state-saved-latest 0 swap !
+  dup transform-state-output-buffer-start 0 swap ! ;
 allocate-transform-state s" transform-state" variable
 
-~   The label transform operates on code that compiles itself, and ensures
-~ that the result of the compilation is suitable to be included in an
-~ executable binary. To achieve this, it makes several changes to the
-~ semantics of that code. The transform relies on the label facility, and
-~ expects to run from within label-loop.
-~
-~   The most fundamental change is that the label transform separates words
-~ that run in compile mode from words that run immediately.  There is no
-~ distinction made between words running in immediate mode, and words declared
-~ as immediate. Immediate words are looked up and executed based on their
-~ "real", currently-executing definitions. Compiled words, including
-~ literals, are looked up via the label facility.
-~
-~   Since the label facility is able to resolve forward references, there is
-~ no hard requirement that words be topologically sorted, but forward
-~ references should still be kept to a minimum, since that's a significant
-~ difference from un-transformed code that could easily become confusing.
-~
-~   Compilation words do make extensive reference to the global variables
-~ "here" and "latest". In particular, flow-control words such as if-else
-~ expect the log to have recent compilation outputs on it, and to be able to
-~ mutate them in-place. In order to make this work, we provide temporary
-~ values of these two variables which point to the location of the output
-~ buffer. This allows pointer resolution to work correctly without additional
-~ effort, but notice that the buffer's address will differ from the address
-~ the resulting program loads itself at. There's no simple way to avoid this
-~ concern, since the variables must point to one of those addresses or the
-~ other, not both.
+
+~   When calling the label facility during a transformation, it's necessary
+~ to use the real, non-wrapped "heap" and "latest".
+: swap-transform-variables
+  here @ transform-state transform-state-saved-here @
+  here ! transform-state transform-state-saved-here !
+  latest @ transform-state transform-state-saved-latest @
+  latest ! transform-state transform-state-saved-latest ! ;
+
+~ (address within the output buffer -- address at generated binary's runtime)
+: transform-offset
+  ~ Don't transform null pointers.
+  dup { transform-state transform-state-output-buffer-start @ -
+        swap-transform-variables L@' origin swap-transform-variables
+        + } if ;
+
+
+~   This is the alternate version of "create" for use with the label
+~ transform. Its code is the same as the regular "create" except as noted
+~ below. It is likely to be extremely useful to read and understand "create"
+~ in interpret.e before attempting to understand "Lcreate".
+: Lcreate
+  dup stringlen 1 + dup 3unroll
+  here @ 10 + 3unroll memmove
+  here @
+  ~   This value of "latest" is going into the generated output, so call
+  ~ transform-offset on it first.
+  latest @ transform-offset pack64
+  0 pack8
+  0 pack8
+  +
+  8 packalign
+  here @ latest !
+  here ! ;
+
+
+~   This is the alternate version of ":" for use with the label transform. Its
+~ code is the same as the regular "create" except as noted below. It is likely
+~ to be extremely useful to read and understand ":" in interpret.e before
+~ attempting to understand "L:".
+: L:
+  ~ This calls "Lcreate" instead of "create".
+  word value@ Lcreate dropstring
+
+  ~ This looks up "docol" by label.
+  swap-transform-variables
+  L@' docol
+  L@' origin
+  swap-transform-variables
+  + ,
+
+  latest @ hide-entry ] ;
+
+
+~   This is the alternate version of ";" for use with the label transform. Its
+~ code is the same as the regular "create" except as noted below. It is likely
+~ to be extremely useful to read and understand ";" in interpret.e before
+~ attempting to understand "L;".
+: L;
+  ~ This looks up "exit" by label.
+  swap-transform-variables
+  L@' exit L@' origin
+  swap-transform-variables
+  + ,
+
+  latest @ unhide-entry
+
+  ~   Since [ is an immediate word, we have to go to extra trouble to compile
+  ~ it as part of ;.
+  [ ' [ entry-to-execution-token , ]
+  ; make-immediate
+
+
+~   This is the alternate version of ";asm" for use with the label transform.
+~ Its code is the same as the regular "create" except as noted below. It is
+~ likely to be extremely useful to read and understand ";asm" in interpret.e
+~ before attempting to understand "L;asm".
+: L;asm
+  here @ pack-next 8 packalign here !
+  latest @ dup unhide-entry entry-to-execution-token dup 8 + swap !
+
+  ~   Since [ is an immediate word, we have to go to extra trouble to compile
+  ~ it as part of ;asm.
+  [ ' [ entry-to-execution-token , ]
+  ; make-immediate
+
+
+~   This implements the label transform for a single word. It is directly
+~ analogous to "interpret", and reading interpret.e may help in understanding
+~ it, though it's meant to still make sense on its own.
 ~
-~   We resolve the issue by running our own, alternate versions of the words
-~ "create", ":", ";", and ";asm" which use the label facility to compute the
-~ addresses that will be needed at runtime. These alternates run instead of
-~ the normal versions of these words. The code being compiled is responsible
-~ for not doing anything else that would rely on "here" and "latest" matching
-~ their runtime addresses, though it is otherwise allowed to modify and rely
-~ on them in all the usual ways. The alternate versions are defined in this
-~ file as their own words, "Lcreate", "L:", "L;", and "L;asm".
+~ It expects to be called from "transform", below, which loops.
 ~
 ~ (-- done)
 : transform-one
@@ -141,31 +263,80 @@ allocate-transform-state s" transform-state" variable
   ~ If it's the magic word, end the transformation.
   dup s" pyrzqxgl" stringcmp 0 = { drop dropstring 1 exit } if
 
-  transform-state transform-state-saved-latest @ swap find-in
+  ~   Check whether it's one of the words we have alternates for, and look up
+  ~ the alternate if so.
+  dup 0 swap
+  ~ (name as stack string, name pointer, placeholder, name pointer)
+  dup s" create" stringcmp 0 = { swap drop ' Lcreate swap } if
+  dup s" :" stringcmp 0 = { swap drop ' L: swap } if
+  dup s" ;" stringcmp 0 = { swap drop ' L; swap } if
+  dup s" ;asm" stringcmp 0 = { swap drop ' L;asm swap } if
+  drop swap
+  ~ (name as stack string, 0 or alternate entry pointer, name pointer)
 
-  ~ Check whether the word was found in the dictionary.
-  dup 0 != {
-    ~ If the word is in the dictionary, check what mode we're in, then...
-    dropstring-with-result
-    ~ (entry pointer)
-    interpreter-flags @ 0x01 & {
-      ~ ... if we're in compile mode, there's still a chance it's an immediate
-      ~ word, in which case we fall through to interpret mode...
-      dup entry-flags@ 1 & 0 =
+  ~   If an alternate was found, the alternate will be used in immediate mode.
+  ~ If not, we look up the word in the regular, non-transformed dictionary
+  ~ and use that for immediate mode.
+  over { dup
+         transform-state transform-state-saved-latest @ swap find-in
+         3roll drop swap } unless
+  ~ (name as stack string, immediate entry pointer, name pointer)
 
-      ~ ... but it's a regular word, so append it to the heap.
-      ~ TODO why is ; being treated as a regular word
-      { entry-to-execution-token , 0 exit } if
-    } if
+  ~   For compile mode, we need to look the word up in the output buffer. We
+  ~ can't easily traverse the next-entry pointers in the output buffer's
+  ~ dictionary, so we use the label.
+  ~
+  ~   Labels point to codewords (because that's what "Lcreate" does), so we
+  ~ have to convert it to get the entry pointer. Since we don't know the
+  ~ word's name statically, this is a rare scenario where we can't use the
+  ~ abbreviated label syntax, but that's easy enough.
+  ~
+  ~   We do have to be careful of one thing: On the first run, the label may
+  ~ be zero!
+  swap-transform-variables
+  intern-label use-label
+  swap-transform-variables
+  dup { execution-token-to-entry } if
+  ~ (name as stack string, immediate entry pointer, compiled entry pointer)
+
+  ~   In regular "interpret", we would check whether we found the word before
+  ~ checking the mode. However, we have three different places words could
+  ~ come from, so that's not a simple notion. So, we check the mode first.
+  interpreter-flags @ 0x01 & {
+    ~   If we're in compile mode, there's still a chance it's an immediate
+    ~ word. First check whether we have an immediate entry, then if so, check
+    ~ that entry's flags. Notice that this means the generated code can't
+    ~ override an immediate word with a non-immediate word of the same name.
+    over dup { entry-flags@ 0x01 & not } if
 
-    ~ ... if we're in interpret mode, or the word is immediate, run it.
-    entry-to-execution-token execute 0 exit
+    ~   Either there was no immediate entry, or the immediate entry wasn't
+    ~ flagged as an immediate word. So we treat this as a compilation, which
+    ~ means we append a word to the heap. Specificaly, of course, we use the
+    ~ compiled entry to do that.
+    { swap drop dropstring-with-result
+      entry-to-execution-token ,
+      0 exit } if
   } if
 
-  ~ If it's not in the dictionary, check whether it's a decimal number.
+  ~   If we got here, one of three things is true: We're in interpret mode;
+  ~ the word is immediate; or no word was found.  Regardless, we don't need
+  ~ the compiled entry pointer anymore, so drop it.
   drop
-  ~ As before, we get the stack address and use it as a string pointer.
-  ~ (string)
+  ~ (name as stack string, immediate entry pointer)
+
+  ~   If the immediate entry pointer is non-zero, run it.
+  dup {
+    dropstring-with-result entry-to-execution-token execute
+    0 exit
+  } if
+
+  ~   If we're still here, it wasn't in the dictionary. Also, we don't need
+  ~ the immediate entry pointer, either.
+  drop
+  ~ (name as stack string)
+
+  ~   If it's not in the dictionary, check whether it's an integer literal. As
+  ~ before, we get the stack address and use it as a string pointer.
   value@ read-integer 0 = {
     ~ It's a number.
     interpreter-flags @ 0x01 & {
@@ -175,7 +346,10 @@ allocate-transform-state s" transform-state" variable
       ~ require dealing with what happens if it's not found.
       ~ TODO this is wrong
       dropstring-with-result
-      [ ' lit entry-to-execution-token literal ]
+
+      ~ We look up "lit" as a label.
+      swap-transform-variables L@' lit swap-transform-variables
+      transform-offset
       , ,
       0 exit
     } if
@@ -191,28 +365,10 @@ allocate-transform-state s" transform-state" variable
   s" No such word: " emitstring value@ emitstring dropstring 0 ;
 
 
-~ ." input " main-input-buffer dup .hex64 newline dup hexdump @ dup .hex64 newline bye
-~ 1024 read-to-buffer
-~ foo bar baz biff
-~ pyrzqxgl
-~ stackhex dup hexdump emitstring bye
+~   This implements the label transform for all words in a region given as an
+~ input string. It is directly analogous to "quit", in interpret.e, but is far
+~ more complex.
 ~
-~ : breakza
-~   ." original" newline
-~   main-input-buffer dup 6 8 * hexdump-from
-~   dup push-input-buffer
-~   ." updated original" newline
-~   dup 6 8 * hexdump-from
-~   ." copy" newline
-~   dup input-buffer-next-source @ 6 8 * hexdump-from
-~   newline
-~   stackhex
-~   dup pop-input-buffer
-~   ." copied back" newline
-~   6 8 * hexdump-from
-~   stackhex
-~   bye ;
-
 ~ (output point, input string pointer -- output point)
 : transform
   main-input-buffer dup push-input-buffer
@@ -226,6 +382,7 @@ allocate-transform-state s" transform-state" variable
   ~ loop, and set it back when the loop ends.
   here @ transform-state transform-state-saved-here !
   latest @ transform-state transform-state-saved-latest !
+  dup transform-state transform-state-output-buffer-start !
   here !
   0 latest !
   ~ Now the stack has nothing of ours on it, so client code can do its thing.
@@ -248,23 +405,10 @@ allocate-transform-state s" transform-state" variable
       ~ invocation, for tidiness we zero it out.
       0 transform-state transform-state-saved-here !
       0 transform-state transform-state-saved-latest !
+      0 transform-state transform-state-output-buffer-start !
 
       ~  Also put the input source back how it was.
       main-input-buffer pop-input-buffer
 
       exit } if } forever ;
 
-~ 1024 allocate dup
-~ ." compilation output buffer" newline dup hexdump
-~ transform
-~ : za ." ZA" 12 13 - . ;
-~ : ' word value@ find dropstring-with-result
-~   interpreter-flags @ 1 & { literal } if ; make-immediate
-~ ~ ' za . newline
-~ pyrzqxgl
-~ ." back back back " here @ .hex64 newline
-~ ~ ." stack after " stackhex
-~ ~ 2dup swap hexdump-between
-~ ~ : piz ." PIZ" ' za . newline ; piz
-~ bye
-