diff options
| author | Irene Knapp <ireneista@irenes.space> | 2026-06-06 11:17:16 -0700 |
|---|---|---|
| committer | Irene Knapp <ireneista@irenes.space> | 2026-06-12 03:07:53 -0700 |
| commit | e7390b9f2f8da8ea0b89705ba8c25cca977a071c (patch) | |
| tree | 9f7111032173d3e0f453feb698cb2516d0d78637 | |
| parent | d1cccb7b2096f9b4418487f1d2a605c0b157441a (diff) | |
first steps towards the hex transform
quite a lot is implemented, but there is quite a lot still to go. at any rate it compiles cleanly and this seemed like a good point to check things in. Change-Id: I634bb3b2f1b10702cf63cb73c0a995983b5041f2 Force-Push: yes
| -rw-r--r-- | quine.asm | 15 | ||||
| -rw-r--r-- | transform.e | 514 |
2 files changed, 522 insertions, 7 deletions
diff --git a/quine.asm b/quine.asm index 2df2d3a..f11b435 100644 --- a/quine.asm +++ b/quine.asm @@ -1883,7 +1883,7 @@ _start: ;;; These will be the permanent homes of these values, though we have ;;; copies of them elsewhere while we're still in this routine. ;;; - mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi ; heap + mov.qreg.disp32.qreg rdi, control_stack_size + 0x00, rdi ; log mov.qreg.disp32.qreg rdi, control_stack_size + 0x08, rsp ; s0 mov.qreg.disp32.qreg rdi, control_stack_size + 0x10, rbp ; r0 mov.qreg.qimm rax, final_word_name @@ -1891,9 +1891,14 @@ _start: lea.qreg.disp32.qreg rax, control_stack_size + 0x28, rdi mov.qreg.disp32.qreg rdi, control_stack_size + 0x20, rax ; here ;;; - ;;; * "heap" is the physical bottom of the heap - ;;; The heap grows upwards in memory, so this is also the logical + ;;; * "log" is the physical bottom of the log + ;;; The log grows upwards in memory, so this is also the logical ;;; bottom. This comes from the address mmap() just returned to us. + ;;; The rest of quine.asm refers to the log as the heap. It's not a + ;;; heap, but it used to be called that. The self-hosted version of + ;;; Evocation has the fully revised and reconciled copy of all these + ;;; comments, it just felt like unnecessary tedium to do that here as + ;;; well. ;;; * "s0" is the logical bottom of the value stack ;;; The value stack grows downwards in memory, so this is the physical ;;; top of it. This comes from the stack pointer the kernel initialized us @@ -2018,7 +2023,7 @@ cold_start: ;;; This is the only hardcoding we need to do; by building on top of it, ;;; we will soon reach a point where the rest of the system can be defined ;;; within itself. - dq early_heap, litstring, "heap", early_variable + dq early_heap, litstring, "log", early_variable dq early_s0, litstring, "s0", early_variable dq early_r0, litstring, "r0", early_variable dq early_latest, litstring, "latest", early_variable @@ -11966,7 +11971,7 @@ defword boot_source, 0x40 ; This use of bitwise and is okay because they're both either 0 or 1. ; We'll have logical and real soon now, be patient... :) - dq ": is-in-heap dup heap @ <= swap here @ > & ; " + dq ": is-in-heap dup log @ <= swap here @ > & ; " ; dq ": unlink-pre-heap-words " ; dq " latest @ " diff --git a/transform.e b/transform.e index 76a192e..474ba13 100644 --- a/transform.e +++ b/transform.e @@ -245,12 +245,14 @@ : transformation-state-saved-latest 8 + ; : transformation-state-output-buffer-start 2 8 * + ; : transformation-state-user-stack-depth 3 8 * + ; +: transformation-state-label-scratch 4 8 * + ; : allocate-transformation-state - 4 8 * allocate + 5 8 * allocate dup transformation-state-saved-here 0 swap ! dup transformation-state-saved-latest 0 swap ! dup transformation-state-output-buffer-start 0 swap ! - dup transformation-state-user-stack-depth 0 swap ! ; + dup transformation-state-user-stack-depth 0 swap ! + dup transformation-state-label-scratch 0 swap ! ; allocate-transformation-state s" transformation-state" variable @@ -2547,3 +2549,511 @@ allocate-transformation-state s" transformation-state" variable exit } if } forever ; + +~ Hex transform implementation +~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~ +~ The following code is all part of implementing the hex transform. For +~ conceptual overview, see the top of this file. +~ +~ The hex transform DOES NOT WORK yet. It's still in development. +~ TODO update this note when it does work + +: hex-tilde-alternate [ ' ~ entry-to-execution-token , ] + ; make-immediate + +: hex-self-codeword-alternate self-codeword ; + +: hex-string-alternate + ~ See label-string-alternate for detailed notes on how we wrap s". + ~ Essentially, we call the immediate version of it, in the outer context, + ~ which uses scratch space in the outer, "real" log. + interpreter-flags @ + ' s" entry-to-execution-token + swap-transform-variables + [ ' [ entry-to-execution-token , ] + execute + swap-transform-variables + swap interpreter-flags ! + + ~ Now we have a string pointer on the stack at transform time. If we're in + ~ immediate mode, that's sufficient. If we're in compile mode, output a + ~ litstring invocation. Notice also that these are essentially the same + ~ responsibilities as we'd have in the label transform. + interpreter-flags @ 0x01 & { + ~ We look up the inner version of litstring to reference here. This is + ~ similar to what the label transform does, except we don't use a label + ~ for it. + s" litstring" find entry-to-execution-token , + here @ swap packstring 8 packalign here ! + } if + ; make-immediate + +: hex-dot-string-alternate + ' hex-string-alternate entry-to-execution-token execute + + interpreter-flags @ 0x01 & { + ~ We look up the inner version of emitstring, too. + s" emitstring" find entry-to-execution-token , + } { emitstring } if-else + ; make-immediate + +: hex-create-alternate create ; + +~ This is a helper called from the patched version of "variable", described +~ in more detail in hex-colon-alternate, below. It expects to be called after +~ outputting the entry header for "variable", during the body of the +~ definition, so that it can output compiled code which will run as part of +~ the transformed "variable". +~ +~ The helper accepts a string pointer giving a variable name. The code it +~ produces checks the name of the variable being defined and, if the two names +~ match, alters the resulting inner variable to point to the same backing +~ store as the outer variable of the same name. +~ +~ In many ways the hex transform is the trippiest one. To wit, there's two +~ layers of compilation happening here... so don't get confused. When the +~ helper is called, we're compiling the inner "variable", but "variable" +~ itself is a word-defining word which also has the task of compilation... +~ which we're modifying. +~ +~ (name pointer --) +: hex-variable-use-outer + ~ The actual payload here is that we check whether we're defining the + ~ word "interpreter-flags" and, if so, we make it reference the outer + ~ backing store instead of the inner one. + ~ + ~ We want to do this all as references to inner words, which fortunately + ~ have been defined by now, but we have to do that a bit indirectly... + s" dup" find entry-to-execution-token , + s" litstring" find entry-to-execution-token , + here @ over packstring 8 packalign here ! + s" stringcmp" find entry-to-execution-token , + s" lit" find entry-to-execution-token , 0 , + s" =" find entry-to-execution-token , + + ~ Also, we don't have high-level flow control yet, and even if we did, + ~ it would be awkward to use it here. So we count the branch by hand. Ah + ~ well. As always, remember that with forward branches, the offset to + ~ branch by is the first word to be skipped, and is included in the count. + s" 0branch" find entry-to-execution-token , 6 8 * , + + ~ If control reaches here in the generated code, the string matched. + s" swap" find entry-to-execution-token , + s" drop" find entry-to-execution-token , + + ~ To get the value of the outer variable, we just call it. Of course, + ~ looking up an outer entry is a pain, but at least it's a pain in a way + ~ that should be familiar by now. + s" lit" find entry-to-execution-token , + swap-transform-variables + find + swap-transform-variables + entry-to-execution-token execute , + + s" swap" find entry-to-execution-token , ; + +~ By overriding colon, we can special-case the definitions of particular +~ words. It's very metacircular. +: hex-colon-alternate + word value@ + + ~ The word "variable" is itself a word-defining word, and we will + ~ special-case its definition to special-case the definitions of particular + ~ variables. It's very very metacircular. + dup s" variable" stringcmp 0 = { + ~ Don't lose track of the layering happening here. The word "variable" + ~ is a regular docol word; it's defining a word that's implemented in + ~ assembly, but it can use whatever Forthy logic it wants to do so. In + ~ this case we're going to have it run a little extra logic, then continue + ~ with the rest of its usual definition. + + ~ Before we get to the extra logic, we do want an entry header for + ~ "variable" itself, so we do that... This takes care of all of colon's + ~ responsibilities except switching to compile mode; we'll do that part + ~ after we've output our payload. + create dropstring + s" docol" find entry-to-execution-token execute , + make-hidden + + ~ There's two variables that we want to point to the outer backing + ~ stores, rather than the inner ones. The code for that is a bit + ~ repetitive, so we have a helper for it; see above. + ~ + ~ It is worth stopping to contemplate the meaning of sharing these two + ~ variables in particular between the inner and outer contexts. + ~ Essentially this says that they're both reading the same input stream, + ~ and the two copies of the interpreter both share the same state. Thus, + ~ trading off responsibility for lexing between inner and outer contexts + ~ works just like trading off responsibility between two interpreters + ~ when there's no transformation involved, or between a regular + ~ interpreter and a transform. + ~ + ~ If we didn't do this, we'd still have to invent some way to control + ~ what input the inner context sees, and the concept of "the next + ~ character" would become more complex during the transform and require + ~ care and attention. Sharing this stuff keeps it simple. + s" interpreter-flags" hex-variable-use-outer + s" main-input-buffer" hex-variable-use-outer + ~ After this, we can return control to the regularly-scheduled + ~ "variable", which will do the "create" and all that. That stuff isn't + ~ colon's responsibility, so it isn't our responsibility, it'll happen + ~ regardless. + + ~ Now we close out colon's responsibilities by switching to compile + ~ mode. We return from colon after this. The hex transform will continue + ~ by processing the source words that form the regular body of "variable", + ~ eventually hitting the matching semicolon. Our friendly tampering is + ~ now complete! + ' ] entry-to-execution-token execute + exit + } if + + ~ Now we want to override s". As usual, that's the single most annoying + ~ string to quote, so we cheat. + dup ' s" entry-to-name stringcmp 0 = { + ~ Create the word header. It's a normal docol word, so that much is + ~ simple. + create dropstring + s" docol" find entry-to-execution-token execute , + make-hidden + + ~ This time around we would really rather just always use the alternate, + ~ which already untangles the layered nonsense. So we have the payload + ~ call the alternate directly, then exit. We could come up with a way to + ~ then skip forward in the code under transformation, but that would be + ~ complex, and it's unnecessary: We let it keep running, outputting the + ~ usual body of s", which we know will never be reached. + ' hex-string-alternate entry-to-execution-token , + s" exit" find entry-to-execution-token , + + ~ As before, finish up colon's responsibilities, then return control to + ~ the code under transformation. + ' ] entry-to-execution-token execute + exit + } if + + ~ Same deal for .". Hey, we're getting good at this! + dup ' ." entry-to-name stringcmp 0 = { + create dropstring + s" docol" find entry-to-execution-token execute , + make-hidden + + ' hex-dot-string-alternate entry-to-execution-token , + s" exit" find entry-to-execution-token , + + ' ] entry-to-execution-token execute + exit + } if + + ~ We want to suppress the behavior of relink-main-input-buffer-to-stdin + ~ entirely. Happily, that's easy. We need to do this because otherwise the + ~ transformed code will mess with our outer interpreter! + dup s" relink-main-input-buffer-to-stdin" stringcmp 0 = { + create dropstring + s" docol" find entry-to-execution-token execute , + make-hidden + + ~ Return before doing anything. + s" exit" find entry-to-execution-token , + + ' ] entry-to-execution-token execute + exit + } if + + ~ If no special case matches, we fall back to just being a regular colon. + ~ We already read the word name above, so we have to do the rest of the + ~ steps ourselves as well. + create dropstring + s" docol" find entry-to-execution-token execute , + make-hidden + ' ] entry-to-execution-token execute + ; + +: hex-semicolon-alternate [ ' ; entry-to-execution-token , ] + ; make-immediate + +: hex-semicolon-assembly-alternate [ ' ;asm entry-to-execution-token , ] + ; make-immediate + +~ Because docol requires it, we provide a special mini-version of the label +~ system. We only do L@' and L!', because that's all we need. Unlike with the +~ label transform, these are NOT real labels; they're restricted similarly to +~ how they are for the log-load transform. +~ +~ Per the precedence rules in hex-transform-one, this version will only +~ apply for immediate execution; compilation will use the inner label system +~ instead. +: hex-L@'-alternate + word dropstring + transformation-state transformation-state-label-scratch @ + ; make-immediate + +: hex-L!'-alternate + word dropstring + transformation-state transformation-state-label-scratch ! + ; make-immediate + +~ We have to provide alternates for the globals that are bootstrapped by +~ warm-start in normal execution, because code under the hex transform never +~ gets its own copy of warm-start. They don't have to do anything special, +~ though, so we can just use the outer ones. Most of these work unmodified; +~ for "latest" and "here" we invoke the outer copy of the word, but the actual +~ value it sees is the wrapped, inner one, since we swapped that out +~ elsewhere. +: hex-log-alternate log ; +: hex-s0-alternate s0 ; +: hex-r0-alternate r0 ; +: hex-latest-alternate latest ; +: hex-here-alternate here ; + +~ This implements the hex transform for a single word. It is directly +~ analogous to "interpret", and reading interpret.e may help in understanding +~ it, though it's meant to still make sense on its own. +~ +~ The hex transform runs code immediately. Whereas most transforms alter +~ what the transformed code compiles into, the hex transform alters what it +~ outputs. It's assumed that the output is a binary file of some sort; the +~ binary is output as hexadecimal, interspersed with comments describing the +~ code that produced it, including descriptions of what was executed, along +~ with any comments from the original code. +~ +~ The hex transform's alternates take priority over words defined under +~ the transform when running immediately, but for compilation, words defined +~ under the transform take precedence. In the event that neither an alternate +~ nor an entry in the inner dictionary is found, the outer dictionary is +~ checked; otherwise it's irrelevant. +~ +~ It expects to be called from "hex-transform", below, which loops. +~ +~ (-- done) +: hex-transform-one + word + + ~ If no word was returned, exit. + dup 0 = { drop 0 exit } if + + ~ The string is on the top of the stack, so to get a pointer to it we get + ~ the stack address. + ~ (string) + value@ + + ~ If it's the magic word, end the transformation. + dup s" pyrzqxgl" stringcmp 0 = { drop dropstring 1 exit } if + + ~ Check whether it's one of the words we have alternates for, and look up + ~ the alternate if so. + 0 swap + ~ (name as stack string, placeholder, name pointer) + dup s" ~" stringcmp 0 = { swap drop ' hex-tilde-alternate swap } if + dup s" self-codeword" stringcmp 0 = { + swap drop ' hex-self-codeword-alternate swap } if + ~ It is nontrivial to construct a string with a double-quote in it. + dup ' s" entry-to-name stringcmp 0 = { + swap drop ' hex-string-alternate swap } if + dup ' ." entry-to-name stringcmp 0 = { + swap drop ' hex-dot-string-alternate swap } if + dup s" create" stringcmp 0 = { swap drop ' hex-create-alternate swap } if + dup s" :" stringcmp 0 = { swap drop ' hex-colon-alternate swap } if + dup s" ;" stringcmp 0 = { swap drop ' hex-semicolon-alternate swap } if + dup s" ;asm" stringcmp 0 = { + swap drop ' hex-semicolon-assembly-alternate swap } if + dup s" L@'" stringcmp 0 = { swap drop ' hex-L@'-alternate swap } if + dup s" L!'" stringcmp 0 = { swap drop ' hex-L!'-alternate swap } if + dup s" log" stringcmp 0 = { swap drop ' hex-log-alternate swap } if + dup s" s0" stringcmp 0 = { swap drop ' hex-s0-alternate swap } if + dup s" r0" stringcmp 0 = { swap drop ' hex-r0-alternate swap } if + dup s" latest" stringcmp 0 = { swap drop ' hex-latest-alternate swap } if + dup s" here" stringcmp 0 = { swap drop ' hex-here-alternate swap } if + ~ (name as stack string, 0 or alternate entry pointer, name pointer) + + find + ~ (stack string, 0 or alternate entry pointer, 0 or inner entry pointer) + + interpreter-flags @ 0x01 & { + ~ We're in compile mode. An alternate immediate entry has the highest + ~ precedence; an inner entry of any kind has second-highest. + over { + ~ An alternate entry exists; check its flags. + over entry-flags@ 0x01 & { + ~ It's an immediate entry, so it has precedence, regardless of + ~ what's up with the inner entry. Execute it. + drop dropstring-with-result + entry-to-execution-token execute + 0 exit + } { + ~ The alternate is not immediate, so check if there's an inner entry. + dup { + ~ There is also an inner entry. Check its flags. + dup entry-flags@ 0x01 & { + ~ The inner entry is immediate, so it has precedence. Execute it. + swap drop dropstring-with-result + entry-to-execution-token execute + 0 exit + } { + ~ The inner entry is not immediate, so the alternate has + ~ precedence. Compile it. + ~ + ~ Watch this space closely for correctness issues, it's a rare + ~ codepath. + drop dropstring-with-result + entry-to-execution-token , + 0 exit + } if-else + } { + ~ There's no inner entry. Compile the alternate. + ~ + ~ This path, too, is rare and should get close scrutiny for + ~ correctness. + drop dropstring-with-result + entry-to-execution-token , + 0 exit + } if-else + } if-else + } { + ~ There is no alternate entry; check for an inner entry. + dup { + ~ An inner entry exists; check its flags. + dup entry-flags@ 0x01 & { + ~ It's an immediate entry. Execute it. + swap drop dropstring-with-result + entry-to-execution-token execute + 0 exit + } { + ~ It's not an immediate entry. Compile it. + swap drop dropstring-with-result + entry-to-execution-token , + 0 exit + } if-else + } if + ~ If we got here, there's no inner or alternate entry; fall through. + } if-else + } { + ~ We're in immediate mode. An alternate entry of any kind has precedence. + over { + ~ There's an alternate entry. Execute it. + drop dropstring-with-result + entry-to-execution-token execute + 0 exit + } { + ~ There's no alternate entry. Check for an inner entry. + dup { + ~ An inner entry exists. Execute it. + swap drop dropstring-with-result + entry-to-execution-token execute + 0 exit + } if + ~ If we got here, there's no inner or alternate; fall through. + } if-else + } if-else + drop drop + ~ (stack string) + + ~ As a final fallback, we also check the outer dictionary, for immediate + ~ use only. This will allow things like assembly words to work. + value@ + swap-transform-variables + find + swap-transform-variables + dup { + ~ There's an outer entry; check the mode. + interpreter-flags @ 0x01 & { + ~ We're in compile mode; check the outer entry's flags. + dup entry-flags@ 0x01 & { + ~ It's an immediate word; execute it. + dropstring-with-result + entry-to-execution-token + execute + 0 exit + } { + ~ It's not an immediate word. Pretend it doesn't exist, and fall + ~ through. + drop + } if-else + } { + ~ We're in immediate mode. Execute the outer entry. + dropstring-with-result + entry-to-execution-token + execute + 0 exit + } if-else + } { + ~ There's no outer entry. Fall through. + drop + } if-else + ~ (stack string) + + ~ Check whether it's a number literal. + value@ read-integer 0 = { + ~ It's a number. + ~ + ~ (name as stack string, integer value) + dropstring-with-result + ~ (integer value) + + interpreter-flags @ 0x01 & { + ~ TODO this probably needs a lit alternate (!) + ' lit entry-to-execution-token , , + 0 exit + } if + + 0 exit + } if + ~ (stack string) + + ." No such word: " value@ emitstring newline dropstring 0 ; + + +~ This implements the hex transform for all words in a region given as +~ an input string. It is directly analogous to "quit", in interpret.e, but is +~ more complex. +~ +~ (output buffer start, output point, input string pointer +~ -- output buffer start, output point) +: hex-transform + main-input-buffer dup push-input-buffer + ~ TODO the arguments for this seem to be backwards from the documentation + swap attach-string-to-input-buffer + + ~ Save the old values of "here" and "latest", and set the initial values + ~ of the internal ones. These values need to persist across iterations, + ~ since client code will make its own updates to them and then rely on those + ~ updates having taken effect. So we do the swap just once, here outside the + ~ loop, and set it back when the loop ends. + ~ + ~ We also take this opportunity to initialize the output-buffer-start and + ~ user-stack-depth fields of transformation-state. + here @ transformation-state transformation-state-saved-here ! + latest @ transformation-state transformation-state-saved-latest ! + over transformation-state transformation-state-output-buffer-start ! + 0 transformation-state transformation-state-user-stack-depth ! + here ! + 0 latest ! + ~ Now the stack has nothing of ours on it, so client code can do its thing. + + ~ It's important that the stack has nothing of ours on it that persists + ~ across iterations, so that client code can add and remove stuff there as + ~ it sees fit. + { hex-transform-one + ~ (done) + + ~ When the loop is done, get the real values of "here" and "latest" + ~ back. The internal "here" is also the output point, and will become our + ~ return value. The internal "latest" is discarded. + { here @ + transformation-state transformation-state-saved-here @ here ! + transformation-state transformation-state-saved-latest @ latest ! + ~ (output point) + + ~ Though we don't actually use transformation-state outside of this + ~ invocation, for tidiness we zero it out. + 0 transformation-state transformation-state-saved-here ! + 0 transformation-state transformation-state-saved-latest ! + 0 transformation-state transformation-state-output-buffer-start ! + 0 transformation-state transformation-state-user-stack-depth ! + + ~ Also put the input source back how it was. + main-input-buffer pop-input-buffer + + exit } if } forever ; + |