core.e


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215

~ ~~~~~~~~~~~~~~~~~~~~~~~~~
~ ~~ Core Forth features ~~
~ ~~~~~~~~~~~~~~~~~~~~~~~~~
~
~   This file provides extremely fundamental functionality which is a
~ necessary component of any Forth dialect, including Evocation. It is
~ included statically as part of any generated executable, and a second copy
~ of it is later copied into the log when that executable runs. Therefore, it
~ is written to obey the constraints of both the label transform, and the
~ log-load transform; see transform.e for more details on that.


~ Execution support
~ ~~~~~~~~~~~~~~~~~
~
~   There's two words, docol and exit, which are essential parts of the
~ indirect-threaded execution model. It would be tempting to put them in
~ execution.e, so they'd be closer to the explanation of what they do, but
~ we need two copies of them (just like we do of every other word in this
~ file), one statically compiled and one in the log. So, they're here, because
~ that's significantly simpler, even though it creates a little bit of extra
~ work for the label transform.


~   Docol is the "interpreter" that is responsible for the semantics of words
~ written as Forth high-level code. The name is said to be short for
~ "do colon", because word definitions begin with a colon.
~
~   Concretely, when interpreting, it saves rsi (the "instruction pointer") to
~ the control stack, takes the address of the codeword from rax and increments
~ it in-place to form the new instruction pointer, and copies that to rsi.
~
~   Having then done this, we're now in the state that normal execution
~ expects, so docol ends by it using "next" to begin the callee's execution,
~ kicking off a nested call.
~
~   At runtime, invoke docol directly and it will return the value that should
~ be used as a codeword. When compiling to a binary executable, the
~ transformation facility needs to reference it directly, and for that
~ purpose, the label "docol-codeword-value" points to the correct place.
~
~ Registers in:
~
~ * rsi is the caller's instruction pointer
~ * rbp is the control stack pointer
~ * rax is the address of the callee's codeword
~
~ Registers out:
~
~ * rsi is the callee's instruction pointer
~ * rbp is the control stack pointer

~   We need this one snippet of assembly code that's just on the log raw,
~ without a word header, because of its role in making words work. We define
~ a label pointing to it.
~
~   This is the only use of labels in core.e. There's no easy way to
~ remove it. We can't use immediate computation based on "here" because
~ the label transform uses the host address space, not the target address
~ space. The transformation facility's support for labels in transformed
~ code was added just for this.
~
~   It may seem as if we could have used some extra space in the middle of
~ the proper docol word, which is defined just below. That would work fine
~ with the label transform, which can do forward references, but the log-load
~ transform's label support is special-cased to ONLY do this, and it will only
~ work with a backward reference.
here @
dup L!' docol-codeword-value
:rsi pack-pushcontrol
8 :rax add-reg64-imm8
:rax :rsi mov-reg64-reg64
pack-next
8 packalign
here !

: docol
  [ here @
    ~ Evaluated as a word, docol is a constant which returns a pointer.
    L@' docol-codeword-value :rax mov-reg64-imm64
    :rax push-reg64
    here ! ] ;asm

~   This is the mechanism to "return" from a word interpreted by docol.
~ We pop the control stack, and then, since this is threaded execution, we
~ do the next thing the caller wants to do, by inlining "next".
: exit
  [ here @
    :rsi pack-popcontrol
    here ! ] ;asm


~ Stack manipulation routines
~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
~
~   We start with the three traditional stack operations, swap drop and
~ roll. Sorry to fans of the name "ROT"; we were an HP calculator kid. It'll
~ always be roll to us. Anyway, we do a couple other operations too. Since
~ our goal right now is just to bootstrap the heap, we keep this short and
~ sweet.
~
~   This code has been written to satisfy the assumptions needed by the label
~ transform, described in-depth in transform.e. This means that it is, in
~ effect, written in a restricted dialect of Evocation rather than in the full
~ language.
~
~   There is definitely plenty of optimization that could be done.

: swap
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rax push-reg64
    :rbx push-reg64
    here ! ] ;asm

: drop
  [ here @
    :rax pop-reg64
    here ! ] ;asm

: 2drop
  [ here @
    :rax pop-reg64
    :rax pop-reg64
    here ! ] ;asm

~   Rotates "up" (pops its parameter, n; nth item then becomes current item).
~
~   We implement this the high-performance way, with rep movsq, aka the
~ instruction that exists to optimize C's memcpy(). The details of setting
~ that up are complex; see below.
: roll
  [ here @

    ~   Pop our parameter. The rep instruction takes rcx as its count, so we
    ~ reduce copying by using it to hold our count, as well.
    :rcx pop-reg64

    ~   We have n - 1 items to slide, so decrement rcx. For the purpose of
    ~ counting how many repetitions will happen, it's one-based. This is because
    ~ the rep instruction performs a single movsq, then decrements rcx, then
    ~ stops if rcx is zero.
    :rcx dec-reg64

    ~   Retrieve the nth item, for later. For this purpose we're thinking in
    ~  zero-based terms, so we do this after already having decremented rcx.
    :rsp :rcx 8 :rbx mov-reg64-indexed-reg64

    ~   The source address for movsq is rsi and the destination is rdi; we can
    ~ use rdi as we wish, but rsi is our Forth "instruction pointer", so we must
    ~ save and restore it. Doing so alters rsp, so we have to adjust the address
    ~ calculations by eight bytes as compared to the expressions above, but
    ~ happily we can use the disp8 field to do that. We'd be using disp8 anyway
    ~ because it's helpful.
    :rsi push-reg64

    ~   Now we set up parameters for the memory-sliding operation. We have
    ~ n - 1 items to copy, moving the range rsp through rsp + (n-2)*8 onto the
    ~ range rsp + 8 through rsp + (n-1)*8. That's with the value of rsp as it
    ~ exists at this moment (it's going to change soon).
    ~
    ~   We're sliding them upwards in memory, so we start at the high end so
    ~ that we're always moving into a location that doesn't have anything
    ~ precious. We use lea as a convenient way to do the stack math.
    ~
    ~   When rcx is 1, we want rsp + 8.
    :rsp :rcx 8 :rsi lea-reg64-indexed-reg64
    ~   When rcx is 1, we want rsp + 16.
    :rsp :rcx 8 8 :rdi lea-reg64-disp8-indexed-reg64
    ~
    ~   Using rcx = 1 is the most convenient example to use for figuring out the
    ~ arithmetic. It's a linear relationship, so as long as we get the 8-byte
    ~ stride correct, we just need to pick a single point and verify that our
    ~ math is right for that point, and it'll be right for any value of rcx.

    ~   Another of our Forth conventions is that the DF flag should be kept at
    ~ zero, which directs string instruction to increment rsi. Here, however,
    ~ because our source and destination ranges overlap, we need to start at the
    ~ high end, which means we need it to decrement. So we set DF to one, and
    ~ we'll clear it after.
    std
    rep-movs64

    ~ Set everything back.
    cld
    :rsi pop-reg64

    ~ There is now an extra item at the low end of the stack (the top) that
    ~ needs to go away, and coincidentally we have a value in rbx that needs to
    ~ be in that spot. Rather than doing a drop and push, we overwrite it, to
    ~ save a little work.
    :rbx :rsp mov-indirect-reg64-reg64

    ~ All done, wow! What a mouthful.
    here ! ] ;asm

~   Rotates "down" (pops its parameter, n; current item then becomes nth
~ item).
~
~   Jonesforth calls this "-roll" and we could do that, but honestly the name
~ unroll sounds nicer and it's only a single character longer. You might say
~ it rolls off the tongue better.
~
~   We implement this the high-performance way, with rep movsq, aka the
~ instruction that exists to optimize C's memcpy(). The details of setting
~ that up are complex; see below.
: unroll
  [ here @

    ~   Pop our parameter. The rep instruction takes rcx as its count, so we
    ~ reduce copying by using it to hold our count, as well.
    :rcx pop-reg64

    ~ We have n - 1 items to slide, so decrement rcx. Also, save a copy of it in
    ~ rdx after doing that, for later.
    :rcx dec-reg64
    :rcx :rdx mov-reg64-reg64

    ~   Retrieve the 0th item, for later.
    :rsp :rbx mov-reg64-indirect-reg64

    ~   Now we set up parameters for the memory-sliding operation. We have
    ~ n - 1 items to copy, moving the range rsp + 8 through rsp + (n-1)*8 onto
    ~ the range rsp through rsp + (n-2)*8. That's with the value of rsp as it
    ~ exists at this moment (it's going to change soon).
    ~
    ~   We're sliding them downwards in memory, so we start at the low end so
    ~ that we're always moving into a location that doesn't have anything
    ~ precious. We use lea as a convenient way to do the stack math.
    ~
    ~   As with roll, we need to save rsi and adjust those rsp calculations
    ~ accordingly.
    :rsi push-reg64

    ~   Regardless of rcx, we want rsp + 16.
    :rsp 16 :rsi lea-reg64-disp8-reg64
    ~   Regardless of rcx, we want rsp + 8.
    :rsp 8 :rdi lea-reg64-disp8-reg64

    ~   With roll, we were starting at the high end. Here, we start at the low
    ~ end, which means we need rsi to increment after each repetition. That's
    ~ what it does when the DF flag is clear, and another of our Forth
    ~ conventions is to keep it clear normally. So, we don't have to touch DF!
    ~ Yay!
    rep-movs64

    ~ Restore our original rsi.
    :rsi pop-reg64

    ~ There is now an extra item in the middle of the stack, at the high end of
    ~ the sliding we did, that needs to be overwritten with our value in rbx.
    ~ Since we destructively updated our count in rcx, we saved a copy of the
    ~ count in rdx, and we use that to find the right address.
    ~
    ~ When the original count was n, we want rsp + (n-1)*8, so we saved rdx
    ~ after decrementing rcx, above.
    :rbx :rsp :rdx 8 mov-indexed-reg64-reg64

    ~ All done, wow! What a mouthful.
    here ! ] ;asm

~ Rotates "up" (third item becomes current item)
: 3roll
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rcx pop-reg64
    :rbx push-reg64
    :rax push-reg64
    :rcx push-reg64
    here ! ] ;asm

~ Rotates "down" (current item becomes third item)
: 3unroll
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rcx pop-reg64
    :rax push-reg64
    :rcx push-reg64
    :rbx push-reg64
    here ! ] ;asm

: dup
  [ here @
    :rax pop-reg64
    :rax push-reg64
    :rax push-reg64
    here ! ] ;asm

: 2dup
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx push-reg64
    :rax push-reg64
    :rbx push-reg64
    :rax push-reg64
    here ! ] ;asm


~ Arithmetic routines
~ ~~~~~~~~~~~~~~~~~~~
~
~   No surprises here. Again, since our goal is to bootstrap the heap, we
~ keep it short. Also again, this is nowhere near optimal.

: +
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :rbx :rax add-reg64-reg64
    :rax push-reg64
    here ! ] ;asm

: -
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :rbx :rax sub-reg64-reg64
    :rax push-reg64
    here ! ] ;asm

: *
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx mul-reg64
    :rax push-reg64
    here ! ] ;asm

~   Jonesforth calls this "/mod" but % is widely recognized and has no special
~ Forth significance.
: /%
  [ here @
    :rdx :rdx xor-reg64-reg64    ~ rdx is the high bits of the input; zero it
    :rbx pop-reg64
    :rax pop-reg64
    :rbx divmod-reg64
    :rdx push-reg64              ~ remainder
    :rax push-reg64              ~ quotient
    here ! ] ;asm


~ Comparison routines
~ ~~~~~~~~~~~~~~~~~~~
~
~   So. This is subtle. These comparison routines all have the same structure.
~ Notice that = and != are commutative, whereas the others are not, so
~ consider > as the archetypical one when reasoning about correctness. To
~ test if A > B, we do cmp A, B, which sets the flags the same way as
~ subtracting B from A does. The mnemonic names of the condition codes are
~ based on the assumption you do it in this order.
~
~   We want to treat the top of the stack as the first operand, so we
~ carefully pop in the appropriate order.
~
~   There are both signed and unsigned variants of the condition codes. We
~ provide both; the unmarked comparisons such as > and <= are signed, while
~ the marked ones such as >unsigned and <=unsigned are of course unsigned.

~   Jonesforth calls this "="; most languages would call it "==".
: =
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-equal :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Jonesforth calls this "<>", but even most modern SQL dialects recognize
~ C's legacy and allow "!=" these days. As someone who learned C in childhood,
~ this is not actually a hard call for us.
: !=
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-not-equal :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Is the top of the stack greater than the second item in the stack, when
~ both are treated as signed?
: >
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-greater :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Is the top of the stack less than the second item in the stack, when both
~ are treated as signed?
: <
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-less :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Is the top of the stack greater than or equal to the second item in the
~ stack, when both are treated as signed?
: >=
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-greater-equal :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Is the top of the stack less than or equal to the second item in the
~ stack, when both are treated as signed?
: <=
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-less-equal :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Is the top of the stack greater than the second item in the stack, when
~ both are treated as unsigned?
: >unsigned
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-above :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Is the top of the stack less than the second item in the stack, when
~ both are treated as unsigned?
: <unsigned
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-below :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Is the top of the stack greater than or equal to the second item in the
~ stack, when both are treated as unsigned?
: >=unsigned
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-above-equal :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm

~   Is the top of the stack less than or equal to the second item in the
~ stack, when both are treated as unsigned?
: <=unsigned
  [ here @
    :rax pop-reg64
    :rbx pop-reg64
    :rbx :rax cmp-reg64-reg64
    :cc-below-equal :al set-reg8-cc
    0x01 :rax and-reg64-imm8
    :rax push-reg64
    here ! ] ;asm


~ Bitwise routines
~ ~~~~~~~~~~~~~~~~
~

: &
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :rbx :rax and-reg64-reg64
    :rax push-reg64
    here ! ] ;asm

: |
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :rbx :rax or-reg64-reg64
    :rax push-reg64
    here ! ] ;asm

: xor
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :rbx :rax xor-reg64-reg64
    :rax push-reg64
    here ! ] ;asm

~   The HP overloads the name "not", so we follow the Forth convention.
: invert
  [ here @
    :rax pop-reg64
    :rax not-reg64
    :rax push-reg64
    here ! ] ;asm


~ Literal routines
~ ~~~~~~~~~~~~~~~~
~
~   These words are rarely used directly from user code, but are emitted by
~ the interpreter, in compile mode. They allow literal values of various kinds
~ to exist inline as part of compiled code, which would otherwise be only an
~ array of codeword pointers. They do this, in their various ways, by pushing
~ any appropriate value on the stack and making sure execution skips over the
~ bytes used to represent it.

: lit
  [ here @
    lods64
    :rax push-reg64
    here ! ] ;asm

: litstring
  [ here @
    ~   The string immediately follows the codeword in memory, so rsi is
    ~ already pointing to it. That address will be our returned result, so we
    ~ push it to the stack.
    :rsi push-reg64

    ~   Now we need to skip over the string, so that rsi will be valid for the
    ~ next Forth word. To do that, we're going to do a string operation with
    ~ scas8, which takes rdi as the address to look at. This means scas8 is
    ~ treating its operand as analogous to the destination operand of movs*.
    :rsi :rdi mov-reg64-reg64
    ~   We want to compare for equality with zero; scas8 looks in al for the
    ~ other half of the comparison, so we clear rax.
    :rax :rax xor-reg64-reg64
    ~   Counterintuitively, we do need to pass a count. We pass -1, which will
    ~ always work.
    :rcx :rcx xor-reg64-reg64
    :rcx not-reg64
    ~   The DF flag is zero per our Forth execution-model convention, which
    ~ means scas8 increments rdi after each iteration. This is what we want,
    ~ so there's no need to mess with it.
    ~
    ~   We also need to worry about ZF, but fortunately that "not" will make
    ~ sure it's clear.
    ~
    ~   So, we're ready; do it!
    repnz-scas8
    ~   The scas8 completes, incrementing rdi, before the repnz checks its
    ~ condition. So, rdi is now pointing immediately after the terminating
    ~ null byte. Of course, we want this in rsi for Forth's purposes.
    :rdi :rsi mov-reg64-reg64

    ~   Finally, we need to align rsi to the next word boundary.
    7 :rsi add-reg64-imm8
    7 invert :rsi and-reg64-imm8

    here ! ] ;asm


~ Memory access routines
~ ~~~~~~~~~~~~~~~~~~~~~~
~
~   We go with Forth names for this stuff. The HP's names for memory and
~ storage operations heavily leverage the fact they have an object system
~ with type tags and so on; we want to stay close to the bytes.

~ Address on the top of the stack, value in the second position
: !
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :rax :rbx mov-indirect-reg64-reg64
    here ! ] ;asm

: @
  [ here @
    :rax pop-reg64
    :rax :rax mov-reg64-indirect-reg64
    :rax push-reg64
    here ! ] ;asm

~   I might have put the parameters the other way round, but this is what
~ Jonesforth does and it seems reasonable enough.
~
~ (value, address --)
: +!
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :rax :rbx add-indirect-reg64-reg64
    here ! ] ;asm

~ (value, address --)
: -!
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :rax :rbx sub-indirect-reg64-reg64
    here ! ] ;asm

~   Jonesforth calls this "c!" but we categorically reject the use of letters
~ that are meant to indicate byte sizes. It's unfriendly to newcomers. There's
~ some risk that this name will be confused for meaning "store a value of 8",
~ but that would not be a useful task, so hopefully it'll be okay.
: 8!
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :al :rbx mov-indirect-reg64-reg8
    here ! ] ;asm

: 8@
  [ here @
    :rbx pop-reg64
    :rax :rax xor-reg64-reg64
    :rbx :al mov-reg8-indirect-reg64
    :rax push-reg64
    here ! ] ;asm

: 16!
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :ax :rbx mov-indirect-reg64-reg16
    here ! ] ;asm

: 16@
  [ here @
    :rbx pop-reg64
    :rax :rax xor-reg64-reg64
    :rbx :ax mov-reg16-indirect-reg64
    :rax push-reg64
    here ! ] ;asm

: 32!
  [ here @
    :rbx pop-reg64
    :rax pop-reg64
    :eax :rbx mov-indirect-reg64-reg32
    here ! ] ;asm

: 32@
  [ here @
    :rbx pop-reg64
    :rax :rax xor-reg64-reg64
    :rbx :eax mov-reg32-indirect-reg64
    :rax push-reg64
    here ! ] ;asm

~   Before we get too deep into it, we also define a few reflection routines
~ that retrieve, or set, the address of either of the two stacks.
~
~   The result, or the new value, is on the top of the data stack. That's the
~ same as how it works for any other word, but given the metacircular nature
~ of these ones it's easy to get confused about that...
~
~   Jonesforth calls this RSP!, which looks as if it's meant to be an Intel
~ register name but is actually short for return stack pointer. There is no
~ register by that name, it's a Forth-provided abstraction. That's super
~ confusing, plus as discussed above we call it the control stack not the
~ return stack, so we call the words...
: control!
  [ here @
    :rbp pop-reg64
    here ! ] ;asm
: control@
  [ here @
    :rbp push-reg64
    here ! ] ;asm

~   Jonesforth calls this DSP!, for data stack pointer. Again, there's no
~ Intel register by that name, and we call it the value stack, so...
: value!
  [ here @
    ~   Per Intel's description of POP this reads from the old location, and
    ~ there is no increment applied to the resulting value. See, the
    ~ description says it increments the register then overwrites it, in that
    ~ order.
    :rsp pop-reg64
    here ! ] ;asm
: value@
  [ here @
    ~ Per Intel's description of PUSH this pushes the old value.
    :rsp push-reg64
    here ! ] ;asm

~   Copy one non-overlapping block of memory over another. For the overlapping
~ logic, see roll and unroll. This always copies at byte granularity, for ease
~ of implementation.
~
~   Jonesforth calls this CMOVE and has a CCOPY which is a single byte. POSIX,
~ however, has memcpy() for non-overlapping blocks and memmove() for
~ overlapping blocks. We follow the latter convention, because it feels like
~ the more important distinction.
~
~   Also, the "c" was meant to indicate that it works at one-byte granularity,
~ but that isn't, uh... actually an important property here, and as a blanket
~ call we're not using letters to denote data sizes. So we call it "memcopy".
~ Apologies to the C programming tradition but vowels are good, actually.
~
~   Jonesforth also offers C@C! as another name for its CCOPY, but neither
~ "@!" nor "mem@mem!" seems particulaly nice.
~
~ (destination, source, length --)
: memcopy
  [ here @
    ~   We need to save and restore rsi; the other registers we can trample.
    :rsi :rdx mov-reg64-reg64
    :rcx pop-reg64
    :rsi pop-reg64
    :rdi pop-reg64
    ~   We start from the low end, since that's easier arithmetic. So, we get
    ~ to leave the DF flag alone.
    rep-movs8
    :rdx :rsi mov-reg64-reg64
    here ! ] ;asm

~   This does the same thing as memcopy, but correctly handles situations
~ where the destination overlaps with the source. It achieves this by being
~ careful about which end the transfer starts from. This "move" vs. "copy"
~ distinction mirrors C terminology.
~
~ (destination, source, length --)
: memmove
  [ here @
    ~ We need to save and restore rsi; the other registers we can trample.
    :rsi :rdx mov-reg64-reg64
    :rcx pop-reg64
    :rsi pop-reg64
    :rdi pop-reg64

    ~ We need to check source < destination to decide which end to start from.
    :rsi :rax mov-reg64-reg64
    :rdi :rax cmp-reg64-reg64
    ~ Relative offsets are from the start of the instruction after the jmp.
    4 :cc-below jmp-cc-rel-imm8

    ~ If source is greater, we are sliding downwards so we start from the low
    ~ end. So, we get to leave the DF flag alone.
    rep-movs8                         ~ 2 bytes
    16 jmp-rel-imm8                   ~ 2 bytes

    ~ If destination is greater, we are sliding upwards so we start from the
    ~ high end. So, we have to save and restore DF. Also, we have to adjust the
    ~ pointers.
    :rcx :rsi add-reg64-reg64         ~ 3 bytes
    :rsi dec-reg64                    ~ 3 bytes
    :rcx :rdi add-reg64-reg64         ~ 3 bytes
    :rdi dec-reg64                    ~ 3 bytes
    std                               ~ 1 byte
    rep-movs8                         ~ 2 bytes
    cld                               ~ 1 byte

    :rdx :rsi mov-reg64-reg64
    here ! ] ;asm


~ (string pointer -- string length not including null byte)
: stringlen
  [ here @
    :rdi pop-reg64
    :rdi :rbx mov-reg64-reg64
    :rax :rax xor-reg64-reg64
    :rcx :rcx xor-reg64-reg64
    :rcx not-reg64
    repnz-scas8
    :rbx :rdi sub-reg64-reg64
    1 :rdi sub-reg64-imm8
    :rdi push-reg64
    here ! ] ;asm

~   This word is for working with strings that are null-terminated at both
~ ends. It counts backwards from the end to find the string length. It accepts
~ an input pointer to the final non-null byte at the end of the string, and
~ returns the length not including either null byte.
~
~ (string end pointer -- string length not including null)
: reverse-stringlen
  [ here @
    :rdi pop-reg64

    :rdi :rbx mov-reg64-reg64
    :rax :rax xor-reg64-reg64
    :rcx :rcx xor-reg64-reg64
    :rcx not-reg64

    std
    repnz-scas8
    cld

    :rdi :rbx sub-reg64-reg64
    1 :rbx sub-reg64-imm8

    :rbx push-reg64
    here ! ] ;asm


~   If you have a variable-length string followed by alignment padding, and
~ you want to traverse it in reverse, you also need to skip the alignment
~ padding...
~
~   The alignment padding can have a total length of no more than 8 bytes. The
~ returned value counts from the end, backwards, to the nearest non-null byte,
~ or at most a distance of eight.
~
~ (address of final null byte in padding -- count of null bytes in padding)
: reverse-padding-len
  [ here @
    :rdi pop-reg64

    :rdi :rbx mov-reg64-reg64
    :rax :rax xor-reg64-reg64
    9 :rcx mov-reg64-imm32

    std
    repz-scas8
    cld

    :rdi :rbx sub-reg64-reg64
    1 :rbx sub-reg64-imm8

    :rbx push-reg64
    here ! ] ;asm


~   We make this work using exactly two jump instructions, which is likely the
~ minimum possible. To avoid relying on labels, we hand-compute the byte
~ offsets, so every instruction within their ranges is annotated with its
~ length in bytes. Good fun.
~
~   The returned comparsion value is 0 for equal, 1 to indicate the left
~ string (lower on the stack) is greater, and -1 to indicate the right string
~ (top of the stack) is greater.
~
~ (left string pointer, right string pointer -- comparison value)
: stringcmp
  [ here @
    ~   Save the rsi register.
    ~
    ~   Happily, we don't need a lot of registers for this code, so we can
    ~ dedicate rdx to this and not have to deal with stack juggling..
    :rsi :rdx mov-reg64-reg64

    ~   Get the parameters off the stack.
    ~
    ~   For reasons that will be explained below, the left and right strings
    ~ from our caller's perspective are swapped from the perspective of the
    ~ comparisons we'll be doing. The rdi register points to the right-hand
    ~ operand of the "cmps" instruction, while rsi points to the left-hand
    ~ operand; [Intel] volume 2A, chapter 3, section 4-3.3, "CMPS"; note that
    ~ the PDF mis-numbers the sections in the second half of chapter 3.
    :rsi pop-reg64
    :rdi pop-reg64

    ~   For the comparison-unequal loop-exit code, we'll need rbx set to zero.
    ~ The easy way to do that is by xor'ing it with itself, but doing it at
    ~ the end of the loop would overwrite the comparison flags, which we need.
    ~ So we do it in advance, and carefully don't touch it anywhere else.
    ~
    ~   See the list of which instructions affect the flags in [Intel]
    ~ volume 1, appendix A, section A-1, table A-2.
    :rbx :rbx xor-reg64-reg64

    ~   We also clear rax; we'll be using al later, and it will be convenient
    ~ to know the upper bits are always zero.
    :rax :rax xor-reg64-reg64

    ~ Now we've initialized everything we need; everything after this point is
    ~ part of the loop.

    ~   At the start of each iteration, save a copy of the byte we're at now.
    ~ We need to do this before cmps because the pointers will increment;
    ~ we'll eventually use it to test whether we've reached the end delimiter.
    ~
    ~   We do a 64-bit load into rcx, which is otherwise unused, then copy the
    ~ low byte from cl to al. This avoids having to think about addressing
    ~ modes that combine 64-bit addresses with smaller data sizes; those are
    ~ very subtle.
    :rsi :rcx mov-reg64-indirect-reg64  ~ 3 bytes
    :cl :al mov-reg8-reg8               ~ 2 bytes

    ~   Now do the cmps, which is the heart of the loop.
    ~
    ~   Note that, in addition to relying on this comparison to test content
    ~ bytes against each other, in the event that one string is a prefix of
    ~ the other, this will also test content bytes against null delimiters.
    ~ The longer string will compare as "greater", because zero is less than
    ~ all other possible byte values. This is what we want.
    ~
    ~   Since strings of different lengths are necessarily unequal, letting
    ~ this test do the work of detecting that also means we don't have to deal
    ~ with scenarios where we're past the end of one string but not the other.
    cmps8                               ~ 1 byte

    ~   The flags are now set based on the simulated subtraction of the next
    ~ bytes from the two strings. If they were unequal, the loop will end. If
    ~ they were equal, we have another test to do before we're finished with
    ~ this iteration. So we put the loop-end code next, and conditionally jump
    ~ forward past it.
    ~
    ~   Recall that relative offsets are from the start of the instruction after
    ~ the jmp.
    15 :cc-equal jmp-cc-rel-imm8        ~ 2 bytes

    ~   If we got here, the strings are unequal, so we need to turn the flags
    ~ into a comparison value. We cleared rbx earlier; now we set its low bit
    ~ to the "above" flag, then use sbb to subtract the "carry" flag (which it
    ~ thinks of as the "borrow" flag).
    ~
    ~   Recall that "carry" is based on whether the comparison would cause a
    ~ change to the next bit "outside" the bits being compared; [Intel]
    ~ volume 1, chapter 3, section 3-4.3.1. Since the comparison is a
    ~ subtraction, "carry" is true when the right-hand byte is strictly
    ~ greater than the left-hand byte.
    ~
    ~   Also recall that "zero", also called "equal", is based on whether the
    ~ comparison's result produces output that's all zero bits. Since the
    ~ comparison is a subtraction, "zero" is true when the two bytes are
    ~ identical.
    ~
    ~   The "above" condition is not a flag, but a composite of flags. It's
    ~ true when both "carry" and "zero" are false; [Intel] volume 1, chapter
    ~ 7, section 7-3.1.1, table 7-2. For us, this is equivalent to saying the
    ~ left-hand byte is strictly greater than the right-hand.
    ~
    ~   So, after the "set" instruction, rbx is 1 if left > right, and 0 if
    ~ left = right or left < right. The "sbb" instruction with an immediate
    ~ value of zero subtracts the value of the carry flag from rbx; the flag
    ~ is true in the case where left < right, and false otherwise. So, after
    ~ the "sbb", rbx is 1 if left > right, 0 if left = right, and -1 if
    ~ left < right.
    ~
    ~   This sounds like the opposite of what we want, but recall that we
    ~ exchanged the operands, so these are the values our caller is expecting.
    ~ While it might be tempting to look for a way to not need that
    ~ transposition, and indeed there exists an approach using "not above",
    ~ it's fiddly in a way that's even harder to explain.
    ~
    ~   Note that it's important that the "jmp" and "set" instructions don't
    ~ change the flags (see table A-2 again), so the "cmps" is the most recent
    ~ thing that did. The "sbb" instruction does change them, but we don't
    ~ need them again after that point so it doesn't matter.
    ~
    ~   Finally, we need to restore rsi before we return to Forth.
    ~
    ~   The two-instruction set-sbb sequence is one of those classic assembly
    ~ programming tricks that often goes unexplained, because most of its
    ~ appeal is its brevity and a proper explanation is quite lengthy. Please
    ~ enjoy this gift of knowledge, and thanks to our friends who showed it to
    ~ us.
    :cc-above :bl set-reg8-cc           ~ 3 bytes
    0 :rbx sbb-reg64-imm8               ~ 4 bytes
    :rbx push-reg64                     ~ 1 byte
    :rdx :rsi mov-reg64-reg64           ~ 3 bytes
    pack-next                           ~ 4 bytes

    ~   If we got here, the current bytes are equal to each other. We still
    ~ need to test if they're null terminators; if so, we exit, and if not, we
    ~ loop.
    ~
    ~   At the beginning of this iteration, we saved a copy of the byte being
    ~ inspected in rax. The "test" instruction simulates a xor and sets the
    ~ flags accordingly. We check the "not equal" condition, also known as
    ~ "not zero", and jump backwards to the start of the loop when that's the
    ~ case.
    ~
    ~   Recall that relative offsets are from the start of the instruction
    ~ after the jmp.
    :rax :rax test-reg64-reg64          ~ 3 bytes
    -28 :cc-not-equal jmp-cc-rel-imm8   ~ 2 bytes

    ~   If we got here, we got all the way to the end of both strings and
    ~ found a null byte, so we return 0 to indicate they're equal.
    ~
    ~   We can use push-imm32-extended64 even though our stack holds 64-bit
    ~ values, because it gets sign-extended.
    ~
    ~   We need to restore rsi before we return to Forth.
    0 push-imm32-extended64
    :rdx :rsi mov-reg64-reg64
    here ! ] ;asm


~ Branching
~ ~~~~~~~~~

~   Okay, so there's a weird thing about how branch and 0branch are defined.
~ 0branch jumps into branch, so they need to be next to each other in the log.
~ To compute the offset to jump by, we pass an address on the stack. Notice
~ that assembly words are always defined in immediate mode; it's just they
~ usually only do trivial logic.

~   This takes a number of bytes, not machine words. That allows it to be used
~ for putting weird things embedded in the code.
~
~   The offset is relative to the start of the word the number of bytes is in,
~ so, make sure to have it skip itself.
: branch
  [ here @
    dup swap ~ Save a copy of this address for later.
    :rsi :rsi add-reg64-indirect-reg64
    here ! ] ;asm
~ (jump destination)

: 0branch
  [ here @
    ~ (jump destination, output point)

    :rax pop-reg64
    :rax :rax test-reg64-reg64

    ~   Please notice the 8-bit branch to the nearby word. This is the offset
    ~ from the end of the jmp instruction, to the start of the target
    ~ instruction. To avoid needing the label system for this, we compute it
    ~ based on the address before the jmp; the jmp itself is two bytes.
    ~
    ~   Remember, with jump arithmetic, subtract the start from the
    ~ destination.
    ~
    ~   What we're testing with :cc-equal is that the input value is zero.
    ~ The name is slightly counterintuitive; it's a result of the condition
    ~ names favoring cmp over test. While cmp simulates subtraction, test
    ~ simulates bitwise AND.
    dup 2 + 3roll swap - :cc-equal jmp-cc-rel-imm8
    ~ (output point)

    ~   In the event we didn't jump, we still need to skip over the literal
    ~ value.  Using lods here is just a convenient way to skip rsi forward.
    lods64

    here ! ] ;asm
~ the stack is empty once more


~   This is like next, but instead of using rsi as the "instruction pointer",
~ it takes a codeword address from the value stack.
~
~   In the event that the codeword is docol, docol will handle any
~ manipulation of the control stack that needs to happen. Yes, it really is
~ that simple.
~
~ (execution token --)
: execute
  [ here @
    :rax pop-reg64
    :rax jmp-abs-indirect-reg64
    here ! ] ;asm


~ Dictionary entries
~ ~~~~~~~~~~~~~~~~~~
~
~   Now, we have a bunch of words that are used for traversing the Forth
~ core data structures that describe words. First, we have a couple that
~ relate to individual words and their pieces...
~
~   The log-load transform produces code that requires
~ entry-to-execution-token, which means it's needed statically. So this stuff
~ to deal with word entry headers might as well go in core, since it has no
~ dependencies to speak of.
~
~   These are the first words in core that are implemented in Forth rather
~ than assembler. That's not as big a deal as it may seem; the Forth execution
~ model has been ready-to-go ever since we implemented docol and exit, and
~ at this point we have enough basics to do useful things with it.

~   Jonesforth calls this "TFCA" and ">CFA"; its author speculates that the
~ original meaning is "code field address".
~
~ (entry pointer -- execution token)
: entry-to-execution-token
  ~ Skip next-entry pointer, flag byte, and start terminator.
  10 +
  ~ Skip string contents.
  dup stringlen +
  ~ Skip one for the null terminator, seven more for alignment.
  8 +
  ~ Zero the low bits and now it's aligned.
  7 invert & ;

~   Jonesforth calls this "CFA>". Jonesforth's implementation searches the
~ entire dictionary, since its word header format isn't designed to be
~ traversed in reverse, but ours is, so it should be fast.
~
~ (execution token -- entry pointer)
: execution-token-to-entry
  1 -
  dup reverse-padding-len -
  dup reverse-stringlen -
  9 - ;

~ (entry pointer -- flags byte)
: entry-flags@
  8 + @ 0xFF & ;

~ TODO these parameters are in a counterintuitive order, swap them
~ (entry pointer, new flags byte --)
: entry-flags!
  swap
  8 +
  dup @ 3roll
  0xFF &
  swap 0xFFFFFFFFFFFFFF00 & |
  swap !
  ;

~ (entry pointer -- name string pointer)
: entry-to-name 10 + ;


~ Binary packing
~ ~~~~~~~~~~~~~~
~
~   These routines are for building up data structures in-memory. Sometimes
~ they're used for structures that are meant to stay in memory; other times
~ it's a buffer that will become output.
~
~   The general pattern is that each routine takes an output address and
~ some specific datum, and returns the output address adjusted to point
~ after the new datum. That makes them easy to chain together. We call this
~ address the "output point", to capture the idea that it's a running total
~ which gets updated by each new datum as it's packed.

~ (output point, value -- output point)
: pack64 swap dup 3unroll ! 8 + ;
: pack32 swap dup 3unroll 32! 4 + ;
: pack16 swap dup 3unroll 16! 2 + ;
: pack8 swap dup 3unroll 8! 1 + ;

~   This works on C-style strings, which are characters followed by a null
~ terminator. The packed data includes the null terminator.
~
~ (output point, string pointer -- output point)
: packstring
  dup stringlen 1 + dup
  ~ (output point, source, length, length)
  4 roll dup 5 unroll
  ~ (destination, source, length, length, output point)
  + 4 unroll
  ~ (output point, destination, source, length)
  memcopy ;

~ (output point, alignment byte count -- output point)
: packalign
  2dup /% drop 0branch [ 8 8 * , ]
  swap 0 pack8 swap branch [ -11 8 * , ]
  drop ;
  ~ TODO this is the implementation we could use if we had more flow-control
  ~ { 2dup /% drop { drop exit } unless
  ~  swap 0 pack8 swap } forever ;


~ Binary unpacking
~ ~~~~~~~~~~~~~~~~
~
~   These routines are for examining data structures in-memory.
~
~   Similarly to the output routines, each routine takes an input address,
~ which it updates to point after the data item being read. We call this the
~ "input point". Since this is input, the routines return data items rather
~ than accepting them.

~ (input point -- input point, value)
: unpack64 dup @ swap 8 + swap ;
: unpack32 dup 32@ swap 4 + swap ;
: unpack16 dup 16@ swap 2 + swap ;
: unpack8 dup 8@ swap 1 + swap ;

~ TODO does this need to have a separate name?
~ (proposed size, alignment byte count -- adjusted size)
: align-size
  dup 3unroll dup 3unroll
  ~ (alignment, alignment, proposed size, alignment)
  1 - + swap /% swap drop * ;

~   You might think this would be identical to packalign, but packalign has
~ side effects.
~
~ (input point, alignment byte count -- input point)
: unpackalign align-size ;


~ Development utilities
~ ~~~~~~~~~~~~~~~~~~~~~

~   This peforms the "hlt" instruction (Intel's mnemomic, short for "halt"),
~ which will cause the program to exit with a segmentation fault. If you're
~ running under a debugger, this is a convenient way to get execution to stop
~ at a certain point.
~
~   It's called "crash" rather than "hlt" to distinguish it from the word
~ which outputs the instruction as machine code.

: crash
  [ here @
    hlt
    here ! ] ;asm