1 /* A minimal FORTH interpreter for Linux / i386 systems. -*- asm -*-
2 * By Richard W.M. Jones <rich@annexia.org>
4 * gcc -m32 -nostdlib -static -Wl,-Ttext,0 -o jonesforth jonesforth.S
7 #include <asm-i386/unistd.h>
9 /* NOTES-------------------------------------------------------------------------------------------------------------------
11 Need to say something about $ before constants.
13 And about je/jne/ja/jb/jbe/etc
26 /* Macros to deal with the return stack. */
28 lea -4(%ebp),%ebp // push reg on to return stack
33 mov (%ebp),\reg // pop top of return stack to reg
37 /* ELF entry point. */
42 mov %esp,var_S0 // Store the initial data stack pointer.
43 mov $return_stack,%ebp // Initialise the return stack.
45 mov $cold_start,%esi // Initialise interpreter.
46 NEXT // Run interpreter!
49 cold_start: // High-level code without a codeword.
52 /* DOCOL - the interpreter! */
56 PUSHRSP %esi // push %esi on to the return stack
57 addl $4,%eax // %eax points to codeword, so make
58 movl %eax,%esi // %esi point to first data word
61 /*----------------------------------------------------------------------
62 * Fixed sized buffers for everything.
66 /* FORTH return stack. */
67 #define RETURN_STACK_SIZE 8192
69 .space RETURN_STACK_SIZE
72 /* Space for user-defined words. */
73 #define USER_DEFS_SIZE 16384
83 /*----------------------------------------------------------------------
84 * Built-in words defined the long way.
89 // Store the chain of links.
92 .macro defcode name, namelen, flags=0, label
99 .byte \flags+\namelen // flags + length byte
100 .ascii "\name" // the name
104 .int code_\label // codeword
108 code_\label : // assembler code follows
111 .macro defword name, namelen, flags=0, label
117 .set link,name_\label
118 .byte \flags+\namelen // flags + length byte
119 .ascii "\name" // the name
123 .int DOCOL // codeword - the interpreter
124 // list of word pointers follow
127 .macro defvar name, namelen, flags=0, label, initial=0
128 defcode \name,\namelen,\flags,\label
137 // Some easy ones, written in assembly for speed
138 defcode "DROP",4,,DROP
139 pop %eax // drop top of stack
143 pop %eax // duplicate top of stack
148 defcode "SWAP",4,,SWAP
149 pop %eax // swap top of stack
155 defcode "OVER",4,,OVER
156 mov 4(%esp),%eax // get the second element of stack
157 push %eax // and push it on top
169 defcode "-ROT",4,,NROT
179 incl (%esp) // increment top of stack
183 decl (%esp) // decrement top of stack
186 defcode "4+",2,,INCR4
187 addl $4,(%esp) // increment top of stack
190 defcode "4-",2,,DECR4
191 subl $4,(%esp) // decrement top of stack
208 push %eax // ignore overflow
216 push %eax // push quotient
224 push %edx // push remainder
227 defcode "=",1,,EQU // top two words are equal?
237 defcode "<>",2,,NEQU // top two words are not equal?
247 defcode "0=",2,,ZEQU // top of stack equals 0?
266 defcode "INVERT",6,,INVERT
270 // COLD must not return (ie. must not call EXIT).
271 defword "COLD",4,,COLD
272 // XXX reinitialisation of the interpreter
273 .int INTERPRETER // call the interpreter loop (never returns)
274 .int LIT,1,SYSEXIT // hmmm, but in case it does, exit(1).
276 defcode "EXIT",4,,EXIT
277 POPRSP %esi // pop return stack into %esi
281 // %esi points to the next command, but in this case it points to the next
282 // literal 32 bit integer. Get that literal into %eax and increment %esi.
283 // On x86, it's a convenient single byte instruction! (cf. NEXT macro)
285 push %eax // push the literal number on to stack
288 defcode "LITSTRING",9,,LITSTRING
289 lodsl // get the length of the string
290 push %eax // push it on the stack
291 push %esi // push the address of the start of the string
292 addl %eax,%esi // skip past the string
293 addl $3,%esi // but round up to next 4 byte boundary
297 defcode "BRANCH",6,,BRANCH
298 add (%esi),%esi // add the offset to the instruction pointer
301 defcode "0BRANCH",7,,ZBRANCH
303 test %eax,%eax // top of stack is zero?
304 jz code_BRANCH // if so, jump back to the branch function above
305 lodsl // otherwise we need to skip the offset
309 pop %ebx // address to store at
310 pop %eax // data to store there
311 mov %eax,(%ebx) // store it
315 pop %ebx // address to fetch
316 mov (%ebx),%eax // fetch it
317 push %eax // push value onto stack
320 /* ! and @ (STORE and FETCH) store 32-bit words. It's also useful to be able to read and write bytes.
321 * I don't know whether FORTH has these words, so I invented my own, called !b and @b.
322 * Byte-oriented operations only work on architectures which permit them (i386 is one of those).
324 defcode "!b",2,,STOREBYTE
325 pop %ebx // address to store at
326 pop %eax // data to store there
327 movb %al,(%ebx) // store it
330 defcode "@b",2,,FETCHBYTE
331 pop %ebx // address to fetch
333 movb (%ebx),%al // fetch it
334 push %eax // push value onto stack
337 // The STATE variable is 0 for execute mode, != 0 for compile mode
338 defvar "STATE",5,,STATE
340 // This points to where compiled words go.
341 defvar "HERE",4,,HERE,user_defs_start
343 // This is the last definition in the dictionary.
344 defvar "LATEST",6,,LATEST,name_SYSEXIT // SYSEXIT must be last in built-in dictionary
346 // _X, _Y and _Z are scratch variables used by standard words.
351 // This stores the top of the data stack.
354 // This stores the top of the return stack.
355 defvar "R0",2,,RZ,return_stack
357 defcode "DSP@",4,,DSPFETCH
362 defcode "DSP!",4,,DSPSTORE
367 pop %eax // pop parameter stack into %eax
368 PUSHRSP %eax // push it on to the return stack
371 defcode "R>",2,,FROMR
372 POPRSP %eax // pop return stack on to %eax
373 push %eax // and push on to parameter stack
376 defcode "RSP@",4,,RSPFETCH
380 defcode "RSP!",4,,RSPSTORE
384 defcode "RDROP",5,,RDROP
385 lea 4(%ebp),%ebp // pop return stack and throw away
390 push %eax // push return value on stack
402 1: // out of input; use read(2) to fetch more input from stdin
403 xor %ebx,%ebx // 1st param: stdin
404 mov $buffer,%ecx // 2nd param: buffer
406 mov $buffend-buffer,%edx // 3rd param: max length
407 mov $__NR_read,%eax // syscall: read
409 test %eax,%eax // If %eax <= 0, then exit.
411 addl %eax,%ecx // buffer+%eax = bufftop
415 2: // error or out of input: exit
417 mov $__NR_exit,%eax // syscall: exit
420 defcode "EMIT",4,,EMIT
425 mov $1,%ebx // 1st param: stdout
427 // write needs the address of the byte to write
429 mov $2f,%ecx // 2nd param: address
431 mov $1,%edx // 3rd param: nbytes = 1
433 mov $__NR_write,%eax // write syscall
438 2: .space 1 // scratch used by EMIT
440 defcode "WORD",4,,WORD
442 push %ecx // push length
443 push %edi // push base address
447 /* Search for first non-blank character. Also skip \ comments. */
449 call _KEY // get next key, returned in %eax
450 cmpb $'\\',%al // start of a comment?
451 je 3f // if so, skip the comment
453 jbe 1b // if so, keep looking
455 /* Search for the end of the word, storing chars as we go. */
456 mov $5f,%edi // pointer to return buffer
458 stosb // add character to return buffer
459 call _KEY // get next key, returned in %al
460 cmpb $' ',%al // is blank?
461 ja 2b // if not, keep looping
463 /* Return the word (well, the static buffer) and length. */
465 mov %edi,%ecx // return length of the word
466 mov $5f,%edi // return address of the word
469 /* Code to skip \ comments to end of the current line. */
472 cmpb $'\n',%al // end of line yet?
477 // A static buffer where WORD returns. Subsequent calls
478 // overwrite this buffer. Maximum word length is 32 chars.
481 defcode "EMITSTRING",10,,EMITSTRING
482 mov $1,%ebx // 1st param: stdout
483 pop %ecx // 2nd param: address of string
484 pop %edx // 3rd param: length of string
486 mov $__NR_write,%eax // write syscall
492 pop %eax // Get the number to print into %eax
493 call _DOT // Easier to do this recursively ...
496 mov $10,%ecx // Base 10
500 xor %edx,%edx // %edx:%eax / %ecx -> quotient %eax, remainder %edx
514 // Parse a number from a string on the stack -- almost the opposite of . (DOT)
515 // Note that there is absolutely no error checking. In particular the length of the
516 // string must be >= 1 bytes.
517 defcode "SNUMBER",7,,SNUMBER
527 imull $10,%eax // %eax *= 10
530 subb $'0',%bl // ASCII -> digit
536 defcode "FIND",4,,FIND
537 pop %edi // %edi = address
538 pop %ecx // %ecx = length
544 push %esi // Save %esi so we can use it in string comparison.
546 // Now we start searching backwards through the dictionary for this word.
547 mov var_LATEST,%edx // LATEST points to name header of the latest word in the dictionary
549 test %edx,%edx // NULL pointer? (end of the linked list)
552 // Compare the length expected and the length of the word.
553 // Note that if the F_HIDDEN flag is set on the word, then by a bit of trickery
554 // this won't pick the word (the length will appear to be wrong).
556 movb 4(%edx),%al // %al = flags+length field
557 andb $(F_HIDDEN|0x1f),%al // %al = name length
558 cmpb %cl,%al // Length is the same?
561 // Compare the strings in detail.
562 push %ecx // Save the length
563 push %edi // Save the address (repe cmpsb will move this pointer)
564 lea 5(%edx),%esi // Dictionary string we are checking against.
565 repe cmpsb // Compare the strings.
568 jne 2f // Not the same.
570 // The strings are the same - return the header pointer in %eax
576 mov (%edx),%edx // Move back through the link field to the previous word
577 jmp 1b // .. and loop.
581 xor %eax,%eax // Return zero to indicate not found.
584 defcode ">CFA",4,,TCFA // DEA -> Codeword address
591 add $4,%edi // Skip link pointer.
592 movb (%edi),%al // Load flags+len into %al.
593 inc %edi // Skip flags+len byte.
594 andb $0x1f,%al // Just the length, not the flags.
595 add %eax,%edi // Skip the name.
596 addl $3,%edi // The codeword is 4-byte aligned.
600 defcode "CHAR",4,,CHAR
601 call _WORD // Returns %ecx = length, %edi = pointer to word.
603 movb (%edi),%al // Get the first character of the word.
604 push %eax // Push it onto the stack.
609 // Get the word and create a dictionary entry header for it.
610 call _WORD // Returns %ecx = length, %edi = pointer to word.
611 mov %edi,%ebx // %ebx = address of the word
613 movl var_HERE,%edi // %edi is the address of the header
614 movl var_LATEST,%eax // Get link pointer
615 stosl // and store it in the header.
617 mov %cl,%al // Get the length.
618 orb $F_HIDDEN,%al // Set the HIDDEN flag on this entry.
619 stosb // Store the length/flags byte.
621 mov %ebx,%esi // %esi = word
622 rep movsb // Copy the word
624 addl $3,%edi // Align to next 4 byte boundary.
627 movl $DOCOL,%eax // The codeword for user-created words is always DOCOL (the interpreter)
630 // Header built, so now update LATEST and HERE.
631 // We'll be compiling words and putting them HERE.
636 // And go into compile mode by setting STATE to 1.
641 pop %eax // Code pointer to store.
645 movl var_HERE,%edi // HERE
647 movl %edi,var_HERE // Update HERE (incremented)
650 defcode "HIDDEN",6,,HIDDEN
654 movl var_LATEST,%edi // LATEST word.
655 addl $4,%edi // Point to name/flags byte.
656 xorb $F_HIDDEN,(%edi) // Toggle the HIDDEN bit.
659 defcode "IMMEDIATE",9,F_IMMED,IMMEDIATE
663 movl var_LATEST,%edi // LATEST word.
664 addl $4,%edi // Point to name/flags byte.
665 xorb $F_IMMED,(%edi) // Toggle the IMMED bit.
668 defcode ";",1,F_IMMED,SEMICOLON
669 movl $EXIT,%eax // EXIT is the final codeword in compiled words.
670 call _COMMA // Store it.
671 call _HIDDEN // Toggle the HIDDEN flag (unhides the new word).
672 xor %eax,%eax // Set STATE to 0 (back to execute mode).
676 /* This definiton of ' (TICK) is strictly cheating - it also only works in compiled code. */
678 lodsl // Get the address of the next word and skip it.
679 pushl %eax // Push it on the stack.
682 /* This interpreter is pretty simple, but remember that in FORTH you can always override
683 * it later with a more powerful one!
685 defword "INTERPRETER",11,,INTERPRETER
686 .int INTERPRET,RDROP,INTERPRETER
688 defcode "INTERPRET",9,,INTERPRET
689 call _WORD // Returns %ecx = length, %edi = pointer to word.
691 // Is it in the dictionary?
693 movl %eax,interpret_is_lit // Not a literal number (not yet anyway ...)
694 call _FIND // Returns %eax = pointer to header or 0 if not found.
695 test %eax,%eax // Found?
698 // In the dictionary. Is it an IMMEDIATE codeword?
699 mov %eax,%edi // %edi = dictionary entry
700 movb 4(%edi),%al // Get name+flags.
701 push %ax // Just save it for now.
702 call _TCFA // Convert dictionary entry (in %edi) to codeword pointer.
704 andb $F_IMMED,%al // Is IMMED flag set?
706 jnz 4f // If IMMED, jump straight to executing.
710 1: // Not in the dictionary (not a word) so assume it's a literal number.
711 incl interpret_is_lit
712 call _SNUMBER // Returns the parsed number in %eax
714 mov $LIT,%eax // The word is LIT
716 2: // Are we compiling or executing?
719 jz 4f // Jump if executing.
721 // Compiling - just append the word to the current dictionary definition.
723 mov interpret_is_lit,%ecx // Was it a literal?
726 mov %ebx,%eax // Yes, so LIT is followed by a number.
730 4: // Executing - run it!
731 mov interpret_is_lit,%ecx // Literal?
732 test %ecx,%ecx // Literal?
735 // Not a literal, execute it now. This never returns, but the codeword will
736 // eventually call NEXT which will reenter the loop in INTERPRETER.
739 5: // Executing a literal, which means push it on the stack.
746 .int 0 // Flag used to record if reading a literal
748 // NB: SYSEXIT must be the last entry in the built-in dictionary.
749 defcode SYSEXIT,7,,SYSEXIT
754 /*----------------------------------------------------------------------
755 * Input buffer & initial input.
760 // XXX gives 'Warning: unterminated string; newline inserted' messages which you can ignore
762 \\ Define some character constants
768 \\ CR prints a carriage return
771 \\ SPACE prints a space
772 : SPACE 'SPACE' EMIT ;
774 \\ Primitive . (DOT) function doesn't follow with a blank, so redefine it to behave like FORTH.
775 \\ Notice how we can trivially redefine existing functions.
778 \\ DUP, DROP are defined in assembly for speed, but this is how you might define them
779 \\ in FORTH. Notice use of the scratch variables _X and _Y.
780 \\ : DUP _X ! _X @ _X @ ;
783 \\ The 2... versions of the standard operators work on pairs of stack entries. They're not used
784 \\ very commonly so not really worth writing in assembler. Here is how they are defined in FORTH.
788 \\ More standard FORTH words.
792 \\ [ and ] allow you to break into immediate mode while compiling a word.
793 : [ IMMEDIATE \\ define [ as an immediate word
794 0 STATE ! \\ go into immediate mode
798 1 STATE ! \\ go back to compile mode
801 \\ LITERAL takes whatever is on the stack and compiles LIT <foo>
803 ' LIT , \\ compile LIT
804 , \\ compile the literal itself (from the stack)
807 \\ condition IF true-part THEN rest
809 \\ condition 0BRANCH OFFSET true-part rest
810 \\ where OFFSET is the offset of 'rest'
811 \\ condition IF true-part ELSE false-part THEN
813 \\ condition 0BRANCH OFFSET true-part BRANCH OFFSET2 false-part rest
814 \\ where OFFSET if the offset of false-part and OFFSET2 is the offset of rest
816 \\ IF is an IMMEDIATE word which compiles 0BRANCH followed by a dummy offset, and places
817 \\ the address of the 0BRANCH on the stack. Later when we see THEN, we pop that address
818 \\ off the stack, calculate the offset, and back-fill the offset.
820 ' 0BRANCH , \\ compile 0BRANCH
821 HERE @ \\ save location of the offset on the stack
822 0 , \\ compile a dummy offset
827 HERE @ SWAP - \\ calculate the offset from the address saved on the stack
828 SWAP ! \\ store the offset in the back-filled location
832 ' BRANCH , \\ definite branch to just over the false-part
833 HERE @ \\ save location of the offset on the stack
834 0 , \\ compile a dummy offset
835 SWAP \\ now back-fill the original (IF) offset
836 DUP \\ same as for THEN word above
841 \\ BEGIN loop-part condition UNTIL
843 \\ loop-part condition 0BRANCH OFFSET
844 \\ where OFFSET points back to the loop-part
845 \\ This is like do { loop-part } while (condition) in the C language
847 HERE @ \\ save location on the stack
851 ' 0BRANCH , \\ compile 0BRANCH
852 HERE @ - \\ calculate the offset from the address saved on the stack
853 , \\ compile the offset here
856 \\ BEGIN loop-part AGAIN
858 \\ loop-part BRANCH OFFSET
859 \\ where OFFSET points back to the loop-part
860 \\ In other words, an infinite loop which can only be returned from with EXIT
862 ' BRANCH , \\ compile BRANCH
863 HERE @ - \\ calculate the offset back
864 , \\ compile the offset here
867 \\ BEGIN condition WHILE loop-part REPEAT
869 \\ condition 0BRANCH OFFSET2 loop-part BRANCH OFFSET
870 \\ where OFFSET points back to condition (the beginning) and OFFSET2 points to after the whole piece of code
871 \\ So this is like a while (condition) { loop-part } loop in the C language
873 ' 0BRANCH , \\ compile 0BRANCH
874 HERE @ \\ save location of the offset2 on the stack
875 0 , \\ compile a dummy offset2
879 ' BRANCH , \\ compile BRANCH
880 SWAP \\ get the original offset (from BEGIN)
881 HERE @ - , \\ and compile it after BRANCH
883 HERE @ SWAP - \\ calculate the offset2
884 SWAP ! \\ and back-fill it in the original location
887 \\ With the looping constructs, we can now write SPACES, which writes n spaces to stdout.
890 SPACE \\ print a space
891 1- \\ until we count down to 0
896 \\ .S prints the contents of the stack. Very useful for debugging.
898 DSP@ \\ get current stack pointer
900 DUP @ . \\ print the stack element
902 DUP S0 @ 4- = \\ stop when we get to the top
907 \\ DEPTH returns the depth of the stack.
908 : DEPTH S0 @ DSP@ - ;
910 \\ .\" is the print string operator in FORTH. Example: .\" Something to print\"
911 \\ The space after the operator is the ordinary space required between words.
912 \\ This is tricky to define because it has to do different things depending on whether
913 \\ we are compiling or in immediate mode. (Thus the word is marked IMMEDIATE so it can
914 \\ detect this and do different things).
915 \\ In immediate mode we just keep reading characters and printing them until we get to
916 \\ the next double quote.
917 \\ In compile mode we have the problem of where we're going to store the string (remember
918 \\ that the input buffer where the string comes from may be overwritten by the time we
919 \\ come round to running the function). We store the string in the compiled function
921 \\ LITSTRING, string length, string rounded up to 4 bytes, EMITSTRING, ...
923 STATE @ \\ compiling?
925 ' LITSTRING , \\ compile LITSTRING
926 HERE @ \\ save the address of the length word on the stack
927 0 , \\ dummy length - we don't know what it is yet
929 KEY \\ get next character of the string
932 HERE @ !b \\ store the character in the compiled image
933 HERE @ 1+ HERE ! \\ increment HERE pointer by 1 byte
935 DROP \\ drop the double quote character at the end
936 DUP \\ get the saved address of the length word
937 HERE @ SWAP - \\ calculate the length
938 4- \\ subtract 4 (because we measured from the start of the length word)
939 SWAP ! \\ and back-fill the length location
940 HERE @ \\ round up to next multiple of 4 bytes for the remaining code
944 ' EMITSTRING , \\ compile the final EMITSTRING
946 \\ In immediate mode, just read characters and print them until we get
947 \\ to the ending double quote. Much simpler than the above code!
950 DUP '\"' = IF EXIT THEN
956 \\ While compiling, [COMPILE] WORD compiles WORD if it would otherwise be IMMEDIATE.
957 : [COMPILE] IMMEDIATE
958 WORD \\ get the next word
959 FIND \\ find it in the dictionary
960 >CFA \\ get its codeword
961 , \\ and compile that
964 \\ RECURSE makes a recursive call to the current word that is being compiled.
965 \\ Normally while a word is being compiled, it is marked HIDDEN so that references to the
966 \\ same word within are calls to the previous definition of the word.
968 LATEST @ >CFA \\ LATEST points to the word being compiled at the moment
973 \\ Finally print the welcome prompt.