X-Git-Url: http://git.annexia.org/?a=blobdiff_plain;f=jonesforth.S;h=624eba46460c14f1e560a7eaeaa5072afc854a9c;hb=7bd36863d6d71c4e4f94ba745d37b21f47e2ea9c;hp=44e837be3e56c47d6e9010934f9ebe4cc50b100b;hpb=c5eb13093afa5e66283806b8eb95cd3c9648a585;p=jonesforth.git diff --git a/jonesforth.S b/jonesforth.S index 44e837b..624eba4 100644 --- a/jonesforth.S +++ b/jonesforth.S @@ -1,11 +1,11 @@ /* A sometimes minimal FORTH compiler and tutorial for Linux / i386 systems. -*- asm -*- By Richard W.M. Jones http://annexia.org/forth This is PUBLIC DOMAIN (see public domain release statement below). - $Id: jonesforth.S,v 1.29 2007-09-24 00:37:01 rich Exp $ + $Id: jonesforth.S,v 1.40 2007-09-29 22:12:07 rich Exp $ gcc -m32 -nostdlib -static -Wl,-Ttext,0 -o jonesforth jonesforth.S */ - .set JONES_VERSION,29 + .set JONES_VERSION,39 /* INTRODUCTION ---------------------------------------------------------------------- @@ -45,7 +45,8 @@ over every other element in a list of numbers? You can add it to the language. What about an operator which pulls in variables directly from a configuration file and makes them available as FORTH variables? Or how about adding Makefile-like dependencies to - the language? No problem in FORTH. This concept isn't common in programming languages, + the language? No problem in FORTH. How about modifying the FORTH compiler to allow + complex inlining strategies -- simple. This concept isn't common in programming languages, but it has a name (in fact two names): "macros" (by which I mean LISP-style macros, not the lame C preprocessor) and "domain specific languages" (DSLs). @@ -74,8 +75,14 @@ This code draws heavily on the design of LINA FORTH (http://home.hccnet.nl/a.w.m.van.der.horst/lina.html) by Albert van der Horst. Any similarities in the code are probably not accidental. - Also I used this document (http://ftp.funet.fi/pub/doc/IOCCC/1992/buzzard.2.design) which really - defies easy explanation. + Some parts of this FORTH are also based on this IOCCC entry from 1992: + http://ftp.funet.fi/pub/doc/IOCCC/1992/buzzard.2.design. + I was very proud when Sean Barrett, the original author of the IOCCC entry, commented in the LtU thread + http://lambda-the-ultimate.org/node/2452#comment-36818 about this FORTH. + + And finally I'd like to acknowledge the (possibly forgotten?) authors of ARTIC FORTH because their + original program which I still have on original cassette tape kept nagging away at me all these years. + http://en.wikipedia.org/wiki/Artic_Software PUBLIC DOMAIN ---------------------------------------------------------------------- @@ -600,7 +607,7 @@ bufftop: /* BUILT-IN WORDS ---------------------------------------------------------------------- - Remember our dictionary entries (headers). Let's bring those together with the codeword + Remember our dictionary entries (headers)? Let's bring those together with the codeword and data words to see how : DOUBLE DUP + ; really looks in memory. pointer to previous word @@ -750,6 +757,14 @@ code_\label : // assembler code follows push %ecx NEXT + defcode "?DUP",4,,QDUP // duplicate top of stack if non-zero + pop %eax + test %eax,%eax + jz 1f + push %eax +1: push %eax + NEXT + defcode "1+",2,,INCR incl (%esp) // increment top of stack NEXT @@ -783,20 +798,19 @@ code_\label : // assembler code follows push %eax // ignore overflow NEXT - defcode "/",1,,DIV - xor %edx,%edx - pop %ebx - pop %eax - idivl %ebx - push %eax // push quotient - NEXT +/* + In this FORTH, only /MOD is primitive. Later we will define the / and MOD words in + terms of the primitive /MOD. The design of the i386 assembly instruction idiv which + leaves both quotient and remainder makes this obvious choice. +*/ - defcode "MOD",3,,MOD + defcode "/MOD",4,,DIVMOD xor %edx,%edx pop %ebx pop %eax idivl %ebx push %edx // push remainder + push %eax // push quotient NEXT defcode "=",1,,EQU // top two words are equal? @@ -877,7 +891,7 @@ code_\label : // assembler code follows 1: pushl $1 NEXT - defcode "0<",2,,ZLT + defcode "0<",2,,ZLT // comparisons with 0 pop %eax test %eax,%eax jl 1f @@ -913,22 +927,22 @@ code_\label : // assembler code follows 1: pushl $1 NEXT - defcode "AND",3,,AND + defcode "AND",3,,AND // bitwise AND pop %eax andl %eax,(%esp) NEXT - defcode "OR",2,,OR + defcode "OR",2,,OR // bitwise OR pop %eax orl %eax,(%esp) NEXT - defcode "XOR",3,,XOR + defcode "XOR",3,,XOR // bitwise XOR pop %eax xorl %eax,(%esp) NEXT - defcode "INVERT",6,,INVERT // this is the FORTH bitwise "NOT" function + defcode "INVERT",6,,INVERT // this is the FORTH bitwise "NOT" function (cf. NEGATE) notl (%esp) NEXT @@ -977,7 +991,7 @@ code_\label : // assembler code follows | addr of EXIT | +------------------+ - And NEXT just completes the job by, well in this case just by calling DOUBLE again :-) + And NEXT just completes the job by, well, in this case just by calling DOUBLE again :-) LITERALS ---------------------------------------------------------------------- @@ -1045,18 +1059,20 @@ code_\label : // assembler code follows subl %eax,(%ebx) // add it NEXT -/* ! and @ (STORE and FETCH) store 32-bit words. It's also useful to be able to read and write bytes. - * I don't know whether FORTH has these words, so I invented my own, called !b and @b. - * Byte-oriented operations only work on architectures which permit them (i386 is one of those). - * UPDATE: writing a byte to the dictionary pointer is called C, in FORTH. +/* + ! and @ (STORE and FETCH) store 32-bit words. It's also useful to be able to read and write bytes + so we also define standard words C@ and C!. + + Byte-oriented operations only work on architectures which permit them (i386 is one of those). */ - defcode "!b",2,,STOREBYTE + + defcode "C!",2,,STOREBYTE pop %ebx // address to store at pop %eax // data to store there movb %al,(%ebx) // store it NEXT - defcode "@b",2,,FETCHBYTE + defcode "C@",2,,FETCHBYTE pop %ebx // address to fetch xor %eax,%eax movb (%ebx),%al // fetch it @@ -1103,7 +1119,7 @@ var_\name : */ defvar "STATE",5,,STATE defvar "HERE",4,,HERE,user_defs_start - defvar "LATEST",6,,LATEST,name_SYSEXIT // SYSEXIT must be last in built-in dictionary + defvar "LATEST",6,,LATEST,name_SYSCALL3 // SYSCALL3 must be last in built-in dictionary defvar "_X",2,,TX defvar "_Y",2,,TY defvar "_Z",2,,TZ @@ -1123,9 +1139,14 @@ var_\name : DOCOL Pointer to DOCOL. F_IMMED The IMMEDIATE flag's actual value. F_HIDDEN The HIDDEN flag's actual value. - F_LENMASK The length mask. + F_LENMASK The length mask in the flags/len byte. + + SYS_* and the numeric codes of various Linux syscalls (from ) */ +//#include // you might need this instead +#include + .macro defconst name, namelen, flags=0, label, value defcode \name,\namelen,\flags,\label push $\value @@ -1139,6 +1160,13 @@ var_\name : defconst "F_HIDDEN",8,,__F_HIDDEN,F_HIDDEN defconst "F_LENMASK",9,,__F_LENMASK,F_LENMASK + defconst "SYS_EXIT",8,,SYS_EXIT,__NR_exit + defconst "SYS_OPEN",8,,SYS_OPEN,__NR_open + defconst "SYS_CLOSE",9,,SYS_CLOSE,__NR_close + defconst "SYS_READ",8,,SYS_READ,__NR_read + defconst "SYS_WRITE",9,,SYS_WRITE,__NR_write + defconst "SYS_CREAT",9,,SYS_CREAT,__NR_creat + /* RETURN STACK ---------------------------------------------------------------------- @@ -1210,8 +1238,6 @@ var_\name : exits the program, which is why when you hit ^D the FORTH system cleanly exits. */ -#include - defcode "KEY",3,,KEY call _KEY push %eax // push return value on stack @@ -1276,7 +1302,7 @@ _EMIT: What it does in detail is that it first skips any blanks (spaces, tabs, newlines and so on). Then it calls KEY to read characters into an internal buffer until it hits a blank. Then it calculates the length of the word it read and returns the address and the length as - two words on the stack (with address at the top). + two words on the stack (with the length at the top of stack). Notice that WORD has a single internal buffer which it overwrites each time (rather like a static C string). Also notice that WORD's internal buffer is just 32 bytes long and @@ -1299,8 +1325,8 @@ _EMIT: defcode "WORD",4,,WORD call _WORD - push %ecx // push length push %edi // push base address + push %ecx // push length NEXT _WORD: @@ -1339,36 +1365,9 @@ _WORD: 5: .space 32 /* - . (also called DOT) prints the top of the stack as an integer in the current BASE. -*/ - - defcode ".",1,,DOT - pop %eax // Get the number to print into %eax - call _DOT // Easier to do this recursively ... - NEXT -_DOT: - mov var_BASE,%ecx // Get current BASE -1: - cmp %ecx,%eax // %eax < BASE? If so jump to print immediately. - jb 2f - xor %edx,%edx // %edx:%eax / %ecx -> quotient %eax, remainder %edx - idivl %ecx - pushl %edx // Print quotient (top half) first ... - call _DOT - popl %eax // ... then loop to print remainder - jmp 1b -2: // %eax < BASE so print immediately. - movl $digits,%edx - addl %eax,%edx - movb (%edx),%al // Note top bits are already zero. - call _EMIT - ret - .section .rodata -digits: .ascii "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" - -/* - Almost the opposite of DOT (but not quite), SNUMBER parses a numeric string such as one returned - by WORD and pushes the number on the parameter stack. + As well as reading in words we'll need to read in numbers and for that we are using a function + called SNUMBER. This parses a numeric string such as one returned by WORD and pushes the + number on the parameter stack. This function does absolutely no error checking, and in particular the length of the string must be >= 1 bytes, and should contain only digits 0-9. If it doesn't you'll get random results. @@ -1420,10 +1419,10 @@ _SNUMBER: */ defcode "FIND",4,,FIND - pop %edi // %edi = address pop %ecx // %ecx = length + pop %edi // %edi = address call _FIND - push %eax + push %eax // %eax = address of dictionary entry (or NULL) NEXT _FIND: @@ -1943,25 +1942,28 @@ _COMMA: NEXT /* - PRINTING STRINGS ---------------------------------------------------------------------- + LITERAL STRINGS ---------------------------------------------------------------------- + + LITSTRING is a primitive used to implement the ." and S" operators (which are written in + FORTH). See the definition of those operators later. - LITSTRING and EMITSTRING are primitives used to implement the ." and S" operators - (which are written in FORTH). See the definition of those operators below. + TELL just prints a string. It's more efficient to define this in assembly because we + can make it a single Linux syscall. */ defcode "LITSTRING",9,,LITSTRING lodsl // get the length of the string - push %eax // push it on the stack push %esi // push the address of the start of the string + push %eax // push it on the stack addl %eax,%esi // skip past the string addl $3,%esi // but round up to next 4 byte boundary andl $~3,%esi NEXT - defcode "EMITSTRING",10,,EMITSTRING + defcode "TELL",4,,TELL mov $1,%ebx // 1st param: stdout - pop %ecx // 2nd param: address of string pop %edx // 3rd param: length of string + pop %ecx // 2nd param: address of string mov $__NR_write,%eax // write syscall int $0x80 NEXT @@ -1978,7 +1980,6 @@ _COMMA: // COLD must not return (ie. must not call EXIT). defword "COLD",4,,COLD .int INTERPRETER // call the interpreter loop (never returns) - .int LIT,1,SYSEXIT // hmmm, but in case it does, exit(1). /* This interpreter is pretty simple, but remember that in FORTH you can always override * it later with a more powerful one! @@ -2052,11 +2053,12 @@ interpret_is_lit: CHAR puts the ASCII code of the first character of the following word on the stack. For example CHAR A puts 65 on the stack. - SYSEXIT exits the process using Linux exit syscall. + SYSCALL3 makes a standard Linux system call. (See for a list of system call + numbers). This is the form to use when the function takes up to three parameters. - In this FORTH, SYSEXIT must be the last word in the built-in (assembler) dictionary because we + In this FORTH, SYSCALL3 must be the last word in the built-in (assembler) dictionary because we initialise the LATEST variable to point to it. This means that if you want to extend the assembler - part, you must put new words before SYSEXIT, or else change how LATEST is initialised. + part, you must put new words before SYSCALL3, or else change how LATEST is initialised. */ defcode "CHAR",4,,CHAR @@ -2066,11 +2068,14 @@ interpret_is_lit: push %eax // Push it onto the stack. NEXT - // NB: SYSEXIT must be the last entry in the built-in dictionary. - defcode SYSEXIT,7,,SYSEXIT - pop %ebx - mov $__NR_exit,%eax + defcode "SYSCALL3",8,,SYSCALL3 + pop %eax // System call number (see ) + pop %ebx // First parameter. + pop %ecx // Second parameter + pop %edx // Third parameter int $0x80 + push %eax // Result (negative for -errno) + NEXT /* START OF FORTH CODE ----------------------------------------------------------------------