Mercurial > hg > index.cgi

--- a/Makefile	Sun Dec 31 17:42:39 2023 -0700
+++ b/Makefile	Sun Dec 31 17:44:39 2023 -0700
@@ -1,7 +1,9 @@
 .PHONY: all
+CFLAGS ?= -Wall
+
 all: bin/lwbasic.rom bin/lwbasic-coco2b.rom bin/lwbasic-coco3.rom bin/coco.zip bin/coco2.zip bin/coco2b.zip bin/coco3.zip

-lwb_srcs := consscr.s defs.s error.s expr.s fps.s genio.s init.s int.s interp.s irq.s keyb.s keywords.s miscdata.s number.s print.s progctrl.s stack.s token.s vars.s
+lwb_srcs := bytecode.s consscr.s defs.s error.s expr.s fps.s genio.s init.s int.s interp.s irq.s keyb.s keywords.s keywordtab.s miscdata.s number.s parse.s print.s progctrl.s stack.s token.s vars.s
 lwb_srcs := $(addprefix src/,$(lwb_srcs))

 bin/lwbasic.rom: src/lwbasic.s $(lwb_srcs)
@@ -11,11 +13,18 @@
 bin/lwbasic-coco3.rom: src/lwbasic.s $(lwb_srcs)
 	lwasm --6809 --tabs=16 --raw --list=src/lwbasic-coco3.list --symbols --output=bin/lwbasic-coco3.rom -DCOCO3=1 src/lwbasic.s

+src/keywordtab.s: src/keywordlist.txt src/buildkeywordtab
+	./src/buildkeywordtab src/keywordlist.txt src/keywordtab.s
+
+src/buildkeywordtab: src/buildkeywordtab.c
+
 .PHONY: clean
 clean:
 	rm -f bin/*.rom bin/*.zip
 	rm -f src/*.list
 	rm -f */*~ *~
+	rm -f src/buildkeywordtab
+	rm -f src/keywordtab.s

 bin/coco2.zip: bin/lwbasic.rom
 	mkdir -p coco2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/buildkeywordtab.c	Sun Dec 31 17:44:39 2023 -0700
@@ -0,0 +1,122 @@
+/*
+Build the keyword parse table for lwbasic
+*/
+
+#define _POSIX_C_SOURCE 200809L // for getline()
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct treenode
+{
+    int ccode;
+    char *toksym;
+    struct treenode *nextsibling;
+    struct treenode *firstchild;
+};
+
+int treedepth = 0;
+void print_tree(FILE *fp, struct treenode *tn)
+{
+    struct treenode *tn1;
+    int depth = ++treedepth;
+
+    fprintf(fp, "parse_wt%d fdb parse_wt%de-parse_wt%d-2\n", depth, depth, depth);
+
+    for (tn1 = tn -> firstchild; tn1; tn1 = tn1 -> nextsibling)
+    {
+        fprintf(fp, " fcb 0x%02x,%s\n", tn1 -> ccode, tn1 -> toksym ? tn1 -> toksym : "token_eot");
+        if (tn1 -> firstchild)
+            print_tree(fp, tn1);
+    }
+
+    fprintf(fp, "parse_wt%de\n", depth);
+}
+
+int main(int argc, char **argv)
+{
+    FILE *infile, *outfile;
+    struct treenode *treeroot;
+    struct treenode *tnp, *tn, *tnprev;
+    char *linebuf = NULL;
+    size_t bufsize = 0;
+    ssize_t rval;
+    char *ptr, *ptr2;
+
+    if (argc != 3)
+    {
+        fprintf(stderr, "Usage: %s <source> <output>\n", argv[0]);
+        exit(1);
+    }
+
+    infile = fopen(argv[1], "rb");
+    if (!infile)
+    {
+        perror("Opening input file");
+        exit(1);
+    }
+
+    treeroot = calloc(1, sizeof(struct treenode));
+    while (1)
+    {
+        rval = getline(&linebuf, &bufsize, infile);
+        if (rval == -1)
+        {
+            if (feof(infile))
+                break;
+            perror("Reading keyword list line");
+            fclose(infile);
+            if (linebuf) free(linebuf);
+            exit(1);
+        }
+        ptr = linebuf + rval - 1;
+        // lose any line terminators
+        while (*ptr == '\r' || *ptr == '\n')
+            *ptr-- = '\0';
+        ptr = strchr(linebuf, ',');
+        if (!ptr)
+        {
+            fprintf(stderr, "WARNING: malformed input line\n");
+            continue;
+        }
+        *ptr++ = '\0'; // put a NUL break in
+        tnp = treeroot;
+        for (ptr2 = linebuf; *ptr2; ptr2++)
+        {
+            for (tn = tnp -> firstchild, tnprev = NULL; tn; tn = tn -> nextsibling)
+            {
+                if (tn -> ccode == *ptr2)
+                    break;
+                tnprev = tn;
+            }
+            if (!tn)
+            {
+                tn = calloc(1, sizeof(struct treenode));
+                tn -> ccode = *ptr2;
+                if (!*(ptr2 + 1))
+                    tn -> toksym = strdup(ptr);
+                if (tnprev)
+                    tnprev -> nextsibling = tn;
+                else
+                    tnp -> firstchild = tn;
+                fprintf(stderr, "Create entry: %c, %s\n", tn -> ccode, tn -> toksym);
+            }
+            tnp = tn;
+        }
+    }
+    fclose(infile);
+
+    outfile = fopen(argv[2], "wb");
+    if (!outfile)
+    {
+        perror("Opening output file");
+        exit(1);
+    }
+    fprintf(outfile, "; This file is automatically generated. Edit %s and rebuild to make changes.\n", argv[1]);
+    fprintf(outfile, "parse_wordtab\n");
+    print_tree(outfile, treeroot);
+
+    fclose(outfile);
+    exit(0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/bytecode.s	Sun Dec 31 17:44:39 2023 -0700
@@ -0,0 +1,12 @@
+                *pragmapush list
+                *pragma list
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Bytecode definitions - define them using the macro bytecode_opdef
+                *pragmapush list
+                *pragma nolist
+bytecode_opdef  macro
+\1              equ \2
+                endm
+                *pragmapop list
+                bytecode_opdef bc_eol,0x00
+                *pragmapop list
--- a/src/defs.s	Sun Dec 31 17:42:39 2023 -0700
+++ b/src/defs.s	Sun Dec 31 17:44:39 2023 -0700
@@ -12,6 +12,7 @@
 keyb_shift      equ 0x01                        ; shift pressed
 linebuffsize    equ 0x100                       ; the line input buffer (256 bytes)
 stringstacknum  equ 20                          ; number of entries on the anonymous string descriptor stack
+stackheadroom   equ 50                          ; required headroom for the stack on OM checks
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Data structure used for calculations. Calculations are handled via structures called value accumulators. A value
 ; accumulator consists of a data type flag (at the end of the structure) and a data area whose layout varies based
--- a/src/error.s	Sun Dec 31 17:42:39 2023 -0700
+++ b/src/error.s	Sun Dec 31 17:44:39 2023 -0700
@@ -63,4 +63,6 @@
                 fcn 'Type mismatch'
                 deferr div0
                 fcn 'Division by zero'
+                deferr om
+                fcn 'Out of memory'
                 *pragmapop list
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/keywordlist.txt	Sun Dec 31 17:44:39 2023 -0700
@@ -0,0 +1,18 @@
+AND,token_and
+DATA,token_data
+ELSE,token_else
+END,token_end
+GO,token_go
+LET,token_let
+LIST,token_list
+NEW,token_new
+NOT,token_not
+OR,token_or
+POP,token_pop
+PRINT,token_print
+REM,token_rem
+RETURN,token_return
+RUN,token_run
+STOP,token_stop
+SUB,token_sub
+TO,token_to
--- a/src/lwbasic.s	Sun Dec 31 17:42:39 2023 -0700
+++ b/src/lwbasic.s	Sun Dec 31 17:44:39 2023 -0700
@@ -52,6 +52,9 @@
                 include int.s
                 include fps.s
                 include token.s
+                include bytecode.s
+                include parse.s
+                include keywordtab.s
                 include miscdata.s
                 include keywords.s
                 *pragmapop list
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parse.s	Sun Dec 31 17:44:39 2023 -0700
@@ -0,0 +1,290 @@
+                *pragmapush list
+                *pragma list
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
+; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
+; code analysis.
+;
+; This is a recursive descent parser.
+;
+; Entry:
+; X             Points to the text to encode
+; B             Nonzero to prevent generating any output (error check/length calculation only)
+;
+; Exit:
+; U             Points to the encoded line
+; D             Length of the encoded line
+; CC.C          clear
+
+; Error Exit:
+; B             Error code
+; U             Offset to error input
+; CC.C          set
+parse           stb parse_noout                 ; save no-output flag
+                leay ,x                         ; save input pointer in a less useful register
+                ldu freestart                   ; point to start of free memory where we will build the output
+                pshs u                          ; save original free memory location
+parse_nextstmt  jsr parse_nexttok               ; fetch the next token, return type in D
+                bcc parse0                      ; brif we succeeded in parsing a token
+parse_error     puls u                          ; restore original free memory location - deallocate any encoding
+                stu freestart
+                ldu parse_tokenst               ; get start location we started parsing the token at
+                rts                             ; return error condition
+parse0          ldx #parse_stmtjump             ; point to jump table for token type handler
+                abx                             ; offset to handler address
+                abx
+                jsr [,x]                        ; call handler
+                bcs parse_error                 ; brif handler flagged error
+                jsr parse_curtoken              ; get the token we terminated on
+                cmpb #token_eot                 ; end of input?
+                bne parse1                      ; brif not
+                ldb #bc_eol                     ; stash an end of line op
+                bsr parse_write
+                bcs parse_error                 ; brif we errored out writing to the result (OM?)
+                tfr u,d                         ; calculate the length of the result
+                subd ,s
+                puls u,pc                       ; get pointer to start of encoded result and return (C is already clear)
+parse1          cmpb #token_stmtsep             ; statement separator?
+                beq parse_nextstmt              ; brif so - do another statement
+                cmpb #token_apos                ; ' token?
+                beq parse0                      ; brif so - parse it as a new statement
+                comb                            ; set C for error
+                ldb #err_sn                     ; raise syntax error
+                bra parse_error
+parse_write     lda parse_noout                 ; are we doing output?
+                beq parse_write0                ; brif so
+                leau 1,u                        ; just count up the output and don't do anything
+                rts
+parse_write0    leax -stackheadroom,s           ; calculate bottom of stack with headroom
+                cmpx freestart                  ; did the stack run into the end of the output?
+                bhs parse_write1                ; brif not - we're good
+                ldb #err_om                     ; raise out of memory error, C already set from comparison
+                rts
+parse_write1    stb ,u+                         ; save output byte
+                stu freestart                   ; save new to of used memory
+parse_noop      rts                             ; return all clear - C clear from comparison above
+parse_curtoken  ldb parse_curtok                ; fetch token code of current token
+                rts
+parse_tokerr    comb                            ; flag error - unexpected token
+                ldb #err_sn                     ; raise syntax error
+                rts
+parse_nextchar  lda ,y                          ; at end of input already?
+                beq parse_curchar               ; brif so
+                leay 1,y                        ; move to next input character
+parse_curchar   lda ,y                          ; fetch input character
+                rts
+parse_nexttok   bsr parse_curchar               ; fetch current input
+                beq parse_nexttok1              ; brif end of input
+parse_nexttok0  cmpa #0x20                      ; space?
+                bne parse_nexttok2              ; brif not
+                bsr parse_nextchar              ; eat the space
+                bne parse_nexttok0              ; brif not end of input
+parse_nexttok1  ldb #token_eot                  ; flag end of input
+                bra parse_nexttok6              ; go return it
+parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
+                cmpa #'.                        ; leading decimal?
+                beq parse_nexttok3              ; brif so - parse number
+                cmpa #'0                        ; is it a digit
+                blo parse_nexttok4              ; brif not
+                cmpa #'9                        ; is it still a digit?
+                bhi parse_nexttok4              ; brif not
+parse_nexttok3  jmp parse_number                ; go parse a number
+parse_nexttok4  ldx #parse_chartab              ; point to list of single character tokens to recognize
+parse_nexttok5  ldb 1,x                         ; get token value
+                cmpa ,x++                       ; character match (and move to next entry)
+                bne parse_nexttok7              ; brif not
+parse_nexttok6  stb parse_curtok                ; save token type
+                leay 1,y                        ; eat the input character
+                clra                            ; clear C to indicate no error (and clear Z also)
+                rts
+parse_nexttok7  cmpb #token_eot                 ; end of table?
+                bne parse_nexttok5              ; brif not
+                clrb                            ; initialize relational flags
+                pshs d                          ; save input character and relational flags for later
+parse_nexttok8  cmpa #'<                        ; less than?
+                blo parse_nexttok9              ; brif not <, =, or >
+                cmpa #'>                        ; still <, =, or >?
+                bhi parse_nexttok9              ; brif not
+                suba #'<                        ; adjust < to 0
+                cmpa #1                         ; set C if <, clear if = or >
+                rola                            ; now 4 if >, 2 if =, or 1 if <
+                eora 1,s                        ; merge with previous relational characters
+                cmpa 1,s                        ; if it doesn't match, we have a dupe
+                bne parse_nexttok9              ; brif it's not valid - we won't recognize more in the token
+                sta 1,s                         ; save new relational flags
+                bsr parse_nextchar              ; fetch next input
+                sta ,s                          ; save input character
+                bne parse_nexttok8              ; brif there was one - go handle it
+parse_nexttok9  puls d                          ; get back input character and relational flag
+                tstb                            ; was it a relational operator?
+                beq parse_nexttok10             ; brif not
+                ldx #parse_reltab               ; point to relational operator token table
+                ldb b,x                         ; get the token code
+                clra                            ; flag no error
+                rts                             ; return - but don't advance - we already did looking for multiples
+parse_nexttok10 bsr parse_toupper               ; convert to upper case
+                cmpa #'A                        ; is it alpha?
+                blo parse_nexttok11             ; brif not
+                cmpa #'Z                        ; is it still alpha?
+                bls parse_nexttok12             ; brif so
+parse_nexttok11 comb                            ; flag error - unrecognized token
+                ldb #token_error
+                rts
+parse_nextcharu bsr parse_nextchar              ; fetch next input character
+                beq parse_toupper0              ; brif end of input
+parse_toupper   cmpa #'a                        ; is it lower case alpha?
+                blo parse_toupper0              ; brif not
+                cmpa #'z                        ; is it still lower case alpha?
+                bhi parse_toupper0              ; brif not
+                suba #0x20                      ; adjust to upper case alpha
+parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
+; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters
+; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword
+; table entry, we fall back to looking for the end of an identifier and then returning that.
+parse_nexttok12 ldx #parse_wordtab              ; point to keyword table
+                bsr parse_nexttok16             ; process this table entry
+                cmpb #token_ident               ; did we match a token?
+                bne parse_nexttok6              ; brif so - go return it
+parse_nexttok13 cmpa #'0                        ; was it alphanumeric?
+                blo parse_nexttok15             ; brif not
+                cmpa #'9                        ; was it numeric?
+                bls parse_nexttok14             ; brif so
+                cmpa #'A                        ; was it alpha?
+                blo parse_nexttok15             ; brif not
+                cmpa #'Z                        ; is it still alpha?
+                bhi parse_nexttok15             ; brif not
+parse_nexttok14 bsr parse_nextcharu             ; fetch next character and force upper case
+                bne parse_nexttok13             ; if not end of input, see if we have alphanumeric
+parse_nexttok15 tfr y,d                         ; fetch input location
+                subd parse_tokenst              ; calculate length of token
+                std val0+val.strlen             ; save the length of the identifier
+                ldb #token_ident                ; set token type to identifier (variable name, probably)
+                rts                             ; return token type, do not advance since we already did above
+; Parsing a potential keyword here. This works using a recursive lookup table. Each lookup table starts with a 18 bit
+; size entry for the table. Each entry is then 2 bytes. The first is the character to
+; match for this entry. The second is either token_eot to indicate a sub table needs to be consulted, token_ident to
+; indicate that the token should be parsed as an identifier, or a token type code which indicates the value should
+; be accepted. If a sub table is to be consulted, the table will appear inline with the same format. Should matching
+; fall off the end of a table, the character being considered will be "ungot" and processing will return back up the
+; call chain, ungetting characters, until the top level at which point token_ident will be returned.
+;
+; If the match character is negative, the match character represents the number of characters to "unget" and then
+; return the specified token. This is for handling look-aheads.
+parse_nexttok16 pshs a,x                        ; save input character
+                ldd ,x++                        ; get number of entries in the table
+                addd 1,s                        ; set pointer to end of table
+                std 1,s
+parse_nexttok17 cmpa ,x++                       ; does this entry match?
+                beq parse_nexttok21             ; brif so
+                ldb -2,x                        ; was this a look-ahead non-match?
+                bpl parse_nexttok19             ; brif not
+                leay b,y                        ; back up the input pointer
+                ldb -1,x                        ; get match token
+parse_nexttok18 puls a,x,pc                     ; clean up stack and return the matched token
+parse_nexttok19 ldb -1,x                        ; is there a sub table?
+                cmpb #token_eot
+                bne parse_nexttok20             ; brif not
+                ldd ,x++                        ; move past the sub table
+                leax d,x
+parse_nexttok20 cmpx 1,s                        ; did we reach the end of this table?
+                blo parse_nexttok17             ; brif not
+                ldb #token_ident                ; flag identifier required
+                puls a,x,pc                     ; restore input character, clean up stack, and return
+parse_nexttok21 ldb -1,x                        ; what token did we match?
+                cmpb #token_eot                 ; sub table?
+                bne parse_nexttok18             ; brif not - ding! ding! ding! we have a match
+                leas 3,s                        ; clean up stack
+                bsr parse_nextcharu             ; fetch next input character
+                bne parse_nexttok16             ; process sub table entries if we have input
+                ldb #token_ident                ; indicate we have an ident
+                leay -1,y                       ; unget the end of input
+                rts
+parse_number    jmp parse_tokerr
+; Relational token table, bits are > = <
+parse_reltab    fcb token_error
+                fcb token_lt
+                fcb token_eq
+                fcb token_le
+                fcb token_gt
+                fcb token_ne
+                fcb token_ge
+                fcb token_reltrue
+; Single character token lookup table
+parse_chartab   fcb 0x21,token_bang             ; !
+                fcb 0x23,token_hash             ; #
+                fcb 0x24,token_dollar           ; $
+                fcb 0x25,token_percent          ; %
+                fcb 0x26,token_amp              ; &
+                fcb 0x27,token_apos             ; '
+                fcb 0x28,token_oparen           ; (
+                fcb 0x29,token_cparen           ; )
+                fcb 0x2a,token_star             ; *
+                fcb 0x2b,token_plus             ; +
+                fcb 0x2c,token_comma            ; ,
+                fcb 0x2d,token_minus            ; -
+                fcb 0x2f,token_slash            ; /
+                fcb 0x3a,token_stmtsep          ; :
+                fcb 0x3b,token_semi             ; ;
+                fcb 0x3f,token_print            ; ? - print shortcut
+                fcb 0x40,token_at               ; @
+                fcb 0x5e,token_exp              ; ^ - exponentiation
+                fcb 0x00,token_eot              ; end of table flag
+; Parse tokens - define them in order using the macro parse_tokdef
+                *pragmapush list
+                *pragma nolist
+parse_toknum    set 0
+parse_tokdef    macro noexpand
+\1              equ parse_toknum
+parse_toknum    set parse_toknum+1
+                fdb \2
+                endm
+                *pragmapop list
+parse_stmtjump  parse_tokdef token_error,parse_tokerr
+                parse_tokdef token_eot,parse_noop
+                parse_tokdef token_lt,parse_noop
+                parse_tokdef token_le,parse_noop
+                parse_tokdef token_gt,parse_noop
+                parse_tokdef token_ge,parse_noop
+                parse_tokdef token_eq,parse_noop
+                parse_tokdef token_ne,parse_noop
+                parse_tokdef token_reltrue,parse_noop // always true relational operator
+                parse_tokdef token_stmtsep,parse_noop
+                parse_tokdef token_apos,parse_rem
+                parse_tokdef token_special,parse_noop
+                parse_tokdef token_bang,parse_noop
+                parse_tokdef token_hash,parse_noop
+                parse_tokdef token_dollar,parse_noop
+                parse_tokdef token_percent,parse_noop
+                parse_tokdef token_amp,parse_noop
+                parse_tokdef token_oparen,parse_noop
+                parse_tokdef token_cparen,parse_noop
+                parse_tokdef token_star,parse_noop
+                parse_tokdef token_plus,parse_noop
+                parse_tokdef token_comma,parse_noop
+                parse_tokdef token_minus,parse_noop
+                parse_tokdef token_slash,parse_noop
+                parse_tokdef token_semi,parse_noop
+                parse_tokdef token_at,parse_noop
+                parse_tokdef token_exp,parse_noop
+                parse_tokdef token_ident,parse_noop
+                parse_tokdef token_rem,parse_noop
+                parse_tokdef token_return,parse_noop
+                parse_tokdef token_run,parse_noop
+                parse_tokdef token_data,parse_noop
+                parse_tokdef token_else,parse_noop
+                parse_tokdef token_end,parse_noop
+                parse_tokdef token_stop,parse_noop
+                parse_tokdef token_sub,parse_noop
+                parse_tokdef token_let,parse_noop
+                parse_tokdef token_list,parse_noop
+                parse_tokdef token_new,parse_noop
+                parse_tokdef token_not,parse_noop
+                parse_tokdef token_print,parse_noop
+                parse_tokdef token_pop,parse_noop
+                parse_tokdef token_to,parse_noop
+                parse_tokdef token_and,parse_noop
+                parse_tokdef token_or,parse_noop
+                parse_tokdef token_go,parse_noop
+parse_rem       rts
+
+                *pragmapop list
--- a/src/vars.s	Sun Dec 31 17:42:39 2023 -0700
+++ b/src/vars.s	Sun Dec 31 17:44:39 2023 -0700
@@ -39,6 +39,9 @@
 tok_kwnum       rmb 1                           ; the actual token number
 tok_kwmatchl    rmb 1                           ; the length of the best match during lookup
 tok_kwmatch     rmb 2                           ; the current best matched token number
+parse_noout     rmb 1                           ; flag for whether we're outputting encoded lines when parsing
+parse_tokenst   rmb 2                           ; pointer into input buffer of start of currently parsed token
+parse_curtok    rmb 1                           ; current token type code
 ; General value accumulators used during expression evaluation. These are in the same format used for storing
 ; values in variables with the exception of having a type flag.
 val0            rmb val.size                    ; value accumulator 0 - current expression value