changeset 124:8770e6f977c3

Rework parser to use parse_wordtab for symbols too There's no reason not to use the parse_wordtab table thing to match the symbols with their token codes. It takes less space than the combined code and tables to do it separately.
author William Astle <lost@l-w.ca>
date Mon, 01 Jan 2024 15:57:59 -0700
parents 5681cdada362
children 0607e4e20702
files src/buildkeywordtab.c src/keywordlist.txt src/parse.s
diffstat 3 files changed, 72 insertions(+), 107 deletions(-) [+]
line wrap: on
line diff
--- a/src/buildkeywordtab.c	Mon Jan 01 15:15:45 2024 -0700
+++ b/src/buildkeywordtab.c	Mon Jan 01 15:57:59 2024 -0700
@@ -106,7 +106,7 @@
         // lose any line terminators
         while (*ptr == '\r' || *ptr == '\n')
             *ptr-- = '\0';
-        ptr = strchr(linebuf, ',');
+        ptr = strchr(linebuf, '\t');
         if (!ptr)
         {
             fprintf(stderr, "WARNING: malformed input line\n");
@@ -132,7 +132,6 @@
                     tnprev -> nextsibling = tn;
                 else
                     tnp -> firstchild = tn;
-                fprintf(stderr, "Create entry: %c, %s\n", tn -> ccode, tn -> toksym);
             }
             tnp = tn;
         }
@@ -147,7 +146,7 @@
     }
     fprintf(outfile, "; This file is automatically generated. Edit %s and rebuild to make changes.\n", argv[1]);
     fprintf(outfile, " *pragmapush list\n *pragma list\n");
-    fprintf(outfile, "parse_wordtab\n");    
+    fprintf(outfile, "parse_wt\n");    
     print_tree(outfile, treeroot, NULL, 0);
     fprintf(outfile, " *pragmapop list\n");
     fclose(outfile);
--- a/src/keywordlist.txt	Mon Jan 01 15:15:45 2024 -0700
+++ b/src/keywordlist.txt	Mon Jan 01 15:57:59 2024 -0700
@@ -1,20 +1,47 @@
-AND,token_and
-AS,token_as
-ASC,token_asc
-DATA,token_data
-ELSE,token_else
-END,token_end
-GO,token_go
-LET,token_let
-LIST,token_list
-NEW,token_new
-NOT,token_not
-OR,token_or
-POP,token_pop
-PRINT,token_print
-REM,token_rem
-RETURN,token_return
-RUN,token_run
-STOP,token_stop
-SUB,token_sub
-TO,token_to
+!	token_bang
+#	token_hash
+$	token_dollar
+%	token_percent
+&	token_amp
+'	token_apos
+(	token_oparen
+)	token_cparen
+*	token_star
++	token_plus
+,	token_comma
+-	token_minus
+/	token_slash
+:	token_stmtsep
+;	token_semi
+?	token_print
+@	token_at
+^	token_exp
+<	token_lt
+<=	token_le
+=>	token_le
+>	token_gt
+>=	token_ge
+=>	token_ge
+<>	token_ne
+><	token_ne
+=	token_eq
+AND	token_and
+AS	token_as
+ASC	token_asc
+DATA	token_data
+ELSE	token_else
+END	token_end
+GO	token_go
+LET	token_let
+LIST	token_list
+NEW	token_new
+NOT	token_not
+OR	token_or
+POP	token_pop
+PRINT	token_print
+REM	token_rem
+RETURN	token_return
+RUN	token_run
+STOP	token_stop
+SUB	token_sub
+TO	token_to
--- a/src/parse.s	Mon Jan 01 15:15:45 2024 -0700
+++ b/src/parse.s	Mon Jan 01 15:57:59 2024 -0700
@@ -82,54 +82,44 @@
 parse_nexttok1  ldb #token_eot                  ; flag end of input
                 bra parse_nexttok6              ; go return it
 parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
+                bsr parse_toupper               ; make sure we have upper case letters for matching
+                ldx #parse_wt                   ; point to keyword parsing table
+                bsr parse_wordtab               ; go see if we have a match in the keyword table
+                bcc parse_nexttok6              ; brif we do - return it
+                ldy parse_tokenst               ; return to the start of the token - pointer probably clobbered
+                bsr parse_curchar               ; get back input character (may have been clobbered)
                 cmpa #'.                        ; leading decimal?
                 beq parse_nexttok3              ; brif so - parse number
                 cmpa #'0                        ; is it a digit
-                blo parse_nexttok4              ; brif not
+                blo parse_nexttok10             ; brif not
                 cmpa #'9                        ; is it still a digit?
-                bhi parse_nexttok4              ; brif not
+                bhi parse_nexttok10             ; brif not
 parse_nexttok3  jmp parse_number                ; go parse a number
-parse_nexttok4  ldx #parse_chartab              ; point to list of single character tokens to recognize
-parse_nexttok5  ldb 1,x                         ; get token value
-                cmpa ,x++                       ; character match (and move to next entry)
-                bne parse_nexttok7              ; brif not
 parse_nexttok6  stb parse_curtok                ; save token type
                 leay 1,y                        ; eat the input character
                 clra                            ; clear C to indicate no error (and clear Z also)
                 rts
-parse_nexttok7  cmpb #token_eot                 ; end of table?
-                bne parse_nexttok5              ; brif not
-                clrb                            ; initialize relational flags
-                pshs d                          ; save input character and relational flags for later
-parse_nexttok8  cmpa #'<                        ; less than?
-                blo parse_nexttok9              ; brif not <, =, or >
-                cmpa #'>                        ; still <, =, or >?
-                bhi parse_nexttok9              ; brif not
-                suba #'<                        ; adjust < to 0
-                cmpa #1                         ; set C if <, clear if = or >
-                rola                            ; now 4 if >, 2 if =, or 1 if <
-                eora 1,s                        ; merge with previous relational characters
-                cmpa 1,s                        ; if it doesn't match, we have a dupe
-                bne parse_nexttok9              ; brif it's not valid - we won't recognize more in the token
-                sta 1,s                         ; save new relational flags
-                bsr parse_nextchar              ; fetch next input
-                sta ,s                          ; save input character
-                bne parse_nexttok8              ; brif there was one - go handle it
-parse_nexttok9  puls d                          ; get back input character and relational flag
-                tstb                            ; was it a relational operator?
-                beq parse_nexttok10             ; brif not
-                ldx #parse_reltab               ; point to relational operator token table
-                ldb b,x                         ; get the token code
-                clra                            ; flag no error
-                rts                             ; return - but don't advance - we already did looking for multiples
-parse_nexttok10 bsr parse_toupper               ; convert to upper case
-                cmpa #'A                        ; is it alpha?
+parse_nexttok10 cmpa #'A                        ; is it alpha?
                 blo parse_nexttok11             ; brif not
                 cmpa #'Z                        ; is it still alpha?
                 bls parse_nexttok12             ; brif so
 parse_nexttok11 comb                            ; flag error - unrecognized token
                 ldb #token_error
                 rts
+parse_nexttok12 bsr parse_nextcharu             ; fetch next input character
+                cmpa #'0                        ; is it alphanumeric?
+                blo parse_nexttok13             ; brif not
+                cmpa #'9                        ; is it numeric?
+                bls parse_nexttok12             ; brif so - keep skipping it
+                cmpa #'A                        ; is it alpha?
+                blo parse_nexttok13             ; brif not
+                cmpa #'Z                        ; is it still alpha?
+                bls parse_nexttok12             ; brif so - keep skipping it
+parse_nexttok13 tfr y,d                         ; calculate length of identifier
+                subd parse_tokenst
+                std val0+val.strlen             ; save it for reference
+                ldb #token_ident                ; indicate an identifier (variable name, etc.)
+                rts                             ; return result (C will be clear from SUBD above)
 parse_nextcharu bsr parse_nextchar              ; fetch next input character
                 beq parse_toupper0              ; brif end of input
 parse_toupper   cmpa #'a                        ; is it lower case alpha?
@@ -138,28 +128,6 @@
                 bhi parse_toupper0              ; brif not
                 suba #0x20                      ; adjust to upper case alpha
 parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
-; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters
-; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword
-; table entry, we fall back to looking for the end of an identifier and then returning that.
-parse_nexttok12 ldx #parse_wordtab              ; point to keyword table
-                bsr parse_nexttok16             ; process this table entry
-                cmpb #token_ident               ; did we match a token?
-                bne parse_nexttok6              ; brif so - go return it
-parse_nexttok13 cmpa #'0                        ; was it alphanumeric?
-                blo parse_nexttok15             ; brif not
-                cmpa #'9                        ; was it numeric?
-                bls parse_nexttok14             ; brif so
-                cmpa #'A                        ; was it alpha?
-                blo parse_nexttok15             ; brif not
-                cmpa #'Z                        ; is it still alpha?
-                bhi parse_nexttok15             ; brif not
-parse_nexttok14 bsr parse_nextcharu             ; fetch next character and force upper case
-                bne parse_nexttok13             ; if not end of input, see if we have alphanumeric
-parse_nexttok15 tfr y,d                         ; fetch input location
-                subd parse_tokenst              ; calculate length of token
-                std val0+val.strlen             ; save the length of the identifier
-                ldb #token_ident                ; set token type to identifier (variable name, probably)
-                rts                             ; return token type, do not advance since we already did above
 ; This routine parses tokens using the table at parse_wordtab. The table is structured as follows:
 ;
 ; * two bytes which contain the length of the table less the two bytes for this length value
@@ -203,35 +171,6 @@
 parse_wordtab6  clra                            ; clear C to indicate a match
                 puls a,x,pc                     ; clean up stack and return
 parse_number    jmp parse_tokerr
-; Relational token table, bits are > = <
-parse_reltab    fcb token_error
-                fcb token_lt
-                fcb token_eq
-                fcb token_le
-                fcb token_gt
-                fcb token_ne
-                fcb token_ge
-                fcb token_reltrue
-; Single character token lookup table
-parse_chartab   fcb 0x21,token_bang             ; !
-                fcb 0x23,token_hash             ; #
-                fcb 0x24,token_dollar           ; $
-                fcb 0x25,token_percent          ; %
-                fcb 0x26,token_amp              ; &
-                fcb 0x27,token_apos             ; '
-                fcb 0x28,token_oparen           ; (
-                fcb 0x29,token_cparen           ; )
-                fcb 0x2a,token_star             ; *
-                fcb 0x2b,token_plus             ; +
-                fcb 0x2c,token_comma            ; ,
-                fcb 0x2d,token_minus            ; -
-                fcb 0x2f,token_slash            ; /
-                fcb 0x3a,token_stmtsep          ; :
-                fcb 0x3b,token_semi             ; ;
-                fcb 0x3f,token_print            ; ? - print shortcut
-                fcb 0x40,token_at               ; @
-                fcb 0x5e,token_exp              ; ^ - exponentiation
-                fcb 0x00,token_eot              ; end of table flag
 ; Parse tokens - define them in order using the macro parse_tokdef
                 *pragmapush list
                 *pragma nolist