changeset 123:5681cdada362

Redo keyword table handling to handle keywords differing in length Some keywords differ only due to length. That is, the shorter keyword matches the leading characters of the longer one. Make the keyword table builder and processor handle these cases. Also re-implement the handler based on evolved understanding of its requirements.
author William Astle <lost@l-w.ca>
date Mon, 01 Jan 2024 15:15:45 -0700
parents 5660ce96a9b7
children 8770e6f977c3
files src/buildkeywordtab.c src/keywordlist.txt src/parse.s
diffstat 3 files changed, 83 insertions(+), 43 deletions(-) [+]
line wrap: on
line diff
--- a/src/buildkeywordtab.c	Mon Jan 01 02:53:44 2024 -0700
+++ b/src/buildkeywordtab.c	Mon Jan 01 15:15:45 2024 -0700
@@ -16,8 +16,13 @@
     struct treenode *firstchild;
 };
 
+/*
+lookaheaddepth will start at 255 and count down which gives an appropriate
+two's complement negative number.
+*/
+
 int treedepth = 0;
-void print_tree(FILE *fp, struct treenode *tn)
+void print_tree(FILE *fp, struct treenode *tn, char *lookahead, int lookaheaddepth)
 {
     struct treenode *tn1;
     int depth = ++treedepth;
@@ -26,9 +31,36 @@
 
     for (tn1 = tn -> firstchild; tn1; tn1 = tn1 -> nextsibling)
     {
-        fprintf(fp, " fcb 0x%02x,%s\n", tn1 -> ccode, tn1 -> toksym ? tn1 -> toksym : "token_eot");
+        // if there are child nodes, insert the sub tree
         if (tn1 -> firstchild)
-            print_tree(fp, tn1);
+        {
+            fprintf(fp, " fcb 0x%02x,token_eot\n", tn1 -> ccode);
+            if (tn1 -> toksym)
+            {
+                print_tree(fp, tn1, tn1 -> toksym, 255);
+            }
+            else
+            {
+                if (lookahead)
+                {
+                    print_tree(fp, tn1, lookahead, lookaheaddepth - 1);
+                }
+                else
+                {
+                    print_tree(fp, tn1, NULL, 0);
+                }
+            }
+        }
+        // if there is also a terminal symbol here
+        if (tn1 -> toksym)
+        {
+            fprintf(fp, " fcb 0x%02x,%s\n", tn1 -> ccode, tn1 -> toksym);
+        }
+    }
+    // handle lookahead failure
+    if (lookahead)
+    {
+        fprintf(fp, " fcb 0x%02x,%s\n", lookaheaddepth, lookahead);
     }
     
     fprintf(fp, "parse_wt%de\n", depth);
@@ -114,9 +146,10 @@
         exit(1);
     }
     fprintf(outfile, "; This file is automatically generated. Edit %s and rebuild to make changes.\n", argv[1]);
+    fprintf(outfile, " *pragmapush list\n *pragma list\n");
     fprintf(outfile, "parse_wordtab\n");    
-    print_tree(outfile, treeroot);
-
+    print_tree(outfile, treeroot, NULL, 0);
+    fprintf(outfile, " *pragmapop list\n");
     fclose(outfile);
     exit(0);
 }
--- a/src/keywordlist.txt	Mon Jan 01 02:53:44 2024 -0700
+++ b/src/keywordlist.txt	Mon Jan 01 15:15:45 2024 -0700
@@ -1,4 +1,6 @@
 AND,token_and
+AS,token_as
+ASC,token_asc
 DATA,token_data
 ELSE,token_else
 END,token_end
--- a/src/parse.s	Mon Jan 01 02:53:44 2024 -0700
+++ b/src/parse.s	Mon Jan 01 15:15:45 2024 -0700
@@ -160,45 +160,48 @@
                 std val0+val.strlen             ; save the length of the identifier
                 ldb #token_ident                ; set token type to identifier (variable name, probably)
                 rts                             ; return token type, do not advance since we already did above
-; Parsing a potential keyword here. This works using a recursive lookup table. Each lookup table starts with a 18 bit
-; size entry for the table. Each entry is then 2 bytes. The first is the character to
-; match for this entry. The second is either token_eot to indicate a sub table needs to be consulted, token_ident to
-; indicate that the token should be parsed as an identifier, or a token type code which indicates the value should
-; be accepted. If a sub table is to be consulted, the table will appear inline with the same format. Should matching
-; fall off the end of a table, the character being considered will be "ungot" and processing will return back up the
-; call chain, ungetting characters, until the top level at which point token_ident will be returned.
+; This routine parses tokens using the table at parse_wordtab. The table is structured as follows:
+;
+; * two bytes which contain the length of the table less the two bytes for this length value
+; * a sequence of entries consisting of a single byte matching character and a token code followed
+;   by an optional sub table, structured exactly the same way.
+;
+; The optional subtable will be present if the token code is token_eot
+;
+; If the character match is negative, it means a lookahead failed. The negative value is the number
+; of characters to unget and the token code is the token value to return. No other entries after this
+; in a table will be considered since thie negative match is a global match.
 ;
-; If the match character is negative, the match character represents the number of characters to "unget" and then
-; return the specified token. This is for handling look-aheads.
-parse_nexttok16 pshs a,x                        ; save input character
-                ldd ,x++                        ; get number of entries in the table
-                addd 1,s                        ; set pointer to end of table
-                std 1,s
-parse_nexttok17 cmpa ,x++                       ; does this entry match?
-                beq parse_nexttok21             ; brif so
-                ldb -2,x                        ; was this a look-ahead non-match?
-                bpl parse_nexttok19             ; brif not
-                leay b,y                        ; back up the input pointer
-                ldb -1,x                        ; get match token
-parse_nexttok18 puls a,x,pc                     ; clean up stack and return the matched token
-parse_nexttok19 ldb -1,x                        ; is there a sub table?
-                cmpb #token_eot
-                bne parse_nexttok20             ; brif not
-                ldd ,x++                        ; move past the sub table
-                leax d,x
-parse_nexttok20 cmpx 1,s                        ; did we reach the end of this table?
-                blo parse_nexttok17             ; brif not
-                ldb #token_ident                ; flag identifier required
-                puls a,x,pc                     ; restore input character, clean up stack, and return
-parse_nexttok21 ldb -1,x                        ; what token did we match?
-                cmpb #token_eot                 ; sub table?
-                bne parse_nexttok18             ; brif not - ding! ding! ding! we have a match
-                leas 3,s                        ; clean up stack
-                bsr parse_nextcharu             ; fetch next input character
-                bne parse_nexttok16             ; process sub table entries if we have input
-                ldb #token_ident                ; indicate we have an ident
-                leay -1,y                       ; unget the end of input
-                rts
+; When a token_eot match is found, if there are no further characters in the input, the match is
+; determined to be invalid and processing continues with the next entry.
+parse_wordtab0  leas 3,s                        ; clean up stack for sub table handling
+parse_wordtab   pshs a,x                        ; save input character and start of table
+                ldd ,x++                        ; get length of this table
+                addd 1,s                        ; calculate the address of the end of the table
+                std 1,s                         ; save end address for comparison later
+                lda ,s                          ; get back input character
+parse_wordtab1  ldb -1,x                        ; fetch token code for this entry
+                cmpa ,x++                       ; does this entry match?
+                bne parse_wordtab4              ; brif not
+                cmpb #token_eot                 ; is it indicating a sub table?
+                bne parse_wordtab6              ; brif not
+                bsr parse_nextcharu             ; fetch next input character (for sub table match)
+                bne parse_wordtab0              ; brif we are going to check the sub table
+parse_wordtab2  ldd ,x++                        ; fetch length of sub table
+                leax d,x                        ; move past sub table
+parse_wordtab3  lda ,s                          ; get back input character
+                cmpx 1,s                        ; are we at the end of the table?
+                blo parse_wordtab1              ; brif not - check another entry
+                comb                            ; indicate no match
+                puls a,x,pc                     ; clean up stack and return
+parse_wordtab4  lda -2,x                        ; get the match character
+                bmi parse_wordtab5              ; brif negative - lookahead fail
+                cmpb #token_eot                 ; is there a sub table to skip?
+                beq parse_wordtab2              ; brif so - skip sub table
+                bra parse_wordtab3              ; otherwise just move to the next entry
+parse_wordtab5  leay a,y                        ; move back the specified number of characters
+parse_wordtab6  clra                            ; clear C to indicate a match
+                puls a,x,pc                     ; clean up stack and return
 parse_number    jmp parse_tokerr
 ; Relational token table, bits are > = <
 parse_reltab    fcb token_error
@@ -285,6 +288,8 @@
                 parse_tokdef token_and,parse_noop
                 parse_tokdef token_or,parse_noop
                 parse_tokdef token_go,parse_noop
+                parse_tokdef token_as,parse_noop
+                parse_tokdef token_asc,parse_noop
 parse_rem       rts
 
                 *pragmapop list