changeset 53:bdd4b9f30916

Convert tokenize routine to do a first longest match Because it may be useful to have a lower numbered token (or one from a table processed earlier) be a prefix match on a token later in the table (or in a subsequent table), doing a longest match and using the first match as a tie breaker seems like a good idea. Notably, this will allow the AS keyword to be a single byte token without conflicting with the ASC keyword.
author William Astle <lost@l-w.ca>
date Wed, 21 Dec 2022 23:02:23 -0700
parents 05c754390b1c
children cc7e60e51c07
files src/lwbasic.s
diffstat 1 files changed, 46 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/src/lwbasic.s	Sun Dec 11 21:33:25 2022 -0700
+++ b/src/lwbasic.s	Wed Dec 21 23:02:23 2022 -0700
@@ -223,6 +223,12 @@
 curstmt         rmb 2                           ; start of statement currently being interpreted
 endflag         rmb 1                           ; 00 = END, FF = STOP
 stringstackptr  rmb 2                           ; anonymous string descriptor stack pointer
+tok_skipkw      rmb 1                           ; flag for when skipping an unrecognized keyword
+tok_skipdt      rmb 1                           ; flag for when processing DATA
+tok_kwtype      rmb 1                           ; primary/secondary type flag for tokens
+tok_kwnum       rmb 1                           ; the actual token number
+tok_kwmatchl    rmb 1                           ; the length of the best match during lookup
+tok_kwmatch     rmb 2                           ; the current best matched token number
                 rmb 0x71-*                      ; align RSTFLG/RSTVEC for stock ROM compatibility
 RSTFLG          rmb 1                           ; 0x55 if RSTVEC is valid
 RSTVEC          rmb 2                           ; points to warm start routine (must start with NOP)
@@ -1487,18 +1493,17 @@
 ;
 ; Enter with X pointing to the text to tokenize.
 ; Exit with X pointing to the start of the tokenized line and D holding the length of the tokenized line.
-tokenize        clra                            ; clear "not token" flag
-                clrb                            ; clear the "in data" flag
+tokenize        clr tok_skipkw                  ; clear "not token" flag
+                clr tok_skipdt                  ; clear the "in data" flag
                 ldy #tokebuff                   ; point to destination buffer
-                pshs d,y                        ; set return value, the "not token" flag, and the "in data" flag
+                pshs y                          ; set return value
 tokenize0       lda ,x+                         ; get input character
                 bne tokenize1                   ; brif not end of input
 tokenize0a      sta ,y+                         ; blank out final byte in result
-tokenize0b      leas 2,s                        ; clean up temporaries on stack
-                tfr y,d                         ; get end address to accumulator
+tokenize0b      tfr y,d                         ; get end address to accumulator
                 subd #tokebuff                  ; subtract out start; gives length of result
                 puls x,pc                       ; set return pointer and return
-tokenize1       tst ,s                          ; are we in the middle of a "not token"?
+tokenize1       tst tok_skipkw                  ; are we in the middle of a "not token"?
                 beq tokenize3a                  ; brif not
                 jsr setcifalpha                 ; is it alpha
                 bcs tokenize2                   ; brif so - store it and continue
@@ -1506,7 +1511,7 @@
                 bcc tokenize3                   ; brif not
 tokenize2       sta ,y+                         ; save output character
                 bra tokenize0                   ; check for another
-tokenize3       clr ,s                          ; clear the "not token" flag
+tokenize3       clr tok_skipkw                  ; clear the "not token" flag
 tokenize3a      cmpa #'"                        ; is it a string?
                 bne tokenize5                   ; brif not
                 sta ,y+                         ; save string delimiter
@@ -1518,11 +1523,11 @@
                 bra tokenize0                   ; brif 
 tokenize5       cmpa #':                        ; end of statement?
                 bne tokenize6                   ; brif not
-                clr 1,s                         ; reset "in data" flag
+                clr tok_skipdt                  ; reset "in data" flag
                 bra tokenize2                   ; stash it and continue
 tokenize6       cmpa #0x20                      ; is it a space?
                 beq tokenize2                   ; brif so - stash it unmodified
-                tst 1,s                         ; are we "in data"?
+                tst tok_skipdt                  ; are we "in data"?
                 bne tokenize2                   ; brif so - don't tokenize it
                 cmpa #'?                        ; PRINT shortcut?
                 bne tokenize6a                  ; brif not
@@ -1540,23 +1545,36 @@
                 bcs tokenize2                   ; brif so - pass it through
                 tsta                            ; is the high bit set?
                 bmi tokenize0                   ; ignore it if so
+
+; do longest match dictionary lookup here
                 ldu #primarydict                ; point to keyword table
                 leax -1,x                       ; back up input to start of potential token
-                clrb                            ; initialize the token number
-                clra                            ; initialize secondary table flag
-                pshs d,x                        ; save start of input token and the token counter
+                clr tok_kwtype                  ; set secondary table flag to primary table
+                clr tok_kwmatch                 ; clear the matched token
+                clr tok_kwmatch+1
+                clr tok_kwmatchl                ; set length matched
+                pshs x                          ; save start of input token
+tokenize10x     clr tok_kwnum                   ; clear keyword number
 tokenize10      ldb ,u                          ; are we at the end of the table?
                 bne tokenize11                  ; brif not
                 ldu #secondarydict              ; point to secondary token dictionary
-                clr ,s                          ; reset token counter
-                com 1,s                         ; flip to secondary token flag
-                bne tokenize10                  ; brif we haven't already done the secondaries
-                puls d,x                        ; get back input pointer and clear stack temporaries
-                com ,s                          ; set "not token flag"
+                com tok_kwtype                  ; flip to secondary token flag
+                bne tokenize10x                 ; brif we haven't already done the secondaries
+                puls x                          ; get back input pointer
+                ldb tok_kwmatchl                ; get length of best match
+                beq tokenize10y                 ; brif we don't have a match
+                abx                             ; move input pointer past matched token
+                ldd tok_kwmatch                 ; get matched token number
+                tsta                            ; is it a primary?
+                beq tokenize17                  ; brif so
+                bra tokenize16                  ; go stash two byte token
+tokenize10y     com tok_skipkw                  ; set "not token flag"
                 lda ,x+                         ; get character
                 bra tokenize2                   ; stash it and continue
-tokenize11      ldx 2,s                         ; get back start of input token
-tokenize12      ldb ,x+                         ; get input character
+tokenize11      ldx ,s                          ; get back start of input token
+                lda #-1                         ; initilize match length (-1 to compensate for inca below)
+tokenize12      inca                            ; bump length counter
+                ldb ,x+                         ; get input character
                 cmpb #'z                        ; is it above lower case Z?
                 bhi tokenize13                  ; brif so
                 cmpb #'a                        ; is it below lower case A?
@@ -1569,13 +1587,15 @@
                 leau -1,u                       ; back up to current test character
 tokenize14      ldb ,u+                         ; end of token?
                 bpl tokenize14                  ; brif not
-                inc ,s                          ; bump token counter
+tokenize14a     inc tok_kwnum                   ; bump token counter
                 bra tokenize10                  ; go check another one
-tokenize15      orb ,s+                         ; merge token number with the high bit (bit 7 set from above)
-                lda ,s+                         ; get back secondary flag and set flags on it
-                leas 2,s                        ; clean up saved input pointer from stack
-                bpl tokenize17                  ; brif primary token
-                skip2
+tokenize15      cmpa tok_kwmatchl               ; is it a longer match?
+                bls tokenize14a                 ; brif not, ignore it
+                sta tok_kwmatchl                ; save new match length
+                ldd tok_kwtype                  ; get the matched token count
+                orb #0x80                       ; set token flag
+                std tok_kwmatch                 ; save matched token
+                bra tokenize14a                 ; keep looking through the tables
 tokenize18      lda #':                         ; for putting implied colons in
 tokenize16      std ,y++                        ; put output into buffer
                 jmp tokenize0                   ; go handle more input
@@ -1583,7 +1603,7 @@
                 beq tokenize18                  ; brif so - stash it with colon
                 cmpb #tok_data                  ; is it DATA?
                 bne tokenize18a                 ; brif not
-                stb 1,s                         ; set "in data" flag
+                stb tok_skipdt                  ; set "in data" flag
 tokenize20      stb ,y+                         ; stash token
                 jmp tokenize0                   ; go handle more
 tokenize18a     cmpb #tok_rem                   ; is it REM?