view src/parse.s @ 126:ac183a519439

Update parsing scheme with a keyword lookup by token value and other framework Add ability to turn a token code into a keyword string. Also correct some details related to token table generation with some additiona adjustments for token symbols. Also rework token symbol definitions and creation of some parsing tables as well as the main statement parsing loop.
author William Astle <lost@l-w.ca>
date Mon, 08 Jan 2024 22:58:08 -0700
parents 0607e4e20702
children 527212870064
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
; code analysis.
;
; This is a recursive descent parser.
;
; Entry:
; X             Points to the text to encode
; B             Nonzero to prevent generating any output (error check/length calculation only)
;
; Exit:
; U             Points to the encoded line
; D             Length of the encoded line
; CC.C          clear

; Error Exit:
; B             Error code
; U             Offset to error input
; CC.C          set
parse           stb parse_noout                 ; save no-output flag
                leay ,x                         ; save input pointer in a less useful register
                ldu freestart                   ; point to start of free memory where we will build the output
                pshs u                          ; save original free memory location
parse_nextstmt  jsr parse_nexttok               ; fetch the next token, return type in D
                bcc parse0                      ; brif we succeeded in parsing a token
parse_error     puls u                          ; restore original free memory location - deallocate any encoding
                stu freestart
                ldu parse_tokenst               ; get start location we started parsing the token at
                rts                             ; return error condition
parse0          ldx #parsetab_cmd               ; point to jump table for token type handler
parse1          cmpb ,x                         ; did we match a valid command token?
                beq parse3                      ; brif so
                leax 3,x                        ; move to next entry
                cmpx #parsetab_cmde             ; end of table?
                blo parse1                      ; brif not
parse2          ldb #err_sn                     ; flag syntax error
                bra parse_error                 ; and return the error
parse3          jsr [1,x]                       ; call the handler
                bcs parse_error                 ; brif the handler indicated error
                jsr parse_curtoken              ; get the token we terminated on
                cmpb #token_eot                 ; end of input?
                bne parse4                      ; brif not
                ldb #bc_eol                     ; stash an end of line op
                bsr parse_write
                bcs parse_error                 ; brif we errored out writing to the result (OM?)
                tfr u,d                         ; calculate the length of the result
                subd ,s
                puls u,pc                       ; get pointer to start of encoded result and return (C is already clear)
parse4          cmpb #token_stmtsep             ; statement separator?
                beq parse_nextstmt              ; brif so - do another statement
                cmpb #token_remabbr             ; ' token?
                beq parse0                      ; brif so - parse it as a new statement
                bra parse2                      ; raise a syntax error
parse_write     lda parse_noout                 ; are we doing output?
                beq parse_write0                ; brif so
                leau 1,u                        ; just count up the output and don't do anything
                rts
parse_write0    leax -stackheadroom,s           ; calculate bottom of stack with headroom
                cmpx freestart                  ; did the stack run into the end of the output?
                bhs parse_write1                ; brif not - we're good
                ldb #err_om                     ; raise out of memory error, C already set from comparison
                rts
parse_write1    stb ,u+                         ; save output byte
                stu freestart                   ; save new to of used memory
list_noop
parse_noop      rts                             ; return all clear - C clear from comparison above
parse_curtoken  ldb parse_curtok                ; fetch token code of current token
                rts
parse_tokerr    comb                            ; flag error - unexpected token
                ldb #err_sn                     ; raise syntax error
                rts
parse_nextchar  lda ,y                          ; at end of input already?
                beq parse_curchar               ; brif so
                leay 1,y                        ; move to next input character
parse_curchar   lda ,y                          ; fetch input character
                rts
parse_nexttok   bsr parse_curchar               ; fetch current input
                beq parse_nexttok1              ; brif end of input
parse_nexttok0  cmpa #0x20                      ; space?
                bne parse_nexttok2              ; brif not
                bsr parse_nextchar              ; eat the space
                bne parse_nexttok0              ; brif not end of input
parse_nexttok1  ldb #token_eot                  ; flag end of input
                bra parse_nexttok6              ; go return it
parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
                bsr parse_toupper               ; make sure we have upper case letters for matching
                ldx #parse_wt                   ; point to keyword parsing table
                bsr parse_wordtab               ; go see if we have a match in the keyword table
                bcc parse_nexttok6              ; brif we do - return it
                ldy parse_tokenst               ; return to the start of the token - pointer probably clobbered
                bsr parse_curchar               ; get back input character (may have been clobbered)
                cmpa #'.                        ; leading decimal?
                beq parse_nexttok3              ; brif so - parse number
                cmpa #'0                        ; is it a digit
                blo parse_nexttok10             ; brif not
                cmpa #'9                        ; is it still a digit?
                bhi parse_nexttok10             ; brif not
parse_nexttok3  jmp parse_number                ; go parse a number
parse_nexttok6  stb parse_curtok                ; save token type
                leay 1,y                        ; eat the input character
                clra                            ; clear C to indicate no error (and clear Z also)
                rts
parse_nexttok10 cmpa #'A                        ; is it alpha?
                blo parse_nexttok11             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so
parse_nexttok11 comb                            ; flag error - unrecognized token
                ldb #token_error
                rts
parse_nexttok12 bsr parse_nextcharu             ; fetch next input character
                cmpa #'0                        ; is it alphanumeric?
                blo parse_nexttok13             ; brif not
                cmpa #'9                        ; is it numeric?
                bls parse_nexttok12             ; brif so - keep skipping it
                cmpa #'A                        ; is it alpha?
                blo parse_nexttok13             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so - keep skipping it
parse_nexttok13 tfr y,d                         ; calculate length of identifier
                subd parse_tokenst
                std val0+val.strlen             ; save it for reference
                ldb #token_ident                ; indicate an identifier (variable name, etc.)
                rts                             ; return result (C will be clear from SUBD above)
parse_nextcharu bsr parse_nextchar              ; fetch next input character
                beq parse_toupper0              ; brif end of input
parse_toupper   cmpa #'a                        ; is it lower case alpha?
                blo parse_toupper0              ; brif not
                cmpa #'z                        ; is it still lower case alpha?
                bhi parse_toupper0              ; brif not
                suba #0x20                      ; adjust to upper case alpha
parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
parse_number    jmp parse_tokerr
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This routine parses tokens using the table at parse_wordtab. The table is structured as follows:
;
; * two bytes which contain the length of the table less the two bytes for this length value
; * a sequence of entries consisting of a single byte matching character and a token code followed
;   by an optional sub table, structured exactly the same way.
;
; The optional subtable will be present if the token code is token_eot
;
; If the character match is negative, it means a lookahead failed. The negative value is the number
; of characters to unget and the token code is the token value to return. No other entries after this
; in a table will be considered since thie negative match is a global match.
;
; When a token_eot match is found, if there are no further characters in the input, the match is
; determined to be invalid and processing continues with the next entry.
parse_wordtab0  leas 3,s                        ; clean up stack for sub table handling
parse_wordtab   pshs a,x                        ; save input character and start of table
                ldd ,x++                        ; get length of this table
                addd 1,s                        ; calculate the address of the end of the table
                std 1,s                         ; save end address for comparison later
                lda ,s                          ; get back input character
parse_wordtab1  ldb 1,x                         ; fetch token code for this entry
                cmpa ,x++                       ; does this entry match?
                bne parse_wordtab4              ; brif not
                cmpb #token_eot                 ; is it indicating a sub table?
                bne parse_wordtab6              ; brif not
                bsr parse_nextcharu             ; fetch next input character (for sub table match)
                bne parse_wordtab0              ; brif we are going to check the sub table
parse_wordtab2  ldd ,x                          ; fetch length of sub table
                leax d,x                        ; move past sub table
parse_wordtab3  lda ,s                          ; get back input character
                cmpx 1,s                        ; are we at the end of the table?
                blo parse_wordtab1              ; brif not - check another entry
                comb                            ; indicate no match
                puls a,x,pc                     ; clean up stack and return
parse_wordtab4  lda -2,x                        ; get the match character
                bmi parse_wordtab5              ; brif negative - lookahead fail
                cmpb #token_eot                 ; is there a sub table to skip?
                beq parse_wordtab2              ; brif so - skip sub table
                bra parse_wordtab3              ; otherwise just move to the next entry
parse_wordtab5  leay a,y                        ; move back the specified number of characters
parse_wordtab6  clra                            ; clear C to indicate a match
                puls a,x,pc                     ; clean up stack and return
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Convert a token number back to its keyword. This will use the same table used by parse_wordtab. Enter with a character
; output routine pointer in U which takes the character in A. The routine can assume that Y is preserved. Will return
; with C set if the token does not exist in the word table and clear otherwise.
parse_wtdc      pshs u                          ; save routine pointer
                ldu #strbuff+20                 ; point to temporary string buffer
                clr ,-u                         ; put a NUL at the end of the string
                ldx #parse_wt                   ; point to keyword parse table
                bsr parse_wtdc2                 ; call the tree walker function
                bcc parse_wtdc1                 ; brif we do have a match
                puls u,pc                       ; clean stack and return
parse_wtdc0     jsr [,s]                        ; output the character
parse_wtdc1     lda ,u+                         ; get output byte
                bne parse_wtdc0                 ; brif we're not at the end yet
                clra                            ; make sure C is clear
                puls u,pc                       ; clean stack and return
parse_wtdc2     pshs a,x                        ; save the token match value and the table pointer
                ldd ,x++                        ; get table length
                addd 1,s                        ; calculate end address
                std 1,s                         ; save it
parse_wtdc3     ldd ,x++                        ; get this table entry
                bmi parse_wtdc6                 ; brif it's a backtracking entry - skip it
                cmpa ,s                         ; does the token match here?
                bne parse_wtdc5                 ; brif not
parse_wtdc4     sta ,-y                         ; add the character to the output buffer
                puls a,x,pc                     ; return up the call stack - C is clear from CMPA above
parse_wtdc5     cmpb #token_eot                 ; does this entry have a sub table?
                bne parse_wtdc6                 ; brif not
                pshs a                          ; save the matched character
                lda 1,s                         ; get back the token we need
                bsr parse_wtdc2                 ; go handle the sub table
                puls a                          ; get back the matched character
                bcc parse_wtdc6                 ; brif it did match - record it and return
parse_wtdc6     cmpx 1,s                        ; are we at the end of this table?
                bne parse_wtdc3                 ; brif not - handle another table entry
                coma                            ; make sure C is set for no match
                puls a,x,pc                     ; clean up stack and return
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This table defines the various handler routines for the various bytecode tokens. Each token is defined as follows:
;               parse_tokdefT <sym>,<parse>,<list>,<exec>
; where:
; T: c for command, f for function, p for particle
; <sym>: the symbol name without the "token_" prefix
; <parse>: parse handler for the type, ignored for particles
; <list>: list handler for the type, ingored for particles
; <exec>: execution handler for the type, ignored for particles
                *pragmapush list
                *pragma nolist
__toknump       set 0
__toknumc       set 0x40
__toknumf       set 0xc0
                setstr __cmdparset=""
                setstr __cmdlistt=""
                setstr __cmdexect=""
                setstr __fnparset=""
                setstr __fnlistt=""
                setstr __fnexect=""
parse_tokendefp macro noexpand
token_\1        equ __toknump
__toknump       set __toknump+1
                endm
parse_tokendefc macro noexpand
token_\1        equ __toknumc
__toknumc       set __toknumc+1
                ifstr ne,"{2}",""
                setstr __cmdparset="%(__cmdparset)\tfcb\ttoken_\1\n\tfdb {2}\n"
                endc
                ifstr ne,"{3}",""
                setstr __cmdlistt="%(__cmdlistt)\tfcb\ttoken_\1\n\tfdb {3}\n"
                endc
                ifstr ne,"{4}",""
                setstr __cmdexect="%(__cmdexect)\tfdb {3}\n"
                else
                setstr __cmdexect="%(__cmdexect)\tfdb SNERROR\n"
                endc
                endm
parse_tokendeff macro noexpand
token_\1        equ __toknumf
__toknumf       set __toknumf+1
                ifstr ne,"{2}",""
                setstr __fnparset="%(__fnparset)\tfcb\ttoken_\1\n\tfdb {2}\n"
                endc
                ifstr ne,"{3}",""
                setstr __fnlistt="%(__fnlistt)\tfcb\ttoken_\1\n\tfdb {3}\n"
                endc
                ifstr ne,"{4}",""
                setstr __fnexect="%(__fnexect)\tfdb {3}\n"
                else
                setstr __fnexect="%(__fnexect)\tfdb SNERROR\n"
                endc
                endm
token_cmdparse  macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__cmdparset)"
                *pragmapop nolist
                endm
token_cmdlist   macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__cmdlistt)"
                *pragmapop nolist
                endm
token_cmdexec   macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__cmdexect)"
token__maxcmd   equ __toknumc-1
                *pragmapop nolist
                endm
token_fnparse   macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__fnparset)"
                *pragmapop nolist
                endm
token_fnlist    macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__fnlistt)"
                *pragmapop nolist
                endm
token_fnexec    macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__fnexect)"
token__maxfn    equ __toknumf-1
                *pragmapop nolist
                endm
                *pragmapop list
                parse_tokendefp error           ; Used to mark errors; should always be first so it's token #0 
                parse_tokendefp eot             ; End of input marker or special handling in word tables
                parse_tokendefp stmtsep         ; statement separator
                parse_tokendefp times           ; times (multiplication) operator (*)
                parse_tokendefp plus            ; addition operator
                parse_tokendefp divide          ; division operator (/)
                parse_tokendefp minus           ; subtraction operator
                parse_tokendefp exp             ; exponentiation operator (^)
                parse_tokendefp lt              ; less than operator
                parse_tokendefp le              ; less than or equal operateor
                parse_tokendefp gt              ; greater than operator
                parse_tokendefp ge              ; greater than or equal operator
                parse_tokendefp eq              ; equality operator
                parse_tokendefp ne              ; inequality operator
                parse_tokendefp not             ; boolean NOT operator
                parse_tokendefp and             ; boolean AND operator
                parse_tokendefp or              ; boolean OR operator
                parse_tokendefp bang            ; exclamation mark
                parse_tokendefp hash            ; number sign
                parse_tokendefp dollar          ; dollar sign (string sigil)
                parse_tokendefp percent         ; percent sign (integer sigil)
                parse_tokendefp amp             ; ampersand
                parse_tokendefp oparen          ; opening paren
                parse_tokendefp cparen          ; closing paren
                parse_tokendefp sep             ; comma (separator)
                parse_tokendefp semi            ; semicolon
                parse_tokendefp at              ; @ symbol
                parse_tokendefp ident           ; identifier (has special parsing)
                parse_tokendefp else            ; ELSE
                parse_tokendefp then            ; THEN
                parse_tokendefp to              ; TO
                parse_tokendefp sub             ; SUB
                parse_tokendefp as              ; AS

                parse_tokendefc remabbr,parse_noop,list_noop,exec_noop          ; abbreviated REM (')
                parse_tokendefc rem,parse_noop,list_noop,exec_noop              ; REM
                parse_tokendefc return,parse_noop,parse_noop,parse_noop         ; RETURN
                parse_tokendefc run,parse_noop,parse_noop,parse_noop            ; RUN
                parse_tokendefc data,parse_noop,parse_noop,parse_noop           ; DATA
                parse_tokendefc end,parse_noop,parse_noop,parse_noop            ; END
                parse_tokendefc stop,parse_noop,parse_noop,parse_noop           ; STOP
                parse_tokendefc let,parse_noop,parse_noop,parse_noop            ; LET
                parse_tokendefc list,parse_noop,parse_noop,parse_noop           ; LIST
                parse_tokendefc new,parse_noop,parse_noop,parse_noop            ; NEW
                parse_tokendefc print,parse_noop,parse_noop,parse_noop          ; PRINT
                parse_tokendefc pop,parse_noop,parse_noop,parse_noop            ; POP
                parse_tokendefc goto,parse_noop,parse_noop,parse_noop           ; GOTO
                parse_tokendefc gosub,parse_noop,parse_noop,parse_noop          ; GOSUB
                parse_tokendefc go,parse_noop,parse_noop,parse_noop             ; GO

                parse_tokendeff asc,parse_noop,parse_noop,parse_noop            ; ASC()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Parse handling tables
parsetab_cmd    token_cmdparse
parsetab_cmde
parsetab_fn     token_fnparse
parsetab_fne
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; List handling tables
listtab_cmd     token_cmdlist
listtab_cmde
listtab_fn      token_fnlist
listtab_fne
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Execution handling tables
exectab_cmd     token_cmdexec
exectab_fn      token_fnexec
                *pragmapop list