view src/parse.s @ 129:d5886daa4f65

Fix a less than brilliant branch target in token to keyword routine
author William Astle <lost@l-w.ca>
date Sat, 04 May 2024 15:18:51 -0600
parents 527212870064
children 9f23ddc5165f
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
; code analysis.
;
; This is a recursive descent parser.
;
; Entry:
; X             Points to the text to encode
; B             Nonzero to prevent generating any output (error check/length calculation only)
;
; Exit:
; U             Points to the encoded line
; D             Length of the encoded line
; CC.C          clear

; Error Exit:
; B             Error code
; U             Offset to error input
; CC.C          set
parse           stb parse_noout                 ; save no-output flag
                leay ,x                         ; save input pointer in a less useful register
                ldu freestart                   ; point to start of free memory where we will build the output
                pshs u                          ; save original free memory location
parse_nextstmt  jsr parse_nexttok               ; fetch the next token, return type in D
                bcc parse0                      ; brif we succeeded in parsing a token
parse_error     puls u                          ; restore original free memory location - deallocate any encoding
                stu freestart
                ldu parse_tokenst               ; get start location we started parsing the token at
                rts                             ; return error condition
parse0          ldx #parsetab_cmd               ; point to jump table for token type handler
                cmpb #token_stmtsep             ; is it a statement separator?
                beq parse_nextstmt              ; brif so - we can just skip it
parse1          cmpb ,x                         ; did we match a valid command token?
                beq parse3                      ; brif so
                leax 3,x                        ; move to next entry
                cmpx #parsetab_cmde             ; end of table?
                blo parse1                      ; brif not
parse2          ldb #err_sn                     ; flag syntax error
                bra parse_error                 ; and return the error
parse3          jsr [1,x]                       ; call the handler
                bcs parse_error                 ; brif the handler indicated error
                bsr parse_curtoken              ; fetch the token we left off on
                cmpb #token_eot                 ; end of input?
                bne parse4                      ; brif not
                ldb #bc_eol                     ; stash an end of line op
                bsr parse_write
                bcs parse_error                 ; brif we errored out writing to the result (OM?)
                tfr u,d                         ; calculate the length of the result
                subd ,s
                puls u,pc                       ; get pointer to start of encoded result and return (C is already clear)
parse4          cmpb #token_stmtsep             ; statement separator?
                beq parse_nextstmt              ; brif so - do another statement
                cmpb #token_remabbr             ; ' token?
                beq parse0                      ; brif so - parse it as a new statement
                bra parse2                      ; raise a syntax error
parse_write     lda parse_noout                 ; are we doing output?
                beq parse_write0                ; brif so
                leau 1,u                        ; just count up the output and don't do anything
                rts
parse_write0    leax -stackheadroom,s           ; calculate bottom of stack with headroom
                cmpx freestart                  ; did the stack run into the end of the output?
                bhs parse_write1                ; brif not - we're good
                ldb #err_om                     ; raise out of memory error, C already set from comparison
                rts
parse_write1    stb ,u+                         ; save output byte
                stu freestart                   ; save new to of used memory
list_noop
parse_noop      rts                             ; return all clear - C clear from comparison above
parse_curtoken  ldb parse_curtok                ; fetch token code of current token
                rts
parse_tokerr    comb                            ; flag error - unexpected token
                ldb #err_sn                     ; raise syntax error
                rts
parse_nextchar  lda ,y                          ; at end of input already?
                beq parse_curchar               ; brif so
                leay 1,y                        ; move to next input character
parse_curchar   lda ,y                          ; fetch input character
                rts
parse_nexttok   bsr parse_curchar               ; fetch current input
                beq parse_nexttok1              ; brif end of input
parse_nexttok0  cmpa #0x20                      ; space?
                bne parse_nexttok2              ; brif not
                bsr parse_nextchar              ; eat the space
                bne parse_nexttok0              ; brif not end of input
parse_nexttok1  ldb #token_eot                  ; flag end of input
                bra parse_nexttok6              ; go return it
parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
                bsr parse_toupper               ; make sure we have upper case letters for matching
                ldx #parse_wt                   ; point to keyword parsing table
                bsr parse_wordtab               ; go see if we have a match in the keyword table
                bcc parse_nexttok6              ; brif we do - return it
                ldy parse_tokenst               ; return to the start of the token - pointer probably clobbered
                bsr parse_curchar               ; get back input character (may have been clobbered)
                cmpa #'.                        ; leading decimal?
                beq parse_nexttok3              ; brif so - parse number
                cmpa #'0                        ; is it a digit
                blo parse_nexttok10             ; brif not
                cmpa #'9                        ; is it still a digit?
                bhi parse_nexttok10             ; brif not
parse_nexttok3  jmp parse_number                ; go parse a number
parse_nexttok6  stb parse_curtok                ; save token type
                leay 1,y                        ; eat the input character
                clra                            ; clear C to indicate no error (and clear Z also)
                rts
parse_nexttok10 cmpa #'A                        ; is it alpha?
                blo parse_nexttok11             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so
parse_nexttok11 comb                            ; flag error - unrecognized token
                ldb #token_error
                rts
parse_nexttok12 bsr parse_nextcharu             ; fetch next input character
                cmpa #'0                        ; is it alphanumeric?
                blo parse_nexttok13             ; brif not
                cmpa #'9                        ; is it numeric?
                bls parse_nexttok12             ; brif so - keep skipping it
                cmpa #'A                        ; is it alpha?
                blo parse_nexttok13             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so - keep skipping it
parse_nexttok13 tfr y,d                         ; calculate length of identifier
                subd parse_tokenst
                std val0+val.strlen             ; save it for reference
                ldb #token_ident                ; indicate an identifier (variable name, etc.)
                rts                             ; return result (C will be clear from SUBD above)
parse_nextcharu bsr parse_nextchar              ; fetch next input character
                beq parse_toupper0              ; brif end of input
parse_toupper   cmpa #'a                        ; is it lower case alpha?
                blo parse_toupper0              ; brif not
                cmpa #'z                        ; is it still lower case alpha?
                bhi parse_toupper0              ; brif not
                suba #0x20                      ; adjust to upper case alpha
parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
parse_number    jmp parse_tokerr
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Parse a statement that consists of just the command token
parse_cmdsingle equ parse_write                 ; just write the token out and bail
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Parse a REM or ' statement. We just copy the comment out after the REM or ' token.
parse_rem       jsr parse_write                 ; write the token/character out
                ldb ,y+                         ; get next input character
                bne parse_rem                   ; brif not at the end of the input
                ldb #token_eot                  ; flag end of input for mainline parser
                stb parse_curtok
                rts                             ; return, pass back the C result from parse_write
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This routine parses tokens using the table at parse_wordtab. The table is structured as follows:
;
; * two bytes which contain the length of the table less the two bytes for this length value
; * a sequence of entries consisting of a single byte matching character and a token code followed
;   by an optional sub table, structured exactly the same way.
;
; The optional subtable will be present if the token code is token_eot
;
; If the character match is negative, it means a lookahead failed. The negative value is the number
; of characters to unget and the token code is the token value to return. No other entries after this
; in a table will be considered since thie negative match is a global match.
;
; When a token_eot match is found, if there are no further characters in the input, the match is
; determined to be invalid and processing continues with the next entry.
parse_wordtab0  leas 3,s                        ; clean up stack for sub table handling
parse_wordtab   pshs a,x                        ; save input character and start of table
                ldd ,x++                        ; get length of this table
                addd 1,s                        ; calculate the address of the end of the table
                std 1,s                         ; save end address for comparison later
                lda ,s                          ; get back input character
parse_wordtab1  ldb 1,x                         ; fetch token code for this entry
                cmpa ,x++                       ; does this entry match?
                bne parse_wordtab4              ; brif not
                cmpb #token_eot                 ; is it indicating a sub table?
                bne parse_wordtab6              ; brif not
                bsr parse_nextcharu             ; fetch next input character (for sub table match)
                bne parse_wordtab0              ; brif we are going to check the sub table
parse_wordtab2  ldd ,x                          ; fetch length of sub table
                leax d,x                        ; move past sub table
parse_wordtab3  lda ,s                          ; get back input character
                cmpx 1,s                        ; are we at the end of the table?
                blo parse_wordtab1              ; brif not - check another entry
                comb                            ; indicate no match
                puls a,x,pc                     ; clean up stack and return
parse_wordtab4  lda -2,x                        ; get the match character
                bmi parse_wordtab5              ; brif negative - lookahead fail
                cmpb #token_eot                 ; is there a sub table to skip?
                beq parse_wordtab2              ; brif so - skip sub table
                bra parse_wordtab3              ; otherwise just move to the next entry
parse_wordtab5  leay a,y                        ; move back the specified number of characters
parse_wordtab6  clra                            ; clear C to indicate a match
                puls a,x,pc                     ; clean up stack and return
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Convert a token number back to its keyword. This will use the same table used by parse_wordtab. Enter with a character
; output routine pointer in U which takes the character in A. The routine can assume that Y is preserved. Will return
; with C set if the token does not exist in the word table and clear otherwise.
parse_wtdc      pshs u                          ; save routine pointer
                ldu #strbuff+20                 ; point to temporary string buffer
                clr ,-u                         ; put a NUL at the end of the string
                ldx #parse_wt                   ; point to keyword parse table
                bsr parse_wtdc2                 ; call the tree walker function
                bcc parse_wtdc1                 ; brif we do have a match
                puls u,pc                       ; clean stack and return
parse_wtdc0     jsr [,s]                        ; output the character
parse_wtdc1     lda ,u+                         ; get output byte
                bne parse_wtdc0                 ; brif we're not at the end yet
                clra                            ; make sure C is clear
                puls u,pc                       ; clean stack and return
parse_wtdc2     pshs a,x                        ; save the token match value and the table pointer
                ldd ,x++                        ; get table length
                addd 1,s                        ; calculate end address
                std 1,s                         ; save it
parse_wtdc3     ldd ,x++                        ; get this table entry
                bmi parse_wtdc6                 ; brif it's a backtracking entry - skip it
                cmpa ,s                         ; does the token match here?
                bne parse_wtdc5                 ; brif not
parse_wtdc4     sta ,-y                         ; add the character to the output buffer
                puls a,x,pc                     ; return up the call stack - C is clear from CMPA above
parse_wtdc5     cmpb #token_eot                 ; does this entry have a sub table?
                bne parse_wtdc6                 ; brif not
                pshs a                          ; save the matched character
                lda 1,s                         ; get back the token we need
                bsr parse_wtdc2                 ; go handle the sub table
                puls a                          ; get back the matched character
                bcc parse_wtdc4                 ; brif it did match - record it and return
parse_wtdc6     cmpx 1,s                        ; are we at the end of this table?
                bne parse_wtdc3                 ; brif not - handle another table entry
                coma                            ; make sure C is set for no match
                puls a,x,pc                     ; clean up stack and return
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This table defines the various handler routines for the various bytecode tokens. Each token is defined as follows:
;               parse_tokdefT <sym>,<parse>,<list>,<exec>
; where:
; T: c for command, f for function, p for particle
; <sym>: the symbol name without the "token_" prefix
; <parse>: parse handler for the type, ignored for particles
; <list>: list handler for the type, ingored for particles
; <exec>: execution handler for the type, ignored for particles
                *pragmapush list
                *pragma nolist
__toknump       set 0
__toknumc       set 0x40
__toknumf       set 0xc0
                setstr __cmdparset=""
                setstr __cmdlistt=""
                setstr __cmdexect=""
                setstr __fnparset=""
                setstr __fnlistt=""
                setstr __fnexect=""
parse_tokendefp macro noexpand
token_\1        equ __toknump
__toknump       set __toknump+1
                endm
parse_tokendefc macro noexpand
token_\1        equ __toknumc
__toknumc       set __toknumc+1
                ifstr ne,"{2}",""
                setstr __cmdparset="%(__cmdparset)\tfcb\ttoken_\1\n\tfdb {2}\n"
                endc
                ifstr ne,"{3}",""
                setstr __cmdlistt="%(__cmdlistt)\tfcb\ttoken_\1\n\tfdb {3}\n"
                endc
                ifstr ne,"{4}",""
                setstr __cmdexect="%(__cmdexect)\tfdb {3}\n"
                else
                setstr __cmdexect="%(__cmdexect)\tfdb SNERROR\n"
                endc
                endm
parse_tokendeff macro noexpand
token_\1        equ __toknumf
__toknumf       set __toknumf+1
                ifstr ne,"{2}",""
                setstr __fnparset="%(__fnparset)\tfcb\ttoken_\1\n\tfdb {2}\n"
                endc
                ifstr ne,"{3}",""
                setstr __fnlistt="%(__fnlistt)\tfcb\ttoken_\1\n\tfdb {3}\n"
                endc
                ifstr ne,"{4}",""
                setstr __fnexect="%(__fnexect)\tfdb {3}\n"
                else
                setstr __fnexect="%(__fnexect)\tfdb SNERROR\n"
                endc
                endm
token_cmdparse  macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__cmdparset)"
                *pragmapop nolist
                endm
token_cmdlist   macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__cmdlistt)"
                *pragmapop nolist
                endm
token_cmdexec   macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__cmdexect)"
token__maxcmd   equ __toknumc-1
                *pragmapop nolist
                endm
token_fnparse   macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__fnparset)"
                *pragmapop nolist
                endm
token_fnlist    macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__fnlistt)"
                *pragmapop nolist
                endm
token_fnexec    macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__fnexect)"
token__maxfn    equ __toknumf-1
                *pragmapop nolist
                endm
                *pragmapop list
                parse_tokendefp error           ; Used to mark errors; should always be first so it's token #0 
                parse_tokendefp eot             ; End of input marker or special handling in word tables
                parse_tokendefp stmtsep         ; statement separator
                parse_tokendefp times           ; times (multiplication) operator (*)
                parse_tokendefp plus            ; addition operator
                parse_tokendefp divide          ; division operator (/)
                parse_tokendefp minus           ; subtraction operator
                parse_tokendefp exp             ; exponentiation operator (^)
                parse_tokendefp lt              ; less than operator
                parse_tokendefp le              ; less than or equal operateor
                parse_tokendefp gt              ; greater than operator
                parse_tokendefp ge              ; greater than or equal operator
                parse_tokendefp eq              ; equality operator
                parse_tokendefp ne              ; inequality operator
                parse_tokendefp not             ; boolean NOT operator
                parse_tokendefp and             ; boolean AND operator
                parse_tokendefp or              ; boolean OR operator
                parse_tokendefp bang            ; exclamation mark
                parse_tokendefp hash            ; number sign
                parse_tokendefp dollar          ; dollar sign (string sigil)
                parse_tokendefp percent         ; percent sign (integer sigil)
                parse_tokendefp amp             ; ampersand
                parse_tokendefp oparen          ; opening paren
                parse_tokendefp cparen          ; closing paren
                parse_tokendefp sep             ; comma (separator)
                parse_tokendefp semi            ; semicolon
                parse_tokendefp at              ; @ symbol
                parse_tokendefp ident           ; identifier (has special parsing)
                parse_tokendefp else            ; ELSE
                parse_tokendefp then            ; THEN
                parse_tokendefp to              ; TO
                parse_tokendefp sub             ; SUB
                parse_tokendefp as              ; AS

                parse_tokendefc remabbr,parse_rem,list_noop,exec_noop           ; abbreviated REM (')
                parse_tokendefc rem,parse_rem,list_noop,exec_noop               ; REM
                parse_tokendefc return,parse_cmdsingle,parse_noop,parse_noop    ; RETURN
                parse_tokendefc run,parse_noop,parse_noop,parse_noop            ; RUN
                parse_tokendefc data,parse_noop,parse_noop,parse_noop           ; DATA
                parse_tokendefc end,parse_cmdsingle,parse_noop,parse_noop       ; END
                parse_tokendefc stop,parse_cmdsingle,parse_noop,parse_noop      ; STOP
                parse_tokendefc let,parse_noop,parse_noop,parse_noop            ; LET
                parse_tokendefc list,parse_noop,parse_noop,parse_noop           ; LIST
                parse_tokendefc new,parse_cmdsingle,parse_noop,parse_noop       ; NEW
                parse_tokendefc print,parse_noop,parse_noop,parse_noop          ; PRINT
                parse_tokendefc pop,parse_cmdsingle,parse_noop,parse_noop       ; POP
                parse_tokendefc goto,parse_noop,parse_noop,parse_noop           ; GOTO
                parse_tokendefc gosub,parse_noop,parse_noop,parse_noop          ; GOSUB
                parse_tokendefc go,parse_noop,parse_noop,parse_noop             ; GO

                parse_tokendeff asc,parse_noop,parse_noop,parse_noop            ; ASC()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Parse handling tables
parsetab_cmd    token_cmdparse
parsetab_cmde
parsetab_fn     token_fnparse
parsetab_fne
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; List handling tables
listtab_cmd     token_cmdlist
listtab_cmde
listtab_fn      token_fnlist
listtab_fne
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Execution handling tables
exectab_cmd     token_cmdexec
exectab_fn      token_fnexec
                *pragmapop list