view src/parse.s @ 124:8770e6f977c3

Rework parser to use parse_wordtab for symbols too There's no reason not to use the parse_wordtab table thing to match the symbols with their token codes. It takes less space than the combined code and tables to do it separately.
author William Astle <lost@l-w.ca>
date Mon, 01 Jan 2024 15:57:59 -0700
parents 5681cdada362
children 0607e4e20702
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
; code analysis.
;
; This is a recursive descent parser.
;
; Entry:
; X             Points to the text to encode
; B             Nonzero to prevent generating any output (error check/length calculation only)
;
; Exit:
; U             Points to the encoded line
; D             Length of the encoded line
; CC.C          clear

; Error Exit:
; B             Error code
; U             Offset to error input
; CC.C          set
parse           stb parse_noout                 ; save no-output flag
                leay ,x                         ; save input pointer in a less useful register
                ldu freestart                   ; point to start of free memory where we will build the output
                pshs u                          ; save original free memory location
parse_nextstmt  jsr parse_nexttok               ; fetch the next token, return type in D
                bcc parse0                      ; brif we succeeded in parsing a token
parse_error     puls u                          ; restore original free memory location - deallocate any encoding
                stu freestart
                ldu parse_tokenst               ; get start location we started parsing the token at
                rts                             ; return error condition
parse0          ldx #parse_stmtjump             ; point to jump table for token type handler
                abx                             ; offset to handler address
                abx
                jsr [,x]                        ; call handler
                bcs parse_error                 ; brif handler flagged error
                jsr parse_curtoken              ; get the token we terminated on
                cmpb #token_eot                 ; end of input?
                bne parse1                      ; brif not
                ldb #bc_eol                     ; stash an end of line op
                bsr parse_write
                bcs parse_error                 ; brif we errored out writing to the result (OM?)
                tfr u,d                         ; calculate the length of the result
                subd ,s
                puls u,pc                       ; get pointer to start of encoded result and return (C is already clear)
parse1          cmpb #token_stmtsep             ; statement separator?
                beq parse_nextstmt              ; brif so - do another statement
                cmpb #token_apos                ; ' token?
                beq parse0                      ; brif so - parse it as a new statement
                comb                            ; set C for error
                ldb #err_sn                     ; raise syntax error
                bra parse_error
parse_write     lda parse_noout                 ; are we doing output?
                beq parse_write0                ; brif so
                leau 1,u                        ; just count up the output and don't do anything
                rts
parse_write0    leax -stackheadroom,s           ; calculate bottom of stack with headroom
                cmpx freestart                  ; did the stack run into the end of the output?
                bhs parse_write1                ; brif not - we're good
                ldb #err_om                     ; raise out of memory error, C already set from comparison
                rts
parse_write1    stb ,u+                         ; save output byte
                stu freestart                   ; save new to of used memory
parse_noop      rts                             ; return all clear - C clear from comparison above
parse_curtoken  ldb parse_curtok                ; fetch token code of current token
                rts
parse_tokerr    comb                            ; flag error - unexpected token
                ldb #err_sn                     ; raise syntax error
                rts
parse_nextchar  lda ,y                          ; at end of input already?
                beq parse_curchar               ; brif so
                leay 1,y                        ; move to next input character
parse_curchar   lda ,y                          ; fetch input character
                rts
parse_nexttok   bsr parse_curchar               ; fetch current input
                beq parse_nexttok1              ; brif end of input
parse_nexttok0  cmpa #0x20                      ; space?
                bne parse_nexttok2              ; brif not
                bsr parse_nextchar              ; eat the space
                bne parse_nexttok0              ; brif not end of input
parse_nexttok1  ldb #token_eot                  ; flag end of input
                bra parse_nexttok6              ; go return it
parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
                bsr parse_toupper               ; make sure we have upper case letters for matching
                ldx #parse_wt                   ; point to keyword parsing table
                bsr parse_wordtab               ; go see if we have a match in the keyword table
                bcc parse_nexttok6              ; brif we do - return it
                ldy parse_tokenst               ; return to the start of the token - pointer probably clobbered
                bsr parse_curchar               ; get back input character (may have been clobbered)
                cmpa #'.                        ; leading decimal?
                beq parse_nexttok3              ; brif so - parse number
                cmpa #'0                        ; is it a digit
                blo parse_nexttok10             ; brif not
                cmpa #'9                        ; is it still a digit?
                bhi parse_nexttok10             ; brif not
parse_nexttok3  jmp parse_number                ; go parse a number
parse_nexttok6  stb parse_curtok                ; save token type
                leay 1,y                        ; eat the input character
                clra                            ; clear C to indicate no error (and clear Z also)
                rts
parse_nexttok10 cmpa #'A                        ; is it alpha?
                blo parse_nexttok11             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so
parse_nexttok11 comb                            ; flag error - unrecognized token
                ldb #token_error
                rts
parse_nexttok12 bsr parse_nextcharu             ; fetch next input character
                cmpa #'0                        ; is it alphanumeric?
                blo parse_nexttok13             ; brif not
                cmpa #'9                        ; is it numeric?
                bls parse_nexttok12             ; brif so - keep skipping it
                cmpa #'A                        ; is it alpha?
                blo parse_nexttok13             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so - keep skipping it
parse_nexttok13 tfr y,d                         ; calculate length of identifier
                subd parse_tokenst
                std val0+val.strlen             ; save it for reference
                ldb #token_ident                ; indicate an identifier (variable name, etc.)
                rts                             ; return result (C will be clear from SUBD above)
parse_nextcharu bsr parse_nextchar              ; fetch next input character
                beq parse_toupper0              ; brif end of input
parse_toupper   cmpa #'a                        ; is it lower case alpha?
                blo parse_toupper0              ; brif not
                cmpa #'z                        ; is it still lower case alpha?
                bhi parse_toupper0              ; brif not
                suba #0x20                      ; adjust to upper case alpha
parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
; This routine parses tokens using the table at parse_wordtab. The table is structured as follows:
;
; * two bytes which contain the length of the table less the two bytes for this length value
; * a sequence of entries consisting of a single byte matching character and a token code followed
;   by an optional sub table, structured exactly the same way.
;
; The optional subtable will be present if the token code is token_eot
;
; If the character match is negative, it means a lookahead failed. The negative value is the number
; of characters to unget and the token code is the token value to return. No other entries after this
; in a table will be considered since thie negative match is a global match.
;
; When a token_eot match is found, if there are no further characters in the input, the match is
; determined to be invalid and processing continues with the next entry.
parse_wordtab0  leas 3,s                        ; clean up stack for sub table handling
parse_wordtab   pshs a,x                        ; save input character and start of table
                ldd ,x++                        ; get length of this table
                addd 1,s                        ; calculate the address of the end of the table
                std 1,s                         ; save end address for comparison later
                lda ,s                          ; get back input character
parse_wordtab1  ldb -1,x                        ; fetch token code for this entry
                cmpa ,x++                       ; does this entry match?
                bne parse_wordtab4              ; brif not
                cmpb #token_eot                 ; is it indicating a sub table?
                bne parse_wordtab6              ; brif not
                bsr parse_nextcharu             ; fetch next input character (for sub table match)
                bne parse_wordtab0              ; brif we are going to check the sub table
parse_wordtab2  ldd ,x++                        ; fetch length of sub table
                leax d,x                        ; move past sub table
parse_wordtab3  lda ,s                          ; get back input character
                cmpx 1,s                        ; are we at the end of the table?
                blo parse_wordtab1              ; brif not - check another entry
                comb                            ; indicate no match
                puls a,x,pc                     ; clean up stack and return
parse_wordtab4  lda -2,x                        ; get the match character
                bmi parse_wordtab5              ; brif negative - lookahead fail
                cmpb #token_eot                 ; is there a sub table to skip?
                beq parse_wordtab2              ; brif so - skip sub table
                bra parse_wordtab3              ; otherwise just move to the next entry
parse_wordtab5  leay a,y                        ; move back the specified number of characters
parse_wordtab6  clra                            ; clear C to indicate a match
                puls a,x,pc                     ; clean up stack and return
parse_number    jmp parse_tokerr
; Parse tokens - define them in order using the macro parse_tokdef
                *pragmapush list
                *pragma nolist
parse_toknum    set 0
parse_tokdef    macro noexpand
\1              equ parse_toknum
parse_toknum    set parse_toknum+1
                fdb \2
                endm
                *pragmapop list
parse_stmtjump  parse_tokdef token_error,parse_tokerr
                parse_tokdef token_eot,parse_noop
                parse_tokdef token_lt,parse_noop
                parse_tokdef token_le,parse_noop
                parse_tokdef token_gt,parse_noop
                parse_tokdef token_ge,parse_noop
                parse_tokdef token_eq,parse_noop
                parse_tokdef token_ne,parse_noop
                parse_tokdef token_reltrue,parse_noop // always true relational operator
                parse_tokdef token_stmtsep,parse_noop
                parse_tokdef token_apos,parse_rem
                parse_tokdef token_special,parse_noop
                parse_tokdef token_bang,parse_noop
                parse_tokdef token_hash,parse_noop
                parse_tokdef token_dollar,parse_noop
                parse_tokdef token_percent,parse_noop
                parse_tokdef token_amp,parse_noop
                parse_tokdef token_oparen,parse_noop
                parse_tokdef token_cparen,parse_noop
                parse_tokdef token_star,parse_noop
                parse_tokdef token_plus,parse_noop
                parse_tokdef token_comma,parse_noop
                parse_tokdef token_minus,parse_noop
                parse_tokdef token_slash,parse_noop
                parse_tokdef token_semi,parse_noop
                parse_tokdef token_at,parse_noop
                parse_tokdef token_exp,parse_noop
                parse_tokdef token_ident,parse_noop
                parse_tokdef token_rem,parse_noop
                parse_tokdef token_return,parse_noop
                parse_tokdef token_run,parse_noop
                parse_tokdef token_data,parse_noop
                parse_tokdef token_else,parse_noop
                parse_tokdef token_end,parse_noop
                parse_tokdef token_stop,parse_noop
                parse_tokdef token_sub,parse_noop
                parse_tokdef token_let,parse_noop
                parse_tokdef token_list,parse_noop
                parse_tokdef token_new,parse_noop
                parse_tokdef token_not,parse_noop
                parse_tokdef token_print,parse_noop
                parse_tokdef token_pop,parse_noop
                parse_tokdef token_to,parse_noop
                parse_tokdef token_and,parse_noop
                parse_tokdef token_or,parse_noop
                parse_tokdef token_go,parse_noop
                parse_tokdef token_as,parse_noop
                parse_tokdef token_asc,parse_noop
parse_rem       rts

                *pragmapop list