view src/parse.s @ 121:5d5472b11ccd

Initital skeleton of separation of separate parsing scheme This is the first commit in a long series related to separating the parsing of the input code from the execution of the code. It should allow for more efficient, and probably simpler, execution while giving quicker feedback when someone types in syntactically invalid code.
author William Astle <lost@l-w.ca>
date Sun, 31 Dec 2023 17:44:39 -0700
parents
children 5681cdada362
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
; code analysis.
;
; This is a recursive descent parser.
;
; Entry:
; X             Points to the text to encode
; B             Nonzero to prevent generating any output (error check/length calculation only)
;
; Exit:
; U             Points to the encoded line
; D             Length of the encoded line
; CC.C          clear

; Error Exit:
; B             Error code
; U             Offset to error input
; CC.C          set
parse           stb parse_noout                 ; save no-output flag
                leay ,x                         ; save input pointer in a less useful register
                ldu freestart                   ; point to start of free memory where we will build the output
                pshs u                          ; save original free memory location
parse_nextstmt  jsr parse_nexttok               ; fetch the next token, return type in D
                bcc parse0                      ; brif we succeeded in parsing a token
parse_error     puls u                          ; restore original free memory location - deallocate any encoding
                stu freestart
                ldu parse_tokenst               ; get start location we started parsing the token at
                rts                             ; return error condition
parse0          ldx #parse_stmtjump             ; point to jump table for token type handler
                abx                             ; offset to handler address
                abx
                jsr [,x]                        ; call handler
                bcs parse_error                 ; brif handler flagged error
                jsr parse_curtoken              ; get the token we terminated on
                cmpb #token_eot                 ; end of input?
                bne parse1                      ; brif not
                ldb #bc_eol                     ; stash an end of line op
                bsr parse_write
                bcs parse_error                 ; brif we errored out writing to the result (OM?)
                tfr u,d                         ; calculate the length of the result
                subd ,s
                puls u,pc                       ; get pointer to start of encoded result and return (C is already clear)
parse1          cmpb #token_stmtsep             ; statement separator?
                beq parse_nextstmt              ; brif so - do another statement
                cmpb #token_apos                ; ' token?
                beq parse0                      ; brif so - parse it as a new statement
                comb                            ; set C for error
                ldb #err_sn                     ; raise syntax error
                bra parse_error
parse_write     lda parse_noout                 ; are we doing output?
                beq parse_write0                ; brif so
                leau 1,u                        ; just count up the output and don't do anything
                rts
parse_write0    leax -stackheadroom,s           ; calculate bottom of stack with headroom
                cmpx freestart                  ; did the stack run into the end of the output?
                bhs parse_write1                ; brif not - we're good
                ldb #err_om                     ; raise out of memory error, C already set from comparison
                rts
parse_write1    stb ,u+                         ; save output byte
                stu freestart                   ; save new to of used memory
parse_noop      rts                             ; return all clear - C clear from comparison above
parse_curtoken  ldb parse_curtok                ; fetch token code of current token
                rts
parse_tokerr    comb                            ; flag error - unexpected token
                ldb #err_sn                     ; raise syntax error
                rts
parse_nextchar  lda ,y                          ; at end of input already?
                beq parse_curchar               ; brif so
                leay 1,y                        ; move to next input character
parse_curchar   lda ,y                          ; fetch input character
                rts
parse_nexttok   bsr parse_curchar               ; fetch current input
                beq parse_nexttok1              ; brif end of input
parse_nexttok0  cmpa #0x20                      ; space?
                bne parse_nexttok2              ; brif not
                bsr parse_nextchar              ; eat the space
                bne parse_nexttok0              ; brif not end of input
parse_nexttok1  ldb #token_eot                  ; flag end of input
                bra parse_nexttok6              ; go return it
parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
                cmpa #'.                        ; leading decimal?
                beq parse_nexttok3              ; brif so - parse number
                cmpa #'0                        ; is it a digit
                blo parse_nexttok4              ; brif not
                cmpa #'9                        ; is it still a digit?
                bhi parse_nexttok4              ; brif not
parse_nexttok3  jmp parse_number                ; go parse a number
parse_nexttok4  ldx #parse_chartab              ; point to list of single character tokens to recognize
parse_nexttok5  ldb 1,x                         ; get token value
                cmpa ,x++                       ; character match (and move to next entry)
                bne parse_nexttok7              ; brif not
parse_nexttok6  stb parse_curtok                ; save token type
                leay 1,y                        ; eat the input character
                clra                            ; clear C to indicate no error (and clear Z also)
                rts
parse_nexttok7  cmpb #token_eot                 ; end of table?
                bne parse_nexttok5              ; brif not
                clrb                            ; initialize relational flags
                pshs d                          ; save input character and relational flags for later
parse_nexttok8  cmpa #'<                        ; less than?
                blo parse_nexttok9              ; brif not <, =, or >
                cmpa #'>                        ; still <, =, or >?
                bhi parse_nexttok9              ; brif not
                suba #'<                        ; adjust < to 0
                cmpa #1                         ; set C if <, clear if = or >
                rola                            ; now 4 if >, 2 if =, or 1 if <
                eora 1,s                        ; merge with previous relational characters
                cmpa 1,s                        ; if it doesn't match, we have a dupe
                bne parse_nexttok9              ; brif it's not valid - we won't recognize more in the token
                sta 1,s                         ; save new relational flags
                bsr parse_nextchar              ; fetch next input
                sta ,s                          ; save input character
                bne parse_nexttok8              ; brif there was one - go handle it
parse_nexttok9  puls d                          ; get back input character and relational flag
                tstb                            ; was it a relational operator?
                beq parse_nexttok10             ; brif not
                ldx #parse_reltab               ; point to relational operator token table
                ldb b,x                         ; get the token code
                clra                            ; flag no error
                rts                             ; return - but don't advance - we already did looking for multiples
parse_nexttok10 bsr parse_toupper               ; convert to upper case
                cmpa #'A                        ; is it alpha?
                blo parse_nexttok11             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so
parse_nexttok11 comb                            ; flag error - unrecognized token
                ldb #token_error
                rts
parse_nextcharu bsr parse_nextchar              ; fetch next input character
                beq parse_toupper0              ; brif end of input
parse_toupper   cmpa #'a                        ; is it lower case alpha?
                blo parse_toupper0              ; brif not
                cmpa #'z                        ; is it still lower case alpha?
                bhi parse_toupper0              ; brif not
                suba #0x20                      ; adjust to upper case alpha
parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters
; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword
; table entry, we fall back to looking for the end of an identifier and then returning that.
parse_nexttok12 ldx #parse_wordtab              ; point to keyword table
                bsr parse_nexttok16             ; process this table entry
                cmpb #token_ident               ; did we match a token?
                bne parse_nexttok6              ; brif so - go return it
parse_nexttok13 cmpa #'0                        ; was it alphanumeric?
                blo parse_nexttok15             ; brif not
                cmpa #'9                        ; was it numeric?
                bls parse_nexttok14             ; brif so
                cmpa #'A                        ; was it alpha?
                blo parse_nexttok15             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bhi parse_nexttok15             ; brif not
parse_nexttok14 bsr parse_nextcharu             ; fetch next character and force upper case
                bne parse_nexttok13             ; if not end of input, see if we have alphanumeric
parse_nexttok15 tfr y,d                         ; fetch input location
                subd parse_tokenst              ; calculate length of token
                std val0+val.strlen             ; save the length of the identifier
                ldb #token_ident                ; set token type to identifier (variable name, probably)
                rts                             ; return token type, do not advance since we already did above
; Parsing a potential keyword here. This works using a recursive lookup table. Each lookup table starts with a 18 bit
; size entry for the table. Each entry is then 2 bytes. The first is the character to
; match for this entry. The second is either token_eot to indicate a sub table needs to be consulted, token_ident to
; indicate that the token should be parsed as an identifier, or a token type code which indicates the value should
; be accepted. If a sub table is to be consulted, the table will appear inline with the same format. Should matching
; fall off the end of a table, the character being considered will be "ungot" and processing will return back up the
; call chain, ungetting characters, until the top level at which point token_ident will be returned.
;
; If the match character is negative, the match character represents the number of characters to "unget" and then
; return the specified token. This is for handling look-aheads.
parse_nexttok16 pshs a,x                        ; save input character
                ldd ,x++                        ; get number of entries in the table
                addd 1,s                        ; set pointer to end of table
                std 1,s
parse_nexttok17 cmpa ,x++                       ; does this entry match?
                beq parse_nexttok21             ; brif so
                ldb -2,x                        ; was this a look-ahead non-match?
                bpl parse_nexttok19             ; brif not
                leay b,y                        ; back up the input pointer
                ldb -1,x                        ; get match token
parse_nexttok18 puls a,x,pc                     ; clean up stack and return the matched token
parse_nexttok19 ldb -1,x                        ; is there a sub table?
                cmpb #token_eot
                bne parse_nexttok20             ; brif not
                ldd ,x++                        ; move past the sub table
                leax d,x
parse_nexttok20 cmpx 1,s                        ; did we reach the end of this table?
                blo parse_nexttok17             ; brif not
                ldb #token_ident                ; flag identifier required
                puls a,x,pc                     ; restore input character, clean up stack, and return
parse_nexttok21 ldb -1,x                        ; what token did we match?
                cmpb #token_eot                 ; sub table?
                bne parse_nexttok18             ; brif not - ding! ding! ding! we have a match
                leas 3,s                        ; clean up stack
                bsr parse_nextcharu             ; fetch next input character
                bne parse_nexttok16             ; process sub table entries if we have input
                ldb #token_ident                ; indicate we have an ident
                leay -1,y                       ; unget the end of input
                rts
parse_number    jmp parse_tokerr
; Relational token table, bits are > = <
parse_reltab    fcb token_error
                fcb token_lt
                fcb token_eq
                fcb token_le
                fcb token_gt
                fcb token_ne
                fcb token_ge
                fcb token_reltrue
; Single character token lookup table
parse_chartab   fcb 0x21,token_bang             ; !
                fcb 0x23,token_hash             ; #
                fcb 0x24,token_dollar           ; $
                fcb 0x25,token_percent          ; %
                fcb 0x26,token_amp              ; &
                fcb 0x27,token_apos             ; '
                fcb 0x28,token_oparen           ; (
                fcb 0x29,token_cparen           ; )
                fcb 0x2a,token_star             ; *
                fcb 0x2b,token_plus             ; +
                fcb 0x2c,token_comma            ; ,
                fcb 0x2d,token_minus            ; -
                fcb 0x2f,token_slash            ; /
                fcb 0x3a,token_stmtsep          ; :
                fcb 0x3b,token_semi             ; ;
                fcb 0x3f,token_print            ; ? - print shortcut
                fcb 0x40,token_at               ; @
                fcb 0x5e,token_exp              ; ^ - exponentiation
                fcb 0x00,token_eot              ; end of table flag
; Parse tokens - define them in order using the macro parse_tokdef
                *pragmapush list
                *pragma nolist
parse_toknum    set 0
parse_tokdef    macro noexpand
\1              equ parse_toknum
parse_toknum    set parse_toknum+1
                fdb \2
                endm
                *pragmapop list
parse_stmtjump  parse_tokdef token_error,parse_tokerr
                parse_tokdef token_eot,parse_noop
                parse_tokdef token_lt,parse_noop
                parse_tokdef token_le,parse_noop
                parse_tokdef token_gt,parse_noop
                parse_tokdef token_ge,parse_noop
                parse_tokdef token_eq,parse_noop
                parse_tokdef token_ne,parse_noop
                parse_tokdef token_reltrue,parse_noop // always true relational operator
                parse_tokdef token_stmtsep,parse_noop
                parse_tokdef token_apos,parse_rem
                parse_tokdef token_special,parse_noop
                parse_tokdef token_bang,parse_noop
                parse_tokdef token_hash,parse_noop
                parse_tokdef token_dollar,parse_noop
                parse_tokdef token_percent,parse_noop
                parse_tokdef token_amp,parse_noop
                parse_tokdef token_oparen,parse_noop
                parse_tokdef token_cparen,parse_noop
                parse_tokdef token_star,parse_noop
                parse_tokdef token_plus,parse_noop
                parse_tokdef token_comma,parse_noop
                parse_tokdef token_minus,parse_noop
                parse_tokdef token_slash,parse_noop
                parse_tokdef token_semi,parse_noop
                parse_tokdef token_at,parse_noop
                parse_tokdef token_exp,parse_noop
                parse_tokdef token_ident,parse_noop
                parse_tokdef token_rem,parse_noop
                parse_tokdef token_return,parse_noop
                parse_tokdef token_run,parse_noop
                parse_tokdef token_data,parse_noop
                parse_tokdef token_else,parse_noop
                parse_tokdef token_end,parse_noop
                parse_tokdef token_stop,parse_noop
                parse_tokdef token_sub,parse_noop
                parse_tokdef token_let,parse_noop
                parse_tokdef token_list,parse_noop
                parse_tokdef token_new,parse_noop
                parse_tokdef token_not,parse_noop
                parse_tokdef token_print,parse_noop
                parse_tokdef token_pop,parse_noop
                parse_tokdef token_to,parse_noop
                parse_tokdef token_and,parse_noop
                parse_tokdef token_or,parse_noop
                parse_tokdef token_go,parse_noop
parse_rem       rts

                *pragmapop list