view src/parse.s @ 123:5681cdada362

Redo keyword table handling to handle keywords differing in length Some keywords differ only due to length. That is, the shorter keyword matches the leading characters of the longer one. Make the keyword table builder and processor handle these cases. Also re-implement the handler based on evolved understanding of its requirements.
author William Astle <lost@l-w.ca>
date Mon, 01 Jan 2024 15:15:45 -0700
parents 5d5472b11ccd
children 8770e6f977c3
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
; code analysis.
;
; This is a recursive descent parser.
;
; Entry:
; X             Points to the text to encode
; B             Nonzero to prevent generating any output (error check/length calculation only)
;
; Exit:
; U             Points to the encoded line
; D             Length of the encoded line
; CC.C          clear

; Error Exit:
; B             Error code
; U             Offset to error input
; CC.C          set
parse           stb parse_noout                 ; save no-output flag
                leay ,x                         ; save input pointer in a less useful register
                ldu freestart                   ; point to start of free memory where we will build the output
                pshs u                          ; save original free memory location
parse_nextstmt  jsr parse_nexttok               ; fetch the next token, return type in D
                bcc parse0                      ; brif we succeeded in parsing a token
parse_error     puls u                          ; restore original free memory location - deallocate any encoding
                stu freestart
                ldu parse_tokenst               ; get start location we started parsing the token at
                rts                             ; return error condition
parse0          ldx #parse_stmtjump             ; point to jump table for token type handler
                abx                             ; offset to handler address
                abx
                jsr [,x]                        ; call handler
                bcs parse_error                 ; brif handler flagged error
                jsr parse_curtoken              ; get the token we terminated on
                cmpb #token_eot                 ; end of input?
                bne parse1                      ; brif not
                ldb #bc_eol                     ; stash an end of line op
                bsr parse_write
                bcs parse_error                 ; brif we errored out writing to the result (OM?)
                tfr u,d                         ; calculate the length of the result
                subd ,s
                puls u,pc                       ; get pointer to start of encoded result and return (C is already clear)
parse1          cmpb #token_stmtsep             ; statement separator?
                beq parse_nextstmt              ; brif so - do another statement
                cmpb #token_apos                ; ' token?
                beq parse0                      ; brif so - parse it as a new statement
                comb                            ; set C for error
                ldb #err_sn                     ; raise syntax error
                bra parse_error
parse_write     lda parse_noout                 ; are we doing output?
                beq parse_write0                ; brif so
                leau 1,u                        ; just count up the output and don't do anything
                rts
parse_write0    leax -stackheadroom,s           ; calculate bottom of stack with headroom
                cmpx freestart                  ; did the stack run into the end of the output?
                bhs parse_write1                ; brif not - we're good
                ldb #err_om                     ; raise out of memory error, C already set from comparison
                rts
parse_write1    stb ,u+                         ; save output byte
                stu freestart                   ; save new to of used memory
parse_noop      rts                             ; return all clear - C clear from comparison above
parse_curtoken  ldb parse_curtok                ; fetch token code of current token
                rts
parse_tokerr    comb                            ; flag error - unexpected token
                ldb #err_sn                     ; raise syntax error
                rts
parse_nextchar  lda ,y                          ; at end of input already?
                beq parse_curchar               ; brif so
                leay 1,y                        ; move to next input character
parse_curchar   lda ,y                          ; fetch input character
                rts
parse_nexttok   bsr parse_curchar               ; fetch current input
                beq parse_nexttok1              ; brif end of input
parse_nexttok0  cmpa #0x20                      ; space?
                bne parse_nexttok2              ; brif not
                bsr parse_nextchar              ; eat the space
                bne parse_nexttok0              ; brif not end of input
parse_nexttok1  ldb #token_eot                  ; flag end of input
                bra parse_nexttok6              ; go return it
parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
                cmpa #'.                        ; leading decimal?
                beq parse_nexttok3              ; brif so - parse number
                cmpa #'0                        ; is it a digit
                blo parse_nexttok4              ; brif not
                cmpa #'9                        ; is it still a digit?
                bhi parse_nexttok4              ; brif not
parse_nexttok3  jmp parse_number                ; go parse a number
parse_nexttok4  ldx #parse_chartab              ; point to list of single character tokens to recognize
parse_nexttok5  ldb 1,x                         ; get token value
                cmpa ,x++                       ; character match (and move to next entry)
                bne parse_nexttok7              ; brif not
parse_nexttok6  stb parse_curtok                ; save token type
                leay 1,y                        ; eat the input character
                clra                            ; clear C to indicate no error (and clear Z also)
                rts
parse_nexttok7  cmpb #token_eot                 ; end of table?
                bne parse_nexttok5              ; brif not
                clrb                            ; initialize relational flags
                pshs d                          ; save input character and relational flags for later
parse_nexttok8  cmpa #'<                        ; less than?
                blo parse_nexttok9              ; brif not <, =, or >
                cmpa #'>                        ; still <, =, or >?
                bhi parse_nexttok9              ; brif not
                suba #'<                        ; adjust < to 0
                cmpa #1                         ; set C if <, clear if = or >
                rola                            ; now 4 if >, 2 if =, or 1 if <
                eora 1,s                        ; merge with previous relational characters
                cmpa 1,s                        ; if it doesn't match, we have a dupe
                bne parse_nexttok9              ; brif it's not valid - we won't recognize more in the token
                sta 1,s                         ; save new relational flags
                bsr parse_nextchar              ; fetch next input
                sta ,s                          ; save input character
                bne parse_nexttok8              ; brif there was one - go handle it
parse_nexttok9  puls d                          ; get back input character and relational flag
                tstb                            ; was it a relational operator?
                beq parse_nexttok10             ; brif not
                ldx #parse_reltab               ; point to relational operator token table
                ldb b,x                         ; get the token code
                clra                            ; flag no error
                rts                             ; return - but don't advance - we already did looking for multiples
parse_nexttok10 bsr parse_toupper               ; convert to upper case
                cmpa #'A                        ; is it alpha?
                blo parse_nexttok11             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so
parse_nexttok11 comb                            ; flag error - unrecognized token
                ldb #token_error
                rts
parse_nextcharu bsr parse_nextchar              ; fetch next input character
                beq parse_toupper0              ; brif end of input
parse_toupper   cmpa #'a                        ; is it lower case alpha?
                blo parse_toupper0              ; brif not
                cmpa #'z                        ; is it still lower case alpha?
                bhi parse_toupper0              ; brif not
                suba #0x20                      ; adjust to upper case alpha
parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters
; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword
; table entry, we fall back to looking for the end of an identifier and then returning that.
parse_nexttok12 ldx #parse_wordtab              ; point to keyword table
                bsr parse_nexttok16             ; process this table entry
                cmpb #token_ident               ; did we match a token?
                bne parse_nexttok6              ; brif so - go return it
parse_nexttok13 cmpa #'0                        ; was it alphanumeric?
                blo parse_nexttok15             ; brif not
                cmpa #'9                        ; was it numeric?
                bls parse_nexttok14             ; brif so
                cmpa #'A                        ; was it alpha?
                blo parse_nexttok15             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bhi parse_nexttok15             ; brif not
parse_nexttok14 bsr parse_nextcharu             ; fetch next character and force upper case
                bne parse_nexttok13             ; if not end of input, see if we have alphanumeric
parse_nexttok15 tfr y,d                         ; fetch input location
                subd parse_tokenst              ; calculate length of token
                std val0+val.strlen             ; save the length of the identifier
                ldb #token_ident                ; set token type to identifier (variable name, probably)
                rts                             ; return token type, do not advance since we already did above
; This routine parses tokens using the table at parse_wordtab. The table is structured as follows:
;
; * two bytes which contain the length of the table less the two bytes for this length value
; * a sequence of entries consisting of a single byte matching character and a token code followed
;   by an optional sub table, structured exactly the same way.
;
; The optional subtable will be present if the token code is token_eot
;
; If the character match is negative, it means a lookahead failed. The negative value is the number
; of characters to unget and the token code is the token value to return. No other entries after this
; in a table will be considered since thie negative match is a global match.
;
; When a token_eot match is found, if there are no further characters in the input, the match is
; determined to be invalid and processing continues with the next entry.
parse_wordtab0  leas 3,s                        ; clean up stack for sub table handling
parse_wordtab   pshs a,x                        ; save input character and start of table
                ldd ,x++                        ; get length of this table
                addd 1,s                        ; calculate the address of the end of the table
                std 1,s                         ; save end address for comparison later
                lda ,s                          ; get back input character
parse_wordtab1  ldb -1,x                        ; fetch token code for this entry
                cmpa ,x++                       ; does this entry match?
                bne parse_wordtab4              ; brif not
                cmpb #token_eot                 ; is it indicating a sub table?
                bne parse_wordtab6              ; brif not
                bsr parse_nextcharu             ; fetch next input character (for sub table match)
                bne parse_wordtab0              ; brif we are going to check the sub table
parse_wordtab2  ldd ,x++                        ; fetch length of sub table
                leax d,x                        ; move past sub table
parse_wordtab3  lda ,s                          ; get back input character
                cmpx 1,s                        ; are we at the end of the table?
                blo parse_wordtab1              ; brif not - check another entry
                comb                            ; indicate no match
                puls a,x,pc                     ; clean up stack and return
parse_wordtab4  lda -2,x                        ; get the match character
                bmi parse_wordtab5              ; brif negative - lookahead fail
                cmpb #token_eot                 ; is there a sub table to skip?
                beq parse_wordtab2              ; brif so - skip sub table
                bra parse_wordtab3              ; otherwise just move to the next entry
parse_wordtab5  leay a,y                        ; move back the specified number of characters
parse_wordtab6  clra                            ; clear C to indicate a match
                puls a,x,pc                     ; clean up stack and return
parse_number    jmp parse_tokerr
; Relational token table, bits are > = <
parse_reltab    fcb token_error
                fcb token_lt
                fcb token_eq
                fcb token_le
                fcb token_gt
                fcb token_ne
                fcb token_ge
                fcb token_reltrue
; Single character token lookup table
parse_chartab   fcb 0x21,token_bang             ; !
                fcb 0x23,token_hash             ; #
                fcb 0x24,token_dollar           ; $
                fcb 0x25,token_percent          ; %
                fcb 0x26,token_amp              ; &
                fcb 0x27,token_apos             ; '
                fcb 0x28,token_oparen           ; (
                fcb 0x29,token_cparen           ; )
                fcb 0x2a,token_star             ; *
                fcb 0x2b,token_plus             ; +
                fcb 0x2c,token_comma            ; ,
                fcb 0x2d,token_minus            ; -
                fcb 0x2f,token_slash            ; /
                fcb 0x3a,token_stmtsep          ; :
                fcb 0x3b,token_semi             ; ;
                fcb 0x3f,token_print            ; ? - print shortcut
                fcb 0x40,token_at               ; @
                fcb 0x5e,token_exp              ; ^ - exponentiation
                fcb 0x00,token_eot              ; end of table flag
; Parse tokens - define them in order using the macro parse_tokdef
                *pragmapush list
                *pragma nolist
parse_toknum    set 0
parse_tokdef    macro noexpand
\1              equ parse_toknum
parse_toknum    set parse_toknum+1
                fdb \2
                endm
                *pragmapop list
parse_stmtjump  parse_tokdef token_error,parse_tokerr
                parse_tokdef token_eot,parse_noop
                parse_tokdef token_lt,parse_noop
                parse_tokdef token_le,parse_noop
                parse_tokdef token_gt,parse_noop
                parse_tokdef token_ge,parse_noop
                parse_tokdef token_eq,parse_noop
                parse_tokdef token_ne,parse_noop
                parse_tokdef token_reltrue,parse_noop // always true relational operator
                parse_tokdef token_stmtsep,parse_noop
                parse_tokdef token_apos,parse_rem
                parse_tokdef token_special,parse_noop
                parse_tokdef token_bang,parse_noop
                parse_tokdef token_hash,parse_noop
                parse_tokdef token_dollar,parse_noop
                parse_tokdef token_percent,parse_noop
                parse_tokdef token_amp,parse_noop
                parse_tokdef token_oparen,parse_noop
                parse_tokdef token_cparen,parse_noop
                parse_tokdef token_star,parse_noop
                parse_tokdef token_plus,parse_noop
                parse_tokdef token_comma,parse_noop
                parse_tokdef token_minus,parse_noop
                parse_tokdef token_slash,parse_noop
                parse_tokdef token_semi,parse_noop
                parse_tokdef token_at,parse_noop
                parse_tokdef token_exp,parse_noop
                parse_tokdef token_ident,parse_noop
                parse_tokdef token_rem,parse_noop
                parse_tokdef token_return,parse_noop
                parse_tokdef token_run,parse_noop
                parse_tokdef token_data,parse_noop
                parse_tokdef token_else,parse_noop
                parse_tokdef token_end,parse_noop
                parse_tokdef token_stop,parse_noop
                parse_tokdef token_sub,parse_noop
                parse_tokdef token_let,parse_noop
                parse_tokdef token_list,parse_noop
                parse_tokdef token_new,parse_noop
                parse_tokdef token_not,parse_noop
                parse_tokdef token_print,parse_noop
                parse_tokdef token_pop,parse_noop
                parse_tokdef token_to,parse_noop
                parse_tokdef token_and,parse_noop
                parse_tokdef token_or,parse_noop
                parse_tokdef token_go,parse_noop
                parse_tokdef token_as,parse_noop
                parse_tokdef token_asc,parse_noop
parse_rem       rts

                *pragmapop list