diff src/parse.s @ 121:5d5472b11ccd

Initital skeleton of separation of separate parsing scheme This is the first commit in a long series related to separating the parsing of the input code from the execution of the code. It should allow for more efficient, and probably simpler, execution while giving quicker feedback when someone types in syntactically invalid code.
author William Astle <lost@l-w.ca>
date Sun, 31 Dec 2023 17:44:39 -0700
parents
children 5681cdada362
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/parse.s	Sun Dec 31 17:44:39 2023 -0700
@@ -0,0 +1,290 @@
+                *pragmapush list
+                *pragma list
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
+; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
+; code analysis.
+;
+; This is a recursive descent parser.
+;
+; Entry:
+; X             Points to the text to encode
+; B             Nonzero to prevent generating any output (error check/length calculation only)
+;
+; Exit:
+; U             Points to the encoded line
+; D             Length of the encoded line
+; CC.C          clear
+
+; Error Exit:
+; B             Error code
+; U             Offset to error input
+; CC.C          set
+parse           stb parse_noout                 ; save no-output flag
+                leay ,x                         ; save input pointer in a less useful register
+                ldu freestart                   ; point to start of free memory where we will build the output
+                pshs u                          ; save original free memory location
+parse_nextstmt  jsr parse_nexttok               ; fetch the next token, return type in D
+                bcc parse0                      ; brif we succeeded in parsing a token
+parse_error     puls u                          ; restore original free memory location - deallocate any encoding
+                stu freestart
+                ldu parse_tokenst               ; get start location we started parsing the token at
+                rts                             ; return error condition
+parse0          ldx #parse_stmtjump             ; point to jump table for token type handler
+                abx                             ; offset to handler address
+                abx
+                jsr [,x]                        ; call handler
+                bcs parse_error                 ; brif handler flagged error
+                jsr parse_curtoken              ; get the token we terminated on
+                cmpb #token_eot                 ; end of input?
+                bne parse1                      ; brif not
+                ldb #bc_eol                     ; stash an end of line op
+                bsr parse_write
+                bcs parse_error                 ; brif we errored out writing to the result (OM?)
+                tfr u,d                         ; calculate the length of the result
+                subd ,s
+                puls u,pc                       ; get pointer to start of encoded result and return (C is already clear)
+parse1          cmpb #token_stmtsep             ; statement separator?
+                beq parse_nextstmt              ; brif so - do another statement
+                cmpb #token_apos                ; ' token?
+                beq parse0                      ; brif so - parse it as a new statement
+                comb                            ; set C for error
+                ldb #err_sn                     ; raise syntax error
+                bra parse_error
+parse_write     lda parse_noout                 ; are we doing output?
+                beq parse_write0                ; brif so
+                leau 1,u                        ; just count up the output and don't do anything
+                rts
+parse_write0    leax -stackheadroom,s           ; calculate bottom of stack with headroom
+                cmpx freestart                  ; did the stack run into the end of the output?
+                bhs parse_write1                ; brif not - we're good
+                ldb #err_om                     ; raise out of memory error, C already set from comparison
+                rts
+parse_write1    stb ,u+                         ; save output byte
+                stu freestart                   ; save new to of used memory
+parse_noop      rts                             ; return all clear - C clear from comparison above
+parse_curtoken  ldb parse_curtok                ; fetch token code of current token
+                rts
+parse_tokerr    comb                            ; flag error - unexpected token
+                ldb #err_sn                     ; raise syntax error
+                rts
+parse_nextchar  lda ,y                          ; at end of input already?
+                beq parse_curchar               ; brif so
+                leay 1,y                        ; move to next input character
+parse_curchar   lda ,y                          ; fetch input character
+                rts
+parse_nexttok   bsr parse_curchar               ; fetch current input
+                beq parse_nexttok1              ; brif end of input
+parse_nexttok0  cmpa #0x20                      ; space?
+                bne parse_nexttok2              ; brif not
+                bsr parse_nextchar              ; eat the space
+                bne parse_nexttok0              ; brif not end of input
+parse_nexttok1  ldb #token_eot                  ; flag end of input
+                bra parse_nexttok6              ; go return it
+parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
+                cmpa #'.                        ; leading decimal?
+                beq parse_nexttok3              ; brif so - parse number
+                cmpa #'0                        ; is it a digit
+                blo parse_nexttok4              ; brif not
+                cmpa #'9                        ; is it still a digit?
+                bhi parse_nexttok4              ; brif not
+parse_nexttok3  jmp parse_number                ; go parse a number
+parse_nexttok4  ldx #parse_chartab              ; point to list of single character tokens to recognize
+parse_nexttok5  ldb 1,x                         ; get token value
+                cmpa ,x++                       ; character match (and move to next entry)
+                bne parse_nexttok7              ; brif not
+parse_nexttok6  stb parse_curtok                ; save token type
+                leay 1,y                        ; eat the input character
+                clra                            ; clear C to indicate no error (and clear Z also)
+                rts
+parse_nexttok7  cmpb #token_eot                 ; end of table?
+                bne parse_nexttok5              ; brif not
+                clrb                            ; initialize relational flags
+                pshs d                          ; save input character and relational flags for later
+parse_nexttok8  cmpa #'<                        ; less than?
+                blo parse_nexttok9              ; brif not <, =, or >
+                cmpa #'>                        ; still <, =, or >?
+                bhi parse_nexttok9              ; brif not
+                suba #'<                        ; adjust < to 0
+                cmpa #1                         ; set C if <, clear if = or >
+                rola                            ; now 4 if >, 2 if =, or 1 if <
+                eora 1,s                        ; merge with previous relational characters
+                cmpa 1,s                        ; if it doesn't match, we have a dupe
+                bne parse_nexttok9              ; brif it's not valid - we won't recognize more in the token
+                sta 1,s                         ; save new relational flags
+                bsr parse_nextchar              ; fetch next input
+                sta ,s                          ; save input character
+                bne parse_nexttok8              ; brif there was one - go handle it
+parse_nexttok9  puls d                          ; get back input character and relational flag
+                tstb                            ; was it a relational operator?
+                beq parse_nexttok10             ; brif not
+                ldx #parse_reltab               ; point to relational operator token table
+                ldb b,x                         ; get the token code
+                clra                            ; flag no error
+                rts                             ; return - but don't advance - we already did looking for multiples
+parse_nexttok10 bsr parse_toupper               ; convert to upper case
+                cmpa #'A                        ; is it alpha?
+                blo parse_nexttok11             ; brif not
+                cmpa #'Z                        ; is it still alpha?
+                bls parse_nexttok12             ; brif so
+parse_nexttok11 comb                            ; flag error - unrecognized token
+                ldb #token_error
+                rts
+parse_nextcharu bsr parse_nextchar              ; fetch next input character
+                beq parse_toupper0              ; brif end of input
+parse_toupper   cmpa #'a                        ; is it lower case alpha?
+                blo parse_toupper0              ; brif not
+                cmpa #'z                        ; is it still lower case alpha?
+                bhi parse_toupper0              ; brif not
+                suba #0x20                      ; adjust to upper case alpha
+parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
+; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters
+; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword
+; table entry, we fall back to looking for the end of an identifier and then returning that.
+parse_nexttok12 ldx #parse_wordtab              ; point to keyword table
+                bsr parse_nexttok16             ; process this table entry
+                cmpb #token_ident               ; did we match a token?
+                bne parse_nexttok6              ; brif so - go return it
+parse_nexttok13 cmpa #'0                        ; was it alphanumeric?
+                blo parse_nexttok15             ; brif not
+                cmpa #'9                        ; was it numeric?
+                bls parse_nexttok14             ; brif so
+                cmpa #'A                        ; was it alpha?
+                blo parse_nexttok15             ; brif not
+                cmpa #'Z                        ; is it still alpha?
+                bhi parse_nexttok15             ; brif not
+parse_nexttok14 bsr parse_nextcharu             ; fetch next character and force upper case
+                bne parse_nexttok13             ; if not end of input, see if we have alphanumeric
+parse_nexttok15 tfr y,d                         ; fetch input location
+                subd parse_tokenst              ; calculate length of token
+                std val0+val.strlen             ; save the length of the identifier
+                ldb #token_ident                ; set token type to identifier (variable name, probably)
+                rts                             ; return token type, do not advance since we already did above
+; Parsing a potential keyword here. This works using a recursive lookup table. Each lookup table starts with a 18 bit
+; size entry for the table. Each entry is then 2 bytes. The first is the character to
+; match for this entry. The second is either token_eot to indicate a sub table needs to be consulted, token_ident to
+; indicate that the token should be parsed as an identifier, or a token type code which indicates the value should
+; be accepted. If a sub table is to be consulted, the table will appear inline with the same format. Should matching
+; fall off the end of a table, the character being considered will be "ungot" and processing will return back up the
+; call chain, ungetting characters, until the top level at which point token_ident will be returned.
+;
+; If the match character is negative, the match character represents the number of characters to "unget" and then
+; return the specified token. This is for handling look-aheads.
+parse_nexttok16 pshs a,x                        ; save input character
+                ldd ,x++                        ; get number of entries in the table
+                addd 1,s                        ; set pointer to end of table
+                std 1,s
+parse_nexttok17 cmpa ,x++                       ; does this entry match?
+                beq parse_nexttok21             ; brif so
+                ldb -2,x                        ; was this a look-ahead non-match?
+                bpl parse_nexttok19             ; brif not
+                leay b,y                        ; back up the input pointer
+                ldb -1,x                        ; get match token
+parse_nexttok18 puls a,x,pc                     ; clean up stack and return the matched token
+parse_nexttok19 ldb -1,x                        ; is there a sub table?
+                cmpb #token_eot
+                bne parse_nexttok20             ; brif not
+                ldd ,x++                        ; move past the sub table
+                leax d,x
+parse_nexttok20 cmpx 1,s                        ; did we reach the end of this table?
+                blo parse_nexttok17             ; brif not
+                ldb #token_ident                ; flag identifier required
+                puls a,x,pc                     ; restore input character, clean up stack, and return
+parse_nexttok21 ldb -1,x                        ; what token did we match?
+                cmpb #token_eot                 ; sub table?
+                bne parse_nexttok18             ; brif not - ding! ding! ding! we have a match
+                leas 3,s                        ; clean up stack
+                bsr parse_nextcharu             ; fetch next input character
+                bne parse_nexttok16             ; process sub table entries if we have input
+                ldb #token_ident                ; indicate we have an ident
+                leay -1,y                       ; unget the end of input
+                rts
+parse_number    jmp parse_tokerr
+; Relational token table, bits are > = <
+parse_reltab    fcb token_error
+                fcb token_lt
+                fcb token_eq
+                fcb token_le
+                fcb token_gt
+                fcb token_ne
+                fcb token_ge
+                fcb token_reltrue
+; Single character token lookup table
+parse_chartab   fcb 0x21,token_bang             ; !
+                fcb 0x23,token_hash             ; #
+                fcb 0x24,token_dollar           ; $
+                fcb 0x25,token_percent          ; %
+                fcb 0x26,token_amp              ; &
+                fcb 0x27,token_apos             ; '
+                fcb 0x28,token_oparen           ; (
+                fcb 0x29,token_cparen           ; )
+                fcb 0x2a,token_star             ; *
+                fcb 0x2b,token_plus             ; +
+                fcb 0x2c,token_comma            ; ,
+                fcb 0x2d,token_minus            ; -
+                fcb 0x2f,token_slash            ; /
+                fcb 0x3a,token_stmtsep          ; :
+                fcb 0x3b,token_semi             ; ;
+                fcb 0x3f,token_print            ; ? - print shortcut
+                fcb 0x40,token_at               ; @
+                fcb 0x5e,token_exp              ; ^ - exponentiation
+                fcb 0x00,token_eot              ; end of table flag
+; Parse tokens - define them in order using the macro parse_tokdef
+                *pragmapush list
+                *pragma nolist
+parse_toknum    set 0
+parse_tokdef    macro noexpand
+\1              equ parse_toknum
+parse_toknum    set parse_toknum+1
+                fdb \2
+                endm
+                *pragmapop list
+parse_stmtjump  parse_tokdef token_error,parse_tokerr
+                parse_tokdef token_eot,parse_noop
+                parse_tokdef token_lt,parse_noop
+                parse_tokdef token_le,parse_noop
+                parse_tokdef token_gt,parse_noop
+                parse_tokdef token_ge,parse_noop
+                parse_tokdef token_eq,parse_noop
+                parse_tokdef token_ne,parse_noop
+                parse_tokdef token_reltrue,parse_noop // always true relational operator
+                parse_tokdef token_stmtsep,parse_noop
+                parse_tokdef token_apos,parse_rem
+                parse_tokdef token_special,parse_noop
+                parse_tokdef token_bang,parse_noop
+                parse_tokdef token_hash,parse_noop
+                parse_tokdef token_dollar,parse_noop
+                parse_tokdef token_percent,parse_noop
+                parse_tokdef token_amp,parse_noop
+                parse_tokdef token_oparen,parse_noop
+                parse_tokdef token_cparen,parse_noop
+                parse_tokdef token_star,parse_noop
+                parse_tokdef token_plus,parse_noop
+                parse_tokdef token_comma,parse_noop
+                parse_tokdef token_minus,parse_noop
+                parse_tokdef token_slash,parse_noop
+                parse_tokdef token_semi,parse_noop
+                parse_tokdef token_at,parse_noop
+                parse_tokdef token_exp,parse_noop
+                parse_tokdef token_ident,parse_noop
+                parse_tokdef token_rem,parse_noop
+                parse_tokdef token_return,parse_noop
+                parse_tokdef token_run,parse_noop
+                parse_tokdef token_data,parse_noop
+                parse_tokdef token_else,parse_noop
+                parse_tokdef token_end,parse_noop
+                parse_tokdef token_stop,parse_noop
+                parse_tokdef token_sub,parse_noop
+                parse_tokdef token_let,parse_noop
+                parse_tokdef token_list,parse_noop
+                parse_tokdef token_new,parse_noop
+                parse_tokdef token_not,parse_noop
+                parse_tokdef token_print,parse_noop
+                parse_tokdef token_pop,parse_noop
+                parse_tokdef token_to,parse_noop
+                parse_tokdef token_and,parse_noop
+                parse_tokdef token_or,parse_noop
+                parse_tokdef token_go,parse_noop
+parse_rem       rts
+
+                *pragmapop list