Mercurial > hg > index.cgi

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
; code analysis.
;
; This is a recursive descent parser.
;
; Entry:
; X             Points to the text to encode
; B             Nonzero to prevent generating any output (error check/length calculation only)
;
; Exit:
; U             Points to the encoded line
; D             Length of the encoded line
; CC.C          clear

; Error Exit:
; B             Error code
; U             Offset to error input
; CC.C          set
parse           stb parse_noout                 ; save no-output flag
                leay ,x                         ; save input pointer in a less useful register
                ldu freestart                   ; point to start of free memory where we will build the output
                pshs u                          ; save original free memory location
parse_nextstmt  jsr parse_nexttok               ; fetch the next token, return type in D
                bcc parse0                      ; brif we succeeded in parsing a token
parse_error     puls u                          ; restore original free memory location - deallocate any encoding
                stu freestart
                ldu parse_tokenst               ; get start location we started parsing the token at
                rts                             ; return error condition
parse0          ldx #parse_stmtjump             ; point to jump table for token type handler
                abx                             ; offset to handler address
                abx
                jsr [,x]                        ; call handler
                bcs parse_error                 ; brif handler flagged error
                jsr parse_curtoken              ; get the token we terminated on
                cmpb #token_eot                 ; end of input?
                bne parse1                      ; brif not
                ldb #bc_eol                     ; stash an end of line op
                bsr parse_write
                bcs parse_error                 ; brif we errored out writing to the result (OM?)
                tfr u,d                         ; calculate the length of the result
                subd ,s
                puls u,pc                       ; get pointer to start of encoded result and return (C is already clear)
parse1          cmpb #token_stmtsep             ; statement separator?
                beq parse_nextstmt              ; brif so - do another statement
                cmpb #token_apos                ; ' token?
                beq parse0                      ; brif so - parse it as a new statement
                comb                            ; set C for error
                ldb #err_sn                     ; raise syntax error
                bra parse_error
parse_write     lda parse_noout                 ; are we doing output?
                beq parse_write0                ; brif so
                leau 1,u                        ; just count up the output and don't do anything
                rts
parse_write0    leax -stackheadroom,s           ; calculate bottom of stack with headroom
                cmpx freestart                  ; did the stack run into the end of the output?
                bhs parse_write1                ; brif not - we're good
                ldb #err_om                     ; raise out of memory error, C already set from comparison
                rts
parse_write1    stb ,u+                         ; save output byte
                stu freestart                   ; save new to of used memory
parse_noop      rts                             ; return all clear - C clear from comparison above
parse_curtoken  ldb parse_curtok                ; fetch token code of current token
                rts
parse_tokerr    comb                            ; flag error - unexpected token
                ldb #err_sn                     ; raise syntax error
                rts
parse_nextchar  lda ,y                          ; at end of input already?
                beq parse_curchar               ; brif so
                leay 1,y                        ; move to next input character
parse_curchar   lda ,y                          ; fetch input character
                rts
parse_nexttok   bsr parse_curchar               ; fetch current input
                beq parse_nexttok1              ; brif end of input
parse_nexttok0  cmpa #0x20                      ; space?
                bne parse_nexttok2              ; brif not
                bsr parse_nextchar              ; eat the space
                bne parse_nexttok0              ; brif not end of input
parse_nexttok1  ldb #token_eot                  ; flag end of input
                bra parse_nexttok6              ; go return it
parse_nexttok2  sty parse_tokenst               ; save start of current token after skipping spaces
                cmpa #'.                        ; leading decimal?
                beq parse_nexttok3              ; brif so - parse number
                cmpa #'0                        ; is it a digit
                blo parse_nexttok4              ; brif not
                cmpa #'9                        ; is it still a digit?
                bhi parse_nexttok4              ; brif not
parse_nexttok3  jmp parse_number                ; go parse a number
parse_nexttok4  ldx #parse_chartab              ; point to list of single character tokens to recognize
parse_nexttok5  ldb 1,x                         ; get token value
                cmpa ,x++                       ; character match (and move to next entry)
                bne parse_nexttok7              ; brif not
parse_nexttok6  stb parse_curtok                ; save token type
                leay 1,y                        ; eat the input character
                clra                            ; clear C to indicate no error (and clear Z also)
                rts
parse_nexttok7  cmpb #token_eot                 ; end of table?
                bne parse_nexttok5              ; brif not
                clrb                            ; initialize relational flags
                pshs d                          ; save input character and relational flags for later
parse_nexttok8  cmpa #'<                        ; less than?
                blo parse_nexttok9              ; brif not <, =, or >
                cmpa #'>                        ; still <, =, or >?
                bhi parse_nexttok9              ; brif not
                suba #'<                        ; adjust < to 0
                cmpa #1                         ; set C if <, clear if = or >
                rola                            ; now 4 if >, 2 if =, or 1 if <
                eora 1,s                        ; merge with previous relational characters
                cmpa 1,s                        ; if it doesn't match, we have a dupe
                bne parse_nexttok9              ; brif it's not valid - we won't recognize more in the token
                sta 1,s                         ; save new relational flags
                bsr parse_nextchar              ; fetch next input
                sta ,s                          ; save input character
                bne parse_nexttok8              ; brif there was one - go handle it
parse_nexttok9  puls d                          ; get back input character and relational flag
                tstb                            ; was it a relational operator?
                beq parse_nexttok10             ; brif not
                ldx #parse_reltab               ; point to relational operator token table
                ldb b,x                         ; get the token code
                clra                            ; flag no error
                rts                             ; return - but don't advance - we already did looking for multiples
parse_nexttok10 bsr parse_toupper               ; convert to upper case
                cmpa #'A                        ; is it alpha?
                blo parse_nexttok11             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bls parse_nexttok12             ; brif so
parse_nexttok11 comb                            ; flag error - unrecognized token
                ldb #token_error
                rts
parse_nextcharu bsr parse_nextchar              ; fetch next input character
                beq parse_toupper0              ; brif end of input
parse_toupper   cmpa #'a                        ; is it lower case alpha?
                blo parse_toupper0              ; brif not
                cmpa #'z                        ; is it still lower case alpha?
                bhi parse_toupper0              ; brif not
                suba #0x20                      ; adjust to upper case alpha
parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters
; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword
; table entry, we fall back to looking for the end of an identifier and then returning that.
parse_nexttok12 ldx #parse_wordtab              ; point to keyword table
                bsr parse_nexttok16             ; process this table entry
                cmpb #token_ident               ; did we match a token?
                bne parse_nexttok6              ; brif so - go return it
parse_nexttok13 cmpa #'0                        ; was it alphanumeric?
                blo parse_nexttok15             ; brif not
                cmpa #'9                        ; was it numeric?
                bls parse_nexttok14             ; brif so
                cmpa #'A                        ; was it alpha?
                blo parse_nexttok15             ; brif not
                cmpa #'Z                        ; is it still alpha?
                bhi parse_nexttok15             ; brif not
parse_nexttok14 bsr parse_nextcharu             ; fetch next character and force upper case
                bne parse_nexttok13             ; if not end of input, see if we have alphanumeric
parse_nexttok15 tfr y,d                         ; fetch input location
                subd parse_tokenst              ; calculate length of token
                std val0+val.strlen             ; save the length of the identifier
                ldb #token_ident                ; set token type to identifier (variable name, probably)
                rts                             ; return token type, do not advance since we already did above
; Parsing a potential keyword here. This works using a recursive lookup table. Each lookup table starts with a 18 bit
; size entry for the table. Each entry is then 2 bytes. The first is the character to
; match for this entry. The second is either token_eot to indicate a sub table needs to be consulted, token_ident to
; indicate that the token should be parsed as an identifier, or a token type code which indicates the value should
; be accepted. If a sub table is to be consulted, the table will appear inline with the same format. Should matching
; fall off the end of a table, the character being considered will be "ungot" and processing will return back up the
; call chain, ungetting characters, until the top level at which point token_ident will be returned.
;
; If the match character is negative, the match character represents the number of characters to "unget" and then
; return the specified token. This is for handling look-aheads.
parse_nexttok16 pshs a,x                        ; save input character
                ldd ,x++                        ; get number of entries in the table
                addd 1,s                        ; set pointer to end of table
                std 1,s
parse_nexttok17 cmpa ,x++                       ; does this entry match?
                beq parse_nexttok21             ; brif so
                ldb -2,x                        ; was this a look-ahead non-match?
                bpl parse_nexttok19             ; brif not
                leay b,y                        ; back up the input pointer
                ldb -1,x                        ; get match token
parse_nexttok18 puls a,x,pc                     ; clean up stack and return the matched token
parse_nexttok19 ldb -1,x                        ; is there a sub table?
                cmpb #token_eot
                bne parse_nexttok20             ; brif not
                ldd ,x++                        ; move past the sub table
                leax d,x
parse_nexttok20 cmpx 1,s                        ; did we reach the end of this table?
                blo parse_nexttok17             ; brif not
                ldb #token_ident                ; flag identifier required
                puls a,x,pc                     ; restore input character, clean up stack, and return
parse_nexttok21 ldb -1,x                        ; what token did we match?
                cmpb #token_eot                 ; sub table?
                bne parse_nexttok18             ; brif not - ding! ding! ding! we have a match
                leas 3,s                        ; clean up stack
                bsr parse_nextcharu             ; fetch next input character
                bne parse_nexttok16             ; process sub table entries if we have input
                ldb #token_ident                ; indicate we have an ident
                leay -1,y                       ; unget the end of input
                rts
parse_number    jmp parse_tokerr
; Relational token table, bits are > = <
parse_reltab    fcb token_error
                fcb token_lt
                fcb token_eq
                fcb token_le
                fcb token_gt
                fcb token_ne
                fcb token_ge
                fcb token_reltrue
; Single character token lookup table
parse_chartab   fcb 0x21,token_bang             ; !
                fcb 0x23,token_hash             ; #
                fcb 0x24,token_dollar           ; $
                fcb 0x25,token_percent          ; %
                fcb 0x26,token_amp              ; &
                fcb 0x27,token_apos             ; '
                fcb 0x28,token_oparen           ; (
                fcb 0x29,token_cparen           ; )
                fcb 0x2a,token_star             ; *
                fcb 0x2b,token_plus             ; +
                fcb 0x2c,token_comma            ; ,
                fcb 0x2d,token_minus            ; -
                fcb 0x2f,token_slash            ; /
                fcb 0x3a,token_stmtsep          ; :
                fcb 0x3b,token_semi             ; ;
                fcb 0x3f,token_print            ; ? - print shortcut
                fcb 0x40,token_at               ; @
                fcb 0x5e,token_exp              ; ^ - exponentiation
                fcb 0x00,token_eot              ; end of table flag
; Parse tokens - define them in order using the macro parse_tokdef
                *pragmapush list
                *pragma nolist
parse_toknum    set 0
parse_tokdef    macro noexpand
\1              equ parse_toknum
parse_toknum    set parse_toknum+1
                fdb \2
                endm
                *pragmapop list
parse_stmtjump  parse_tokdef token_error,parse_tokerr
                parse_tokdef token_eot,parse_noop
                parse_tokdef token_lt,parse_noop
                parse_tokdef token_le,parse_noop
                parse_tokdef token_gt,parse_noop
                parse_tokdef token_ge,parse_noop
                parse_tokdef token_eq,parse_noop
                parse_tokdef token_ne,parse_noop
                parse_tokdef token_reltrue,parse_noop // always true relational operator
                parse_tokdef token_stmtsep,parse_noop
                parse_tokdef token_apos,parse_rem
                parse_tokdef token_special,parse_noop
                parse_tokdef token_bang,parse_noop
                parse_tokdef token_hash,parse_noop
                parse_tokdef token_dollar,parse_noop
                parse_tokdef token_percent,parse_noop
                parse_tokdef token_amp,parse_noop
                parse_tokdef token_oparen,parse_noop
                parse_tokdef token_cparen,parse_noop
                parse_tokdef token_star,parse_noop
                parse_tokdef token_plus,parse_noop
                parse_tokdef token_comma,parse_noop
                parse_tokdef token_minus,parse_noop
                parse_tokdef token_slash,parse_noop
                parse_tokdef token_semi,parse_noop
                parse_tokdef token_at,parse_noop
                parse_tokdef token_exp,parse_noop
                parse_tokdef token_ident,parse_noop
                parse_tokdef token_rem,parse_noop
                parse_tokdef token_return,parse_noop
                parse_tokdef token_run,parse_noop
                parse_tokdef token_data,parse_noop
                parse_tokdef token_else,parse_noop
                parse_tokdef token_end,parse_noop
                parse_tokdef token_stop,parse_noop
                parse_tokdef token_sub,parse_noop
                parse_tokdef token_let,parse_noop
                parse_tokdef token_list,parse_noop
                parse_tokdef token_new,parse_noop
                parse_tokdef token_not,parse_noop
                parse_tokdef token_print,parse_noop
                parse_tokdef token_pop,parse_noop
                parse_tokdef token_to,parse_noop
                parse_tokdef token_and,parse_noop
                parse_tokdef token_or,parse_noop
                parse_tokdef token_go,parse_noop
parse_rem       rts

                *pragmapop list
author	William Astle <lost@l-w.ca>
date	Sun, 31 Dec 2023 17:44:39 -0700
parents
children	5681cdada362