Mercurial > hg > index.cgi
view src/parse.s @ 121:5d5472b11ccd
Initital skeleton of separation of separate parsing scheme
This is the first commit in a long series related to separating the parsing
of the input code from the execution of the code. It should allow for more
efficient, and probably simpler, execution while giving quicker feedback
when someone types in syntactically invalid code.
author | William Astle <lost@l-w.ca> |
---|---|
date | Sun, 31 Dec 2023 17:44:39 -0700 |
parents | |
children | 5681cdada362 |
line wrap: on
line source
*pragmapush list *pragma list ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This is the overall parsing package. This is responsible for converting program text into the internal byte code and ; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated ; code analysis. ; ; This is a recursive descent parser. ; ; Entry: ; X Points to the text to encode ; B Nonzero to prevent generating any output (error check/length calculation only) ; ; Exit: ; U Points to the encoded line ; D Length of the encoded line ; CC.C clear ; Error Exit: ; B Error code ; U Offset to error input ; CC.C set parse stb parse_noout ; save no-output flag leay ,x ; save input pointer in a less useful register ldu freestart ; point to start of free memory where we will build the output pshs u ; save original free memory location parse_nextstmt jsr parse_nexttok ; fetch the next token, return type in D bcc parse0 ; brif we succeeded in parsing a token parse_error puls u ; restore original free memory location - deallocate any encoding stu freestart ldu parse_tokenst ; get start location we started parsing the token at rts ; return error condition parse0 ldx #parse_stmtjump ; point to jump table for token type handler abx ; offset to handler address abx jsr [,x] ; call handler bcs parse_error ; brif handler flagged error jsr parse_curtoken ; get the token we terminated on cmpb #token_eot ; end of input? bne parse1 ; brif not ldb #bc_eol ; stash an end of line op bsr parse_write bcs parse_error ; brif we errored out writing to the result (OM?) tfr u,d ; calculate the length of the result subd ,s puls u,pc ; get pointer to start of encoded result and return (C is already clear) parse1 cmpb #token_stmtsep ; statement separator? beq parse_nextstmt ; brif so - do another statement cmpb #token_apos ; ' token? beq parse0 ; brif so - parse it as a new statement comb ; set C for error ldb #err_sn ; raise syntax error bra parse_error parse_write lda parse_noout ; are we doing output? beq parse_write0 ; brif so leau 1,u ; just count up the output and don't do anything rts parse_write0 leax -stackheadroom,s ; calculate bottom of stack with headroom cmpx freestart ; did the stack run into the end of the output? bhs parse_write1 ; brif not - we're good ldb #err_om ; raise out of memory error, C already set from comparison rts parse_write1 stb ,u+ ; save output byte stu freestart ; save new to of used memory parse_noop rts ; return all clear - C clear from comparison above parse_curtoken ldb parse_curtok ; fetch token code of current token rts parse_tokerr comb ; flag error - unexpected token ldb #err_sn ; raise syntax error rts parse_nextchar lda ,y ; at end of input already? beq parse_curchar ; brif so leay 1,y ; move to next input character parse_curchar lda ,y ; fetch input character rts parse_nexttok bsr parse_curchar ; fetch current input beq parse_nexttok1 ; brif end of input parse_nexttok0 cmpa #0x20 ; space? bne parse_nexttok2 ; brif not bsr parse_nextchar ; eat the space bne parse_nexttok0 ; brif not end of input parse_nexttok1 ldb #token_eot ; flag end of input bra parse_nexttok6 ; go return it parse_nexttok2 sty parse_tokenst ; save start of current token after skipping spaces cmpa #'. ; leading decimal? beq parse_nexttok3 ; brif so - parse number cmpa #'0 ; is it a digit blo parse_nexttok4 ; brif not cmpa #'9 ; is it still a digit? bhi parse_nexttok4 ; brif not parse_nexttok3 jmp parse_number ; go parse a number parse_nexttok4 ldx #parse_chartab ; point to list of single character tokens to recognize parse_nexttok5 ldb 1,x ; get token value cmpa ,x++ ; character match (and move to next entry) bne parse_nexttok7 ; brif not parse_nexttok6 stb parse_curtok ; save token type leay 1,y ; eat the input character clra ; clear C to indicate no error (and clear Z also) rts parse_nexttok7 cmpb #token_eot ; end of table? bne parse_nexttok5 ; brif not clrb ; initialize relational flags pshs d ; save input character and relational flags for later parse_nexttok8 cmpa #'< ; less than? blo parse_nexttok9 ; brif not <, =, or > cmpa #'> ; still <, =, or >? bhi parse_nexttok9 ; brif not suba #'< ; adjust < to 0 cmpa #1 ; set C if <, clear if = or > rola ; now 4 if >, 2 if =, or 1 if < eora 1,s ; merge with previous relational characters cmpa 1,s ; if it doesn't match, we have a dupe bne parse_nexttok9 ; brif it's not valid - we won't recognize more in the token sta 1,s ; save new relational flags bsr parse_nextchar ; fetch next input sta ,s ; save input character bne parse_nexttok8 ; brif there was one - go handle it parse_nexttok9 puls d ; get back input character and relational flag tstb ; was it a relational operator? beq parse_nexttok10 ; brif not ldx #parse_reltab ; point to relational operator token table ldb b,x ; get the token code clra ; flag no error rts ; return - but don't advance - we already did looking for multiples parse_nexttok10 bsr parse_toupper ; convert to upper case cmpa #'A ; is it alpha? blo parse_nexttok11 ; brif not cmpa #'Z ; is it still alpha? bls parse_nexttok12 ; brif so parse_nexttok11 comb ; flag error - unrecognized token ldb #token_error rts parse_nextcharu bsr parse_nextchar ; fetch next input character beq parse_toupper0 ; brif end of input parse_toupper cmpa #'a ; is it lower case alpha? blo parse_toupper0 ; brif not cmpa #'z ; is it still lower case alpha? bhi parse_toupper0 ; brif not suba #0x20 ; adjust to upper case alpha parse_toupper0 rts ; Z only set here if input was zero entering from parse_nextcharu ; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters ; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword ; table entry, we fall back to looking for the end of an identifier and then returning that. parse_nexttok12 ldx #parse_wordtab ; point to keyword table bsr parse_nexttok16 ; process this table entry cmpb #token_ident ; did we match a token? bne parse_nexttok6 ; brif so - go return it parse_nexttok13 cmpa #'0 ; was it alphanumeric? blo parse_nexttok15 ; brif not cmpa #'9 ; was it numeric? bls parse_nexttok14 ; brif so cmpa #'A ; was it alpha? blo parse_nexttok15 ; brif not cmpa #'Z ; is it still alpha? bhi parse_nexttok15 ; brif not parse_nexttok14 bsr parse_nextcharu ; fetch next character and force upper case bne parse_nexttok13 ; if not end of input, see if we have alphanumeric parse_nexttok15 tfr y,d ; fetch input location subd parse_tokenst ; calculate length of token std val0+val.strlen ; save the length of the identifier ldb #token_ident ; set token type to identifier (variable name, probably) rts ; return token type, do not advance since we already did above ; Parsing a potential keyword here. This works using a recursive lookup table. Each lookup table starts with a 18 bit ; size entry for the table. Each entry is then 2 bytes. The first is the character to ; match for this entry. The second is either token_eot to indicate a sub table needs to be consulted, token_ident to ; indicate that the token should be parsed as an identifier, or a token type code which indicates the value should ; be accepted. If a sub table is to be consulted, the table will appear inline with the same format. Should matching ; fall off the end of a table, the character being considered will be "ungot" and processing will return back up the ; call chain, ungetting characters, until the top level at which point token_ident will be returned. ; ; If the match character is negative, the match character represents the number of characters to "unget" and then ; return the specified token. This is for handling look-aheads. parse_nexttok16 pshs a,x ; save input character ldd ,x++ ; get number of entries in the table addd 1,s ; set pointer to end of table std 1,s parse_nexttok17 cmpa ,x++ ; does this entry match? beq parse_nexttok21 ; brif so ldb -2,x ; was this a look-ahead non-match? bpl parse_nexttok19 ; brif not leay b,y ; back up the input pointer ldb -1,x ; get match token parse_nexttok18 puls a,x,pc ; clean up stack and return the matched token parse_nexttok19 ldb -1,x ; is there a sub table? cmpb #token_eot bne parse_nexttok20 ; brif not ldd ,x++ ; move past the sub table leax d,x parse_nexttok20 cmpx 1,s ; did we reach the end of this table? blo parse_nexttok17 ; brif not ldb #token_ident ; flag identifier required puls a,x,pc ; restore input character, clean up stack, and return parse_nexttok21 ldb -1,x ; what token did we match? cmpb #token_eot ; sub table? bne parse_nexttok18 ; brif not - ding! ding! ding! we have a match leas 3,s ; clean up stack bsr parse_nextcharu ; fetch next input character bne parse_nexttok16 ; process sub table entries if we have input ldb #token_ident ; indicate we have an ident leay -1,y ; unget the end of input rts parse_number jmp parse_tokerr ; Relational token table, bits are > = < parse_reltab fcb token_error fcb token_lt fcb token_eq fcb token_le fcb token_gt fcb token_ne fcb token_ge fcb token_reltrue ; Single character token lookup table parse_chartab fcb 0x21,token_bang ; ! fcb 0x23,token_hash ; # fcb 0x24,token_dollar ; $ fcb 0x25,token_percent ; % fcb 0x26,token_amp ; & fcb 0x27,token_apos ; ' fcb 0x28,token_oparen ; ( fcb 0x29,token_cparen ; ) fcb 0x2a,token_star ; * fcb 0x2b,token_plus ; + fcb 0x2c,token_comma ; , fcb 0x2d,token_minus ; - fcb 0x2f,token_slash ; / fcb 0x3a,token_stmtsep ; : fcb 0x3b,token_semi ; ; fcb 0x3f,token_print ; ? - print shortcut fcb 0x40,token_at ; @ fcb 0x5e,token_exp ; ^ - exponentiation fcb 0x00,token_eot ; end of table flag ; Parse tokens - define them in order using the macro parse_tokdef *pragmapush list *pragma nolist parse_toknum set 0 parse_tokdef macro noexpand \1 equ parse_toknum parse_toknum set parse_toknum+1 fdb \2 endm *pragmapop list parse_stmtjump parse_tokdef token_error,parse_tokerr parse_tokdef token_eot,parse_noop parse_tokdef token_lt,parse_noop parse_tokdef token_le,parse_noop parse_tokdef token_gt,parse_noop parse_tokdef token_ge,parse_noop parse_tokdef token_eq,parse_noop parse_tokdef token_ne,parse_noop parse_tokdef token_reltrue,parse_noop // always true relational operator parse_tokdef token_stmtsep,parse_noop parse_tokdef token_apos,parse_rem parse_tokdef token_special,parse_noop parse_tokdef token_bang,parse_noop parse_tokdef token_hash,parse_noop parse_tokdef token_dollar,parse_noop parse_tokdef token_percent,parse_noop parse_tokdef token_amp,parse_noop parse_tokdef token_oparen,parse_noop parse_tokdef token_cparen,parse_noop parse_tokdef token_star,parse_noop parse_tokdef token_plus,parse_noop parse_tokdef token_comma,parse_noop parse_tokdef token_minus,parse_noop parse_tokdef token_slash,parse_noop parse_tokdef token_semi,parse_noop parse_tokdef token_at,parse_noop parse_tokdef token_exp,parse_noop parse_tokdef token_ident,parse_noop parse_tokdef token_rem,parse_noop parse_tokdef token_return,parse_noop parse_tokdef token_run,parse_noop parse_tokdef token_data,parse_noop parse_tokdef token_else,parse_noop parse_tokdef token_end,parse_noop parse_tokdef token_stop,parse_noop parse_tokdef token_sub,parse_noop parse_tokdef token_let,parse_noop parse_tokdef token_list,parse_noop parse_tokdef token_new,parse_noop parse_tokdef token_not,parse_noop parse_tokdef token_print,parse_noop parse_tokdef token_pop,parse_noop parse_tokdef token_to,parse_noop parse_tokdef token_and,parse_noop parse_tokdef token_or,parse_noop parse_tokdef token_go,parse_noop parse_rem rts *pragmapop list