Mercurial > hg > index.cgi
view src/parse.s @ 123:5681cdada362
Redo keyword table handling to handle keywords differing in length
Some keywords differ only due to length. That is, the shorter keyword
matches the leading characters of the longer one. Make the keyword table
builder and processor handle these cases. Also re-implement the handler
based on evolved understanding of its requirements.
author | William Astle <lost@l-w.ca> |
---|---|
date | Mon, 01 Jan 2024 15:15:45 -0700 |
parents | 5d5472b11ccd |
children | 8770e6f977c3 |
line wrap: on
line source
*pragmapush list *pragma list ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This is the overall parsing package. This is responsible for converting program text into the internal byte code and ; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated ; code analysis. ; ; This is a recursive descent parser. ; ; Entry: ; X Points to the text to encode ; B Nonzero to prevent generating any output (error check/length calculation only) ; ; Exit: ; U Points to the encoded line ; D Length of the encoded line ; CC.C clear ; Error Exit: ; B Error code ; U Offset to error input ; CC.C set parse stb parse_noout ; save no-output flag leay ,x ; save input pointer in a less useful register ldu freestart ; point to start of free memory where we will build the output pshs u ; save original free memory location parse_nextstmt jsr parse_nexttok ; fetch the next token, return type in D bcc parse0 ; brif we succeeded in parsing a token parse_error puls u ; restore original free memory location - deallocate any encoding stu freestart ldu parse_tokenst ; get start location we started parsing the token at rts ; return error condition parse0 ldx #parse_stmtjump ; point to jump table for token type handler abx ; offset to handler address abx jsr [,x] ; call handler bcs parse_error ; brif handler flagged error jsr parse_curtoken ; get the token we terminated on cmpb #token_eot ; end of input? bne parse1 ; brif not ldb #bc_eol ; stash an end of line op bsr parse_write bcs parse_error ; brif we errored out writing to the result (OM?) tfr u,d ; calculate the length of the result subd ,s puls u,pc ; get pointer to start of encoded result and return (C is already clear) parse1 cmpb #token_stmtsep ; statement separator? beq parse_nextstmt ; brif so - do another statement cmpb #token_apos ; ' token? beq parse0 ; brif so - parse it as a new statement comb ; set C for error ldb #err_sn ; raise syntax error bra parse_error parse_write lda parse_noout ; are we doing output? beq parse_write0 ; brif so leau 1,u ; just count up the output and don't do anything rts parse_write0 leax -stackheadroom,s ; calculate bottom of stack with headroom cmpx freestart ; did the stack run into the end of the output? bhs parse_write1 ; brif not - we're good ldb #err_om ; raise out of memory error, C already set from comparison rts parse_write1 stb ,u+ ; save output byte stu freestart ; save new to of used memory parse_noop rts ; return all clear - C clear from comparison above parse_curtoken ldb parse_curtok ; fetch token code of current token rts parse_tokerr comb ; flag error - unexpected token ldb #err_sn ; raise syntax error rts parse_nextchar lda ,y ; at end of input already? beq parse_curchar ; brif so leay 1,y ; move to next input character parse_curchar lda ,y ; fetch input character rts parse_nexttok bsr parse_curchar ; fetch current input beq parse_nexttok1 ; brif end of input parse_nexttok0 cmpa #0x20 ; space? bne parse_nexttok2 ; brif not bsr parse_nextchar ; eat the space bne parse_nexttok0 ; brif not end of input parse_nexttok1 ldb #token_eot ; flag end of input bra parse_nexttok6 ; go return it parse_nexttok2 sty parse_tokenst ; save start of current token after skipping spaces cmpa #'. ; leading decimal? beq parse_nexttok3 ; brif so - parse number cmpa #'0 ; is it a digit blo parse_nexttok4 ; brif not cmpa #'9 ; is it still a digit? bhi parse_nexttok4 ; brif not parse_nexttok3 jmp parse_number ; go parse a number parse_nexttok4 ldx #parse_chartab ; point to list of single character tokens to recognize parse_nexttok5 ldb 1,x ; get token value cmpa ,x++ ; character match (and move to next entry) bne parse_nexttok7 ; brif not parse_nexttok6 stb parse_curtok ; save token type leay 1,y ; eat the input character clra ; clear C to indicate no error (and clear Z also) rts parse_nexttok7 cmpb #token_eot ; end of table? bne parse_nexttok5 ; brif not clrb ; initialize relational flags pshs d ; save input character and relational flags for later parse_nexttok8 cmpa #'< ; less than? blo parse_nexttok9 ; brif not <, =, or > cmpa #'> ; still <, =, or >? bhi parse_nexttok9 ; brif not suba #'< ; adjust < to 0 cmpa #1 ; set C if <, clear if = or > rola ; now 4 if >, 2 if =, or 1 if < eora 1,s ; merge with previous relational characters cmpa 1,s ; if it doesn't match, we have a dupe bne parse_nexttok9 ; brif it's not valid - we won't recognize more in the token sta 1,s ; save new relational flags bsr parse_nextchar ; fetch next input sta ,s ; save input character bne parse_nexttok8 ; brif there was one - go handle it parse_nexttok9 puls d ; get back input character and relational flag tstb ; was it a relational operator? beq parse_nexttok10 ; brif not ldx #parse_reltab ; point to relational operator token table ldb b,x ; get the token code clra ; flag no error rts ; return - but don't advance - we already did looking for multiples parse_nexttok10 bsr parse_toupper ; convert to upper case cmpa #'A ; is it alpha? blo parse_nexttok11 ; brif not cmpa #'Z ; is it still alpha? bls parse_nexttok12 ; brif so parse_nexttok11 comb ; flag error - unrecognized token ldb #token_error rts parse_nextcharu bsr parse_nextchar ; fetch next input character beq parse_toupper0 ; brif end of input parse_toupper cmpa #'a ; is it lower case alpha? blo parse_toupper0 ; brif not cmpa #'z ; is it still lower case alpha? bhi parse_toupper0 ; brif not suba #0x20 ; adjust to upper case alpha parse_toupper0 rts ; Z only set here if input was zero entering from parse_nextcharu ; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters ; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword ; table entry, we fall back to looking for the end of an identifier and then returning that. parse_nexttok12 ldx #parse_wordtab ; point to keyword table bsr parse_nexttok16 ; process this table entry cmpb #token_ident ; did we match a token? bne parse_nexttok6 ; brif so - go return it parse_nexttok13 cmpa #'0 ; was it alphanumeric? blo parse_nexttok15 ; brif not cmpa #'9 ; was it numeric? bls parse_nexttok14 ; brif so cmpa #'A ; was it alpha? blo parse_nexttok15 ; brif not cmpa #'Z ; is it still alpha? bhi parse_nexttok15 ; brif not parse_nexttok14 bsr parse_nextcharu ; fetch next character and force upper case bne parse_nexttok13 ; if not end of input, see if we have alphanumeric parse_nexttok15 tfr y,d ; fetch input location subd parse_tokenst ; calculate length of token std val0+val.strlen ; save the length of the identifier ldb #token_ident ; set token type to identifier (variable name, probably) rts ; return token type, do not advance since we already did above ; This routine parses tokens using the table at parse_wordtab. The table is structured as follows: ; ; * two bytes which contain the length of the table less the two bytes for this length value ; * a sequence of entries consisting of a single byte matching character and a token code followed ; by an optional sub table, structured exactly the same way. ; ; The optional subtable will be present if the token code is token_eot ; ; If the character match is negative, it means a lookahead failed. The negative value is the number ; of characters to unget and the token code is the token value to return. No other entries after this ; in a table will be considered since thie negative match is a global match. ; ; When a token_eot match is found, if there are no further characters in the input, the match is ; determined to be invalid and processing continues with the next entry. parse_wordtab0 leas 3,s ; clean up stack for sub table handling parse_wordtab pshs a,x ; save input character and start of table ldd ,x++ ; get length of this table addd 1,s ; calculate the address of the end of the table std 1,s ; save end address for comparison later lda ,s ; get back input character parse_wordtab1 ldb -1,x ; fetch token code for this entry cmpa ,x++ ; does this entry match? bne parse_wordtab4 ; brif not cmpb #token_eot ; is it indicating a sub table? bne parse_wordtab6 ; brif not bsr parse_nextcharu ; fetch next input character (for sub table match) bne parse_wordtab0 ; brif we are going to check the sub table parse_wordtab2 ldd ,x++ ; fetch length of sub table leax d,x ; move past sub table parse_wordtab3 lda ,s ; get back input character cmpx 1,s ; are we at the end of the table? blo parse_wordtab1 ; brif not - check another entry comb ; indicate no match puls a,x,pc ; clean up stack and return parse_wordtab4 lda -2,x ; get the match character bmi parse_wordtab5 ; brif negative - lookahead fail cmpb #token_eot ; is there a sub table to skip? beq parse_wordtab2 ; brif so - skip sub table bra parse_wordtab3 ; otherwise just move to the next entry parse_wordtab5 leay a,y ; move back the specified number of characters parse_wordtab6 clra ; clear C to indicate a match puls a,x,pc ; clean up stack and return parse_number jmp parse_tokerr ; Relational token table, bits are > = < parse_reltab fcb token_error fcb token_lt fcb token_eq fcb token_le fcb token_gt fcb token_ne fcb token_ge fcb token_reltrue ; Single character token lookup table parse_chartab fcb 0x21,token_bang ; ! fcb 0x23,token_hash ; # fcb 0x24,token_dollar ; $ fcb 0x25,token_percent ; % fcb 0x26,token_amp ; & fcb 0x27,token_apos ; ' fcb 0x28,token_oparen ; ( fcb 0x29,token_cparen ; ) fcb 0x2a,token_star ; * fcb 0x2b,token_plus ; + fcb 0x2c,token_comma ; , fcb 0x2d,token_minus ; - fcb 0x2f,token_slash ; / fcb 0x3a,token_stmtsep ; : fcb 0x3b,token_semi ; ; fcb 0x3f,token_print ; ? - print shortcut fcb 0x40,token_at ; @ fcb 0x5e,token_exp ; ^ - exponentiation fcb 0x00,token_eot ; end of table flag ; Parse tokens - define them in order using the macro parse_tokdef *pragmapush list *pragma nolist parse_toknum set 0 parse_tokdef macro noexpand \1 equ parse_toknum parse_toknum set parse_toknum+1 fdb \2 endm *pragmapop list parse_stmtjump parse_tokdef token_error,parse_tokerr parse_tokdef token_eot,parse_noop parse_tokdef token_lt,parse_noop parse_tokdef token_le,parse_noop parse_tokdef token_gt,parse_noop parse_tokdef token_ge,parse_noop parse_tokdef token_eq,parse_noop parse_tokdef token_ne,parse_noop parse_tokdef token_reltrue,parse_noop // always true relational operator parse_tokdef token_stmtsep,parse_noop parse_tokdef token_apos,parse_rem parse_tokdef token_special,parse_noop parse_tokdef token_bang,parse_noop parse_tokdef token_hash,parse_noop parse_tokdef token_dollar,parse_noop parse_tokdef token_percent,parse_noop parse_tokdef token_amp,parse_noop parse_tokdef token_oparen,parse_noop parse_tokdef token_cparen,parse_noop parse_tokdef token_star,parse_noop parse_tokdef token_plus,parse_noop parse_tokdef token_comma,parse_noop parse_tokdef token_minus,parse_noop parse_tokdef token_slash,parse_noop parse_tokdef token_semi,parse_noop parse_tokdef token_at,parse_noop parse_tokdef token_exp,parse_noop parse_tokdef token_ident,parse_noop parse_tokdef token_rem,parse_noop parse_tokdef token_return,parse_noop parse_tokdef token_run,parse_noop parse_tokdef token_data,parse_noop parse_tokdef token_else,parse_noop parse_tokdef token_end,parse_noop parse_tokdef token_stop,parse_noop parse_tokdef token_sub,parse_noop parse_tokdef token_let,parse_noop parse_tokdef token_list,parse_noop parse_tokdef token_new,parse_noop parse_tokdef token_not,parse_noop parse_tokdef token_print,parse_noop parse_tokdef token_pop,parse_noop parse_tokdef token_to,parse_noop parse_tokdef token_and,parse_noop parse_tokdef token_or,parse_noop parse_tokdef token_go,parse_noop parse_tokdef token_as,parse_noop parse_tokdef token_asc,parse_noop parse_rem rts *pragmapop list