Mercurial > hg > index.cgi
view src/parse.s @ 132:917b4893bb3d
Checkpoint before redoing a bunch of code for clarity
author | William Astle <lost@l-w.ca> |
---|---|
date | Mon, 24 Jun 2024 23:44:39 -0600 |
parents | 95f174bf459b |
children | 5d4801c0566d |
line wrap: on
line source
*pragmapush list *pragma list ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This is the overall parsing package. This is responsible for converting program text into the internal byte code and ; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated ; code analysis. In almost all cases, the returned error will be a syntax error. The internal byte code shares the same ; token number allocations as the parser. Some allocated tokens cannot be identified by the lexer (parse_nexttok) but ; are used at runtime and when "decompiling" to text. ; ; In the event of a parse error, everything up to the next end of statement is retained as is using a special token ; that preserves the unparsable text and parsing resumes. Only the first error is referenced by the return error ; pointer. ; ; This is a recursive descent parser. ; ; Entry: ; X Points to the text to encode ; B Nonzero to prevent generating any output (error check/length calculation only) ; ; Exit: ; X Points to the encoded line ; D Length of the encoded line ; CC.C clear ; Error Exit: ; X Points to the encoded line ; D Length of the encoded line ; Y Pointer to the first error location in the input ; U Error code ; CC.C set ; ; This is the error handler. It is responsible for resetting the stack to bail out to the top level ; parsing loop. It must also store the input pointer if this is the first error. Finally, it has to ; output all the text up to either the end of the line *or* the next valid statement separator. parse_errorsn ldb #err_sn parse_error lds parse_stackptr ; restore the original stack pointer so we can call from down stack puls u ; get back original free pointer stu freestart ; deallocate any allocated result ldu parse_tokenst ; get start location of the token where the error was raised coma ; make sure C is set for error rts parse stb parse_noout ; save no-output flag leay ,x ; save input pointer in a less useful register ldu freestart ; point to start of free memory where we will build the output pshs u ; save original free memory location sts parse_stackptr ; save the stack pointer for bailing out on errors parse_nextstmt jsr parse_nexttok ; fetch the next token, return type in D bcs parse_error ; brif we failed at parsing a token parse0 ldx #parsetab_cmd ; point to jump table for token type handler cmpb #token_stmtsep ; is it a statement separator? beq parse_nextstmt ; brif so - we can just skip it parse1 cmpb ,x ; did we match a valid command token? beq parse3 ; brif so leax 3,x ; move to next entry cmpx #parsetab_cmde ; end of table? blo parse1 ; brif not bra parse_errorsn ; fell off the end parse3 jsr [1,x] ; call the handler bcs parse_error ; brif the handler indicated error bsr parse_curtoken ; fetch the token we left off on cmpb #token_eot ; end of input? bne parse4 ; brif not ldb #bc_eol ; stash an end of line op bsr parse_write bcs parse_error ; brif we errored out writing to the result (OM?) tfr u,d ; calculate the length of the result subd ,s puls u,pc ; get pointer to start of encoded result and return (C is already clear) parse4 cmpb #token_stmtsep ; statement separator? beq parse_nextstmt ; brif so - do another statement cmpb #token_remabbr ; ' token? beq parse0 ; brif so - parse it as a new statement bra parse_errorsn ; raise a syntax error parse_write lda parse_noout ; are we doing output? beq parse_write0 ; brif so leau 1,u ; just count up the output and don't do anything rts parse_write0 leax -stackheadroom,s ; calculate bottom of stack with headroom cmpx freestart ; did the stack run into the end of the output? bhs parse_write1 ; brif not - we're good ldb #err_om ; raise out of memory error, C already set from comparison rts parse_write1 stb ,u+ ; save output byte stu freestart ; save new to of used memory list_noop parse_noop rts ; return all clear - C clear from comparison above parse_curtoken ldb parse_curtok ; fetch token code of current token rts parse_tokerr comb ; flag error - unexpected token ldb #err_sn ; raise syntax error rts parse_nextchar lda ,y ; at end of input already? beq parse_curchar ; brif so leay 1,y ; move to next input character parse_curchar lda ,y ; fetch input character rts parse_nexttokc bsr parse_nexttok ; fetch next token parse_iseos cmpb #token_eot ; end of text? beq parse_iseos0 ; brif so cmpb #token_stmtsep ; is it a statement separator parse_iseos0 rts parse_nexttok bsr parse_curchar ; fetch current input beq parse_nexttok1 ; brif end of input parse_nexttok0 cmpa #0x20 ; space? bne parse_nexttok2 ; brif not bsr parse_nextchar ; eat the space bne parse_nexttok0 ; brif not end of input parse_nexttok1 ldb #token_eot ; flag end of input bra parse_nexttok6 ; go return it parse_nexttok2 sty parse_tokenst ; save start of current token after skipping spaces bsr parse_toupper ; make sure we have upper case letters for matching ldx #parse_wt ; point to keyword parsing table jsr parse_wordtab ; go see if we have a match in the keyword table bcc parse_nexttok6 ; brif we do - return it ldy parse_tokenst ; return to the start of the token - pointer probably clobbered bsr parse_curchar ; get back input character (may have been clobbered) cmpa #'. ; leading decimal? beq parse_nexttok3 ; brif so - parse number cmpa #'0 ; is it a digit blo parse_nexttok10 ; brif not cmpa #'9 ; is it still a digit? bhi parse_nexttok10 ; brif not parse_nexttok3 jmp parse_number ; go parse a number parse_nexttok6 stb parse_curtok ; save token type leay 1,y ; eat the input character clra ; clear C to indicate no error (and clear Z also) rts parse_nexttok10 cmpa #'A ; is it alpha? blo parse_nexttok11 ; brif not cmpa #'Z ; is it still alpha? bls parse_nexttok12 ; brif so parse_nexttok11 comb ; flag error - unrecognized token ldb #token_error rts parse_nexttok12 bsr parse_nextcharu ; fetch next input character cmpa #'0 ; is it alphanumeric? blo parse_nexttok13 ; brif not cmpa #'9 ; is it numeric? bls parse_nexttok12 ; brif so - keep skipping it cmpa #'A ; is it alpha? blo parse_nexttok13 ; brif not cmpa #'Z ; is it still alpha? bls parse_nexttok12 ; brif so - keep skipping it parse_nexttok13 tfr y,d ; calculate length of identifier subd parse_tokenst std val0+val.strlen ; save it for reference ldb #token_ident ; indicate an identifier (variable name, etc.) rts ; return result (C will be clear from SUBD above) parse_nextcharu bsr parse_nextchar ; fetch next input character beq parse_toupper0 ; brif end of input parse_toupper cmpa #'a ; is it lower case alpha? blo parse_toupper0 ; brif not cmpa #'z ; is it still lower case alpha? bhi parse_toupper0 ; brif not suba #0x20 ; adjust to upper case alpha parse_toupper0 rts ; Z only set here if input was zero entering from parse_nextcharu parse_number jmp parse_tokerr ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Parse a statement that consists of just the command token parse_cmdsingle equ parse_write ; just write the token out and bail ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Parse a REM or ' statement. We just copy the comment out after the REM or ' token. parse_rem jsr parse_write ; write the token/character out ldb ,y+ ; get next input character bne parse_rem ; brif not at the end of the input ldb #token_eot ; flag end of input for mainline parser stb parse_curtok rts ; return, pass back the C result from parse_write ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This routine parses tokens using the table at parse_wordtab. The table is structured as follows: ; ; * two bytes which contain the length of the table less the two bytes for this length value ; * a sequence of entries consisting of a single byte matching character and a token code followed ; by an optional sub table, structured exactly the same way. ; ; The optional subtable will be present if the token code is token_eot ; ; If the character match is negative, it means a lookahead failed. The negative value is the number ; of characters to unget and the token code is the token value to return. No other entries after this ; in a table will be considered since thie negative match is a global match. ; ; When a token_eot match is found, if there are no further characters in the input, the match is ; determined to be invalid and processing continues with the next entry. parse_wordtab0 leas 3,s ; clean up stack for sub table handling parse_wordtab pshs a,x ; save input character and start of table ldd ,x++ ; get length of this table addd 1,s ; calculate the address of the end of the table std 1,s ; save end address for comparison later lda ,s ; get back input character parse_wordtab1 ldb 1,x ; fetch token code for this entry cmpa ,x++ ; does this entry match? bne parse_wordtab4 ; brif not cmpb #token_eot ; is it indicating a sub table? bne parse_wordtab6 ; brif not jsr parse_nextcharu ; fetch next input character (for sub table match) bne parse_wordtab0 ; brif we are going to check the sub table parse_wordtab2 ldd ,x ; fetch length of sub table leax d,x ; move past sub table parse_wordtab3 lda ,s ; get back input character cmpx 1,s ; are we at the end of the table? blo parse_wordtab1 ; brif not - check another entry comb ; indicate no match puls a,x,pc ; clean up stack and return parse_wordtab4 lda -2,x ; get the match character bmi parse_wordtab5 ; brif negative - lookahead fail cmpb #token_eot ; is there a sub table to skip? beq parse_wordtab2 ; brif so - skip sub table bra parse_wordtab3 ; otherwise just move to the next entry parse_wordtab5 leay a,y ; move back the specified number of characters parse_wordtab6 clra ; clear C to indicate a match puls a,x,pc ; clean up stack and return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Convert a token number back to its keyword. This will use the same table used by parse_wordtab. Enter with a character ; output routine pointer in U which takes the character in A. The routine can assume that Y is preserved. Will return ; with C set if the token does not exist in the word table and clear otherwise. parse_wtdc pshs u ; save routine pointer ldu #strbuff+20 ; point to temporary string buffer clr ,-u ; put a NUL at the end of the string ldx #parse_wt ; point to keyword parse table bsr parse_wtdc2 ; call the tree walker function bcc parse_wtdc1 ; brif we do have a match puls u,pc ; clean stack and return parse_wtdc0 jsr [,s] ; output the character parse_wtdc1 lda ,u+ ; get output byte bne parse_wtdc0 ; brif we're not at the end yet clra ; make sure C is clear puls u,pc ; clean stack and return parse_wtdc2 pshs a,x ; save the token match value and the table pointer ldd ,x++ ; get table length addd 1,s ; calculate end address std 1,s ; save it parse_wtdc3 ldd ,x++ ; get this table entry bmi parse_wtdc6 ; brif it's a backtracking entry - skip it cmpa ,s ; does the token match here? bne parse_wtdc5 ; brif not parse_wtdc4 sta ,-y ; add the character to the output buffer puls a,x,pc ; return up the call stack - C is clear from CMPA above parse_wtdc5 cmpb #token_eot ; does this entry have a sub table? bne parse_wtdc6 ; brif not pshs a ; save the matched character lda 1,s ; get back the token we need bsr parse_wtdc2 ; go handle the sub table puls a ; get back the matched character bcc parse_wtdc4 ; brif it did match - record it and return parse_wtdc6 cmpx 1,s ; are we at the end of this table? bne parse_wtdc3 ; brif not - handle another table entry coma ; make sure C is set for no match puls a,x,pc ; clean up stack and return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Validate a line number. Must enter with the token type in B. Will return the line number in X. It will return a ; syntax error if the line number is invalid or out of range. It will also consume a valid line number token. parse_linenum cmpb #token_int32 ; is it an integer? beq parse_linenum1 ; brif so parse_linenum0 ldb #err_sn ; flag syntax error coma ; flag error rts parse_linenum1 ldx val0+val.int ; get high word of integer bne parse_linenum0 ; brif not a valid line number ldx val0+val.int+2 ; get actual line number pshs x ; save it jsr parse_nexttok ; consume line number puls x,pc ; get back line number and return it ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Parse a line number range which is one of the following forms: ; <linenum1> ; <linenum1>- ; <linenum1>-<linenum2> ; -<linenum2> ; The result will store two line numbers. If no - token appears, then both line numbers will be the same. Otherwise, ; if <linenum1> is omitted, it will be assumed to be 0. If <linenum2> is omitted, it will be assumed to be 65535. Those ; are the minimum and maximum line numbers. ; ; Parsing works by first looking for an integer token that is in range. If it finds one, it looks for an optional - ; followed by an optional integer token that is in range. If the first token is not an integer, it must be a - which may ; be optionally followed by another integer in range. ; ; It is technically valid to have a single - with no line numbers. ; ; Enter with the current token in B. ; ; The resulting line numbers will be returned in parse_buff parse_linerange ldx zero ; default start line number leau -1,x ; default end line number pshs x,u ; save the return range cmpb #token_minus ; range with no start? beq parse_linerang1 ; brif so bsr parse_linenum ; verify line number, return in X bcs parse_linerang4 ; bail out on error stx ,s ; save new start line number jsr parse_nexttokc ; fetch next token, set Z if end of statement bne parse_linerang0 ; brif not end of line ldx ,s ; get end line to use as start line bra parse_linerang2 ; go set range end and return parse_linerang0 cmpb #token_minus ; do we have a range character? bne parse_linerang3 ; brif not - we have an error parse_linerang1 jsr parse_nexttokc ; parse what comes after the range mark beq parse_linerang2 ; brif end of statement - use the default range end bsr parse_linenum ; make sure it's a valid line number bcs parse_linerang4 ; bail out on error parse_linerang2 stx 2,s ; set range end clra ; make sure C is clear puls x,u,pc ; fetch return values and return parse_linerang3 ldb #err_sn ; flag a syntax error coma ; make sure C is set parse_linerang4 puls x,u,pc ; clean up stack and return error condition ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This table defines the various handler routines for the various bytecode tokens. Each token is defined as follows: ; parse_tokdefT <sym>,<parse>,<list>,<exec> ; where: ; T: c for command, f for function, p for particle ; <sym>: the symbol name without the "token_" prefix ; <parse>: parse handler for the type, ignored for particles ; <list>: list handler for the type, ingored for particles ; <exec>: execution handler for the type, ignored for particles *pragmapush list *pragma nolist __toknump set 0 __toknumc set 0x40 __toknumf set 0xc0 setstr __cmdparset="" setstr __cmdlistt="" setstr __cmdexect="" setstr __fnparset="" setstr __fnlistt="" setstr __fnexect="" parse_tokendefp macro noexpand token_\1 equ __toknump __toknump set __toknump+1 endm parse_tokendefc macro noexpand token_\1 equ __toknumc __toknumc set __toknumc+1 ifstr ne,"{2}","" setstr __cmdparset="%(__cmdparset)\tfcb\ttoken_\1\n\tfdb {2}\n" endc ifstr ne,"{3}","" setstr __cmdlistt="%(__cmdlistt)\tfcb\ttoken_\1\n\tfdb {3}\n" endc ifstr ne,"{4}","" setstr __cmdexect="%(__cmdexect)\tfdb {3}\n" else setstr __cmdexect="%(__cmdexect)\tfdb SNERROR\n" endc endm parse_tokendeff macro noexpand token_\1 equ __toknumf __toknumf set __toknumf+1 ifstr ne,"{2}","" setstr __fnparset="%(__fnparset)\tfcb\ttoken_\1\n\tfdb {2}\n" endc ifstr ne,"{3}","" setstr __fnlistt="%(__fnlistt)\tfcb\ttoken_\1\n\tfdb {3}\n" endc ifstr ne,"{4}","" setstr __fnexect="%(__fnexect)\tfdb {3}\n" else setstr __fnexect="%(__fnexect)\tfdb SNERROR\n" endc endm token_cmdparse macro *pragmapush nolist *pragma nolist includestr "%(__cmdparset)" *pragmapop nolist endm token_cmdlist macro *pragmapush nolist *pragma nolist includestr "%(__cmdlistt)" *pragmapop nolist endm token_cmdexec macro *pragmapush nolist *pragma nolist includestr "%(__cmdexect)" token__maxcmd equ __toknumc-1 *pragmapop nolist endm token_fnparse macro *pragmapush nolist *pragma nolist includestr "%(__fnparset)" *pragmapop nolist endm token_fnlist macro *pragmapush nolist *pragma nolist includestr "%(__fnlistt)" *pragmapop nolist endm token_fnexec macro *pragmapush nolist *pragma nolist includestr "%(__fnexect)" token__maxfn equ __toknumf-1 *pragmapop nolist endm *pragmapop list ; the tokens defined in this section all have special parsing or meaning parse_tokendefp error ; Used to mark errors; should always be first so it's token #0 parse_tokendefp eot ; End of input marker or special handling in word tables parse_tokendefp int32 ; 32 bit integer (has special parsing) parse_tokendefp float ; floating point value (has special parsing) parse_tokendefp ident ; identifier (has special parsing) parse_tokendefp linenum ; a 16 bit unsigned integer treated as a line number parse_tokendefp linerange ; a pair of 16 bit unsigned integers treated as line numbers ; everything below here references keywords or particle characters parse_tokendefp stmtsep ; statement separator parse_tokendefp times ; times (multiplication) operator (*) parse_tokendefp plus ; addition operator parse_tokendefp divide ; division operator (/) parse_tokendefp minus ; subtraction operator parse_tokendefp exp ; exponentiation operator (^) parse_tokendefp lt ; less than operator parse_tokendefp le ; less than or equal operateor parse_tokendefp gt ; greater than operator parse_tokendefp ge ; greater than or equal operator parse_tokendefp eq ; equality operator parse_tokendefp ne ; inequality operator parse_tokendefp not ; boolean NOT operator parse_tokendefp and ; boolean AND operator parse_tokendefp or ; boolean OR operator parse_tokendefp bang ; exclamation mark parse_tokendefp hash ; number sign parse_tokendefp dollar ; dollar sign (string sigil) parse_tokendefp percent ; percent sign (integer sigil) parse_tokendefp amp ; ampersand parse_tokendefp oparen ; opening paren parse_tokendefp cparen ; closing paren parse_tokendefp sep ; comma (separator) parse_tokendefp semi ; semicolon parse_tokendefp at ; @ symbol parse_tokendefp else ; ELSE parse_tokendefp then ; THEN parse_tokendefp to ; TO parse_tokendefp sub ; SUB parse_tokendefp as ; AS parse_tokendefc remabbr,parse_rem,list_noop,exec_noop ; abbreviated REM (') parse_tokendefc rem,parse_rem,list_noop,exec_noop ; REM parse_tokendefc return,parse_cmdsingle,parse_noop,parse_noop ; RETURN parse_tokendefc run,parse_noop,parse_noop,parse_noop ; RUN parse_tokendefc data,parse_noop,parse_noop,parse_noop ; DATA parse_tokendefc end,parse_cmdsingle,parse_noop,parse_noop ; END parse_tokendefc stop,parse_cmdsingle,parse_noop,parse_noop ; STOP parse_tokendefc let,parse_noop,parse_noop,parse_noop ; LET parse_tokendefc list,parse_noop,parse_noop,parse_noop ; LIST parse_tokendefc new,parse_cmdsingle,parse_noop,parse_noop ; NEW parse_tokendefc print,parse_noop,parse_noop,parse_noop ; PRINT parse_tokendefc pop,parse_cmdsingle,parse_noop,parse_noop ; POP parse_tokendefc goto,parse_noop,parse_noop,parse_noop ; GOTO parse_tokendefc gosub,parse_noop,parse_noop,parse_noop ; GOSUB parse_tokendefc go,parse_noop,parse_noop,parse_noop ; GO parse_tokendeff asc,parse_noop,parse_noop,parse_noop ; ASC() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Parse handling tables parsetab_cmd token_cmdparse parsetab_cmde parsetab_fn token_fnparse parsetab_fne ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; List handling tables listtab_cmd token_cmdlist listtab_cmde listtab_fn token_fnlist listtab_fne ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Execution handling tables exectab_cmd token_cmdexec exectab_fn token_fnexec *pragmapop list