Mercurial > hg > index.cgi
view src/parse.s @ 126:ac183a519439
Update parsing scheme with a keyword lookup by token value and other framework
Add ability to turn a token code into a keyword string. Also correct some
details related to token table generation with some additiona adjustments
for token symbols.
Also rework token symbol definitions and creation of some parsing tables as
well as the main statement parsing loop.
author | William Astle <lost@l-w.ca> |
---|---|
date | Mon, 08 Jan 2024 22:58:08 -0700 |
parents | 0607e4e20702 |
children | 527212870064 |
line wrap: on
line source
*pragmapush list *pragma list ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This is the overall parsing package. This is responsible for converting program text into the internal byte code and ; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated ; code analysis. ; ; This is a recursive descent parser. ; ; Entry: ; X Points to the text to encode ; B Nonzero to prevent generating any output (error check/length calculation only) ; ; Exit: ; U Points to the encoded line ; D Length of the encoded line ; CC.C clear ; Error Exit: ; B Error code ; U Offset to error input ; CC.C set parse stb parse_noout ; save no-output flag leay ,x ; save input pointer in a less useful register ldu freestart ; point to start of free memory where we will build the output pshs u ; save original free memory location parse_nextstmt jsr parse_nexttok ; fetch the next token, return type in D bcc parse0 ; brif we succeeded in parsing a token parse_error puls u ; restore original free memory location - deallocate any encoding stu freestart ldu parse_tokenst ; get start location we started parsing the token at rts ; return error condition parse0 ldx #parsetab_cmd ; point to jump table for token type handler parse1 cmpb ,x ; did we match a valid command token? beq parse3 ; brif so leax 3,x ; move to next entry cmpx #parsetab_cmde ; end of table? blo parse1 ; brif not parse2 ldb #err_sn ; flag syntax error bra parse_error ; and return the error parse3 jsr [1,x] ; call the handler bcs parse_error ; brif the handler indicated error jsr parse_curtoken ; get the token we terminated on cmpb #token_eot ; end of input? bne parse4 ; brif not ldb #bc_eol ; stash an end of line op bsr parse_write bcs parse_error ; brif we errored out writing to the result (OM?) tfr u,d ; calculate the length of the result subd ,s puls u,pc ; get pointer to start of encoded result and return (C is already clear) parse4 cmpb #token_stmtsep ; statement separator? beq parse_nextstmt ; brif so - do another statement cmpb #token_remabbr ; ' token? beq parse0 ; brif so - parse it as a new statement bra parse2 ; raise a syntax error parse_write lda parse_noout ; are we doing output? beq parse_write0 ; brif so leau 1,u ; just count up the output and don't do anything rts parse_write0 leax -stackheadroom,s ; calculate bottom of stack with headroom cmpx freestart ; did the stack run into the end of the output? bhs parse_write1 ; brif not - we're good ldb #err_om ; raise out of memory error, C already set from comparison rts parse_write1 stb ,u+ ; save output byte stu freestart ; save new to of used memory list_noop parse_noop rts ; return all clear - C clear from comparison above parse_curtoken ldb parse_curtok ; fetch token code of current token rts parse_tokerr comb ; flag error - unexpected token ldb #err_sn ; raise syntax error rts parse_nextchar lda ,y ; at end of input already? beq parse_curchar ; brif so leay 1,y ; move to next input character parse_curchar lda ,y ; fetch input character rts parse_nexttok bsr parse_curchar ; fetch current input beq parse_nexttok1 ; brif end of input parse_nexttok0 cmpa #0x20 ; space? bne parse_nexttok2 ; brif not bsr parse_nextchar ; eat the space bne parse_nexttok0 ; brif not end of input parse_nexttok1 ldb #token_eot ; flag end of input bra parse_nexttok6 ; go return it parse_nexttok2 sty parse_tokenst ; save start of current token after skipping spaces bsr parse_toupper ; make sure we have upper case letters for matching ldx #parse_wt ; point to keyword parsing table bsr parse_wordtab ; go see if we have a match in the keyword table bcc parse_nexttok6 ; brif we do - return it ldy parse_tokenst ; return to the start of the token - pointer probably clobbered bsr parse_curchar ; get back input character (may have been clobbered) cmpa #'. ; leading decimal? beq parse_nexttok3 ; brif so - parse number cmpa #'0 ; is it a digit blo parse_nexttok10 ; brif not cmpa #'9 ; is it still a digit? bhi parse_nexttok10 ; brif not parse_nexttok3 jmp parse_number ; go parse a number parse_nexttok6 stb parse_curtok ; save token type leay 1,y ; eat the input character clra ; clear C to indicate no error (and clear Z also) rts parse_nexttok10 cmpa #'A ; is it alpha? blo parse_nexttok11 ; brif not cmpa #'Z ; is it still alpha? bls parse_nexttok12 ; brif so parse_nexttok11 comb ; flag error - unrecognized token ldb #token_error rts parse_nexttok12 bsr parse_nextcharu ; fetch next input character cmpa #'0 ; is it alphanumeric? blo parse_nexttok13 ; brif not cmpa #'9 ; is it numeric? bls parse_nexttok12 ; brif so - keep skipping it cmpa #'A ; is it alpha? blo parse_nexttok13 ; brif not cmpa #'Z ; is it still alpha? bls parse_nexttok12 ; brif so - keep skipping it parse_nexttok13 tfr y,d ; calculate length of identifier subd parse_tokenst std val0+val.strlen ; save it for reference ldb #token_ident ; indicate an identifier (variable name, etc.) rts ; return result (C will be clear from SUBD above) parse_nextcharu bsr parse_nextchar ; fetch next input character beq parse_toupper0 ; brif end of input parse_toupper cmpa #'a ; is it lower case alpha? blo parse_toupper0 ; brif not cmpa #'z ; is it still lower case alpha? bhi parse_toupper0 ; brif not suba #0x20 ; adjust to upper case alpha parse_toupper0 rts ; Z only set here if input was zero entering from parse_nextcharu parse_number jmp parse_tokerr ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This routine parses tokens using the table at parse_wordtab. The table is structured as follows: ; ; * two bytes which contain the length of the table less the two bytes for this length value ; * a sequence of entries consisting of a single byte matching character and a token code followed ; by an optional sub table, structured exactly the same way. ; ; The optional subtable will be present if the token code is token_eot ; ; If the character match is negative, it means a lookahead failed. The negative value is the number ; of characters to unget and the token code is the token value to return. No other entries after this ; in a table will be considered since thie negative match is a global match. ; ; When a token_eot match is found, if there are no further characters in the input, the match is ; determined to be invalid and processing continues with the next entry. parse_wordtab0 leas 3,s ; clean up stack for sub table handling parse_wordtab pshs a,x ; save input character and start of table ldd ,x++ ; get length of this table addd 1,s ; calculate the address of the end of the table std 1,s ; save end address for comparison later lda ,s ; get back input character parse_wordtab1 ldb 1,x ; fetch token code for this entry cmpa ,x++ ; does this entry match? bne parse_wordtab4 ; brif not cmpb #token_eot ; is it indicating a sub table? bne parse_wordtab6 ; brif not bsr parse_nextcharu ; fetch next input character (for sub table match) bne parse_wordtab0 ; brif we are going to check the sub table parse_wordtab2 ldd ,x ; fetch length of sub table leax d,x ; move past sub table parse_wordtab3 lda ,s ; get back input character cmpx 1,s ; are we at the end of the table? blo parse_wordtab1 ; brif not - check another entry comb ; indicate no match puls a,x,pc ; clean up stack and return parse_wordtab4 lda -2,x ; get the match character bmi parse_wordtab5 ; brif negative - lookahead fail cmpb #token_eot ; is there a sub table to skip? beq parse_wordtab2 ; brif so - skip sub table bra parse_wordtab3 ; otherwise just move to the next entry parse_wordtab5 leay a,y ; move back the specified number of characters parse_wordtab6 clra ; clear C to indicate a match puls a,x,pc ; clean up stack and return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Convert a token number back to its keyword. This will use the same table used by parse_wordtab. Enter with a character ; output routine pointer in U which takes the character in A. The routine can assume that Y is preserved. Will return ; with C set if the token does not exist in the word table and clear otherwise. parse_wtdc pshs u ; save routine pointer ldu #strbuff+20 ; point to temporary string buffer clr ,-u ; put a NUL at the end of the string ldx #parse_wt ; point to keyword parse table bsr parse_wtdc2 ; call the tree walker function bcc parse_wtdc1 ; brif we do have a match puls u,pc ; clean stack and return parse_wtdc0 jsr [,s] ; output the character parse_wtdc1 lda ,u+ ; get output byte bne parse_wtdc0 ; brif we're not at the end yet clra ; make sure C is clear puls u,pc ; clean stack and return parse_wtdc2 pshs a,x ; save the token match value and the table pointer ldd ,x++ ; get table length addd 1,s ; calculate end address std 1,s ; save it parse_wtdc3 ldd ,x++ ; get this table entry bmi parse_wtdc6 ; brif it's a backtracking entry - skip it cmpa ,s ; does the token match here? bne parse_wtdc5 ; brif not parse_wtdc4 sta ,-y ; add the character to the output buffer puls a,x,pc ; return up the call stack - C is clear from CMPA above parse_wtdc5 cmpb #token_eot ; does this entry have a sub table? bne parse_wtdc6 ; brif not pshs a ; save the matched character lda 1,s ; get back the token we need bsr parse_wtdc2 ; go handle the sub table puls a ; get back the matched character bcc parse_wtdc6 ; brif it did match - record it and return parse_wtdc6 cmpx 1,s ; are we at the end of this table? bne parse_wtdc3 ; brif not - handle another table entry coma ; make sure C is set for no match puls a,x,pc ; clean up stack and return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This table defines the various handler routines for the various bytecode tokens. Each token is defined as follows: ; parse_tokdefT <sym>,<parse>,<list>,<exec> ; where: ; T: c for command, f for function, p for particle ; <sym>: the symbol name without the "token_" prefix ; <parse>: parse handler for the type, ignored for particles ; <list>: list handler for the type, ingored for particles ; <exec>: execution handler for the type, ignored for particles *pragmapush list *pragma nolist __toknump set 0 __toknumc set 0x40 __toknumf set 0xc0 setstr __cmdparset="" setstr __cmdlistt="" setstr __cmdexect="" setstr __fnparset="" setstr __fnlistt="" setstr __fnexect="" parse_tokendefp macro noexpand token_\1 equ __toknump __toknump set __toknump+1 endm parse_tokendefc macro noexpand token_\1 equ __toknumc __toknumc set __toknumc+1 ifstr ne,"{2}","" setstr __cmdparset="%(__cmdparset)\tfcb\ttoken_\1\n\tfdb {2}\n" endc ifstr ne,"{3}","" setstr __cmdlistt="%(__cmdlistt)\tfcb\ttoken_\1\n\tfdb {3}\n" endc ifstr ne,"{4}","" setstr __cmdexect="%(__cmdexect)\tfdb {3}\n" else setstr __cmdexect="%(__cmdexect)\tfdb SNERROR\n" endc endm parse_tokendeff macro noexpand token_\1 equ __toknumf __toknumf set __toknumf+1 ifstr ne,"{2}","" setstr __fnparset="%(__fnparset)\tfcb\ttoken_\1\n\tfdb {2}\n" endc ifstr ne,"{3}","" setstr __fnlistt="%(__fnlistt)\tfcb\ttoken_\1\n\tfdb {3}\n" endc ifstr ne,"{4}","" setstr __fnexect="%(__fnexect)\tfdb {3}\n" else setstr __fnexect="%(__fnexect)\tfdb SNERROR\n" endc endm token_cmdparse macro *pragmapush nolist *pragma nolist includestr "%(__cmdparset)" *pragmapop nolist endm token_cmdlist macro *pragmapush nolist *pragma nolist includestr "%(__cmdlistt)" *pragmapop nolist endm token_cmdexec macro *pragmapush nolist *pragma nolist includestr "%(__cmdexect)" token__maxcmd equ __toknumc-1 *pragmapop nolist endm token_fnparse macro *pragmapush nolist *pragma nolist includestr "%(__fnparset)" *pragmapop nolist endm token_fnlist macro *pragmapush nolist *pragma nolist includestr "%(__fnlistt)" *pragmapop nolist endm token_fnexec macro *pragmapush nolist *pragma nolist includestr "%(__fnexect)" token__maxfn equ __toknumf-1 *pragmapop nolist endm *pragmapop list parse_tokendefp error ; Used to mark errors; should always be first so it's token #0 parse_tokendefp eot ; End of input marker or special handling in word tables parse_tokendefp stmtsep ; statement separator parse_tokendefp times ; times (multiplication) operator (*) parse_tokendefp plus ; addition operator parse_tokendefp divide ; division operator (/) parse_tokendefp minus ; subtraction operator parse_tokendefp exp ; exponentiation operator (^) parse_tokendefp lt ; less than operator parse_tokendefp le ; less than or equal operateor parse_tokendefp gt ; greater than operator parse_tokendefp ge ; greater than or equal operator parse_tokendefp eq ; equality operator parse_tokendefp ne ; inequality operator parse_tokendefp not ; boolean NOT operator parse_tokendefp and ; boolean AND operator parse_tokendefp or ; boolean OR operator parse_tokendefp bang ; exclamation mark parse_tokendefp hash ; number sign parse_tokendefp dollar ; dollar sign (string sigil) parse_tokendefp percent ; percent sign (integer sigil) parse_tokendefp amp ; ampersand parse_tokendefp oparen ; opening paren parse_tokendefp cparen ; closing paren parse_tokendefp sep ; comma (separator) parse_tokendefp semi ; semicolon parse_tokendefp at ; @ symbol parse_tokendefp ident ; identifier (has special parsing) parse_tokendefp else ; ELSE parse_tokendefp then ; THEN parse_tokendefp to ; TO parse_tokendefp sub ; SUB parse_tokendefp as ; AS parse_tokendefc remabbr,parse_noop,list_noop,exec_noop ; abbreviated REM (') parse_tokendefc rem,parse_noop,list_noop,exec_noop ; REM parse_tokendefc return,parse_noop,parse_noop,parse_noop ; RETURN parse_tokendefc run,parse_noop,parse_noop,parse_noop ; RUN parse_tokendefc data,parse_noop,parse_noop,parse_noop ; DATA parse_tokendefc end,parse_noop,parse_noop,parse_noop ; END parse_tokendefc stop,parse_noop,parse_noop,parse_noop ; STOP parse_tokendefc let,parse_noop,parse_noop,parse_noop ; LET parse_tokendefc list,parse_noop,parse_noop,parse_noop ; LIST parse_tokendefc new,parse_noop,parse_noop,parse_noop ; NEW parse_tokendefc print,parse_noop,parse_noop,parse_noop ; PRINT parse_tokendefc pop,parse_noop,parse_noop,parse_noop ; POP parse_tokendefc goto,parse_noop,parse_noop,parse_noop ; GOTO parse_tokendefc gosub,parse_noop,parse_noop,parse_noop ; GOSUB parse_tokendefc go,parse_noop,parse_noop,parse_noop ; GO parse_tokendeff asc,parse_noop,parse_noop,parse_noop ; ASC() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Parse handling tables parsetab_cmd token_cmdparse parsetab_cmde parsetab_fn token_fnparse parsetab_fne ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; List handling tables listtab_cmd token_cmdlist listtab_cmde listtab_fn token_fnlist listtab_fne ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Execution handling tables exectab_cmd token_cmdexec exectab_fn token_fnexec *pragmapop list