# HG changeset patch # User William Astle # Date 1720913581 21600 # Node ID 4983ba49f936a12048126ff7cbfbd503c4bc3e5d # Parent 18940aa42dcf814c3069270193b699434bd8ed79 Make backup of parse.s in preparation for a complete refactor diff -r 18940aa42dcf -r 4983ba49f936 src/parse.s-saved --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/parse.s-saved Sat Jul 13 17:33:01 2024 -0600 @@ -0,0 +1,472 @@ + *pragmapush list + *pragma list +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This is the overall parsing package. This is responsible for converting program text into the internal byte code and +; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated +; code analysis. In almost all cases, the returned error will be a syntax error. The internal byte code shares the same +; token number allocations as the parser. Some allocated tokens cannot be identified by the lexer (parse_nexttok) but +; are used at runtime and when "decompiling" to text. +; +; In the event of a parse error, everything up to the next end of statement is retained as is using a special token +; that preserves the unparsable text and parsing resumes. Only the first error is referenced by the return error +; pointer. +; +; This is a recursive descent parser. +; +; Entry: +; X Points to the text to encode +; B Nonzero to prevent generating any output (error check/length calculation only) +; +; Exit: +; X Points to the encoded line +; D Length of the encoded line +; CC.C clear + +; Error Exit: +; X Points to the encoded line +; D Length of the encoded line +; Y Pointer to the first error location in the input +; U Error code +; CC.C set +; +; This is the error handler. It is responsible for resetting the stack to bail out to the top level +; parsing loop. It must also store the input pointer if this is the first error. Finally, it has to +; output all the text up to either the end of the line *or* the next valid statement separator. +parse_errorsn ldb #err_sn +parse_error lds parse_stackptr ; restore the original stack pointer so we can call from down stack + puls u ; get back original free pointer + stu freestart ; deallocate any allocated result + ldu parse_tokenst ; get start location of the token where the error was raised + coma ; make sure C is set for error + rts +parse stb parse_noout ; save no-output flag + leay ,x ; save input pointer in a less useful register + ldu freestart ; point to start of free memory where we will build the output + pshs u ; save original free memory location + sts parse_stackptr ; save the stack pointer for bailing out on errors +parse_nextstmt jsr parse_nexttok ; fetch the next token, return type in D + bcs parse_error ; brif we failed at parsing a token +parse0 ldx #parsetab_cmd ; point to jump table for token type handler + cmpb #token_stmtsep ; is it a statement separator? + beq parse_nextstmt ; brif so - we can just skip it +parse1 cmpb ,x ; did we match a valid command token? + beq parse3 ; brif so + leax 3,x ; move to next entry + cmpx #parsetab_cmde ; end of table? + blo parse1 ; brif not + bra parse_errorsn ; fell off the end +parse3 jsr [1,x] ; call the handler + bcs parse_error ; brif the handler indicated error + bsr parse_curtoken ; fetch the token we left off on + cmpb #token_eot ; end of input? + bne parse4 ; brif not + ldb #bc_eol ; stash an end of line op + bsr parse_write + bcs parse_error ; brif we errored out writing to the result (OM?) + tfr u,d ; calculate the length of the result + subd ,s + puls u,pc ; get pointer to start of encoded result and return (C is already clear) +parse4 cmpb #token_stmtsep ; statement separator? + beq parse_nextstmt ; brif so - do another statement + cmpb #token_remabbr ; ' token? + beq parse0 ; brif so - parse it as a new statement + bra parse_errorsn ; raise a syntax error +parse_write lda parse_noout ; are we doing output? + beq parse_write0 ; brif so + leau 1,u ; just count up the output and don't do anything + rts +parse_write0 leax -stackheadroom,s ; calculate bottom of stack with headroom + cmpx freestart ; did the stack run into the end of the output? + bhs parse_write1 ; brif not - we're good + ldb #err_om ; raise out of memory error, C already set from comparison + rts +parse_write1 stb ,u+ ; save output byte + stu freestart ; save new to of used memory +list_noop +parse_noop rts ; return all clear - C clear from comparison above +parse_curtoken ldb parse_curtok ; fetch token code of current token + rts +parse_tokerr comb ; flag error - unexpected token + ldb #err_sn ; raise syntax error + rts +parse_nextchar lda ,y ; at end of input already? + beq parse_curchar ; brif so + leay 1,y ; move to next input character +parse_curchar lda ,y ; fetch input character + rts +parse_nexttokc bsr parse_nexttok ; fetch next token +parse_iseos cmpb #token_eot ; end of text? + beq parse_iseos0 ; brif so + cmpb #token_stmtsep ; is it a statement separator +parse_iseos0 rts +parse_nexttok bsr parse_curchar ; fetch current input + beq parse_nexttok1 ; brif end of input +parse_nexttok0 cmpa #0x20 ; space? + bne parse_nexttok2 ; brif not + bsr parse_nextchar ; eat the space + bne parse_nexttok0 ; brif not end of input +parse_nexttok1 ldb #token_eot ; flag end of input + bra parse_nexttok6 ; go return it +parse_nexttok2 sty parse_tokenst ; save start of current token after skipping spaces + bsr parse_toupper ; make sure we have upper case letters for matching + ldx #parse_wt ; point to keyword parsing table + jsr parse_wordtab ; go see if we have a match in the keyword table + bcc parse_nexttok6 ; brif we do - return it + ldy parse_tokenst ; return to the start of the token - pointer probably clobbered + bsr parse_curchar ; get back input character (may have been clobbered) + cmpa #'. ; leading decimal? + beq parse_nexttok3 ; brif so - parse number + cmpa #'0 ; is it a digit + blo parse_nexttok10 ; brif not + cmpa #'9 ; is it still a digit? + bhi parse_nexttok10 ; brif not +parse_nexttok3 jmp parse_number ; go parse a number +parse_nexttok6 stb parse_curtok ; save token type + leay 1,y ; eat the input character + clra ; clear C to indicate no error (and clear Z also) + rts +parse_nexttok10 cmpa #'A ; is it alpha? + blo parse_nexttok11 ; brif not + cmpa #'Z ; is it still alpha? + bls parse_nexttok12 ; brif so +parse_nexttok11 comb ; flag error - unrecognized token + ldb #token_error + rts +parse_nexttok12 bsr parse_nextcharu ; fetch next input character + cmpa #'0 ; is it alphanumeric? + blo parse_nexttok13 ; brif not + cmpa #'9 ; is it numeric? + bls parse_nexttok12 ; brif so - keep skipping it + cmpa #'A ; is it alpha? + blo parse_nexttok13 ; brif not + cmpa #'Z ; is it still alpha? + bls parse_nexttok12 ; brif so - keep skipping it +parse_nexttok13 tfr y,d ; calculate length of identifier + subd parse_tokenst + std val0+val.strlen ; save it for reference + ldb #token_ident ; indicate an identifier (variable name, etc.) + rts ; return result (C will be clear from SUBD above) +parse_nextcharu bsr parse_nextchar ; fetch next input character + beq parse_toupper0 ; brif end of input +parse_toupper cmpa #'a ; is it lower case alpha? + blo parse_toupper0 ; brif not + cmpa #'z ; is it still lower case alpha? + bhi parse_toupper0 ; brif not + suba #0x20 ; adjust to upper case alpha +parse_toupper0 rts ; Z only set here if input was zero entering from parse_nextcharu +parse_number jmp parse_tokerr +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Parse a statement that consists of just the command token +parse_cmdsingle equ parse_write ; just write the token out and bail +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Parse a REM or ' statement. We just copy the comment out after the REM or ' token. +parse_rem jsr parse_write ; write the token/character out + ldb ,y+ ; get next input character + bne parse_rem ; brif not at the end of the input + ldb #token_eot ; flag end of input for mainline parser + stb parse_curtok + rts ; return, pass back the C result from parse_write +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This routine parses tokens using the table at parse_wordtab. The table is structured as follows: +; +; * two bytes which contain the length of the table less the two bytes for this length value +; * a sequence of entries consisting of a single byte matching character and a token code followed +; by an optional sub table, structured exactly the same way. +; +; The optional subtable will be present if the token code is token_eot +; +; If the character match is negative, it means a lookahead failed. The negative value is the number +; of characters to unget and the token code is the token value to return. No other entries after this +; in a table will be considered since thie negative match is a global match. +; +; When a token_eot match is found, if there are no further characters in the input, the match is +; determined to be invalid and processing continues with the next entry. +parse_wordtab0 leas 3,s ; clean up stack for sub table handling +parse_wordtab pshs a,x ; save input character and start of table + ldd ,x++ ; get length of this table + addd 1,s ; calculate the address of the end of the table + std 1,s ; save end address for comparison later + lda ,s ; get back input character +parse_wordtab1 ldb 1,x ; fetch token code for this entry + cmpa ,x++ ; does this entry match? + bne parse_wordtab4 ; brif not + cmpb #token_eot ; is it indicating a sub table? + bne parse_wordtab6 ; brif not + jsr parse_nextcharu ; fetch next input character (for sub table match) + bne parse_wordtab0 ; brif we are going to check the sub table +parse_wordtab2 ldd ,x ; fetch length of sub table + leax d,x ; move past sub table +parse_wordtab3 lda ,s ; get back input character + cmpx 1,s ; are we at the end of the table? + blo parse_wordtab1 ; brif not - check another entry + comb ; indicate no match + puls a,x,pc ; clean up stack and return +parse_wordtab4 lda -2,x ; get the match character + bmi parse_wordtab5 ; brif negative - lookahead fail + cmpb #token_eot ; is there a sub table to skip? + beq parse_wordtab2 ; brif so - skip sub table + bra parse_wordtab3 ; otherwise just move to the next entry +parse_wordtab5 leay a,y ; move back the specified number of characters +parse_wordtab6 clra ; clear C to indicate a match + puls a,x,pc ; clean up stack and return +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Convert a token number back to its keyword. This will use the same table used by parse_wordtab. Enter with a character +; output routine pointer in U which takes the character in A. The routine can assume that Y is preserved. Will return +; with C set if the token does not exist in the word table and clear otherwise. +parse_wtdc pshs u ; save routine pointer + ldu #strbuff+20 ; point to temporary string buffer + clr ,-u ; put a NUL at the end of the string + ldx #parse_wt ; point to keyword parse table + bsr parse_wtdc2 ; call the tree walker function + bcc parse_wtdc1 ; brif we do have a match + puls u,pc ; clean stack and return +parse_wtdc0 jsr [,s] ; output the character +parse_wtdc1 lda ,u+ ; get output byte + bne parse_wtdc0 ; brif we're not at the end yet + clra ; make sure C is clear + puls u,pc ; clean stack and return +parse_wtdc2 pshs a,x ; save the token match value and the table pointer + ldd ,x++ ; get table length + addd 1,s ; calculate end address + std 1,s ; save it +parse_wtdc3 ldd ,x++ ; get this table entry + bmi parse_wtdc6 ; brif it's a backtracking entry - skip it + cmpa ,s ; does the token match here? + bne parse_wtdc5 ; brif not +parse_wtdc4 sta ,-y ; add the character to the output buffer + puls a,x,pc ; return up the call stack - C is clear from CMPA above +parse_wtdc5 cmpb #token_eot ; does this entry have a sub table? + bne parse_wtdc6 ; brif not + pshs a ; save the matched character + lda 1,s ; get back the token we need + bsr parse_wtdc2 ; go handle the sub table + puls a ; get back the matched character + bcc parse_wtdc4 ; brif it did match - record it and return +parse_wtdc6 cmpx 1,s ; are we at the end of this table? + bne parse_wtdc3 ; brif not - handle another table entry + coma ; make sure C is set for no match + puls a,x,pc ; clean up stack and return +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Validate a line number. Must enter with the token type in B. Will return the line number in X. It will return a +; syntax error if the line number is invalid or out of range. It will also consume a valid line number token. +parse_linenum cmpb #token_int32 ; is it an integer? + beq parse_linenum1 ; brif so +parse_linenum0 ldb #err_sn ; flag syntax error + coma ; flag error + rts +parse_linenum1 ldx val0+val.int ; get high word of integer + bne parse_linenum0 ; brif not a valid line number + ldx val0+val.int+2 ; get actual line number + pshs x ; save it + jsr parse_nexttok ; consume line number + puls x,pc ; get back line number and return it +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Parse a line number range which is one of the following forms: +; +; - +; - +; - +; The result will store two line numbers. If no - token appears, then both line numbers will be the same. Otherwise, +; if is omitted, it will be assumed to be 0. If is omitted, it will be assumed to be 65535. Those +; are the minimum and maximum line numbers. +; +; Parsing works by first looking for an integer token that is in range. If it finds one, it looks for an optional - +; followed by an optional integer token that is in range. If the first token is not an integer, it must be a - which may +; be optionally followed by another integer in range. +; +; It is technically valid to have a single - with no line numbers. +; +; Enter with the current token in B. +; +; The resulting line numbers will be returned in parse_buff +parse_linerange ldx zero ; default start line number + leau -1,x ; default end line number + pshs x,u ; save the return range + cmpb #token_minus ; range with no start? + beq parse_linerang1 ; brif so + bsr parse_linenum ; verify line number, return in X + bcs parse_linerang4 ; bail out on error + stx ,s ; save new start line number + jsr parse_nexttokc ; fetch next token, set Z if end of statement + bne parse_linerang0 ; brif not end of line + ldx ,s ; get end line to use as start line + bra parse_linerang2 ; go set range end and return +parse_linerang0 cmpb #token_minus ; do we have a range character? + bne parse_linerang3 ; brif not - we have an error +parse_linerang1 jsr parse_nexttokc ; parse what comes after the range mark + beq parse_linerang2 ; brif end of statement - use the default range end + bsr parse_linenum ; make sure it's a valid line number + bcs parse_linerang4 ; bail out on error +parse_linerang2 stx 2,s ; set range end + clra ; make sure C is clear + puls x,u,pc ; fetch return values and return +parse_linerang3 ldb #err_sn ; flag a syntax error + coma ; make sure C is set +parse_linerang4 puls x,u,pc ; clean up stack and return error condition +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This table defines the various handler routines for the various bytecode tokens. Each token is defined as follows: +; parse_tokdefT ,,, +; where: +; T: c for command, f for function, p for particle +; : the symbol name without the "token_" prefix +; : parse handler for the type, ignored for particles +; : list handler for the type, ingored for particles +; : execution handler for the type, ignored for particles + *pragmapush list + *pragma nolist +__toknump set 0 +__toknumc set 0x40 +__toknumf set 0xc0 + setstr __cmdparset="" + setstr __cmdlistt="" + setstr __cmdexect="" + setstr __fnparset="" + setstr __fnlistt="" + setstr __fnexect="" +parse_tokendefp macro noexpand +token_\1 equ __toknump +__toknump set __toknump+1 + endm +parse_tokendefc macro noexpand +token_\1 equ __toknumc +__toknumc set __toknumc+1 + ifstr ne,"{2}","" + setstr __cmdparset="%(__cmdparset)\tfcb\ttoken_\1\n\tfdb {2}\n" + endc + ifstr ne,"{3}","" + setstr __cmdlistt="%(__cmdlistt)\tfcb\ttoken_\1\n\tfdb {3}\n" + endc + ifstr ne,"{4}","" + setstr __cmdexect="%(__cmdexect)\tfdb {3}\n" + else + setstr __cmdexect="%(__cmdexect)\tfdb SNERROR\n" + endc + endm +parse_tokendeff macro noexpand +token_\1 equ __toknumf +__toknumf set __toknumf+1 + ifstr ne,"{2}","" + setstr __fnparset="%(__fnparset)\tfcb\ttoken_\1\n\tfdb {2}\n" + endc + ifstr ne,"{3}","" + setstr __fnlistt="%(__fnlistt)\tfcb\ttoken_\1\n\tfdb {3}\n" + endc + ifstr ne,"{4}","" + setstr __fnexect="%(__fnexect)\tfdb {3}\n" + else + setstr __fnexect="%(__fnexect)\tfdb SNERROR\n" + endc + endm +token_cmdparse macro + *pragmapush nolist + *pragma nolist + includestr "%(__cmdparset)" + *pragmapop nolist + endm +token_cmdlist macro + *pragmapush nolist + *pragma nolist + includestr "%(__cmdlistt)" + *pragmapop nolist + endm +token_cmdexec macro + *pragmapush nolist + *pragma nolist + includestr "%(__cmdexect)" +token__maxcmd equ __toknumc-1 + *pragmapop nolist + endm +token_fnparse macro + *pragmapush nolist + *pragma nolist + includestr "%(__fnparset)" + *pragmapop nolist + endm +token_fnlist macro + *pragmapush nolist + *pragma nolist + includestr "%(__fnlistt)" + *pragmapop nolist + endm +token_fnexec macro + *pragmapush nolist + *pragma nolist + includestr "%(__fnexect)" +token__maxfn equ __toknumf-1 + *pragmapop nolist + endm + *pragmapop list + ; the tokens defined in this section all have special parsing or meaning + parse_tokendefp error ; Used to mark errors; should always be first so it's token #0 + parse_tokendefp eot ; End of input marker or special handling in word tables + parse_tokendefp int32 ; 32 bit integer (has special parsing) + parse_tokendefp float ; floating point value (has special parsing) + parse_tokendefp ident ; identifier (has special parsing) + parse_tokendefp linenum ; a 16 bit unsigned integer treated as a line number + parse_tokendefp linerange ; a pair of 16 bit unsigned integers treated as line numbers + ; everything below here references keywords or particle characters + parse_tokendefp stmtsep ; statement separator + parse_tokendefp times ; times (multiplication) operator (*) + parse_tokendefp plus ; addition operator + parse_tokendefp divide ; division operator (/) + parse_tokendefp minus ; subtraction operator + parse_tokendefp exp ; exponentiation operator (^) + parse_tokendefp lt ; less than operator + parse_tokendefp le ; less than or equal operateor + parse_tokendefp gt ; greater than operator + parse_tokendefp ge ; greater than or equal operator + parse_tokendefp eq ; equality operator + parse_tokendefp ne ; inequality operator + parse_tokendefp not ; boolean NOT operator + parse_tokendefp and ; boolean AND operator + parse_tokendefp or ; boolean OR operator + parse_tokendefp bang ; exclamation mark + parse_tokendefp hash ; number sign + parse_tokendefp dollar ; dollar sign (string sigil) + parse_tokendefp percent ; percent sign (integer sigil) + parse_tokendefp amp ; ampersand + parse_tokendefp oparen ; opening paren + parse_tokendefp cparen ; closing paren + parse_tokendefp sep ; comma (separator) + parse_tokendefp semi ; semicolon + parse_tokendefp at ; @ symbol + parse_tokendefp else ; ELSE + parse_tokendefp then ; THEN + parse_tokendefp to ; TO + parse_tokendefp sub ; SUB + parse_tokendefp as ; AS + + parse_tokendefc remabbr,parse_rem,list_noop,exec_noop ; abbreviated REM (') + parse_tokendefc rem,parse_rem,list_noop,exec_noop ; REM + parse_tokendefc return,parse_cmdsingle,parse_noop,parse_noop ; RETURN + parse_tokendefc run,parse_noop,parse_noop,parse_noop ; RUN + parse_tokendefc data,parse_noop,parse_noop,parse_noop ; DATA + parse_tokendefc end,parse_cmdsingle,parse_noop,parse_noop ; END + parse_tokendefc stop,parse_cmdsingle,parse_noop,parse_noop ; STOP + parse_tokendefc let,parse_noop,parse_noop,parse_noop ; LET + parse_tokendefc list,parse_noop,parse_noop,parse_noop ; LIST + parse_tokendefc new,parse_cmdsingle,parse_noop,parse_noop ; NEW + parse_tokendefc print,parse_noop,parse_noop,parse_noop ; PRINT + parse_tokendefc pop,parse_cmdsingle,parse_noop,parse_noop ; POP + parse_tokendefc goto,parse_noop,parse_noop,parse_noop ; GOTO + parse_tokendefc gosub,parse_noop,parse_noop,parse_noop ; GOSUB + parse_tokendefc go,parse_noop,parse_noop,parse_noop ; GO + + parse_tokendeff asc,parse_noop,parse_noop,parse_noop ; ASC() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Parse handling tables +parsetab_cmd token_cmdparse +parsetab_cmde +parsetab_fn token_fnparse +parsetab_fne +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; List handling tables +listtab_cmd token_cmdlist +listtab_cmde +listtab_fn token_fnlist +listtab_fne +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Execution handling tables +exectab_cmd token_cmdexec +exectab_fn token_fnexec + *pragmapop list