Mercurial > hg > index.cgi
view src/parse.s @ 140:86f6f3a71e60 default tip
Fix some bugs in tokenization/parsing routine
author | William Astle <lost@l-w.ca> |
---|---|
date | Tue, 16 Jul 2024 22:30:07 -0600 |
parents | 5d4801c0566d |
children |
line wrap: on
line source
*pragmapush list *pragma list ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This is the overall parsing package. It is responsible for converting the input source code into the internal byte ; code. ; ; This version only converts keywords to token codes. Additional conversions will be done in future versions. ; ; Enter with X pointing to the text to parse. The encoded result will be placed freestart. On return, X will point to ; the encoded result and D will contain the length in bytes of the result, and C will be clear. ; ; In the event that there is insufficient memory between freestart and the bottom of the stack, C will be set. This ; routine does not immediately throw an "out of memory" error to allow the caller to clear up some memory and try ; again. ; ; Enter at parseto with U set to the encoding destination and Y set to one byte past the end of the destination buffer ; to specify the destination. Defaults to encoding to the buffer between freestart and the bottom of the stack (with ; headroom accounted for). ; ; The stuff below that has hard coded colon checks will eventually be replaced by more complete parsing. parse ldu freestart ; default to the start of free memory for encoding leay -stackheadroom,s ; set the top of free memory parseto lda #1 ; flag to enable memory limit detection pshs a,u,y ; save start and end addresses and OM error detection flag leay -1,x ; put the input pointer somewhere less useful and back up one spot parsea jsr parse_nextchar ; fetch an input character bne parseb ; brif not end of input parsez tfr u,d ; get current output pointer subd 3,s ; now D is the length leas 5,s ; clean up the stack rts ; return - C will be clear from subd above parseb jsr parse_wordtab ; look up a keyword and see if we have a match bcs parsec ; brif no match - handle unknown stuff tsta ; do we have a two byte token? bne parseq ; brif so - just stash it cmpb #token_else ; ELSE? beq parsed ; brif so - gets a hidden statement separator cmpb #token_remabbr ; REM abbreviation? bne parsee ; brif not parsed lda #': ; add a statement separator before it parseq bsr parseoutw ; output a word bra parsef parsee bsr parseout ; output the token code parsef cmpb #token_remabbr ; REM abbreviation? beq parseg ; brif so cmpb #token_rem ; Actual REM? bne parseh ; brif not parseg ldb ,y+ ; get current input character beq parsez ; brif end of input bsr parseout ; add unmodified characters to output bra parseg ; keep going until end of input parseh cmpb #token_data ; DATA command? bne parsea ; brif not - continue normal handling clra ; flag for not skipping quoted string parsei ldb ,y+ ; get input character beq parsez ; brif end of input cmpb #'" ; string delimiter? bne parsej ; brif not coma ; flip the quoted statement handler parsej cmpb #': ; end of statement? bne parsek ; brif not tsta ; are we skipping them? bne parsek ; brif so leay -1,y ; unconsume it bra parsea ; we're done with DATA parsek bsr parseout ; put the data value into the output bra parsei ; go handle another character parsec ldb ,y ; get back the current input in the right register cmpb #'" ; did we encounter a quoted string? bne parsel ; brif not bsr parseout ; output delimiter parsem leay 1,y ; move to next input character ldb ,y ; get string character beq parsez ; brif end of input bsr parseout ; output it cmpb #'" ; end delimiter? bne parsem ; brif not - keep looking bra parsea ; go handle more stuff parsep cmpb #'0 ; is it a digit? blo parsen ; brif not cmpb #'9 ; is it still a digit? bls parseo ; brif so parsel cmpb #'A ; is it a letter? blo parsen ; brif not cmpb #'Z ; is it still a letter (UC)? bls parseo ; brif so cmpb #'a ; is it a lower case letter? blo parsen ; brif not cmpb #'z ; is it still a lower case letter? bhi parsen ; brif not parseo bsr parseout ; stash the character leay 1,y ; move to next character ldb ,y ; fetch next input bne parsep ; brif not end of input jmp parsez ; go handle end of input parsen bsr parseout ; output unknown character (number, unknown token) jmp parsea ; go handle more parseoutw exg a,b ; do MSB bsr parseout exg a,b ; and then LSB (fall through) parseout tst 2,s ; need to test for OM? beq parseout0 ; brif not cmpu 3,s ; did we run into the end of the buffer? blo parseout0 ; brif not coma ; set C for error leas 7,s ; clean up stack rts ; return to original caller parseout0 stb ,u+ ; stash in buffer rts parse_nextchar leay 1,y ; move to next input character parse_curchar lda ,y ; fetch input character rts parse_nextcharu bsr parse_nextchar ; fetch next input character beq parse_toupper0 ; brif end of input parse_toupper cmpa #'a ; is it lower case alpha? blo parse_toupper0 ; brif not cmpa #'z ; is it still lower case alpha? bhi parse_toupper0 ; brif not suba #0x20 ; adjust to upper case alpha parse_toupper0 rts ; Z only set here if input was zero entering from parse_nextcharu ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This routine parses tokens using the table at parse_wt. The table is structured as follows: ; ; * two bytes which contain the length of the table less the two bytes for this length value ; * a sequence of entries consisting of a single byte matching character and a token code followed ; by an optional sub table, structured exactly the same way. The token code is 2 bytes. ; ; The optional subtable will be present if the token code is token_eot ; ; If the character match is negative, it means a lookahead failed. The negative value is the number ; of characters to unget and the token code is the token value to return. No other entries after this ; in a table will be considered since thie negative match is a global match. ; ; When a token_eot match is found, if there are no further characters in the input, the match is ; determined to be invalid and processing continues with the next entry. parse_wordtab ldx #parse_wt ; point to main lookup table skip2 ; move on into the main routine parse_wordtab0 leas 3,s ; clean up stack for sub table handling pshs a,x ; save input character and start of table ldd ,x++ ; get length of this table addd 1,s ; calculate the address of the end of the table std 1,s ; save end address for comparison later lda ,s ; get back input character parse_wordtab1 leax 3,x ; move past this entry - this order to avoid Z effects from leax cmpa -3,x ; does this entry match? bne parse_wordtab4 ; brif not ldd -2,x ; get the matched token code cmpd #tokenf_eot ; is it indicating a sub table? bne parse_wordtab6 ; brif not jsr parse_nextcharu ; fetch next input character (for sub table match) bne parse_wordtab0 ; brif we are going to check the sub table parse_wordtab2 ldd ,x ; fetch length of sub table leax d,x ; move past sub table parse_wordtab3 lda ,s ; get back input character cmpx 1,s ; are we at the end of the table? blo parse_wordtab1 ; brif not - check another entry comb ; indicate no match puls a,x,pc ; clean up stack and return parse_wordtab4 lda -3,x ; get the match character bmi parse_wordtab5 ; brif negative - lookahead fail ldd -2,x ; get the token match cmpd #tokenf_eot ; is there a sub table to skip? beq parse_wordtab2 ; brif so - skip sub table bra parse_wordtab3 ; otherwise just move to the next entry parse_wordtab5 leay a,y ; move back the specified number of characters ldd -2,x ; get the matched token parse_wordtab6 sta ,s ; save MSB of match clra ; clear carry to indicate match puls a,x,pc ; clean up stack, restore return value and return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Convert a token number back to its keyword. This will use the same table used by parse_wordtab. Enter with a character ; output routine pointer in U which takes the character in A. The routine can assume that Y is preserved. Will return ; with C set if the token does not exist in the word table and clear otherwise. parse_wtdc pshs u ; save routine pointer ldu #strbuff+20 ; point to temporary string buffer clr ,-u ; put a NUL at the end of the string ldx #parse_wt ; point to keyword parse table bsr parse_wtdc2 ; call the tree walker function bcc parse_wtdc1 ; brif we do have a match puls u,pc ; clean stack and return parse_wtdc0 jsr [,s] ; output the character parse_wtdc1 lda ,u+ ; get output byte bne parse_wtdc0 ; brif we're not at the end yet clra ; make sure C is clear puls u,pc ; clean stack and return parse_wtdc2 pshs a,x ; save the token match value and the table pointer ldd ,x++ ; get table length addd 1,s ; calculate end address std 1,s ; save it parse_wtdc3 ldd ,x++ ; get this table entry bmi parse_wtdc6 ; brif it's a backtracking entry - skip it cmpa ,s ; does the token match here? bne parse_wtdc5 ; brif not parse_wtdc4 sta ,-y ; add the character to the output buffer puls a,x,pc ; return up the call stack - C is clear from CMPA above parse_wtdc5 cmpb #token_eot ; does this entry have a sub table? bne parse_wtdc6 ; brif not pshs a ; save the matched character lda 1,s ; get back the token we need bsr parse_wtdc2 ; go handle the sub table puls a ; get back the matched character bcc parse_wtdc4 ; brif it did match - record it and return parse_wtdc6 cmpx 1,s ; are we at the end of this table? bne parse_wtdc3 ; brif not - handle another table entry coma ; make sure C is set for no match puls a,x,pc ; clean up stack and return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Definition of tokens used in the interpreter. ; ; Each token is defined as follows: ; parse_tokdefT <sym>[,<handler>] ; where T is one of: ; p: particle - utility tokens and definitions, starting at 0x00 ; c: command - a command keyword, starting at 0x80 ; f: function - a function keyword, start at 0x80 with a 0xFF prefix ; n: token width specific number/code, but otherwise a particle; in this case, the code replaces <handler> ; ; <sym> is the base symbol name (such as "then" or "eot") ; <handler> is the address of the execution handler routine of the natural token type (command or function) ; ; <handler> is optional for particles. If it is omitted for command or function tokens, it defaults to SNERROR. *pragmapush list *pragma nolist __toknump set 0 __toknumc set 0x80 __toknumf set 0x80 parse_tokendefp macro noexpand token_\1 equ __toknump tokenf_\1 equ __toknump __toknump set __toknump+1 endm parse_tokendefv macro noexpand token_\1 equ \2 tokenf_\1 equ \2 endm setstr __cmdexect="" setstr __funcexect="" parse_tokendefc macro noexpand token_\1 equ __toknumc tokenf_\1 equ __toknumc __toknumc set __toknumc+1 ifstr ne,"{2}","" setstr __cmdexect="%(__cmdexect)\tfdb {2}\n" else setstr __cmdexect="%(__cmdexect)\tfdb SNERROR\n" endc endm parse_tokendeff macro noexpand token_\1 equ __toknumf tokenf_\1 equ 0xff00|__toknumf __toknumf set __toknumf+1 ifstr ne,"{2}","" setstr __fnexect="%(__fnexect)\tfdb {2}\n" else setstr __fnexect="%(__fnexect)\tfdb SNERROR\n" endc endm token_cmdexec macro *pragmapush nolist *pragma nolist includestr "%(__cmdexect)" token__maxcmd equ __toknumc-1 *pragmapop nolist endm token_fnexec macro *pragmapush nolist *pragma nolist includestr "%(__fnexect)" token__maxfn equ __toknumf-1 *pragmapop nolist endm *pragmapop list ; special tokens parse_tokendefp error ; Used to mark errors; should always be first so it's token #0 parse_tokendefp eot ; End of input marker or special handling in word tables ; command (and simple non-command keywords) parse_tokendefc remabbr ; abbreviated REM (') parse_tokendefc rem ; REM parse_tokendefc return ; RETURN parse_tokendefc run ; RUN parse_tokendefc data ; DATA parse_tokendefc end ; END parse_tokendefc stop ; STOP parse_tokendefc let ; LET parse_tokendefc list ; LIST parse_tokendefc new ; NEW parse_tokendefc print ; PRINT parse_tokendefc pop ; POP parse_tokendefc goto ; GOTO parse_tokendefc gosub ; GOSUB parse_tokendefc go ; GO parse_tokendefc times ; times (multiplication) operator (*) parse_tokendefc plus ; addition operator parse_tokendefc divide ; division operator (/) parse_tokendefc minus ; subtraction operator parse_tokendefc exp ; exponentiation operator (^) parse_tokendefc lt ; less than operator parse_tokendefc le ; less than or equal operateor parse_tokendefc gt ; greater than operator parse_tokendefc ge ; greater than or equal operator parse_tokendefc eq ; equality operator parse_tokendefc ne ; inequality operator parse_tokendefc not ; boolean NOT operator parse_tokendefc and ; boolean AND operator parse_tokendefc or ; boolean OR operator parse_tokendefc else ; ELSE parse_tokendefc then ; THEN parse_tokendefc to ; TO parse_tokendefc sub ; SUB parse_tokendefc as ; AS ; secondary tokens (functions) parse_tokendeff asc ; ASC() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Execution handling tables exectab_cmd token_cmdexec exectab_fn token_fnexec *pragmapop list