view src/parse.s @ 139:5d4801c0566d

Get things building again with the updated tokenization scheme
author William Astle <lost@l-w.ca>
date Mon, 15 Jul 2024 23:26:15 -0600
parents 917b4893bb3d
children 86f6f3a71e60
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This is the overall parsing package. It is responsible for converting the input source code into the internal byte
; code.
;
; This version only converts keywords to token codes. Additional conversions will be done in future versions.
;
; Enter with X pointing to the text to parse. The encoded result will be placed freestart. On return, X will point to
; the encoded result and D will contain the length in bytes of the result, and C will be clear.
;
; In the event that there is insufficient memory between freestart and the bottom of the stack, C will be set. This
; routine does not immediately throw an "out of memory" error to allow the caller to clear up some memory and try
; again.
;
; Enter at parseto with U set to the encoding destination and Y set to one byte past the end of the destination buffer
; to specify the destination. Defaults to encoding to the buffer between freestart and the bottom of the stack (with
; headroom accounted for).
;
; The stuff below that has hard coded colon checks will eventually be replaced by more complete parsing.
parse           ldu freestart                   ; default to the start of free memory for encoding
                leay -stackheadroom,s           ; set the top of free memory
parseto         lda #1                          ; flag to enable memory limit detection
                pshs a,u,y                      ; save start and end addresses and OM error detection flag
                leay ,x                         ; put the input pointer somewhere less useful                
parsea          jsr parse_curchar               ; fetch an input character
                bne parseb                      ; brif not end of input
parsez          tfr u,d                         ; get current output pointer
                subd 3,s                        ; now D is the length
                leas 5,s                        ; clean up the stack
                rts                             ; return - C will be clear from subd above
parseb          jsr parse_wordtab               ; look up a keyword and see if we have a match
                bcs parsec                      ; brif no match - handle unknown stuff
                tsta                            ; do we have a two byte token?
                bne parseq                      ; brif so - just stash it
                cmpb #token_else                ; ELSE?
                beq parsed                      ; brif so - gets a hidden statement separator
                cmpb #token_remabbr             ; REM abbreviation?
                bne parsee                      ; brif not
parsed          lda #':                         ; add a statement separator before it
parseq          bsr parseoutw                   ; output a word
                bra parsef
parsee          bsr parseout                    ; output the token code
parsef          cmpb #token_remabbr             ; REM abbreviation?
                beq parseg                      ; brif so
                cmpb #token_rem                 ; Actual REM?
                bne parseh                      ; brif not
parseg          ldb ,y+                         ; get current input character
                beq parsez                      ; brif end of input
                bsr parseout                    ; add unmodified characters to output
                bra parseg                      ; keep going until end of input
parseh          cmpb #token_data                ; DATA command?
                bne parsea                      ; brif not - continue normal handling
                clra                            ; flag for not skipping quoted string
parsei          ldb ,y+                         ; get input character
                beq parsez                      ; brif end of input
                cmpb #'"                        ; string delimiter?
                bne parsej                      ; brif not
                coma                            ; flip the quoted statement handler
parsej          cmpb #':                        ; end of statement?
                bne parsek                      ; brif not
                tsta                            ; are we skipping them?
                bne parsek                      ; brif so
                leay -1,y                       ; unconsume it
                bra parsea                      ; we're done with DATA
parsek          bsr parseout                    ; put the data value into the output
                bra parsei                      ; go handle another character
parsec          cmpb #'"                        ; did we encounter a quoted string?
                bne parsel                      ; brif not
                bsr parseout                    ; output delimiter
parsem          ldb ,y+                         ; get string character
                beq parsez                      ; brif end of input
                bsr parseout                    ; output it
                cmpb #'"                        ; end delimiter?
                bne parsem                      ; brif not - keep looking
                bra parsea                      ; go handle more stuff
parsep          cmpb #'0                        ; is it a digit?
                blo parsen                      ; brif not
                cmpb #'9                        ; is it still a digit?
                bls parseo                      ; brif so
parsel          cmpb #'A                        ; is it a letter?
                blo parsen                      ; brif not
                cmpb #'Z                        ; is it still a letter (UC)?
                bls parseo                      ; brif so
                cmpb #'a                        ; is it a lower case letter?
                blo parsen                      ; brif not
                cmpb #'z                        ; is it still a lower case letter?
                bhi parsen                      ; brif not
parseo          bsr parseout                    ; stash the character
                ldb ,y+                         ; fetch next input
                beq parsez                      ; brif end of input
                bra parsep                      ; go see if we're still in an identifier
parsen          bsr parseout                    ; output unknown character (number, unknown token)
                jmp parsea                      ; go handle more
parseoutw       exg a,b                         ; do MSB
                bsr parseout
                exg a,b                         ; and then LSB (fall through)
parseout        tst 2,s                         ; need to test for OM?
                beq parseout0                   ; brif not
                cmpu 3,s                        ; did we run into the end of the buffer?
                blo parseout0                   ; brif not
                coma                            ; set C for error
                leas 7,s                        ; clean up stack
                rts                             ; return to original caller
parseout0       stb ,u+                         ; stash in buffer
                rts
parse_nextchar  lda ,y                          ; at end of input already?
                beq parse_curchar               ; brif so
                leay 1,y                        ; move to next input character
parse_curchar   lda ,y                          ; fetch input character
                rts
parse_nextcharu bsr parse_nextchar              ; fetch next input character
                beq parse_toupper0              ; brif end of input
parse_toupper   cmpa #'a                        ; is it lower case alpha?
                blo parse_toupper0              ; brif not
                cmpa #'z                        ; is it still lower case alpha?
                bhi parse_toupper0              ; brif not
                suba #0x20                      ; adjust to upper case alpha
parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This routine parses tokens using the table at parse_wt. The table is structured as follows:
;
; * two bytes which contain the length of the table less the two bytes for this length value
; * a sequence of entries consisting of a single byte matching character and a token code followed
;   by an optional sub table, structured exactly the same way. The token code is 2 bytes.
;
; The optional subtable will be present if the token code is token_eot
;
; If the character match is negative, it means a lookahead failed. The negative value is the number
; of characters to unget and the token code is the token value to return. No other entries after this
; in a table will be considered since thie negative match is a global match.
;
; When a token_eot match is found, if there are no further characters in the input, the match is
; determined to be invalid and processing continues with the next entry.
parse_wordtab   ldx #parse_wt                   ; point to main lookup table
                skip2                           ; move on into the main routine
parse_wordtab0  leas 3,s                        ; clean up stack for sub table handling
                pshs a,x                        ; save input character and start of table
                ldd ,x++                        ; get length of this table
                addd 1,s                        ; calculate the address of the end of the table
                std 1,s                         ; save end address for comparison later
                lda ,s                          ; get back input character
parse_wordtab1  leax 3,x                        ; move past this entry - this order to avoid Z effects from leax
                cmpa -3,x                       ; does this entry match?
                bne parse_wordtab4              ; brif not
                ldd -2,x                        ; get the matched token code
                cmpd #tokenf_eot                ; is it indicating a sub table?
                bne parse_wordtab6              ; brif not
                jsr parse_nextcharu             ; fetch next input character (for sub table match)
                bne parse_wordtab0              ; brif we are going to check the sub table
parse_wordtab2  ldd ,x                          ; fetch length of sub table
                leax d,x                        ; move past sub table
parse_wordtab3  lda ,s                          ; get back input character
                cmpx 1,s                        ; are we at the end of the table?
                blo parse_wordtab1              ; brif not - check another entry
                comb                            ; indicate no match
                puls a,x,pc                     ; clean up stack and return
parse_wordtab4  lda -3,x                        ; get the match character
                bmi parse_wordtab5              ; brif negative - lookahead fail
                ldd -2,x                        ; get the token match
                cmpd #tokenf_eot                ; is there a sub table to skip?
                beq parse_wordtab2              ; brif so - skip sub table
                bra parse_wordtab3              ; otherwise just move to the next entry
parse_wordtab5  leay a,y                        ; move back the specified number of characters
                ldd -2,x                        ; get the matched token
parse_wordtab6  sta ,s                          ; save MSB of match
                clra                            ; clear carry to indicate match
                puls a,x,pc                     ; clean up stack, restore return value and return
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Convert a token number back to its keyword. This will use the same table used by parse_wordtab. Enter with a character
; output routine pointer in U which takes the character in A. The routine can assume that Y is preserved. Will return
; with C set if the token does not exist in the word table and clear otherwise.
parse_wtdc      pshs u                          ; save routine pointer
                ldu #strbuff+20                 ; point to temporary string buffer
                clr ,-u                         ; put a NUL at the end of the string
                ldx #parse_wt                   ; point to keyword parse table
                bsr parse_wtdc2                 ; call the tree walker function
                bcc parse_wtdc1                 ; brif we do have a match
                puls u,pc                       ; clean stack and return
parse_wtdc0     jsr [,s]                        ; output the character
parse_wtdc1     lda ,u+                         ; get output byte
                bne parse_wtdc0                 ; brif we're not at the end yet
                clra                            ; make sure C is clear
                puls u,pc                       ; clean stack and return
parse_wtdc2     pshs a,x                        ; save the token match value and the table pointer
                ldd ,x++                        ; get table length
                addd 1,s                        ; calculate end address
                std 1,s                         ; save it
parse_wtdc3     ldd ,x++                        ; get this table entry
                bmi parse_wtdc6                 ; brif it's a backtracking entry - skip it
                cmpa ,s                         ; does the token match here?
                bne parse_wtdc5                 ; brif not
parse_wtdc4     sta ,-y                         ; add the character to the output buffer
                puls a,x,pc                     ; return up the call stack - C is clear from CMPA above
parse_wtdc5     cmpb #token_eot                 ; does this entry have a sub table?
                bne parse_wtdc6                 ; brif not
                pshs a                          ; save the matched character
                lda 1,s                         ; get back the token we need
                bsr parse_wtdc2                 ; go handle the sub table
                puls a                          ; get back the matched character
                bcc parse_wtdc4                 ; brif it did match - record it and return
parse_wtdc6     cmpx 1,s                        ; are we at the end of this table?
                bne parse_wtdc3                 ; brif not - handle another table entry
                coma                            ; make sure C is set for no match
                puls a,x,pc                     ; clean up stack and return
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Definition of tokens used in the interpreter.
;
; Each token is defined as follows:
;               parse_tokdefT <sym>[,<handler>]
; where T is one of:
; p: particle - utility tokens and definitions, starting at 0x00
; c: command - a command keyword, starting at 0x80
; f: function - a function keyword, start at 0x80 with a 0xFF prefix
; n: token width specific number/code, but otherwise a particle; in this case, the code replaces <handler>
;
; <sym> is the base symbol name (such as "then" or "eot")
; <handler> is the address of the execution handler routine of the natural token type (command or function)
;
; <handler> is optional for particles. If it is omitted for command or function tokens, it defaults to SNERROR.
                *pragmapush list
                *pragma nolist
__toknump       set 0
__toknumc       set 0x80
__toknumf       set 0x80
parse_tokendefp macro noexpand
token_\1        equ __toknump
tokenf_\1       equ __toknump
__toknump       set __toknump+1
                endm
parse_tokendefv macro noexpand
token_\1        equ \2
tokenf_\1       equ \2
                endm
                setstr __cmdexect=""
                setstr __funcexect=""
parse_tokendefc macro noexpand
token_\1        equ __toknumc
tokenf_\1       equ __toknumc
__toknumc       set __toknumc+1
                ifstr ne,"{2}",""
                setstr __cmdexect="%(__cmdexect)\tfdb {2}\n"
                else
                setstr __cmdexect="%(__cmdexect)\tfdb SNERROR\n"
                endc
                endm
parse_tokendeff macro noexpand
token_\1        equ __toknumf
tokenf_\1       equ 0xff00|__toknumf
__toknumf       set __toknumf+1
                ifstr ne,"{2}",""
                setstr __fnexect="%(__fnexect)\tfdb {2}\n"
                else
                setstr __fnexect="%(__fnexect)\tfdb SNERROR\n"
                endc
                endm
token_cmdexec   macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__cmdexect)"
token__maxcmd   equ __toknumc-1
                *pragmapop nolist
                endm
token_fnexec    macro
                *pragmapush nolist
                *pragma nolist
                includestr "%(__fnexect)"
token__maxfn    equ __toknumf-1
                *pragmapop nolist
                endm
                *pragmapop list
                ; special tokens
                parse_tokendefp error           ; Used to mark errors; should always be first so it's token #0 
                parse_tokendefp eot             ; End of input marker or special handling in word tables
                ; command (and simple non-command keywords)
                parse_tokendefc remabbr         ; abbreviated REM (')
                parse_tokendefc rem             ; REM
                parse_tokendefc return          ; RETURN
                parse_tokendefc run             ; RUN
                parse_tokendefc data            ; DATA
                parse_tokendefc end             ; END
                parse_tokendefc stop            ; STOP
                parse_tokendefc let             ; LET
                parse_tokendefc list            ; LIST
                parse_tokendefc new             ; NEW
                parse_tokendefc print           ; PRINT
                parse_tokendefc pop             ; POP
                parse_tokendefc goto            ; GOTO
                parse_tokendefc gosub           ; GOSUB
                parse_tokendefc go              ; GO
                parse_tokendefc times           ; times (multiplication) operator (*)
                parse_tokendefc plus            ; addition operator
                parse_tokendefc divide          ; division operator (/)
                parse_tokendefc minus           ; subtraction operator
                parse_tokendefc exp             ; exponentiation operator (^)
                parse_tokendefc lt              ; less than operator
                parse_tokendefc le              ; less than or equal operateor
                parse_tokendefc gt              ; greater than operator
                parse_tokendefc ge              ; greater than or equal operator
                parse_tokendefc eq              ; equality operator
                parse_tokendefc ne              ; inequality operator
                parse_tokendefc not             ; boolean NOT operator
                parse_tokendefc and             ; boolean AND operator
                parse_tokendefc or              ; boolean OR operator
                parse_tokendefc else            ; ELSE
                parse_tokendefc then            ; THEN
                parse_tokendefc to              ; TO
                parse_tokendefc sub             ; SUB
                parse_tokendefc as              ; AS
                ; secondary tokens (functions)
                parse_tokendeff asc             ; ASC()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Execution handling tables
exectab_cmd     token_cmdexec
exectab_fn      token_fnexec
                
                *pragmapop list