diff src/parse.s @ 132:917b4893bb3d

Checkpoint before redoing a bunch of code for clarity
author William Astle <lost@l-w.ca>
date Mon, 24 Jun 2024 23:44:39 -0600
parents 95f174bf459b
children 5d4801c0566d
line wrap: on
line diff
--- a/src/parse.s	Sat May 18 00:41:46 2024 -0600
+++ b/src/parse.s	Mon Jun 24 23:44:39 2024 -0600
@@ -3,7 +3,13 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This is the overall parsing package. This is responsible for converting program text into the internal byte code and
 ; reporting any syntax errors and anything else reasonably detectable at parse time without having overly complicated
-; code analysis. In almost all cases, the returned error will be a syntax error.
+; code analysis. In almost all cases, the returned error will be a syntax error. The internal byte code shares the same
+; token number allocations as the parser. Some allocated tokens cannot be identified by the lexer (parse_nexttok) but
+; are used at runtime and when "decompiling" to text.
+;
+; In the event of a parse error, everything up to the next end of statement is retained as is using a special token
+; that preserves the unparsable text and parsing resumes. Only the first error is referenced by the return error
+; pointer.
 ;
 ; This is a recursive descent parser.
 ;
@@ -12,14 +18,20 @@
 ; B             Nonzero to prevent generating any output (error check/length calculation only)
 ;
 ; Exit:
-; U             Points to the encoded line
+; X             Points to the encoded line
 ; D             Length of the encoded line
 ; CC.C          clear
 
 ; Error Exit:
-; B             Error code
-; U             Offset to error input
+; X             Points to the encoded line
+; D             Length of the encoded line
+; Y             Pointer to the first error location in the input
+; U             Error code
 ; CC.C          set
+;
+; This is the error handler. It is responsible for resetting the stack to bail out to the top level
+; parsing loop. It must also store the input pointer if this is the first error. Finally, it has to
+; output all the text up to either the end of the line *or* the next valid statement separator.
 parse_errorsn   ldb #err_sn
 parse_error     lds parse_stackptr              ; restore the original stack pointer so we can call from down stack
                 puls u                          ; get back original free pointer
@@ -82,6 +94,11 @@
                 leay 1,y                        ; move to next input character
 parse_curchar   lda ,y                          ; fetch input character
                 rts
+parse_nexttokc  bsr parse_nexttok               ; fetch next token
+parse_iseos     cmpb #token_eot                 ; end of text?
+                beq parse_iseos0                ; brif so
+                cmpb #token_stmtsep             ; is it a statement separator
+parse_iseos0    rts
 parse_nexttok   bsr parse_curchar               ; fetch current input
                 beq parse_nexttok1              ; brif end of input
 parse_nexttok0  cmpa #0x20                      ; space?
@@ -137,10 +154,6 @@
                 bhi parse_toupper0              ; brif not
                 suba #0x20                      ; adjust to upper case alpha
 parse_toupper0  rts                             ; Z only set here if input was zero entering from parse_nextcharu
-parse_iseos     cmpa #token_stmtsep             ; end of statement?
-                beq parse_iseos0                ; brif so
-                cmpa #token_eot                 ; end of text?
-parse_iseos0    rts
 parse_number    jmp parse_tokerr
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Parse a statement that consists of just the command token
@@ -154,49 +167,6 @@
                 stb parse_curtok
                 rts                             ; return, pass back the C result from parse_write
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; Parse an optional line number range which may be [lineno][-[lineno]]
-parse_range     jsr parse_write                 ; output the token
-                jsr parse_nexttok               ; fetch input token
-                ldx zero                        ; set default start and end line numbers - whole program
-                leau -1,x
-                pshs x,u
-                bsr parse_iseos                 ; are there arguments?
-                beq parse_range3                ; brif so
-                cmpa #token_int32               ; is it an integer (line number)?
-                bne parse_range0                ; brif not
-                ldd val0+val.int                ; is the upper 16 bits set?
-                beq parse_rangee                ; brif yes - we have an error
-                ldd val0+val.int+2              ; set the start line number
-                std ,s
-                jsr parse_nexttok               ; see what's after the line number
-parse_range0    cmpa #token_minus               ; do we have a range?
-                beq parse_range1                ; brif so
-                bsr parse_iseos                 ; end of statement?
-                bne parse_rangee                ; brif not - error
-                ldd ,s                          ; set end line to start line
-                std 2,s
-                bra parse_range3                ; go output things
-parse_range1    jsr parse_nexttok               ; skip the -
-                bsr parse_iseos                 ; end of statement?
-                beq parse_range3                ; brif so
-                cmpa #token_int32               ; is it an integer?
-                bne parse_rangee                ; brif not
-                ldx val0+val.int                ; upper 16 bits set?
-                bne parse_rangee                ; brif so - invalid number
-                ldx val0+val.int+2              ; get end line number
-                stx 2,s                         ; save end line number
-                cmpx ,s                         ; is end line lower than start line?
-                blo parse_rangee                ; brif so - error
-parse_range3    puls a                          ; write out the range
-                jsr parse_write
-                puls a
-                jsr parse_write
-                puls a
-                jsr parse_write
-                puls a
-                jmp parse_write
-parse_rangee    jmp parse_errorsn               ; go raise the parse error
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This routine parses tokens using the table at parse_wordtab. The table is structured as follows:
 ;
 ; * two bytes which contain the length of the table less the two bytes for this length value
@@ -277,6 +247,63 @@
                 coma                            ; make sure C is set for no match
                 puls a,x,pc                     ; clean up stack and return
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Validate a line number. Must enter with the token type in B. Will return the line number in X. It will return a
+; syntax error if the line number is invalid or out of range. It will also consume a valid line number token.
+parse_linenum   cmpb #token_int32               ; is it an integer?
+                beq parse_linenum1              ; brif so
+parse_linenum0  ldb #err_sn                     ; flag syntax error
+                coma                            ; flag error
+                rts
+parse_linenum1  ldx val0+val.int                ; get high word of integer
+                bne parse_linenum0              ; brif not a valid line number
+                ldx val0+val.int+2              ; get actual line number
+                pshs x                          ; save it
+                jsr parse_nexttok               ; consume line number
+                puls x,pc                       ; get back line number and return it
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Parse a line number range which is one of the following forms:
+; <linenum1>
+; <linenum1>-
+; <linenum1>-<linenum2>
+; -<linenum2>
+; The result will store two line numbers. If no - token appears, then both line numbers will be the same. Otherwise,
+; if <linenum1> is omitted, it will be assumed to be 0. If <linenum2> is omitted, it will be assumed to be 65535. Those
+; are the minimum and maximum line numbers.
+;
+; Parsing works by first looking for an integer token that is in range. If it finds one, it looks for an optional -
+; followed by an optional integer token that is in range. If the first token is not an integer, it must be a - which may
+; be optionally followed by another integer in range.
+;
+; It is technically valid to have a single - with no line numbers.
+;
+; Enter with the current token in B.
+;
+; The resulting line numbers will be returned in parse_buff
+parse_linerange ldx zero                        ; default start line number
+                leau -1,x                       ; default end line number
+                pshs x,u                        ; save the return range
+                cmpb #token_minus               ; range with no start?
+                beq parse_linerang1             ; brif so
+                bsr parse_linenum               ; verify line number, return in X
+                bcs parse_linerang4             ; bail out on error
+                stx ,s                          ; save new start line number
+                jsr parse_nexttokc              ; fetch next token, set Z if end of statement
+                bne parse_linerang0             ; brif not end of line
+                ldx ,s                          ; get end line to use as start line
+                bra parse_linerang2             ; go set range end and return
+parse_linerang0 cmpb #token_minus               ; do we have a range character?
+                bne parse_linerang3             ; brif not - we have an error
+parse_linerang1 jsr parse_nexttokc              ; parse what comes after the range mark     
+                beq parse_linerang2             ; brif end of statement - use the default range end
+                bsr parse_linenum               ; make sure it's a valid line number
+                bcs parse_linerang4             ; bail out on error
+parse_linerang2 stx 2,s                         ; set range end
+                clra                            ; make sure C is clear
+                puls x,u,pc                     ; fetch return values and return
+parse_linerang3 ldb #err_sn                     ; flag a syntax error
+                coma                            ; make sure C is set
+parse_linerang4 puls x,u,pc                     ; clean up stack and return error condition
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; This table defines the various handler routines for the various bytecode tokens. Each token is defined as follows:
 ;               parse_tokdefT <sym>,<parse>,<list>,<exec>
 ; where:
@@ -369,11 +396,15 @@
                 *pragmapop nolist
                 endm
                 *pragmapop list
+                ; the tokens defined in this section all have special parsing or meaning
                 parse_tokendefp error           ; Used to mark errors; should always be first so it's token #0 
                 parse_tokendefp eot             ; End of input marker or special handling in word tables
                 parse_tokendefp int32           ; 32 bit integer (has special parsing)
                 parse_tokendefp float           ; floating point value (has special parsing)
                 parse_tokendefp ident           ; identifier (has special parsing)
+                parse_tokendefp linenum         ; a 16 bit unsigned integer treated as a line number
+                parse_tokendefp linerange       ; a pair of 16 bit unsigned integers treated as line numbers
+                ; everything below here references keywords or particle characters
                 parse_tokendefp stmtsep         ; statement separator
                 parse_tokendefp times           ; times (multiplication) operator (*)
                 parse_tokendefp plus            ; addition operator