Mercurial > hg > index.cgi
changeset 124:8770e6f977c3
Rework parser to use parse_wordtab for symbols too
There's no reason not to use the parse_wordtab table thing to match the
symbols with their token codes. It takes less space than the combined code
and tables to do it separately.
author | William Astle <lost@l-w.ca> |
---|---|
date | Mon, 01 Jan 2024 15:57:59 -0700 |
parents | 5681cdada362 |
children | 0607e4e20702 |
files | src/buildkeywordtab.c src/keywordlist.txt src/parse.s |
diffstat | 3 files changed, 72 insertions(+), 107 deletions(-) [+] |
line wrap: on
line diff
--- a/src/buildkeywordtab.c Mon Jan 01 15:15:45 2024 -0700 +++ b/src/buildkeywordtab.c Mon Jan 01 15:57:59 2024 -0700 @@ -106,7 +106,7 @@ // lose any line terminators while (*ptr == '\r' || *ptr == '\n') *ptr-- = '\0'; - ptr = strchr(linebuf, ','); + ptr = strchr(linebuf, '\t'); if (!ptr) { fprintf(stderr, "WARNING: malformed input line\n"); @@ -132,7 +132,6 @@ tnprev -> nextsibling = tn; else tnp -> firstchild = tn; - fprintf(stderr, "Create entry: %c, %s\n", tn -> ccode, tn -> toksym); } tnp = tn; } @@ -147,7 +146,7 @@ } fprintf(outfile, "; This file is automatically generated. Edit %s and rebuild to make changes.\n", argv[1]); fprintf(outfile, " *pragmapush list\n *pragma list\n"); - fprintf(outfile, "parse_wordtab\n"); + fprintf(outfile, "parse_wt\n"); print_tree(outfile, treeroot, NULL, 0); fprintf(outfile, " *pragmapop list\n"); fclose(outfile);
--- a/src/keywordlist.txt Mon Jan 01 15:15:45 2024 -0700 +++ b/src/keywordlist.txt Mon Jan 01 15:57:59 2024 -0700 @@ -1,20 +1,47 @@ -AND,token_and -AS,token_as -ASC,token_asc -DATA,token_data -ELSE,token_else -END,token_end -GO,token_go -LET,token_let -LIST,token_list -NEW,token_new -NOT,token_not -OR,token_or -POP,token_pop -PRINT,token_print -REM,token_rem -RETURN,token_return -RUN,token_run -STOP,token_stop -SUB,token_sub -TO,token_to +! token_bang +# token_hash +$ token_dollar +% token_percent +& token_amp +' token_apos +( token_oparen +) token_cparen +* token_star ++ token_plus +, token_comma +- token_minus +/ token_slash +: token_stmtsep +; token_semi +? token_print +@ token_at +^ token_exp +< token_lt +<= token_le +=> token_le +> token_gt +>= token_ge +=> token_ge +<> token_ne +>< token_ne += token_eq +AND token_and +AS token_as +ASC token_asc +DATA token_data +ELSE token_else +END token_end +GO token_go +LET token_let +LIST token_list +NEW token_new +NOT token_not +OR token_or +POP token_pop +PRINT token_print +REM token_rem +RETURN token_return +RUN token_run +STOP token_stop +SUB token_sub +TO token_to
--- a/src/parse.s Mon Jan 01 15:15:45 2024 -0700 +++ b/src/parse.s Mon Jan 01 15:57:59 2024 -0700 @@ -82,54 +82,44 @@ parse_nexttok1 ldb #token_eot ; flag end of input bra parse_nexttok6 ; go return it parse_nexttok2 sty parse_tokenst ; save start of current token after skipping spaces + bsr parse_toupper ; make sure we have upper case letters for matching + ldx #parse_wt ; point to keyword parsing table + bsr parse_wordtab ; go see if we have a match in the keyword table + bcc parse_nexttok6 ; brif we do - return it + ldy parse_tokenst ; return to the start of the token - pointer probably clobbered + bsr parse_curchar ; get back input character (may have been clobbered) cmpa #'. ; leading decimal? beq parse_nexttok3 ; brif so - parse number cmpa #'0 ; is it a digit - blo parse_nexttok4 ; brif not + blo parse_nexttok10 ; brif not cmpa #'9 ; is it still a digit? - bhi parse_nexttok4 ; brif not + bhi parse_nexttok10 ; brif not parse_nexttok3 jmp parse_number ; go parse a number -parse_nexttok4 ldx #parse_chartab ; point to list of single character tokens to recognize -parse_nexttok5 ldb 1,x ; get token value - cmpa ,x++ ; character match (and move to next entry) - bne parse_nexttok7 ; brif not parse_nexttok6 stb parse_curtok ; save token type leay 1,y ; eat the input character clra ; clear C to indicate no error (and clear Z also) rts -parse_nexttok7 cmpb #token_eot ; end of table? - bne parse_nexttok5 ; brif not - clrb ; initialize relational flags - pshs d ; save input character and relational flags for later -parse_nexttok8 cmpa #'< ; less than? - blo parse_nexttok9 ; brif not <, =, or > - cmpa #'> ; still <, =, or >? - bhi parse_nexttok9 ; brif not - suba #'< ; adjust < to 0 - cmpa #1 ; set C if <, clear if = or > - rola ; now 4 if >, 2 if =, or 1 if < - eora 1,s ; merge with previous relational characters - cmpa 1,s ; if it doesn't match, we have a dupe - bne parse_nexttok9 ; brif it's not valid - we won't recognize more in the token - sta 1,s ; save new relational flags - bsr parse_nextchar ; fetch next input - sta ,s ; save input character - bne parse_nexttok8 ; brif there was one - go handle it -parse_nexttok9 puls d ; get back input character and relational flag - tstb ; was it a relational operator? - beq parse_nexttok10 ; brif not - ldx #parse_reltab ; point to relational operator token table - ldb b,x ; get the token code - clra ; flag no error - rts ; return - but don't advance - we already did looking for multiples -parse_nexttok10 bsr parse_toupper ; convert to upper case - cmpa #'A ; is it alpha? +parse_nexttok10 cmpa #'A ; is it alpha? blo parse_nexttok11 ; brif not cmpa #'Z ; is it still alpha? bls parse_nexttok12 ; brif so parse_nexttok11 comb ; flag error - unrecognized token ldb #token_error rts +parse_nexttok12 bsr parse_nextcharu ; fetch next input character + cmpa #'0 ; is it alphanumeric? + blo parse_nexttok13 ; brif not + cmpa #'9 ; is it numeric? + bls parse_nexttok12 ; brif so - keep skipping it + cmpa #'A ; is it alpha? + blo parse_nexttok13 ; brif not + cmpa #'Z ; is it still alpha? + bls parse_nexttok12 ; brif so - keep skipping it +parse_nexttok13 tfr y,d ; calculate length of identifier + subd parse_tokenst + std val0+val.strlen ; save it for reference + ldb #token_ident ; indicate an identifier (variable name, etc.) + rts ; return result (C will be clear from SUBD above) parse_nextcharu bsr parse_nextchar ; fetch next input character beq parse_toupper0 ; brif end of input parse_toupper cmpa #'a ; is it lower case alpha? @@ -138,28 +128,6 @@ bhi parse_toupper0 ; brif not suba #0x20 ; adjust to upper case alpha parse_toupper0 rts ; Z only set here if input was zero entering from parse_nextcharu -; We parse alpha keywords and identifiers here, of the form [a-zA-Z][a-zA-Z0-9]* with a possible nonalpha characters -; in actual keywords. We use a table to parse keywords. As soon as we find a character that doesn't match a keyword -; table entry, we fall back to looking for the end of an identifier and then returning that. -parse_nexttok12 ldx #parse_wordtab ; point to keyword table - bsr parse_nexttok16 ; process this table entry - cmpb #token_ident ; did we match a token? - bne parse_nexttok6 ; brif so - go return it -parse_nexttok13 cmpa #'0 ; was it alphanumeric? - blo parse_nexttok15 ; brif not - cmpa #'9 ; was it numeric? - bls parse_nexttok14 ; brif so - cmpa #'A ; was it alpha? - blo parse_nexttok15 ; brif not - cmpa #'Z ; is it still alpha? - bhi parse_nexttok15 ; brif not -parse_nexttok14 bsr parse_nextcharu ; fetch next character and force upper case - bne parse_nexttok13 ; if not end of input, see if we have alphanumeric -parse_nexttok15 tfr y,d ; fetch input location - subd parse_tokenst ; calculate length of token - std val0+val.strlen ; save the length of the identifier - ldb #token_ident ; set token type to identifier (variable name, probably) - rts ; return token type, do not advance since we already did above ; This routine parses tokens using the table at parse_wordtab. The table is structured as follows: ; ; * two bytes which contain the length of the table less the two bytes for this length value @@ -203,35 +171,6 @@ parse_wordtab6 clra ; clear C to indicate a match puls a,x,pc ; clean up stack and return parse_number jmp parse_tokerr -; Relational token table, bits are > = < -parse_reltab fcb token_error - fcb token_lt - fcb token_eq - fcb token_le - fcb token_gt - fcb token_ne - fcb token_ge - fcb token_reltrue -; Single character token lookup table -parse_chartab fcb 0x21,token_bang ; ! - fcb 0x23,token_hash ; # - fcb 0x24,token_dollar ; $ - fcb 0x25,token_percent ; % - fcb 0x26,token_amp ; & - fcb 0x27,token_apos ; ' - fcb 0x28,token_oparen ; ( - fcb 0x29,token_cparen ; ) - fcb 0x2a,token_star ; * - fcb 0x2b,token_plus ; + - fcb 0x2c,token_comma ; , - fcb 0x2d,token_minus ; - - fcb 0x2f,token_slash ; / - fcb 0x3a,token_stmtsep ; : - fcb 0x3b,token_semi ; ; - fcb 0x3f,token_print ; ? - print shortcut - fcb 0x40,token_at ; @ - fcb 0x5e,token_exp ; ^ - exponentiation - fcb 0x00,token_eot ; end of table flag ; Parse tokens - define them in order using the macro parse_tokdef *pragmapush list *pragma nolist