Mercurial > hg > index.cgi

--- a/src/fps.s	Sat Oct 07 13:39:25 2023 -0600
+++ b/src/fps.s	Sat Oct 07 15:17:44 2023 -0600
@@ -71,6 +71,34 @@
                 stb fpa0extra
                 jmp fps_add10                   ; go normalize the result and return
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Fast multiply (X) by 10, in place.
+;
+; * first, save original value
+; * then, shift left by 2 bits (add 2 to exponent)
+; * then, add original value
+; * then, shift left one more (add 1 to exponent)
+;
+; This should be faster than multiplying by 10.
+fps_mul10       leas -fps.size,s                ; make a temporary to hold original value
+                ldd ,x                          ; copy original value
+                std ,s
+                ldd 2,x
+                std 2,s
+                ldd 4,x
+                std 4,s
+                lda fps.exp,x                   ; bump original exponent by 2 (times 4)
+                adda #2
+                bcc fps_mul10b                  ; brif it overflowed
+fps_mul10a      jmp OVERROR                     ; raise overflow
+fps_mul10b      sta fps.exp,x
+                leay ,x
+                leau ,s
+                bsr fps_add                     ; add original value (times 5)
+                leas fps.size,s                 ; clean up temporary
+                inc fps.exp,y                   ; bump exponent (times 10) in result
+                beq fps_mul10a                  ; brif it overflowed
+                rts
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Unary negation - negate (X) to (Y)
 fps_neg         ldd 2,x                         ; copy to output and keep exponent in A
                 std 2,y
@@ -335,6 +363,12 @@
                 clra                            ; clear carry - so shift above will terminate
                 bra fps_mul6                    ; go do another bit
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Divide (X) by 10 in place
+fps_const10     fcb 0x83,0xa0,0x00,0x00,0x00,0x00 ; single precision unpacked constant 10
+fps_div10       ldu #fps_const10                ; point to constant 10
+                leay ,x                         ; put output in input
+                ; fall through to regular division
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Single precision division (X) ÷ (U) -> (Y)
 ;
 ; This is basically the same algorithm used in the Color Basic ROM
--- a/src/int.s	Sat Oct 07 13:39:25 2023 -0600
+++ b/src/int.s	Sat Oct 07 15:17:44 2023 -0600
@@ -35,6 +35,38 @@
                 bvs OVERROR2                    ; raise overflow if needed
 int32_add0      rts
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Fast multiply 32 bit at (X) by 10
+;
+; This will work for signed because the left shift will double it even if it is negative and V is set correctly after
+; left shifts. The add will have the same sign so the magnitude will still increase, not decrease.
+uint32_mul10    ldd val.int,x                   ; make copy of original
+                ldu val.int+2,x
+                pshs d,u                        ; save original
+                lsl val.int+3,x                 ; shift left (times 2)
+                rol val.int+2,x
+                rol val.int+1,x
+                rol val.int,x
+                bvs OVERROR2                    ; brif overflow
+                lsl val.int+3,x                 ; shift left (times 4)
+                rol val.int+2,x
+                rol val.int+1,x
+                rol val.int,x
+                bvs OVERROR2                    ; brif overflow
+                ldd val.int+2,x                 ; add original (times 5)
+                addd 2,s
+                std val.int+2,x
+                puls d,u                        ; (get upper word and clean stack)
+                adcb val.int+1,x
+                adca val.int,x
+                std val.int,x
+                bvs OVERROR2                    ; brif overflow
+                lsl val.int+3,x                 ; shift left again (times 10)
+                rol val.int+2,x
+                rol val.int+1,x
+                rol val.int,x
+                bvs OVERROR2                    ; brif overflow
+                rts
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Signed 32 bit integer multiply (X) * (U) -> (Y), overflow if exceeds signed 32 bit range
 int32_mul       ldd val.int+2,x                 ; copy left operand to temporary
                 std fpa0+fps.sig+2
@@ -227,6 +259,12 @@
                 stb fpa0extra
                 rts
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Integer divide (X) by 10 *in place*
+int32_const10   fqb 10                          ; integer constant 10
+int32_div10     ldu #int32_const10              ; point to integer constant 10
+                leay ,x                         ; point to output location
+                ; fall through to integer division
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; 32 bit division, integer only, truncate fraction without rounding. Note that there is exactly one case where integer
 ; division can overflow: dividing -0x80000000 by -1 which yields 0x80000000. All other cases reduce the magnitude.
 int32_div       ldd val.int+2,x                 ; copy left operand to temporary