view src/int.s @ 83:a492441bfc56

Add utility multiply and divide by 10 routines Add a fast multiply by 10 routine for both integer and floating point (shift left twice, add original, shift left). Also add a simple call to divide by 10 for both though there is no fast shortcut for that.
author William Astle <lost@l-w.ca>
date Sat, 07 Oct 2023 15:17:44 -0600
parents 9a4e2364a966
children 663d8e77b579
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer handling package.
;
; Negate a 32 bit integer in (X); done by subtracting it from zero
int32_neg       ldd zero                        ; subtract low word
                subd val.int+2,x
                std val.int+2,x
                ldd zero                        ; and now the high word
                sbcb val.int+1,x
                sbca val.int,x
                std val.int,x
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer addition (X) + (U) -> (Y)
int32_add       ldd val.int+2,x                 ; do low word
                addd val.int+2,u
                std val.int+2,y
                ldd val.int,x                   ; and the high word
                adcb val.int+1,u
                adca val.int,u
                std val.int,y
                bvc int32_add0                  ; raise overflow if needed
OVERROR2        jmp OVERROR
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer subtraction (X) - (U) -> (Y)
int32_sub       ldd val.int+2,x                 ; do low word
                subd val.int+2,u
                std val.int+2,y
                ldd val.int,x                   ; and the high word
                sbcb val.int+1,u
                sbca val.int,u
                std val.int,y
                bvs OVERROR2                    ; raise overflow if needed
int32_add0      rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Fast multiply 32 bit at (X) by 10
;
; This will work for signed because the left shift will double it even if it is negative and V is set correctly after
; left shifts. The add will have the same sign so the magnitude will still increase, not decrease.
uint32_mul10    ldd val.int,x                   ; make copy of original
                ldu val.int+2,x
                pshs d,u                        ; save original
                lsl val.int+3,x                 ; shift left (times 2)
                rol val.int+2,x
                rol val.int+1,x
                rol val.int,x
                bvs OVERROR2                    ; brif overflow
                lsl val.int+3,x                 ; shift left (times 4)
                rol val.int+2,x
                rol val.int+1,x
                rol val.int,x
                bvs OVERROR2                    ; brif overflow
                ldd val.int+2,x                 ; add original (times 5)
                addd 2,s
                std val.int+2,x
                puls d,u                        ; (get upper word and clean stack)
                adcb val.int+1,x
                adca val.int,x
                std val.int,x
                bvs OVERROR2                    ; brif overflow
                lsl val.int+3,x                 ; shift left again (times 10)
                rol val.int+2,x
                rol val.int+1,x
                rol val.int,x
                bvs OVERROR2                    ; brif overflow
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Signed 32 bit integer multiply (X) * (U) -> (Y), overflow if exceeds signed 32 bit range                
int32_mul       ldd val.int+2,x                 ; copy left operand to temporary
                std fpa0+fps.sig+2
                ldd val.int,x
                std fpa0+fps.sig
                eora val.int,u                  ; set sign bit in A if signs differ
                pshs a                          ; save result sign
                ldd val.int+2,u                 ; copy right operand to temporary
                std fpa1+fps.sig+2
                ldd val.int,u
                std fpa1+fps.sig
                bpl int32_mul0                  ; brif right operand is positive
                ldd zero                        ; negate right operand
                subd fpa1+fps.sig+2
                std fpa1+fps.sig+2
                ldd zero
                sbcb fpa1+fps.sig+1
                sbca fpa1+fps.sig
                std fpa1+fps.sig
int32_mul0      lda fpa0+fps.sig                ; is left operand negative?
                bpl int32_mul1                  ; brif not
                ldd zero                        ; negate left operand
                subd fpa0+fps.sig+2
                std fpa0+fps.sig+2
                ldd zero
                sbcb fpa0+fps.sig+1
                sbca fpa0+fps.sig
                std fpa0+fps.sig
int32_mul1      bsr util_mul32                  ; do the actual multiplication
                ldb fpa0extra                   ; are upper bits all zero?
                orb fpa0extra1
                orb fpa0extra2
                orb fpa0extra3
                bne int32_mul4                  ; brif not - overflow to floating point
                ldb fpa0extra4                  ; is bit 31 set?
                bpl int32_mul2                  ; brif not - no overflow
                lda ,s                          ; negative result wanted?
                bpl int32_mul4                  ; brif not - overflow to floating point
                andb #0x7f                      ; lose extra sign bit
                orb fpa0extra2                  ; "or" in other bytes to see if all but bit 31 are zero
                orb fpa0extra6
                orb fpa0extra7
                bne int32_mul4                  ; brif any nonzero bits - we overflowed maximum negative number
int32_mul2      ldb ,s+                         ; do we want a negative result?
                bpl int32_mul3                  ; brif not - don't negate result
                ldd zero                        ; negate result
                subd fpa0extra6
                std fpa0extra6
                ldd zero
                sbcb fpa0extra5
                sbca fpa0extra4
                std fpa0extra4
int32_mul3      ldd fpa0extra4                  ; copy result to destination
                std val.int,y
                ldd fpa0extra6
                std val.int+2,y
                rts
int32_mul4      puls b                          ; get back desired sign
                sex                             ; set proper sign for floating point result
                ldx #fpa0extra                  ; point to 64 bit unsigned result
                jmp fps_fromuint64s             ; go convert to floating point using the sign in A
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit multiply.
;
; Significands of fpa0 and fpa1, treated as unsigned, are multiplied with the product being stored in the fpa0extra
; memory locations.
;
; The agorithm is simply this: zero out the result, then multiply fpa0 by each byte of fpa1 and then add the result
; to the result location. This yields a 64 bit product which is somewhat wasteful.
util_mul32      ldd zero                        ;* zero out result bits; low 16 bits don't need to be cleared and
                stb fpa0extra3                  ;* upper 24 bits also don't
                std fpa0extra4
                ldb fpa1+fps.sig+3              ; multiply by low byte of fpa1 - no carries possible for this iteration
                lda fpa0+fps.sig+3
                mul
                std fpa0extra6
                ldb fpa1+fps.sig+3
                lda fpa0+fps.sig+2
                mul
                addd fpa0extra5
                std fpa0extra5
                ldb fpa1+fps.sig+3
                lda fpa0+fps.sig+1
                mul
                addd fpa0extra4
                std fpa0extra4
                ldb fpa1+fps.sig+3
                lda fpa0+fps.sig
                mul
                addd fpa0extra3
                std fpa0extra3
; Now we potentially have cascading carries at every stage; it makes more sense to handle those in a separate
; addition pass after each partial calculation. The partial calculations are identical to above. This is completely
; unrolled for speed.
                ldd zero                        ; zero out extra work bytes
                std fpa0extra8
                stb fpa0extra10
                ldb fpa1+fps.sig+2              ; multiply by second low byte of fpa1
                lda fpa0+fps.sig+3
                mul
                std fpa0extra11
                ldb fpa1+fps.sig+2
                lda fpa0+fps.sig+2
                mul
                addd fpa0extra10
                std fpa0extra10
                ldb fpa1+fps.sig+2
                lda fpa0+fps.sig+1
                mul
                addd fpa0extra9
                std fpa0extra9
                ldb fpa1+fps.sig+2
                lda fpa0+fps.sig
                mul
                addd fpa0extra8
                std fpa0extra8
                ldd fpa0extra11                 ; add to partial product (shifted left 8 bits)
                addd fpa0extra5
                std fpa0extra5
                ldd fpa0extra9
                adcb fpa0extra4
                adca fpa0extra3
                std fpa0extra3
                ldb #0
                adcb fpa0extra8
                stb fpa0extra2
                ldd zero                        ; and do it all again for next byte of fpa1
                std fpa0extra8
                stb fpa0extra10
                ldb fpa1+fps.sig+1
                lda fpa0+fps.sig+3
                mul
                std fpa0extra11
                ldb fpa1+fps.sig+1
                lda fpa0+fps.sig+2
                mul
                addd fpa0extra10
                std fpa0extra10
                ldb fpa1+fps.sig+1
                lda fpa0+fps.sig+1
                mul
                addd fpa0extra9
                std fpa0extra9
                ldb fpa1+fps.sig+1
                lda fpa0+fps.sig
                mul
                addd fpa0extra8
                std fpa0extra8
                ldd fpa0extra11
                addd fpa0extra4
                std fpa0extra4
                ldd fpa0extra9
                adcb fpa0extra3
                adca fpa0extra2
                std fpa0extra2
                ldb #0
                adcb fpa0extra8
                stb fpa0extra1
                ldd zero                        ; and the final sequence with the fpa1 high byte
                std fpa0extra8
                stb fpa0extra10
                ldb fpa1+fps.sig
                lda fpa0+fps.sig+3
                mul
                std fpa0extra11
                ldb fpa1+fps.sig
                lda fpa0+fps.sig+2
                mul
                addd fpa0extra10
                std fpa0extra10
                ldb fpa1+fps.sig
                lda fpa0+fps.sig+1
                mul
                addd fpa0extra9
                std fpa0extra9
                ldb fpa1+fps.sig
                lda fpa0+fps.sig
                mul
                addd fpa0extra8
                std fpa0extra8
                ldd fpa0extra11
                addd fpa0extra3
                std fpa0extra3
                ldd fpa0extra9
                adcb fpa0extra2
                adca fpa0extra1
                std fpa0extra1
                ldb #0
                adcb fpa0extra
                stb fpa0extra
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Integer divide (X) by 10 *in place*
int32_const10   fqb 10                          ; integer constant 10
int32_div10     ldu #int32_const10              ; point to integer constant 10
                leay ,x                         ; point to output location
                ; fall through to integer division
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit division, integer only, truncate fraction without rounding. Note that there is exactly one case where integer
; division can overflow: dividing -0x80000000 by -1 which yields 0x80000000. All other cases reduce the magnitude.
int32_div       ldd val.int+2,x                 ; copy left operand to temporary
                std fpa0+fps.sig+2
                ldd val.int,x
                std fpa0+fps.sig
                eora val.int,u                  ; set sign bit in A if signs differ
                pshs a                          ; save result sign
                ldd val.int+2,u                 ; copy right operand to temporary
                std fpa1+fps.sig+2
                ldd val.int,u
                std fpa1+fps.sig
                bpl int32_div0                  ; brif right operand is positive
                ldd zero                        ; negate right operand
                subd fpa1+fps.sig+2
                std fpa1+fps.sig+2
                ldd zero
                sbcb fpa1+fps.sig+1
                sbca fpa1+fps.sig
                std fpa1+fps.sig
int32_div0      lda fpa0+fps.sig                ; is left operand negative?
                bpl int32_div1                  ; brif not
                ldd zero                        ; negate left operand
                subd fpa0+fps.sig+2
                std fpa0+fps.sig+2
                ldd zero
                sbcb fpa0+fps.sig+1
                sbca fpa0+fps.sig
                std fpa0+fps.sig
int32_div1      ldb fpa1+fps.sig                ; check for division by zero
                orb fpa1+fps.sig+1
                orb fpa1+fps.sig+2
                orb fpa1+fps.sig+3
                lbne DIV0ERROR                  ; brif division by zero
                bsr util_div32                  ; do the actual division
                lda ,s+                         ; get desired sign
                bmi int32_div2                  ; brif want negative - we can't overflow in that case
                ldb fpa0extra                   ; get high byte of result
                lbmi OVERROR2                   ; brif we ended up with 0x80000000 positive
                bra int32_div3                  ; go return result
int32_div2      ldd zero                        ; negate result to correct sign
                subd fpa0extra+2
                std fpa0extra+2
                ldd zero
                sbcb fpa0extra+1
                sbca fpa0extra
                std fpa0extra
int32_div3      ldd fpa0extra                   ; copy result to destination
                std val.int,y
                ldd fpa0extra2
                std val.int+2,y
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Divide 32 bit integer in fpa0 significand by 32 bit integer in fpa1 significand, both treated as unsigned. Leave
; quotient at fpa0extra...fpa0extra3 and remainder at fpa0extra4...fpa0extra7; does not check for division by zero
; which will result in a quotient of 0xffffffff and a remainder will be the dividend. It will not get suck in a loop.
;
; Algorithm is basically pencil and paper long division. We check to see if the divisor "goes" at each step by doing
; a trial subtraction without saving the result. If it doesn't go, we just loop around again. If it does go, we stash
; a 1 bit in the quotient and actually do the subtraction. Then go loop around again. Doing it this way rather than
; with an actual subtraction and then undoing it with addition saves two store instructions on the comparison saves
; having to do a restore in the no-go case which is going to be quite common with values whose upper bits are
; mostly zeroes, thus it makes the operations faster in that case, for integers. (Floating point is a different
; problem.) 
util_div32      ldd fpa0+fps.sig+2              ; copy dividend to result location
                std fpa0extra6
                ldd fpa0+fps.sig
                std fpa0extra4
                ldb #32                         ; do 32 bits
                stb fpa0+fps.exp                ; save counter somewhere because we don't have enough registers
                ldd zero                        ; zero out remainder
                std fpa0extra4
                std fpa0extra6
util_div32a     lsl fpa0extra3                  ; shift dividend residue into remainder
                rol fpa0extra2
                rol fpa0extra1
                rol fpa0extra
                rol fpa0extra7
                rol fpa0extra6
                rol fpa0extra5
                rol fpa0extra4
                ldd fpa0extra6                  ; now subtract divisor from remainder
                subd fpa1+fps.sig+2
                ldd fpa0extra4
                sbcb fpa1+fps.sig+1
                sbca fpa1+fps.sig
                bcs util_div32b                 ; brif it doesn't go - don't subtract or set bit
                inc fpa0extra3                  ; set quotient bit
                ldd fpa0extra6                  ; actually do the subtraction
                subd fpa1+fps.sig+2
                std fpa0extra6
                ldd fpa0extra4
                sbcb fpa1+fps.sig+1
                sbca fpa1+fps.sig
                std fpa0extra4
util_div32b     dec fpa0+fps.exp                ; done all 32 bits?
                bne util_div32a                 ; do another
                *pragmapop list