view src/int.s @ 108:b1958992a66a

Properly initialize result for integer multiplication
author William Astle <lost@l-w.ca>
date Sun, 03 Dec 2023 14:57:19 -0700
parents b375a38b2b1a
children b10f6c8979a5
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Convert a signed integer in val0 to a string in strbuff
;
; The maximum size of a string generated here is 12 bytes representing 10 significant digits, a leading sign, and the
; trailing NUL.
int_toascii     ldu #strbuff+1                  ; point to start of digits
                lda #0x20                       ; default sign to space
                sta -1,u
                ldx #int_toascii5               ; point to digit constant table
                ldd #0xa00                      ; do 10 digits, no nonzero seen yet
                std fpaextra                    ; save digit counts
                ldd val0+val.int+2              ; copy to temporary accumulator
                std fpaextra+4
                ldd val0+val.int
                std fpaextra+2                  ; (will set N if the number is negative)
                bpl int_toascii0                ; brif so
                lda #'-                         ; negative sign
                sta -1,u                        ; set sign
                ldd zero                        ; negate the value
                subd fpaextra+4
                std fpaextra+4
                ldd zero
                sbcb fpaextra+3
                sbca fpaextra+2
                std fpaextra+2
int_toascii0    lda #0x2f                       ; initialize digit (account for inc below)
                sta ,u
int_toascii1    inc ,u                          ; bump digit
                ldd fpaextra+4                  ; subtract digit constnat
                subd 2,x
                std fpaextra+4
                ldd fpaextra+2
                sbcb 1,x
                sbca ,x
                std fpaextra+2
                bcc int_toascii1                ; brif we aren't at the right digit
                ldd fpaextra+4                  ; undo extra subtraction
                addd 2,x
                std fpaextra+4
                ldd fpaextra+2
                adcb 1,x
                adca ,x
                std fpaextra+2
                leax 4,x                        ; move to next constant
                lda ,u                          ; get digit count
                cmpa #'0                        ; is it zero?
                bne int_toascii2                ; brif not
                lda fpaextra+1                  ; get nonzero digit count
                beq int_toascii3                ; brif we haven't already got a nonzero - don't count the zero
int_toascii2    inc fpaextra+1                  ; bump nonzero digit count
                leau 1,u                        ; move to next digit position                
int_toascii3    dec fpaextra                    ; done all digits?
                bne int_toascii0                ; brif not
                ldb fpaextra+1                  ; did we have any nonzero digits?
                bne int_toascii4                ; brif so
                leau 1,u                        ; move past the only zero in the output
int_toascii4    clr ,u                          ; NUL terminate the string
                rts
int_toascii5    fqb 1000000000                  ; digit place values
                fqb 100000000
                fqb 10000000
                fqb 1000000
                fqb 100000
                fqb 10000
                fqb 1000
                fqb 100
                fqb 10
                fqb 1
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer handling package.
;
; Negate a 32 bit integer in (X); done by subtracting it from zero
int32_neg       ldd zero                        ; subtract low word
                subd val.int+2,x
                std val.int+2,x
                ldd zero                        ; and now the high word
                sbcb val.int+1,x
                sbca val.int,x
                std val.int,x
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer addition (X) + (U) -> (Y)
int32_add       ldd val.int+2,x                 ; do low word
                addd val.int+2,u
                std val.int+2,y
                ldd val.int,x                   ; and the high word
                adcb val.int+1,u
                adca val.int,u
                std val.int,y
                bvc int32_add0                  ; raise overflow if needed
OVERROR2        jmp OVERROR
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer subtraction (X) - (U) -> (Y)
int32_sub       ldd val.int+2,x                 ; do low word
                subd val.int+2,u
                std val.int+2,y
                ldd val.int,x                   ; and the high word
                sbcb val.int+1,u
                sbca val.int,u
                std val.int,y
                bvs OVERROR2                    ; raise overflow if needed
int32_add0      rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Fast multiply 32 bit at (X) by 10
;
; This will work for signed because the left shift will double it even if it is negative and V is set correctly after
; left shifts. The add will have the same sign so the magnitude will still increase, not decrease.
uint32_mul10    ldd val.int,x                   ; make copy of original
                ldu val.int+2,x
                pshs d,u                        ; save original
                lsl val.int+3,x                 ; shift left (times 2)
                rol val.int+2,x
                rol val.int+1,x
                rol val.int,x
                bvs OVERROR2                    ; brif overflow
                lsl val.int+3,x                 ; shift left (times 4)
                rol val.int+2,x
                rol val.int+1,x
                rol val.int,x
                bvs OVERROR2                    ; brif overflow
                ldd val.int+2,x                 ; add original (times 5)
                addd 2,s
                std val.int+2,x
                puls d,u                        ; (get upper word and clean stack)
                adcb val.int+1,x
                adca val.int,x
                std val.int,x
                bvs OVERROR2                    ; brif overflow
                lsl val.int+3,x                 ; shift left again (times 10)
                rol val.int+2,x
                rol val.int+1,x
                rol val.int,x
                bvs OVERROR2                    ; brif overflow
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Signed 32 bit integer multiply (X) * (U) -> (Y), overflow if exceeds signed 32 bit range                
int32_mul       ldd val.int+2,x                 ; copy left operand to temporary
                std fpa0+fpa.sig+2
                ldd val.int,x
                std fpa0+fpa.sig
                eora val.int,u                  ; set sign bit in A if signs differ
                pshs a                          ; save result sign
                ldd val.int+2,u                 ; copy right operand to temporary
                std fpa1+fpa.sig+2
                ldd val.int,u
                std fpa1+fpa.sig
                bpl int32_mul0                  ; brif right operand is positive
                ldd zero                        ; negate right operand
                subd fpa1+fpa.sig+2
                std fpa1+fpa.sig+2
                ldd zero
                sbcb fpa1+fpa.sig+1
                sbca fpa1+fpa.sig
                std fpa1+fpa.sig
int32_mul0      lda fpa0+fpa.sig                ; is left operand negative?
                bpl int32_mul1                  ; brif not
                ldd zero                        ; negate left operand
                subd fpa0+fpa.sig+2
                std fpa0+fpa.sig+2
                ldd zero
                sbcb fpa0+fpa.sig+1
                sbca fpa0+fpa.sig
                std fpa0+fpa.sig
int32_mul1      bsr util_mul32                  ; do the actual multiplication
                ldb fpaextra                   ; are upper bits all zero?
                orb fpaextra+1
                orb fpaextra+2
                orb fpaextra+3
                bne int32_mul4                  ; brif not - overflow to floating point
                ldb fpaextra+4                  ; is bit 31 set?
                bpl int32_mul2                  ; brif not - no overflow
                lda ,s                          ; negative result wanted?
                bpl int32_mul4                  ; brif not - overflow to floating point
                andb #0x7f                      ; lose extra sign bit
                orb fpaextra+2                  ; "or" in other bytes to see if all but bit 31 are zero
                orb fpaextra+6
                orb fpaextra+7
                bne int32_mul4                  ; brif any nonzero bits - we overflowed maximum negative number
int32_mul2      ldb ,s+                         ; do we want a negative result?
                bpl int32_mul3                  ; brif not - don't negate result
                ldd zero                        ; negate result
                subd fpaextra+6
                std fpaextra+6
                ldd zero
                sbcb fpaextra+5
                sbca fpaextra+4
                std fpaextra+4
int32_mul3      ldd fpaextra+4                  ; copy result to destination
                std val.int,y
                ldd fpaextra+6
                std val.int+2,y
                rts
int32_mul4      puls b                          ; get back desired sign
                sex                             ; set proper sign for floating point result
                ldx #fpaextra                  ; point to 64 bit unsigned result
;                jmp fps_fromuint64s             ; go convert to floating point using the sign in A
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit multiply.
;
; Significands of fpa0 and fpa1, treated as unsigned, are multiplied with the product being stored in the fpaextra
; memory locations.
;
; The agorithm is simply this: zero out the result, then multiply fpa0 by each byte of fpa1 and then add the result
; to the result location. This yields a 64 bit product which is somewhat wasteful.
util_mul32      ldd zero                        ;* zero out result bits; low 16 bits don't need to be cleared
                std fpaextra
                std fpaextra+2
                std fpaextra+4
                ldb fpa1+fpa.sig+3              ; multiply by low byte of fpa1 - no carries possible for this iteration
                lda fpa0+fpa.sig+3
                mul
                std fpaextra+6
                ldb fpa1+fpa.sig+3
                lda fpa0+fpa.sig+2
                mul
                addd fpaextra+5
                std fpaextra+5
                ldb fpa1+fpa.sig+3
                lda fpa0+fpa.sig+1
                mul
                addd fpaextra+4
                std fpaextra+4
                ldb fpa1+fpa.sig+3
                lda fpa0+fpa.sig
                mul
                addd fpaextra+3
                std fpaextra+3
; Now we potentially have cascading carries at every stage; it makes more sense to handle those in a separate
; addition pass after each partial calculation. The partial calculations are identical to above. This is completely
; unrolled for speed.
                ldd zero                        ; zero out extra work bytes
                std fpaextra+8
                stb fpaextra+10
                ldb fpa1+fpa.sig+2              ; multiply by second low byte of fpa1
                lda fpa0+fpa.sig+3
                mul
                std fpaextra+11
                ldb fpa1+fpa.sig+2
                lda fpa0+fpa.sig+2
                mul
                addd fpaextra+10
                std fpaextra+10
                ldb fpa1+fpa.sig+2
                lda fpa0+fpa.sig+1
                mul
                addd fpaextra+9
                std fpaextra+9
                ldb fpa1+fpa.sig+2
                lda fpa0+fpa.sig
                mul
                addd fpaextra+8
                std fpaextra+8
                ldd fpaextra+11                 ; add to partial product (shifted left 8 bits)
                addd fpaextra+5
                std fpaextra+5
                ldd fpaextra+9
                adcb fpaextra+4
                adca fpaextra+3
                std fpaextra+3
                ldb #0
                adcb fpaextra+8
                stb fpaextra+2
                ldd zero                        ; and do it all again for next byte of fpa1
                std fpaextra+8
                stb fpaextra+10
                ldb fpa1+fpa.sig+1
                lda fpa0+fpa.sig+3
                mul
                std fpaextra+11
                ldb fpa1+fpa.sig+1
                lda fpa0+fpa.sig+2
                mul
                addd fpaextra+10
                std fpaextra+10
                ldb fpa1+fpa.sig+1
                lda fpa0+fpa.sig+1
                mul
                addd fpaextra+9
                std fpaextra+9
                ldb fpa1+fpa.sig+1
                lda fpa0+fpa.sig
                mul
                addd fpaextra+8
                std fpaextra+8
                ldd fpaextra+11
                addd fpaextra+4
                std fpaextra+4
                ldd fpaextra+9
                adcb fpaextra+3
                adca fpaextra+2
                std fpaextra+2
                ldb #0
                adcb fpaextra+8
                stb fpaextra+1
                ldd zero                        ; and the final sequence with the fpa1 high byte
                std fpaextra+8
                stb fpaextra+10
                ldb fpa1+fpa.sig
                lda fpa0+fpa.sig+3
                mul
                std fpaextra+11
                ldb fpa1+fpa.sig
                lda fpa0+fpa.sig+2
                mul
                addd fpaextra+10
                std fpaextra+10
                ldb fpa1+fpa.sig
                lda fpa0+fpa.sig+1
                mul
                addd fpaextra+9
                std fpaextra+9
                ldb fpa1+fpa.sig
                lda fpa0+fpa.sig
                mul
                addd fpaextra+8
                std fpaextra+8
                ldd fpaextra+11
                addd fpaextra+3
                std fpaextra+3
                ldd fpaextra+9
                adcb fpaextra+2
                adca fpaextra+1
                std fpaextra+1
                ldb #0
                adcb fpaextra
                stb fpaextra
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Integer divide (X) by 10 *in place*
int32_const10   fqb 10                          ; integer constant 10
int32_div10     ldu #int32_const10              ; point to integer constant 10
                leay ,x                         ; point to output location
                ; fall through to integer division
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit division, integer only, truncate fraction without rounding. Note that there is exactly one case where integer
; division can overflow: dividing -0x80000000 by -1 which yields 0x80000000. All other cases reduce the magnitude.
int32_div       ldd val.int+2,x                 ; copy left operand to temporary
                std fpa0+fpa.sig+2
                ldd val.int,x
                std fpa0+fpa.sig
                eora val.int,u                  ; set sign bit in A if signs differ
                pshs a                          ; save result sign
                ldd val.int+2,u                 ; copy right operand to temporary
                std fpa1+fpa.sig+2
                ldd val.int,u
                std fpa1+fpa.sig
                bpl int32_div0                  ; brif right operand is positive
                ldd zero                        ; negate right operand
                subd fpa1+fpa.sig+2
                std fpa1+fpa.sig+2
                ldd zero
                sbcb fpa1+fpa.sig+1
                sbca fpa1+fpa.sig
                std fpa1+fpa.sig
int32_div0      lda fpa0+fpa.sig                ; is left operand negative?
                bpl int32_div1                  ; brif not
                ldd zero                        ; negate left operand
                subd fpa0+fpa.sig+2
                std fpa0+fpa.sig+2
                ldd zero
                sbcb fpa0+fpa.sig+1
                sbca fpa0+fpa.sig
                std fpa0+fpa.sig
int32_div1      ldb fpa1+fpa.sig                ; check for division by zero
                orb fpa1+fpa.sig+1
                orb fpa1+fpa.sig+2
                orb fpa1+fpa.sig+3
                lbne DIV0ERROR                  ; brif division by zero
                bsr util_div32                  ; do the actual division
                lda ,s+                         ; get desired sign
                bmi int32_div2                  ; brif want negative - we can't overflow in that case
                ldb fpaextra                   ; get high byte of result
                lbmi OVERROR2                   ; brif we ended up with 0x80000000 positive
                bra int32_div3                  ; go return result
int32_div2      ldd zero                        ; negate result to correct sign
                subd fpaextra+2
                std fpaextra+2
                ldd zero
                sbcb fpaextra+1
                sbca fpaextra
                std fpaextra
int32_div3      ldd fpaextra                   ; copy result to destination
                std val.int,y
                ldd fpaextra+2
                std val.int+2,y
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Divide 32 bit integer in fpa0 significand by 32 bit integer in fpa1 significand, both treated as unsigned. Leave
; quotient at fpaextra...fpaextra+3 and remainder at fpaextra+4...fpaextra+7; does not check for division by zero
; which will result in a quotient of 0xffffffff and a remainder will be the dividend. It will not get suck in a loop.
;
; Algorithm is basically pencil and paper long division. We check to see if the divisor "goes" at each step by doing
; a trial subtraction without saving the result. If it doesn't go, we just loop around again. If it does go, we stash
; a 1 bit in the quotient and actually do the subtraction. Then go loop around again. Doing it this way rather than
; with an actual subtraction and then undoing it with addition saves two store instructions on the comparison saves
; having to do a restore in the no-go case which is going to be quite common with values whose upper bits are
; mostly zeroes, thus it makes the operations faster in that case, for integers. (Floating point is a different
; problem.) 
util_div32      ldd fpa0+fpa.sig+2              ; copy dividend to result location
                std fpaextra+6
                ldd fpa0+fpa.sig
                std fpaextra+4
                ldb #32                         ; do 32 bits
                stb fpa0+fpa.exp                ; save counter somewhere because we don't have enough registers
                ldd zero                        ; zero out remainder
                std fpaextra+4
                std fpaextra+6
util_div32a     lsl fpaextra+3                  ; shift dividend residue into remainder
                rol fpaextra+2
                rol fpaextra+1
                rol fpaextra
                rol fpaextra+7
                rol fpaextra+6
                rol fpaextra+5
                rol fpaextra+4
                ldd fpaextra+6                  ; now subtract divisor from remainder
                subd fpa1+fpa.sig+2
                ldd fpaextra+4
                sbcb fpa1+fpa.sig+1
                sbca fpa1+fpa.sig
                bcs util_div32b                 ; brif it doesn't go - don't subtract or set bit
                inc fpaextra+3                  ; set quotient bit
                ldd fpaextra+6                  ; actually do the subtraction
                subd fpa1+fpa.sig+2
                std fpaextra+6
                ldd fpaextra+4
                sbcb fpa1+fpa.sig+1
                sbca fpa1+fpa.sig
                std fpaextra+4
util_div32b     dec fpa0+fpa.exp                ; done all 32 bits?
                bne util_div32a                 ; do another
                *pragmapop list