view src/int.s @ 80:bb50ac9fdf37

Checkpoint with very basic integer and floating point arithmetic, untested This commit has implementations for floating point add, subtract, multiply, and divide, along with 32 bit signed integer equivalents. These can probably be optimized and they are untested.
author William Astle <lost@l-w.ca>
date Sat, 07 Oct 2023 02:56:59 -0600
parents
children fbc14509955a
line wrap: on
line source

                *pragmapush list
                *pragma list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer handling package.
;
; Negate a 32 bit integer in (X); done by subtracting it from zero
int32_neg       ldd zero                        ; subtract low word
                subd val.int+2,x
                std val.int+2,x
                ldd zero                        ; and now the high word
                sbcb val.int+1,x
                sbca val.int,x
                std val.int,x
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer addition (X) + (U) -> (Y)
int32_add       ldd val.int+2,x                 ; do low word
                addd val.int+2,u
                std val.int+2,y
                ldd val.int,x                   ; and the high word
                adcb val.int+1,u
                adca val.int,u
                std val.int,y
                bvc int32_add0                  ; raise overflow if needed
OVERROR2        jmp OVERROR
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit integer subtraction (X) - (U) -> (Y)
int32_sub       ldd val.int+2,x                 ; do low word
                subd val.int+2,u
                std val.int+2,y
                ldd val.int,x                   ; and the high word
                sbcb val.int+1,u
                sbca val.int,u
                std val.int,y
                bvs OVERROR2                    ; raise overflow if needed
int32_add0      rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Signed 32 bit integer multiply (X) * (U) -> (Y), overflow if exceeds signed 32 bit range                
int32_mul       ldd val.int+2,x                 ; copy left operand to temporary
                std fpa0+fps.sig+2
                ldd val.int,x
                std fpa0+fps.sig
                eora val.int,u                  ; set sign bit in A if signs differ
                pshs a                          ; save result sign
                ldd val.int+2,u                 ; copy right operand to temporary
                std fpa1+fps.sig+2
                ldd val.int,u
                std fpa1+fps.sig
                bpl int32_mul0                  ; brif right operand is positive
                ldd zero                        ; negate right operand
                subd fpa1+fps.sig+2
                std fpa1+fps.sig+2
                ldd zero
                sbcb fpa1+fps.sig+1
                sbca fpa1+fps.sig
                std fpa1+fps.sig
int32_mul0      lda fpa0+fps.sig                ; is left operand negative?
                bpl int32_mul1                  ; brif not
                ldd zero                        ; negate left operand
                subd fpa0+fps.sig+2
                std fpa0+fps.sig+2
                ldd zero
                sbcb fpa0+fps.sig+1
                sbca fpa0+fps.sig
                std fpa0+fps.sig
int32_mul1      bsr util_mul32                  ; do the actual multiplication
                ldb fpa0extra                   ; are upper bits all zero?
                orb fpa0extra1
                orb fpa0extra2
                orb fpa0extra3
                bne OVERROR2                    ; brif not - overflowed
                ldb fpa0extra4                  ; is bit 31 set?
                bpl int32_mul2                  ; brif not - no overflow
                lda ,s                          ; negative result wanted?
                bpl OVERROR2                    ; brif not - we overflowed
                andb #0x7f                      ; lose extra sign bit
                orb fpa0extra5                  ; "or" in other bytes to see if all but bit 31 are zero
                orb fpa0extra6
                orb fpa0extra7
                bne OVERROR2                    ; brif any nonzero bits - we overflowed maximum negative number
                ldb ,s+                         ; do we want a negative result?
                bpl int32_mul2                  ; brif not
                ldd zero                        ; negate result
                subd fpa0extra6
                std fpa0extra6
                ldd zero
                sbcb fpa0extra5
                sbca fpa0extra4
                std fpa0extra4
int32_mul2      ldd fpa0extra4                  ; copy result to destination
                std val.int,y
                ldd fpa0extra6
                std val.int+2,y
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit multiply.
;
; Significands of fpa0 and fpa1, treated as unsigned, are multiplied with the product being stored in the fpa0extra
; memory locations.
;
; The agorithm is simply this: zero out the result, then multiply fpa0 by each byte of fpa1 and then add the result
; to the result location. This yields a 64 bit product which is somewhat wasteful.
util_mul32      ldd zero                        ;* zero out result bits; low 16 bits don't need to be cleared and
                stb fpa0extra3                  ;* upper 24 bits also don't
                std fpa0extra4
                ldb fpa1+fps.sig+3              ; multiply by low byte of fpa1 - no carries possible for this iteration
                lda fpa0+fps.sig+3
                mul
                std fpa0extra6
                ldb fpa1+fps.sig+3
                lda fpa0+fps.sig+2
                mul
                addd fpa0extra5
                std fpa0extra5
                ldb fpa1+fps.sig+3
                lda fpa0+fps.sig+1
                mul
                addd fpa0extra4
                std fpa0extra4
                ldb fpa1+fps.sig+3
                lda fpa0+fps.sig
                mul
                addd fpa0extra3
                std fpa0extra3
; Now we potentially have cascading carries at every stage; it makes more sense to handle those in a separate
; addition pass after each partial calculation. The partial calculations are identical to above. This is completely
; unrolled for speed.
                ldd zero                        ; zero out extra work bytes
                std fpa0extra8
                stb fpa0extra10
                ldb fpa1+fps.sig+2              ; multiply by second low byte of fpa1
                lda fpa0+fps.sig+3
                mul
                std fpa0extra11
                ldb fpa1+fps.sig+2
                lda fpa0+fps.sig+2
                mul
                addd fpa0extra10
                std fpa0extra10
                ldb fpa1+fps.sig+2
                lda fpa0+fps.sig+1
                mul
                addd fpa0extra9
                std fpa0extra9
                ldb fpa1+fps.sig+2
                lda fpa0+fps.sig
                mul
                addd fpa0extra8
                std fpa0extra8
                ldd fpa0extra11                 ; add to partial product (shifted left 8 bits)
                addd fpa0extra5
                std fpa0extra5
                ldd fpa0extra9
                adcb fpa0extra4
                adca fpa0extra3
                std fpa0extra3
                ldb #0
                adcb fpa0extra8
                stb fpa0extra2
                ldd zero                        ; and do it all again for next byte of fpa1
                std fpa0extra8
                stb fpa0extra10
                ldb fpa1+fps.sig+1
                lda fpa0+fps.sig+3
                mul
                std fpa0extra11
                ldb fpa1+fps.sig+1
                lda fpa0+fps.sig+2
                mul
                addd fpa0extra10
                std fpa0extra10
                ldb fpa1+fps.sig+1
                lda fpa0+fps.sig+1
                mul
                addd fpa0extra9
                std fpa0extra9
                ldb fpa1+fps.sig+1
                lda fpa0+fps.sig
                mul
                addd fpa0extra8
                std fpa0extra8
                ldd fpa0extra11
                addd fpa0extra4
                std fpa0extra4
                ldd fpa0extra9
                adcb fpa0extra3
                adca fpa0extra2
                std fpa0extra2
                ldb #0
                adcb fpa0extra8
                stb fpa0extra1
                ldd zero                        ; and the final sequence with the fpa1 high byte
                std fpa0extra8
                stb fpa0extra10
                ldb fpa1+fps.sig
                lda fpa0+fps.sig+3
                mul
                std fpa0extra11
                ldb fpa1+fps.sig
                lda fpa0+fps.sig+2
                mul
                addd fpa0extra10
                std fpa0extra10
                ldb fpa1+fps.sig
                lda fpa0+fps.sig+1
                mul
                addd fpa0extra9
                std fpa0extra9
                ldb fpa1+fps.sig
                lda fpa0+fps.sig
                mul
                addd fpa0extra8
                std fpa0extra8
                ldd fpa0extra11
                addd fpa0extra3
                std fpa0extra3
                ldd fpa0extra9
                adcb fpa0extra2
                adca fpa0extra1
                std fpa0extra1
                ldb #0
                adcb fpa0extra
                stb fpa0extra
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32 bit division, integer only, truncate fraction without rounding. Note that there is exactly one case where integer
; division can overflow: dividing -0x80000000 by -1 which yields 0x80000000. All other cases reduce the magnitude.
int32_div       ldd val.int+2,x                 ; copy left operand to temporary
                std fpa0+fps.sig+2
                ldd val.int,x
                std fpa0+fps.sig
                eora val.int,u                  ; set sign bit in A if signs differ
                pshs a                          ; save result sign
                ldd val.int+2,u                 ; copy right operand to temporary
                std fpa1+fps.sig+2
                ldd val.int,u
                std fpa1+fps.sig
                bpl int32_div0                  ; brif right operand is positive
                ldd zero                        ; negate right operand
                subd fpa1+fps.sig+2
                std fpa1+fps.sig+2
                ldd zero
                sbcb fpa1+fps.sig+1
                sbca fpa1+fps.sig
                std fpa1+fps.sig
int32_div0      lda fpa0+fps.sig                ; is left operand negative?
                bpl int32_div1                  ; brif not
                ldd zero                        ; negate left operand
                subd fpa0+fps.sig+2
                std fpa0+fps.sig+2
                ldd zero
                sbcb fpa0+fps.sig+1
                sbca fpa0+fps.sig
                std fpa0+fps.sig
int32_div1      ldb fpa1+fps.sig                ; check for division by zero
                orb fpa1+fps.sig+1
                orb fpa1+fps.sig+2
                orb fpa1+fps.sig+3
                lbne DIV0ERROR                  ; brif division by zero
                bsr util_div32                  ; do the actual division
                lda ,s+                         ; get desired sign
                bmi int32_div2                  ; brif want negative - we can't overflow in that case
                ldb fpa0extra                   ; get high byte of result
                lbmi OVERROR2                   ; brif we ended up with 0x80000000 positive
                bra int32_div3                  ; go return result
int32_div2      ldd zero                        ; negate result to correct sign
                subd fpa0extra+2
                std fpa0extra+2
                ldd zero
                sbcb fpa0extra+1
                sbca fpa0extra
                std fpa0extra
int32_div3      ldd fpa0extra                   ; copy result to destination
                std val.int,y
                ldd fpa0extra2
                std val.int+2,y
                rts
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Divide 32 bit integer in fpa0 significand by 32 bit integer in fpa1 significand, both treated as unsigned. Leave
; quotient at fpa0extra...fpa0extra3 and remainder at fpa0extra4...fpa0extra7; does not check for division by zero
; which will result in a quotient of 0xffffffff and a remainder will be the dividend. It will not get suck in a loop.
;
; Algorithm is basically pencil and paper long division. We check to see if the divisor "goes" at each step by doing
; a trial subtraction without saving the result. If it doesn't go, we just loop around again. If it does go, we stash
; a 1 bit in the quotient and actually do the subtraction. Then go loop around again. Doing it this way rather than
; with an actual subtraction and then undoing it with addition saves two store instructions on the comparison saves
; having to do a restore in the no-go case which is going to be quite common with values whose upper bits are
; mostly zeroes, thus it makes the operations faster in that case, for integers. (Floating point is a different
; problem.) 
util_div32      ldd fpa0+fps.sig+2              ; copy dividend to result location
                std fpa0extra6
                ldd fpa0+fps.sig
                std fpa0extra4
                ldb #32                         ; do 32 bits
                stb fpa0+fps.exp                ; save counter somewhere because we don't have enough registers
                ldd zero                        ; zero out remainder
                std fpa0extra4
                std fpa0extra6
util_div32a     lsl fpa0extra3                  ; shift dividend residue into remainder
                rol fpa0extra2
                rol fpa0extra1
                rol fpa0extra
                rol fpa0extra7
                rol fpa0extra6
                rol fpa0extra5
                rol fpa0extra4
                ldd fpa0extra6                  ; now subtract divisor from remainder
                subd fpa1+fps.sig+2
                ldd fpa0extra4
                sbcb fpa1+fps.sig+1
                sbca fpa1+fps.sig
                bcs util_div32b                 ; brif it doesn't go - need to restore
                inc fpa0extra3                  ; set quotient bit
                ldd fpa0extra6                  ; actuall do the subtraction
                subd fpa1+fps.sig+2
                std fpa0extra6
                ldd fpa0extra4
                sbcb fpa1+fps.sig+1
                sbca fpa1+fps.sig
                std fpa0extra4
util_div32b     dec fpa0+fps.exp                ; done all 32 bits?
                bne util_div32a                 ; do another
                *pragmapop list