Mercurial > hg > index.cgi
view src/fps.s @ 83:a492441bfc56
Add utility multiply and divide by 10 routines
Add a fast multiply by 10 routine for both integer and floating point (shift
left twice, add original, shift left). Also add a simple call to divide by
10 for both though there is no fast shortcut for that.
author | William Astle <lost@l-w.ca> |
---|---|
date | Sat, 07 Oct 2023 15:17:44 -0600 |
parents | 9a4e2364a966 |
children | 663d8e77b579 |
line wrap: on
line source
*pragmapush list *pragma list ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Single precision floating point arithmetic package ; ; Floating point values are stored in 6 byte packets (single precision) organized as follows: ; Offset Length Contents ; 0 1 8 bit binary exponent with a bias of 128; 0 means the number is 0 ; 1 4 32 bit significand ; 5 1 sign flag; zero for positive, 0xff for negative ; ; Binary operateions take pointers to their arguments in X and U. Y contains the result location. In all cases, it is ; safe to specify either one of the arguments as the result location. Unary operations take their operand pointer in ; X and their result pointer in Y. In all cases, there is an in place version of the unary operation that operates on ; its input and only needs the single pointer in X. ; ; On the topic of optimization: these routines copy their operands to accumulators in the direct page. This saves one ; cycle per instruction which has a nonzero offset. In addition, this means that the input operands can remain ; unmodified by the actual operations. For instance, addition requires denormalizing one operand which would otherwise ; have to be done in place. ; ; NOTE: the input pointers X and U may be clobbered. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Convert 32 bit unsigned integer at (X) to single precision floating point at (U) fps_fromuint32 clr fpa0+fps.sign ; set result sign to positive bra fps_fromint32a ; go do conversion ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Convert 32 bit signed integer at (X) to single precision floating point at value accumulator in (Y) fps_fromint32 ldb ,x ; set sign based on signed integer sex sta fpa0+fps.sign ; set result sign to the two's complement sign bpl fps_fromint32a ; brif positive - no need to mess with bits jsr int32_neg ; make the bits positive fps_fromint32a ldb valtype_float ; set output value type to floating point stb val.type,y ldd ,x ; copy integer bits to fpa0 std fpa0+fps.sig ldd 2,x std fpa0+fps.sig+2 ldb #0xa0 ; put binary point to the right of the significand stb fpa0+fps.exp clr fpa0extra ; clear extra precision jmp fps_add10 ; go normalize the result and return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Convert 64 bit unsigned value at (X) to single precision floating point in value accumulator at (Y) ; ; Cases: ; * byte 0 is first nonzero - exponent at 64 bits right, use upper 0 to 4 ; * byte 1 is first nonzero - exponent at 56 bits right, use bytes 1 to 5 ; * byte 2 is first nonzero - exponent at 48 bits right, use bytes 2 to 6 ; * otherwise - exponent at 40 bits right, use bytes 3 to 7 fps_fromuint64 clra ; set sign to positive fps_fromuint64s sta fpa0+fps.sign ; save sign of result ldb #0xc0 ; exponent if binary point is 64 bits to the right lda ,x+ ; is the first byte zero? bne fps_fromuint64a ; brif not subb #8 ; lose a byte off exponent lda ,x+ ; is the second byte zero? bne fps_fromuint64a ; brif not subb #8 ; lose another byte off exponent lda ,x+ ; is third byte zero? bne fps_fromuint64a ; brif not subb #8 ; lose another byte lda ,x+ ; get first byte to copy fps_fromuint64a stb fpa0+fps.exp ; save exponent sta fpa0+fps.sig ; save high byte of significand ldd ,x ; copy next two bytes to significand std fpa0+fps.sig+1 ldd 2,x ; and the final byte and extra precision sta fpa0+fps.sig+3 stb fpa0extra jmp fps_add10 ; go normalize the result and return ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Fast multiply (X) by 10, in place. ; ; * first, save original value ; * then, shift left by 2 bits (add 2 to exponent) ; * then, add original value ; * then, shift left one more (add 1 to exponent) ; ; This should be faster than multiplying by 10. fps_mul10 leas -fps.size,s ; make a temporary to hold original value ldd ,x ; copy original value std ,s ldd 2,x std 2,s ldd 4,x std 4,s lda fps.exp,x ; bump original exponent by 2 (times 4) adda #2 bcc fps_mul10b ; brif it overflowed fps_mul10a jmp OVERROR ; raise overflow fps_mul10b sta fps.exp,x leay ,x leau ,s bsr fps_add ; add original value (times 5) leas fps.size,s ; clean up temporary inc fps.exp,y ; bump exponent (times 10) in result beq fps_mul10a ; brif it overflowed rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Unary negation - negate (X) to (Y) fps_neg ldd 2,x ; copy to output and keep exponent in A std 2,y ldd 4,x std 4,y ldd ,x std ,y tsta ; is the number zero? beq fps_neg0 ; brif so - do nothing com fps.sign,y ; flip the sign fps_neg0 rts fps_negx lda fps.exp,x ; is the number zero? beq fps_negx0 ; brif so - do nothing com fps.sign,x ; flip the sign fps_negx0 rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copy (X) to fpa0 and (U) to fpa1 fps_copyinputs ldd ,x ; copy (X) to fpa0 std fpa0 ldd 2,x std fpa0+2 ldd 4,x std fpa0+4 ldd ,u ; copy (U) to fpa0 std fpa1 ldd 2,u std fpa1+2 ldd 4,u std fpa1+4 rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Subtraction (X) - (U) to (Y) fps_sub bsr fps_copyinputs ; copy input operands com fpa1+fps.sign ; negate the subtrahend (don't need to handle zero here) bra fps_add0 ; go handle it as addition ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Addition (X) + (U) to (Y) fps_add bsr fps_copyinputs ; copy input operands fps_add0 lda fpa1+fps.exp ; is the second operand 0? bne fps_add1 ; brif not ldd fpa0 ; copy first operand to output std ,y ldd fpa0+2 std 2,y ldd fpa0+4 std 4,y rts fps_add1 lda fpa0+fps.exp ; get exponent of first operand bne fps_add2 ; brif not zero ldd fpa1 ; copy second operand to output std ,y ldd fpa1+2 std 2,y ldd fpa1+4 std 4,y rts fps_add2 clr fpa0extra ; clear extra precision bits lda fpa0+fps.exp ; get first operand exponent suba fpa1+fps.exp ; get difference in exponents beq fps_add8 ; brif we don't need to denormalize - they're the same blo fps_add3 ; brif we need to denormalize the first operand ldx #fpa1 ; point to second operand to denormalize bra fps_add4 ; go denormalize fps_add3 ldb fpa1+fps.exp ; get exponent of second operand (result exponent) stb fpa0+fps.exp ; set result exponent ldx #fpa0 ; point to first operand to denormalize nega ; get number of bits to shift as positive number fps_add4 suba #8 ; is there 8 bits left? blo fps_add5 ; brif not ldb fps.sig+3,x ; shift significand 8 bits right stb fpa0extra ; save extra precision bits ldb fps.sig+2,x stb fps.sig+3,x ldb fps.sig+1,x stb fps.sig+2,x ldb fps.sig,x stb fps.sig+1,x clr fps.sig,x bra fps_add4 ; see if we have another byte to shift fps_add5 adda #8 ; adjust for extra subtract above bra fps_add7 ; do the bit shifting fps_add6 lsr fps.sig,x ; shift one bit right ror fps.sig+1,x ror fps.sig+2,x ror fps.sig+3,x ror fpa0extra deca ; done all of the bits? fps_add7 bne fps_add6 ; brif not fps_add8 lda fpa0+fps.sign ; compare the signs of the operands eora fpa1+fps.sign ; set A if signs differ bne fps_add9 ; brif they differ - do subtraction ldd fpa0+fps.sig+2 ; add low word of significands addd fpa1+fps.sig+2 std fpa0+fps.sig+2 ldd fpa0+fps.sig ; and the high word adcb fpa1+fps.sig+1 adca fpa1+fps.sig std fpa0+fps.sig bcc fps_add9 ; brif no carry ror fpa0+fps.sig ; shift carry into significand ror fpa0+fps.sig+1 ror fpa0+fps.sig+2 ror fpa0+fps.sig+3 ror fpa0extra ; and the extra bits (for rounding) inc fpa0+fps.exp ; bump exponent to account for bit shift bne fps_add14 ; go check for round-off if not overflow OVERROR ldb #err_ov ; raise overflow jmp ERROR fps_add9 ldd fpa0+fps.sig+2 ; subtract low word subd fpa1+fps.sig+2 std fpa0+fps.sig+2 ldd fpa0+fps.sig ; and now the high word sbcb fpa1+fps.sig+1 sbca fpa1+fps.sig std fpa0+fps.sig bcc fps_add10 ; brif we didn't carry com fpa0+fps.sign ; flip result sign - other number was bigger com fpa0+fps.sig+3 ; negate two's complement result to be just the magnitude com fpa0+fps.sig+2 com fpa0+fps.sig+1 com fpa0+fps.sig ldx fpa0+fps.sig+2 ; add 1 to complete negation leax 1,x stx fpa0+fps.sig+2 bne fps_add10 ; brif carry doesn't propagate ldx fpa0+fps.sig ; propagate carry leax 1,x stx fpa0+fps.sig ; NOTE: this cannot carry because magnitude got smaller fps_add10 clra ; initialize exponent offset fps_add11 ldb fpa0+fps.sig ; do we have nonzero bits in high byte of significand? bne fps_add13 ; brif so ldb fpa0+fps.sig+1 ; shift left 8 bits stb fpa0+fps.sig ldb fpa0+fps.sig+2 stb fpa0+fps.sig+1 ldb fpa0+fps.sig+3 stb fpa0+fps.sig+2 ldb fpa0extra ; and extra precision bits stb fpa0+fps.sig+3 clr fpa0extra addb #8 ; account for number of bits shifted cmpb #40 ; done 40 bits? blo fps_add11 ; brif not - see if we have more bits to shift clr fpa0+fps.exp ; number underflowed to zero - set it so clr fpa0+fps.sign clr fpa0+fps.sig clr fpa0+fps.sig+1 clr fpa0+fps.sig+2 clr fpa0+fps.sig+3 bra fps_add16 ; go return result fps_add12 inca ; account for a bit shift lsl fpa0extra ; shift significand and extra bits left rol fpa0+fps.sig+3 rol fpa0+fps.sig+2 rol fpa0+fps.sig+1 rol fpa0+fps.sig fps_add13 bpl fps_add12 ; brif we haven't normalized yet fps_add14 ldb fpa0extra ; do we need to round? bpl fps_add16 ; brif not ldx fpa0+fps.sig+2 ; add one to significand leax 1,x stx fpa0+fps.sig+2 bne fps_add16 ; brif no carry ldx fpa0+fps.sig ; bump the upper word of significand leax 1,x stx fpa0+fps.sig bne fps_add16 fps_add15 inc fpa0+fps.exp ; bump exponent beq OVERROR ; brif it overflowed lda #0x80 ; set high bit of significand (rest is zero) sta fpa0+fps.sig fps_add16 ldd fpa0 ; copy result to destination std ,y ldd fpa0+2 std 2,y ldd fpa0+4 std 4,y rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Single precision multiplication (X) × (U) to (Y) fps_mul lda fps.exp,x ; is first operand zero? beq fps_mul0 ; brif so - return zero lda fps.exp,u ; is second operand zero? bne fps_mul1 ; brif not - have to do the multiply fps_mul0 ldd zero ; return zero result std ,y std 2,y std 4,y rts fps_mul1 jsr fps_copyinputs ; copy input operands lda fpa0+fps.sign ; calculate sign of result - xor of the two signs eora fpa1+fps.sign sta fpa0+fps.sign ; save result sign lda fpa0+fps.exp ; get exponent of first value adda fpa1+fps.exp ; calculate new exponent; also cancels the bias rora ; set V if C and N differ rola bvc fps_mul3 ; brif maybe an overflow adda #0x80 ; add back the bias sta fpa0+fps.sign ; save new sign beq fps_mul0 ; brif we underflowed - zero out sign ; This does a shift-and-add multiplication algorithm. This is slower than an equivalent using MUL but is smaller. ; The high order bytes will be left in fpa0 with the low order bytes in fpa0extra and fpa0extra[1-3]. The low order ; bytes are kept for the odd case where extra precision is useful. Uses fpa0extra[4-7] as temporaries. fps_mul2 ldd zero ; zero out temporary bytes std fpa0extra4 std fpa0extra6 ldb fpa0+fps.sig+3 ; multiply by low byte of fpa0 bsr fps_mul4 ldb fpa0extra ; move low bites stb fpa0extra3 ldb fpa0+fps.sig+2 ; multiply by next higher byte bsr fps_mul4 ldb fpa0extra ; move low bits stb fpa0extra2 ldb fpa0+fps.sig+1 ; and again for the next higher byte bsr fps_mul4 ldb fpa0extra stb fpa0extra1 ldb fpa0+fps.sig ; and the high order byte bsr fps_mul4 ldd fpa0extra4 ; copy high order product bits to result std fpa0+fps.sig ldd fpa0extra6 std fpa0+fps.sig+2 jmp fps_add10 ; go normalize the result and return fps_mul3 bpl fps_mul0 ; brif we underflowed - return 0 jmp OVERROR ; raise overflow fps_mul4 bne fps_mul5 ; brif not multiply by zero lda fpa0+fps.sig+3 ; shift 8 bits right sta fpa0extra lda fpa0+fps.sig+2 sta fpa0+fps.sig+3 lda fpa0+fps.sig+1 sta fpa0+fps.sig+2 lda fpa0+fps.sig sta fpa0+fps.sig+1 clr fpa0+fps.sig fps_mul8 rts fps_mul5 coma ; set C fps_mul6 lda fpa0extra4 ; get high byte of result bytes rorb ; is multiplier bit set? beq fps_mul8 ; brif 8 shifts done (C set above makes sure of that) bcc fps_mul7 ; brif bit not set - don't do addition lda fpa0extra7 ; add multiplier (fpa1) to result adda fpa1+fps.sig+3 sta fpa0extra7 lda fpa0extra6 adca fpa1+fps.sig+2 sta fpa0extra6 lda fpa0extra5 adca fpa1+fps.sig+1 sta fpa0extra5 lda fpa0extra4 adca fpa1+fps.sig fps_mul7 rora ; rotate carry in (from add or 0) sta fpa0extra4 ror fpa0extra5 ror fpa0extra6 ror fpa0extra7 ror fpa0extra ; and into the extra precision bits clra ; clear carry - so shift above will terminate bra fps_mul6 ; go do another bit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Divide (X) by 10 in place fps_const10 fcb 0x83,0xa0,0x00,0x00,0x00,0x00 ; single precision unpacked constant 10 fps_div10 ldu #fps_const10 ; point to constant 10 leay ,x ; put output in input ; fall through to regular division ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Single precision division (X) ÷ (U) -> (Y) ; ; This is basically the same algorithm used in the Color Basic ROM fps_div lda fps.exp,u ; is divisor 0? bne fps_div0 DIV0ERROR ldb #err_div0 ; raise division by zero jmp ERROR fps_div0 lda fps.exp,x ; is dividend 0? lbeq fps_mul0 ; brif so - return 0 jsr fps_copyinputs ; copy input values lda fpa0+fps.sign ; calculate result sign - xor of operand signs eora fpa1+fps.sign sta fpa0+fps.sign ; save result sign lda fpa1+fps.exp ; get divisor exponent nega ; negate for subtraction adda fpa0+fps.exp ; subtract it from dividend exponent rora ; set V if C and N differ rola bvc fps_mul3 ; brif overflow or underflow adda #0x80 ; add back the bias sta fpa0+fps.sign ; save new sign lbeq fps_mul0 ; brif we underflowed - zero out sign inca ; bump exponent - why?? related to the bias stuff above? lbeq OVERROR ; brif it overflows sta fpa0+fps.exp ; save result exponent ldx #fpa0extra4 ; point to temporary storage bytes for quotient ldb #4 ; counter for 4 significand bytes and one extra partial byte stb fpa0extra1 ; save counter since we need both accumulators ldb #1 ; shift counter flag and quotient byte fps_div1 lda fpa1+fps.sig ; set C if fpa0 significand <= fpa1 significand cmpa fpa0+fps.sig bne fps_div2 lda fpa1+fps.sig+1 cmpa fpa0+fps.sig+1 bne fps_div2 lda fpa1+fps.sig+2 cmpa fpa0+fps.sig+2 bne fps_div2 lda fpa1+fps.sig+3 cmpa fpa0+fps.sig+3 bne fps_div2 coma ; set C if values are the same fps_div2 tfr cc,a ; save C for later, C clear if fpa1 > fpa0 rolb ; shift carry into quotient bcc fps_div3 ; brif carry clear - we haven't done 8 bits yet stb ,x+ ; save quotient byte after a full set of bits dec fpa0extra1 ; have we done all the bytes? bmi fps_div7 ; brif all bytes plus extra precision - done all beq fps_div6 ; brif all main sigificand bytes - do a couple extra bits ldb #1 ; reset the bit counter flag fps_div3 tfr cc,a ; get back original carry from compare bcs fps_div5 ; brif it "went" fps_div4 lsl fpa0+fps.sig+3 ; shift dividend left rol fpa0+fps.sig+2 rol fpa0+fps.sig+1 rol fpa0+fps.sig bcs fps_div2 ; brif it carries - next bit "goes" bmi fps_div1 ; check magnitudes of next bit bra fps_div2 ; carry clear - check another bit fps_div5 lda fpa0+fps.sig+3 ; subtract divisor from dividend bits suba fpa1+fps.sig+3 sta fpa0+fps.sig+3 lda fpa0+fps.sig+2 sbca fpa1+fps.sig+2 sta fpa0+fps.sig+2 lda fpa0+fps.sig+1 sbca fpa1+fps.sig+1 sta fpa0+fps.sig+1 lda fpa0+fps.sig sbca fpa1+fps.sig sta fpa0+fps.sig bra fps_div4 ; now do the bit shift to line things up fps_div6 ldb #0x40 ; only do two bits of extra precision byte bra fps_div2 ; go handle these bitsd fps_div7 rorb ; get extra quotient bits to bit 7,6 and bit 5 set rorb rorb stb fpa0extra ; save extra precision bits ldd fpa0extra4 ; copy result bits to fpa0 std fpa0+fps.sig ldd fpa0extra6 std fpa0+fps.sig+2 jmp fps_add10 ; go normalize the result ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Pack single precision number at (X) to (U) fps_pack lda fps.sign,x ; get sign of number (will be 0 for 0) ora #0x7f ; make sure low bits are set for merging anda fps.sig,x ; merge with high bits of significand ldb fps.sig+1,x ; get upper mid bits of significand std fps.sig,u ldd fps.sig+2,x std fps.sig+2,u lda fps.exp,x sta fps.exp,u rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Unpack single precision number at (X) to (U) fps_unpack lda fps.exp,x ; get exponent of value beq fps_unpack0 ; brif value is zero sta fps.exp,u ldb fps.sig,x ; get high byte of significand sex ; make sign value in A sta fps.sign,u ; set sign in result ldd fps.sig,x ; get high word of sifnificand ora #0x80 ; make sure high bit is set std fps.sig,u ; save high word in result ldd fps.sig+2,x ; copy middle bytes of significand std fps.sig+2,u rts fps_unpack0 sta fps.exp,u ; zero out destination sta fps.sig,u sta fps.sig+1,u sta fps.sig+2,u sta fps.sig+3,u sta fps.sign,u rts *pragmapop list