comparison src/int.s @ 80:bb50ac9fdf37

Checkpoint with very basic integer and floating point arithmetic, untested This commit has implementations for floating point add, subtract, multiply, and divide, along with 32 bit signed integer equivalents. These can probably be optimized and they are untested.
author William Astle <lost@l-w.ca>
date Sat, 07 Oct 2023 02:56:59 -0600
parents
children fbc14509955a
comparison
equal deleted inserted replaced
79:df86e6d64ce2 80:bb50ac9fdf37
1 *pragmapush list
2 *pragma list
3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4 ; 32 bit integer handling package.
5 ;
6 ; Negate a 32 bit integer in (X); done by subtracting it from zero
7 int32_neg ldd zero ; subtract low word
8 subd val.int+2,x
9 std val.int+2,x
10 ldd zero ; and now the high word
11 sbcb val.int+1,x
12 sbca val.int,x
13 std val.int,x
14 rts
15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
16 ; 32 bit integer addition (X) + (U) -> (Y)
17 int32_add ldd val.int+2,x ; do low word
18 addd val.int+2,u
19 std val.int+2,y
20 ldd val.int,x ; and the high word
21 adcb val.int+1,u
22 adca val.int,u
23 std val.int,y
24 bvc int32_add0 ; raise overflow if needed
25 OVERROR2 jmp OVERROR
26 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
27 ; 32 bit integer subtraction (X) - (U) -> (Y)
28 int32_sub ldd val.int+2,x ; do low word
29 subd val.int+2,u
30 std val.int+2,y
31 ldd val.int,x ; and the high word
32 sbcb val.int+1,u
33 sbca val.int,u
34 std val.int,y
35 bvs OVERROR2 ; raise overflow if needed
36 int32_add0 rts
37 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
38 ; Signed 32 bit integer multiply (X) * (U) -> (Y), overflow if exceeds signed 32 bit range
39 int32_mul ldd val.int+2,x ; copy left operand to temporary
40 std fpa0+fps.sig+2
41 ldd val.int,x
42 std fpa0+fps.sig
43 eora val.int,u ; set sign bit in A if signs differ
44 pshs a ; save result sign
45 ldd val.int+2,u ; copy right operand to temporary
46 std fpa1+fps.sig+2
47 ldd val.int,u
48 std fpa1+fps.sig
49 bpl int32_mul0 ; brif right operand is positive
50 ldd zero ; negate right operand
51 subd fpa1+fps.sig+2
52 std fpa1+fps.sig+2
53 ldd zero
54 sbcb fpa1+fps.sig+1
55 sbca fpa1+fps.sig
56 std fpa1+fps.sig
57 int32_mul0 lda fpa0+fps.sig ; is left operand negative?
58 bpl int32_mul1 ; brif not
59 ldd zero ; negate left operand
60 subd fpa0+fps.sig+2
61 std fpa0+fps.sig+2
62 ldd zero
63 sbcb fpa0+fps.sig+1
64 sbca fpa0+fps.sig
65 std fpa0+fps.sig
66 int32_mul1 bsr util_mul32 ; do the actual multiplication
67 ldb fpa0extra ; are upper bits all zero?
68 orb fpa0extra1
69 orb fpa0extra2
70 orb fpa0extra3
71 bne OVERROR2 ; brif not - overflowed
72 ldb fpa0extra4 ; is bit 31 set?
73 bpl int32_mul2 ; brif not - no overflow
74 lda ,s ; negative result wanted?
75 bpl OVERROR2 ; brif not - we overflowed
76 andb #0x7f ; lose extra sign bit
77 orb fpa0extra5 ; "or" in other bytes to see if all but bit 31 are zero
78 orb fpa0extra6
79 orb fpa0extra7
80 bne OVERROR2 ; brif any nonzero bits - we overflowed maximum negative number
81 ldb ,s+ ; do we want a negative result?
82 bpl int32_mul2 ; brif not
83 ldd zero ; negate result
84 subd fpa0extra6
85 std fpa0extra6
86 ldd zero
87 sbcb fpa0extra5
88 sbca fpa0extra4
89 std fpa0extra4
90 int32_mul2 ldd fpa0extra4 ; copy result to destination
91 std val.int,y
92 ldd fpa0extra6
93 std val.int+2,y
94 rts
95 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
96 ; 32 bit multiply.
97 ;
98 ; Significands of fpa0 and fpa1, treated as unsigned, are multiplied with the product being stored in the fpa0extra
99 ; memory locations.
100 ;
101 ; The agorithm is simply this: zero out the result, then multiply fpa0 by each byte of fpa1 and then add the result
102 ; to the result location. This yields a 64 bit product which is somewhat wasteful.
103 util_mul32 ldd zero ;* zero out result bits; low 16 bits don't need to be cleared and
104 stb fpa0extra3 ;* upper 24 bits also don't
105 std fpa0extra4
106 ldb fpa1+fps.sig+3 ; multiply by low byte of fpa1 - no carries possible for this iteration
107 lda fpa0+fps.sig+3
108 mul
109 std fpa0extra6
110 ldb fpa1+fps.sig+3
111 lda fpa0+fps.sig+2
112 mul
113 addd fpa0extra5
114 std fpa0extra5
115 ldb fpa1+fps.sig+3
116 lda fpa0+fps.sig+1
117 mul
118 addd fpa0extra4
119 std fpa0extra4
120 ldb fpa1+fps.sig+3
121 lda fpa0+fps.sig
122 mul
123 addd fpa0extra3
124 std fpa0extra3
125 ; Now we potentially have cascading carries at every stage; it makes more sense to handle those in a separate
126 ; addition pass after each partial calculation. The partial calculations are identical to above. This is completely
127 ; unrolled for speed.
128 ldd zero ; zero out extra work bytes
129 std fpa0extra8
130 stb fpa0extra10
131 ldb fpa1+fps.sig+2 ; multiply by second low byte of fpa1
132 lda fpa0+fps.sig+3
133 mul
134 std fpa0extra11
135 ldb fpa1+fps.sig+2
136 lda fpa0+fps.sig+2
137 mul
138 addd fpa0extra10
139 std fpa0extra10
140 ldb fpa1+fps.sig+2
141 lda fpa0+fps.sig+1
142 mul
143 addd fpa0extra9
144 std fpa0extra9
145 ldb fpa1+fps.sig+2
146 lda fpa0+fps.sig
147 mul
148 addd fpa0extra8
149 std fpa0extra8
150 ldd fpa0extra11 ; add to partial product (shifted left 8 bits)
151 addd fpa0extra5
152 std fpa0extra5
153 ldd fpa0extra9
154 adcb fpa0extra4
155 adca fpa0extra3
156 std fpa0extra3
157 ldb #0
158 adcb fpa0extra8
159 stb fpa0extra2
160 ldd zero ; and do it all again for next byte of fpa1
161 std fpa0extra8
162 stb fpa0extra10
163 ldb fpa1+fps.sig+1
164 lda fpa0+fps.sig+3
165 mul
166 std fpa0extra11
167 ldb fpa1+fps.sig+1
168 lda fpa0+fps.sig+2
169 mul
170 addd fpa0extra10
171 std fpa0extra10
172 ldb fpa1+fps.sig+1
173 lda fpa0+fps.sig+1
174 mul
175 addd fpa0extra9
176 std fpa0extra9
177 ldb fpa1+fps.sig+1
178 lda fpa0+fps.sig
179 mul
180 addd fpa0extra8
181 std fpa0extra8
182 ldd fpa0extra11
183 addd fpa0extra4
184 std fpa0extra4
185 ldd fpa0extra9
186 adcb fpa0extra3
187 adca fpa0extra2
188 std fpa0extra2
189 ldb #0
190 adcb fpa0extra8
191 stb fpa0extra1
192 ldd zero ; and the final sequence with the fpa1 high byte
193 std fpa0extra8
194 stb fpa0extra10
195 ldb fpa1+fps.sig
196 lda fpa0+fps.sig+3
197 mul
198 std fpa0extra11
199 ldb fpa1+fps.sig
200 lda fpa0+fps.sig+2
201 mul
202 addd fpa0extra10
203 std fpa0extra10
204 ldb fpa1+fps.sig
205 lda fpa0+fps.sig+1
206 mul
207 addd fpa0extra9
208 std fpa0extra9
209 ldb fpa1+fps.sig
210 lda fpa0+fps.sig
211 mul
212 addd fpa0extra8
213 std fpa0extra8
214 ldd fpa0extra11
215 addd fpa0extra3
216 std fpa0extra3
217 ldd fpa0extra9
218 adcb fpa0extra2
219 adca fpa0extra1
220 std fpa0extra1
221 ldb #0
222 adcb fpa0extra
223 stb fpa0extra
224 rts
225 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
226 ; 32 bit division, integer only, truncate fraction without rounding. Note that there is exactly one case where integer
227 ; division can overflow: dividing -0x80000000 by -1 which yields 0x80000000. All other cases reduce the magnitude.
228 int32_div ldd val.int+2,x ; copy left operand to temporary
229 std fpa0+fps.sig+2
230 ldd val.int,x
231 std fpa0+fps.sig
232 eora val.int,u ; set sign bit in A if signs differ
233 pshs a ; save result sign
234 ldd val.int+2,u ; copy right operand to temporary
235 std fpa1+fps.sig+2
236 ldd val.int,u
237 std fpa1+fps.sig
238 bpl int32_div0 ; brif right operand is positive
239 ldd zero ; negate right operand
240 subd fpa1+fps.sig+2
241 std fpa1+fps.sig+2
242 ldd zero
243 sbcb fpa1+fps.sig+1
244 sbca fpa1+fps.sig
245 std fpa1+fps.sig
246 int32_div0 lda fpa0+fps.sig ; is left operand negative?
247 bpl int32_div1 ; brif not
248 ldd zero ; negate left operand
249 subd fpa0+fps.sig+2
250 std fpa0+fps.sig+2
251 ldd zero
252 sbcb fpa0+fps.sig+1
253 sbca fpa0+fps.sig
254 std fpa0+fps.sig
255 int32_div1 ldb fpa1+fps.sig ; check for division by zero
256 orb fpa1+fps.sig+1
257 orb fpa1+fps.sig+2
258 orb fpa1+fps.sig+3
259 lbne DIV0ERROR ; brif division by zero
260 bsr util_div32 ; do the actual division
261 lda ,s+ ; get desired sign
262 bmi int32_div2 ; brif want negative - we can't overflow in that case
263 ldb fpa0extra ; get high byte of result
264 lbmi OVERROR2 ; brif we ended up with 0x80000000 positive
265 bra int32_div3 ; go return result
266 int32_div2 ldd zero ; negate result to correct sign
267 subd fpa0extra+2
268 std fpa0extra+2
269 ldd zero
270 sbcb fpa0extra+1
271 sbca fpa0extra
272 std fpa0extra
273 int32_div3 ldd fpa0extra ; copy result to destination
274 std val.int,y
275 ldd fpa0extra2
276 std val.int+2,y
277 rts
278 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
279 ; Divide 32 bit integer in fpa0 significand by 32 bit integer in fpa1 significand, both treated as unsigned. Leave
280 ; quotient at fpa0extra...fpa0extra3 and remainder at fpa0extra4...fpa0extra7; does not check for division by zero
281 ; which will result in a quotient of 0xffffffff and a remainder will be the dividend. It will not get suck in a loop.
282 ;
283 ; Algorithm is basically pencil and paper long division. We check to see if the divisor "goes" at each step by doing
284 ; a trial subtraction without saving the result. If it doesn't go, we just loop around again. If it does go, we stash
285 ; a 1 bit in the quotient and actually do the subtraction. Then go loop around again. Doing it this way rather than
286 ; with an actual subtraction and then undoing it with addition saves two store instructions on the comparison saves
287 ; having to do a restore in the no-go case which is going to be quite common with values whose upper bits are
288 ; mostly zeroes, thus it makes the operations faster in that case, for integers. (Floating point is a different
289 ; problem.)
290 util_div32 ldd fpa0+fps.sig+2 ; copy dividend to result location
291 std fpa0extra6
292 ldd fpa0+fps.sig
293 std fpa0extra4
294 ldb #32 ; do 32 bits
295 stb fpa0+fps.exp ; save counter somewhere because we don't have enough registers
296 ldd zero ; zero out remainder
297 std fpa0extra4
298 std fpa0extra6
299 util_div32a lsl fpa0extra3 ; shift dividend residue into remainder
300 rol fpa0extra2
301 rol fpa0extra1
302 rol fpa0extra
303 rol fpa0extra7
304 rol fpa0extra6
305 rol fpa0extra5
306 rol fpa0extra4
307 ldd fpa0extra6 ; now subtract divisor from remainder
308 subd fpa1+fps.sig+2
309 ldd fpa0extra4
310 sbcb fpa1+fps.sig+1
311 sbca fpa1+fps.sig
312 bcs util_div32b ; brif it doesn't go - need to restore
313 inc fpa0extra3 ; set quotient bit
314 ldd fpa0extra6 ; actuall do the subtraction
315 subd fpa1+fps.sig+2
316 std fpa0extra6
317 ldd fpa0extra4
318 sbcb fpa1+fps.sig+1
319 sbca fpa1+fps.sig
320 std fpa0extra4
321 util_div32b dec fpa0+fps.exp ; done all 32 bits?
322 bne util_div32a ; do another
323 *pragmapop list