Mercurial > hg > index.cgi
comparison src/int.s @ 80:bb50ac9fdf37
Checkpoint with very basic integer and floating point arithmetic, untested
This commit has implementations for floating point add, subtract, multiply,
and divide, along with 32 bit signed integer equivalents. These can probably
be optimized and they are untested.
author | William Astle <lost@l-w.ca> |
---|---|
date | Sat, 07 Oct 2023 02:56:59 -0600 |
parents | |
children | fbc14509955a |
comparison
equal
deleted
inserted
replaced
79:df86e6d64ce2 | 80:bb50ac9fdf37 |
---|---|
1 *pragmapush list | |
2 *pragma list | |
3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
4 ; 32 bit integer handling package. | |
5 ; | |
6 ; Negate a 32 bit integer in (X); done by subtracting it from zero | |
7 int32_neg ldd zero ; subtract low word | |
8 subd val.int+2,x | |
9 std val.int+2,x | |
10 ldd zero ; and now the high word | |
11 sbcb val.int+1,x | |
12 sbca val.int,x | |
13 std val.int,x | |
14 rts | |
15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
16 ; 32 bit integer addition (X) + (U) -> (Y) | |
17 int32_add ldd val.int+2,x ; do low word | |
18 addd val.int+2,u | |
19 std val.int+2,y | |
20 ldd val.int,x ; and the high word | |
21 adcb val.int+1,u | |
22 adca val.int,u | |
23 std val.int,y | |
24 bvc int32_add0 ; raise overflow if needed | |
25 OVERROR2 jmp OVERROR | |
26 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
27 ; 32 bit integer subtraction (X) - (U) -> (Y) | |
28 int32_sub ldd val.int+2,x ; do low word | |
29 subd val.int+2,u | |
30 std val.int+2,y | |
31 ldd val.int,x ; and the high word | |
32 sbcb val.int+1,u | |
33 sbca val.int,u | |
34 std val.int,y | |
35 bvs OVERROR2 ; raise overflow if needed | |
36 int32_add0 rts | |
37 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
38 ; Signed 32 bit integer multiply (X) * (U) -> (Y), overflow if exceeds signed 32 bit range | |
39 int32_mul ldd val.int+2,x ; copy left operand to temporary | |
40 std fpa0+fps.sig+2 | |
41 ldd val.int,x | |
42 std fpa0+fps.sig | |
43 eora val.int,u ; set sign bit in A if signs differ | |
44 pshs a ; save result sign | |
45 ldd val.int+2,u ; copy right operand to temporary | |
46 std fpa1+fps.sig+2 | |
47 ldd val.int,u | |
48 std fpa1+fps.sig | |
49 bpl int32_mul0 ; brif right operand is positive | |
50 ldd zero ; negate right operand | |
51 subd fpa1+fps.sig+2 | |
52 std fpa1+fps.sig+2 | |
53 ldd zero | |
54 sbcb fpa1+fps.sig+1 | |
55 sbca fpa1+fps.sig | |
56 std fpa1+fps.sig | |
57 int32_mul0 lda fpa0+fps.sig ; is left operand negative? | |
58 bpl int32_mul1 ; brif not | |
59 ldd zero ; negate left operand | |
60 subd fpa0+fps.sig+2 | |
61 std fpa0+fps.sig+2 | |
62 ldd zero | |
63 sbcb fpa0+fps.sig+1 | |
64 sbca fpa0+fps.sig | |
65 std fpa0+fps.sig | |
66 int32_mul1 bsr util_mul32 ; do the actual multiplication | |
67 ldb fpa0extra ; are upper bits all zero? | |
68 orb fpa0extra1 | |
69 orb fpa0extra2 | |
70 orb fpa0extra3 | |
71 bne OVERROR2 ; brif not - overflowed | |
72 ldb fpa0extra4 ; is bit 31 set? | |
73 bpl int32_mul2 ; brif not - no overflow | |
74 lda ,s ; negative result wanted? | |
75 bpl OVERROR2 ; brif not - we overflowed | |
76 andb #0x7f ; lose extra sign bit | |
77 orb fpa0extra5 ; "or" in other bytes to see if all but bit 31 are zero | |
78 orb fpa0extra6 | |
79 orb fpa0extra7 | |
80 bne OVERROR2 ; brif any nonzero bits - we overflowed maximum negative number | |
81 ldb ,s+ ; do we want a negative result? | |
82 bpl int32_mul2 ; brif not | |
83 ldd zero ; negate result | |
84 subd fpa0extra6 | |
85 std fpa0extra6 | |
86 ldd zero | |
87 sbcb fpa0extra5 | |
88 sbca fpa0extra4 | |
89 std fpa0extra4 | |
90 int32_mul2 ldd fpa0extra4 ; copy result to destination | |
91 std val.int,y | |
92 ldd fpa0extra6 | |
93 std val.int+2,y | |
94 rts | |
95 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
96 ; 32 bit multiply. | |
97 ; | |
98 ; Significands of fpa0 and fpa1, treated as unsigned, are multiplied with the product being stored in the fpa0extra | |
99 ; memory locations. | |
100 ; | |
101 ; The agorithm is simply this: zero out the result, then multiply fpa0 by each byte of fpa1 and then add the result | |
102 ; to the result location. This yields a 64 bit product which is somewhat wasteful. | |
103 util_mul32 ldd zero ;* zero out result bits; low 16 bits don't need to be cleared and | |
104 stb fpa0extra3 ;* upper 24 bits also don't | |
105 std fpa0extra4 | |
106 ldb fpa1+fps.sig+3 ; multiply by low byte of fpa1 - no carries possible for this iteration | |
107 lda fpa0+fps.sig+3 | |
108 mul | |
109 std fpa0extra6 | |
110 ldb fpa1+fps.sig+3 | |
111 lda fpa0+fps.sig+2 | |
112 mul | |
113 addd fpa0extra5 | |
114 std fpa0extra5 | |
115 ldb fpa1+fps.sig+3 | |
116 lda fpa0+fps.sig+1 | |
117 mul | |
118 addd fpa0extra4 | |
119 std fpa0extra4 | |
120 ldb fpa1+fps.sig+3 | |
121 lda fpa0+fps.sig | |
122 mul | |
123 addd fpa0extra3 | |
124 std fpa0extra3 | |
125 ; Now we potentially have cascading carries at every stage; it makes more sense to handle those in a separate | |
126 ; addition pass after each partial calculation. The partial calculations are identical to above. This is completely | |
127 ; unrolled for speed. | |
128 ldd zero ; zero out extra work bytes | |
129 std fpa0extra8 | |
130 stb fpa0extra10 | |
131 ldb fpa1+fps.sig+2 ; multiply by second low byte of fpa1 | |
132 lda fpa0+fps.sig+3 | |
133 mul | |
134 std fpa0extra11 | |
135 ldb fpa1+fps.sig+2 | |
136 lda fpa0+fps.sig+2 | |
137 mul | |
138 addd fpa0extra10 | |
139 std fpa0extra10 | |
140 ldb fpa1+fps.sig+2 | |
141 lda fpa0+fps.sig+1 | |
142 mul | |
143 addd fpa0extra9 | |
144 std fpa0extra9 | |
145 ldb fpa1+fps.sig+2 | |
146 lda fpa0+fps.sig | |
147 mul | |
148 addd fpa0extra8 | |
149 std fpa0extra8 | |
150 ldd fpa0extra11 ; add to partial product (shifted left 8 bits) | |
151 addd fpa0extra5 | |
152 std fpa0extra5 | |
153 ldd fpa0extra9 | |
154 adcb fpa0extra4 | |
155 adca fpa0extra3 | |
156 std fpa0extra3 | |
157 ldb #0 | |
158 adcb fpa0extra8 | |
159 stb fpa0extra2 | |
160 ldd zero ; and do it all again for next byte of fpa1 | |
161 std fpa0extra8 | |
162 stb fpa0extra10 | |
163 ldb fpa1+fps.sig+1 | |
164 lda fpa0+fps.sig+3 | |
165 mul | |
166 std fpa0extra11 | |
167 ldb fpa1+fps.sig+1 | |
168 lda fpa0+fps.sig+2 | |
169 mul | |
170 addd fpa0extra10 | |
171 std fpa0extra10 | |
172 ldb fpa1+fps.sig+1 | |
173 lda fpa0+fps.sig+1 | |
174 mul | |
175 addd fpa0extra9 | |
176 std fpa0extra9 | |
177 ldb fpa1+fps.sig+1 | |
178 lda fpa0+fps.sig | |
179 mul | |
180 addd fpa0extra8 | |
181 std fpa0extra8 | |
182 ldd fpa0extra11 | |
183 addd fpa0extra4 | |
184 std fpa0extra4 | |
185 ldd fpa0extra9 | |
186 adcb fpa0extra3 | |
187 adca fpa0extra2 | |
188 std fpa0extra2 | |
189 ldb #0 | |
190 adcb fpa0extra8 | |
191 stb fpa0extra1 | |
192 ldd zero ; and the final sequence with the fpa1 high byte | |
193 std fpa0extra8 | |
194 stb fpa0extra10 | |
195 ldb fpa1+fps.sig | |
196 lda fpa0+fps.sig+3 | |
197 mul | |
198 std fpa0extra11 | |
199 ldb fpa1+fps.sig | |
200 lda fpa0+fps.sig+2 | |
201 mul | |
202 addd fpa0extra10 | |
203 std fpa0extra10 | |
204 ldb fpa1+fps.sig | |
205 lda fpa0+fps.sig+1 | |
206 mul | |
207 addd fpa0extra9 | |
208 std fpa0extra9 | |
209 ldb fpa1+fps.sig | |
210 lda fpa0+fps.sig | |
211 mul | |
212 addd fpa0extra8 | |
213 std fpa0extra8 | |
214 ldd fpa0extra11 | |
215 addd fpa0extra3 | |
216 std fpa0extra3 | |
217 ldd fpa0extra9 | |
218 adcb fpa0extra2 | |
219 adca fpa0extra1 | |
220 std fpa0extra1 | |
221 ldb #0 | |
222 adcb fpa0extra | |
223 stb fpa0extra | |
224 rts | |
225 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
226 ; 32 bit division, integer only, truncate fraction without rounding. Note that there is exactly one case where integer | |
227 ; division can overflow: dividing -0x80000000 by -1 which yields 0x80000000. All other cases reduce the magnitude. | |
228 int32_div ldd val.int+2,x ; copy left operand to temporary | |
229 std fpa0+fps.sig+2 | |
230 ldd val.int,x | |
231 std fpa0+fps.sig | |
232 eora val.int,u ; set sign bit in A if signs differ | |
233 pshs a ; save result sign | |
234 ldd val.int+2,u ; copy right operand to temporary | |
235 std fpa1+fps.sig+2 | |
236 ldd val.int,u | |
237 std fpa1+fps.sig | |
238 bpl int32_div0 ; brif right operand is positive | |
239 ldd zero ; negate right operand | |
240 subd fpa1+fps.sig+2 | |
241 std fpa1+fps.sig+2 | |
242 ldd zero | |
243 sbcb fpa1+fps.sig+1 | |
244 sbca fpa1+fps.sig | |
245 std fpa1+fps.sig | |
246 int32_div0 lda fpa0+fps.sig ; is left operand negative? | |
247 bpl int32_div1 ; brif not | |
248 ldd zero ; negate left operand | |
249 subd fpa0+fps.sig+2 | |
250 std fpa0+fps.sig+2 | |
251 ldd zero | |
252 sbcb fpa0+fps.sig+1 | |
253 sbca fpa0+fps.sig | |
254 std fpa0+fps.sig | |
255 int32_div1 ldb fpa1+fps.sig ; check for division by zero | |
256 orb fpa1+fps.sig+1 | |
257 orb fpa1+fps.sig+2 | |
258 orb fpa1+fps.sig+3 | |
259 lbne DIV0ERROR ; brif division by zero | |
260 bsr util_div32 ; do the actual division | |
261 lda ,s+ ; get desired sign | |
262 bmi int32_div2 ; brif want negative - we can't overflow in that case | |
263 ldb fpa0extra ; get high byte of result | |
264 lbmi OVERROR2 ; brif we ended up with 0x80000000 positive | |
265 bra int32_div3 ; go return result | |
266 int32_div2 ldd zero ; negate result to correct sign | |
267 subd fpa0extra+2 | |
268 std fpa0extra+2 | |
269 ldd zero | |
270 sbcb fpa0extra+1 | |
271 sbca fpa0extra | |
272 std fpa0extra | |
273 int32_div3 ldd fpa0extra ; copy result to destination | |
274 std val.int,y | |
275 ldd fpa0extra2 | |
276 std val.int+2,y | |
277 rts | |
278 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
279 ; Divide 32 bit integer in fpa0 significand by 32 bit integer in fpa1 significand, both treated as unsigned. Leave | |
280 ; quotient at fpa0extra...fpa0extra3 and remainder at fpa0extra4...fpa0extra7; does not check for division by zero | |
281 ; which will result in a quotient of 0xffffffff and a remainder will be the dividend. It will not get suck in a loop. | |
282 ; | |
283 ; Algorithm is basically pencil and paper long division. We check to see if the divisor "goes" at each step by doing | |
284 ; a trial subtraction without saving the result. If it doesn't go, we just loop around again. If it does go, we stash | |
285 ; a 1 bit in the quotient and actually do the subtraction. Then go loop around again. Doing it this way rather than | |
286 ; with an actual subtraction and then undoing it with addition saves two store instructions on the comparison saves | |
287 ; having to do a restore in the no-go case which is going to be quite common with values whose upper bits are | |
288 ; mostly zeroes, thus it makes the operations faster in that case, for integers. (Floating point is a different | |
289 ; problem.) | |
290 util_div32 ldd fpa0+fps.sig+2 ; copy dividend to result location | |
291 std fpa0extra6 | |
292 ldd fpa0+fps.sig | |
293 std fpa0extra4 | |
294 ldb #32 ; do 32 bits | |
295 stb fpa0+fps.exp ; save counter somewhere because we don't have enough registers | |
296 ldd zero ; zero out remainder | |
297 std fpa0extra4 | |
298 std fpa0extra6 | |
299 util_div32a lsl fpa0extra3 ; shift dividend residue into remainder | |
300 rol fpa0extra2 | |
301 rol fpa0extra1 | |
302 rol fpa0extra | |
303 rol fpa0extra7 | |
304 rol fpa0extra6 | |
305 rol fpa0extra5 | |
306 rol fpa0extra4 | |
307 ldd fpa0extra6 ; now subtract divisor from remainder | |
308 subd fpa1+fps.sig+2 | |
309 ldd fpa0extra4 | |
310 sbcb fpa1+fps.sig+1 | |
311 sbca fpa1+fps.sig | |
312 bcs util_div32b ; brif it doesn't go - need to restore | |
313 inc fpa0extra3 ; set quotient bit | |
314 ldd fpa0extra6 ; actuall do the subtraction | |
315 subd fpa1+fps.sig+2 | |
316 std fpa0extra6 | |
317 ldd fpa0extra4 | |
318 sbcb fpa1+fps.sig+1 | |
319 sbca fpa1+fps.sig | |
320 std fpa0extra4 | |
321 util_div32b dec fpa0+fps.exp ; done all 32 bits? | |
322 bne util_div32a ; do another | |
323 *pragmapop list |