1 // ariarm.d (c) Copyright 1994, 1997 P.J.Burwood
2 // little-endian modifications (c) Copyright 1996 B. Haible
3 // external routines for arilev1.d
4 // Processor: ARM in APCS mode
5 // Assembler-Syntax: ObjAsm under RISC OS, GAS otherwise
6 // Assumptions: intCsize=32, intDsize=32.
7 // Parameter passing conventions: APCS means that registers a1-a4 and ip
8 // do not have to be preserved across function calls.
9 // Note: A sequence of up to 4 conditional instructions is used in preference
44 #define EXPORT(x) EXPORT x
45 #define DECLARE_FUNCTION(x)
46 #define GLABEL(x) _##x
49 AREA |C$$code|,CODE,READONLY
74 #define EXPORT(x) .global _##x
75 #if defined(__NetBSD__)
76 #define DECLARE_FUNCTION(x) .type _##x,%function
78 #define DECLARE_FUNCTION(x)
80 #define GLABEL(x) _##x##:
88 #if defined(__arm7m__) || defined(__arm8__) || defined(__arm9__) || defined(__strongarm__)
89 // ARM7M and later have 32x32 -> 64 multiplies which execute in 2-4 clocks.
94 #if defined(__GNUC__) && 0
95 // With GNU C, we would like to pass the second return value in a2, don't
96 // need a global variable. Unfortunately, the current Acorn gcc crashes if
97 // we declare an appropriate local register variable with __asm__.
98 // It would be possible to declare the functions as returning a 64-bit
99 // result, but given the quality of gcc code dealing with 64-bit entities
100 // and the subtleties of 64-bit returns values (passed in register or in
101 // memory?) we now let it be.
103 // Use three global variables.
148 // extern uint32 mulu32_ (uint32 x, uint32 y);
155 // mulu32_high = high32(x*y)
156 // a3,a4,ip destroyed
158 DECLARE_FUNCTION(mulu32_)
164 MOV ip,a1,LSR #16 // temp := top half of x
165 MOV a3,a2,LSR #16 // hi := top half of y
166 BIC a1,a1,ip,LSL #16 // x := bottom half of x
167 BIC a2,a2,a3,LSL #16 // y := bottom half of y
168 MUL a4,a1,a2 // low section of result
169 MUL a2,ip,a2 // ) middle sections
170 MUL a1,a3,a1 // ) of result
171 MUL a3,ip,a3 // high section of result
172 ADDS a2,a2,a1 // add middle sections
173 // (can't use mla as we need carry)
174 ADDCS a3,a3,#0x10000 // carry from above add
175 ADDS a1,a4,a2,LSL #16 // x is now bottom 32 bits of result
176 ADC a2,a3,a2,LSR #16 // hi is top 32 bits
179 LDR a3,[pc,#ptr_mulu32_high-.-8]
184 // extern uint16 divu_3216_1616_ (uint32 x, uint16 y);
189 // a1 = q = floor(x/y)
191 // divu_16_rest = r = x-q*y
193 EXPORT(divu_3216_1616_)
194 DECLARE_FUNCTION(divu_3216_1616_)
195 GLABEL(divu_3216_1616_)
196 // see cl_low_div.cc for algorithm
197 // in that notation: a1 = r, a2 = -s.
198 MOV a2,a2,LSL#15 // multiply divisor by 2^15
199 RSB a2,a2,#0 // negate divisor
200 ADDS a1,a2,a1 // dividend = dividend + -divisor/2
201 SUBCC a1,a1,a2 // dividend = dividend - -divisor/2
202 ADCS a1,a2,a1,LSL#1 // dividend = dividend*2 + -divisor
203 // and shift quotient
204 SUBCC a1,a1,a2 // do this another 14 times
232 SUBCC a1,a1,a2 // do the last conditional subtraction
233 MOV a2,a1,LSR#15 // move remainder into a2 and shift
234 ADC a1,a1,a1 // move last bit of quotient in
235 MOV a1,a1,LSL#16 // AND out top 16 bits by shifting up
236 MOV a1,a1,LSR#16 // and back down again
238 LDR a3,[pc,#ptr_divu_16_rest-.-8] // save rest so can be picked up later
239 STR a2,[a3,#0] // the result is 16 bits
243 // extern uint32 divu_6432_3232_ (uint32 xhi, uint32 xlo, uint32 y); // -> Quotient q
244 // extern uint32 divu_32_rest; // -> Rest r
245 // see cl_low_div.cc for algorithm
247 // a1 = xhi (dividend)
248 // a2 = xlo (dividend)
251 // a1 = 32 bit quotient
252 // a2 = 32 bit remainder
254 EXPORT(divu_6432_3232_)
255 DECLARE_FUNCTION(divu_6432_3232_)
256 GLABEL(divu_6432_3232_)
257 STMFD sp!, {v1,v2,v3,v4,v5,v6,lr}
260 CMP a3,#0x10000 // y <= (uint32)(bit(16)-1)
261 BCS divu_6432_3232_l1
263 ORR a1, a2, a1, ASL #16 // = highlow32(low16(xhi),high16(xlo))
265 BL C(divu_3216_1616_)
269 ORR a1, a1, a2, ASL #16 // = highlow32(r1,low16(xlo))
271 BL C(divu_3216_1616_)
272 ORR a1, a1, v3, ASL #16 // = highlow32(q1,q0)
274 LDR a4,[pc,#ptr_divu_32_rest-.-8]
275 STR a2,[a4,#0] // divu_32_rest = remainder
277 LDMFD sp!, {v1,v2,v3,v4,v5,v6,pc}^
279 LABEL(divu_6432_3232_l1)
281 MOVS a4, v1, LSR #16 // while ((sint32)y >= 0)
282 ADDEQ v3, v3, #16 // { y = y<<1; s++; }
283 MOVEQ v1, v1, ASL #16
298 MOVNE a2, a1, ASL v3 // if (!(s==0))
299 RSBNE a1, v3, #32 // { xhi = (xhi << s)
300 ORRNE a1, a2, v2, LSR a1 // | (xlo >> (32-s));
301 MOVNE v2, v2, ASL v3 // xlo = xlo << s; }
302 ADD a2, v1, #0x10000 // y1_1 = high16(y)+1
303 MOVS v5, a2, LSR #16 // if (y1_1 = 0)
304 MOVEQ v4, a1, ASL #16 // r16 = low16(xhi) * 2^16
305 MOVEQ a1, a1, LSR #16 // q1 = high16(xhi)
307 BLNE C(divu_3216_1616_) // divu_3216_1616(xhi,y1_1, q1=,r16=)
308 MOVNE v4, a2, ASL #16 // r16 = r16 * 2^16
309 ORR v4, v4, v2, LSR #16 // r = highlow32(r16,high16(xlo))
310 MOV a4, v1, ASL #16 // tmp = mulu16(low16(y),q1)
313 RSB a3, a3, a1, ASL #16 // r2 = highlow32_0(q1) - tmp
315 ADDS a1, v4, a3 // r += r2
316 ADDCS v6, v6, #1 // if ( r < r2 ) { q1 += 1
317 SUBCS a1, a1, v1 // r -= y }
318 CMP a1, v1 // if (r >= y)
319 ADDCS v6, v6, #1 // { q1 += 1
320 SUBCS a1, a1, v1 // r -= y }
321 CMP v5, #0 // if (y1_1 = 0)
322 MOVEQ v4, a1, ASL #16 // { r16 = low16(r) * 2^16
323 MOVEQ a1, a1, LSR #16 // q0 = high16(r) }
325 BLNE C(divu_3216_1616_) // divu_3216_1616(r,y1_1, q0=,r16=)
326 MOVNE v4, a2, ASL #16 // r16 = r16 * 2^16
328 ORR v4, v4, v2, LSR #16 // r = highlow32(r16,low16(xlo))
329 MOV a4, v1, ASL #16 // tmp = mulu16(low16(y),q0)
332 RSB a3, a3, a1, ASL #16 // r2 = highlow32_0(q0) - tmp
333 ADDS v4, v4, a3 // r += r2
334 ADDCS a1, a1, #1 // if ( r < r2 ) { q0 += 1
335 SUBCS v4, v4, v1 // r -= y }
336 CMP v4, v1 // if (r >= y)
337 ADDCS a1, a1, #1 // { q0 += 1
338 SUBCS v4, v4, v1 // r -= y }
339 MOV a2, v4, LSR v3 // remainder = r >> s
340 ORR a1, a1, v6, ASL #16 // return highlow32(q1,q0)
342 LDR a3,[pc,#ptr_divu_32_rest-.-8]
343 STR a2,[a3,#0] // divu_32_rest = remainder
345 LDMFD sp!, {v1,v2,v3,v4,v5,v6,pc}^
347 // extern uintD* copy_loop_up (uintD* sourceptr, uintD* destptr, uintC count);
349 // a1 = source pointer
350 // a2 = destination pointer
351 // a3 = count of words to store
353 // a1 = address of last word stored + 1
354 // a2 - a4, ip destroyed
355 EXPORT(copy_loop_up) // word aligned copy loop up
356 DECLARE_FUNCTION(copy_loop_up)
358 ANDS a4,a3,#3 // multiple of 4 words ?
359 BEQ copy_loop_up_l1 // yup, so branch
360 CMP a4,#2 // copy the first 1-3 words
361 LDR a4,[a1],#4 // to align the total to a multiple
362 STR a4,[a2],#4 // of 4 words
367 LABEL(copy_loop_up_l1)
368 BICS a4,a3,#3 // set counter to multiple of 4
369 MOVEQ a1,a2 // return addr of last word stored
370 MOVEQS pc,lr // if zero then we're done
371 STMFD sp!,{v1,lr} // save work regs
372 LABEL(copy_loop_up_l2)
373 LDMIA a1!,{a3,v1,ip,lr} // copy 4 words in one go
374 STMIA a2!,{a3,v1,ip,lr}
375 SUBS a4,a4,#8 // decrement counter by 8
376 LDMGEIA a1!,{a3,v1,ip,lr} // if count still positive then copy
377 STMGEIA a2!,{a3,v1,ip,lr} // 4 more words
378 BGT copy_loop_up_l2 // and loop
379 MOV a1,a2 // return addr of last word stored
380 LDMFD sp!,{v1,pc}^ // restore work regs and return
382 // extern uintD* copy_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
384 // a1 = source pointer
385 // a2 = destination pointer
386 // a3 = count of words to store
388 // a1 = address of last word stored
389 // a2 - a4, ip destroyed
390 EXPORT(copy_loop_down) // word aligned copy loop down
391 DECLARE_FUNCTION(copy_loop_down)
392 GLABEL(copy_loop_down)
393 ANDS a4,a3,#3 // multiple of 4 words ?
394 BEQ copy_loop_down_l1 // yup, so branch
395 CMP a4,#2 // copy the first 1-3 words
396 LDR a4,[a1,#-4]! // to align the total to a multiple
397 STR a4,[a2,#-4]! // of 4 words
402 LABEL(copy_loop_down_l1)
403 BICS a4,a3,#3 // set counter to multiple of 4
404 MOVEQ a1,a2 // return addr of last word stored
405 MOVEQS pc,lr // if zero then we're done
406 STMFD sp!,{v1,lr} // save work regs
407 LABEL(copy_loop_down_l2)
408 LDMDB a1!,{a3,v1,ip,lr} // copy 4 words in one go
409 STMDB a2!,{a3,v1,ip,lr}
410 SUBS a4,a4,#8 // decrement counter by 8
411 LDMGEDB a1!,{a3,v1,ip,lr} // if count still positive then copy
412 STMGEDB a2!,{a3,v1,ip,lr} // 4 more words
413 BGT copy_loop_down_l2 // and loop
414 MOV a1,a2 // return addr of last word stored
415 LDMFD sp!,{v1,pc}^ // restore work regs and return
417 // extern uintD* clear_loop_up (uintD* destptr, uintC count);
419 // a1 = destination pointer
420 // a2 = count of words to store
422 // a1 = address of last word stored + 1
423 // a2 - a4, ip destroyed
424 EXPORT(clear_loop_up) // word aligned clear loop up
425 DECLARE_FUNCTION(clear_loop_up)
426 GLABEL(clear_loop_up)
427 MOV a3,#0 // set filler to 0
428 // and drop into fill_loop_up
430 // extern uintD* fill_loop_up (uintD* destptr, uintC count, uintD filler);
432 // a1 = destination pointer
433 // a2 = count of words to store
434 // a3 = word to store
436 // a1 = address of last word stored + 1
437 // a2 - a4, ip destroyed
438 EXPORT(fill_loop_up) // word aligned fill loop up
439 DECLARE_FUNCTION(fill_loop_up)
441 ANDS a4,a2,#3 // multiple of 4 words ?
442 BEQ fill_loop_up_l1 // yup, so branch
443 CMP a4,#2 // store the first 1-3 words
444 STR a3,[a1],#4 // to align the total to a multiple
445 STRGE a3,[a1],#4 // of 4 words
447 LABEL(fill_loop_up_l1)
448 BICS a4,a2,#3 // set counter to multiple of 4
449 MOVEQS pc,lr // if zero then we're done
450 STMFD sp!,{v1,lr} // save work regs
451 MOV v1,a3 // copy filler to three other
452 MOV ip,a3 // registers
454 LABEL(fill_loop_up_l2)
455 STMIA a1!,{a3,v1,ip,lr} // store 4 fillers in one go
456 SUBS a4,a4,#8 // decrement counter by 8
457 STMGEIA a1!,{a3,v1,ip,lr} // if count still positive then store 4
458 BGT fill_loop_up_l2 // more and loop
459 LDMFD sp!,{v1,pc}^ // restore work regs and return
462 // extern uintD* clear_loop_down (uintD* destptr, uintC count);
464 // a1 = destination pointer
465 // a2 = count of words to store
467 // a1 = address of last word stored + 1
468 // a2 - a4, ip destroyed
469 EXPORT(clear_loop_down) // word aligned clear loop down
470 DECLARE_FUNCTION(clear_loop_down)
471 GLABEL(clear_loop_down)
472 MOV a3,#0 // set filler to 0
473 // and drop into fill_loop_down
475 // extern uintD* fill_loop_down (uintD* destptr, uintC count, uintD filler);
477 // a1 = destination pointer
478 // a2 = count of words to store
479 // a3 = word to store
481 // a1 = address of last word stored
482 // a2 - a4, ip destroyed
483 EXPORT(fill_loop_down) // word aligned fill loop down
484 DECLARE_FUNCTION(fill_loop_down)
485 GLABEL(fill_loop_down)
486 ANDS a4,a2,#3 // multiple of 4 words ?
487 BEQ fill_loop_down_l1 // yup, so branch
488 CMP a4,#2 // store the first 1-3 words
489 STR a3,[a1,#-4]! // to align the total to a multiple
490 STRGE a3,[a1,#-4]! // of 4 words
492 LABEL(fill_loop_down_l1)
493 BICS a4,a2,#3 // set counter to multiple of 4
494 MOVEQS pc,lr // if zero then we're done
495 STMFD sp!,{v1,lr} // save work regs
496 MOV v1,a3 // copy filler to three other
497 MOV ip,a3 // registers
499 LABEL(fill_loop_down_l2)
500 STMDB a1!,{a3,v1,ip,lr} // store 4 fillers in one go
501 SUBS a4,a4,#8 // decrement counter by 8
502 STMGEDB a1!,{a3,v1,ip,lr} // if count still positive then store 4
503 BGT fill_loop_down_l2 // more and loop
504 LDMFD sp!,{v1,pc}^ // restore work regs and return
506 // extern void test_loop_up (uintD* xptr, uintC count);
509 // a2 = count of words to be TESTed
511 // a1 = TRUE if any words are non-zero else FALSE
512 // a2 - a4, ip destroyed
513 EXPORT(test_loop_up) // word aligned test loop up
514 DECLARE_FUNCTION(test_loop_up)
516 MOV ip,a1 // move xptr to ip
517 MOV a1,#1 // set result to TRUE
518 ANDS a3,a2,#3 // multiple of 4 words ?
519 BEQ test_loop_up_l1 // yup, so branch
520 LDR a4,[ip],#4 // TEST the first 1-3 words
521 TEQ a4,#0 // align the total to a multiple of 4
522 MOVNES pc,lr // return TRUE if AND_TEST ok
524 BLT test_loop_up_l1 // need to branch 'cos PSR set
525 LDRGE a4,[ip],#4 // when checking against zero
529 BLE test_loop_up_l1 // need to branch 'cos PSR set
530 LDRGT a4,[ip],#4 // when checking against zero
533 LABEL(test_loop_up_l1)
534 BICS a4,a2,#3 // set counter to multiple of 4
535 MOVEQ a1,#0 // return FALSE
536 MOVEQS pc,lr // if zero then we're done
537 STMFD sp!,{v1,lr} // save work regs
538 LABEL(test_loop_up_l2)
539 LDMIA ip!,{a2,a3,v1,lr} // load 4 words in one go
540 TEQ a2,#0 // TEST the four words
545 SUBS a4,a4,#4 // decrement counter by 4
546 BGT test_loop_up_l2 // if count still positive then loop
548 LDMFD sp!,{v1,pc}^ // restore work regs and return
550 // extern void test_loop_down (uintD* xptr, uintC count);
553 // a2 = count of words to be TESTed
555 // a1 = TRUE if any words are non-zero else FALSE
556 // a2 - a4, ip destroyed
557 EXPORT(test_loop_down) // word aligned test loop down
558 DECLARE_FUNCTION(test_loop_down)
559 GLABEL(test_loop_down)
560 MOV ip,a1 // move xptr to ip
561 MOV a1,#1 // set result to TRUE
562 ANDS a3,a2,#3 // multiple of 4 words ?
563 BEQ test_loop_down_l1 // yup, so branch
564 LDR a4,[ip,#-4]! // TEST the first 1-3 words
565 TEQ a4,#0 // align the total to a multiple of 4
566 MOVNES pc,lr // return TRUE if AND_TEST ok
568 BLT test_loop_down_l1 // need to branch 'cos PSR set
569 LDRGE a4,[ip,#-4]! // when checking against zero
573 BLE test_loop_down_l1 // need to branch 'cos PSR set
574 LDRGT a4,[ip,#-4]! // when checking against zero
577 LABEL(test_loop_down_l1)
578 BICS a4,a2,#3 // set counter to multiple of 4
579 MOVEQ a1,#0 // return FALSE
580 MOVEQS pc,lr // if zero then we're done
581 STMFD sp!,{v1,lr} // save work regs
582 LABEL(test_loop_down_l2)
583 LDMDB ip!,{a2,a3,v1,lr} // load 4 words in one go
584 TEQ a2,#0 // TEST the four words
589 SUBS a4,a4,#4 // decrement counter by 4
590 BGT test_loop_down_l2 // if count still positive then loop
592 LDMFD sp!,{v1,pc}^ // restore work regs and return
594 #if CL_DS_BIG_ENDIAN_P
596 // extern void or_loop_up (uintD* xptr, uintD* yptr, uintC count);
600 // a3 = count of words to be ORed
602 // xptr |= yptr for count words
603 // a1 - a4, ip destroyed
604 EXPORT(or_loop_up) // word aligned or loop up
605 DECLARE_FUNCTION(or_loop_up)
607 ANDS a4,a3,#3 // multiple of 4 words ?
608 BEQ or_loop_up_l1 // yup, so branch
609 CMP a4,#2 // OR the first 1-3 words
610 LDR a4,[a2],#4 // to align the total to a multiple
611 LDR ip,[a1] // of 4 words
614 BLT or_loop_up_l1 // better to branch than skip instrs.
624 BICS a4,a3,#3 // set counter to multiple of 4
625 MOVEQS pc,lr // if zero then we're done
626 STMFD sp!,{v1-v5,lr} // save work regs
628 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
629 LDMIA a1,{v3,v4,v5,lr} // load target words
630 ORR v3,v3,a3 // OR the four words
634 STMIA a1!,{v3,v4,v5,lr} // store 4 results
635 SUBS a4,a4,#4 // decrement counter by 4
636 BGT or_loop_up_l2 // if count still positive then loop
637 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
641 // extern void xor_loop_up (uintD* xptr, uintD* yptr, uintC count);
645 // a3 = count of words to be XORed
647 // xptr ^= yptr for count words
648 // a1 - a4, ip destroyed
649 EXPORT(xor_loop_up) // word aligned xor loop up
650 DECLARE_FUNCTION(xor_loop_up)
652 ANDS a4,a3,#3 // multiple of 4 words ?
653 BEQ xor_loop_up_l1 // yup, so branch
654 CMP a4,#2 // XOR the first 1-3 words
655 LDR a4,[a2],#4 // to align the total to a multiple
656 LDR ip,[a1] // of 4 words
659 BLT xor_loop_up_l1 // better to branch than skip instrs.
668 LABEL(xor_loop_up_l1)
669 BICS a4,a3,#3 // set counter to multiple of 4
670 MOVEQS pc,lr // if zero then we're done
671 STMFD sp!,{v1-v5,lr} // save work regs
672 LABEL(xor_loop_up_l2)
673 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
674 LDMIA a1,{v3,v4,v5,lr} // load target words
675 EOR v3,v3,a3 // XOR the four words
679 STMIA a1!,{v3,v4,v5,lr} // store 4 results
680 SUBS a4,a4,#4 // decrement counter by 4
681 BGT xor_loop_up_l2 // if count still positive then loop
682 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
684 #if CL_DS_BIG_ENDIAN_P
686 // extern void and_loop_up (uintD* xptr, uintD* yptr, uintC count);
690 // a3 = count of words to be ANDed
692 // xptr &= yptr for count words
693 // a1 - a4, ip destroyed
694 EXPORT(and_loop_up) // word aligned and loop up
695 DECLARE_FUNCTION(and_loop_up)
697 ANDS a4,a3,#3 // multiple of 4 words ?
698 BEQ and_loop_up_l1 // yup, so branch
699 CMP a4,#2 // AND the first 1-3 words
700 LDR a4,[a2],#4 // to align the total to a multiple
701 LDR ip,[a1] // of 4 words
704 BLT and_loop_up_l1 // better to branch than skip instrs.
713 LABEL(and_loop_up_l1)
714 BICS a4,a3,#3 // set counter to multiple of 4
715 MOVEQS pc,lr // if zero then we're done
716 STMFD sp!,{v1-v5,lr} // save work regs
717 LABEL(and_loop_up_l2)
718 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
719 LDMIA a1,{v3,v4,v5,lr} // load target words
720 AND v3,v3,a3 // AND the four words
724 STMIA a1!,{v3,v4,v5,lr} // store 4 results
725 SUBS a4,a4,#4 // decrement counter by 4
726 BGT and_loop_up_l2 // if count still positive then loop
727 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
729 // extern void eqv_loop_up (uintD* xptr, uintD* yptr, uintC count);
733 // a3 = count of words to be XORed
735 // xptr = ~(xptr ^ yptr) for count words
736 // a1 - a4, ip destroyed
737 EXPORT(eqv_loop_up) // word aligned eqv loop up
738 DECLARE_FUNCTION(eqv_loop_up)
740 ANDS a4,a3,#3 // multiple of 4 words ?
741 BEQ eqv_loop_up_l1 // yup, so branch
742 CMP a4,#2 // EQV the first 1-3 words
743 LDR a4,[a2],#4 // to align the total to a multiple
744 LDR ip,[a1] // of 4 words
748 BLT eqv_loop_up_l1 // better to branch than skip instrs.
754 BLE eqv_loop_up_l1 // better to branch than skip instrs.
760 LABEL(eqv_loop_up_l1)
761 BICS a4,a3,#3 // set counter to multiple of 4
762 MOVEQS pc,lr // if zero then we're done
763 STMFD sp!,{v1-v5,lr} // save work regs
764 LABEL(eqv_loop_up_l2)
765 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
766 LDMIA a1,{v3,v4,v5,lr} // load target words
767 EOR v3,v3,a3 // EVQ the four words
775 STMIA a1!,{v3,v4,v5,lr} // store 4 results
776 SUBS a4,a4,#4 // decrement counter by 4
777 BGT eqv_loop_up_l2 // if count still positive then loop
778 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
780 // extern void nand_loop_up (uintD* xptr, uintD* yptr, uintC count);
784 // a3 = count of words to be NANDed
786 // xptr = ~(xptr & yptr) for count words
787 // a1 - a4, ip destroyed
788 EXPORT(nand_loop_up) // word aligned nand loop up
789 DECLARE_FUNCTION(nand_loop_up)
791 ANDS a4,a3,#3 // multiple of 4 words ?
792 BEQ nand_loop_up_l1 // yup, so branch
793 CMP a4,#2 // NAND the first 1-3 words
794 LDR a4,[a2],#4 // to align the total to a multiple
795 LDR ip,[a1] // of 4 words
799 BLT nand_loop_up_l1 // better to branch than skip instrs.
805 BLE nand_loop_up_l1 // better to branch than skip instrs.
811 LABEL(nand_loop_up_l1)
812 BICS a4,a3,#3 // set counter to multiple of 4
813 MOVEQS pc,lr // if zero then we're done
814 STMFD sp!,{v1-v5,lr} // save work regs
815 LABEL(nand_loop_up_l2)
816 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
817 LDMIA a1,{v3,v4,v5,lr} // load target words
818 AND v3,v3,a3 // NAND the four words
826 STMIA a1!,{v3,v4,v5,lr} // store 4 results
827 SUBS a4,a4,#4 // decrement counter by 4
828 BGT nand_loop_up_l2 // if count still positive then loop
829 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
831 // extern void nor_loop_up (uintD* xptr, uintD* yptr, uintC count);
835 // a3 = count of words to be NORed
837 // xptr = ~(xptr | yptr) for count words
838 // a1 - a4, ip destroyed
839 EXPORT(nor_loop_up) // word aligned nor loop up
840 DECLARE_FUNCTION(nor_loop_up)
842 ANDS a4,a3,#3 // multiple of 4 words ?
843 BEQ nor_loop_up_l1 // yup, so branch
844 CMP a4,#2 // NOR the first 1-3 words
845 LDR a4,[a2],#4 // to align the total to a multiple
846 LDR ip,[a1] // of 4 words
850 BLT nor_loop_up_l1 // better to branch than skip instrs.
856 BLE nor_loop_up_l1 // better to branch than skip instrs.
862 LABEL(nor_loop_up_l1)
863 BICS a4,a3,#3 // set counter to multiple of 4
864 MOVEQS pc,lr // if zero then we're done
865 STMFD sp!,{v1-v5,lr} // save work regs
866 LABEL(nor_loop_up_l2)
867 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
868 LDMIA a1,{v3,v4,v5,lr} // load target words
869 ORR v3,v3,a3 // NOR the four words
877 STMIA a1!,{v3,v4,v5,lr} // store 4 results
878 SUBS a4,a4,#4 // decrement counter by 4
879 BGT nor_loop_up_l2 // if count still positive then loop
880 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
882 // extern void andc2_loop_up (uintD* xptr, uintD* yptr, uintC count);
886 // a3 = count of words to be ANDC2ed
888 // xptr = xptr & ~yptr for count words
889 // a1 - a4, ip destroyed
890 EXPORT(andc2_loop_up) // word aligned andc2 loop up
891 DECLARE_FUNCTION(andc2_loop_up)
892 GLABEL(andc2_loop_up)
893 ANDS a4,a3,#3 // multiple of 4 words ?
894 BEQ andc2_loop_up_l1 // yup, so branch
895 CMP a4,#2 // ANDC2 the first 1-3 words
896 LDR a4,[a2],#4 // to align the total to a multiple
897 LDR ip,[a1] // of 4 words
900 BLT andc2_loop_up_l1 // better to branch than skip instrs.
909 LABEL(andc2_loop_up_l1)
910 BICS a4,a3,#3 // set counter to multiple of 4
911 MOVEQS pc,lr // if zero then we're done
912 STMFD sp!,{v1-v5,lr} // save work regs
913 LABEL(andc2_loop_up_l2)
914 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
915 LDMIA a1,{v3,v4,v5,lr} // load target words
916 BIC v3,v3,a3 // ANDC2 the four words
920 STMIA a1!,{v3,v4,v5,lr} // store 4 results
921 SUBS a4,a4,#4 // decrement counter by 4
922 BGT andc2_loop_up_l2 // if count still positive then loop
923 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
925 // extern void orc2_loop_up (uintD* xptr, uintD* yptr, uintC count);
929 // a3 = count of words to be XORed
931 // xptr = xptr | ~yptr for count words
932 // a1 - a4, ip destroyed
933 EXPORT(orc2_loop_up) // word aligned orc2 loop up
934 DECLARE_FUNCTION(orc2_loop_up)
936 ANDS a4,a3,#3 // multiple of 4 words ?
937 BEQ orc2_loop_up_l1 // yup, so branch
938 CMP a4,#2 // ORC2 the first 1-3 words
939 LDR a4,[a2],#4 // to align the total to a multiple
940 LDR ip,[a1] // of 4 words
944 BLT orc2_loop_up_l1 // better to branch than skip instrs.
950 BLE orc2_loop_up_l1 // better to branch than skip instrs.
956 LABEL(orc2_loop_up_l1)
957 BICS a4,a3,#3 // set counter to multiple of 4
958 MOVEQS pc,lr // if zero then we're done
959 STMFD sp!,{v1-v5,lr} // save work regs
960 LABEL(orc2_loop_up_l2)
961 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
962 LDMIA a1,{v3,v4,v5,lr} // load target words
963 MVN a3,a3 // ORC2 the four words
971 STMIA a1!,{v3,v4,v5,lr} // store 4 results
972 SUBS a4,a4,#4 // decrement counter by 4
973 BGT orc2_loop_up_l2 // if count still positive then loop
974 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
976 // extern void not_loop_up (uintD* xptr, uintC count);
979 // a2 = count of words to be NOTed
981 // xptr = ~xptr for count words
982 // a1 - a4, ip destroyed
983 EXPORT(not_loop_up) // word aligned not loop up
984 DECLARE_FUNCTION(not_loop_up)
986 ANDS a3,a2,#3 // multiple of 4 words ?
987 BEQ not_loop_up_l1 // yup, so branch
988 CMP a3,#2 // NOT the first 1-3 words
989 LDR a3,[a1] // to align the total to a multiple
990 MVN a3,a3 // of 4 words
992 BLT not_loop_up_l1 // better to branch than skip instrs.
999 LABEL(not_loop_up_l1)
1000 BICS a4,a2,#3 // set counter to multiple of 4
1001 MOVEQS pc,lr // if zero then we're done
1002 STMFD sp!,{lr} // save work regs
1003 LABEL(not_loop_up_l2)
1004 LDMIA a1,{a2,a3,ip,lr} // load 4 words in one go,NO writeback
1005 MVN a2,a2 // NOT the four words
1009 STMIA a1!,{a2,a3,ip,lr} // store 4 results
1010 SUBS a4,a4,#4 // decrement counter by 4
1011 BGT not_loop_up_l2 // if count still positive then loop
1012 LDMFD sp!,{pc}^ // restore work regs and return
1014 // extern void and_test_loop_up (uintD* xptr, uintD* yptr, uintC count);
1018 // a3 = count of words to be AND_TESTed
1020 // a1 = TRUE if any words ANDed together are non-zero else FALSE
1021 // a2 - a4, ip destroyed
1022 EXPORT(and_test_loop_up) // word aligned and_test loop up
1023 DECLARE_FUNCTION(and_test_loop_up)
1024 GLABEL(and_test_loop_up)
1025 ANDS a4,a3,#3 // multiple of 4 words ?
1026 BEQ and_test_loop_up_l1 // yup, so branch
1028 LDR a4,[a2],#4 // AND_TEST the first 1-3 words
1029 LDR ip,[a1],#4 // to align the total to a multiple
1030 TST ip,a4 // of 4 words
1031 MOVNE a1,#1 // return TRUE if AND_TEST ok
1033 BCC and_test_loop_up_l1 // better to branch than skip instrs.
1041 BLE and_test_loop_up_l1 // better to branch than skip instrs.
1047 LABEL(and_test_loop_up_l1)
1048 BICS a4,a3,#3 // set counter to multiple of 4
1049 MOVEQ a1,#0 // return FALSE
1050 MOVEQS pc,lr // if zero then we're done
1051 STMFD sp!,{v1-v6,lr} // save work regs
1052 MOV v6,a1 // move xptr to v6
1053 MOV a1,#1 // set result to TRUE
1054 LABEL(and_test_loop_up_l2)
1055 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
1056 LDMIA v6!,{v3,v4,v5,lr} // load target words
1057 TST v3,a3 // AND_TEST the four words
1061 LDMNEFD sp!,{v1-v6,pc}^
1062 SUBS a4,a4,#4 // decrement counter by 4
1063 BGT and_test_loop_up_l2 // if count still positive then loop
1065 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
1069 // extern void compare_loop_up (uintD* xptr, uintD* yptr, uintC count);
1073 // a3 = count of words to be COMPAREd
1075 // a1 = +1 if first non-equal word in xptr[] and yptr[]
1076 // xptr[i] > yptr[i]
1077 // -1 if xptr[i] < yptr[i]
1079 // a2 - a4, ip destroyed
1080 EXPORT(compare_loop_up) // word aligned compare loop up
1081 DECLARE_FUNCTION(compare_loop_up)
1082 GLABEL(compare_loop_up)
1083 ANDS a4,a3,#3 // multiple of 4 words ?
1084 BEQ compare_loop_up_l1 // yup, so branch
1085 LDR a4,[a2],#4 // COMPARE the first 1-3 words
1086 LDR ip,[a1],#4 // to align the total to a multiple
1087 CMP ip,a4 // of 4 words
1088 MVNLO a1,#0 // x < y -> -1
1089 MOVHI a1,#1 // x > y -> +1
1090 MOVNES pc,lr // and return result if not equal
1093 BLT compare_loop_up_l1 // need to branch 'cos PSR used
1102 BLE compare_loop_up_l1 // need to branch 'cos PSR used
1109 LABEL(compare_loop_up_l1)
1110 BICS a4,a3,#3 // set counter to multiple of 4
1111 MOVEQ a1,#0 // xptr[] == yptr[] -> 0
1112 MOVEQS pc,lr // if zero then we're done
1113 STMFD sp!,{v1-v6,lr} // save work regs
1114 MOV v6,a1 // move xptr to v6
1115 MOV a1,#1 // set result to +1
1116 LABEL(compare_loop_up_l2)
1117 LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go
1118 LDMIA v6!,{v3,v4,v5,lr} // load test words
1119 CMP v3,a3 // COMPARE the four words
1123 MVNLO a1,#0 // x < y -> -1 (a1 already holds +1)
1124 LDMNEFD sp!,{v1-v6,pc}^
1125 SUBS a4,a4,#4 // decrement counter by 4
1126 BGT compare_loop_up_l2 // if count still positive then loop
1128 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
1130 #if CL_DS_BIG_ENDIAN_P
1132 // extern uintD addto_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
1136 // a3 = count of words to be added
1138 // destptr[] = sourceptr[] + destptr[]
1140 // a2 - a4, ip destroyed
1141 EXPORT(addto_loop_down) // word aligned addto loop down
1142 DECLARE_FUNCTION(addto_loop_down)
1143 GLABEL(addto_loop_down)
1144 MOV a4,a3 // set regs for a call
1145 MOV a3,a2 // to add_loop_down
1146 // and drop into add_loop_down
1148 // extern uintD add_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
1153 // a4 = count of words to be added
1155 // destptr[] = sourceptr1[] + sourceptr2[]
1157 // a2 - a4, ip destroyed
1158 EXPORT(add_loop_down) // word aligned add loop down
1159 DECLARE_FUNCTION(add_loop_down)
1160 GLABEL(add_loop_down)
1161 ANDS ip,a4,#3 // multiple of 4 words ?
1162 BEQ add_loop_down_l1 // yup, so branch
1164 LDR v6,[a2,#-4]! // add the first 1-3 words
1165 LDR lr,[a1,#-4]! // to align the total to a multiple
1166 ADDS lr,lr,v6 // of 4 words
1169 BEQ add_loop_down_l0 // need to branch 'cos PSR used
1175 BEQ add_loop_down_l0 // need to branch 'cos PSR used
1180 LABEL(add_loop_down_l0) // at least one add has happened
1181 BICS a4,a4,#3 // set counter to multiple of 4
1182 BNE add_loop_down_l3 // branch if more adds to do
1183 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0)
1184 LDMEQFD sp!,{v6,pc}^ // and return
1185 LABEL(add_loop_down_l1)
1186 BICS a4,a4,#3 // set counter to multiple of 4
1187 MOVEQ a1,#0 // no adds, so C = 0
1188 MOVEQS pc,lr // if zero then we're done
1189 CMN a4,#0 // clear carry bit
1191 LABEL(add_loop_down_l3)
1192 STMFD sp!,{v1-v5} // save work regs
1193 LABEL(add_loop_down_l2)
1194 LDMDB a2!,{v1,v2,v3,ip} // load 4 words in one go
1195 LDMDB a1!,{v4,v5,v6,lr} // and from source2
1196 ADCS lr,lr,ip // add the four words with carry
1200 STMDB a3!,{v4,v5,v6,lr} // store 4 results
1201 SUB a4,a4,#4 // decrement counter by 4, preserve C
1202 TEQ a4,#0 // are we done ?
1203 BNE add_loop_down_l2 // if count non-zero then loop
1204 ADC a1,a4,a4 // set result to Carry (a4 is 0)
1205 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
1207 // extern uintD inc_loop_down (uintD* ptr, uintC count);
1210 // a2 = count of words to be INCed
1212 // a1 = 0 if any words are non-zero after increment else 1
1213 // stop incrementing when first word becomes non-zero
1214 // a2 - a4, ip destroyed
1215 EXPORT(inc_loop_down) // word aligned inc loop down
1216 DECLARE_FUNCTION(inc_loop_down)
1217 GLABEL(inc_loop_down)
1218 ANDS a3,a2,#1 // multiple of 2 words ?
1219 BEQ inc_loop_down_l1 // yup, so branch
1220 LDR a4,[a1,#-4]! // INC the first word
1221 ADDS a4,a4,#1 // align the total to a multiple of 2
1223 MOVNE a1,#0 // set result to 0
1224 MOVNES pc,lr // return 0 if non-zero result
1225 LABEL(inc_loop_down_l1)
1226 BICS a4,a2,#1 // set counter to multiple of 2
1227 MOVEQ a1,#1 // return 1
1228 MOVEQS pc,lr // if zero then we're done
1229 MOV ip,a1 // move ptr to ip
1230 MOV a1,#0 // set result to 0
1232 BEQ inc_loop_down_l3
1233 LDMDB ip,{a2,a3} // load 2 words in one go
1234 ADDS a3,a3,#1 // INC the two words
1235 ADDEQS a2,a2,#1 // stopping when first word non-zero
1236 STMDB ip!,{a2,a3} // store 2 results
1237 MOVNES pc,lr // return 0 if any result non-zero
1238 SUBS a4,a4,#2 // decrement counter by 2
1239 MOVEQ a1,#1 // if finished loop then
1240 MOVEQS pc,lr // return 1
1241 LABEL(inc_loop_down_l3) // now a multiple of 4 words
1242 STMFD sp!,{v1,lr} // save work regs
1243 LABEL(inc_loop_down_l2)
1244 LDMDB ip,{a2,a3,v1,lr} // load 4 words in one go
1245 ADDS lr,lr,#1 // INC the four words
1246 ADDEQS v1,v1,#1 // stopping when first word non-zero
1249 STMDB ip!,{a2,a3,v1,lr} // store 4 results
1250 LDMNEFD sp!,{v1,pc}^ // return 0 if any result non-zero
1251 SUBS a4,a4,#4 // decrement counter by 4
1252 BGT inc_loop_down_l2 // if count still positive then loop
1254 LDMFD sp!,{v1,pc}^ // restore work regs and return 1
1256 // extern uintD sub_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
1261 // a4 = count of words to be subtracted
1263 // destptr[] = sourceptr1[] - sourceptr2[]
1265 // a2 - a4, ip destroyed
1266 EXPORT(sub_loop_down) // word aligned sub loop down
1267 LABEL(sub_loop_down)
1268 ANDS ip,a4,#3 // multiple of 4 words ?
1269 BEQ sub_loop_down_l1 // yup, so branch
1271 LDR v6,[a2,#-4]! // subtract the first 1-3 words
1272 LDR lr,[a1,#-4]! // to align the total to a multiple
1273 SUBS lr,lr,v6 // of 4 words
1276 BNE sub_loop_down_l0 // branch if more than one subtract
1277 LABEL(sub_loop_down_l4) // drop through for better instr. timings
1278 BICS a4,a4,#3 // set counter to multiple of 4
1279 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0)
1280 LDMEQFD sp!,{v6,pc}^ // and return
1281 STMFD sp!,{v1-v5} // save work regs
1282 B sub_loop_down_l2 // branch if more subtracts to do
1283 LABEL(sub_loop_down_l0)
1289 BEQ sub_loop_down_l4 // need to branch 'cos PSR used
1295 LABEL(sub_loop_down_l1)
1296 BICS a4,a4,#3 // set counter to multiple of 4
1297 MOVEQ a1,#0 // no subtracts, so C = 0
1298 MOVEQS pc,lr // if zero then we're done
1299 CMP a4,#0 // set carry bit, since a4 > 0
1300 STMFD sp!,{v1-v6,lr} // save work regs
1301 LABEL(sub_loop_down_l2)
1302 LDMDB a2!,{v1,v2,v3,ip} // load 4 words in one go
1303 LDMDB a1!,{v4,v5,v6,lr} // and from source2
1304 SBCS lr,lr,ip // subtract the four words with carry
1308 STMDB a3!,{v4,v5,v6,lr} // store 4 results
1309 SUB a4,a4,#4 // decrement counter by 4, preserve C
1310 TEQ a4,#0 // are we done ?
1311 BNE sub_loop_down_l2 // if count non-zero then loop
1312 SBC a1,a4,a4 // set result to Carry (a4 is 0)
1313 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
1315 // extern uintD subx_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count, uintD carry);
1320 // a4 = count of words to be subtracted
1323 // destptr[] = sourceptr1[] - sourceptr2[]
1325 // a2 - a4, ip destroyed
1326 EXPORT(subx_loop_down) // word aligned xsub loop down
1327 LABEL(subx_loop_down)
1328 LDR ip,[sp] // get starting value of carry
1329 LABEL(subx_loop_down_lsub)
1330 RSBS ip,ip,#0 // set carry in PSR
1331 ANDS ip,a4,#3 // multiple of 4 words ?
1332 BEQ subx_loop_down_l1 // yup, so branch
1334 LDR v6,[a2,#-4]! // subtract the first 1-3 words
1335 LDR lr,[a1,#-4]! // to align the total to a multiple
1336 SBCS lr,lr,v6 // of 4 words
1339 BNE subx_loop_down_l0 // branch if more than one subtract
1340 LABEL(subx_loop_down_l4) // drop through for better instr. timings
1341 BICS a4,a4,#3 // set counter to multiple of 4
1342 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0)
1343 LDMEQFD sp!,{v6,pc}^ // and return
1344 STMFD sp!,{v1-v5} // save work regs
1345 B subx_loop_down_l2 // branch if more subtracts to do
1346 LABEL(subx_loop_down_l0)
1352 BEQ subx_loop_down_l4 // need to branch 'cos PSR used
1358 LABEL(subx_loop_down_l1)
1359 BICS a4,a4,#3 // set counter to multiple of 4
1360 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0)
1361 MOVEQS pc,lr // if zero then we're done
1362 STMFD sp!,{v1-v6,lr} // save work regs
1363 LABEL(subx_loop_down_l2)
1364 LDMDB a2!,{v1,v2,v3,ip} // load 4 words in one go
1365 LDMDB a1!,{v4,v5,v6,lr} // and from source2
1366 SBCS lr,lr,ip // subtract the four words with carry
1370 STMDB a3!,{v4,v5,v6,lr} // store 4 results
1371 SUB a4,a4,#4 // decrement counter by 4, preserve C
1372 TEQ a4,#0 // are we done ?
1373 BNE subx_loop_down_l2 // if count non-zero then loop
1374 SBC a1,a4,a4 // set result to Carry (a4 is 0)
1375 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
1377 // extern uintD subfrom_loop_down (uintD* sourceptr, uintD* destptr, uintC count);
1381 // a3 = count of words to be subtracted
1383 // destptr[] = destptr[] - sourceptr[]
1385 // a2 - a4, ip destroyed
1386 EXPORT(subfrom_loop_down) // word aligned subfrom loop down
1387 DECLARE_FUNCTION(subfrom_loop_down)
1388 GLABEL(subfrom_loop_down)
1389 ANDS ip,a3,#3 // multiple of 4 words ?
1390 BEQ subfrom_loop_down_l1 // yup, so branch
1392 LDR a4,[a1,#-4]! // subtract the first 1-3 words
1393 LDR lr,[a2,#-4]! // to align the total to a multiple
1394 SUBS lr,lr,a4 // of 4 words
1397 BNE subfrom_loop_down_l0 // branch if more than one subtract
1398 LABEL(subfrom_loop_down_l4) // drop through for better instr. timings
1399 BICS a4,a3,#3 // set counter to multiple of 4
1400 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0)
1401 LDMEQFD sp!,{pc}^ // and return
1402 STMFD sp!,{v1-v5} // save work regs
1403 B subfrom_loop_down_l2 // branch if more subtracts to do
1404 LABEL(subfrom_loop_down_l0)
1410 BEQ subfrom_loop_down_l4 // need to branch 'cos PSR used
1415 B subfrom_loop_down_l4
1416 LABEL(subfrom_loop_down_l1)
1417 BICS a4,a3,#3 // set counter to multiple of 4
1418 MOVEQ a1,#0 // no subtracts, so C = 0
1419 MOVEQS pc,lr // if zero then we're done
1420 CMP a4,#0 // set carry bit, since a4 > 0
1421 STMFD sp!,{v1-v5,lr} // save work regs
1422 LABEL(subfrom_loop_down_l2)
1423 LDMDB a1!,{a3,v1,v2,ip} // load 4 words in one go
1424 LDMDB a2,{v3,v4,v5,lr} // and from destptr
1425 SBCS lr,lr,ip // subtract the four words with carry
1429 STMDB a2!,{v3,v4,v5,lr} // store 4 results
1430 SUB a4,a4,#4 // decrement counter by 4, preserve C
1431 TEQ a4,#0 // are we done ?
1432 BNE subfrom_loop_down_l2 // if count non-zero then loop
1433 SBC a1,a4,a4 // set result to Carry (a4 is 0)
1434 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
1436 // extern uintD dec_loop_down (uintD* ptr, uintC count);
1439 // a2 = count of words to be DECed
1441 // a1 = 0 if any words are non-zero before decrement else -1
1442 // stop decrementing when first word is non-zero
1443 // a2 - a4, ip destroyed
1444 EXPORT(dec_loop_down) // word aligned dec loop down
1445 DECLARE_FUNCTION(dec_loop_down)
1446 GLABEL(dec_loop_down)
1447 ANDS a3,a2,#1 // multiple of 2 words ?
1448 BEQ dec_loop_down_l1 // yup, so branch
1449 LDR a4,[a1,#-4]! // DEC the first word
1450 SUBS a4,a4,#1 // align the total to a multiple of 2
1452 MOVCS a1,#0 // set result to 0
1453 MOVCSS pc,lr // return 0 if non-zero result
1454 LABEL(dec_loop_down_l1)
1455 BICS a4,a2,#1 // set counter to multiple of 2
1456 MVNEQ a1,#0 // return -1
1457 MOVEQS pc,lr // if zero then we're done
1458 MOV ip,a1 // move ptr to ip
1459 MOV a1,#0 // set result to 0
1461 BEQ dec_loop_down_l3
1462 LDMDB ip,{a2,a3} // load 2 words in one go
1463 SUBS a3,a3,#1 // DEC the two words
1464 SUBCCS a2,a2,#1 // stopping when first word non-zero
1465 STMDB ip!,{a2,a3} // store 2 results
1466 MOVCSS pc,lr // return 0 if any result non-zero
1467 SUBS a4,a4,#2 // decrement counter by 2
1468 MVNEQ a1,#0 // if finished loop then
1469 MOVEQS pc,lr // return -1
1470 LABEL(dec_loop_down_l3) // now a multiple of 4 words
1471 STMFD sp!,{v1,lr} // save work regs
1472 LABEL(dec_loop_down_l2)
1473 LDMDB ip,{a2,a3,v1,lr} // load 4 words in one go
1474 SUBS lr,lr,#1 // DEC the four words
1475 SUBCCS v1,v1,#1 // stopping when first word non-zero
1478 STMDB ip!,{a2,a3,v1,lr} // store 4 results
1479 LDMCSFD sp!,{v1,pc}^ // return 0 if any carry
1480 SUBS a4,a4,#4 // decrement counter by 4
1481 BGT dec_loop_down_l2 // if count still positive then loop
1483 LDMFD sp!,{v1,pc}^ // restore work regs and return -1
1485 // extern void neg_loop_down (uintD* ptr, uintC count);
1488 // a2 = count of words. The long integer is to be NEGated
1490 // ptr[] = -ptr[] for count words
1492 // a2 - a4, ip destroyed
1493 EXPORT(neg_loop_down) // word aligned neg loop down
1494 DECLARE_FUNCTION(neg_loop_down)
1495 GLABEL(neg_loop_down)
1496 CMPS a2,#0 // count = 0 ?
1497 MOVEQ a1,#0 // yup, so return 0
1499 LABEL(neg_loop_down_l1) // skip all the zero words first
1500 LDR a3,[a1,#-4]! // compare words against zero
1501 CMPS a3,#0 // downwards in memory
1502 BNE neg_loop_down_l2 // non-zero, so negate rest of words
1503 SUBS a2,a2,#1 // reduce count of words
1504 BNE neg_loop_down_l1 // more ?, so loop
1505 MOV a1,#0 // return 0
1507 LABEL(neg_loop_down_l2)
1508 RSB a3,a3,#0 // first non-zero word = -word
1511 MVNEQ a1,#0 // done ? -> return -1
1513 // now NOT rest of the words
1514 ANDS a3,a2,#3 // multiple of 4 words ?
1515 BEQ neg_loop_down_l3 // yup, so branch
1516 CMP a3,#2 // NOT the first 1-3 words
1517 LDR a3,[a1,#-4]! // to align the total to a multiple
1518 MVN a3,a3 // of 4 words
1520 BLT neg_loop_down_l3 // better to branch than skip instrs.
1527 LABEL(neg_loop_down_l3)
1528 BICS a4,a2,#3 // set counter to multiple of 4
1529 MVNEQ a1,#0 // set result to -1
1530 MOVEQS pc,lr // if zero then we're done
1531 STMFD sp!,{lr} // save work regs
1532 LABEL(neg_loop_down_l4)
1533 LDMDB a1,{a2,a3,ip,lr} // load 4 words in one go,NO writeback
1534 MVN a2,a2 // NOT the four words
1538 STMDB a1!,{a2,a3,ip,lr} // store 4 results
1539 SUBS a4,a4,#4 // decrement counter by 4
1540 BGT neg_loop_down_l4 // if count still positive then loop
1541 MVN a1,#0 // set result to -1
1542 LDMFD sp!,{pc}^ // restore work regs and return -1
1544 // extern uintD shift1left_loop_down (uintD* ptr, uintC count);
1547 // a2 = count of words to be shifted left
1549 // a1 = carry out from last shift left
1550 // a2 - a4, ip destroyed
1551 EXPORT(shift1left_loop_down) // word aligned shift1left loop down
1552 DECLARE_FUNCTION(shift1left_loop_down)
1553 GLABEL(shift1left_loop_down)
1554 CMN a1,#0 // clear carry bit, since a1 > 0
1555 ANDS a3,a2,#1 // multiple of 2 words ?
1556 BEQ shift1left_loop_down_l1 // yup, so branch
1557 LDR a4,[a1,#-4]! // shift left the first word
1560 LABEL(shift1left_loop_down_l1)
1561 BICS a4,a2,#1 // set counter to multiple of 2
1562 ADCEQ a1,a4,a4 // if zero set result to C (a4 is 0)
1563 MOVEQS pc,lr // and return
1564 ANDS a3,a4,#3 // multiple of 4 words ?
1565 BEQ shift1left_loop_down_l3 // yup, so branch
1566 LDMDB a1,{a2,a3} // load 2 words in one go
1567 ADCS a3,a3,a3 // shift left the two words
1569 STMDB a1!,{a2,a3} // store 2 results
1570 BICS a4,a4,#2 // decrement counter by 2
1571 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0)
1572 MOVEQS pc,lr // and return
1573 LABEL(shift1left_loop_down_l3) // now a multiple of 4 words
1574 STMFD sp!,{lr} // save work regs
1575 LABEL(shift1left_loop_down_l2)
1576 LDMDB a1,{a2,a3,ip,lr} // load 4 words in one go
1577 ADCS lr,lr,lr // shift left the four words
1581 STMDB a1!,{a2,a3,ip,lr} // store 4 results
1582 SUB a4,a4,#4 // decrement counter by 4
1583 TEQ a4,#0 // are we done ?
1584 BNE shift1left_loop_down_l2 // if count non-zero then loop
1585 ADC a1,a4,a4 // set result to Carry (a4 is 0)
1586 LDMFD sp!,{pc}^ // restore work regs and return 1
1588 // extern uintD shiftleft_loop_down (uintD* ptr, uintC count, uintC i, uintD carry);
1591 // a2 = count of words to be shifted left
1592 // a3 = size of left shift
1593 // a4 = value to ORR in for first shift
1595 // a1 = shift out from last shift left
1596 // a2 - a4, ip destroyed
1597 EXPORT(shiftleft_loop_down) // word aligned shiftleft loop down
1598 DECLARE_FUNCTION(shiftleft_loop_down)
1599 GLABEL(shiftleft_loop_down)
1601 RSB v6,a3,#32 // size of complementary right shift
1602 ANDS ip,a2,#3 // multiple of 4 words ?
1603 BEQ shiftleft_loop_down_l1 // yup, so branch
1604 LDR lr,[a1,#-4]! // shiftleft the first 1-3 words
1605 ORR a4,a4,lr,ASL a3 // to align the total to a multiple
1606 STR a4,[a1,#0] // of 4 words
1609 BLT shiftleft_loop_down_l1 // better to branch than skip instrs.
1611 ORRGE a4,a4,lr,ASL a3
1615 ORRGT a4,a4,lr,ASL a3
1618 LABEL(shiftleft_loop_down_l1)
1619 BICS ip,a2,#3 // set counter to multiple of 4
1620 MOVEQ a1,a4 // if zero then we're done
1621 LDMEQFD sp!,{v6,pc}^ // so return last shift out
1622 STMFD sp!,{v1-v3} // save work regs
1623 LABEL(shiftleft_loop_down_l2)
1624 LDMDB a1,{a2,v1,v2,v3} // load 4 words in one go
1625 ORR lr,a4,v3,ASL a3 // shiftleft the four words
1626 MOV a4,v3,LSR v6 // keep carry in a4
1627 ORR v3,a4,v2,ASL a3 // and store results up a register
1628 MOV a4,v2,LSR v6 // to regs v1-v3,lr
1633 STMDB a1!,{v1,v2,v3,lr} // store 4 results
1634 SUBS ip,ip,#4 // decrement counter by 4
1635 BGT shiftleft_loop_down_l2 // if count still positive then loop
1636 MOV a1,a4 // result = last shift out
1637 LDMFD sp!,{v1-v3,v6,pc}^ // restore work regs and return
1639 // extern uintD shiftleftcopy_loop_down (uintD* sourceptr, uintD* destptr, uintC count, uintC i);
1643 // a3 = count of words to be shifted left
1644 // a4 = size of left shift
1646 // a1 = shift out from last shift left
1647 // a2 - a4, ip destroyed
1648 EXPORT(shiftleftcopy_loop_down) // word aligned shiftleftcopy loop down
1649 DECLARE_FUNCTION(shiftleftcopy_loop_down)
1650 GLABEL(shiftleftcopy_loop_down)
1651 STMFD sp!,{v5,v6,lr}
1652 MOV v5,#0 // initial shift carry
1653 RSB v6,a4,#32 // size of complementary right shift
1654 ANDS ip,a3,#3 // multiple of 4 words ?
1655 BEQ shiftleftcopy_loop_down_l1 // yup, so branch
1656 LDR lr,[a1,#-4]! // shiftleft the first 1-3 words
1657 ORR v5,v5,lr,ASL a4 // to align the total to a multiple
1658 STR v5,[a2,#-4]! // of 4 words
1661 BLT shiftleftcopy_loop_down_l1 // better to branch than skip instrs.
1663 ORRGE v5,v5,lr,ASL a4
1667 ORRGT v5,v5,lr,ASL a4
1670 LABEL(shiftleftcopy_loop_down_l1)
1671 BICS ip,a3,#3 // set counter to multiple of 4
1672 MOVEQ a1,v5 // if zero then we're done
1673 LDMEQFD sp!,{v5,v6,pc}^ // so return last shift out
1674 STMFD sp!,{v1-v3} // save work regs
1675 LABEL(shiftleftcopy_loop_down_l2)
1676 LDMDB a1!,{a3,v1,v2,v3} // load 4 words in one go
1677 ORR lr,v5,v3,ASL a4 // shiftleft the four words
1678 MOV v5,v3,LSR v6 // keep carry in v5
1679 ORR v3,v5,v2,ASL a4 // and store results up a register
1680 MOV v5,v2,LSR v6 // to regs v1-v3,lr
1685 STMDB a2!,{v1,v2,v3,lr} // store 4 results
1686 SUBS ip,ip,#4 // decrement counter by 4
1687 BGT shiftleftcopy_loop_down_l2 // if count still positive then loop
1688 MOV a1,v5 // result = last shift out
1689 LDMFD sp!,{v1-v3,v5,v6,pc}^ // restore work regs and return
1691 // extern uintD shift1right_loop_up (uintD* ptr, uintC count, uintD carry);
1694 // a2 = count of words to be shifted right
1697 // a1 = carry out from last shift right
1698 // a2 - a4, ip destroyed
1699 EXPORT(shift1right_loop_up) // word aligned shift1right loop up
1700 DECLARE_FUNCTION(shift1right_loop_up)
1701 GLABEL(shift1right_loop_up)
1702 MOVS a3,a3,LSR #1 // set carry
1703 ANDS a3,a2,#1 // multiple of 2 words ?
1704 BEQ shift1right_loop_up_l1 // yup, so branch
1705 LDR a4,[a1] // shift right the first word
1708 LABEL(shift1right_loop_up_l1)
1709 BICS a4,a2,#1 // set counter to multiple of 2
1710 MOVEQ a1,a4,RRX // if zero set result to C (a4 is 0)
1711 MOVEQS pc,lr // and return
1712 ANDS a3,a4,#3 // multiple of 4 words ?
1713 BEQ shift1right_loop_up_l3 // yup, so branch
1714 LDMIA a1,{a2,a3} // load 2 words in one go
1715 MOVS a2,a2,RRX // shift right the two words
1717 STMIA a1!,{a2,a3} // store 2 results
1718 BICS a4,a4,#2 // decrement counter by 2
1719 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0)
1720 MOVEQS pc,lr // and return
1721 LABEL(shift1right_loop_up_l3) // now a multiple of 4 words
1722 STMFD sp!,{lr} // save work regs
1723 LABEL(shift1right_loop_up_l2)
1724 LDMIA a1,{a2,a3,ip,lr} // load 4 words in one go
1725 MOVS a2,a2,RRX // shift right the four words
1729 STMIA a1!,{a2,a3,ip,lr} // store 4 results
1730 SUB a4,a4,#4 // decrement counter by 4
1731 TEQ a4,#0 // are we done ?
1732 BNE shift1right_loop_up_l2 // if count non-zero then loop
1733 MOV a1,a4,RRX // set result to Carry (a4 is 0)
1734 LDMFD sp!,{pc}^ // restore work regs and return 1
1736 // extern uintD shiftright_loop_up (uintD* ptr, uintC count, uintC i);
1739 // a2 = count of words to be shifted right
1740 // a3 = size of right shift
1742 // a1 = shift out from last shift right
1743 // a2 - a4, ip destroyed
1744 EXPORT(shiftright_loop_up) // word aligned shiftright loop up
1745 DECLARE_FUNCTION(shiftright_loop_up)
1746 GLABEL(shiftright_loop_up)
1748 MOV a4,#0 // initial shift carry
1749 RSB v6,a3,#32 // size of complementary left shift
1750 LABEL(shiftright_loop_up_l0)
1751 ANDS ip,a2,#3 // multiple of 4 words ?
1752 BEQ shiftright_loop_up_l1 // yup, so branch
1753 LDR lr,[a1] // shiftright the first 1-3 words
1754 ORR a4,a4,lr,LSR a3 // to align the total to a multiple
1755 STR a4,[a1],#4 // of 4 words
1758 BLT shiftright_loop_up_l1 // better to branch than skip instrs.
1760 ORRGE a4,a4,lr,LSR a3
1764 ORRGT a4,a4,lr,LSR a3
1767 LABEL(shiftright_loop_up_l1)
1768 BICS ip,a2,#3 // set counter to multiple of 4
1769 MOVEQ a1,a4 // if zero then we're done
1770 LDMEQFD sp!,{v6,pc}^ // so return last shift out
1771 STMFD sp!,{v1-v3} // save work regs
1772 LABEL(shiftright_loop_up_l2)
1773 LDMIA a1,{v1,v2,v3,lr} // load 4 words in one go
1774 ORR a2,a4,v1,LSR a3 // shiftright the four words
1775 MOV a4,v1,ASL v6 // keep carry in a4
1776 ORR v1,a4,v2,LSR a3 // and store results down a register
1777 MOV a4,v2,ASL v6 // to regs a2,v1-v3
1782 STMIA a1!,{a2,v1,v2,v3} // store 4 results
1783 SUBS ip,ip,#4 // decrement counter by 4
1784 BGT shiftright_loop_up_l2 // if count still positive then loop
1785 MOV a1,a4 // result = last shift out
1786 LDMFD sp!,{v1-v3,v6,pc}^ // restore work regs and return
1788 // extern uintD shiftrightsigned_loop_up (uintD* ptr, uintC count, uintC i);
1791 // a2 = count of words to be shifted right signed
1792 // a3 = size of right shift
1794 // a1 = shift out from last shift right
1795 // a2 - a4, ip destroyed
1796 EXPORT(shiftrightsigned_loop_up)// word aligned shiftrightsigned loop up
1797 DECLARE_FUNCTION(shiftrightsigned_loop_up)
1798 GLABEL(shiftrightsigned_loop_up)
1800 RSB v6,a3,#32 // size of complementary left shift
1801 LDR lr,[a1] // setup carry for first shift.
1802 MOV a4,lr,ASR #31 // this is the sign extended bits
1803 AND a4,a4,a4,LSL v6 // 31->(32-i) of the first word
1804 B shiftright_loop_up_l0 // use right shift code now
1806 // extern uintD shiftrightcopy_loop_up (uintD* sourceptr, uintD* destptr, uintC count, uintC i, uintD carry);
1810 // a3 = count of words to be shifted right
1811 // a4 = size of right shift
1812 // [sp] = carry for first shift
1814 // a1 = shift out from last shift right
1815 // a2 - a4, ip destroyed
1816 EXPORT(shiftrightcopy_loop_up) // word aligned shiftrightcopy loop up
1817 DECLARE_FUNCTION(shiftrightcopy_loop_up)
1818 GLABEL(shiftrightcopy_loop_up)
1819 STMFD sp!,{v5,v6,lr}
1820 LDR v5,[sp,#12] // initial shift carry
1821 RSB v6,a4,#32 // size of complementary left shift
1823 LABEL(shiftrightcopy_loop_up_l0)
1824 ANDS ip,a3,#3 // multiple of 4 words ?
1825 BEQ shiftrightcopy_loop_up_l1 // yup, so branch
1826 LDR lr,[a1],#4 // shiftright the first 1-3 words
1827 ORR v5,v5,lr,LSR a4 // to align the total to a multiple
1828 STR v5,[a2],#4 // of 4 words
1831 BLT shiftrightcopy_loop_up_l1 // better to branch than skip instrs.
1833 ORRGE v5,v5,lr,LSR a4
1837 ORRGT v5,v5,lr,LSR a4
1840 LABEL(shiftrightcopy_loop_up_l1)
1841 BICS ip,a3,#3 // set counter to multiple of 4
1842 MOVEQ a1,v5 // if zero then we're done
1843 LDMEQFD sp!,{v5,v6,pc}^ // so return last shift out
1844 STMFD sp!,{v1-v3} // save work regs
1845 LABEL(shiftrightcopy_loop_up_l2)
1846 LDMIA a1!,{v1,v2,v3,lr} // load 4 words in one go
1847 ORR a3,v5,v1,LSR a4 // shiftright the four words
1848 MOV v5,v1,ASL v6 // keep carry in v5
1849 ORR v1,v5,v2,LSR a4 // and store results down a register
1850 MOV v5,v2,ASL v6 // to regs a2,v1-v3
1855 STMIA a2!,{a3,v1,v2,v3} // store 4 results
1856 SUBS ip,ip,#4 // decrement counter by 4
1857 BGT shiftrightcopy_loop_up_l2 // if count still positive then loop
1858 MOV a1,v5 // result = last shift out
1859 LDMFD sp!,{v1-v3,v5,v6,pc}^ // restore work regs and return
1869 // v2,v3,v4 destroyed
1870 LABEL(mulu32_64_vregs)
1871 MOV v1,a1,LSR #16 // temp := top half of x
1872 MOV v2,ip,LSR #16 // hi := top half of y
1873 BIC v3,a1,v1,LSL #16 // x := bottom half of x
1874 BIC ip,ip,v2,LSL #16 // y := bottom half of y
1875 MUL v4,v3,ip // low section of result
1876 MUL ip,v1,ip // ) middle sections
1877 MUL v3,v2,v3 // ) of result
1878 MUL v2,v1,v2 // high section of result
1879 ADDS ip,ip,v3 // add middle sections
1880 // (can't use mla as we need carry)
1881 ADDCS v2,v2,#0x10000 // carry from above add
1882 ADDS v1,v4,ip,LSL #16 // x is now bottom 32 bits of result
1883 ADC ip,v2,ip,LSR #16 // hi is top 32 bits
1887 // extern uintD mulusmall_loop_down (uintD digit, uintD* ptr, uintC len, uintD newdigit);
1891 // a3 = count of words to be multiplied down
1892 // a4 = new digit = carry
1894 // a1 = final carry of multiply
1895 // a2 - a4, ip destroyed
1896 EXPORT(mulusmall_loop_down)
1897 DECLARE_FUNCTION(mulusmall_loop_down)
1898 GLABEL(mulusmall_loop_down)
1904 LABEL(mulusmall_loop_down_l1)
1906 UMULL v1,ip,a1,ip // muluD(digit,*--ptr,hi=,lo=)
1907 ADDS v1,v1,a4 // lo += carry
1908 ADC a4,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
1909 STR v1,[a2,#0] // *ptr = lo
1910 SUBS a3,a3,#1 // len--
1911 BNE mulusmall_loop_down_l1 // until len==0
1912 MOV a1,a4 // return carry
1915 STMFD sp!,{v1-v2,lr}
1916 LABEL(mulusmall_loop_down_l1)
1919 // BL mulu32_64_vregs // muluD(digit,*--ptr,hi=,lo=)
1920 // replaced by multiplication of a small x = a1 and a big y = ip :
1921 MOV v1,ip,LSR #16 // top half of y
1922 BIC ip,ip,v1,LSL #16 // bottom half of y
1923 MUL v2,a1,v1 // middle section of result
1924 MUL v1,a1,ip // low section of result
1925 MOV ip,#0 // high section of result
1926 ADDS v1,v1,v2,LSL #16 // bottom 32 bits of result
1927 ADC ip,ip,v2,LSR #16 // top 32 bits of result
1929 ADDS v1,v1,a4 // lo += carry
1930 ADC a4,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
1931 STR v1,[a2,#0] // *ptr = lo
1932 SUBS a3,a3,#1 // len--
1933 BNE mulusmall_loop_down_l1 // until len==0
1934 MOV a1,a4 // return carry
1935 LDMFD sp!,{v1-v2,pc}^
1938 // extern void mulu_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
1943 // a4 = count of words to be multiplied down
1945 // a1 - a4, ip destroyed
1946 EXPORT(mulu_loop_down)
1947 DECLARE_FUNCTION(mulu_loop_down)
1948 GLABEL(mulu_loop_down)
1950 STMFD sp!,{v1,v5,lr}
1952 LABEL(mulu_loop_down_l1)
1954 UMULL v1,ip,a1,ip // muluD(digit,*--sourceptr,hi=,lo=)
1955 ADDS v1,v1,v5 // lo += carry
1956 ADC v5,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
1957 STR v1,[a3,#-4]! // *--destptr = lo
1958 SUBS a4,a4,#1 // len--
1959 BNE mulu_loop_down_l1 // until len==0
1960 STR v5,[a3,#-4]! // *--destptr = carry
1961 LDMFD sp!,{v1,v5,pc}^
1963 STMFD sp!,{v1-v5,lr}
1965 LABEL(mulu_loop_down_l1)
1967 BL mulu32_64_vregs // muluD(digit,*--sourceptr,hi=,lo=)
1968 ADDS v1,v1,v5 // lo += carry
1969 ADC v5,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
1970 STR v1,[a3,#-4]! // *--destptr = lo
1971 SUBS a4,a4,#1 // len--
1972 BNE mulu_loop_down_l1 // until len==0
1973 STR v5,[a3,#-4]! // *--destptr = carry
1974 LDMFD sp!,{v1-v5,pc}^
1977 // extern void muluadd_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
1982 // a4 = count of words to be multiplied added down
1984 // a1 - a4, ip destroyed
1985 EXPORT(muluadd_loop_down)
1986 DECLARE_FUNCTION(muluadd_loop_down)
1987 GLABEL(muluadd_loop_down)
1989 STMFD sp!,{v1,v5,lr}
1991 LABEL(muluadd_loop_down_l1)
1993 UMULL v1,ip,a1,ip // muluD(digit,*--sourceptr,hi=,lo=)
1994 ADDS v1,v1,v5 // lo += carry
1995 ADCCS ip,ip,#0 // if (lo<carry) { hi += 1 };
1996 LDR v5,[a3,#-4]! // carry = *--destptr
1997 ADDS v1,v1,v5 // lo += carry
1998 ADC v5,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
1999 STR v1,[a3,#0] // *destptr = lo
2000 SUBS a4,a4,#1 // len--
2001 BNE muluadd_loop_down_l1 // until len==0
2002 MOV a1,v5 // return carry
2003 LDMFD sp!,{v1,v5,pc}^
2005 STMFD sp!,{v1-v5,lr}
2007 LABEL(muluadd_loop_down_l1)
2009 BL mulu32_64_vregs // muluD(digit,*--sourceptr,hi=,lo=)
2010 ADDS v1,v1,v5 // lo += carry
2011 ADCCS ip,ip,#0 // if (lo<carry) { hi += 1 };
2012 LDR v5,[a3,#-4]! // carry = *--destptr
2013 ADDS v1,v1,v5 // lo += carry
2014 ADC v5,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
2015 STR v1,[a3,#0] // *destptr = lo
2016 SUBS a4,a4,#1 // len--
2017 BNE muluadd_loop_down_l1 // until len==0
2018 MOV a1,v5 // return carry
2019 LDMFD sp!,{v1-v5,pc}^
2022 // extern void mulusub_loop_down (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
2027 // a4 = count of words to be multiplied subtracted down
2029 // a1 - a4, ip destroyed
2030 EXPORT(mulusub_loop_down)
2031 DECLARE_FUNCTION(mulusub_loop_down)
2032 GLABEL(mulusub_loop_down)
2034 STMFD sp!,{v1,v5,lr}
2036 LABEL(mulusub_loop_down_l1)
2038 UMULL v1,ip,a1,ip // muluD(digit,*--sourceptr,hi=,lo=)
2039 ADDS v1,v1,v5 // lo += carry
2040 ADC v5,ip,#0 // if (lo<carry) { hi += 1 };
2041 LDR ip,[a3,#-4]! // carry = *--destptr
2043 STR ip,[a3,#0] // *destptr = carry - lo
2044 ADDCC v5,v5,#1 // if (carry<lo) { hi += 1 }; carry=hi
2045 SUBS a4,a4,#1 // len--
2046 BNE mulusub_loop_down_l1 // until len==0
2047 MOV a1,v5 // return carry
2048 LDMFD sp!,{v1,v5,pc}^
2050 STMFD sp!,{v1-v5,lr}
2052 LABEL(mulusub_loop_down_l1)
2054 BL mulu32_64_vregs // muluD(digit,*--sourceptr,hi=,lo=)
2055 ADDS v1,v1,v5 // lo += carry
2056 ADC v5,ip,#0 // if (lo<carry) { hi += 1 };
2057 LDR ip,[a3,#-4]! // carry = *--destptr
2059 STR ip,[a3,#0] // *destptr = carry - lo
2060 ADDCC v5,v5,#1 // if (carry<lo) { hi += 1 }; carry=hi
2061 SUBS a4,a4,#1 // len--
2062 BNE mulusub_loop_down_l1 // until len==0
2063 MOV a1,v5 // return carry
2064 LDMFD sp!,{v1-v5,pc}^
2069 #if !CL_DS_BIG_ENDIAN_P
2071 // extern void or_loop_down (uintD* xptr, uintD* yptr, uintC count);
2075 // a3 = count of words to be ORed
2077 // xptr |= yptr for count words
2078 // a1 - a4, ip destroyed
2079 EXPORT(or_loop_down) // word aligned or loop down
2080 DECLARE_FUNCTION(or_loop_down)
2081 GLABEL(or_loop_down)
2082 ANDS a4,a3,#3 // multiple of 4 words ?
2083 BEQ or_loop_down_l1 // yup, so branch
2084 CMP a4,#2 // OR the first 1-3 words
2085 LDR a4,[a2,#-4]! // to align the total to a multiple
2086 LDR ip,[a1,#-4]! // of 4 words
2089 BLT or_loop_down_l1 // better to branch than skip instrs.
2098 LABEL(or_loop_down_l1)
2099 BICS a4,a3,#3 // set counter to multiple of 4
2100 MOVEQS pc,lr // if zero then we're done
2101 STMFD sp!,{v1-v5,lr} // save work regs
2102 LABEL(or_loop_down_l2)
2103 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2104 LDMDB a1,{v3,v4,v5,lr} // load target words
2105 ORR v3,v3,a3 // OR the four words
2109 STMDB a1!,{v3,v4,v5,lr} // store 4 results
2110 SUBS a4,a4,#4 // decrement counter by 4
2111 BGT or_loop_down_l2 // if count still positive then loop
2112 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2114 // extern void xor_loop_down (uintD* xptr, uintD* yptr, uintC count);
2118 // a3 = count of words to be XORed
2120 // xptr ^= yptr for count words
2121 // a1 - a4, ip destroyed
2122 EXPORT(xor_loop_down) // word aligned xor loop down
2123 DECLARE_FUNCTION(xor_loop_down)
2124 GLABEL(xor_loop_down)
2125 ANDS a4,a3,#3 // multiple of 4 words ?
2126 BEQ xor_loop_down_l1 // yup, so branch
2127 CMP a4,#2 // XOR the first 1-3 words
2128 LDR a4,[a2,#-4]! // to align the total to a multiple
2129 LDR ip,[a1,#-4]! // of 4 words
2132 BLT xor_loop_down_l1 // better to branch than skip instrs.
2141 LABEL(xor_loop_down_l1)
2142 BICS a4,a3,#3 // set counter to multiple of 4
2143 MOVEQS pc,lr // if zero then we're done
2144 STMFD sp!,{v1-v5,lr} // save work regs
2145 LABEL(xor_loop_down_l2)
2146 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2147 LDMDB a1,{v3,v4,v5,lr} // load target words
2148 EOR v3,v3,a3 // XOR the four words
2152 STMDB a1!,{v3,v4,v5,lr} // store 4 results
2153 SUBS a4,a4,#4 // decrement counter by 4
2154 BGT xor_loop_down_l2 // if count still positive then loop
2155 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2157 // extern void and_loop_down (uintD* xptr, uintD* yptr, uintC count);
2161 // a3 = count of words to be ANDed
2163 // xptr &= yptr for count words
2164 // a1 - a4, ip destroyed
2165 EXPORT(and_loop_down) // word aligned and loop down
2166 DECLARE_FUNCTION(and_loop_down)
2167 GLABEL(and_loop_down)
2168 ANDS a4,a3,#3 // multiple of 4 words ?
2169 BEQ and_loop_down_l1 // yup, so branch
2170 CMP a4,#2 // AND the first 1-3 words
2171 LDR a4,[a2,#-4]! // to align the total to a multiple
2172 LDR ip,[a1,#-4]! // of 4 words
2175 BLT and_loop_down_l1 // better to branch than skip instrs.
2184 LABEL(and_loop_down_l1)
2185 BICS a4,a3,#3 // set counter to multiple of 4
2186 MOVEQS pc,lr // if zero then we're done
2187 STMFD sp!,{v1-v5,lr} // save work regs
2188 LABEL(and_loop_down_l2)
2189 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2190 LDMDB a1,{v3,v4,v5,lr} // load target words
2191 AND v3,v3,a3 // AND the four words
2195 STMDB a1!,{v3,v4,v5,lr} // store 4 results
2196 SUBS a4,a4,#4 // decrement counter by 4
2197 BGT and_loop_down_l2 // if count still positive then loop
2198 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2200 // extern void eqv_loop_down (uintD* xptr, uintD* yptr, uintC count);
2204 // a3 = count of words to be XORed
2206 // xptr = ~(xptr ^ yptr) for count words
2207 // a1 - a4, ip destroyed
2208 EXPORT(eqv_loop_down) // word aligned eqv loop down
2209 DECLARE_FUNCTION(eqv_loop_down)
2210 GLABEL(eqv_loop_down)
2211 ANDS a4,a3,#3 // multiple of 4 words ?
2212 BEQ eqv_loop_down_l1 // yup, so branch
2213 CMP a4,#2 // EQV the first 1-3 words
2214 LDR a4,[a2,#-4]! // to align the total to a multiple
2215 LDR ip,[a1,#-4]! // of 4 words
2219 BLT eqv_loop_down_l1 // better to branch than skip instrs.
2225 BLE eqv_loop_down_l1 // better to branch than skip instrs.
2231 LABEL(eqv_loop_down_l1)
2232 BICS a4,a3,#3 // set counter to multiple of 4
2233 MOVEQS pc,lr // if zero then we're done
2234 STMFD sp!,{v1-v5,lr} // save work regs
2235 LABEL(eqv_loop_down_l2)
2236 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2237 LDMDB a1,{v3,v4,v5,lr} // load target words
2238 EOR v3,v3,a3 // EVQ the four words
2246 STMDB a1!,{v3,v4,v5,lr} // store 4 results
2247 SUBS a4,a4,#4 // decrement counter by 4
2248 BGT eqv_loop_down_l2 // if count still positive then loop
2249 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2251 // extern void nand_loop_down (uintD* xptr, uintD* yptr, uintC count);
2255 // a3 = count of words to be NANDed
2257 // xptr = ~(xptr & yptr) for count words
2258 // a1 - a4, ip destroyed
2259 EXPORT(nand_loop_down) // word aligned nand loop down
2260 DECLARE_FUNCTION(nand_loop_down)
2261 GLABEL(nand_loop_down)
2262 ANDS a4,a3,#3 // multiple of 4 words ?
2263 BEQ nand_loop_down_l1 // yup, so branch
2264 CMP a4,#2 // NAND the first 1-3 words
2265 LDR a4,[a2,#-4]! // to align the total to a multiple
2266 LDR ip,[a1,#-4]! // of 4 words
2270 BLT nand_loop_down_l1 // better to branch than skip instrs.
2276 BLE nand_loop_down_l1 // better to branch than skip instrs.
2282 LABEL(nand_loop_down_l1)
2283 BICS a4,a3,#3 // set counter to multiple of 4
2284 MOVEQS pc,lr // if zero then we're done
2285 STMFD sp!,{v1-v5,lr} // save work regs
2286 LABEL(nand_loop_down_l2)
2287 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2288 LDMDB a1,{v3,v4,v5,lr} // load target words
2289 AND v3,v3,a3 // NAND the four words
2297 STMDB a1!,{v3,v4,v5,lr} // store 4 results
2298 SUBS a4,a4,#4 // decrement counter by 4
2299 BGT nand_loop_down_l2 // if count still positive then loop
2300 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2302 // extern void nor_loop_down (uintD* xptr, uintD* yptr, uintC count);
2306 // a3 = count of words to be NORed
2308 // xptr = ~(xptr | yptr) for count words
2309 // a1 - a4, ip destroyed
2310 EXPORT(nor_loop_down) // word aligned nor loop down
2311 DECLARE_FUNCTION(nor_loop_down)
2312 GLABEL(nor_loop_down)
2313 ANDS a4,a3,#3 // multiple of 4 words ?
2314 BEQ nor_loop_down_l1 // yup, so branch
2315 CMP a4,#2 // NOR the first 1-3 words
2316 LDR a4,[a2,#-4]! // to align the total to a multiple
2317 LDR ip,[a1,#-4]! // of 4 words
2321 BLT nor_loop_down_l1 // better to branch than skip instrs.
2327 BLE nor_loop_down_l1 // better to branch than skip instrs.
2333 LABEL(nor_loop_down_l1)
2334 BICS a4,a3,#3 // set counter to multiple of 4
2335 MOVEQS pc,lr // if zero then we're done
2336 STMFD sp!,{v1-v5,lr} // save work regs
2337 LABEL(nor_loop_down_l2)
2338 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2339 LDMDB a1,{v3,v4,v5,lr} // load target words
2340 ORR v3,v3,a3 // NOR the four words
2348 STMDB a1!,{v3,v4,v5,lr} // store 4 results
2349 SUBS a4,a4,#4 // decrement counter by 4
2350 BGT nor_loop_down_l2 // if count still positive then loop
2351 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2353 // extern void andc2_loop_down (uintD* xptr, uintD* yptr, uintC count);
2357 // a3 = count of words to be ANDC2ed
2359 // xptr = xptr & ~yptr for count words
2360 // a1 - a4, ip destroyed
2361 EXPORT(andc2_loop_down) // word aligned andc2 loop down
2362 DECLARE_FUNCTION(andc2_loop_down)
2363 GLABEL(andc2_loop_down)
2364 ANDS a4,a3,#3 // multiple of 4 words ?
2365 BEQ andc2_loop_down_l1 // yup, so branch
2366 CMP a4,#2 // ANDC2 the first 1-3 words
2367 LDR a4,[a2,#-4]! // to align the total to a multiple
2368 LDR ip,[a1,#-4]! // of 4 words
2371 BLT andc2_loop_down_l1 // better to branch than skip instrs.
2380 LABEL(andc2_loop_down_l1)
2381 BICS a4,a3,#3 // set counter to multiple of 4
2382 MOVEQS pc,lr // if zero then we're done
2383 STMFD sp!,{v1-v5,lr} // save work regs
2384 LABEL(andc2_loop_down_l2)
2385 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2386 LDMDB a1,{v3,v4,v5,lr} // load target words
2387 BIC v3,v3,a3 // ANDC2 the four words
2391 STMDB a1!,{v3,v4,v5,lr} // store 4 results
2392 SUBS a4,a4,#4 // decrement counter by 4
2393 BGT andc2_loop_down_l2 // if count still positive then loop
2394 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2396 // extern void orc2_loop_down (uintD* xptr, uintD* yptr, uintC count);
2400 // a3 = count of words to be XORed
2402 // xptr = xptr | ~yptr for count words
2403 // a1 - a4, ip destroyed
2404 EXPORT(orc2_loop_down) // word aligned orc2 loop down
2405 DECLARE_FUNCTION(orc2_loop_down)
2406 GLABEL(orc2_loop_down)
2407 ANDS a4,a3,#3 // multiple of 4 words ?
2408 BEQ orc2_loop_down_l1 // yup, so branch
2409 CMP a4,#2 // ORC2 the first 1-3 words
2410 LDR a4,[a2,#-4]! // to align the total to a multiple
2411 LDR ip,[a1,#-4]! // of 4 words
2415 BLT orc2_loop_down_l1 // better to branch than skip instrs.
2421 BLE orc2_loop_down_l1 // better to branch than skip instrs.
2427 LABEL(orc2_loop_down_l1)
2428 BICS a4,a3,#3 // set counter to multiple of 4
2429 MOVEQS pc,lr // if zero then we're done
2430 STMFD sp!,{v1-v5,lr} // save work regs
2431 LABEL(orc2_loop_down_l2)
2432 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2433 LDMDB a1,{v3,v4,v5,lr} // load target words
2434 MVN a3,a3 // ORC2 the four words
2442 STMDB a1!,{v3,v4,v5,lr} // store 4 results
2443 SUBS a4,a4,#4 // decrement counter by 4
2444 BGT orc2_loop_down_l2 // if count still positive then loop
2445 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2447 // extern void not_loop_down (uintD* xptr, uintC count);
2450 // a2 = count of words to be NOTed
2452 // xptr = ~xptr for count words
2453 // a1 - a4, ip destroyed
2454 EXPORT(not_loop_down) // word aligned not loop down
2455 DECLARE_FUNCTION(not_loop_down)
2456 GLABEL(not_loop_down)
2457 ANDS a3,a2,#3 // multiple of 4 words ?
2458 BEQ not_loop_down_l1 // yup, so branch
2459 CMP a3,#2 // NOT the first 1-3 words
2460 LDR a3,[a1,#-4]! // to align the total to a multiple
2461 MVN a3,a3 // of 4 words
2463 BLT not_loop_down_l1 // better to branch than skip instrs.
2470 LABEL(not_loop_down_l1)
2471 BICS a4,a2,#3 // set counter to multiple of 4
2472 MOVEQS pc,lr // if zero then we're done
2473 STMFD sp!,{lr} // save work regs
2474 LABEL(not_loop_down_l2)
2475 LDMDB a1,{a2,a3,ip,lr} // load 4 words in one go,NO writeback
2476 MVN a2,a2 // NOT the four words
2480 STMDB a1!,{a2,a3,ip,lr} // store 4 results
2481 SUBS a4,a4,#4 // decrement counter by 4
2482 BGT not_loop_down_l2 // if count still positive then loop
2483 LDMFD sp!,{pc}^ // restore work regs and return
2485 // extern void and_test_loop_down (uintD* xptr, uintD* yptr, uintC count);
2489 // a3 = count of words to be AND_TESTed
2491 // a1 = TRUE if any words ANDed together are non-zero else FALSE
2492 // a2 - a4, ip destroyed
2493 EXPORT(and_test_loop_down) // word aligned and_test loop down
2494 DECLARE_FUNCTION(and_test_loop_down)
2495 GLABEL(and_test_loop_down)
2496 ANDS a4,a3,#3 // multiple of 4 words ?
2497 BEQ and_test_loop_down_l1 // yup, so branch
2499 LDR a4,[a2,#-4]! // AND_TEST the first 1-3 words
2500 LDR ip,[a1,#-4]! // to align the total to a multiple
2501 TST ip,a4 // of 4 words
2502 MOVNE a1,#1 // return TRUE if AND_TEST ok
2504 BCC and_test_loop_down_l1 // better to branch than skip instrs.
2512 BLE and_test_loop_down_l1 // better to branch than skip instrs.
2518 LABEL(and_test_loop_down_l1)
2519 BICS a4,a3,#3 // set counter to multiple of 4
2520 MOVEQ a1,#0 // return FALSE
2521 MOVEQS pc,lr // if zero then we're done
2522 STMFD sp!,{v1-v6,lr} // save work regs
2523 MOV v6,a1 // move xptr to v6
2524 MOV a1,#1 // set result to TRUE
2525 LABEL(and_test_loop_down_l2)
2526 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2527 LDMDB v6!,{v3,v4,v5,lr} // load target words
2528 TST v3,a3 // AND_TEST the four words
2532 LDMNEFD sp!,{v1-v6,pc}^
2533 SUBS a4,a4,#4 // decrement counter by 4
2534 BGT and_test_loop_down_l2 // if count still positive then loop
2536 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
2538 // extern void compare_loop_down (uintD* xptr, uintD* yptr, uintC count);
2542 // a3 = count of words to be COMPAREd
2544 // a1 = +1 if first non-equal word in xptr[] and yptr[]
2545 // xptr[i] > yptr[i]
2546 // -1 if xptr[i] < yptr[i]
2548 // a2 - a4, ip destroyed
2549 EXPORT(compare_loop_down) // word aligned compare loop down
2550 DECLARE_FUNCTION(compare_loop_down)
2551 GLABEL(compare_loop_down)
2552 ANDS a4,a3,#3 // multiple of 4 words ?
2553 BEQ compare_loop_down_l1 // yup, so branch
2554 LDR a4,[a2,#-4]! // COMPARE the first 1-3 words
2555 LDR ip,[a1,#-4]! // to align the total to a multiple
2556 CMP ip,a4 // of 4 words
2557 MVNLO a1,#0 // x < y -> -1
2558 MOVHI a1,#1 // x > y -> +1
2559 MOVNES pc,lr // and return result if not equal
2562 BLT compare_loop_down_l1 // need to branch 'cos PSR used
2571 BLE compare_loop_down_l1 // need to branch 'cos PSR used
2578 LABEL(compare_loop_down_l1)
2579 BICS a4,a3,#3 // set counter to multiple of 4
2580 MOVEQ a1,#0 // xptr[] == yptr[] -> 0
2581 MOVEQS pc,lr // if zero then we're done
2582 STMFD sp!,{v1-v6,lr} // save work regs
2583 MOV v6,a1 // move xptr to v6
2584 MOV a1,#1 // set result to +1
2585 LABEL(compare_loop_down_l2)
2586 LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go
2587 LDMDB v6!,{v3,v4,v5,lr} // load test words
2588 CMP lr,ip // COMPARE the four words
2592 MVNLO a1,#0 // x < y -> -1 (a1 already holds +1)
2593 LDMNEFD sp!,{v1-v6,pc}^
2594 SUBS a4,a4,#4 // decrement counter by 4
2595 BGT compare_loop_down_l2 // if count still positive then loop
2597 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
2599 // extern uintD addto_loop_up (uintD* sourceptr, uintD* destptr, uintC count);
2603 // a3 = count of words to be added
2605 // destptr[] = sourceptr[] + destptr[]
2607 // a2 - a4, ip destroyed
2608 EXPORT(addto_loop_up) // word aligned addto loop up
2609 DECLARE_FUNCTION(addto_loop_up)
2610 GLABEL(addto_loop_up)
2611 MOV a4,a3 // set regs for a call
2612 MOV a3,a2 // to add_loop_up
2613 // and drop into add_loop_up
2615 // extern uintD add_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
2620 // a4 = count of words to be added
2622 // destptr[] = sourceptr1[] + sourceptr2[]
2624 // a2 - a4, ip destroyed
2625 EXPORT(add_loop_up) // word aligned add loop up
2626 DECLARE_FUNCTION(add_loop_up)
2628 ANDS ip,a4,#3 // multiple of 4 words ?
2629 BEQ add_loop_up_l1 // yup, so branch
2631 LDR v6,[a2],#4 // add the first 1-3 words
2632 LDR lr,[a1],#4 // to align the total to a multiple
2633 ADDS lr,lr,v6 // of 4 words
2636 BEQ add_loop_up_l0 // need to branch 'cos PSR used
2642 BEQ add_loop_up_l0 // need to branch 'cos PSR used
2647 LABEL(add_loop_up_l0) // at least one add has happened
2648 BICS a4,a4,#3 // set counter to multiple of 4
2649 BNE add_loop_up_l3 // branch if more adds to do
2650 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0)
2651 LDMEQFD sp!,{v6,pc}^ // and return
2652 LABEL(add_loop_up_l1)
2653 BICS a4,a4,#3 // set counter to multiple of 4
2654 MOVEQ a1,#0 // no adds, so C = 0
2655 MOVEQS pc,lr // if zero then we're done
2656 CMN a4,#0 // clear carry bit
2658 LABEL(add_loop_up_l3)
2659 STMFD sp!,{v1-v5} // save work regs
2660 LABEL(add_loop_up_l2)
2661 LDMIA a2!,{v1,v2,v3,ip} // load 4 words in one go
2662 LDMIA a1!,{v4,v5,v6,lr} // and from source2
2663 ADCS v4,v4,v1 // add the four words with carry
2667 STMIA a3!,{v4,v5,v6,lr} // store 4 results
2668 SUB a4,a4,#4 // decrement counter by 4, preserve C
2669 TEQ a4,#0 // are we done ?
2670 BNE add_loop_up_l2 // if count non-zero then loop
2671 ADC a1,a4,a4 // set result to Carry (a4 is 0)
2672 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
2674 // extern uintD inc_loop_up (uintD* ptr, uintC count);
2677 // a2 = count of words to be INCed
2679 // a1 = 0 if any words are non-zero after increment else 1
2680 // stop incrementing when first word becomes non-zero
2681 // a2 - a4, ip destroyed
2682 EXPORT(inc_loop_up) // word aligned inc loop up
2683 DECLARE_FUNCTION(inc_loop_up)
2685 ANDS a3,a2,#1 // multiple of 2 words ?
2686 BEQ inc_loop_up_l1 // yup, so branch
2687 LDR a4,[a1] // INC the first word
2688 ADDS a4,a4,#1 // align the total to a multiple of 2
2690 MOVNE a1,#0 // set result to 0
2691 MOVNES pc,lr // return 0 if non-zero result
2692 LABEL(inc_loop_up_l1)
2693 BICS a4,a2,#1 // set counter to multiple of 2
2694 MOVEQ a1,#1 // return 1
2695 MOVEQS pc,lr // if zero then we're done
2696 MOV ip,a1 // move ptr to ip
2697 MOV a1,#0 // set result to 0
2700 LDMIA ip,{a2,a3} // load 2 words in one go
2701 ADDS a2,a2,#1 // INC the two words
2702 ADDEQS a3,a3,#1 // stopping when first word non-zero
2703 STMIA ip!,{a2,a3} // store 2 results
2704 MOVNES pc,lr // return 0 if any result non-zero
2705 SUBS a4,a4,#2 // decrement counter by 2
2706 MOVEQ a1,#1 // if finished loop then
2707 MOVEQS pc,lr // return 1
2708 LABEL(inc_loop_up_l3) // now a multiple of 4 words
2709 STMFD sp!,{v1,lr} // save work regs
2710 LABEL(inc_loop_up_l2)
2711 LDMIA ip,{a2,a3,v1,lr} // load 4 words in one go
2712 ADDS a2,a2,#1 // INC the four words
2713 ADDEQS a3,a3,#1 // stopping when first word non-zero
2716 STMIA ip!,{a2,a3,v1,lr} // store 4 results
2717 LDMNEFD sp!,{v1,pc}^ // return 0 if any result non-zero
2718 SUBS a4,a4,#4 // decrement counter by 4
2719 BGT inc_loop_up_l2 // if count still positive then loop
2721 LDMFD sp!,{v1,pc}^ // restore work regs and return 1
2723 // extern uintD sub_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count);
2728 // a4 = count of words to be subtracted
2730 // destptr[] = sourceptr1[] - sourceptr2[]
2732 // a2 - a4, ip destroyed
2733 EXPORT(sub_loop_up) // word aligned sub loop up
2734 DECLARE_FUNCTION(sub_loop_up)
2736 ANDS ip,a4,#3 // multiple of 4 words ?
2737 BEQ sub_loop_up_l1 // yup, so branch
2739 LDR v6,[a2],#4 // subtract the first 1-3 words
2740 LDR lr,[a1],#4 // to align the total to a multiple
2741 SUBS lr,lr,v6 // of 4 words
2744 BNE sub_loop_up_l0 // branch if more than one subtract
2745 LABEL(sub_loop_up_l4) // drop through for better instr. timings
2746 BICS a4,a4,#3 // set counter to multiple of 4
2747 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0)
2748 LDMEQFD sp!,{v6,pc}^ // and return
2749 STMFD sp!,{v1-v5} // save work regs
2750 B sub_loop_up_l2 // branch if more subtracts to do
2751 LABEL(sub_loop_up_l0)
2757 BEQ sub_loop_up_l4 // need to branch 'cos PSR used
2763 LABEL(sub_loop_up_l1)
2764 BICS a4,a4,#3 // set counter to multiple of 4
2765 MOVEQ a1,#0 // no subtracts, so C = 0
2766 MOVEQS pc,lr // if zero then we're done
2767 CMP a4,#0 // set carry bit, since a4 > 0
2768 STMFD sp!,{v1-v6,lr} // save work regs
2769 LABEL(sub_loop_up_l2)
2770 LDMIA a2!,{v1,v2,v3,ip} // load 4 words in one go
2771 LDMIA a1!,{v4,v5,v6,lr} // and from source2
2772 SBCS v4,v4,v1 // subtract the four words with carry
2776 STMIA a3!,{v4,v5,v6,lr} // store 4 results
2777 SUB a4,a4,#4 // decrement counter by 4, preserve C
2778 TEQ a4,#0 // are we done ?
2779 BNE sub_loop_up_l2 // if count non-zero then loop
2780 SBC a1,a4,a4 // set result to Carry (a4 is 0)
2781 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
2783 // extern uintD subx_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count, uintD carry);
2788 // a4 = count of words to be subtracted
2791 // destptr[] = sourceptr1[] - sourceptr2[]
2793 // a2 - a4, ip destroyed
2794 EXPORT(subx_loop_up) // word aligned xsub loop up
2795 DECLARE_FUNCTION(subx_loop_up)
2796 GLABEL(subx_loop_up)
2797 LDR ip,[sp] // get starting value of carry
2798 LABEL(subx_loop_up_lsub)
2799 RSBS ip,ip,#0 // set carry in PSR
2800 ANDS ip,a4,#3 // multiple of 4 words ?
2801 BEQ subx_loop_up_l1 // yup, so branch
2803 LDR v6,[a2],#4 // subtract the first 1-3 words
2804 LDR lr,[a1],#4 // to align the total to a multiple
2805 SBCS lr,lr,v6 // of 4 words
2808 BNE subx_loop_up_l0 // branch if more than one subtract
2809 LABEL(subx_loop_up_l4) // drop through for better instr. timings
2810 BICS a4,a4,#3 // set counter to multiple of 4
2811 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0)
2812 LDMEQFD sp!,{v6,pc}^ // and return
2813 STMFD sp!,{v1-v5} // save work regs
2814 B subx_loop_up_l2 // branch if more subtracts to do
2815 LABEL(subx_loop_up_l0)
2821 BEQ subx_loop_up_l4 // need to branch 'cos PSR used
2827 LABEL(subx_loop_up_l1)
2828 BICS a4,a4,#3 // set counter to multiple of 4
2829 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0)
2830 MOVEQS pc,lr // if zero then we're done
2831 STMFD sp!,{v1-v6,lr} // save work regs
2832 LABEL(subx_loop_up_l2)
2833 LDMIA a2!,{v1,v2,v3,ip} // load 4 words in one go
2834 LDMIA a1!,{v4,v5,v6,lr} // and from source2
2835 SBCS v4,v4,v1 // subtract the four words with carry
2839 STMIA a3!,{v4,v5,v6,lr} // store 4 results
2840 SUB a4,a4,#4 // decrement counter by 4, preserve C
2841 TEQ a4,#0 // are we done ?
2842 BNE subx_loop_up_l2 // if count non-zero then loop
2843 SBC a1,a4,a4 // set result to Carry (a4 is 0)
2844 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return
2846 // extern uintD subfrom_loop_up (uintD* sourceptr, uintD* destptr, uintC count);
2850 // a3 = count of words to be subtracted
2852 // destptr[] = destptr[] - sourceptr[]
2854 // a2 - a4, ip destroyed
2855 EXPORT(subfrom_loop_up) // word aligned subfrom loop up
2856 DECLARE_FUNCTION(subfrom_loop_up)
2857 GLABEL(subfrom_loop_up)
2858 ANDS ip,a3,#3 // multiple of 4 words ?
2859 BEQ subfrom_loop_up_l1 // yup, so branch
2861 LDR a4,[a1],#4 // subtract the first 1-3 words
2862 LDR lr,[a2] // to align the total to a multiple
2863 SUBS lr,lr,a4 // of 4 words
2866 BNE subfrom_loop_up_l0 // branch if more than one subtract
2867 LABEL(subfrom_loop_up_l4) // drop through for better instr. timings
2868 BICS a4,a3,#3 // set counter to multiple of 4
2869 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0)
2870 LDMEQFD sp!,{pc}^ // and return
2871 STMFD sp!,{v1-v5} // save work regs
2872 B subfrom_loop_up_l2 // branch if more subtracts to do
2873 LABEL(subfrom_loop_up_l0)
2879 BEQ subfrom_loop_up_l4 // need to branch 'cos PSR used
2884 B subfrom_loop_up_l4
2885 LABEL(subfrom_loop_up_l1)
2886 BICS a4,a3,#3 // set counter to multiple of 4
2887 MOVEQ a1,#0 // no subtracts, so C = 0
2888 MOVEQS pc,lr // if zero then we're done
2889 CMP a4,#0 // set carry bit, since a4 > 0
2890 STMFD sp!,{v1-v5,lr} // save work regs
2891 LABEL(subfrom_loop_up_l2)
2892 LDMIA a1!,{a3,v1,v2,ip} // load 4 words in one go
2893 LDMIA a2,{v3,v4,v5,lr} // and from destptr
2894 SBCS v3,v3,a3 // subtract the four words with carry
2898 STMIA a2!,{v3,v4,v5,lr} // store 4 results
2899 SUB a4,a4,#4 // decrement counter by 4, preserve C
2900 TEQ a4,#0 // are we done ?
2901 BNE subfrom_loop_up_l2 // if count non-zero then loop
2902 SBC a1,a4,a4 // set result to Carry (a4 is 0)
2903 LDMFD sp!,{v1-v5,pc}^ // restore work regs and return
2905 // extern uintD dec_loop_up (uintD* ptr, uintC count);
2908 // a2 = count of words to be DECed
2910 // a1 = 0 if any words are non-zero before decrement else -1
2911 // stop decrementing when first word is non-zero
2912 // a2 - a4, ip destroyed
2913 EXPORT(dec_loop_up) // word aligned dec loop up
2914 DECLARE_FUNCTION(dec_loop_up)
2916 ANDS a3,a2,#1 // multiple of 2 words ?
2917 BEQ dec_loop_up_l1 // yup, so branch
2918 LDR a4,[a1] // DEC the first word
2919 SUBS a4,a4,#1 // align the total to a multiple of 2
2921 MOVCS a1,#0 // set result to 0
2922 MOVCSS pc,lr // return 0 if non-zero result
2923 LABEL(dec_loop_up_l1)
2924 BICS a4,a2,#1 // set counter to multiple of 2
2925 MVNEQ a1,#0 // return -1
2926 MOVEQS pc,lr // if zero then we're done
2927 MOV ip,a1 // move ptr to ip
2928 MOV a1,#0 // set result to 0
2931 LDMIA ip,{a2,a3} // load 2 words in one go
2932 SUBS a2,a2,#1 // DEC the two words
2933 SUBCCS a3,a3,#1 // stopping when first word non-zero
2934 STMIA ip!,{a2,a3} // store 2 results
2935 MOVCSS pc,lr // return 0 if any result non-zero
2936 SUBS a4,a4,#2 // decrement counter by 2
2937 MVNEQ a1,#0 // if finished loop then
2938 MOVEQS pc,lr // return -1
2939 LABEL(dec_loop_up_l3) // now a multiple of 4 words
2940 STMFD sp!,{v1,lr} // save work regs
2941 LABEL(dec_loop_up_l2)
2942 LDMIA ip,{a2,a3,v1,lr} // load 4 words in one go
2943 SUBS a2,a2,#1 // DEC the four words
2944 SUBCCS a3,a3,#1 // stopping when first word non-zero
2947 STMIA ip!,{a2,a3,v1,lr} // store 4 results
2948 LDMCSFD sp!,{v1,pc}^ // return 0 if any carry
2949 SUBS a4,a4,#4 // decrement counter by 4
2950 BGT dec_loop_up_l2 // if count still positive then loop
2952 LDMFD sp!,{v1,pc}^ // restore work regs and return -1
2954 // extern void neg_loop_up (uintD* ptr, uintC count);
2957 // a2 = count of words. The long integer is to be NEGated
2959 // ptr[] = -ptr[] for count words
2961 // a2 - a4, ip destroyed
2962 EXPORT(neg_loop_up) // word aligned neg loop up
2963 DECLARE_FUNCTION(neg_loop_up)
2965 CMPS a2,#0 // count = 0 ?
2966 MOVEQ a1,#0 // yup, so return 0
2968 LABEL(neg_loop_up_l1) // skip all the zero words first
2969 LDR a3,[a1],#4 // compare words against zero
2970 CMPS a3,#0 // upwards in memory
2971 BNE neg_loop_up_l2 // non-zero, so negate rest of words
2972 SUBS a2,a2,#1 // reduce count of words
2973 BNE neg_loop_up_l1 // more ?, so loop
2974 MOV a1,#0 // return 0
2976 LABEL(neg_loop_up_l2)
2977 RSB a3,a3,#0 // first non-zero word = -word
2980 MVNEQ a1,#0 // done ? -> return -1
2982 // now NOT rest of the words
2983 ANDS a3,a2,#3 // multiple of 4 words ?
2984 BEQ neg_loop_up_l3 // yup, so branch
2985 CMP a3,#2 // NOT the first 1-3 words
2986 LDR a3,[a1] // to align the total to a multiple
2987 MVN a3,a3 // of 4 words
2989 BLT neg_loop_up_l3 // better to branch than skip instrs.
2996 LABEL(neg_loop_up_l3)
2997 BICS a4,a2,#3 // set counter to multiple of 4
2998 MVNEQ a1,#0 // set result to -1
2999 MOVEQS pc,lr // if zero then we're done
3000 STMFD sp!,{lr} // save work regs
3001 LABEL(neg_loop_up_l4)
3002 LDMIA a1,{a2,a3,ip,lr} // load 4 words in one go,NO writeback
3003 MVN a2,a2 // NOT the four words
3007 STMIA a1!,{a2,a3,ip,lr} // store 4 results
3008 SUBS a4,a4,#4 // decrement counter by 4
3009 BGT neg_loop_up_l4 // if count still positive then loop
3010 MVN a1,#0 // set result to -1
3011 LDMFD sp!,{pc}^ // restore work regs and return -1
3013 // extern uintD shift1left_loop_up (uintD* ptr, uintC count);
3016 // a2 = count of words to be shifted left
3018 // a1 = carry out from last shift left
3019 // a2 - a4, ip destroyed
3020 EXPORT(shift1left_loop_up) // word aligned shift1left loop up
3021 DECLARE_FUNCTION(shift1left_loop_up)
3022 GLABEL(shift1left_loop_up)
3023 CMN a1,#0 // clear carry bit, since a1 > 0
3024 ANDS a3,a2,#1 // multiple of 2 words ?
3025 BEQ shift1left_loop_up_l1 // yup, so branch
3026 LDR a4,[a1] // shift left the first word
3029 LABEL(shift1left_loop_up_l1)
3030 BICS a4,a2,#1 // set counter to multiple of 2
3031 ADCEQ a1,a4,a4 // if zero set result to C (a4 is 0)
3032 MOVEQS pc,lr // and return
3033 ANDS a3,a4,#3 // multiple of 4 words ?
3034 BEQ shift1left_loop_up_l3 // yup, so branch
3035 LDMIA a1,{a2,a3} // load 2 words in one go
3036 ADCS a2,a2,a2 // shift left the two words
3038 STMIA a1!,{a2,a3} // store 2 results
3039 BICS a4,a4,#2 // decrement counter by 2
3040 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0)
3041 MOVEQS pc,lr // and return
3042 LABEL(shift1left_loop_up_l3) // now a multiple of 4 words
3043 STMFD sp!,{lr} // save work regs
3044 LABEL(shift1left_loop_up_l2)
3045 LDMIA a1,{a2,a3,ip,lr} // load 4 words in one go
3046 ADCS a2,a2,a2 // shift left the four words
3050 STMIA a1!,{a2,a3,ip,lr} // store 4 results
3051 SUB a4,a4,#4 // decrement counter by 4
3052 TEQ a4,#0 // are we done ?
3053 BNE shift1left_loop_up_l2 // if count non-zero then loop
3054 ADC a1,a4,a4 // set result to Carry (a4 is 0)
3055 LDMFD sp!,{pc}^ // restore work regs and return 1
3057 // extern uintD shiftleft_loop_up (uintD* ptr, uintC count, uintC i, uintD carry);
3060 // a2 = count of words to be shifted left
3061 // a3 = size of left shift
3062 // a4 = value to ORR in for first shift
3064 // a1 = shift out from last shift left
3065 // a2 - a4, ip destroyed
3066 EXPORT(shiftleft_loop_up) // word aligned shiftleft loop up
3067 DECLARE_FUNCTION(shiftleft_loop_up)
3068 GLABEL(shiftleft_loop_up)
3070 RSB v6,a3,#32 // size of complementary right shift
3071 ANDS ip,a2,#3 // multiple of 4 words ?
3072 BEQ shiftleft_loop_up_l1 // yup, so branch
3073 LDR lr,[a1] // shiftleft the first 1-3 words
3074 ORR a4,a4,lr,ASL a3 // to align the total to a multiple
3075 STR a4,[a1],#4 // of 4 words
3078 BLT shiftleft_loop_up_l1 // better to branch than skip instrs.
3080 ORRGE a4,a4,lr,ASL a3
3084 ORRGT a4,a4,lr,ASL a3
3087 LABEL(shiftleft_loop_up_l1)
3088 BICS ip,a2,#3 // set counter to multiple of 4
3089 MOVEQ a1,a4 // if zero then we're done
3090 LDMEQFD sp!,{v6,pc}^ // so return last shift out
3091 STMFD sp!,{v1-v3} // save work regs
3092 LABEL(shiftleft_loop_up_l2)
3093 LDMIA a1,{v1,v2,v3,lr} // load 4 words in one go
3094 ORR a2,a4,v1,ASL a3 // shiftleft the four words
3095 MOV a4,v1,LSR v6 // keep carry in a4
3096 ORR v1,a4,v2,ASL a3 // and store results down a register
3097 MOV a4,v2,LSR v6 // to regs a2,v1-v3
3102 STMIA a1!,{a2,v1,v2,v3} // store 4 results
3103 SUBS ip,ip,#4 // decrement counter by 4
3104 BGT shiftleft_loop_up_l2 // if count still positive then loop
3105 MOV a1,a4 // result = last shift out
3106 LDMFD sp!,{v1-v3,v6,pc}^ // restore work regs and return
3110 // extern uintD shiftleftcopy_loop_up (uintD* sourceptr, uintD* destptr, uintC count, uintC i);
3114 // a3 = count of words to be shifted left
3115 // a4 = size of left shift
3117 // a1 = shift out from last shift left
3118 // a2 - a4, ip destroyed
3119 EXPORT(shiftleftcopy_loop_up) // word aligned shiftleftcopy loop up
3120 DECLARE_FUNCTION(shiftleftcopy_loop_up)
3121 GLABEL(shiftleftcopy_loop_up)
3122 STMFD sp!,{v5,v6,lr}
3123 MOV v5,#0 // initial shift carry
3124 RSB v6,a4,#32 // size of complementary right shift
3125 ANDS ip,a3,#3 // multiple of 4 words ?
3126 BEQ shiftleftcopy_loop_up_l1 // yup, so branch
3127 LDR lr,[a1],#4 // shiftleft the first 1-3 words
3128 ORR v5,v5,lr,ASL a4 // to align the total to a multiple
3129 STR v5,[a2],#4 // of 4 words
3132 BLT shiftleftcopy_loop_up_l1 // better to branch than skip instrs.
3134 ORRGE v5,v5,lr,ASL a4
3138 ORRGT v5,v5,lr,ASL a4
3141 LABEL(shiftleftcopy_loop_up_l1)
3142 BICS ip,a3,#3 // set counter to multiple of 4
3143 MOVEQ a1,v5 // if zero then we're done
3144 LDMEQFD sp!,{v5,v6,pc}^ // so return last shift out
3145 STMFD sp!,{v1-v3} // save work regs
3146 LABEL(shiftleftcopy_loop_up_l2)
3147 LDMIA a1!,{v1,v2,v3,lr} // load 4 words in one go
3148 ORR a3,v5,v1,ASL a4 // shiftleft the four words
3149 MOV v5,v1,LSR v6 // keep carry in v5
3150 ORR v1,v5,v2,ASL a4 // and store results down a register
3151 MOV v5,v2,LSR v6 // to regs a3,v1-v3
3156 STMIA a2!,{a3,v1,v2,v3} // store 4 results
3157 SUBS ip,ip,#4 // decrement counter by 4
3158 BGT shiftleftcopy_loop_up_l2 // if count still positive then loop
3159 MOV a1,v5 // result = last shift out
3160 LDMFD sp!,{v1-v3,v5,v6,pc}^ // restore work regs and return
3162 #if !CL_DS_BIG_ENDIAN_P
3164 // extern uintD shift1right_loop_down (uintD* ptr, uintC count, uintD carry);
3167 // a2 = count of words to be shifted right
3170 // a1 = carry out from last shift right
3171 // a2 - a4, ip destroyed
3172 EXPORT(shift1right_loop_down) // word aligned shift1right loop down
3173 DECLARE_FUNCTION(shift1right_loop_down)
3174 GLABEL(shift1right_loop_down)
3175 MOVS a3,a3,LSR #1 // set carry
3176 ANDS a3,a2,#1 // multiple of 2 words ?
3177 BEQ shift1right_loop_down_l1 // yup, so branch
3178 LDR a4,[a1,#-4]! // shift right the first word
3181 LABEL(shift1right_loop_down_l1)
3182 BICS a4,a2,#1 // set counter to multiple of 2
3183 MOVEQ a1,a4,RRX // if zero set result to C (a4 is 0)
3184 MOVEQS pc,lr // and return
3185 ANDS a3,a4,#3 // multiple of 4 words ?
3186 BEQ shift1right_loop_down_l3 // yup, so branch
3187 LDMDB a1,{a2,a3} // load 2 words in one go
3188 MOVS a3,a3,RRX // shift right the two words
3190 STMDB a1!,{a2,a3} // store 2 results
3191 BICS a4,a4,#2 // decrement counter by 2
3192 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0)
3193 MOVEQS pc,lr // and return
3194 LABEL(shift1right_loop_down_l3) // now a multiple of 4 words
3195 STMFD sp!,{lr} // save work regs
3196 LABEL(shift1right_loop_down_l2)
3197 LDMDB a1,{a2,a3,ip,lr} // load 4 words in one go
3198 MOVS lr,lr,RRX // shift right the four words
3202 STMDB a1!,{a2,a3,ip,lr} // store 4 results
3203 SUB a4,a4,#4 // decrement counter by 4
3204 TEQ a4,#0 // are we done ?
3205 BNE shift1right_loop_down_l2 // if count non-zero then loop
3206 MOV a1,a4,RRX // set result to Carry (a4 is 0)
3207 LDMFD sp!,{pc}^ // restore work regs and return 1
3209 // extern uintD shiftright_loop_down (uintD* ptr, uintC count, uintC i);
3212 // a2 = count of words to be shifted right
3213 // a3 = size of right shift
3215 // a1 = shift out from last shift right
3216 // a2 - a4, ip destroyed
3217 EXPORT(shiftright_loop_down) // word aligned shiftright loop down
3218 DECLARE_FUNCTION(shiftright_loop_down)
3219 GLABEL(shiftright_loop_down)
3221 MOV a4,#0 // initial shift carry
3222 RSB v6,a3,#32 // size of complementary left shift
3223 LABEL(shiftright_loop_down_l0)
3224 ANDS ip,a2,#3 // multiple of 4 words ?
3225 BEQ shiftright_loop_down_l1 // yup, so branch
3226 LDR lr,[a1,#-4]! // shiftright the first 1-3 words
3227 ORR a4,a4,lr,LSR a3 // to align the total to a multiple
3228 STR a4,[a1] // of 4 words
3231 BLT shiftright_loop_down_l1 // better to branch than skip instrs.
3233 ORRGE a4,a4,lr,LSR a3
3237 ORRGT a4,a4,lr,LSR a3
3240 LABEL(shiftright_loop_down_l1)
3241 BICS ip,a2,#3 // set counter to multiple of 4
3242 MOVEQ a1,a4 // if zero then we're done
3243 LDMEQFD sp!,{v6,pc}^ // so return last shift out
3244 STMFD sp!,{v1-v3} // save work regs
3245 LABEL(shiftright_loop_down_l2)
3246 LDMDB a1,{a2,v1,v2,v3} // load 4 words in one go
3247 ORR lr,a4,v3,LSR a3 // shiftright the four words
3248 MOV a4,v3,ASL v6 // keep carry in a4
3249 ORR v3,a4,v2,LSR a3 // and store results up a register
3250 MOV a4,v2,ASL v6 // to regs v1-v3,lr
3255 STMDB a1!,{v1,v2,v3,lr} // store 4 results
3256 SUBS ip,ip,#4 // decrement counter by 4
3257 BGT shiftright_loop_down_l2 // if count still positive then loop
3258 MOV a1,a4 // result = last shift out
3259 LDMFD sp!,{v1-v3,v6,pc}^ // restore work regs and return
3261 // extern uintD shiftrightsigned_loop_down (uintD* ptr, uintC count, uintC i);
3264 // a2 = count of words to be shifted right signed
3265 // a3 = size of right shift
3267 // a1 = shift out from last shift right
3268 // a2 - a4, ip destroyed
3269 EXPORT(shiftrightsigned_loop_down)// word aligned shiftrightsigned loop down
3270 DECLARE_FUNCTION(shiftrightsigned_loop_down)
3271 GLABEL(shiftrightsigned_loop_down)
3273 RSB v6,a3,#32 // size of complementary left shift
3274 LDR lr,[a1,#-4] // setup carry for first shift.
3275 MOV a4,lr,ASR #31 // this is the sign extended bits
3276 AND a4,a4,a4,LSL v6 // 31->(32-i) of the first word
3277 B shiftright_loop_down_l0 // use right shift code now
3279 // extern uintD shiftrightcopy_loop_down (uintD* sourceptr, uintD* destptr, uintC count, uintC i, uintD carry);
3283 // a3 = count of words to be shifted right
3284 // a4 = size of right shift
3285 // [sp] = carry for first shift
3287 // a1 = shift out from last shift right
3288 // a2 - a4, ip destroyed
3289 EXPORT(shiftrightcopy_loop_down)// word aligned shiftrightcopy loop down
3290 DECLARE_FUNCTION(shiftrightcopy_loop_down)
3291 GLABEL(shiftrightcopy_loop_down)
3292 STMFD sp!,{v5,v6,lr}
3293 LDR v5,[sp,#12] // initial shift carry
3294 RSB v6,a4,#32 // size of complementary left shift
3296 LABEL(shiftrightcopy_loop_down_l0)
3297 ANDS ip,a3,#3 // multiple of 4 words ?
3298 BEQ shiftrightcopy_loop_down_l1 // yup, so branch
3299 LDR lr,[a1,#-4]! // shiftright the first 1-3 words
3300 ORR v5,v5,lr,LSR a4 // to align the total to a multiple
3301 STR v5,[a2,#-4]! // of 4 words
3304 BLT shiftrightcopy_loop_down_l1 // better to branch than skip instrs.
3306 ORRGE v5,v5,lr,LSR a4
3310 ORRGT v5,v5,lr,LSR a4
3313 LABEL(shiftrightcopy_loop_down_l1)
3314 BICS ip,a3,#3 // set counter to multiple of 4
3315 MOVEQ a1,v5 // if zero then we're done
3316 LDMEQFD sp!,{v5,v6,pc}^ // so return last shift out
3317 STMFD sp!,{v1-v3} // save work regs
3318 LABEL(shiftrightcopy_loop_down_l2)
3319 LDMDB a1!,{a3,v1,v2,v3} // load 4 words in one go
3320 ORR lr,v5,v3,LSR a4 // shiftright the four words
3321 MOV v5,v3,ASL v6 // keep carry in v5
3322 ORR v3,v5,v2,LSR a4 // and store results up a register
3323 MOV v5,v2,ASL v6 // to regs v1-v3,lr
3328 STMDB a2!,{v1,v2,v3,lr} // store 4 results
3329 SUBS ip,ip,#4 // decrement counter by 4
3330 BGT shiftrightcopy_loop_down_l2 // if count still positive then loop
3331 MOV a1,v5 // result = last shift out
3332 LDMFD sp!,{v1-v3,v5,v6,pc}^ // restore work regs and return
3342 // v2,v3,v4 destroyed
3343 LABEL(mulu32_64_vregs)
3344 MOV v1,a1,LSR #16 // temp := top half of x
3345 MOV v2,ip,LSR #16 // hi := top half of y
3346 BIC v3,a1,v1,LSL #16 // x := bottom half of x
3347 BIC ip,ip,v2,LSL #16 // y := bottom half of y
3348 MUL v4,v3,ip // low section of result
3349 MUL ip,v1,ip // ) middle sections
3350 MUL v3,v2,v3 // ) of result
3351 MUL v2,v1,v2 // high section of result
3352 ADDS ip,ip,v3 // add middle sections
3353 // (can't use mla as we need carry)
3354 ADDCS v2,v2,#0x10000 // carry from above add
3355 ADDS v1,v4,ip,LSL #16 // x is now bottom 32 bits of result
3356 ADC ip,v2,ip,LSR #16 // hi is top 32 bits
3360 // extern uintD mulusmall_loop_up (uintD digit, uintD* ptr, uintC len, uintD newdigit);
3364 // a3 = count of words to be multiplied up
3365 // a4 = new digit = carry
3367 // a1 = final carry of multiply
3368 // a2 - a4, ip destroyed
3369 EXPORT(mulusmall_loop_up)
3370 DECLARE_FUNCTION(mulusmall_loop_up)
3371 GLABEL(mulusmall_loop_up)
3377 LABEL(mulusmall_loop_up_l1)
3379 UMULL v1,ip,a1,ip // muluD(digit,*--ptr,hi=,lo=)
3380 ADDS v1,v1,a4 // lo += carry
3381 ADC a4,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
3382 STR v1,[a2],#4 // *ptr++ = lo
3383 SUBS a3,a3,#1 // len--
3384 BNE mulusmall_loop_up_l1 // until len==0
3385 MOV a1,a4 // return carry
3388 STMFD sp!,{v1-v2,lr}
3389 LABEL(mulusmall_loop_up_l1)
3392 // BL mulu32_64_vregs // muluD(digit,*ptr,hi=,lo=)
3393 // replaced by multiplication of a small x = a1 and a big y = ip :
3394 MOV v1,ip,LSR #16 // top half of y
3395 BIC ip,ip,v1,LSL #16 // bottom half of y
3396 MUL v2,a1,v1 // middle section of result
3397 MUL v1,a1,ip // low section of result
3398 MOV ip,#0 // high section of result
3399 ADDS v1,v1,v2,LSL #16 // bottom 32 bits of result
3400 ADC ip,ip,v2,LSR #16 // top 32 bits of result
3402 ADDS v1,v1,a4 // lo += carry
3403 ADC a4,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
3404 STR v1,[a2],#4 // *ptr++ = lo
3405 SUBS a3,a3,#1 // len--
3406 BNE mulusmall_loop_up_l1 // until len==0
3407 MOV a1,a4 // return carry
3408 LDMFD sp!,{v1-v2,pc}^
3411 // extern void mulu_loop_up (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
3416 // a4 = count of words to be multiplied up
3418 // a1 - a4, ip destroyed
3419 EXPORT(mulu_loop_up)
3420 DECLARE_FUNCTION(mulu_loop_up)
3421 GLABEL(mulu_loop_up)
3423 STMFD sp!,{v1,v5,lr}
3425 LABEL(mulu_loop_up_l1)
3427 UMULL v1,ip,a1,ip // muluD(digit,*sourceptr++,hi=,lo=)
3428 ADDS v1,v1,v5 // lo += carry
3429 ADC v5,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
3430 STR v1,[a3],#4 // *destptr++ = lo
3431 SUBS a4,a4,#1 // len--
3432 BNE mulu_loop_up_l1 // until len==0
3433 STR v5,[a3],#4 // *destptr++ = carry
3434 LDMFD sp!,{v1,v5,pc}^
3436 STMFD sp!,{v1-v5,lr}
3438 LABEL(mulu_loop_up_l1)
3440 BL mulu32_64_vregs // muluD(digit,*sourceptr++,hi=,lo=)
3441 ADDS v1,v1,v5 // lo += carry
3442 ADC v5,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
3443 STR v1,[a3],#4 // *destptr++ = lo
3444 SUBS a4,a4,#1 // len--
3445 BNE mulu_loop_up_l1 // until len==0
3446 STR v5,[a3],#4 // *destptr++ = carry
3447 LDMFD sp!,{v1-v5,pc}^
3450 // extern void muluadd_loop_up (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
3455 // a4 = count of words to be multiplied added up
3457 // a1 - a4, ip destroyed
3458 EXPORT(muluadd_loop_up)
3459 DECLARE_FUNCTION(muluadd_loop_up)
3460 GLABEL(muluadd_loop_up)
3462 STMFD sp!,{v1,v5,lr}
3464 LABEL(muluadd_loop_up_l1)
3466 UMULL v1,ip,a1,ip // muluD(digit,*sourceptr++,hi=,lo=)
3467 ADDS v1,v1,v5 // lo += carry
3468 ADCCS ip,ip,#0 // if (lo<carry) { hi += 1 };
3469 LDR v5,[a3] // carry = *destptr
3470 ADDS v1,v1,v5 // lo += carry
3471 ADC v5,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
3472 STR v1,[a3],#4 // *destptr++ = lo
3473 SUBS a4,a4,#1 // len--
3474 BNE muluadd_loop_up_l1 // until len==0
3475 MOV a1,v5 // return carry
3476 LDMFD sp!,{v1,v5,pc}^
3478 STMFD sp!,{v1-v5,lr}
3480 LABEL(muluadd_loop_up_l1)
3482 BL mulu32_64_vregs // muluD(digit,*sourceptr++,hi=,lo=)
3483 ADDS v1,v1,v5 // lo += carry
3484 ADCCS ip,ip,#0 // if (lo<carry) { hi += 1 };
3485 LDR v5,[a3] // carry = *destptr
3486 ADDS v1,v1,v5 // lo += carry
3487 ADC v5,ip,#0 // if (lo<carry) { hi += 1 }; carry=hi
3488 STR v1,[a3],#4 // *destptr++ = lo
3489 SUBS a4,a4,#1 // len--
3490 BNE muluadd_loop_up_l1 // until len==0
3491 MOV a1,v5 // return carry
3492 LDMFD sp!,{v1-v5,pc}^
3495 // extern void mulusub_loop_up (uintD digit, uintD* sourceptr, uintD* destptr, uintC len);
3500 // a4 = count of words to be multiplied subtracted up
3502 // a1 - a4, ip destroyed
3503 EXPORT(mulusub_loop_up)
3504 DECLARE_FUNCTION(mulusub_loop_up)
3505 GLABEL(mulusub_loop_up)
3507 STMFD sp!,{v1,v5,lr}
3509 LABEL(mulusub_loop_up_l1)
3511 UMULL v1,ip,a1,ip // muluD(digit,*sourceptr++,hi=,lo=)
3512 ADDS v1,v1,v5 // lo += carry
3513 ADC v5,ip,#0 // if (lo<carry) { hi += 1 };
3514 LDR ip,[a3] // carry = *destptr
3516 STR ip,[a3],#4 // *destptr++ = carry - lo
3517 ADDCC v5,v5,#1 // if (carry<lo) { hi += 1 }; carry=hi
3518 SUBS a4,a4,#1 // len--
3519 BNE mulusub_loop_up_l1 // until len==0
3520 MOV a1,v5 // return carry
3521 LDMFD sp!,{v1,v5,pc}^
3523 STMFD sp!,{v1-v5,lr}
3525 LABEL(mulusub_loop_up_l1)
3527 BL mulu32_64_vregs // muluD(digit,*sourceptr++,hi=,lo=)
3528 ADDS v1,v1,v5 // lo += carry
3529 ADC v5,ip,#0 // if (lo<carry) { hi += 1 };
3530 LDR ip,[a3] // carry = *destptr
3532 STR ip,[a3],#4 // *destptr++ = carry - lo
3533 ADDCC v5,v5,#1 // if (carry<lo) { hi += 1 }; carry=hi
3534 SUBS a4,a4,#1 // len--
3535 BNE mulusub_loop_up_l1 // until len==0
3536 MOV a1,v5 // return carry
3537 LDMFD sp!,{v1-v5,pc}^
3542 // extern void shiftxor_loop_up (uintD* xptr, const uintD* yptr, uintC count, uintC i);
3546 // a3 = count of words to be shifted left
3547 // a4 = size of left shift
3549 // a1 - a4, ip destroyed
3550 EXPORT(shiftxor_loop_up) // word aligned shiftxor loop up
3551 DECLARE_FUNCTION(shiftxor_loop_up)
3552 GLABEL(shiftxor_loop_up)
3553 STMFD sp!,{v5,v6,lr}
3554 RSB lr,a4,#32 // size of complementary right shift
3555 LDR ip,[a1] // get first *xptr
3556 ANDS v6,a3,#3 // multiple of 4 words ?
3557 BEQ shiftxor_loop_up_l1 // yup, so branch
3558 LDR v5,[a2],#4 // get *yptr
3559 EOR ip,ip,v5,ASL a4 // combine with modified *xptr
3560 STR ip,[a1],#4 // save new *xptr
3561 LDR ip,[a1] // get next *xptr
3562 EOR ip,ip,v5,LSR lr // combine with *xptr
3564 BLT shiftxor_loop_up_l1 // better to branch than skip instrs.
3565 LDR v5,[a2],#4 // get *yptr
3566 EOR ip,ip,v5,ASL a4 // combine with modified *xptr
3567 STR ip,[a1],#4 // save new *xptr
3568 LDR ip,[a1] // get next *xptr
3569 EOR ip,ip,v5,LSR lr // combine with *xptr
3570 LDRGT v5,[a2],#4 // get *yptr
3571 EORGT ip,ip,v5,ASL a4 // combine with modified *xptr
3572 STRGT ip,[a1],#4 // save new *xptr
3573 LDRGT ip,[a1] // get next *xptr
3574 EORGT ip,ip,v5,LSR lr // combine with *xptr
3575 LABEL(shiftxor_loop_up_l1)
3576 BICS a3,a3,#3 // set counter to multiple of 4
3578 LDMEQFD sp!,{v5,v6,pc}^ // return if done
3579 STMFD sp!,{v1-v4} // save work regs
3580 LABEL(shiftxor_loop_up_l2)
3581 LDMIA a2!,{v3,v4,v5,v6} // load 4 words yptr[0..3] in one go
3582 EOR v1,ip,v3,ASL a4 // combine with modified *xptr
3585 EOR v2,v2,v4,ASL a4 // combine with modified *xptr
3588 EOR v3,v3,v5,ASL a4 // combine with modified *xptr
3591 EOR v4,v4,v6,ASL a4 // combine with modified *xptr
3592 STMIA a1!,{v1,v2,v3,v4} // store 4 words xptr[0..3] in one go
3595 SUBS a3,a3,#4 // decrement counter by 4
3596 BGT shiftxor_loop_up_l2
3598 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return