// ariarm.d (c) Copyright 1994, 1997 P.J.Burwood // little-endian modifications (c) Copyright 1996 B. Haible // external routines for arilev1.d // Processor: ARM in APCS mode // Assembler-Syntax: ObjAsm under RISC OS, GAS otherwise // Assumptions: intCsize=32, intDsize=32. // Parameter passing conventions: APCS means that registers a1-a4 and ip // do not have to be preserved across function calls. // Note: A sequence of up to 4 conditional instructions is used in preference // to a branch. #ifdef __riscos // ObjAsm syntax a1 RN 0 a2 RN 1 a3 RN 2 a4 RN 3 v1 RN 4 v2 RN 5 v3 RN 6 v4 RN 7 v5 RN 8 v6 RN 9 sl RN 10 fp RN 11 ip RN 12 sp RN 13 lr RN 14 pc RN 15 f0 FN 0 f1 FN 1 f2 FN 2 f3 FN 3 f4 FN 4 f5 FN 5 f6 FN 6 f7 FN 7 #define C(x) _##x #define EXPORT(x) EXPORT x #define DECLARE_FUNCTION(x) #define GLABEL(x) _##x #define LABEL(x) _##x AREA |C$$code|,CODE,READONLY #else // GAS syntax a1 .req r0 a2 .req r1 a3 .req r2 a4 .req r3 v1 .req r4 v2 .req r5 v3 .req r6 v4 .req r7 v5 .req r8 v6 .req r9 rfp .req r9 sl .req r10 fp .req r11 ip .req r12 sp .req r13 lr .req r14 pc .req r15 #define C(x) _##x #define EXPORT(x) .global _##x #if defined(__NetBSD__) #define DECLARE_FUNCTION(x) .type _##x,%function #else #define DECLARE_FUNCTION(x) #endif #define GLABEL(x) _##x##: #define LABEL(x) x##: #define RRX rrx #define END #endif #if defined(__arm7m__) || defined(__arm8__) || defined(__arm9__) || defined(__strongarm__) // ARM7M and later have 32x32 -> 64 multiplies which execute in 2-4 clocks. #define HAVE_umull #endif #if defined(__GNUC__) && 0 // With GNU C, we would like to pass the second return value in a2, don't // need a global variable. Unfortunately, the current Acorn gcc crashes if // we declare an appropriate local register variable with __asm__. // It would be possible to declare the functions as returning a 64-bit // result, but given the quality of gcc code dealing with 64-bit entities // and the subtleties of 64-bit returns values (passed in register or in // memory?) we now let it be. #else // Use three global variables. #define MULU32_HIGH #define DIVU_16_REST #define DIVU_32_REST #endif #ifdef __riscos #ifdef MULU32_HIGH ptr_mulu32_high IMPORT mulu32_high DCD mulu32_high #endif #ifdef DIVU_16_REST ptr_divu_16_rest IMPORT divu_16_rest DCD divu_16_rest #endif #ifdef DIVU_32_REST ptr_divu_32_rest IMPORT divu_32_rest DCD divu_32_rest #endif #else #ifdef MULU32_HIGH ptr_mulu32_high: .word _mulu32_high .align 0 #endif #ifdef DIVU_16_REST ptr_divu_16_rest: .word _divu_16_rest .align 0 #endif #ifdef DIVU_32_REST ptr_divu_32_rest: .word _divu_32_rest .align 0 #endif #endif // extern uint32 mulu32_ (uint32 x, uint32 y); // entry // a1 = x // a2 = y // exit // a1 = low32(x*y) // a2 = high32(x*y) // mulu32_high = high32(x*y) // a3,a4,ip destroyed EXPORT(mulu32_) DECLARE_FUNCTION(mulu32_) GLABEL(mulu32_) #ifdef HAVE_umull MOV a3,a2 UMULL a1,a2,a3,a1 #else MOV ip,a1,LSR #16 // temp := top half of x MOV a3,a2,LSR #16 // hi := top half of y BIC a1,a1,ip,LSL #16 // x := bottom half of x BIC a2,a2,a3,LSL #16 // y := bottom half of y MUL a4,a1,a2 // low section of result MUL a2,ip,a2 // ) middle sections MUL a1,a3,a1 // ) of result MUL a3,ip,a3 // high section of result ADDS a2,a2,a1 // add middle sections // (can't use mla as we need carry) ADDCS a3,a3,#0x10000 // carry from above add ADDS a1,a4,a2,LSL #16 // x is now bottom 32 bits of result ADC a2,a3,a2,LSR #16 // hi is top 32 bits #endif #ifdef MULU32_HIGH LDR a3,[pc,#ptr_mulu32_high-.-8] STR a2,[a3,#0] #endif MOVS pc,lr // extern uint16 divu_3216_1616_ (uint32 x, uint16 y); // entry // a1 = x // a2 = y // exit // a1 = q = floor(x/y) // a2 = r = x-q*y // divu_16_rest = r = x-q*y // a3 destroyed EXPORT(divu_3216_1616_) DECLARE_FUNCTION(divu_3216_1616_) GLABEL(divu_3216_1616_) // see cl_low_div.cc for algorithm // in that notation: a1 = r, a2 = -s. MOV a2,a2,LSL#15 // multiply divisor by 2^15 RSB a2,a2,#0 // negate divisor ADDS a1,a2,a1 // dividend = dividend + -divisor/2 SUBCC a1,a1,a2 // dividend = dividend - -divisor/2 ADCS a1,a2,a1,LSL#1 // dividend = dividend*2 + -divisor // and shift quotient SUBCC a1,a1,a2 // do this another 14 times ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 ADCS a1,a2,a1,LSL#1 SUBCC a1,a1,a2 // do the last conditional subtraction MOV a2,a1,LSR#15 // move remainder into a2 and shift ADC a1,a1,a1 // move last bit of quotient in MOV a1,a1,LSL#16 // AND out top 16 bits by shifting up MOV a1,a1,LSR#16 // and back down again #ifdef DIVU_16_REST LDR a3,[pc,#ptr_divu_16_rest-.-8] // save rest so can be picked up later STR a2,[a3,#0] // the result is 16 bits #endif MOVS pc, lr // extern uint32 divu_6432_3232_ (uint32 xhi, uint32 xlo, uint32 y); // -> Quotient q // extern uint32 divu_32_rest; // -> Rest r // see cl_low_div.cc for algorithm // entry // a1 = xhi (dividend) // a2 = xlo (dividend) // a3 = y (divisor) // exit // a1 = 32 bit quotient // a2 = 32 bit remainder // a3, a4 destroyed EXPORT(divu_6432_3232_) DECLARE_FUNCTION(divu_6432_3232_) GLABEL(divu_6432_3232_) STMFD sp!, {v1,v2,v3,v4,v5,v6,lr} MOV v2, a2 // = xlo MOV v1, a3 // = y CMP a3,#0x10000 // y <= (uint32)(bit(16)-1) BCS divu_6432_3232_l1 MOV a2, v2, LSR #16 ORR a1, a2, a1, ASL #16 // = highlow32(low16(xhi),high16(xlo)) MOV a2, v1 BL C(divu_3216_1616_) MOV v3, a1 // = q1 MOV a1, v2, ASL #16 MOV a1, a1, LSR #16 ORR a1, a1, a2, ASL #16 // = highlow32(r1,low16(xlo)) MOV a2, v1 BL C(divu_3216_1616_) ORR a1, a1, v3, ASL #16 // = highlow32(q1,q0) #ifdef DIVU_32_REST LDR a4,[pc,#ptr_divu_32_rest-.-8] STR a2,[a4,#0] // divu_32_rest = remainder #endif LDMFD sp!, {v1,v2,v3,v4,v5,v6,pc}^ LABEL(divu_6432_3232_l1) MOV v3, #0 // s = 0 MOVS a4, v1, LSR #16 // while ((sint32)y >= 0) ADDEQ v3, v3, #16 // { y = y<<1; s++; } MOVEQ v1, v1, ASL #16 MOVS a4, v1, LSR #24 ADDEQ v3, v3, #8 MOVEQ v1, v1, ASL #8 MOVS a4, v1, LSR #28 ADDEQ v3, v3, #4 MOVEQ v1, v1, ASL #4 MOVS a4, v1, LSR #30 ADDEQ v3, v3, #2 MOVEQ v1, v1, ASL #2 MOVS a4, v1, LSR #31 ADDEQ v3, v3, #1 MOVEQ v1, v1, ASL #1 CMPS v3, #0 MOVNE a2, a1, ASL v3 // if (!(s==0)) RSBNE a1, v3, #32 // { xhi = (xhi << s) ORRNE a1, a2, v2, LSR a1 // | (xlo >> (32-s)); MOVNE v2, v2, ASL v3 // xlo = xlo << s; } ADD a2, v1, #0x10000 // y1_1 = high16(y)+1 MOVS v5, a2, LSR #16 // if (y1_1 = 0) MOVEQ v4, a1, ASL #16 // r16 = low16(xhi) * 2^16 MOVEQ a1, a1, LSR #16 // q1 = high16(xhi) MOVNE a2, v5 BLNE C(divu_3216_1616_) // divu_3216_1616(xhi,y1_1, q1=,r16=) MOVNE v4, a2, ASL #16 // r16 = r16 * 2^16 ORR v4, v4, v2, LSR #16 // r = highlow32(r16,high16(xlo)) MOV a4, v1, ASL #16 // tmp = mulu16(low16(y),q1) MOV a4, a4, LSR #16 MUL a3, a4, a1 RSB a3, a3, a1, ASL #16 // r2 = highlow32_0(q1) - tmp MOV v6, a1 // = q1 ADDS a1, v4, a3 // r += r2 ADDCS v6, v6, #1 // if ( r < r2 ) { q1 += 1 SUBCS a1, a1, v1 // r -= y } CMP a1, v1 // if (r >= y) ADDCS v6, v6, #1 // { q1 += 1 SUBCS a1, a1, v1 // r -= y } CMP v5, #0 // if (y1_1 = 0) MOVEQ v4, a1, ASL #16 // { r16 = low16(r) * 2^16 MOVEQ a1, a1, LSR #16 // q0 = high16(r) } MOVNE a2, v5 BLNE C(divu_3216_1616_) // divu_3216_1616(r,y1_1, q0=,r16=) MOVNE v4, a2, ASL #16 // r16 = r16 * 2^16 MOV v2, v2, ASL #16 ORR v4, v4, v2, LSR #16 // r = highlow32(r16,low16(xlo)) MOV a4, v1, ASL #16 // tmp = mulu16(low16(y),q0) MOV a4, a4, LSR #16 MUL a3, a4, a1 RSB a3, a3, a1, ASL #16 // r2 = highlow32_0(q0) - tmp ADDS v4, v4, a3 // r += r2 ADDCS a1, a1, #1 // if ( r < r2 ) { q0 += 1 SUBCS v4, v4, v1 // r -= y } CMP v4, v1 // if (r >= y) ADDCS a1, a1, #1 // { q0 += 1 SUBCS v4, v4, v1 // r -= y } MOV a2, v4, LSR v3 // remainder = r >> s ORR a1, a1, v6, ASL #16 // return highlow32(q1,q0) #ifdef DIVU_32_REST LDR a3,[pc,#ptr_divu_32_rest-.-8] STR a2,[a3,#0] // divu_32_rest = remainder #endif LDMFD sp!, {v1,v2,v3,v4,v5,v6,pc}^ // extern uintD* copy_loop_up (uintD* sourceptr, uintD* destptr, uintC count); // entry // a1 = source pointer // a2 = destination pointer // a3 = count of words to store // exit // a1 = address of last word stored + 1 // a2 - a4, ip destroyed EXPORT(copy_loop_up) // word aligned copy loop up DECLARE_FUNCTION(copy_loop_up) GLABEL(copy_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ copy_loop_up_l1 // yup, so branch CMP a4,#2 // copy the first 1-3 words LDR a4,[a1],#4 // to align the total to a multiple STR a4,[a2],#4 // of 4 words LDRGE a4,[a1],#4 STRGE a4,[a2],#4 LDRGT a4,[a1],#4 STRGT a4,[a2],#4 LABEL(copy_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQ a1,a2 // return addr of last word stored MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1,lr} // save work regs LABEL(copy_loop_up_l2) LDMIA a1!,{a3,v1,ip,lr} // copy 4 words in one go STMIA a2!,{a3,v1,ip,lr} SUBS a4,a4,#8 // decrement counter by 8 LDMGEIA a1!,{a3,v1,ip,lr} // if count still positive then copy STMGEIA a2!,{a3,v1,ip,lr} // 4 more words BGT copy_loop_up_l2 // and loop MOV a1,a2 // return addr of last word stored LDMFD sp!,{v1,pc}^ // restore work regs and return // extern uintD* copy_loop_down (uintD* sourceptr, uintD* destptr, uintC count); // entry // a1 = source pointer // a2 = destination pointer // a3 = count of words to store // exit // a1 = address of last word stored // a2 - a4, ip destroyed EXPORT(copy_loop_down) // word aligned copy loop down DECLARE_FUNCTION(copy_loop_down) GLABEL(copy_loop_down) ANDS a4,a3,#3 // multiple of 4 words ? BEQ copy_loop_down_l1 // yup, so branch CMP a4,#2 // copy the first 1-3 words LDR a4,[a1,#-4]! // to align the total to a multiple STR a4,[a2,#-4]! // of 4 words LDRGE a4,[a1,#-4]! STRGE a4,[a2,#-4]! LDRGT a4,[a1,#-4]! STRGT a4,[a2,#-4]! LABEL(copy_loop_down_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQ a1,a2 // return addr of last word stored MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1,lr} // save work regs LABEL(copy_loop_down_l2) LDMDB a1!,{a3,v1,ip,lr} // copy 4 words in one go STMDB a2!,{a3,v1,ip,lr} SUBS a4,a4,#8 // decrement counter by 8 LDMGEDB a1!,{a3,v1,ip,lr} // if count still positive then copy STMGEDB a2!,{a3,v1,ip,lr} // 4 more words BGT copy_loop_down_l2 // and loop MOV a1,a2 // return addr of last word stored LDMFD sp!,{v1,pc}^ // restore work regs and return // extern uintD* clear_loop_up (uintD* destptr, uintC count); // entry // a1 = destination pointer // a2 = count of words to store // exit // a1 = address of last word stored + 1 // a2 - a4, ip destroyed EXPORT(clear_loop_up) // word aligned clear loop up DECLARE_FUNCTION(clear_loop_up) GLABEL(clear_loop_up) MOV a3,#0 // set filler to 0 // and drop into fill_loop_up // extern uintD* fill_loop_up (uintD* destptr, uintC count, uintD filler); // entry // a1 = destination pointer // a2 = count of words to store // a3 = word to store // exit // a1 = address of last word stored + 1 // a2 - a4, ip destroyed EXPORT(fill_loop_up) // word aligned fill loop up DECLARE_FUNCTION(fill_loop_up) GLABEL(fill_loop_up) ANDS a4,a2,#3 // multiple of 4 words ? BEQ fill_loop_up_l1 // yup, so branch CMP a4,#2 // store the first 1-3 words STR a3,[a1],#4 // to align the total to a multiple STRGE a3,[a1],#4 // of 4 words STRGT a3,[a1],#4 LABEL(fill_loop_up_l1) BICS a4,a2,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1,lr} // save work regs MOV v1,a3 // copy filler to three other MOV ip,a3 // registers MOV lr,a3 LABEL(fill_loop_up_l2) STMIA a1!,{a3,v1,ip,lr} // store 4 fillers in one go SUBS a4,a4,#8 // decrement counter by 8 STMGEIA a1!,{a3,v1,ip,lr} // if count still positive then store 4 BGT fill_loop_up_l2 // more and loop LDMFD sp!,{v1,pc}^ // restore work regs and return // extern uintD* clear_loop_down (uintD* destptr, uintC count); // entry // a1 = destination pointer // a2 = count of words to store // exit // a1 = address of last word stored + 1 // a2 - a4, ip destroyed EXPORT(clear_loop_down) // word aligned clear loop down DECLARE_FUNCTION(clear_loop_down) GLABEL(clear_loop_down) MOV a3,#0 // set filler to 0 // and drop into fill_loop_down // extern uintD* fill_loop_down (uintD* destptr, uintC count, uintD filler); // entry // a1 = destination pointer // a2 = count of words to store // a3 = word to store // exit // a1 = address of last word stored // a2 - a4, ip destroyed EXPORT(fill_loop_down) // word aligned fill loop down DECLARE_FUNCTION(fill_loop_down) GLABEL(fill_loop_down) ANDS a4,a2,#3 // multiple of 4 words ? BEQ fill_loop_down_l1 // yup, so branch CMP a4,#2 // store the first 1-3 words STR a3,[a1,#-4]! // to align the total to a multiple STRGE a3,[a1,#-4]! // of 4 words STRGT a3,[a1,#-4]! LABEL(fill_loop_down_l1) BICS a4,a2,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1,lr} // save work regs MOV v1,a3 // copy filler to three other MOV ip,a3 // registers MOV lr,a3 LABEL(fill_loop_down_l2) STMDB a1!,{a3,v1,ip,lr} // store 4 fillers in one go SUBS a4,a4,#8 // decrement counter by 8 STMGEDB a1!,{a3,v1,ip,lr} // if count still positive then store 4 BGT fill_loop_down_l2 // more and loop LDMFD sp!,{v1,pc}^ // restore work regs and return // extern void test_loop_up (uintD* xptr, uintC count); // entry // a1 = xptr // a2 = count of words to be TESTed // exit // a1 = TRUE if any words are non-zero else FALSE // a2 - a4, ip destroyed EXPORT(test_loop_up) // word aligned test loop up DECLARE_FUNCTION(test_loop_up) GLABEL(test_loop_up) MOV ip,a1 // move xptr to ip MOV a1,#1 // set result to TRUE ANDS a3,a2,#3 // multiple of 4 words ? BEQ test_loop_up_l1 // yup, so branch LDR a4,[ip],#4 // TEST the first 1-3 words TEQ a4,#0 // align the total to a multiple of 4 MOVNES pc,lr // return TRUE if AND_TEST ok CMP a3,#2 BLT test_loop_up_l1 // need to branch 'cos PSR set LDRGE a4,[ip],#4 // when checking against zero TEQGE a4,#0 MOVNES pc,lr CMP a3,#2 BLE test_loop_up_l1 // need to branch 'cos PSR set LDRGT a4,[ip],#4 // when checking against zero TEQGT a4,#0 MOVNES pc,lr LABEL(test_loop_up_l1) BICS a4,a2,#3 // set counter to multiple of 4 MOVEQ a1,#0 // return FALSE MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1,lr} // save work regs LABEL(test_loop_up_l2) LDMIA ip!,{a2,a3,v1,lr} // load 4 words in one go TEQ a2,#0 // TEST the four words TEQEQ a3,#0 TEQEQ v1,#0 TEQEQ lr,#0 LDMNEFD sp!,{v1,pc}^ SUBS a4,a4,#4 // decrement counter by 4 BGT test_loop_up_l2 // if count still positive then loop MOV a1,#0 LDMFD sp!,{v1,pc}^ // restore work regs and return // extern void test_loop_down (uintD* xptr, uintC count); // entry // a1 = xptr // a2 = count of words to be TESTed // exit // a1 = TRUE if any words are non-zero else FALSE // a2 - a4, ip destroyed EXPORT(test_loop_down) // word aligned test loop down DECLARE_FUNCTION(test_loop_down) GLABEL(test_loop_down) MOV ip,a1 // move xptr to ip MOV a1,#1 // set result to TRUE ANDS a3,a2,#3 // multiple of 4 words ? BEQ test_loop_down_l1 // yup, so branch LDR a4,[ip,#-4]! // TEST the first 1-3 words TEQ a4,#0 // align the total to a multiple of 4 MOVNES pc,lr // return TRUE if AND_TEST ok CMP a3,#2 BLT test_loop_down_l1 // need to branch 'cos PSR set LDRGE a4,[ip,#-4]! // when checking against zero TEQGE a4,#0 MOVNES pc,lr CMP a3,#2 BLE test_loop_down_l1 // need to branch 'cos PSR set LDRGT a4,[ip,#-4]! // when checking against zero TEQGT a4,#0 MOVNES pc,lr LABEL(test_loop_down_l1) BICS a4,a2,#3 // set counter to multiple of 4 MOVEQ a1,#0 // return FALSE MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1,lr} // save work regs LABEL(test_loop_down_l2) LDMDB ip!,{a2,a3,v1,lr} // load 4 words in one go TEQ a2,#0 // TEST the four words TEQEQ a3,#0 TEQEQ v1,#0 TEQEQ lr,#0 LDMNEFD sp!,{v1,pc}^ SUBS a4,a4,#4 // decrement counter by 4 BGT test_loop_down_l2 // if count still positive then loop MOV a1,#0 LDMFD sp!,{v1,pc}^ // restore work regs and return #if CL_DS_BIG_ENDIAN_P // extern void or_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be ORed // exit // xptr |= yptr for count words // a1 - a4, ip destroyed EXPORT(or_loop_up) // word aligned or loop up DECLARE_FUNCTION(or_loop_up) GLABEL(or_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ or_loop_up_l1 // yup, so branch CMP a4,#2 // OR the first 1-3 words LDR a4,[a2],#4 // to align the total to a multiple LDR ip,[a1] // of 4 words ORR ip,ip,a4 STR ip,[a1],#4 BLT or_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1] ORRGE ip,ip,a4 STRGE ip,[a1],#4 LDRGT a4,[a2],#4 LDRGT ip,[a1] ORRGT ip,ip,a4 STRGT ip,[a1],#4 LABEL(or_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v5,lr} // save work regs LABEL(or_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a1,{v3,v4,v5,lr} // load target words ORR v3,v3,a3 // OR the four words ORR v4,v4,v1 ORR v5,v5,v2 ORR lr,lr,ip STMIA a1!,{v3,v4,v5,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT or_loop_up_l2 // if count still positive then loop LDMFD sp!,{v1-v5,pc}^ // restore work regs and return #endif // extern void xor_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be XORed // exit // xptr ^= yptr for count words // a1 - a4, ip destroyed EXPORT(xor_loop_up) // word aligned xor loop up DECLARE_FUNCTION(xor_loop_up) GLABEL(xor_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ xor_loop_up_l1 // yup, so branch CMP a4,#2 // XOR the first 1-3 words LDR a4,[a2],#4 // to align the total to a multiple LDR ip,[a1] // of 4 words EOR ip,ip,a4 STR ip,[a1],#4 BLT xor_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1] EORGE ip,ip,a4 STRGE ip,[a1],#4 LDRGT a4,[a2],#4 LDRGT ip,[a1] EORGT ip,ip,a4 STRGT ip,[a1],#4 LABEL(xor_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v5,lr} // save work regs LABEL(xor_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a1,{v3,v4,v5,lr} // load target words EOR v3,v3,a3 // XOR the four words EOR v4,v4,v1 EOR v5,v5,v2 EOR lr,lr,ip STMIA a1!,{v3,v4,v5,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT xor_loop_up_l2 // if count still positive then loop LDMFD sp!,{v1-v5,pc}^ // restore work regs and return #if CL_DS_BIG_ENDIAN_P // extern void and_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be ANDed // exit // xptr &= yptr for count words // a1 - a4, ip destroyed EXPORT(and_loop_up) // word aligned and loop up DECLARE_FUNCTION(and_loop_up) GLABEL(and_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ and_loop_up_l1 // yup, so branch CMP a4,#2 // AND the first 1-3 words LDR a4,[a2],#4 // to align the total to a multiple LDR ip,[a1] // of 4 words AND ip,ip,a4 STR ip,[a1],#4 BLT and_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1] ANDGE ip,ip,a4 STRGE ip,[a1],#4 LDRGT a4,[a2],#4 LDRGT ip,[a1] ANDGT ip,ip,a4 STRGT ip,[a1],#4 LABEL(and_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v5,lr} // save work regs LABEL(and_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a1,{v3,v4,v5,lr} // load target words AND v3,v3,a3 // AND the four words AND v4,v4,v1 AND v5,v5,v2 AND lr,lr,ip STMIA a1!,{v3,v4,v5,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT and_loop_up_l2 // if count still positive then loop LDMFD sp!,{v1-v5,pc}^ // restore work regs and return // extern void eqv_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be XORed // exit // xptr = ~(xptr ^ yptr) for count words // a1 - a4, ip destroyed EXPORT(eqv_loop_up) // word aligned eqv loop up DECLARE_FUNCTION(eqv_loop_up) GLABEL(eqv_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ eqv_loop_up_l1 // yup, so branch CMP a4,#2 // EQV the first 1-3 words LDR a4,[a2],#4 // to align the total to a multiple LDR ip,[a1] // of 4 words EOR ip,ip,a4 MVN ip,ip STR ip,[a1],#4 BLT eqv_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1] EORGE ip,ip,a4 MVNGE ip,ip STRGE ip,[a1],#4 BLE eqv_loop_up_l1 // better to branch than skip instrs. LDRGT a4,[a2],#4 LDRGT ip,[a1] EORGT ip,ip,a4 MVNGT ip,ip STRGT ip,[a1],#4 LABEL(eqv_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v5,lr} // save work regs LABEL(eqv_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a1,{v3,v4,v5,lr} // load target words EOR v3,v3,a3 // EVQ the four words MVN v3,v3 EOR v4,v4,v1 MVN v4,v4 EOR v5,v5,v2 MVN v5,v5 EOR lr,lr,ip MVN lr,lr STMIA a1!,{v3,v4,v5,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT eqv_loop_up_l2 // if count still positive then loop LDMFD sp!,{v1-v5,pc}^ // restore work regs and return // extern void nand_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be NANDed // exit // xptr = ~(xptr & yptr) for count words // a1 - a4, ip destroyed EXPORT(nand_loop_up) // word aligned nand loop up DECLARE_FUNCTION(nand_loop_up) GLABEL(nand_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ nand_loop_up_l1 // yup, so branch CMP a4,#2 // NAND the first 1-3 words LDR a4,[a2],#4 // to align the total to a multiple LDR ip,[a1] // of 4 words AND ip,ip,a4 MVN ip,ip STR ip,[a1],#4 BLT nand_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1] ANDGE ip,ip,a4 MVNGE ip,ip STRGE ip,[a1],#4 BLE nand_loop_up_l1 // better to branch than skip instrs. LDRGT a4,[a2],#4 LDRGT ip,[a1] ANDGT ip,ip,a4 MVNGT ip,ip STRGT ip,[a1],#4 LABEL(nand_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v5,lr} // save work regs LABEL(nand_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a1,{v3,v4,v5,lr} // load target words AND v3,v3,a3 // NAND the four words MVN v3,v3 AND v4,v4,v1 MVN v4,v4 AND v5,v5,v2 MVN v5,v5 AND lr,lr,ip MVN lr,lr STMIA a1!,{v3,v4,v5,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT nand_loop_up_l2 // if count still positive then loop LDMFD sp!,{v1-v5,pc}^ // restore work regs and return // extern void nor_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be NORed // exit // xptr = ~(xptr | yptr) for count words // a1 - a4, ip destroyed EXPORT(nor_loop_up) // word aligned nor loop up DECLARE_FUNCTION(nor_loop_up) GLABEL(nor_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ nor_loop_up_l1 // yup, so branch CMP a4,#2 // NOR the first 1-3 words LDR a4,[a2],#4 // to align the total to a multiple LDR ip,[a1] // of 4 words ORR ip,ip,a4 MVN ip,ip STR ip,[a1],#4 BLT nor_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1] ORRGE ip,ip,a4 MVNGE ip,ip STRGE ip,[a1],#4 BLE nor_loop_up_l1 // better to branch than skip instrs. LDRGT a4,[a2],#4 LDRGT ip,[a1] ORRGT ip,ip,a4 MVNGT ip,ip STRGT ip,[a1],#4 LABEL(nor_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v5,lr} // save work regs LABEL(nor_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a1,{v3,v4,v5,lr} // load target words ORR v3,v3,a3 // NOR the four words MVN v3,v3 ORR v4,v4,v1 MVN v4,v4 ORR v5,v5,v2 MVN v5,v5 ORR lr,lr,ip MVN lr,lr STMIA a1!,{v3,v4,v5,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT nor_loop_up_l2 // if count still positive then loop LDMFD sp!,{v1-v5,pc}^ // restore work regs and return // extern void andc2_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be ANDC2ed // exit // xptr = xptr & ~yptr for count words // a1 - a4, ip destroyed EXPORT(andc2_loop_up) // word aligned andc2 loop up DECLARE_FUNCTION(andc2_loop_up) GLABEL(andc2_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ andc2_loop_up_l1 // yup, so branch CMP a4,#2 // ANDC2 the first 1-3 words LDR a4,[a2],#4 // to align the total to a multiple LDR ip,[a1] // of 4 words BIC ip,ip,a4 STR ip,[a1],#4 BLT andc2_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1] BICGE ip,ip,a4 STRGE ip,[a1],#4 LDRGT a4,[a2],#4 LDRGT ip,[a1] BICGT ip,ip,a4 STRGT ip,[a1],#4 LABEL(andc2_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v5,lr} // save work regs LABEL(andc2_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a1,{v3,v4,v5,lr} // load target words BIC v3,v3,a3 // ANDC2 the four words BIC v4,v4,v1 BIC v5,v5,v2 BIC lr,lr,ip STMIA a1!,{v3,v4,v5,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT andc2_loop_up_l2 // if count still positive then loop LDMFD sp!,{v1-v5,pc}^ // restore work regs and return // extern void orc2_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be XORed // exit // xptr = xptr | ~yptr for count words // a1 - a4, ip destroyed EXPORT(orc2_loop_up) // word aligned orc2 loop up DECLARE_FUNCTION(orc2_loop_up) GLABEL(orc2_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ orc2_loop_up_l1 // yup, so branch CMP a4,#2 // ORC2 the first 1-3 words LDR a4,[a2],#4 // to align the total to a multiple LDR ip,[a1] // of 4 words MVN a4,a4 ORR ip,ip,a4 STR ip,[a1],#4 BLT orc2_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1] MVNGE a4,a4 ORRGE ip,ip,a4 STRGE ip,[a1],#4 BLE orc2_loop_up_l1 // better to branch than skip instrs. LDRGT a4,[a2],#4 LDRGT ip,[a1] MVNGT a4,a4 ORRGT ip,ip,a4 STRGT ip,[a1],#4 LABEL(orc2_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v5,lr} // save work regs LABEL(orc2_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a1,{v3,v4,v5,lr} // load target words MVN a3,a3 // ORC2 the four words ORR v3,v3,a3 MVN v1,v1 ORR v4,v4,v1 MVN v2,v2 ORR v5,v5,v2 MVN ip,ip ORR lr,lr,ip STMIA a1!,{v3,v4,v5,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT orc2_loop_up_l2 // if count still positive then loop LDMFD sp!,{v1-v5,pc}^ // restore work regs and return // extern void not_loop_up (uintD* xptr, uintC count); // entry // a1 = xptr // a2 = count of words to be NOTed // exit // xptr = ~xptr for count words // a1 - a4, ip destroyed EXPORT(not_loop_up) // word aligned not loop up DECLARE_FUNCTION(not_loop_up) GLABEL(not_loop_up) ANDS a3,a2,#3 // multiple of 4 words ? BEQ not_loop_up_l1 // yup, so branch CMP a3,#2 // NOT the first 1-3 words LDR a3,[a1] // to align the total to a multiple MVN a3,a3 // of 4 words STR a3,[a1],#4 BLT not_loop_up_l1 // better to branch than skip instrs. LDRGE a3,[a1] MVNGE a3,a3 STRGE a3,[a1],#4 LDRGT a3,[a1] MVNGT a3,a3 STRGT a3,[a1],#4 LABEL(not_loop_up_l1) BICS a4,a2,#3 // set counter to multiple of 4 MOVEQS pc,lr // if zero then we're done STMFD sp!,{lr} // save work regs LABEL(not_loop_up_l2) LDMIA a1,{a2,a3,ip,lr} // load 4 words in one go,NO writeback MVN a2,a2 // NOT the four words MVN a3,a3 MVN ip,ip MVN lr,lr STMIA a1!,{a2,a3,ip,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT not_loop_up_l2 // if count still positive then loop LDMFD sp!,{pc}^ // restore work regs and return // extern void and_test_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be AND_TESTed // exit // a1 = TRUE if any words ANDed together are non-zero else FALSE // a2 - a4, ip destroyed EXPORT(and_test_loop_up) // word aligned and_test loop up DECLARE_FUNCTION(and_test_loop_up) GLABEL(and_test_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ and_test_loop_up_l1 // yup, so branch CMP a4,#2 LDR a4,[a2],#4 // AND_TEST the first 1-3 words LDR ip,[a1],#4 // to align the total to a multiple TST ip,a4 // of 4 words MOVNE a1,#1 // return TRUE if AND_TEST ok MOVNES pc,lr BCC and_test_loop_up_l1 // better to branch than skip instrs. LDRGE a4,[a2],#4 LDRGE ip,[a1],#4 TSTGE ip,a4 MOVNE a1,#1 MOVNES pc,lr ANDS a4,a3,#3 CMP a4,#2 BLE and_test_loop_up_l1 // better to branch than skip instrs. LDRGT a4,[a2],#4 LDRGT ip,[a1],#4 TSTGT ip,a4 MOVNE a1,#1 MOVNES pc,lr LABEL(and_test_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQ a1,#0 // return FALSE MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v6,lr} // save work regs MOV v6,a1 // move xptr to v6 MOV a1,#1 // set result to TRUE LABEL(and_test_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA v6!,{v3,v4,v5,lr} // load target words TST v3,a3 // AND_TEST the four words TSTEQ v4,v1 TSTEQ v5,v2 TSTEQ lr,ip LDMNEFD sp!,{v1-v6,pc}^ SUBS a4,a4,#4 // decrement counter by 4 BGT and_test_loop_up_l2 // if count still positive then loop MOV a1,#0 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return #endif // extern void compare_loop_up (uintD* xptr, uintD* yptr, uintC count); // entry // a1 = xptr // a2 = yptr // a3 = count of words to be COMPAREd // exit // a1 = +1 if first non-equal word in xptr[] and yptr[] // xptr[i] > yptr[i] // -1 if xptr[i] < yptr[i] // 0 otherwise // a2 - a4, ip destroyed EXPORT(compare_loop_up) // word aligned compare loop up DECLARE_FUNCTION(compare_loop_up) GLABEL(compare_loop_up) ANDS a4,a3,#3 // multiple of 4 words ? BEQ compare_loop_up_l1 // yup, so branch LDR a4,[a2],#4 // COMPARE the first 1-3 words LDR ip,[a1],#4 // to align the total to a multiple CMP ip,a4 // of 4 words MVNLO a1,#0 // x < y -> -1 MOVHI a1,#1 // x > y -> +1 MOVNES pc,lr // and return result if not equal ANDS a4,a3,#3 CMP a4,#2 BLT compare_loop_up_l1 // need to branch 'cos PSR used LDR a4,[a2],#4 LDR ip,[a1],#4 CMP ip,a4 MVNLO a1,#0 MOVHI a1,#1 MOVNES pc,lr ANDS a4,a3,#3 CMP a4,#2 BLE compare_loop_up_l1 // need to branch 'cos PSR used LDR a4,[a2],#4 LDR ip,[a1],#4 CMP ip,a4 MVNLO a1,#0 MOVHI a1,#1 MOVNES pc,lr LABEL(compare_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQ a1,#0 // xptr[] == yptr[] -> 0 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v6,lr} // save work regs MOV v6,a1 // move xptr to v6 MOV a1,#1 // set result to +1 LABEL(compare_loop_up_l2) LDMIA a2!,{a3,v1,v2,ip} // load 4 words in one go LDMIA v6!,{v3,v4,v5,lr} // load test words CMP v3,a3 // COMPARE the four words CMPEQ v4,v1 CMPEQ v5,v2 CMPEQ lr,ip MVNLO a1,#0 // x < y -> -1 (a1 already holds +1) LDMNEFD sp!,{v1-v6,pc}^ SUBS a4,a4,#4 // decrement counter by 4 BGT compare_loop_up_l2 // if count still positive then loop MOV a1,#0 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return #if CL_DS_BIG_ENDIAN_P // extern uintD addto_loop_down (uintD* sourceptr, uintD* destptr, uintC count); // entry // a1 = sourceptr // a2 = destptr // a3 = count of words to be added // exit // destptr[] = sourceptr[] + destptr[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(addto_loop_down) // word aligned addto loop down DECLARE_FUNCTION(addto_loop_down) GLABEL(addto_loop_down) MOV a4,a3 // set regs for a call MOV a3,a2 // to add_loop_down // and drop into add_loop_down // extern uintD add_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count); // entry // a1 = sourceptr1 // a2 = sourceptr2 // a3 = destptr // a4 = count of words to be added // exit // destptr[] = sourceptr1[] + sourceptr2[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(add_loop_down) // word aligned add loop down DECLARE_FUNCTION(add_loop_down) GLABEL(add_loop_down) ANDS ip,a4,#3 // multiple of 4 words ? BEQ add_loop_down_l1 // yup, so branch STMFD sp!,{v6,lr} LDR v6,[a2,#-4]! // add the first 1-3 words LDR lr,[a1,#-4]! // to align the total to a multiple ADDS lr,lr,v6 // of 4 words STR lr,[a3,#-4]! TEQ ip,#1 BEQ add_loop_down_l0 // need to branch 'cos PSR used LDR v6,[a2,#-4]! LDR lr,[a1,#-4]! ADCS lr,lr,v6 STR lr,[a3,#-4]! TEQ ip,#2 BEQ add_loop_down_l0 // need to branch 'cos PSR used LDR v6,[a2,#-4]! LDR lr,[a1,#-4]! ADCS lr,lr,v6 STR lr,[a3,#-4]! LABEL(add_loop_down_l0) // at least one add has happened BICS a4,a4,#3 // set counter to multiple of 4 BNE add_loop_down_l3 // branch if more adds to do ADCEQ a1,a4,a4 // set result to Carry (a4 is 0) LDMEQFD sp!,{v6,pc}^ // and return LABEL(add_loop_down_l1) BICS a4,a4,#3 // set counter to multiple of 4 MOVEQ a1,#0 // no adds, so C = 0 MOVEQS pc,lr // if zero then we're done CMN a4,#0 // clear carry bit STMFD sp!,{v6,lr} LABEL(add_loop_down_l3) STMFD sp!,{v1-v5} // save work regs LABEL(add_loop_down_l2) LDMDB a2!,{v1,v2,v3,ip} // load 4 words in one go LDMDB a1!,{v4,v5,v6,lr} // and from source2 ADCS lr,lr,ip // add the four words with carry ADCS v6,v6,v3 ADCS v5,v5,v2 ADCS v4,v4,v1 STMDB a3!,{v4,v5,v6,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4, preserve C TEQ a4,#0 // are we done ? BNE add_loop_down_l2 // if count non-zero then loop ADC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{v1-v6,pc}^ // restore work regs and return // extern uintD inc_loop_down (uintD* ptr, uintC count); // entry // a1 = ptr // a2 = count of words to be INCed // exit // a1 = 0 if any words are non-zero after increment else 1 // stop incrementing when first word becomes non-zero // a2 - a4, ip destroyed EXPORT(inc_loop_down) // word aligned inc loop down DECLARE_FUNCTION(inc_loop_down) GLABEL(inc_loop_down) ANDS a3,a2,#1 // multiple of 2 words ? BEQ inc_loop_down_l1 // yup, so branch LDR a4,[a1,#-4]! // INC the first word ADDS a4,a4,#1 // align the total to a multiple of 2 STR a4,[a1] MOVNE a1,#0 // set result to 0 MOVNES pc,lr // return 0 if non-zero result LABEL(inc_loop_down_l1) BICS a4,a2,#1 // set counter to multiple of 2 MOVEQ a1,#1 // return 1 MOVEQS pc,lr // if zero then we're done MOV ip,a1 // move ptr to ip MOV a1,#0 // set result to 0 ANDS a3,a4,#3 BEQ inc_loop_down_l3 LDMDB ip,{a2,a3} // load 2 words in one go ADDS a3,a3,#1 // INC the two words ADDEQS a2,a2,#1 // stopping when first word non-zero STMDB ip!,{a2,a3} // store 2 results MOVNES pc,lr // return 0 if any result non-zero SUBS a4,a4,#2 // decrement counter by 2 MOVEQ a1,#1 // if finished loop then MOVEQS pc,lr // return 1 LABEL(inc_loop_down_l3) // now a multiple of 4 words STMFD sp!,{v1,lr} // save work regs LABEL(inc_loop_down_l2) LDMDB ip,{a2,a3,v1,lr} // load 4 words in one go ADDS lr,lr,#1 // INC the four words ADDEQS v1,v1,#1 // stopping when first word non-zero ADDEQS a3,a3,#1 ADDEQS a2,a2,#1 STMDB ip!,{a2,a3,v1,lr} // store 4 results LDMNEFD sp!,{v1,pc}^ // return 0 if any result non-zero SUBS a4,a4,#4 // decrement counter by 4 BGT inc_loop_down_l2 // if count still positive then loop MOV a1,#1 LDMFD sp!,{v1,pc}^ // restore work regs and return 1 // extern uintD sub_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count); // entry // a1 = sourceptr1 // a2 = sourceptr2 // a3 = destptr // a4 = count of words to be subtracted // exit // destptr[] = sourceptr1[] - sourceptr2[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(sub_loop_down) // word aligned sub loop down LABEL(sub_loop_down) ANDS ip,a4,#3 // multiple of 4 words ? BEQ sub_loop_down_l1 // yup, so branch STMFD sp!,{v6,lr} LDR v6,[a2,#-4]! // subtract the first 1-3 words LDR lr,[a1,#-4]! // to align the total to a multiple SUBS lr,lr,v6 // of 4 words STR lr,[a3,#-4]! TEQ ip,#1 BNE sub_loop_down_l0 // branch if more than one subtract LABEL(sub_loop_down_l4) // drop through for better instr. timings BICS a4,a4,#3 // set counter to multiple of 4 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0) LDMEQFD sp!,{v6,pc}^ // and return STMFD sp!,{v1-v5} // save work regs B sub_loop_down_l2 // branch if more subtracts to do LABEL(sub_loop_down_l0) LDR v6,[a2,#-4]! LDR lr,[a1,#-4]! SBCS lr,lr,v6 STR lr,[a3,#-4]! TEQ ip,#2 BEQ sub_loop_down_l4 // need to branch 'cos PSR used LDR v6,[a2,#-4]! LDR lr,[a1,#-4]! SBCS lr,lr,v6 STR lr,[a3,#-4]! B sub_loop_down_l4 LABEL(sub_loop_down_l1) BICS a4,a4,#3 // set counter to multiple of 4 MOVEQ a1,#0 // no subtracts, so C = 0 MOVEQS pc,lr // if zero then we're done CMP a4,#0 // set carry bit, since a4 > 0 STMFD sp!,{v1-v6,lr} // save work regs LABEL(sub_loop_down_l2) LDMDB a2!,{v1,v2,v3,ip} // load 4 words in one go LDMDB a1!,{v4,v5,v6,lr} // and from source2 SBCS lr,lr,ip // subtract the four words with carry SBCS v6,v6,v3 SBCS v5,v5,v2 SBCS v4,v4,v1 STMDB a3!,{v4,v5,v6,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4, preserve C TEQ a4,#0 // are we done ? BNE sub_loop_down_l2 // if count non-zero then loop SBC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{v1-v6,pc}^ // restore work regs and return // extern uintD subx_loop_down (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count, uintD carry); // entry // a1 = sourceptr1 // a2 = sourceptr2 // a3 = destptr // a4 = count of words to be subtracted // [sp] = carry // exit // destptr[] = sourceptr1[] - sourceptr2[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(subx_loop_down) // word aligned xsub loop down LABEL(subx_loop_down) LDR ip,[sp] // get starting value of carry LABEL(subx_loop_down_lsub) RSBS ip,ip,#0 // set carry in PSR ANDS ip,a4,#3 // multiple of 4 words ? BEQ subx_loop_down_l1 // yup, so branch STMFD sp!,{v6,lr} LDR v6,[a2,#-4]! // subtract the first 1-3 words LDR lr,[a1,#-4]! // to align the total to a multiple SBCS lr,lr,v6 // of 4 words STR lr,[a3,#-4]! TEQ ip,#1 BNE subx_loop_down_l0 // branch if more than one subtract LABEL(subx_loop_down_l4) // drop through for better instr. timings BICS a4,a4,#3 // set counter to multiple of 4 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0) LDMEQFD sp!,{v6,pc}^ // and return STMFD sp!,{v1-v5} // save work regs B subx_loop_down_l2 // branch if more subtracts to do LABEL(subx_loop_down_l0) LDR v6,[a2,#-4]! LDR lr,[a1,#-4]! SBCS lr,lr,v6 STR lr,[a3,#-4]! TEQ ip,#2 BEQ subx_loop_down_l4 // need to branch 'cos PSR used LDR v6,[a2,#-4]! LDR lr,[a1,#-4]! SBCS lr,lr,v6 STR lr,[a3,#-4]! B subx_loop_down_l4 LABEL(subx_loop_down_l1) BICS a4,a4,#3 // set counter to multiple of 4 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0) MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v6,lr} // save work regs LABEL(subx_loop_down_l2) LDMDB a2!,{v1,v2,v3,ip} // load 4 words in one go LDMDB a1!,{v4,v5,v6,lr} // and from source2 SBCS lr,lr,ip // subtract the four words with carry SBCS v6,v6,v3 SBCS v5,v5,v2 SBCS v4,v4,v1 STMDB a3!,{v4,v5,v6,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4, preserve C TEQ a4,#0 // are we done ? BNE subx_loop_down_l2 // if count non-zero then loop SBC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{v1-v6,pc}^ // restore work regs and return // extern uintD subfrom_loop_down (uintD* sourceptr, uintD* destptr, uintC count); // entry // a1 = sourceptr // a2 = destptr // a3 = count of words to be subtracted // exit // destptr[] = destptr[] - sourceptr[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(subfrom_loop_down) // word aligned subfrom loop down DECLARE_FUNCTION(subfrom_loop_down) GLABEL(subfrom_loop_down) ANDS ip,a3,#3 // multiple of 4 words ? BEQ subfrom_loop_down_l1 // yup, so branch STMFD sp!,{lr} LDR a4,[a1,#-4]! // subtract the first 1-3 words LDR lr,[a2,#-4]! // to align the total to a multiple SUBS lr,lr,a4 // of 4 words STR lr,[a2] TEQ ip,#1 BNE subfrom_loop_down_l0 // branch if more than one subtract LABEL(subfrom_loop_down_l4) // drop through for better instr. timings BICS a4,a3,#3 // set counter to multiple of 4 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0) LDMEQFD sp!,{pc}^ // and return STMFD sp!,{v1-v5} // save work regs B subfrom_loop_down_l2 // branch if more subtracts to do LABEL(subfrom_loop_down_l0) LDR a4,[a1,#-4]! LDR lr,[a2,#-4]! SBCS lr,lr,a4 STR lr,[a2] TEQ ip,#2 BEQ subfrom_loop_down_l4 // need to branch 'cos PSR used LDR a4,[a1,#-4]! LDR lr,[a2,#-4]! SBCS lr,lr,a4 STR lr,[a2] B subfrom_loop_down_l4 LABEL(subfrom_loop_down_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQ a1,#0 // no subtracts, so C = 0 MOVEQS pc,lr // if zero then we're done CMP a4,#0 // set carry bit, since a4 > 0 STMFD sp!,{v1-v5,lr} // save work regs LABEL(subfrom_loop_down_l2) LDMDB a1!,{a3,v1,v2,ip} // load 4 words in one go LDMDB a2,{v3,v4,v5,lr} // and from destptr SBCS lr,lr,ip // subtract the four words with carry SBCS v5,v5,v2 SBCS v4,v4,v1 SBCS v3,v3,a3 STMDB a2!,{v3,v4,v5,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4, preserve C TEQ a4,#0 // are we done ? BNE subfrom_loop_down_l2 // if count non-zero then loop SBC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{v1-v5,pc}^ // restore work regs and return // extern uintD dec_loop_down (uintD* ptr, uintC count); // entry // a1 = ptr // a2 = count of words to be DECed // exit // a1 = 0 if any words are non-zero before decrement else -1 // stop decrementing when first word is non-zero // a2 - a4, ip destroyed EXPORT(dec_loop_down) // word aligned dec loop down DECLARE_FUNCTION(dec_loop_down) GLABEL(dec_loop_down) ANDS a3,a2,#1 // multiple of 2 words ? BEQ dec_loop_down_l1 // yup, so branch LDR a4,[a1,#-4]! // DEC the first word SUBS a4,a4,#1 // align the total to a multiple of 2 STR a4,[a1] MOVCS a1,#0 // set result to 0 MOVCSS pc,lr // return 0 if non-zero result LABEL(dec_loop_down_l1) BICS a4,a2,#1 // set counter to multiple of 2 MVNEQ a1,#0 // return -1 MOVEQS pc,lr // if zero then we're done MOV ip,a1 // move ptr to ip MOV a1,#0 // set result to 0 ANDS a3,a4,#3 BEQ dec_loop_down_l3 LDMDB ip,{a2,a3} // load 2 words in one go SUBS a3,a3,#1 // DEC the two words SUBCCS a2,a2,#1 // stopping when first word non-zero STMDB ip!,{a2,a3} // store 2 results MOVCSS pc,lr // return 0 if any result non-zero SUBS a4,a4,#2 // decrement counter by 2 MVNEQ a1,#0 // if finished loop then MOVEQS pc,lr // return -1 LABEL(dec_loop_down_l3) // now a multiple of 4 words STMFD sp!,{v1,lr} // save work regs LABEL(dec_loop_down_l2) LDMDB ip,{a2,a3,v1,lr} // load 4 words in one go SUBS lr,lr,#1 // DEC the four words SUBCCS v1,v1,#1 // stopping when first word non-zero SUBCCS a3,a3,#1 SUBCCS a2,a2,#1 STMDB ip!,{a2,a3,v1,lr} // store 4 results LDMCSFD sp!,{v1,pc}^ // return 0 if any carry SUBS a4,a4,#4 // decrement counter by 4 BGT dec_loop_down_l2 // if count still positive then loop MVN a1,#0 LDMFD sp!,{v1,pc}^ // restore work regs and return -1 // extern void neg_loop_down (uintD* ptr, uintC count); // entry // a1 = ptr // a2 = count of words. The long integer is to be NEGated // exit // ptr[] = -ptr[] for count words // a1 = last carry // a2 - a4, ip destroyed EXPORT(neg_loop_down) // word aligned neg loop down DECLARE_FUNCTION(neg_loop_down) GLABEL(neg_loop_down) CMPS a2,#0 // count = 0 ? MOVEQ a1,#0 // yup, so return 0 MOVEQS pc,lr LABEL(neg_loop_down_l1) // skip all the zero words first LDR a3,[a1,#-4]! // compare words against zero CMPS a3,#0 // downwards in memory BNE neg_loop_down_l2 // non-zero, so negate rest of words SUBS a2,a2,#1 // reduce count of words BNE neg_loop_down_l1 // more ?, so loop MOV a1,#0 // return 0 MOVS pc,lr LABEL(neg_loop_down_l2) RSB a3,a3,#0 // first non-zero word = -word STR a3,[a1] SUBS a2,a2,#1 MVNEQ a1,#0 // done ? -> return -1 MOVEQS pc,lr // now NOT rest of the words ANDS a3,a2,#3 // multiple of 4 words ? BEQ neg_loop_down_l3 // yup, so branch CMP a3,#2 // NOT the first 1-3 words LDR a3,[a1,#-4]! // to align the total to a multiple MVN a3,a3 // of 4 words STR a3,[a1] BLT neg_loop_down_l3 // better to branch than skip instrs. LDRGE a3,[a1,#-4]! MVNGE a3,a3 STRGE a3,[a1] LDRGT a3,[a1,#-4]! MVNGT a3,a3 STRGT a3,[a1] LABEL(neg_loop_down_l3) BICS a4,a2,#3 // set counter to multiple of 4 MVNEQ a1,#0 // set result to -1 MOVEQS pc,lr // if zero then we're done STMFD sp!,{lr} // save work regs LABEL(neg_loop_down_l4) LDMDB a1,{a2,a3,ip,lr} // load 4 words in one go,NO writeback MVN a2,a2 // NOT the four words MVN a3,a3 MVN ip,ip MVN lr,lr STMDB a1!,{a2,a3,ip,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT neg_loop_down_l4 // if count still positive then loop MVN a1,#0 // set result to -1 LDMFD sp!,{pc}^ // restore work regs and return -1 // extern uintD shift1left_loop_down (uintD* ptr, uintC count); // entry // a1 = ptr // a2 = count of words to be shifted left // exit // a1 = carry out from last shift left // a2 - a4, ip destroyed EXPORT(shift1left_loop_down) // word aligned shift1left loop down DECLARE_FUNCTION(shift1left_loop_down) GLABEL(shift1left_loop_down) CMN a1,#0 // clear carry bit, since a1 > 0 ANDS a3,a2,#1 // multiple of 2 words ? BEQ shift1left_loop_down_l1 // yup, so branch LDR a4,[a1,#-4]! // shift left the first word ADDS a4,a4,a4 STR a4,[a1] LABEL(shift1left_loop_down_l1) BICS a4,a2,#1 // set counter to multiple of 2 ADCEQ a1,a4,a4 // if zero set result to C (a4 is 0) MOVEQS pc,lr // and return ANDS a3,a4,#3 // multiple of 4 words ? BEQ shift1left_loop_down_l3 // yup, so branch LDMDB a1,{a2,a3} // load 2 words in one go ADCS a3,a3,a3 // shift left the two words ADCS a2,a2,a2 STMDB a1!,{a2,a3} // store 2 results BICS a4,a4,#2 // decrement counter by 2 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0) MOVEQS pc,lr // and return LABEL(shift1left_loop_down_l3) // now a multiple of 4 words STMFD sp!,{lr} // save work regs LABEL(shift1left_loop_down_l2) LDMDB a1,{a2,a3,ip,lr} // load 4 words in one go ADCS lr,lr,lr // shift left the four words ADCS ip,ip,ip ADCS a3,a3,a3 ADCS a2,a2,a2 STMDB a1!,{a2,a3,ip,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4 TEQ a4,#0 // are we done ? BNE shift1left_loop_down_l2 // if count non-zero then loop ADC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{pc}^ // restore work regs and return 1 // extern uintD shiftleft_loop_down (uintD* ptr, uintC count, uintC i, uintD carry); // entry // a1 = ptr // a2 = count of words to be shifted left // a3 = size of left shift // a4 = value to ORR in for first shift // exit // a1 = shift out from last shift left // a2 - a4, ip destroyed EXPORT(shiftleft_loop_down) // word aligned shiftleft loop down DECLARE_FUNCTION(shiftleft_loop_down) GLABEL(shiftleft_loop_down) STMFD sp!,{v6,lr} RSB v6,a3,#32 // size of complementary right shift ANDS ip,a2,#3 // multiple of 4 words ? BEQ shiftleft_loop_down_l1 // yup, so branch LDR lr,[a1,#-4]! // shiftleft the first 1-3 words ORR a4,a4,lr,ASL a3 // to align the total to a multiple STR a4,[a1,#0] // of 4 words MOV a4,lr,LSR v6 CMP ip,#2 BLT shiftleft_loop_down_l1 // better to branch than skip instrs. LDRGE lr,[a1,#-4]! ORRGE a4,a4,lr,ASL a3 STRGE a4,[a1,#0] MOVGE a4,lr,LSR v6 LDRGT lr,[a1,#-4]! ORRGT a4,a4,lr,ASL a3 STRGT a4,[a1,#0] MOVGT a4,lr,LSR v6 LABEL(shiftleft_loop_down_l1) BICS ip,a2,#3 // set counter to multiple of 4 MOVEQ a1,a4 // if zero then we're done LDMEQFD sp!,{v6,pc}^ // so return last shift out STMFD sp!,{v1-v3} // save work regs LABEL(shiftleft_loop_down_l2) LDMDB a1,{a2,v1,v2,v3} // load 4 words in one go ORR lr,a4,v3,ASL a3 // shiftleft the four words MOV a4,v3,LSR v6 // keep carry in a4 ORR v3,a4,v2,ASL a3 // and store results up a register MOV a4,v2,LSR v6 // to regs v1-v3,lr ORR v2,a4,v1,ASL a3 MOV a4,v1,LSR v6 ORR v1,a4,a2,ASL a3 MOV a4,a2,LSR v6 STMDB a1!,{v1,v2,v3,lr} // store 4 results SUBS ip,ip,#4 // decrement counter by 4 BGT shiftleft_loop_down_l2 // if count still positive then loop MOV a1,a4 // result = last shift out LDMFD sp!,{v1-v3,v6,pc}^ // restore work regs and return // extern uintD shiftleftcopy_loop_down (uintD* sourceptr, uintD* destptr, uintC count, uintC i); // entry // a1 = sourceptr // a2 = destptr // a3 = count of words to be shifted left // a4 = size of left shift // exit // a1 = shift out from last shift left // a2 - a4, ip destroyed EXPORT(shiftleftcopy_loop_down) // word aligned shiftleftcopy loop down DECLARE_FUNCTION(shiftleftcopy_loop_down) GLABEL(shiftleftcopy_loop_down) STMFD sp!,{v5,v6,lr} MOV v5,#0 // initial shift carry RSB v6,a4,#32 // size of complementary right shift ANDS ip,a3,#3 // multiple of 4 words ? BEQ shiftleftcopy_loop_down_l1 // yup, so branch LDR lr,[a1,#-4]! // shiftleft the first 1-3 words ORR v5,v5,lr,ASL a4 // to align the total to a multiple STR v5,[a2,#-4]! // of 4 words MOV v5,lr,LSR v6 CMP ip,#2 BLT shiftleftcopy_loop_down_l1 // better to branch than skip instrs. LDRGE lr,[a1,#-4]! ORRGE v5,v5,lr,ASL a4 STRGE v5,[a2,#-4]! MOVGE v5,lr,LSR v6 LDRGT lr,[a1,#-4]! ORRGT v5,v5,lr,ASL a4 STRGT v5,[a2,#-4]! MOVGT v5,lr,LSR v6 LABEL(shiftleftcopy_loop_down_l1) BICS ip,a3,#3 // set counter to multiple of 4 MOVEQ a1,v5 // if zero then we're done LDMEQFD sp!,{v5,v6,pc}^ // so return last shift out STMFD sp!,{v1-v3} // save work regs LABEL(shiftleftcopy_loop_down_l2) LDMDB a1!,{a3,v1,v2,v3} // load 4 words in one go ORR lr,v5,v3,ASL a4 // shiftleft the four words MOV v5,v3,LSR v6 // keep carry in v5 ORR v3,v5,v2,ASL a4 // and store results up a register MOV v5,v2,LSR v6 // to regs v1-v3,lr ORR v2,v5,v1,ASL a4 MOV v5,v1,LSR v6 ORR v1,v5,a3,ASL a4 MOV v5,a3,LSR v6 STMDB a2!,{v1,v2,v3,lr} // store 4 results SUBS ip,ip,#4 // decrement counter by 4 BGT shiftleftcopy_loop_down_l2 // if count still positive then loop MOV a1,v5 // result = last shift out LDMFD sp!,{v1-v3,v5,v6,pc}^ // restore work regs and return // extern uintD shift1right_loop_up (uintD* ptr, uintC count, uintD carry); // entry // a1 = ptr // a2 = count of words to be shifted right // a3 = carry // exit // a1 = carry out from last shift right // a2 - a4, ip destroyed EXPORT(shift1right_loop_up) // word aligned shift1right loop up DECLARE_FUNCTION(shift1right_loop_up) GLABEL(shift1right_loop_up) MOVS a3,a3,LSR #1 // set carry ANDS a3,a2,#1 // multiple of 2 words ? BEQ shift1right_loop_up_l1 // yup, so branch LDR a4,[a1] // shift right the first word MOVS a4,a4,RRX STR a4,[a1],#4 LABEL(shift1right_loop_up_l1) BICS a4,a2,#1 // set counter to multiple of 2 MOVEQ a1,a4,RRX // if zero set result to C (a4 is 0) MOVEQS pc,lr // and return ANDS a3,a4,#3 // multiple of 4 words ? BEQ shift1right_loop_up_l3 // yup, so branch LDMIA a1,{a2,a3} // load 2 words in one go MOVS a2,a2,RRX // shift right the two words MOVS a3,a3,RRX STMIA a1!,{a2,a3} // store 2 results BICS a4,a4,#2 // decrement counter by 2 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0) MOVEQS pc,lr // and return LABEL(shift1right_loop_up_l3) // now a multiple of 4 words STMFD sp!,{lr} // save work regs LABEL(shift1right_loop_up_l2) LDMIA a1,{a2,a3,ip,lr} // load 4 words in one go MOVS a2,a2,RRX // shift right the four words MOVS a3,a3,RRX MOVS ip,ip,RRX MOVS lr,lr,RRX STMIA a1!,{a2,a3,ip,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4 TEQ a4,#0 // are we done ? BNE shift1right_loop_up_l2 // if count non-zero then loop MOV a1,a4,RRX // set result to Carry (a4 is 0) LDMFD sp!,{pc}^ // restore work regs and return 1 // extern uintD shiftright_loop_up (uintD* ptr, uintC count, uintC i); // entry // a1 = ptr // a2 = count of words to be shifted right // a3 = size of right shift // exit // a1 = shift out from last shift right // a2 - a4, ip destroyed EXPORT(shiftright_loop_up) // word aligned shiftright loop up DECLARE_FUNCTION(shiftright_loop_up) GLABEL(shiftright_loop_up) STMFD sp!,{v6,lr} MOV a4,#0 // initial shift carry RSB v6,a3,#32 // size of complementary left shift LABEL(shiftright_loop_up_l0) ANDS ip,a2,#3 // multiple of 4 words ? BEQ shiftright_loop_up_l1 // yup, so branch LDR lr,[a1] // shiftright the first 1-3 words ORR a4,a4,lr,LSR a3 // to align the total to a multiple STR a4,[a1],#4 // of 4 words MOV a4,lr,ASL v6 CMP ip,#2 BLT shiftright_loop_up_l1 // better to branch than skip instrs. LDRGE lr,[a1] ORRGE a4,a4,lr,LSR a3 STRGE a4,[a1],#4 MOVGE a4,lr,ASL v6 LDRGT lr,[a1] ORRGT a4,a4,lr,LSR a3 STRGT a4,[a1],#4 MOVGT a4,lr,ASL v6 LABEL(shiftright_loop_up_l1) BICS ip,a2,#3 // set counter to multiple of 4 MOVEQ a1,a4 // if zero then we're done LDMEQFD sp!,{v6,pc}^ // so return last shift out STMFD sp!,{v1-v3} // save work regs LABEL(shiftright_loop_up_l2) LDMIA a1,{v1,v2,v3,lr} // load 4 words in one go ORR a2,a4,v1,LSR a3 // shiftright the four words MOV a4,v1,ASL v6 // keep carry in a4 ORR v1,a4,v2,LSR a3 // and store results down a register MOV a4,v2,ASL v6 // to regs a2,v1-v3 ORR v2,a4,v3,LSR a3 MOV a4,v3,ASL v6 ORR v3,a4,lr,LSR a3 MOV a4,lr,ASL v6 STMIA a1!,{a2,v1,v2,v3} // store 4 results SUBS ip,ip,#4 // decrement counter by 4 BGT shiftright_loop_up_l2 // if count still positive then loop MOV a1,a4 // result = last shift out LDMFD sp!,{v1-v3,v6,pc}^ // restore work regs and return // extern uintD shiftrightsigned_loop_up (uintD* ptr, uintC count, uintC i); // entry // a1 = ptr // a2 = count of words to be shifted right signed // a3 = size of right shift // exit // a1 = shift out from last shift right // a2 - a4, ip destroyed EXPORT(shiftrightsigned_loop_up)// word aligned shiftrightsigned loop up DECLARE_FUNCTION(shiftrightsigned_loop_up) GLABEL(shiftrightsigned_loop_up) STMFD sp!,{v6,lr} RSB v6,a3,#32 // size of complementary left shift LDR lr,[a1] // setup carry for first shift. MOV a4,lr,ASR #31 // this is the sign extended bits AND a4,a4,a4,LSL v6 // 31->(32-i) of the first word B shiftright_loop_up_l0 // use right shift code now // extern uintD shiftrightcopy_loop_up (uintD* sourceptr, uintD* destptr, uintC count, uintC i, uintD carry); // entry // a1 = sourceptr // a2 = destptr // a3 = count of words to be shifted right // a4 = size of right shift // [sp] = carry for first shift // exit // a1 = shift out from last shift right // a2 - a4, ip destroyed EXPORT(shiftrightcopy_loop_up) // word aligned shiftrightcopy loop up DECLARE_FUNCTION(shiftrightcopy_loop_up) GLABEL(shiftrightcopy_loop_up) STMFD sp!,{v5,v6,lr} LDR v5,[sp,#12] // initial shift carry RSB v6,a4,#32 // size of complementary left shift MOV v5,v5,ASL v6 LABEL(shiftrightcopy_loop_up_l0) ANDS ip,a3,#3 // multiple of 4 words ? BEQ shiftrightcopy_loop_up_l1 // yup, so branch LDR lr,[a1],#4 // shiftright the first 1-3 words ORR v5,v5,lr,LSR a4 // to align the total to a multiple STR v5,[a2],#4 // of 4 words MOV v5,lr,ASL v6 CMP ip,#2 BLT shiftrightcopy_loop_up_l1 // better to branch than skip instrs. LDRGE lr,[a1],#4 ORRGE v5,v5,lr,LSR a4 STRGE v5,[a2],#4 MOVGE v5,lr,ASL v6 LDRGT lr,[a1],#4 ORRGT v5,v5,lr,LSR a4 STRGT v5,[a2],#4 MOVGT v5,lr,ASL v6 LABEL(shiftrightcopy_loop_up_l1) BICS ip,a3,#3 // set counter to multiple of 4 MOVEQ a1,v5 // if zero then we're done LDMEQFD sp!,{v5,v6,pc}^ // so return last shift out STMFD sp!,{v1-v3} // save work regs LABEL(shiftrightcopy_loop_up_l2) LDMIA a1!,{v1,v2,v3,lr} // load 4 words in one go ORR a3,v5,v1,LSR a4 // shiftright the four words MOV v5,v1,ASL v6 // keep carry in v5 ORR v1,v5,v2,LSR a4 // and store results down a register MOV v5,v2,ASL v6 // to regs a2,v1-v3 ORR v2,v5,v3,LSR a4 MOV v5,v3,ASL v6 ORR v3,v5,lr,LSR a4 MOV v5,lr,ASL v6 STMIA a2!,{a3,v1,v2,v3} // store 4 results SUBS ip,ip,#4 // decrement counter by 4 BGT shiftrightcopy_loop_up_l2 // if count still positive then loop MOV a1,v5 // result = last shift out LDMFD sp!,{v1-v3,v5,v6,pc}^ // restore work regs and return #ifndef HAVE_umull // mulu32_64_vregs // entry // a1 = x // ip = y // exit // v1 = low32(x*y) // ip = high32(x*y) // v2,v3,v4 destroyed LABEL(mulu32_64_vregs) MOV v1,a1,LSR #16 // temp := top half of x MOV v2,ip,LSR #16 // hi := top half of y BIC v3,a1,v1,LSL #16 // x := bottom half of x BIC ip,ip,v2,LSL #16 // y := bottom half of y MUL v4,v3,ip // low section of result MUL ip,v1,ip // ) middle sections MUL v3,v2,v3 // ) of result MUL v2,v1,v2 // high section of result ADDS ip,ip,v3 // add middle sections // (can't use mla as we need carry) ADDCS v2,v2,#0x10000 // carry from above add ADDS v1,v4,ip,LSL #16 // x is now bottom 32 bits of result ADC ip,v2,ip,LSR #16 // hi is top 32 bits MOVS pc,lr #endif // extern uintD mulusmall_loop_down (uintD digit, uintD* ptr, uintC len, uintD newdigit); // entry // a1 = digit // a2 = ptr // a3 = count of words to be multiplied down // a4 = new digit = carry // exit // a1 = final carry of multiply // a2 - a4, ip destroyed EXPORT(mulusmall_loop_down) DECLARE_FUNCTION(mulusmall_loop_down) GLABEL(mulusmall_loop_down) CMP a3,#0 MOVEQ a1,a4 MOVEQS pc,lr #ifdef HAVE_umull STMFD sp!,{v1,lr} LABEL(mulusmall_loop_down_l1) LDR ip,[a2,#-4]! UMULL v1,ip,a1,ip // muluD(digit,*--ptr,hi=,lo=) ADDS v1,v1,a4 // lo += carry ADC a4,ip,#0 // if (lo yptr[i] // -1 if xptr[i] < yptr[i] // 0 otherwise // a2 - a4, ip destroyed EXPORT(compare_loop_down) // word aligned compare loop down DECLARE_FUNCTION(compare_loop_down) GLABEL(compare_loop_down) ANDS a4,a3,#3 // multiple of 4 words ? BEQ compare_loop_down_l1 // yup, so branch LDR a4,[a2,#-4]! // COMPARE the first 1-3 words LDR ip,[a1,#-4]! // to align the total to a multiple CMP ip,a4 // of 4 words MVNLO a1,#0 // x < y -> -1 MOVHI a1,#1 // x > y -> +1 MOVNES pc,lr // and return result if not equal ANDS a4,a3,#3 CMP a4,#2 BLT compare_loop_down_l1 // need to branch 'cos PSR used LDR a4,[a2,#-4]! LDR ip,[a1,#-4]! CMP ip,a4 MVNLO a1,#0 MOVHI a1,#1 MOVNES pc,lr ANDS a4,a3,#3 CMP a4,#2 BLE compare_loop_down_l1 // need to branch 'cos PSR used LDR a4,[a2,#-4]! LDR ip,[a1,#-4]! CMP ip,a4 MVNLO a1,#0 MOVHI a1,#1 MOVNES pc,lr LABEL(compare_loop_down_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQ a1,#0 // xptr[] == yptr[] -> 0 MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v6,lr} // save work regs MOV v6,a1 // move xptr to v6 MOV a1,#1 // set result to +1 LABEL(compare_loop_down_l2) LDMDB a2!,{a3,v1,v2,ip} // load 4 words in one go LDMDB v6!,{v3,v4,v5,lr} // load test words CMP lr,ip // COMPARE the four words CMPEQ v5,v2 CMPEQ v4,v1 CMPEQ v3,a3 MVNLO a1,#0 // x < y -> -1 (a1 already holds +1) LDMNEFD sp!,{v1-v6,pc}^ SUBS a4,a4,#4 // decrement counter by 4 BGT compare_loop_down_l2 // if count still positive then loop MOV a1,#0 LDMFD sp!,{v1-v6,pc}^ // restore work regs and return // extern uintD addto_loop_up (uintD* sourceptr, uintD* destptr, uintC count); // entry // a1 = sourceptr // a2 = destptr // a3 = count of words to be added // exit // destptr[] = sourceptr[] + destptr[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(addto_loop_up) // word aligned addto loop up DECLARE_FUNCTION(addto_loop_up) GLABEL(addto_loop_up) MOV a4,a3 // set regs for a call MOV a3,a2 // to add_loop_up // and drop into add_loop_up // extern uintD add_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count); // entry // a1 = sourceptr1 // a2 = sourceptr2 // a3 = destptr // a4 = count of words to be added // exit // destptr[] = sourceptr1[] + sourceptr2[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(add_loop_up) // word aligned add loop up DECLARE_FUNCTION(add_loop_up) GLABEL(add_loop_up) ANDS ip,a4,#3 // multiple of 4 words ? BEQ add_loop_up_l1 // yup, so branch STMFD sp!,{v6,lr} LDR v6,[a2],#4 // add the first 1-3 words LDR lr,[a1],#4 // to align the total to a multiple ADDS lr,lr,v6 // of 4 words STR lr,[a3],#4 TEQ ip,#1 BEQ add_loop_up_l0 // need to branch 'cos PSR used LDR v6,[a2],#4 LDR lr,[a1],#4 ADCS lr,lr,v6 STR lr,[a3],#4 TEQ ip,#2 BEQ add_loop_up_l0 // need to branch 'cos PSR used LDR v6,[a2],#4 LDR lr,[a1],#4 ADCS lr,lr,v6 STR lr,[a3],#4 LABEL(add_loop_up_l0) // at least one add has happened BICS a4,a4,#3 // set counter to multiple of 4 BNE add_loop_up_l3 // branch if more adds to do ADCEQ a1,a4,a4 // set result to Carry (a4 is 0) LDMEQFD sp!,{v6,pc}^ // and return LABEL(add_loop_up_l1) BICS a4,a4,#3 // set counter to multiple of 4 MOVEQ a1,#0 // no adds, so C = 0 MOVEQS pc,lr // if zero then we're done CMN a4,#0 // clear carry bit STMFD sp!,{v6,lr} LABEL(add_loop_up_l3) STMFD sp!,{v1-v5} // save work regs LABEL(add_loop_up_l2) LDMIA a2!,{v1,v2,v3,ip} // load 4 words in one go LDMIA a1!,{v4,v5,v6,lr} // and from source2 ADCS v4,v4,v1 // add the four words with carry ADCS v5,v5,v2 ADCS v6,v6,v3 ADCS lr,lr,ip STMIA a3!,{v4,v5,v6,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4, preserve C TEQ a4,#0 // are we done ? BNE add_loop_up_l2 // if count non-zero then loop ADC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{v1-v6,pc}^ // restore work regs and return // extern uintD inc_loop_up (uintD* ptr, uintC count); // entry // a1 = ptr // a2 = count of words to be INCed // exit // a1 = 0 if any words are non-zero after increment else 1 // stop incrementing when first word becomes non-zero // a2 - a4, ip destroyed EXPORT(inc_loop_up) // word aligned inc loop up DECLARE_FUNCTION(inc_loop_up) GLABEL(inc_loop_up) ANDS a3,a2,#1 // multiple of 2 words ? BEQ inc_loop_up_l1 // yup, so branch LDR a4,[a1] // INC the first word ADDS a4,a4,#1 // align the total to a multiple of 2 STR a4,[a1],#4 MOVNE a1,#0 // set result to 0 MOVNES pc,lr // return 0 if non-zero result LABEL(inc_loop_up_l1) BICS a4,a2,#1 // set counter to multiple of 2 MOVEQ a1,#1 // return 1 MOVEQS pc,lr // if zero then we're done MOV ip,a1 // move ptr to ip MOV a1,#0 // set result to 0 ANDS a3,a4,#3 BEQ inc_loop_up_l3 LDMIA ip,{a2,a3} // load 2 words in one go ADDS a2,a2,#1 // INC the two words ADDEQS a3,a3,#1 // stopping when first word non-zero STMIA ip!,{a2,a3} // store 2 results MOVNES pc,lr // return 0 if any result non-zero SUBS a4,a4,#2 // decrement counter by 2 MOVEQ a1,#1 // if finished loop then MOVEQS pc,lr // return 1 LABEL(inc_loop_up_l3) // now a multiple of 4 words STMFD sp!,{v1,lr} // save work regs LABEL(inc_loop_up_l2) LDMIA ip,{a2,a3,v1,lr} // load 4 words in one go ADDS a2,a2,#1 // INC the four words ADDEQS a3,a3,#1 // stopping when first word non-zero ADDEQS v1,v1,#1 ADDEQS lr,lr,#1 STMIA ip!,{a2,a3,v1,lr} // store 4 results LDMNEFD sp!,{v1,pc}^ // return 0 if any result non-zero SUBS a4,a4,#4 // decrement counter by 4 BGT inc_loop_up_l2 // if count still positive then loop MOV a1,#1 LDMFD sp!,{v1,pc}^ // restore work regs and return 1 // extern uintD sub_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count); // entry // a1 = sourceptr1 // a2 = sourceptr2 // a3 = destptr // a4 = count of words to be subtracted // exit // destptr[] = sourceptr1[] - sourceptr2[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(sub_loop_up) // word aligned sub loop up DECLARE_FUNCTION(sub_loop_up) GLABEL(sub_loop_up) ANDS ip,a4,#3 // multiple of 4 words ? BEQ sub_loop_up_l1 // yup, so branch STMFD sp!,{v6,lr} LDR v6,[a2],#4 // subtract the first 1-3 words LDR lr,[a1],#4 // to align the total to a multiple SUBS lr,lr,v6 // of 4 words STR lr,[a3],#4 TEQ ip,#1 BNE sub_loop_up_l0 // branch if more than one subtract LABEL(sub_loop_up_l4) // drop through for better instr. timings BICS a4,a4,#3 // set counter to multiple of 4 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0) LDMEQFD sp!,{v6,pc}^ // and return STMFD sp!,{v1-v5} // save work regs B sub_loop_up_l2 // branch if more subtracts to do LABEL(sub_loop_up_l0) LDR v6,[a2],#4 LDR lr,[a1],#4 SBCS lr,lr,v6 STR lr,[a3],#4 TEQ ip,#2 BEQ sub_loop_up_l4 // need to branch 'cos PSR used LDR v6,[a2],#4 LDR lr,[a1],#4 SBCS lr,lr,v6 STR lr,[a3],#4 B sub_loop_up_l4 LABEL(sub_loop_up_l1) BICS a4,a4,#3 // set counter to multiple of 4 MOVEQ a1,#0 // no subtracts, so C = 0 MOVEQS pc,lr // if zero then we're done CMP a4,#0 // set carry bit, since a4 > 0 STMFD sp!,{v1-v6,lr} // save work regs LABEL(sub_loop_up_l2) LDMIA a2!,{v1,v2,v3,ip} // load 4 words in one go LDMIA a1!,{v4,v5,v6,lr} // and from source2 SBCS v4,v4,v1 // subtract the four words with carry SBCS v5,v5,v2 SBCS v6,v6,v3 SBCS lr,lr,ip STMIA a3!,{v4,v5,v6,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4, preserve C TEQ a4,#0 // are we done ? BNE sub_loop_up_l2 // if count non-zero then loop SBC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{v1-v6,pc}^ // restore work regs and return // extern uintD subx_loop_up (uintD* sourceptr1, uintD* sourceptr2, uintD* destptr, uintC count, uintD carry); // entry // a1 = sourceptr1 // a2 = sourceptr2 // a3 = destptr // a4 = count of words to be subtracted // [sp] = carry // exit // destptr[] = sourceptr1[] - sourceptr2[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(subx_loop_up) // word aligned xsub loop up DECLARE_FUNCTION(subx_loop_up) GLABEL(subx_loop_up) LDR ip,[sp] // get starting value of carry LABEL(subx_loop_up_lsub) RSBS ip,ip,#0 // set carry in PSR ANDS ip,a4,#3 // multiple of 4 words ? BEQ subx_loop_up_l1 // yup, so branch STMFD sp!,{v6,lr} LDR v6,[a2],#4 // subtract the first 1-3 words LDR lr,[a1],#4 // to align the total to a multiple SBCS lr,lr,v6 // of 4 words STR lr,[a3],#4 TEQ ip,#1 BNE subx_loop_up_l0 // branch if more than one subtract LABEL(subx_loop_up_l4) // drop through for better instr. timings BICS a4,a4,#3 // set counter to multiple of 4 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0) LDMEQFD sp!,{v6,pc}^ // and return STMFD sp!,{v1-v5} // save work regs B subx_loop_up_l2 // branch if more subtracts to do LABEL(subx_loop_up_l0) LDR v6,[a2],#4 LDR lr,[a1],#4 SBCS lr,lr,v6 STR lr,[a3],#4 TEQ ip,#2 BEQ subx_loop_up_l4 // need to branch 'cos PSR used LDR v6,[a2],#4 LDR lr,[a1],#4 SBCS lr,lr,v6 STR lr,[a3],#4 B subx_loop_up_l4 LABEL(subx_loop_up_l1) BICS a4,a4,#3 // set counter to multiple of 4 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0) MOVEQS pc,lr // if zero then we're done STMFD sp!,{v1-v6,lr} // save work regs LABEL(subx_loop_up_l2) LDMIA a2!,{v1,v2,v3,ip} // load 4 words in one go LDMIA a1!,{v4,v5,v6,lr} // and from source2 SBCS v4,v4,v1 // subtract the four words with carry SBCS v5,v5,v2 SBCS v6,v6,v3 SBCS lr,lr,ip STMIA a3!,{v4,v5,v6,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4, preserve C TEQ a4,#0 // are we done ? BNE subx_loop_up_l2 // if count non-zero then loop SBC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{v1-v6,pc}^ // restore work regs and return // extern uintD subfrom_loop_up (uintD* sourceptr, uintD* destptr, uintC count); // entry // a1 = sourceptr // a2 = destptr // a3 = count of words to be subtracted // exit // destptr[] = destptr[] - sourceptr[] // a1 = last carry // a2 - a4, ip destroyed EXPORT(subfrom_loop_up) // word aligned subfrom loop up DECLARE_FUNCTION(subfrom_loop_up) GLABEL(subfrom_loop_up) ANDS ip,a3,#3 // multiple of 4 words ? BEQ subfrom_loop_up_l1 // yup, so branch STMFD sp!,{lr} LDR a4,[a1],#4 // subtract the first 1-3 words LDR lr,[a2] // to align the total to a multiple SUBS lr,lr,a4 // of 4 words STR lr,[a2],#4 TEQ ip,#1 BNE subfrom_loop_up_l0 // branch if more than one subtract LABEL(subfrom_loop_up_l4) // drop through for better instr. timings BICS a4,a3,#3 // set counter to multiple of 4 SBCEQ a1,a4,a4 // set result to Carry (a4 is 0) LDMEQFD sp!,{pc}^ // and return STMFD sp!,{v1-v5} // save work regs B subfrom_loop_up_l2 // branch if more subtracts to do LABEL(subfrom_loop_up_l0) LDR a4,[a1],#4 LDR lr,[a2] SBCS lr,lr,a4 STR lr,[a2],#4 TEQ ip,#2 BEQ subfrom_loop_up_l4 // need to branch 'cos PSR used LDR a4,[a1],#4 LDR lr,[a2] SBCS lr,lr,a4 STR lr,[a2],#4 B subfrom_loop_up_l4 LABEL(subfrom_loop_up_l1) BICS a4,a3,#3 // set counter to multiple of 4 MOVEQ a1,#0 // no subtracts, so C = 0 MOVEQS pc,lr // if zero then we're done CMP a4,#0 // set carry bit, since a4 > 0 STMFD sp!,{v1-v5,lr} // save work regs LABEL(subfrom_loop_up_l2) LDMIA a1!,{a3,v1,v2,ip} // load 4 words in one go LDMIA a2,{v3,v4,v5,lr} // and from destptr SBCS v3,v3,a3 // subtract the four words with carry SBCS v4,v4,v1 SBCS v5,v5,v2 SBCS lr,lr,ip STMIA a2!,{v3,v4,v5,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4, preserve C TEQ a4,#0 // are we done ? BNE subfrom_loop_up_l2 // if count non-zero then loop SBC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{v1-v5,pc}^ // restore work regs and return // extern uintD dec_loop_up (uintD* ptr, uintC count); // entry // a1 = ptr // a2 = count of words to be DECed // exit // a1 = 0 if any words are non-zero before decrement else -1 // stop decrementing when first word is non-zero // a2 - a4, ip destroyed EXPORT(dec_loop_up) // word aligned dec loop up DECLARE_FUNCTION(dec_loop_up) GLABEL(dec_loop_up) ANDS a3,a2,#1 // multiple of 2 words ? BEQ dec_loop_up_l1 // yup, so branch LDR a4,[a1] // DEC the first word SUBS a4,a4,#1 // align the total to a multiple of 2 STR a4,[a1],#4 MOVCS a1,#0 // set result to 0 MOVCSS pc,lr // return 0 if non-zero result LABEL(dec_loop_up_l1) BICS a4,a2,#1 // set counter to multiple of 2 MVNEQ a1,#0 // return -1 MOVEQS pc,lr // if zero then we're done MOV ip,a1 // move ptr to ip MOV a1,#0 // set result to 0 ANDS a3,a4,#3 BEQ dec_loop_up_l3 LDMIA ip,{a2,a3} // load 2 words in one go SUBS a2,a2,#1 // DEC the two words SUBCCS a3,a3,#1 // stopping when first word non-zero STMIA ip!,{a2,a3} // store 2 results MOVCSS pc,lr // return 0 if any result non-zero SUBS a4,a4,#2 // decrement counter by 2 MVNEQ a1,#0 // if finished loop then MOVEQS pc,lr // return -1 LABEL(dec_loop_up_l3) // now a multiple of 4 words STMFD sp!,{v1,lr} // save work regs LABEL(dec_loop_up_l2) LDMIA ip,{a2,a3,v1,lr} // load 4 words in one go SUBS a2,a2,#1 // DEC the four words SUBCCS a3,a3,#1 // stopping when first word non-zero SUBCCS v1,v1,#1 SUBCCS lr,lr,#1 STMIA ip!,{a2,a3,v1,lr} // store 4 results LDMCSFD sp!,{v1,pc}^ // return 0 if any carry SUBS a4,a4,#4 // decrement counter by 4 BGT dec_loop_up_l2 // if count still positive then loop MVN a1,#0 LDMFD sp!,{v1,pc}^ // restore work regs and return -1 // extern void neg_loop_up (uintD* ptr, uintC count); // entry // a1 = ptr // a2 = count of words. The long integer is to be NEGated // exit // ptr[] = -ptr[] for count words // a1 = last carry // a2 - a4, ip destroyed EXPORT(neg_loop_up) // word aligned neg loop up DECLARE_FUNCTION(neg_loop_up) GLABEL(neg_loop_up) CMPS a2,#0 // count = 0 ? MOVEQ a1,#0 // yup, so return 0 MOVEQS pc,lr LABEL(neg_loop_up_l1) // skip all the zero words first LDR a3,[a1],#4 // compare words against zero CMPS a3,#0 // upwards in memory BNE neg_loop_up_l2 // non-zero, so negate rest of words SUBS a2,a2,#1 // reduce count of words BNE neg_loop_up_l1 // more ?, so loop MOV a1,#0 // return 0 MOVS pc,lr LABEL(neg_loop_up_l2) RSB a3,a3,#0 // first non-zero word = -word STR a3,[a1,#-4] SUBS a2,a2,#1 MVNEQ a1,#0 // done ? -> return -1 MOVEQS pc,lr // now NOT rest of the words ANDS a3,a2,#3 // multiple of 4 words ? BEQ neg_loop_up_l3 // yup, so branch CMP a3,#2 // NOT the first 1-3 words LDR a3,[a1] // to align the total to a multiple MVN a3,a3 // of 4 words STR a3,[a1],#4 BLT neg_loop_up_l3 // better to branch than skip instrs. LDRGE a3,[a1] MVNGE a3,a3 STRGE a3,[a1],#4 LDRGT a3,[a1] MVNGT a3,a3 STRGT a3,[a1],#4 LABEL(neg_loop_up_l3) BICS a4,a2,#3 // set counter to multiple of 4 MVNEQ a1,#0 // set result to -1 MOVEQS pc,lr // if zero then we're done STMFD sp!,{lr} // save work regs LABEL(neg_loop_up_l4) LDMIA a1,{a2,a3,ip,lr} // load 4 words in one go,NO writeback MVN a2,a2 // NOT the four words MVN a3,a3 MVN ip,ip MVN lr,lr STMIA a1!,{a2,a3,ip,lr} // store 4 results SUBS a4,a4,#4 // decrement counter by 4 BGT neg_loop_up_l4 // if count still positive then loop MVN a1,#0 // set result to -1 LDMFD sp!,{pc}^ // restore work regs and return -1 // extern uintD shift1left_loop_up (uintD* ptr, uintC count); // entry // a1 = ptr // a2 = count of words to be shifted left // exit // a1 = carry out from last shift left // a2 - a4, ip destroyed EXPORT(shift1left_loop_up) // word aligned shift1left loop up DECLARE_FUNCTION(shift1left_loop_up) GLABEL(shift1left_loop_up) CMN a1,#0 // clear carry bit, since a1 > 0 ANDS a3,a2,#1 // multiple of 2 words ? BEQ shift1left_loop_up_l1 // yup, so branch LDR a4,[a1] // shift left the first word ADDS a4,a4,a4 STR a4,[a1],#4 LABEL(shift1left_loop_up_l1) BICS a4,a2,#1 // set counter to multiple of 2 ADCEQ a1,a4,a4 // if zero set result to C (a4 is 0) MOVEQS pc,lr // and return ANDS a3,a4,#3 // multiple of 4 words ? BEQ shift1left_loop_up_l3 // yup, so branch LDMIA a1,{a2,a3} // load 2 words in one go ADCS a2,a2,a2 // shift left the two words ADCS a3,a3,a3 STMIA a1!,{a2,a3} // store 2 results BICS a4,a4,#2 // decrement counter by 2 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0) MOVEQS pc,lr // and return LABEL(shift1left_loop_up_l3) // now a multiple of 4 words STMFD sp!,{lr} // save work regs LABEL(shift1left_loop_up_l2) LDMIA a1,{a2,a3,ip,lr} // load 4 words in one go ADCS a2,a2,a2 // shift left the four words ADCS a3,a3,a3 ADCS ip,ip,ip ADCS lr,lr,lr STMIA a1!,{a2,a3,ip,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4 TEQ a4,#0 // are we done ? BNE shift1left_loop_up_l2 // if count non-zero then loop ADC a1,a4,a4 // set result to Carry (a4 is 0) LDMFD sp!,{pc}^ // restore work regs and return 1 // extern uintD shiftleft_loop_up (uintD* ptr, uintC count, uintC i, uintD carry); // entry // a1 = ptr // a2 = count of words to be shifted left // a3 = size of left shift // a4 = value to ORR in for first shift // exit // a1 = shift out from last shift left // a2 - a4, ip destroyed EXPORT(shiftleft_loop_up) // word aligned shiftleft loop up DECLARE_FUNCTION(shiftleft_loop_up) GLABEL(shiftleft_loop_up) STMFD sp!,{v6,lr} RSB v6,a3,#32 // size of complementary right shift ANDS ip,a2,#3 // multiple of 4 words ? BEQ shiftleft_loop_up_l1 // yup, so branch LDR lr,[a1] // shiftleft the first 1-3 words ORR a4,a4,lr,ASL a3 // to align the total to a multiple STR a4,[a1],#4 // of 4 words MOV a4,lr,LSR v6 CMP ip,#2 BLT shiftleft_loop_up_l1 // better to branch than skip instrs. LDRGE lr,[a1] ORRGE a4,a4,lr,ASL a3 STRGE a4,[a1],#4 MOVGE a4,lr,LSR v6 LDRGT lr,[a1] ORRGT a4,a4,lr,ASL a3 STRGT a4,[a1],#4 MOVGT a4,lr,LSR v6 LABEL(shiftleft_loop_up_l1) BICS ip,a2,#3 // set counter to multiple of 4 MOVEQ a1,a4 // if zero then we're done LDMEQFD sp!,{v6,pc}^ // so return last shift out STMFD sp!,{v1-v3} // save work regs LABEL(shiftleft_loop_up_l2) LDMIA a1,{v1,v2,v3,lr} // load 4 words in one go ORR a2,a4,v1,ASL a3 // shiftleft the four words MOV a4,v1,LSR v6 // keep carry in a4 ORR v1,a4,v2,ASL a3 // and store results down a register MOV a4,v2,LSR v6 // to regs a2,v1-v3 ORR v2,a4,v3,ASL a3 MOV a4,v3,LSR v6 ORR v3,a4,lr,ASL a3 MOV a4,lr,LSR v6 STMIA a1!,{a2,v1,v2,v3} // store 4 results SUBS ip,ip,#4 // decrement counter by 4 BGT shiftleft_loop_up_l2 // if count still positive then loop MOV a1,a4 // result = last shift out LDMFD sp!,{v1-v3,v6,pc}^ // restore work regs and return #endif // extern uintD shiftleftcopy_loop_up (uintD* sourceptr, uintD* destptr, uintC count, uintC i); // entry // a1 = sourceptr // a2 = destptr // a3 = count of words to be shifted left // a4 = size of left shift // exit // a1 = shift out from last shift left // a2 - a4, ip destroyed EXPORT(shiftleftcopy_loop_up) // word aligned shiftleftcopy loop up DECLARE_FUNCTION(shiftleftcopy_loop_up) GLABEL(shiftleftcopy_loop_up) STMFD sp!,{v5,v6,lr} MOV v5,#0 // initial shift carry RSB v6,a4,#32 // size of complementary right shift ANDS ip,a3,#3 // multiple of 4 words ? BEQ shiftleftcopy_loop_up_l1 // yup, so branch LDR lr,[a1],#4 // shiftleft the first 1-3 words ORR v5,v5,lr,ASL a4 // to align the total to a multiple STR v5,[a2],#4 // of 4 words MOV v5,lr,LSR v6 CMP ip,#2 BLT shiftleftcopy_loop_up_l1 // better to branch than skip instrs. LDRGE lr,[a1],#4 ORRGE v5,v5,lr,ASL a4 STRGE v5,[a2],#4 MOVGE v5,lr,LSR v6 LDRGT lr,[a1],#4 ORRGT v5,v5,lr,ASL a4 STRGT v5,[a2],#4 MOVGT v5,lr,LSR v6 LABEL(shiftleftcopy_loop_up_l1) BICS ip,a3,#3 // set counter to multiple of 4 MOVEQ a1,v5 // if zero then we're done LDMEQFD sp!,{v5,v6,pc}^ // so return last shift out STMFD sp!,{v1-v3} // save work regs LABEL(shiftleftcopy_loop_up_l2) LDMIA a1!,{v1,v2,v3,lr} // load 4 words in one go ORR a3,v5,v1,ASL a4 // shiftleft the four words MOV v5,v1,LSR v6 // keep carry in v5 ORR v1,v5,v2,ASL a4 // and store results down a register MOV v5,v2,LSR v6 // to regs a3,v1-v3 ORR v2,v5,v3,ASL a4 MOV v5,v3,LSR v6 ORR v3,v5,lr,ASL a4 MOV v5,lr,LSR v6 STMIA a2!,{a3,v1,v2,v3} // store 4 results SUBS ip,ip,#4 // decrement counter by 4 BGT shiftleftcopy_loop_up_l2 // if count still positive then loop MOV a1,v5 // result = last shift out LDMFD sp!,{v1-v3,v5,v6,pc}^ // restore work regs and return #if !CL_DS_BIG_ENDIAN_P // extern uintD shift1right_loop_down (uintD* ptr, uintC count, uintD carry); // entry // a1 = ptr // a2 = count of words to be shifted right // a3 = carry // exit // a1 = carry out from last shift right // a2 - a4, ip destroyed EXPORT(shift1right_loop_down) // word aligned shift1right loop down DECLARE_FUNCTION(shift1right_loop_down) GLABEL(shift1right_loop_down) MOVS a3,a3,LSR #1 // set carry ANDS a3,a2,#1 // multiple of 2 words ? BEQ shift1right_loop_down_l1 // yup, so branch LDR a4,[a1,#-4]! // shift right the first word MOVS a4,a4,RRX STR a4,[a1] LABEL(shift1right_loop_down_l1) BICS a4,a2,#1 // set counter to multiple of 2 MOVEQ a1,a4,RRX // if zero set result to C (a4 is 0) MOVEQS pc,lr // and return ANDS a3,a4,#3 // multiple of 4 words ? BEQ shift1right_loop_down_l3 // yup, so branch LDMDB a1,{a2,a3} // load 2 words in one go MOVS a3,a3,RRX // shift right the two words MOVS a2,a2,RRX STMDB a1!,{a2,a3} // store 2 results BICS a4,a4,#2 // decrement counter by 2 ADCEQ a1,a4,a4 // set result to Carry (a4 is 0) MOVEQS pc,lr // and return LABEL(shift1right_loop_down_l3) // now a multiple of 4 words STMFD sp!,{lr} // save work regs LABEL(shift1right_loop_down_l2) LDMDB a1,{a2,a3,ip,lr} // load 4 words in one go MOVS lr,lr,RRX // shift right the four words MOVS ip,ip,RRX MOVS a3,a3,RRX MOVS a2,a2,RRX STMDB a1!,{a2,a3,ip,lr} // store 4 results SUB a4,a4,#4 // decrement counter by 4 TEQ a4,#0 // are we done ? BNE shift1right_loop_down_l2 // if count non-zero then loop MOV a1,a4,RRX // set result to Carry (a4 is 0) LDMFD sp!,{pc}^ // restore work regs and return 1 // extern uintD shiftright_loop_down (uintD* ptr, uintC count, uintC i); // entry // a1 = ptr // a2 = count of words to be shifted right // a3 = size of right shift // exit // a1 = shift out from last shift right // a2 - a4, ip destroyed EXPORT(shiftright_loop_down) // word aligned shiftright loop down DECLARE_FUNCTION(shiftright_loop_down) GLABEL(shiftright_loop_down) STMFD sp!,{v6,lr} MOV a4,#0 // initial shift carry RSB v6,a3,#32 // size of complementary left shift LABEL(shiftright_loop_down_l0) ANDS ip,a2,#3 // multiple of 4 words ? BEQ shiftright_loop_down_l1 // yup, so branch LDR lr,[a1,#-4]! // shiftright the first 1-3 words ORR a4,a4,lr,LSR a3 // to align the total to a multiple STR a4,[a1] // of 4 words MOV a4,lr,ASL v6 CMP ip,#2 BLT shiftright_loop_down_l1 // better to branch than skip instrs. LDRGE lr,[a1,#-4]! ORRGE a4,a4,lr,LSR a3 STRGE a4,[a1] MOVGE a4,lr,ASL v6 LDRGT lr,[a1,#-4]! ORRGT a4,a4,lr,LSR a3 STRGT a4,[a1] MOVGT a4,lr,ASL v6 LABEL(shiftright_loop_down_l1) BICS ip,a2,#3 // set counter to multiple of 4 MOVEQ a1,a4 // if zero then we're done LDMEQFD sp!,{v6,pc}^ // so return last shift out STMFD sp!,{v1-v3} // save work regs LABEL(shiftright_loop_down_l2) LDMDB a1,{a2,v1,v2,v3} // load 4 words in one go ORR lr,a4,v3,LSR a3 // shiftright the four words MOV a4,v3,ASL v6 // keep carry in a4 ORR v3,a4,v2,LSR a3 // and store results up a register MOV a4,v2,ASL v6 // to regs v1-v3,lr ORR v2,a4,v1,LSR a3 MOV a4,v1,ASL v6 ORR v1,a4,a2,LSR a3 MOV a4,a2,ASL v6 STMDB a1!,{v1,v2,v3,lr} // store 4 results SUBS ip,ip,#4 // decrement counter by 4 BGT shiftright_loop_down_l2 // if count still positive then loop MOV a1,a4 // result = last shift out LDMFD sp!,{v1-v3,v6,pc}^ // restore work regs and return // extern uintD shiftrightsigned_loop_down (uintD* ptr, uintC count, uintC i); // entry // a1 = ptr // a2 = count of words to be shifted right signed // a3 = size of right shift // exit // a1 = shift out from last shift right // a2 - a4, ip destroyed EXPORT(shiftrightsigned_loop_down)// word aligned shiftrightsigned loop down DECLARE_FUNCTION(shiftrightsigned_loop_down) GLABEL(shiftrightsigned_loop_down) STMFD sp!,{v6,lr} RSB v6,a3,#32 // size of complementary left shift LDR lr,[a1,#-4] // setup carry for first shift. MOV a4,lr,ASR #31 // this is the sign extended bits AND a4,a4,a4,LSL v6 // 31->(32-i) of the first word B shiftright_loop_down_l0 // use right shift code now // extern uintD shiftrightcopy_loop_down (uintD* sourceptr, uintD* destptr, uintC count, uintC i, uintD carry); // entry // a1 = sourceptr // a2 = destptr // a3 = count of words to be shifted right // a4 = size of right shift // [sp] = carry for first shift // exit // a1 = shift out from last shift right // a2 - a4, ip destroyed EXPORT(shiftrightcopy_loop_down)// word aligned shiftrightcopy loop down DECLARE_FUNCTION(shiftrightcopy_loop_down) GLABEL(shiftrightcopy_loop_down) STMFD sp!,{v5,v6,lr} LDR v5,[sp,#12] // initial shift carry RSB v6,a4,#32 // size of complementary left shift MOV v5,v5,ASL v6 LABEL(shiftrightcopy_loop_down_l0) ANDS ip,a3,#3 // multiple of 4 words ? BEQ shiftrightcopy_loop_down_l1 // yup, so branch LDR lr,[a1,#-4]! // shiftright the first 1-3 words ORR v5,v5,lr,LSR a4 // to align the total to a multiple STR v5,[a2,#-4]! // of 4 words MOV v5,lr,ASL v6 CMP ip,#2 BLT shiftrightcopy_loop_down_l1 // better to branch than skip instrs. LDRGE lr,[a1,#-4]! ORRGE v5,v5,lr,LSR a4 STRGE v5,[a2,#-4]! MOVGE v5,lr,ASL v6 LDRGT lr,[a1,#-4]! ORRGT v5,v5,lr,LSR a4 STRGT v5,[a2,#-4]! MOVGT v5,lr,ASL v6 LABEL(shiftrightcopy_loop_down_l1) BICS ip,a3,#3 // set counter to multiple of 4 MOVEQ a1,v5 // if zero then we're done LDMEQFD sp!,{v5,v6,pc}^ // so return last shift out STMFD sp!,{v1-v3} // save work regs LABEL(shiftrightcopy_loop_down_l2) LDMDB a1!,{a3,v1,v2,v3} // load 4 words in one go ORR lr,v5,v3,LSR a4 // shiftright the four words MOV v5,v3,ASL v6 // keep carry in v5 ORR v3,v5,v2,LSR a4 // and store results up a register MOV v5,v2,ASL v6 // to regs v1-v3,lr ORR v2,v5,v1,LSR a4 MOV v5,v1,ASL v6 ORR v1,v5,a3,LSR a4 MOV v5,a3,ASL v6 STMDB a2!,{v1,v2,v3,lr} // store 4 results SUBS ip,ip,#4 // decrement counter by 4 BGT shiftrightcopy_loop_down_l2 // if count still positive then loop MOV a1,v5 // result = last shift out LDMFD sp!,{v1-v3,v5,v6,pc}^ // restore work regs and return #ifndef HAVE_umull // mulu32_64_vregs // entry // a1 = x // ip = y // exit // v1 = low32(x*y) // ip = high32(x*y) // v2,v3,v4 destroyed LABEL(mulu32_64_vregs) MOV v1,a1,LSR #16 // temp := top half of x MOV v2,ip,LSR #16 // hi := top half of y BIC v3,a1,v1,LSL #16 // x := bottom half of x BIC ip,ip,v2,LSL #16 // y := bottom half of y MUL v4,v3,ip // low section of result MUL ip,v1,ip // ) middle sections MUL v3,v2,v3 // ) of result MUL v2,v1,v2 // high section of result ADDS ip,ip,v3 // add middle sections // (can't use mla as we need carry) ADDCS v2,v2,#0x10000 // carry from above add ADDS v1,v4,ip,LSL #16 // x is now bottom 32 bits of result ADC ip,v2,ip,LSR #16 // hi is top 32 bits MOVS pc,lr #endif // extern uintD mulusmall_loop_up (uintD digit, uintD* ptr, uintC len, uintD newdigit); // entry // a1 = digit // a2 = ptr // a3 = count of words to be multiplied up // a4 = new digit = carry // exit // a1 = final carry of multiply // a2 - a4, ip destroyed EXPORT(mulusmall_loop_up) DECLARE_FUNCTION(mulusmall_loop_up) GLABEL(mulusmall_loop_up) CMP a3,#0 MOVEQ a1,a4 MOVEQS pc,lr #ifdef HAVE_umull STMFD sp!,{v1,lr} LABEL(mulusmall_loop_up_l1) LDR ip,[a2] UMULL v1,ip,a1,ip // muluD(digit,*--ptr,hi=,lo=) ADDS v1,v1,a4 // lo += carry ADC a4,ip,#0 // if (lo