Using integer multiplication to avoid integer division
David G. Hough
dgh
Wed Jun 20 17:03:48 PDT 1990
In the context of base conversion, I've mentioned that multiplying
a short unsigned by 10000 to produce a long unsigned, and producing
a quotient and remainder from a short unsigned and 10000, are core
operations in correctly-rounded base conversion whenever you can't
get by with floating-point arithmetic.
This is not the best possible situation with current SPARC implementations.
The compiler can handle *10000 fairly efficiently with shifts and adds,
but /10000 and %10000 don't fare so well. Ironically the underlying libc
function .udiv computes both at once, but the compiler doesn't recognize
the opportunity to avoid calling .udiv twice. So K-C Ng coded that in
assembly language for the base conversion of SPARCompilers 1.0 (C 1.1).
But Mike Powell pointed out in another context that integer division
by a fixed constant can often be replaced by an integer multiplication
and a shift. When I understood what he had in mind I decided to apply
it to see what kind of performance improvement was possible. I was
running on validgh, a 4/110 with SunOS 4.0, with a version of the C 1.1
compiler, compiling -O4 -Bstatic. Results:
obvious code in C 6.2 secs
assembler call to .udiv 3.7 secs
Powell improvement in C 2.2 secs
Here's the obvious code:
unsigned short
quorem10000(x, pr)
unsigned long x;
unsigned short *pr;
/* quorem gets x/10000 ; *pr gets x%10000 */
{
*pr = (short unsigned) (x % 10000);
return (short unsigned) (x / 10000);
}
for which the compiler generates two function calls
save %sp,-96,%sp
sethi %hi(0x2710),%l0 ! [internal]
add %l0,%lo(0x2710),%o1
call .urem,2
mov %i0,%o0
sth %o0,[%i1]
mov %i0,%o0
call .udiv,2
add %l0,%lo(0x2710),%o1
sll %o0,16,%o0
srl %o0,16,%i0
ret
restore
Here's K-C Ng's assembler code from SC 1.0 which only makes one call:
save %sp,-96,%sp
set 10000,%o1
call .udiv
mov %i0,%o0
tst %o3 ! adjustment on remainder
bl,a 1f
add %o3,%o1,%o3
1:
sth %o3,[%i1]
sll %o0,16,%o0
srl %o0,16,%i0
ret
restore
And here's the code which avoids division entirely:
unsigned short
quorem10000(x, pr)
unsigned long x;
unsigned short *pr;
/* quorem gets x/10000 ; *pr gets x%10000 */
{
long unsigned t;
t = ((x >> 4) * 839) >> 19;
switch (t) {
case 0:
*pr = x;
break;
case 1:
*pr = x - 10000;
break;
case 2:
*pr = x - 20000;
break;
case 3:
*pr = x - 30000;
break;
case 4:
*pr = x - 40000;
break;
case 5:
*pr = x - 50000;
break;
case 6:
*pr = x - 60000;
break;
}
return t;
}
which compiles to
srl %o0,4,%o3
sll %o3,3,%o4
sub %o4,%o3,%o3
sll %o4,3,%o4
add %o3,%o4,%o3
sll %o4,2,%o4
add %o3,%o4,%o3
sll %o4,1,%o4
add %o3,%o4,%o3
srl %o3,19,%o2
mov %o2,%o5
cmp %o5,6
bgu L77023
sll %o5,2,%o3
sethi %hi(L2000000),%o4
or %o4,%lo(L2000000),%o4 ! [internal]
ld [%o3+%o4],%o3
jmp %o3
nop
.align 4
L2000000:
.word L77014
.word L77015
.word L77016
.word L77017
.word L77018
.word L77019
.word L77020
L77014:
b L77023
sth %o0,[%o1]
L77015:
sethi %hi(0x2710),%o3
b LY5
or %o3,%lo(0x2710),%o3 ! [internal]
L77016:
sethi %hi(0x4e20),%o3
b LY5
or %o3,%lo(0x4e20),%o3 ! [internal]
L77017:
sethi %hi(0x7530),%o3
b LY5
or %o3,%lo(0x7530),%o3 ! [internal]
L77018:
sethi %hi(0x9c40),%o3
b LY5
or %o3,%lo(0x9c40),%o3 ! [internal]
L77019:
sethi %hi(0xc350),%o3
b LY5
or %o3,%lo(0xc350),%o3 ! [internal]
L77020:
sethi %hi(0xea60),%o3
or %o3,%lo(0xea60),%o3 ! [internal]
LY5: ! [internal]
sub %o0,%o3,%o0
sth %o0,[%o1]
L77023:
sll %o5,16,%o5
retl
srl %o5,16,%o0
which could probably be further tuned in hand-coded assembler.
More information about the Numeric-interest
mailing list