Using integer multiplication to avoid integer division

David G. Hough dgh
Wed Jun 20 17:03:48 PDT 1990


In the context of base conversion, I've mentioned that multiplying
a short unsigned by 10000 to produce a long unsigned, and producing
a quotient and remainder from a short unsigned and 10000, are core
operations in correctly-rounded base conversion whenever you can't
get by with floating-point arithmetic.

This is not the best possible situation with current SPARC implementations.
The compiler can handle *10000 fairly efficiently with shifts and adds,
but /10000 and %10000 don't fare so well.  Ironically the underlying libc
function .udiv computes both at once, but the compiler doesn't recognize
the opportunity to avoid calling .udiv twice.  So K-C Ng coded that in
assembly language for the base conversion of SPARCompilers 1.0 (C 1.1).

But Mike Powell pointed out in another context that integer division
by a fixed constant can often be replaced by an integer multiplication
and a shift.   When I understood what he had in mind I decided to apply
it to see what kind of performance improvement was possible.  I was
running on validgh, a 4/110 with SunOS 4.0, with a version of the C 1.1
compiler, compiling -O4 -Bstatic.  Results:

	obvious code in C		6.2 secs
	assembler call to .udiv		3.7 secs
	Powell improvement in C		2.2 secs

Here's the obvious code:

unsigned short
quorem10000(x, pr)
	unsigned long   x;
	unsigned short *pr;

/* quorem gets x/10000 ; *pr gets x%10000              */

{
	*pr = (short unsigned) (x % 10000);
	return (short unsigned) (x / 10000);
}

for which the compiler generates two function calls

        save    %sp,-96,%sp
        sethi   %hi(0x2710),%l0         ! [internal]
        add     %l0,%lo(0x2710),%o1
        call    .urem,2
        mov     %i0,%o0
        sth     %o0,[%i1]
        mov     %i0,%o0
        call    .udiv,2
        add     %l0,%lo(0x2710),%o1
        sll     %o0,16,%o0
        srl     %o0,16,%i0
        ret
        restore

Here's K-C Ng's assembler code from SC 1.0 which only makes one call:

        save    %sp,-96,%sp
        set     10000,%o1
        call    .udiv
        mov     %i0,%o0
        tst     %o3                     ! adjustment on remainder
        bl,a    1f
        add     %o3,%o1,%o3
1:
        sth     %o3,[%i1]
        sll     %o0,16,%o0
        srl     %o0,16,%i0
        ret
        restore

And here's the code which avoids division entirely:


unsigned short
quorem10000(x, pr)
	unsigned long   x;
	unsigned short *pr;

/* quorem gets x/10000 ; *pr gets x%10000              */

{
	long unsigned   t;

	t = ((x >> 4) * 839) >> 19;
	switch (t) {
	case 0:
		*pr = x;
		break;
	case 1:
		*pr = x - 10000;
		break;
	case 2:
		*pr = x - 20000;
		break;
	case 3:
		*pr = x - 30000;
		break;
	case 4:
		*pr = x - 40000;
		break;
	case 5:
		*pr = x - 50000;
		break;
	case 6:
		*pr = x - 60000;
		break;
	}
	return t;
}

which compiles to 

	srl	%o0,4,%o3
	sll	%o3,3,%o4
	sub	%o4,%o3,%o3
	sll	%o4,3,%o4
	add	%o3,%o4,%o3
	sll	%o4,2,%o4
	add	%o3,%o4,%o3
	sll	%o4,1,%o4
	add	%o3,%o4,%o3
	srl	%o3,19,%o2
	mov	%o2,%o5
	cmp	%o5,6
	bgu	L77023
	sll	%o5,2,%o3
	sethi	%hi(L2000000),%o4
	or	%o4,%lo(L2000000),%o4	! [internal]
	ld	[%o3+%o4],%o3
	jmp	%o3
	nop
	.align	4
L2000000:
	.word	L77014
	.word	L77015
	.word	L77016
	.word	L77017
	.word	L77018
	.word	L77019
	.word	L77020
L77014:
	b	L77023
	sth	%o0,[%o1]
L77015:
	sethi	%hi(0x2710),%o3
	b	LY5
	or	%o3,%lo(0x2710),%o3	! [internal]
L77016:
	sethi	%hi(0x4e20),%o3
	b	LY5
	or	%o3,%lo(0x4e20),%o3	! [internal]
L77017:
	sethi	%hi(0x7530),%o3
	b	LY5
	or	%o3,%lo(0x7530),%o3	! [internal]
L77018:
	sethi	%hi(0x9c40),%o3
	b	LY5
	or	%o3,%lo(0x9c40),%o3	! [internal]
L77019:
	sethi	%hi(0xc350),%o3
	b	LY5
	or	%o3,%lo(0xc350),%o3	! [internal]
L77020:
	sethi	%hi(0xea60),%o3
	or	%o3,%lo(0xea60),%o3	! [internal]
LY5:					! [internal]
	sub	%o0,%o3,%o0
	sth	%o0,[%o1]
L77023:
	sll	%o5,16,%o5
	retl
	srl	%o5,16,%o0

which could probably be further tuned in hand-coded assembler.



More information about the Numeric-interest mailing list