/*
 *  linux/arch/arm/lib/div64.S
 *
 *  Optimized computation of 64-bit dividend / 32-bit divisor
 *
 *  Author:	Nicolas Pitre
 *  Created:	Oct 5, 2003
 *  Copyright:	Monta Vista Software, Inc.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */

#define xl r0
#define xh r1
#define yl r2
#define yh r3

#define UNWIND(x...)
#define ARM(x...)	x
#define THUMB(x...)

#define ENTRY(__f)			\
	.align		3		;\
	.globl		__f		;\
	.type		__f,%function	;\
__f:
#define ENDPROC(__f)			;\
	.size		__f, . - __f


/*
UINT64
DivU64x32 (
    IN UINT64   Dividend,
    IN UINTN    Divisor,
    OUT UINTN   *Remainder OPTIONAL
    )
// divide 64bit by 32bit and get a 64bit result
// N.B. only works for 31bit divisors!!
{
}
*/

ENTRY(__DivU64x32)
	stmfd	sp!, {r4-r6, lr}

	mov	r5, r4			@ preserve Remainder
	mov	r4, r2			@ divisor in r4
	bl	__do_div64

	teq	r5, #0
	strne	xh, [r5]
	mov	r0, yl
	mov	r1, yh
	ldmfd	sp!, {r4-r6, pc}
ENDPROC(__DivU64x32)

/*
 * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
 *
 * Note: Calling convention is totally non standard for optimal code.
 *       This is meant to be used by do_div() from include/asm/div64.h only.
 *
 * Input parameters:
 * 	xh-xl	= dividend (clobbered)
 * 	r4	= divisor (preserved)
 *
 * Output values:
 * 	yh-yl	= result
 * 	xh	= remainder
 *
 * Clobbered regs: xl, ip
 */

ENTRY(__do_div64)
UNWIND(.fnstart)

	@ Test for easy paths first.
	subs	ip, r4, #1
	bls	9f			@ divisor is 0 or 1
	tst	ip, r4
	beq	8f			@ divisor is power of 2

	@ See if we need to handle upper 32-bit result.
	cmp	xh, r4
	mov	yh, #0
	blo	3f

	@ Align divisor with upper part of dividend.
	@ The aligned divisor is stored in yl preserving the original.
	@ The bit position is stored in ip.

	clz	yl, r4
	clz	ip, xh
	sub	yl, yl, ip
	mov	ip, #1
	mov	ip, ip, lsl yl
	mov	yl, r4, lsl yl

	@ The division loop for needed upper bit positions.
 	@ Break out early if dividend reaches 0.
2:	cmp	xh, yl
	orrcs	yh, yh, ip
	subcss	xh, xh, yl
	movnes	ip, ip, lsr #1
	mov	yl, yl, lsr #1
	bne	2b

	@ See if we need to handle lower 32-bit result.
3:	cmp	xh, #0
	mov	yl, #0
	cmpeq	xl, r4
	movlo	xh, xl
	movlo	pc, lr

	@ The division loop for lower bit positions.
	@ Here we shift remainer bits leftwards rather than moving the
	@ divisor for comparisons, considering the carry-out bit as well.
	mov	ip, #0x80000000
4:	movs	xl, xl, lsl #1
	adcs	xh, xh, xh
	beq	6f
	cmpcc	xh, r4
5:	orrcs	yl, yl, ip
	subcs	xh, xh, r4
	movs	ip, ip, lsr #1
	bne	4b
	mov	pc, lr

	@ The top part of remainder became zero.  If carry is set
	@ (the 33th bit) this is a false positive so resume the loop.
	@ Otherwise, if lower part is also null then we are done.
6:	bcs	5b
	cmp	xl, #0
	moveq	pc, lr

	@ We still have remainer bits in the low part.  Bring them up.

	clz	xh, xl			@ we know xh is zero here so...
	add	xh, xh, #1
	mov	xl, xl, lsl xh
	mov	ip, ip, lsr xh

	@ Current remainder is now 1.  It is worthless to compare with
	@ divisor at this point since divisor can not be smaller than 3 here.
	@ If possible, branch for another shift in the division loop.
	@ If no bit position left then we are done.
	movs	ip, ip, lsr #1
	mov	xh, #1
	bne	4b
	mov	pc, lr

8:	@ Division by a power of 2: determine what that divisor order is
	@ then simply shift values around

	clz	ip, r4
	rsb	ip, ip, #31

	mov	yh, xh, lsr ip
	mov	yl, xl, lsr ip
	rsb	ip, ip, #32
 ARM(	orr	yl, yl, xh, lsl ip	)
 THUMB(	lsl	xh, xh, ip		)
 THUMB(	orr	yl, yl, xh		)
	mov	xh, xl, lsl ip
	mov	xh, xh, lsr ip
	mov	pc, lr

	@ eq -> division by 1: obvious enough...
9:	moveq	yl, xl
	moveq	yh, xh
	moveq	xh, #0
	moveq	pc, lr
UNWIND(.fnend)

UNWIND(.fnstart)
UNWIND(.pad #4)
UNWIND(.save {lr})
Ldiv0_64:
	@ Division by 0:
	str	lr, [sp, #-8]!
	bl	__div0

	@ as wrong as it could be...
	mov	yl, #0
	mov	yh, #0
	mov	xh, #0
	ldr	pc, [sp], #8

UNWIND(.fnend)
ENDPROC(__do_div64)