head	1.2;
access;
symbols
	perseant-exfatfs-base-20250801:1.2
	perseant-exfatfs-base-20240630:1.2
	perseant-exfatfs:1.2.0.8
	perseant-exfatfs-base:1.2
	netbsd-8-3-RELEASE:1.1.1.2
	cjep_sun2x:1.2.0.6
	cjep_sun2x-base:1.2
	cjep_staticlib_x-base1:1.2
	cjep_staticlib_x:1.2.0.4
	cjep_staticlib_x-base:1.2
	phil-wifi-20200421:1.2
	phil-wifi-20200411:1.2
	phil-wifi-20200406:1.2
	netbsd-8-2-RELEASE:1.1.1.2
	netbsd-8-1-RELEASE:1.1.1.2
	netbsd-8-1-RC1:1.1.1.2
	pgoyette-compat-merge-20190127:1.2
	pgoyette-compat-20190127:1.2
	pgoyette-compat-20190118:1.2
	pgoyette-compat-1226:1.2
	pgoyette-compat-1126:1.2
	pgoyette-compat-1020:1.2
	pgoyette-compat-0930:1.2
	pgoyette-compat-0906:1.2
	netbsd-7-2-RELEASE:1.1.1.2
	pgoyette-compat-0728:1.2
	netbsd-8-0-RELEASE:1.1.1.2
	pgoyette-compat-0625:1.2
	netbsd-8-0-RC2:1.1.1.2
	pgoyette-compat-0521:1.2
	pgoyette-compat-0502:1.2
	pgoyette-compat-0422:1.2
	netbsd-8-0-RC1:1.1.1.2
	pgoyette-compat-0415:1.2
	pgoyette-compat-0407:1.2
	pgoyette-compat-0330:1.2
	pgoyette-compat-0322:1.2
	pgoyette-compat-0315:1.2
	netbsd-7-1-2-RELEASE:1.1.1.2
	pgoyette-compat:1.2.0.2
	pgoyette-compat-base:1.2
	netbsd-7-1-1-RELEASE:1.1.1.2
	matt-nb8-mediatek:1.1.1.2.0.22
	matt-nb8-mediatek-base:1.1.1.2
	perseant-stdc-iso10646:1.1.1.2.0.20
	perseant-stdc-iso10646-base:1.1.1.2
	netbsd-8:1.1.1.2.0.18
	netbsd-8-base:1.1.1.2
	prg-localcount2-base3:1.1.1.2
	prg-localcount2-base2:1.1.1.2
	prg-localcount2-base1:1.1.1.2
	prg-localcount2:1.1.1.2.0.16
	prg-localcount2-base:1.1.1.2
	pgoyette-localcount-20170426:1.1.1.2
	bouyer-socketcan-base1:1.1.1.2
	pgoyette-localcount-20170320:1.1.1.2
	netbsd-7-1:1.1.1.2.0.14
	netbsd-7-1-RELEASE:1.1.1.2
	netbsd-7-1-RC2:1.1.1.2
	netbsd-7-nhusb-base-20170116:1.1.1.2
	bouyer-socketcan:1.1.1.2.0.12
	bouyer-socketcan-base:1.1.1.2
	pgoyette-localcount-20170107:1.1.1.2
	netbsd-7-1-RC1:1.1.1.2
	pgoyette-localcount-20161104:1.1.1.2
	netbsd-7-0-2-RELEASE:1.1.1.2
	localcount-20160914:1.1.1.2
	netbsd-7-nhusb:1.1.1.2.0.10
	netbsd-7-nhusb-base:1.1.1.2
	pgoyette-localcount-20160806:1.1.1.2
	pgoyette-localcount-20160726:1.1.1.2
	pgoyette-localcount:1.1.1.2.0.8
	pgoyette-localcount-base:1.1.1.2
	netbsd-7-0-1-RELEASE:1.1.1.2
	netbsd-7-0:1.1.1.2.0.6
	netbsd-7-0-RELEASE:1.1.1.2
	netbsd-7-0-RC3:1.1.1.2
	netbsd-7-0-RC2:1.1.1.2
	netbsd-7-0-RC1:1.1.1.2
	netbsd-6-0-6-RELEASE:1.1.1.1
	netbsd-6-1-5-RELEASE:1.1.1.1
	netbsd-7:1.1.1.2.0.4
	netbsd-7-base:1.1.1.2
	yamt-pagecache-base9:1.1.1.2
	yamt-pagecache-tag8:1.1.1.1
	netbsd-6-1-4-RELEASE:1.1.1.1
	netbsd-6-0-5-RELEASE:1.1.1.1
	tls-earlyentropy:1.1.1.2.0.2
	tls-earlyentropy-base:1.1.1.2
	riastradh-xf86-video-intel-2-7-1-pre-2-21-15:1.1.1.2
	riastradh-drm2-base3:1.1.1.2
	netbsd-6-1-3-RELEASE:1.1.1.1
	netbsd-6-0-4-RELEASE:1.1.1.1
	gmp-5-1-3:1.1.1.2
	netbsd-6-1-2-RELEASE:1.1.1.1
	netbsd-6-0-3-RELEASE:1.1.1.1
	netbsd-6-1-1-RELEASE:1.1.1.1
	riastradh-drm2-base2:1.1.1.1
	riastradh-drm2-base1:1.1.1.1
	riastradh-drm2:1.1.1.1.0.12
	riastradh-drm2-base:1.1.1.1
	netbsd-6-1:1.1.1.1.0.16
	netbsd-6-0-2-RELEASE:1.1.1.1
	netbsd-6-1-RELEASE:1.1.1.1
	netbsd-6-1-RC4:1.1.1.1
	netbsd-6-1-RC3:1.1.1.1
	agc-symver:1.1.1.1.0.14
	agc-symver-base:1.1.1.1
	netbsd-6-1-RC2:1.1.1.1
	netbsd-6-1-RC1:1.1.1.1
	yamt-pagecache-base8:1.1.1.1
	netbsd-6-0-1-RELEASE:1.1.1.1
	yamt-pagecache-base7:1.1.1.1
	matt-nb6-plus-nbase:1.1.1.1
	yamt-pagecache-base6:1.1.1.1
	netbsd-6-0:1.1.1.1.0.10
	netbsd-6-0-RELEASE:1.1.1.1
	netbsd-6-0-RC2:1.1.1.1
	tls-maxphys:1.1.1.1.0.8
	tls-maxphys-base:1.1.1.2
	matt-nb6-plus:1.1.1.1.0.6
	matt-nb6-plus-base:1.1.1.1
	netbsd-6-0-RC1:1.1.1.1
	yamt-pagecache-base5:1.1.1.1
	yamt-pagecache-base4:1.1.1.1
	netbsd-6:1.1.1.1.0.4
	netbsd-6-base:1.1.1.1
	yamt-pagecache-base3:1.1.1.1
	yamt-pagecache-base2:1.1.1.1
	yamt-pagecache:1.1.1.1.0.2
	yamt-pagecache-base:1.1.1.1
	gmp-5-0-2:1.1.1.1
	gmp:1.1.1;
locks; strict;
comment	@;; @;


1.2
date	2017.08.22.09.55.46;	author mrg;	state dead;
branches;
next	1.1;
commitid	DVNRZh45aSIRZb4A;

1.1
date	2011.06.20.05.54.39;	author mrg;	state Exp;
branches
	1.1.1.1;
next	;

1.1.1.1
date	2011.06.20.05.54.39;	author mrg;	state Exp;
branches
	1.1.1.1.2.1
	1.1.1.1.8.1;
next	1.1.1.2;

1.1.1.2
date	2013.11.29.07.49.48;	author mrg;	state Exp;
branches;
next	;
commitid	L2Av4PuGmdoL39fx;

1.1.1.1.2.1
date	2014.05.22.14.09.06;	author yamt;	state Exp;
branches;
next	;
commitid	nx2BSsHy0NPeAxBx;

1.1.1.1.8.1
date	2014.08.19.23.59.55;	author tls;	state Exp;
branches;
next	;
commitid	jTnpym9Qu0o4R1Nx;


desc
@@


1.2
log
@merge GMP 6.1.2.
@
text
@dnl  AMD64 mpn_sqr_basecase.

dnl  Contributed to the GNU project by Torbjorn Granlund.

dnl  Copyright 2008, 2009 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C The inner loops of this code are the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.

C NOTES
C   * This code only handles operands up to SQR_TOOM2_THRESHOLD_MAX.  That
C     means we can safely use 32-bit operations for all sizes, unlike in e.g.,
C     mpn_addmul_1.
C   * The jump table could probably be optimized, at least for non-pic.
C   * The special code for n=1,2,3 was quickly written.  It is probably too
C     large and unnecessarily slow.
C   * Consider combining small cases code so that the n=k-1 code jumps into
C     the middle of the n=k code.
C   * Avoid saving registers for small cases code.
C   * Needed variables:
C    n   r11  input size
C    i   r8   work left, initially n
C    j   r9   inner loop count
C        r15  unused
C    v0  r13
C    v1  r14
C    rp  rdi
C    up  rsi
C    w0  rbx
C    w1  rcx
C    w2  rbp
C    w3  r10
C    tp  r12
C    lo  rax
C    hi  rdx
C        rsp

C INPUT PARAMETERS
define(`rp',	  `%rdi')
define(`up',	  `%rsi')
define(`n_param', `%rdx')

C We should really trim this, for better spatial locality.  Alternatively,
C we could grab the upper part of the stack area, leaving the lower part
C instead of the upper part unused.
deflit(SQR_TOOM2_THRESHOLD_MAX, 80)
define(`STACK_ALLOC', eval(8*2*SQR_TOOM2_THRESHOLD_MAX))

define(`n',	`%r11')
define(`tp',	`%r12')
define(`i',	`%r8')
define(`j',	`%r9')
define(`v0',	`%r13')
define(`v1',	`%r14')
define(`w0',	`%rbx')
define(`w1',	`%rcx')
define(`w2',	`%rbp')
define(`w3',	`%r10')


ASM_START()
	TEXT
	ALIGN(16)

PROLOGUE(mpn_sqr_basecase)
	add	$-48, %rsp
	mov	%rbx, 40(%rsp)
	mov	%rbp, 32(%rsp)
	mov	%r12, 24(%rsp)
	mov	%r13, 16(%rsp)
	mov	%r14, 8(%rsp)

	mov	R32(n_param), R32(n)		C free original n register (rdx)
	mov	R32(n_param), R32(%rcx)
	and	$3, R32(%rcx)
	lea	4(%rcx), %rbx
	cmp	$4, R32(n_param)
	cmovg	%rbx, %rcx
	lea	L(jmptab)(%rip), %rax
	jmp	*(%rax,%rcx,8)
	JUMPTABSECT
	ALIGN(8)
L(jmptab):
	.quad	L(4)
	.quad	L(1)
	.quad	L(2)
	.quad	L(3)
	.quad	L(0m4)
	.quad	L(1m4)
	.quad	L(2m4)
	.quad	L(3m4)
	TEXT

L(1):	mov	(up), %rax
	mul	%rax
	mov	%rax, (rp)
	mov	%rdx, 8(rp)
	add	$40, %rsp
	pop	%rbx
	ret

L(2):	mov	(up), %rax
	mul	%rax
	mov	%rax, (rp)
	mov	%rdx, %r9
	mov	8(up), %rax
	mul	%rax
	mov	%rax, %r10
	mov	%rdx, %r11
	mov	8(up), %rax
	mov	(up), %rbx
	mul	%rbx
	add	%rax, %r9
	adc	%rdx, %r10
	adc	$0, %r11
	add	%rax, %r9
	mov	%r9, 8(rp)
	adc	%rdx, %r10
	mov	%r10, 16(rp)
	adc	$0, %r11
	mov	%r11, 24(rp)
	add	$40, %rsp
	pop	%rbx
	ret

L(3):	mov	(up), %rax
	mul	%rax
	mov	%rax, (rp)
	mov	%rdx, 8(rp)
	mov	8(up), %rax
	mul	%rax
	mov	%rax, 16(rp)
	mov	%rdx, 24(rp)
	mov	16(up), %rax
	mul	%rax
	mov	%rax, 32(rp)
	mov	%rdx, 40(rp)

	mov	(up), %rbx
	mov	8(up), %rax
	mul	%rbx
	mov	%rax, %r8
	mov	%rdx, %r9
	mov	16(up), %rax
	mul	%rbx
	xor	R32(%r10), R32(%r10)
	add	%rax, %r9
	adc	%rdx, %r10

	mov	8(up), %rbx
	mov	16(up), %rax
	mul	%rbx
	xor	R32(%r11), R32(%r11)
	add	%rax, %r10
	adc	%rdx, %r11
	add	%r8, %r8
	adc	%r9, %r9
	adc	%r10, %r10
	adc	%r11, %r11
	mov	$0, R32(%rbx)
	adc	%rbx, %rbx
	add	%r8, 8(rp)
	adc	%r9, 16(rp)
	adc	%r10, 24(rp)
	adc	%r11, 32(rp)
	adc	%rbx, 40(rp)
	add	$40, %rsp
	pop	%rbx
	ret

L(4):	mov	(up), %rax
	mul	%rax
	mov	%rax, (rp)
	mov	%rdx, 8(rp)
	mov	8(up), %rax
	mul	%rax
	mov	%rax, 16(rp)
	mov	%rdx, 24(rp)
	mov	16(up), %rax
	mul	%rax
	mov	%rax, 32(rp)
	mov	%rdx, 40(rp)
	mov	24(up), %rax
	mul	%rax
	mov	%rax, 48(rp)
	mov	%rdx, 56(rp)

	mov	(up), %rbx
	mov	8(up), %rax
	mul	%rbx
	mov	%rax, %r8
	mov	%rdx, %r9
	mov	16(up), %rax
	mul	%rbx
	xor	R32(%r10), R32(%r10)
	add	%rax, %r9
	adc	%rdx, %r10
	mov	24(up), %rax
	mul	%rbx
	xor	R32(%r11), R32(%r11)
	add	%rax, %r10
	adc	%rdx, %r11
	mov	8(up), %rbx
	mov	16(up), %rax
	mul	%rbx
	xor	R32(%r12), R32(%r12)
	add	%rax, %r10
	adc	%rdx, %r11
	adc	$0, %r12
	mov	24(up), %rax
	mul	%rbx
	add	%rax, %r11
	adc	%rdx, %r12
	mov	16(up), %rbx
	mov	24(up), %rax
	mul	%rbx
	xor	R32(%rbp), R32(%rbp)
	add	%rax, %r12
	adc	%rdx, %rbp

	add	%r8, %r8
	adc	%r9, %r9
	adc	%r10, %r10
	adc	%r11, %r11
	adc	%r12, %r12
	mov	$0, R32(%rbx)
	adc	%rbp, %rbp

	adc	%rbx, %rbx
	add	%r8, 8(rp)
	adc	%r9, 16(rp)
	adc	%r10, 24(rp)
	adc	%r11, 32(rp)
	adc	%r12, 40(rp)
	adc	%rbp, 48(rp)
	adc	%rbx, 56(rp)
	add	$24, %rsp
	pop	%r12
	pop	%rbp
	pop	%rbx
	ret


L(0m4):	add	$-STACK_ALLOC, %rsp
	lea	-24(%rsp,n,8), tp		C point tp in middle of result operand
	mov	(up), v0
	mov	8(up), %rax
	lea	(up,n,8), up		C point up at end of input operand

	lea	-4(n), i
C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
	xor	R32(j), R32(j)
	sub	n, j

	mul	v0
	xor	R32(w2), R32(w2)
	mov	%rax, w0
	mov	16(up,j,8), %rax
	mov	%rdx, w3
	jmp	L(L3)

	ALIGN(16)
L(mul_1_m3_top):
	add	%rax, w2
	mov	w3, (tp,j,8)
	mov	(up,j,8), %rax
	adc	%rdx, w1
	xor	R32(w0), R32(w0)
	mul	v0
	xor	R32(w3), R32(w3)
	mov	w2, 8(tp,j,8)
	add	%rax, w1
	adc	%rdx, w0
	mov	8(up,j,8), %rax
	mov	w1, 16(tp,j,8)
	xor	R32(w2), R32(w2)
	mul	v0
	add	%rax, w0
	mov	16(up,j,8), %rax
	adc	%rdx, w3
L(L3):	xor	R32(w1), R32(w1)
	mul	v0
	add	%rax, w3
	mov	24(up,j,8), %rax
	adc	%rdx, w2
	mov	w0, 24(tp,j,8)
	mul	v0
	add	$4, j
	js	L(mul_1_m3_top)

	add	%rax, w2
	mov	w3, (tp)
	adc	%rdx, w1
	mov	w2, 8(tp)
	mov	w1, 16(tp)

	lea	eval(2*8)(tp), tp	C tp += 2
	lea	-8(up), up
	jmp	L(dowhile)


L(1m4):	add	$-STACK_ALLOC, %rsp
	lea	(%rsp,n,8), tp		C point tp in middle of result operand
	mov	(up), v0		C u0
	mov	8(up), %rax		C u1
	lea	8(up,n,8), up		C point up at end of input operand

	lea	-3(n), i
C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
	lea	-3(n), j
	neg	j

	mov	%rax, v1		C u1
	mul	v0			C u0 * u1
	mov	%rdx, w1
	xor	R32(w2), R32(w2)
	mov	%rax, (%rsp)
	jmp	L(m0)

	ALIGN(16)
L(mul_2_m0_top):
	mul	v1
	add	%rax, w0
	adc	%rdx, w1
	mov	-24(up,j,8), %rax
	mov	$0, R32(w2)
	mul	v0
	add	%rax, w0
	mov	-24(up,j,8), %rax
	adc	%rdx, w1
	adc	$0, R32(w2)
	mul	v1			C v1 * u0
	add	%rax, w1
	mov	w0, -24(tp,j,8)
	adc	%rdx, w2
L(m0):	mov	-16(up,j,8), %rax	C u2, u6 ...
	mul	v0			C u0 * u2
	mov	$0, R32(w3)
	add	%rax, w1
	adc	%rdx, w2
	mov	-16(up,j,8), %rax
	adc	$0, R32(w3)
	mov	$0, R32(w0)
	mov	w1, -16(tp,j,8)
	mul	v1
	add	%rax, w2
	mov	-8(up,j,8), %rax
	adc	%rdx, w3
	mov	$0, R32(w1)
	mul	v0
	add	%rax, w2
	mov	-8(up,j,8), %rax
	adc	%rdx, w3
	adc	$0, R32(w0)
	mul	v1
	add	%rax, w3
	mov	w2, -8(tp,j,8)
	adc	%rdx, w0
L(m2x):	mov	(up,j,8), %rax
	mul	v0
	add	%rax, w3
	adc	%rdx, w0
	adc	$0, R32(w1)
	add	$4, j
	mov	-32(up,j,8), %rax
	mov	w3, -32(tp,j,8)
	js	L(mul_2_m0_top)

	mul	v1
	add	%rax, w0
	adc	%rdx, w1
	mov	w0, -8(tp)
	mov	w1, (tp)

	lea	-16(up), up
	lea	eval(3*8-24)(tp), tp	C tp += 3
	jmp	L(dowhile_end)


L(2m4):	add	$-STACK_ALLOC, %rsp
	lea	-24(%rsp,n,8), tp	C point tp in middle of result operand
	mov	(up), v0
	mov	8(up), %rax
	lea	(up,n,8), up		C point up at end of input operand

	lea	-4(n), i
C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
	lea	-2(n), j
	neg	j

	mul	v0
	mov	%rax, w2
	mov	(up,j,8), %rax
	mov	%rdx, w1
	jmp	L(L1)

	ALIGN(16)
L(mul_1_m1_top):
	add	%rax, w2
	mov	w3, (tp,j,8)
	mov	(up,j,8), %rax
	adc	%rdx, w1
L(L1):	xor	R32(w0), R32(w0)
	mul	v0
	xor	R32(w3), R32(w3)
	mov	w2, 8(tp,j,8)
	add	%rax, w1
	adc	%rdx, w0
	mov	8(up,j,8), %rax
	mov	w1, 16(tp,j,8)
	xor	R32(w2), R32(w2)
	mul	v0
	add	%rax, w0
	mov	16(up,j,8), %rax
	adc	%rdx, w3
	xor	R32(w1), R32(w1)
	mul	v0
	add	%rax, w3
	mov	24(up,j,8), %rax
	adc	%rdx, w2
	mov	w0, 24(tp,j,8)
	mul	v0
	add	$4, j
	js	L(mul_1_m1_top)

	add	%rax, w2
	mov	w3, (tp)
	adc	%rdx, w1
	mov	w2, 8(tp)
	mov	w1, 16(tp)

	lea	eval(2*8)(tp), tp	C tp += 2
	lea	-8(up), up
	jmp	L(dowhile_mid)


L(3m4):	add	$-STACK_ALLOC, %rsp
	lea	(%rsp,n,8), tp		C point tp in middle of result operand
	mov	(up), v0		C u0
	mov	8(up), %rax		C u1
	lea	8(up,n,8), up		C point up at end of input operand

	lea	-5(n), i
C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
	lea	-1(n), j
	neg	j

	mov	%rax, v1		C u1
	mul	v0			C u0 * u1
	mov	%rdx, w3
	xor	R32(w0), R32(w0)
	xor	R32(w1), R32(w1)
	mov	%rax, (%rsp)
	jmp	L(m2)

	ALIGN(16)
L(mul_2_m2_top):
	mul	v1
	add	%rax, w0
	adc	%rdx, w1
	mov	-24(up,j,8), %rax
	mov	$0, R32(w2)
	mul	v0
	add	%rax, w0
	mov	-24(up,j,8), %rax
	adc	%rdx, w1
	adc	$0, R32(w2)
	mul	v1			C v1 * u0
	add	%rax, w1
	mov	w0, -24(tp,j,8)
	adc	%rdx, w2
	mov	-16(up,j,8), %rax
	mul	v0
	mov	$0, R32(w3)
	add	%rax, w1
	adc	%rdx, w2
	mov	-16(up,j,8), %rax
	adc	$0, R32(w3)
	mov	$0, R32(w0)
	mov	w1, -16(tp,j,8)
	mul	v1
	add	%rax, w2
	mov	-8(up,j,8), %rax
	adc	%rdx, w3
	mov	$0, R32(w1)
	mul	v0
	add	%rax, w2
	mov	-8(up,j,8), %rax
	adc	%rdx, w3
	adc	$0, R32(w0)
	mul	v1
	add	%rax, w3
	mov	w2, -8(tp,j,8)
	adc	%rdx, w0
L(m2):	mov	(up,j,8), %rax
	mul	v0
	add	%rax, w3
	adc	%rdx, w0
	adc	$0, R32(w1)
	add	$4, j
	mov	-32(up,j,8), %rax
	mov	w3, -32(tp,j,8)
	js	L(mul_2_m2_top)

	mul	v1
	add	%rax, w0
	adc	%rdx, w1
	mov	w0, -8(tp)
	mov	w1, (tp)

	lea	-16(up), up
	jmp	L(dowhile_mid)

L(dowhile):
C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
	lea	4(i), j
	neg	j

	mov	16(up,j,8), v0
	mov	24(up,j,8), v1
	mov	24(up,j,8), %rax
	mul	v0
	xor	R32(w3), R32(w3)
	add	%rax, 24(tp,j,8)
	adc	%rdx, w3
	xor	R32(w0), R32(w0)
	xor	R32(w1), R32(w1)
	jmp	L(am2)

	ALIGN(16)
L(addmul_2_m2_top):
	add	w3, (tp,j,8)
	adc	%rax, w0
	mov	8(up,j,8), %rax
	adc	%rdx, w1
	mov	$0, R32(w2)
	mul	v0
	add	%rax, w0
	mov	8(up,j,8), %rax
	adc	%rdx, w1
	adc	$0, R32(w2)
	mul	v1				C v1 * u0
	add	w0, 8(tp,j,8)
	adc	%rax, w1
	adc	%rdx, w2
	mov	16(up,j,8), %rax
	mov	$0, R32(w3)
	mul	v0				C v0 * u1
	add	%rax, w1
	mov	16(up,j,8), %rax
	adc	%rdx, w2
	adc	$0, R32(w3)
	mul	v1				C v1 * u1
	add	w1, 16(tp,j,8)
	adc	%rax, w2
	mov	24(up,j,8), %rax
	adc	%rdx, w3
	mul	v0
	mov	$0, R32(w0)
	add	%rax, w2
	adc	%rdx, w3
	mov	$0, R32(w1)
	mov	24(up,j,8), %rax
	adc	$0, R32(w0)
	mul	v1
	add	w2, 24(tp,j,8)
	adc	%rax, w3
	adc	%rdx, w0
L(am2):	mov	32(up,j,8), %rax
	mul	v0
	add	%rax, w3
	mov	32(up,j,8), %rax
	adc	%rdx, w0
	adc	$0, R32(w1)
	mul	v1
	add	$4, j
	js	L(addmul_2_m2_top)

	add	w3, (tp)
	adc	%rax, w0
	adc	%rdx, w1
	mov	w0, 8(tp)
	mov	w1, 16(tp)

	lea	eval(2*8)(tp), tp	C tp += 2

	add	$-2, R32(i)		C i -= 2

L(dowhile_mid):
C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
	lea	2(i), j
	neg	j

	mov	(up,j,8), v0
	mov	8(up,j,8), v1
	mov	8(up,j,8), %rax
	mul	v0
	xor	R32(w1), R32(w1)
	add	%rax, 8(tp,j,8)
	adc	%rdx, w1
	xor	R32(w2), R32(w2)
	jmp	L(20)

	ALIGN(16)
L(addmul_2_m0_top):
	add	w3, (tp,j,8)
	adc	%rax, w0
	mov	8(up,j,8), %rax
	adc	%rdx, w1
	mov	$0, R32(w2)
	mul	v0
	add	%rax, w0
	mov	8(up,j,8), %rax
	adc	%rdx, w1
	adc	$0, R32(w2)
	mul	v1				C v1 * u0
	add	w0, 8(tp,j,8)
	adc	%rax, w1
	adc	%rdx, w2
L(20):	mov	16(up,j,8), %rax
	mov	$0, R32(w3)
	mul	v0				C v0 * u1
	add	%rax, w1
	mov	16(up,j,8), %rax
	adc	%rdx, w2
	adc	$0, R32(w3)
	mul	v1				C v1 * u1
	add	w1, 16(tp,j,8)
	adc	%rax, w2
	mov	24(up,j,8), %rax
	adc	%rdx, w3
	mul	v0
	mov	$0, R32(w0)
	add	%rax, w2
	adc	%rdx, w3
	mov	$0, R32(w1)
	mov	24(up,j,8), %rax
	adc	$0, R32(w0)
	mul	v1
	add	w2, 24(tp,j,8)
	adc	%rax, w3
	adc	%rdx, w0
	mov	32(up,j,8), %rax
	mul	v0
	add	%rax, w3
	mov	32(up,j,8), %rax
	adc	%rdx, w0
	adc	$0, R32(w1)
	mul	v1
	add	$4, j
	js	L(addmul_2_m0_top)

	add	w3, (tp)
	adc	%rax, w0
	adc	%rdx, w1
	mov	w0, 8(tp)
	mov	w1, 16(tp)

	lea	eval(2*8)(tp), tp	C tp += 2
L(dowhile_end):

	add	$-2, R32(i)		C i -= 2
	jne	L(dowhile)

C Function mpn_addmul_2s_2
	mov	-16(up), v0
	mov	-8(up), v1
	mov	-8(up), %rax
	mul	v0
	xor	R32(w3), R32(w3)
	add	%rax, -8(tp)
	adc	%rdx, w3
	xor	R32(w0), R32(w0)
	xor	R32(w1), R32(w1)
	mov	(up), %rax
	mul	v0
	add	%rax, w3
	mov	(up), %rax
	adc	%rdx, w0
	mul	v1
	add	w3, (tp)
	adc	%rax, w0
	adc	%rdx, w1
	mov	w0, 8(tp)
	mov	w1, 16(tp)

C Function mpn_sqr_diag_addlsh1
	lea	-4(n,n), j

	mov	(%rsp), %r11

	lea	(rp,j,8), rp
	lea	-8(up), up
	lea	8(%rsp,j,8), tp
	neg	j
	mov	(up,j,4), %rax
	mul	%rax
	test	$2, R8(j)
	jnz	L(odd)

L(evn):	add	%r11, %r11
	sbb	R32(%rbx), R32(%rbx)		C save CF
	add	%rdx, %r11
	mov	%rax, (rp,j,8)
	jmp	L(d0)

L(odd):	add	%r11, %r11
	sbb	R32(%rbp), R32(%rbp)		C save CF
	add	%rdx, %r11
	mov	%rax, (rp,j,8)
	lea	-2(j), j
	jmp	L(d1)

	ALIGN(16)
L(top):	mov	(up,j,4), %rax
	mul	%rax
	add	R32(%rbp), R32(%rbp)		C restore carry
	adc	%rax, %r10
	adc	%rdx, %r11
	mov	%r10, (rp,j,8)
L(d0):	mov	%r11, 8(rp,j,8)
	mov	(tp,j,8), %r10
	adc	%r10, %r10
	mov	8(tp,j,8), %r11
	adc	%r11, %r11
	nop
	sbb	R32(%rbp), R32(%rbp)		C save CF
	mov	8(up,j,4), %rax
	mul	%rax
	add	R32(%rbx), R32(%rbx)		C restore carry
	adc	%rax, %r10
	adc	%rdx, %r11
	mov	%r10, 16(rp,j,8)
L(d1):	mov	%r11, 24(rp,j,8)
	mov	16(tp,j,8), %r10
	adc	%r10, %r10
	mov	24(tp,j,8), %r11
	adc	%r11, %r11
	sbb	R32(%rbx), R32(%rbx)		C save CF
	add	$4, j
	js	L(top)

	mov	(up), %rax
	mul	%rax
	add	R32(%rbp), R32(%rbp)		C restore carry
	adc	%rax, %r10
	adc	%rdx, %r11
	mov	%r10, (rp)
	mov	%r11, 8(rp)
	mov	(tp), %r10
	adc	%r10, %r10
	sbb	R32(%rbp), R32(%rbp)		C save CF
	neg	R32(%rbp)
	mov	8(up), %rax
	mul	%rax
	add	R32(%rbx), R32(%rbx)		C restore carry
	adc	%rax, %r10
	adc	%rbp, %rdx
	mov	%r10, 16(rp)
	mov	%rdx, 24(rp)

	add	$eval(8+STACK_ALLOC), %rsp
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbp
	pop	%rbx
	ret
EPILOGUE()
@


1.1
log
@Initial revision
@
text
@@


1.1.1.1
log
@initial import of GMP 5.0.2.

GNU MP is a library for arbitrary precision arithmetic, operating on signed
integers, rational numbers, and floating point numbers.  It has a rich set
of functions, and the functions have a regular interface.

GMP is necessary for GCC >= 4.2.
@
text
@@


1.1.1.1.8.1
log
@Rebase to HEAD as of a few days ago.
@
text
@d5 1
a5 1
dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
d28 3
a30 10
C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
C     large trip count.  Instead, we should follow the generic/sqr_basecase.c
C     code which uses addmul_2s from the start, conditionally leaving a 1x1
C     multiply to the end.  (In assembly code, one would stop invoking
C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to
C     save/restore carry, instead it can propagate into the high product word.
C   * Align more labels, should shave off a few cycles.
C   * We can safely use 32-bit size operations, since operands with (2^32)
C     limbs will lead to non-termination in practice.
d32 1
a32 1
C   * The special code for n <= 4 was quickly written.  It is probably too
d34 2
a35 2
C   * Consider combining small cases code so that the n=k-1 code jumps into the
C     middle of the n=k code.
d60 6
a76 2
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
d81 1
d83 8
a90 1
	FUNC_ENTRY(3)
a91 4
	mov	R32(n_param), R32(n)		C free original n register (rdx)

	add	$-40, %rsp

d93 1
d95 2
a96 16
	lea	4(%rcx), %r8

	mov	%rbx, 32(%rsp)
	mov	%rbp, 24(%rsp)
	mov	%r12, 16(%rsp)
	mov	%r13, 8(%rsp)
	mov	%r14, (%rsp)

	cmovg	%r8, %rcx

	lea	L(tab)(%rip), %rax
ifdef(`PIC',
`	movslq	(%rax,%rcx,4), %r10
	add	%r10, %rax
	jmp	*%rax
',`
a97 1
')
d100 9
a108 8
L(tab):	JMPENT(	L(4), L(tab))
	JMPENT(	L(1), L(tab))
	JMPENT(	L(2), L(tab))
	JMPENT(	L(3), L(tab))
	JMPENT(	L(0m4), L(tab))
	JMPENT(	L(1m4), L(tab))
	JMPENT(	L(2m4), L(tab))
	JMPENT(	L(3m4), L(tab))
a112 1
	add	$40, %rsp
d115 2
a116 1
	FUNC_EXIT()
a119 1
	mov	%rax, %r8
a120 1
	mov	8(up), %r11
a121 1
	mov	%r11, %rax
d123 1
a124 1
	add	$40, %rsp
a125 1
	mov	%r11, %rax
d127 3
a129 2
	mul	%r8
	xor	%r8, %r8
d132 1
a132 1
	adc	%r8, %r11
d137 1
a137 1
	adc	%r8, %r11
d139 2
a140 1
	FUNC_EXIT()
a143 1
	mov	%rax, %r10
a144 1
	mov	8(up), %r11
a145 1
	mov	%r11, %rax
d147 1
a148 1
	mov	16(up), %rcx
a149 1
	mov	%rcx, %rax
d151 1
d156 3
a158 2
	mov	%r11, %rax
	mul	%r10
a159 1
	mov	%rcx, %rax
d161 3
a163 2
	mul	%r10
	xor	%r10, %r10
a164 2
	mov	%r11, %rax
	mov	%r10, %r11
d167 4
a170 2
	mul	%rcx
	add	$40, %rsp
d172 1
a172 1
	adc	%r11, %rdx
a175 1
	adc	%rdx, %rdx
d177 2
d182 4
a185 3
	adc	%rdx, 32(rp)
	adc	%r11, 40(rp)
	FUNC_EXIT()
a188 1
	mov	%rax, %r11
a189 1
	mov	8(up), %rbx
a190 1
	mov	%rbx, %rax
d192 1
a202 1
	mov	%rbx, %rax
d205 3
a207 2
	mul	%r11
	add	$32, %rsp
d211 2
a212 2
	mul	%r11
	xor	%r10, %r10
d216 2
a217 2
	mul	%r11
	xor	%r11, %r11
d220 1
d223 1
a223 1
	xor	%rcx, %rcx
d226 1
a226 1
	adc	$0, %rcx
a228 1
	pop	%rbx
d230 2
a231 2
	adc	%rdx, %rcx
	mov	16(up), %rdx
d233 4
a236 3
	mul	%rdx
	add	%rax, %rcx
	adc	$0, %rdx
d242 3
a244 3
	adc	%rcx, %rcx
	mov	$0, R32(%rax)
	adc	%rdx, %rdx
d246 1
a246 1
	adc	%rax, %rax
d251 7
a257 4
	adc	%rcx, 40(rp)
	adc	%rdx, 48(rp)
	adc	%rax, 56(rp)
	FUNC_EXIT()
d261 2
a262 2
L(0m4):
	lea	-16(rp,n,8), tp		C point tp in middle of result operand
d319 2
a320 2
L(1m4):
	lea	8(rp,n,8), tp		C point tp in middle of result operand
d334 1
a334 1
	mov	%rax, 8(rp)
d397 2
a398 2
L(2m4):
	lea	-16(rp,n,8), tp		C point tp in middle of result operand
d454 2
a455 2
L(3m4):
	lea	8(rp,n,8), tp		C point tp in middle of result operand
d470 1
a470 1
	mov	%rax, 8(rp)
d707 3
a709 1
	mov	8(rp), %r11
d711 1
a711 1
	lea	(rp,j,8), rp
d739 1
a739 1
	mov	16(rp,j,8), %r10
d741 1
a741 1
	mov	24(rp,j,8), %r11
d752 1
a752 1
	mov	32(rp,j,8), %r10
d754 1
a754 1
	mov	40(rp,j,8), %r11
d767 1
a767 1
	mov	16(rp), %r10
d779 1
a784 1
	FUNC_EXIT()
@


1.1.1.1.2.1
log
@sync with head.

for a reference, the tree before this commit was tagged
as yamt-pagecache-tag8.

this commit was splitted into small chunks to avoid
a limitation of cvs.  ("Protocol error: too many arguments")
@
text
@d5 1
a5 1
dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
d28 3
a30 10
C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
C     large trip count.  Instead, we should follow the generic/sqr_basecase.c
C     code which uses addmul_2s from the start, conditionally leaving a 1x1
C     multiply to the end.  (In assembly code, one would stop invoking
C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to
C     save/restore carry, instead it can propagate into the high product word.
C   * Align more labels, should shave off a few cycles.
C   * We can safely use 32-bit size operations, since operands with (2^32)
C     limbs will lead to non-termination in practice.
d32 1
a32 1
C   * The special code for n <= 4 was quickly written.  It is probably too
d34 2
a35 2
C   * Consider combining small cases code so that the n=k-1 code jumps into the
C     middle of the n=k code.
d60 6
a76 2
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
d81 1
d83 8
a90 1
	FUNC_ENTRY(3)
a91 4
	mov	R32(n_param), R32(n)		C free original n register (rdx)

	add	$-40, %rsp

d93 1
d95 2
a96 16
	lea	4(%rcx), %r8

	mov	%rbx, 32(%rsp)
	mov	%rbp, 24(%rsp)
	mov	%r12, 16(%rsp)
	mov	%r13, 8(%rsp)
	mov	%r14, (%rsp)

	cmovg	%r8, %rcx

	lea	L(tab)(%rip), %rax
ifdef(`PIC',
`	movslq	(%rax,%rcx,4), %r10
	add	%r10, %rax
	jmp	*%rax
',`
a97 1
')
d100 9
a108 8
L(tab):	JMPENT(	L(4), L(tab))
	JMPENT(	L(1), L(tab))
	JMPENT(	L(2), L(tab))
	JMPENT(	L(3), L(tab))
	JMPENT(	L(0m4), L(tab))
	JMPENT(	L(1m4), L(tab))
	JMPENT(	L(2m4), L(tab))
	JMPENT(	L(3m4), L(tab))
a112 1
	add	$40, %rsp
d115 2
a116 1
	FUNC_EXIT()
a119 1
	mov	%rax, %r8
a120 1
	mov	8(up), %r11
a121 1
	mov	%r11, %rax
d123 1
a124 1
	add	$40, %rsp
a125 1
	mov	%r11, %rax
d127 3
a129 2
	mul	%r8
	xor	%r8, %r8
d132 1
a132 1
	adc	%r8, %r11
d137 1
a137 1
	adc	%r8, %r11
d139 2
a140 1
	FUNC_EXIT()
a143 1
	mov	%rax, %r10
a144 1
	mov	8(up), %r11
a145 1
	mov	%r11, %rax
d147 1
a148 1
	mov	16(up), %rcx
a149 1
	mov	%rcx, %rax
d151 1
d156 3
a158 2
	mov	%r11, %rax
	mul	%r10
a159 1
	mov	%rcx, %rax
d161 3
a163 2
	mul	%r10
	xor	%r10, %r10
a164 2
	mov	%r11, %rax
	mov	%r10, %r11
d167 4
a170 2
	mul	%rcx
	add	$40, %rsp
d172 1
a172 1
	adc	%r11, %rdx
a175 1
	adc	%rdx, %rdx
d177 2
d182 4
a185 3
	adc	%rdx, 32(rp)
	adc	%r11, 40(rp)
	FUNC_EXIT()
a188 1
	mov	%rax, %r11
a189 1
	mov	8(up), %rbx
a190 1
	mov	%rbx, %rax
d192 1
a202 1
	mov	%rbx, %rax
d205 3
a207 2
	mul	%r11
	add	$32, %rsp
d211 2
a212 2
	mul	%r11
	xor	%r10, %r10
d216 2
a217 2
	mul	%r11
	xor	%r11, %r11
d220 1
d223 1
a223 1
	xor	%rcx, %rcx
d226 1
a226 1
	adc	$0, %rcx
a228 1
	pop	%rbx
d230 2
a231 2
	adc	%rdx, %rcx
	mov	16(up), %rdx
d233 4
a236 3
	mul	%rdx
	add	%rax, %rcx
	adc	$0, %rdx
d242 3
a244 3
	adc	%rcx, %rcx
	mov	$0, R32(%rax)
	adc	%rdx, %rdx
d246 1
a246 1
	adc	%rax, %rax
d251 7
a257 4
	adc	%rcx, 40(rp)
	adc	%rdx, 48(rp)
	adc	%rax, 56(rp)
	FUNC_EXIT()
d261 2
a262 2
L(0m4):
	lea	-16(rp,n,8), tp		C point tp in middle of result operand
d319 2
a320 2
L(1m4):
	lea	8(rp,n,8), tp		C point tp in middle of result operand
d334 1
a334 1
	mov	%rax, 8(rp)
d397 2
a398 2
L(2m4):
	lea	-16(rp,n,8), tp		C point tp in middle of result operand
d454 2
a455 2
L(3m4):
	lea	8(rp,n,8), tp		C point tp in middle of result operand
d470 1
a470 1
	mov	%rax, 8(rp)
d707 3
a709 1
	mov	8(rp), %r11
d711 1
a711 1
	lea	(rp,j,8), rp
d739 1
a739 1
	mov	16(rp,j,8), %r10
d741 1
a741 1
	mov	24(rp,j,8), %r11
d752 1
a752 1
	mov	32(rp,j,8), %r10
d754 1
a754 1
	mov	40(rp,j,8), %r11
d767 1
a767 1
	mov	16(rp), %r10
d779 1
a784 1
	FUNC_EXIT()
@


1.1.1.2
log
@initial import GMP 5.1.3 sources.  changes include:

fixes for:
- mpn_sbpi1_div_qr_sec and mpn_sbpi1_div_r_sec
- mpz_powm_ui
- AMD family 11h
- mpz_powm_sec and mpn_powm_sec
- ASSERT() fixes
- gcd, gcdext, and invert function fixes
- some PPC division operations
@
text
@d5 1
a5 1
dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
d28 3
a30 10
C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
C     large trip count.  Instead, we should follow the generic/sqr_basecase.c
C     code which uses addmul_2s from the start, conditionally leaving a 1x1
C     multiply to the end.  (In assembly code, one would stop invoking
C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to
C     save/restore carry, instead it can propagate into the high product word.
C   * Align more labels, should shave off a few cycles.
C   * We can safely use 32-bit size operations, since operands with (2^32)
C     limbs will lead to non-termination in practice.
d32 1
a32 1
C   * The special code for n <= 4 was quickly written.  It is probably too
d34 2
a35 2
C   * Consider combining small cases code so that the n=k-1 code jumps into the
C     middle of the n=k code.
d60 6
a76 2
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
d81 1
d83 8
a90 1
	FUNC_ENTRY(3)
a91 4
	mov	R32(n_param), R32(n)		C free original n register (rdx)

	add	$-40, %rsp

d93 1
d95 2
a96 16
	lea	4(%rcx), %r8

	mov	%rbx, 32(%rsp)
	mov	%rbp, 24(%rsp)
	mov	%r12, 16(%rsp)
	mov	%r13, 8(%rsp)
	mov	%r14, (%rsp)

	cmovg	%r8, %rcx

	lea	L(tab)(%rip), %rax
ifdef(`PIC',
`	movslq	(%rax,%rcx,4), %r10
	add	%r10, %rax
	jmp	*%rax
',`
a97 1
')
d100 9
a108 8
L(tab):	JMPENT(	L(4), L(tab))
	JMPENT(	L(1), L(tab))
	JMPENT(	L(2), L(tab))
	JMPENT(	L(3), L(tab))
	JMPENT(	L(0m4), L(tab))
	JMPENT(	L(1m4), L(tab))
	JMPENT(	L(2m4), L(tab))
	JMPENT(	L(3m4), L(tab))
a112 1
	add	$40, %rsp
d115 2
a116 1
	FUNC_EXIT()
a119 1
	mov	%rax, %r8
a120 1
	mov	8(up), %r11
a121 1
	mov	%r11, %rax
d123 1
a124 1
	add	$40, %rsp
a125 1
	mov	%r11, %rax
d127 3
a129 2
	mul	%r8
	xor	%r8, %r8
d132 1
a132 1
	adc	%r8, %r11
d137 1
a137 1
	adc	%r8, %r11
d139 2
a140 1
	FUNC_EXIT()
a143 1
	mov	%rax, %r10
a144 1
	mov	8(up), %r11
a145 1
	mov	%r11, %rax
d147 1
a148 1
	mov	16(up), %rcx
a149 1
	mov	%rcx, %rax
d151 1
d156 3
a158 2
	mov	%r11, %rax
	mul	%r10
a159 1
	mov	%rcx, %rax
d161 3
a163 2
	mul	%r10
	xor	%r10, %r10
a164 2
	mov	%r11, %rax
	mov	%r10, %r11
d167 4
a170 2
	mul	%rcx
	add	$40, %rsp
d172 1
a172 1
	adc	%r11, %rdx
a175 1
	adc	%rdx, %rdx
d177 2
d182 4
a185 3
	adc	%rdx, 32(rp)
	adc	%r11, 40(rp)
	FUNC_EXIT()
a188 1
	mov	%rax, %r11
a189 1
	mov	8(up), %rbx
a190 1
	mov	%rbx, %rax
d192 1
a202 1
	mov	%rbx, %rax
d205 3
a207 2
	mul	%r11
	add	$32, %rsp
d211 2
a212 2
	mul	%r11
	xor	%r10, %r10
d216 2
a217 2
	mul	%r11
	xor	%r11, %r11
d220 1
d223 1
a223 1
	xor	%rcx, %rcx
d226 1
a226 1
	adc	$0, %rcx
a228 1
	pop	%rbx
d230 2
a231 2
	adc	%rdx, %rcx
	mov	16(up), %rdx
d233 4
a236 3
	mul	%rdx
	add	%rax, %rcx
	adc	$0, %rdx
d242 3
a244 3
	adc	%rcx, %rcx
	mov	$0, R32(%rax)
	adc	%rdx, %rdx
d246 1
a246 1
	adc	%rax, %rax
d251 7
a257 4
	adc	%rcx, 40(rp)
	adc	%rdx, 48(rp)
	adc	%rax, 56(rp)
	FUNC_EXIT()
d261 2
a262 2
L(0m4):
	lea	-16(rp,n,8), tp		C point tp in middle of result operand
d319 2
a320 2
L(1m4):
	lea	8(rp,n,8), tp		C point tp in middle of result operand
d334 1
a334 1
	mov	%rax, 8(rp)
d397 2
a398 2
L(2m4):
	lea	-16(rp,n,8), tp		C point tp in middle of result operand
d454 2
a455 2
L(3m4):
	lea	8(rp,n,8), tp		C point tp in middle of result operand
d470 1
a470 1
	mov	%rax, 8(rp)
d707 3
a709 1
	mov	8(rp), %r11
d711 1
a711 1
	lea	(rp,j,8), rp
d739 1
a739 1
	mov	16(rp,j,8), %r10
d741 1
a741 1
	mov	24(rp,j,8), %r11
d752 1
a752 1
	mov	32(rp,j,8), %r10
d754 1
a754 1
	mov	40(rp,j,8), %r11
d767 1
a767 1
	mov	16(rp), %r10
d779 1
a784 1
	FUNC_EXIT()
@