head 1.1; branch 1.1.1; access; symbols netbsd-11-0-RC5:1.1.1.3 netbsd-11-0-RC4:1.1.1.3 netbsd-11-0-RC3:1.1.1.3 netbsd-11-0-RC2:1.1.1.3 netbsd-11-0-RC1:1.1.1.3 perseant-exfatfs-base-20250801:1.1.1.3 netbsd-11:1.1.1.3.0.18 netbsd-11-base:1.1.1.3 netbsd-10-1-RELEASE:1.1.1.3 perseant-exfatfs-base-20240630:1.1.1.3 perseant-exfatfs:1.1.1.3.0.16 perseant-exfatfs-base:1.1.1.3 netbsd-8-3-RELEASE:1.1.1.2 netbsd-9-4-RELEASE:1.1.1.3 netbsd-10-0-RELEASE:1.1.1.3 netbsd-10-0-RC6:1.1.1.3 netbsd-10-0-RC5:1.1.1.3 netbsd-10-0-RC4:1.1.1.3 netbsd-10-0-RC3:1.1.1.3 netbsd-10-0-RC2:1.1.1.3 netbsd-10-0-RC1:1.1.1.3 netbsd-10:1.1.1.3.0.14 netbsd-10-base:1.1.1.3 netbsd-9-3-RELEASE:1.1.1.3 gmp-6-2-1:1.1.1.3 cjep_sun2x-base1:1.1.1.3 cjep_sun2x:1.1.1.3.0.12 cjep_sun2x-base:1.1.1.3 cjep_staticlib_x-base1:1.1.1.3 netbsd-9-2-RELEASE:1.1.1.3 cjep_staticlib_x:1.1.1.3.0.10 cjep_staticlib_x-base:1.1.1.3 netbsd-9-1-RELEASE:1.1.1.3 gmp-6-2-0:1.1.1.3 phil-wifi-20200421:1.1.1.3 phil-wifi-20200411:1.1.1.3 is-mlppp:1.1.1.3.0.8 is-mlppp-base:1.1.1.3 phil-wifi-20200406:1.1.1.3 netbsd-8-2-RELEASE:1.1.1.2 netbsd-9-0-RELEASE:1.1.1.3 netbsd-9-0-RC2:1.1.1.3 netbsd-9-0-RC1:1.1.1.3 phil-wifi-20191119:1.1.1.3 netbsd-9:1.1.1.3.0.6 netbsd-9-base:1.1.1.3 phil-wifi-20190609:1.1.1.3 netbsd-8-1-RELEASE:1.1.1.2 netbsd-8-1-RC1:1.1.1.2 pgoyette-compat-merge-20190127:1.1.1.3 pgoyette-compat-20190127:1.1.1.3 pgoyette-compat-20190118:1.1.1.3 pgoyette-compat-1226:1.1.1.3 pgoyette-compat-1126:1.1.1.3 pgoyette-compat-1020:1.1.1.3 pgoyette-compat-0930:1.1.1.3 pgoyette-compat-0906:1.1.1.3 netbsd-7-2-RELEASE:1.1.1.2 pgoyette-compat-0728:1.1.1.3 netbsd-8-0-RELEASE:1.1.1.2 phil-wifi:1.1.1.3.0.4 phil-wifi-base:1.1.1.3 pgoyette-compat-0625:1.1.1.3 netbsd-8-0-RC2:1.1.1.2 pgoyette-compat-0521:1.1.1.3 pgoyette-compat-0502:1.1.1.3 pgoyette-compat-0422:1.1.1.3 netbsd-8-0-RC1:1.1.1.2 pgoyette-compat-0415:1.1.1.3 pgoyette-compat-0407:1.1.1.3 pgoyette-compat-0330:1.1.1.3 pgoyette-compat-0322:1.1.1.3 pgoyette-compat-0315:1.1.1.3 netbsd-7-1-2-RELEASE:1.1.1.2 pgoyette-compat:1.1.1.3.0.2 pgoyette-compat-base:1.1.1.3 netbsd-7-1-1-RELEASE:1.1.1.2 matt-nb8-mediatek:1.1.1.2.0.22 matt-nb8-mediatek-base:1.1.1.2 gmp-6-1-2:1.1.1.3 perseant-stdc-iso10646:1.1.1.2.0.20 perseant-stdc-iso10646-base:1.1.1.2 netbsd-8:1.1.1.2.0.18 netbsd-8-base:1.1.1.2 prg-localcount2-base3:1.1.1.2 prg-localcount2-base2:1.1.1.2 prg-localcount2-base1:1.1.1.2 prg-localcount2:1.1.1.2.0.16 prg-localcount2-base:1.1.1.2 pgoyette-localcount-20170426:1.1.1.2 bouyer-socketcan-base1:1.1.1.2 pgoyette-localcount-20170320:1.1.1.2 netbsd-7-1:1.1.1.2.0.14 netbsd-7-1-RELEASE:1.1.1.2 netbsd-7-1-RC2:1.1.1.2 netbsd-7-nhusb-base-20170116:1.1.1.2 bouyer-socketcan:1.1.1.2.0.12 bouyer-socketcan-base:1.1.1.2 pgoyette-localcount-20170107:1.1.1.2 netbsd-7-1-RC1:1.1.1.2 pgoyette-localcount-20161104:1.1.1.2 netbsd-7-0-2-RELEASE:1.1.1.2 localcount-20160914:1.1.1.2 netbsd-7-nhusb:1.1.1.2.0.10 netbsd-7-nhusb-base:1.1.1.2 pgoyette-localcount-20160806:1.1.1.2 pgoyette-localcount-20160726:1.1.1.2 pgoyette-localcount:1.1.1.2.0.8 pgoyette-localcount-base:1.1.1.2 netbsd-7-0-1-RELEASE:1.1.1.2 netbsd-7-0:1.1.1.2.0.6 netbsd-7-0-RELEASE:1.1.1.2 netbsd-7-0-RC3:1.1.1.2 netbsd-7-0-RC2:1.1.1.2 netbsd-7-0-RC1:1.1.1.2 netbsd-6-0-6-RELEASE:1.1.1.1 netbsd-6-1-5-RELEASE:1.1.1.1 netbsd-7:1.1.1.2.0.4 netbsd-7-base:1.1.1.2 yamt-pagecache-base9:1.1.1.2 yamt-pagecache-tag8:1.1.1.1 netbsd-6-1-4-RELEASE:1.1.1.1 netbsd-6-0-5-RELEASE:1.1.1.1 tls-earlyentropy:1.1.1.2.0.2 tls-earlyentropy-base:1.1.1.2 riastradh-xf86-video-intel-2-7-1-pre-2-21-15:1.1.1.2 riastradh-drm2-base3:1.1.1.2 netbsd-6-1-3-RELEASE:1.1.1.1 netbsd-6-0-4-RELEASE:1.1.1.1 gmp-5-1-3:1.1.1.2 netbsd-6-1-2-RELEASE:1.1.1.1 netbsd-6-0-3-RELEASE:1.1.1.1 netbsd-6-1-1-RELEASE:1.1.1.1 riastradh-drm2-base2:1.1.1.1 riastradh-drm2-base1:1.1.1.1 riastradh-drm2:1.1.1.1.0.12 riastradh-drm2-base:1.1.1.1 netbsd-6-1:1.1.1.1.0.16 netbsd-6-0-2-RELEASE:1.1.1.1 netbsd-6-1-RELEASE:1.1.1.1 netbsd-6-1-RC4:1.1.1.1 netbsd-6-1-RC3:1.1.1.1 agc-symver:1.1.1.1.0.14 agc-symver-base:1.1.1.1 netbsd-6-1-RC2:1.1.1.1 netbsd-6-1-RC1:1.1.1.1 yamt-pagecache-base8:1.1.1.1 netbsd-6-0-1-RELEASE:1.1.1.1 yamt-pagecache-base7:1.1.1.1 matt-nb6-plus-nbase:1.1.1.1 yamt-pagecache-base6:1.1.1.1 netbsd-6-0:1.1.1.1.0.10 netbsd-6-0-RELEASE:1.1.1.1 netbsd-6-0-RC2:1.1.1.1 tls-maxphys:1.1.1.1.0.8 tls-maxphys-base:1.1.1.2 matt-nb6-plus:1.1.1.1.0.6 matt-nb6-plus-base:1.1.1.1 netbsd-6-0-RC1:1.1.1.1 yamt-pagecache-base5:1.1.1.1 yamt-pagecache-base4:1.1.1.1 netbsd-6:1.1.1.1.0.4 netbsd-6-base:1.1.1.1 yamt-pagecache-base3:1.1.1.1 yamt-pagecache-base2:1.1.1.1 yamt-pagecache:1.1.1.1.0.2 yamt-pagecache-base:1.1.1.1 gmp-5-0-2:1.1.1.1 gmp:1.1.1; locks; strict; comment @;; @; 1.1 date 2011.06.20.05.54.39; author mrg; state Exp; branches 1.1.1.1; next ; 1.1.1.1 date 2011.06.20.05.54.39; author mrg; state Exp; branches 1.1.1.1.2.1 1.1.1.1.8.1; next 1.1.1.2; 1.1.1.2 date 2013.11.29.07.49.48; author mrg; state Exp; branches; next 1.1.1.3; commitid L2Av4PuGmdoL39fx; 1.1.1.3 date 2017.08.22.09.40.49; author mrg; state Exp; branches; next ; commitid W5kmAIk8hwVpSb4A; 1.1.1.1.2.1 date 2014.05.22.14.09.01; author yamt; state Exp; branches; next ; commitid nx2BSsHy0NPeAxBx; 1.1.1.1.8.1 date 2014.08.19.23.59.50; author tls; state Exp; branches; next ; commitid jTnpym9Qu0o4R1Nx; desc @@ 1.1 log @Initial revision @ text @dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and dnl add the result to a (n+1)-limb number. dnl Copyright 2004, 2005 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C Itanium: 3.65 C Itanium 2: 1.625 C Note that this is very similar to mul_2.asm. If you change this file, C please change that file too. C TODO C * Clean up variable names, and try to decrease the number of distinct C registers used. C * Cleanup feed-in code to not require zeroing several registers. C * Make sure we don't depend on uninitialized predicate registers. C * We currently cross-jump very aggressively, at the expense of a few cycles C per operation. Consider changing that. C * Could perhaps save a few cycles by using 1 c/l carry propagation in C wind-down code. C * Ultimately rewrite. The problem with this code is that it first uses a C loaded u value in one xma pair, then leaves it live over several unrelated C xma pairs, before it uses it again. It should actually be quite possible C to just swap some aligned xma pairs around. But we should then schedule C u loads further from the first use. C INPUT PARAMETERS define(`rp',`r32') define(`up',`r33') define(`n',`r34') define(`vp',`r35') define(`srp',`r3') define(`v0',`f6') define(`v1',`f7') define(`s0',`r14') define(`acc0',`r15') define(`pr0_0',`r16') define(`pr0_1',`r17') define(`pr0_2',`r18') define(`pr0_3',`r19') define(`pr1_0',`r20') define(`pr1_1',`r21') define(`pr1_2',`r22') define(`pr1_3',`r23') define(`acc1_0',`r24') define(`acc1_1',`r25') define(`acc1_2',`r26') define(`acc1_3',`r27') dnl define(`',`r28') dnl define(`',`r29') dnl define(`',`r30') dnl define(`',`r31') define(`fp0b_0',`f8') define(`fp0b_1',`f9') define(`fp0b_2',`f10') define(`fp0b_3',`f11') define(`fp1a_0',`f12') define(`fp1a_1',`f13') define(`fp1a_2',`f14') define(`fp1a_3',`f15') define(`fp1b_0',`f32') define(`fp1b_1',`f33') define(`fp1b_2',`f34') define(`fp1b_3',`f35') define(`fp2a_0',`f36') define(`fp2a_1',`f37') define(`fp2a_2',`f38') define(`fp2a_3',`f39') define(`r_0',`f40') define(`r_1',`f41') define(`r_2',`f42') define(`r_3',`f43') define(`u_0',`f44') define(`u_1',`f45') define(`u_2',`f46') define(`u_3',`f47') define(`rx',`f48') define(`ux',`f49') define(`ry',`f50') define(`uy',`f51') ASM_START() PROLOGUE(mpn_addmul_2) .prologue .save ar.lc, r2 .body ifdef(`HAVE_ABI_32', ` addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I zxt4 n = n C I ;;') {.mmi C 00 ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov.i r2 = ar.lc C I0 }{.mmi ldf8 rx = [rp], 8 C M and r14 = 3, n C M I add n = -2, n C M I ;; }{.mmi C 01 ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I0 }{.mmi ldf8 ry = [rp], -8 C M cmp.eq p10, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; }{.mmi C 02 add srp = 16, rp C M I cmp.eq p12, p0 = 3, r14 C M I mov.i ar.lc = n C I0 }{.bbb (p10) br.dptk .Lb01 C B (p11) br.dptk .Lb10 C B (p12) br.dptk .Lb11 C B ;; } ALIGN(32) .Lb00: ldf8 r_1 = [srp], 8 ldf8 u_1 = [up], 8 mov acc1_2 = 0 mov pr1_2 = 0 mov pr0_3 = 0 cmp.ne p8, p9 = r0, r0 ;; ldf8 r_2 = [srp], 8 xma.l fp0b_3 = ux, v0, rx cmp.ne p12, p13 = r0, r0 ldf8 u_2 = [up], 8 xma.hu fp1a_3 = ux, v0, rx br.cloop.dptk .grt4 xma.l fp0b_0 = uy, v0, ry xma.hu fp1a_0 = uy, v0, ry ;; getf.sig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 ;; getf.sig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; getf.sig pr1_3 = fp1b_3 getf.sig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 br .Lcj4 .grt4: xma.l fp0b_0 = uy, v0, ry xma.hu fp1a_0 = uy, v0, ry ;; ldf8 r_3 = [srp], 8 getf.sig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 ldf8 u_3 = [up], 8 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 ;; ldf8 r_0 = [srp], 8 getf.sig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; ldf8 u_0 = [up], 8 getf.sig pr1_3 = fp1b_3 ;; getf.sig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 br .LL00 ALIGN(32) .Lb01: ldf8 r_0 = [srp], 8 C M ldf8 u_0 = [up], 8 C M mov acc1_1 = 0 C M I mov pr1_1 = 0 C M I mov pr0_2 = 0 C M I cmp.ne p6, p7 = r0, r0 C M I ;; ldf8 r_1 = [srp], 8 C M xma.l fp0b_2 = ux, v0, rx C F cmp.ne p10, p11 = r0, r0 C M I ldf8 u_1 = [up], 8 C M xma.hu fp1a_2 = ux, v0, rx C F ;; xma.l fp0b_3 = uy, v0, ry C F xma.hu fp1a_3 = uy, v0, ry C F ;; getf.sig acc0 = fp0b_2 C M ldf8 r_2 = [srp], 8 C M xma.l fp1b_2 = ux, v1,fp1a_2 C F xma.hu fp2a_2 = ux, v1,fp1a_2 C F ldf8 u_2 = [up], 8 C M br.cloop.dptk .grt5 xma.l fp0b_0 = u_0, v0, r_0 C F xma.hu fp1a_0 = u_0, v0, r_0 C F ;; getf.sig pr0_3 = fp0b_3 C M xma.l fp1b_3 = uy, v1,fp1a_3 C F xma.hu fp2a_3 = uy, v1,fp1a_3 C F ;; getf.sig pr1_2 = fp1b_2 C M getf.sig acc1_2 = fp2a_2 C M xma.l fp0b_1 = u_1, v0, r_1 C F xma.hu fp1a_1 = u_1, v0, r_1 C F br .Lcj5 .grt5: xma.l fp0b_0 = u_0, v0, r_0 xma.hu fp1a_0 = u_0, v0, r_0 ;; getf.sig pr0_3 = fp0b_3 ldf8 r_3 = [srp], 8 xma.l fp1b_3 = uy, v1, fp1a_3 xma.hu fp2a_3 = uy, v1, fp1a_3 ;; ldf8 u_3 = [up], 8 getf.sig pr1_2 = fp1b_2 ;; getf.sig acc1_2 = fp2a_2 xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 br .LL01 ALIGN(32) .Lb10: C 03 br.cloop.dptk .grt2 C 04 C 05 C 06 xma.l fp0b_1 = ux, v0, rx xma.hu fp1a_1 = ux, v0, rx ;; C 07 xma.l fp0b_2 = uy, v0, ry xma.hu fp1a_2 = uy, v0, ry ;; C 08 C 09 C 10 stf8 [rp] = fp0b_1, 8 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; C 11 getf.sig acc0 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; C 12 C 13 C 14 getf.sig pr1_1 = fp1b_1 C 15 getf.sig acc1_1 = fp2a_1 C 16 getf.sig pr1_2 = fp1b_2 C 17 getf.sig r8 = fp2a_2 ;; C 18 C 19 add s0 = pr1_1, acc0 ;; C 20 st8 [rp] = s0, 8 cmp.ltu p8, p9 = s0, pr1_1 sub r31 = -1, acc1_1 ;; C 21 .pred.rel "mutex", p8, p9 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 (p8) cmp.leu p10, p0 = r31, pr1_2 (p9) cmp.ltu p10, p0 = r31, pr1_2 ;; C 22 st8 [rp] = acc0, 8 mov.i ar.lc = r2 (p10) add r8 = 1, r8 br.ret.sptk.many b0 .grt2: ldf8 r_3 = [srp], 8 ldf8 u_3 = [up], 8 mov acc1_0 = 0 ;; ldf8 r_0 = [srp], 8 xma.l fp0b_1 = ux, v0, rx mov pr1_0 = 0 ldf8 u_0 = [up], 8 xma.hu fp1a_1 = ux, v0, rx mov pr0_1 = 0 ;; xma.l fp0b_2 = uy, v0, ry xma.hu fp1a_2 = uy, v0, ry ;; getf.sig acc0 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 xma.l fp0b_3 = u_3, v0, r_3 xma.hu fp1a_3 = u_3, v0, r_3 ;; getf.sig pr0_2 = fp0b_2 ldf8 r_2 = [srp], 8 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; ldf8 u_2 = [up], 8 getf.sig pr1_1 = fp1b_1 ;; getf.sig acc1_1 = fp2a_1 xma.l fp0b_0 = u_0, v0, r_0 cmp.ne p8, p9 = r0, r0 cmp.ne p12, p13 = r0, r0 xma.hu fp1a_0 = u_0, v0, r_0 br .LL10 ALIGN(32) .Lb11: mov acc1_3 = 0 mov pr1_3 = 0 mov pr0_0 = 0 cmp.ne p6, p7 = r0, r0 ;; ldf8 r_2 = [srp], 8 ldf8 u_2 = [up], 8 br.cloop.dptk .grt3 ;; xma.l fp0b_0 = ux, v0, rx xma.hu fp1a_0 = ux, v0, rx ;; cmp.ne p10, p11 = r0, r0 xma.l fp0b_1 = uy, v0, ry xma.hu fp1a_1 = uy, v0, ry ;; getf.sig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 ;; getf.sig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; getf.sig pr1_0 = fp1b_0 getf.sig acc1_0 = fp2a_0 br .Lcj3 .grt3: ldf8 r_3 = [srp], 8 xma.l fp0b_0 = ux, v0, rx cmp.ne p10, p11 = r0, r0 ldf8 u_3 = [up], 8 xma.hu fp1a_0 = ux, v0, rx ;; xma.l fp0b_1 = uy, v0, ry xma.hu fp1a_1 = uy, v0, ry ;; getf.sig acc0 = fp0b_0 ldf8 r_0 = [srp], 8 xma.l fp1b_0 = ux, v1, fp1a_0 ldf8 u_0 = [up], 8 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 ;; getf.sig pr0_1 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 getf.sig pr1_0 = fp1b_0 ;; getf.sig acc1_0 = fp2a_0 xma.l fp0b_3 = u_3, v0, r_3 xma.hu fp1a_3 = u_3, v0, r_3 br .LL11 C *** MAIN LOOP START *** ALIGN(32) .Loop: C 00 .pred.rel "mutex", p12, p13 getf.sig pr0_3 = fp0b_3 ldf8 r_3 = [srp], 8 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 ;; C 01 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_3 = [up], 8 getf.sig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; C 02 .pred.rel "mutex", p6, p7 getf.sig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, r_1 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, r_1 ;; C 03 .LL01: .pred.rel "mutex", p10, p11 getf.sig pr0_0 = fp0b_0 ldf8 r_0 = [srp], 8 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 ;; C 04 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_0 = [up], 8 getf.sig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; C 05 .pred.rel "mutex", p8, p9 getf.sig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, r_2 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, r_2 ;; C 06 .LL00: .pred.rel "mutex", p12, p13 getf.sig pr0_1 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 ;; C 07 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_1 = [up], 8 getf.sig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; C 08 .pred.rel "mutex", p6, p7 getf.sig acc1_0 = fp2a_0 st8 [rp] = s0, 8 xma.l fp0b_3 = u_3, v0, r_3 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 xma.hu fp1a_3 = u_3, v0, r_3 ;; C 09 .LL11: .pred.rel "mutex", p10, p11 getf.sig pr0_2 = fp0b_2 ldf8 r_2 = [srp], 8 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 ;; C 10 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_2 = [up], 8 getf.sig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; C 11 .pred.rel "mutex", p8, p9 getf.sig acc1_1 = fp2a_1 st8 [rp] = s0, 8 xma.l fp0b_0 = u_0, v0, r_0 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 xma.hu fp1a_0 = u_0, v0, r_0 .LL10: br.cloop.dptk .Loop C 12 ;; C *** MAIN LOOP END *** .Lcj6: .pred.rel "mutex", p12, p13 getf.sig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 getf.sig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 getf.sig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, r_1 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, r_1 ;; .Lcj5: .pred.rel "mutex", p10, p11 getf.sig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 getf.sig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 getf.sig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, r_2 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, r_2 ;; .Lcj4: .pred.rel "mutex", p12, p13 getf.sig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 getf.sig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; .pred.rel "mutex", p6, p7 getf.sig acc1_0 = fp2a_0 st8 [rp] = s0, 8 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 ;; .Lcj3: .pred.rel "mutex", p10, p11 getf.sig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 getf.sig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; .pred.rel "mutex", p8, p9 getf.sig acc1_1 = fp2a_1 st8 [rp] = s0, 8 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 ;; .Lcj2: .pred.rel "mutex", p12, p13 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 getf.sig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 getf.sig acc1_2 = fp2a_2 st8 [rp] = s0, 8 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 ;; .pred.rel "mutex", p10, p11 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 st8 [rp] = s0, 8 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 ;; .pred.rel "mutex", p8, p9 (p8) cmp.leu p10, p11 = acc0, pr1_2 (p9) cmp.ltu p10, p11 = acc0, pr1_2 (p12) add acc0 = 1, acc0 ;; st8 [rp] = acc0, 8 (p12) cmp.eq.or p10, p0 = 0, acc0 mov r8 = acc1_2 ;; .pred.rel "mutex", p10, p11 (p10) add r8 = 1, r8 mov.i ar.lc = r2 br.ret.sptk.many b0 EPILOGUE() ASM_END() @ 1.1.1.1 log @initial import of GMP 5.0.2. GNU MP is a library for arbitrary precision arithmetic, operating on signed integers, rational numbers, and floating point numbers. It has a rich set of functions, and the functions have a regular interface. GMP is necessary for GCC >= 4.2. @ text @@ 1.1.1.1.8.1 log @Rebase to HEAD as of a few days ago. @ text @d4 1 a4 3 dnl Contributed to the GNU project by Torbjorn Granlund. dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc. d27 3 d33 4 a36 2 C * Clean up feed-in code to not require zeroing several registers. C * Make sure we don't depend on uninitialised predicate registers. a96 51 PROLOGUE(mpn_addmul_2s) .prologue .save ar.lc, r2 .body ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I ;;') .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; ldf8 rx = [rp], 8 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I0 .mmi; ldf8 ry = [rp], -8 C M cmp.eq p14, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; add srp = 16, rp C M I cmp.eq p15, p0 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p14) br.dptk L(x01) C B (p11) br.dptk L(x10) C B (p15) br.dptk L(x11) C B ;; L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair mov fp2a_3 = f0 br L(b00) L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair mov fp2a_2 = f0 br L(b01) L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair mov fp2a_1 = f0 br L(b10) L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair mov fp2a_0 = f0 br L(b11) EPILOGUE() d102 5 a106 7 ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I d109 26 a134 20 .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; ldf8 rx = [rp], 8 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I0 .mmi; ldf8 ry = [rp], -8 C M cmp.eq p14, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; add srp = 16, rp C M I cmp.eq p15, p6 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p14) br.dptk L(b01) C B (p11) br.dptk L(b10) C B (p15) br.dptk L(b11) C B d136 1 d139 58 a196 59 L(b00): .mmi; ldf8 r_1 = [srp], 8 ldf8 u_1 = [up], 8 mov acc1_2 = 0 .mmi; mov pr1_2 = 0 mov pr0_3 = 0 cmp.ne p8, p9 = r0, r0 ;; .mfi; ldf8 r_2 = [srp], 8 xma.l fp0b_3 = ux, v0, rx cmp.ne p12, p13 = r0, r0 .mfb; ldf8 u_2 = [up], 8 xma.hu fp1b_3 = ux, v0, rx br.cloop.dptk L(gt4) xma.l fp0b_0 = uy, v0, ry xma.hu fp1a_0 = uy, v0, ry ;; getfsig acc0 = fp0b_3 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ;; xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; getfsig pr1_3 = fp1b_3 getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 br L(cj4) L(gt4): xma.l fp0b_0 = uy, v0, ry xma.hu fp1a_0 = uy, v0, ry ;; ldf8 r_3 = [srp], 8 getfsig acc0 = fp0b_3 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ldf8 u_3 = [up], 8 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ;; xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 ;; ldf8 r_0 = [srp], 8 getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 xma.l fp0b_2 = u_2, v0, r_2 ;; getfsig acc1_3 = fp2a_3 xma.hu fp1a_2 = u_2, v0, r_2 br L(00) d200 51 a250 53 L(b01): .mmi; ldf8 r_0 = [srp], 8 C M ldf8 u_0 = [up], 8 C M mov acc1_1 = 0 C M I .mmi; mov pr1_1 = 0 C M I mov pr0_2 = 0 C M I cmp.ne p6, p7 = r0, r0 C M I ;; .mfi; ldf8 r_1 = [srp], 8 C M xma.l fp0b_2 = ux, v0, rx C F cmp.ne p10, p11 = r0, r0 C M I .mfi; ldf8 u_1 = [up], 8 C M xma.hu fp1b_2 = ux, v0, rx C F nop 1 ;; xma.l fp0b_3 = uy, v0, ry C F xma.hu fp1a_3 = uy, v0, ry C F ;; .mmf; getfsig acc0 = fp0b_2 C M ldf8 r_2 = [srp], 8 C M (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s .mfb; ldf8 u_2 = [up], 8 C M (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s br.cloop.dptk L(gt5) xma.l fp0b_0 = u_0, v0, r_0 C F xma.hu fp1a_0 = u_0, v0, r_0 C F ;; getfsig pr0_3 = fp0b_3 C M xma.l fp1b_3 = uy, v1,fp1a_3 C F xma.hu fp2a_3 = uy, v1,fp1a_3 C F ;; getfsig pr1_2 = fp1b_2 C M getfsig acc1_2 = fp2a_2 C M xma.l fp0b_1 = u_1, v0, r_1 C F xma.hu fp1a_1 = u_1, v0, r_1 C F br L(cj5) L(gt5): xma.l fp0b_0 = u_0, v0, r_0 xma.hu fp1a_0 = u_0, v0, r_0 ;; getfsig pr0_3 = fp0b_3 ldf8 r_3 = [srp], 8 xma.l fp1b_3 = uy, v1, fp1a_3 xma.hu fp2a_3 = uy, v1, fp1a_3 ;; ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 xma.l fp0b_1 = u_1, v0, r_1 ;; getfsig acc1_2 = fp2a_2 xma.hu fp1a_1 = u_1, v0, r_1 br L(01) d254 87 a340 77 L(b10): br.cloop.dptk L(gt2) xma.l fp0b_1 = ux, v0, rx xma.hu fp1b_1 = ux, v0, rx ;; xma.l fp0b_2 = uy, v0, ry xma.hu fp1a_2 = uy, v0, ry ;; stf8 [rp] = fp0b_1, 8 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s ;; getfsig acc0 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; getfsig pr1_1 = fp1b_1 getfsig acc1_1 = fp2a_1 mov ar.lc = r2 getfsig pr1_2 = fp1b_2 getfsig r8 = fp2a_2 ;; add s0 = pr1_1, acc0 ;; st8 [rp] = s0, 8 cmp.ltu p8, p9 = s0, pr1_1 sub r31 = -1, acc1_1 ;; .pred.rel "mutex", p8, p9 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 (p8) cmp.leu p10, p0 = r31, pr1_2 (p9) cmp.ltu p10, p0 = r31, pr1_2 ;; st8 [rp] = acc0, 8 (p10) add r8 = 1, r8 br.ret.sptk.many b0 L(gt2): .mmi; ldf8 r_3 = [srp], 8 ldf8 u_3 = [up], 8 mov acc1_0 = 0 ;; .mfi; ldf8 r_0 = [srp], 8 xma.l fp0b_1 = ux, v0, rx mov pr1_0 = 0 .mfi; ldf8 u_0 = [up], 8 xma.hu fp1b_1 = ux, v0, rx mov pr0_1 = 0 ;; xma.l fp0b_2 = uy, v0, ry xma.hu fp1a_2 = uy, v0, ry ;; getfsig acc0 = fp0b_1 ldf8 r_1 = [srp], 8 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s ;; ldf8 u_1 = [up], 8 xma.l fp0b_3 = u_3, v0, r_3 xma.hu fp1a_3 = u_3, v0, r_3 ;; getfsig pr0_2 = fp0b_2 ldf8 r_2 = [srp], 8 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 ;; .mfi; getfsig acc1_1 = fp2a_1 xma.l fp0b_0 = u_0, v0, r_0 cmp.ne p8, p9 = r0, r0 .mfb; cmp.ne p12, p13 = r0, r0 xma.hu fp1a_0 = u_0, v0, r_0 br.cloop.sptk.clr L(top) br.many L(end) d344 61 a404 62 L(b11): ldf8 r_2 = [srp], 8 mov pr1_3 = 0 mov pr0_0 = 0 ;; ldf8 u_2 = [up], 8 mov acc1_3 = 0 br.cloop.dptk L(gt3) ;; cmp.ne p6, p7 = r0, r0 xma.l fp0b_0 = ux, v0, rx xma.hu fp1b_0 = ux, v0, rx ;; cmp.ne p10, p11 = r0, r0 xma.l fp0b_1 = uy, v0, ry xma.hu fp1a_1 = uy, v0, ry ;; getfsig acc0 = fp0b_0 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ;; xma.l fp0b_2 = uy, v1, r_2 xma.hu fp1a_2 = uy, v1, r_2 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_2, v0, fp1a_1 xma.hu fp2a_1 = u_2, v0, fp1a_1 ;; getfsig pr1_0 = fp1b_0 getfsig acc1_0 = fp2a_0 br L(cj3) L(gt3): ldf8 r_3 = [srp], 8 xma.l fp0b_0 = ux, v0, rx cmp.ne p10, p11 = r0, r0 ldf8 u_3 = [up], 8 xma.hu fp1b_0 = ux, v0, rx cmp.ne p6, p7 = r0, r0 ;; xma.l fp0b_1 = uy, v0, ry xma.hu fp1a_1 = uy, v0, ry ;; getfsig acc0 = fp0b_0 ldf8 r_0 = [srp], 8 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ldf8 u_0 = [up], 8 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ;; xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 ;; getfsig pr0_1 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 ;; getfsig acc1_0 = fp2a_0 xma.l fp0b_3 = u_3, v0, r_3 xma.hu fp1a_3 = u_3, v0, r_3 br L(11) d409 8 a416 8 L(top): C 00 .pred.rel "mutex", p12, p13 getfsig pr0_3 = fp0b_3 ldf8 r_3 = [srp], 8 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 d418 8 a425 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 d427 7 a433 7 .pred.rel "mutex", p6, p7 getfsig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, r_1 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, r_1 d435 8 a442 8 L(01): .pred.rel "mutex", p10, p11 getfsig pr0_0 = fp0b_0 ldf8 r_0 = [srp], 8 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 d444 8 a451 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 d453 7 a459 7 .pred.rel "mutex", p8, p9 getfsig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, r_2 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, r_2 d461 8 a468 8 L(00): .pred.rel "mutex", p12, p13 getfsig pr0_1 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 d470 8 a477 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 d479 7 a485 7 .pred.rel "mutex", p6, p7 getfsig acc1_0 = fp2a_0 st8 [rp] = s0, 8 xma.l fp0b_3 = u_3, v0, r_3 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 xma.hu fp1a_3 = u_3, v0, r_3 d487 8 a494 8 L(11): .pred.rel "mutex", p10, p11 getfsig pr0_2 = fp0b_2 ldf8 r_2 = [srp], 8 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 d496 8 a503 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 d505 8 a512 8 .pred.rel "mutex", p8, p9 getfsig acc1_1 = fp2a_1 st8 [rp] = s0, 8 xma.l fp0b_0 = u_0, v0, r_0 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 xma.hu fp1a_0 = u_0, v0, r_0 L(10): br.cloop.sptk.clr L(top) C 12 d515 141 a655 144 L(end): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 .mfi; (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mfi; getfsig acc1_2 = fp2a_2 xma.l fp0b_1 = u_1, v0, r_1 nop 1 .mmf; (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, r_1 ;; L(cj5): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 .mfi; (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_3 = fp1b_3 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_2 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mfi; getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, r_2 nop 1 .mmf; (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, r_2 ;; L(cj4): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 .mfi; (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_0 = fp1b_0 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_3 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig acc1_0 = fp2a_0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 ;; L(cj3): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 .mfi; (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_1 = fp1b_1 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_0 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; .pred.rel "mutex", p8, p9 .mmi; getfsig acc1_1 = fp2a_1 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 ;; .pred.rel "mutex", p12, p13 .mmi; (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig r8 = fp2a_2 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 ;; .pred.rel "mutex", p10, p11 .mmi; (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 (p6) cmp.leu p8, p9 = acc0, pr0_2 ;; .pred.rel "mutex", p10, p11 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mmi; st8 [rp] = s0, 8 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 ;; .pred.rel "mutex", p8, p9 .mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 (p9) cmp.ltu p10, p11 = acc0, pr1_2 (p12) add acc0 = 1, acc0 ;; .mmi; st8 [rp] = acc0, 8 (p12) cmpeqor p10, p0 = 0, acc0 nop 1 ;; .mib; (p10) add r8 = 1, r8 mov ar.lc = r2 br.ret.sptk.many b0 @ 1.1.1.1.2.1 log @sync with head. for a reference, the tree before this commit was tagged as yamt-pagecache-tag8. this commit was splitted into small chunks to avoid a limitation of cvs. ("Protocol error: too many arguments") @ text @d4 1 a4 3 dnl Contributed to the GNU project by Torbjorn Granlund. dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc. d27 3 d33 4 a36 2 C * Clean up feed-in code to not require zeroing several registers. C * Make sure we don't depend on uninitialised predicate registers. a96 51 PROLOGUE(mpn_addmul_2s) .prologue .save ar.lc, r2 .body ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I ;;') .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; ldf8 rx = [rp], 8 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I0 .mmi; ldf8 ry = [rp], -8 C M cmp.eq p14, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; add srp = 16, rp C M I cmp.eq p15, p0 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p14) br.dptk L(x01) C B (p11) br.dptk L(x10) C B (p15) br.dptk L(x11) C B ;; L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair mov fp2a_3 = f0 br L(b00) L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair mov fp2a_2 = f0 br L(b01) L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair mov fp2a_1 = f0 br L(b10) L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair mov fp2a_0 = f0 br L(b11) EPILOGUE() d102 5 a106 7 ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I d109 26 a134 20 .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; ldf8 rx = [rp], 8 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I0 .mmi; ldf8 ry = [rp], -8 C M cmp.eq p14, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; add srp = 16, rp C M I cmp.eq p15, p6 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p14) br.dptk L(b01) C B (p11) br.dptk L(b10) C B (p15) br.dptk L(b11) C B d136 1 d139 58 a196 59 L(b00): .mmi; ldf8 r_1 = [srp], 8 ldf8 u_1 = [up], 8 mov acc1_2 = 0 .mmi; mov pr1_2 = 0 mov pr0_3 = 0 cmp.ne p8, p9 = r0, r0 ;; .mfi; ldf8 r_2 = [srp], 8 xma.l fp0b_3 = ux, v0, rx cmp.ne p12, p13 = r0, r0 .mfb; ldf8 u_2 = [up], 8 xma.hu fp1b_3 = ux, v0, rx br.cloop.dptk L(gt4) xma.l fp0b_0 = uy, v0, ry xma.hu fp1a_0 = uy, v0, ry ;; getfsig acc0 = fp0b_3 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ;; xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; getfsig pr1_3 = fp1b_3 getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 br L(cj4) L(gt4): xma.l fp0b_0 = uy, v0, ry xma.hu fp1a_0 = uy, v0, ry ;; ldf8 r_3 = [srp], 8 getfsig acc0 = fp0b_3 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ldf8 u_3 = [up], 8 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ;; xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 ;; ldf8 r_0 = [srp], 8 getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 xma.l fp0b_2 = u_2, v0, r_2 ;; getfsig acc1_3 = fp2a_3 xma.hu fp1a_2 = u_2, v0, r_2 br L(00) d200 51 a250 53 L(b01): .mmi; ldf8 r_0 = [srp], 8 C M ldf8 u_0 = [up], 8 C M mov acc1_1 = 0 C M I .mmi; mov pr1_1 = 0 C M I mov pr0_2 = 0 C M I cmp.ne p6, p7 = r0, r0 C M I ;; .mfi; ldf8 r_1 = [srp], 8 C M xma.l fp0b_2 = ux, v0, rx C F cmp.ne p10, p11 = r0, r0 C M I .mfi; ldf8 u_1 = [up], 8 C M xma.hu fp1b_2 = ux, v0, rx C F nop 1 ;; xma.l fp0b_3 = uy, v0, ry C F xma.hu fp1a_3 = uy, v0, ry C F ;; .mmf; getfsig acc0 = fp0b_2 C M ldf8 r_2 = [srp], 8 C M (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s .mfb; ldf8 u_2 = [up], 8 C M (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s br.cloop.dptk L(gt5) xma.l fp0b_0 = u_0, v0, r_0 C F xma.hu fp1a_0 = u_0, v0, r_0 C F ;; getfsig pr0_3 = fp0b_3 C M xma.l fp1b_3 = uy, v1,fp1a_3 C F xma.hu fp2a_3 = uy, v1,fp1a_3 C F ;; getfsig pr1_2 = fp1b_2 C M getfsig acc1_2 = fp2a_2 C M xma.l fp0b_1 = u_1, v0, r_1 C F xma.hu fp1a_1 = u_1, v0, r_1 C F br L(cj5) L(gt5): xma.l fp0b_0 = u_0, v0, r_0 xma.hu fp1a_0 = u_0, v0, r_0 ;; getfsig pr0_3 = fp0b_3 ldf8 r_3 = [srp], 8 xma.l fp1b_3 = uy, v1, fp1a_3 xma.hu fp2a_3 = uy, v1, fp1a_3 ;; ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 xma.l fp0b_1 = u_1, v0, r_1 ;; getfsig acc1_2 = fp2a_2 xma.hu fp1a_1 = u_1, v0, r_1 br L(01) d254 87 a340 77 L(b10): br.cloop.dptk L(gt2) xma.l fp0b_1 = ux, v0, rx xma.hu fp1b_1 = ux, v0, rx ;; xma.l fp0b_2 = uy, v0, ry xma.hu fp1a_2 = uy, v0, ry ;; stf8 [rp] = fp0b_1, 8 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s ;; getfsig acc0 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; getfsig pr1_1 = fp1b_1 getfsig acc1_1 = fp2a_1 mov ar.lc = r2 getfsig pr1_2 = fp1b_2 getfsig r8 = fp2a_2 ;; add s0 = pr1_1, acc0 ;; st8 [rp] = s0, 8 cmp.ltu p8, p9 = s0, pr1_1 sub r31 = -1, acc1_1 ;; .pred.rel "mutex", p8, p9 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 (p8) cmp.leu p10, p0 = r31, pr1_2 (p9) cmp.ltu p10, p0 = r31, pr1_2 ;; st8 [rp] = acc0, 8 (p10) add r8 = 1, r8 br.ret.sptk.many b0 L(gt2): .mmi; ldf8 r_3 = [srp], 8 ldf8 u_3 = [up], 8 mov acc1_0 = 0 ;; .mfi; ldf8 r_0 = [srp], 8 xma.l fp0b_1 = ux, v0, rx mov pr1_0 = 0 .mfi; ldf8 u_0 = [up], 8 xma.hu fp1b_1 = ux, v0, rx mov pr0_1 = 0 ;; xma.l fp0b_2 = uy, v0, ry xma.hu fp1a_2 = uy, v0, ry ;; getfsig acc0 = fp0b_1 ldf8 r_1 = [srp], 8 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s ;; ldf8 u_1 = [up], 8 xma.l fp0b_3 = u_3, v0, r_3 xma.hu fp1a_3 = u_3, v0, r_3 ;; getfsig pr0_2 = fp0b_2 ldf8 r_2 = [srp], 8 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 ;; .mfi; getfsig acc1_1 = fp2a_1 xma.l fp0b_0 = u_0, v0, r_0 cmp.ne p8, p9 = r0, r0 .mfb; cmp.ne p12, p13 = r0, r0 xma.hu fp1a_0 = u_0, v0, r_0 br.cloop.sptk.clr L(top) br.many L(end) d344 61 a404 62 L(b11): ldf8 r_2 = [srp], 8 mov pr1_3 = 0 mov pr0_0 = 0 ;; ldf8 u_2 = [up], 8 mov acc1_3 = 0 br.cloop.dptk L(gt3) ;; cmp.ne p6, p7 = r0, r0 xma.l fp0b_0 = ux, v0, rx xma.hu fp1b_0 = ux, v0, rx ;; cmp.ne p10, p11 = r0, r0 xma.l fp0b_1 = uy, v0, ry xma.hu fp1a_1 = uy, v0, ry ;; getfsig acc0 = fp0b_0 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ;; xma.l fp0b_2 = uy, v1, r_2 xma.hu fp1a_2 = uy, v1, r_2 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_2, v0, fp1a_1 xma.hu fp2a_1 = u_2, v0, fp1a_1 ;; getfsig pr1_0 = fp1b_0 getfsig acc1_0 = fp2a_0 br L(cj3) L(gt3): ldf8 r_3 = [srp], 8 xma.l fp0b_0 = ux, v0, rx cmp.ne p10, p11 = r0, r0 ldf8 u_3 = [up], 8 xma.hu fp1b_0 = ux, v0, rx cmp.ne p6, p7 = r0, r0 ;; xma.l fp0b_1 = uy, v0, ry xma.hu fp1a_1 = uy, v0, ry ;; getfsig acc0 = fp0b_0 ldf8 r_0 = [srp], 8 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ldf8 u_0 = [up], 8 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ;; xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 ;; getfsig pr0_1 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 ;; getfsig acc1_0 = fp2a_0 xma.l fp0b_3 = u_3, v0, r_3 xma.hu fp1a_3 = u_3, v0, r_3 br L(11) d409 8 a416 8 L(top): C 00 .pred.rel "mutex", p12, p13 getfsig pr0_3 = fp0b_3 ldf8 r_3 = [srp], 8 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 d418 8 a425 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 d427 7 a433 7 .pred.rel "mutex", p6, p7 getfsig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, r_1 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, r_1 d435 8 a442 8 L(01): .pred.rel "mutex", p10, p11 getfsig pr0_0 = fp0b_0 ldf8 r_0 = [srp], 8 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 d444 8 a451 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 d453 7 a459 7 .pred.rel "mutex", p8, p9 getfsig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, r_2 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, r_2 d461 8 a468 8 L(00): .pred.rel "mutex", p12, p13 getfsig pr0_1 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 d470 8 a477 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 d479 7 a485 7 .pred.rel "mutex", p6, p7 getfsig acc1_0 = fp2a_0 st8 [rp] = s0, 8 xma.l fp0b_3 = u_3, v0, r_3 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 xma.hu fp1a_3 = u_3, v0, r_3 d487 8 a494 8 L(11): .pred.rel "mutex", p10, p11 getfsig pr0_2 = fp0b_2 ldf8 r_2 = [srp], 8 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 d496 8 a503 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 d505 8 a512 8 .pred.rel "mutex", p8, p9 getfsig acc1_1 = fp2a_1 st8 [rp] = s0, 8 xma.l fp0b_0 = u_0, v0, r_0 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 xma.hu fp1a_0 = u_0, v0, r_0 L(10): br.cloop.sptk.clr L(top) C 12 d515 141 a655 144 L(end): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 .mfi; (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mfi; getfsig acc1_2 = fp2a_2 xma.l fp0b_1 = u_1, v0, r_1 nop 1 .mmf; (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, r_1 ;; L(cj5): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 .mfi; (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_3 = fp1b_3 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_2 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mfi; getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, r_2 nop 1 .mmf; (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, r_2 ;; L(cj4): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 .mfi; (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_0 = fp1b_0 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_3 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig acc1_0 = fp2a_0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 ;; L(cj3): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 .mfi; (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_1 = fp1b_1 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_0 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; .pred.rel "mutex", p8, p9 .mmi; getfsig acc1_1 = fp2a_1 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 ;; .pred.rel "mutex", p12, p13 .mmi; (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig r8 = fp2a_2 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 ;; .pred.rel "mutex", p10, p11 .mmi; (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 (p6) cmp.leu p8, p9 = acc0, pr0_2 ;; .pred.rel "mutex", p10, p11 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mmi; st8 [rp] = s0, 8 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 ;; .pred.rel "mutex", p8, p9 .mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 (p9) cmp.ltu p10, p11 = acc0, pr1_2 (p12) add acc0 = 1, acc0 ;; .mmi; st8 [rp] = acc0, 8 (p12) cmpeqor p10, p0 = 0, acc0 nop 1 ;; .mib; (p10) add r8 = 1, r8 mov ar.lc = r2 br.ret.sptk.many b0 @ 1.1.1.2 log @initial import GMP 5.1.3 sources. changes include: fixes for: - mpn_sbpi1_div_qr_sec and mpn_sbpi1_div_r_sec - mpz_powm_ui - AMD family 11h - mpz_powm_sec and mpn_powm_sec - ASSERT() fixes - gcd, gcdext, and invert function fixes - some PPC division operations @ text @d4 1 a4 3 dnl Contributed to the GNU project by Torbjorn Granlund. dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc. d27 3 d33 4 a36 2 C * Clean up feed-in code to not require zeroing several registers. C * Make sure we don't depend on uninitialised predicate registers. a96 51 PROLOGUE(mpn_addmul_2s) .prologue .save ar.lc, r2 .body ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I ;;') .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; ldf8 rx = [rp], 8 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I0 .mmi; ldf8 ry = [rp], -8 C M cmp.eq p14, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; add srp = 16, rp C M I cmp.eq p15, p0 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p14) br.dptk L(x01) C B (p11) br.dptk L(x10) C B (p15) br.dptk L(x11) C B ;; L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair mov fp2a_3 = f0 br L(b00) L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair mov fp2a_2 = f0 br L(b01) L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair mov fp2a_1 = f0 br L(b10) L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair mov fp2a_0 = f0 br L(b11) EPILOGUE() d102 5 a106 7 ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I d109 26 a134 20 .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; ldf8 rx = [rp], 8 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I0 .mmi; ldf8 ry = [rp], -8 C M cmp.eq p14, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; add srp = 16, rp C M I cmp.eq p15, p6 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p14) br.dptk L(b01) C B (p11) br.dptk L(b10) C B (p15) br.dptk L(b11) C B d136 1 d139 58 a196 59 L(b00): .mmi; ldf8 r_1 = [srp], 8 ldf8 u_1 = [up], 8 mov acc1_2 = 0 .mmi; mov pr1_2 = 0 mov pr0_3 = 0 cmp.ne p8, p9 = r0, r0 ;; .mfi; ldf8 r_2 = [srp], 8 xma.l fp0b_3 = ux, v0, rx cmp.ne p12, p13 = r0, r0 .mfb; ldf8 u_2 = [up], 8 xma.hu fp1b_3 = ux, v0, rx br.cloop.dptk L(gt4) xma.l fp0b_0 = uy, v0, ry xma.hu fp1a_0 = uy, v0, ry ;; getfsig acc0 = fp0b_3 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ;; xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; getfsig pr1_3 = fp1b_3 getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 br L(cj4) L(gt4): xma.l fp0b_0 = uy, v0, ry xma.hu fp1a_0 = uy, v0, ry ;; ldf8 r_3 = [srp], 8 getfsig acc0 = fp0b_3 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ldf8 u_3 = [up], 8 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s ;; xma.l fp0b_1 = u_1, v0, r_1 xma.hu fp1a_1 = u_1, v0, r_1 ;; ldf8 r_0 = [srp], 8 getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 xma.l fp0b_2 = u_2, v0, r_2 ;; getfsig acc1_3 = fp2a_3 xma.hu fp1a_2 = u_2, v0, r_2 br L(00) d200 51 a250 53 L(b01): .mmi; ldf8 r_0 = [srp], 8 C M ldf8 u_0 = [up], 8 C M mov acc1_1 = 0 C M I .mmi; mov pr1_1 = 0 C M I mov pr0_2 = 0 C M I cmp.ne p6, p7 = r0, r0 C M I ;; .mfi; ldf8 r_1 = [srp], 8 C M xma.l fp0b_2 = ux, v0, rx C F cmp.ne p10, p11 = r0, r0 C M I .mfi; ldf8 u_1 = [up], 8 C M xma.hu fp1b_2 = ux, v0, rx C F nop 1 ;; xma.l fp0b_3 = uy, v0, ry C F xma.hu fp1a_3 = uy, v0, ry C F ;; .mmf; getfsig acc0 = fp0b_2 C M ldf8 r_2 = [srp], 8 C M (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s .mfb; ldf8 u_2 = [up], 8 C M (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s br.cloop.dptk L(gt5) xma.l fp0b_0 = u_0, v0, r_0 C F xma.hu fp1a_0 = u_0, v0, r_0 C F ;; getfsig pr0_3 = fp0b_3 C M xma.l fp1b_3 = uy, v1,fp1a_3 C F xma.hu fp2a_3 = uy, v1,fp1a_3 C F ;; getfsig pr1_2 = fp1b_2 C M getfsig acc1_2 = fp2a_2 C M xma.l fp0b_1 = u_1, v0, r_1 C F xma.hu fp1a_1 = u_1, v0, r_1 C F br L(cj5) L(gt5): xma.l fp0b_0 = u_0, v0, r_0 xma.hu fp1a_0 = u_0, v0, r_0 ;; getfsig pr0_3 = fp0b_3 ldf8 r_3 = [srp], 8 xma.l fp1b_3 = uy, v1, fp1a_3 xma.hu fp2a_3 = uy, v1, fp1a_3 ;; ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 xma.l fp0b_1 = u_1, v0, r_1 ;; getfsig acc1_2 = fp2a_2 xma.hu fp1a_1 = u_1, v0, r_1 br L(01) d254 87 a340 77 L(b10): br.cloop.dptk L(gt2) xma.l fp0b_1 = ux, v0, rx xma.hu fp1b_1 = ux, v0, rx ;; xma.l fp0b_2 = uy, v0, ry xma.hu fp1a_2 = uy, v0, ry ;; stf8 [rp] = fp0b_1, 8 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s ;; getfsig acc0 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; getfsig pr1_1 = fp1b_1 getfsig acc1_1 = fp2a_1 mov ar.lc = r2 getfsig pr1_2 = fp1b_2 getfsig r8 = fp2a_2 ;; add s0 = pr1_1, acc0 ;; st8 [rp] = s0, 8 cmp.ltu p8, p9 = s0, pr1_1 sub r31 = -1, acc1_1 ;; .pred.rel "mutex", p8, p9 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 (p8) cmp.leu p10, p0 = r31, pr1_2 (p9) cmp.ltu p10, p0 = r31, pr1_2 ;; st8 [rp] = acc0, 8 (p10) add r8 = 1, r8 br.ret.sptk.many b0 L(gt2): .mmi; ldf8 r_3 = [srp], 8 ldf8 u_3 = [up], 8 mov acc1_0 = 0 ;; .mfi; ldf8 r_0 = [srp], 8 xma.l fp0b_1 = ux, v0, rx mov pr1_0 = 0 .mfi; ldf8 u_0 = [up], 8 xma.hu fp1b_1 = ux, v0, rx mov pr0_1 = 0 ;; xma.l fp0b_2 = uy, v0, ry xma.hu fp1a_2 = uy, v0, ry ;; getfsig acc0 = fp0b_1 ldf8 r_1 = [srp], 8 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s ;; ldf8 u_1 = [up], 8 xma.l fp0b_3 = u_3, v0, r_3 xma.hu fp1a_3 = u_3, v0, r_3 ;; getfsig pr0_2 = fp0b_2 ldf8 r_2 = [srp], 8 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 ;; .mfi; getfsig acc1_1 = fp2a_1 xma.l fp0b_0 = u_0, v0, r_0 cmp.ne p8, p9 = r0, r0 .mfb; cmp.ne p12, p13 = r0, r0 xma.hu fp1a_0 = u_0, v0, r_0 br.cloop.sptk.clr L(top) br.many L(end) d344 61 a404 62 L(b11): ldf8 r_2 = [srp], 8 mov pr1_3 = 0 mov pr0_0 = 0 ;; ldf8 u_2 = [up], 8 mov acc1_3 = 0 br.cloop.dptk L(gt3) ;; cmp.ne p6, p7 = r0, r0 xma.l fp0b_0 = ux, v0, rx xma.hu fp1b_0 = ux, v0, rx ;; cmp.ne p10, p11 = r0, r0 xma.l fp0b_1 = uy, v0, ry xma.hu fp1a_1 = uy, v0, ry ;; getfsig acc0 = fp0b_0 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ;; xma.l fp0b_2 = uy, v1, r_2 xma.hu fp1a_2 = uy, v1, r_2 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_2, v0, fp1a_1 xma.hu fp2a_1 = u_2, v0, fp1a_1 ;; getfsig pr1_0 = fp1b_0 getfsig acc1_0 = fp2a_0 br L(cj3) L(gt3): ldf8 r_3 = [srp], 8 xma.l fp0b_0 = ux, v0, rx cmp.ne p10, p11 = r0, r0 ldf8 u_3 = [up], 8 xma.hu fp1b_0 = ux, v0, rx cmp.ne p6, p7 = r0, r0 ;; xma.l fp0b_1 = uy, v0, ry xma.hu fp1a_1 = uy, v0, ry ;; getfsig acc0 = fp0b_0 ldf8 r_0 = [srp], 8 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ldf8 u_0 = [up], 8 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s ;; xma.l fp0b_2 = u_2, v0, r_2 xma.hu fp1a_2 = u_2, v0, r_2 ;; getfsig pr0_1 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 ;; getfsig acc1_0 = fp2a_0 xma.l fp0b_3 = u_3, v0, r_3 xma.hu fp1a_3 = u_3, v0, r_3 br L(11) d409 8 a416 8 L(top): C 00 .pred.rel "mutex", p12, p13 getfsig pr0_3 = fp0b_3 ldf8 r_3 = [srp], 8 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 d418 8 a425 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 d427 7 a433 7 .pred.rel "mutex", p6, p7 getfsig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, r_1 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, r_1 d435 8 a442 8 L(01): .pred.rel "mutex", p10, p11 getfsig pr0_0 = fp0b_0 ldf8 r_0 = [srp], 8 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 d444 8 a451 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 d453 7 a459 7 .pred.rel "mutex", p8, p9 getfsig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, r_2 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, r_2 d461 8 a468 8 L(00): .pred.rel "mutex", p12, p13 getfsig pr0_1 = fp0b_1 ldf8 r_1 = [srp], 8 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 d470 8 a477 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 d479 7 a485 7 .pred.rel "mutex", p6, p7 getfsig acc1_0 = fp2a_0 st8 [rp] = s0, 8 xma.l fp0b_3 = u_3, v0, r_3 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 xma.hu fp1a_3 = u_3, v0, r_3 d487 8 a494 8 L(11): .pred.rel "mutex", p10, p11 getfsig pr0_2 = fp0b_2 ldf8 r_2 = [srp], 8 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 d496 8 a503 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 d505 8 a512 8 .pred.rel "mutex", p8, p9 getfsig acc1_1 = fp2a_1 st8 [rp] = s0, 8 xma.l fp0b_0 = u_0, v0, r_0 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 xma.hu fp1a_0 = u_0, v0, r_0 L(10): br.cloop.sptk.clr L(top) C 12 d515 141 a655 144 L(end): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 .mfi; (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mfi; getfsig acc1_2 = fp2a_2 xma.l fp0b_1 = u_1, v0, r_1 nop 1 .mmf; (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, r_1 ;; L(cj5): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 .mfi; (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_3 = fp1b_3 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_2 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mfi; getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, r_2 nop 1 .mmf; (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, r_2 ;; L(cj4): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 .mfi; (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_0 = fp1b_0 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_3 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig acc1_0 = fp2a_0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 ;; L(cj3): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 .mfi; (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_1 = fp1b_1 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_0 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; .pred.rel "mutex", p8, p9 .mmi; getfsig acc1_1 = fp2a_1 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 ;; .pred.rel "mutex", p12, p13 .mmi; (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig r8 = fp2a_2 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 ;; .pred.rel "mutex", p10, p11 .mmi; (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 (p6) cmp.leu p8, p9 = acc0, pr0_2 ;; .pred.rel "mutex", p10, p11 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mmi; st8 [rp] = s0, 8 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 ;; .pred.rel "mutex", p8, p9 .mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 (p9) cmp.ltu p10, p11 = acc0, pr1_2 (p12) add acc0 = 1, acc0 ;; .mmi; st8 [rp] = acc0, 8 (p12) cmpeqor p10, p0 = 0, acc0 nop 1 ;; .mib; (p10) add r8 = 1, r8 mov ar.lc = r2 br.ret.sptk.many b0 @ 1.1.1.3 log @initial import of GMP 6.1.2. main changes from 5.1.3 below. notes: - support for thumb-less ARM chips was in our port of 5.1.3, but a similar method has been provided upstream now - someone should look at the AVX failure reports, and fix them Changes between GMP version 6.1.0 and 6.1.1 FEATURES * Work around faulty cpuid on some recent Intel chips (this allows GMP to run on Skylake Pentiums). * Support thumb-less ARM chips. Changes between GMP version 6.0.* and 6.1.0 BUGS FIXED * The public function mpn_com is now correctly declared in gmp.h. * Healed possible failures of mpn_sec_sqr for non-cryptographic sizes for some obsolete CPUs. * Various problems related to precision for mpf have been fixed. * Fixed ABI incompatible stack alignment in calls from assembly code. * Fixed PIC bug in popcount affecting Intel processors using the 32-bit ABI. SPEEDUPS * Speedup for Intel Broadwell and Skylake through assembly code making use of new ADX instructions. * Square root is now faster when the remainder is not needed. Also the speed to compute the k-th root improved, for small sizes. FEATURES * New C++ functions gcd and lcm for mpz_class. * New public mpn functions mpn_divexact_1, mpn_zero_p, and mpn_cnd_swap. * New public mpq_cmp_z function, to efficiently compare rationals with integers. * Support for more 32-bit arm processors. * Support for AVX-less modern x86 CPUs. (Such support might be missing either because the CPU vendor chose to disable AVX, or because the running kernel lacks AVX context switch support.) * Support for NetBSD under Xen; we switch off AVX unconditionally under NetBSD since a bug in NetBSD makes AVX fail under Xen. MISC * Tuned values for FFT multiplications are provided for larger number on many platforms. Changes between GMP version 5.1.* and 6.0.0 BUGS FIXED * The function mpz_invert now considers any number invertible in Z/1Z. * The mpn multiply code now handles operands of more than 2^31 limbs correctly. (Note however that the mpz code is limited to 2^32 bits on 32-bit hosts and 2^37 bits on 64-bit hosts.) SPEEDUPS * Plain division of large operands is faster and more monotonous in operand size. * Major speedup for ARM, in particular ARM Cortex-A15, thanks to improved assembly. * Speedup for Intel Sandy Bridge, Ivy Bridge, Haswell, thanks to rewritten and vastly expanded assembly support. Speedup also for the older Core 2 and Nehalem. * Faster mixed arithmetic between mpq_class and double. FEATURES * Support for new Intel and AMD CPUs. * New public functions mpn_sec_mul and mpn_sec_sqr, implementing side-channel silent multiplication and squaring. * New public functions mpn_sec_div_qr and mpn_sec_div_r, implementing side-channel silent division. * New public functions mpn_cnd_add_n and mpn_cnd_sub_n. Side-channel silent conditional addition and subtraction. * New public function mpn_sec_powm, implementing side-channel silent modexp. * New public function mpn_sec_invert, implementing side-channel silent modular inversion. * Better support for applications which use the mpz_t type, but nevertheless need to call some of the lower-level mpn functions. See the documentation for mpz_limbs_read and related functions. @ text @d9 1 a9 1 dnl d11 4 a14 14 dnl it under the terms of either: dnl dnl * the GNU Lesser General Public License as published by the Free dnl Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl or dnl dnl * the GNU General Public License as published by the Free Software dnl Foundation; either version 2 of the License, or (at your option) any dnl later version. dnl dnl or both in parallel, as here. dnl d17 5 a21 6 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License dnl for more details. dnl dnl You should have received copies of the GNU General Public License and the dnl GNU Lesser General Public License along with the GNU MP Library. If not, dnl see https://www.gnu.org/licenses/. d100 1 a100 1 {.mmi; addp4 rp = 0, rp C M I d103 1 a103 1 }{.mmi; nop 1 d106 1 a106 2 ;; }') d108 1 a108 1 {.mmi; ldf8 ux = [up], 8 C M d111 1 a111 1 }{.mmi; ldf8 rx = [rp], 8 C M d115 1 a115 1 }{.mmi; ldf8 uy = [up], 8 C M d118 1 a118 1 }{.mmi; ldf8 ry = [rp], -8 C M d122 1 a122 1 }{.mmi; add srp = 16, rp C M I d125 1 a125 1 }{.bbb; (p14) br.dptk L(x01) C B d129 1 a129 1 } d151 1 a151 1 {.mmi; addp4 rp = 0, rp C M I d154 1 a154 1 }{.mmi; nop 1 d157 1 a157 2 ;; }') d159 1 a159 1 {.mmi; ldf8 ux = [up], 8 C M d162 1 a162 1 }{.mmi; ldf8 rx = [rp], 8 C M d166 1 a166 1 }{.mmi; ldf8 uy = [up], 8 C M d169 1 a169 1 }{.mmi; ldf8 ry = [rp], -8 C M d173 1 a173 1 }{.mmi; add srp = 16, rp C M I d176 1 a176 1 }{.bbb; (p14) br.dptk L(b01) C B d180 1 a180 1 } d183 1 a183 1 {.mmi; ldf8 r_1 = [srp], 8 d186 1 a186 1 }{.mmi; mov pr1_2 = 0 d190 1 a190 1 }{.mfi; ldf8 r_2 = [srp], 8 d193 1 a193 1 }{.mfb; ldf8 u_2 = [up], 8 d196 1 a196 1 } d245 1 a245 1 {.mmi; ldf8 r_0 = [srp], 8 C M d248 1 a248 1 }{.mmi; mov pr1_1 = 0 C M I d252 1 a252 1 }{.mfi; ldf8 r_1 = [srp], 8 C M d255 1 a255 1 }{.mfi; ldf8 u_1 = [up], 8 C M d259 1 a259 1 } xma.l fp0b_3 = uy, v0, ry C F d262 1 a262 1 {.mmf; getfsig acc0 = fp0b_2 C M d265 1 a265 1 }{.mfb; ldf8 u_2 = [up], 8 C M d268 1 a268 1 } d327 1 a327 1 .pred.rel "mutex", p8, p9 d339 1 a339 1 {.mmi; ldf8 r_3 = [srp], 8 d343 1 a343 1 }{.mfi; ldf8 r_0 = [srp], 8 d346 1 a346 1 }{.mfi; ldf8 u_0 = [up], 8 d350 1 a350 1 } xma.l fp0b_2 = uy, v0, ry d370 1 a370 1 {.mfi; getfsig acc1_1 = fp2a_1 d373 1 a373 1 }{.mfb; cmp.ne p12, p13 = r0, r0 a375 1 } d447 1 a447 1 .pred.rel "mutex", p12, p13 d455 2 a456 2 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 d464 1 a464 1 .pred.rel "mutex", p6, p7 d473 1 a473 1 .pred.rel "mutex", p10, p11 d481 2 a482 2 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 d490 1 a490 1 .pred.rel "mutex", p8, p9 d499 1 a499 1 .pred.rel "mutex", p12, p13 d507 2 a508 2 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 d516 1 a516 1 .pred.rel "mutex", p6, p7 d525 1 a525 1 .pred.rel "mutex", p10, p11 d533 2 a534 2 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 d542 1 a542 1 .pred.rel "mutex", p8, p9 d549 1 a549 1 L(10): br.cloop.sptk.clr L(top) C 12 d553 2 a554 2 .pred.rel "mutex", p12, p13 {.mfi; getfsig pr0_3 = fp0b_3 d557 1 a557 1 }{.mfi; (p13) add s0 = pr1_0, acc0 d561 3 a563 3 } .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 {.mmi; getfsig pr1_2 = fp1b_2 d566 1 a566 1 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 d570 2 a571 2 } .pred.rel "mutex", p6, p7 {.mfi; getfsig acc1_2 = fp2a_2 d574 1 a574 1 }{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 a577 1 } d579 2 a580 2 .pred.rel "mutex", p10, p11 {.mfi; getfsig pr0_0 = fp0b_0 d583 1 a583 1 }{.mfi; (p11) add s0 = pr1_1, acc0 d587 4 a590 4 } .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 {.mmi; getfsig pr1_3 = fp1b_3 st8 [rp] = s0, 8 d592 1 a592 1 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 d596 2 a597 2 } .pred.rel "mutex", p8, p9 {.mfi; getfsig acc1_3 = fp2a_3 d600 1 a600 1 }{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 a603 1 } d605 2 a606 2 .pred.rel "mutex", p12, p13 {.mfi; getfsig pr0_1 = fp0b_1 d609 1 a609 1 }{.mfi; (p13) add s0 = pr1_2, acc0 d613 3 a615 3 } .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 {.mmi; getfsig pr1_0 = fp1b_0 d618 1 a618 1 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 d622 2 a623 2 } .pred.rel "mutex", p6, p7 {.mmi; getfsig acc1_0 = fp2a_0 a626 1 } d628 2 a629 2 .pred.rel "mutex", p10, p11 {.mfi; getfsig pr0_2 = fp0b_2 d632 1 a632 1 }{.mfi; (p11) add s0 = pr1_3, acc0 d636 3 a638 3 } .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 {.mmi; getfsig pr1_1 = fp1b_1 d641 1 a641 1 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 d645 2 a646 2 } .pred.rel "mutex", p8, p9 {.mmi; getfsig acc1_1 = fp2a_1 d650 2 a651 2 } .pred.rel "mutex", p12, p13 {.mmi; (p12) add s0 = pr1_0, acc0, 1 d655 3 a657 3 } .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 {.mmi; getfsig pr1_2 = fp1b_2 d660 1 a660 1 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 d664 2 a665 2 } .pred.rel "mutex", p6, p7 {.mmi; getfsig r8 = fp2a_2 d669 2 a670 2 } .pred.rel "mutex", p10, p11 {.mmi; (p10) add s0 = pr1_1, acc0, 1 d674 2 a675 2 } .pred.rel "mutex", p10, p11 {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 d679 2 a680 2 } .pred.rel "mutex", p8, p9 {.mmi; st8 [rp] = s0, 8 d684 2 a685 2 } .pred.rel "mutex", p8, p9 {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 d689 1 a689 1 }{.mmi; st8 [rp] = acc0, 8 d693 1 a693 1 }{.mib; (p10) add r8 = 1, r8 a695 1 } @