head 1.1; branch 1.1.1; access; symbols netbsd-11-0-RC5:1.1.1.3 netbsd-11-0-RC4:1.1.1.3 netbsd-11-0-RC3:1.1.1.3 netbsd-11-0-RC2:1.1.1.3 netbsd-11-0-RC1:1.1.1.3 perseant-exfatfs-base-20250801:1.1.1.3 netbsd-11:1.1.1.3.0.18 netbsd-11-base:1.1.1.3 netbsd-10-1-RELEASE:1.1.1.3 perseant-exfatfs-base-20240630:1.1.1.3 perseant-exfatfs:1.1.1.3.0.16 perseant-exfatfs-base:1.1.1.3 netbsd-8-3-RELEASE:1.1.1.2 netbsd-9-4-RELEASE:1.1.1.3 netbsd-10-0-RELEASE:1.1.1.3 netbsd-10-0-RC6:1.1.1.3 netbsd-10-0-RC5:1.1.1.3 netbsd-10-0-RC4:1.1.1.3 netbsd-10-0-RC3:1.1.1.3 netbsd-10-0-RC2:1.1.1.3 netbsd-10-0-RC1:1.1.1.3 netbsd-10:1.1.1.3.0.14 netbsd-10-base:1.1.1.3 netbsd-9-3-RELEASE:1.1.1.3 gmp-6-2-1:1.1.1.3 cjep_sun2x-base1:1.1.1.3 cjep_sun2x:1.1.1.3.0.12 cjep_sun2x-base:1.1.1.3 cjep_staticlib_x-base1:1.1.1.3 netbsd-9-2-RELEASE:1.1.1.3 cjep_staticlib_x:1.1.1.3.0.10 cjep_staticlib_x-base:1.1.1.3 netbsd-9-1-RELEASE:1.1.1.3 gmp-6-2-0:1.1.1.3 phil-wifi-20200421:1.1.1.3 phil-wifi-20200411:1.1.1.3 is-mlppp:1.1.1.3.0.8 is-mlppp-base:1.1.1.3 phil-wifi-20200406:1.1.1.3 netbsd-8-2-RELEASE:1.1.1.2 netbsd-9-0-RELEASE:1.1.1.3 netbsd-9-0-RC2:1.1.1.3 netbsd-9-0-RC1:1.1.1.3 phil-wifi-20191119:1.1.1.3 netbsd-9:1.1.1.3.0.6 netbsd-9-base:1.1.1.3 phil-wifi-20190609:1.1.1.3 netbsd-8-1-RELEASE:1.1.1.2 netbsd-8-1-RC1:1.1.1.2 pgoyette-compat-merge-20190127:1.1.1.3 pgoyette-compat-20190127:1.1.1.3 pgoyette-compat-20190118:1.1.1.3 pgoyette-compat-1226:1.1.1.3 pgoyette-compat-1126:1.1.1.3 pgoyette-compat-1020:1.1.1.3 pgoyette-compat-0930:1.1.1.3 pgoyette-compat-0906:1.1.1.3 netbsd-7-2-RELEASE:1.1.1.2 pgoyette-compat-0728:1.1.1.3 netbsd-8-0-RELEASE:1.1.1.2 phil-wifi:1.1.1.3.0.4 phil-wifi-base:1.1.1.3 pgoyette-compat-0625:1.1.1.3 netbsd-8-0-RC2:1.1.1.2 pgoyette-compat-0521:1.1.1.3 pgoyette-compat-0502:1.1.1.3 pgoyette-compat-0422:1.1.1.3 netbsd-8-0-RC1:1.1.1.2 pgoyette-compat-0415:1.1.1.3 pgoyette-compat-0407:1.1.1.3 pgoyette-compat-0330:1.1.1.3 pgoyette-compat-0322:1.1.1.3 pgoyette-compat-0315:1.1.1.3 netbsd-7-1-2-RELEASE:1.1.1.2 pgoyette-compat:1.1.1.3.0.2 pgoyette-compat-base:1.1.1.3 netbsd-7-1-1-RELEASE:1.1.1.2 matt-nb8-mediatek:1.1.1.2.0.22 matt-nb8-mediatek-base:1.1.1.2 gmp-6-1-2:1.1.1.3 perseant-stdc-iso10646:1.1.1.2.0.20 perseant-stdc-iso10646-base:1.1.1.2 netbsd-8:1.1.1.2.0.18 netbsd-8-base:1.1.1.2 prg-localcount2-base3:1.1.1.2 prg-localcount2-base2:1.1.1.2 prg-localcount2-base1:1.1.1.2 prg-localcount2:1.1.1.2.0.16 prg-localcount2-base:1.1.1.2 pgoyette-localcount-20170426:1.1.1.2 bouyer-socketcan-base1:1.1.1.2 pgoyette-localcount-20170320:1.1.1.2 netbsd-7-1:1.1.1.2.0.14 netbsd-7-1-RELEASE:1.1.1.2 netbsd-7-1-RC2:1.1.1.2 netbsd-7-nhusb-base-20170116:1.1.1.2 bouyer-socketcan:1.1.1.2.0.12 bouyer-socketcan-base:1.1.1.2 pgoyette-localcount-20170107:1.1.1.2 netbsd-7-1-RC1:1.1.1.2 pgoyette-localcount-20161104:1.1.1.2 netbsd-7-0-2-RELEASE:1.1.1.2 localcount-20160914:1.1.1.2 netbsd-7-nhusb:1.1.1.2.0.10 netbsd-7-nhusb-base:1.1.1.2 pgoyette-localcount-20160806:1.1.1.2 pgoyette-localcount-20160726:1.1.1.2 pgoyette-localcount:1.1.1.2.0.8 pgoyette-localcount-base:1.1.1.2 netbsd-7-0-1-RELEASE:1.1.1.2 netbsd-7-0:1.1.1.2.0.6 netbsd-7-0-RELEASE:1.1.1.2 netbsd-7-0-RC3:1.1.1.2 netbsd-7-0-RC2:1.1.1.2 netbsd-7-0-RC1:1.1.1.2 netbsd-6-0-6-RELEASE:1.1.1.1 netbsd-6-1-5-RELEASE:1.1.1.1 netbsd-7:1.1.1.2.0.4 netbsd-7-base:1.1.1.2 yamt-pagecache-base9:1.1.1.2 yamt-pagecache-tag8:1.1.1.1 netbsd-6-1-4-RELEASE:1.1.1.1 netbsd-6-0-5-RELEASE:1.1.1.1 tls-earlyentropy:1.1.1.2.0.2 tls-earlyentropy-base:1.1.1.2 riastradh-xf86-video-intel-2-7-1-pre-2-21-15:1.1.1.2 riastradh-drm2-base3:1.1.1.2 netbsd-6-1-3-RELEASE:1.1.1.1 netbsd-6-0-4-RELEASE:1.1.1.1 gmp-5-1-3:1.1.1.2 netbsd-6-1-2-RELEASE:1.1.1.1 netbsd-6-0-3-RELEASE:1.1.1.1 netbsd-6-1-1-RELEASE:1.1.1.1 riastradh-drm2-base2:1.1.1.1 riastradh-drm2-base1:1.1.1.1 riastradh-drm2:1.1.1.1.0.12 riastradh-drm2-base:1.1.1.1 netbsd-6-1:1.1.1.1.0.16 netbsd-6-0-2-RELEASE:1.1.1.1 netbsd-6-1-RELEASE:1.1.1.1 netbsd-6-1-RC4:1.1.1.1 netbsd-6-1-RC3:1.1.1.1 agc-symver:1.1.1.1.0.14 agc-symver-base:1.1.1.1 netbsd-6-1-RC2:1.1.1.1 netbsd-6-1-RC1:1.1.1.1 yamt-pagecache-base8:1.1.1.1 netbsd-6-0-1-RELEASE:1.1.1.1 yamt-pagecache-base7:1.1.1.1 matt-nb6-plus-nbase:1.1.1.1 yamt-pagecache-base6:1.1.1.1 netbsd-6-0:1.1.1.1.0.10 netbsd-6-0-RELEASE:1.1.1.1 netbsd-6-0-RC2:1.1.1.1 tls-maxphys:1.1.1.1.0.8 tls-maxphys-base:1.1.1.2 matt-nb6-plus:1.1.1.1.0.6 matt-nb6-plus-base:1.1.1.1 netbsd-6-0-RC1:1.1.1.1 yamt-pagecache-base5:1.1.1.1 yamt-pagecache-base4:1.1.1.1 netbsd-6:1.1.1.1.0.4 netbsd-6-base:1.1.1.1 yamt-pagecache-base3:1.1.1.1 yamt-pagecache-base2:1.1.1.1 yamt-pagecache:1.1.1.1.0.2 yamt-pagecache-base:1.1.1.1 gmp-5-0-2:1.1.1.1 gmp:1.1.1; locks; strict; comment @;; @; 1.1 date 2011.06.20.05.54.39; author mrg; state Exp; branches 1.1.1.1; next ; 1.1.1.1 date 2011.06.20.05.54.39; author mrg; state Exp; branches 1.1.1.1.2.1 1.1.1.1.8.1; next 1.1.1.2; 1.1.1.2 date 2013.11.29.07.49.48; author mrg; state Exp; branches; next 1.1.1.3; commitid L2Av4PuGmdoL39fx; 1.1.1.3 date 2017.08.22.09.40.49; author mrg; state Exp; branches; next ; commitid W5kmAIk8hwVpSb4A; 1.1.1.1.2.1 date 2014.05.22.14.09.01; author yamt; state Exp; branches; next ; commitid nx2BSsHy0NPeAxBx; 1.1.1.1.8.1 date 2014.08.19.23.59.50; author tls; state Exp; branches; next ; commitid jTnpym9Qu0o4R1Nx; desc @@ 1.1 log @Initial revision @ text @dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store dnl store the result to a (n+1)-limb number. dnl Copyright 2004 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C Itanium: 3.15 C Itanium 2: 1.625 C Note that this is very similar to addmul_2.asm. If you change this file, C please change that file too. C TODO C * Clean up variable names, and try to decrease the number of distinct C registers used. C * Cleanup feed-in code to not require zeroing several registers. C * Make sure we don't depend on uninitialized predicate registers. C * We currently cross-jump very aggressively, at the expense of a few cycles C per operation. Consider changing that. C * Could perhaps save a few cycles by using 1 c/l carry propagation in C wind-down code. C * Ultimately rewrite. The problem with this code is that it first uses a C loaded u value in one xma pair, then leaves it live over several unrelated C xma pairs, before it uses it again. It should actually be quite possible C to just swap some aligned xma pairs around. But we should then schedule C u loads further from the first use. C INPUT PARAMETERS define(`rp',`r32') define(`up',`r33') define(`n',`r34') define(`vp',`r35') define(`srp',`r3') define(`v0',`f6') define(`v1',`f7') define(`s0',`r14') define(`acc0',`r15') define(`pr0_0',`r16') define(`pr0_1',`r17') define(`pr0_2',`r18') define(`pr0_3',`r19') define(`pr1_0',`r20') define(`pr1_1',`r21') define(`pr1_2',`r22') define(`pr1_3',`r23') define(`acc1_0',`r24') define(`acc1_1',`r25') define(`acc1_2',`r26') define(`acc1_3',`r27') dnl define(`',`r28') dnl define(`',`r29') dnl define(`',`r30') dnl define(`',`r31') define(`fp0b_0',`f8') define(`fp0b_1',`f9') define(`fp0b_2',`f10') define(`fp0b_3',`f11') define(`fp1a_0',`f12') define(`fp1a_1',`f13') define(`fp1a_2',`f14') define(`fp1a_3',`f15') define(`fp1b_0',`f32') define(`fp1b_1',`f33') define(`fp1b_2',`f34') define(`fp1b_3',`f35') define(`fp2a_0',`f36') define(`fp2a_1',`f37') define(`fp2a_2',`f38') define(`fp2a_3',`f39') define(`u_0',`f44') define(`u_1',`f45') define(`u_2',`f46') define(`u_3',`f47') define(`ux',`f49') define(`uy',`f51') ASM_START() PROLOGUE(mpn_mul_2) .prologue .save ar.lc, r2 .body ifdef(`HAVE_ABI_32', ` addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I zxt4 n = n C I ;;') {.mmi C 00 ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov.i r2 = ar.lc C I0 }{.mmi nop 0 C M and r14 = 3, n C M I add n = -2, n C M I ;; }{.mmi C 01 ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I }{.mmi nop 0 C M cmp.eq p10, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; }{.mmi C 02 nop 0 C M cmp.eq p12, p0 = 3, r14 C M I mov.i ar.lc = n C I0 }{.bbb (p10) br.dptk .Lb01 C B (p11) br.dptk .Lb10 C B (p12) br.dptk .Lb11 C B ;; } ALIGN(32) .Lb00: ldf8 u_1 = [up], 8 mov acc1_2 = 0 mov pr1_2 = 0 mov pr0_3 = 0 cmp.ne p8, p9 = r0, r0 ;; xma.l fp0b_3 = ux, v0, f0 cmp.ne p12, p13 = r0, r0 ldf8 u_2 = [up], 8 xma.hu fp1a_3 = ux, v0, f0 br.cloop.dptk .grt4 xma.l fp0b_0 = uy, v0, f0 xma.hu fp1a_0 = uy, v0, f0 ;; getf.sig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 ;; getf.sig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; getf.sig pr1_3 = fp1b_3 getf.sig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 br .Lcj4 .grt4: xma.l fp0b_0 = uy, v0, f0 xma.hu fp1a_0 = uy, v0, f0 ;; getf.sig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 ldf8 u_3 = [up], 8 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 ;; getf.sig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; ldf8 u_0 = [up], 8 getf.sig pr1_3 = fp1b_3 ;; getf.sig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 br .LL00 ALIGN(32) .Lb01: ldf8 u_0 = [up], 8 C M mov acc1_1 = 0 C M I mov pr1_1 = 0 C M I mov pr0_2 = 0 C M I cmp.ne p6, p7 = r0, r0 C M I ;; xma.l fp0b_2 = ux, v0, f0 C F cmp.ne p10, p11 = r0, r0 C M I ldf8 u_1 = [up], 8 C M xma.hu fp1a_2 = ux, v0, f0 C F ;; xma.l fp0b_3 = uy, v0, f0 C F xma.hu fp1a_3 = uy, v0, f0 C F ;; getf.sig acc0 = fp0b_2 C M xma.l fp1b_2 = ux, v1,fp1a_2 C F xma.hu fp2a_2 = ux, v1,fp1a_2 C F ldf8 u_2 = [up], 8 C M br.cloop.dptk .grt5 xma.l fp0b_0 = u_0, v0, f0 C F xma.hu fp1a_0 = u_0, v0, f0 C F ;; getf.sig pr0_3 = fp0b_3 C M xma.l fp1b_3 = uy, v1,fp1a_3 C F xma.hu fp2a_3 = uy, v1,fp1a_3 C F ;; getf.sig pr1_2 = fp1b_2 C M getf.sig acc1_2 = fp2a_2 C M xma.l fp0b_1 = u_1, v0, f0 C F xma.hu fp1a_1 = u_1, v0, f0 C F br .Lcj5 .grt5: xma.l fp0b_0 = u_0, v0, f0 xma.hu fp1a_0 = u_0, v0, f0 ;; getf.sig pr0_3 = fp0b_3 xma.l fp1b_3 = uy, v1, fp1a_3 xma.hu fp2a_3 = uy, v1, fp1a_3 ;; ldf8 u_3 = [up], 8 getf.sig pr1_2 = fp1b_2 ;; getf.sig acc1_2 = fp2a_2 xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 br .LL01 C We have two variants for n = 2. They turn out to run at exactly the same C speed. But the first, odd variant might allow one cycle to be trimmed. ALIGN(32) ifdef(`',` .Lb10: C 03 br.cloop.dptk .grt2 C 04 C 05 C 06 xma.l fp0b_1 = ux, v0, f0 C 0 xma.hu fp1a_1 = ux, v0, f0 C 1 ;; C 07 xma.l fp0b_2 = uy, v0, f0 C 1 xma.l fp1b_1 = ux, v1, f0 C 1 ;; C 08 xma.hu fp1a_2 = uy, v0, f0 C 2 xma.hu fp2a_1 = ux, v1, f0 C 2 ;; C 09 xma.l fp1b_2 = uy, v1, f0 C 2 xma.hu fp2a_2 = uy, v1, f0 C 3 ;; C 10 getf.sig r16 = fp1a_1 stf8 [rp] = fp0b_1, 8 ;; C 11 getf.sig r17 = fp0b_2 C 12 getf.sig r18 = fp1b_1 C 13 getf.sig r19 = fp1a_2 C 14 getf.sig r20 = fp2a_1 C 15 getf.sig r21 = fp1b_2 ;; C 16 getf.sig r8 = fp2a_2 add r24 = r16, r17 ;; C 17 cmp.ltu p6, p7 = r24, r16 add r26 = r24, r18 ;; C 18 cmp.ltu p8, p9 = r26, r24 ;; C 19 st8 [rp] = r26, 8 (p6) add r25 = r19, r20, 1 (p7) add r25 = r19, r20 ;; C 20 (p8) add r27 = r25, r21, 1 (p9) add r27 = r25, r21 (p6) cmp.leu p10, p0 = r25, r19 (p7) cmp.ltu p10, p0 = r25, r19 ;; C 21 (p10) add r8 = 1, r8 (p8) cmp.leu p12, p0 = r27, r25 (p9) cmp.ltu p12, p0 = r27, r25 ;; C 22 st8 [rp] = r27, 8 mov.i ar.lc = r2 (p12) add r8 = 1, r8 br.ret.sptk.many b0 ') .Lb10: C 03 br.cloop.dptk .grt2 C 04 C 05 C 06 xma.l fp0b_1 = ux, v0, f0 xma.hu fp1a_1 = ux, v0, f0 ;; C 07 xma.l fp0b_2 = uy, v0, f0 xma.hu fp1a_2 = uy, v0, f0 ;; C 08 C 09 C 10 stf8 [rp] = fp0b_1, 8 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; C 11 getf.sig acc0 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; C 12 C 13 C 14 getf.sig pr1_1 = fp1b_1 C 15 getf.sig acc1_1 = fp2a_1 C 16 getf.sig pr1_2 = fp1b_2 C 17 getf.sig r8 = fp2a_2 ;; C 18 C 19 add s0 = pr1_1, acc0 ;; C 20 st8 [rp] = s0, 8 cmp.ltu p8, p9 = s0, pr1_1 sub r31 = -1, acc1_1 ;; C 21 .pred.rel "mutex", p8, p9 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 (p8) cmp.leu p10, p0 = r31, pr1_2 (p9) cmp.ltu p10, p0 = r31, pr1_2 ;; C 22 st8 [rp] = acc0, 8 mov.i ar.lc = r2 (p10) add r8 = 1, r8 br.ret.sptk.many b0 .grt2: ldf8 u_3 = [up], 8 mov acc1_0 = 0 mov pr1_0 = 0 ;; mov pr0_1 = 0 xma.l fp0b_1 = ux, v0, f0 ldf8 u_0 = [up], 8 xma.hu fp1a_1 = ux, v0, f0 ;; xma.l fp0b_2 = uy, v0, f0 xma.hu fp1a_2 = uy, v0, f0 ;; getf.sig acc0 = fp0b_1 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 xma.l fp0b_3 = u_3, v0, f0 xma.hu fp1a_3 = u_3, v0, f0 ;; getf.sig pr0_2 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; ldf8 u_2 = [up], 8 getf.sig pr1_1 = fp1b_1 ;; getf.sig acc1_1 = fp2a_1 xma.l fp0b_0 = u_0, v0, f0 cmp.ne p8, p9 = r0, r0 cmp.ne p12, p13 = r0, r0 xma.hu fp1a_0 = u_0, v0, f0 br .LL10 ALIGN(32) .Lb11: mov acc1_3 = 0 mov pr1_3 = 0 mov pr0_0 = 0 cmp.ne p6, p7 = r0, r0 ;; ldf8 u_2 = [up], 8 br.cloop.dptk .grt3 ;; xma.l fp0b_0 = ux, v0, f0 xma.hu fp1a_0 = ux, v0, f0 ;; cmp.ne p10, p11 = r0, r0 xma.l fp0b_1 = uy, v0, f0 xma.hu fp1a_1 = uy, v0, f0 ;; getf.sig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 ;; getf.sig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; getf.sig pr1_0 = fp1b_0 getf.sig acc1_0 = fp2a_0 br .Lcj3 .grt3: xma.l fp0b_0 = ux, v0, f0 cmp.ne p10, p11 = r0, r0 ldf8 u_3 = [up], 8 xma.hu fp1a_0 = ux, v0, f0 ;; xma.l fp0b_1 = uy, v0, f0 xma.hu fp1a_1 = uy, v0, f0 ;; getf.sig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 ldf8 u_0 = [up], 8 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 ;; getf.sig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 getf.sig pr1_0 = fp1b_0 ;; getf.sig acc1_0 = fp2a_0 xma.l fp0b_3 = u_3, v0, f0 xma.hu fp1a_3 = u_3, v0, f0 br .LL11 C *** MAIN LOOP START *** ALIGN(32) .Loop: C 00 .pred.rel "mutex", p12, p13 getf.sig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 ;; C 01 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_3 = [up], 8 getf.sig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; C 02 .pred.rel "mutex", p6, p7 getf.sig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, f0 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, f0 ;; C 03 .LL01: .pred.rel "mutex", p10, p11 getf.sig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 ;; C 04 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_0 = [up], 8 getf.sig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; C 05 .pred.rel "mutex", p8, p9 getf.sig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, f0 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, f0 ;; C 06 .LL00: .pred.rel "mutex", p12, p13 getf.sig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 ;; C 07 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_1 = [up], 8 getf.sig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; C 08 .pred.rel "mutex", p6, p7 getf.sig acc1_0 = fp2a_0 st8 [rp] = s0, 8 xma.l fp0b_3 = u_3, v0, f0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 xma.hu fp1a_3 = u_3, v0, f0 ;; C 09 .LL11: .pred.rel "mutex", p10, p11 getf.sig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 ;; C 10 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_2 = [up], 8 getf.sig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; C 11 .pred.rel "mutex", p8, p9 getf.sig acc1_1 = fp2a_1 st8 [rp] = s0, 8 xma.l fp0b_0 = u_0, v0, f0 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 xma.hu fp1a_0 = u_0, v0, f0 .LL10: br.cloop.dptk .Loop C 12 ;; C *** MAIN LOOP END *** .Lcj6: .pred.rel "mutex", p12, p13 getf.sig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 getf.sig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 getf.sig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, f0 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, f0 ;; .Lcj5: .pred.rel "mutex", p10, p11 getf.sig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 getf.sig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 getf.sig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, f0 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, f0 ;; .Lcj4: .pred.rel "mutex", p12, p13 getf.sig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 getf.sig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; .pred.rel "mutex", p6, p7 getf.sig acc1_0 = fp2a_0 st8 [rp] = s0, 8 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 ;; .Lcj3: .pred.rel "mutex", p10, p11 getf.sig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 getf.sig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; .pred.rel "mutex", p8, p9 getf.sig acc1_1 = fp2a_1 st8 [rp] = s0, 8 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 ;; .pred.rel "mutex", p12, p13 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 getf.sig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 getf.sig acc1_2 = fp2a_2 st8 [rp] = s0, 8 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 ;; .pred.rel "mutex", p10, p11 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 st8 [rp] = s0, 8 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 ;; .pred.rel "mutex", p8, p9 (p8) cmp.leu p10, p11 = acc0, pr1_2 (p9) cmp.ltu p10, p11 = acc0, pr1_2 (p12) add acc0 = 1, acc0 ;; st8 [rp] = acc0, 8 (p12) cmp.eq.or p10, p0 = 0, acc0 mov r8 = acc1_2 ;; .pred.rel "mutex", p10, p11 (p10) add r8 = 1, r8 mov.i ar.lc = r2 br.ret.sptk.many b0 EPILOGUE() ASM_END() @ 1.1.1.1 log @initial import of GMP 5.0.2. GNU MP is a library for arbitrary precision arithmetic, operating on signed integers, rational numbers, and floating point numbers. It has a rich set of functions, and the functions have a regular interface. GMP is necessary for GCC >= 4.2. @ text @@ 1.1.1.1.8.1 log @Rebase to HEAD as of a few days ago. @ text @d4 1 a4 3 dnl Contributed to the GNU project by Torbjorn Granlund. dnl Copyright 2004, 2011 Free Software Foundation, Inc. d24 5 a28 2 C Itanium: ? C Itanium 2: 1.5 d33 1 a33 1 C * Clean up feed-in code to not require zeroing several registers. d35 2 d97 5 a101 7 ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I d104 26 a129 20 .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; nop 1 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I .mmi; nop 1 C M cmp.eq p10, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; nop 1 C M cmp.eq p12, p0 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p10) br.dptk L(b01) C B (p11) br.dptk L(b10) C B (p12) br.dptk L(b11) C B d131 1 d134 54 a187 54 L(b00): ldf8 u_1 = [up], 8 mov acc1_2 = 0 mov pr1_2 = 0 mov pr0_3 = 0 cmp.ne p8, p9 = r0, r0 ;; xma.l fp0b_3 = ux, v0, f0 cmp.ne p12, p13 = r0, r0 ldf8 u_2 = [up], 8 xma.hu fp1a_3 = ux, v0, f0 br.cloop.dptk L(gt4) xma.l fp0b_0 = uy, v0, f0 xma.hu fp1a_0 = uy, v0, f0 ;; getfsig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; getfsig pr1_3 = fp1b_3 getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 br L(cj4) L(gt4): xma.l fp0b_0 = uy, v0, f0 xma.hu fp1a_0 = uy, v0, f0 ;; getfsig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 ldf8 u_3 = [up], 8 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 xma.l fp0b_2 = u_2, v0, f0 ;; getfsig acc1_3 = fp2a_3 xma.hu fp1a_2 = u_2, v0, f0 br L(00) d191 47 a237 47 L(b01): ldf8 u_0 = [up], 8 C M mov acc1_1 = 0 C M I mov pr1_1 = 0 C M I mov pr0_2 = 0 C M I cmp.ne p6, p7 = r0, r0 C M I ;; xma.l fp0b_2 = ux, v0, f0 C F cmp.ne p10, p11 = r0, r0 C M I ldf8 u_1 = [up], 8 C M xma.hu fp1a_2 = ux, v0, f0 C F ;; xma.l fp0b_3 = uy, v0, f0 C F xma.hu fp1a_3 = uy, v0, f0 C F ;; getfsig acc0 = fp0b_2 C M xma.l fp1b_2 = ux, v1,fp1a_2 C F ldf8 u_2 = [up], 8 C M xma.hu fp2a_2 = ux, v1,fp1a_2 C F br.cloop.dptk L(gt5) xma.l fp0b_0 = u_0, v0, f0 C F xma.hu fp1a_0 = u_0, v0, f0 C F ;; getfsig pr0_3 = fp0b_3 C M xma.l fp1b_3 = uy, v1,fp1a_3 C F xma.hu fp2a_3 = uy, v1,fp1a_3 C F ;; getfsig pr1_2 = fp1b_2 C M getfsig acc1_2 = fp2a_2 C M xma.l fp0b_1 = u_1, v0, f0 C F xma.hu fp1a_1 = u_1, v0, f0 C F br L(cj5) L(gt5): xma.l fp0b_0 = u_0, v0, f0 xma.hu fp1a_0 = u_0, v0, f0 ;; getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = uy, v1, fp1a_3 xma.hu fp2a_3 = uy, v1, fp1a_3 ;; ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 xma.l fp0b_1 = u_1, v0, f0 ;; getfsig acc1_2 = fp2a_2 xma.hu fp1a_1 = u_1, v0, f0 br L(01) d240 2 d243 141 a383 70 L(b10): br.cloop.dptk L(gt2) xma.l fp0b_1 = ux, v0, f0 xma.hu fp1a_1 = ux, v0, f0 ;; xma.l fp0b_2 = uy, v0, f0 xma.hu fp1a_2 = uy, v0, f0 ;; stf8 [rp] = fp0b_1, 8 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; getfsig acc0 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; getfsig pr1_1 = fp1b_1 getfsig acc1_1 = fp2a_1 mov ar.lc = r2 getfsig pr1_2 = fp1b_2 getfsig r8 = fp2a_2 ;; add s0 = pr1_1, acc0 ;; st8 [rp] = s0, 8 cmp.ltu p8, p9 = s0, pr1_1 sub r31 = -1, acc1_1 ;; .pred.rel "mutex", p8, p9 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 (p8) cmp.leu p10, p0 = r31, pr1_2 (p9) cmp.ltu p10, p0 = r31, pr1_2 ;; st8 [rp] = acc0, 8 (p10) add r8 = 1, r8 br.ret.sptk.many b0 L(gt2): ldf8 u_3 = [up], 8 mov acc1_0 = 0 mov pr1_0 = 0 ;; mov pr0_1 = 0 xma.l fp0b_1 = ux, v0, f0 ldf8 u_0 = [up], 8 xma.hu fp1a_1 = ux, v0, f0 ;; xma.l fp0b_2 = uy, v0, f0 xma.hu fp1a_2 = uy, v0, f0 ;; getfsig acc0 = fp0b_1 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 xma.l fp0b_3 = u_3, v0, f0 xma.hu fp1a_3 = u_3, v0, f0 ;; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 ;; .mfi; getfsig acc1_1 = fp2a_1 xma.l fp0b_0 = u_0, v0, f0 cmp.ne p8, p9 = r0, r0 .mfb; cmp.ne p12, p13 = r0, r0 xma.hu fp1a_0 = u_0, v0, f0 br L(10) d387 57 a443 56 L(b11): mov acc1_3 = 0 mov pr1_3 = 0 mov pr0_0 = 0 ldf8 u_2 = [up], 8 cmp.ne p6, p7 = r0, r0 br.cloop.dptk L(gt3) ;; xma.l fp0b_0 = ux, v0, f0 xma.hu fp1a_0 = ux, v0, f0 ;; cmp.ne p10, p11 = r0, r0 xma.l fp0b_1 = uy, v0, f0 xma.hu fp1a_1 = uy, v0, f0 ;; getfsig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; getfsig pr1_0 = fp1b_0 getfsig acc1_0 = fp2a_0 br L(cj3) L(gt3): xma.l fp0b_0 = ux, v0, f0 cmp.ne p10, p11 = r0, r0 ldf8 u_3 = [up], 8 xma.hu fp1a_0 = ux, v0, f0 ;; xma.l fp0b_1 = uy, v0, f0 xma.hu fp1a_1 = uy, v0, f0 ;; getfsig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 ldf8 u_0 = [up], 8 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 ;; getfsig acc1_0 = fp2a_0 xma.l fp0b_3 = u_3, v0, f0 xma.hu fp1a_3 = u_3, v0, f0 br L(11) d448 7 a454 9 L(top): C 00 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 d456 8 a463 7 .pred.rel "mutex", p6, p7 getfsig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, f0 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, f0 d465 7 a471 8 L(01): .pred.rel "mutex", p10, p11 getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 d473 7 a479 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 d481 8 a488 7 .pred.rel "mutex", p8, p9 getfsig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, f0 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, f0 d490 7 a496 8 L(00): .pred.rel "mutex", p12, p13 getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 d498 7 a504 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 d506 8 a513 7 .pred.rel "mutex", p6, p7 getfsig acc1_0 = fp2a_0 st8 [rp] = s0, 8 xma.l fp0b_3 = u_3, v0, f0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 xma.hu fp1a_3 = u_3, v0, f0 d515 7 a521 8 L(11): .pred.rel "mutex", p10, p11 getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 d523 7 a529 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 d531 8 a538 7 .pred.rel "mutex", p8, p9 getfsig acc1_1 = fp2a_1 st8 [rp] = s0, 8 xma.l fp0b_0 = u_0, v0, f0 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 xma.hu fp1a_0 = u_0, v0, f0 d540 8 a547 8 L(10): .pred.rel "mutex", p12, p13 getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 br.cloop.dptk L(top) d551 139 a689 135 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mfi; getfsig acc1_2 = fp2a_2 xma.l fp0b_1 = u_1, v0, f0 nop 1 .mmf; (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, f0 ;; L(cj5): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 .mfi; (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_3 = fp1b_3 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_2 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mfi; getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, f0 nop 1 .mmf; (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, f0 ;; L(cj4): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 .mfi; (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_0 = fp1b_0 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_3 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig acc1_0 = fp2a_0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 ;; L(cj3): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 .mfi; (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_1 = fp1b_1 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_0 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; .pred.rel "mutex", p8, p9 .mmi; getfsig acc1_1 = fp2a_1 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 ;; .pred.rel "mutex", p12, p13 .mmi; (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig r8 = fp2a_2 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 ;; .pred.rel "mutex", p10, p11 .mmi; (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 (p6) cmp.leu p8, p9 = acc0, pr0_2 ;; .pred.rel "mutex", p10, p11 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mmi; st8 [rp] = s0, 8 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 ;; .pred.rel "mutex", p8, p9 .mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 (p9) cmp.ltu p10, p11 = acc0, pr1_2 (p12) add acc0 = 1, acc0 ;; .mmi; st8 [rp] = acc0, 8 (p12) cmpeqor p10, p0 = 0, acc0 nop 1 ;; .mib; (p10) add r8 = 1, r8 mov ar.lc = r2 br.ret.sptk.many b0 @ 1.1.1.1.2.1 log @sync with head. for a reference, the tree before this commit was tagged as yamt-pagecache-tag8. this commit was splitted into small chunks to avoid a limitation of cvs. ("Protocol error: too many arguments") @ text @d4 1 a4 3 dnl Contributed to the GNU project by Torbjorn Granlund. dnl Copyright 2004, 2011 Free Software Foundation, Inc. d24 5 a28 2 C Itanium: ? C Itanium 2: 1.5 d33 1 a33 1 C * Clean up feed-in code to not require zeroing several registers. d35 2 d97 5 a101 7 ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I d104 26 a129 20 .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; nop 1 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I .mmi; nop 1 C M cmp.eq p10, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; nop 1 C M cmp.eq p12, p0 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p10) br.dptk L(b01) C B (p11) br.dptk L(b10) C B (p12) br.dptk L(b11) C B d131 1 d134 54 a187 54 L(b00): ldf8 u_1 = [up], 8 mov acc1_2 = 0 mov pr1_2 = 0 mov pr0_3 = 0 cmp.ne p8, p9 = r0, r0 ;; xma.l fp0b_3 = ux, v0, f0 cmp.ne p12, p13 = r0, r0 ldf8 u_2 = [up], 8 xma.hu fp1a_3 = ux, v0, f0 br.cloop.dptk L(gt4) xma.l fp0b_0 = uy, v0, f0 xma.hu fp1a_0 = uy, v0, f0 ;; getfsig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; getfsig pr1_3 = fp1b_3 getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 br L(cj4) L(gt4): xma.l fp0b_0 = uy, v0, f0 xma.hu fp1a_0 = uy, v0, f0 ;; getfsig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 ldf8 u_3 = [up], 8 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 xma.l fp0b_2 = u_2, v0, f0 ;; getfsig acc1_3 = fp2a_3 xma.hu fp1a_2 = u_2, v0, f0 br L(00) d191 47 a237 47 L(b01): ldf8 u_0 = [up], 8 C M mov acc1_1 = 0 C M I mov pr1_1 = 0 C M I mov pr0_2 = 0 C M I cmp.ne p6, p7 = r0, r0 C M I ;; xma.l fp0b_2 = ux, v0, f0 C F cmp.ne p10, p11 = r0, r0 C M I ldf8 u_1 = [up], 8 C M xma.hu fp1a_2 = ux, v0, f0 C F ;; xma.l fp0b_3 = uy, v0, f0 C F xma.hu fp1a_3 = uy, v0, f0 C F ;; getfsig acc0 = fp0b_2 C M xma.l fp1b_2 = ux, v1,fp1a_2 C F ldf8 u_2 = [up], 8 C M xma.hu fp2a_2 = ux, v1,fp1a_2 C F br.cloop.dptk L(gt5) xma.l fp0b_0 = u_0, v0, f0 C F xma.hu fp1a_0 = u_0, v0, f0 C F ;; getfsig pr0_3 = fp0b_3 C M xma.l fp1b_3 = uy, v1,fp1a_3 C F xma.hu fp2a_3 = uy, v1,fp1a_3 C F ;; getfsig pr1_2 = fp1b_2 C M getfsig acc1_2 = fp2a_2 C M xma.l fp0b_1 = u_1, v0, f0 C F xma.hu fp1a_1 = u_1, v0, f0 C F br L(cj5) L(gt5): xma.l fp0b_0 = u_0, v0, f0 xma.hu fp1a_0 = u_0, v0, f0 ;; getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = uy, v1, fp1a_3 xma.hu fp2a_3 = uy, v1, fp1a_3 ;; ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 xma.l fp0b_1 = u_1, v0, f0 ;; getfsig acc1_2 = fp2a_2 xma.hu fp1a_1 = u_1, v0, f0 br L(01) d240 2 d243 141 a383 70 L(b10): br.cloop.dptk L(gt2) xma.l fp0b_1 = ux, v0, f0 xma.hu fp1a_1 = ux, v0, f0 ;; xma.l fp0b_2 = uy, v0, f0 xma.hu fp1a_2 = uy, v0, f0 ;; stf8 [rp] = fp0b_1, 8 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; getfsig acc0 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; getfsig pr1_1 = fp1b_1 getfsig acc1_1 = fp2a_1 mov ar.lc = r2 getfsig pr1_2 = fp1b_2 getfsig r8 = fp2a_2 ;; add s0 = pr1_1, acc0 ;; st8 [rp] = s0, 8 cmp.ltu p8, p9 = s0, pr1_1 sub r31 = -1, acc1_1 ;; .pred.rel "mutex", p8, p9 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 (p8) cmp.leu p10, p0 = r31, pr1_2 (p9) cmp.ltu p10, p0 = r31, pr1_2 ;; st8 [rp] = acc0, 8 (p10) add r8 = 1, r8 br.ret.sptk.many b0 L(gt2): ldf8 u_3 = [up], 8 mov acc1_0 = 0 mov pr1_0 = 0 ;; mov pr0_1 = 0 xma.l fp0b_1 = ux, v0, f0 ldf8 u_0 = [up], 8 xma.hu fp1a_1 = ux, v0, f0 ;; xma.l fp0b_2 = uy, v0, f0 xma.hu fp1a_2 = uy, v0, f0 ;; getfsig acc0 = fp0b_1 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 xma.l fp0b_3 = u_3, v0, f0 xma.hu fp1a_3 = u_3, v0, f0 ;; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 ;; .mfi; getfsig acc1_1 = fp2a_1 xma.l fp0b_0 = u_0, v0, f0 cmp.ne p8, p9 = r0, r0 .mfb; cmp.ne p12, p13 = r0, r0 xma.hu fp1a_0 = u_0, v0, f0 br L(10) d387 57 a443 56 L(b11): mov acc1_3 = 0 mov pr1_3 = 0 mov pr0_0 = 0 ldf8 u_2 = [up], 8 cmp.ne p6, p7 = r0, r0 br.cloop.dptk L(gt3) ;; xma.l fp0b_0 = ux, v0, f0 xma.hu fp1a_0 = ux, v0, f0 ;; cmp.ne p10, p11 = r0, r0 xma.l fp0b_1 = uy, v0, f0 xma.hu fp1a_1 = uy, v0, f0 ;; getfsig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; getfsig pr1_0 = fp1b_0 getfsig acc1_0 = fp2a_0 br L(cj3) L(gt3): xma.l fp0b_0 = ux, v0, f0 cmp.ne p10, p11 = r0, r0 ldf8 u_3 = [up], 8 xma.hu fp1a_0 = ux, v0, f0 ;; xma.l fp0b_1 = uy, v0, f0 xma.hu fp1a_1 = uy, v0, f0 ;; getfsig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 ldf8 u_0 = [up], 8 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 ;; getfsig acc1_0 = fp2a_0 xma.l fp0b_3 = u_3, v0, f0 xma.hu fp1a_3 = u_3, v0, f0 br L(11) d448 7 a454 9 L(top): C 00 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 d456 8 a463 7 .pred.rel "mutex", p6, p7 getfsig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, f0 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, f0 d465 7 a471 8 L(01): .pred.rel "mutex", p10, p11 getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 d473 7 a479 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 d481 8 a488 7 .pred.rel "mutex", p8, p9 getfsig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, f0 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, f0 d490 7 a496 8 L(00): .pred.rel "mutex", p12, p13 getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 d498 7 a504 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 d506 8 a513 7 .pred.rel "mutex", p6, p7 getfsig acc1_0 = fp2a_0 st8 [rp] = s0, 8 xma.l fp0b_3 = u_3, v0, f0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 xma.hu fp1a_3 = u_3, v0, f0 d515 7 a521 8 L(11): .pred.rel "mutex", p10, p11 getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 d523 7 a529 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 d531 8 a538 7 .pred.rel "mutex", p8, p9 getfsig acc1_1 = fp2a_1 st8 [rp] = s0, 8 xma.l fp0b_0 = u_0, v0, f0 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 xma.hu fp1a_0 = u_0, v0, f0 d540 8 a547 8 L(10): .pred.rel "mutex", p12, p13 getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 br.cloop.dptk L(top) d551 139 a689 135 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mfi; getfsig acc1_2 = fp2a_2 xma.l fp0b_1 = u_1, v0, f0 nop 1 .mmf; (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, f0 ;; L(cj5): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 .mfi; (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_3 = fp1b_3 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_2 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mfi; getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, f0 nop 1 .mmf; (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, f0 ;; L(cj4): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 .mfi; (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_0 = fp1b_0 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_3 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig acc1_0 = fp2a_0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 ;; L(cj3): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 .mfi; (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_1 = fp1b_1 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_0 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; .pred.rel "mutex", p8, p9 .mmi; getfsig acc1_1 = fp2a_1 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 ;; .pred.rel "mutex", p12, p13 .mmi; (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig r8 = fp2a_2 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 ;; .pred.rel "mutex", p10, p11 .mmi; (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 (p6) cmp.leu p8, p9 = acc0, pr0_2 ;; .pred.rel "mutex", p10, p11 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mmi; st8 [rp] = s0, 8 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 ;; .pred.rel "mutex", p8, p9 .mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 (p9) cmp.ltu p10, p11 = acc0, pr1_2 (p12) add acc0 = 1, acc0 ;; .mmi; st8 [rp] = acc0, 8 (p12) cmpeqor p10, p0 = 0, acc0 nop 1 ;; .mib; (p10) add r8 = 1, r8 mov ar.lc = r2 br.ret.sptk.many b0 @ 1.1.1.2 log @initial import GMP 5.1.3 sources. changes include: fixes for: - mpn_sbpi1_div_qr_sec and mpn_sbpi1_div_r_sec - mpz_powm_ui - AMD family 11h - mpz_powm_sec and mpn_powm_sec - ASSERT() fixes - gcd, gcdext, and invert function fixes - some PPC division operations @ text @d4 1 a4 3 dnl Contributed to the GNU project by Torbjorn Granlund. dnl Copyright 2004, 2011 Free Software Foundation, Inc. d24 5 a28 2 C Itanium: ? C Itanium 2: 1.5 d33 1 a33 1 C * Clean up feed-in code to not require zeroing several registers. d35 2 d97 5 a101 7 ifdef(`HAVE_ABI_32',` .mmi; addp4 rp = 0, rp C M I addp4 up = 0, up C M I addp4 vp = 0, vp C M I .mmi; nop 1 nop 1 zxt4 n = n C I d104 26 a129 20 .mmi; ldf8 ux = [up], 8 C M ldf8 v0 = [vp], 8 C M mov r2 = ar.lc C I0 .mmi; nop 1 C M and r14 = 3, n C M I add n = -2, n C M I ;; .mmi; ldf8 uy = [up], 8 C M ldf8 v1 = [vp] C M shr.u n = n, 2 C I .mmi; nop 1 C M cmp.eq p10, p0 = 1, r14 C M I cmp.eq p11, p0 = 2, r14 C M I ;; .mmi; nop 1 C M cmp.eq p12, p0 = 3, r14 C M I mov ar.lc = n C I0 .bbb; (p10) br.dptk L(b01) C B (p11) br.dptk L(b10) C B (p12) br.dptk L(b11) C B d131 1 d134 54 a187 54 L(b00): ldf8 u_1 = [up], 8 mov acc1_2 = 0 mov pr1_2 = 0 mov pr0_3 = 0 cmp.ne p8, p9 = r0, r0 ;; xma.l fp0b_3 = ux, v0, f0 cmp.ne p12, p13 = r0, r0 ldf8 u_2 = [up], 8 xma.hu fp1a_3 = ux, v0, f0 br.cloop.dptk L(gt4) xma.l fp0b_0 = uy, v0, f0 xma.hu fp1a_0 = uy, v0, f0 ;; getfsig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; getfsig pr1_3 = fp1b_3 getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 br L(cj4) L(gt4): xma.l fp0b_0 = uy, v0, f0 xma.hu fp1a_0 = uy, v0, f0 ;; getfsig acc0 = fp0b_3 xma.l fp1b_3 = ux, v1, fp1a_3 ldf8 u_3 = [up], 8 xma.hu fp2a_3 = ux, v1, fp1a_3 ;; xma.l fp0b_1 = u_1, v0, f0 xma.hu fp1a_1 = u_1, v0, f0 ;; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = uy, v1, fp1a_0 xma.hu fp2a_0 = uy, v1, fp1a_0 ;; ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 xma.l fp0b_2 = u_2, v0, f0 ;; getfsig acc1_3 = fp2a_3 xma.hu fp1a_2 = u_2, v0, f0 br L(00) d191 47 a237 47 L(b01): ldf8 u_0 = [up], 8 C M mov acc1_1 = 0 C M I mov pr1_1 = 0 C M I mov pr0_2 = 0 C M I cmp.ne p6, p7 = r0, r0 C M I ;; xma.l fp0b_2 = ux, v0, f0 C F cmp.ne p10, p11 = r0, r0 C M I ldf8 u_1 = [up], 8 C M xma.hu fp1a_2 = ux, v0, f0 C F ;; xma.l fp0b_3 = uy, v0, f0 C F xma.hu fp1a_3 = uy, v0, f0 C F ;; getfsig acc0 = fp0b_2 C M xma.l fp1b_2 = ux, v1,fp1a_2 C F ldf8 u_2 = [up], 8 C M xma.hu fp2a_2 = ux, v1,fp1a_2 C F br.cloop.dptk L(gt5) xma.l fp0b_0 = u_0, v0, f0 C F xma.hu fp1a_0 = u_0, v0, f0 C F ;; getfsig pr0_3 = fp0b_3 C M xma.l fp1b_3 = uy, v1,fp1a_3 C F xma.hu fp2a_3 = uy, v1,fp1a_3 C F ;; getfsig pr1_2 = fp1b_2 C M getfsig acc1_2 = fp2a_2 C M xma.l fp0b_1 = u_1, v0, f0 C F xma.hu fp1a_1 = u_1, v0, f0 C F br L(cj5) L(gt5): xma.l fp0b_0 = u_0, v0, f0 xma.hu fp1a_0 = u_0, v0, f0 ;; getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = uy, v1, fp1a_3 xma.hu fp2a_3 = uy, v1, fp1a_3 ;; ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 xma.l fp0b_1 = u_1, v0, f0 ;; getfsig acc1_2 = fp2a_2 xma.hu fp1a_1 = u_1, v0, f0 br L(01) d240 2 d243 141 a383 70 L(b10): br.cloop.dptk L(gt2) xma.l fp0b_1 = ux, v0, f0 xma.hu fp1a_1 = ux, v0, f0 ;; xma.l fp0b_2 = uy, v0, f0 xma.hu fp1a_2 = uy, v0, f0 ;; stf8 [rp] = fp0b_1, 8 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; getfsig acc0 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; getfsig pr1_1 = fp1b_1 getfsig acc1_1 = fp2a_1 mov ar.lc = r2 getfsig pr1_2 = fp1b_2 getfsig r8 = fp2a_2 ;; add s0 = pr1_1, acc0 ;; st8 [rp] = s0, 8 cmp.ltu p8, p9 = s0, pr1_1 sub r31 = -1, acc1_1 ;; .pred.rel "mutex", p8, p9 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 (p8) cmp.leu p10, p0 = r31, pr1_2 (p9) cmp.ltu p10, p0 = r31, pr1_2 ;; st8 [rp] = acc0, 8 (p10) add r8 = 1, r8 br.ret.sptk.many b0 L(gt2): ldf8 u_3 = [up], 8 mov acc1_0 = 0 mov pr1_0 = 0 ;; mov pr0_1 = 0 xma.l fp0b_1 = ux, v0, f0 ldf8 u_0 = [up], 8 xma.hu fp1a_1 = ux, v0, f0 ;; xma.l fp0b_2 = uy, v0, f0 xma.hu fp1a_2 = uy, v0, f0 ;; getfsig acc0 = fp0b_1 xma.l fp1b_1 = ux, v1, fp1a_1 xma.hu fp2a_1 = ux, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 xma.l fp0b_3 = u_3, v0, f0 xma.hu fp1a_3 = u_3, v0, f0 ;; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = uy, v1, fp1a_2 xma.hu fp2a_2 = uy, v1, fp1a_2 ;; ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 ;; .mfi; getfsig acc1_1 = fp2a_1 xma.l fp0b_0 = u_0, v0, f0 cmp.ne p8, p9 = r0, r0 .mfb; cmp.ne p12, p13 = r0, r0 xma.hu fp1a_0 = u_0, v0, f0 br L(10) d387 57 a443 56 L(b11): mov acc1_3 = 0 mov pr1_3 = 0 mov pr0_0 = 0 ldf8 u_2 = [up], 8 cmp.ne p6, p7 = r0, r0 br.cloop.dptk L(gt3) ;; xma.l fp0b_0 = ux, v0, f0 xma.hu fp1a_0 = ux, v0, f0 ;; cmp.ne p10, p11 = r0, r0 xma.l fp0b_1 = uy, v0, f0 xma.hu fp1a_1 = uy, v0, f0 ;; getfsig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; getfsig pr1_0 = fp1b_0 getfsig acc1_0 = fp2a_0 br L(cj3) L(gt3): xma.l fp0b_0 = ux, v0, f0 cmp.ne p10, p11 = r0, r0 ldf8 u_3 = [up], 8 xma.hu fp1a_0 = ux, v0, f0 ;; xma.l fp0b_1 = uy, v0, f0 xma.hu fp1a_1 = uy, v0, f0 ;; getfsig acc0 = fp0b_0 xma.l fp1b_0 = ux, v1, fp1a_0 ldf8 u_0 = [up], 8 xma.hu fp2a_0 = ux, v1, fp1a_0 ;; xma.l fp0b_2 = u_2, v0, f0 xma.hu fp1a_2 = u_2, v0, f0 ;; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = uy, v1, fp1a_1 xma.hu fp2a_1 = uy, v1, fp1a_1 ;; ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 ;; getfsig acc1_0 = fp2a_0 xma.l fp0b_3 = u_3, v0, f0 xma.hu fp1a_3 = u_3, v0, f0 br L(11) d448 7 a454 9 L(top): C 00 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_3 = [up], 8 getfsig pr1_2 = fp1b_2 (p8) cmp.leu p6, p7 = acc0, pr0_1 (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 d456 8 a463 7 .pred.rel "mutex", p6, p7 getfsig acc1_2 = fp2a_2 st8 [rp] = s0, 8 xma.l fp0b_1 = u_1, v0, f0 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, f0 d465 7 a471 8 L(01): .pred.rel "mutex", p10, p11 getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 d473 7 a479 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_0 = [up], 8 getfsig pr1_3 = fp1b_3 (p6) cmp.leu p8, p9 = acc0, pr0_2 (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 d481 8 a488 7 .pred.rel "mutex", p8, p9 getfsig acc1_3 = fp2a_3 st8 [rp] = s0, 8 xma.l fp0b_2 = u_2, v0, f0 (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, f0 d490 7 a496 8 L(00): .pred.rel "mutex", p12, p13 getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 d498 7 a504 8 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 ldf8 u_1 = [up], 8 getfsig pr1_0 = fp1b_0 (p8) cmp.leu p6, p7 = acc0, pr0_3 (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 d506 8 a513 7 .pred.rel "mutex", p6, p7 getfsig acc1_0 = fp2a_0 st8 [rp] = s0, 8 xma.l fp0b_3 = u_3, v0, f0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 xma.hu fp1a_3 = u_3, v0, f0 d515 7 a521 8 L(11): .pred.rel "mutex", p10, p11 getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 d523 7 a529 8 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 ldf8 u_2 = [up], 8 getfsig pr1_1 = fp1b_1 (p6) cmp.leu p8, p9 = acc0, pr0_0 (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 d531 8 a538 7 .pred.rel "mutex", p8, p9 getfsig acc1_1 = fp2a_1 st8 [rp] = s0, 8 xma.l fp0b_0 = u_0, v0, f0 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 xma.hu fp1a_0 = u_0, v0, f0 d540 8 a547 8 L(10): .pred.rel "mutex", p12, p13 getfsig pr0_3 = fp0b_3 xma.l fp1b_3 = u_3, v1, fp1a_3 (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 xma.hu fp2a_3 = u_3, v1, fp1a_3 br.cloop.dptk L(top) d551 139 a689 135 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mfi; getfsig acc1_2 = fp2a_2 xma.l fp0b_1 = u_1, v0, f0 nop 1 .mmf; (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 xma.hu fp1a_1 = u_1, v0, f0 ;; L(cj5): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_0 = fp0b_0 xma.l fp1b_0 = u_0, v1, fp1a_0 (p10) add s0 = pr1_1, acc0, 1 .mfi; (p11) add s0 = pr1_1, acc0 xma.hu fp2a_0 = u_0, v1, fp1a_0 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_3 = fp1b_3 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_2 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mfi; getfsig acc1_3 = fp2a_3 xma.l fp0b_2 = u_2, v0, f0 nop 1 .mmf; (p8) add acc0 = pr0_3, acc1_1, 1 (p9) add acc0 = pr0_3, acc1_1 xma.hu fp1a_2 = u_2, v0, f0 ;; L(cj4): .pred.rel "mutex", p12, p13 .mfi; getfsig pr0_1 = fp0b_1 xma.l fp1b_1 = u_1, v1, fp1a_1 (p12) add s0 = pr1_2, acc0, 1 .mfi; (p13) add s0 = pr1_2, acc0 xma.hu fp2a_1 = u_1, v1, fp1a_1 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_0 = fp1b_0 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_3 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 (p12) cmp.leu p10, p11 = s0, pr1_2 (p13) cmp.ltu p10, p11 = s0, pr1_2 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig acc1_0 = fp2a_0 (p6) add acc0 = pr0_0, acc1_2, 1 (p7) add acc0 = pr0_0, acc1_2 ;; L(cj3): .pred.rel "mutex", p10, p11 .mfi; getfsig pr0_2 = fp0b_2 xma.l fp1b_2 = u_2, v1, fp1a_2 (p10) add s0 = pr1_3, acc0, 1 .mfi; (p11) add s0 = pr1_3, acc0 xma.hu fp2a_2 = u_2, v1, fp1a_2 nop 1 ;; .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 .mmi; getfsig pr1_1 = fp1b_1 st8 [rp] = s0, 8 (p6) cmp.leu p8, p9 = acc0, pr0_0 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 (p10) cmp.leu p12, p13 = s0, pr1_3 (p11) cmp.ltu p12, p13 = s0, pr1_3 ;; .pred.rel "mutex", p8, p9 .mmi; getfsig acc1_1 = fp2a_1 (p8) add acc0 = pr0_1, acc1_3, 1 (p9) add acc0 = pr0_1, acc1_3 ;; .pred.rel "mutex", p12, p13 .mmi; (p12) add s0 = pr1_0, acc0, 1 (p13) add s0 = pr1_0, acc0 nop 1 ;; .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 .mmi; getfsig pr1_2 = fp1b_2 st8 [rp] = s0, 8 (p8) cmp.leu p6, p7 = acc0, pr0_1 .mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 (p12) cmp.leu p10, p11 = s0, pr1_0 (p13) cmp.ltu p10, p11 = s0, pr1_0 ;; .pred.rel "mutex", p6, p7 .mmi; getfsig r8 = fp2a_2 (p6) add acc0 = pr0_2, acc1_0, 1 (p7) add acc0 = pr0_2, acc1_0 ;; .pred.rel "mutex", p10, p11 .mmi; (p10) add s0 = pr1_1, acc0, 1 (p11) add s0 = pr1_1, acc0 (p6) cmp.leu p8, p9 = acc0, pr0_2 ;; .pred.rel "mutex", p10, p11 .mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 (p10) cmp.leu p12, p13 = s0, pr1_1 (p11) cmp.ltu p12, p13 = s0, pr1_1 ;; .pred.rel "mutex", p8, p9 .mmi; st8 [rp] = s0, 8 (p8) add acc0 = pr1_2, acc1_1, 1 (p9) add acc0 = pr1_2, acc1_1 ;; .pred.rel "mutex", p8, p9 .mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 (p9) cmp.ltu p10, p11 = acc0, pr1_2 (p12) add acc0 = 1, acc0 ;; .mmi; st8 [rp] = acc0, 8 (p12) cmpeqor p10, p0 = 0, acc0 nop 1 ;; .mib; (p10) add r8 = 1, r8 mov ar.lc = r2 br.ret.sptk.many b0 @ 1.1.1.3 log @initial import of GMP 6.1.2. main changes from 5.1.3 below. notes: - support for thumb-less ARM chips was in our port of 5.1.3, but a similar method has been provided upstream now - someone should look at the AVX failure reports, and fix them Changes between GMP version 6.1.0 and 6.1.1 FEATURES * Work around faulty cpuid on some recent Intel chips (this allows GMP to run on Skylake Pentiums). * Support thumb-less ARM chips. Changes between GMP version 6.0.* and 6.1.0 BUGS FIXED * The public function mpn_com is now correctly declared in gmp.h. * Healed possible failures of mpn_sec_sqr for non-cryptographic sizes for some obsolete CPUs. * Various problems related to precision for mpf have been fixed. * Fixed ABI incompatible stack alignment in calls from assembly code. * Fixed PIC bug in popcount affecting Intel processors using the 32-bit ABI. SPEEDUPS * Speedup for Intel Broadwell and Skylake through assembly code making use of new ADX instructions. * Square root is now faster when the remainder is not needed. Also the speed to compute the k-th root improved, for small sizes. FEATURES * New C++ functions gcd and lcm for mpz_class. * New public mpn functions mpn_divexact_1, mpn_zero_p, and mpn_cnd_swap. * New public mpq_cmp_z function, to efficiently compare rationals with integers. * Support for more 32-bit arm processors. * Support for AVX-less modern x86 CPUs. (Such support might be missing either because the CPU vendor chose to disable AVX, or because the running kernel lacks AVX context switch support.) * Support for NetBSD under Xen; we switch off AVX unconditionally under NetBSD since a bug in NetBSD makes AVX fail under Xen. MISC * Tuned values for FFT multiplications are provided for larger number on many platforms. Changes between GMP version 5.1.* and 6.0.0 BUGS FIXED * The function mpz_invert now considers any number invertible in Z/1Z. * The mpn multiply code now handles operands of more than 2^31 limbs correctly. (Note however that the mpz code is limited to 2^32 bits on 32-bit hosts and 2^37 bits on 64-bit hosts.) SPEEDUPS * Plain division of large operands is faster and more monotonous in operand size. * Major speedup for ARM, in particular ARM Cortex-A15, thanks to improved assembly. * Speedup for Intel Sandy Bridge, Ivy Bridge, Haswell, thanks to rewritten and vastly expanded assembly support. Speedup also for the older Core 2 and Nehalem. * Faster mixed arithmetic between mpq_class and double. FEATURES * Support for new Intel and AMD CPUs. * New public functions mpn_sec_mul and mpn_sec_sqr, implementing side-channel silent multiplication and squaring. * New public functions mpn_sec_div_qr and mpn_sec_div_r, implementing side-channel silent division. * New public functions mpn_cnd_add_n and mpn_cnd_sub_n. Side-channel silent conditional addition and subtraction. * New public function mpn_sec_powm, implementing side-channel silent modexp. * New public function mpn_sec_invert, implementing side-channel silent modular inversion. * Better support for applications which use the mpz_t type, but nevertheless need to call some of the lower-level mpn functions. See the documentation for mpz_limbs_read and related functions. @ text @d9 1 a9 1 dnl d11 4 a14 14 dnl it under the terms of either: dnl dnl * the GNU Lesser General Public License as published by the Free dnl Software Foundation; either version 3 of the License, or (at your dnl option) any later version. dnl dnl or dnl dnl * the GNU General Public License as published by the Free Software dnl Foundation; either version 2 of the License, or (at your option) any dnl later version. dnl dnl or both in parallel, as here. dnl d17 5 a21 6 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License dnl for more details. dnl dnl You should have received copies of the GNU General Public License and the dnl GNU Lesser General Public License along with the GNU MP Library. If not, dnl see https://www.gnu.org/licenses/. d95 1 a95 1 {.mmi; addp4 rp = 0, rp C M I d98 1 a98 1 }{.mmi; nop 1 d101 1 a101 2 ;; }') d103 1 a103 1 {.mmi; ldf8 ux = [up], 8 C M d106 1 a106 1 }{.mmi; nop 1 C M d110 1 a110 1 }{.mmi; ldf8 uy = [up], 8 C M d112 2 a113 2 shr.u n = n, 2 C I0 }{.mmi; nop 1 C M d117 1 a117 1 }{.mmi; nop 1 C M d120 1 a120 1 }{.bbb; (p10) br.dptk L(b01) C B d124 1 a124 1 } d260 1 a260 1 .pred.rel "mutex", p8, p9 d297 1 a297 1 {.mfi; getfsig acc1_1 = fp2a_1 d300 1 a300 1 }{.mfb; cmp.ne p12, p13 = r0, r0 d303 1 a303 1 } d367 2 a368 2 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 d376 1 a376 1 .pred.rel "mutex", p6, p7 d385 1 a385 1 .pred.rel "mutex", p10, p11 d393 2 a394 2 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 d402 1 a402 1 .pred.rel "mutex", p8, p9 d411 1 a411 1 .pred.rel "mutex", p12, p13 d419 2 a420 2 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 d428 1 a428 1 .pred.rel "mutex", p6, p7 d437 1 a437 1 .pred.rel "mutex", p10, p11 d445 2 a446 2 .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 d454 1 a454 1 .pred.rel "mutex", p8, p9 d463 1 a463 1 .pred.rel "mutex", p12, p13 d473 3 a475 3 .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 {.mmi; getfsig pr1_2 = fp1b_2 d478 1 a478 1 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 d482 2 a483 2 } .pred.rel "mutex", p6, p7 {.mfi; getfsig acc1_2 = fp2a_2 d486 1 a486 1 }{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 a489 1 } d491 2 a492 2 .pred.rel "mutex", p10, p11 {.mfi; getfsig pr0_0 = fp0b_0 d495 1 a495 1 }{.mfi; (p11) add s0 = pr1_1, acc0 d499 3 a501 3 } .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 {.mmi; getfsig pr1_3 = fp1b_3 d504 1 a504 1 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 d508 2 a509 2 } .pred.rel "mutex", p8, p9 {.mfi; getfsig acc1_3 = fp2a_3 d512 1 a512 1 }{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 a515 1 } d517 2 a518 2 .pred.rel "mutex", p12, p13 {.mfi; getfsig pr0_1 = fp0b_1 d521 1 a521 1 }{.mfi; (p13) add s0 = pr1_2, acc0 d525 3 a527 3 } .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 {.mmi; getfsig pr1_0 = fp1b_0 d530 1 a530 1 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 d534 2 a535 2 } .pred.rel "mutex", p6, p7 {.mmi; getfsig acc1_0 = fp2a_0 a538 1 } d540 2 a541 2 .pred.rel "mutex", p10, p11 {.mfi; getfsig pr0_2 = fp0b_2 d544 1 a544 1 }{.mfi; (p11) add s0 = pr1_3, acc0 d548 3 a550 3 } .pred.rel "mutex", p6, p7 .pred.rel "mutex", p10, p11 {.mmi; getfsig pr1_1 = fp1b_1 d553 1 a553 1 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 d557 2 a558 2 } .pred.rel "mutex", p8, p9 {.mmi; getfsig acc1_1 = fp2a_1 d562 2 a563 2 } .pred.rel "mutex", p12, p13 {.mmi; (p12) add s0 = pr1_0, acc0, 1 d567 3 a569 3 } .pred.rel "mutex", p8, p9 .pred.rel "mutex", p12, p13 {.mmi; getfsig pr1_2 = fp1b_2 d572 1 a572 1 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 d576 2 a577 2 } .pred.rel "mutex", p6, p7 {.mmi; getfsig r8 = fp2a_2 d581 2 a582 2 } .pred.rel "mutex", p10, p11 {.mmi; (p10) add s0 = pr1_1, acc0, 1 d586 2 a587 2 } .pred.rel "mutex", p10, p11 {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 d591 2 a592 2 } .pred.rel "mutex", p8, p9 {.mmi; st8 [rp] = s0, 8 d596 2 a597 2 } .pred.rel "mutex", p8, p9 {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 d601 1 a601 1 }{.mmi; st8 [rp] = acc0, 8 d605 1 a605 1 }{.mib; (p10) add r8 = 1, r8 a607 1 } @