diff options
author | 2023-10-10 14:33:42 +0000 | |
---|---|---|
committer | 2023-10-10 14:33:42 +0000 | |
commit | af1a266670d040d2f4083ff309d732d648afba2a (patch) | |
tree | 2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm | |
parent | e02cda008591317b1625707ff8e115a4841aa889 (diff) |
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm')
9 files changed, 19804 insertions, 0 deletions
diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-armv4.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-armv4.pl new file mode 100755 index 000000000..ea538c069 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-armv4.pl @@ -0,0 +1,1852 @@ +#! /usr/bin/env perl +# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for ARMv4. +# +# October 2014. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. In the process of adaptation +# original .c module was made 32-bit savvy in order to make this +# implementation possible. +# +# with/without -DECP_NISTZ256_ASM +# Cortex-A8 +53-170% +# Cortex-A9 +76-205% +# Cortex-A15 +100-316% +# Snapdragon S4 +66-187% +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +200% means 3x improvement. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$code.=<<___; +#include "arm_arch.h" + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif +___ +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +$code.=<<___; +.globl ecp_nistz256_precomputed +.type ecp_nistz256_precomputed,%object +.align 12 +ecp_nistz256_precomputed: +___ +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + $code.=".byte\t"; + $code.=join(',',map { sprintf "0x%02x",$_} @line); + $code.="\n"; + } +} +$code.=<<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.align 5 +.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial +.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb +.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 +.Lone: +.long 1,0,0,0,0,0,0,0 +.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" +.align 6 +___ + +######################################################################## +# common register layout, note that $t2 is link register, so that if +# internal subroutine uses $t2, then it has to offload lr... + +($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= + map("r$_",(0..12,14)); +($t0,$t3)=($ff,$a_ptr); + +$code.=<<___; +@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_to_mont +.type ecp_nistz256_to_mont,%function +ecp_nistz256_to_mont: + adr $b_ptr,.LRR + b .Lecp_nistz256_mul_mont +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,%function +ecp_nistz256_from_mont: + adr $b_ptr,.Lone + b .Lecp_nistz256_mul_mont +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + +@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,%function +.align 4 +ecp_nistz256_mul_by_2: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_mul_by_2 +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +.type __ecp_nistz256_mul_by_2,%function +.align 4 +__ecp_nistz256_mul_by_2: + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself + ldr $a3,[$a_ptr,#12] + adcs $a1,$a1,$a1 + ldr $a4,[$a_ptr,#16] + adcs $a2,$a2,$a2 + ldr $a5,[$a_ptr,#20] + adcs $a3,$a3,$a3 + ldr $a6,[$a_ptr,#24] + adcs $a4,$a4,$a4 + ldr $a7,[$a_ptr,#28] + adcs $a5,$a5,$a5 + adcs $a6,$a6,$a6 + mov $ff,#0 + adcs $a7,$a7,$a7 + adc $ff,$ff,#0 + + b .Lreduce_by_sub +.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 + +@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], +@ const BN_ULONG r2[8]); +.globl ecp_nistz256_add +.type ecp_nistz256_add,%function +.align 4 +ecp_nistz256_add: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_add +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_add,.-ecp_nistz256_add + +.type __ecp_nistz256_add,%function +.align 4 +__ecp_nistz256_add: + str lr,[sp,#-4]! @ push lr + + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + ldr $a3,[$a_ptr,#12] + ldr $a4,[$a_ptr,#16] + ldr $t0,[$b_ptr,#0] + ldr $a5,[$a_ptr,#20] + ldr $t1,[$b_ptr,#4] + ldr $a6,[$a_ptr,#24] + ldr $t2,[$b_ptr,#8] + ldr $a7,[$a_ptr,#28] + ldr $t3,[$b_ptr,#12] + adds $a0,$a0,$t0 + ldr $t0,[$b_ptr,#16] + adcs $a1,$a1,$t1 + ldr $t1,[$b_ptr,#20] + adcs $a2,$a2,$t2 + ldr $t2,[$b_ptr,#24] + adcs $a3,$a3,$t3 + ldr $t3,[$b_ptr,#28] + adcs $a4,$a4,$t0 + adcs $a5,$a5,$t1 + adcs $a6,$a6,$t2 + mov $ff,#0 + adcs $a7,$a7,$t3 + adc $ff,$ff,#0 + ldr lr,[sp],#4 @ pop lr + +.Lreduce_by_sub: + + @ if a+b >= modulus, subtract modulus. + @ + @ But since comparison implies subtraction, we subtract + @ modulus and then add it back if subtraction borrowed. + + subs $a0,$a0,#-1 + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 + sbcs $a3,$a3,#0 + sbcs $a4,$a4,#0 + sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 + + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ using value of borrow as a whole or extracting single bit. + @ Follow $ff register... + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_add,.-__ecp_nistz256_add + +@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,%function +.align 4 +ecp_nistz256_mul_by_3: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_mul_by_3 +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +.type __ecp_nistz256_mul_by_3,%function +.align 4 +__ecp_nistz256_mul_by_3: + str lr,[sp,#-4]! @ push lr + + @ As multiplication by 3 is performed as 2*n+n, below are inline + @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see + @ corresponding subroutines for details. + + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] + ldr $a3,[$a_ptr,#12] + adcs $a1,$a1,$a1 + ldr $a4,[$a_ptr,#16] + adcs $a2,$a2,$a2 + ldr $a5,[$a_ptr,#20] + adcs $a3,$a3,$a3 + ldr $a6,[$a_ptr,#24] + adcs $a4,$a4,$a4 + ldr $a7,[$a_ptr,#28] + adcs $a5,$a5,$a5 + adcs $a6,$a6,$a6 + mov $ff,#0 + adcs $a7,$a7,$a7 + adc $ff,$ff,#0 + + subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 + sbcs $a3,$a3,#0 + sbcs $a4,$a4,#0 + sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + adcs $a2,$a2,$ff + adcs $a3,$a3,#0 + adcs $a4,$a4,#0 + ldr $b_ptr,[$a_ptr,#0] + adcs $a5,$a5,#0 + ldr $t1,[$a_ptr,#4] + adcs $a6,$a6,$ff,lsr#31 + ldr $t2,[$a_ptr,#8] + adc $a7,$a7,$ff + + ldr $t0,[$a_ptr,#12] + adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] + ldr $b_ptr,[$a_ptr,#16] + adcs $a1,$a1,$t1 + ldr $t1,[$a_ptr,#20] + adcs $a2,$a2,$t2 + ldr $t2,[$a_ptr,#24] + adcs $a3,$a3,$t0 + ldr $t3,[$a_ptr,#28] + adcs $a4,$a4,$b_ptr + adcs $a5,$a5,$t1 + adcs $a6,$a6,$t2 + mov $ff,#0 + adcs $a7,$a7,$t3 + adc $ff,$ff,#0 + ldr lr,[sp],#4 @ pop lr + + b .Lreduce_by_sub +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,%function +.align 4 +ecp_nistz256_div_by_2: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_div_by_2 +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +.type __ecp_nistz256_div_by_2,%function +.align 4 +__ecp_nistz256_div_by_2: + @ ret = (a is odd ? a+mod : a) >> 1 + + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + mov $ff,$a0,lsl#31 @ place least significant bit to most + @ significant position, now arithmetic + @ right shift by 31 will produce -1 or + @ 0, while logical right shift 1 or 0, + @ this is how modulus is conditionally + @ synthesized in this case... + ldr $a3,[$a_ptr,#12] + adds $a0,$a0,$ff,asr#31 + ldr $a4,[$a_ptr,#16] + adcs $a1,$a1,$ff,asr#31 + ldr $a5,[$a_ptr,#20] + adcs $a2,$a2,$ff,asr#31 + ldr $a6,[$a_ptr,#24] + adcs $a3,$a3,#0 + ldr $a7,[$a_ptr,#28] + adcs $a4,$a4,#0 + mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early + @ because it doesn't affect flags + adcs $a5,$a5,#0 + orr $a0,$a0,$a1,lsl#31 + adcs $a6,$a6,$ff,lsr#31 + mov $b_ptr,#0 + adcs $a7,$a7,$ff,asr#31 + mov $a1,$a1,lsr#1 + adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition + + orr $a1,$a1,$a2,lsl#31 + mov $a2,$a2,lsr#1 + str $a0,[$r_ptr,#0] + orr $a2,$a2,$a3,lsl#31 + mov $a3,$a3,lsr#1 + str $a1,[$r_ptr,#4] + orr $a3,$a3,$a4,lsl#31 + mov $a4,$a4,lsr#1 + str $a2,[$r_ptr,#8] + orr $a4,$a4,$a5,lsl#31 + mov $a5,$a5,lsr#1 + str $a3,[$r_ptr,#12] + orr $a5,$a5,$a6,lsl#31 + mov $a6,$a6,lsr#1 + str $a4,[$r_ptr,#16] + orr $a6,$a6,$a7,lsl#31 + mov $a7,$a7,lsr#1 + str $a5,[$r_ptr,#20] + orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 + +@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], +@ const BN_ULONG r2[8]); +.globl ecp_nistz256_sub +.type ecp_nistz256_sub,%function +.align 4 +ecp_nistz256_sub: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_sub +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +.type __ecp_nistz256_sub,%function +.align 4 +__ecp_nistz256_sub: + str lr,[sp,#-4]! @ push lr + + ldr $a0,[$a_ptr,#0] + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + ldr $a3,[$a_ptr,#12] + ldr $a4,[$a_ptr,#16] + ldr $t0,[$b_ptr,#0] + ldr $a5,[$a_ptr,#20] + ldr $t1,[$b_ptr,#4] + ldr $a6,[$a_ptr,#24] + ldr $t2,[$b_ptr,#8] + ldr $a7,[$a_ptr,#28] + ldr $t3,[$b_ptr,#12] + subs $a0,$a0,$t0 + ldr $t0,[$b_ptr,#16] + sbcs $a1,$a1,$t1 + ldr $t1,[$b_ptr,#20] + sbcs $a2,$a2,$t2 + ldr $t2,[$b_ptr,#24] + sbcs $a3,$a3,$t3 + ldr $t3,[$b_ptr,#28] + sbcs $a4,$a4,$t0 + sbcs $a5,$a5,$t1 + sbcs $a6,$a6,$t2 + sbcs $a7,$a7,$t3 + sbc $ff,$ff,$ff @ broadcast borrow bit + ldr lr,[sp],#4 @ pop lr + +.Lreduce_by_add: + + @ if a-b borrows, add modulus. + @ + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ broadcasting borrow bit to a register, $ff, and using it as + @ a whole or extracting single bit. + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_sub,.-__ecp_nistz256_sub + +@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,%function +.align 4 +ecp_nistz256_neg: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_neg +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +.type __ecp_nistz256_neg,%function +.align 4 +__ecp_nistz256_neg: + ldr $a0,[$a_ptr,#0] + eor $ff,$ff,$ff + ldr $a1,[$a_ptr,#4] + ldr $a2,[$a_ptr,#8] + subs $a0,$ff,$a0 + ldr $a3,[$a_ptr,#12] + sbcs $a1,$ff,$a1 + ldr $a4,[$a_ptr,#16] + sbcs $a2,$ff,$a2 + ldr $a5,[$a_ptr,#20] + sbcs $a3,$ff,$a3 + ldr $a6,[$a_ptr,#24] + sbcs $a4,$ff,$a4 + ldr $a7,[$a_ptr,#28] + sbcs $a5,$ff,$a5 + sbcs $a6,$ff,$a6 + sbcs $a7,$ff,$a7 + sbc $ff,$ff,$ff + + b .Lreduce_by_add +.size __ecp_nistz256_neg,.-__ecp_nistz256_neg +___ +{ +my @acc=map("r$_",(3..11)); +my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); + +$code.=<<___; +@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,%function +.align 4 +ecp_nistz256_sqr_mont: + mov $b_ptr,$a_ptr + b .Lecp_nistz256_mul_mont +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], +@ const BN_ULONG r2[8]); +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,%function +.align 4 +ecp_nistz256_mul_mont: +.Lecp_nistz256_mul_mont: + stmdb sp!,{r4-r12,lr} + bl __ecp_nistz256_mul_mont +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +.type __ecp_nistz256_mul_mont,%function +.align 4 +__ecp_nistz256_mul_mont: + stmdb sp!,{r0-r2,lr} @ make a copy of arguments too + + ldr $bj,[$b_ptr,#0] @ b[0] + ldmia $a_ptr,{@acc[1]-@acc[8]} + + umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] + stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so + @ that it can be addressed + @ without spending register + @ on address + umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] + umull @acc[2],$t1,@acc[3],$bj + adds @acc[1],@acc[1],$t3 @ accumulate high part of mult + umull @acc[3],$t2,@acc[4],$bj + adcs @acc[2],@acc[2],$t0 + umull @acc[4],$t3,@acc[5],$bj + adcs @acc[3],@acc[3],$t1 + umull @acc[5],$t0,@acc[6],$bj + adcs @acc[4],@acc[4],$t2 + umull @acc[6],$t1,@acc[7],$bj + adcs @acc[5],@acc[5],$t3 + umull @acc[7],$t2,@acc[8],$bj + adcs @acc[6],@acc[6],$t0 + adcs @acc[7],@acc[7],$t1 + eor $t3,$t3,$t3 @ first overflow bit is zero + adc @acc[8],$t2,#0 +___ +for(my $i=1;$i<8;$i++) { +my $t4=@acc[0]; + + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff.0001.0000.0000.0000.ffff.ffff.ffff + # * abcd + # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 + # - abcd.0000.0000.0000.0000.0000.0000.abcd + # + # or marking redundant operations: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- + # + abcd.0000.abcd.0000.0000.abcd.----.----.---- + # - abcd.----.----.----.----.----.----.---- + +$code.=<<___; + @ multiplication-less reduction $i + adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] + ldr $bj,[sp,#40] @ restore b_ptr + adcs @acc[4],@acc[4],#0 @ r[4]+=0 + adcs @acc[5],@acc[5],#0 @ r[5]+=0 + adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] + ldr $t1,[sp,#0] @ load a[0] + adcs @acc[7],@acc[7],#0 @ r[7]+=0 + ldr $bj,[$bj,#4*$i] @ load b[i] + adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] + eor $t0,$t0,$t0 + adc $t3,$t3,#0 @ overflow bit + subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] + ldr $t2,[sp,#4] @ a[1] + sbcs @acc[8],@acc[8],#0 @ r[8]-=0 + umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] + eor $t1,$t1,$t1 + sbc @acc[0],$t3,#0 @ overflow bit, keep in mind + @ that netto result is + @ addition of a value which + @ makes underflow impossible + + ldr $t3,[sp,#8] @ a[2] + umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] + str @acc[0],[sp,#36] @ temporarily offload overflow + eor $t2,$t2,$t2 + ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] + umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] + eor $t3,$t3,$t3 + adds @acc[2],@acc[2],$t0 @ accumulate high part of mult + ldr $t0,[sp,#16] @ a[4] + umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] + eor $t4,$t4,$t4 + adcs @acc[3],@acc[3],$t1 + ldr $t1,[sp,#20] @ a[5] + umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] + eor $t0,$t0,$t0 + adcs @acc[4],@acc[4],$t2 + ldr $t2,[sp,#24] @ a[6] + umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] + eor $t1,$t1,$t1 + adcs @acc[5],@acc[5],$t3 + ldr $t3,[sp,#28] @ a[7] + umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] + eor $t2,$t2,$t2 + adcs @acc[6],@acc[6],$t4 + ldr @acc[0],[sp,#36] @ restore overflow bit + umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] + eor $t3,$t3,$t3 + adcs @acc[7],@acc[7],$t0 + adcs @acc[8],@acc[8],$t1 + adcs @acc[0],$acc[0],$t2 + adc $t3,$t3,#0 @ new overflow bit +___ + push(@acc,shift(@acc)); # rotate registers, so that + # "r[i]" becomes r[i] +} +$code.=<<___; + @ last multiplication-less reduction + adds @acc[3],@acc[3],@acc[0] + ldr $r_ptr,[sp,#32] @ restore r_ptr + adcs @acc[4],@acc[4],#0 + adcs @acc[5],@acc[5],#0 + adcs @acc[6],@acc[6],@acc[0] + adcs @acc[7],@acc[7],#0 + adcs @acc[8],@acc[8],@acc[0] + adc $t3,$t3,#0 + subs @acc[7],@acc[7],@acc[0] + sbcs @acc[8],@acc[8],#0 + sbc @acc[0],$t3,#0 @ overflow bit + + @ Final step is "if result > mod, subtract mod", but we do it + @ "other way around", namely subtract modulus from result + @ and if it borrowed, add modulus back. + + adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 + adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 + adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 + sbcs @acc[4],@acc[4],#0 + sbcs @acc[5],@acc[5],#0 + sbcs @acc[6],@acc[6],#0 + sbcs @acc[7],@acc[7],#1 + adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 + ldr lr,[sp,#44] @ restore lr + sbc @acc[0],@acc[0],#0 @ broadcast borrow bit + add sp,sp,#48 + + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ broadcasting borrow bit to a register, @acc[0], and using it as + @ a whole or extracting single bit. + + adds @acc[1],@acc[1],@acc[0] @ add modulus or zero + adcs @acc[2],@acc[2],@acc[0] + str @acc[1],[$r_ptr,#0] + adcs @acc[3],@acc[3],@acc[0] + str @acc[2],[$r_ptr,#4] + adcs @acc[4],@acc[4],#0 + str @acc[3],[$r_ptr,#8] + adcs @acc[5],@acc[5],#0 + str @acc[4],[$r_ptr,#12] + adcs @acc[6],@acc[6],#0 + str @acc[5],[$r_ptr,#16] + adcs @acc[7],@acc[7],@acc[0],lsr#31 + str @acc[6],[$r_ptr,#20] + adc @acc[8],@acc[8],@acc[0] + str @acc[7],[$r_ptr,#24] + str @acc[8],[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont +___ +} + +{ +my ($out,$inp,$index,$mask)=map("r$_",(0..3)); +$code.=<<___; +@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1, +@ int r2); +.globl ecp_nistz256_scatter_w5 +.type ecp_nistz256_scatter_w5,%function +.align 5 +ecp_nistz256_scatter_w5: + stmdb sp!,{r4-r11} + + add $out,$out,$index,lsl#2 + + ldmia $inp!,{r4-r11} @ X + str r4,[$out,#64*0-4] + str r5,[$out,#64*1-4] + str r6,[$out,#64*2-4] + str r7,[$out,#64*3-4] + str r8,[$out,#64*4-4] + str r9,[$out,#64*5-4] + str r10,[$out,#64*6-4] + str r11,[$out,#64*7-4] + add $out,$out,#64*8 + + ldmia $inp!,{r4-r11} @ Y + str r4,[$out,#64*0-4] + str r5,[$out,#64*1-4] + str r6,[$out,#64*2-4] + str r7,[$out,#64*3-4] + str r8,[$out,#64*4-4] + str r9,[$out,#64*5-4] + str r10,[$out,#64*6-4] + str r11,[$out,#64*7-4] + add $out,$out,#64*8 + + ldmia $inp,{r4-r11} @ Z + str r4,[$out,#64*0-4] + str r5,[$out,#64*1-4] + str r6,[$out,#64*2-4] + str r7,[$out,#64*3-4] + str r8,[$out,#64*4-4] + str r9,[$out,#64*5-4] + str r10,[$out,#64*6-4] + str r11,[$out,#64*7-4] + + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 || defined(__thumb__) + bx lr +#else + mov pc,lr +#endif +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1, +@ int r2); +.globl ecp_nistz256_gather_w5 +.type ecp_nistz256_gather_w5,%function +.align 5 +ecp_nistz256_gather_w5: + stmdb sp!,{r4-r11} + + cmp $index,#0 + mov $mask,#0 +#ifdef __thumb2__ + itt ne +#endif + subne $index,$index,#1 + movne $mask,#-1 + add $inp,$inp,$index,lsl#2 + + ldr r4,[$inp,#64*0] + ldr r5,[$inp,#64*1] + ldr r6,[$inp,#64*2] + and r4,r4,$mask + ldr r7,[$inp,#64*3] + and r5,r5,$mask + ldr r8,[$inp,#64*4] + and r6,r6,$mask + ldr r9,[$inp,#64*5] + and r7,r7,$mask + ldr r10,[$inp,#64*6] + and r8,r8,$mask + ldr r11,[$inp,#64*7] + add $inp,$inp,#64*8 + and r9,r9,$mask + and r10,r10,$mask + and r11,r11,$mask + stmia $out!,{r4-r11} @ X + + ldr r4,[$inp,#64*0] + ldr r5,[$inp,#64*1] + ldr r6,[$inp,#64*2] + and r4,r4,$mask + ldr r7,[$inp,#64*3] + and r5,r5,$mask + ldr r8,[$inp,#64*4] + and r6,r6,$mask + ldr r9,[$inp,#64*5] + and r7,r7,$mask + ldr r10,[$inp,#64*6] + and r8,r8,$mask + ldr r11,[$inp,#64*7] + add $inp,$inp,#64*8 + and r9,r9,$mask + and r10,r10,$mask + and r11,r11,$mask + stmia $out!,{r4-r11} @ Y + + ldr r4,[$inp,#64*0] + ldr r5,[$inp,#64*1] + ldr r6,[$inp,#64*2] + and r4,r4,$mask + ldr r7,[$inp,#64*3] + and r5,r5,$mask + ldr r8,[$inp,#64*4] + and r6,r6,$mask + ldr r9,[$inp,#64*5] + and r7,r7,$mask + ldr r10,[$inp,#64*6] + and r8,r8,$mask + ldr r11,[$inp,#64*7] + and r9,r9,$mask + and r10,r10,$mask + and r11,r11,$mask + stmia $out,{r4-r11} @ Z + + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 || defined(__thumb__) + bx lr +#else + mov pc,lr +#endif +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1, +@ int r2); +.globl ecp_nistz256_scatter_w7 +.type ecp_nistz256_scatter_w7,%function +.align 5 +ecp_nistz256_scatter_w7: + add $out,$out,$index + mov $index,#64/4 +.Loop_scatter_w7: + ldr $mask,[$inp],#4 + subs $index,$index,#1 + strb $mask,[$out,#64*0] + mov $mask,$mask,lsr#8 + strb $mask,[$out,#64*1] + mov $mask,$mask,lsr#8 + strb $mask,[$out,#64*2] + mov $mask,$mask,lsr#8 + strb $mask,[$out,#64*3] + add $out,$out,#64*4 + bne .Loop_scatter_w7 + +#if __ARM_ARCH__>=5 || defined(__thumb__) + bx lr +#else + mov pc,lr +#endif +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1, +@ int r2); +.globl ecp_nistz256_gather_w7 +.type ecp_nistz256_gather_w7,%function +.align 5 +ecp_nistz256_gather_w7: + stmdb sp!,{r4-r7} + + cmp $index,#0 + mov $mask,#0 +#ifdef __thumb2__ + itt ne +#endif + subne $index,$index,#1 + movne $mask,#-1 + add $inp,$inp,$index + mov $index,#64/4 + nop +.Loop_gather_w7: + ldrb r4,[$inp,#64*0] + subs $index,$index,#1 + ldrb r5,[$inp,#64*1] + ldrb r6,[$inp,#64*2] + ldrb r7,[$inp,#64*3] + add $inp,$inp,#64*4 + orr r4,r4,r5,lsl#8 + orr r4,r4,r6,lsl#16 + orr r4,r4,r7,lsl#24 + and r4,r4,$mask + str r4,[$out],#4 + bne .Loop_gather_w7 + + ldmia sp!,{r4-r7} +#if __ARM_ARCH__>=5 || defined(__thumb__) + bx lr +#else + mov pc,lr +#endif +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +} +if (0) { +# In comparison to integer-only equivalent of below subroutine: +# +# Cortex-A8 +10% +# Cortex-A9 -10% +# Snapdragon S4 +5% +# +# As not all time is spent in multiplication, overall impact is deemed +# too low to care about. + +my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); +my $mask="q4"; +my $mult="q5"; +my @AxB=map("q$_",(8..15)); + +my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.globl ecp_nistz256_mul_mont_neon +.type ecp_nistz256_mul_mont_neon,%function +.align 5 +ecp_nistz256_mul_mont_neon: + mov ip,sp + stmdb sp!,{r4-r9} + vstmdb sp!,{q4-q5} @ ABI specification says so + + sub $toutptr,sp,#40 + vld1.32 {${Bi}[0]},[$bptr,:32]! + veor $zero,$zero,$zero + vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( + vzip.16 $Bi,$zero + mov sp,$toutptr @ alloca + vmov.i64 $mask,#0xffff + + vmull.u32 @AxB[0],$Bi,${A0}[0] + vmull.u32 @AxB[1],$Bi,${A0}[1] + vmull.u32 @AxB[2],$Bi,${A1}[0] + vmull.u32 @AxB[3],$Bi,${A1}[1] + vshr.u64 $temp,@AxB[0]#lo,#16 + vmull.u32 @AxB[4],$Bi,${A2}[0] + vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp + vmull.u32 @AxB[5],$Bi,${A2}[1] + vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] + vmull.u32 @AxB[6],$Bi,${A3}[0] + vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] + vmull.u32 @AxB[7],$Bi,${A3}[1] +___ +for($i=1;$i<8;$i++) { +$code.=<<___; + vld1.32 {${Bi}[0]},[$bptr,:32]! + veor $zero,$zero,$zero + vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction + vshl.u64 $mult,@AxB[0],#32 + vadd.u64 @AxB[3],@AxB[3],@AxB[0] + vsub.u64 $mult,$mult,@AxB[0] + vzip.16 $Bi,$zero + vadd.u64 @AxB[6],@AxB[6],@AxB[0] + vadd.u64 @AxB[7],@AxB[7],$mult +___ + push(@AxB,shift(@AxB)); +$code.=<<___; + vmlal.u32 @AxB[0],$Bi,${A0}[0] + vmlal.u32 @AxB[1],$Bi,${A0}[1] + vmlal.u32 @AxB[2],$Bi,${A1}[0] + vmlal.u32 @AxB[3],$Bi,${A1}[1] + vshr.u64 $temp,@AxB[0]#lo,#16 + vmlal.u32 @AxB[4],$Bi,${A2}[0] + vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp + vmlal.u32 @AxB[5],$Bi,${A2}[1] + vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] + vmlal.u32 @AxB[6],$Bi,${A3}[0] + vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] + vmull.u32 @AxB[7],$Bi,${A3}[1] +___ +} +$code.=<<___; + vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction + vshl.u64 $mult,@AxB[0],#32 + vadd.u64 @AxB[3],@AxB[3],@AxB[0] + vsub.u64 $mult,$mult,@AxB[0] + vadd.u64 @AxB[6],@AxB[6],@AxB[0] + vadd.u64 @AxB[7],@AxB[7],$mult + + vshr.u64 $temp,@AxB[1]#lo,#16 @ convert + vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp + vshr.u64 $temp,@AxB[1]#hi,#16 + vzip.16 @AxB[1]#lo,@AxB[1]#hi +___ +foreach (2..7) { +$code.=<<___; + vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp + vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! + vshr.u64 $temp,@AxB[$_]#lo,#16 + vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp + vshr.u64 $temp,@AxB[$_]#hi,#16 + vzip.16 @AxB[$_]#lo,@AxB[$_]#hi +___ +} +$code.=<<___; + vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! + vst1.32 {$temp},[$toutptr] @ upper 33 bits + + ldr r1,[sp,#0] + ldr r2,[sp,#4] + ldr r3,[sp,#8] + subs r1,r1,#-1 + ldr r4,[sp,#12] + sbcs r2,r2,#-1 + ldr r5,[sp,#16] + sbcs r3,r3,#-1 + ldr r6,[sp,#20] + sbcs r4,r4,#0 + ldr r7,[sp,#24] + sbcs r5,r5,#0 + ldr r8,[sp,#28] + sbcs r6,r6,#0 + ldr r9,[sp,#32] @ top-most bit + sbcs r7,r7,#1 + sub sp,ip,#40+16 + sbcs r8,r8,#-1 + sbc r9,r9,#0 + vldmia sp!,{q4-q5} + + adds r1,r1,r9 + adcs r2,r2,r9 + str r1,[$rptr,#0] + adcs r3,r3,r9 + str r2,[$rptr,#4] + adcs r4,r4,#0 + str r3,[$rptr,#8] + adcs r5,r5,#0 + str r4,[$rptr,#12] + adcs r6,r6,#0 + str r5,[$rptr,#16] + adcs r7,r7,r9,lsr#31 + str r6,[$rptr,#20] + adcs r8,r8,r9 + str r7,[$rptr,#24] + str r8,[$rptr,#28] + + ldmia sp!,{r4-r9} + bx lr +.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon +#endif +___ +} + +{{{ +######################################################################## +# Below $aN assignment matches order in which 256-bit result appears in +# register bank at return from __ecp_nistz256_mul_mont, so that we can +# skip over reloading it from memory. This means that below functions +# use custom calling sequence accepting 256-bit input in registers, +# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. +# +# See their "normal" counterparts for insights on calculations. + +my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, + $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); +my $ff=$b_ptr; + +$code.=<<___; +.type __ecp_nistz256_sub_from,%function +.align 5 +__ecp_nistz256_sub_from: + str lr,[sp,#-4]! @ push lr + + ldr $t0,[$b_ptr,#0] + ldr $t1,[$b_ptr,#4] + ldr $t2,[$b_ptr,#8] + ldr $t3,[$b_ptr,#12] + subs $a0,$a0,$t0 + ldr $t0,[$b_ptr,#16] + sbcs $a1,$a1,$t1 + ldr $t1,[$b_ptr,#20] + sbcs $a2,$a2,$t2 + ldr $t2,[$b_ptr,#24] + sbcs $a3,$a3,$t3 + ldr $t3,[$b_ptr,#28] + sbcs $a4,$a4,$t0 + sbcs $a5,$a5,$t1 + sbcs $a6,$a6,$t2 + sbcs $a7,$a7,$t3 + sbc $ff,$ff,$ff @ broadcast borrow bit + ldr lr,[sp],#4 @ pop lr + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,%function +.align 5 +__ecp_nistz256_sub_morf: + str lr,[sp,#-4]! @ push lr + + ldr $t0,[$b_ptr,#0] + ldr $t1,[$b_ptr,#4] + ldr $t2,[$b_ptr,#8] + ldr $t3,[$b_ptr,#12] + subs $a0,$t0,$a0 + ldr $t0,[$b_ptr,#16] + sbcs $a1,$t1,$a1 + ldr $t1,[$b_ptr,#20] + sbcs $a2,$t2,$a2 + ldr $t2,[$b_ptr,#24] + sbcs $a3,$t3,$a3 + ldr $t3,[$b_ptr,#28] + sbcs $a4,$t0,$a4 + sbcs $a5,$t1,$a5 + sbcs $a6,$t2,$a6 + sbcs $a7,$t3,$a7 + sbc $ff,$ff,$ff @ broadcast borrow bit + ldr lr,[sp],#4 @ pop lr + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_add_self,%function +.align 4 +__ecp_nistz256_add_self: + adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] + adcs $a1,$a1,$a1 + adcs $a2,$a2,$a2 + adcs $a3,$a3,$a3 + adcs $a4,$a4,$a4 + adcs $a5,$a5,$a5 + adcs $a6,$a6,$a6 + mov $ff,#0 + adcs $a7,$a7,$a7 + adc $ff,$ff,#0 + + @ if a+b >= modulus, subtract modulus. + @ + @ But since comparison implies subtraction, we subtract + @ modulus and then add it back if subtraction borrowed. + + subs $a0,$a0,#-1 + sbcs $a1,$a1,#-1 + sbcs $a2,$a2,#-1 + sbcs $a3,$a3,#0 + sbcs $a4,$a4,#0 + sbcs $a5,$a5,#0 + sbcs $a6,$a6,#1 + sbcs $a7,$a7,#-1 + sbc $ff,$ff,#0 + + @ Note that because mod has special form, i.e. consists of + @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by + @ using value of borrow as a whole or extracting single bit. + @ Follow $ff register... + + adds $a0,$a0,$ff @ add synthesized modulus + adcs $a1,$a1,$ff + str $a0,[$r_ptr,#0] + adcs $a2,$a2,$ff + str $a1,[$r_ptr,#4] + adcs $a3,$a3,#0 + str $a2,[$r_ptr,#8] + adcs $a4,$a4,#0 + str $a3,[$r_ptr,#12] + adcs $a5,$a5,#0 + str $a4,[$r_ptr,#16] + adcs $a6,$a6,$ff,lsr#31 + str $a5,[$r_ptr,#20] + adcs $a7,$a7,$ff + str $a6,[$r_ptr,#24] + str $a7,[$r_ptr,#28] + + mov pc,lr +.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self + +___ + +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); +# above map() describes stack layout with 5 temporary +# 256-bit vectors on top. Then note that we push +# starting from r0, which means that we have copy of +# input arguments just below these temporary vectors. + +$code.=<<___; +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,%function +.align 5 +ecp_nistz256_point_double: + stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional + sub sp,sp,#32*5 + +.Lpoint_double_shortcut: + add r3,sp,#$in_x + ldmia $a_ptr!,{r4-r11} @ copy in_x + stmia r3,{r4-r11} + + add $r_ptr,sp,#$S + bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); + + add $b_ptr,$a_ptr,#32 + add $a_ptr,$a_ptr,#32 + add $r_ptr,sp,#$Zsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); + + add $a_ptr,sp,#$S + add $b_ptr,sp,#$S + add $r_ptr,sp,#$S + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); + + ldr $b_ptr,[sp,#32*5+4] + add $a_ptr,$b_ptr,#32 + add $b_ptr,$b_ptr,#64 + add $r_ptr,sp,#$tmp0 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); + + ldr $r_ptr,[sp,#32*5] + add $r_ptr,$r_ptr,#64 + bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); + + add $a_ptr,sp,#$in_x + add $b_ptr,sp,#$Zsqr + add $r_ptr,sp,#$M + bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); + + add $a_ptr,sp,#$in_x + add $b_ptr,sp,#$Zsqr + add $r_ptr,sp,#$Zsqr + bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); + + add $a_ptr,sp,#$S + add $b_ptr,sp,#$S + add $r_ptr,sp,#$tmp0 + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); + + add $a_ptr,sp,#$Zsqr + add $b_ptr,sp,#$M + add $r_ptr,sp,#$M + bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); + + ldr $r_ptr,[sp,#32*5] + add $a_ptr,sp,#$tmp0 + add $r_ptr,$r_ptr,#32 + bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); + + add $a_ptr,sp,#$M + add $r_ptr,sp,#$M + bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); + + add $a_ptr,sp,#$in_x + add $b_ptr,sp,#$S + add $r_ptr,sp,#$S + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); + + add $r_ptr,sp,#$tmp0 + bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); + + ldr $r_ptr,[sp,#32*5] + add $a_ptr,sp,#$M + add $b_ptr,sp,#$M + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); + + add $b_ptr,sp,#$tmp0 + bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); + + add $b_ptr,sp,#$S + add $r_ptr,sp,#$S + bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); + + add $a_ptr,sp,#$M + add $b_ptr,sp,#$S + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); + + ldr $r_ptr,[sp,#32*5] + add $b_ptr,$r_ptr,#32 + add $r_ptr,$r_ptr,#32 + bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); + + add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y,$in2_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); +# above map() describes stack layout with 18 temporary +# 256-bit vectors on top. Then note that we push +# starting from r0, which means that we have copy of +# input arguments just below these temporary vectors. +# We use three of them for ~in1infty, ~in2infty and +# result of check for zero. + +$code.=<<___; +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,%function +.align 5 +ecp_nistz256_point_add: + stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional + sub sp,sp,#32*18+16 + + ldmia $b_ptr!,{r4-r11} @ copy in2_x + add r3,sp,#$in2_x + stmia r3!,{r4-r11} + ldmia $b_ptr!,{r4-r11} @ copy in2_y + stmia r3!,{r4-r11} + ldmia $b_ptr,{r4-r11} @ copy in2_z + orr r12,r4,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif + movne r12,#-1 + stmia r3,{r4-r11} + str r12,[sp,#32*18+8] @ ~in2infty + + ldmia $a_ptr!,{r4-r11} @ copy in1_x + add r3,sp,#$in1_x + stmia r3!,{r4-r11} + ldmia $a_ptr!,{r4-r11} @ copy in1_y + stmia r3!,{r4-r11} + ldmia $a_ptr,{r4-r11} @ copy in1_z + orr r12,r4,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif + movne r12,#-1 + stmia r3,{r4-r11} + str r12,[sp,#32*18+4] @ ~in1infty + + add $a_ptr,sp,#$in2_z + add $b_ptr,sp,#$in2_z + add $r_ptr,sp,#$Z2sqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); + + add $a_ptr,sp,#$in1_z + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$Z1sqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); + + add $a_ptr,sp,#$in2_z + add $b_ptr,sp,#$Z2sqr + add $r_ptr,sp,#$S1 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); + + add $a_ptr,sp,#$in1_z + add $b_ptr,sp,#$Z1sqr + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); + + add $a_ptr,sp,#$in1_y + add $b_ptr,sp,#$S1 + add $r_ptr,sp,#$S1 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); + + add $a_ptr,sp,#$in2_y + add $b_ptr,sp,#$S2 + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); + + add $b_ptr,sp,#$S1 + add $r_ptr,sp,#$R + bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); + + orr $a0,$a0,$a1 @ see if result is zero + orr $a2,$a2,$a3 + orr $a4,$a4,$a5 + orr $a0,$a0,$a2 + orr $a4,$a4,$a6 + orr $a0,$a0,$a7 + add $a_ptr,sp,#$in1_x + orr $a0,$a0,$a4 + add $b_ptr,sp,#$Z2sqr + str $a0,[sp,#32*18+12] + + add $r_ptr,sp,#$U1 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); + + add $a_ptr,sp,#$in2_x + add $b_ptr,sp,#$Z1sqr + add $r_ptr,sp,#$U2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); + + add $b_ptr,sp,#$U1 + add $r_ptr,sp,#$H + bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); + + orr $a0,$a0,$a1 @ see if result is zero + orr $a2,$a2,$a3 + orr $a4,$a4,$a5 + orr $a0,$a0,$a2 + orr $a4,$a4,$a6 + orr $a0,$a0,$a7 + orr $a0,$a0,$a4 @ ~is_equal(U1,U2) + + ldr $t0,[sp,#32*18+4] @ ~in1infty + ldr $t1,[sp,#32*18+8] @ ~in2infty + ldr $t2,[sp,#32*18+12] @ ~is_equal(S1,S2) + mvn $t0,$t0 @ -1/0 -> 0/-1 + mvn $t1,$t1 @ -1/0 -> 0/-1 + orr $a0,$t0 + orr $a0,$t1 + orrs $a0,$t2 @ set flags + + @ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + bne .Ladd_proceed + +.Ladd_double: + ldr $a_ptr,[sp,#32*18+20] + add sp,sp,#32*(18-5)+16 @ difference in frame sizes + b .Lpoint_double_shortcut + +.align 4 +.Ladd_proceed: + add $a_ptr,sp,#$R + add $b_ptr,sp,#$R + add $r_ptr,sp,#$Rsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$res_z + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$H + add $r_ptr,sp,#$Hsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); + + add $a_ptr,sp,#$in2_z + add $b_ptr,sp,#$res_z + add $r_ptr,sp,#$res_z + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$Hsqr + add $r_ptr,sp,#$Hcub + bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); + + add $a_ptr,sp,#$Hsqr + add $b_ptr,sp,#$U1 + add $r_ptr,sp,#$U2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); + + add $r_ptr,sp,#$Hsqr + bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); + + add $b_ptr,sp,#$Rsqr + add $r_ptr,sp,#$res_x + bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); + + add $b_ptr,sp,#$Hcub + bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); + + add $b_ptr,sp,#$U2 + add $r_ptr,sp,#$res_y + bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); + + add $a_ptr,sp,#$Hcub + add $b_ptr,sp,#$S1 + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); + + add $a_ptr,sp,#$R + add $b_ptr,sp,#$res_y + add $r_ptr,sp,#$res_y + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); + + add $b_ptr,sp,#$S2 + bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); + + ldr r11,[sp,#32*18+4] @ ~in1infty + ldr r12,[sp,#32*18+8] @ ~in2infty + add r1,sp,#$res_x + add r2,sp,#$in2_x + and r10,r11,r12 @ ~in1infty & ~in2infty + mvn r11,r11 + add r3,sp,#$in1_x + and r11,r11,r12 @ in1infty & ~in2infty + mvn r12,r12 @ in2infty + ldr $r_ptr,[sp,#32*18+16] +___ +for($i=0;$i<96;$i+=8) { # conditional moves +$code.=<<___; + ldmia r1!,{r4-r5} @ res_x + ldmia r2!,{r6-r7} @ in2_x + ldmia r3!,{r8-r9} @ in1_x + and r4,r4,r10 @ ~in1infty & ~in2infty + and r5,r5,r10 + and r6,r6,r11 @ in1infty & ~in2infty + and r7,r7,r11 + and r8,r8,r12 @ in2infty + and r9,r9,r12 + orr r4,r4,r6 + orr r5,r5,r7 + orr r4,r4,r8 + orr r5,r5,r9 + stmia $r_ptr!,{r4-r5} +___ +} +$code.=<<___; +.Ladd_done: + add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); +my $Z1sqr = $S2; +# above map() describes stack layout with 18 temporary +# 256-bit vectors on top. Then note that we push +# starting from r0, which means that we have copy of +# input arguments just below these temporary vectors. +# We use two of them for ~in1infty, ~in2infty. + +my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,%function +.align 5 +ecp_nistz256_point_add_affine: + stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional + sub sp,sp,#32*15 + + ldmia $a_ptr!,{r4-r11} @ copy in1_x + add r3,sp,#$in1_x + stmia r3!,{r4-r11} + ldmia $a_ptr!,{r4-r11} @ copy in1_y + stmia r3!,{r4-r11} + ldmia $a_ptr,{r4-r11} @ copy in1_z + orr r12,r4,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif + movne r12,#-1 + stmia r3,{r4-r11} + str r12,[sp,#32*15+4] @ ~in1infty + + ldmia $b_ptr!,{r4-r11} @ copy in2_x + add r3,sp,#$in2_x + orr r12,r4,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + stmia r3!,{r4-r11} + ldmia $b_ptr!,{r4-r11} @ copy in2_y + orr r12,r12,r4 + orr r12,r12,r5 + orr r12,r12,r6 + orr r12,r12,r7 + orr r12,r12,r8 + orr r12,r12,r9 + orr r12,r12,r10 + orr r12,r12,r11 + stmia r3!,{r4-r11} + cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif + movne r12,#-1 + str r12,[sp,#32*15+8] @ ~in2infty + + add $a_ptr,sp,#$in1_z + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$Z1sqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); + + add $a_ptr,sp,#$Z1sqr + add $b_ptr,sp,#$in2_x + add $r_ptr,sp,#$U2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); + + add $b_ptr,sp,#$in1_x + add $r_ptr,sp,#$H + bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); + + add $a_ptr,sp,#$Z1sqr + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$in1_z + add $r_ptr,sp,#$res_z + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); + + add $a_ptr,sp,#$in2_y + add $b_ptr,sp,#$S2 + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); + + add $b_ptr,sp,#$in1_y + add $r_ptr,sp,#$R + bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$H + add $r_ptr,sp,#$Hsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); + + add $a_ptr,sp,#$R + add $b_ptr,sp,#$R + add $r_ptr,sp,#$Rsqr + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); + + add $a_ptr,sp,#$H + add $b_ptr,sp,#$Hsqr + add $r_ptr,sp,#$Hcub + bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); + + add $a_ptr,sp,#$Hsqr + add $b_ptr,sp,#$in1_x + add $r_ptr,sp,#$U2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); + + add $r_ptr,sp,#$Hsqr + bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); + + add $b_ptr,sp,#$Rsqr + add $r_ptr,sp,#$res_x + bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); + + add $b_ptr,sp,#$Hcub + bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); + + add $b_ptr,sp,#$U2 + add $r_ptr,sp,#$res_y + bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); + + add $a_ptr,sp,#$Hcub + add $b_ptr,sp,#$in1_y + add $r_ptr,sp,#$S2 + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); + + add $a_ptr,sp,#$R + add $b_ptr,sp,#$res_y + add $r_ptr,sp,#$res_y + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); + + add $b_ptr,sp,#$S2 + bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); + + ldr r11,[sp,#32*15+4] @ ~in1infty + ldr r12,[sp,#32*15+8] @ ~in2infty + add r1,sp,#$res_x + add r2,sp,#$in2_x + and r10,r11,r12 @ ~in1infty & ~in2infty + mvn r11,r11 + add r3,sp,#$in1_x + and r11,r11,r12 @ in1infty & ~in2infty + mvn r12,r12 @ in2infty + ldr $r_ptr,[sp,#32*15] +___ +for($i=0;$i<64;$i+=8) { # conditional moves +$code.=<<___; + ldmia r1!,{r4-r5} @ res_x + ldmia r2!,{r6-r7} @ in2_x + ldmia r3!,{r8-r9} @ in1_x + and r4,r4,r10 @ ~in1infty & ~in2infty + and r5,r5,r10 + and r6,r6,r11 @ in1infty & ~in2infty + and r7,r7,r11 + and r8,r8,r12 @ in2infty + and r9,r9,r12 + orr r4,r4,r6 + orr r5,r5,r7 + orr r4,r4,r8 + orr r5,r5,r9 + stmia $r_ptr!,{r4-r5} +___ +} +for(;$i<96;$i+=8) { +my $j=($i-64)/4; +$code.=<<___; + ldmia r1!,{r4-r5} @ res_z + ldmia r3!,{r8-r9} @ in1_z + and r4,r4,r10 + and r5,r5,r10 + and r6,r11,#@ONE_mont[$j] + and r7,r11,#@ONE_mont[$j+1] + and r8,r8,r12 + and r9,r9,r12 + orr r4,r4,r6 + orr r5,r5,r7 + orr r4,r4,r8 + orr r5,r5,r9 + stmia $r_ptr!,{r4-r5} +___ +} +$code.=<<___; + add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" +#if __ARM_ARCH__>=5 || !defined(__thumb__) + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + bx lr @ interoperable with Thumb ISA:-) +#endif +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} }}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl new file mode 100644 index 000000000..e93e18f29 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -0,0 +1,1874 @@ +#! /usr/bin/env perl +# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for ARMv8. +# +# February 2015. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. +# +# with/without -DECP_NISTZ256_ASM +# Apple A7 +190-360% +# Cortex-A53 +190-400% +# Cortex-A57 +190-350% +# Denver +230-400% +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +400% means 5x improvement. + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +{ +my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, + $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = + map("x$_",(0..17,19,20)); + +my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont + +$code.=<<___; +#include "arm_arch.h" + +.text +___ +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +$code.=<<___; +.globl ecp_nistz256_precomputed +.type ecp_nistz256_precomputed,%object +.align 12 +ecp_nistz256_precomputed: +___ +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + $code.=".byte\t"; + $code.=join(',',map { sprintf "0x%02x",$_} @line); + $code.="\n"; + } +} +$code.=<<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.align 5 +.Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +.LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +.Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +.Lone: +.quad 1,0,0,0 +.Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" + +// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_to_mont +.type ecp_nistz256_to_mont,%function +.align 6 +ecp_nistz256_to_mont: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr $bi,.LRR // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + adr $bp,.LRR // &bp[0] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,%function +.align 4 +ecp_nistz256_from_mont: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + mov $bi,#1 // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + adr $bp,.Lone // &bp[0] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,%function +.align 4 +ecp_nistz256_mul_mont: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,%function +.align 4 +ecp_nistz256_sqr_mont: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_add +.type ecp_nistz256_add,%function +.align 4 +ecp_nistz256_add: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $t0,$t1,[$bp] + ldp $acc2,$acc3,[$ap,#16] + ldp $t2,$t3,[$bp,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_add + + ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_add,.-ecp_nistz256_add + +// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,%function +.align 4 +ecp_nistz256_div_by_2: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_div_by_2 + + ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,%function +.align 4 +ecp_nistz256_mul_by_2: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + + bl __ecp_nistz256_add // ret = a+a // 2*a + + ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,%function +.align 4 +ecp_nistz256_mul_by_3: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + mov $a0,$acc0 + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + + bl __ecp_nistz256_add // ret = a+a // 2*a + + mov $t0,$a0 + mov $t1,$a1 + mov $t2,$a2 + mov $t3,$a3 + + bl __ecp_nistz256_add // ret += a // 2*a+a=3*a + + ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_sub +.type ecp_nistz256_sub,%function +.align 4 +ecp_nistz256_sub: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp $acc0,$acc1,[$ap] + ldp $acc2,$acc3,[$ap,#16] + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,%function +.align 4 +ecp_nistz256_neg: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $bp,$ap + mov $acc0,xzr // a = 0 + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to $a0-$a3 and b[0] - to $bi +.type __ecp_nistz256_mul_mont,%function +.align 4 +__ecp_nistz256_mul_mont: + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $t3,$a3,$bi + ldr $bi,[$bp,#8] // b[1] + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + lsl $t0,$acc0,#32 + adcs $acc2,$acc2,$t1 + lsr $t1,$acc0,#32 + adcs $acc3,$acc3,$t2 + adc $acc4,xzr,$t3 + mov $acc5,xzr +___ +for($i=1;$i<4;$i++) { + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff0001.00000000.0000ffff.ffffffff + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 + # - 0000abcd.efgh0000.00000000.00000000.abcdefgh + # + # or marking redundant operations: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- + # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- + # - 0000abcd.efgh0000.--------.--------.-------- + +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + mul $t0,$a0,$bi // lo(a[0]*b[i]) + adcs $acc1,$acc2,$t1 + mul $t1,$a1,$bi // lo(a[1]*b[i]) + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + mul $t2,$a2,$bi // lo(a[2]*b[i]) + adcs $acc3,$acc4,$t3 + mul $t3,$a3,$bi // lo(a[3]*b[i]) + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts of multiplication + umulh $t0,$a0,$bi // hi(a[0]*b[i]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi // hi(a[1]*b[i]) + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi // hi(a[2]*b[i]) + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi // hi(a[3]*b[i]) + adc $acc4,$acc4,xzr +___ +$code.=<<___ if ($i<3); + ldr $bi,[$bp,#8*($i+1)] // b[$i+1] +___ +$code.=<<___; + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + lsl $t0,$acc0,#32 + adcs $acc2,$acc2,$t1 + lsr $t1,$acc0,#32 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + // last reduction + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + adcs $acc3,$acc4,$t3 + adc $acc4,$acc5,xzr + + adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$acc4,xzr // did it borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to $a0-$a3 +.type __ecp_nistz256_sqr_mont,%function +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + lsl $t0,$acc0,#32 + adcs $acc6,$acc6,$t3 + lsr $t1,$acc0,#32 + adc $acc7,$acc7,$a3 +___ +for($i=0;$i<3;$i++) { # reductions, see commentary in + # multiplication for details +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + lsl $t0,$acc0,#32 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + lsr $t1,$acc0,#32 + adc $acc3,$t3,xzr // can't overflow +___ +} +$code.=<<___; + subs $t2,$acc0,$t0 // "*0xffff0001" + sbc $t3,$acc0,$t1 + adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 + adc $acc3,$t3,xzr // can't overflow + + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$acc4,xzr // did it borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont + +// Note that __ecp_nistz256_add expects both input vectors pre-loaded to +// $a0-$a3 and $t0-$t3. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.type __ecp_nistz256_add,%function +.align 4 +__ecp_nistz256_add: + adds $acc0,$acc0,$t0 // ret = a+b + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + adc $ap,xzr,xzr // zap $ap + + adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus + sbcs $t1,$acc1,$poly1 + sbcs $t2,$acc2,xzr + sbcs $t3,$acc3,$poly3 + sbcs xzr,$ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_add,.-__ecp_nistz256_add + +.type __ecp_nistz256_sub_from,%function +.align 4 +__ecp_nistz256_sub_from: + ldp $t0,$t1,[$bp] + ldp $t2,$t3,[$bp,#16] + subs $acc0,$acc0,$t0 // ret = a-b + sbcs $acc1,$acc1,$t1 + sbcs $acc2,$acc2,$t2 + sbcs $acc3,$acc3,$t3 + sbc $ap,xzr,xzr // zap $ap + + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adc $t3,$acc3,$poly3 + cmp $ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,eq + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,%function +.align 4 +__ecp_nistz256_sub_morf: + ldp $t0,$t1,[$bp] + ldp $t2,$t3,[$bp,#16] + subs $acc0,$t0,$acc0 // ret = b-a + sbcs $acc1,$t1,$acc1 + sbcs $acc2,$t2,$acc2 + sbcs $acc3,$t3,$acc3 + sbc $ap,xzr,xzr // zap $ap + + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adc $t3,$acc3,$poly3 + cmp $ap,xzr // did subtraction borrow? + + csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,eq + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_div_by_2,%function +.align 4 +__ecp_nistz256_div_by_2: + subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus + adcs $t1,$acc1,$poly1 + adcs $t2,$acc2,xzr + adcs $t3,$acc3,$poly3 + adc $ap,xzr,xzr // zap $ap + tst $acc0,#1 // is a even? + + csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus + csel $acc1,$acc1,$t1,eq + csel $acc2,$acc2,$t2,eq + csel $acc3,$acc3,$t3,eq + csel $ap,xzr,$ap,eq + + lsr $acc0,$acc0,#1 // ret >>= 1 + orr $acc0,$acc0,$acc1,lsl#63 + lsr $acc1,$acc1,#1 + orr $acc1,$acc1,$acc2,lsl#63 + lsr $acc2,$acc2,#1 + orr $acc2,$acc2,$acc3,lsl#63 + lsr $acc3,$acc3,#1 + stp $acc0,$acc1,[$rp] + orr $acc3,$acc3,$ap,lsl#63 + stp $acc2,$acc3,[$rp,#16] + + ret +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +___ +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); +# above map() describes stack layout with 4 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real) = map("x$_",(21,22)); + +$code.=<<___; +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,%function +.align 5 +ecp_nistz256_point_double: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +.Ldouble_shortcut: + ldp $acc0,$acc1,[$ap,#32] + mov $rp_real,$rp + ldp $acc2,$acc3,[$ap,#48] + mov $ap_real,$ap + ldr $poly1,.Lpoly+8 + mov $t0,$acc0 + ldr $poly3,.Lpoly+24 + mov $t1,$acc1 + ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[$ap_real,#64+16] + add $rp,sp,#$S + bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); + + add $rp,sp,#$Zsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp $t0,$t1,[$ap_real] + ldp $t2,$t3,[$ap_real,#16] + mov $a0,$acc0 // put Zsqr aside for p256_sub + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $rp,sp,#$M + bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); + + add $bp,$ap_real,#0 + mov $acc0,$a0 // restore Zsqr + mov $acc1,$a1 + ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + mov $acc2,$a2 + mov $acc3,$a3 + ldp $a2,$a3,[sp,#$S+16] + add $rp,sp,#$Zsqr + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add $rp,sp,#$S + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr $bi,[$ap_real,#32] + ldp $a0,$a1,[$ap_real,#64] + ldp $a2,$a3,[$ap_real,#64+16] + add $bp,$ap_real,#32 + add $rp,sp,#$tmp0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov $t0,$acc0 + mov $t1,$acc1 + ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[sp,#$S+16] + add $rp,$rp_real,#64 + bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); + + add $rp,sp,#$tmp0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$M] + ldp $a2,$a3,[sp,#$M+16] + add $rp,$rp_real,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add $bp,sp,#$Zsqr + add $rp,sp,#$M + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov $t0,$acc0 // duplicate M + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + mov $a0,$acc0 // put M aside + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $rp,sp,#$M + bl __ecp_nistz256_add + mov $t0,$a0 // restore M + mov $t1,$a1 + ldr $bi,[$ap_real] // forward load for p256_mul_mont + mov $t2,$a2 + ldp $a0,$a1,[sp,#$S] + mov $t3,$a3 + ldp $a2,$a3,[sp,#$S+16] + bl __ecp_nistz256_add // p256_mul_by_3(M, M); + + add $bp,$ap_real,#0 + add $rp,sp,#$S + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov $t0,$acc0 + mov $t1,$acc1 + ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont + mov $t2,$acc2 + mov $t3,$acc3 + ldp $a2,$a3,[sp,#$M+16] + add $rp,sp,#$tmp0 + bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); + + add $rp,$rp_real,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add $bp,sp,#$tmp0 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add $bp,sp,#$S + add $rp,sp,#$S + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr $bi,[sp,#$M] + mov $a0,$acc0 // copy S + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + add $bp,sp,#$M + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add $bp,$rp_real,#32 + add $rp,$rp_real,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); +# above map() describes stack layout with 12 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); + +$code.=<<___; +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,%function +.align 5 +ecp_nistz256_point_add: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp $a0,$a1,[$bp,#64] // in2_z + ldp $a2,$a3,[$bp,#64+16] + mov $rp_real,$rp + mov $ap_real,$ap + mov $bp_real,$bp + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in2infty,$t0,$t2 + cmp $in2infty,#0 + csetm $in2infty,ne // ~in2infty + add $rp,sp,#$Z2sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp $a0,$a1,[$ap_real,#64] // in1_z + ldp $a2,$a3,[$ap_real,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // ~in1infty + add $rp,sp,#$Z1sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr $bi,[$bp_real,#64] + ldp $a0,$a1,[sp,#$Z2sqr] + ldp $a2,$a3,[sp,#$Z2sqr+16] + add $bp,$bp_real,#64 + add $rp,sp,#$S1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$Z1sqr] + ldp $a2,$a3,[sp,#$Z1sqr+16] + add $bp,$ap_real,#64 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr $bi,[$ap_real,#32] + ldp $a0,$a1,[sp,#$S1] + ldp $a2,$a3,[sp,#$S1+16] + add $bp,$ap_real,#32 + add $rp,sp,#$S1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr $bi,[$bp_real,#32] + ldp $a0,$a1,[sp,#$S2] + ldp $a2,$a3,[sp,#$S2+16] + add $bp,$bp_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add $bp,sp,#$S1 + ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont + ldp $a0,$a1,[$ap_real] + ldp $a2,$a3,[$ap_real,#16] + add $rp,sp,#$R + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr $acc0,$acc0,$acc1 // see if result is zero + orr $acc2,$acc2,$acc3 + orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) + + add $bp,sp,#$Z2sqr + add $rp,sp,#$U1 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr $bi,[sp,#$Z1sqr] + ldp $a0,$a1,[$bp_real] + ldp $a2,$a3,[$bp_real,#16] + add $bp,sp,#$Z1sqr + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add $bp,sp,#$U1 + ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont + ldp $a2,$a3,[sp,#$R+16] + add $rp,sp,#$H + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr $acc0,$acc0,$acc1 // see if result is zero + orr $acc2,$acc2,$acc3 + orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) + + mvn $temp1,$in1infty // -1/0 -> 0/-1 + mvn $temp2,$in2infty // -1/0 -> 0/-1 + orr $acc0,$acc0,$temp1 + orr $acc0,$acc0,$temp2 + orr $acc0,$acc0,$temp0 + cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +.Ladd_double: + mov $ap,$ap_real + mov $rp,$rp_real + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#32*(12-4) // difference in stack frames + b .Ldouble_shortcut + +.align 4 +.Ladd_proceed: + add $rp,sp,#$Rsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $bp,$ap_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $rp,sp,#$Hsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr $bi,[$bp_real,#64] + ldp $a0,$a1,[sp,#$res_z] + ldp $a2,$a3,[sp,#$res_z+16] + add $bp,$bp_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr $bi,[sp,#$H] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,sp,#$H + add $rp,sp,#$Hcub + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr $bi,[sp,#$Hsqr] + ldp $a0,$a1,[sp,#$U1] + ldp $a2,$a3,[sp,#$U1+16] + add $bp,sp,#$Hsqr + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + add $rp,sp,#$Hsqr + bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); + + add $bp,sp,#$Rsqr + add $rp,sp,#$res_x + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add $bp,sp,#$Hcub + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add $bp,sp,#$U2 + ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$S1] + ldp $a2,$a3,[sp,#$S1+16] + add $rp,sp,#$res_y + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add $bp,sp,#$Hcub + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr $bi,[sp,#$R] + ldp $a0,$a1,[sp,#$res_y] + ldp $a2,$a3,[sp,#$res_y+16] + add $bp,sp,#$R + add $rp,sp,#$res_y + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add $bp,sp,#$S2 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp $a0,$a1,[sp,#$res_x] // res + ldp $a2,$a3,[sp,#$res_x+16] + ldp $t0,$t1,[$bp_real] // in2 + ldp $t2,$t3,[$bp_real,#16] +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + ldp $a0,$a1,[sp,#$res_x+$i+32] // res + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + ldp $a2,$a3,[sp,#$res_x+$i+48] + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + ldp $t0,$t1,[$bp_real,#$i+32] // in2 + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + ldp $t2,$t3,[$bp_real,#$i+48] + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] +___ +} +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] + +.Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); +my $Z1sqr = $S2; +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,%function +.align 5 +ecp_nistz256_point_add_affine: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov $rp_real,$rp + mov $ap_real,$ap + mov $bp_real,$bp + ldr $poly1,.Lpoly+8 + ldr $poly3,.Lpoly+24 + + ldp $a0,$a1,[$ap,#64] // in1_z + ldp $a2,$a3,[$ap,#64+16] + orr $t0,$a0,$a1 + orr $t2,$a2,$a3 + orr $in1infty,$t0,$t2 + cmp $in1infty,#0 + csetm $in1infty,ne // ~in1infty + + ldp $acc0,$acc1,[$bp] // in2_x + ldp $acc2,$acc3,[$bp,#16] + ldp $t0,$t1,[$bp,#32] // in2_y + ldp $t2,$t3,[$bp,#48] + orr $acc0,$acc0,$acc1 + orr $acc2,$acc2,$acc3 + orr $t0,$t0,$t1 + orr $t2,$t2,$t3 + orr $acc0,$acc0,$acc2 + orr $t0,$t0,$t2 + orr $in2infty,$acc0,$t0 + cmp $in2infty,#0 + csetm $in2infty,ne // ~in2infty + + add $rp,sp,#$Z1sqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov $a0,$acc0 + mov $a1,$acc1 + mov $a2,$acc2 + mov $a3,$acc3 + ldr $bi,[$bp_real] + add $bp,$bp_real,#0 + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add $bp,$ap_real,#0 + ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$Z1sqr] + ldp $a2,$a3,[sp,#$Z1sqr+16] + add $rp,sp,#$H + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add $bp,$ap_real,#64 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr $bi,[$ap_real,#64] + ldp $a0,$a1,[sp,#$H] + ldp $a2,$a3,[sp,#$H+16] + add $bp,$ap_real,#64 + add $rp,sp,#$res_z + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr $bi,[$bp_real,#32] + ldp $a0,$a1,[sp,#$S2] + ldp $a2,$a3,[sp,#$S2+16] + add $bp,$bp_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add $bp,$ap_real,#32 + ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont + ldp $a2,$a3,[sp,#$H+16] + add $rp,sp,#$R + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add $rp,sp,#$Hsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp $a0,$a1,[sp,#$R] + ldp $a2,$a3,[sp,#$R+16] + add $rp,sp,#$Rsqr + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr $bi,[sp,#$H] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,sp,#$H + add $rp,sp,#$Hcub + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr $bi,[$ap_real] + ldp $a0,$a1,[sp,#$Hsqr] + ldp $a2,$a3,[sp,#$Hsqr+16] + add $bp,$ap_real,#0 + add $rp,sp,#$U2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov $t0,$acc0 + mov $t1,$acc1 + mov $t2,$acc2 + mov $t3,$acc3 + add $rp,sp,#$Hsqr + bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); + + add $bp,sp,#$Rsqr + add $rp,sp,#$res_x + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add $bp,sp,#$Hcub + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add $bp,sp,#$U2 + ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont + ldp $a0,$a1,[sp,#$Hcub] + ldp $a2,$a3,[sp,#$Hcub+16] + add $rp,sp,#$res_y + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add $bp,$ap_real,#32 + add $rp,sp,#$S2 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr $bi,[sp,#$R] + ldp $a0,$a1,[sp,#$res_y] + ldp $a2,$a3,[sp,#$res_y+16] + add $bp,sp,#$R + add $rp,sp,#$res_y + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add $bp,sp,#$S2 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp $a0,$a1,[sp,#$res_x] // res + ldp $a2,$a3,[sp,#$res_x+16] + ldp $t0,$t1,[$bp_real] // in2 + ldp $t2,$t3,[$bp_real,#16] +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + ldp $a0,$a1,[sp,#$res_x+$i+32] // res + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + ldp $a2,$a3,[sp,#$res_x+$i+48] + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + ldp $t0,$t1,[$bp_real,#$i+32] // in2 + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + ldp $t2,$t3,[$bp_real,#$i+48] + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] +___ +$code.=<<___ if ($i == 0); + adr $bp_real,.Lone_mont-64 +___ +} +$code.=<<___; + ldp $acc0,$acc1,[$ap_real,#$i] // in1 + cmp $in1infty,#0 // ~$in1intfy, remember? + ldp $acc2,$acc3,[$ap_real,#$i+16] + csel $t0,$a0,$t0,ne + csel $t1,$a1,$t1,ne + csel $t2,$a2,$t2,ne + csel $t3,$a3,$t3,ne + cmp $in2infty,#0 // ~$in2intfy, remember? + csel $acc0,$t0,$acc0,ne + csel $acc1,$t1,$acc1,ne + csel $acc2,$t2,$acc2,ne + csel $acc3,$t3,$acc3,ne + stp $acc0,$acc1,[$rp_real,#$i] + stp $acc2,$acc3,[$rp_real,#$i+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + .inst 0xd50323bf // autiasp + ret +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} +if (1) { +my ($ord0,$ord1) = ($poly1,$poly3); +my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); +my $acc7 = $bi; + +$code.=<<___; +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,%function +.align 4 +ecp_nistz256_ord_mul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adr $ordk,.Lord + ldr $bi,[$bp] // bp[0] + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + + mul $acc0,$a0,$bi // a[0]*b[0] + umulh $t0,$a0,$bi + + mul $acc1,$a1,$bi // a[1]*b[0] + umulh $t1,$a1,$bi + + mul $acc2,$a2,$bi // a[2]*b[0] + umulh $t2,$a2,$bi + + mul $acc3,$a3,$bi // a[3]*b[0] + umulh $acc4,$a3,$bi + + mul $t4,$acc0,$ordk + + adds $acc1,$acc1,$t0 // accumulate high parts of multiplication + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adc $acc4,$acc4,xzr + mov $acc5,xzr +___ +for ($i=1;$i<4;$i++) { + ################################################################ + # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 + # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh +$code.=<<___; + ldr $bi,[$bp,#8*$i] // b[i] + + lsl $t0,$t4,#32 + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + mul $t0,$a0,$bi + adc $t3,$t3,xzr + mul $t1,$a1,$bi + + adds $acc0,$acc1,$t2 + mul $t2,$a2,$bi + adcs $acc1,$acc2,$t3 + mul $t3,$a3,$bi + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + adds $acc0,$acc0,$t0 // accumulate low parts + umulh $t0,$a0,$bi + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,$acc4,xzr + mul $t4,$acc0,$ordk + adds $acc1,$acc1,$t0 // accumulate high parts + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$t2 + adcs $acc4,$acc4,$t3 + adc $acc5,xzr,xzr +___ +} +$code.=<<___; + lsl $t0,$t4,#32 // last reduction + subs $acc2,$acc2,$t4 + lsr $t1,$t4,#32 + sbcs $acc3,$acc3,$t0 + sbcs $acc4,$acc4,$t1 + sbc $acc5,$acc5,xzr + + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adcs $acc3,$acc4,$t4 + adc $acc4,$acc5,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $acc1,$acc1,$t1,lo + csel $acc2,$acc2,$t2,lo + stp $acc0,$acc1,[$rp] + csel $acc3,$acc3,$t3,lo + stp $acc2,$acc3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// int rep); +.globl ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,%function +.align 4 +ecp_nistz256_ord_sqr_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adr $ordk,.Lord + ldp $a0,$a1,[$ap] + ldp $a2,$a3,[$ap,#16] + + ldp $ord0,$ord1,[$ordk,#0] + ldp $ord2,$ord3,[$ordk,#16] + ldr $ordk,[$ordk,#32] + b .Loop_ord_sqr + +.align 4 +.Loop_ord_sqr: + sub $bp,$bp,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul $acc1,$a1,$a0 // a[1]*a[0] + umulh $t1,$a1,$a0 + mul $acc2,$a2,$a0 // a[2]*a[0] + umulh $t2,$a2,$a0 + mul $acc3,$a3,$a0 // a[3]*a[0] + umulh $acc4,$a3,$a0 + + adds $acc2,$acc2,$t1 // accumulate high parts of multiplication + mul $t0,$a2,$a1 // a[2]*a[1] + umulh $t1,$a2,$a1 + adcs $acc3,$acc3,$t2 + mul $t2,$a3,$a1 // a[3]*a[1] + umulh $t3,$a3,$a1 + adc $acc4,$acc4,xzr // can't overflow + + mul $acc5,$a3,$a2 // a[3]*a[2] + umulh $acc6,$a3,$a2 + + adds $t1,$t1,$t2 // accumulate high parts of multiplication + mul $acc0,$a0,$a0 // a[0]*a[0] + adc $t2,$t3,xzr // can't overflow + + adds $acc3,$acc3,$t0 // accumulate low parts of multiplication + umulh $a0,$a0,$a0 + adcs $acc4,$acc4,$t1 + mul $t1,$a1,$a1 // a[1]*a[1] + adcs $acc5,$acc5,$t2 + umulh $a1,$a1,$a1 + adc $acc6,$acc6,xzr // can't overflow + + adds $acc1,$acc1,$acc1 // acc[1-6]*=2 + mul $t2,$a2,$a2 // a[2]*a[2] + adcs $acc2,$acc2,$acc2 + umulh $a2,$a2,$a2 + adcs $acc3,$acc3,$acc3 + mul $t3,$a3,$a3 // a[3]*a[3] + adcs $acc4,$acc4,$acc4 + umulh $a3,$a3,$a3 + adcs $acc5,$acc5,$acc5 + adcs $acc6,$acc6,$acc6 + adc $acc7,xzr,xzr + + adds $acc1,$acc1,$a0 // +a[i]*a[i] + mul $t4,$acc0,$ordk + adcs $acc2,$acc2,$t1 + adcs $acc3,$acc3,$a1 + adcs $acc4,$acc4,$t2 + adcs $acc5,$acc5,$a2 + adcs $acc6,$acc6,$t3 + adc $acc7,$acc7,$a3 +___ +for($i=0; $i<4; $i++) { # reductions +$code.=<<___; + subs xzr,$acc0,#1 + umulh $t1,$ord0,$t4 + mul $t2,$ord1,$t4 + umulh $t3,$ord1,$t4 + + adcs $t2,$t2,$t1 + adc $t3,$t3,xzr + + adds $acc0,$acc1,$t2 + adcs $acc1,$acc2,$t3 + adcs $acc2,$acc3,$t4 + adc $acc3,xzr,$t4 // can't overflow +___ +$code.=<<___ if ($i<3); + mul $t3,$acc0,$ordk +___ +$code.=<<___; + lsl $t0,$t4,#32 + subs $acc1,$acc1,$t4 + lsr $t1,$t4,#32 + sbcs $acc2,$acc2,$t0 + sbc $acc3,$acc3,$t1 // can't borrow +___ + ($t3,$t4) = ($t4,$t3); +} +$code.=<<___; + adds $acc0,$acc0,$acc4 // accumulate upper half + adcs $acc1,$acc1,$acc5 + adcs $acc2,$acc2,$acc6 + adcs $acc3,$acc3,$acc7 + adc $acc4,xzr,xzr + + subs $t0,$acc0,$ord0 // ret -= modulus + sbcs $t1,$acc1,$ord1 + sbcs $t2,$acc2,$ord2 + sbcs $t3,$acc3,$ord3 + sbcs xzr,$acc4,xzr + + csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus + csel $a1,$acc1,$t1,lo + csel $a2,$acc2,$t2,lo + csel $a3,$acc3,$t3,lo + + cbnz $bp,.Loop_ord_sqr + + stp $a0,$a1,[$rp] + stp $a2,$a3,[$rp,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ +} } + +######################################################################## +# scatter-gather subroutines +{ +my ($out,$inp,$index,$mask)=map("x$_",(0..3)); +$code.=<<___; +// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, +// int x2); +.globl ecp_nistz256_scatter_w5 +.type ecp_nistz256_scatter_w5,%function +.align 4 +ecp_nistz256_scatter_w5: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + add $out,$out,$index,lsl#2 + + ldp x4,x5,[$inp] // X + ldp x6,x7,[$inp,#16] + str w4,[$out,#64*0-4] + lsr x4,x4,#32 + str w5,[$out,#64*1-4] + lsr x5,x5,#32 + str w6,[$out,#64*2-4] + lsr x6,x6,#32 + str w7,[$out,#64*3-4] + lsr x7,x7,#32 + str w4,[$out,#64*4-4] + str w5,[$out,#64*5-4] + str w6,[$out,#64*6-4] + str w7,[$out,#64*7-4] + add $out,$out,#64*8 + + ldp x4,x5,[$inp,#32] // Y + ldp x6,x7,[$inp,#48] + str w4,[$out,#64*0-4] + lsr x4,x4,#32 + str w5,[$out,#64*1-4] + lsr x5,x5,#32 + str w6,[$out,#64*2-4] + lsr x6,x6,#32 + str w7,[$out,#64*3-4] + lsr x7,x7,#32 + str w4,[$out,#64*4-4] + str w5,[$out,#64*5-4] + str w6,[$out,#64*6-4] + str w7,[$out,#64*7-4] + add $out,$out,#64*8 + + ldp x4,x5,[$inp,#64] // Z + ldp x6,x7,[$inp,#80] + str w4,[$out,#64*0-4] + lsr x4,x4,#32 + str w5,[$out,#64*1-4] + lsr x5,x5,#32 + str w6,[$out,#64*2-4] + lsr x6,x6,#32 + str w7,[$out,#64*3-4] + lsr x7,x7,#32 + str w4,[$out,#64*4-4] + str w5,[$out,#64*5-4] + str w6,[$out,#64*6-4] + str w7,[$out,#64*7-4] + + ldr x29,[sp],#16 + ret +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, +// int x2); +.globl ecp_nistz256_gather_w5 +.type ecp_nistz256_gather_w5,%function +.align 4 +ecp_nistz256_gather_w5: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + cmp $index,xzr + csetm x3,ne + add $index,$index,x3 + add $inp,$inp,$index,lsl#2 + + ldr w4,[$inp,#64*0] + ldr w5,[$inp,#64*1] + ldr w6,[$inp,#64*2] + ldr w7,[$inp,#64*3] + ldr w8,[$inp,#64*4] + ldr w9,[$inp,#64*5] + ldr w10,[$inp,#64*6] + ldr w11,[$inp,#64*7] + add $inp,$inp,#64*8 + orr x4,x4,x8,lsl#32 + orr x5,x5,x9,lsl#32 + orr x6,x6,x10,lsl#32 + orr x7,x7,x11,lsl#32 + csel x4,x4,xzr,ne + csel x5,x5,xzr,ne + csel x6,x6,xzr,ne + csel x7,x7,xzr,ne + stp x4,x5,[$out] // X + stp x6,x7,[$out,#16] + + ldr w4,[$inp,#64*0] + ldr w5,[$inp,#64*1] + ldr w6,[$inp,#64*2] + ldr w7,[$inp,#64*3] + ldr w8,[$inp,#64*4] + ldr w9,[$inp,#64*5] + ldr w10,[$inp,#64*6] + ldr w11,[$inp,#64*7] + add $inp,$inp,#64*8 + orr x4,x4,x8,lsl#32 + orr x5,x5,x9,lsl#32 + orr x6,x6,x10,lsl#32 + orr x7,x7,x11,lsl#32 + csel x4,x4,xzr,ne + csel x5,x5,xzr,ne + csel x6,x6,xzr,ne + csel x7,x7,xzr,ne + stp x4,x5,[$out,#32] // Y + stp x6,x7,[$out,#48] + + ldr w4,[$inp,#64*0] + ldr w5,[$inp,#64*1] + ldr w6,[$inp,#64*2] + ldr w7,[$inp,#64*3] + ldr w8,[$inp,#64*4] + ldr w9,[$inp,#64*5] + ldr w10,[$inp,#64*6] + ldr w11,[$inp,#64*7] + orr x4,x4,x8,lsl#32 + orr x5,x5,x9,lsl#32 + orr x6,x6,x10,lsl#32 + orr x7,x7,x11,lsl#32 + csel x4,x4,xzr,ne + csel x5,x5,xzr,ne + csel x6,x6,xzr,ne + csel x7,x7,xzr,ne + stp x4,x5,[$out,#64] // Z + stp x6,x7,[$out,#80] + + ldr x29,[sp],#16 + ret +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, +// int x2); +.globl ecp_nistz256_scatter_w7 +.type ecp_nistz256_scatter_w7,%function +.align 4 +ecp_nistz256_scatter_w7: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + add $out,$out,$index + mov $index,#64/8 +.Loop_scatter_w7: + ldr x3,[$inp],#8 + subs $index,$index,#1 + prfm pstl1strm,[$out,#4096+64*0] + prfm pstl1strm,[$out,#4096+64*1] + prfm pstl1strm,[$out,#4096+64*2] + prfm pstl1strm,[$out,#4096+64*3] + prfm pstl1strm,[$out,#4096+64*4] + prfm pstl1strm,[$out,#4096+64*5] + prfm pstl1strm,[$out,#4096+64*6] + prfm pstl1strm,[$out,#4096+64*7] + strb w3,[$out,#64*0] + lsr x3,x3,#8 + strb w3,[$out,#64*1] + lsr x3,x3,#8 + strb w3,[$out,#64*2] + lsr x3,x3,#8 + strb w3,[$out,#64*3] + lsr x3,x3,#8 + strb w3,[$out,#64*4] + lsr x3,x3,#8 + strb w3,[$out,#64*5] + lsr x3,x3,#8 + strb w3,[$out,#64*6] + lsr x3,x3,#8 + strb w3,[$out,#64*7] + add $out,$out,#64*8 + b.ne .Loop_scatter_w7 + + ldr x29,[sp],#16 + ret +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, +// int x2); +.globl ecp_nistz256_gather_w7 +.type ecp_nistz256_gather_w7,%function +.align 4 +ecp_nistz256_gather_w7: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + cmp $index,xzr + csetm x3,ne + add $index,$index,x3 + add $inp,$inp,$index + mov $index,#64/8 + nop +.Loop_gather_w7: + ldrb w4,[$inp,#64*0] + prfm pldl1strm,[$inp,#4096+64*0] + subs $index,$index,#1 + ldrb w5,[$inp,#64*1] + prfm pldl1strm,[$inp,#4096+64*1] + ldrb w6,[$inp,#64*2] + prfm pldl1strm,[$inp,#4096+64*2] + ldrb w7,[$inp,#64*3] + prfm pldl1strm,[$inp,#4096+64*3] + ldrb w8,[$inp,#64*4] + prfm pldl1strm,[$inp,#4096+64*4] + ldrb w9,[$inp,#64*5] + prfm pldl1strm,[$inp,#4096+64*5] + ldrb w10,[$inp,#64*6] + prfm pldl1strm,[$inp,#4096+64*6] + ldrb w11,[$inp,#64*7] + prfm pldl1strm,[$inp,#4096+64*7] + add $inp,$inp,#64*8 + orr x4,x4,x5,lsl#8 + orr x6,x6,x7,lsl#8 + orr x8,x8,x9,lsl#8 + orr x4,x4,x6,lsl#16 + orr x10,x10,x11,lsl#8 + orr x4,x4,x8,lsl#32 + orr x4,x4,x10,lsl#48 + and x4,x4,x3 + str x4,[$out],#8 + b.ne .Loop_gather_w7 + + ldr x29,[sp],#16 + ret +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl new file mode 100755 index 000000000..5071d09ac --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl @@ -0,0 +1,2080 @@ +#! /usr/bin/env perl +# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2014, Intel Corporation. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +# (1) Intel Corporation, Israel Development Center, Haifa, Israel +# (2) University of Haifa, Israel +# +# Reference: +# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +# 256 Bit Primes" + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); + $addx = ($1>=2.23); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); + $addx = ($1>=12); +} + +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $avx = ($ver>=3.0) + ($ver>=3.01); + $addx = ($ver>=3.03); +} + +if ($avx>=2) {{ +$digit_size = "\$29"; +$n_digits = "\$9"; + +$code.=<<___; +.text + +.align 64 +.LAVX2_AND_MASK: +.LAVX2_POLY: +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 +.quad 0x00040000, 0x00040000, 0x00040000, 0x00040000 +.quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000 +.quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff + +.LAVX2_POLY_x2: +.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC +.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC +.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC +.quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC +.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE +.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE +.quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE +.quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE +.quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC + +.LAVX2_POLY_x8: +.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 +.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 +.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 +.quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8 +.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC +.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC +.quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC +.quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC +.quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8 + +.LONE: +.quad 0x00000020, 0x00000020, 0x00000020, 0x00000020 +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 +.quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000 +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff +.quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 + +# RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL +# Montgomery form (*2^256) to our format (*2^261) + +.LTO_MONT_AVX2: +.quad 0x00000400, 0x00000400, 0x00000400, 0x00000400 +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 +.quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000 +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x00000003, 0x00000003, 0x00000003, 0x00000003 + +.LFROM_MONT_AVX2: +.quad 0x00000001, 0x00000001, 0x00000001, 0x00000001 +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 +.quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00 +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff +.quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff +.quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff +.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 + +.LIntOne: +.long 1,1,1,1,1,1,1,1 +___ + +{ +# This function receives a pointer to an array of four affine points +# (X, Y, <1>) and rearranges the data for AVX2 execution, while +# converting it to 2^29 radix redundant form + +my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3, + $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15)); + +$code.=<<___; +.globl ecp_nistz256_avx2_transpose_convert +.type ecp_nistz256_avx2_transpose_convert,\@function,2 +.align 64 +ecp_nistz256_avx2_transpose_convert: + vzeroupper +___ +$code.=<<___ if ($win64); + lea -8-16*10(%rsp), %rsp + vmovaps %xmm6, -8-16*10(%rax) + vmovaps %xmm7, -8-16*9(%rax) + vmovaps %xmm8, -8-16*8(%rax) + vmovaps %xmm9, -8-16*7(%rax) + vmovaps %xmm10, -8-16*6(%rax) + vmovaps %xmm11, -8-16*5(%rax) + vmovaps %xmm12, -8-16*4(%rax) + vmovaps %xmm13, -8-16*3(%rax) + vmovaps %xmm14, -8-16*2(%rax) + vmovaps %xmm15, -8-16*1(%rax) +___ +$code.=<<___; + # Load the data + vmovdqa 32*0(%rsi), $X0 + lea 112(%rsi), %rax # size optimization + vmovdqa 32*1(%rsi), $Y0 + lea .LAVX2_AND_MASK(%rip), %rdx + vmovdqa 32*2(%rsi), $X1 + vmovdqa 32*3(%rsi), $Y1 + vmovdqa 32*4-112(%rax), $X2 + vmovdqa 32*5-112(%rax), $Y2 + vmovdqa 32*6-112(%rax), $X3 + vmovdqa 32*7-112(%rax), $Y3 + + # Transpose X and Y independently + vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0] + vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0] + vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1] + vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1] + + vpunpcklqdq $Y1, $Y0, $T4 + vpunpcklqdq $Y3, $Y2, $T5 + vpunpckhqdq $Y1, $Y0, $T6 + vpunpckhqdq $Y3, $Y2, $T7 + + vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0] + vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1] + vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2] + vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3] + + vperm2i128 \$0x20, $T5, $T4, $Y0 + vperm2i128 \$0x20, $T7, $T6, $Y1 + vperm2i128 \$0x31, $T5, $T4, $Y2 + vperm2i128 \$0x31, $T7, $T6, $Y3 + vmovdqa (%rdx), $T7 + + vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask; + vpsrlq \$29, $X0, $X0 + vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask; + vpsrlq \$29, $X0, $X0 + vpsllq \$6, $X1, $T2 + vpxor $X0, $T2, $T2 + vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; + vpsrlq \$23, $X1, $X1 + vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; + vpsrlq \$29, $X1, $X1 + vpsllq \$12, $X2, $T4 + vpxor $X1, $T4, $T4 + vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; + vpsrlq \$17, $X2, $X2 + vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; + vpsrlq \$29, $X2, $X2 + vpsllq \$18, $X3, $T6 + vpxor $X2, $T6, $T6 + vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; + vpsrlq \$11, $X3, $X3 + vmovdqa $T0, 32*0(%rdi) + lea 112(%rdi), %rax # size optimization + vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; + vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; + + vmovdqa $T1, 32*1(%rdi) + vmovdqa $T2, 32*2(%rdi) + vmovdqa $T3, 32*3(%rdi) + vmovdqa $T4, 32*4-112(%rax) + vmovdqa $T5, 32*5-112(%rax) + vmovdqa $T6, 32*6-112(%rax) + vmovdqa $T0, 32*7-112(%rax) + vmovdqa $X3, 32*8-112(%rax) + lea 448(%rdi), %rax # size optimization + + vpand $T7, $Y0, $T0 # out[0] = in[0] & mask; + vpsrlq \$29, $Y0, $Y0 + vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask; + vpsrlq \$29, $Y0, $Y0 + vpsllq \$6, $Y1, $T2 + vpxor $Y0, $T2, $T2 + vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; + vpsrlq \$23, $Y1, $Y1 + vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; + vpsrlq \$29, $Y1, $Y1 + vpsllq \$12, $Y2, $T4 + vpxor $Y1, $T4, $T4 + vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; + vpsrlq \$17, $Y2, $Y2 + vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; + vpsrlq \$29, $Y2, $Y2 + vpsllq \$18, $Y3, $T6 + vpxor $Y2, $T6, $T6 + vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; + vpsrlq \$11, $Y3, $Y3 + vmovdqa $T0, 32*9-448(%rax) + vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; + vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; + + vmovdqa $T1, 32*10-448(%rax) + vmovdqa $T2, 32*11-448(%rax) + vmovdqa $T3, 32*12-448(%rax) + vmovdqa $T4, 32*13-448(%rax) + vmovdqa $T5, 32*14-448(%rax) + vmovdqa $T6, 32*15-448(%rax) + vmovdqa $T0, 32*16-448(%rax) + vmovdqa $Y3, 32*17-448(%rax) + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*0(%rsp), %xmm6 + movaps 16*1(%rsp), %xmm7 + movaps 16*2(%rsp), %xmm8 + movaps 16*3(%rsp), %xmm9 + movaps 16*4(%rsp), %xmm10 + movaps 16*5(%rsp), %xmm11 + movaps 16*6(%rsp), %xmm12 + movaps 16*7(%rsp), %xmm13 + movaps 16*8(%rsp), %xmm14 + movaps 16*9(%rsp), %xmm15 + lea 8+16*10(%rsp), %rsp +___ +$code.=<<___; + ret +.size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert +___ +} +{ +################################################################################ +# This function receives a pointer to an array of four AVX2 formatted points +# (X, Y, Z) convert the data to normal representation, and rearranges the data + +my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8)); +my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15)); + +$code.=<<___; + +.globl ecp_nistz256_avx2_convert_transpose_back +.type ecp_nistz256_avx2_convert_transpose_back,\@function,2 +.align 32 +ecp_nistz256_avx2_convert_transpose_back: + vzeroupper +___ +$code.=<<___ if ($win64); + lea -8-16*10(%rsp), %rsp + vmovaps %xmm6, -8-16*10(%rax) + vmovaps %xmm7, -8-16*9(%rax) + vmovaps %xmm8, -8-16*8(%rax) + vmovaps %xmm9, -8-16*7(%rax) + vmovaps %xmm10, -8-16*6(%rax) + vmovaps %xmm11, -8-16*5(%rax) + vmovaps %xmm12, -8-16*4(%rax) + vmovaps %xmm13, -8-16*3(%rax) + vmovaps %xmm14, -8-16*2(%rax) + vmovaps %xmm15, -8-16*1(%rax) +___ +$code.=<<___; + mov \$3, %ecx + +.Lconv_loop: + vmovdqa 32*0(%rsi), $D0 + lea 160(%rsi), %rax # size optimization + vmovdqa 32*1(%rsi), $D1 + vmovdqa 32*2(%rsi), $D2 + vmovdqa 32*3(%rsi), $D3 + vmovdqa 32*4-160(%rax), $D4 + vmovdqa 32*5-160(%rax), $D5 + vmovdqa 32*6-160(%rax), $D6 + vmovdqa 32*7-160(%rax), $D7 + vmovdqa 32*8-160(%rax), $D8 + + vpsllq \$29, $D1, $D1 + vpsllq \$58, $D2, $T0 + vpaddq $D1, $D0, $D0 + vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2); + + vpsrlq \$6, $D2, $D2 + vpsllq \$23, $D3, $D3 + vpsllq \$52, $D4, $T1 + vpaddq $D2, $D3, $D3 + vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64); + + vpsrlq \$12, $D4, $D4 + vpsllq \$17, $D5, $D5 + vpsllq \$46, $D6, $T2 + vpaddq $D4, $D5, $D5 + vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64); + + vpsrlq \$18, $D6, $D6 + vpsllq \$11, $D7, $D7 + vpsllq \$40, $D8, $T3 + vpaddq $D6, $D7, $D7 + vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64); + + vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0] + vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0] + vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1] + vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1] + + vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0] + vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1] + vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2] + vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3] + + vmovdqa $D0, 32*0(%rdi) + vmovdqa $D1, 32*3(%rdi) + vmovdqa $D2, 32*6(%rdi) + vmovdqa $D3, 32*9(%rdi) + + lea 32*9(%rsi), %rsi + lea 32*1(%rdi), %rdi + + dec %ecx + jnz .Lconv_loop + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*0(%rsp), %xmm6 + movaps 16*1(%rsp), %xmm7 + movaps 16*2(%rsp), %xmm8 + movaps 16*3(%rsp), %xmm9 + movaps 16*4(%rsp), %xmm10 + movaps 16*5(%rsp), %xmm11 + movaps 16*6(%rsp), %xmm12 + movaps 16*7(%rsp), %xmm13 + movaps 16*8(%rsp), %xmm14 + movaps 16*9(%rsp), %xmm15 + lea 8+16*10(%rsp), %rsp +___ +$code.=<<___; + ret +.size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back +___ +} +{ +my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx"); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8)); +my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13)); + +sub NORMALIZE { +my $ret=<<___; + vpsrlq $digit_size, $ACC0, $T0 + vpand $AND_MASK, $ACC0, $ACC0 + vpaddq $T0, $ACC1, $ACC1 + + vpsrlq $digit_size, $ACC1, $T0 + vpand $AND_MASK, $ACC1, $ACC1 + vpaddq $T0, $ACC2, $ACC2 + + vpsrlq $digit_size, $ACC2, $T0 + vpand $AND_MASK, $ACC2, $ACC2 + vpaddq $T0, $ACC3, $ACC3 + + vpsrlq $digit_size, $ACC3, $T0 + vpand $AND_MASK, $ACC3, $ACC3 + vpaddq $T0, $ACC4, $ACC4 + + vpsrlq $digit_size, $ACC4, $T0 + vpand $AND_MASK, $ACC4, $ACC4 + vpaddq $T0, $ACC5, $ACC5 + + vpsrlq $digit_size, $ACC5, $T0 + vpand $AND_MASK, $ACC5, $ACC5 + vpaddq $T0, $ACC6, $ACC6 + + vpsrlq $digit_size, $ACC6, $T0 + vpand $AND_MASK, $ACC6, $ACC6 + vpaddq $T0, $ACC7, $ACC7 + + vpsrlq $digit_size, $ACC7, $T0 + vpand $AND_MASK, $ACC7, $ACC7 + vpaddq $T0, $ACC8, $ACC8 + #vpand $AND_MASK, $ACC8, $ACC8 +___ + $ret; +} + +sub STORE { +my $ret=<<___; + vmovdqa $ACC0, 32*0(%rdi) + lea 160(%rdi), %rax # size optimization + vmovdqa $ACC1, 32*1(%rdi) + vmovdqa $ACC2, 32*2(%rdi) + vmovdqa $ACC3, 32*3(%rdi) + vmovdqa $ACC4, 32*4-160(%rax) + vmovdqa $ACC5, 32*5-160(%rax) + vmovdqa $ACC6, 32*6-160(%rax) + vmovdqa $ACC7, 32*7-160(%rax) + vmovdqa $ACC8, 32*8-160(%rax) +___ + $ret; +} + +$code.=<<___; +.type avx2_normalize,\@abi-omnipotent +.align 32 +avx2_normalize: + vpsrlq $digit_size, $ACC0, $T0 + vpand $AND_MASK, $ACC0, $ACC0 + vpaddq $T0, $ACC1, $ACC1 + + vpsrlq $digit_size, $ACC1, $T0 + vpand $AND_MASK, $ACC1, $ACC1 + vpaddq $T0, $ACC2, $ACC2 + + vpsrlq $digit_size, $ACC2, $T0 + vpand $AND_MASK, $ACC2, $ACC2 + vpaddq $T0, $ACC3, $ACC3 + + vpsrlq $digit_size, $ACC3, $T0 + vpand $AND_MASK, $ACC3, $ACC3 + vpaddq $T0, $ACC4, $ACC4 + + vpsrlq $digit_size, $ACC4, $T0 + vpand $AND_MASK, $ACC4, $ACC4 + vpaddq $T0, $ACC5, $ACC5 + + vpsrlq $digit_size, $ACC5, $T0 + vpand $AND_MASK, $ACC5, $ACC5 + vpaddq $T0, $ACC6, $ACC6 + + vpsrlq $digit_size, $ACC6, $T0 + vpand $AND_MASK, $ACC6, $ACC6 + vpaddq $T0, $ACC7, $ACC7 + + vpsrlq $digit_size, $ACC7, $T0 + vpand $AND_MASK, $ACC7, $ACC7 + vpaddq $T0, $ACC8, $ACC8 + #vpand $AND_MASK, $ACC8, $ACC8 + + ret +.size avx2_normalize,.-avx2_normalize + +.type avx2_normalize_n_store,\@abi-omnipotent +.align 32 +avx2_normalize_n_store: + vpsrlq $digit_size, $ACC0, $T0 + vpand $AND_MASK, $ACC0, $ACC0 + vpaddq $T0, $ACC1, $ACC1 + + vpsrlq $digit_size, $ACC1, $T0 + vpand $AND_MASK, $ACC1, $ACC1 + vmovdqa $ACC0, 32*0(%rdi) + lea 160(%rdi), %rax # size optimization + vpaddq $T0, $ACC2, $ACC2 + + vpsrlq $digit_size, $ACC2, $T0 + vpand $AND_MASK, $ACC2, $ACC2 + vmovdqa $ACC1, 32*1(%rdi) + vpaddq $T0, $ACC3, $ACC3 + + vpsrlq $digit_size, $ACC3, $T0 + vpand $AND_MASK, $ACC3, $ACC3 + vmovdqa $ACC2, 32*2(%rdi) + vpaddq $T0, $ACC4, $ACC4 + + vpsrlq $digit_size, $ACC4, $T0 + vpand $AND_MASK, $ACC4, $ACC4 + vmovdqa $ACC3, 32*3(%rdi) + vpaddq $T0, $ACC5, $ACC5 + + vpsrlq $digit_size, $ACC5, $T0 + vpand $AND_MASK, $ACC5, $ACC5 + vmovdqa $ACC4, 32*4-160(%rax) + vpaddq $T0, $ACC6, $ACC6 + + vpsrlq $digit_size, $ACC6, $T0 + vpand $AND_MASK, $ACC6, $ACC6 + vmovdqa $ACC5, 32*5-160(%rax) + vpaddq $T0, $ACC7, $ACC7 + + vpsrlq $digit_size, $ACC7, $T0 + vpand $AND_MASK, $ACC7, $ACC7 + vmovdqa $ACC6, 32*6-160(%rax) + vpaddq $T0, $ACC8, $ACC8 + #vpand $AND_MASK, $ACC8, $ACC8 + vmovdqa $ACC7, 32*7-160(%rax) + vmovdqa $ACC8, 32*8-160(%rax) + + ret +.size avx2_normalize_n_store,.-avx2_normalize_n_store + +################################################################################ +# void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4); +.type avx2_mul_x4,\@abi-omnipotent +.align 32 +avx2_mul_x4: + lea .LAVX2_POLY(%rip), %rax + + vpxor $ACC0, $ACC0, $ACC0 + vpxor $ACC1, $ACC1, $ACC1 + vpxor $ACC2, $ACC2, $ACC2 + vpxor $ACC3, $ACC3, $ACC3 + vpxor $ACC4, $ACC4, $ACC4 + vpxor $ACC5, $ACC5, $ACC5 + vpxor $ACC6, $ACC6, $ACC6 + vpxor $ACC7, $ACC7, $ACC7 + + vmovdqa 32*7(%rax), %ymm14 + vmovdqa 32*8(%rax), %ymm15 + + mov $n_digits, $itr + lea -512($a_ptr), $a_ptr # strategic bias to control u-op density + jmp .Lavx2_mul_x4_loop + +.align 32 +.Lavx2_mul_x4_loop: + vmovdqa 32*0($b_ptr), $B + lea 32*1($b_ptr), $b_ptr + + vpmuludq 32*0+512($a_ptr), $B, $T0 + vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW + vpaddq $T0, $ACC0, $ACC0 + vpmuludq 32*2+512($a_ptr), $B, $T0 + vpaddq $OVERFLOW, $ACC1, $ACC1 + vpand $AND_MASK, $ACC0, $Y + vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW + vpaddq $T0, $ACC2, $ACC2 + vpmuludq 32*4+512($a_ptr), $B, $T0 + vpaddq $OVERFLOW, $ACC3, $ACC3 + vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW + vpaddq $T0, $ACC4, $ACC4 + vpmuludq 32*6+512($a_ptr), $B, $T0 + vpaddq $OVERFLOW, $ACC5, $ACC5 + vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW + vpaddq $T0, $ACC6, $ACC6 + + # Skip some multiplications, optimizing for the constant poly + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpmuludq 32*8+512($a_ptr), $B, $ACC8 + vpaddq $T0, $ACC0, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + .byte 0x67 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $OVERFLOW + .byte 0x67 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $T0 + vpaddq $OVERFLOW, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $OVERFLOW + vpaddq $T0, $ACC7, $ACC6 + vpaddq $OVERFLOW, $ACC8, $ACC7 + + dec $itr + jnz .Lavx2_mul_x4_loop + + vpxor $ACC8, $ACC8, $ACC8 + + ret +.size avx2_mul_x4,.-avx2_mul_x4 + +# Function optimized for the constant 1 +################################################################################ +# void avx2_mul_by1_x4(void* RESULTx4, void *Ax4); +.type avx2_mul_by1_x4,\@abi-omnipotent +.align 32 +avx2_mul_by1_x4: + lea .LAVX2_POLY(%rip), %rax + + vpxor $ACC0, $ACC0, $ACC0 + vpxor $ACC1, $ACC1, $ACC1 + vpxor $ACC2, $ACC2, $ACC2 + vpxor $ACC3, $ACC3, $ACC3 + vpxor $ACC4, $ACC4, $ACC4 + vpxor $ACC5, $ACC5, $ACC5 + vpxor $ACC6, $ACC6, $ACC6 + vpxor $ACC7, $ACC7, $ACC7 + vpxor $ACC8, $ACC8, $ACC8 + + vmovdqa 32*3+.LONE(%rip), %ymm14 + vmovdqa 32*7+.LONE(%rip), %ymm15 + + mov $n_digits, $itr + jmp .Lavx2_mul_by1_x4_loop + +.align 32 +.Lavx2_mul_by1_x4_loop: + vmovdqa 32*0($a_ptr), $B + .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr + + vpsllq \$5, $B, $OVERFLOW + vpmuludq %ymm14, $B, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC3 + .byte 0x67 + vpmuludq $AND_MASK, $B, $T0 + vpand $AND_MASK, $ACC0, $Y + vpaddq $T0, $ACC4, $ACC4 + vpaddq $T0, $ACC5, $ACC5 + vpaddq $T0, $ACC6, $ACC6 + vpsllq \$23, $B, $T0 + + .byte 0x67,0x67 + vpmuludq %ymm15, $B, $OVERFLOW + vpsubq $T0, $ACC6, $ACC6 + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpaddq $T0, $ACC0, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + .byte 0x67,0x67 + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $OVERFLOW + vmovdqa $ACC5, $ACC4 + vpmuludq 32*7(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC6, $ACC5 + vpaddq $T0, $ACC7, $ACC6 + vpmuludq 32*8(%rax), $Y, $ACC7 + + dec $itr + jnz .Lavx2_mul_by1_x4_loop + + ret +.size avx2_mul_by1_x4,.-avx2_mul_by1_x4 + +################################################################################ +# void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4); +.type avx2_sqr_x4,\@abi-omnipotent +.align 32 +avx2_sqr_x4: + lea .LAVX2_POLY(%rip), %rax + + vmovdqa 32*7(%rax), %ymm14 + vmovdqa 32*8(%rax), %ymm15 + + vmovdqa 32*0($a_ptr), $B + vmovdqa 32*1($a_ptr), $ACC1 + vmovdqa 32*2($a_ptr), $ACC2 + vmovdqa 32*3($a_ptr), $ACC3 + vmovdqa 32*4($a_ptr), $ACC4 + vmovdqa 32*5($a_ptr), $ACC5 + vmovdqa 32*6($a_ptr), $ACC6 + vmovdqa 32*7($a_ptr), $ACC7 + vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7 + vmovdqa 32*8($a_ptr), $ACC8 + vpaddq $ACC2, $ACC2, $ACC2 + vmovdqa $ACC1, 32*0(%rcx) + vpaddq $ACC3, $ACC3, $ACC3 + vmovdqa $ACC2, 32*1(%rcx) + vpaddq $ACC4, $ACC4, $ACC4 + vmovdqa $ACC3, 32*2(%rcx) + vpaddq $ACC5, $ACC5, $ACC5 + vmovdqa $ACC4, 32*3(%rcx) + vpaddq $ACC6, $ACC6, $ACC6 + vmovdqa $ACC5, 32*4(%rcx) + vpaddq $ACC7, $ACC7, $ACC7 + vmovdqa $ACC6, 32*5(%rcx) + vpaddq $ACC8, $ACC8, $ACC8 + vmovdqa $ACC7, 32*6(%rcx) + vmovdqa $ACC8, 32*7(%rcx) + + #itr 1 + vpmuludq $B, $B, $ACC0 + vpmuludq $B, $ACC1, $ACC1 + vpand $AND_MASK, $ACC0, $Y + vpmuludq $B, $ACC2, $ACC2 + vpmuludq $B, $ACC3, $ACC3 + vpmuludq $B, $ACC4, $ACC4 + vpmuludq $B, $ACC5, $ACC5 + vpmuludq $B, $ACC6, $ACC6 + vpmuludq $AND_MASK, $Y, $T0 + vpmuludq $B, $ACC7, $ACC7 + vpmuludq $B, $ACC8, $ACC8 + vmovdqa 32*1($a_ptr), $B + + vpaddq $T0, $ACC0, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + #itr 2 + vpmuludq $B, $B, $OVERFLOW + vpand $AND_MASK, $ACC0, $Y + vpmuludq 32*1(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC1, $ACC1 + vpmuludq 32*2(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC2, $ACC2 + vpmuludq 32*3(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC3, $ACC3 + vpmuludq 32*4(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC4, $ACC4 + vpmuludq 32*5(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC5, $ACC5 + vpmuludq 32*6(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC6, $ACC6 + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpmuludq 32*7(%rcx), $B, $ACC8 + vmovdqa 32*2($a_ptr), $B + vpaddq $T0, $ACC0, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + #itr 3 + vpmuludq $B, $B, $T0 + vpand $AND_MASK, $ACC0, $Y + vpmuludq 32*2(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC2, $ACC2 + vpmuludq 32*3(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC3, $ACC3 + vpmuludq 32*4(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC4, $ACC4 + vpmuludq 32*5(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC5, $ACC5 + vpmuludq 32*6(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC6, $ACC6 + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpmuludq 32*7(%rcx), $B, $ACC8 + vmovdqa 32*3($a_ptr), $B + vpaddq $T0, $ACC0, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpand $AND_MASK, $ACC0, $Y + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + #itr 4 + vpmuludq $B, $B, $OVERFLOW + vpmuludq 32*3(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC3, $ACC3 + vpmuludq 32*4(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC4, $ACC4 + vpmuludq 32*5(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC5, $ACC5 + vpmuludq 32*6(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC6, $ACC6 + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpmuludq 32*7(%rcx), $B, $ACC8 + vmovdqa 32*4($a_ptr), $B + vpaddq $T0, $ACC0, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpand $AND_MASK, $ACC0, $Y + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + #itr 5 + vpmuludq $B, $B, $T0 + vpmuludq 32*4(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC4, $ACC4 + vpmuludq 32*5(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC5, $ACC5 + vpmuludq 32*6(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC6, $ACC6 + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpmuludq 32*7(%rcx), $B, $ACC8 + vmovdqa 32*5($a_ptr), $B + vpaddq $T0, $ACC0, $OVERFLOW + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpand $AND_MASK, $ACC0, $Y + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + #itr 6 + vpmuludq $B, $B, $OVERFLOW + vpmuludq 32*5(%rcx), $B, $T0 + vpaddq $OVERFLOW, $ACC5, $ACC5 + vpmuludq 32*6(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC6, $ACC6 + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpmuludq 32*7(%rcx), $B, $ACC8 + vmovdqa 32*6($a_ptr), $B + vpaddq $T0, $ACC0, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpand $AND_MASK, $ACC0, $Y + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + #itr 7 + vpmuludq $B, $B, $T0 + vpmuludq 32*6(%rcx), $B, $OVERFLOW + vpaddq $T0, $ACC6, $ACC6 + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpmuludq 32*7(%rcx), $B, $ACC8 + vmovdqa 32*7($a_ptr), $B + vpaddq $T0, $ACC0, $OVERFLOW + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpand $AND_MASK, $ACC0, $Y + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + #itr 8 + vpmuludq $B, $B, $OVERFLOW + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC7 + vpmuludq 32*7(%rcx), $B, $ACC8 + vmovdqa 32*8($a_ptr), $B + vpaddq $T0, $ACC0, $OVERFLOW + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpand $AND_MASK, $ACC0, $Y + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + #itr 9 + vpmuludq $B, $B, $ACC8 + + vpmuludq $AND_MASK, $Y, $T0 + vpaddq $T0, $ACC0, $OVERFLOW + vpsrlq $digit_size, $OVERFLOW, $OVERFLOW + vpaddq $T0, $ACC1, $ACC0 + vpaddq $T0, $ACC2, $ACC1 + vpmuludq 32*3(%rax), $Y, $T0 + vpaddq $OVERFLOW, $ACC0, $ACC0 + vpaddq $T0, $ACC3, $ACC2 + vmovdqa $ACC4, $ACC3 + vpsllq \$18, $Y, $T0 + vmovdqa $ACC5, $ACC4 + vpmuludq %ymm14, $Y, $OVERFLOW + vpaddq $T0, $ACC6, $ACC5 + vpmuludq %ymm15, $Y, $T0 + vpaddq $OVERFLOW, $ACC7, $ACC6 + vpaddq $T0, $ACC8, $ACC7 + + vpxor $ACC8, $ACC8, $ACC8 + + ret +.size avx2_sqr_x4,.-avx2_sqr_x4 + +################################################################################ +# void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4); +.type avx2_sub_x4,\@abi-omnipotent +.align 32 +avx2_sub_x4: + vmovdqa 32*0($a_ptr), $ACC0 + lea 160($a_ptr), $a_ptr + lea .LAVX2_POLY_x8+128(%rip), %rax + lea 128($b_ptr), $b_ptr + vmovdqa 32*1-160($a_ptr), $ACC1 + vmovdqa 32*2-160($a_ptr), $ACC2 + vmovdqa 32*3-160($a_ptr), $ACC3 + vmovdqa 32*4-160($a_ptr), $ACC4 + vmovdqa 32*5-160($a_ptr), $ACC5 + vmovdqa 32*6-160($a_ptr), $ACC6 + vmovdqa 32*7-160($a_ptr), $ACC7 + vmovdqa 32*8-160($a_ptr), $ACC8 + + vpaddq 32*0-128(%rax), $ACC0, $ACC0 + vpaddq 32*1-128(%rax), $ACC1, $ACC1 + vpaddq 32*2-128(%rax), $ACC2, $ACC2 + vpaddq 32*3-128(%rax), $ACC3, $ACC3 + vpaddq 32*4-128(%rax), $ACC4, $ACC4 + vpaddq 32*5-128(%rax), $ACC5, $ACC5 + vpaddq 32*6-128(%rax), $ACC6, $ACC6 + vpaddq 32*7-128(%rax), $ACC7, $ACC7 + vpaddq 32*8-128(%rax), $ACC8, $ACC8 + + vpsubq 32*0-128($b_ptr), $ACC0, $ACC0 + vpsubq 32*1-128($b_ptr), $ACC1, $ACC1 + vpsubq 32*2-128($b_ptr), $ACC2, $ACC2 + vpsubq 32*3-128($b_ptr), $ACC3, $ACC3 + vpsubq 32*4-128($b_ptr), $ACC4, $ACC4 + vpsubq 32*5-128($b_ptr), $ACC5, $ACC5 + vpsubq 32*6-128($b_ptr), $ACC6, $ACC6 + vpsubq 32*7-128($b_ptr), $ACC7, $ACC7 + vpsubq 32*8-128($b_ptr), $ACC8, $ACC8 + + ret +.size avx2_sub_x4,.-avx2_sub_x4 + +.type avx2_select_n_store,\@abi-omnipotent +.align 32 +avx2_select_n_store: + vmovdqa `8+32*9*8`(%rsp), $Y + vpor `8+32*9*8+32`(%rsp), $Y, $Y + + vpandn $ACC0, $Y, $ACC0 + vpandn $ACC1, $Y, $ACC1 + vpandn $ACC2, $Y, $ACC2 + vpandn $ACC3, $Y, $ACC3 + vpandn $ACC4, $Y, $ACC4 + vpandn $ACC5, $Y, $ACC5 + vpandn $ACC6, $Y, $ACC6 + vmovdqa `8+32*9*8+32`(%rsp), $B + vpandn $ACC7, $Y, $ACC7 + vpandn `8+32*9*8`(%rsp), $B, $B + vpandn $ACC8, $Y, $ACC8 + + vpand 32*0(%rsi), $B, $T0 + lea 160(%rsi), %rax + vpand 32*1(%rsi), $B, $Y + vpxor $T0, $ACC0, $ACC0 + vpand 32*2(%rsi), $B, $T0 + vpxor $Y, $ACC1, $ACC1 + vpand 32*3(%rsi), $B, $Y + vpxor $T0, $ACC2, $ACC2 + vpand 32*4-160(%rax), $B, $T0 + vpxor $Y, $ACC3, $ACC3 + vpand 32*5-160(%rax), $B, $Y + vpxor $T0, $ACC4, $ACC4 + vpand 32*6-160(%rax), $B, $T0 + vpxor $Y, $ACC5, $ACC5 + vpand 32*7-160(%rax), $B, $Y + vpxor $T0, $ACC6, $ACC6 + vpand 32*8-160(%rax), $B, $T0 + vmovdqa `8+32*9*8+32`(%rsp), $B + vpxor $Y, $ACC7, $ACC7 + + vpand 32*0(%rdx), $B, $Y + lea 160(%rdx), %rax + vpxor $T0, $ACC8, $ACC8 + vpand 32*1(%rdx), $B, $T0 + vpxor $Y, $ACC0, $ACC0 + vpand 32*2(%rdx), $B, $Y + vpxor $T0, $ACC1, $ACC1 + vpand 32*3(%rdx), $B, $T0 + vpxor $Y, $ACC2, $ACC2 + vpand 32*4-160(%rax), $B, $Y + vpxor $T0, $ACC3, $ACC3 + vpand 32*5-160(%rax), $B, $T0 + vpxor $Y, $ACC4, $ACC4 + vpand 32*6-160(%rax), $B, $Y + vpxor $T0, $ACC5, $ACC5 + vpand 32*7-160(%rax), $B, $T0 + vpxor $Y, $ACC6, $ACC6 + vpand 32*8-160(%rax), $B, $Y + vpxor $T0, $ACC7, $ACC7 + vpxor $Y, $ACC8, $ACC8 + `&STORE` + + ret +.size avx2_select_n_store,.-avx2_select_n_store +___ +$code.=<<___ if (0); # inlined +################################################################################ +# void avx2_mul_by2_x4(void* RESULTx4, void *Ax4); +.type avx2_mul_by2_x4,\@abi-omnipotent +.align 32 +avx2_mul_by2_x4: + vmovdqa 32*0($a_ptr), $ACC0 + lea 160($a_ptr), %rax + vmovdqa 32*1($a_ptr), $ACC1 + vmovdqa 32*2($a_ptr), $ACC2 + vmovdqa 32*3($a_ptr), $ACC3 + vmovdqa 32*4-160(%rax), $ACC4 + vmovdqa 32*5-160(%rax), $ACC5 + vmovdqa 32*6-160(%rax), $ACC6 + vmovdqa 32*7-160(%rax), $ACC7 + vmovdqa 32*8-160(%rax), $ACC8 + + vpaddq $ACC0, $ACC0, $ACC0 + vpaddq $ACC1, $ACC1, $ACC1 + vpaddq $ACC2, $ACC2, $ACC2 + vpaddq $ACC3, $ACC3, $ACC3 + vpaddq $ACC4, $ACC4, $ACC4 + vpaddq $ACC5, $ACC5, $ACC5 + vpaddq $ACC6, $ACC6, $ACC6 + vpaddq $ACC7, $ACC7, $ACC7 + vpaddq $ACC8, $ACC8, $ACC8 + + ret +.size avx2_mul_by2_x4,.-avx2_mul_by2_x4 +___ +my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx"); +my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10"); + +$code.=<<___; +################################################################################ +# void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4); +.globl ecp_nistz256_avx2_point_add_affine_x4 +.type ecp_nistz256_avx2_point_add_affine_x4,\@function,3 +.align 32 +ecp_nistz256_avx2_point_add_affine_x4: + mov %rsp, %rax + push %rbp + vzeroupper +___ +$code.=<<___ if ($win64); + lea -16*10(%rsp), %rsp + vmovaps %xmm6, -8-16*10(%rax) + vmovaps %xmm7, -8-16*9(%rax) + vmovaps %xmm8, -8-16*8(%rax) + vmovaps %xmm9, -8-16*7(%rax) + vmovaps %xmm10, -8-16*6(%rax) + vmovaps %xmm11, -8-16*5(%rax) + vmovaps %xmm12, -8-16*4(%rax) + vmovaps %xmm13, -8-16*3(%rax) + vmovaps %xmm14, -8-16*2(%rax) + vmovaps %xmm15, -8-16*1(%rax) +___ +$code.=<<___; + lea -8(%rax), %rbp + +# Result + 32*0 = Result.X +# Result + 32*9 = Result.Y +# Result + 32*18 = Result.Z + +# A + 32*0 = A.X +# A + 32*9 = A.Y +# A + 32*18 = A.Z + +# B + 32*0 = B.X +# B + 32*9 = B.Y + + sub \$`32*9*8+32*2+32*8`, %rsp + and \$-64, %rsp + + mov $r_ptr_in, $r_ptr + mov $a_ptr_in, $a_ptr + mov $b_ptr_in, $b_ptr + + vmovdqa 32*0($a_ptr_in), %ymm0 + vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK + vpxor %ymm1, %ymm1, %ymm1 + lea 256($a_ptr_in), %rax # size optimization + vpor 32*1($a_ptr_in), %ymm0, %ymm0 + vpor 32*2($a_ptr_in), %ymm0, %ymm0 + vpor 32*3($a_ptr_in), %ymm0, %ymm0 + vpor 32*4-256(%rax), %ymm0, %ymm0 + lea 256(%rax), %rcx # size optimization + vpor 32*5-256(%rax), %ymm0, %ymm0 + vpor 32*6-256(%rax), %ymm0, %ymm0 + vpor 32*7-256(%rax), %ymm0, %ymm0 + vpor 32*8-256(%rax), %ymm0, %ymm0 + vpor 32*9-256(%rax), %ymm0, %ymm0 + vpor 32*10-256(%rax), %ymm0, %ymm0 + vpor 32*11-256(%rax), %ymm0, %ymm0 + vpor 32*12-512(%rcx), %ymm0, %ymm0 + vpor 32*13-512(%rcx), %ymm0, %ymm0 + vpor 32*14-512(%rcx), %ymm0, %ymm0 + vpor 32*15-512(%rcx), %ymm0, %ymm0 + vpor 32*16-512(%rcx), %ymm0, %ymm0 + vpor 32*17-512(%rcx), %ymm0, %ymm0 + vpcmpeqq %ymm1, %ymm0, %ymm0 + vmovdqa %ymm0, `32*9*8`(%rsp) + + vpxor %ymm1, %ymm1, %ymm1 + vmovdqa 32*0($b_ptr), %ymm0 + lea 256($b_ptr), %rax # size optimization + vpor 32*1($b_ptr), %ymm0, %ymm0 + vpor 32*2($b_ptr), %ymm0, %ymm0 + vpor 32*3($b_ptr), %ymm0, %ymm0 + vpor 32*4-256(%rax), %ymm0, %ymm0 + lea 256(%rax), %rcx # size optimization + vpor 32*5-256(%rax), %ymm0, %ymm0 + vpor 32*6-256(%rax), %ymm0, %ymm0 + vpor 32*7-256(%rax), %ymm0, %ymm0 + vpor 32*8-256(%rax), %ymm0, %ymm0 + vpor 32*9-256(%rax), %ymm0, %ymm0 + vpor 32*10-256(%rax), %ymm0, %ymm0 + vpor 32*11-256(%rax), %ymm0, %ymm0 + vpor 32*12-512(%rcx), %ymm0, %ymm0 + vpor 32*13-512(%rcx), %ymm0, %ymm0 + vpor 32*14-512(%rcx), %ymm0, %ymm0 + vpor 32*15-512(%rcx), %ymm0, %ymm0 + vpor 32*16-512(%rcx), %ymm0, %ymm0 + vpor 32*17-512(%rcx), %ymm0, %ymm0 + vpcmpeqq %ymm1, %ymm0, %ymm0 + vmovdqa %ymm0, `32*9*8+32`(%rsp) + + # Z1^2 = Z1*Z1 + lea `32*9*2`($a_ptr), %rsi + lea `32*9*2`(%rsp), %rdi + lea `32*9*8+32*2`(%rsp), %rcx # temporary vector + call avx2_sqr_x4 + call avx2_normalize_n_store + + # U2 = X2*Z1^2 + lea `32*9*0`($b_ptr), %rsi + lea `32*9*2`(%rsp), %rdx + lea `32*9*0`(%rsp), %rdi + call avx2_mul_x4 + #call avx2_normalize + `&STORE` + + # S2 = Z1*Z1^2 = Z1^3 + lea `32*9*2`($a_ptr), %rsi + lea `32*9*2`(%rsp), %rdx + lea `32*9*1`(%rsp), %rdi + call avx2_mul_x4 + call avx2_normalize_n_store + + # S2 = S2*Y2 = Y2*Z1^3 + lea `32*9*1`($b_ptr), %rsi + lea `32*9*1`(%rsp), %rdx + lea `32*9*1`(%rsp), %rdi + call avx2_mul_x4 + call avx2_normalize_n_store + + # H = U2 - U1 = U2 - X1 + lea `32*9*0`(%rsp), %rsi + lea `32*9*0`($a_ptr), %rdx + lea `32*9*3`(%rsp), %rdi + call avx2_sub_x4 + call avx2_normalize_n_store + + # R = S2 - S1 = S2 - Y1 + lea `32*9*1`(%rsp), %rsi + lea `32*9*1`($a_ptr), %rdx + lea `32*9*4`(%rsp), %rdi + call avx2_sub_x4 + call avx2_normalize_n_store + + # Z3 = H*Z1*Z2 + lea `32*9*3`(%rsp), %rsi + lea `32*9*2`($a_ptr), %rdx + lea `32*9*2`($r_ptr), %rdi + call avx2_mul_x4 + call avx2_normalize + + lea .LONE(%rip), %rsi + lea `32*9*2`($a_ptr), %rdx + call avx2_select_n_store + + # R^2 = R^2 + lea `32*9*4`(%rsp), %rsi + lea `32*9*6`(%rsp), %rdi + lea `32*9*8+32*2`(%rsp), %rcx # temporary vector + call avx2_sqr_x4 + call avx2_normalize_n_store + + # H^2 = H^2 + lea `32*9*3`(%rsp), %rsi + lea `32*9*5`(%rsp), %rdi + call avx2_sqr_x4 + call avx2_normalize_n_store + + # H^3 = H^2*H + lea `32*9*3`(%rsp), %rsi + lea `32*9*5`(%rsp), %rdx + lea `32*9*7`(%rsp), %rdi + call avx2_mul_x4 + call avx2_normalize_n_store + + # U2 = U1*H^2 + lea `32*9*0`($a_ptr), %rsi + lea `32*9*5`(%rsp), %rdx + lea `32*9*0`(%rsp), %rdi + call avx2_mul_x4 + #call avx2_normalize + `&STORE` + + # Hsqr = U2*2 + #lea 32*9*0(%rsp), %rsi + #lea 32*9*5(%rsp), %rdi + #call avx2_mul_by2_x4 + + vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 + lea `32*9*5`(%rsp), %rdi + vpaddq $ACC1, $ACC1, $ACC1 + vpaddq $ACC2, $ACC2, $ACC2 + vpaddq $ACC3, $ACC3, $ACC3 + vpaddq $ACC4, $ACC4, $ACC4 + vpaddq $ACC5, $ACC5, $ACC5 + vpaddq $ACC6, $ACC6, $ACC6 + vpaddq $ACC7, $ACC7, $ACC7 + vpaddq $ACC8, $ACC8, $ACC8 + call avx2_normalize_n_store + + # X3 = R^2 - H^3 + #lea 32*9*6(%rsp), %rsi + #lea 32*9*7(%rsp), %rdx + #lea 32*9*5(%rsp), %rcx + #lea 32*9*0($r_ptr), %rdi + #call avx2_sub_x4 + #NORMALIZE + #STORE + + # X3 = X3 - U2*2 + #lea 32*9*0($r_ptr), %rsi + #lea 32*9*0($r_ptr), %rdi + #call avx2_sub_x4 + #NORMALIZE + #STORE + + lea `32*9*6+128`(%rsp), %rsi + lea .LAVX2_POLY_x2+128(%rip), %rax + lea `32*9*7+128`(%rsp), %rdx + lea `32*9*5+128`(%rsp), %rcx + lea `32*9*0`($r_ptr), %rdi + + vmovdqa 32*0-128(%rsi), $ACC0 + vmovdqa 32*1-128(%rsi), $ACC1 + vmovdqa 32*2-128(%rsi), $ACC2 + vmovdqa 32*3-128(%rsi), $ACC3 + vmovdqa 32*4-128(%rsi), $ACC4 + vmovdqa 32*5-128(%rsi), $ACC5 + vmovdqa 32*6-128(%rsi), $ACC6 + vmovdqa 32*7-128(%rsi), $ACC7 + vmovdqa 32*8-128(%rsi), $ACC8 + + vpaddq 32*0-128(%rax), $ACC0, $ACC0 + vpaddq 32*1-128(%rax), $ACC1, $ACC1 + vpaddq 32*2-128(%rax), $ACC2, $ACC2 + vpaddq 32*3-128(%rax), $ACC3, $ACC3 + vpaddq 32*4-128(%rax), $ACC4, $ACC4 + vpaddq 32*5-128(%rax), $ACC5, $ACC5 + vpaddq 32*6-128(%rax), $ACC6, $ACC6 + vpaddq 32*7-128(%rax), $ACC7, $ACC7 + vpaddq 32*8-128(%rax), $ACC8, $ACC8 + + vpsubq 32*0-128(%rdx), $ACC0, $ACC0 + vpsubq 32*1-128(%rdx), $ACC1, $ACC1 + vpsubq 32*2-128(%rdx), $ACC2, $ACC2 + vpsubq 32*3-128(%rdx), $ACC3, $ACC3 + vpsubq 32*4-128(%rdx), $ACC4, $ACC4 + vpsubq 32*5-128(%rdx), $ACC5, $ACC5 + vpsubq 32*6-128(%rdx), $ACC6, $ACC6 + vpsubq 32*7-128(%rdx), $ACC7, $ACC7 + vpsubq 32*8-128(%rdx), $ACC8, $ACC8 + + vpsubq 32*0-128(%rcx), $ACC0, $ACC0 + vpsubq 32*1-128(%rcx), $ACC1, $ACC1 + vpsubq 32*2-128(%rcx), $ACC2, $ACC2 + vpsubq 32*3-128(%rcx), $ACC3, $ACC3 + vpsubq 32*4-128(%rcx), $ACC4, $ACC4 + vpsubq 32*5-128(%rcx), $ACC5, $ACC5 + vpsubq 32*6-128(%rcx), $ACC6, $ACC6 + vpsubq 32*7-128(%rcx), $ACC7, $ACC7 + vpsubq 32*8-128(%rcx), $ACC8, $ACC8 + call avx2_normalize + + lea 32*0($b_ptr), %rsi + lea 32*0($a_ptr), %rdx + call avx2_select_n_store + + # H = U2 - X3 + lea `32*9*0`(%rsp), %rsi + lea `32*9*0`($r_ptr), %rdx + lea `32*9*3`(%rsp), %rdi + call avx2_sub_x4 + call avx2_normalize_n_store + + # + lea `32*9*3`(%rsp), %rsi + lea `32*9*4`(%rsp), %rdx + lea `32*9*3`(%rsp), %rdi + call avx2_mul_x4 + call avx2_normalize_n_store + + # + lea `32*9*7`(%rsp), %rsi + lea `32*9*1`($a_ptr), %rdx + lea `32*9*1`(%rsp), %rdi + call avx2_mul_x4 + call avx2_normalize_n_store + + # + lea `32*9*3`(%rsp), %rsi + lea `32*9*1`(%rsp), %rdx + lea `32*9*1`($r_ptr), %rdi + call avx2_sub_x4 + call avx2_normalize + + lea 32*9($b_ptr), %rsi + lea 32*9($a_ptr), %rdx + call avx2_select_n_store + + #lea 32*9*0($r_ptr), %rsi + #lea 32*9*0($r_ptr), %rdi + #call avx2_mul_by1_x4 + #NORMALIZE + #STORE + + lea `32*9*1`($r_ptr), %rsi + lea `32*9*1`($r_ptr), %rdi + call avx2_mul_by1_x4 + call avx2_normalize_n_store + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps %xmm6, -16*10(%rbp) + movaps %xmm7, -16*9(%rbp) + movaps %xmm8, -16*8(%rbp) + movaps %xmm9, -16*7(%rbp) + movaps %xmm10, -16*6(%rbp) + movaps %xmm11, -16*5(%rbp) + movaps %xmm12, -16*4(%rbp) + movaps %xmm13, -16*3(%rbp) + movaps %xmm14, -16*2(%rbp) + movaps %xmm15, -16*1(%rbp) +___ +$code.=<<___; + mov %rbp, %rsp + pop %rbp + ret +.size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4 + +################################################################################ +# void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4); +.globl ecp_nistz256_avx2_point_add_affines_x4 +.type ecp_nistz256_avx2_point_add_affines_x4,\@function,3 +.align 32 +ecp_nistz256_avx2_point_add_affines_x4: + mov %rsp, %rax + push %rbp + vzeroupper +___ +$code.=<<___ if ($win64); + lea -16*10(%rsp), %rsp + vmovaps %xmm6, -8-16*10(%rax) + vmovaps %xmm7, -8-16*9(%rax) + vmovaps %xmm8, -8-16*8(%rax) + vmovaps %xmm9, -8-16*7(%rax) + vmovaps %xmm10, -8-16*6(%rax) + vmovaps %xmm11, -8-16*5(%rax) + vmovaps %xmm12, -8-16*4(%rax) + vmovaps %xmm13, -8-16*3(%rax) + vmovaps %xmm14, -8-16*2(%rax) + vmovaps %xmm15, -8-16*1(%rax) +___ +$code.=<<___; + lea -8(%rax), %rbp + +# Result + 32*0 = Result.X +# Result + 32*9 = Result.Y +# Result + 32*18 = Result.Z + +# A + 32*0 = A.X +# A + 32*9 = A.Y + +# B + 32*0 = B.X +# B + 32*9 = B.Y + + sub \$`32*9*8+32*2+32*8`, %rsp + and \$-64, %rsp + + mov $r_ptr_in, $r_ptr + mov $a_ptr_in, $a_ptr + mov $b_ptr_in, $b_ptr + + vmovdqa 32*0($a_ptr_in), %ymm0 + vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK + vpxor %ymm1, %ymm1, %ymm1 + lea 256($a_ptr_in), %rax # size optimization + vpor 32*1($a_ptr_in), %ymm0, %ymm0 + vpor 32*2($a_ptr_in), %ymm0, %ymm0 + vpor 32*3($a_ptr_in), %ymm0, %ymm0 + vpor 32*4-256(%rax), %ymm0, %ymm0 + lea 256(%rax), %rcx # size optimization + vpor 32*5-256(%rax), %ymm0, %ymm0 + vpor 32*6-256(%rax), %ymm0, %ymm0 + vpor 32*7-256(%rax), %ymm0, %ymm0 + vpor 32*8-256(%rax), %ymm0, %ymm0 + vpor 32*9-256(%rax), %ymm0, %ymm0 + vpor 32*10-256(%rax), %ymm0, %ymm0 + vpor 32*11-256(%rax), %ymm0, %ymm0 + vpor 32*12-512(%rcx), %ymm0, %ymm0 + vpor 32*13-512(%rcx), %ymm0, %ymm0 + vpor 32*14-512(%rcx), %ymm0, %ymm0 + vpor 32*15-512(%rcx), %ymm0, %ymm0 + vpor 32*16-512(%rcx), %ymm0, %ymm0 + vpor 32*17-512(%rcx), %ymm0, %ymm0 + vpcmpeqq %ymm1, %ymm0, %ymm0 + vmovdqa %ymm0, `32*9*8`(%rsp) + + vpxor %ymm1, %ymm1, %ymm1 + vmovdqa 32*0($b_ptr), %ymm0 + lea 256($b_ptr), %rax # size optimization + vpor 32*1($b_ptr), %ymm0, %ymm0 + vpor 32*2($b_ptr), %ymm0, %ymm0 + vpor 32*3($b_ptr), %ymm0, %ymm0 + vpor 32*4-256(%rax), %ymm0, %ymm0 + lea 256(%rax), %rcx # size optimization + vpor 32*5-256(%rax), %ymm0, %ymm0 + vpor 32*6-256(%rax), %ymm0, %ymm0 + vpor 32*7-256(%rax), %ymm0, %ymm0 + vpor 32*8-256(%rax), %ymm0, %ymm0 + vpor 32*9-256(%rax), %ymm0, %ymm0 + vpor 32*10-256(%rax), %ymm0, %ymm0 + vpor 32*11-256(%rax), %ymm0, %ymm0 + vpor 32*12-512(%rcx), %ymm0, %ymm0 + vpor 32*13-512(%rcx), %ymm0, %ymm0 + vpor 32*14-512(%rcx), %ymm0, %ymm0 + vpor 32*15-512(%rcx), %ymm0, %ymm0 + vpor 32*16-512(%rcx), %ymm0, %ymm0 + vpor 32*17-512(%rcx), %ymm0, %ymm0 + vpcmpeqq %ymm1, %ymm0, %ymm0 + vmovdqa %ymm0, `32*9*8+32`(%rsp) + + # H = U2 - U1 = X2 - X1 + lea `32*9*0`($b_ptr), %rsi + lea `32*9*0`($a_ptr), %rdx + lea `32*9*3`(%rsp), %rdi + call avx2_sub_x4 + call avx2_normalize_n_store + + # R = S2 - S1 = Y2 - Y1 + lea `32*9*1`($b_ptr), %rsi + lea `32*9*1`($a_ptr), %rdx + lea `32*9*4`(%rsp), %rdi + call avx2_sub_x4 + call avx2_normalize_n_store + + # Z3 = H*Z1*Z2 = H + lea `32*9*3`(%rsp), %rsi + lea `32*9*2`($r_ptr), %rdi + call avx2_mul_by1_x4 + call avx2_normalize + + vmovdqa `32*9*8`(%rsp), $B + vpor `32*9*8+32`(%rsp), $B, $B + + vpandn $ACC0, $B, $ACC0 + lea .LONE+128(%rip), %rax + vpandn $ACC1, $B, $ACC1 + vpandn $ACC2, $B, $ACC2 + vpandn $ACC3, $B, $ACC3 + vpandn $ACC4, $B, $ACC4 + vpandn $ACC5, $B, $ACC5 + vpandn $ACC6, $B, $ACC6 + vpandn $ACC7, $B, $ACC7 + + vpand 32*0-128(%rax), $B, $T0 + vpandn $ACC8, $B, $ACC8 + vpand 32*1-128(%rax), $B, $Y + vpxor $T0, $ACC0, $ACC0 + vpand 32*2-128(%rax), $B, $T0 + vpxor $Y, $ACC1, $ACC1 + vpand 32*3-128(%rax), $B, $Y + vpxor $T0, $ACC2, $ACC2 + vpand 32*4-128(%rax), $B, $T0 + vpxor $Y, $ACC3, $ACC3 + vpand 32*5-128(%rax), $B, $Y + vpxor $T0, $ACC4, $ACC4 + vpand 32*6-128(%rax), $B, $T0 + vpxor $Y, $ACC5, $ACC5 + vpand 32*7-128(%rax), $B, $Y + vpxor $T0, $ACC6, $ACC6 + vpand 32*8-128(%rax), $B, $T0 + vpxor $Y, $ACC7, $ACC7 + vpxor $T0, $ACC8, $ACC8 + `&STORE` + + # R^2 = R^2 + lea `32*9*4`(%rsp), %rsi + lea `32*9*6`(%rsp), %rdi + lea `32*9*8+32*2`(%rsp), %rcx # temporary vector + call avx2_sqr_x4 + call avx2_normalize_n_store + + # H^2 = H^2 + lea `32*9*3`(%rsp), %rsi + lea `32*9*5`(%rsp), %rdi + call avx2_sqr_x4 + call avx2_normalize_n_store + + # H^3 = H^2*H + lea `32*9*3`(%rsp), %rsi + lea `32*9*5`(%rsp), %rdx + lea `32*9*7`(%rsp), %rdi + call avx2_mul_x4 + call avx2_normalize_n_store + + # U2 = U1*H^2 + lea `32*9*0`($a_ptr), %rsi + lea `32*9*5`(%rsp), %rdx + lea `32*9*0`(%rsp), %rdi + call avx2_mul_x4 + #call avx2_normalize + `&STORE` + + # Hsqr = U2*2 + #lea 32*9*0(%rsp), %rsi + #lea 32*9*5(%rsp), %rdi + #call avx2_mul_by2_x4 + + vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 + lea `32*9*5`(%rsp), %rdi + vpaddq $ACC1, $ACC1, $ACC1 + vpaddq $ACC2, $ACC2, $ACC2 + vpaddq $ACC3, $ACC3, $ACC3 + vpaddq $ACC4, $ACC4, $ACC4 + vpaddq $ACC5, $ACC5, $ACC5 + vpaddq $ACC6, $ACC6, $ACC6 + vpaddq $ACC7, $ACC7, $ACC7 + vpaddq $ACC8, $ACC8, $ACC8 + call avx2_normalize_n_store + + # X3 = R^2 - H^3 + #lea 32*9*6(%rsp), %rsi + #lea 32*9*7(%rsp), %rdx + #lea 32*9*5(%rsp), %rcx + #lea 32*9*0($r_ptr), %rdi + #call avx2_sub_x4 + #NORMALIZE + #STORE + + # X3 = X3 - U2*2 + #lea 32*9*0($r_ptr), %rsi + #lea 32*9*0($r_ptr), %rdi + #call avx2_sub_x4 + #NORMALIZE + #STORE + + lea `32*9*6+128`(%rsp), %rsi + lea .LAVX2_POLY_x2+128(%rip), %rax + lea `32*9*7+128`(%rsp), %rdx + lea `32*9*5+128`(%rsp), %rcx + lea `32*9*0`($r_ptr), %rdi + + vmovdqa 32*0-128(%rsi), $ACC0 + vmovdqa 32*1-128(%rsi), $ACC1 + vmovdqa 32*2-128(%rsi), $ACC2 + vmovdqa 32*3-128(%rsi), $ACC3 + vmovdqa 32*4-128(%rsi), $ACC4 + vmovdqa 32*5-128(%rsi), $ACC5 + vmovdqa 32*6-128(%rsi), $ACC6 + vmovdqa 32*7-128(%rsi), $ACC7 + vmovdqa 32*8-128(%rsi), $ACC8 + + vpaddq 32*0-128(%rax), $ACC0, $ACC0 + vpaddq 32*1-128(%rax), $ACC1, $ACC1 + vpaddq 32*2-128(%rax), $ACC2, $ACC2 + vpaddq 32*3-128(%rax), $ACC3, $ACC3 + vpaddq 32*4-128(%rax), $ACC4, $ACC4 + vpaddq 32*5-128(%rax), $ACC5, $ACC5 + vpaddq 32*6-128(%rax), $ACC6, $ACC6 + vpaddq 32*7-128(%rax), $ACC7, $ACC7 + vpaddq 32*8-128(%rax), $ACC8, $ACC8 + + vpsubq 32*0-128(%rdx), $ACC0, $ACC0 + vpsubq 32*1-128(%rdx), $ACC1, $ACC1 + vpsubq 32*2-128(%rdx), $ACC2, $ACC2 + vpsubq 32*3-128(%rdx), $ACC3, $ACC3 + vpsubq 32*4-128(%rdx), $ACC4, $ACC4 + vpsubq 32*5-128(%rdx), $ACC5, $ACC5 + vpsubq 32*6-128(%rdx), $ACC6, $ACC6 + vpsubq 32*7-128(%rdx), $ACC7, $ACC7 + vpsubq 32*8-128(%rdx), $ACC8, $ACC8 + + vpsubq 32*0-128(%rcx), $ACC0, $ACC0 + vpsubq 32*1-128(%rcx), $ACC1, $ACC1 + vpsubq 32*2-128(%rcx), $ACC2, $ACC2 + vpsubq 32*3-128(%rcx), $ACC3, $ACC3 + vpsubq 32*4-128(%rcx), $ACC4, $ACC4 + vpsubq 32*5-128(%rcx), $ACC5, $ACC5 + vpsubq 32*6-128(%rcx), $ACC6, $ACC6 + vpsubq 32*7-128(%rcx), $ACC7, $ACC7 + vpsubq 32*8-128(%rcx), $ACC8, $ACC8 + call avx2_normalize + + lea 32*0($b_ptr), %rsi + lea 32*0($a_ptr), %rdx + call avx2_select_n_store + + # H = U2 - X3 + lea `32*9*0`(%rsp), %rsi + lea `32*9*0`($r_ptr), %rdx + lea `32*9*3`(%rsp), %rdi + call avx2_sub_x4 + call avx2_normalize_n_store + + # H = H*R + lea `32*9*3`(%rsp), %rsi + lea `32*9*4`(%rsp), %rdx + lea `32*9*3`(%rsp), %rdi + call avx2_mul_x4 + call avx2_normalize_n_store + + # S2 = S1 * H^3 + lea `32*9*7`(%rsp), %rsi + lea `32*9*1`($a_ptr), %rdx + lea `32*9*1`(%rsp), %rdi + call avx2_mul_x4 + call avx2_normalize_n_store + + # + lea `32*9*3`(%rsp), %rsi + lea `32*9*1`(%rsp), %rdx + lea `32*9*1`($r_ptr), %rdi + call avx2_sub_x4 + call avx2_normalize + + lea 32*9($b_ptr), %rsi + lea 32*9($a_ptr), %rdx + call avx2_select_n_store + + #lea 32*9*0($r_ptr), %rsi + #lea 32*9*0($r_ptr), %rdi + #call avx2_mul_by1_x4 + #NORMALIZE + #STORE + + lea `32*9*1`($r_ptr), %rsi + lea `32*9*1`($r_ptr), %rdi + call avx2_mul_by1_x4 + call avx2_normalize_n_store + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps %xmm6, -16*10(%rbp) + movaps %xmm7, -16*9(%rbp) + movaps %xmm8, -16*8(%rbp) + movaps %xmm9, -16*7(%rbp) + movaps %xmm10, -16*6(%rbp) + movaps %xmm11, -16*5(%rbp) + movaps %xmm12, -16*4(%rbp) + movaps %xmm13, -16*3(%rbp) + movaps %xmm14, -16*2(%rbp) + movaps %xmm15, -16*1(%rbp) +___ +$code.=<<___; + mov %rbp, %rsp + pop %rbp + ret +.size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4 + +################################################################################ +# void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4); +.globl ecp_nistz256_avx2_to_mont +.type ecp_nistz256_avx2_to_mont,\@function,2 +.align 32 +ecp_nistz256_avx2_to_mont: + vzeroupper +___ +$code.=<<___ if ($win64); + lea -8-16*10(%rsp), %rsp + vmovaps %xmm6, -8-16*10(%rax) + vmovaps %xmm7, -8-16*9(%rax) + vmovaps %xmm8, -8-16*8(%rax) + vmovaps %xmm9, -8-16*7(%rax) + vmovaps %xmm10, -8-16*6(%rax) + vmovaps %xmm11, -8-16*5(%rax) + vmovaps %xmm12, -8-16*4(%rax) + vmovaps %xmm13, -8-16*3(%rax) + vmovaps %xmm14, -8-16*2(%rax) + vmovaps %xmm15, -8-16*1(%rax) +___ +$code.=<<___; + vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK + lea .LTO_MONT_AVX2(%rip), %rdx + call avx2_mul_x4 + call avx2_normalize_n_store + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*0(%rsp), %xmm6 + movaps 16*1(%rsp), %xmm7 + movaps 16*2(%rsp), %xmm8 + movaps 16*3(%rsp), %xmm9 + movaps 16*4(%rsp), %xmm10 + movaps 16*5(%rsp), %xmm11 + movaps 16*6(%rsp), %xmm12 + movaps 16*7(%rsp), %xmm13 + movaps 16*8(%rsp), %xmm14 + movaps 16*9(%rsp), %xmm15 + lea 8+16*10(%rsp), %rsp +___ +$code.=<<___; + ret +.size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont + +################################################################################ +# void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4); +.globl ecp_nistz256_avx2_from_mont +.type ecp_nistz256_avx2_from_mont,\@function,2 +.align 32 +ecp_nistz256_avx2_from_mont: + vzeroupper +___ +$code.=<<___ if ($win64); + lea -8-16*10(%rsp), %rsp + vmovaps %xmm6, -8-16*10(%rax) + vmovaps %xmm7, -8-16*9(%rax) + vmovaps %xmm8, -8-16*8(%rax) + vmovaps %xmm9, -8-16*7(%rax) + vmovaps %xmm10, -8-16*6(%rax) + vmovaps %xmm11, -8-16*5(%rax) + vmovaps %xmm12, -8-16*4(%rax) + vmovaps %xmm13, -8-16*3(%rax) + vmovaps %xmm14, -8-16*2(%rax) + vmovaps %xmm15, -8-16*1(%rax) +___ +$code.=<<___; + vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK + lea .LFROM_MONT_AVX2(%rip), %rdx + call avx2_mul_x4 + call avx2_normalize_n_store + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*0(%rsp), %xmm6 + movaps 16*1(%rsp), %xmm7 + movaps 16*2(%rsp), %xmm8 + movaps 16*3(%rsp), %xmm9 + movaps 16*4(%rsp), %xmm10 + movaps 16*5(%rsp), %xmm11 + movaps 16*6(%rsp), %xmm12 + movaps 16*7(%rsp), %xmm13 + movaps 16*8(%rsp), %xmm14 + movaps 16*9(%rsp), %xmm15 + lea 8+16*10(%rsp), %rsp +___ +$code.=<<___; + ret +.size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont + +################################################################################ +# void ecp_nistz256_avx2_set1(void* RESULTx4); +.globl ecp_nistz256_avx2_set1 +.type ecp_nistz256_avx2_set1,\@function,1 +.align 32 +ecp_nistz256_avx2_set1: + lea .LONE+128(%rip), %rax + lea 128(%rdi), %rdi + vzeroupper + vmovdqa 32*0-128(%rax), %ymm0 + vmovdqa 32*1-128(%rax), %ymm1 + vmovdqa 32*2-128(%rax), %ymm2 + vmovdqa 32*3-128(%rax), %ymm3 + vmovdqa 32*4-128(%rax), %ymm4 + vmovdqa 32*5-128(%rax), %ymm5 + vmovdqa %ymm0, 32*0-128(%rdi) + vmovdqa 32*6-128(%rax), %ymm0 + vmovdqa %ymm1, 32*1-128(%rdi) + vmovdqa 32*7-128(%rax), %ymm1 + vmovdqa %ymm2, 32*2-128(%rdi) + vmovdqa 32*8-128(%rax), %ymm2 + vmovdqa %ymm3, 32*3-128(%rdi) + vmovdqa %ymm4, 32*4-128(%rdi) + vmovdqa %ymm5, 32*5-128(%rdi) + vmovdqa %ymm0, 32*6-128(%rdi) + vmovdqa %ymm1, 32*7-128(%rdi) + vmovdqa %ymm2, 32*8-128(%rdi) + + vzeroupper + ret +.size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1 +___ +} +{ +################################################################################ +# void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in, +# int index0, int index1, int index2, int index3); +################################################################################ + +my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d"); +my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3)); +my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11)); +my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15)); + +$code.=<<___; +.globl ecp_nistz256_avx2_multi_gather_w7 +.type ecp_nistz256_avx2_multi_gather_w7,\@function,6 +.align 32 +ecp_nistz256_avx2_multi_gather_w7: + vzeroupper +___ +$code.=<<___ if ($win64); + lea -8-16*10(%rsp), %rsp + vmovaps %xmm6, -8-16*10(%rax) + vmovaps %xmm7, -8-16*9(%rax) + vmovaps %xmm8, -8-16*8(%rax) + vmovaps %xmm9, -8-16*7(%rax) + vmovaps %xmm10, -8-16*6(%rax) + vmovaps %xmm11, -8-16*5(%rax) + vmovaps %xmm12, -8-16*4(%rax) + vmovaps %xmm13, -8-16*3(%rax) + vmovaps %xmm14, -8-16*2(%rax) + vmovaps %xmm15, -8-16*1(%rax) +___ +$code.=<<___; + lea .LIntOne(%rip), %rax + + vmovd $index0, %xmm0 + vmovd $index1, %xmm1 + vmovd $index2, %xmm2 + vmovd $index3, %xmm3 + + vpxor $R0a, $R0a, $R0a + vpxor $R0b, $R0b, $R0b + vpxor $R1a, $R1a, $R1a + vpxor $R1b, $R1b, $R1b + vpxor $R2a, $R2a, $R2a + vpxor $R2b, $R2b, $R2b + vpxor $R3a, $R3a, $R3a + vpxor $R3b, $R3b, $R3b + vmovdqa (%rax), $M0 + + vpermd $INDEX0, $R0a, $INDEX0 + vpermd $INDEX1, $R0a, $INDEX1 + vpermd $INDEX2, $R0a, $INDEX2 + vpermd $INDEX3, $R0a, $INDEX3 + + mov \$64, %ecx + lea 112($val), $val # size optimization + jmp .Lmulti_select_loop_avx2 + +# INDEX=0, corresponds to the point at infty (0,0) +.align 32 +.Lmulti_select_loop_avx2: + vpcmpeqd $INDEX0, $M0, $TMP0 + + vmovdqa `32*0+32*64*2*0`($in_t), $T0 + vmovdqa `32*1+32*64*2*0`($in_t), $T1 + vpand $TMP0, $T0, $T0 + vpand $TMP0, $T1, $T1 + vpxor $T0, $R0a, $R0a + vpxor $T1, $R0b, $R0b + + vpcmpeqd $INDEX1, $M0, $TMP0 + + vmovdqa `32*0+32*64*2*1`($in_t), $T0 + vmovdqa `32*1+32*64*2*1`($in_t), $T1 + vpand $TMP0, $T0, $T0 + vpand $TMP0, $T1, $T1 + vpxor $T0, $R1a, $R1a + vpxor $T1, $R1b, $R1b + + vpcmpeqd $INDEX2, $M0, $TMP0 + + vmovdqa `32*0+32*64*2*2`($in_t), $T0 + vmovdqa `32*1+32*64*2*2`($in_t), $T1 + vpand $TMP0, $T0, $T0 + vpand $TMP0, $T1, $T1 + vpxor $T0, $R2a, $R2a + vpxor $T1, $R2b, $R2b + + vpcmpeqd $INDEX3, $M0, $TMP0 + + vmovdqa `32*0+32*64*2*3`($in_t), $T0 + vmovdqa `32*1+32*64*2*3`($in_t), $T1 + vpand $TMP0, $T0, $T0 + vpand $TMP0, $T1, $T1 + vpxor $T0, $R3a, $R3a + vpxor $T1, $R3b, $R3b + + vpaddd (%rax), $M0, $M0 # increment + lea 32*2($in_t), $in_t + + dec %ecx + jnz .Lmulti_select_loop_avx2 + + vmovdqu $R0a, 32*0-112($val) + vmovdqu $R0b, 32*1-112($val) + vmovdqu $R1a, 32*2-112($val) + vmovdqu $R1b, 32*3-112($val) + vmovdqu $R2a, 32*4-112($val) + vmovdqu $R2b, 32*5-112($val) + vmovdqu $R3a, 32*6-112($val) + vmovdqu $R3b, 32*7-112($val) + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*0(%rsp), %xmm6 + movaps 16*1(%rsp), %xmm7 + movaps 16*2(%rsp), %xmm8 + movaps 16*3(%rsp), %xmm9 + movaps 16*4(%rsp), %xmm10 + movaps 16*5(%rsp), %xmm11 + movaps 16*6(%rsp), %xmm12 + movaps 16*7(%rsp), %xmm13 + movaps 16*8(%rsp), %xmm14 + movaps 16*9(%rsp), %xmm15 + lea 8+16*10(%rsp), %rsp +___ +$code.=<<___; + ret +.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 + +.extern OPENSSL_ia32cap_P +.globl ecp_nistz_avx2_eligible +.type ecp_nistz_avx2_eligible,\@abi-omnipotent +.align 32 +ecp_nistz_avx2_eligible: + mov OPENSSL_ia32cap_P+8(%rip),%eax + shr \$5,%eax + and \$1,%eax + ret +.size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible +___ +} +}} else {{ # assembler is too old +$code.=<<___; +.text + +.globl ecp_nistz256_avx2_transpose_convert +.globl ecp_nistz256_avx2_convert_transpose_back +.globl ecp_nistz256_avx2_point_add_affine_x4 +.globl ecp_nistz256_avx2_point_add_affines_x4 +.globl ecp_nistz256_avx2_to_mont +.globl ecp_nistz256_avx2_from_mont +.globl ecp_nistz256_avx2_set1 +.globl ecp_nistz256_avx2_multi_gather_w7 +.type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent +ecp_nistz256_avx2_transpose_convert: +ecp_nistz256_avx2_convert_transpose_back: +ecp_nistz256_avx2_point_add_affine_x4: +ecp_nistz256_avx2_point_add_affines_x4: +ecp_nistz256_avx2_to_mont: +ecp_nistz256_avx2_from_mont: +ecp_nistz256_avx2_set1: +ecp_nistz256_avx2_multi_gather_w7: + .byte 0x0f,0x0b # ud2 + ret +.size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 + +.globl ecp_nistz_avx2_eligible +.type ecp_nistz_avx2_eligible,\@abi-omnipotent +ecp_nistz_avx2_eligible: + xor %eax,%eax + ret +.size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible +___ +}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-ppc64.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-ppc64.pl new file mode 100755 index 000000000..2bf54e2aa --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-ppc64.pl @@ -0,0 +1,2382 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for PPC64. +# +# August 2016. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. +# +# with/without -DECP_NISTZ256_ASM +# POWER7 +260-530% +# POWER8 +220-340% + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my $sp="r1"; + +{ +my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3, + $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) = + map("r$_",(3..12,22..31)); + +my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont + +$code.=<<___; +.machine "any" +.text +___ +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +$code.=<<___; +.type ecp_nistz256_precomputed,\@object +.globl ecp_nistz256_precomputed +.align 12 +ecp_nistz256_precomputed: +___ +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + $code.=".byte\t"; + $code.=join(',',map { sprintf "0x%02x",$_} @line); + $code.="\n"; + } +} + +$code.=<<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>" + +# void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +# const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont +.align 5 +ecp_nistz256_mul_mont: + stdu $sp,-128($sp) + mflr r0 + std r22,48($sp) + std r23,56($sp) + std r24,64($sp) + std r25,72($sp) + std r26,80($sp) + std r27,88($sp) + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $a0,0($ap) + ld $bi,0($bp) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_mul_mont + + mtlr r0 + ld r22,48($sp) + ld r23,56($sp) + ld r24,64($sp) + ld r25,72($sp) + ld r26,80($sp) + ld r27,88($sp) + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,10,3,0 + .long 0 +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +# void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont +.align 4 +ecp_nistz256_sqr_mont: + stdu $sp,-128($sp) + mflr r0 + std r22,48($sp) + std r23,56($sp) + std r24,64($sp) + std r25,72($sp) + std r26,80($sp) + std r27,88($sp) + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_sqr_mont + + mtlr r0 + ld r22,48($sp) + ld r23,56($sp) + ld r24,64($sp) + ld r25,72($sp) + ld r26,80($sp) + ld r27,88($sp) + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,10,2,0 + .long 0 +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +# void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], +# const BN_ULONG x2[4]); +.globl ecp_nistz256_add +.align 4 +ecp_nistz256_add: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $t0, 0($bp) + ld $acc1,8($ap) + ld $t1, 8($bp) + ld $acc2,16($ap) + ld $t2, 16($bp) + ld $acc3,24($ap) + ld $t3, 24($bp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_add + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,3,0 + .long 0 +.size ecp_nistz256_add,.-ecp_nistz256_add + +# void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_div_by_2 +.align 4 +ecp_nistz256_div_by_2: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $acc1,8($ap) + ld $acc2,16($ap) + ld $acc3,24($ap) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_div_by_2 + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,2,0 + .long 0 +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +# void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_2 +.align 4 +ecp_nistz256_mul_by_2: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $acc1,8($ap) + ld $acc2,16($ap) + ld $acc3,24($ap) + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_add # ret = a+a // 2*a + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,3,0 + .long 0 +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +# void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_3 +.align 4 +ecp_nistz256_mul_by_3: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $acc1,8($ap) + ld $acc2,16($ap) + ld $acc3,24($ap) + + mr $t0,$acc0 + std $acc0,64($sp) + mr $t1,$acc1 + std $acc1,72($sp) + mr $t2,$acc2 + std $acc2,80($sp) + mr $t3,$acc3 + std $acc3,88($sp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_add # ret = a+a // 2*a + + ld $t0,64($sp) + ld $t1,72($sp) + ld $t2,80($sp) + ld $t3,88($sp) + + bl __ecp_nistz256_add # ret += a // 2*a+a=3*a + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,2,0 + .long 0 +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +# void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +# const BN_ULONG x2[4]); +.globl ecp_nistz256_sub +.align 4 +ecp_nistz256_sub: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + ld $acc0,0($ap) + ld $acc1,8($ap) + ld $acc2,16($ap) + ld $acc3,24($ap) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_sub_from + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,3,0 + .long 0 +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +# void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg +.align 4 +ecp_nistz256_neg: + stdu $sp,-128($sp) + mflr r0 + std r28,96($sp) + std r29,104($sp) + std r30,112($sp) + std r31,120($sp) + + mr $bp,$ap + li $acc0,0 + li $acc1,0 + li $acc2,0 + li $acc3,0 + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + bl __ecp_nistz256_sub_from + + mtlr r0 + ld r28,96($sp) + ld r29,104($sp) + ld r30,112($sp) + ld r31,120($sp) + addi $sp,$sp,128 + blr + .long 0 + .byte 0,12,4,0,0x80,4,2,0 + .long 0 +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +# to $a0-$a3 and b[0] - to $bi +.type __ecp_nistz256_mul_mont,\@function +.align 4 +__ecp_nistz256_mul_mont: + mulld $acc0,$a0,$bi # a[0]*b[0] + mulhdu $t0,$a0,$bi + + mulld $acc1,$a1,$bi # a[1]*b[0] + mulhdu $t1,$a1,$bi + + mulld $acc2,$a2,$bi # a[2]*b[0] + mulhdu $t2,$a2,$bi + + mulld $acc3,$a3,$bi # a[3]*b[0] + mulhdu $t3,$a3,$bi + ld $bi,8($bp) # b[1] + + addc $acc1,$acc1,$t0 # accumulate high parts of multiplication + sldi $t0,$acc0,32 + adde $acc2,$acc2,$t1 + srdi $t1,$acc0,32 + adde $acc3,$acc3,$t2 + addze $acc4,$t3 + li $acc5,0 +___ +for($i=1;$i<4;$i++) { + ################################################################ + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff0001.00000000.0000ffff.ffffffff + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 + # - 0000abcd.efgh0000.00000000.00000000.abcdefgh + # + # or marking redundant operations: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- + # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- + # - 0000abcd.efgh0000.--------.--------.-------- + +$code.=<<___; + subfc $t2,$t0,$acc0 # "*0xffff0001" + subfe $t3,$t1,$acc0 + addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] + adde $acc1,$acc2,$t1 + adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 + adde $acc3,$acc4,$t3 + addze $acc4,$acc5 + + mulld $t0,$a0,$bi # lo(a[0]*b[i]) + mulld $t1,$a1,$bi # lo(a[1]*b[i]) + mulld $t2,$a2,$bi # lo(a[2]*b[i]) + mulld $t3,$a3,$bi # lo(a[3]*b[i]) + addc $acc0,$acc0,$t0 # accumulate low parts of multiplication + mulhdu $t0,$a0,$bi # hi(a[0]*b[i]) + adde $acc1,$acc1,$t1 + mulhdu $t1,$a1,$bi # hi(a[1]*b[i]) + adde $acc2,$acc2,$t2 + mulhdu $t2,$a2,$bi # hi(a[2]*b[i]) + adde $acc3,$acc3,$t3 + mulhdu $t3,$a3,$bi # hi(a[3]*b[i]) + addze $acc4,$acc4 +___ +$code.=<<___ if ($i<3); + ld $bi,8*($i+1)($bp) # b[$i+1] +___ +$code.=<<___; + addc $acc1,$acc1,$t0 # accumulate high parts of multiplication + sldi $t0,$acc0,32 + adde $acc2,$acc2,$t1 + srdi $t1,$acc0,32 + adde $acc3,$acc3,$t2 + adde $acc4,$acc4,$t3 + li $acc5,0 + addze $acc5,$acc5 +___ +} +$code.=<<___; + # last reduction + subfc $t2,$t0,$acc0 # "*0xffff0001" + subfe $t3,$t1,$acc0 + addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] + adde $acc1,$acc2,$t1 + adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 + adde $acc3,$acc4,$t3 + addze $acc4,$acc5 + + li $t2,0 + addic $acc0,$acc0,1 # ret -= modulus + subfe $acc1,$poly1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$poly3,$acc3 + subfe $acc4,$t2,$acc4 + + addc $acc0,$acc0,$acc4 # ret += modulus if borrow + and $t1,$poly1,$acc4 + and $t3,$poly3,$acc4 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,1,0 + .long 0 +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +# to $a0-$a3 +.type __ecp_nistz256_sqr_mont,\@function +.align 4 +__ecp_nistz256_sqr_mont: + ################################################################ + # | | | | | |a1*a0| | + # | | | | |a2*a0| | | + # | |a3*a2|a3*a0| | | | + # | | | |a2*a1| | | | + # | | |a3*a1| | | | | + # *| | | | | | | | 2| + # +|a3*a3|a2*a2|a1*a1|a0*a0| + # |--+--+--+--+--+--+--+--| + # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + # + # "can't overflow" below mark carrying into high part of + # multiplication result, which can't overflow, because it + # can never be all ones. + + mulld $acc1,$a1,$a0 # a[1]*a[0] + mulhdu $t1,$a1,$a0 + mulld $acc2,$a2,$a0 # a[2]*a[0] + mulhdu $t2,$a2,$a0 + mulld $acc3,$a3,$a0 # a[3]*a[0] + mulhdu $acc4,$a3,$a0 + + addc $acc2,$acc2,$t1 # accumulate high parts of multiplication + mulld $t0,$a2,$a1 # a[2]*a[1] + mulhdu $t1,$a2,$a1 + adde $acc3,$acc3,$t2 + mulld $t2,$a3,$a1 # a[3]*a[1] + mulhdu $t3,$a3,$a1 + addze $acc4,$acc4 # can't overflow + + mulld $acc5,$a3,$a2 # a[3]*a[2] + mulhdu $acc6,$a3,$a2 + + addc $t1,$t1,$t2 # accumulate high parts of multiplication + addze $t2,$t3 # can't overflow + + addc $acc3,$acc3,$t0 # accumulate low parts of multiplication + adde $acc4,$acc4,$t1 + adde $acc5,$acc5,$t2 + addze $acc6,$acc6 # can't overflow + + addc $acc1,$acc1,$acc1 # acc[1-6]*=2 + adde $acc2,$acc2,$acc2 + adde $acc3,$acc3,$acc3 + adde $acc4,$acc4,$acc4 + adde $acc5,$acc5,$acc5 + adde $acc6,$acc6,$acc6 + li $acc7,0 + addze $acc7,$acc7 + + mulld $acc0,$a0,$a0 # a[0]*a[0] + mulhdu $a0,$a0,$a0 + mulld $t1,$a1,$a1 # a[1]*a[1] + mulhdu $a1,$a1,$a1 + mulld $t2,$a2,$a2 # a[2]*a[2] + mulhdu $a2,$a2,$a2 + mulld $t3,$a3,$a3 # a[3]*a[3] + mulhdu $a3,$a3,$a3 + addc $acc1,$acc1,$a0 # +a[i]*a[i] + sldi $t0,$acc0,32 + adde $acc2,$acc2,$t1 + srdi $t1,$acc0,32 + adde $acc3,$acc3,$a1 + adde $acc4,$acc4,$t2 + adde $acc5,$acc5,$a2 + adde $acc6,$acc6,$t3 + adde $acc7,$acc7,$a3 +___ +for($i=0;$i<3;$i++) { # reductions, see commentary in + # multiplication for details +$code.=<<___; + subfc $t2,$t0,$acc0 # "*0xffff0001" + subfe $t3,$t1,$acc0 + addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] + sldi $t0,$acc0,32 + adde $acc1,$acc2,$t1 + srdi $t1,$acc0,32 + adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 + addze $acc3,$t3 # can't overflow +___ +} +$code.=<<___; + subfc $t2,$t0,$acc0 # "*0xffff0001" + subfe $t3,$t1,$acc0 + addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] + adde $acc1,$acc2,$t1 + adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 + addze $acc3,$t3 # can't overflow + + addc $acc0,$acc0,$acc4 # accumulate upper half + adde $acc1,$acc1,$acc5 + adde $acc2,$acc2,$acc6 + adde $acc3,$acc3,$acc7 + li $t2,0 + addze $acc4,$t2 + + addic $acc0,$acc0,1 # ret -= modulus + subfe $acc1,$poly1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$poly3,$acc3 + subfe $acc4,$t2,$acc4 + + addc $acc0,$acc0,$acc4 # ret += modulus if borrow + and $t1,$poly1,$acc4 + and $t3,$poly3,$acc4 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,1,0 + .long 0 +.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont + +# Note that __ecp_nistz256_add expects both input vectors pre-loaded to +# $a0-$a3 and $t0-$t3. This is done because it's used in multiple +# contexts, e.g. in multiplication by 2 and 3... +.type __ecp_nistz256_add,\@function +.align 4 +__ecp_nistz256_add: + addc $acc0,$acc0,$t0 # ret = a+b + adde $acc1,$acc1,$t1 + adde $acc2,$acc2,$t2 + li $t2,0 + adde $acc3,$acc3,$t3 + addze $t0,$t2 + + # if a+b >= modulus, subtract modulus + # + # But since comparison implies subtraction, we subtract + # modulus and then add it back if subtraction borrowed. + + subic $acc0,$acc0,-1 + subfe $acc1,$poly1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$poly3,$acc3 + subfe $t0,$t2,$t0 + + addc $acc0,$acc0,$t0 + and $t1,$poly1,$t0 + and $t3,$poly3,$t0 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size __ecp_nistz256_add,.-__ecp_nistz256_add + +.type __ecp_nistz256_sub_from,\@function +.align 4 +__ecp_nistz256_sub_from: + ld $t0,0($bp) + ld $t1,8($bp) + ld $t2,16($bp) + ld $t3,24($bp) + subfc $acc0,$t0,$acc0 # ret = a-b + subfe $acc1,$t1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$t3,$acc3 + subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 + + # if a-b borrowed, add modulus + + addc $acc0,$acc0,$t0 # ret -= modulus & t0 + and $t1,$poly1,$t0 + and $t3,$poly3,$t0 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,\@function +.align 4 +__ecp_nistz256_sub_morf: + ld $t0,0($bp) + ld $t1,8($bp) + ld $t2,16($bp) + ld $t3,24($bp) + subfc $acc0,$acc0,$t0 # ret = b-a + subfe $acc1,$acc1,$t1 + subfe $acc2,$acc2,$t2 + subfe $acc3,$acc3,$t3 + subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 + + # if b-a borrowed, add modulus + + addc $acc0,$acc0,$t0 # ret -= modulus & t0 + and $t1,$poly1,$t0 + and $t3,$poly3,$t0 + adde $acc1,$acc1,$t1 + addze $acc2,$acc2 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_div_by_2,\@function +.align 4 +__ecp_nistz256_div_by_2: + andi. $t0,$acc0,1 + addic $acc0,$acc0,-1 # a += modulus + neg $t0,$t0 + adde $acc1,$acc1,$poly1 + not $t0,$t0 + addze $acc2,$acc2 + li $t2,0 + adde $acc3,$acc3,$poly3 + and $t1,$poly1,$t0 + addze $ap,$t2 # ap = carry + and $t3,$poly3,$t0 + + subfc $acc0,$t0,$acc0 # a -= modulus if a was even + subfe $acc1,$t1,$acc1 + subfe $acc2,$t2,$acc2 + subfe $acc3,$t3,$acc3 + subfe $ap, $t2,$ap + + srdi $acc0,$acc0,1 + sldi $t0,$acc1,63 + srdi $acc1,$acc1,1 + sldi $t1,$acc2,63 + srdi $acc2,$acc2,1 + sldi $t2,$acc3,63 + srdi $acc3,$acc3,1 + sldi $t3,$ap,63 + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,1,0 + .long 0 +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +___ +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +if (1) { +my $FRAME=64+32*4+12*8; +my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3)); +# above map() describes stack layout with 4 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real) = map("r$_",(20,21)); + +$code.=<<___; +.globl ecp_nistz256_point_double +.align 5 +ecp_nistz256_point_double: + stdu $sp,-$FRAME($sp) + mflr r0 + std r20,$FRAME-8*12($sp) + std r21,$FRAME-8*11($sp) + std r22,$FRAME-8*10($sp) + std r23,$FRAME-8*9($sp) + std r24,$FRAME-8*8($sp) + std r25,$FRAME-8*7($sp) + std r26,$FRAME-8*6($sp) + std r27,$FRAME-8*5($sp) + std r28,$FRAME-8*4($sp) + std r29,$FRAME-8*3($sp) + std r30,$FRAME-8*2($sp) + std r31,$FRAME-8*1($sp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 +.Ldouble_shortcut: + ld $acc0,32($ap) + ld $acc1,40($ap) + ld $acc2,48($ap) + ld $acc3,56($ap) + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + ld $a0,64($ap) # forward load for p256_sqr_mont + ld $a1,72($ap) + ld $a2,80($ap) + ld $a3,88($ap) + mr $rp_real,$rp + mr $ap_real,$ap + addi $rp,$sp,$S + bl __ecp_nistz256_add # p256_mul_by_2(S, in_y); + + addi $rp,$sp,$Zsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z); + + ld $t0,0($ap_real) + ld $t1,8($ap_real) + ld $t2,16($ap_real) + ld $t3,24($ap_real) + mr $a0,$acc0 # put Zsqr aside for p256_sub + mr $a1,$acc1 + mr $a2,$acc2 + mr $a3,$acc3 + addi $rp,$sp,$M + bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x); + + addi $bp,$ap_real,0 + mr $acc0,$a0 # restore Zsqr + mr $acc1,$a1 + mr $acc2,$a2 + mr $acc3,$a3 + ld $a0,$S+0($sp) # forward load for p256_sqr_mont + ld $a1,$S+8($sp) + ld $a2,$S+16($sp) + ld $a3,$S+24($sp) + addi $rp,$sp,$Zsqr + bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr); + + addi $rp,$sp,$S + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S); + + ld $bi,32($ap_real) + ld $a0,64($ap_real) + ld $a1,72($ap_real) + ld $a2,80($ap_real) + ld $a3,88($ap_real) + addi $bp,$ap_real,32 + addi $rp,$sp,$tmp0 + bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y); + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + ld $a0,$S+0($sp) # forward load for p256_sqr_mont + ld $a1,$S+8($sp) + ld $a2,$S+16($sp) + ld $a3,$S+24($sp) + addi $rp,$rp_real,64 + bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0); + + addi $rp,$sp,$tmp0 + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S); + + ld $bi,$Zsqr($sp) # forward load for p256_mul_mont + ld $a0,$M+0($sp) + ld $a1,$M+8($sp) + ld $a2,$M+16($sp) + ld $a3,$M+24($sp) + addi $rp,$rp_real,32 + bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0); + + addi $bp,$sp,$Zsqr + addi $rp,$sp,$M + bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr); + + mr $t0,$acc0 # duplicate M + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + mr $a0,$acc0 # put M aside + mr $a1,$acc1 + mr $a2,$acc2 + mr $a3,$acc3 + addi $rp,$sp,$M + bl __ecp_nistz256_add + mr $t0,$a0 # restore M + mr $t1,$a1 + mr $t2,$a2 + mr $t3,$a3 + ld $bi,0($ap_real) # forward load for p256_mul_mont + ld $a0,$S+0($sp) + ld $a1,$S+8($sp) + ld $a2,$S+16($sp) + ld $a3,$S+24($sp) + bl __ecp_nistz256_add # p256_mul_by_3(M, M); + + addi $bp,$ap_real,0 + addi $rp,$sp,$S + bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x); + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + ld $a0,$M+0($sp) # forward load for p256_sqr_mont + ld $a1,$M+8($sp) + ld $a2,$M+16($sp) + ld $a3,$M+24($sp) + addi $rp,$sp,$tmp0 + bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S); + + addi $rp,$rp_real,0 + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M); + + addi $bp,$sp,$tmp0 + bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0); + + addi $bp,$sp,$S + addi $rp,$sp,$S + bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x); + + ld $bi,$M($sp) + mr $a0,$acc0 # copy S + mr $a1,$acc1 + mr $a2,$acc2 + mr $a3,$acc3 + addi $bp,$sp,$M + bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M); + + addi $bp,$rp_real,32 + addi $rp,$rp_real,32 + bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y); + + mtlr r0 + ld r20,$FRAME-8*12($sp) + ld r21,$FRAME-8*11($sp) + ld r22,$FRAME-8*10($sp) + ld r23,$FRAME-8*9($sp) + ld r24,$FRAME-8*8($sp) + ld r25,$FRAME-8*7($sp) + ld r26,$FRAME-8*6($sp) + ld r27,$FRAME-8*5($sp) + ld r28,$FRAME-8*4($sp) + ld r29,$FRAME-8*3($sp) + ld r30,$FRAME-8*2($sp) + ld r31,$FRAME-8*1($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,12,2,0 + .long 0 +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +if (1) { +my $FRAME = 64 + 32*12 + 16*8; +my ($res_x,$res_y,$res_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); +# above map() describes stack layout with 12 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); + +$code.=<<___; +.globl ecp_nistz256_point_add +.align 5 +ecp_nistz256_point_add: + stdu $sp,-$FRAME($sp) + mflr r0 + std r16,$FRAME-8*16($sp) + std r17,$FRAME-8*15($sp) + std r18,$FRAME-8*14($sp) + std r19,$FRAME-8*13($sp) + std r20,$FRAME-8*12($sp) + std r21,$FRAME-8*11($sp) + std r22,$FRAME-8*10($sp) + std r23,$FRAME-8*9($sp) + std r24,$FRAME-8*8($sp) + std r25,$FRAME-8*7($sp) + std r26,$FRAME-8*6($sp) + std r27,$FRAME-8*5($sp) + std r28,$FRAME-8*4($sp) + std r29,$FRAME-8*3($sp) + std r30,$FRAME-8*2($sp) + std r31,$FRAME-8*1($sp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + ld $a0,64($bp) # in2_z + ld $a1,72($bp) + ld $a2,80($bp) + ld $a3,88($bp) + mr $rp_real,$rp + mr $ap_real,$ap + mr $bp_real,$bp + or $t0,$a0,$a1 + or $t2,$a2,$a3 + or $in2infty,$t0,$t2 + neg $t0,$in2infty + or $in2infty,$in2infty,$t0 + sradi $in2infty,$in2infty,63 # !in2infty + addi $rp,$sp,$Z2sqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z); + + ld $a0,64($ap_real) # in1_z + ld $a1,72($ap_real) + ld $a2,80($ap_real) + ld $a3,88($ap_real) + or $t0,$a0,$a1 + or $t2,$a2,$a3 + or $in1infty,$t0,$t2 + neg $t0,$in1infty + or $in1infty,$in1infty,$t0 + sradi $in1infty,$in1infty,63 # !in1infty + addi $rp,$sp,$Z1sqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); + + ld $bi,64($bp_real) + ld $a0,$Z2sqr+0($sp) + ld $a1,$Z2sqr+8($sp) + ld $a2,$Z2sqr+16($sp) + ld $a3,$Z2sqr+24($sp) + addi $bp,$bp_real,64 + addi $rp,$sp,$S1 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z); + + ld $bi,64($ap_real) + ld $a0,$Z1sqr+0($sp) + ld $a1,$Z1sqr+8($sp) + ld $a2,$Z1sqr+16($sp) + ld $a3,$Z1sqr+24($sp) + addi $bp,$ap_real,64 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); + + ld $bi,32($ap_real) + ld $a0,$S1+0($sp) + ld $a1,$S1+8($sp) + ld $a2,$S1+16($sp) + ld $a3,$S1+24($sp) + addi $bp,$ap_real,32 + addi $rp,$sp,$S1 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y); + + ld $bi,32($bp_real) + ld $a0,$S2+0($sp) + ld $a1,$S2+8($sp) + ld $a2,$S2+16($sp) + ld $a3,$S2+24($sp) + addi $bp,$bp_real,32 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); + + addi $bp,$sp,$S1 + ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont + ld $a0,0($ap_real) + ld $a1,8($ap_real) + ld $a2,16($ap_real) + ld $a3,24($ap_real) + addi $rp,$sp,$R + bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1); + + or $acc0,$acc0,$acc1 # see if result is zero + or $acc2,$acc2,$acc3 + or $temp,$acc0,$acc2 + + addi $bp,$sp,$Z2sqr + addi $rp,$sp,$U1 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr); + + ld $bi,$Z1sqr($sp) + ld $a0,0($bp_real) + ld $a1,8($bp_real) + ld $a2,16($bp_real) + ld $a3,24($bp_real) + addi $bp,$sp,$Z1sqr + addi $rp,$sp,$U2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr); + + addi $bp,$sp,$U1 + ld $a0,$R+0($sp) # forward load for p256_sqr_mont + ld $a1,$R+8($sp) + ld $a2,$R+16($sp) + ld $a3,$R+24($sp) + addi $rp,$sp,$H + bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1); + + or $acc0,$acc0,$acc1 # see if result is zero + or $acc2,$acc2,$acc3 + or. $acc0,$acc0,$acc2 + bne .Ladd_proceed # is_equal(U1,U2)? + + and. $t0,$in1infty,$in2infty + beq .Ladd_proceed # (in1infty || in2infty)? + + cmpldi $temp,0 + beq .Ladd_double # is_equal(S1,S2)? + + xor $a0,$a0,$a0 + std $a0,0($rp_real) + std $a0,8($rp_real) + std $a0,16($rp_real) + std $a0,24($rp_real) + std $a0,32($rp_real) + std $a0,40($rp_real) + std $a0,48($rp_real) + std $a0,56($rp_real) + std $a0,64($rp_real) + std $a0,72($rp_real) + std $a0,80($rp_real) + std $a0,88($rp_real) + b .Ladd_done + +.align 4 +.Ladd_double: + ld $bp,0($sp) # back-link + mr $ap,$ap_real + mr $rp,$rp_real + ld r16,$FRAME-8*16($sp) + ld r17,$FRAME-8*15($sp) + ld r18,$FRAME-8*14($sp) + ld r19,$FRAME-8*13($sp) + stdu $bp,$FRAME-288($sp) # difference in stack frame sizes + b .Ldouble_shortcut + +.align 4 +.Ladd_proceed: + addi $rp,$sp,$Rsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); + + ld $bi,64($ap_real) + ld $a0,$H+0($sp) + ld $a1,$H+8($sp) + ld $a2,$H+16($sp) + ld $a3,$H+24($sp) + addi $bp,$ap_real,64 + addi $rp,$sp,$res_z + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); + + ld $a0,$H+0($sp) + ld $a1,$H+8($sp) + ld $a2,$H+16($sp) + ld $a3,$H+24($sp) + addi $rp,$sp,$Hsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); + + ld $bi,64($bp_real) + ld $a0,$res_z+0($sp) + ld $a1,$res_z+8($sp) + ld $a2,$res_z+16($sp) + ld $a3,$res_z+24($sp) + addi $bp,$bp_real,64 + addi $rp,$sp,$res_z + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z); + + ld $bi,$H($sp) + ld $a0,$Hsqr+0($sp) + ld $a1,$Hsqr+8($sp) + ld $a2,$Hsqr+16($sp) + ld $a3,$Hsqr+24($sp) + addi $bp,$sp,$H + addi $rp,$sp,$Hcub + bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); + + ld $bi,$Hsqr($sp) + ld $a0,$U1+0($sp) + ld $a1,$U1+8($sp) + ld $a2,$U1+16($sp) + ld $a3,$U1+24($sp) + addi $bp,$sp,$Hsqr + addi $rp,$sp,$U2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr); + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + addi $rp,$sp,$Hsqr + bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); + + addi $bp,$sp,$Rsqr + addi $rp,$sp,$res_x + bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); + + addi $bp,$sp,$Hcub + bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); + + addi $bp,$sp,$U2 + ld $bi,$Hcub($sp) # forward load for p256_mul_mont + ld $a0,$S1+0($sp) + ld $a1,$S1+8($sp) + ld $a2,$S1+16($sp) + ld $a3,$S1+24($sp) + addi $rp,$sp,$res_y + bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); + + addi $bp,$sp,$Hcub + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub); + + ld $bi,$R($sp) + ld $a0,$res_y+0($sp) + ld $a1,$res_y+8($sp) + ld $a2,$res_y+16($sp) + ld $a3,$res_y+24($sp) + addi $bp,$sp,$R + addi $rp,$sp,$res_y + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); + + addi $bp,$sp,$S2 + bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); + + ld $t0,0($bp_real) # in2 + ld $t1,8($bp_real) + ld $t2,16($bp_real) + ld $t3,24($bp_real) + ld $a0,$res_x+0($sp) # res + ld $a1,$res_x+8($sp) + ld $a2,$res_x+16($sp) + ld $a3,$res_x+24($sp) +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ld $acc0,$i+0($ap_real) # in1 + ld $acc1,$i+8($ap_real) + ld $acc2,$i+16($ap_real) + ld $acc3,$i+24($ap_real) + andc $t0,$t0,$in1infty + andc $t1,$t1,$in1infty + andc $t2,$t2,$in1infty + andc $t3,$t3,$in1infty + and $a0,$a0,$in1infty + and $a1,$a1,$in1infty + and $a2,$a2,$in1infty + and $a3,$a3,$in1infty + or $t0,$t0,$a0 + or $t1,$t1,$a1 + or $t2,$t2,$a2 + or $t3,$t3,$a3 + andc $acc0,$acc0,$in2infty + andc $acc1,$acc1,$in2infty + andc $acc2,$acc2,$in2infty + andc $acc3,$acc3,$in2infty + and $t0,$t0,$in2infty + and $t1,$t1,$in2infty + and $t2,$t2,$in2infty + and $t3,$t3,$in2infty + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 + + ld $t0,$i+32($bp_real) # in2 + ld $t1,$i+40($bp_real) + ld $t2,$i+48($bp_real) + ld $t3,$i+56($bp_real) + ld $a0,$res_x+$i+32($sp) + ld $a1,$res_x+$i+40($sp) + ld $a2,$res_x+$i+48($sp) + ld $a3,$res_x+$i+56($sp) + std $acc0,$i+0($rp_real) + std $acc1,$i+8($rp_real) + std $acc2,$i+16($rp_real) + std $acc3,$i+24($rp_real) +___ +} +$code.=<<___; + ld $acc0,$i+0($ap_real) # in1 + ld $acc1,$i+8($ap_real) + ld $acc2,$i+16($ap_real) + ld $acc3,$i+24($ap_real) + andc $t0,$t0,$in1infty + andc $t1,$t1,$in1infty + andc $t2,$t2,$in1infty + andc $t3,$t3,$in1infty + and $a0,$a0,$in1infty + and $a1,$a1,$in1infty + and $a2,$a2,$in1infty + and $a3,$a3,$in1infty + or $t0,$t0,$a0 + or $t1,$t1,$a1 + or $t2,$t2,$a2 + or $t3,$t3,$a3 + andc $acc0,$acc0,$in2infty + andc $acc1,$acc1,$in2infty + andc $acc2,$acc2,$in2infty + andc $acc3,$acc3,$in2infty + and $t0,$t0,$in2infty + and $t1,$t1,$in2infty + and $t2,$t2,$in2infty + and $t3,$t3,$in2infty + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 + std $acc0,$i+0($rp_real) + std $acc1,$i+8($rp_real) + std $acc2,$i+16($rp_real) + std $acc3,$i+24($rp_real) + +.Ladd_done: + mtlr r0 + ld r16,$FRAME-8*16($sp) + ld r17,$FRAME-8*15($sp) + ld r18,$FRAME-8*14($sp) + ld r19,$FRAME-8*13($sp) + ld r20,$FRAME-8*12($sp) + ld r21,$FRAME-8*11($sp) + ld r22,$FRAME-8*10($sp) + ld r23,$FRAME-8*9($sp) + ld r24,$FRAME-8*8($sp) + ld r25,$FRAME-8*7($sp) + ld r26,$FRAME-8*6($sp) + ld r27,$FRAME-8*5($sp) + ld r28,$FRAME-8*4($sp) + ld r29,$FRAME-8*3($sp) + ld r30,$FRAME-8*2($sp) + ld r31,$FRAME-8*1($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,16,3,0 + .long 0 +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +if (1) { +my $FRAME = 64 + 32*10 + 16*8; +my ($res_x,$res_y,$res_z, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9)); +my $Z1sqr = $S2; +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. +my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.align 5 +ecp_nistz256_point_add_affine: + stdu $sp,-$FRAME($sp) + mflr r0 + std r16,$FRAME-8*16($sp) + std r17,$FRAME-8*15($sp) + std r18,$FRAME-8*14($sp) + std r19,$FRAME-8*13($sp) + std r20,$FRAME-8*12($sp) + std r21,$FRAME-8*11($sp) + std r22,$FRAME-8*10($sp) + std r23,$FRAME-8*9($sp) + std r24,$FRAME-8*8($sp) + std r25,$FRAME-8*7($sp) + std r26,$FRAME-8*6($sp) + std r27,$FRAME-8*5($sp) + std r28,$FRAME-8*4($sp) + std r29,$FRAME-8*3($sp) + std r30,$FRAME-8*2($sp) + std r31,$FRAME-8*1($sp) + + li $poly1,-1 + srdi $poly1,$poly1,32 # 0x00000000ffffffff + li $poly3,1 + orc $poly3,$poly3,$poly1 # 0xffffffff00000001 + + mr $rp_real,$rp + mr $ap_real,$ap + mr $bp_real,$bp + + ld $a0,64($ap) # in1_z + ld $a1,72($ap) + ld $a2,80($ap) + ld $a3,88($ap) + or $t0,$a0,$a1 + or $t2,$a2,$a3 + or $in1infty,$t0,$t2 + neg $t0,$in1infty + or $in1infty,$in1infty,$t0 + sradi $in1infty,$in1infty,63 # !in1infty + + ld $acc0,0($bp) # in2_x + ld $acc1,8($bp) + ld $acc2,16($bp) + ld $acc3,24($bp) + ld $t0,32($bp) # in2_y + ld $t1,40($bp) + ld $t2,48($bp) + ld $t3,56($bp) + or $acc0,$acc0,$acc1 + or $acc2,$acc2,$acc3 + or $acc0,$acc0,$acc2 + or $t0,$t0,$t1 + or $t2,$t2,$t3 + or $t0,$t0,$t2 + or $in2infty,$acc0,$t0 + neg $t0,$in2infty + or $in2infty,$in2infty,$t0 + sradi $in2infty,$in2infty,63 # !in2infty + + addi $rp,$sp,$Z1sqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); + + mr $a0,$acc0 + mr $a1,$acc1 + mr $a2,$acc2 + mr $a3,$acc3 + ld $bi,0($bp_real) + addi $bp,$bp_real,0 + addi $rp,$sp,$U2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x); + + addi $bp,$ap_real,0 + ld $bi,64($ap_real) # forward load for p256_mul_mont + ld $a0,$Z1sqr+0($sp) + ld $a1,$Z1sqr+8($sp) + ld $a2,$Z1sqr+16($sp) + ld $a3,$Z1sqr+24($sp) + addi $rp,$sp,$H + bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x); + + addi $bp,$ap_real,64 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); + + ld $bi,64($ap_real) + ld $a0,$H+0($sp) + ld $a1,$H+8($sp) + ld $a2,$H+16($sp) + ld $a3,$H+24($sp) + addi $bp,$ap_real,64 + addi $rp,$sp,$res_z + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); + + ld $bi,32($bp_real) + ld $a0,$S2+0($sp) + ld $a1,$S2+8($sp) + ld $a2,$S2+16($sp) + ld $a3,$S2+24($sp) + addi $bp,$bp_real,32 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); + + addi $bp,$ap_real,32 + ld $a0,$H+0($sp) # forward load for p256_sqr_mont + ld $a1,$H+8($sp) + ld $a2,$H+16($sp) + ld $a3,$H+24($sp) + addi $rp,$sp,$R + bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y); + + addi $rp,$sp,$Hsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); + + ld $a0,$R+0($sp) + ld $a1,$R+8($sp) + ld $a2,$R+16($sp) + ld $a3,$R+24($sp) + addi $rp,$sp,$Rsqr + bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); + + ld $bi,$H($sp) + ld $a0,$Hsqr+0($sp) + ld $a1,$Hsqr+8($sp) + ld $a2,$Hsqr+16($sp) + ld $a3,$Hsqr+24($sp) + addi $bp,$sp,$H + addi $rp,$sp,$Hcub + bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); + + ld $bi,0($ap_real) + ld $a0,$Hsqr+0($sp) + ld $a1,$Hsqr+8($sp) + ld $a2,$Hsqr+16($sp) + ld $a3,$Hsqr+24($sp) + addi $bp,$ap_real,0 + addi $rp,$sp,$U2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr); + + mr $t0,$acc0 + mr $t1,$acc1 + mr $t2,$acc2 + mr $t3,$acc3 + addi $rp,$sp,$Hsqr + bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); + + addi $bp,$sp,$Rsqr + addi $rp,$sp,$res_x + bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); + + addi $bp,$sp,$Hcub + bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); + + addi $bp,$sp,$U2 + ld $bi,32($ap_real) # forward load for p256_mul_mont + ld $a0,$Hcub+0($sp) + ld $a1,$Hcub+8($sp) + ld $a2,$Hcub+16($sp) + ld $a3,$Hcub+24($sp) + addi $rp,$sp,$res_y + bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); + + addi $bp,$ap_real,32 + addi $rp,$sp,$S2 + bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub); + + ld $bi,$R($sp) + ld $a0,$res_y+0($sp) + ld $a1,$res_y+8($sp) + ld $a2,$res_y+16($sp) + ld $a3,$res_y+24($sp) + addi $bp,$sp,$R + addi $rp,$sp,$res_y + bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); + + addi $bp,$sp,$S2 + bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); + + ld $t0,0($bp_real) # in2 + ld $t1,8($bp_real) + ld $t2,16($bp_real) + ld $t3,24($bp_real) + ld $a0,$res_x+0($sp) # res + ld $a1,$res_x+8($sp) + ld $a2,$res_x+16($sp) + ld $a3,$res_x+24($sp) +___ +for($i=0;$i<64;$i+=32) { # conditional moves +$code.=<<___; + ld $acc0,$i+0($ap_real) # in1 + ld $acc1,$i+8($ap_real) + ld $acc2,$i+16($ap_real) + ld $acc3,$i+24($ap_real) + andc $t0,$t0,$in1infty + andc $t1,$t1,$in1infty + andc $t2,$t2,$in1infty + andc $t3,$t3,$in1infty + and $a0,$a0,$in1infty + and $a1,$a1,$in1infty + and $a2,$a2,$in1infty + and $a3,$a3,$in1infty + or $t0,$t0,$a0 + or $t1,$t1,$a1 + or $t2,$t2,$a2 + or $t3,$t3,$a3 + andc $acc0,$acc0,$in2infty + andc $acc1,$acc1,$in2infty + andc $acc2,$acc2,$in2infty + andc $acc3,$acc3,$in2infty + and $t0,$t0,$in2infty + and $t1,$t1,$in2infty + and $t2,$t2,$in2infty + and $t3,$t3,$in2infty + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 +___ +$code.=<<___ if ($i==0); + ld $t0,32($bp_real) # in2 + ld $t1,40($bp_real) + ld $t2,48($bp_real) + ld $t3,56($bp_real) +___ +$code.=<<___ if ($i==32); + li $t0,1 # Lone_mont + not $t1,$poly1 + li $t2,-1 + not $t3,$poly3 +___ +$code.=<<___; + ld $a0,$res_x+$i+32($sp) + ld $a1,$res_x+$i+40($sp) + ld $a2,$res_x+$i+48($sp) + ld $a3,$res_x+$i+56($sp) + std $acc0,$i+0($rp_real) + std $acc1,$i+8($rp_real) + std $acc2,$i+16($rp_real) + std $acc3,$i+24($rp_real) +___ +} +$code.=<<___; + ld $acc0,$i+0($ap_real) # in1 + ld $acc1,$i+8($ap_real) + ld $acc2,$i+16($ap_real) + ld $acc3,$i+24($ap_real) + andc $t0,$t0,$in1infty + andc $t1,$t1,$in1infty + andc $t2,$t2,$in1infty + andc $t3,$t3,$in1infty + and $a0,$a0,$in1infty + and $a1,$a1,$in1infty + and $a2,$a2,$in1infty + and $a3,$a3,$in1infty + or $t0,$t0,$a0 + or $t1,$t1,$a1 + or $t2,$t2,$a2 + or $t3,$t3,$a3 + andc $acc0,$acc0,$in2infty + andc $acc1,$acc1,$in2infty + andc $acc2,$acc2,$in2infty + andc $acc3,$acc3,$in2infty + and $t0,$t0,$in2infty + and $t1,$t1,$in2infty + and $t2,$t2,$in2infty + and $t3,$t3,$in2infty + or $acc0,$acc0,$t0 + or $acc1,$acc1,$t1 + or $acc2,$acc2,$t2 + or $acc3,$acc3,$t3 + std $acc0,$i+0($rp_real) + std $acc1,$i+8($rp_real) + std $acc2,$i+16($rp_real) + std $acc3,$i+24($rp_real) + + mtlr r0 + ld r16,$FRAME-8*16($sp) + ld r17,$FRAME-8*15($sp) + ld r18,$FRAME-8*14($sp) + ld r19,$FRAME-8*13($sp) + ld r20,$FRAME-8*12($sp) + ld r21,$FRAME-8*11($sp) + ld r22,$FRAME-8*10($sp) + ld r23,$FRAME-8*9($sp) + ld r24,$FRAME-8*8($sp) + ld r25,$FRAME-8*7($sp) + ld r26,$FRAME-8*6($sp) + ld r27,$FRAME-8*5($sp) + ld r28,$FRAME-8*4($sp) + ld r29,$FRAME-8*3($sp) + ld r30,$FRAME-8*2($sp) + ld r31,$FRAME-8*1($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,16,3,0 + .long 0 +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} +if (1) { +my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21)); +my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0"); + +$code.=<<___; +######################################################################## +# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +# uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.align 5 +ecp_nistz256_ord_mul_mont: + stdu $sp,-160($sp) + std r18,48($sp) + std r19,56($sp) + std r20,64($sp) + std r21,72($sp) + std r22,80($sp) + std r23,88($sp) + std r24,96($sp) + std r25,104($sp) + std r26,112($sp) + std r27,120($sp) + std r28,128($sp) + std r29,136($sp) + std r30,144($sp) + std r31,152($sp) + + ld $a0,0($ap) + ld $bi,0($bp) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + lis $ordk,0xccd1 + lis $ord0,0xf3b9 + lis $ord1,0xbce6 + ori $ordk,$ordk,0xc8aa + ori $ord0,$ord0,0xcac2 + ori $ord1,$ord1,0xfaad + sldi $ordk,$ordk,32 + sldi $ord0,$ord0,32 + sldi $ord1,$ord1,32 + oris $ordk,$ordk,0xee00 + oris $ord0,$ord0,0xfc63 + oris $ord1,$ord1,0xa717 + ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f + ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 + ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 + li $ord2,-1 # 0xffffffffffffffff + sldi $ord3,$ord2,32 # 0xffffffff00000000 + li $zr,0 + + mulld $acc0,$a0,$bi # a[0]*b[0] + mulhdu $t0,$a0,$bi + + mulld $acc1,$a1,$bi # a[1]*b[0] + mulhdu $t1,$a1,$bi + + mulld $acc2,$a2,$bi # a[2]*b[0] + mulhdu $t2,$a2,$bi + + mulld $acc3,$a3,$bi # a[3]*b[0] + mulhdu $acc4,$a3,$bi + + mulld $t4,$acc0,$ordk + + addc $acc1,$acc1,$t0 # accumulate high parts of multiplication + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$t2 + addze $acc4,$acc4 + li $acc5,0 +___ +for ($i=1;$i<4;$i++) { + ################################################################ + # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx + # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 + # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh +$code.=<<___; + ld $bi,8*$i($bp) # b[i] + + sldi $t0,$t4,32 + subfc $acc2,$t4,$acc2 + srdi $t1,$t4,32 + subfe $acc3,$t0,$acc3 + subfe $acc4,$t1,$acc4 + subfe $acc5,$zr,$acc5 + + addic $t0,$acc0,-1 # discarded + mulhdu $t1,$ord0,$t4 + mulld $t2,$ord1,$t4 + mulhdu $t3,$ord1,$t4 + + adde $t2,$t2,$t1 + mulld $t0,$a0,$bi + addze $t3,$t3 + mulld $t1,$a1,$bi + + addc $acc0,$acc1,$t2 + mulld $t2,$a2,$bi + adde $acc1,$acc2,$t3 + mulld $t3,$a3,$bi + adde $acc2,$acc3,$t4 + adde $acc3,$acc4,$t4 + addze $acc4,$acc5 + + addc $acc0,$acc0,$t0 # accumulate low parts + mulhdu $t0,$a0,$bi + adde $acc1,$acc1,$t1 + mulhdu $t1,$a1,$bi + adde $acc2,$acc2,$t2 + mulhdu $t2,$a2,$bi + adde $acc3,$acc3,$t3 + mulhdu $t3,$a3,$bi + addze $acc4,$acc4 + mulld $t4,$acc0,$ordk + addc $acc1,$acc1,$t0 # accumulate high parts + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$t2 + adde $acc4,$acc4,$t3 + addze $acc5,$zr +___ +} +$code.=<<___; + sldi $t0,$t4,32 # last reduction + subfc $acc2,$t4,$acc2 + srdi $t1,$t4,32 + subfe $acc3,$t0,$acc3 + subfe $acc4,$t1,$acc4 + subfe $acc5,$zr,$acc5 + + addic $t0,$acc0,-1 # discarded + mulhdu $t1,$ord0,$t4 + mulld $t2,$ord1,$t4 + mulhdu $t3,$ord1,$t4 + + adde $t2,$t2,$t1 + addze $t3,$t3 + + addc $acc0,$acc1,$t2 + adde $acc1,$acc2,$t3 + adde $acc2,$acc3,$t4 + adde $acc3,$acc4,$t4 + addze $acc4,$acc5 + + subfc $acc0,$ord0,$acc0 # ret -= modulus + subfe $acc1,$ord1,$acc1 + subfe $acc2,$ord2,$acc2 + subfe $acc3,$ord3,$acc3 + subfe $acc4,$zr,$acc4 + + and $t0,$ord0,$acc4 + and $t1,$ord1,$acc4 + addc $acc0,$acc0,$t0 # ret += modulus if borrow + and $t3,$ord3,$acc4 + adde $acc1,$acc1,$t1 + adde $acc2,$acc2,$acc4 + adde $acc3,$acc3,$t3 + + std $acc0,0($rp) + std $acc1,8($rp) + std $acc2,16($rp) + std $acc3,24($rp) + + ld r18,48($sp) + ld r19,56($sp) + ld r20,64($sp) + ld r21,72($sp) + ld r22,80($sp) + ld r23,88($sp) + ld r24,96($sp) + ld r25,104($sp) + ld r26,112($sp) + ld r27,120($sp) + ld r28,128($sp) + ld r29,136($sp) + ld r30,144($sp) + ld r31,152($sp) + addi $sp,$sp,160 + blr + .long 0 + .byte 0,12,4,0,0x80,14,3,0 + .long 0 +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +################################################################################ +# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +# int rep); +.globl ecp_nistz256_ord_sqr_mont +.align 5 +ecp_nistz256_ord_sqr_mont: + stdu $sp,-160($sp) + std r18,48($sp) + std r19,56($sp) + std r20,64($sp) + std r21,72($sp) + std r22,80($sp) + std r23,88($sp) + std r24,96($sp) + std r25,104($sp) + std r26,112($sp) + std r27,120($sp) + std r28,128($sp) + std r29,136($sp) + std r30,144($sp) + std r31,152($sp) + + mtctr $bp + + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + lis $ordk,0xccd1 + lis $ord0,0xf3b9 + lis $ord1,0xbce6 + ori $ordk,$ordk,0xc8aa + ori $ord0,$ord0,0xcac2 + ori $ord1,$ord1,0xfaad + sldi $ordk,$ordk,32 + sldi $ord0,$ord0,32 + sldi $ord1,$ord1,32 + oris $ordk,$ordk,0xee00 + oris $ord0,$ord0,0xfc63 + oris $ord1,$ord1,0xa717 + ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f + ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 + ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 + li $ord2,-1 # 0xffffffffffffffff + sldi $ord3,$ord2,32 # 0xffffffff00000000 + li $zr,0 + b .Loop_ord_sqr + +.align 5 +.Loop_ord_sqr: + ################################################################ + # | | | | | |a1*a0| | + # | | | | |a2*a0| | | + # | |a3*a2|a3*a0| | | | + # | | | |a2*a1| | | | + # | | |a3*a1| | | | | + # *| | | | | | | | 2| + # +|a3*a3|a2*a2|a1*a1|a0*a0| + # |--+--+--+--+--+--+--+--| + # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + # + # "can't overflow" below mark carrying into high part of + # multiplication result, which can't overflow, because it + # can never be all ones. + + mulld $acc1,$a1,$a0 # a[1]*a[0] + mulhdu $t1,$a1,$a0 + mulld $acc2,$a2,$a0 # a[2]*a[0] + mulhdu $t2,$a2,$a0 + mulld $acc3,$a3,$a0 # a[3]*a[0] + mulhdu $acc4,$a3,$a0 + + addc $acc2,$acc2,$t1 # accumulate high parts of multiplication + mulld $t0,$a2,$a1 # a[2]*a[1] + mulhdu $t1,$a2,$a1 + adde $acc3,$acc3,$t2 + mulld $t2,$a3,$a1 # a[3]*a[1] + mulhdu $t3,$a3,$a1 + addze $acc4,$acc4 # can't overflow + + mulld $acc5,$a3,$a2 # a[3]*a[2] + mulhdu $acc6,$a3,$a2 + + addc $t1,$t1,$t2 # accumulate high parts of multiplication + mulld $acc0,$a0,$a0 # a[0]*a[0] + addze $t2,$t3 # can't overflow + + addc $acc3,$acc3,$t0 # accumulate low parts of multiplication + mulhdu $a0,$a0,$a0 + adde $acc4,$acc4,$t1 + mulld $t1,$a1,$a1 # a[1]*a[1] + adde $acc5,$acc5,$t2 + mulhdu $a1,$a1,$a1 + addze $acc6,$acc6 # can't overflow + + addc $acc1,$acc1,$acc1 # acc[1-6]*=2 + mulld $t2,$a2,$a2 # a[2]*a[2] + adde $acc2,$acc2,$acc2 + mulhdu $a2,$a2,$a2 + adde $acc3,$acc3,$acc3 + mulld $t3,$a3,$a3 # a[3]*a[3] + adde $acc4,$acc4,$acc4 + mulhdu $a3,$a3,$a3 + adde $acc5,$acc5,$acc5 + adde $acc6,$acc6,$acc6 + addze $acc7,$zr + + addc $acc1,$acc1,$a0 # +a[i]*a[i] + mulld $t4,$acc0,$ordk + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$a1 + adde $acc4,$acc4,$t2 + adde $acc5,$acc5,$a2 + adde $acc6,$acc6,$t3 + adde $acc7,$acc7,$a3 +___ +for($i=0; $i<4; $i++) { # reductions +$code.=<<___; + addic $t0,$acc0,-1 # discarded + mulhdu $t1,$ord0,$t4 + mulld $t2,$ord1,$t4 + mulhdu $t3,$ord1,$t4 + + adde $t2,$t2,$t1 + addze $t3,$t3 + + addc $acc0,$acc1,$t2 + adde $acc1,$acc2,$t3 + adde $acc2,$acc3,$t4 + adde $acc3,$zr,$t4 # can't overflow +___ +$code.=<<___ if ($i<3); + mulld $t3,$acc0,$ordk +___ +$code.=<<___; + sldi $t0,$t4,32 + subfc $acc1,$t4,$acc1 + srdi $t1,$t4,32 + subfe $acc2,$t0,$acc2 + subfe $acc3,$t1,$acc3 # can't borrow +___ + ($t3,$t4) = ($t4,$t3); +} +$code.=<<___; + addc $acc0,$acc0,$acc4 # accumulate upper half + adde $acc1,$acc1,$acc5 + adde $acc2,$acc2,$acc6 + adde $acc3,$acc3,$acc7 + addze $acc4,$zr + + subfc $acc0,$ord0,$acc0 # ret -= modulus + subfe $acc1,$ord1,$acc1 + subfe $acc2,$ord2,$acc2 + subfe $acc3,$ord3,$acc3 + subfe $acc4,$zr,$acc4 + + and $t0,$ord0,$acc4 + and $t1,$ord1,$acc4 + addc $a0,$acc0,$t0 # ret += modulus if borrow + and $t3,$ord3,$acc4 + adde $a1,$acc1,$t1 + adde $a2,$acc2,$acc4 + adde $a3,$acc3,$t3 + + bdnz .Loop_ord_sqr + + std $a0,0($rp) + std $a1,8($rp) + std $a2,16($rp) + std $a3,24($rp) + + ld r18,48($sp) + ld r19,56($sp) + ld r20,64($sp) + ld r21,72($sp) + ld r22,80($sp) + ld r23,88($sp) + ld r24,96($sp) + ld r25,104($sp) + ld r26,112($sp) + ld r27,120($sp) + ld r28,128($sp) + ld r29,136($sp) + ld r30,144($sp) + ld r31,152($sp) + addi $sp,$sp,160 + blr + .long 0 + .byte 0,12,4,0,0x80,14,3,0 + .long 0 +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ +} } + +######################################################################## +# scatter-gather subroutines +{ +my ($out,$inp,$index,$mask)=map("r$_",(3..7)); +$code.=<<___; +######################################################################## +# void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp, +# int index); +.globl ecp_nistz256_scatter_w5 +.align 4 +ecp_nistz256_scatter_w5: + slwi $index,$index,2 + add $out,$out,$index + + ld r8, 0($inp) # X + ld r9, 8($inp) + ld r10,16($inp) + ld r11,24($inp) + + stw r8, 64*0-4($out) + srdi r8, r8, 32 + stw r9, 64*1-4($out) + srdi r9, r9, 32 + stw r10,64*2-4($out) + srdi r10,r10,32 + stw r11,64*3-4($out) + srdi r11,r11,32 + stw r8, 64*4-4($out) + stw r9, 64*5-4($out) + stw r10,64*6-4($out) + stw r11,64*7-4($out) + addi $out,$out,64*8 + + ld r8, 32($inp) # Y + ld r9, 40($inp) + ld r10,48($inp) + ld r11,56($inp) + + stw r8, 64*0-4($out) + srdi r8, r8, 32 + stw r9, 64*1-4($out) + srdi r9, r9, 32 + stw r10,64*2-4($out) + srdi r10,r10,32 + stw r11,64*3-4($out) + srdi r11,r11,32 + stw r8, 64*4-4($out) + stw r9, 64*5-4($out) + stw r10,64*6-4($out) + stw r11,64*7-4($out) + addi $out,$out,64*8 + + ld r8, 64($inp) # Z + ld r9, 72($inp) + ld r10,80($inp) + ld r11,88($inp) + + stw r8, 64*0-4($out) + srdi r8, r8, 32 + stw r9, 64*1-4($out) + srdi r9, r9, 32 + stw r10,64*2-4($out) + srdi r10,r10,32 + stw r11,64*3-4($out) + srdi r11,r11,32 + stw r8, 64*4-4($out) + stw r9, 64*5-4($out) + stw r10,64*6-4($out) + stw r11,64*7-4($out) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +######################################################################## +# void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp, +# int index); +.globl ecp_nistz256_gather_w5 +.align 4 +ecp_nistz256_gather_w5: + neg r0,$index + sradi r0,r0,63 + + add $index,$index,r0 + slwi $index,$index,2 + add $inp,$inp,$index + + lwz r5, 64*0($inp) + lwz r6, 64*1($inp) + lwz r7, 64*2($inp) + lwz r8, 64*3($inp) + lwz r9, 64*4($inp) + lwz r10,64*5($inp) + lwz r11,64*6($inp) + lwz r12,64*7($inp) + addi $inp,$inp,64*8 + sldi r9, r9, 32 + sldi r10,r10,32 + sldi r11,r11,32 + sldi r12,r12,32 + or r5,r5,r9 + or r6,r6,r10 + or r7,r7,r11 + or r8,r8,r12 + and r5,r5,r0 + and r6,r6,r0 + and r7,r7,r0 + and r8,r8,r0 + std r5,0($out) # X + std r6,8($out) + std r7,16($out) + std r8,24($out) + + lwz r5, 64*0($inp) + lwz r6, 64*1($inp) + lwz r7, 64*2($inp) + lwz r8, 64*3($inp) + lwz r9, 64*4($inp) + lwz r10,64*5($inp) + lwz r11,64*6($inp) + lwz r12,64*7($inp) + addi $inp,$inp,64*8 + sldi r9, r9, 32 + sldi r10,r10,32 + sldi r11,r11,32 + sldi r12,r12,32 + or r5,r5,r9 + or r6,r6,r10 + or r7,r7,r11 + or r8,r8,r12 + and r5,r5,r0 + and r6,r6,r0 + and r7,r7,r0 + and r8,r8,r0 + std r5,32($out) # Y + std r6,40($out) + std r7,48($out) + std r8,56($out) + + lwz r5, 64*0($inp) + lwz r6, 64*1($inp) + lwz r7, 64*2($inp) + lwz r8, 64*3($inp) + lwz r9, 64*4($inp) + lwz r10,64*5($inp) + lwz r11,64*6($inp) + lwz r12,64*7($inp) + sldi r9, r9, 32 + sldi r10,r10,32 + sldi r11,r11,32 + sldi r12,r12,32 + or r5,r5,r9 + or r6,r6,r10 + or r7,r7,r11 + or r8,r8,r12 + and r5,r5,r0 + and r6,r6,r0 + and r7,r7,r0 + and r8,r8,r0 + std r5,64($out) # Z + std r6,72($out) + std r7,80($out) + std r8,88($out) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +######################################################################## +# void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp, +# int index); +.globl ecp_nistz256_scatter_w7 +.align 4 +ecp_nistz256_scatter_w7: + li r0,8 + mtctr r0 + add $out,$out,$index + subi $inp,$inp,8 + +.Loop_scatter_w7: + ldu r0,8($inp) + stb r0,64*0($out) + srdi r0,r0,8 + stb r0,64*1($out) + srdi r0,r0,8 + stb r0,64*2($out) + srdi r0,r0,8 + stb r0,64*3($out) + srdi r0,r0,8 + stb r0,64*4($out) + srdi r0,r0,8 + stb r0,64*5($out) + srdi r0,r0,8 + stb r0,64*6($out) + srdi r0,r0,8 + stb r0,64*7($out) + addi $out,$out,64*8 + bdnz .Loop_scatter_w7 + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +######################################################################## +# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp, +# int index); +.globl ecp_nistz256_gather_w7 +.align 4 +ecp_nistz256_gather_w7: + li r0,8 + mtctr r0 + neg r0,$index + sradi r0,r0,63 + + add $index,$index,r0 + add $inp,$inp,$index + subi $out,$out,8 + +.Loop_gather_w7: + lbz r5, 64*0($inp) + lbz r6, 64*1($inp) + lbz r7, 64*2($inp) + lbz r8, 64*3($inp) + lbz r9, 64*4($inp) + lbz r10,64*5($inp) + lbz r11,64*6($inp) + lbz r12,64*7($inp) + addi $inp,$inp,64*8 + + sldi r6, r6, 8 + sldi r7, r7, 16 + sldi r8, r8, 24 + sldi r9, r9, 32 + sldi r10,r10,40 + sldi r11,r11,48 + sldi r12,r12,56 + + or r5,r5,r6 + or r7,r7,r8 + or r9,r9,r10 + or r11,r11,r12 + or r5,r5,r7 + or r9,r9,r11 + or r5,r5,r9 + and r5,r5,r0 + stdu r5,8($out) + bdnz .Loop_gather_w7 + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-sparcv9.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-sparcv9.pl new file mode 100755 index 000000000..042e12271 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-sparcv9.pl @@ -0,0 +1,3060 @@ +#! /usr/bin/env perl +# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for SPARCv9. +# +# February 2015. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. In the process of adaptation +# original .c module was made 32-bit savvy in order to make this +# implementation possible. +# +# with/without -DECP_NISTZ256_ASM +# UltraSPARC III +12-18% +# SPARC T4 +99-550% (+66-150% on 32-bit Solaris) +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +200% means 3x improvement. + +$output = pop; +open STDOUT,">$output"; + +$code.=<<___; +#include "sparc_arch.h" + +#define LOCALS (STACK_BIAS+STACK_FRAME) +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +# define STACK64_FRAME STACK_FRAME +# define LOCALS64 LOCALS +#else +# define STACK64_FRAME (2047+192) +# define LOCALS64 STACK64_FRAME +#endif + +.section ".text",#alloc,#execinstr +___ +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +$code.=<<___; +.globl ecp_nistz256_precomputed +.align 4096 +ecp_nistz256_precomputed: +___ +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + $code.=".byte\t"; + $code.=join(',',map { sprintf "0x%02x",$_} @line); + $code.="\n"; + } +} + +{{{ +my ($rp,$ap,$bp)=map("%i$_",(0..2)); +my @acc=map("%l$_",(0..7)); +my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5"); +my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1"); +my ($rp_real,$ap_real)=("%g2","%g3"); + +$code.=<<___; +.type ecp_nistz256_precomputed,#object +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.align 64 +.LRR: ! 2^512 mod P precomputed for NIST P256 polynomial +.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb +.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 +.Lone: +.long 1,0,0,0,0,0,0,0 +.asciz "ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" + +! void ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_to_mont +.align 64 +ecp_nistz256_to_mont: + save %sp,-STACK_FRAME,%sp + nop +1: call .+8 + add %o7,.LRR-1b,$bp + call __ecp_nistz256_mul_mont + nop + ret + restore +.type ecp_nistz256_to_mont,#function +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +! void ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_from_mont +.align 32 +ecp_nistz256_from_mont: + save %sp,-STACK_FRAME,%sp + nop +1: call .+8 + add %o7,.Lone-1b,$bp + call __ecp_nistz256_mul_mont + nop + ret + restore +.type ecp_nistz256_from_mont,#function +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + +! void ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8], +! const BN_ULONG %i2[8]); +.globl ecp_nistz256_mul_mont +.align 32 +ecp_nistz256_mul_mont: + save %sp,-STACK_FRAME,%sp + nop + call __ecp_nistz256_mul_mont + nop + ret + restore +.type ecp_nistz256_mul_mont,#function +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +! void ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]); +.globl ecp_nistz256_sqr_mont +.align 32 +ecp_nistz256_sqr_mont: + save %sp,-STACK_FRAME,%sp + mov $ap,$bp + call __ecp_nistz256_mul_mont + nop + ret + restore +.type ecp_nistz256_sqr_mont,#function +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont +___ + +######################################################################## +# Special thing to keep in mind is that $t0-$t7 hold 64-bit values, +# while all others are meant to keep 32. "Meant to" means that additions +# to @acc[0-7] do "contaminate" upper bits, but they are cleared before +# they can affect outcome (follow 'and' with $mask). Also keep in mind +# that addition with carry is addition with 32-bit carry, even though +# CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see +# below for VIS3 code paths.] + +$code.=<<___; +.align 32 +__ecp_nistz256_mul_mont: + ld [$bp+0],$bi ! b[0] + mov -1,$mask + ld [$ap+0],$a0 + srl $mask,0,$mask ! 0xffffffff + ld [$ap+4],$t1 + ld [$ap+8],$t2 + ld [$ap+12],$t3 + ld [$ap+16],$t4 + ld [$ap+20],$t5 + ld [$ap+24],$t6 + ld [$ap+28],$t7 + mulx $a0,$bi,$t0 ! a[0-7]*b[0], 64-bit results + mulx $t1,$bi,$t1 + mulx $t2,$bi,$t2 + mulx $t3,$bi,$t3 + mulx $t4,$bi,$t4 + mulx $t5,$bi,$t5 + mulx $t6,$bi,$t6 + mulx $t7,$bi,$t7 + srlx $t0,32,@acc[1] ! extract high parts + srlx $t1,32,@acc[2] + srlx $t2,32,@acc[3] + srlx $t3,32,@acc[4] + srlx $t4,32,@acc[5] + srlx $t5,32,@acc[6] + srlx $t6,32,@acc[7] + srlx $t7,32,@acc[0] ! "@acc[8]" + mov 0,$carry +___ +for($i=1;$i<8;$i++) { +$code.=<<___; + addcc @acc[1],$t1,@acc[1] ! accumulate high parts + ld [$bp+4*$i],$bi ! b[$i] + ld [$ap+4],$t1 ! re-load a[1-7] + addccc @acc[2],$t2,@acc[2] + addccc @acc[3],$t3,@acc[3] + ld [$ap+8],$t2 + ld [$ap+12],$t3 + addccc @acc[4],$t4,@acc[4] + addccc @acc[5],$t5,@acc[5] + ld [$ap+16],$t4 + ld [$ap+20],$t5 + addccc @acc[6],$t6,@acc[6] + addccc @acc[7],$t7,@acc[7] + ld [$ap+24],$t6 + ld [$ap+28],$t7 + addccc @acc[0],$carry,@acc[0] ! "@acc[8]" + addc %g0,%g0,$carry +___ + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff.0001.0000.0000.0000.ffff.ffff.ffff + # * abcd + # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 + # - abcd.0000.0000.0000.0000.0000.0000.abcd + # + # or marking redundant operations: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- + # + abcd.0000.abcd.0000.0000.abcd.----.----.---- + # - abcd.----.----.----.----.----.----.---- + +$code.=<<___; + ! multiplication-less reduction + addcc @acc[3],$t0,@acc[3] ! r[3]+=r[0] + addccc @acc[4],%g0,@acc[4] ! r[4]+=0 + and @acc[1],$mask,@acc[1] + and @acc[2],$mask,@acc[2] + addccc @acc[5],%g0,@acc[5] ! r[5]+=0 + addccc @acc[6],$t0,@acc[6] ! r[6]+=r[0] + and @acc[3],$mask,@acc[3] + and @acc[4],$mask,@acc[4] + addccc @acc[7],%g0,@acc[7] ! r[7]+=0 + addccc @acc[0],$t0,@acc[0] ! r[8]+=r[0] "@acc[8]" + and @acc[5],$mask,@acc[5] + and @acc[6],$mask,@acc[6] + addc $carry,%g0,$carry ! top-most carry + subcc @acc[7],$t0,@acc[7] ! r[7]-=r[0] + subccc @acc[0],%g0,@acc[0] ! r[8]-=0 "@acc[8]" + subc $carry,%g0,$carry ! top-most carry + and @acc[7],$mask,@acc[7] + and @acc[0],$mask,@acc[0] ! "@acc[8]" +___ + push(@acc,shift(@acc)); # rotate registers to "omit" acc[0] +$code.=<<___; + mulx $a0,$bi,$t0 ! a[0-7]*b[$i], 64-bit results + mulx $t1,$bi,$t1 + mulx $t2,$bi,$t2 + mulx $t3,$bi,$t3 + mulx $t4,$bi,$t4 + mulx $t5,$bi,$t5 + mulx $t6,$bi,$t6 + mulx $t7,$bi,$t7 + add @acc[0],$t0,$t0 ! accumulate low parts, can't overflow + add @acc[1],$t1,$t1 + srlx $t0,32,@acc[1] ! extract high parts + add @acc[2],$t2,$t2 + srlx $t1,32,@acc[2] + add @acc[3],$t3,$t3 + srlx $t2,32,@acc[3] + add @acc[4],$t4,$t4 + srlx $t3,32,@acc[4] + add @acc[5],$t5,$t5 + srlx $t4,32,@acc[5] + add @acc[6],$t6,$t6 + srlx $t5,32,@acc[6] + add @acc[7],$t7,$t7 + srlx $t6,32,@acc[7] + srlx $t7,32,@acc[0] ! "@acc[8]" +___ +} +$code.=<<___; + addcc @acc[1],$t1,@acc[1] ! accumulate high parts + addccc @acc[2],$t2,@acc[2] + addccc @acc[3],$t3,@acc[3] + addccc @acc[4],$t4,@acc[4] + addccc @acc[5],$t5,@acc[5] + addccc @acc[6],$t6,@acc[6] + addccc @acc[7],$t7,@acc[7] + addccc @acc[0],$carry,@acc[0] ! "@acc[8]" + addc %g0,%g0,$carry + + addcc @acc[3],$t0,@acc[3] ! multiplication-less reduction + addccc @acc[4],%g0,@acc[4] + addccc @acc[5],%g0,@acc[5] + addccc @acc[6],$t0,@acc[6] + addccc @acc[7],%g0,@acc[7] + addccc @acc[0],$t0,@acc[0] ! "@acc[8]" + addc $carry,%g0,$carry + subcc @acc[7],$t0,@acc[7] + subccc @acc[0],%g0,@acc[0] ! "@acc[8]" + subc $carry,%g0,$carry ! top-most carry +___ + push(@acc,shift(@acc)); # rotate registers to omit acc[0] +$code.=<<___; + ! Final step is "if result > mod, subtract mod", but we do it + ! "other way around", namely subtract modulus from result + ! and if it borrowed, add modulus back. + + subcc @acc[0],-1,@acc[0] ! subtract modulus + subccc @acc[1],-1,@acc[1] + subccc @acc[2],-1,@acc[2] + subccc @acc[3],0,@acc[3] + subccc @acc[4],0,@acc[4] + subccc @acc[5],0,@acc[5] + subccc @acc[6],1,@acc[6] + subccc @acc[7],-1,@acc[7] + subc $carry,0,$carry ! broadcast borrow bit + + ! Note that because mod has special form, i.e. consists of + ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by + ! using value of broadcasted borrow and the borrow bit itself. + ! To minimize dependency chain we first broadcast and then + ! extract the bit by negating (follow $bi). + + addcc @acc[0],$carry,@acc[0] ! add modulus or zero + addccc @acc[1],$carry,@acc[1] + neg $carry,$bi + st @acc[0],[$rp] + addccc @acc[2],$carry,@acc[2] + st @acc[1],[$rp+4] + addccc @acc[3],0,@acc[3] + st @acc[2],[$rp+8] + addccc @acc[4],0,@acc[4] + st @acc[3],[$rp+12] + addccc @acc[5],0,@acc[5] + st @acc[4],[$rp+16] + addccc @acc[6],$bi,@acc[6] + st @acc[5],[$rp+20] + addc @acc[7],$carry,@acc[7] + st @acc[6],[$rp+24] + retl + st @acc[7],[$rp+28] +.type __ecp_nistz256_mul_mont,#function +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +! void ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8], +! const BN_ULONG %i2[8]); +.globl ecp_nistz256_add +.align 32 +ecp_nistz256_add: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_add + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_add,#function +.size ecp_nistz256_add,.-ecp_nistz256_add + +.align 32 +__ecp_nistz256_add: + ld [$bp+0],$t0 ! b[0] + ld [$bp+4],$t1 + ld [$bp+8],$t2 + ld [$bp+12],$t3 + addcc @acc[0],$t0,@acc[0] + ld [$bp+16],$t4 + ld [$bp+20],$t5 + addccc @acc[1],$t1,@acc[1] + ld [$bp+24],$t6 + ld [$bp+28],$t7 + addccc @acc[2],$t2,@acc[2] + addccc @acc[3],$t3,@acc[3] + addccc @acc[4],$t4,@acc[4] + addccc @acc[5],$t5,@acc[5] + addccc @acc[6],$t6,@acc[6] + addccc @acc[7],$t7,@acc[7] + addc %g0,%g0,$carry + +.Lreduce_by_sub: + + ! if a+b >= modulus, subtract modulus. + ! + ! But since comparison implies subtraction, we subtract + ! modulus and then add it back if subtraction borrowed. + + subcc @acc[0],-1,@acc[0] + subccc @acc[1],-1,@acc[1] + subccc @acc[2],-1,@acc[2] + subccc @acc[3], 0,@acc[3] + subccc @acc[4], 0,@acc[4] + subccc @acc[5], 0,@acc[5] + subccc @acc[6], 1,@acc[6] + subccc @acc[7],-1,@acc[7] + subc $carry,0,$carry + + ! Note that because mod has special form, i.e. consists of + ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by + ! using value of borrow and its negative. + + addcc @acc[0],$carry,@acc[0] ! add synthesized modulus + addccc @acc[1],$carry,@acc[1] + neg $carry,$bi + st @acc[0],[$rp] + addccc @acc[2],$carry,@acc[2] + st @acc[1],[$rp+4] + addccc @acc[3],0,@acc[3] + st @acc[2],[$rp+8] + addccc @acc[4],0,@acc[4] + st @acc[3],[$rp+12] + addccc @acc[5],0,@acc[5] + st @acc[4],[$rp+16] + addccc @acc[6],$bi,@acc[6] + st @acc[5],[$rp+20] + addc @acc[7],$carry,@acc[7] + st @acc[6],[$rp+24] + retl + st @acc[7],[$rp+28] +.type __ecp_nistz256_add,#function +.size __ecp_nistz256_add,.-__ecp_nistz256_add + +! void ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_mul_by_2 +.align 32 +ecp_nistz256_mul_by_2: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_mul_by_2 + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_mul_by_2,#function +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +.align 32 +__ecp_nistz256_mul_by_2: + addcc @acc[0],@acc[0],@acc[0] ! a+a=2*a + addccc @acc[1],@acc[1],@acc[1] + addccc @acc[2],@acc[2],@acc[2] + addccc @acc[3],@acc[3],@acc[3] + addccc @acc[4],@acc[4],@acc[4] + addccc @acc[5],@acc[5],@acc[5] + addccc @acc[6],@acc[6],@acc[6] + addccc @acc[7],@acc[7],@acc[7] + b .Lreduce_by_sub + addc %g0,%g0,$carry +.type __ecp_nistz256_mul_by_2,#function +.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 + +! void ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_mul_by_3 +.align 32 +ecp_nistz256_mul_by_3: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_mul_by_3 + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_mul_by_3,#function +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +.align 32 +__ecp_nistz256_mul_by_3: + addcc @acc[0],@acc[0],$t0 ! a+a=2*a + addccc @acc[1],@acc[1],$t1 + addccc @acc[2],@acc[2],$t2 + addccc @acc[3],@acc[3],$t3 + addccc @acc[4],@acc[4],$t4 + addccc @acc[5],@acc[5],$t5 + addccc @acc[6],@acc[6],$t6 + addccc @acc[7],@acc[7],$t7 + addc %g0,%g0,$carry + + subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores + subccc $t1,-1,$t1 + subccc $t2,-1,$t2 + subccc $t3, 0,$t3 + subccc $t4, 0,$t4 + subccc $t5, 0,$t5 + subccc $t6, 1,$t6 + subccc $t7,-1,$t7 + subc $carry,0,$carry + + addcc $t0,$carry,$t0 ! add synthesized modulus + addccc $t1,$carry,$t1 + neg $carry,$bi + addccc $t2,$carry,$t2 + addccc $t3,0,$t3 + addccc $t4,0,$t4 + addccc $t5,0,$t5 + addccc $t6,$bi,$t6 + addc $t7,$carry,$t7 + + addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a + addccc $t1,@acc[1],@acc[1] + addccc $t2,@acc[2],@acc[2] + addccc $t3,@acc[3],@acc[3] + addccc $t4,@acc[4],@acc[4] + addccc $t5,@acc[5],@acc[5] + addccc $t6,@acc[6],@acc[6] + addccc $t7,@acc[7],@acc[7] + b .Lreduce_by_sub + addc %g0,%g0,$carry +.type __ecp_nistz256_mul_by_3,#function +.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3 + +! void ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8], +! const BN_ULONG %i2[8]); +.globl ecp_nistz256_sub +.align 32 +ecp_nistz256_sub: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_sub_from + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_sub,#function +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +! void ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_neg +.align 32 +ecp_nistz256_neg: + save %sp,-STACK_FRAME,%sp + mov $ap,$bp + mov 0,@acc[0] + mov 0,@acc[1] + mov 0,@acc[2] + mov 0,@acc[3] + mov 0,@acc[4] + mov 0,@acc[5] + mov 0,@acc[6] + call __ecp_nistz256_sub_from + mov 0,@acc[7] + ret + restore +.type ecp_nistz256_neg,#function +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +.align 32 +__ecp_nistz256_sub_from: + ld [$bp+0],$t0 ! b[0] + ld [$bp+4],$t1 + ld [$bp+8],$t2 + ld [$bp+12],$t3 + subcc @acc[0],$t0,@acc[0] + ld [$bp+16],$t4 + ld [$bp+20],$t5 + subccc @acc[1],$t1,@acc[1] + subccc @acc[2],$t2,@acc[2] + ld [$bp+24],$t6 + ld [$bp+28],$t7 + subccc @acc[3],$t3,@acc[3] + subccc @acc[4],$t4,@acc[4] + subccc @acc[5],$t5,@acc[5] + subccc @acc[6],$t6,@acc[6] + subccc @acc[7],$t7,@acc[7] + subc %g0,%g0,$carry ! broadcast borrow bit + +.Lreduce_by_add: + + ! if a-b borrows, add modulus. + ! + ! Note that because mod has special form, i.e. consists of + ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by + ! using value of broadcasted borrow and the borrow bit itself. + ! To minimize dependency chain we first broadcast and then + ! extract the bit by negating (follow $bi). + + addcc @acc[0],$carry,@acc[0] ! add synthesized modulus + addccc @acc[1],$carry,@acc[1] + neg $carry,$bi + st @acc[0],[$rp] + addccc @acc[2],$carry,@acc[2] + st @acc[1],[$rp+4] + addccc @acc[3],0,@acc[3] + st @acc[2],[$rp+8] + addccc @acc[4],0,@acc[4] + st @acc[3],[$rp+12] + addccc @acc[5],0,@acc[5] + st @acc[4],[$rp+16] + addccc @acc[6],$bi,@acc[6] + st @acc[5],[$rp+20] + addc @acc[7],$carry,@acc[7] + st @acc[6],[$rp+24] + retl + st @acc[7],[$rp+28] +.type __ecp_nistz256_sub_from,#function +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.align 32 +__ecp_nistz256_sub_morf: + ld [$bp+0],$t0 ! b[0] + ld [$bp+4],$t1 + ld [$bp+8],$t2 + ld [$bp+12],$t3 + subcc $t0,@acc[0],@acc[0] + ld [$bp+16],$t4 + ld [$bp+20],$t5 + subccc $t1,@acc[1],@acc[1] + subccc $t2,@acc[2],@acc[2] + ld [$bp+24],$t6 + ld [$bp+28],$t7 + subccc $t3,@acc[3],@acc[3] + subccc $t4,@acc[4],@acc[4] + subccc $t5,@acc[5],@acc[5] + subccc $t6,@acc[6],@acc[6] + subccc $t7,@acc[7],@acc[7] + b .Lreduce_by_add + subc %g0,%g0,$carry ! broadcast borrow bit +.type __ecp_nistz256_sub_morf,#function +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +! void ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]); +.globl ecp_nistz256_div_by_2 +.align 32 +ecp_nistz256_div_by_2: + save %sp,-STACK_FRAME,%sp + ld [$ap],@acc[0] + ld [$ap+4],@acc[1] + ld [$ap+8],@acc[2] + ld [$ap+12],@acc[3] + ld [$ap+16],@acc[4] + ld [$ap+20],@acc[5] + ld [$ap+24],@acc[6] + call __ecp_nistz256_div_by_2 + ld [$ap+28],@acc[7] + ret + restore +.type ecp_nistz256_div_by_2,#function +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +.align 32 +__ecp_nistz256_div_by_2: + ! ret = (a is odd ? a+mod : a) >> 1 + + and @acc[0],1,$bi + neg $bi,$carry + addcc @acc[0],$carry,@acc[0] + addccc @acc[1],$carry,@acc[1] + addccc @acc[2],$carry,@acc[2] + addccc @acc[3],0,@acc[3] + addccc @acc[4],0,@acc[4] + addccc @acc[5],0,@acc[5] + addccc @acc[6],$bi,@acc[6] + addccc @acc[7],$carry,@acc[7] + addc %g0,%g0,$carry + + ! ret >>= 1 + + srl @acc[0],1,@acc[0] + sll @acc[1],31,$t0 + srl @acc[1],1,@acc[1] + or @acc[0],$t0,@acc[0] + sll @acc[2],31,$t1 + srl @acc[2],1,@acc[2] + or @acc[1],$t1,@acc[1] + sll @acc[3],31,$t2 + st @acc[0],[$rp] + srl @acc[3],1,@acc[3] + or @acc[2],$t2,@acc[2] + sll @acc[4],31,$t3 + st @acc[1],[$rp+4] + srl @acc[4],1,@acc[4] + or @acc[3],$t3,@acc[3] + sll @acc[5],31,$t4 + st @acc[2],[$rp+8] + srl @acc[5],1,@acc[5] + or @acc[4],$t4,@acc[4] + sll @acc[6],31,$t5 + st @acc[3],[$rp+12] + srl @acc[6],1,@acc[6] + or @acc[5],$t5,@acc[5] + sll @acc[7],31,$t6 + st @acc[4],[$rp+16] + srl @acc[7],1,@acc[7] + or @acc[6],$t6,@acc[6] + sll $carry,31,$t7 + st @acc[5],[$rp+20] + or @acc[7],$t7,@acc[7] + st @acc[6],[$rp+24] + retl + st @acc[7],[$rp+28] +.type __ecp_nistz256_div_by_2,#function +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +___ + +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); +# above map() describes stack layout with 4 temporary +# 256-bit vectors on top. + +$code.=<<___; +#ifdef __PIC__ +SPARC_PIC_THUNK(%g1) +#endif + +.globl ecp_nistz256_point_double +.align 32 +ecp_nistz256_point_double: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] + and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 + cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) + be ecp_nistz256_point_double_vis3 + nop + + save %sp,-STACK_FRAME-32*4,%sp + + mov $rp,$rp_real + mov $ap,$ap_real + +.Lpoint_double_shortcut: + ld [$ap+32],@acc[0] + ld [$ap+32+4],@acc[1] + ld [$ap+32+8],@acc[2] + ld [$ap+32+12],@acc[3] + ld [$ap+32+16],@acc[4] + ld [$ap+32+20],@acc[5] + ld [$ap+32+24],@acc[6] + ld [$ap+32+28],@acc[7] + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y); + add %sp,LOCALS+$S,$rp + + add $ap_real,64,$bp + add $ap_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z); + add %sp,LOCALS+$Zsqr,$rp + + add $ap_real,0,$bp + call __ecp_nistz256_add ! p256_add(M, Zsqr, in_x); + add %sp,LOCALS+$M,$rp + + add %sp,LOCALS+$S,$bp + add %sp,LOCALS+$S,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S); + add %sp,LOCALS+$S,$rp + + ld [$ap_real],@acc[0] + add %sp,LOCALS+$Zsqr,$bp + ld [$ap_real+4],@acc[1] + ld [$ap_real+8],@acc[2] + ld [$ap_real+12],@acc[3] + ld [$ap_real+16],@acc[4] + ld [$ap_real+20],@acc[5] + ld [$ap_real+24],@acc[6] + ld [$ap_real+28],@acc[7] + call __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr); + add %sp,LOCALS+$Zsqr,$rp + + add $ap_real,32,$bp + add $ap_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y); + add %sp,LOCALS+$tmp0,$rp + + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0); + add $rp_real,64,$rp + + add %sp,LOCALS+$Zsqr,$bp + add %sp,LOCALS+$M,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr); + add %sp,LOCALS+$M,$rp + + call __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M); + add %sp,LOCALS+$M,$rp + + add %sp,LOCALS+$S,$bp + add %sp,LOCALS+$S,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S); + add %sp,LOCALS+$tmp0,$rp + + call __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0); + add $rp_real,32,$rp + + add $ap_real,0,$bp + add %sp,LOCALS+$S,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x); + add %sp,LOCALS+$S,$rp + + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S); + add %sp,LOCALS+$tmp0,$rp + + add %sp,LOCALS+$M,$bp + add %sp,LOCALS+$M,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M); + add $rp_real,0,$rp + + add %sp,LOCALS+$tmp0,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0); + add $rp_real,0,$rp + + add %sp,LOCALS+$S,$bp + call __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x); + add %sp,LOCALS+$S,$rp + + add %sp,LOCALS+$M,$bp + add %sp,LOCALS+$S,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M); + add %sp,LOCALS+$S,$rp + + add $rp_real,32,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y); + add $rp_real,32,$rp + + ret + restore +.type ecp_nistz256_point_double,#function +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +___ +} + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); + +# above map() describes stack layout with 12 temporary +# 256-bit vectors on top. Then we reserve some space for +# !in1infty, !in2infty, result of check for zero and return pointer. + +my $bp_real=$rp_real; + +$code.=<<___; +.globl ecp_nistz256_point_add +.align 32 +ecp_nistz256_point_add: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] + and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 + cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) + be ecp_nistz256_point_add_vis3 + nop + + save %sp,-STACK_FRAME-32*12-32,%sp + + stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp + mov $ap,$ap_real + mov $bp,$bp_real + + ld [$bp+64],$t0 ! in2_z + ld [$bp+64+4],$t1 + ld [$bp+64+8],$t2 + ld [$bp+64+12],$t3 + ld [$bp+64+16],$t4 + ld [$bp+64+20],$t5 + ld [$bp+64+24],$t6 + ld [$bp+64+28],$t7 + or $t1,$t0,$t0 + or $t3,$t2,$t2 + or $t5,$t4,$t4 + or $t7,$t6,$t6 + or $t2,$t0,$t0 + or $t6,$t4,$t4 + or $t4,$t0,$t0 ! !in2infty + movrnz $t0,-1,$t0 + st $t0,[%fp+STACK_BIAS-12] + + ld [$ap+64],$t0 ! in1_z + ld [$ap+64+4],$t1 + ld [$ap+64+8],$t2 + ld [$ap+64+12],$t3 + ld [$ap+64+16],$t4 + ld [$ap+64+20],$t5 + ld [$ap+64+24],$t6 + ld [$ap+64+28],$t7 + or $t1,$t0,$t0 + or $t3,$t2,$t2 + or $t5,$t4,$t4 + or $t7,$t6,$t6 + or $t2,$t0,$t0 + or $t6,$t4,$t4 + or $t4,$t0,$t0 ! !in1infty + movrnz $t0,-1,$t0 + st $t0,[%fp+STACK_BIAS-16] + + add $bp_real,64,$bp + add $bp_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z); + add %sp,LOCALS+$Z2sqr,$rp + + add $ap_real,64,$bp + add $ap_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); + add %sp,LOCALS+$Z1sqr,$rp + + add $bp_real,64,$bp + add %sp,LOCALS+$Z2sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z); + add %sp,LOCALS+$S1,$rp + + add $ap_real,64,$bp + add %sp,LOCALS+$Z1sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); + add %sp,LOCALS+$S2,$rp + + add $ap_real,32,$bp + add %sp,LOCALS+$S1,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y); + add %sp,LOCALS+$S1,$rp + + add $bp_real,32,$bp + add %sp,LOCALS+$S2,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); + add %sp,LOCALS+$S2,$rp + + add %sp,LOCALS+$S1,$bp + call __ecp_nistz256_sub_from ! p256_sub(R, S2, S1); + add %sp,LOCALS+$R,$rp + + or @acc[1],@acc[0],@acc[0] ! see if result is zero + or @acc[3],@acc[2],@acc[2] + or @acc[5],@acc[4],@acc[4] + or @acc[7],@acc[6],@acc[6] + or @acc[2],@acc[0],@acc[0] + or @acc[6],@acc[4],@acc[4] + or @acc[4],@acc[0],@acc[0] + st @acc[0],[%fp+STACK_BIAS-20] + + add $ap_real,0,$bp + add %sp,LOCALS+$Z2sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr); + add %sp,LOCALS+$U1,$rp + + add $bp_real,0,$bp + add %sp,LOCALS+$Z1sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr); + add %sp,LOCALS+$U2,$rp + + add %sp,LOCALS+$U1,$bp + call __ecp_nistz256_sub_from ! p256_sub(H, U2, U1); + add %sp,LOCALS+$H,$rp + + or @acc[1],@acc[0],@acc[0] ! see if result is zero + or @acc[3],@acc[2],@acc[2] + or @acc[5],@acc[4],@acc[4] + or @acc[7],@acc[6],@acc[6] + or @acc[2],@acc[0],@acc[0] + or @acc[6],@acc[4],@acc[4] + orcc @acc[4],@acc[0],@acc[0] + + bne,pt %icc,.Ladd_proceed ! is_equal(U1,U2)? + nop + + ld [%fp+STACK_BIAS-12],$t0 + ld [%fp+STACK_BIAS-16],$t1 + ld [%fp+STACK_BIAS-20],$t2 + andcc $t0,$t1,%g0 + be,pt %icc,.Ladd_proceed ! (in1infty || in2infty)? + nop + andcc $t2,$t2,%g0 + be,pt %icc,.Ladd_double ! is_equal(S1,S2)? + nop + + ldx [%fp+STACK_BIAS-8],$rp + st %g0,[$rp] + st %g0,[$rp+4] + st %g0,[$rp+8] + st %g0,[$rp+12] + st %g0,[$rp+16] + st %g0,[$rp+20] + st %g0,[$rp+24] + st %g0,[$rp+28] + st %g0,[$rp+32] + st %g0,[$rp+32+4] + st %g0,[$rp+32+8] + st %g0,[$rp+32+12] + st %g0,[$rp+32+16] + st %g0,[$rp+32+20] + st %g0,[$rp+32+24] + st %g0,[$rp+32+28] + st %g0,[$rp+64] + st %g0,[$rp+64+4] + st %g0,[$rp+64+8] + st %g0,[$rp+64+12] + st %g0,[$rp+64+16] + st %g0,[$rp+64+20] + st %g0,[$rp+64+24] + st %g0,[$rp+64+28] + b .Ladd_done + nop + +.align 16 +.Ladd_double: + ldx [%fp+STACK_BIAS-8],$rp_real + mov $ap_real,$ap + b .Lpoint_double_shortcut + add %sp,32*(12-4)+32,%sp ! difference in frame sizes + +.align 16 +.Ladd_proceed: + add %sp,LOCALS+$R,$bp + add %sp,LOCALS+$R,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); + add %sp,LOCALS+$Rsqr,$rp + + add $ap_real,64,$bp + add %sp,LOCALS+$H,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); + add %sp,LOCALS+$res_z,$rp + + add %sp,LOCALS+$H,$bp + add %sp,LOCALS+$H,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); + add %sp,LOCALS+$Hsqr,$rp + + add $bp_real,64,$bp + add %sp,LOCALS+$res_z,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z); + add %sp,LOCALS+$res_z,$rp + + add %sp,LOCALS+$H,$bp + add %sp,LOCALS+$Hsqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); + add %sp,LOCALS+$Hcub,$rp + + add %sp,LOCALS+$U1,$bp + add %sp,LOCALS+$Hsqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr); + add %sp,LOCALS+$U2,$rp + + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); + add %sp,LOCALS+$Hsqr,$rp + + add %sp,LOCALS+$Rsqr,$bp + call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); + add %sp,LOCALS+$res_x,$rp + + add %sp,LOCALS+$Hcub,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); + add %sp,LOCALS+$res_x,$rp + + add %sp,LOCALS+$U2,$bp + call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); + add %sp,LOCALS+$res_y,$rp + + add %sp,LOCALS+$Hcub,$bp + add %sp,LOCALS+$S1,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub); + add %sp,LOCALS+$S2,$rp + + add %sp,LOCALS+$R,$bp + add %sp,LOCALS+$res_y,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); + add %sp,LOCALS+$res_y,$rp + + add %sp,LOCALS+$S2,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); + add %sp,LOCALS+$res_y,$rp + + ld [%fp+STACK_BIAS-16],$t1 ! !in1infty + ld [%fp+STACK_BIAS-12],$t2 ! !in2infty + ldx [%fp+STACK_BIAS-8],$rp +___ +for($i=0;$i<96;$i+=8) { # conditional moves +$code.=<<___; + ld [%sp+LOCALS+$i],@acc[0] ! res + ld [%sp+LOCALS+$i+4],@acc[1] + ld [$bp_real+$i],@acc[2] ! in2 + ld [$bp_real+$i+4],@acc[3] + ld [$ap_real+$i],@acc[4] ! in1 + ld [$ap_real+$i+4],@acc[5] + movrz $t1,@acc[2],@acc[0] + movrz $t1,@acc[3],@acc[1] + movrz $t2,@acc[4],@acc[0] + movrz $t2,@acc[5],@acc[1] + st @acc[0],[$rp+$i] + st @acc[1],[$rp+$i+4] +___ +} +$code.=<<___; +.Ladd_done: + ret + restore +.type ecp_nistz256_point_add,#function +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +___ +} + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); +my $Z1sqr = $S2; +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. Then we reserve some space for +# !in1infty, !in2infty, result of check for zero and return pointer. + +my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); +my $bp_real=$rp_real; + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.align 32 +ecp_nistz256_point_add_affine: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0] + and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1 + cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK) + be ecp_nistz256_point_add_affine_vis3 + nop + + save %sp,-STACK_FRAME-32*10-32,%sp + + stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp + mov $ap,$ap_real + mov $bp,$bp_real + + ld [$ap+64],$t0 ! in1_z + ld [$ap+64+4],$t1 + ld [$ap+64+8],$t2 + ld [$ap+64+12],$t3 + ld [$ap+64+16],$t4 + ld [$ap+64+20],$t5 + ld [$ap+64+24],$t6 + ld [$ap+64+28],$t7 + or $t1,$t0,$t0 + or $t3,$t2,$t2 + or $t5,$t4,$t4 + or $t7,$t6,$t6 + or $t2,$t0,$t0 + or $t6,$t4,$t4 + or $t4,$t0,$t0 ! !in1infty + movrnz $t0,-1,$t0 + st $t0,[%fp+STACK_BIAS-16] + + ld [$bp],@acc[0] ! in2_x + ld [$bp+4],@acc[1] + ld [$bp+8],@acc[2] + ld [$bp+12],@acc[3] + ld [$bp+16],@acc[4] + ld [$bp+20],@acc[5] + ld [$bp+24],@acc[6] + ld [$bp+28],@acc[7] + ld [$bp+32],$t0 ! in2_y + ld [$bp+32+4],$t1 + ld [$bp+32+8],$t2 + ld [$bp+32+12],$t3 + ld [$bp+32+16],$t4 + ld [$bp+32+20],$t5 + ld [$bp+32+24],$t6 + ld [$bp+32+28],$t7 + or @acc[1],@acc[0],@acc[0] + or @acc[3],@acc[2],@acc[2] + or @acc[5],@acc[4],@acc[4] + or @acc[7],@acc[6],@acc[6] + or @acc[2],@acc[0],@acc[0] + or @acc[6],@acc[4],@acc[4] + or @acc[4],@acc[0],@acc[0] + or $t1,$t0,$t0 + or $t3,$t2,$t2 + or $t5,$t4,$t4 + or $t7,$t6,$t6 + or $t2,$t0,$t0 + or $t6,$t4,$t4 + or $t4,$t0,$t0 + or @acc[0],$t0,$t0 ! !in2infty + movrnz $t0,-1,$t0 + st $t0,[%fp+STACK_BIAS-12] + + add $ap_real,64,$bp + add $ap_real,64,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z); + add %sp,LOCALS+$Z1sqr,$rp + + add $bp_real,0,$bp + add %sp,LOCALS+$Z1sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x); + add %sp,LOCALS+$U2,$rp + + add $ap_real,0,$bp + call __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x); + add %sp,LOCALS+$H,$rp + + add $ap_real,64,$bp + add %sp,LOCALS+$Z1sqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z); + add %sp,LOCALS+$S2,$rp + + add $ap_real,64,$bp + add %sp,LOCALS+$H,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z); + add %sp,LOCALS+$res_z,$rp + + add $bp_real,32,$bp + add %sp,LOCALS+$S2,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y); + add %sp,LOCALS+$S2,$rp + + add $ap_real,32,$bp + call __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y); + add %sp,LOCALS+$R,$rp + + add %sp,LOCALS+$H,$bp + add %sp,LOCALS+$H,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H); + add %sp,LOCALS+$Hsqr,$rp + + add %sp,LOCALS+$R,$bp + add %sp,LOCALS+$R,$ap + call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R); + add %sp,LOCALS+$Rsqr,$rp + + add %sp,LOCALS+$H,$bp + add %sp,LOCALS+$Hsqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H); + add %sp,LOCALS+$Hcub,$rp + + add $ap_real,0,$bp + add %sp,LOCALS+$Hsqr,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr); + add %sp,LOCALS+$U2,$rp + + call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2); + add %sp,LOCALS+$Hsqr,$rp + + add %sp,LOCALS+$Rsqr,$bp + call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr); + add %sp,LOCALS+$res_x,$rp + + add %sp,LOCALS+$Hcub,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub); + add %sp,LOCALS+$res_x,$rp + + add %sp,LOCALS+$U2,$bp + call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x); + add %sp,LOCALS+$res_y,$rp + + add $ap_real,32,$bp + add %sp,LOCALS+$Hcub,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub); + add %sp,LOCALS+$S2,$rp + + add %sp,LOCALS+$R,$bp + add %sp,LOCALS+$res_y,$ap + call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R); + add %sp,LOCALS+$res_y,$rp + + add %sp,LOCALS+$S2,$bp + call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2); + add %sp,LOCALS+$res_y,$rp + + ld [%fp+STACK_BIAS-16],$t1 ! !in1infty + ld [%fp+STACK_BIAS-12],$t2 ! !in2infty + ldx [%fp+STACK_BIAS-8],$rp +___ +for($i=0;$i<64;$i+=8) { # conditional moves +$code.=<<___; + ld [%sp+LOCALS+$i],@acc[0] ! res + ld [%sp+LOCALS+$i+4],@acc[1] + ld [$bp_real+$i],@acc[2] ! in2 + ld [$bp_real+$i+4],@acc[3] + ld [$ap_real+$i],@acc[4] ! in1 + ld [$ap_real+$i+4],@acc[5] + movrz $t1,@acc[2],@acc[0] + movrz $t1,@acc[3],@acc[1] + movrz $t2,@acc[4],@acc[0] + movrz $t2,@acc[5],@acc[1] + st @acc[0],[$rp+$i] + st @acc[1],[$rp+$i+4] +___ +} +for(;$i<96;$i+=8) { +my $j=($i-64)/4; +$code.=<<___; + ld [%sp+LOCALS+$i],@acc[0] ! res + ld [%sp+LOCALS+$i+4],@acc[1] + ld [$ap_real+$i],@acc[4] ! in1 + ld [$ap_real+$i+4],@acc[5] + movrz $t1,@ONE_mont[$j],@acc[0] + movrz $t1,@ONE_mont[$j+1],@acc[1] + movrz $t2,@acc[4],@acc[0] + movrz $t2,@acc[5],@acc[1] + st @acc[0],[$rp+$i] + st @acc[1],[$rp+$i+4] +___ +} +$code.=<<___; + ret + restore +.type ecp_nistz256_point_add_affine,#function +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +___ +} }}} +{{{ +my ($out,$inp,$index)=map("%i$_",(0..2)); +my $mask="%o0"; + +$code.=<<___; +! void ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1, +! int %i2); +.globl ecp_nistz256_scatter_w5 +.align 32 +ecp_nistz256_scatter_w5: + save %sp,-STACK_FRAME,%sp + + sll $index,2,$index + add $out,$index,$out + + ld [$inp],%l0 ! X + ld [$inp+4],%l1 + ld [$inp+8],%l2 + ld [$inp+12],%l3 + ld [$inp+16],%l4 + ld [$inp+20],%l5 + ld [$inp+24],%l6 + ld [$inp+28],%l7 + add $inp,32,$inp + st %l0,[$out+64*0-4] + st %l1,[$out+64*1-4] + st %l2,[$out+64*2-4] + st %l3,[$out+64*3-4] + st %l4,[$out+64*4-4] + st %l5,[$out+64*5-4] + st %l6,[$out+64*6-4] + st %l7,[$out+64*7-4] + add $out,64*8,$out + + ld [$inp],%l0 ! Y + ld [$inp+4],%l1 + ld [$inp+8],%l2 + ld [$inp+12],%l3 + ld [$inp+16],%l4 + ld [$inp+20],%l5 + ld [$inp+24],%l6 + ld [$inp+28],%l7 + add $inp,32,$inp + st %l0,[$out+64*0-4] + st %l1,[$out+64*1-4] + st %l2,[$out+64*2-4] + st %l3,[$out+64*3-4] + st %l4,[$out+64*4-4] + st %l5,[$out+64*5-4] + st %l6,[$out+64*6-4] + st %l7,[$out+64*7-4] + add $out,64*8,$out + + ld [$inp],%l0 ! Z + ld [$inp+4],%l1 + ld [$inp+8],%l2 + ld [$inp+12],%l3 + ld [$inp+16],%l4 + ld [$inp+20],%l5 + ld [$inp+24],%l6 + ld [$inp+28],%l7 + st %l0,[$out+64*0-4] + st %l1,[$out+64*1-4] + st %l2,[$out+64*2-4] + st %l3,[$out+64*3-4] + st %l4,[$out+64*4-4] + st %l5,[$out+64*5-4] + st %l6,[$out+64*6-4] + st %l7,[$out+64*7-4] + + ret + restore +.type ecp_nistz256_scatter_w5,#function +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +! void ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1, +! int %i2); +.globl ecp_nistz256_gather_w5 +.align 32 +ecp_nistz256_gather_w5: + save %sp,-STACK_FRAME,%sp + + neg $index,$mask + srax $mask,63,$mask + + add $index,$mask,$index + sll $index,2,$index + add $inp,$index,$inp + + ld [$inp+64*0],%l0 + ld [$inp+64*1],%l1 + ld [$inp+64*2],%l2 + ld [$inp+64*3],%l3 + ld [$inp+64*4],%l4 + ld [$inp+64*5],%l5 + ld [$inp+64*6],%l6 + ld [$inp+64*7],%l7 + add $inp,64*8,$inp + and %l0,$mask,%l0 + and %l1,$mask,%l1 + st %l0,[$out] ! X + and %l2,$mask,%l2 + st %l1,[$out+4] + and %l3,$mask,%l3 + st %l2,[$out+8] + and %l4,$mask,%l4 + st %l3,[$out+12] + and %l5,$mask,%l5 + st %l4,[$out+16] + and %l6,$mask,%l6 + st %l5,[$out+20] + and %l7,$mask,%l7 + st %l6,[$out+24] + st %l7,[$out+28] + add $out,32,$out + + ld [$inp+64*0],%l0 + ld [$inp+64*1],%l1 + ld [$inp+64*2],%l2 + ld [$inp+64*3],%l3 + ld [$inp+64*4],%l4 + ld [$inp+64*5],%l5 + ld [$inp+64*6],%l6 + ld [$inp+64*7],%l7 + add $inp,64*8,$inp + and %l0,$mask,%l0 + and %l1,$mask,%l1 + st %l0,[$out] ! Y + and %l2,$mask,%l2 + st %l1,[$out+4] + and %l3,$mask,%l3 + st %l2,[$out+8] + and %l4,$mask,%l4 + st %l3,[$out+12] + and %l5,$mask,%l5 + st %l4,[$out+16] + and %l6,$mask,%l6 + st %l5,[$out+20] + and %l7,$mask,%l7 + st %l6,[$out+24] + st %l7,[$out+28] + add $out,32,$out + + ld [$inp+64*0],%l0 + ld [$inp+64*1],%l1 + ld [$inp+64*2],%l2 + ld [$inp+64*3],%l3 + ld [$inp+64*4],%l4 + ld [$inp+64*5],%l5 + ld [$inp+64*6],%l6 + ld [$inp+64*7],%l7 + and %l0,$mask,%l0 + and %l1,$mask,%l1 + st %l0,[$out] ! Z + and %l2,$mask,%l2 + st %l1,[$out+4] + and %l3,$mask,%l3 + st %l2,[$out+8] + and %l4,$mask,%l4 + st %l3,[$out+12] + and %l5,$mask,%l5 + st %l4,[$out+16] + and %l6,$mask,%l6 + st %l5,[$out+20] + and %l7,$mask,%l7 + st %l6,[$out+24] + st %l7,[$out+28] + + ret + restore +.type ecp_nistz256_gather_w5,#function +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +! void ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1, +! int %i2); +.globl ecp_nistz256_scatter_w7 +.align 32 +ecp_nistz256_scatter_w7: + save %sp,-STACK_FRAME,%sp + nop + add $out,$index,$out + mov 64/4,$index +.Loop_scatter_w7: + ld [$inp],%l0 + add $inp,4,$inp + subcc $index,1,$index + stb %l0,[$out+64*0] + srl %l0,8,%l1 + stb %l1,[$out+64*1] + srl %l0,16,%l2 + stb %l2,[$out+64*2] + srl %l0,24,%l3 + stb %l3,[$out+64*3] + bne .Loop_scatter_w7 + add $out,64*4,$out + + ret + restore +.type ecp_nistz256_scatter_w7,#function +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +! void ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1, +! int %i2); +.globl ecp_nistz256_gather_w7 +.align 32 +ecp_nistz256_gather_w7: + save %sp,-STACK_FRAME,%sp + + neg $index,$mask + srax $mask,63,$mask + + add $index,$mask,$index + add $inp,$index,$inp + mov 64/4,$index + +.Loop_gather_w7: + ldub [$inp+64*0],%l0 + prefetch [$inp+3840+64*0],1 + subcc $index,1,$index + ldub [$inp+64*1],%l1 + prefetch [$inp+3840+64*1],1 + ldub [$inp+64*2],%l2 + prefetch [$inp+3840+64*2],1 + ldub [$inp+64*3],%l3 + prefetch [$inp+3840+64*3],1 + add $inp,64*4,$inp + sll %l1,8,%l1 + sll %l2,16,%l2 + or %l0,%l1,%l0 + sll %l3,24,%l3 + or %l0,%l2,%l0 + or %l0,%l3,%l0 + and %l0,$mask,%l0 + st %l0,[$out] + bne .Loop_gather_w7 + add $out,4,$out + + ret + restore +.type ecp_nistz256_gather_w7,#function +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +}}} +{{{ +######################################################################## +# Following subroutines are VIS3 counterparts of those above that +# implement ones found in ecp_nistz256.c. Key difference is that they +# use 128-bit multiplication and addition with 64-bit carry, and in order +# to do that they perform conversion from uin32_t[8] to uint64_t[4] upon +# entry and vice versa on return. +# +my ($rp,$ap,$bp)=map("%i$_",(0..2)); +my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7)); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5)); +my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1"); +my ($rp_real,$ap_real)=("%g2","%g3"); +my ($acc6,$acc7)=($bp,$bi); # used in squaring + +$code.=<<___; +.align 32 +__ecp_nistz256_mul_by_2_vis3: + addcc $acc0,$acc0,$acc0 + addxccc $acc1,$acc1,$acc1 + addxccc $acc2,$acc2,$acc2 + addxccc $acc3,$acc3,$acc3 + b .Lreduce_by_sub_vis3 + addxc %g0,%g0,$acc4 ! did it carry? +.type __ecp_nistz256_mul_by_2_vis3,#function +.size __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3 + +.align 32 +__ecp_nistz256_add_vis3: + ldx [$bp+0],$t0 + ldx [$bp+8],$t1 + ldx [$bp+16],$t2 + ldx [$bp+24],$t3 + +__ecp_nistz256_add_noload_vis3: + + addcc $t0,$acc0,$acc0 + addxccc $t1,$acc1,$acc1 + addxccc $t2,$acc2,$acc2 + addxccc $t3,$acc3,$acc3 + addxc %g0,%g0,$acc4 ! did it carry? + +.Lreduce_by_sub_vis3: + + addcc $acc0,1,$t0 ! add -modulus, i.e. subtract + addxccc $acc1,$poly1,$t1 + addxccc $acc2,$minus1,$t2 + addxccc $acc3,$poly3,$t3 + addxc $acc4,$minus1,$acc4 + + movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus + movrz $acc4,$t1,$acc1 + stx $acc0,[$rp] + movrz $acc4,$t2,$acc2 + stx $acc1,[$rp+8] + movrz $acc4,$t3,$acc3 + stx $acc2,[$rp+16] + retl + stx $acc3,[$rp+24] +.type __ecp_nistz256_add_vis3,#function +.size __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3 + +! Trouble with subtraction is that there is no subtraction with 64-bit +! borrow, only with 32-bit one. For this reason we "decompose" 64-bit +! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But +! recall that SPARC is big-endian, which is why you'll observe that +! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we +! "collect" result back to 64-bit $acc0-$acc3. +.align 32 +__ecp_nistz256_sub_from_vis3: + ld [$bp+4],$t0 + ld [$bp+0],$t1 + ld [$bp+12],$t2 + ld [$bp+8],$t3 + + srlx $acc0,32,$acc4 + not $poly1,$poly1 + srlx $acc1,32,$acc5 + subcc $acc0,$t0,$acc0 + ld [$bp+20],$t0 + subccc $acc4,$t1,$acc4 + ld [$bp+16],$t1 + subccc $acc1,$t2,$acc1 + ld [$bp+28],$t2 + and $acc0,$poly1,$acc0 + subccc $acc5,$t3,$acc5 + ld [$bp+24],$t3 + sllx $acc4,32,$acc4 + and $acc1,$poly1,$acc1 + sllx $acc5,32,$acc5 + or $acc0,$acc4,$acc0 + srlx $acc2,32,$acc4 + or $acc1,$acc5,$acc1 + srlx $acc3,32,$acc5 + subccc $acc2,$t0,$acc2 + subccc $acc4,$t1,$acc4 + subccc $acc3,$t2,$acc3 + and $acc2,$poly1,$acc2 + subccc $acc5,$t3,$acc5 + sllx $acc4,32,$acc4 + and $acc3,$poly1,$acc3 + sllx $acc5,32,$acc5 + or $acc2,$acc4,$acc2 + subc %g0,%g0,$acc4 ! did it borrow? + b .Lreduce_by_add_vis3 + or $acc3,$acc5,$acc3 +.type __ecp_nistz256_sub_from_vis3,#function +.size __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3 + +.align 32 +__ecp_nistz256_sub_morf_vis3: + ld [$bp+4],$t0 + ld [$bp+0],$t1 + ld [$bp+12],$t2 + ld [$bp+8],$t3 + + srlx $acc0,32,$acc4 + not $poly1,$poly1 + srlx $acc1,32,$acc5 + subcc $t0,$acc0,$acc0 + ld [$bp+20],$t0 + subccc $t1,$acc4,$acc4 + ld [$bp+16],$t1 + subccc $t2,$acc1,$acc1 + ld [$bp+28],$t2 + and $acc0,$poly1,$acc0 + subccc $t3,$acc5,$acc5 + ld [$bp+24],$t3 + sllx $acc4,32,$acc4 + and $acc1,$poly1,$acc1 + sllx $acc5,32,$acc5 + or $acc0,$acc4,$acc0 + srlx $acc2,32,$acc4 + or $acc1,$acc5,$acc1 + srlx $acc3,32,$acc5 + subccc $t0,$acc2,$acc2 + subccc $t1,$acc4,$acc4 + subccc $t2,$acc3,$acc3 + and $acc2,$poly1,$acc2 + subccc $t3,$acc5,$acc5 + sllx $acc4,32,$acc4 + and $acc3,$poly1,$acc3 + sllx $acc5,32,$acc5 + or $acc2,$acc4,$acc2 + subc %g0,%g0,$acc4 ! did it borrow? + or $acc3,$acc5,$acc3 + +.Lreduce_by_add_vis3: + + addcc $acc0,-1,$t0 ! add modulus + not $poly3,$t3 + addxccc $acc1,$poly1,$t1 + not $poly1,$poly1 ! restore $poly1 + addxccc $acc2,%g0,$t2 + addxc $acc3,$t3,$t3 + + movrnz $acc4,$t0,$acc0 ! if a-b borrowed, ret = ret+mod + movrnz $acc4,$t1,$acc1 + stx $acc0,[$rp] + movrnz $acc4,$t2,$acc2 + stx $acc1,[$rp+8] + movrnz $acc4,$t3,$acc3 + stx $acc2,[$rp+16] + retl + stx $acc3,[$rp+24] +.type __ecp_nistz256_sub_morf_vis3,#function +.size __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3 + +.align 32 +__ecp_nistz256_div_by_2_vis3: + ! ret = (a is odd ? a+mod : a) >> 1 + + not $poly1,$t1 + not $poly3,$t3 + and $acc0,1,$acc5 + addcc $acc0,-1,$t0 ! add modulus + addxccc $acc1,$t1,$t1 + addxccc $acc2,%g0,$t2 + addxccc $acc3,$t3,$t3 + addxc %g0,%g0,$acc4 ! carry bit + + movrnz $acc5,$t0,$acc0 + movrnz $acc5,$t1,$acc1 + movrnz $acc5,$t2,$acc2 + movrnz $acc5,$t3,$acc3 + movrz $acc5,%g0,$acc4 + + ! ret >>= 1 + + srlx $acc0,1,$acc0 + sllx $acc1,63,$t0 + srlx $acc1,1,$acc1 + or $acc0,$t0,$acc0 + sllx $acc2,63,$t1 + srlx $acc2,1,$acc2 + or $acc1,$t1,$acc1 + sllx $acc3,63,$t2 + stx $acc0,[$rp] + srlx $acc3,1,$acc3 + or $acc2,$t2,$acc2 + sllx $acc4,63,$t3 ! don't forget carry bit + stx $acc1,[$rp+8] + or $acc3,$t3,$acc3 + stx $acc2,[$rp+16] + retl + stx $acc3,[$rp+24] +.type __ecp_nistz256_div_by_2_vis3,#function +.size __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3 + +! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and +! 4x faster [on T4]... +.align 32 +__ecp_nistz256_mul_mont_vis3: + mulx $a0,$bi,$acc0 + not $poly3,$poly3 ! 0xFFFFFFFF00000001 + umulxhi $a0,$bi,$t0 + mulx $a1,$bi,$acc1 + umulxhi $a1,$bi,$t1 + mulx $a2,$bi,$acc2 + umulxhi $a2,$bi,$t2 + mulx $a3,$bi,$acc3 + umulxhi $a3,$bi,$t3 + ldx [$bp+8],$bi ! b[1] + + addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication + sllx $acc0,32,$t0 + addxccc $acc2,$t1,$acc2 + srlx $acc0,32,$t1 + addxccc $acc3,$t2,$acc3 + addxc %g0,$t3,$acc4 + mov 0,$acc5 +___ +for($i=1;$i<4;$i++) { + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff0001.00000000.0000ffff.ffffffff + # * abcdefgh + # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh + # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 + # - 0000abcd.efgh0000.00000000.00000000.abcdefgh + # + # or marking redundant operations: + # + # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- + # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- + # - 0000abcd.efgh0000.--------.--------.-------- + # ^^^^^^^^ but this word is calculated with umulxhi, because + # there is no subtract with 64-bit borrow:-( + +$code.=<<___; + sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part + umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part + addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] + mulx $a0,$bi,$t0 + addxccc $acc2,$t1,$acc1 + mulx $a1,$bi,$t1 + addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 + mulx $a2,$bi,$t2 + addxccc $acc4,$t3,$acc3 + mulx $a3,$bi,$t3 + addxc $acc5,%g0,$acc4 + + addcc $acc0,$t0,$acc0 ! accumulate low parts of multiplication + umulxhi $a0,$bi,$t0 + addxccc $acc1,$t1,$acc1 + umulxhi $a1,$bi,$t1 + addxccc $acc2,$t2,$acc2 + umulxhi $a2,$bi,$t2 + addxccc $acc3,$t3,$acc3 + umulxhi $a3,$bi,$t3 + addxc $acc4,%g0,$acc4 +___ +$code.=<<___ if ($i<3); + ldx [$bp+8*($i+1)],$bi ! bp[$i+1] +___ +$code.=<<___; + addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication + sllx $acc0,32,$t0 + addxccc $acc2,$t1,$acc2 + srlx $acc0,32,$t1 + addxccc $acc3,$t2,$acc3 + addxccc $acc4,$t3,$acc4 + addxc %g0,%g0,$acc5 +___ +} +$code.=<<___; + sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part + umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part + addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] + addxccc $acc2,$t1,$acc1 + addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 + addxccc $acc4,$t3,$acc3 + b .Lmul_final_vis3 ! see below + addxc $acc5,%g0,$acc4 +.type __ecp_nistz256_mul_mont_vis3,#function +.size __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3 + +! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less +! instructions, but only 14% faster [on T4]... +.align 32 +__ecp_nistz256_sqr_mont_vis3: + ! | | | | | |a1*a0| | + ! | | | | |a2*a0| | | + ! | |a3*a2|a3*a0| | | | + ! | | | |a2*a1| | | | + ! | | |a3*a1| | | | | + ! *| | | | | | | | 2| + ! +|a3*a3|a2*a2|a1*a1|a0*a0| + ! |--+--+--+--+--+--+--+--| + ! |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + ! + ! "can't overflow" below mark carrying into high part of + ! multiplication result, which can't overflow, because it + ! can never be all ones. + + mulx $a1,$a0,$acc1 ! a[1]*a[0] + umulxhi $a1,$a0,$t1 + mulx $a2,$a0,$acc2 ! a[2]*a[0] + umulxhi $a2,$a0,$t2 + mulx $a3,$a0,$acc3 ! a[3]*a[0] + umulxhi $a3,$a0,$acc4 + + addcc $acc2,$t1,$acc2 ! accumulate high parts of multiplication + mulx $a2,$a1,$t0 ! a[2]*a[1] + umulxhi $a2,$a1,$t1 + addxccc $acc3,$t2,$acc3 + mulx $a3,$a1,$t2 ! a[3]*a[1] + umulxhi $a3,$a1,$t3 + addxc $acc4,%g0,$acc4 ! can't overflow + + mulx $a3,$a2,$acc5 ! a[3]*a[2] + not $poly3,$poly3 ! 0xFFFFFFFF00000001 + umulxhi $a3,$a2,$acc6 + + addcc $t2,$t1,$t1 ! accumulate high parts of multiplication + mulx $a0,$a0,$acc0 ! a[0]*a[0] + addxc $t3,%g0,$t2 ! can't overflow + + addcc $acc3,$t0,$acc3 ! accumulate low parts of multiplication + umulxhi $a0,$a0,$a0 + addxccc $acc4,$t1,$acc4 + mulx $a1,$a1,$t1 ! a[1]*a[1] + addxccc $acc5,$t2,$acc5 + umulxhi $a1,$a1,$a1 + addxc $acc6,%g0,$acc6 ! can't overflow + + addcc $acc1,$acc1,$acc1 ! acc[1-6]*=2 + mulx $a2,$a2,$t2 ! a[2]*a[2] + addxccc $acc2,$acc2,$acc2 + umulxhi $a2,$a2,$a2 + addxccc $acc3,$acc3,$acc3 + mulx $a3,$a3,$t3 ! a[3]*a[3] + addxccc $acc4,$acc4,$acc4 + umulxhi $a3,$a3,$a3 + addxccc $acc5,$acc5,$acc5 + addxccc $acc6,$acc6,$acc6 + addxc %g0,%g0,$acc7 + + addcc $acc1,$a0,$acc1 ! +a[i]*a[i] + addxccc $acc2,$t1,$acc2 + addxccc $acc3,$a1,$acc3 + addxccc $acc4,$t2,$acc4 + sllx $acc0,32,$t0 + addxccc $acc5,$a2,$acc5 + srlx $acc0,32,$t1 + addxccc $acc6,$t3,$acc6 + sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part + addxc $acc7,$a3,$acc7 +___ +for($i=0;$i<3;$i++) { # reductions, see commentary + # in multiplication for details +$code.=<<___; + umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part + addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] + sllx $acc0,32,$t0 + addxccc $acc2,$t1,$acc1 + srlx $acc0,32,$t1 + addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 + sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part + addxc %g0,$t3,$acc3 ! can't overflow +___ +} +$code.=<<___; + umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part + addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0] + addxccc $acc2,$t1,$acc1 + addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001 + addxc %g0,$t3,$acc3 ! can't overflow + + addcc $acc0,$acc4,$acc0 ! accumulate upper half + addxccc $acc1,$acc5,$acc1 + addxccc $acc2,$acc6,$acc2 + addxccc $acc3,$acc7,$acc3 + addxc %g0,%g0,$acc4 + +.Lmul_final_vis3: + + ! Final step is "if result > mod, subtract mod", but as comparison + ! means subtraction, we do the subtraction and then copy outcome + ! if it didn't borrow. But note that as we [have to] replace + ! subtraction with addition with negative, carry/borrow logic is + ! inverse. + + addcc $acc0,1,$t0 ! add -modulus, i.e. subtract + not $poly3,$poly3 ! restore 0x00000000FFFFFFFE + addxccc $acc1,$poly1,$t1 + addxccc $acc2,$minus1,$t2 + addxccc $acc3,$poly3,$t3 + addxccc $acc4,$minus1,%g0 ! did it carry? + + movcs %xcc,$t0,$acc0 + movcs %xcc,$t1,$acc1 + stx $acc0,[$rp] + movcs %xcc,$t2,$acc2 + stx $acc1,[$rp+8] + movcs %xcc,$t3,$acc3 + stx $acc2,[$rp+16] + retl + stx $acc3,[$rp+24] +.type __ecp_nistz256_sqr_mont_vis3,#function +.size __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3 +___ + +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +{ +my ($res_x,$res_y,$res_z, + $in_x,$in_y,$in_z, + $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9)); +# above map() describes stack layout with 10 temporary +# 256-bit vectors on top. + +$code.=<<___; +.align 32 +ecp_nistz256_point_double_vis3: + save %sp,-STACK64_FRAME-32*10,%sp + + mov $rp,$rp_real +.Ldouble_shortcut_vis3: + mov -1,$minus1 + mov -2,$poly3 + sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 + srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE + + ! convert input to uint64_t[4] + ld [$ap],$a0 ! in_x + ld [$ap+4],$t0 + ld [$ap+8],$a1 + ld [$ap+12],$t1 + ld [$ap+16],$a2 + ld [$ap+20],$t2 + ld [$ap+24],$a3 + ld [$ap+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$ap+32],$acc0 ! in_y + or $a0,$t0,$a0 + ld [$ap+32+4],$t0 + sllx $t2,32,$t2 + ld [$ap+32+8],$acc1 + or $a1,$t1,$a1 + ld [$ap+32+12],$t1 + sllx $t3,32,$t3 + ld [$ap+32+16],$acc2 + or $a2,$t2,$a2 + ld [$ap+32+20],$t2 + or $a3,$t3,$a3 + ld [$ap+32+24],$acc3 + sllx $t0,32,$t0 + ld [$ap+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in_y+16] + stx $acc3,[%sp+LOCALS64+$in_y+24] + + ld [$ap+64],$a0 ! in_z + ld [$ap+64+4],$t0 + ld [$ap+64+8],$a1 + ld [$ap+64+12],$t1 + ld [$ap+64+16],$a2 + ld [$ap+64+20],$t2 + ld [$ap+64+24],$a3 + ld [$ap+64+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + or $a0,$t0,$a0 + sllx $t2,32,$t2 + or $a1,$t1,$a1 + sllx $t3,32,$t3 + or $a2,$t2,$a2 + or $a3,$t3,$a3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in_z] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in_z+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in_z+16] + stx $a3,[%sp+LOCALS64+$in_z+24] + + ! in_y is still in $acc0-$acc3 + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(S, in_y); + add %sp,LOCALS64+$S,$rp + + ! in_z is still in $a0-$a3 + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Zsqr, in_z); + add %sp,LOCALS64+$Zsqr,$rp + + mov $acc0,$a0 ! put Zsqr aside + mov $acc1,$a1 + mov $acc2,$a2 + mov $acc3,$a3 + + add %sp,LOCALS64+$in_x,$bp + call __ecp_nistz256_add_vis3 ! p256_add(M, Zsqr, in_x); + add %sp,LOCALS64+$M,$rp + + mov $a0,$acc0 ! restore Zsqr + ldx [%sp+LOCALS64+$S],$a0 ! forward load + mov $a1,$acc1 + ldx [%sp+LOCALS64+$S+8],$a1 + mov $a2,$acc2 + ldx [%sp+LOCALS64+$S+16],$a2 + mov $a3,$acc3 + ldx [%sp+LOCALS64+$S+24],$a3 + + add %sp,LOCALS64+$in_x,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(Zsqr, in_x, Zsqr); + add %sp,LOCALS64+$Zsqr,$rp + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(S, S); + add %sp,LOCALS64+$S,$rp + + ldx [%sp+LOCALS64+$in_z],$bi + ldx [%sp+LOCALS64+$in_y],$a0 + ldx [%sp+LOCALS64+$in_y+8],$a1 + ldx [%sp+LOCALS64+$in_y+16],$a2 + ldx [%sp+LOCALS64+$in_y+24],$a3 + add %sp,LOCALS64+$in_z,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(tmp0, in_z, in_y); + add %sp,LOCALS64+$tmp0,$rp + + ldx [%sp+LOCALS64+$M],$bi ! forward load + ldx [%sp+LOCALS64+$Zsqr],$a0 + ldx [%sp+LOCALS64+$Zsqr+8],$a1 + ldx [%sp+LOCALS64+$Zsqr+16],$a2 + ldx [%sp+LOCALS64+$Zsqr+24],$a3 + + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(res_z, tmp0); + add %sp,LOCALS64+$res_z,$rp + + add %sp,LOCALS64+$M,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(M, M, Zsqr); + add %sp,LOCALS64+$M,$rp + + mov $acc0,$a0 ! put aside M + mov $acc1,$a1 + mov $acc2,$a2 + mov $acc3,$a3 + call __ecp_nistz256_mul_by_2_vis3 + add %sp,LOCALS64+$M,$rp + mov $a0,$t0 ! copy M + ldx [%sp+LOCALS64+$S],$a0 ! forward load + mov $a1,$t1 + ldx [%sp+LOCALS64+$S+8],$a1 + mov $a2,$t2 + ldx [%sp+LOCALS64+$S+16],$a2 + mov $a3,$t3 + ldx [%sp+LOCALS64+$S+24],$a3 + call __ecp_nistz256_add_noload_vis3 ! p256_mul_by_3(M, M); + add %sp,LOCALS64+$M,$rp + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(tmp0, S); + add %sp,LOCALS64+$tmp0,$rp + + ldx [%sp+LOCALS64+$S],$bi ! forward load + ldx [%sp+LOCALS64+$in_x],$a0 + ldx [%sp+LOCALS64+$in_x+8],$a1 + ldx [%sp+LOCALS64+$in_x+16],$a2 + ldx [%sp+LOCALS64+$in_x+24],$a3 + + call __ecp_nistz256_div_by_2_vis3 ! p256_div_by_2(res_y, tmp0); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$S,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, in_x); + add %sp,LOCALS64+$S,$rp + + ldx [%sp+LOCALS64+$M],$a0 ! forward load + ldx [%sp+LOCALS64+$M+8],$a1 + ldx [%sp+LOCALS64+$M+16],$a2 + ldx [%sp+LOCALS64+$M+24],$a3 + + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(tmp0, S); + add %sp,LOCALS64+$tmp0,$rp + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(res_x, M); + add %sp,LOCALS64+$res_x,$rp + + add %sp,LOCALS64+$tmp0,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, tmp0); + add %sp,LOCALS64+$res_x,$rp + + ldx [%sp+LOCALS64+$M],$a0 ! forward load + ldx [%sp+LOCALS64+$M+8],$a1 + ldx [%sp+LOCALS64+$M+16],$a2 + ldx [%sp+LOCALS64+$M+24],$a3 + + add %sp,LOCALS64+$S,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(S, S, res_x); + add %sp,LOCALS64+$S,$rp + + mov $acc0,$bi + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, M); + add %sp,LOCALS64+$S,$rp + + ldx [%sp+LOCALS64+$res_x],$a0 ! forward load + ldx [%sp+LOCALS64+$res_x+8],$a1 + ldx [%sp+LOCALS64+$res_x+16],$a2 + ldx [%sp+LOCALS64+$res_x+24],$a3 + + add %sp,LOCALS64+$res_y,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, S, res_y); + add %sp,LOCALS64+$res_y,$bp + + ! convert output to uint_32[8] + srlx $a0,32,$t0 + srlx $a1,32,$t1 + st $a0,[$rp_real] ! res_x + srlx $a2,32,$t2 + st $t0,[$rp_real+4] + srlx $a3,32,$t3 + st $a1,[$rp_real+8] + st $t1,[$rp_real+12] + st $a2,[$rp_real+16] + st $t2,[$rp_real+20] + st $a3,[$rp_real+24] + st $t3,[$rp_real+28] + + ldx [%sp+LOCALS64+$res_z],$a0 ! forward load + srlx $acc0,32,$t0 + ldx [%sp+LOCALS64+$res_z+8],$a1 + srlx $acc1,32,$t1 + ldx [%sp+LOCALS64+$res_z+16],$a2 + srlx $acc2,32,$t2 + ldx [%sp+LOCALS64+$res_z+24],$a3 + srlx $acc3,32,$t3 + st $acc0,[$rp_real+32] ! res_y + st $t0, [$rp_real+32+4] + st $acc1,[$rp_real+32+8] + st $t1, [$rp_real+32+12] + st $acc2,[$rp_real+32+16] + st $t2, [$rp_real+32+20] + st $acc3,[$rp_real+32+24] + st $t3, [$rp_real+32+28] + + srlx $a0,32,$t0 + srlx $a1,32,$t1 + st $a0,[$rp_real+64] ! res_z + srlx $a2,32,$t2 + st $t0,[$rp_real+64+4] + srlx $a3,32,$t3 + st $a1,[$rp_real+64+8] + st $t1,[$rp_real+64+12] + st $a2,[$rp_real+64+16] + st $t2,[$rp_real+64+20] + st $a3,[$rp_real+64+24] + st $t3,[$rp_real+64+28] + + ret + restore +.type ecp_nistz256_point_double_vis3,#function +.size ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3 +___ +} +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +{ +my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y,$in2_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); +my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); + +# above map() describes stack layout with 18 temporary +# 256-bit vectors on top. Then we reserve some space for +# !in1infty, !in2infty and result of check for zero. + +$code.=<<___; +.align 32 +ecp_nistz256_point_add_vis3: + save %sp,-STACK64_FRAME-32*18-32,%sp + + mov $rp,$rp_real + mov -1,$minus1 + mov -2,$poly3 + sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 + srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE + + ! convert input to uint64_t[4] + ld [$bp],$a0 ! in2_x + ld [$bp+4],$t0 + ld [$bp+8],$a1 + ld [$bp+12],$t1 + ld [$bp+16],$a2 + ld [$bp+20],$t2 + ld [$bp+24],$a3 + ld [$bp+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$bp+32],$acc0 ! in2_y + or $a0,$t0,$a0 + ld [$bp+32+4],$t0 + sllx $t2,32,$t2 + ld [$bp+32+8],$acc1 + or $a1,$t1,$a1 + ld [$bp+32+12],$t1 + sllx $t3,32,$t3 + ld [$bp+32+16],$acc2 + or $a2,$t2,$a2 + ld [$bp+32+20],$t2 + or $a3,$t3,$a3 + ld [$bp+32+24],$acc3 + sllx $t0,32,$t0 + ld [$bp+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in2_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in2_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in2_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in2_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in2_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in2_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in2_y+16] + stx $acc3,[%sp+LOCALS64+$in2_y+24] + + ld [$bp+64],$acc0 ! in2_z + ld [$bp+64+4],$t0 + ld [$bp+64+8],$acc1 + ld [$bp+64+12],$t1 + ld [$bp+64+16],$acc2 + ld [$bp+64+20],$t2 + ld [$bp+64+24],$acc3 + ld [$bp+64+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$ap],$a0 ! in1_x + or $acc0,$t0,$acc0 + ld [$ap+4],$t0 + sllx $t2,32,$t2 + ld [$ap+8],$a1 + or $acc1,$t1,$acc1 + ld [$ap+12],$t1 + sllx $t3,32,$t3 + ld [$ap+16],$a2 + or $acc2,$t2,$acc2 + ld [$ap+20],$t2 + or $acc3,$t3,$acc3 + ld [$ap+24],$a3 + sllx $t0,32,$t0 + ld [$ap+28],$t3 + sllx $t1,32,$t1 + stx $acc0,[%sp+LOCALS64+$in2_z] + sllx $t2,32,$t2 + stx $acc1,[%sp+LOCALS64+$in2_z+8] + sllx $t3,32,$t3 + stx $acc2,[%sp+LOCALS64+$in2_z+16] + stx $acc3,[%sp+LOCALS64+$in2_z+24] + + or $acc1,$acc0,$acc0 + or $acc3,$acc2,$acc2 + or $acc2,$acc0,$acc0 + movrnz $acc0,-1,$acc0 ! !in2infty + stx $acc0,[%fp+STACK_BIAS-8] + + or $a0,$t0,$a0 + ld [$ap+32],$acc0 ! in1_y + or $a1,$t1,$a1 + ld [$ap+32+4],$t0 + or $a2,$t2,$a2 + ld [$ap+32+8],$acc1 + or $a3,$t3,$a3 + ld [$ap+32+12],$t1 + ld [$ap+32+16],$acc2 + ld [$ap+32+20],$t2 + ld [$ap+32+24],$acc3 + sllx $t0,32,$t0 + ld [$ap+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in1_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in1_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in1_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in1_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in1_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in1_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in1_y+16] + stx $acc3,[%sp+LOCALS64+$in1_y+24] + + ldx [%sp+LOCALS64+$in2_z],$a0 ! forward load + ldx [%sp+LOCALS64+$in2_z+8],$a1 + ldx [%sp+LOCALS64+$in2_z+16],$a2 + ldx [%sp+LOCALS64+$in2_z+24],$a3 + + ld [$ap+64],$acc0 ! in1_z + ld [$ap+64+4],$t0 + ld [$ap+64+8],$acc1 + ld [$ap+64+12],$t1 + ld [$ap+64+16],$acc2 + ld [$ap+64+20],$t2 + ld [$ap+64+24],$acc3 + ld [$ap+64+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + or $acc0,$t0,$acc0 + sllx $t2,32,$t2 + or $acc1,$t1,$acc1 + sllx $t3,32,$t3 + stx $acc0,[%sp+LOCALS64+$in1_z] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in1_z+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in1_z+16] + stx $acc3,[%sp+LOCALS64+$in1_z+24] + + or $acc1,$acc0,$acc0 + or $acc3,$acc2,$acc2 + or $acc2,$acc0,$acc0 + movrnz $acc0,-1,$acc0 ! !in1infty + stx $acc0,[%fp+STACK_BIAS-16] + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z2sqr, in2_z); + add %sp,LOCALS64+$Z2sqr,$rp + + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); + add %sp,LOCALS64+$Z1sqr,$rp + + ldx [%sp+LOCALS64+$Z2sqr],$bi + ldx [%sp+LOCALS64+$in2_z],$a0 + ldx [%sp+LOCALS64+$in2_z+8],$a1 + ldx [%sp+LOCALS64+$in2_z+16],$a2 + ldx [%sp+LOCALS64+$in2_z+24],$a3 + add %sp,LOCALS64+$Z2sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, Z2sqr, in2_z); + add %sp,LOCALS64+$S1,$rp + + ldx [%sp+LOCALS64+$Z1sqr],$bi + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + add %sp,LOCALS64+$Z1sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$S1],$bi + ldx [%sp+LOCALS64+$in1_y],$a0 + ldx [%sp+LOCALS64+$in1_y+8],$a1 + ldx [%sp+LOCALS64+$in1_y+16],$a2 + ldx [%sp+LOCALS64+$in1_y+24],$a3 + add %sp,LOCALS64+$S1,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, S1, in1_y); + add %sp,LOCALS64+$S1,$rp + + ldx [%sp+LOCALS64+$S2],$bi + ldx [%sp+LOCALS64+$in2_y],$a0 + ldx [%sp+LOCALS64+$in2_y+8],$a1 + ldx [%sp+LOCALS64+$in2_y+16],$a2 + ldx [%sp+LOCALS64+$in2_y+24],$a3 + add %sp,LOCALS64+$S2,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$Z2sqr],$bi ! forward load + ldx [%sp+LOCALS64+$in1_x],$a0 + ldx [%sp+LOCALS64+$in1_x+8],$a1 + ldx [%sp+LOCALS64+$in1_x+16],$a2 + ldx [%sp+LOCALS64+$in1_x+24],$a3 + + add %sp,LOCALS64+$S1,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, S1); + add %sp,LOCALS64+$R,$rp + + or $acc1,$acc0,$acc0 ! see if result is zero + or $acc3,$acc2,$acc2 + or $acc2,$acc0,$acc0 + stx $acc0,[%fp+STACK_BIAS-24] + + add %sp,LOCALS64+$Z2sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U1, in1_x, Z2sqr); + add %sp,LOCALS64+$U1,$rp + + ldx [%sp+LOCALS64+$Z1sqr],$bi + ldx [%sp+LOCALS64+$in2_x],$a0 + ldx [%sp+LOCALS64+$in2_x+8],$a1 + ldx [%sp+LOCALS64+$in2_x+16],$a2 + ldx [%sp+LOCALS64+$in2_x+24],$a3 + add %sp,LOCALS64+$Z1sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in2_x, Z1sqr); + add %sp,LOCALS64+$U2,$rp + + ldx [%sp+LOCALS64+$R],$a0 ! forward load + ldx [%sp+LOCALS64+$R+8],$a1 + ldx [%sp+LOCALS64+$R+16],$a2 + ldx [%sp+LOCALS64+$R+24],$a3 + + add %sp,LOCALS64+$U1,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, U1); + add %sp,LOCALS64+$H,$rp + + or $acc1,$acc0,$acc0 ! see if result is zero + or $acc3,$acc2,$acc2 + orcc $acc2,$acc0,$acc0 + + bne,pt %xcc,.Ladd_proceed_vis3 ! is_equal(U1,U2)? + nop + + ldx [%fp+STACK_BIAS-8],$t0 + ldx [%fp+STACK_BIAS-16],$t1 + ldx [%fp+STACK_BIAS-24],$t2 + andcc $t0,$t1,%g0 + be,pt %xcc,.Ladd_proceed_vis3 ! (in1infty || in2infty)? + nop + andcc $t2,$t2,%g0 + be,a,pt %xcc,.Ldouble_shortcut_vis3 ! is_equal(S1,S2)? + add %sp,32*(12-10)+32,%sp ! difference in frame sizes + + st %g0,[$rp_real] + st %g0,[$rp_real+4] + st %g0,[$rp_real+8] + st %g0,[$rp_real+12] + st %g0,[$rp_real+16] + st %g0,[$rp_real+20] + st %g0,[$rp_real+24] + st %g0,[$rp_real+28] + st %g0,[$rp_real+32] + st %g0,[$rp_real+32+4] + st %g0,[$rp_real+32+8] + st %g0,[$rp_real+32+12] + st %g0,[$rp_real+32+16] + st %g0,[$rp_real+32+20] + st %g0,[$rp_real+32+24] + st %g0,[$rp_real+32+28] + st %g0,[$rp_real+64] + st %g0,[$rp_real+64+4] + st %g0,[$rp_real+64+8] + st %g0,[$rp_real+64+12] + st %g0,[$rp_real+64+16] + st %g0,[$rp_real+64+20] + st %g0,[$rp_real+64+24] + st %g0,[$rp_real+64+28] + b .Ladd_done_vis3 + nop + +.align 16 +.Ladd_proceed_vis3: + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); + add %sp,LOCALS64+$Rsqr,$rp + + ldx [%sp+LOCALS64+$H],$bi + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + add %sp,LOCALS64+$H,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); + add %sp,LOCALS64+$res_z,$rp + + ldx [%sp+LOCALS64+$H],$a0 + ldx [%sp+LOCALS64+$H+8],$a1 + ldx [%sp+LOCALS64+$H+16],$a2 + ldx [%sp+LOCALS64+$H+24],$a3 + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); + add %sp,LOCALS64+$Hsqr,$rp + + ldx [%sp+LOCALS64+$res_z],$bi + ldx [%sp+LOCALS64+$in2_z],$a0 + ldx [%sp+LOCALS64+$in2_z+8],$a1 + ldx [%sp+LOCALS64+$in2_z+16],$a2 + ldx [%sp+LOCALS64+$in2_z+24],$a3 + add %sp,LOCALS64+$res_z,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, res_z, in2_z); + add %sp,LOCALS64+$res_z,$rp + + ldx [%sp+LOCALS64+$H],$bi + ldx [%sp+LOCALS64+$Hsqr],$a0 + ldx [%sp+LOCALS64+$Hsqr+8],$a1 + ldx [%sp+LOCALS64+$Hsqr+16],$a2 + ldx [%sp+LOCALS64+$Hsqr+24],$a3 + add %sp,LOCALS64+$H,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); + add %sp,LOCALS64+$Hcub,$rp + + ldx [%sp+LOCALS64+$U1],$bi + ldx [%sp+LOCALS64+$Hsqr],$a0 + ldx [%sp+LOCALS64+$Hsqr+8],$a1 + ldx [%sp+LOCALS64+$Hsqr+16],$a2 + ldx [%sp+LOCALS64+$Hsqr+24],$a3 + add %sp,LOCALS64+$U1,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, U1, Hsqr); + add %sp,LOCALS64+$U2,$rp + + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); + add %sp,LOCALS64+$Hsqr,$rp + + add %sp,LOCALS64+$Rsqr,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); + add %sp,LOCALS64+$res_x,$rp + + add %sp,LOCALS64+$Hcub,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); + add %sp,LOCALS64+$res_x,$rp + + ldx [%sp+LOCALS64+$S1],$bi ! forward load + ldx [%sp+LOCALS64+$Hcub],$a0 + ldx [%sp+LOCALS64+$Hcub+8],$a1 + ldx [%sp+LOCALS64+$Hcub+16],$a2 + ldx [%sp+LOCALS64+$Hcub+24],$a3 + + add %sp,LOCALS64+$U2,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$S1,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S1, Hcub); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$R],$bi + ldx [%sp+LOCALS64+$res_y],$a0 + ldx [%sp+LOCALS64+$res_y+8],$a1 + ldx [%sp+LOCALS64+$res_y+16],$a2 + ldx [%sp+LOCALS64+$res_y+24],$a3 + add %sp,LOCALS64+$R,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$S2,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); + add %sp,LOCALS64+$res_y,$rp + + ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty + ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty +___ +for($i=0;$i<96;$i+=16) { # conditional moves +$code.=<<___; + ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res + ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 + ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 + ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 + ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 + ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 + movrz $t1,$acc2,$acc0 + movrz $t1,$acc3,$acc1 + movrz $t2,$acc4,$acc0 + movrz $t2,$acc5,$acc1 + srlx $acc0,32,$acc2 + srlx $acc1,32,$acc3 + st $acc0,[$rp_real+$i] + st $acc2,[$rp_real+$i+4] + st $acc1,[$rp_real+$i+8] + st $acc3,[$rp_real+$i+12] +___ +} +$code.=<<___; +.Ladd_done_vis3: + ret + restore +.type ecp_nistz256_point_add_vis3,#function +.size ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3 +___ +} +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +{ +my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); +my $Z1sqr = $S2; +# above map() describes stack layout with 15 temporary +# 256-bit vectors on top. Then we reserve some space for +# !in1infty and !in2infty. + +$code.=<<___; +.align 32 +ecp_nistz256_point_add_affine_vis3: + save %sp,-STACK64_FRAME-32*15-32,%sp + + mov $rp,$rp_real + mov -1,$minus1 + mov -2,$poly3 + sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000 + srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE + + ! convert input to uint64_t[4] + ld [$bp],$a0 ! in2_x + ld [$bp+4],$t0 + ld [$bp+8],$a1 + ld [$bp+12],$t1 + ld [$bp+16],$a2 + ld [$bp+20],$t2 + ld [$bp+24],$a3 + ld [$bp+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$bp+32],$acc0 ! in2_y + or $a0,$t0,$a0 + ld [$bp+32+4],$t0 + sllx $t2,32,$t2 + ld [$bp+32+8],$acc1 + or $a1,$t1,$a1 + ld [$bp+32+12],$t1 + sllx $t3,32,$t3 + ld [$bp+32+16],$acc2 + or $a2,$t2,$a2 + ld [$bp+32+20],$t2 + or $a3,$t3,$a3 + ld [$bp+32+24],$acc3 + sllx $t0,32,$t0 + ld [$bp+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in2_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in2_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in2_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in2_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in2_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in2_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in2_y+16] + stx $acc3,[%sp+LOCALS64+$in2_y+24] + + or $a1,$a0,$a0 + or $a3,$a2,$a2 + or $acc1,$acc0,$acc0 + or $acc3,$acc2,$acc2 + or $a2,$a0,$a0 + or $acc2,$acc0,$acc0 + or $acc0,$a0,$a0 + movrnz $a0,-1,$a0 ! !in2infty + stx $a0,[%fp+STACK_BIAS-8] + + ld [$ap],$a0 ! in1_x + ld [$ap+4],$t0 + ld [$ap+8],$a1 + ld [$ap+12],$t1 + ld [$ap+16],$a2 + ld [$ap+20],$t2 + ld [$ap+24],$a3 + ld [$ap+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + ld [$ap+32],$acc0 ! in1_y + or $a0,$t0,$a0 + ld [$ap+32+4],$t0 + sllx $t2,32,$t2 + ld [$ap+32+8],$acc1 + or $a1,$t1,$a1 + ld [$ap+32+12],$t1 + sllx $t3,32,$t3 + ld [$ap+32+16],$acc2 + or $a2,$t2,$a2 + ld [$ap+32+20],$t2 + or $a3,$t3,$a3 + ld [$ap+32+24],$acc3 + sllx $t0,32,$t0 + ld [$ap+32+28],$t3 + sllx $t1,32,$t1 + stx $a0,[%sp+LOCALS64+$in1_x] + sllx $t2,32,$t2 + stx $a1,[%sp+LOCALS64+$in1_x+8] + sllx $t3,32,$t3 + stx $a2,[%sp+LOCALS64+$in1_x+16] + or $acc0,$t0,$acc0 + stx $a3,[%sp+LOCALS64+$in1_x+24] + or $acc1,$t1,$acc1 + stx $acc0,[%sp+LOCALS64+$in1_y] + or $acc2,$t2,$acc2 + stx $acc1,[%sp+LOCALS64+$in1_y+8] + or $acc3,$t3,$acc3 + stx $acc2,[%sp+LOCALS64+$in1_y+16] + stx $acc3,[%sp+LOCALS64+$in1_y+24] + + ld [$ap+64],$a0 ! in1_z + ld [$ap+64+4],$t0 + ld [$ap+64+8],$a1 + ld [$ap+64+12],$t1 + ld [$ap+64+16],$a2 + ld [$ap+64+20],$t2 + ld [$ap+64+24],$a3 + ld [$ap+64+28],$t3 + sllx $t0,32,$t0 + sllx $t1,32,$t1 + or $a0,$t0,$a0 + sllx $t2,32,$t2 + or $a1,$t1,$a1 + sllx $t3,32,$t3 + stx $a0,[%sp+LOCALS64+$in1_z] + or $a2,$t2,$a2 + stx $a1,[%sp+LOCALS64+$in1_z+8] + or $a3,$t3,$a3 + stx $a2,[%sp+LOCALS64+$in1_z+16] + stx $a3,[%sp+LOCALS64+$in1_z+24] + + or $a1,$a0,$t0 + or $a3,$a2,$t2 + or $t2,$t0,$t0 + movrnz $t0,-1,$t0 ! !in1infty + stx $t0,[%fp+STACK_BIAS-16] + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z); + add %sp,LOCALS64+$Z1sqr,$rp + + ldx [%sp+LOCALS64+$in2_x],$bi + mov $acc0,$a0 + mov $acc1,$a1 + mov $acc2,$a2 + mov $acc3,$a3 + add %sp,LOCALS64+$in2_x,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, Z1sqr, in2_x); + add %sp,LOCALS64+$U2,$rp + + ldx [%sp+LOCALS64+$Z1sqr],$bi ! forward load + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + + add %sp,LOCALS64+$in1_x,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, in1_x); + add %sp,LOCALS64+$H,$rp + + add %sp,LOCALS64+$Z1sqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$H],$bi + ldx [%sp+LOCALS64+$in1_z],$a0 + ldx [%sp+LOCALS64+$in1_z+8],$a1 + ldx [%sp+LOCALS64+$in1_z+16],$a2 + ldx [%sp+LOCALS64+$in1_z+24],$a3 + add %sp,LOCALS64+$H,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z); + add %sp,LOCALS64+$res_z,$rp + + ldx [%sp+LOCALS64+$S2],$bi + ldx [%sp+LOCALS64+$in2_y],$a0 + ldx [%sp+LOCALS64+$in2_y+8],$a1 + ldx [%sp+LOCALS64+$in2_y+16],$a2 + ldx [%sp+LOCALS64+$in2_y+24],$a3 + add %sp,LOCALS64+$S2,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$H],$a0 ! forward load + ldx [%sp+LOCALS64+$H+8],$a1 + ldx [%sp+LOCALS64+$H+16],$a2 + ldx [%sp+LOCALS64+$H+24],$a3 + + add %sp,LOCALS64+$in1_y,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, in1_y); + add %sp,LOCALS64+$R,$rp + + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H); + add %sp,LOCALS64+$Hsqr,$rp + + ldx [%sp+LOCALS64+$R],$a0 + ldx [%sp+LOCALS64+$R+8],$a1 + ldx [%sp+LOCALS64+$R+16],$a2 + ldx [%sp+LOCALS64+$R+24],$a3 + call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R); + add %sp,LOCALS64+$Rsqr,$rp + + ldx [%sp+LOCALS64+$H],$bi + ldx [%sp+LOCALS64+$Hsqr],$a0 + ldx [%sp+LOCALS64+$Hsqr+8],$a1 + ldx [%sp+LOCALS64+$Hsqr+16],$a2 + ldx [%sp+LOCALS64+$Hsqr+24],$a3 + add %sp,LOCALS64+$H,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H); + add %sp,LOCALS64+$Hcub,$rp + + ldx [%sp+LOCALS64+$Hsqr],$bi + ldx [%sp+LOCALS64+$in1_x],$a0 + ldx [%sp+LOCALS64+$in1_x+8],$a1 + ldx [%sp+LOCALS64+$in1_x+16],$a2 + ldx [%sp+LOCALS64+$in1_x+24],$a3 + add %sp,LOCALS64+$Hsqr,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in1_x, Hsqr); + add %sp,LOCALS64+$U2,$rp + + call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2); + add %sp,LOCALS64+$Hsqr,$rp + + add %sp,LOCALS64+$Rsqr,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr); + add %sp,LOCALS64+$res_x,$rp + + add %sp,LOCALS64+$Hcub,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub); + add %sp,LOCALS64+$res_x,$rp + + ldx [%sp+LOCALS64+$Hcub],$bi ! forward load + ldx [%sp+LOCALS64+$in1_y],$a0 + ldx [%sp+LOCALS64+$in1_y+8],$a1 + ldx [%sp+LOCALS64+$in1_y+16],$a2 + ldx [%sp+LOCALS64+$in1_y+24],$a3 + + add %sp,LOCALS64+$U2,$bp + call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$Hcub,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, in1_y, Hcub); + add %sp,LOCALS64+$S2,$rp + + ldx [%sp+LOCALS64+$R],$bi + ldx [%sp+LOCALS64+$res_y],$a0 + ldx [%sp+LOCALS64+$res_y+8],$a1 + ldx [%sp+LOCALS64+$res_y+16],$a2 + ldx [%sp+LOCALS64+$res_y+24],$a3 + add %sp,LOCALS64+$R,$bp + call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R); + add %sp,LOCALS64+$res_y,$rp + + add %sp,LOCALS64+$S2,$bp + call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2); + add %sp,LOCALS64+$res_y,$rp + + ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty + ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty +1: call .+8 + add %o7,.Lone_mont_vis3-1b,$bp +___ +for($i=0;$i<64;$i+=16) { # conditional moves +$code.=<<___; + ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res + ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 + ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2 + ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3 + ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 + ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 + movrz $t1,$acc2,$acc0 + movrz $t1,$acc3,$acc1 + movrz $t2,$acc4,$acc0 + movrz $t2,$acc5,$acc1 + srlx $acc0,32,$acc2 + srlx $acc1,32,$acc3 + st $acc0,[$rp_real+$i] + st $acc2,[$rp_real+$i+4] + st $acc1,[$rp_real+$i+8] + st $acc3,[$rp_real+$i+12] +___ +} +for(;$i<96;$i+=16) { +$code.=<<___; + ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res + ldx [%sp+LOCALS64+$res_x+$i+8],$acc1 + ldx [$bp+$i-64],$acc2 ! "in2" + ldx [$bp+$i-64+8],$acc3 + ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1 + ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5 + movrz $t1,$acc2,$acc0 + movrz $t1,$acc3,$acc1 + movrz $t2,$acc4,$acc0 + movrz $t2,$acc5,$acc1 + srlx $acc0,32,$acc2 + srlx $acc1,32,$acc3 + st $acc0,[$rp_real+$i] + st $acc2,[$rp_real+$i+4] + st $acc1,[$rp_real+$i+8] + st $acc3,[$rp_real+$i+12] +___ +} +$code.=<<___; + ret + restore +.type ecp_nistz256_point_add_affine_vis3,#function +.size ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3 +.align 64 +.Lone_mont_vis3: +.long 0x00000000,0x00000001, 0xffffffff,0x00000000 +.long 0xffffffff,0xffffffff, 0x00000000,0xfffffffe +.align 64 +___ +} }}} + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "addxc" => 0x011, + "addxccc" => 0x013, + "umulxhi" => 0x016 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /ge; + + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-x86.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-x86.pl new file mode 100755 index 000000000..e926d69b0 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-x86.pl @@ -0,0 +1,1862 @@ +#! /usr/bin/env perl +# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# ECP_NISTZ256 module for x86/SSE2. +# +# October 2014. +# +# Original ECP_NISTZ256 submission targeting x86_64 is detailed in +# http://eprint.iacr.org/2013/816. In the process of adaptation +# original .c module was made 32-bit savvy in order to make this +# implementation possible. +# +# with/without -DECP_NISTZ256_ASM +# Pentium +66-163% +# PIII +72-172% +# P4 +65-132% +# Core2 +90-215% +# Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) +# Atom +65-155% +# Opteron +54-110% +# Bulldozer +99-240% +# VIA Nano +93-290% +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. Lower coefficients are for ECDSA sign, server-side +# operation. Keep in mind that +200% means 3x improvement. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + + +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +# See ecp_nistz256_table.c for explanation for why it's 64*16*37. +# 64*16*37-1 is because $#arr returns last valid index or @arr, not +# amount of elements. +die "insane number of elements" if ($#arr != 64*16*37-1); + +&public_label("ecp_nistz256_precomputed"); +&align(4096); +&set_label("ecp_nistz256_precomputed"); + +######################################################################## +# this conversion smashes P256_POINT_AFFINE by individual bytes with +# 64 byte interval, similar to +# 1111222233334444 +# 1234123412341234 +for(1..37) { + @tbl = splice(@arr,0,64*16); + for($i=0;$i<64;$i++) { + undef @line; + for($j=0;$j<64;$j++) { + push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; + } + &data_byte(join(',',map { sprintf "0x%02x",$_} @line)); + } +} + +######################################################################## +# Keep in mind that constants are stored least to most significant word +&static_label("RR"); +&set_label("RR",64); +&data_word(3,0,-1,-5,-2,-1,-3,4); # 2^512 mod P-256 + +&static_label("ONE_mont"); +&set_label("ONE_mont"); +&data_word(1,0,0,-1,-1,-1,-2,0); + +&static_label("ONE"); +&set_label("ONE"); +&data_word(1,0,0,0,0,0,0,0); +&asciz("ECP_NISZ256 for x86/SSE2, CRYPTOGAMS by <appro\@openssl.org>"); +&align(64); + +######################################################################## +# void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_mul_by_2"); + &mov ("esi",&wparam(1)); + &mov ("edi",&wparam(0)); + &mov ("ebp","esi"); +######################################################################## +# common pattern for internal functions is that %edi is result pointer, +# %esi and %ebp are input ones, %ebp being optional. %edi is preserved. + &call ("_ecp_nistz256_add"); +&function_end("ecp_nistz256_mul_by_2"); + +######################################################################## +# void ecp_nistz256_mul_by_3(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_mul_by_3"); + &mov ("esi",&wparam(1)); + # multiplication by 3 is performed + # as 2*n+n, but we can't use output + # to store 2*n, because if output + # pointer equals to input, then + # we'll get 2*n+2*n. + &stack_push(8); # therefore we need to allocate + # 256-bit intermediate buffer. + &mov ("edi","esp"); + &mov ("ebp","esi"); + &call ("_ecp_nistz256_add"); + &lea ("esi",&DWP(0,"edi")); + &mov ("ebp",&wparam(1)); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_add"); + &stack_pop(8); +&function_end("ecp_nistz256_mul_by_3"); + +######################################################################## +# void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_div_by_2"); + &mov ("esi",&wparam(1)); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_div_by_2"); +&function_end("ecp_nistz256_div_by_2"); + +&function_begin_B("_ecp_nistz256_div_by_2"); + # tmp = a is odd ? a+mod : a + # + # note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # assigning least significant bit of input to one register, + # %ebp, and its negative to another, %edx. + + &mov ("ebp",&DWP(0,"esi")); + &xor ("edx","edx"); + &mov ("ebx",&DWP(4,"esi")); + &mov ("eax","ebp"); + &and ("ebp",1); + &mov ("ecx",&DWP(8,"esi")); + &sub ("edx","ebp"); + + &add ("eax","edx"); + &adc ("ebx","edx"); + &mov (&DWP(0,"edi"),"eax"); + &adc ("ecx","edx"); + &mov (&DWP(4,"edi"),"ebx"); + &mov (&DWP(8,"edi"),"ecx"); + + &mov ("eax",&DWP(12,"esi")); + &mov ("ebx",&DWP(16,"esi")); + &adc ("eax",0); + &mov ("ecx",&DWP(20,"esi")); + &adc ("ebx",0); + &mov (&DWP(12,"edi"),"eax"); + &adc ("ecx",0); + &mov (&DWP(16,"edi"),"ebx"); + &mov (&DWP(20,"edi"),"ecx"); + + &mov ("eax",&DWP(24,"esi")); + &mov ("ebx",&DWP(28,"esi")); + &adc ("eax","ebp"); + &adc ("ebx","edx"); + &mov (&DWP(24,"edi"),"eax"); + &sbb ("esi","esi"); # broadcast carry bit + &mov (&DWP(28,"edi"),"ebx"); + + # ret = tmp >> 1 + + &mov ("eax",&DWP(0,"edi")); + &mov ("ebx",&DWP(4,"edi")); + &mov ("ecx",&DWP(8,"edi")); + &mov ("edx",&DWP(12,"edi")); + + &shr ("eax",1); + &mov ("ebp","ebx"); + &shl ("ebx",31); + &or ("eax","ebx"); + + &shr ("ebp",1); + &mov ("ebx","ecx"); + &shl ("ecx",31); + &mov (&DWP(0,"edi"),"eax"); + &or ("ebp","ecx"); + &mov ("eax",&DWP(16,"edi")); + + &shr ("ebx",1); + &mov ("ecx","edx"); + &shl ("edx",31); + &mov (&DWP(4,"edi"),"ebp"); + &or ("ebx","edx"); + &mov ("ebp",&DWP(20,"edi")); + + &shr ("ecx",1); + &mov ("edx","eax"); + &shl ("eax",31); + &mov (&DWP(8,"edi"),"ebx"); + &or ("ecx","eax"); + &mov ("ebx",&DWP(24,"edi")); + + &shr ("edx",1); + &mov ("eax","ebp"); + &shl ("ebp",31); + &mov (&DWP(12,"edi"),"ecx"); + &or ("edx","ebp"); + &mov ("ecx",&DWP(28,"edi")); + + &shr ("eax",1); + &mov ("ebp","ebx"); + &shl ("ebx",31); + &mov (&DWP(16,"edi"),"edx"); + &or ("eax","ebx"); + + &shr ("ebp",1); + &mov ("ebx","ecx"); + &shl ("ecx",31); + &mov (&DWP(20,"edi"),"eax"); + &or ("ebp","ecx"); + + &shr ("ebx",1); + &shl ("esi",31); + &mov (&DWP(24,"edi"),"ebp"); + &or ("ebx","esi"); # handle top-most carry bit + &mov (&DWP(28,"edi"),"ebx"); + + &ret (); +&function_end_B("_ecp_nistz256_div_by_2"); + +######################################################################## +# void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], +# const BN_ULONG ebp[8]); +&function_begin("ecp_nistz256_add"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_add"); +&function_end("ecp_nistz256_add"); + +&function_begin_B("_ecp_nistz256_add"); + &mov ("eax",&DWP(0,"esi")); + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &add ("eax",&DWP(0,"ebp")); + &mov ("edx",&DWP(12,"esi")); + &adc ("ebx",&DWP(4,"ebp")); + &mov (&DWP(0,"edi"),"eax"); + &adc ("ecx",&DWP(8,"ebp")); + &mov (&DWP(4,"edi"),"ebx"); + &adc ("edx",&DWP(12,"ebp")); + &mov (&DWP(8,"edi"),"ecx"); + &mov (&DWP(12,"edi"),"edx"); + + &mov ("eax",&DWP(16,"esi")); + &mov ("ebx",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &adc ("eax",&DWP(16,"ebp")); + &mov ("edx",&DWP(28,"esi")); + &adc ("ebx",&DWP(20,"ebp")); + &mov (&DWP(16,"edi"),"eax"); + &adc ("ecx",&DWP(24,"ebp")); + &mov (&DWP(20,"edi"),"ebx"); + &mov ("esi",0); + &adc ("edx",&DWP(28,"ebp")); + &mov (&DWP(24,"edi"),"ecx"); + &adc ("esi",0); + &mov (&DWP(28,"edi"),"edx"); + + # if a+b >= modulus, subtract modulus. + # + # But since comparison implies subtraction, we subtract modulus + # to see if it borrows, and then subtract it for real if + # subtraction didn't borrow. + + &mov ("eax",&DWP(0,"edi")); + &mov ("ebx",&DWP(4,"edi")); + &mov ("ecx",&DWP(8,"edi")); + &sub ("eax",-1); + &mov ("edx",&DWP(12,"edi")); + &sbb ("ebx",-1); + &mov ("eax",&DWP(16,"edi")); + &sbb ("ecx",-1); + &mov ("ebx",&DWP(20,"edi")); + &sbb ("edx",0); + &mov ("ecx",&DWP(24,"edi")); + &sbb ("eax",0); + &mov ("edx",&DWP(28,"edi")); + &sbb ("ebx",0); + &sbb ("ecx",1); + &sbb ("edx",-1); + &sbb ("esi",0); + + # Note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # by using borrow. + + ¬ ("esi"); + &mov ("eax",&DWP(0,"edi")); + &mov ("ebp","esi"); + &mov ("ebx",&DWP(4,"edi")); + &shr ("ebp",31); + &mov ("ecx",&DWP(8,"edi")); + &sub ("eax","esi"); + &mov ("edx",&DWP(12,"edi")); + &sbb ("ebx","esi"); + &mov (&DWP(0,"edi"),"eax"); + &sbb ("ecx","esi"); + &mov (&DWP(4,"edi"),"ebx"); + &sbb ("edx",0); + &mov (&DWP(8,"edi"),"ecx"); + &mov (&DWP(12,"edi"),"edx"); + + &mov ("eax",&DWP(16,"edi")); + &mov ("ebx",&DWP(20,"edi")); + &mov ("ecx",&DWP(24,"edi")); + &sbb ("eax",0); + &mov ("edx",&DWP(28,"edi")); + &sbb ("ebx",0); + &mov (&DWP(16,"edi"),"eax"); + &sbb ("ecx","ebp"); + &mov (&DWP(20,"edi"),"ebx"); + &sbb ("edx","esi"); + &mov (&DWP(24,"edi"),"ecx"); + &mov (&DWP(28,"edi"),"edx"); + + &ret (); +&function_end_B("_ecp_nistz256_add"); + +######################################################################## +# void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], +# const BN_ULONG ebp[8]); +&function_begin("ecp_nistz256_sub"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_sub"); +&function_end("ecp_nistz256_sub"); + +&function_begin_B("_ecp_nistz256_sub"); + &mov ("eax",&DWP(0,"esi")); + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &sub ("eax",&DWP(0,"ebp")); + &mov ("edx",&DWP(12,"esi")); + &sbb ("ebx",&DWP(4,"ebp")); + &mov (&DWP(0,"edi"),"eax"); + &sbb ("ecx",&DWP(8,"ebp")); + &mov (&DWP(4,"edi"),"ebx"); + &sbb ("edx",&DWP(12,"ebp")); + &mov (&DWP(8,"edi"),"ecx"); + &mov (&DWP(12,"edi"),"edx"); + + &mov ("eax",&DWP(16,"esi")); + &mov ("ebx",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &sbb ("eax",&DWP(16,"ebp")); + &mov ("edx",&DWP(28,"esi")); + &sbb ("ebx",&DWP(20,"ebp")); + &sbb ("ecx",&DWP(24,"ebp")); + &mov (&DWP(16,"edi"),"eax"); + &sbb ("edx",&DWP(28,"ebp")); + &mov (&DWP(20,"edi"),"ebx"); + &sbb ("esi","esi"); # broadcast borrow bit + &mov (&DWP(24,"edi"),"ecx"); + &mov (&DWP(28,"edi"),"edx"); + + # if a-b borrows, add modulus. + # + # Note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # assigning borrow bit to one register, %ebp, and its negative + # to another, %esi. But we started by calculating %esi... + + &mov ("eax",&DWP(0,"edi")); + &mov ("ebp","esi"); + &mov ("ebx",&DWP(4,"edi")); + &shr ("ebp",31); + &mov ("ecx",&DWP(8,"edi")); + &add ("eax","esi"); + &mov ("edx",&DWP(12,"edi")); + &adc ("ebx","esi"); + &mov (&DWP(0,"edi"),"eax"); + &adc ("ecx","esi"); + &mov (&DWP(4,"edi"),"ebx"); + &adc ("edx",0); + &mov (&DWP(8,"edi"),"ecx"); + &mov (&DWP(12,"edi"),"edx"); + + &mov ("eax",&DWP(16,"edi")); + &mov ("ebx",&DWP(20,"edi")); + &mov ("ecx",&DWP(24,"edi")); + &adc ("eax",0); + &mov ("edx",&DWP(28,"edi")); + &adc ("ebx",0); + &mov (&DWP(16,"edi"),"eax"); + &adc ("ecx","ebp"); + &mov (&DWP(20,"edi"),"ebx"); + &adc ("edx","esi"); + &mov (&DWP(24,"edi"),"ecx"); + &mov (&DWP(28,"edi"),"edx"); + + &ret (); +&function_end_B("_ecp_nistz256_sub"); + +######################################################################## +# void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_neg"); + &mov ("ebp",&wparam(1)); + &mov ("edi",&wparam(0)); + + &xor ("eax","eax"); + &stack_push(8); + &mov (&DWP(0,"esp"),"eax"); + &mov ("esi","esp"); + &mov (&DWP(4,"esp"),"eax"); + &mov (&DWP(8,"esp"),"eax"); + &mov (&DWP(12,"esp"),"eax"); + &mov (&DWP(16,"esp"),"eax"); + &mov (&DWP(20,"esp"),"eax"); + &mov (&DWP(24,"esp"),"eax"); + &mov (&DWP(28,"esp"),"eax"); + + &call ("_ecp_nistz256_sub"); + + &stack_pop(8); +&function_end("ecp_nistz256_neg"); + +&function_begin_B("_picup_eax"); + &mov ("eax",&DWP(0,"esp")); + &ret (); +&function_end_B("_picup_eax"); + +######################################################################## +# void ecp_nistz256_to_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_to_mont"); + &mov ("esi",&wparam(1)); + &call ("_picup_eax"); + &set_label("pic"); + &lea ("ebp",&DWP(&label("RR")."-".&label("pic"),"eax")); + if ($sse2) { + &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("eax",&DWP(0,"eax")); } + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_mul_mont"); +&function_end("ecp_nistz256_to_mont"); + +######################################################################## +# void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_from_mont"); + &mov ("esi",&wparam(1)); + &call ("_picup_eax"); + &set_label("pic"); + &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); + if ($sse2) { + &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("eax",&DWP(0,"eax")); } + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_mul_mont"); +&function_end("ecp_nistz256_from_mont"); + +######################################################################## +# void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], +# const BN_ULONG ebp[8]); +&function_begin("ecp_nistz256_mul_mont"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("eax",&DWP(0,"eax")); } + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_mul_mont"); +&function_end("ecp_nistz256_mul_mont"); + +######################################################################## +# void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); +&function_begin("ecp_nistz256_sqr_mont"); + &mov ("esi",&wparam(1)); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("eax",&DWP(0,"eax")); } + &mov ("edi",&wparam(0)); + &mov ("ebp","esi"); + &call ("_ecp_nistz256_mul_mont"); +&function_end("ecp_nistz256_sqr_mont"); + +&function_begin_B("_ecp_nistz256_mul_mont"); + if ($sse2) { + &and ("eax",1<<24|1<<26); + &cmp ("eax",1<<24|1<<26); # see if XMM+SSE2 is on + &jne (&label("mul_mont_ialu")); + + ######################################## + # SSE2 code path featuring 32x16-bit + # multiplications is ~2x faster than + # IALU counterpart (except on Atom)... + ######################################## + # stack layout: + # +------------------------------------+< %esp + # | 7 16-byte temporary XMM words, | + # | "sliding" toward lower address | + # . . + # +------------------------------------+ + # | unused XMM word | + # +------------------------------------+< +128,%ebx + # | 8 16-byte XMM words holding copies | + # | of a[i]<<64|a[i] | + # . . + # . . + # +------------------------------------+< +256 + &mov ("edx","esp"); + &sub ("esp",0x100); + + &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy + &lea ("ebp",&DWP(4,"ebp")); + &pcmpeqd("xmm6","xmm6"); + &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff + + &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y + &and ("esp",-64); + &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y + &lea ("ebx",&DWP(0x80,"esp")); + + &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy + &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy + &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... + &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] + &pmuludq("xmm0","xmm7"); # a[0]*b[0] + + &movd ("xmm2",&DWP(4*2,"esi")); + &pshufd ("xmm1","xmm1",0b11001100); + &movdqa (&QWP(0x10,"ebx"),"xmm1"); + &pmuludq("xmm1","xmm7"); # a[1]*b[0] + + &movq ("xmm4","xmm0"); # clear upper 64 bits + &pslldq("xmm4",6); + &paddq ("xmm4","xmm0"); + &movdqa("xmm5","xmm4"); + &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] + &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] + + # Upper half of a[0]*b[i] is carried into next multiplication + # iteration, while lower one "participates" in actual reduction. + # Normally latter is done by accumulating result of multiplication + # of modulus by "magic" digit, but thanks to special form of modulus + # and "magic" digit it can be performed only with additions and + # subtractions (see note in IALU section below). Note that we are + # not bothered with carry bits, they are accumulated in "flatten" + # phase after all multiplications and reductions. + + &movd ("xmm3",&DWP(4*3,"esi")); + &pshufd ("xmm2","xmm2",0b11001100); + &movdqa (&QWP(0x20,"ebx"),"xmm2"); + &pmuludq("xmm2","xmm7"); # a[2]*b[0] + &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry + &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] + + &movd ("xmm0",&DWP(4*4,"esi")); + &pshufd ("xmm3","xmm3",0b11001100); + &movdqa (&QWP(0x30,"ebx"),"xmm3"); + &pmuludq("xmm3","xmm7"); # a[3]*b[0] + &movdqa (&QWP(0x10,"esp"),"xmm2"); + + &movd ("xmm1",&DWP(4*5,"esi")); + &pshufd ("xmm0","xmm0",0b11001100); + &movdqa (&QWP(0x40,"ebx"),"xmm0"); + &pmuludq("xmm0","xmm7"); # a[4]*b[0] + &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step + &movdqa (&QWP(0x20,"esp"),"xmm3"); + + &movd ("xmm2",&DWP(4*6,"esi")); + &pshufd ("xmm1","xmm1",0b11001100); + &movdqa (&QWP(0x50,"ebx"),"xmm1"); + &pmuludq("xmm1","xmm7"); # a[5]*b[0] + &movdqa (&QWP(0x30,"esp"),"xmm0"); + &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step + + &movd ("xmm3",&DWP(4*7,"esi")); + &pshufd ("xmm2","xmm2",0b11001100); + &movdqa (&QWP(0x60,"ebx"),"xmm2"); + &pmuludq("xmm2","xmm7"); # a[6]*b[0] + &movdqa (&QWP(0x40,"esp"),"xmm1"); + &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step + + &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy + &pshufd ("xmm3","xmm3",0b11001100); + &movdqa (&QWP(0x70,"ebx"),"xmm3"); + &pmuludq("xmm3","xmm7"); # a[7]*b[0] + + &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y + &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] + &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y + + &mov ("ecx",6); + &lea ("ebp",&DWP(4,"ebp")); + &jmp (&label("madd_sse2")); + +&set_label("madd_sse2",16); + &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] + &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] + &movdqa ("xmm1",&QWP(0x10,"ebx")); + &pmuludq("xmm0","xmm7"); # a[0]*b[i] + &movdqa(&QWP(0x50,"esp"),"xmm2"); + + &movdqa ("xmm2",&QWP(0x20,"ebx")); + &pmuludq("xmm1","xmm7"); # a[1]*b[i] + &movdqa(&QWP(0x60,"esp"),"xmm3"); + &paddq ("xmm0",&QWP(0x00,"esp")); + + &movdqa ("xmm3",&QWP(0x30,"ebx")); + &pmuludq("xmm2","xmm7"); # a[2]*b[i] + &movq ("xmm4","xmm0"); # clear upper 64 bits + &pslldq("xmm4",6); + &paddq ("xmm1",&QWP(0x10,"esp")); + &paddq ("xmm4","xmm0"); + &movdqa("xmm5","xmm4"); + &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] + + &movdqa ("xmm0",&QWP(0x40,"ebx")); + &pmuludq("xmm3","xmm7"); # a[3]*b[i] + &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry + &paddq ("xmm2",&QWP(0x20,"esp")); + &movdqa (&QWP(0x00,"esp"),"xmm1"); + + &movdqa ("xmm1",&QWP(0x50,"ebx")); + &pmuludq("xmm0","xmm7"); # a[4]*b[i] + &paddq ("xmm3",&QWP(0x30,"esp")); + &movdqa (&QWP(0x10,"esp"),"xmm2"); + &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] + + &movdqa ("xmm2",&QWP(0x60,"ebx")); + &pmuludq("xmm1","xmm7"); # a[5]*b[i] + &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step + &paddq ("xmm0",&QWP(0x40,"esp")); + &movdqa (&QWP(0x20,"esp"),"xmm3"); + &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step + + &movdqa ("xmm3","xmm7"); + &pmuludq("xmm2","xmm7"); # a[6]*b[i] + &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy + &lea ("ebp",&DWP(4,"ebp")); + &paddq ("xmm1",&QWP(0x50,"esp")); + &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step + &movdqa (&QWP(0x30,"esp"),"xmm0"); + &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y + + &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] + &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y + &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] + &movdqa (&QWP(0x40,"esp"),"xmm1"); + &paddq ("xmm2",&QWP(0x60,"esp")); + + &dec ("ecx"); + &jnz (&label("madd_sse2")); + + &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] + &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] + &movdqa ("xmm1",&QWP(0x10,"ebx")); + &pmuludq("xmm0","xmm7"); # a[0]*b[7] + &movdqa(&QWP(0x50,"esp"),"xmm2"); + + &movdqa ("xmm2",&QWP(0x20,"ebx")); + &pmuludq("xmm1","xmm7"); # a[1]*b[7] + &movdqa(&QWP(0x60,"esp"),"xmm3"); + &paddq ("xmm0",&QWP(0x00,"esp")); + + &movdqa ("xmm3",&QWP(0x30,"ebx")); + &pmuludq("xmm2","xmm7"); # a[2]*b[7] + &movq ("xmm4","xmm0"); # clear upper 64 bits + &pslldq("xmm4",6); + &paddq ("xmm1",&QWP(0x10,"esp")); + &paddq ("xmm4","xmm0"); + &movdqa("xmm5","xmm4"); + &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] + + &movdqa ("xmm0",&QWP(0x40,"ebx")); + &pmuludq("xmm3","xmm7"); # a[3]*b[7] + &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry + &paddq ("xmm2",&QWP(0x20,"esp")); + &movdqa (&QWP(0x00,"esp"),"xmm1"); + + &movdqa ("xmm1",&QWP(0x50,"ebx")); + &pmuludq("xmm0","xmm7"); # a[4]*b[7] + &paddq ("xmm3",&QWP(0x30,"esp")); + &movdqa (&QWP(0x10,"esp"),"xmm2"); + &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] + + &movdqa ("xmm2",&QWP(0x60,"ebx")); + &pmuludq("xmm1","xmm7"); # a[5]*b[7] + &paddq ("xmm3","xmm5"); # reduction step + &paddq ("xmm0",&QWP(0x40,"esp")); + &movdqa (&QWP(0x20,"esp"),"xmm3"); + &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step + + &movdqa ("xmm3",&QWP(0x70,"ebx")); + &pmuludq("xmm2","xmm7"); # a[6]*b[7] + &paddq ("xmm1",&QWP(0x50,"esp")); + &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step + &movdqa (&QWP(0x30,"esp"),"xmm0"); + + &pmuludq("xmm3","xmm7"); # a[7]*b[7] + &pcmpeqd("xmm7","xmm7"); + &movdqa ("xmm0",&QWP(0x00,"esp")); + &pslldq ("xmm7",8); + &movdqa (&QWP(0x40,"esp"),"xmm1"); + &paddq ("xmm2",&QWP(0x60,"esp")); + + &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step + &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step + &movdqa(&QWP(0x50,"esp"),"xmm2"); + &movdqa(&QWP(0x60,"esp"),"xmm3"); + + &movdqa ("xmm1",&QWP(0x10,"esp")); + &movdqa ("xmm2",&QWP(0x20,"esp")); + &movdqa ("xmm3",&QWP(0x30,"esp")); + + &movq ("xmm4","xmm0"); # "flatten" + &pand ("xmm0","xmm7"); + &xor ("ebp","ebp"); + &pslldq ("xmm4",6); + &movq ("xmm5","xmm1"); + &paddq ("xmm0","xmm4"); + &pand ("xmm1","xmm7"); + &psrldq ("xmm0",6); + &movd ("eax","xmm0"); + &psrldq ("xmm0",4); + + &paddq ("xmm5","xmm0"); + &movdqa ("xmm0",&QWP(0x40,"esp")); + &sub ("eax",-1); # start subtracting modulus, + # this is used to determine + # if result is larger/smaller + # than modulus (see below) + &pslldq ("xmm5",6); + &movq ("xmm4","xmm2"); + &paddq ("xmm1","xmm5"); + &pand ("xmm2","xmm7"); + &psrldq ("xmm1",6); + &mov (&DWP(4*0,"edi"),"eax"); + &movd ("eax","xmm1"); + &psrldq ("xmm1",4); + + &paddq ("xmm4","xmm1"); + &movdqa ("xmm1",&QWP(0x50,"esp")); + &sbb ("eax",-1); + &pslldq ("xmm4",6); + &movq ("xmm5","xmm3"); + &paddq ("xmm2","xmm4"); + &pand ("xmm3","xmm7"); + &psrldq ("xmm2",6); + &mov (&DWP(4*1,"edi"),"eax"); + &movd ("eax","xmm2"); + &psrldq ("xmm2",4); + + &paddq ("xmm5","xmm2"); + &movdqa ("xmm2",&QWP(0x60,"esp")); + &sbb ("eax",-1); + &pslldq ("xmm5",6); + &movq ("xmm4","xmm0"); + &paddq ("xmm3","xmm5"); + &pand ("xmm0","xmm7"); + &psrldq ("xmm3",6); + &mov (&DWP(4*2,"edi"),"eax"); + &movd ("eax","xmm3"); + &psrldq ("xmm3",4); + + &paddq ("xmm4","xmm3"); + &sbb ("eax",0); + &pslldq ("xmm4",6); + &movq ("xmm5","xmm1"); + &paddq ("xmm0","xmm4"); + &pand ("xmm1","xmm7"); + &psrldq ("xmm0",6); + &mov (&DWP(4*3,"edi"),"eax"); + &movd ("eax","xmm0"); + &psrldq ("xmm0",4); + + &paddq ("xmm5","xmm0"); + &sbb ("eax",0); + &pslldq ("xmm5",6); + &movq ("xmm4","xmm2"); + &paddq ("xmm1","xmm5"); + &pand ("xmm2","xmm7"); + &psrldq ("xmm1",6); + &movd ("ebx","xmm1"); + &psrldq ("xmm1",4); + &mov ("esp","edx"); + + &paddq ("xmm4","xmm1"); + &pslldq ("xmm4",6); + &paddq ("xmm2","xmm4"); + &psrldq ("xmm2",6); + &movd ("ecx","xmm2"); + &psrldq ("xmm2",4); + &sbb ("ebx",0); + &movd ("edx","xmm2"); + &pextrw ("esi","xmm2",2); # top-most overflow bit + &sbb ("ecx",1); + &sbb ("edx",-1); + &sbb ("esi",0); # borrow from subtraction + + # Final step is "if result > mod, subtract mod", and at this point + # we have result - mod written to output buffer, as well as borrow + # bit from this subtraction, and if borrow bit is set, we add + # modulus back. + # + # Note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # assigning borrow bit to one register, %ebp, and its negative + # to another, %esi. But we started by calculating %esi... + + &sub ("ebp","esi"); + &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero + &adc (&DWP(4*1,"edi"),"esi"); + &adc (&DWP(4*2,"edi"),"esi"); + &adc (&DWP(4*3,"edi"),0); + &adc ("eax",0); + &adc ("ebx",0); + &mov (&DWP(4*4,"edi"),"eax"); + &adc ("ecx","ebp"); + &mov (&DWP(4*5,"edi"),"ebx"); + &adc ("edx","esi"); + &mov (&DWP(4*6,"edi"),"ecx"); + &mov (&DWP(4*7,"edi"),"edx"); + + &ret (); + +&set_label("mul_mont_ialu",16); } + + ######################################## + # IALU code path suitable for all CPUs. + ######################################## + # stack layout: + # +------------------------------------+< %esp + # | 8 32-bit temporary words, accessed | + # | as circular buffer | + # . . + # . . + # +------------------------------------+< +32 + # | offloaded destination pointer | + # +------------------------------------+ + # | unused | + # +------------------------------------+< +40 + &sub ("esp",10*4); + + &mov ("eax",&DWP(0*4,"esi")); # a[0] + &mov ("ebx",&DWP(0*4,"ebp")); # b[0] + &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr + + &mul ("ebx"); # a[0]*b[0] + &mov (&DWP(0*4,"esp"),"eax"); # t[0] + &mov ("eax",&DWP(1*4,"esi")); + &mov ("ecx","edx") + + &mul ("ebx"); # a[1]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(2*4,"esi")); + &adc ("edx",0); + &mov (&DWP(1*4,"esp"),"ecx"); # t[1] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[2]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(3*4,"esi")); + &adc ("edx",0); + &mov (&DWP(2*4,"esp"),"ecx"); # t[2] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[3]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(4*4,"esi")); + &adc ("edx",0); + &mov (&DWP(3*4,"esp"),"ecx"); # t[3] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[4]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(5*4,"esi")); + &adc ("edx",0); + &mov (&DWP(4*4,"esp"),"ecx"); # t[4] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[5]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(6*4,"esi")); + &adc ("edx",0); + &mov (&DWP(5*4,"esp"),"ecx"); # t[5] + &mov ("ecx","edx"); + + &mul ("ebx"); # a[6]*b[0] + &add ("ecx","eax"); + &mov ("eax",&DWP(7*4,"esi")); + &adc ("edx",0); + &mov (&DWP(6*4,"esp"),"ecx"); # t[6] + &mov ("ecx","edx"); + + &xor ("edi","edi"); # initial top-most carry + &mul ("ebx"); # a[7]*b[0] + &add ("ecx","eax"); # t[7] + &mov ("eax",&DWP(0*4,"esp")); # t[0] + &adc ("edx",0); # t[8] + +for ($i=0;$i<7;$i++) { + my $j=$i+1; + + # Reduction iteration is normally performed by accumulating + # result of multiplication of modulus by "magic" digit [and + # omitting least significant word, which is guaranteed to + # be 0], but thanks to special form of modulus and "magic" + # digit being equal to least significant word, it can be + # performed with additions and subtractions alone. Indeed: + # + # ffff.0001.0000.0000.0000.ffff.ffff.ffff + # * abcd + # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we + # rewrite above as: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd + # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 + # - abcd.0000.0000.0000.0000.0000.0000.abcd + # + # or marking redundant operations: + # + # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- + # + abcd.0000.abcd.0000.0000.abcd.----.----.---- + # - abcd.----.----.----.----.----.----.---- + + &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] + &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 + &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 + &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] + &adc ("ecx",0); # t[7]+=0 + &adc ("edx","eax"); # t[8]+=t[0] + &adc ("edi",0); # top-most carry + &mov ("ebx",&DWP($j*4,"ebp")); # b[i] + &sub ("ecx","eax"); # t[7]-=t[0] + &mov ("eax",&DWP(0*4,"esi")); # a[0] + &sbb ("edx",0); # t[8]-=0 + &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); + &sbb ("edi",0); # top-most carry, + # keep in mind that + # netto result is + # *addition* of value + # with (abcd<<32)-abcd + # on top, so that + # underflow is + # impossible, because + # (abcd<<32)-abcd + # doesn't underflow + &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); + + &mul ("ebx"); # a[0]*b[i] + &add ("eax",&DWP((($j+0)%8)*4,"esp")); + &adc ("edx",0); + &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); + &mov ("eax",&DWP(1*4,"esi")); + &mov ("ecx","edx") + + &mul ("ebx"); # a[1]*b[i] + &add ("ecx",&DWP((($j+1)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(2*4,"esi")); + &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[2]*b[i] + &add ("ecx",&DWP((($j+2)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(3*4,"esi")); + &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[3]*b[i] + &add ("ecx",&DWP((($j+3)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(4*4,"esi")); + &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[4]*b[i] + &add ("ecx",&DWP((($j+4)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(5*4,"esi")); + &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[5]*b[i] + &add ("ecx",&DWP((($j+5)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(6*4,"esi")); + &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[6]*b[i] + &add ("ecx",&DWP((($j+6)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); + &adc ("edx",0); + &mov ("eax",&DWP(7*4,"esi")); + &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); + &mov ("ecx","edx"); + + &mul ("ebx"); # a[7]*b[i] + &add ("ecx",&DWP((($j+7)%8)*4,"esp")); + &adc ("edx",0); + &add ("ecx","eax"); # t[7] + &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] + &adc ("edx","edi"); # t[8] + &mov ("edi",0); + &adc ("edi",0); # top-most carry +} + &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr + &xor ("esi","esi"); + my $j=$i+1; + + # last multiplication-less reduction + &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] + &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 + &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 + &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] + &adc ("ecx",0); # t[7]+=0 + &adc ("edx","eax"); # t[8]+=t[0] + &adc ("edi",0); # top-most carry + &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); + &sub ("ecx","eax"); # t[7]-=t[0] + &mov ("eax",&DWP((($j+0)%8)*4,"esp")); + &sbb ("edx",0); # t[8]-=0 + &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); + &sbb ("edi",0); # top-most carry + &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); + + # Final step is "if result > mod, subtract mod", but we do it + # "other way around", namely write result - mod to output buffer + # and if subtraction borrowed, add modulus back. + + &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); + &sub ("eax",-1); + &mov ("edx",&DWP((($j+3)%8)*4,"esp")); + &sbb ("ebx",-1); + &mov (&DWP(0*4,"ebp"),"eax"); + &sbb ("ecx",-1); + &mov (&DWP(1*4,"ebp"),"ebx"); + &sbb ("edx",0); + &mov (&DWP(2*4,"ebp"),"ecx"); + &mov (&DWP(3*4,"ebp"),"edx"); + + &mov ("eax",&DWP((($j+4)%8)*4,"esp")); + &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); + &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); + &sbb ("eax",0); + &mov ("edx",&DWP((($j+7)%8)*4,"esp")); + &sbb ("ebx",0); + &sbb ("ecx",1); + &sbb ("edx",-1); + &sbb ("edi",0); + + # Note that because mod has special form, i.e. consists of + # 0xffffffff, 1 and 0s, we can conditionally synthesize it by + # assigning borrow bit to one register, %ebp, and its negative + # to another, %esi. But we started by calculating %esi... + + &sub ("esi","edi"); + &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero + &adc (&DWP(1*4,"ebp"),"edi"); + &adc (&DWP(2*4,"ebp"),"edi"); + &adc (&DWP(3*4,"ebp"),0); + &adc ("eax",0); + &adc ("ebx",0); + &mov (&DWP(4*4,"ebp"),"eax"); + &adc ("ecx","esi"); + &mov (&DWP(5*4,"ebp"),"ebx"); + &adc ("edx","edi"); + &mov (&DWP(6*4,"ebp"),"ecx"); + &mov ("edi","ebp"); # fulfill contract + &mov (&DWP(7*4,"ebp"),"edx"); + + &add ("esp",10*4); + &ret (); +&function_end_B("_ecp_nistz256_mul_mont"); + +######################################################################## +# void ecp_nistz256_scatter_w5(void *edi,const P256_POINT *esi, +# int ebp); +&function_begin("ecp_nistz256_scatter_w5"); + &mov ("edi",&wparam(0)); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + + &lea ("edi",&DWP(128-4,"edi","ebp",4)); + &mov ("ebp",96/16); +&set_label("scatter_w5_loop"); + &mov ("eax",&DWP(0,"esi")); + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("edx",&DWP(12,"esi")); + &lea ("esi",&DWP(16,"esi")); + &mov (&DWP(64*0-128,"edi"),"eax"); + &mov (&DWP(64*1-128,"edi"),"ebx"); + &mov (&DWP(64*2-128,"edi"),"ecx"); + &mov (&DWP(64*3-128,"edi"),"edx"); + &lea ("edi",&DWP(64*4,"edi")); + &dec ("ebp"); + &jnz (&label("scatter_w5_loop")); +&function_end("ecp_nistz256_scatter_w5"); + +######################################################################## +# void ecp_nistz256_gather_w5(P256_POINT *edi,const void *esi, +# int ebp); +&function_begin("ecp_nistz256_gather_w5"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + + &lea ("esi",&DWP(0,"esi","ebp",4)); + &neg ("ebp"); + &sar ("ebp",31); + &mov ("edi",&wparam(0)); + &lea ("esi",&DWP(0,"esi","ebp",4)); + + for($i=0;$i<24;$i+=4) { + &mov ("eax",&DWP(64*($i+0),"esi")); + &mov ("ebx",&DWP(64*($i+1),"esi")); + &mov ("ecx",&DWP(64*($i+2),"esi")); + &mov ("edx",&DWP(64*($i+3),"esi")); + &and ("eax","ebp"); + &and ("ebx","ebp"); + &and ("ecx","ebp"); + &and ("edx","ebp"); + &mov (&DWP(4*($i+0),"edi"),"eax"); + &mov (&DWP(4*($i+1),"edi"),"ebx"); + &mov (&DWP(4*($i+2),"edi"),"ecx"); + &mov (&DWP(4*($i+3),"edi"),"edx"); + } +&function_end("ecp_nistz256_gather_w5"); + +######################################################################## +# void ecp_nistz256_scatter_w7(void *edi,const P256_POINT_AFFINE *esi, +# int ebp); +&function_begin("ecp_nistz256_scatter_w7"); + &mov ("edi",&wparam(0)); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + + &lea ("edi",&DWP(0,"edi","ebp")); + &mov ("ebp",64/4); +&set_label("scatter_w7_loop"); + &mov ("eax",&DWP(0,"esi")); + &lea ("esi",&DWP(4,"esi")); + &mov (&BP(64*0,"edi"),"al"); + &mov (&BP(64*1,"edi"),"ah"); + &shr ("eax",16); + &mov (&BP(64*2,"edi"),"al"); + &mov (&BP(64*3,"edi"),"ah"); + &lea ("edi",&DWP(64*4,"edi")); + &dec ("ebp"); + &jnz (&label("scatter_w7_loop")); +&function_end("ecp_nistz256_scatter_w7"); + +######################################################################## +# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *edi,const void *esi, +# int ebp); +&function_begin("ecp_nistz256_gather_w7"); + &mov ("esi",&wparam(1)); + &mov ("ebp",&wparam(2)); + + &add ("esi","ebp"); + &neg ("ebp"), + &sar ("ebp",31); + &mov ("edi",&wparam(0)); + &lea ("esi",&DWP(0,"esi","ebp")); + + for($i=0;$i<64;$i+=4) { + &movz ("eax",&BP(64*($i+0),"esi")); + &movz ("ebx",&BP(64*($i+1),"esi")); + &movz ("ecx",&BP(64*($i+2),"esi")); + &and ("eax","ebp"); + &movz ("edx",&BP(64*($i+3),"esi")); + &and ("ebx","ebp"); + &mov (&BP($i+0,"edi"),"al"); + &and ("ecx","ebp"); + &mov (&BP($i+1,"edi"),"bl"); + &and ("edx","ebp"); + &mov (&BP($i+2,"edi"),"cl"); + &mov (&BP($i+3,"edi"),"dl"); + } +&function_end("ecp_nistz256_gather_w7"); + +######################################################################## +# following subroutines are "literal" implementation of those found in +# ecp_nistz256.c +# +######################################################################## +# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); +# +&static_label("point_double_shortcut"); +&function_begin("ecp_nistz256_point_double"); +{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); + + &mov ("esi",&wparam(1)); + + # above map() describes stack layout with 5 temporary + # 256-bit vectors on top, then we take extra word for + # OPENSSL_ia32cap_P copy. + &stack_push(8*5+1); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("ebp",&DWP(0,"edx")); } + +&set_label("point_double_shortcut"); + &mov ("eax",&DWP(0,"esi")); # copy in_x + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("edx",&DWP(12,"esi")); + &mov (&DWP($in_x+0,"esp"),"eax"); + &mov (&DWP($in_x+4,"esp"),"ebx"); + &mov (&DWP($in_x+8,"esp"),"ecx"); + &mov (&DWP($in_x+12,"esp"),"edx"); + &mov ("eax",&DWP(16,"esi")); + &mov ("ebx",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &mov ("edx",&DWP(28,"esi")); + &mov (&DWP($in_x+16,"esp"),"eax"); + &mov (&DWP($in_x+20,"esp"),"ebx"); + &mov (&DWP($in_x+24,"esp"),"ecx"); + &mov (&DWP($in_x+28,"esp"),"edx"); + &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy + + &lea ("ebp",&DWP(32,"esi")); + &lea ("esi",&DWP(32,"esi")); + &lea ("edi",&DWP($S,"esp")); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &mov ("esi",64); + &add ("esi",&wparam(1)); + &lea ("edi",&DWP($Zsqr,"esp")); + &mov ("ebp","esi"); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($S,"esp")); + &lea ("ebp",&DWP($S,"esp")); + &lea ("edi",&DWP($S,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &mov ("ebp",&wparam(1)); + &lea ("esi",&DWP(32,"ebp")); + &lea ("ebp",&DWP(64,"ebp")); + &lea ("edi",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); + + &lea ("esi",&DWP($in_x,"esp")); + &lea ("ebp",&DWP($Zsqr,"esp")); + &lea ("edi",&DWP($M,"esp")); + &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); + + &mov ("edi",64); + &lea ("esi",&DWP($tmp0,"esp")); + &lea ("ebp",&DWP($tmp0,"esp")); + &add ("edi",&wparam(0)); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); + + &lea ("esi",&DWP($in_x,"esp")); + &lea ("ebp",&DWP($Zsqr,"esp")); + &lea ("edi",&DWP($Zsqr,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($S,"esp")); + &lea ("ebp",&DWP($S,"esp")); + &lea ("edi",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($M,"esp")); + &lea ("ebp",&DWP($Zsqr,"esp")); + &lea ("edi",&DWP($M,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); + + &mov ("edi",32); + &lea ("esi",&DWP($tmp0,"esp")); + &add ("edi",&wparam(0)); + &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); + + &lea ("esi",&DWP($M,"esp")); + &lea ("ebp",&DWP($M,"esp")); + &lea ("edi",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in_x,"esp")); + &lea ("ebp",&DWP($S,"esp")); + &lea ("edi",&DWP($S,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); + + &lea ("esi",&DWP($tmp0,"esp")); + &lea ("ebp",&DWP($M,"esp")); + &lea ("edi",&DWP($M,"esp")); + &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); + + &lea ("esi",&DWP($S,"esp")); + &lea ("ebp",&DWP($S,"esp")); + &lea ("edi",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($M,"esp")); + &lea ("ebp",&DWP($M,"esp")); + &mov ("edi",&wparam(0)); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); + + &mov ("esi","edi"); # %edi is still res_x here + &lea ("ebp",&DWP($tmp0,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); + + &lea ("esi",&DWP($S,"esp")); + &mov ("ebp","edi"); # %edi is still res_x + &lea ("edi",&DWP($S,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); + + &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy + &mov ("esi","edi"); # %edi is still &S + &lea ("ebp",&DWP($M,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); + + &mov ("ebp",32); + &lea ("esi",&DWP($S,"esp")); + &add ("ebp",&wparam(0)); + &mov ("edi","ebp"); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); + + &stack_pop(8*5+1); +} &function_end("ecp_nistz256_point_double"); + +######################################################################## +# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, +# const P256_POINT *in2); +&function_begin("ecp_nistz256_point_add"); +{ my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y,$in2_z, + $H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); + my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); + + &mov ("esi",&wparam(2)); + + # above map() describes stack layout with 18 temporary + # 256-bit vectors on top, then we take extra words for + # ~in1infty, ~in2infty, result of check for zero and + # OPENSSL_ia32cap_P copy. [one unused word for padding] + &stack_push(8*18+5); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("ebp",&DWP(0,"edx")); } + + &lea ("edi",&DWP($in2_x,"esp")); + for($i=0;$i<96;$i+=16) { + &mov ("eax",&DWP($i+0,"esi")); # copy in2 + &mov ("ebx",&DWP($i+4,"esi")); + &mov ("ecx",&DWP($i+8,"esi")); + &mov ("edx",&DWP($i+12,"esi")); + &mov (&DWP($i+0,"edi"),"eax"); + &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); + &mov ("ebp","eax") if ($i==64); + &or ("ebp","eax") if ($i>64); + &mov (&DWP($i+4,"edi"),"ebx"); + &or ("ebp","ebx") if ($i>=64); + &mov (&DWP($i+8,"edi"),"ecx"); + &or ("ebp","ecx") if ($i>=64); + &mov (&DWP($i+12,"edi"),"edx"); + &or ("ebp","edx") if ($i>=64); + } + &xor ("eax","eax"); + &mov ("esi",&wparam(1)); + &sub ("eax","ebp"); + &or ("ebp","eax"); + &sar ("ebp",31); + &mov (&DWP(32*18+4,"esp"),"ebp"); # ~in2infty + + &lea ("edi",&DWP($in1_x,"esp")); + for($i=0;$i<96;$i+=16) { + &mov ("eax",&DWP($i+0,"esi")); # copy in1 + &mov ("ebx",&DWP($i+4,"esi")); + &mov ("ecx",&DWP($i+8,"esi")); + &mov ("edx",&DWP($i+12,"esi")); + &mov (&DWP($i+0,"edi"),"eax"); + &mov ("ebp","eax") if ($i==64); + &or ("ebp","eax") if ($i>64); + &mov (&DWP($i+4,"edi"),"ebx"); + &or ("ebp","ebx") if ($i>=64); + &mov (&DWP($i+8,"edi"),"ecx"); + &or ("ebp","ecx") if ($i>=64); + &mov (&DWP($i+12,"edi"),"edx"); + &or ("ebp","edx") if ($i>=64); + } + &xor ("eax","eax"); + &sub ("eax","ebp"); + &or ("ebp","eax"); + &sar ("ebp",31); + &mov (&DWP(32*18+0,"esp"),"ebp"); # ~in1infty + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_z,"esp")); + &lea ("ebp",&DWP($in2_z,"esp")); + &lea ("edi",&DWP($Z2sqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_z,"esp")); + &lea ("ebp",&DWP($in1_z,"esp")); + &lea ("edi",&DWP($Z1sqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Z2sqr,"esp")); + &lea ("ebp",&DWP($in2_z,"esp")); + &lea ("edi",&DWP($S1,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Z1sqr,"esp")); + &lea ("ebp",&DWP($in1_z,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_y,"esp")); + &lea ("ebp",&DWP($S1,"esp")); + &lea ("edi",&DWP($S1,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_y,"esp")); + &lea ("ebp",&DWP($S2,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); + + &lea ("esi",&DWP($S2,"esp")); + &lea ("ebp",&DWP($S1,"esp")); + &lea ("edi",&DWP($R,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); + + &or ("ebx","eax"); # see if result is zero + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &or ("ebx","ecx"); + &or ("ebx","edx"); + &or ("ebx",&DWP(0,"edi")); + &or ("ebx",&DWP(4,"edi")); + &lea ("esi",&DWP($in1_x,"esp")); + &or ("ebx",&DWP(8,"edi")); + &lea ("ebp",&DWP($Z2sqr,"esp")); + &or ("ebx",&DWP(12,"edi")); + &lea ("edi",&DWP($U1,"esp")); + &mov (&DWP(32*18+8,"esp"),"ebx"); + + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_x,"esp")); + &lea ("ebp",&DWP($Z1sqr,"esp")); + &lea ("edi",&DWP($U2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($U1,"esp")); + &lea ("edi",&DWP($H,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); + + &or ("eax","ebx"); # see if result is zero + &or ("eax","ecx"); + &or ("eax","edx"); + &or ("eax",&DWP(0,"edi")); + &or ("eax",&DWP(4,"edi")); + &or ("eax",&DWP(8,"edi")); + &or ("eax",&DWP(12,"edi")); # ~is_equal(U1,U2) + + &mov ("ebx",&DWP(32*18+0,"esp")); # ~in1infty + ¬ ("ebx"); # -1/0 -> 0/-1 + &or ("eax","ebx"); + &mov ("ebx",&DWP(32*18+4,"esp")); # ~in2infty + ¬ ("ebx"); # -1/0 -> 0/-1 + &or ("eax","ebx"); + &or ("eax",&DWP(32*18+8,"esp")); # ~is_equal(S1,S2) + + # if (~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + &data_byte(0x3e); # predict taken + &jnz (&label("add_proceed")); + +&set_label("add_double",16); + &mov ("esi",&wparam(1)); + &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes + &jmp (&label("point_double_shortcut")); + +&set_label("add_proceed",16); + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($R,"esp")); + &lea ("ebp",&DWP($R,"esp")); + &lea ("edi",&DWP($Rsqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($in1_z,"esp")); + &lea ("edi",&DWP($res_z,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($H,"esp")); + &lea ("edi",&DWP($Hsqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_z,"esp")); + &lea ("ebp",&DWP($res_z,"esp")); + &lea ("edi",&DWP($res_z,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Hsqr,"esp")); + &lea ("ebp",&DWP($U1,"esp")); + &lea ("edi",&DWP($U2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($Hcub,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($U2,"esp")); + &lea ("edi",&DWP($Hsqr,"esp")); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); + + &lea ("esi",&DWP($Rsqr,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($res_x,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); + + &lea ("esi",&DWP($res_x,"esp")); + &lea ("ebp",&DWP($Hcub,"esp")); + &lea ("edi",&DWP($res_x,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($res_x,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Hcub,"esp")); + &lea ("ebp",&DWP($S1,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); + + &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($R,"esp")); + &lea ("ebp",&DWP($res_y,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); + + &lea ("esi",&DWP($res_y,"esp")); + &lea ("ebp",&DWP($S2,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); + + &mov ("ebp",&DWP(32*18+0,"esp")); # ~in1infty + &mov ("esi",&DWP(32*18+4,"esp")); # ~in2infty + &mov ("edi",&wparam(0)); + &mov ("edx","ebp"); + ¬ ("ebp"); + &and ("edx","esi"); # ~in1infty & ~in2infty + &and ("ebp","esi"); # in1infty & ~in2infty + ¬ ("esi"); # in2infty + + ######################################## + # conditional moves + for($i=64;$i<96;$i+=4) { + &mov ("eax","edx"); # ~in1infty & ~in2infty + &and ("eax",&DWP($res_x+$i,"esp")); + &mov ("ebx","ebp"); # in1infty & ~in2infty + &and ("ebx",&DWP($in2_x+$i,"esp")); + &mov ("ecx","esi"); # in2infty + &and ("ecx",&DWP($in1_x+$i,"esp")); + &or ("eax","ebx"); + &or ("eax","ecx"); + &mov (&DWP($i,"edi"),"eax"); + } + for($i=0;$i<64;$i+=4) { + &mov ("eax","edx"); # ~in1infty & ~in2infty + &and ("eax",&DWP($res_x+$i,"esp")); + &mov ("ebx","ebp"); # in1infty & ~in2infty + &and ("ebx",&DWP($in2_x+$i,"esp")); + &mov ("ecx","esi"); # in2infty + &and ("ecx",&DWP($in1_x+$i,"esp")); + &or ("eax","ebx"); + &or ("eax","ecx"); + &mov (&DWP($i,"edi"),"eax"); + } + &set_label("add_done"); + &stack_pop(8*18+5); +} &function_end("ecp_nistz256_point_add"); + +######################################################################## +# void ecp_nistz256_point_add_affine(P256_POINT *out, +# const P256_POINT *in1, +# const P256_POINT_AFFINE *in2); +&function_begin("ecp_nistz256_point_add_affine"); +{ + my ($res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y, + $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); + my $Z1sqr = $S2; + my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); + + &mov ("esi",&wparam(1)); + + # above map() describes stack layout with 15 temporary + # 256-bit vectors on top, then we take extra words for + # ~in1infty, ~in2infty, and OPENSSL_ia32cap_P copy. + &stack_push(8*15+3); + if ($sse2) { + &call ("_picup_eax"); + &set_label("pic"); + &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); + &mov ("ebp",&DWP(0,"edx")); } + + &lea ("edi",&DWP($in1_x,"esp")); + for($i=0;$i<96;$i+=16) { + &mov ("eax",&DWP($i+0,"esi")); # copy in1 + &mov ("ebx",&DWP($i+4,"esi")); + &mov ("ecx",&DWP($i+8,"esi")); + &mov ("edx",&DWP($i+12,"esi")); + &mov (&DWP($i+0,"edi"),"eax"); + &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); + &mov ("ebp","eax") if ($i==64); + &or ("ebp","eax") if ($i>64); + &mov (&DWP($i+4,"edi"),"ebx"); + &or ("ebp","ebx") if ($i>=64); + &mov (&DWP($i+8,"edi"),"ecx"); + &or ("ebp","ecx") if ($i>=64); + &mov (&DWP($i+12,"edi"),"edx"); + &or ("ebp","edx") if ($i>=64); + } + &xor ("eax","eax"); + &mov ("esi",&wparam(2)); + &sub ("eax","ebp"); + &or ("ebp","eax"); + &sar ("ebp",31); + &mov (&DWP(32*15+0,"esp"),"ebp"); # ~in1infty + + &lea ("edi",&DWP($in2_x,"esp")); + for($i=0;$i<64;$i+=16) { + &mov ("eax",&DWP($i+0,"esi")); # copy in2 + &mov ("ebx",&DWP($i+4,"esi")); + &mov ("ecx",&DWP($i+8,"esi")); + &mov ("edx",&DWP($i+12,"esi")); + &mov (&DWP($i+0,"edi"),"eax"); + &mov ("ebp","eax") if ($i==0); + &or ("ebp","eax") if ($i!=0); + &mov (&DWP($i+4,"edi"),"ebx"); + &or ("ebp","ebx"); + &mov (&DWP($i+8,"edi"),"ecx"); + &or ("ebp","ecx"); + &mov (&DWP($i+12,"edi"),"edx"); + &or ("ebp","edx"); + } + &xor ("ebx","ebx"); + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &sub ("ebx","ebp"); + &lea ("esi",&DWP($in1_z,"esp")); + &or ("ebx","ebp"); + &lea ("ebp",&DWP($in1_z,"esp")); + &sar ("ebx",31); + &lea ("edi",&DWP($Z1sqr,"esp")); + &mov (&DWP(32*15+4,"esp"),"ebx"); # ~in2infty + + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_x,"esp")); + &mov ("ebp","edi"); # %esi is stull &Z1sqr + &lea ("edi",&DWP($U2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_z,"esp")); + &lea ("ebp",&DWP($Z1sqr,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($in1_x,"esp")); + &lea ("edi",&DWP($H,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in2_y,"esp")); + &lea ("ebp",&DWP($S2,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_z,"esp")); + &lea ("ebp",&DWP($H,"esp")); + &lea ("edi",&DWP($res_z,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); + + &lea ("esi",&DWP($S2,"esp")); + &lea ("ebp",&DWP($in1_y,"esp")); + &lea ("edi",&DWP($R,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($H,"esp")); + &lea ("edi",&DWP($Hsqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($R,"esp")); + &lea ("ebp",&DWP($R,"esp")); + &lea ("edi",&DWP($Rsqr,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($in1_x,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($U2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($H,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($Hcub,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($U2,"esp")); + &lea ("edi",&DWP($Hsqr,"esp")); + &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); + + &lea ("esi",&DWP($Rsqr,"esp")); + &lea ("ebp",&DWP($Hsqr,"esp")); + &lea ("edi",&DWP($res_x,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); + + &lea ("esi",&DWP($res_x,"esp")); + &lea ("ebp",&DWP($Hcub,"esp")); + &lea ("edi",&DWP($res_x,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); + + &lea ("esi",&DWP($U2,"esp")); + &lea ("ebp",&DWP($res_x,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($Hcub,"esp")); + &lea ("ebp",&DWP($in1_y,"esp")); + &lea ("edi",&DWP($S2,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); + + &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy + &lea ("esi",&DWP($R,"esp")); + &lea ("ebp",&DWP($res_y,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); + + &lea ("esi",&DWP($res_y,"esp")); + &lea ("ebp",&DWP($S2,"esp")); + &lea ("edi",&DWP($res_y,"esp")); + &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); + + &mov ("ebp",&DWP(32*15+0,"esp")); # ~in1infty + &mov ("esi",&DWP(32*15+4,"esp")); # ~in2infty + &mov ("edi",&wparam(0)); + &mov ("edx","ebp"); + ¬ ("ebp"); + &and ("edx","esi"); # ~in1infty & ~in2infty + &and ("ebp","esi"); # in1infty & ~in2infty + ¬ ("esi"); # in2infty + + ######################################## + # conditional moves + for($i=64;$i<96;$i+=4) { + my $one=@ONE_mont[($i-64)/4]; + + &mov ("eax","edx"); + &and ("eax",&DWP($res_x+$i,"esp")); + &mov ("ebx","ebp") if ($one && $one!=-1); + &and ("ebx",$one) if ($one && $one!=-1); + &mov ("ecx","esi"); + &and ("ecx",&DWP($in1_x+$i,"esp")); + &or ("eax",$one==-1?"ebp":"ebx") if ($one); + &or ("eax","ecx"); + &mov (&DWP($i,"edi"),"eax"); + } + for($i=0;$i<64;$i+=4) { + &mov ("eax","edx"); # ~in1infty & ~in2infty + &and ("eax",&DWP($res_x+$i,"esp")); + &mov ("ebx","ebp"); # in1infty & ~in2infty + &and ("ebx",&DWP($in2_x+$i,"esp")); + &mov ("ecx","esi"); # in2infty + &and ("ecx",&DWP($in1_x+$i,"esp")); + &or ("eax","ebx"); + &or ("eax","ecx"); + &mov (&DWP($i,"edi"),"eax"); + } + &stack_pop(8*15+3); +} &function_end("ecp_nistz256_point_add_affine"); + +&asm_finish(); + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl new file mode 100755 index 000000000..de9b19451 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl @@ -0,0 +1,4739 @@ +#! /usr/bin/env perl +# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2014, Intel Corporation. All Rights Reserved. +# Copyright (c) 2015 CloudFlare, Inc. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) +# (1) Intel Corporation, Israel Development Center, Haifa, Israel +# (2) University of Haifa, Israel +# (3) CloudFlare, Inc. +# +# Reference: +# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +# 256 Bit Primes" + +# Further optimization by <appro@openssl.org>: +# +# this/original with/without -DECP_NISTZ256_ASM(*) +# Opteron +15-49% +150-195% +# Bulldozer +18-45% +175-240% +# P4 +24-46% +100-150% +# Westmere +18-34% +87-160% +# Sandy Bridge +14-35% +120-185% +# Ivy Bridge +11-35% +125-180% +# Haswell +10-37% +160-200% +# Broadwell +24-58% +210-270% +# Atom +20-50% +180-240% +# VIA Nano +50-160% +480-480% +# +# (*) "without -DECP_NISTZ256_ASM" refers to build with +# "enable-ec_nistp_64_gcc_128"; +# +# Ranges denote minimum and maximum improvement coefficients depending +# on benchmark. In "this/original" column lower coefficient is for +# ECDSA sign, while in "with/without" - for ECDH key agreement, and +# higher - for ECDSA sign, relatively fastest server-side operation. +# Keep in mind that +100% means 2x improvement. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); + $addx = ($1>=2.23); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); + $addx = ($1>=12); +} + +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $avx = ($ver>=3.0) + ($ver>=3.01); + $addx = ($ver>=3.03); +} + +$code.=<<___; +.text +.extern OPENSSL_ia32cap_P + +# The polynomial +.align 64 +.Lpoly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +# 2^512 mod P precomputed for NIST P256 polynomial +.LRR: +.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd + +.LOne: +.long 1,1,1,1,1,1,1,1 +.LTwo: +.long 2,2,2,2,2,2,2,2 +.LThree: +.long 3,3,3,3,3,3,3,3 +.LONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +# Constants for computations modulo ord(p256) +.Lord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +___ + +{ +################################################################################ +# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); + +my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); +my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); +my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); + +$code.=<<___; + +.globl ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,\@function,2 +.align 64 +ecp_nistz256_mul_by_2: +.cfi_startproc + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 +.Lmul_by_2_body: + + mov 8*0($a_ptr), $a0 + xor $t4,$t4 + mov 8*1($a_ptr), $a1 + add $a0, $a0 # a0:a3+a0:a3 + mov 8*2($a_ptr), $a2 + adc $a1, $a1 + mov 8*3($a_ptr), $a3 + lea .Lpoly(%rip), $a_ptr + mov $a0, $t0 + adc $a2, $a2 + adc $a3, $a3 + mov $a1, $t1 + adc \$0, $t4 + + sub 8*0($a_ptr), $a0 + mov $a2, $t2 + sbb 8*1($a_ptr), $a1 + sbb 8*2($a_ptr), $a2 + mov $a3, $t3 + sbb 8*3($a_ptr), $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lmul_by_2_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +################################################################################ +# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); +.globl ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,\@function,2 +.align 32 +ecp_nistz256_div_by_2: +.cfi_startproc + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 +.Ldiv_by_2_body: + + mov 8*0($a_ptr), $a0 + mov 8*1($a_ptr), $a1 + mov 8*2($a_ptr), $a2 + mov $a0, $t0 + mov 8*3($a_ptr), $a3 + lea .Lpoly(%rip), $a_ptr + + mov $a1, $t1 + xor $t4, $t4 + add 8*0($a_ptr), $a0 + mov $a2, $t2 + adc 8*1($a_ptr), $a1 + adc 8*2($a_ptr), $a2 + mov $a3, $t3 + adc 8*3($a_ptr), $a3 + adc \$0, $t4 + xor $a_ptr, $a_ptr # borrow $a_ptr + test \$1, $t0 + + cmovz $t0, $a0 + cmovz $t1, $a1 + cmovz $t2, $a2 + cmovz $t3, $a3 + cmovz $a_ptr, $t4 + + mov $a1, $t0 # a0:a3>>1 + shr \$1, $a0 + shl \$63, $t0 + mov $a2, $t1 + shr \$1, $a1 + or $t0, $a0 + shl \$63, $t1 + mov $a3, $t2 + shr \$1, $a2 + or $t1, $a1 + shl \$63, $t2 + shr \$1, $a3 + shl \$63, $t4 + or $t2, $a2 + or $t4, $a3 + + mov $a0, 8*0($r_ptr) + mov $a1, 8*1($r_ptr) + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Ldiv_by_2_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +################################################################################ +# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); +.globl ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,\@function,2 +.align 32 +ecp_nistz256_mul_by_3: +.cfi_startproc + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 +.Lmul_by_3_body: + + mov 8*0($a_ptr), $a0 + xor $t4, $t4 + mov 8*1($a_ptr), $a1 + add $a0, $a0 # a0:a3+a0:a3 + mov 8*2($a_ptr), $a2 + adc $a1, $a1 + mov 8*3($a_ptr), $a3 + mov $a0, $t0 + adc $a2, $a2 + adc $a3, $a3 + mov $a1, $t1 + adc \$0, $t4 + + sub \$-1, $a0 + mov $a2, $t2 + sbb .Lpoly+8*1(%rip), $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb .Lpoly+8*3(%rip), $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + cmovc $t2, $a2 + cmovc $t3, $a3 + + xor $t4, $t4 + add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] + adc 8*1($a_ptr), $a1 + mov $a0, $t0 + adc 8*2($a_ptr), $a2 + adc 8*3($a_ptr), $a3 + mov $a1, $t1 + adc \$0, $t4 + + sub \$-1, $a0 + mov $a2, $t2 + sbb .Lpoly+8*1(%rip), $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb .Lpoly+8*3(%rip), $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lmul_by_3_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +################################################################################ +# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); +.globl ecp_nistz256_add +.type ecp_nistz256_add,\@function,3 +.align 32 +ecp_nistz256_add: +.cfi_startproc + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 +.Ladd_body: + + mov 8*0($a_ptr), $a0 + xor $t4, $t4 + mov 8*1($a_ptr), $a1 + mov 8*2($a_ptr), $a2 + mov 8*3($a_ptr), $a3 + lea .Lpoly(%rip), $a_ptr + + add 8*0($b_ptr), $a0 + adc 8*1($b_ptr), $a1 + mov $a0, $t0 + adc 8*2($b_ptr), $a2 + adc 8*3($b_ptr), $a3 + mov $a1, $t1 + adc \$0, $t4 + + sub 8*0($a_ptr), $a0 + mov $a2, $t2 + sbb 8*1($a_ptr), $a1 + sbb 8*2($a_ptr), $a2 + mov $a3, $t3 + sbb 8*3($a_ptr), $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Ladd_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_add,.-ecp_nistz256_add + +################################################################################ +# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); +.globl ecp_nistz256_sub +.type ecp_nistz256_sub,\@function,3 +.align 32 +ecp_nistz256_sub: +.cfi_startproc + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 +.Lsub_body: + + mov 8*0($a_ptr), $a0 + xor $t4, $t4 + mov 8*1($a_ptr), $a1 + mov 8*2($a_ptr), $a2 + mov 8*3($a_ptr), $a3 + lea .Lpoly(%rip), $a_ptr + + sub 8*0($b_ptr), $a0 + sbb 8*1($b_ptr), $a1 + mov $a0, $t0 + sbb 8*2($b_ptr), $a2 + sbb 8*3($b_ptr), $a3 + mov $a1, $t1 + sbb \$0, $t4 + + add 8*0($a_ptr), $a0 + mov $a2, $t2 + adc 8*1($a_ptr), $a1 + adc 8*2($a_ptr), $a2 + mov $a3, $t3 + adc 8*3($a_ptr), $a3 + test $t4, $t4 + + cmovz $t0, $a0 + cmovz $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovz $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovz $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lsub_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +################################################################################ +# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); +.globl ecp_nistz256_neg +.type ecp_nistz256_neg,\@function,2 +.align 32 +ecp_nistz256_neg: +.cfi_startproc + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 +.Lneg_body: + + xor $a0, $a0 + xor $a1, $a1 + xor $a2, $a2 + xor $a3, $a3 + xor $t4, $t4 + + sub 8*0($a_ptr), $a0 + sbb 8*1($a_ptr), $a1 + sbb 8*2($a_ptr), $a2 + mov $a0, $t0 + sbb 8*3($a_ptr), $a3 + lea .Lpoly(%rip), $a_ptr + mov $a1, $t1 + sbb \$0, $t4 + + add 8*0($a_ptr), $a0 + mov $a2, $t2 + adc 8*1($a_ptr), $a1 + adc 8*2($a_ptr), $a2 + mov $a3, $t3 + adc 8*3($a_ptr), $a3 + test $t4, $t4 + + cmovz $t0, $a0 + cmovz $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovz $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovz $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lneg_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_neg,.-ecp_nistz256_neg +___ +} +{ +my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); +my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); +my ($poly1,$poly3)=($acc6,$acc7); + +$code.=<<___; +################################################################################ +# void ecp_nistz256_ord_mul_mont( +# uint64_t res[4], +# uint64_t a[4], +# uint64_t b[4]); + +.globl ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,\@function,3 +.align 32 +ecp_nistz256_ord_mul_mont: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx + cmp \$0x80100, %ecx + je .Lecp_nistz256_ord_mul_montx +___ +$code.=<<___; + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_mul_body: + + mov 8*0($b_org), %rax + mov $b_org, $b_ptr + lea .Lord(%rip), %r14 + mov .LordK(%rip), %r15 + + ################################# * b[0] + mov %rax, $t0 + mulq 8*0($a_ptr) + mov %rax, $acc0 + mov $t0, %rax + mov %rdx, $acc1 + + mulq 8*1($a_ptr) + add %rax, $acc1 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $acc2 + + mulq 8*2($a_ptr) + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + + mov $acc0, $acc5 + imulq %r15,$acc0 + + mov %rdx, $acc3 + mulq 8*3($a_ptr) + add %rax, $acc3 + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $acc4 + + ################################# First reduction step + mulq 8*0(%r14) + mov $acc0, $t1 + add %rax, $acc5 # guaranteed to be zero + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $t0 + + sub $acc0, $acc2 + sbb \$0, $acc0 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc1 + adc \$0, %rdx + add %rax, $acc1 + mov $t1, %rax + adc %rdx, $acc2 + mov $t1, %rdx + adc \$0, $acc0 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc3 + mov 8*1($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc0, $acc3 + adc $t1, $acc4 + adc \$0, $acc5 + + ################################# * b[1] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc1 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + + mov $acc1, $t0 + imulq %r15, $acc1 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + xor $acc0, $acc0 + add %rax, $acc4 + mov $acc1, %rax + adc %rdx, $acc5 + adc \$0, $acc0 + + ################################# Second reduction step + mulq 8*0(%r14) + mov $acc1, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc1, %rax + adc %rdx, $t0 + + sub $acc1, $acc3 + sbb \$0, $acc1 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $t1, %rax + adc %rdx, $acc3 + mov $t1, %rdx + adc \$0, $acc1 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc4 + mov 8*2($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc1, $acc4 + adc $t1, $acc5 + adc \$0, $acc0 + + ################################## * b[2] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc2 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t0, %rax + adc \$0, %rdx + + mov $acc2, $t0 + imulq %r15, $acc2 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc5 + adc \$0, %rdx + xor $acc1, $acc1 + add %rax, $acc5 + mov $acc2, %rax + adc %rdx, $acc0 + adc \$0, $acc1 + + ################################# Third reduction step + mulq 8*0(%r14) + mov $acc2, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc2, %rax + adc %rdx, $t0 + + sub $acc2, $acc4 + sbb \$0, $acc2 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t1, %rax + adc %rdx, $acc4 + mov $t1, %rdx + adc \$0, $acc2 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc5 + mov 8*3($b_ptr), %rax + sbb %rdx, $t1 # can't borrow + + add $acc2, $acc5 + adc $t1, $acc0 + adc \$0, $acc1 + + ################################# * b[3] + mov %rax, $t0 + mulq 8*0($a_ptr) + add %rax, $acc3 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*1($a_ptr) + add $t1, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq 8*2($a_ptr) + add $t1, $acc5 + adc \$0, %rdx + add %rax, $acc5 + mov $t0, %rax + adc \$0, %rdx + + mov $acc3, $t0 + imulq %r15, $acc3 + + mov %rdx, $t1 + mulq 8*3($a_ptr) + add $t1, $acc0 + adc \$0, %rdx + xor $acc2, $acc2 + add %rax, $acc0 + mov $acc3, %rax + adc %rdx, $acc1 + adc \$0, $acc2 + + ################################# Last reduction step + mulq 8*0(%r14) + mov $acc3, $t1 + add %rax, $t0 # guaranteed to be zero + mov $acc3, %rax + adc %rdx, $t0 + + sub $acc3, $acc5 + sbb \$0, $acc3 # can't borrow + + mulq 8*1(%r14) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t1, %rax + adc %rdx, $acc5 + mov $t1, %rdx + adc \$0, $acc3 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc0 + sbb %rdx, $t1 # can't borrow + + add $acc3, $acc0 + adc $t1, $acc1 + adc \$0, $acc2 + + ################################# Subtract ord + mov $acc4, $a_ptr + sub 8*0(%r14), $acc4 + mov $acc5, $acc3 + sbb 8*1(%r14), $acc5 + mov $acc0, $t0 + sbb 8*2(%r14), $acc0 + mov $acc1, $t1 + sbb 8*3(%r14), $acc1 + sbb \$0, $acc2 + + cmovc $a_ptr, $acc4 + cmovc $acc3, $acc5 + cmovc $t0, $acc0 + cmovc $t1, $acc1 + + mov $acc4, 8*0($r_ptr) + mov $acc5, 8*1($r_ptr) + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +################################################################################ +# void ecp_nistz256_ord_sqr_mont( +# uint64_t res[4], +# uint64_t a[4], +# int rep); + +.globl ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,\@function,3 +.align 32 +ecp_nistz256_ord_sqr_mont: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx + cmp \$0x80100, %ecx + je .Lecp_nistz256_ord_sqr_montx +___ +$code.=<<___; + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_sqr_body: + + mov 8*0($a_ptr), $acc0 + mov 8*1($a_ptr), %rax + mov 8*2($a_ptr), $acc6 + mov 8*3($a_ptr), $acc7 + lea .Lord(%rip), $a_ptr # pointer to modulus + mov $b_org, $b_ptr + jmp .Loop_ord_sqr + +.align 32 +.Loop_ord_sqr: + ################################# a[1:] * a[0] + mov %rax, $t1 # put aside a[1] + mul $acc0 # a[1] * a[0] + mov %rax, $acc1 + movq $t1, %xmm1 # offload a[1] + mov $acc6, %rax + mov %rdx, $acc2 + + mul $acc0 # a[2] * a[0] + add %rax, $acc2 + mov $acc7, %rax + movq $acc6, %xmm2 # offload a[2] + adc \$0, %rdx + mov %rdx, $acc3 + + mul $acc0 # a[3] * a[0] + add %rax, $acc3 + mov $acc7, %rax + movq $acc7, %xmm3 # offload a[3] + adc \$0, %rdx + mov %rdx, $acc4 + + ################################# a[3] * a[2] + mul $acc6 # a[3] * a[2] + mov %rax, $acc5 + mov $acc6, %rax + mov %rdx, $acc6 + + ################################# a[2:] * a[1] + mul $t1 # a[2] * a[1] + add %rax, $acc3 + mov $acc7, %rax + adc \$0, %rdx + mov %rdx, $acc7 + + mul $t1 # a[3] * a[1] + add %rax, $acc4 + adc \$0, %rdx + + add $acc7, $acc4 + adc %rdx, $acc5 + adc \$0, $acc6 # can't overflow + + ################################# *2 + xor $acc7, $acc7 + mov $acc0, %rax + add $acc1, $acc1 + adc $acc2, $acc2 + adc $acc3, $acc3 + adc $acc4, $acc4 + adc $acc5, $acc5 + adc $acc6, $acc6 + adc \$0, $acc7 + + ################################# Missing products + mul %rax # a[0] * a[0] + mov %rax, $acc0 + movq %xmm1, %rax + mov %rdx, $t1 + + mul %rax # a[1] * a[1] + add $t1, $acc1 + adc %rax, $acc2 + movq %xmm2, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mul %rax # a[2] * a[2] + add $t1, $acc3 + adc %rax, $acc4 + movq %xmm3, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mov $acc0, $t0 + imulq 8*4($a_ptr), $acc0 # *= .LordK + + mul %rax # a[3] * a[3] + add $t1, $acc5 + adc %rax, $acc6 + mov 8*0($a_ptr), %rax # modulus[0] + adc %rdx, $acc7 # can't overflow + + ################################# First reduction step + mul $acc0 + mov $acc0, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax # modulus[1] + adc %rdx, $t0 + + sub $acc0, $acc2 + sbb \$0, $t1 # can't borrow + + mul $acc0 + add $t0, $acc1 + adc \$0, %rdx + add %rax, $acc1 + mov $acc0, %rax + adc %rdx, $acc2 + mov $acc0, %rdx + adc \$0, $t1 # can't overflow + + mov $acc1, $t0 + imulq 8*4($a_ptr), $acc1 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc3 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc0 # can't borrow + + add $t1, $acc3 + adc \$0, $acc0 # can't overflow + + ################################# Second reduction step + mul $acc1 + mov $acc1, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc1, $acc3 + sbb \$0, $t1 # can't borrow + + mul $acc1 + add $t0, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $acc1, %rax + adc %rdx, $acc3 + mov $acc1, %rdx + adc \$0, $t1 # can't overflow + + mov $acc2, $t0 + imulq 8*4($a_ptr), $acc2 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc0 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc1 # can't borrow + + add $t1, $acc0 + adc \$0, $acc1 # can't overflow + + ################################# Third reduction step + mul $acc2 + mov $acc2, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc2, $acc0 + sbb \$0, $t1 # can't borrow + + mul $acc2 + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $acc2, %rax + adc %rdx, $acc0 + mov $acc2, %rdx + adc \$0, $t1 # can't overflow + + mov $acc3, $t0 + imulq 8*4($a_ptr), $acc3 # *= .LordK + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc1 + mov 8*0($a_ptr), %rax + sbb %rdx, $acc2 # can't borrow + + add $t1, $acc1 + adc \$0, $acc2 # can't overflow + + ################################# Last reduction step + mul $acc3 + mov $acc3, $t1 + add %rax, $t0 # guaranteed to be zero + mov 8*1($a_ptr), %rax + adc %rdx, $t0 + + sub $acc3, $acc1 + sbb \$0, $t1 # can't borrow + + mul $acc3 + add $t0, $acc0 + adc \$0, %rdx + add %rax, $acc0 + mov $acc3, %rax + adc %rdx, $acc1 + mov $acc3, %rdx + adc \$0, $t1 # can't overflow + + shl \$32, %rax + shr \$32, %rdx + sub %rax, $acc2 + sbb %rdx, $acc3 # can't borrow + + add $t1, $acc2 + adc \$0, $acc3 # can't overflow + + ################################# Add bits [511:256] of the sqr result + xor %rdx, %rdx + add $acc4, $acc0 + adc $acc5, $acc1 + mov $acc0, $acc4 + adc $acc6, $acc2 + adc $acc7, $acc3 + mov $acc1, %rax + adc \$0, %rdx + + ################################# Compare to modulus + sub 8*0($a_ptr), $acc0 + mov $acc2, $acc6 + sbb 8*1($a_ptr), $acc1 + sbb 8*2($a_ptr), $acc2 + mov $acc3, $acc7 + sbb 8*3($a_ptr), $acc3 + sbb \$0, %rdx + + cmovc $acc4, $acc0 + cmovnc $acc1, %rax + cmovnc $acc2, $acc6 + cmovnc $acc3, $acc7 + + dec $b_ptr + jnz .Loop_ord_sqr + + mov $acc0, 8*0($r_ptr) + mov %rax, 8*1($r_ptr) + pxor %xmm1, %xmm1 + mov $acc6, 8*2($r_ptr) + pxor %xmm2, %xmm2 + mov $acc7, 8*3($r_ptr) + pxor %xmm3, %xmm3 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +___ + +$code.=<<___ if ($addx); +################################################################################ +.type ecp_nistz256_ord_mul_montx,\@function,3 +.align 32 +ecp_nistz256_ord_mul_montx: +.cfi_startproc +.Lecp_nistz256_ord_mul_montx: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_mulx_body: + + mov $b_org, $b_ptr + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), $acc1 + mov 8*1($a_ptr), $acc2 + mov 8*2($a_ptr), $acc3 + mov 8*3($a_ptr), $acc4 + lea -128($a_ptr), $a_ptr # control u-op density + lea .Lord-128(%rip), %r14 + mov .LordK(%rip), %r15 + + ################################# Multiply by b[0] + mulx $acc1, $acc0, $acc1 + mulx $acc2, $t0, $acc2 + mulx $acc3, $t1, $acc3 + add $t0, $acc1 + mulx $acc4, $t0, $acc4 + mov $acc0, %rdx + mulx %r15, %rdx, %rax + adc $t1, $acc2 + adc $t0, $acc3 + adc \$0, $acc4 + + ################################# reduction + xor $acc5, $acc5 # $acc5=0, cf=0, of=0 + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc0 # guaranteed to be zero + adox $t1, $acc1 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*1($b_ptr), %rdx + adcx $t0, $acc3 + adox $t1, $acc4 + adcx $acc0, $acc4 + adox $acc0, $acc5 + adc \$0, $acc5 # cf=0, of=0 + + ################################# Multiply by b[1] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc1, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc4 + adox $t1, $acc5 + + adcx $acc0, $acc5 + adox $acc0, $acc0 + adc \$0, $acc0 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc1 # guaranteed to be zero + adox $t1, $acc2 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*2($b_ptr), %rdx + adcx $t0, $acc4 + adox $t1, $acc5 + adcx $acc1, $acc5 + adox $acc1, $acc0 + adc \$0, $acc0 # cf=0, of=0 + + ################################# Multiply by b[2] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc2, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc5 + adox $t1, $acc0 + + adcx $acc1, $acc0 + adox $acc1, $acc1 + adc \$0, $acc1 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc2 # guaranteed to be zero + adox $t1, $acc3 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*3+128(%r14), $t0, $t1 + mov 8*3($b_ptr), %rdx + adcx $t0, $acc5 + adox $t1, $acc0 + adcx $acc2, $acc0 + adox $acc2, $acc1 + adc \$0, $acc1 # cf=0, of=0 + + ################################# Multiply by b[3] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc5 + adox $t1, $acc0 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc3, %rdx + mulx %r15, %rdx, %rax + adcx $t0, $acc0 + adox $t1, $acc1 + + adcx $acc2, $acc1 + adox $acc2, $acc2 + adc \$0, $acc2 # cf=0, of=0 + + ################################# reduction + mulx 8*0+128(%r14), $t0, $t1 + adcx $t0, $acc3 # guaranteed to be zero + adox $t1, $acc4 + + mulx 8*1+128(%r14), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*2+128(%r14), $t0, $t1 + adcx $t0, $acc5 + adox $t1, $acc0 + + mulx 8*3+128(%r14), $t0, $t1 + lea 128(%r14),%r14 + mov $acc4, $t2 + adcx $t0, $acc0 + adox $t1, $acc1 + mov $acc5, $t3 + adcx $acc3, $acc1 + adox $acc3, $acc2 + adc \$0, $acc2 + + ################################# + # Branch-less conditional subtraction of P + mov $acc0, $t0 + sub 8*0(%r14), $acc4 + sbb 8*1(%r14), $acc5 + sbb 8*2(%r14), $acc0 + mov $acc1, $t1 + sbb 8*3(%r14), $acc1 + sbb \$0, $acc2 + + cmovc $t2, $acc4 + cmovc $t3, $acc5 + cmovc $t0, $acc0 + cmovc $t1, $acc1 + + mov $acc4, 8*0($r_ptr) + mov $acc5, 8*1($r_ptr) + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx + +.type ecp_nistz256_ord_sqr_montx,\@function,3 +.align 32 +ecp_nistz256_ord_sqr_montx: +.cfi_startproc +.Lecp_nistz256_ord_sqr_montx: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lord_sqrx_body: + + mov $b_org, $b_ptr + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), $acc6 + mov 8*2($a_ptr), $acc7 + mov 8*3($a_ptr), $acc0 + lea .Lord(%rip), $a_ptr + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulx $acc6, $acc1, $acc2 # a[0]*a[1] + mulx $acc7, $t0, $acc3 # a[0]*a[2] + mov %rdx, %rax # offload a[0] + movq $acc6, %xmm1 # offload a[1] + mulx $acc0, $t1, $acc4 # a[0]*a[3] + mov $acc6, %rdx + add $t0, $acc2 + movq $acc7, %xmm2 # offload a[2] + adc $t1, $acc3 + adc \$0, $acc4 + xor $acc5, $acc5 # $acc5=0,cf=0,of=0 + ################################# + mulx $acc7, $t0, $t1 # a[1]*a[2] + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx $acc0, $t0, $t1 # a[1]*a[3] + mov $acc7, %rdx + adcx $t0, $acc4 + adox $t1, $acc5 + adc \$0, $acc5 + ################################# + mulx $acc0, $t0, $acc6 # a[2]*a[3] + mov %rax, %rdx + movq $acc0, %xmm3 # offload a[3] + xor $acc7, $acc7 # $acc7=0,cf=0,of=0 + adcx $acc1, $acc1 # acc1:6<<1 + adox $t0, $acc5 + adcx $acc2, $acc2 + adox $acc7, $acc6 # of=0 + + ################################# a[i]*a[i] + mulx %rdx, $acc0, $t1 + movq %xmm1, %rdx + adcx $acc3, $acc3 + adox $t1, $acc1 + adcx $acc4, $acc4 + mulx %rdx, $t0, $t4 + movq %xmm2, %rdx + adcx $acc5, $acc5 + adox $t0, $acc2 + adcx $acc6, $acc6 + mulx %rdx, $t0, $t1 + .byte 0x67 + movq %xmm3, %rdx + adox $t4, $acc3 + adcx $acc7, $acc7 + adox $t0, $acc4 + adox $t1, $acc5 + mulx %rdx, $t0, $t4 + adox $t0, $acc6 + adox $t4, $acc7 + + ################################# reduction + mov $acc0, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + xor %rax, %rax # cf=0, of=0 + mulx 8*0($a_ptr), $t0, $t1 + adcx $t0, $acc0 # guaranteed to be zero + adox $t1, $acc1 + mulx 8*1($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + mulx 8*2($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + mulx 8*3($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc0 # of=0 + adcx %rax, $acc0 # cf=0 + + ################################# + mov $acc1, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adox $t0, $acc1 # guaranteed to be zero + adcx $t1, $acc2 + mulx 8*1($a_ptr), $t0, $t1 + adox $t0, $acc2 + adcx $t1, $acc3 + mulx 8*2($a_ptr), $t0, $t1 + adox $t0, $acc3 + adcx $t1, $acc0 + mulx 8*3($a_ptr), $t0, $t1 + adox $t0, $acc0 + adcx $t1, $acc1 # cf=0 + adox %rax, $acc1 # of=0 + + ################################# + mov $acc2, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adcx $t0, $acc2 # guaranteed to be zero + adox $t1, $acc3 + mulx 8*1($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc0 + mulx 8*2($a_ptr), $t0, $t1 + adcx $t0, $acc0 + adox $t1, $acc1 + mulx 8*3($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 # of=0 + adcx %rax, $acc2 # cf=0 + + ################################# + mov $acc3, %rdx + mulx 8*4($a_ptr), %rdx, $t0 + + mulx 8*0($a_ptr), $t0, $t1 + adox $t0, $acc3 # guaranteed to be zero + adcx $t1, $acc0 + mulx 8*1($a_ptr), $t0, $t1 + adox $t0, $acc0 + adcx $t1, $acc1 + mulx 8*2($a_ptr), $t0, $t1 + adox $t0, $acc1 + adcx $t1, $acc2 + mulx 8*3($a_ptr), $t0, $t1 + adox $t0, $acc2 + adcx $t1, $acc3 + adox %rax, $acc3 + + ################################# accumulate upper half + add $acc0, $acc4 # add $acc4, $acc0 + adc $acc5, $acc1 + mov $acc4, %rdx + adc $acc6, $acc2 + adc $acc7, $acc3 + mov $acc1, $acc6 + adc \$0, %rax + + ################################# compare to modulus + sub 8*0($a_ptr), $acc4 + mov $acc2, $acc7 + sbb 8*1($a_ptr), $acc1 + sbb 8*2($a_ptr), $acc2 + mov $acc3, $acc0 + sbb 8*3($a_ptr), $acc3 + sbb \$0, %rax + + cmovnc $acc4, %rdx + cmovnc $acc1, $acc6 + cmovnc $acc2, $acc7 + cmovnc $acc3, $acc0 + + dec $b_ptr + jnz .Loop_ord_sqrx + + mov %rdx, 8*0($r_ptr) + mov $acc6, 8*1($r_ptr) + pxor %xmm1, %xmm1 + mov $acc7, 8*2($r_ptr) + pxor %xmm2, %xmm2 + mov $acc0, 8*3($r_ptr) + pxor %xmm3, %xmm3 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx +___ + +$code.=<<___; +################################################################################ +# void ecp_nistz256_to_mont( +# uint64_t res[4], +# uint64_t in[4]); +.globl ecp_nistz256_to_mont +.type ecp_nistz256_to_mont,\@function,2 +.align 32 +ecp_nistz256_to_mont: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx +___ +$code.=<<___; + lea .LRR(%rip), $b_org + jmp .Lmul_mont +.cfi_endproc +.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +################################################################################ +# void ecp_nistz256_mul_mont( +# uint64_t res[4], +# uint64_t a[4], +# uint64_t b[4]); + +.globl ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,\@function,3 +.align 32 +ecp_nistz256_mul_mont: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx +___ +$code.=<<___; +.Lmul_mont: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lmul_body: +___ +$code.=<<___ if ($addx); + cmp \$0x80100, %ecx + je .Lmul_montx +___ +$code.=<<___; + mov $b_org, $b_ptr + mov 8*0($b_org), %rax + mov 8*0($a_ptr), $acc1 + mov 8*1($a_ptr), $acc2 + mov 8*2($a_ptr), $acc3 + mov 8*3($a_ptr), $acc4 + + call __ecp_nistz256_mul_montq +___ +$code.=<<___ if ($addx); + jmp .Lmul_mont_done + +.align 32 +.Lmul_montx: + mov $b_org, $b_ptr + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), $acc1 + mov 8*1($a_ptr), $acc2 + mov 8*2($a_ptr), $acc3 + mov 8*3($a_ptr), $acc4 + lea -128($a_ptr), $a_ptr # control u-op density + + call __ecp_nistz256_mul_montx +___ +$code.=<<___; +.Lmul_mont_done: + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +.type __ecp_nistz256_mul_montq,\@abi-omnipotent +.align 32 +__ecp_nistz256_mul_montq: +.cfi_startproc + ######################################################################## + # Multiply a by b[0] + mov %rax, $t1 + mulq $acc1 + mov .Lpoly+8*1(%rip),$poly1 + mov %rax, $acc0 + mov $t1, %rax + mov %rdx, $acc1 + + mulq $acc2 + mov .Lpoly+8*3(%rip),$poly3 + add %rax, $acc1 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $acc2 + + mulq $acc3 + add %rax, $acc2 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $acc3 + + mulq $acc4 + add %rax, $acc3 + mov $acc0, %rax + adc \$0, %rdx + xor $acc5, $acc5 + mov %rdx, $acc4 + + ######################################################################## + # First reduction step + # Basically now we want to multiply acc[0] by p256, + # and add the result to the acc. + # Due to the special form of p256 we do some optimizations + # + # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] + # then we add acc[0] and get acc[0] x 2^96 + + mov $acc0, $t1 + shl \$32, $acc0 + mulq $poly3 + shr \$32, $t1 + add $acc0, $acc1 # +=acc[0]<<96 + adc $t1, $acc2 + adc %rax, $acc3 + mov 8*1($b_ptr), %rax + adc %rdx, $acc4 + adc \$0, $acc5 + xor $acc0, $acc0 + + ######################################################################## + # Multiply by b[1] + mov %rax, $t1 + mulq 8*0($a_ptr) + add %rax, $acc1 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*1($a_ptr) + add $t0, $acc2 + adc \$0, %rdx + add %rax, $acc2 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*2($a_ptr) + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*3($a_ptr) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $acc1, %rax + adc %rdx, $acc5 + adc \$0, $acc0 + + ######################################################################## + # Second reduction step + mov $acc1, $t1 + shl \$32, $acc1 + mulq $poly3 + shr \$32, $t1 + add $acc1, $acc2 + adc $t1, $acc3 + adc %rax, $acc4 + mov 8*2($b_ptr), %rax + adc %rdx, $acc5 + adc \$0, $acc0 + xor $acc1, $acc1 + + ######################################################################## + # Multiply by b[2] + mov %rax, $t1 + mulq 8*0($a_ptr) + add %rax, $acc2 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*1($a_ptr) + add $t0, $acc3 + adc \$0, %rdx + add %rax, $acc3 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*2($a_ptr) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*3($a_ptr) + add $t0, $acc5 + adc \$0, %rdx + add %rax, $acc5 + mov $acc2, %rax + adc %rdx, $acc0 + adc \$0, $acc1 + + ######################################################################## + # Third reduction step + mov $acc2, $t1 + shl \$32, $acc2 + mulq $poly3 + shr \$32, $t1 + add $acc2, $acc3 + adc $t1, $acc4 + adc %rax, $acc5 + mov 8*3($b_ptr), %rax + adc %rdx, $acc0 + adc \$0, $acc1 + xor $acc2, $acc2 + + ######################################################################## + # Multiply by b[3] + mov %rax, $t1 + mulq 8*0($a_ptr) + add %rax, $acc3 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*1($a_ptr) + add $t0, $acc4 + adc \$0, %rdx + add %rax, $acc4 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*2($a_ptr) + add $t0, $acc5 + adc \$0, %rdx + add %rax, $acc5 + mov $t1, %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq 8*3($a_ptr) + add $t0, $acc0 + adc \$0, %rdx + add %rax, $acc0 + mov $acc3, %rax + adc %rdx, $acc1 + adc \$0, $acc2 + + ######################################################################## + # Final reduction step + mov $acc3, $t1 + shl \$32, $acc3 + mulq $poly3 + shr \$32, $t1 + add $acc3, $acc4 + adc $t1, $acc5 + mov $acc4, $t0 + adc %rax, $acc0 + adc %rdx, $acc1 + mov $acc5, $t1 + adc \$0, $acc2 + + ######################################################################## + # Branch-less conditional subtraction of P + sub \$-1, $acc4 # .Lpoly[0] + mov $acc0, $t2 + sbb $poly1, $acc5 # .Lpoly[1] + sbb \$0, $acc0 # .Lpoly[2] + mov $acc1, $t3 + sbb $poly3, $acc1 # .Lpoly[3] + sbb \$0, $acc2 + + cmovc $t0, $acc4 + cmovc $t1, $acc5 + mov $acc4, 8*0($r_ptr) + cmovc $t2, $acc0 + mov $acc5, 8*1($r_ptr) + cmovc $t3, $acc1 + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq + +################################################################################ +# void ecp_nistz256_sqr_mont( +# uint64_t res[4], +# uint64_t a[4]); + +# we optimize the square according to S.Gueron and V.Krasnov, +# "Speeding up Big-Number Squaring" +.globl ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,\@function,2 +.align 32 +ecp_nistz256_sqr_mont: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx +___ +$code.=<<___; + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lsqr_body: +___ +$code.=<<___ if ($addx); + cmp \$0x80100, %ecx + je .Lsqr_montx +___ +$code.=<<___; + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), $acc6 + mov 8*2($a_ptr), $acc7 + mov 8*3($a_ptr), $acc0 + + call __ecp_nistz256_sqr_montq +___ +$code.=<<___ if ($addx); + jmp .Lsqr_mont_done + +.align 32 +.Lsqr_montx: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), $acc6 + mov 8*2($a_ptr), $acc7 + mov 8*3($a_ptr), $acc0 + lea -128($a_ptr), $a_ptr # control u-op density + + call __ecp_nistz256_sqr_montx +___ +$code.=<<___; +.Lsqr_mont_done: + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +.type __ecp_nistz256_sqr_montq,\@abi-omnipotent +.align 32 +__ecp_nistz256_sqr_montq: +.cfi_startproc + mov %rax, $acc5 + mulq $acc6 # a[1]*a[0] + mov %rax, $acc1 + mov $acc7, %rax + mov %rdx, $acc2 + + mulq $acc5 # a[0]*a[2] + add %rax, $acc2 + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $acc3 + + mulq $acc5 # a[0]*a[3] + add %rax, $acc3 + mov $acc7, %rax + adc \$0, %rdx + mov %rdx, $acc4 + + ################################# + mulq $acc6 # a[1]*a[2] + add %rax, $acc3 + mov $acc0, %rax + adc \$0, %rdx + mov %rdx, $t1 + + mulq $acc6 # a[1]*a[3] + add %rax, $acc4 + mov $acc0, %rax + adc \$0, %rdx + add $t1, $acc4 + mov %rdx, $acc5 + adc \$0, $acc5 + + ################################# + mulq $acc7 # a[2]*a[3] + xor $acc7, $acc7 + add %rax, $acc5 + mov 8*0($a_ptr), %rax + mov %rdx, $acc6 + adc \$0, $acc6 + + add $acc1, $acc1 # acc1:6<<1 + adc $acc2, $acc2 + adc $acc3, $acc3 + adc $acc4, $acc4 + adc $acc5, $acc5 + adc $acc6, $acc6 + adc \$0, $acc7 + + mulq %rax + mov %rax, $acc0 + mov 8*1($a_ptr), %rax + mov %rdx, $t0 + + mulq %rax + add $t0, $acc1 + adc %rax, $acc2 + mov 8*2($a_ptr), %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq %rax + add $t0, $acc3 + adc %rax, $acc4 + mov 8*3($a_ptr), %rax + adc \$0, %rdx + mov %rdx, $t0 + + mulq %rax + add $t0, $acc5 + adc %rax, $acc6 + mov $acc0, %rax + adc %rdx, $acc7 + + mov .Lpoly+8*1(%rip), $a_ptr + mov .Lpoly+8*3(%rip), $t1 + + ########################################## + # Now the reduction + # First iteration + mov $acc0, $t0 + shl \$32, $acc0 + mulq $t1 + shr \$32, $t0 + add $acc0, $acc1 # +=acc[0]<<96 + adc $t0, $acc2 + adc %rax, $acc3 + mov $acc1, %rax + adc \$0, %rdx + + ########################################## + # Second iteration + mov $acc1, $t0 + shl \$32, $acc1 + mov %rdx, $acc0 + mulq $t1 + shr \$32, $t0 + add $acc1, $acc2 + adc $t0, $acc3 + adc %rax, $acc0 + mov $acc2, %rax + adc \$0, %rdx + + ########################################## + # Third iteration + mov $acc2, $t0 + shl \$32, $acc2 + mov %rdx, $acc1 + mulq $t1 + shr \$32, $t0 + add $acc2, $acc3 + adc $t0, $acc0 + adc %rax, $acc1 + mov $acc3, %rax + adc \$0, %rdx + + ########################################### + # Last iteration + mov $acc3, $t0 + shl \$32, $acc3 + mov %rdx, $acc2 + mulq $t1 + shr \$32, $t0 + add $acc3, $acc0 + adc $t0, $acc1 + adc %rax, $acc2 + adc \$0, %rdx + xor $acc3, $acc3 + + ############################################ + # Add the rest of the acc + add $acc0, $acc4 + adc $acc1, $acc5 + mov $acc4, $acc0 + adc $acc2, $acc6 + adc %rdx, $acc7 + mov $acc5, $acc1 + adc \$0, $acc3 + + sub \$-1, $acc4 # .Lpoly[0] + mov $acc6, $acc2 + sbb $a_ptr, $acc5 # .Lpoly[1] + sbb \$0, $acc6 # .Lpoly[2] + mov $acc7, $t0 + sbb $t1, $acc7 # .Lpoly[3] + sbb \$0, $acc3 + + cmovc $acc0, $acc4 + cmovc $acc1, $acc5 + mov $acc4, 8*0($r_ptr) + cmovc $acc2, $acc6 + mov $acc5, 8*1($r_ptr) + cmovc $t0, $acc7 + mov $acc6, 8*2($r_ptr) + mov $acc7, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +___ + +if ($addx) { +$code.=<<___; +.type __ecp_nistz256_mul_montx,\@abi-omnipotent +.align 32 +__ecp_nistz256_mul_montx: +.cfi_startproc + ######################################################################## + # Multiply by b[0] + mulx $acc1, $acc0, $acc1 + mulx $acc2, $t0, $acc2 + mov \$32, $poly1 + xor $acc5, $acc5 # cf=0 + mulx $acc3, $t1, $acc3 + mov .Lpoly+8*3(%rip), $poly3 + adc $t0, $acc1 + mulx $acc4, $t0, $acc4 + mov $acc0, %rdx + adc $t1, $acc2 + shlx $poly1,$acc0,$t1 + adc $t0, $acc3 + shrx $poly1,$acc0,$t0 + adc \$0, $acc4 + + ######################################################################## + # First reduction step + add $t1, $acc1 + adc $t0, $acc2 + + mulx $poly3, $t0, $t1 + mov 8*1($b_ptr), %rdx + adc $t0, $acc3 + adc $t1, $acc4 + adc \$0, $acc5 + xor $acc0, $acc0 # $acc0=0,cf=0,of=0 + + ######################################################################## + # Multiply by b[1] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc1 + adox $t1, $acc2 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc1, %rdx + adcx $t0, $acc4 + shlx $poly1, $acc1, $t0 + adox $t1, $acc5 + shrx $poly1, $acc1, $t1 + + adcx $acc0, $acc5 + adox $acc0, $acc0 + adc \$0, $acc0 + + ######################################################################## + # Second reduction step + add $t0, $acc2 + adc $t1, $acc3 + + mulx $poly3, $t0, $t1 + mov 8*2($b_ptr), %rdx + adc $t0, $acc4 + adc $t1, $acc5 + adc \$0, $acc0 + xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 + + ######################################################################## + # Multiply by b[2] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc2 + adox $t1, $acc3 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc2, %rdx + adcx $t0, $acc5 + shlx $poly1, $acc2, $t0 + adox $t1, $acc0 + shrx $poly1, $acc2, $t1 + + adcx $acc1, $acc0 + adox $acc1, $acc1 + adc \$0, $acc1 + + ######################################################################## + # Third reduction step + add $t0, $acc3 + adc $t1, $acc4 + + mulx $poly3, $t0, $t1 + mov 8*3($b_ptr), %rdx + adc $t0, $acc5 + adc $t1, $acc0 + adc \$0, $acc1 + xor $acc2, $acc2 # $acc2=0,cf=0,of=0 + + ######################################################################## + # Multiply by b[3] + mulx 8*0+128($a_ptr), $t0, $t1 + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx 8*1+128($a_ptr), $t0, $t1 + adcx $t0, $acc4 + adox $t1, $acc5 + + mulx 8*2+128($a_ptr), $t0, $t1 + adcx $t0, $acc5 + adox $t1, $acc0 + + mulx 8*3+128($a_ptr), $t0, $t1 + mov $acc3, %rdx + adcx $t0, $acc0 + shlx $poly1, $acc3, $t0 + adox $t1, $acc1 + shrx $poly1, $acc3, $t1 + + adcx $acc2, $acc1 + adox $acc2, $acc2 + adc \$0, $acc2 + + ######################################################################## + # Fourth reduction step + add $t0, $acc4 + adc $t1, $acc5 + + mulx $poly3, $t0, $t1 + mov $acc4, $t2 + mov .Lpoly+8*1(%rip), $poly1 + adc $t0, $acc0 + mov $acc5, $t3 + adc $t1, $acc1 + adc \$0, $acc2 + + ######################################################################## + # Branch-less conditional subtraction of P + xor %eax, %eax + mov $acc0, $t0 + sbb \$-1, $acc4 # .Lpoly[0] + sbb $poly1, $acc5 # .Lpoly[1] + sbb \$0, $acc0 # .Lpoly[2] + mov $acc1, $t1 + sbb $poly3, $acc1 # .Lpoly[3] + sbb \$0, $acc2 + + cmovc $t2, $acc4 + cmovc $t3, $acc5 + mov $acc4, 8*0($r_ptr) + cmovc $t0, $acc0 + mov $acc5, 8*1($r_ptr) + cmovc $t1, $acc1 + mov $acc0, 8*2($r_ptr) + mov $acc1, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.type __ecp_nistz256_sqr_montx,\@abi-omnipotent +.align 32 +__ecp_nistz256_sqr_montx: +.cfi_startproc + mulx $acc6, $acc1, $acc2 # a[0]*a[1] + mulx $acc7, $t0, $acc3 # a[0]*a[2] + xor %eax, %eax + adc $t0, $acc2 + mulx $acc0, $t1, $acc4 # a[0]*a[3] + mov $acc6, %rdx + adc $t1, $acc3 + adc \$0, $acc4 + xor $acc5, $acc5 # $acc5=0,cf=0,of=0 + + ################################# + mulx $acc7, $t0, $t1 # a[1]*a[2] + adcx $t0, $acc3 + adox $t1, $acc4 + + mulx $acc0, $t0, $t1 # a[1]*a[3] + mov $acc7, %rdx + adcx $t0, $acc4 + adox $t1, $acc5 + adc \$0, $acc5 + + ################################# + mulx $acc0, $t0, $acc6 # a[2]*a[3] + mov 8*0+128($a_ptr), %rdx + xor $acc7, $acc7 # $acc7=0,cf=0,of=0 + adcx $acc1, $acc1 # acc1:6<<1 + adox $t0, $acc5 + adcx $acc2, $acc2 + adox $acc7, $acc6 # of=0 + + mulx %rdx, $acc0, $t1 + mov 8*1+128($a_ptr), %rdx + adcx $acc3, $acc3 + adox $t1, $acc1 + adcx $acc4, $acc4 + mulx %rdx, $t0, $t4 + mov 8*2+128($a_ptr), %rdx + adcx $acc5, $acc5 + adox $t0, $acc2 + adcx $acc6, $acc6 + .byte 0x67 + mulx %rdx, $t0, $t1 + mov 8*3+128($a_ptr), %rdx + adox $t4, $acc3 + adcx $acc7, $acc7 + adox $t0, $acc4 + mov \$32, $a_ptr + adox $t1, $acc5 + .byte 0x67,0x67 + mulx %rdx, $t0, $t4 + mov .Lpoly+8*3(%rip), %rdx + adox $t0, $acc6 + shlx $a_ptr, $acc0, $t0 + adox $t4, $acc7 + shrx $a_ptr, $acc0, $t4 + mov %rdx,$t1 + + # reduction step 1 + add $t0, $acc1 + adc $t4, $acc2 + + mulx $acc0, $t0, $acc0 + adc $t0, $acc3 + shlx $a_ptr, $acc1, $t0 + adc \$0, $acc0 + shrx $a_ptr, $acc1, $t4 + + # reduction step 2 + add $t0, $acc2 + adc $t4, $acc3 + + mulx $acc1, $t0, $acc1 + adc $t0, $acc0 + shlx $a_ptr, $acc2, $t0 + adc \$0, $acc1 + shrx $a_ptr, $acc2, $t4 + + # reduction step 3 + add $t0, $acc3 + adc $t4, $acc0 + + mulx $acc2, $t0, $acc2 + adc $t0, $acc1 + shlx $a_ptr, $acc3, $t0 + adc \$0, $acc2 + shrx $a_ptr, $acc3, $t4 + + # reduction step 4 + add $t0, $acc0 + adc $t4, $acc1 + + mulx $acc3, $t0, $acc3 + adc $t0, $acc2 + adc \$0, $acc3 + + xor $t3, $t3 + add $acc0, $acc4 # accumulate upper half + mov .Lpoly+8*1(%rip), $a_ptr + adc $acc1, $acc5 + mov $acc4, $acc0 + adc $acc2, $acc6 + adc $acc3, $acc7 + mov $acc5, $acc1 + adc \$0, $t3 + + sub \$-1, $acc4 # .Lpoly[0] + mov $acc6, $acc2 + sbb $a_ptr, $acc5 # .Lpoly[1] + sbb \$0, $acc6 # .Lpoly[2] + mov $acc7, $acc3 + sbb $t1, $acc7 # .Lpoly[3] + sbb \$0, $t3 + + cmovc $acc0, $acc4 + cmovc $acc1, $acc5 + mov $acc4, 8*0($r_ptr) + cmovc $acc2, $acc6 + mov $acc5, 8*1($r_ptr) + cmovc $acc3, $acc7 + mov $acc6, 8*2($r_ptr) + mov $acc7, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx +___ +} +} +{ +my ($r_ptr,$in_ptr)=("%rdi","%rsi"); +my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); +my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); + +$code.=<<___; +################################################################################ +# void ecp_nistz256_from_mont( +# uint64_t res[4], +# uint64_t in[4]); +# This one performs Montgomery multiplication by 1, so we only need the reduction + +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,\@function,2 +.align 32 +ecp_nistz256_from_mont: +.cfi_startproc + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 +.Lfrom_body: + + mov 8*0($in_ptr), %rax + mov .Lpoly+8*3(%rip), $t2 + mov 8*1($in_ptr), $acc1 + mov 8*2($in_ptr), $acc2 + mov 8*3($in_ptr), $acc3 + mov %rax, $acc0 + mov .Lpoly+8*1(%rip), $t1 + + ######################################### + # First iteration + mov %rax, $t0 + shl \$32, $acc0 + mulq $t2 + shr \$32, $t0 + add $acc0, $acc1 + adc $t0, $acc2 + adc %rax, $acc3 + mov $acc1, %rax + adc \$0, %rdx + + ######################################### + # Second iteration + mov $acc1, $t0 + shl \$32, $acc1 + mov %rdx, $acc0 + mulq $t2 + shr \$32, $t0 + add $acc1, $acc2 + adc $t0, $acc3 + adc %rax, $acc0 + mov $acc2, %rax + adc \$0, %rdx + + ########################################## + # Third iteration + mov $acc2, $t0 + shl \$32, $acc2 + mov %rdx, $acc1 + mulq $t2 + shr \$32, $t0 + add $acc2, $acc3 + adc $t0, $acc0 + adc %rax, $acc1 + mov $acc3, %rax + adc \$0, %rdx + + ########################################### + # Last iteration + mov $acc3, $t0 + shl \$32, $acc3 + mov %rdx, $acc2 + mulq $t2 + shr \$32, $t0 + add $acc3, $acc0 + adc $t0, $acc1 + mov $acc0, $t0 + adc %rax, $acc2 + mov $acc1, $in_ptr + adc \$0, %rdx + + ########################################### + # Branch-less conditional subtraction + sub \$-1, $acc0 + mov $acc2, %rax + sbb $t1, $acc1 + sbb \$0, $acc2 + mov %rdx, $acc3 + sbb $t2, %rdx + sbb $t2, $t2 + + cmovnz $t0, $acc0 + cmovnz $in_ptr, $acc1 + mov $acc0, 8*0($r_ptr) + cmovnz %rax, $acc2 + mov $acc1, 8*1($r_ptr) + cmovz %rdx, $acc3 + mov $acc2, 8*2($r_ptr) + mov $acc3, 8*3($r_ptr) + + mov 0(%rsp),%r13 +.cfi_restore %r13 + mov 8(%rsp),%r12 +.cfi_restore %r12 + lea 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lfrom_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont +___ +} +{ +my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); +my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); +my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); +my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); + +$code.=<<___; +################################################################################ +# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_scatter_w5 +.type ecp_nistz256_scatter_w5,\@abi-omnipotent +.align 32 +ecp_nistz256_scatter_w5: +.cfi_startproc + lea -3($index,$index,2), $index + movdqa 0x00($in_t), %xmm0 + shl \$5, $index + movdqa 0x10($in_t), %xmm1 + movdqa 0x20($in_t), %xmm2 + movdqa 0x30($in_t), %xmm3 + movdqa 0x40($in_t), %xmm4 + movdqa 0x50($in_t), %xmm5 + movdqa %xmm0, 0x00($val,$index) + movdqa %xmm1, 0x10($val,$index) + movdqa %xmm2, 0x20($val,$index) + movdqa %xmm3, 0x30($val,$index) + movdqa %xmm4, 0x40($val,$index) + movdqa %xmm5, 0x50($val,$index) + + ret +.cfi_endproc +.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 + +################################################################################ +# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_gather_w5 +.type ecp_nistz256_gather_w5,\@abi-omnipotent +.align 32 +ecp_nistz256_gather_w5: +.cfi_startproc +___ +$code.=<<___ if ($avx>1); + mov OPENSSL_ia32cap_P+8(%rip), %eax + test \$`1<<5`, %eax + jnz .Lavx2_gather_w5 +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp), %rax +.LSEH_begin_ecp_nistz256_gather_w5: + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) +___ +$code.=<<___; + movdqa .LOne(%rip), $ONE + movd $index, $INDEX + + pxor $Ra, $Ra + pxor $Rb, $Rb + pxor $Rc, $Rc + pxor $Rd, $Rd + pxor $Re, $Re + pxor $Rf, $Rf + + movdqa $ONE, $M0 + pshufd \$0, $INDEX, $INDEX + + mov \$16, %rax +.Lselect_loop_sse_w5: + + movdqa $M0, $TMP0 + paddd $ONE, $M0 + pcmpeqd $INDEX, $TMP0 + + movdqa 16*0($in_t), $T0a + movdqa 16*1($in_t), $T0b + movdqa 16*2($in_t), $T0c + movdqa 16*3($in_t), $T0d + movdqa 16*4($in_t), $T0e + movdqa 16*5($in_t), $T0f + lea 16*6($in_t), $in_t + + pand $TMP0, $T0a + pand $TMP0, $T0b + por $T0a, $Ra + pand $TMP0, $T0c + por $T0b, $Rb + pand $TMP0, $T0d + por $T0c, $Rc + pand $TMP0, $T0e + por $T0d, $Rd + pand $TMP0, $T0f + por $T0e, $Re + por $T0f, $Rf + + dec %rax + jnz .Lselect_loop_sse_w5 + + movdqu $Ra, 16*0($val) + movdqu $Rb, 16*1($val) + movdqu $Rc, 16*2($val) + movdqu $Rd, 16*3($val) + movdqu $Re, 16*4($val) + movdqu $Rf, 16*5($val) +___ +$code.=<<___ if ($win64); + movaps (%rsp), %xmm6 + movaps 0x10(%rsp), %xmm7 + movaps 0x20(%rsp), %xmm8 + movaps 0x30(%rsp), %xmm9 + movaps 0x40(%rsp), %xmm10 + movaps 0x50(%rsp), %xmm11 + movaps 0x60(%rsp), %xmm12 + movaps 0x70(%rsp), %xmm13 + movaps 0x80(%rsp), %xmm14 + movaps 0x90(%rsp), %xmm15 + lea 0xa8(%rsp), %rsp +___ +$code.=<<___; + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_gather_w5: +.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 + +################################################################################ +# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_scatter_w7 +.type ecp_nistz256_scatter_w7,\@abi-omnipotent +.align 32 +ecp_nistz256_scatter_w7: +.cfi_startproc + movdqu 0x00($in_t), %xmm0 + shl \$6, $index + movdqu 0x10($in_t), %xmm1 + movdqu 0x20($in_t), %xmm2 + movdqu 0x30($in_t), %xmm3 + movdqa %xmm0, 0x00($val,$index) + movdqa %xmm1, 0x10($val,$index) + movdqa %xmm2, 0x20($val,$index) + movdqa %xmm3, 0x30($val,$index) + + ret +.cfi_endproc +.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 + +################################################################################ +# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_gather_w7 +.type ecp_nistz256_gather_w7,\@abi-omnipotent +.align 32 +ecp_nistz256_gather_w7: +.cfi_startproc +___ +$code.=<<___ if ($avx>1); + mov OPENSSL_ia32cap_P+8(%rip), %eax + test \$`1<<5`, %eax + jnz .Lavx2_gather_w7 +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp), %rax +.LSEH_begin_ecp_nistz256_gather_w7: + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) +___ +$code.=<<___; + movdqa .LOne(%rip), $M0 + movd $index, $INDEX + + pxor $Ra, $Ra + pxor $Rb, $Rb + pxor $Rc, $Rc + pxor $Rd, $Rd + + movdqa $M0, $ONE + pshufd \$0, $INDEX, $INDEX + mov \$64, %rax + +.Lselect_loop_sse_w7: + movdqa $M0, $TMP0 + paddd $ONE, $M0 + movdqa 16*0($in_t), $T0a + movdqa 16*1($in_t), $T0b + pcmpeqd $INDEX, $TMP0 + movdqa 16*2($in_t), $T0c + movdqa 16*3($in_t), $T0d + lea 16*4($in_t), $in_t + + pand $TMP0, $T0a + pand $TMP0, $T0b + por $T0a, $Ra + pand $TMP0, $T0c + por $T0b, $Rb + pand $TMP0, $T0d + por $T0c, $Rc + prefetcht0 255($in_t) + por $T0d, $Rd + + dec %rax + jnz .Lselect_loop_sse_w7 + + movdqu $Ra, 16*0($val) + movdqu $Rb, 16*1($val) + movdqu $Rc, 16*2($val) + movdqu $Rd, 16*3($val) +___ +$code.=<<___ if ($win64); + movaps (%rsp), %xmm6 + movaps 0x10(%rsp), %xmm7 + movaps 0x20(%rsp), %xmm8 + movaps 0x30(%rsp), %xmm9 + movaps 0x40(%rsp), %xmm10 + movaps 0x50(%rsp), %xmm11 + movaps 0x60(%rsp), %xmm12 + movaps 0x70(%rsp), %xmm13 + movaps 0x80(%rsp), %xmm14 + movaps 0x90(%rsp), %xmm15 + lea 0xa8(%rsp), %rsp +___ +$code.=<<___; + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_gather_w7: +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +___ +} +if ($avx>1) { +my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); +my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); +my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); +my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); + +$code.=<<___; +################################################################################ +# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index); +.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent +.align 32 +ecp_nistz256_avx2_gather_w5: +.cfi_startproc +.Lavx2_gather_w5: + vzeroupper +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp), %rax + mov %rsp,%r11 +.LSEH_begin_ecp_nistz256_avx2_gather_w5: + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) +___ +$code.=<<___; + vmovdqa .LTwo(%rip), $TWO + + vpxor $Ra, $Ra, $Ra + vpxor $Rb, $Rb, $Rb + vpxor $Rc, $Rc, $Rc + + vmovdqa .LOne(%rip), $M0 + vmovdqa .LTwo(%rip), $M1 + + vmovd $index, %xmm1 + vpermd $INDEX, $Ra, $INDEX + + mov \$8, %rax +.Lselect_loop_avx2_w5: + + vmovdqa 32*0($in_t), $T0a + vmovdqa 32*1($in_t), $T0b + vmovdqa 32*2($in_t), $T0c + + vmovdqa 32*3($in_t), $T1a + vmovdqa 32*4($in_t), $T1b + vmovdqa 32*5($in_t), $T1c + + vpcmpeqd $INDEX, $M0, $TMP0 + vpcmpeqd $INDEX, $M1, $TMP1 + + vpaddd $TWO, $M0, $M0 + vpaddd $TWO, $M1, $M1 + lea 32*6($in_t), $in_t + + vpand $TMP0, $T0a, $T0a + vpand $TMP0, $T0b, $T0b + vpand $TMP0, $T0c, $T0c + vpand $TMP1, $T1a, $T1a + vpand $TMP1, $T1b, $T1b + vpand $TMP1, $T1c, $T1c + + vpxor $T0a, $Ra, $Ra + vpxor $T0b, $Rb, $Rb + vpxor $T0c, $Rc, $Rc + vpxor $T1a, $Ra, $Ra + vpxor $T1b, $Rb, $Rb + vpxor $T1c, $Rc, $Rc + + dec %rax + jnz .Lselect_loop_avx2_w5 + + vmovdqu $Ra, 32*0($val) + vmovdqu $Rb, 32*1($val) + vmovdqu $Rc, 32*2($val) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp), %xmm6 + movaps 0x10(%rsp), %xmm7 + movaps 0x20(%rsp), %xmm8 + movaps 0x30(%rsp), %xmm9 + movaps 0x40(%rsp), %xmm10 + movaps 0x50(%rsp), %xmm11 + movaps 0x60(%rsp), %xmm12 + movaps 0x70(%rsp), %xmm13 + movaps 0x80(%rsp), %xmm14 + movaps 0x90(%rsp), %xmm15 + lea (%r11), %rsp +___ +$code.=<<___; + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_gather_w5: +.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 +___ +} +if ($avx>1) { +my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); +my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); +my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); +my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); +my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); + +$code.=<<___; + +################################################################################ +# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_avx2_gather_w7 +.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent +.align 32 +ecp_nistz256_avx2_gather_w7: +.cfi_startproc +.Lavx2_gather_w7: + vzeroupper +___ +$code.=<<___ if ($win64); + mov %rsp,%r11 + lea -0x88(%rsp), %rax +.LSEH_begin_ecp_nistz256_avx2_gather_w7: + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) +___ +$code.=<<___; + vmovdqa .LThree(%rip), $THREE + + vpxor $Ra, $Ra, $Ra + vpxor $Rb, $Rb, $Rb + + vmovdqa .LOne(%rip), $M0 + vmovdqa .LTwo(%rip), $M1 + vmovdqa .LThree(%rip), $M2 + + vmovd $index, %xmm1 + vpermd $INDEX, $Ra, $INDEX + # Skip index = 0, because it is implicitly the point at infinity + + mov \$21, %rax +.Lselect_loop_avx2_w7: + + vmovdqa 32*0($in_t), $T0a + vmovdqa 32*1($in_t), $T0b + + vmovdqa 32*2($in_t), $T1a + vmovdqa 32*3($in_t), $T1b + + vmovdqa 32*4($in_t), $T2a + vmovdqa 32*5($in_t), $T2b + + vpcmpeqd $INDEX, $M0, $TMP0 + vpcmpeqd $INDEX, $M1, $TMP1 + vpcmpeqd $INDEX, $M2, $TMP2 + + vpaddd $THREE, $M0, $M0 + vpaddd $THREE, $M1, $M1 + vpaddd $THREE, $M2, $M2 + lea 32*6($in_t), $in_t + + vpand $TMP0, $T0a, $T0a + vpand $TMP0, $T0b, $T0b + vpand $TMP1, $T1a, $T1a + vpand $TMP1, $T1b, $T1b + vpand $TMP2, $T2a, $T2a + vpand $TMP2, $T2b, $T2b + + vpxor $T0a, $Ra, $Ra + vpxor $T0b, $Rb, $Rb + vpxor $T1a, $Ra, $Ra + vpxor $T1b, $Rb, $Rb + vpxor $T2a, $Ra, $Ra + vpxor $T2b, $Rb, $Rb + + dec %rax + jnz .Lselect_loop_avx2_w7 + + + vmovdqa 32*0($in_t), $T0a + vmovdqa 32*1($in_t), $T0b + + vpcmpeqd $INDEX, $M0, $TMP0 + + vpand $TMP0, $T0a, $T0a + vpand $TMP0, $T0b, $T0b + + vpxor $T0a, $Ra, $Ra + vpxor $T0b, $Rb, $Rb + + vmovdqu $Ra, 32*0($val) + vmovdqu $Rb, 32*1($val) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp), %xmm6 + movaps 0x10(%rsp), %xmm7 + movaps 0x20(%rsp), %xmm8 + movaps 0x30(%rsp), %xmm9 + movaps 0x40(%rsp), %xmm10 + movaps 0x50(%rsp), %xmm11 + movaps 0x60(%rsp), %xmm12 + movaps 0x70(%rsp), %xmm13 + movaps 0x80(%rsp), %xmm14 + movaps 0x90(%rsp), %xmm15 + lea (%r11), %rsp +___ +$code.=<<___; + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_gather_w7: +.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 +___ +} else { +$code.=<<___; +.globl ecp_nistz256_avx2_gather_w7 +.type ecp_nistz256_avx2_gather_w7,\@function,3 +.align 32 +ecp_nistz256_avx2_gather_w7: +.cfi_startproc + .byte 0x0f,0x0b # ud2 + ret +.cfi_endproc +.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 +___ +} +{{{ +######################################################################## +# This block implements higher level point_double, point_add and +# point_add_affine. The key to performance in this case is to allow +# out-of-order execution logic to overlap computations from next step +# with tail processing from current step. By using tailored calling +# sequence we minimize inter-step overhead to give processor better +# shot at overlapping operations... +# +# You will notice that input data is copied to stack. Trouble is that +# there are no registers to spare for holding original pointers and +# reloading them, pointers, would create undesired dependencies on +# effective addresses calculation paths. In other words it's too done +# to favour out-of-order execution logic. +# <appro@openssl.org> + +my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); +my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); +my ($poly1,$poly3)=($acc6,$acc7); + +sub load_for_mul () { +my ($a,$b,$src0) = @_; +my $bias = $src0 eq "%rax" ? 0 : -128; + +" mov $b, $src0 + lea $b, $b_ptr + mov 8*0+$a, $acc1 + mov 8*1+$a, $acc2 + lea $bias+$a, $a_ptr + mov 8*2+$a, $acc3 + mov 8*3+$a, $acc4" +} + +sub load_for_sqr () { +my ($a,$src0) = @_; +my $bias = $src0 eq "%rax" ? 0 : -128; + +" mov 8*0+$a, $src0 + mov 8*1+$a, $acc6 + lea $bias+$a, $a_ptr + mov 8*2+$a, $acc7 + mov 8*3+$a, $acc0" +} + + { +######################################################################## +# operate in 4-5-0-1 "name space" that matches multiplication output +# +my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); + +$code.=<<___; +.type __ecp_nistz256_add_toq,\@abi-omnipotent +.align 32 +__ecp_nistz256_add_toq: +.cfi_startproc + xor $t4,$t4 + add 8*0($b_ptr), $a0 + adc 8*1($b_ptr), $a1 + mov $a0, $t0 + adc 8*2($b_ptr), $a2 + adc 8*3($b_ptr), $a3 + mov $a1, $t1 + adc \$0, $t4 + + sub \$-1, $a0 + mov $a2, $t2 + sbb $poly1, $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb $poly3, $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,\@abi-omnipotent +.align 32 +__ecp_nistz256_sub_fromq: +.cfi_startproc + sub 8*0($b_ptr), $a0 + sbb 8*1($b_ptr), $a1 + mov $a0, $t0 + sbb 8*2($b_ptr), $a2 + sbb 8*3($b_ptr), $a3 + mov $a1, $t1 + sbb $t4, $t4 + + add \$-1, $a0 + mov $a2, $t2 + adc $poly1, $a1 + adc \$0, $a2 + mov $a3, $t3 + adc $poly3, $a3 + test $t4, $t4 + + cmovz $t0, $a0 + cmovz $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovz $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovz $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,\@abi-omnipotent +.align 32 +__ecp_nistz256_subq: +.cfi_startproc + sub $a0, $t0 + sbb $a1, $t1 + mov $t0, $a0 + sbb $a2, $t2 + sbb $a3, $t3 + mov $t1, $a1 + sbb $t4, $t4 + + add \$-1, $t0 + mov $t2, $a2 + adc $poly1, $t1 + adc \$0, $t2 + mov $t3, $a3 + adc $poly3, $t3 + test $t4, $t4 + + cmovnz $t0, $a0 + cmovnz $t1, $a1 + cmovnz $t2, $a2 + cmovnz $t3, $a3 + + ret +.cfi_endproc +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent +.align 32 +__ecp_nistz256_mul_by_2q: +.cfi_startproc + xor $t4, $t4 + add $a0, $a0 # a0:a3+a0:a3 + adc $a1, $a1 + mov $a0, $t0 + adc $a2, $a2 + adc $a3, $a3 + mov $a1, $t1 + adc \$0, $t4 + + sub \$-1, $a0 + mov $a2, $t2 + sbb $poly1, $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb $poly3, $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +___ + } +sub gen_double () { + my $x = shift; + my ($src0,$sfx,$bias); + my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); + + if ($x ne "x") { + $src0 = "%rax"; + $sfx = ""; + $bias = 0; + +$code.=<<___; +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,\@function,2 +.align 32 +ecp_nistz256_point_double: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx + cmp \$0x80100, %ecx + je .Lpoint_doublex +___ + } else { + $src0 = "%rdx"; + $sfx = "x"; + $bias = 128; + +$code.=<<___; +.type ecp_nistz256_point_doublex,\@function,2 +.align 32 +ecp_nistz256_point_doublex: +.cfi_startproc +.Lpoint_doublex: +___ + } +$code.=<<___; + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$32*5+8, %rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_double${x}_body: + +.Lpoint_double_shortcut$x: + movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x + mov $a_ptr, $b_ptr # backup copy + movdqu 0x10($a_ptr), %xmm1 + mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order + mov 0x20+8*1($a_ptr), $acc5 + mov 0x20+8*2($a_ptr), $acc0 + mov 0x20+8*3($a_ptr), $acc1 + mov .Lpoly+8*1(%rip), $poly1 + mov .Lpoly+8*3(%rip), $poly3 + movdqa %xmm0, $in_x(%rsp) + movdqa %xmm1, $in_x+0x10(%rsp) + lea 0x20($r_ptr), $acc2 + lea 0x40($r_ptr), $acc3 + movq $r_ptr, %xmm0 + movq $acc2, %xmm1 + movq $acc3, %xmm2 + + lea $S(%rsp), $r_ptr + call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); + + mov 0x40+8*0($a_ptr), $src0 + mov 0x40+8*1($a_ptr), $acc6 + mov 0x40+8*2($a_ptr), $acc7 + mov 0x40+8*3($a_ptr), $acc0 + lea 0x40-$bias($a_ptr), $a_ptr + lea $Zsqr(%rsp), $r_ptr + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); + + `&load_for_sqr("$S(%rsp)", "$src0")` + lea $S(%rsp), $r_ptr + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); + + mov 0x20($b_ptr), $src0 # $b_ptr is still valid + mov 0x40+8*0($b_ptr), $acc1 + mov 0x40+8*1($b_ptr), $acc2 + mov 0x40+8*2($b_ptr), $acc3 + mov 0x40+8*3($b_ptr), $acc4 + lea 0x40-$bias($b_ptr), $a_ptr + lea 0x20($b_ptr), $b_ptr + movq %xmm2, $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); + call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); + + mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order + mov $in_x+8*1(%rsp), $acc5 + lea $Zsqr(%rsp), $b_ptr + mov $in_x+8*2(%rsp), $acc0 + mov $in_x+8*3(%rsp), $acc1 + lea $M(%rsp), $r_ptr + call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); + + mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order + mov $in_x+8*1(%rsp), $acc5 + lea $Zsqr(%rsp), $b_ptr + mov $in_x+8*2(%rsp), $acc0 + mov $in_x+8*3(%rsp), $acc1 + lea $Zsqr(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); + + `&load_for_sqr("$S(%rsp)", "$src0")` + movq %xmm1, $r_ptr + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); +___ +{ +######## ecp_nistz256_div_by_2(res_y, res_y); ########################## +# operate in 4-5-6-7 "name space" that matches squaring output +# +my ($poly1,$poly3)=($a_ptr,$t1); +my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); + +$code.=<<___; + xor $t4, $t4 + mov $a0, $t0 + add \$-1, $a0 + mov $a1, $t1 + adc $poly1, $a1 + mov $a2, $t2 + adc \$0, $a2 + mov $a3, $t3 + adc $poly3, $a3 + adc \$0, $t4 + xor $a_ptr, $a_ptr # borrow $a_ptr + test \$1, $t0 + + cmovz $t0, $a0 + cmovz $t1, $a1 + cmovz $t2, $a2 + cmovz $t3, $a3 + cmovz $a_ptr, $t4 + + mov $a1, $t0 # a0:a3>>1 + shr \$1, $a0 + shl \$63, $t0 + mov $a2, $t1 + shr \$1, $a1 + or $t0, $a0 + shl \$63, $t1 + mov $a3, $t2 + shr \$1, $a2 + or $t1, $a1 + shl \$63, $t2 + mov $a0, 8*0($r_ptr) + shr \$1, $a3 + mov $a1, 8*1($r_ptr) + shl \$63, $t4 + or $t2, $a2 + or $t4, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) +___ +} +$code.=<<___; + `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` + lea $M(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); + + lea $tmp0(%rsp), $r_ptr + call __ecp_nistz256_mul_by_2$x + + lea $M(%rsp), $b_ptr + lea $M(%rsp), $r_ptr + call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); + + `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` + lea $S(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); + + lea $tmp0(%rsp), $r_ptr + call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); + + `&load_for_sqr("$M(%rsp)", "$src0")` + movq %xmm0, $r_ptr + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); + + lea $tmp0(%rsp), $b_ptr + mov $acc6, $acc0 # harmonize sqr output and sub input + mov $acc7, $acc1 + mov $a_ptr, $poly1 + mov $t1, $poly3 + call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); + + mov $S+8*0(%rsp), $t0 + mov $S+8*1(%rsp), $t1 + mov $S+8*2(%rsp), $t2 + mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order + lea $S(%rsp), $r_ptr + call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); + + mov $M(%rsp), $src0 + lea $M(%rsp), $b_ptr + mov $acc4, $acc6 # harmonize sub output and mul input + xor %ecx, %ecx + mov $acc4, $S+8*0(%rsp) # have to save:-( + mov $acc5, $acc2 + mov $acc5, $S+8*1(%rsp) + cmovz $acc0, $acc3 + mov $acc0, $S+8*2(%rsp) + lea $S-$bias(%rsp), $a_ptr + cmovz $acc1, $acc4 + mov $acc1, $S+8*3(%rsp) + mov $acc6, $acc1 + lea $S(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); + + movq %xmm1, $b_ptr + movq %xmm1, $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); + + lea 32*5+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_double${x}_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx +___ +} +&gen_double("q"); + +sub gen_add () { + my $x = shift; + my ($src0,$sfx,$bias); + my ($H,$Hsqr,$R,$Rsqr,$Hcub, + $U1,$U2,$S1,$S2, + $res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); + my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); + + if ($x ne "x") { + $src0 = "%rax"; + $sfx = ""; + $bias = 0; + +$code.=<<___; +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,\@function,3 +.align 32 +ecp_nistz256_point_add: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx + cmp \$0x80100, %ecx + je .Lpoint_addx +___ + } else { + $src0 = "%rdx"; + $sfx = "x"; + $bias = 128; + +$code.=<<___; +.type ecp_nistz256_point_addx,\@function,3 +.align 32 +ecp_nistz256_point_addx: +.cfi_startproc +.Lpoint_addx: +___ + } +$code.=<<___; + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$32*18+8, %rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_add${x}_body: + + movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr + movdqu 0x10($a_ptr), %xmm1 + movdqu 0x20($a_ptr), %xmm2 + movdqu 0x30($a_ptr), %xmm3 + movdqu 0x40($a_ptr), %xmm4 + movdqu 0x50($a_ptr), %xmm5 + mov $a_ptr, $b_ptr # reassign + mov $b_org, $a_ptr # reassign + movdqa %xmm0, $in1_x(%rsp) + movdqa %xmm1, $in1_x+0x10(%rsp) + movdqa %xmm2, $in1_y(%rsp) + movdqa %xmm3, $in1_y+0x10(%rsp) + movdqa %xmm4, $in1_z(%rsp) + movdqa %xmm5, $in1_z+0x10(%rsp) + por %xmm4, %xmm5 + + movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr + pshufd \$0xb1, %xmm5, %xmm3 + movdqu 0x10($a_ptr), %xmm1 + movdqu 0x20($a_ptr), %xmm2 + por %xmm3, %xmm5 + movdqu 0x30($a_ptr), %xmm3 + mov 0x40+8*0($a_ptr), $src0 # load original in2_z + mov 0x40+8*1($a_ptr), $acc6 + mov 0x40+8*2($a_ptr), $acc7 + mov 0x40+8*3($a_ptr), $acc0 + movdqa %xmm0, $in2_x(%rsp) + pshufd \$0x1e, %xmm5, %xmm4 + movdqa %xmm1, $in2_x+0x10(%rsp) + movdqu 0x40($a_ptr),%xmm0 # in2_z again + movdqu 0x50($a_ptr),%xmm1 + movdqa %xmm2, $in2_y(%rsp) + movdqa %xmm3, $in2_y+0x10(%rsp) + por %xmm4, %xmm5 + pxor %xmm4, %xmm4 + por %xmm0, %xmm1 + movq $r_ptr, %xmm0 # save $r_ptr + + lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid + mov $src0, $in2_z+8*0(%rsp) # make in2_z copy + mov $acc6, $in2_z+8*1(%rsp) + mov $acc7, $in2_z+8*2(%rsp) + mov $acc0, $in2_z+8*3(%rsp) + lea $Z2sqr(%rsp), $r_ptr # Z2^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); + + pcmpeqd %xmm4, %xmm5 + pshufd \$0xb1, %xmm1, %xmm4 + por %xmm1, %xmm4 + pshufd \$0, %xmm5, %xmm5 # in1infty + pshufd \$0x1e, %xmm4, %xmm3 + por %xmm3, %xmm4 + pxor %xmm3, %xmm3 + pcmpeqd %xmm3, %xmm4 + pshufd \$0, %xmm4, %xmm4 # in2infty + mov 0x40+8*0($b_ptr), $src0 # load original in1_z + mov 0x40+8*1($b_ptr), $acc6 + mov 0x40+8*2($b_ptr), $acc7 + mov 0x40+8*3($b_ptr), $acc0 + movq $b_ptr, %xmm1 + + lea 0x40-$bias($b_ptr), $a_ptr + lea $Z1sqr(%rsp), $r_ptr # Z1^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); + + `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` + lea $S1(%rsp), $r_ptr # S1 = Z2^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); + + `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr # S2 = Z1^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); + + `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` + lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); + + `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); + + lea $S1(%rsp), $b_ptr + lea $R(%rsp), $r_ptr # R = S2 - S1 + call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); + + or $acc5, $acc4 # see if result is zero + movdqa %xmm4, %xmm2 + or $acc0, $acc4 + or $acc1, $acc4 + por %xmm5, %xmm2 # in1infty || in2infty + movq $acc4, %xmm3 + + `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` + lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); + + `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` + lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); + + lea $U1(%rsp), $b_ptr + lea $H(%rsp), $r_ptr # H = U2 - U1 + call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); + + or $acc5, $acc4 # see if result is zero + or $acc0, $acc4 + or $acc1, $acc4 # !is_equal(U1, U2) + + movq %xmm2, $acc0 # in1infty | in2infty + movq %xmm3, $acc1 # !is_equal(S1, S2) + + or $acc0, $acc4 + or $acc1, $acc4 + + # if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2)) + .byte 0x3e # predict taken + jnz .Ladd_proceed$x + +.Ladd_double$x: + movq %xmm1, $a_ptr # restore $a_ptr + movq %xmm0, $r_ptr # restore $r_ptr + add \$`32*(18-5)`, %rsp # difference in frame sizes +.cfi_adjust_cfa_offset `-32*(18-5)` + jmp .Lpoint_double_shortcut$x +.cfi_adjust_cfa_offset `32*(18-5)` + +.align 32 +.Ladd_proceed$x: + `&load_for_sqr("$R(%rsp)", "$src0")` + lea $Rsqr(%rsp), $r_ptr # R^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); + + `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` + lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); + + `&load_for_sqr("$H(%rsp)", "$src0")` + lea $Hsqr(%rsp), $r_ptr # H^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); + + `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` + lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); + + `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` + lea $Hcub(%rsp), $r_ptr # H^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); + + `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` + lea $U2(%rsp), $r_ptr # U1*H^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); +___ +{ +####################################################################### +# operate in 4-5-0-1 "name space" that matches multiplication output +# +my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); +my ($poly1, $poly3)=($acc6,$acc7); + +$code.=<<___; + #lea $U2(%rsp), $a_ptr + #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 + #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); + + xor $t4, $t4 + add $acc0, $acc0 # a0:a3+a0:a3 + lea $Rsqr(%rsp), $a_ptr + adc $acc1, $acc1 + mov $acc0, $t0 + adc $acc2, $acc2 + adc $acc3, $acc3 + mov $acc1, $t1 + adc \$0, $t4 + + sub \$-1, $acc0 + mov $acc2, $t2 + sbb $poly1, $acc1 + sbb \$0, $acc2 + mov $acc3, $t3 + sbb $poly3, $acc3 + sbb \$0, $t4 + + cmovc $t0, $acc0 + mov 8*0($a_ptr), $t0 + cmovc $t1, $acc1 + mov 8*1($a_ptr), $t1 + cmovc $t2, $acc2 + mov 8*2($a_ptr), $t2 + cmovc $t3, $acc3 + mov 8*3($a_ptr), $t3 + + call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); + + lea $Hcub(%rsp), $b_ptr + lea $res_x(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); + + mov $U2+8*0(%rsp), $t0 + mov $U2+8*1(%rsp), $t1 + mov $U2+8*2(%rsp), $t2 + mov $U2+8*3(%rsp), $t3 + lea $res_y(%rsp), $r_ptr + + call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); + + mov $acc0, 8*0($r_ptr) # save the result, as + mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't + mov $acc2, 8*2($r_ptr) + mov $acc3, 8*3($r_ptr) +___ +} +$code.=<<___; + `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); + + `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` + lea $res_y(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); + + lea $S2(%rsp), $b_ptr + lea $res_y(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); + + movq %xmm0, $r_ptr # restore $r_ptr + + movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_z(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_z+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_z(%rsp), %xmm2 + pand $in2_z+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_z(%rsp), %xmm2 + pand $in1_z+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x40($r_ptr) + movdqu %xmm3, 0x50($r_ptr) + + movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_x(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_x+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_x(%rsp), %xmm2 + pand $in2_x+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_x(%rsp), %xmm2 + pand $in1_x+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x00($r_ptr) + movdqu %xmm3, 0x10($r_ptr) + + movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_y(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_y+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_y(%rsp), %xmm2 + pand $in2_y+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_y(%rsp), %xmm2 + pand $in1_y+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x20($r_ptr) + movdqu %xmm3, 0x30($r_ptr) + +.Ladd_done$x: + lea 32*18+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_add${x}_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx +___ +} +&gen_add("q"); + +sub gen_add_affine () { + my $x = shift; + my ($src0,$sfx,$bias); + my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, + $res_x,$res_y,$res_z, + $in1_x,$in1_y,$in1_z, + $in2_x,$in2_y)=map(32*$_,(0..14)); + my $Z1sqr = $S2; + + if ($x ne "x") { + $src0 = "%rax"; + $sfx = ""; + $bias = 0; + +$code.=<<___; +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,\@function,3 +.align 32 +ecp_nistz256_point_add_affine: +.cfi_startproc +___ +$code.=<<___ if ($addx); + mov \$0x80100, %ecx + and OPENSSL_ia32cap_P+8(%rip), %ecx + cmp \$0x80100, %ecx + je .Lpoint_add_affinex +___ + } else { + $src0 = "%rdx"; + $sfx = "x"; + $bias = 128; + +$code.=<<___; +.type ecp_nistz256_point_add_affinex,\@function,3 +.align 32 +ecp_nistz256_point_add_affinex: +.cfi_startproc +.Lpoint_add_affinex: +___ + } +$code.=<<___; + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$32*15+8, %rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affine${x}_body: + + movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr + mov $b_org, $b_ptr # reassign + movdqu 0x10($a_ptr), %xmm1 + movdqu 0x20($a_ptr), %xmm2 + movdqu 0x30($a_ptr), %xmm3 + movdqu 0x40($a_ptr), %xmm4 + movdqu 0x50($a_ptr), %xmm5 + mov 0x40+8*0($a_ptr), $src0 # load original in1_z + mov 0x40+8*1($a_ptr), $acc6 + mov 0x40+8*2($a_ptr), $acc7 + mov 0x40+8*3($a_ptr), $acc0 + movdqa %xmm0, $in1_x(%rsp) + movdqa %xmm1, $in1_x+0x10(%rsp) + movdqa %xmm2, $in1_y(%rsp) + movdqa %xmm3, $in1_y+0x10(%rsp) + movdqa %xmm4, $in1_z(%rsp) + movdqa %xmm5, $in1_z+0x10(%rsp) + por %xmm4, %xmm5 + + movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr + pshufd \$0xb1, %xmm5, %xmm3 + movdqu 0x10($b_ptr), %xmm1 + movdqu 0x20($b_ptr), %xmm2 + por %xmm3, %xmm5 + movdqu 0x30($b_ptr), %xmm3 + movdqa %xmm0, $in2_x(%rsp) + pshufd \$0x1e, %xmm5, %xmm4 + movdqa %xmm1, $in2_x+0x10(%rsp) + por %xmm0, %xmm1 + movq $r_ptr, %xmm0 # save $r_ptr + movdqa %xmm2, $in2_y(%rsp) + movdqa %xmm3, $in2_y+0x10(%rsp) + por %xmm2, %xmm3 + por %xmm4, %xmm5 + pxor %xmm4, %xmm4 + por %xmm1, %xmm3 + + lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid + lea $Z1sqr(%rsp), $r_ptr # Z1^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); + + pcmpeqd %xmm4, %xmm5 + pshufd \$0xb1, %xmm3, %xmm4 + mov 0x00($b_ptr), $src0 # $b_ptr is still valid + #lea 0x00($b_ptr), $b_ptr + mov $acc4, $acc1 # harmonize sqr output and mul input + por %xmm3, %xmm4 + pshufd \$0, %xmm5, %xmm5 # in1infty + pshufd \$0x1e, %xmm4, %xmm3 + mov $acc5, $acc2 + por %xmm3, %xmm4 + pxor %xmm3, %xmm3 + mov $acc6, $acc3 + pcmpeqd %xmm3, %xmm4 + pshufd \$0, %xmm4, %xmm4 # in2infty + + lea $Z1sqr-$bias(%rsp), $a_ptr + mov $acc7, $acc4 + lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); + + lea $in1_x(%rsp), $b_ptr + lea $H(%rsp), $r_ptr # H = U2 - U1 + call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); + + `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr # S2 = Z1^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); + + `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` + lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); + + `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); + + lea $in1_y(%rsp), $b_ptr + lea $R(%rsp), $r_ptr # R = S2 - S1 + call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); + + `&load_for_sqr("$H(%rsp)", "$src0")` + lea $Hsqr(%rsp), $r_ptr # H^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); + + `&load_for_sqr("$R(%rsp)", "$src0")` + lea $Rsqr(%rsp), $r_ptr # R^2 + call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); + + `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` + lea $Hcub(%rsp), $r_ptr # H^3 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); + + `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` + lea $U2(%rsp), $r_ptr # U1*H^2 + call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); +___ +{ +####################################################################### +# operate in 4-5-0-1 "name space" that matches multiplication output +# +my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); +my ($poly1, $poly3)=($acc6,$acc7); + +$code.=<<___; + #lea $U2(%rsp), $a_ptr + #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 + #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); + + xor $t4, $t4 + add $acc0, $acc0 # a0:a3+a0:a3 + lea $Rsqr(%rsp), $a_ptr + adc $acc1, $acc1 + mov $acc0, $t0 + adc $acc2, $acc2 + adc $acc3, $acc3 + mov $acc1, $t1 + adc \$0, $t4 + + sub \$-1, $acc0 + mov $acc2, $t2 + sbb $poly1, $acc1 + sbb \$0, $acc2 + mov $acc3, $t3 + sbb $poly3, $acc3 + sbb \$0, $t4 + + cmovc $t0, $acc0 + mov 8*0($a_ptr), $t0 + cmovc $t1, $acc1 + mov 8*1($a_ptr), $t1 + cmovc $t2, $acc2 + mov 8*2($a_ptr), $t2 + cmovc $t3, $acc3 + mov 8*3($a_ptr), $t3 + + call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); + + lea $Hcub(%rsp), $b_ptr + lea $res_x(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); + + mov $U2+8*0(%rsp), $t0 + mov $U2+8*1(%rsp), $t1 + mov $U2+8*2(%rsp), $t2 + mov $U2+8*3(%rsp), $t3 + lea $H(%rsp), $r_ptr + + call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); + + mov $acc0, 8*0($r_ptr) # save the result, as + mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't + mov $acc2, 8*2($r_ptr) + mov $acc3, 8*3($r_ptr) +___ +} +$code.=<<___; + `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` + lea $S2(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); + + `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` + lea $H(%rsp), $r_ptr + call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); + + lea $S2(%rsp), $b_ptr + lea $res_y(%rsp), $r_ptr + call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); + + movq %xmm0, $r_ptr # restore $r_ptr + + movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_z(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_z+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand .LONE_mont(%rip), %xmm2 + pand .LONE_mont+0x10(%rip), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_z(%rsp), %xmm2 + pand $in1_z+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x40($r_ptr) + movdqu %xmm3, 0x50($r_ptr) + + movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_x(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_x+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_x(%rsp), %xmm2 + pand $in2_x+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_x(%rsp), %xmm2 + pand $in1_x+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x00($r_ptr) + movdqu %xmm3, 0x10($r_ptr) + + movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); + movdqa %xmm5, %xmm1 + pandn $res_y(%rsp), %xmm0 + movdqa %xmm5, %xmm2 + pandn $res_y+0x10(%rsp), %xmm1 + movdqa %xmm5, %xmm3 + pand $in2_y(%rsp), %xmm2 + pand $in2_y+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + + movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pandn %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pand $in1_y(%rsp), %xmm2 + pand $in1_y+0x10(%rsp), %xmm3 + por %xmm0, %xmm2 + por %xmm1, %xmm3 + movdqu %xmm2, 0x20($r_ptr) + movdqu %xmm3, 0x30($r_ptr) + + lea 32*15+56(%rsp), %rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbx +.cfi_restore %rbx + mov -8(%rsi),%rbp +.cfi_restore %rbp + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affine${x}_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx +___ +} +&gen_add_affine("q"); + +######################################################################## +# AD*X magic +# +if ($addx) { { +######################################################################## +# operate in 4-5-0-1 "name space" that matches multiplication output +# +my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); + +$code.=<<___; +.type __ecp_nistz256_add_tox,\@abi-omnipotent +.align 32 +__ecp_nistz256_add_tox: +.cfi_startproc + xor $t4, $t4 + adc 8*0($b_ptr), $a0 + adc 8*1($b_ptr), $a1 + mov $a0, $t0 + adc 8*2($b_ptr), $a2 + adc 8*3($b_ptr), $a3 + mov $a1, $t1 + adc \$0, $t4 + + xor $t3, $t3 + sbb \$-1, $a0 + mov $a2, $t2 + sbb $poly1, $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb $poly3, $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox + +.type __ecp_nistz256_sub_fromx,\@abi-omnipotent +.align 32 +__ecp_nistz256_sub_fromx: +.cfi_startproc + xor $t4, $t4 + sbb 8*0($b_ptr), $a0 + sbb 8*1($b_ptr), $a1 + mov $a0, $t0 + sbb 8*2($b_ptr), $a2 + sbb 8*3($b_ptr), $a3 + mov $a1, $t1 + sbb \$0, $t4 + + xor $t3, $t3 + adc \$-1, $a0 + mov $a2, $t2 + adc $poly1, $a1 + adc \$0, $a2 + mov $a3, $t3 + adc $poly3, $a3 + + bt \$0, $t4 + cmovnc $t0, $a0 + cmovnc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovnc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovnc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx + +.type __ecp_nistz256_subx,\@abi-omnipotent +.align 32 +__ecp_nistz256_subx: +.cfi_startproc + xor $t4, $t4 + sbb $a0, $t0 + sbb $a1, $t1 + mov $t0, $a0 + sbb $a2, $t2 + sbb $a3, $t3 + mov $t1, $a1 + sbb \$0, $t4 + + xor $a3 ,$a3 + adc \$-1, $t0 + mov $t2, $a2 + adc $poly1, $t1 + adc \$0, $t2 + mov $t3, $a3 + adc $poly3, $t3 + + bt \$0, $t4 + cmovc $t0, $a0 + cmovc $t1, $a1 + cmovc $t2, $a2 + cmovc $t3, $a3 + + ret +.cfi_endproc +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx + +.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent +.align 32 +__ecp_nistz256_mul_by_2x: +.cfi_startproc + xor $t4, $t4 + adc $a0, $a0 # a0:a3+a0:a3 + adc $a1, $a1 + mov $a0, $t0 + adc $a2, $a2 + adc $a3, $a3 + mov $a1, $t1 + adc \$0, $t4 + + xor $t3, $t3 + sbb \$-1, $a0 + mov $a2, $t2 + sbb $poly1, $a1 + sbb \$0, $a2 + mov $a3, $t3 + sbb $poly3, $a3 + sbb \$0, $t4 + + cmovc $t0, $a0 + cmovc $t1, $a1 + mov $a0, 8*0($r_ptr) + cmovc $t2, $a2 + mov $a1, 8*1($r_ptr) + cmovc $t3, $a3 + mov $a2, 8*2($r_ptr) + mov $a3, 8*3($r_ptr) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +___ + } +&gen_double("x"); +&gen_add("x"); +&gen_add_affine("x"); +} +}}} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind + +.type short_handler,\@abi-omnipotent +.align 16 +short_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 16(%rax),%rax + + mov -8(%rax),%r12 + mov -16(%rax),%r13 + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + + jmp .Lcommon_seh_tail +.size short_handler,.-short_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rax,%r10),%rax + + mov -8(%rax),%rbp + mov -16(%rax),%rbx + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ecp_nistz256_mul_by_2 + .rva .LSEH_end_ecp_nistz256_mul_by_2 + .rva .LSEH_info_ecp_nistz256_mul_by_2 + + .rva .LSEH_begin_ecp_nistz256_div_by_2 + .rva .LSEH_end_ecp_nistz256_div_by_2 + .rva .LSEH_info_ecp_nistz256_div_by_2 + + .rva .LSEH_begin_ecp_nistz256_mul_by_3 + .rva .LSEH_end_ecp_nistz256_mul_by_3 + .rva .LSEH_info_ecp_nistz256_mul_by_3 + + .rva .LSEH_begin_ecp_nistz256_add + .rva .LSEH_end_ecp_nistz256_add + .rva .LSEH_info_ecp_nistz256_add + + .rva .LSEH_begin_ecp_nistz256_sub + .rva .LSEH_end_ecp_nistz256_sub + .rva .LSEH_info_ecp_nistz256_sub + + .rva .LSEH_begin_ecp_nistz256_neg + .rva .LSEH_end_ecp_nistz256_neg + .rva .LSEH_info_ecp_nistz256_neg + + .rva .LSEH_begin_ecp_nistz256_ord_mul_mont + .rva .LSEH_end_ecp_nistz256_ord_mul_mont + .rva .LSEH_info_ecp_nistz256_ord_mul_mont + + .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont + .rva .LSEH_end_ecp_nistz256_ord_sqr_mont + .rva .LSEH_info_ecp_nistz256_ord_sqr_mont +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_ecp_nistz256_ord_mul_montx + .rva .LSEH_end_ecp_nistz256_ord_mul_montx + .rva .LSEH_info_ecp_nistz256_ord_mul_montx + + .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx + .rva .LSEH_end_ecp_nistz256_ord_sqr_montx + .rva .LSEH_info_ecp_nistz256_ord_sqr_montx +___ +$code.=<<___; + .rva .LSEH_begin_ecp_nistz256_to_mont + .rva .LSEH_end_ecp_nistz256_to_mont + .rva .LSEH_info_ecp_nistz256_to_mont + + .rva .LSEH_begin_ecp_nistz256_mul_mont + .rva .LSEH_end_ecp_nistz256_mul_mont + .rva .LSEH_info_ecp_nistz256_mul_mont + + .rva .LSEH_begin_ecp_nistz256_sqr_mont + .rva .LSEH_end_ecp_nistz256_sqr_mont + .rva .LSEH_info_ecp_nistz256_sqr_mont + + .rva .LSEH_begin_ecp_nistz256_from_mont + .rva .LSEH_end_ecp_nistz256_from_mont + .rva .LSEH_info_ecp_nistz256_from_mont + + .rva .LSEH_begin_ecp_nistz256_gather_w5 + .rva .LSEH_end_ecp_nistz256_gather_w5 + .rva .LSEH_info_ecp_nistz256_gather_wX + + .rva .LSEH_begin_ecp_nistz256_gather_w7 + .rva .LSEH_end_ecp_nistz256_gather_w7 + .rva .LSEH_info_ecp_nistz256_gather_wX +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5 + .rva .LSEH_end_ecp_nistz256_avx2_gather_w5 + .rva .LSEH_info_ecp_nistz256_avx2_gather_wX + + .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7 + .rva .LSEH_end_ecp_nistz256_avx2_gather_w7 + .rva .LSEH_info_ecp_nistz256_avx2_gather_wX +___ +$code.=<<___; + .rva .LSEH_begin_ecp_nistz256_point_double + .rva .LSEH_end_ecp_nistz256_point_double + .rva .LSEH_info_ecp_nistz256_point_double + + .rva .LSEH_begin_ecp_nistz256_point_add + .rva .LSEH_end_ecp_nistz256_point_add + .rva .LSEH_info_ecp_nistz256_point_add + + .rva .LSEH_begin_ecp_nistz256_point_add_affine + .rva .LSEH_end_ecp_nistz256_point_add_affine + .rva .LSEH_info_ecp_nistz256_point_add_affine +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_ecp_nistz256_point_doublex + .rva .LSEH_end_ecp_nistz256_point_doublex + .rva .LSEH_info_ecp_nistz256_point_doublex + + .rva .LSEH_begin_ecp_nistz256_point_addx + .rva .LSEH_end_ecp_nistz256_point_addx + .rva .LSEH_info_ecp_nistz256_point_addx + + .rva .LSEH_begin_ecp_nistz256_point_add_affinex + .rva .LSEH_end_ecp_nistz256_point_add_affinex + .rva .LSEH_info_ecp_nistz256_point_add_affinex +___ +$code.=<<___; + +.section .xdata +.align 8 +.LSEH_info_ecp_nistz256_mul_by_2: + .byte 9,0,0,0 + .rva short_handler + .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_div_by_2: + .byte 9,0,0,0 + .rva short_handler + .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_mul_by_3: + .byte 9,0,0,0 + .rva short_handler + .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_add: + .byte 9,0,0,0 + .rva short_handler + .rva .Ladd_body,.Ladd_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_sub: + .byte 9,0,0,0 + .rva short_handler + .rva .Lsub_body,.Lsub_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_neg: + .byte 9,0,0,0 + .rva short_handler + .rva .Lneg_body,.Lneg_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_ord_mul_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_ord_sqr_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] + .long 48,0 +___ +$code.=<<___ if ($addx); +.LSEH_info_ecp_nistz256_ord_mul_montx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_ord_sqr_montx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] + .long 48,0 +___ +$code.=<<___; +.LSEH_info_ecp_nistz256_to_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_mul_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_sqr_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_from_mont: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_gather_wX: + .byte 0x01,0x33,0x16,0x00 + .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 + .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 + .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 + .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 + .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 + .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 + .align 8 +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ecp_nistz256_avx2_gather_wX: + .byte 0x01,0x36,0x17,0x0b + .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 + .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 + .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 + .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 + .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 + .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 + .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 + .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 + .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 + .byte 0x00,0xb3,0x00,0x00 # set_frame r11 + .align 8 +___ +$code.=<<___; +.LSEH_info_ecp_nistz256_point_double: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] + .long 32*5+56,0 +.LSEH_info_ecp_nistz256_point_add: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] + .long 32*18+56,0 +.LSEH_info_ecp_nistz256_point_add_affine: + .byte 9,0,0,0 + .rva full_handler + .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] + .long 32*15+56,0 +___ +$code.=<<___ if ($addx); +.align 8 +.LSEH_info_ecp_nistz256_point_doublex: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] + .long 32*5+56,0 +.LSEH_info_ecp_nistz256_point_addx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] + .long 32*18+56,0 +.LSEH_info_ecp_nistz256_point_add_affinex: + .byte 9,0,0,0 + .rva full_handler + .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] + .long 32*15+56,0 +___ +} + +######################################################################## +# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 +# +open TABLE,"<ecp_nistz256_table.c" or +open TABLE,"<${dir}../ecp_nistz256_table.c" or +die "failed to open ecp_nistz256_table.c:",$!; + +use integer; + +foreach(<TABLE>) { + s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; +} +close TABLE; + +die "insane number of elements" if ($#arr != 64*16*37-1); + +print <<___; +.text +.globl ecp_nistz256_precomputed +.type ecp_nistz256_precomputed,\@object +.align 4096 +ecp_nistz256_precomputed: +___ +while (@line=splice(@arr,0,16)) { + print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n"; +} +print <<___; +.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/x25519-ppc64.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/x25519-ppc64.pl new file mode 100755 index 000000000..f4b523bf8 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/x25519-ppc64.pl @@ -0,0 +1,824 @@ +#! /usr/bin/env perl +# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# X25519 lower-level primitives for PPC64. +# +# July 2018. +# +# Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15% +# faster on PPC970/G5. POWER8 on the other hand seems to trip on own +# shoelaces when handling longer carry chains. As base 2^51 has just +# single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is +# pretty old, base 2^64 implementation is not engaged. Comparison to +# compiler-generated code is complicated by the fact that not all +# compilers support 128-bit integers. When compiler doesn't, like xlc, +# this module delivers more than 2x improvement, and when it does, +# from 12% to 30% improvement was measured... + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my $sp = "r1"; +my ($rp,$ap,$bp) = map("r$_",3..5); + +####################################################### base 2^64 +if (0) { +my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3, + $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = + map("r$_",(6..12,22..31)); +my $zero = "r0"; +my $FRAME = 16*8; + +$code.=<<___; +.text + +.globl x25519_fe64_mul +.type x25519_fe64_mul,\@function +.align 5 +x25519_fe64_mul: + stdu $sp,-$FRAME($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + ld $bi,0($bp) + ld $a0,0($ap) + xor $zero,$zero,$zero + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + mulld $acc0,$a0,$bi # a[0]*b[0] + mulhdu $t0,$a0,$bi + mulld $acc1,$a1,$bi # a[1]*b[0] + mulhdu $t1,$a1,$bi + mulld $acc2,$a2,$bi # a[2]*b[0] + mulhdu $t2,$a2,$bi + mulld $acc3,$a3,$bi # a[3]*b[0] + mulhdu $t3,$a3,$bi +___ +for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7), + my $i=1; $i<4; shift(@acc), $i++) { +my $acc4 = $i==1? $zero : @acc[4]; + +$code.=<<___; + ld $bi,`8*$i`($bp) + addc @acc[1],@acc[1],$t0 # accumulate high parts + mulld $t0,$a0,$bi + adde @acc[2],@acc[2],$t1 + mulld $t1,$a1,$bi + adde @acc[3],@acc[3],$t2 + mulld $t2,$a2,$bi + adde @acc[4],$acc4,$t3 + mulld $t3,$a3,$bi + addc @acc[1],@acc[1],$t0 # accumulate low parts + mulhdu $t0,$a0,$bi + adde @acc[2],@acc[2],$t1 + mulhdu $t1,$a1,$bi + adde @acc[3],@acc[3],$t2 + mulhdu $t2,$a2,$bi + adde @acc[4],@acc[4],$t3 + mulhdu $t3,$a3,$bi + adde @acc[5],$zero,$zero +___ +} +$code.=<<___; + li $bi,38 + addc $acc4,$acc4,$t0 + mulld $t0,$acc4,$bi + adde $acc5,$acc5,$t1 + mulld $t1,$acc5,$bi + adde $acc6,$acc6,$t2 + mulld $t2,$acc6,$bi + adde $acc7,$acc7,$t3 + mulld $t3,$acc7,$bi + + addc $acc0,$acc0,$t0 + mulhdu $t0,$acc4,$bi + adde $acc1,$acc1,$t1 + mulhdu $t1,$acc5,$bi + adde $acc2,$acc2,$t2 + mulhdu $t2,$acc6,$bi + adde $acc3,$acc3,$t3 + mulhdu $t3,$acc7,$bi + adde $acc4,$zero,$zero + + addc $acc1,$acc1,$t0 + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$t2 + adde $acc4,$acc4,$t3 + + mulld $acc4,$acc4,$bi + + addc $acc0,$acc0,$acc4 + addze $acc1,$acc1 + addze $acc2,$acc2 + addze $acc3,$acc3 + + subfe $acc4,$acc4,$acc4 # carry -> ~mask + std $acc1,8($rp) + andc $acc4,$bi,$acc4 + std $acc2,16($rp) + add $acc0,$acc0,$acc4 + std $acc3,24($rp) + std $acc0,0($rp) + + ld r22,`$FRAME-8*10`($sp) + ld r23,`$FRAME-8*9`($sp) + ld r24,`$FRAME-8*8`($sp) + ld r25,`$FRAME-8*7`($sp) + ld r26,`$FRAME-8*6`($sp) + ld r27,`$FRAME-8*5`($sp) + ld r28,`$FRAME-8*4`($sp) + ld r29,`$FRAME-8*3`($sp) + ld r30,`$FRAME-8*2`($sp) + ld r31,`$FRAME-8*1`($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,10,3,0 + .long 0 +.size x25519_fe64_mul,.-x25519_fe64_mul + +.globl x25519_fe64_sqr +.type x25519_fe64_sqr,\@function +.align 5 +x25519_fe64_sqr: + stdu $sp,-$FRAME($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + ld $a0,0($ap) + xor $zero,$zero,$zero + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + + ################################ + # | | | | | |a1*a0| | + # | | | | |a2*a0| | | + # | |a3*a2|a3*a0| | | | + # | | | |a2*a1| | | | + # | | |a3*a1| | | | | + # *| | | | | | | | 2| + # +|a3*a3|a2*a2|a1*a1|a0*a0| + # |--+--+--+--+--+--+--+--| + # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx + # + # "can't overflow" below mark carrying into high part of + # multiplication result, which can't overflow, because it + # can never be all ones. + + mulld $acc1,$a1,$a0 # a[1]*a[0] + mulhdu $t1,$a1,$a0 + mulld $acc2,$a2,$a0 # a[2]*a[0] + mulhdu $t2,$a2,$a0 + mulld $acc3,$a3,$a0 # a[3]*a[0] + mulhdu $acc4,$a3,$a0 + + addc $acc2,$acc2,$t1 # accumulate high parts of multiplication + mulld $t0,$a2,$a1 # a[2]*a[1] + mulhdu $t1,$a2,$a1 + adde $acc3,$acc3,$t2 + mulld $t2,$a3,$a1 # a[3]*a[1] + mulhdu $t3,$a3,$a1 + addze $acc4,$acc4 # can't overflow + + mulld $acc5,$a3,$a2 # a[3]*a[2] + mulhdu $acc6,$a3,$a2 + + addc $t1,$t1,$t2 # accumulate high parts of multiplication + mulld $acc0,$a0,$a0 # a[0]*a[0] + addze $t2,$t3 # can't overflow + + addc $acc3,$acc3,$t0 # accumulate low parts of multiplication + mulhdu $a0,$a0,$a0 + adde $acc4,$acc4,$t1 + mulld $t1,$a1,$a1 # a[1]*a[1] + adde $acc5,$acc5,$t2 + mulhdu $a1,$a1,$a1 + addze $acc6,$acc6 # can't overflow + + addc $acc1,$acc1,$acc1 # acc[1-6]*=2 + mulld $t2,$a2,$a2 # a[2]*a[2] + adde $acc2,$acc2,$acc2 + mulhdu $a2,$a2,$a2 + adde $acc3,$acc3,$acc3 + mulld $t3,$a3,$a3 # a[3]*a[3] + adde $acc4,$acc4,$acc4 + mulhdu $a3,$a3,$a3 + adde $acc5,$acc5,$acc5 + adde $acc6,$acc6,$acc6 + addze $acc7,$zero + + addc $acc1,$acc1,$a0 # +a[i]*a[i] + li $bi,38 + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$a1 + adde $acc4,$acc4,$t2 + adde $acc5,$acc5,$a2 + adde $acc6,$acc6,$t3 + adde $acc7,$acc7,$a3 + + mulld $t0,$acc4,$bi + mulld $t1,$acc5,$bi + mulld $t2,$acc6,$bi + mulld $t3,$acc7,$bi + + addc $acc0,$acc0,$t0 + mulhdu $t0,$acc4,$bi + adde $acc1,$acc1,$t1 + mulhdu $t1,$acc5,$bi + adde $acc2,$acc2,$t2 + mulhdu $t2,$acc6,$bi + adde $acc3,$acc3,$t3 + mulhdu $t3,$acc7,$bi + addze $acc4,$zero + + addc $acc1,$acc1,$t0 + adde $acc2,$acc2,$t1 + adde $acc3,$acc3,$t2 + adde $acc4,$acc4,$t3 + + mulld $acc4,$acc4,$bi + + addc $acc0,$acc0,$acc4 + addze $acc1,$acc1 + addze $acc2,$acc2 + addze $acc3,$acc3 + + subfe $acc4,$acc4,$acc4 # carry -> ~mask + std $acc1,8($rp) + andc $acc4,$bi,$acc4 + std $acc2,16($rp) + add $acc0,$acc0,$acc4 + std $acc3,24($rp) + std $acc0,0($rp) + + ld r22,`$FRAME-8*10`($sp) + ld r23,`$FRAME-8*9`($sp) + ld r24,`$FRAME-8*8`($sp) + ld r25,`$FRAME-8*7`($sp) + ld r26,`$FRAME-8*6`($sp) + ld r27,`$FRAME-8*5`($sp) + ld r28,`$FRAME-8*4`($sp) + ld r29,`$FRAME-8*3`($sp) + ld r30,`$FRAME-8*2`($sp) + ld r31,`$FRAME-8*1`($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,10,2,0 + .long 0 +.size x25519_fe64_sqr,.-x25519_fe64_sqr + +.globl x25519_fe64_mul121666 +.type x25519_fe64_mul121666,\@function +.align 5 +x25519_fe64_mul121666: + lis $bi,`65536>>16` + ori $bi,$bi,`121666-65536` + + ld $t0,0($ap) + ld $t1,8($ap) + ld $bp,16($ap) + ld $ap,24($ap) + + mulld $a0,$t0,$bi + mulhdu $t0,$t0,$bi + mulld $a1,$t1,$bi + mulhdu $t1,$t1,$bi + mulld $a2,$bp,$bi + mulhdu $bp,$bp,$bi + mulld $a3,$ap,$bi + mulhdu $ap,$ap,$bi + + addc $a1,$a1,$t0 + adde $a2,$a2,$t1 + adde $a3,$a3,$bp + addze $ap, $ap + + mulli $ap,$ap,38 + + addc $a0,$a0,$ap + addze $a1,$a1 + addze $a2,$a2 + addze $a3,$a3 + + subfe $t1,$t1,$t1 # carry -> ~mask + std $a1,8($rp) + andc $t0,$t0,$t1 + std $a2,16($rp) + add $a0,$a0,$t0 + std $a3,24($rp) + std $a0,0($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 + +.globl x25519_fe64_add +.type x25519_fe64_add,\@function +.align 5 +x25519_fe64_add: + ld $a0,0($ap) + ld $t0,0($bp) + ld $a1,8($ap) + ld $t1,8($bp) + ld $a2,16($ap) + ld $bi,16($bp) + ld $a3,24($ap) + ld $bp,24($bp) + + addc $a0,$a0,$t0 + adde $a1,$a1,$t1 + adde $a2,$a2,$bi + adde $a3,$a3,$bp + + li $t0,38 + subfe $t1,$t1,$t1 # carry -> ~mask + andc $t1,$t0,$t1 + + addc $a0,$a0,$t1 + addze $a1,$a1 + addze $a2,$a2 + addze $a3,$a3 + + subfe $t1,$t1,$t1 # carry -> ~mask + std $a1,8($rp) + andc $t0,$t0,$t1 + std $a2,16($rp) + add $a0,$a0,$t0 + std $a3,24($rp) + std $a0,0($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size x25519_fe64_add,.-x25519_fe64_add + +.globl x25519_fe64_sub +.type x25519_fe64_sub,\@function +.align 5 +x25519_fe64_sub: + ld $a0,0($ap) + ld $t0,0($bp) + ld $a1,8($ap) + ld $t1,8($bp) + ld $a2,16($ap) + ld $bi,16($bp) + ld $a3,24($ap) + ld $bp,24($bp) + + subfc $a0,$t0,$a0 + subfe $a1,$t1,$a1 + subfe $a2,$bi,$a2 + subfe $a3,$bp,$a3 + + li $t0,38 + subfe $t1,$t1,$t1 # borrow -> mask + xor $zero,$zero,$zero + and $t1,$t0,$t1 + + subfc $a0,$t1,$a0 + subfe $a1,$zero,$a1 + subfe $a2,$zero,$a2 + subfe $a3,$zero,$a3 + + subfe $t1,$t1,$t1 # borrow -> mask + std $a1,8($rp) + and $t0,$t0,$t1 + std $a2,16($rp) + subf $a0,$t0,$a0 + std $a3,24($rp) + std $a0,0($rp) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size x25519_fe64_sub,.-x25519_fe64_sub + +.globl x25519_fe64_tobytes +.type x25519_fe64_tobytes,\@function +.align 5 +x25519_fe64_tobytes: + ld $a3,24($ap) + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + + sradi $t0,$a3,63 # most significant bit -> mask + li $t1,19 + and $t0,$t0,$t1 + sldi $a3,$a3,1 + add $t0,$t0,$t1 # compare to modulus in the same go + srdi $a3,$a3,1 # most significant bit cleared + + addc $a0,$a0,$t0 + addze $a1,$a1 + addze $a2,$a2 + addze $a3,$a3 + + xor $zero,$zero,$zero + sradi $t0,$a3,63 # most significant bit -> mask + sldi $a3,$a3,1 + andc $t0,$t1,$t0 + srdi $a3,$a3,1 # most significant bit cleared + + subi $rp,$rp,1 + subfc $a0,$t0,$a0 + subfe $a1,$zero,$a1 + subfe $a2,$zero,$a2 + subfe $a3,$zero,$a3 + +___ +for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) { +$code.=<<___; + srdi $t0,@a[0],8 + stbu @a[0],1($rp) + srdi @a[0],@a[0],16 + stbu $t0,1($rp) + srdi $t0,@a[0],8 + stbu @a[0],1($rp) + srdi @a[0],@a[0],16 + stbu $t0,1($rp) + srdi $t0,@a[0],8 + stbu @a[0],1($rp) + srdi @a[0],@a[0],16 + stbu $t0,1($rp) + srdi $t0,@a[0],8 + stbu @a[0],1($rp) + stbu $t0,1($rp) +___ +} +$code.=<<___; + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size x25519_fe64_tobytes,.-x25519_fe64_tobytes +___ +} +####################################################### base 2^51 +{ +my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1, + $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) = + map("r$_",(6..12,21..31)); +my $mask = "r0"; +my $FRAME = 18*8; + +$code.=<<___; +.text + +.globl x25519_fe51_mul +.type x25519_fe51_mul,\@function +.align 5 +x25519_fe51_mul: + stdu $sp,-$FRAME($sp) + std r21,`$FRAME-8*11`($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + ld $bi,0($bp) + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + ld $a4,32($ap) + + mulld $h0lo,$a0,$bi # a[0]*b[0] + mulhdu $h0hi,$a0,$bi + + mulld $h1lo,$a1,$bi # a[1]*b[0] + mulhdu $h1hi,$a1,$bi + + mulld $h4lo,$a4,$bi # a[4]*b[0] + mulhdu $h4hi,$a4,$bi + ld $ap,8($bp) + mulli $a4,$a4,19 + + mulld $h2lo,$a2,$bi # a[2]*b[0] + mulhdu $h2hi,$a2,$bi + + mulld $h3lo,$a3,$bi # a[3]*b[0] + mulhdu $h3hi,$a3,$bi +___ +for(my @a=($a0,$a1,$a2,$a3,$a4), + my $i=1; $i<4; $i++) { + ($ap,$bi) = ($bi,$ap); +$code.=<<___; + mulld $t0,@a[4],$bi + mulhdu $t1,@a[4],$bi + addc $h0lo,$h0lo,$t0 + adde $h0hi,$h0hi,$t1 + + mulld $t0,@a[0],$bi + mulhdu $t1,@a[0],$bi + addc $h1lo,$h1lo,$t0 + adde $h1hi,$h1hi,$t1 + + mulld $t0,@a[3],$bi + mulhdu $t1,@a[3],$bi + ld $ap,`8*($i+1)`($bp) + mulli @a[3],@a[3],19 + addc $h4lo,$h4lo,$t0 + adde $h4hi,$h4hi,$t1 + + mulld $t0,@a[1],$bi + mulhdu $t1,@a[1],$bi + addc $h2lo,$h2lo,$t0 + adde $h2hi,$h2hi,$t1 + + mulld $t0,@a[2],$bi + mulhdu $t1,@a[2],$bi + addc $h3lo,$h3lo,$t0 + adde $h3hi,$h3hi,$t1 +___ + unshift(@a,pop(@a)); +} + ($ap,$bi) = ($bi,$ap); +$code.=<<___; + mulld $t0,$a1,$bi + mulhdu $t1,$a1,$bi + addc $h0lo,$h0lo,$t0 + adde $h0hi,$h0hi,$t1 + + mulld $t0,$a2,$bi + mulhdu $t1,$a2,$bi + addc $h1lo,$h1lo,$t0 + adde $h1hi,$h1hi,$t1 + + mulld $t0,$a3,$bi + mulhdu $t1,$a3,$bi + addc $h2lo,$h2lo,$t0 + adde $h2hi,$h2hi,$t1 + + mulld $t0,$a4,$bi + mulhdu $t1,$a4,$bi + addc $h3lo,$h3lo,$t0 + adde $h3hi,$h3hi,$t1 + + mulld $t0,$a0,$bi + mulhdu $t1,$a0,$bi + addc $h4lo,$h4lo,$t0 + adde $h4hi,$h4hi,$t1 + +.Lfe51_reduce: + li $mask,-1 + srdi $mask,$mask,13 # 0x7ffffffffffff + + srdi $t0,$h2lo,51 + and $a2,$h2lo,$mask + insrdi $t0,$h2hi,51,0 # h2>>51 + srdi $t1,$h0lo,51 + and $a0,$h0lo,$mask + insrdi $t1,$h0hi,51,0 # h0>>51 + addc $h3lo,$h3lo,$t0 + addze $h3hi,$h3hi + addc $h1lo,$h1lo,$t1 + addze $h1hi,$h1hi + + srdi $t0,$h3lo,51 + and $a3,$h3lo,$mask + insrdi $t0,$h3hi,51,0 # h3>>51 + srdi $t1,$h1lo,51 + and $a1,$h1lo,$mask + insrdi $t1,$h1hi,51,0 # h1>>51 + addc $h4lo,$h4lo,$t0 + addze $h4hi,$h4hi + add $a2,$a2,$t1 + + srdi $t0,$h4lo,51 + and $a4,$h4lo,$mask + insrdi $t0,$h4hi,51,0 + mulli $t0,$t0,19 # (h4 >> 51) * 19 + + add $a0,$a0,$t0 + + srdi $t1,$a2,51 + and $a2,$a2,$mask + add $a3,$a3,$t1 + + srdi $t0,$a0,51 + and $a0,$a0,$mask + add $a1,$a1,$t0 + + std $a2,16($rp) + std $a3,24($rp) + std $a4,32($rp) + std $a0,0($rp) + std $a1,8($rp) + + ld r21,`$FRAME-8*11`($sp) + ld r22,`$FRAME-8*10`($sp) + ld r23,`$FRAME-8*9`($sp) + ld r24,`$FRAME-8*8`($sp) + ld r25,`$FRAME-8*7`($sp) + ld r26,`$FRAME-8*6`($sp) + ld r27,`$FRAME-8*5`($sp) + ld r28,`$FRAME-8*4`($sp) + ld r29,`$FRAME-8*3`($sp) + ld r30,`$FRAME-8*2`($sp) + ld r31,`$FRAME-8*1`($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,0,0x80,11,3,0 + .long 0 +.size x25519_fe51_mul,.-x25519_fe51_mul +___ +{ +my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1); +$code.=<<___; +.globl x25519_fe51_sqr +.type x25519_fe51_sqr,\@function +.align 5 +x25519_fe51_sqr: + stdu $sp,-$FRAME($sp) + std r21,`$FRAME-8*11`($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + ld $a4,32($ap) + + add $bi,$a0,$a0 # a[0]*2 + mulli $t1,$a4,19 # a[4]*19 + + mulld $h0lo,$a0,$a0 + mulhdu $h0hi,$a0,$a0 + mulld $h1lo,$a1,$bi + mulhdu $h1hi,$a1,$bi + mulld $h2lo,$a2,$bi + mulhdu $h2hi,$a2,$bi + mulld $h3lo,$a3,$bi + mulhdu $h3hi,$a3,$bi + mulld $h4lo,$a4,$bi + mulhdu $h4hi,$a4,$bi + add $bi,$a1,$a1 # a[1]*2 +___ + ($a4,$t1) = ($t1,$a4); +$code.=<<___; + mulld $t0,$t1,$a4 + mulhdu $t1,$t1,$a4 + addc $h3lo,$h3lo,$t0 + adde $h3hi,$h3hi,$t1 + + mulli $bp,$a3,19 # a[3]*19 + + mulld $t0,$a1,$a1 + mulhdu $t1,$a1,$a1 + addc $h2lo,$h2lo,$t0 + adde $h2hi,$h2hi,$t1 + mulld $t0,$a2,$bi + mulhdu $t1,$a2,$bi + addc $h3lo,$h3lo,$t0 + adde $h3hi,$h3hi,$t1 + mulld $t0,$a3,$bi + mulhdu $t1,$a3,$bi + addc $h4lo,$h4lo,$t0 + adde $h4hi,$h4hi,$t1 + mulld $t0,$a4,$bi + mulhdu $t1,$a4,$bi + add $bi,$a3,$a3 # a[3]*2 + addc $h0lo,$h0lo,$t0 + adde $h0hi,$h0hi,$t1 +___ + ($a3,$t1) = ($bp,$a3); +$code.=<<___; + mulld $t0,$t1,$a3 + mulhdu $t1,$t1,$a3 + addc $h1lo,$h1lo,$t0 + adde $h1hi,$h1hi,$t1 + mulld $t0,$bi,$a4 + mulhdu $t1,$bi,$a4 + add $bi,$a2,$a2 # a[2]*2 + addc $h2lo,$h2lo,$t0 + adde $h2hi,$h2hi,$t1 + + mulld $t0,$a2,$a2 + mulhdu $t1,$a2,$a2 + addc $h4lo,$h4lo,$t0 + adde $h4hi,$h4hi,$t1 + mulld $t0,$a3,$bi + mulhdu $t1,$a3,$bi + addc $h0lo,$h0lo,$t0 + adde $h0hi,$h0hi,$t1 + mulld $t0,$a4,$bi + mulhdu $t1,$a4,$bi + addc $h1lo,$h1lo,$t0 + adde $h1hi,$h1hi,$t1 + + b .Lfe51_reduce + .long 0 + .byte 0,12,4,0,0x80,11,2,0 + .long 0 +.size x25519_fe51_sqr,.-x25519_fe51_sqr +___ +} +$code.=<<___; +.globl x25519_fe51_mul121666 +.type x25519_fe51_mul121666,\@function +.align 5 +x25519_fe51_mul121666: + stdu $sp,-$FRAME($sp) + std r21,`$FRAME-8*11`($sp) + std r22,`$FRAME-8*10`($sp) + std r23,`$FRAME-8*9`($sp) + std r24,`$FRAME-8*8`($sp) + std r25,`$FRAME-8*7`($sp) + std r26,`$FRAME-8*6`($sp) + std r27,`$FRAME-8*5`($sp) + std r28,`$FRAME-8*4`($sp) + std r29,`$FRAME-8*3`($sp) + std r30,`$FRAME-8*2`($sp) + std r31,`$FRAME-8*1`($sp) + + lis $bi,`65536>>16` + ori $bi,$bi,`121666-65536` + ld $a0,0($ap) + ld $a1,8($ap) + ld $a2,16($ap) + ld $a3,24($ap) + ld $a4,32($ap) + + mulld $h0lo,$a0,$bi # a[0]*121666 + mulhdu $h0hi,$a0,$bi + mulld $h1lo,$a1,$bi # a[1]*121666 + mulhdu $h1hi,$a1,$bi + mulld $h2lo,$a2,$bi # a[2]*121666 + mulhdu $h2hi,$a2,$bi + mulld $h3lo,$a3,$bi # a[3]*121666 + mulhdu $h3hi,$a3,$bi + mulld $h4lo,$a4,$bi # a[4]*121666 + mulhdu $h4hi,$a4,$bi + + b .Lfe51_reduce + .long 0 + .byte 0,12,4,0,0x80,11,2,0 + .long 0 +.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/x25519-x86_64.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/x25519-x86_64.pl new file mode 100755 index 000000000..3d9d1dc1a --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/ec/asm/x25519-x86_64.pl @@ -0,0 +1,1131 @@ +#!/usr/bin/env perl +# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# X25519 lower-level primitives for x86_64. +# +# February 2018. +# +# This module implements radix 2^51 multiplication and squaring, and +# radix 2^64 multiplication, squaring, addition, subtraction and final +# reduction. Latter radix is used on ADCX/ADOX-capable processors such +# as Broadwell. On related note one should mention that there are +# vector implementations that provide significantly better performance +# on some processors(*), but they are large and overly complex. Which +# in combination with them being effectively processor-specific makes +# the undertaking hard to justify. The goal for this implementation +# is rather versatility and simplicity [and ultimately formal +# verification]. +# +# (*) For example sandy2x should provide ~30% improvement on Sandy +# Bridge, but only nominal ~5% on Haswell [and big loss on +# Broadwell and successors]. +# +###################################################################### +# Improvement coefficients: +# +# amd64-51(*) gcc-5.x(**) +# +# P4 +22% +40% +# Sandy Bridge -3% +11% +# Haswell -1% +13% +# Broadwell(***) +30% +35% +# Skylake(***) +33% +47% +# Silvermont +20% +26% +# Goldmont +40% +50% +# Bulldozer +20% +9% +# Ryzen(***) +43% +40% +# VIA +170% +120% +# +# (*) amd64-51 is popular assembly implementation with 2^51 radix, +# only multiplication and squaring subroutines were linked +# for comparison, but not complete ladder step; gain on most +# processors is because this module refrains from shld, and +# minor regression on others is because this does result in +# higher instruction count; +# (**) compiler is free to inline functions, in assembly one would +# need to implement ladder step to do that, and it will improve +# performance by several percent; +# (***) ADCX/ADOX result for 2^64 radix, there is no corresponding +# C implementation, so that comparison is always against +# 2^51 radix; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.23); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $addx = ($1>=12); +} + +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $addx = ($ver>=3.03); +} + +$code.=<<___; +.text + +.globl x25519_fe51_mul +.type x25519_fe51_mul,\@function,3 +.align 32 +x25519_fe51_mul: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_mul_body: + + mov 8*0(%rsi),%rax # f[0] + mov 8*0(%rdx),%r11 # load g[0-4] + mov 8*1(%rdx),%r12 + mov 8*2(%rdx),%r13 + mov 8*3(%rdx),%rbp + mov 8*4(%rdx),%r14 + + mov %rdi,8*4(%rsp) # offload 1st argument + mov %rax,%rdi + mulq %r11 # f[0]*g[0] + mov %r11,8*0(%rsp) # offload g[0] + mov %rax,%rbx # %rbx:%rcx = h0 + mov %rdi,%rax + mov %rdx,%rcx + mulq %r12 # f[0]*g[1] + mov %r12,8*1(%rsp) # offload g[1] + mov %rax,%r8 # %r8:%r9 = h1 + mov %rdi,%rax + lea (%r14,%r14,8),%r15 + mov %rdx,%r9 + mulq %r13 # f[0]*g[2] + mov %r13,8*2(%rsp) # offload g[2] + mov %rax,%r10 # %r10:%r11 = h2 + mov %rdi,%rax + lea (%r14,%r15,2),%rdi # g[4]*19 + mov %rdx,%r11 + mulq %rbp # f[0]*g[3] + mov %rax,%r12 # %r12:%r13 = h3 + mov 8*0(%rsi),%rax # f[0] + mov %rdx,%r13 + mulq %r14 # f[0]*g[4] + mov %rax,%r14 # %r14:%r15 = h4 + mov 8*1(%rsi),%rax # f[1] + mov %rdx,%r15 + + mulq %rdi # f[1]*g[4]*19 + add %rax,%rbx + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%rcx + mulq %rdi # f[2]*g[4]*19 + add %rax,%r8 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r9 + mulq %rdi # f[3]*g[4]*19 + add %rax,%r10 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r11 + mulq %rdi # f[4]*g[4]*19 + imulq \$19,%rbp,%rdi # g[3]*19 + add %rax,%r12 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r13 + mulq %rbp # f[1]*g[3] + mov 8*2(%rsp),%rbp # g[2] + add %rax,%r14 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r15 + + mulq %rdi # f[2]*g[3]*19 + add %rax,%rbx + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%rcx + mulq %rdi # f[3]*g[3]*19 + add %rax,%r8 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r9 + mulq %rdi # f[4]*g[3]*19 + imulq \$19,%rbp,%rdi # g[2]*19 + add %rax,%r10 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r11 + mulq %rbp # f[1]*g[2] + add %rax,%r12 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r13 + mulq %rbp # f[2]*g[2] + mov 8*1(%rsp),%rbp # g[1] + add %rax,%r14 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r15 + + mulq %rdi # f[3]*g[2]*19 + add %rax,%rbx + mov 8*4(%rsi),%rax # f[3] + adc %rdx,%rcx + mulq %rdi # f[4]*g[2]*19 + add %rax,%r8 + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%r9 + mulq %rbp # f[1]*g[1] + imulq \$19,%rbp,%rdi + add %rax,%r10 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r11 + mulq %rbp # f[2]*g[1] + add %rax,%r12 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r13 + mulq %rbp # f[3]*g[1] + mov 8*0(%rsp),%rbp # g[0] + add %rax,%r14 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r15 + + mulq %rdi # f[4]*g[1]*19 + add %rax,%rbx + mov 8*1(%rsi),%rax # f[1] + adc %rdx,%rcx + mul %rbp # f[1]*g[0] + add %rax,%r8 + mov 8*2(%rsi),%rax # f[2] + adc %rdx,%r9 + mul %rbp # f[2]*g[0] + add %rax,%r10 + mov 8*3(%rsi),%rax # f[3] + adc %rdx,%r11 + mul %rbp # f[3]*g[0] + add %rax,%r12 + mov 8*4(%rsi),%rax # f[4] + adc %rdx,%r13 + mulq %rbp # f[4]*g[0] + add %rax,%r14 + adc %rdx,%r15 + + mov 8*4(%rsp),%rdi # restore 1st argument + jmp .Lreduce51 +.Lfe51_mul_epilogue: +.cfi_endproc +.size x25519_fe51_mul,.-x25519_fe51_mul + +.globl x25519_fe51_sqr +.type x25519_fe51_sqr,\@function,2 +.align 32 +x25519_fe51_sqr: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_sqr_body: + + mov 8*0(%rsi),%rax # g[0] + mov 8*2(%rsi),%r15 # g[2] + mov 8*4(%rsi),%rbp # g[4] + + mov %rdi,8*4(%rsp) # offload 1st argument + lea (%rax,%rax),%r14 + mulq %rax # g[0]*g[0] + mov %rax,%rbx + mov 8*1(%rsi),%rax # g[1] + mov %rdx,%rcx + mulq %r14 # 2*g[0]*g[1] + mov %rax,%r8 + mov %r15,%rax + mov %r15,8*0(%rsp) # offload g[2] + mov %rdx,%r9 + mulq %r14 # 2*g[0]*g[2] + mov %rax,%r10 + mov 8*3(%rsi),%rax + mov %rdx,%r11 + imulq \$19,%rbp,%rdi # g[4]*19 + mulq %r14 # 2*g[0]*g[3] + mov %rax,%r12 + mov %rbp,%rax + mov %rdx,%r13 + mulq %r14 # 2*g[0]*g[4] + mov %rax,%r14 + mov %rbp,%rax + mov %rdx,%r15 + + mulq %rdi # g[4]*g[4]*19 + add %rax,%r12 + mov 8*1(%rsi),%rax # g[1] + adc %rdx,%r13 + + mov 8*3(%rsi),%rsi # g[3] + lea (%rax,%rax),%rbp + mulq %rax # g[1]*g[1] + add %rax,%r10 + mov 8*0(%rsp),%rax # g[2] + adc %rdx,%r11 + mulq %rbp # 2*g[1]*g[2] + add %rax,%r12 + mov %rbp,%rax + adc %rdx,%r13 + mulq %rsi # 2*g[1]*g[3] + add %rax,%r14 + mov %rbp,%rax + adc %rdx,%r15 + imulq \$19,%rsi,%rbp # g[3]*19 + mulq %rdi # 2*g[1]*g[4]*19 + add %rax,%rbx + lea (%rsi,%rsi),%rax + adc %rdx,%rcx + + mulq %rdi # 2*g[3]*g[4]*19 + add %rax,%r10 + mov %rsi,%rax + adc %rdx,%r11 + mulq %rbp # g[3]*g[3]*19 + add %rax,%r8 + mov 8*0(%rsp),%rax # g[2] + adc %rdx,%r9 + + lea (%rax,%rax),%rsi + mulq %rax # g[2]*g[2] + add %rax,%r14 + mov %rbp,%rax + adc %rdx,%r15 + mulq %rsi # 2*g[2]*g[3]*19 + add %rax,%rbx + mov %rsi,%rax + adc %rdx,%rcx + mulq %rdi # 2*g[2]*g[4]*19 + add %rax,%r8 + adc %rdx,%r9 + + mov 8*4(%rsp),%rdi # restore 1st argument + jmp .Lreduce51 + +.align 32 +.Lreduce51: + mov \$0x7ffffffffffff,%rbp + + mov %r10,%rdx + shr \$51,%r10 + shl \$13,%r11 + and %rbp,%rdx # %rdx = g2 = h2 & mask + or %r10,%r11 # h2>>51 + add %r11,%r12 + adc \$0,%r13 # h3 += h2>>51 + + mov %rbx,%rax + shr \$51,%rbx + shl \$13,%rcx + and %rbp,%rax # %rax = g0 = h0 & mask + or %rbx,%rcx # h0>>51 + add %rcx,%r8 # h1 += h0>>51 + adc \$0,%r9 + + mov %r12,%rbx + shr \$51,%r12 + shl \$13,%r13 + and %rbp,%rbx # %rbx = g3 = h3 & mask + or %r12,%r13 # h3>>51 + add %r13,%r14 # h4 += h3>>51 + adc \$0,%r15 + + mov %r8,%rcx + shr \$51,%r8 + shl \$13,%r9 + and %rbp,%rcx # %rcx = g1 = h1 & mask + or %r8,%r9 + add %r9,%rdx # g2 += h1>>51 + + mov %r14,%r10 + shr \$51,%r14 + shl \$13,%r15 + and %rbp,%r10 # %r10 = g4 = h0 & mask + or %r14,%r15 # h0>>51 + + lea (%r15,%r15,8),%r14 + lea (%r15,%r14,2),%r15 + add %r15,%rax # g0 += (h0>>51)*19 + + mov %rdx,%r8 + and %rbp,%rdx # g2 &= mask + shr \$51,%r8 + add %r8,%rbx # g3 += g2>>51 + + mov %rax,%r9 + and %rbp,%rax # g0 &= mask + shr \$51,%r9 + add %r9,%rcx # g1 += g0>>51 + + mov %rax,8*0(%rdi) # save the result + mov %rcx,8*1(%rdi) + mov %rdx,8*2(%rdi) + mov %rbx,8*3(%rdi) + mov %r10,8*4(%rdi) + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset 88 +.Lfe51_sqr_epilogue: + ret +.cfi_endproc +.size x25519_fe51_sqr,.-x25519_fe51_sqr + +.globl x25519_fe51_mul121666 +.type x25519_fe51_mul121666,\@function,2 +.align 32 +x25519_fe51_mul121666: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp),%rsp +.cfi_adjust_cfa_offset 40 +.Lfe51_mul121666_body: + mov \$121666,%eax + + mulq 8*0(%rsi) + mov %rax,%rbx # %rbx:%rcx = h0 + mov \$121666,%eax + mov %rdx,%rcx + mulq 8*1(%rsi) + mov %rax,%r8 # %r8:%r9 = h1 + mov \$121666,%eax + mov %rdx,%r9 + mulq 8*2(%rsi) + mov %rax,%r10 # %r10:%r11 = h2 + mov \$121666,%eax + mov %rdx,%r11 + mulq 8*3(%rsi) + mov %rax,%r12 # %r12:%r13 = h3 + mov \$121666,%eax # f[0] + mov %rdx,%r13 + mulq 8*4(%rsi) + mov %rax,%r14 # %r14:%r15 = h4 + mov %rdx,%r15 + + jmp .Lreduce51 +.Lfe51_mul121666_epilogue: +.cfi_endproc +.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 +___ +######################################################################## +# Base 2^64 subroutines modulo 2*(2^255-19) +# +if ($addx) { +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15)); + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl x25519_fe64_eligible +.type x25519_fe64_eligible,\@abi-omnipotent +.align 32 +x25519_fe64_eligible: +.cfi_startproc + mov OPENSSL_ia32cap_P+8(%rip),%ecx + xor %eax,%eax + and \$0x80100,%ecx + cmp \$0x80100,%ecx + cmove %ecx,%eax + ret +.cfi_endproc +.size x25519_fe64_eligible,.-x25519_fe64_eligible + +.globl x25519_fe64_mul +.type x25519_fe64_mul,\@function,3 +.align 32 +x25519_fe64_mul: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push %rdi # offload dst +.cfi_push %rdi + lea -8*2(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_mul_body: + + mov %rdx,%rax + mov 8*0(%rdx),%rbp # b[0] + mov 8*0(%rsi),%rdx # a[0] + mov 8*1(%rax),%rcx # b[1] + mov 8*2(%rax),$acc6 # b[2] + mov 8*3(%rax),$acc7 # b[3] + + mulx %rbp,$acc0,%rax # a[0]*b[0] + xor %edi,%edi # cf=0,of=0 + mulx %rcx,$acc1,%rbx # a[0]*b[1] + adcx %rax,$acc1 + mulx $acc6,$acc2,%rax # a[0]*b[2] + adcx %rbx,$acc2 + mulx $acc7,$acc3,$acc4 # a[0]*b[3] + mov 8*1(%rsi),%rdx # a[1] + adcx %rax,$acc3 + mov $acc6,(%rsp) # offload b[2] + adcx %rdi,$acc4 # cf=0 + + mulx %rbp,%rax,%rbx # a[1]*b[0] + adox %rax,$acc1 + adcx %rbx,$acc2 + mulx %rcx,%rax,%rbx # a[1]*b[1] + adox %rax,$acc2 + adcx %rbx,$acc3 + mulx $acc6,%rax,%rbx # a[1]*b[2] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx $acc7,%rax,$acc5 # a[1]*b[3] + mov 8*2(%rsi),%rdx # a[2] + adox %rax,$acc4 + adcx %rdi,$acc5 # cf=0 + adox %rdi,$acc5 # of=0 + + mulx %rbp,%rax,%rbx # a[2]*b[0] + adcx %rax,$acc2 + adox %rbx,$acc3 + mulx %rcx,%rax,%rbx # a[2]*b[1] + adcx %rax,$acc3 + adox %rbx,$acc4 + mulx $acc6,%rax,%rbx # a[2]*b[2] + adcx %rax,$acc4 + adox %rbx,$acc5 + mulx $acc7,%rax,$acc6 # a[2]*b[3] + mov 8*3(%rsi),%rdx # a[3] + adcx %rax,$acc5 + adox %rdi,$acc6 # of=0 + adcx %rdi,$acc6 # cf=0 + + mulx %rbp,%rax,%rbx # a[3]*b[0] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx %rcx,%rax,%rbx # a[3]*b[1] + adox %rax,$acc4 + adcx %rbx,$acc5 + mulx (%rsp),%rax,%rbx # a[3]*b[2] + adox %rax,$acc5 + adcx %rbx,$acc6 + mulx $acc7,%rax,$acc7 # a[3]*b[3] + mov \$38,%edx + adox %rax,$acc6 + adcx %rdi,$acc7 # cf=0 + adox %rdi,$acc7 # of=0 + + jmp .Lreduce64 +.Lfe64_mul_epilogue: +.cfi_endproc +.size x25519_fe64_mul,.-x25519_fe64_mul + +.globl x25519_fe64_sqr +.type x25519_fe64_sqr,\@function,2 +.align 32 +x25519_fe64_sqr: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push %rdi # offload dst +.cfi_push %rdi + lea -8*2(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_sqr_body: + + mov 8*0(%rsi),%rdx # a[0] + mov 8*1(%rsi),%rcx # a[1] + mov 8*2(%rsi),%rbp # a[2] + mov 8*3(%rsi),%rsi # a[3] + + ################################################################ + mulx %rdx,$acc0,$acc7 # a[0]*a[0] + mulx %rcx,$acc1,%rax # a[0]*a[1] + xor %edi,%edi # cf=0,of=0 + mulx %rbp,$acc2,%rbx # a[0]*a[2] + adcx %rax,$acc2 + mulx %rsi,$acc3,$acc4 # a[0]*a[3] + mov %rcx,%rdx # a[1] + adcx %rbx,$acc3 + adcx %rdi,$acc4 # cf=0 + + ################################################################ + mulx %rbp,%rax,%rbx # a[1]*a[2] + adox %rax,$acc3 + adcx %rbx,$acc4 + mulx %rsi,%rax,$acc5 # a[1]*a[3] + mov %rbp,%rdx # a[2] + adox %rax,$acc4 + adcx %rdi,$acc5 + + ################################################################ + mulx %rsi,%rax,$acc6 # a[2]*a[3] + mov %rcx,%rdx # a[1] + adox %rax,$acc5 + adcx %rdi,$acc6 # cf=0 + adox %rdi,$acc6 # of=0 + + adcx $acc1,$acc1 # acc1:6<<1 + adox $acc7,$acc1 + adcx $acc2,$acc2 + mulx %rdx,%rax,%rbx # a[1]*a[1] + mov %rbp,%rdx # a[2] + adcx $acc3,$acc3 + adox %rax,$acc2 + adcx $acc4,$acc4 + adox %rbx,$acc3 + mulx %rdx,%rax,%rbx # a[2]*a[2] + mov %rsi,%rdx # a[3] + adcx $acc5,$acc5 + adox %rax,$acc4 + adcx $acc6,$acc6 + adox %rbx,$acc5 + mulx %rdx,%rax,$acc7 # a[3]*a[3] + mov \$38,%edx + adox %rax,$acc6 + adcx %rdi,$acc7 # cf=0 + adox %rdi,$acc7 # of=0 + jmp .Lreduce64 + +.align 32 +.Lreduce64: + mulx $acc4,%rax,%rbx + adcx %rax,$acc0 + adox %rbx,$acc1 + mulx $acc5,%rax,%rbx + adcx %rax,$acc1 + adox %rbx,$acc2 + mulx $acc6,%rax,%rbx + adcx %rax,$acc2 + adox %rbx,$acc3 + mulx $acc7,%rax,$acc4 + adcx %rax,$acc3 + adox %rdi,$acc4 + adcx %rdi,$acc4 + + mov 8*2(%rsp),%rdi # restore dst + imulq %rdx,$acc4 + + add $acc4,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + mov $acc1,8*1(%rdi) + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + mov $acc0,8*0(%rdi) + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset 88 +.Lfe64_sqr_epilogue: + ret +.cfi_endproc +.size x25519_fe64_sqr,.-x25519_fe64_sqr + +.globl x25519_fe64_mul121666 +.type x25519_fe64_mul121666,\@function,2 +.align 32 +x25519_fe64_mul121666: +.Lfe64_mul121666_body: +.cfi_startproc + mov \$121666,%edx + mulx 8*0(%rsi),$acc0,%rcx + mulx 8*1(%rsi),$acc1,%rax + add %rcx,$acc1 + mulx 8*2(%rsi),$acc2,%rcx + adc %rax,$acc2 + mulx 8*3(%rsi),$acc3,%rax + adc %rcx,$acc3 + adc \$0,%rax + + imulq \$38,%rax,%rax + + add %rax,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + mov $acc1,8*1(%rdi) + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + mov $acc0,8*0(%rdi) + +.Lfe64_mul121666_epilogue: + ret +.cfi_endproc +.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 + +.globl x25519_fe64_add +.type x25519_fe64_add,\@function,3 +.align 32 +x25519_fe64_add: +.Lfe64_add_body: +.cfi_startproc + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + add 8*0(%rdx),$acc0 + adc 8*1(%rdx),$acc1 + adc 8*2(%rdx),$acc2 + adc 8*3(%rdx),$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + add %rax,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + mov $acc1,8*1(%rdi) + adc \$0,$acc3 + mov $acc2,8*2(%rdi) + sbb %rax,%rax # cf -> mask + mov $acc3,8*3(%rdi) + and \$38,%rax + + add %rax,$acc0 + mov $acc0,8*0(%rdi) + +.Lfe64_add_epilogue: + ret +.cfi_endproc +.size x25519_fe64_add,.-x25519_fe64_add + +.globl x25519_fe64_sub +.type x25519_fe64_sub,\@function,3 +.align 32 +x25519_fe64_sub: +.Lfe64_sub_body: +.cfi_startproc + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + sub 8*0(%rdx),$acc0 + sbb 8*1(%rdx),$acc1 + sbb 8*2(%rdx),$acc2 + sbb 8*3(%rdx),$acc3 + + sbb %rax,%rax # cf -> mask + and \$38,%rax + + sub %rax,$acc0 + sbb \$0,$acc1 + sbb \$0,$acc2 + mov $acc1,8*1(%rdi) + sbb \$0,$acc3 + mov $acc2,8*2(%rdi) + sbb %rax,%rax # cf -> mask + mov $acc3,8*3(%rdi) + and \$38,%rax + + sub %rax,$acc0 + mov $acc0,8*0(%rdi) + +.Lfe64_sub_epilogue: + ret +.cfi_endproc +.size x25519_fe64_sub,.-x25519_fe64_sub + +.globl x25519_fe64_tobytes +.type x25519_fe64_tobytes,\@function,2 +.align 32 +x25519_fe64_tobytes: +.Lfe64_to_body: +.cfi_startproc + mov 8*0(%rsi),$acc0 + mov 8*1(%rsi),$acc1 + mov 8*2(%rsi),$acc2 + mov 8*3(%rsi),$acc3 + + ################################# reduction modulo 2^255-19 + lea ($acc3,$acc3),%rax + sar \$63,$acc3 # most significant bit -> mask + shr \$1,%rax # most significant bit cleared + and \$19,$acc3 + add \$19,$acc3 # compare to modulus in the same go + + add $acc3,$acc0 + adc \$0,$acc1 + adc \$0,$acc2 + adc \$0,%rax + + lea (%rax,%rax),$acc3 + sar \$63,%rax # most significant bit -> mask + shr \$1,$acc3 # most significant bit cleared + not %rax + and \$19,%rax + + sub %rax,$acc0 + sbb \$0,$acc1 + sbb \$0,$acc2 + sbb \$0,$acc3 + + mov $acc0,8*0(%rdi) + mov $acc1,8*1(%rdi) + mov $acc2,8*2(%rdi) + mov $acc3,8*3(%rdi) + +.Lfe64_to_epilogue: + ret +.cfi_endproc +.size x25519_fe64_tobytes,.-x25519_fe64_tobytes +___ +} else { +$code.=<<___; +.globl x25519_fe64_eligible +.type x25519_fe64_eligible,\@abi-omnipotent +.align 32 +x25519_fe64_eligible: +.cfi_startproc + xor %eax,%eax + ret +.cfi_endproc +.size x25519_fe64_eligible,.-x25519_fe64_eligible + +.globl x25519_fe64_mul +.type x25519_fe64_mul,\@abi-omnipotent +.globl x25519_fe64_sqr +.globl x25519_fe64_mul121666 +.globl x25519_fe64_add +.globl x25519_fe64_sub +.globl x25519_fe64_tobytes +x25519_fe64_mul: +x25519_fe64_sqr: +x25519_fe64_mul121666: +x25519_fe64_add: +x25519_fe64_sub: +x25519_fe64_tobytes: +.cfi_startproc + .byte 0x0f,0x0b # ud2 + ret +.cfi_endproc +.size x25519_fe64_mul,.-x25519_fe64_mul +___ +} +$code.=<<___; +.asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind + +.type short_handler,\@abi-omnipotent +.align 16 +short_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + jmp .Lcommon_seh_tail +.size short_handler,.-short_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<end of prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rax,%r10),%rax + + mov -8(%rax),%rbp + mov -16(%rax),%rbx + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_x25519_fe51_mul + .rva .LSEH_end_x25519_fe51_mul + .rva .LSEH_info_x25519_fe51_mul + + .rva .LSEH_begin_x25519_fe51_sqr + .rva .LSEH_end_x25519_fe51_sqr + .rva .LSEH_info_x25519_fe51_sqr + + .rva .LSEH_begin_x25519_fe51_mul121666 + .rva .LSEH_end_x25519_fe51_mul121666 + .rva .LSEH_info_x25519_fe51_mul121666 +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_x25519_fe64_mul + .rva .LSEH_end_x25519_fe64_mul + .rva .LSEH_info_x25519_fe64_mul + + .rva .LSEH_begin_x25519_fe64_sqr + .rva .LSEH_end_x25519_fe64_sqr + .rva .LSEH_info_x25519_fe64_sqr + + .rva .LSEH_begin_x25519_fe64_mul121666 + .rva .LSEH_end_x25519_fe64_mul121666 + .rva .LSEH_info_x25519_fe64_mul121666 + + .rva .LSEH_begin_x25519_fe64_add + .rva .LSEH_end_x25519_fe64_add + .rva .LSEH_info_x25519_fe64_add + + .rva .LSEH_begin_x25519_fe64_sub + .rva .LSEH_end_x25519_fe64_sub + .rva .LSEH_info_x25519_fe64_sub + + .rva .LSEH_begin_x25519_fe64_tobytes + .rva .LSEH_end_x25519_fe64_tobytes + .rva .LSEH_info_x25519_fe64_tobytes +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_x25519_fe51_mul: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[] + .long 88,0 +.LSEH_info_x25519_fe51_sqr: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[] + .long 88,0 +.LSEH_info_x25519_fe51_mul121666: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[] + .long 88,0 +___ +$code.=<<___ if ($addx); +.LSEH_info_x25519_fe64_mul: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[] + .long 72,0 +.LSEH_info_x25519_fe64_sqr: + .byte 9,0,0,0 + .rva full_handler + .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[] + .long 72,0 +.LSEH_info_x25519_fe64_mul121666: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[] +.LSEH_info_x25519_fe64_add: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[] +.LSEH_info_x25519_fe64_sub: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[] +.LSEH_info_x25519_fe64_tobytes: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; |