diff options
Diffstat (limited to 'roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305')
17 files changed, 13254 insertions, 0 deletions
diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-armv4.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-armv4.pl new file mode 100755 index 000000000..f77e1170f --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-armv4.pl @@ -0,0 +1,1253 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# IALU(*)/gcc-4.4 NEON +# +# ARM11xx(ARMv6) 7.78/+100% - +# Cortex-A5 6.35/+130% 3.00 +# Cortex-A8 6.25/+115% 2.36 +# Cortex-A9 5.10/+95% 2.55 +# Cortex-A15 3.85/+85% 1.25(**) +# Snapdragon S4 5.70/+100% 1.48(**) +# +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; +# (**) these are trade-off results, they can be improved by ~8% but at +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); + +$code.=<<___; +#include "arm_arch.h" + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl poly1305_emit +.globl poly1305_blocks +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: +.Lpoly1305_init: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp $inp,#0 + str r3,[$ctx,#0] @ zero hash value + str r3,[$ctx,#4] + str r3,[$ctx,#8] + str r3,[$ctx,#12] + str r3,[$ctx,#16] + str r3,[$ctx,#36] @ is_base2_26 + add $ctx,$ctx,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + +#if __ARM_MAX_ARCH__>=7 + adr r11,.Lpoly1305_init + ldr r12,.LOPENSSL_armcap +#endif + ldrb r4,[$inp,#0] + mov r10,#0x0fffffff + ldrb r5,[$inp,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[$inp,#2] + ldrb r7,[$inp,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[$inp,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[$inp,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[$inp,#6] + and r4,r4,r10 + +#if __ARM_MAX_ARCH__>=7 + ldr r12,[r11,r12] @ OPENSSL_armcap_P +# ifdef __APPLE__ + ldr r12,[r12] +# endif +#endif + ldrb r8,[$inp,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[$inp,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[$inp,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[$inp,#10] + and r5,r5,r3 + +#if __ARM_MAX_ARCH__>=7 + tst r12,#ARMV7_NEON @ check for NEON +# ifdef __APPLE__ + adr r9,poly1305_blocks_neon + adr r11,poly1305_blocks +# ifdef __thumb2__ + it ne +# endif + movne r11,r9 + adr r12,poly1305_emit + adr r10,poly1305_emit_neon +# ifdef __thumb2__ + it ne +# endif + movne r12,r10 +# else +# ifdef __thumb2__ + itete eq +# endif + addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) + addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) + addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) +# endif +# ifdef __thumb2__ + orr r12,r12,#1 @ thumb-ify address + orr r11,r11,#1 +# endif +#endif + ldrb r9,[$inp,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[$inp,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[$inp,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[$inp,#14] + and r6,r6,r3 + + ldrb r10,[$inp,#15] + orr r7,r7,r8,lsl#8 + str r4,[$ctx,#0] + orr r7,r7,r9,lsl#16 + str r5,[$ctx,#4] + orr r7,r7,r10,lsl#24 + str r6,[$ctx,#8] + and r7,r7,r3 + str r7,[$ctx,#12] +#if __ARM_MAX_ARCH__>=7 + stmia r2,{r11,r12} @ fill functions table + mov r0,#1 +#else + mov r0,#0 +#endif +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); +my ($s1,$s2,$s3)=($r1,$r2,$r3); + +$code.=<<___; +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + stmdb sp!,{r3-r11,lr} + + ands $len,$len,#-16 + beq .Lno_data + + cmp $padbit,#0 + add $len,$len,$inp @ end pointer + sub sp,sp,#32 + + ldmia $ctx,{$h0-$r3} @ load context + + str $ctx,[sp,#12] @ offload stuff + mov lr,$inp + str $len,[sp,#16] + str $r1,[sp,#20] + str $r2,[sp,#24] + str $r3,[sp,#28] + b .Loop + +.Loop: +#if __ARM_ARCH__<7 + ldrb r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds $h0,$h0,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs $h1,$h1,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs $h2,$h2,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add $s1,$r1,$r1,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + adds $h0,$h0,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs $h1,$h1,r1 + add $s1,$r1,$r1,lsr#2 + adcs $h2,$h2,r2 +#endif + add $s2,$r2,$r2,lsr#2 + adcs $h3,$h3,r3 + add $s3,$r3,$r3,lsr#2 + + umull r2,r3,$h1,$r0 + adc $h4,$h4,#0 + umull r0,r1,$h0,$r0 + umlal r2,r3,$h4,$s1 + umlal r0,r1,$h3,$s1 + ldr $r1,[sp,#20] @ reload $r1 + umlal r2,r3,$h2,$s3 + umlal r0,r1,$h1,$s3 + umlal r2,r3,$h3,$s2 + umlal r0,r1,$h2,$s2 + umlal r2,r3,$h0,$r1 + str r0,[sp,#0] @ future $h0 + mul r0,$s2,$h4 + ldr $r2,[sp,#24] @ reload $r2 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future $h2 + str r2,[sp,#4] @ future $h1 + + mul r2,$s3,$h4 + eor r3,r3,r3 + umlal r0,r1,$h3,$s3 + ldr $r3,[sp,#28] @ reload $r3 + umlal r2,r3,$h3,$r0 + umlal r0,r1,$h2,$r0 + umlal r2,r3,$h2,$r1 + umlal r0,r1,$h1,$r1 + umlal r2,r3,$h1,$r2 + umlal r0,r1,$h0,$r2 + umlal r2,r3,$h0,$r3 + ldr $h0,[sp,#0] + mul $h4,$r0,$h4 + ldr $h1,[sp,#4] + + adds $h2,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds $h3,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add $h4,$h4,r3 @ h4+=d3>>32 + + and r1,$h4,#-4 + and $h4,$h4,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds $h0,$h0,r1 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr $ctx,[sp,#12] + add sp,sp,#32 + stmia $ctx,{$h0-$h4} @ store the result + +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce)=map("r$_",(0..2)); +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); +my $g4=$h4; + +$code.=<<___; +.type poly1305_emit,%function +.align 5 +poly1305_emit: + stmdb sp!,{r4-r11} +.Lpoly1305_emit_enter: + + ldmia $ctx,{$h0-$h4} + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne $h0,$g0 + ldr $g0,[$nonce,#0] +#ifdef __thumb2__ + it ne +#endif + movne $h1,$g1 + ldr $g1,[$nonce,#4] +#ifdef __thumb2__ + it ne +#endif + movne $h2,$g2 + ldr $g2,[$nonce,#8] +#ifdef __thumb2__ + it ne +#endif + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +#if __ARM_ARCH__>=7 +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] +#else + strb $h0,[$mac,#0] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#4] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#8] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#12] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#1] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#5] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#9] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#13] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#2] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#6] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#10] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#14] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#3] + strb $h1,[$mac,#7] + strb $h2,[$mac,#11] + strb $h3,[$mac,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit,.-poly1305_emit +___ +{ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); +my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); + +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type poly1305_init_neon,%function +.align 5 +poly1305_init_neon: + ldr r4,[$ctx,#20] @ load key base 2^32 + ldr r5,[$ctx,#24] + ldr r6,[$ctx,#28] + ldr r7,[$ctx,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 $R0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 $R1,r3 + add r3,r4,r4,lsl#2 + vdup.32 $S1,r2 + vdup.32 $R2,r4 + add r4,r5,r5,lsl#2 + vdup.32 $S2,r3 + vdup.32 $R3,r5 + add r5,r6,r6,lsl#2 + vdup.32 $S3,r4 + vdup.32 $R4,r6 + vdup.32 $S4,r5 + + mov $zeros,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 $D0,$R0,${R0}[1] + vmull.u32 $D1,$R1,${R0}[1] + vmull.u32 $D2,$R2,${R0}[1] + vmull.u32 $D3,$R3,${R0}[1] + vmull.u32 $D4,$R4,${R0}[1] + + vmlal.u32 $D0,$R4,${S1}[1] + vmlal.u32 $D1,$R0,${R1}[1] + vmlal.u32 $D2,$R1,${R1}[1] + vmlal.u32 $D3,$R2,${R1}[1] + vmlal.u32 $D4,$R3,${R1}[1] + + vmlal.u32 $D0,$R3,${S2}[1] + vmlal.u32 $D1,$R4,${S2}[1] + vmlal.u32 $D3,$R1,${R2}[1] + vmlal.u32 $D2,$R0,${R2}[1] + vmlal.u32 $D4,$R2,${R2}[1] + + vmlal.u32 $D0,$R2,${S3}[1] + vmlal.u32 $D3,$R0,${R3}[1] + vmlal.u32 $D1,$R3,${S3}[1] + vmlal.u32 $D2,$R4,${S3}[1] + vmlal.u32 $D4,$R1,${R3}[1] + + vmlal.u32 $D3,$R4,${S4}[1] + vmlal.u32 $D0,$R1,${S4}[1] + vmlal.u32 $D1,$R2,${S4}[1] + vmlal.u32 $D2,$R3,${S4}[1] + vmlal.u32 $D4,$R0,${R4}[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vbic.i32 $D4#lo,#0xfc000000 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vbic.i32 $D2#lo,#0xfc000000 + + vshr.u32 $T0#lo,$D0#lo,#26 + vbic.i32 $D0#lo,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + + subs $zeros,$zeros,#1 + beq .Lsquare_break_neon + + add $tbl0,$ctx,#(48+0*9*4) + add $tbl1,$ctx,#(48+1*9*4) + + vtrn.32 $R0,$D0#lo @ r^2:r^1 + vtrn.32 $R2,$D2#lo + vtrn.32 $R3,$D3#lo + vtrn.32 $R1,$D1#lo + vtrn.32 $R4,$D4#lo + + vshl.u32 $S2,$R2,#2 @ *5 + vshl.u32 $S3,$R3,#2 + vshl.u32 $S1,$R1,#2 + vshl.u32 $S4,$R4,#2 + vadd.i32 $S2,$S2,$R2 + vadd.i32 $S1,$S1,$R1 + vadd.i32 $S3,$S3,$R3 + vadd.i32 $S4,$S4,$R4 + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0,:32] + vst1.32 {${S4}[1]},[$tbl1,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add $tbl0,$ctx,#(48+2*4*9) + add $tbl1,$ctx,#(48+3*4*9) + + vmov $R0,$D0#lo @ r^4:r^3 + vshl.u32 $S1,$D1#lo,#2 @ *5 + vmov $R1,$D1#lo + vshl.u32 $S2,$D2#lo,#2 + vmov $R2,$D2#lo + vshl.u32 $S3,$D3#lo,#2 + vmov $R3,$D3#lo + vshl.u32 $S4,$D4#lo,#2 + vmov $R4,$D4#lo + vadd.i32 $S1,$S1,$D1#lo + vadd.i32 $S2,$S2,$D2#lo + vadd.i32 $S3,$S3,$D3#lo + vadd.i32 $S4,$S4,$D4#lo + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0] + vst1.32 {${S4}[1]},[$tbl1] + + ret @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + ands $len,$len,#-16 + beq .Lno_data_neon + + cmp $len,#64 + bhs .Lenter_neon + tst ip,ip @ is_base2_26? + beq .Lpoly1305_blocks + +.Lenter_neon: + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl poly1305_init_neon + + ldr r4,[$ctx,#0] @ load hash value base 2^32 + ldr r5,[$ctx,#4] + ldr r6,[$ctx,#8] + ldr r7,[$ctx,#12] + ldr ip,[$ctx,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor $D0#lo,$D0#lo,$D0#lo + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor $D1#lo,$D1#lo,$D1#lo + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor $D2#lo,$D2#lo,$D2#lo + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor $D3#lo,$D3#lo,$D3#lo + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor $D4#lo,$D4#lo,$D4#lo + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[$ctx,#36] @ is_base2_26 + + vmov.32 $D0#lo[0],r2 + vmov.32 $D1#lo[0],r3 + vmov.32 $D2#lo[0],r4 + vmov.32 $D3#lo[0],r5 + vmov.32 $D4#lo[0],r6 + adr $zeros,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lbase2_32_neon + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor $D0#lo,$D0#lo,$D0#lo + veor $D1#lo,$D1#lo,$D1#lo + veor $D2#lo,$D2#lo,$D2#lo + veor $D3#lo,$D3#lo,$D3#lo + veor $D4#lo,$D4#lo,$D4#lo + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + adr $zeros,.Lzeros + vld1.32 {$D4#lo[0]},[$ctx] + sub $ctx,$ctx,#16 @ rewind + +.Lbase2_32_neon: + add $in2,$inp,#32 + mov $padbit,$padbit,lsl#24 + tst $len,#31 + beq .Leven + + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! + vmov.32 $H4#lo[0],$padbit + sub $len,$len,#16 + add $in2,$inp,#32 + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3#lo,$H3#lo,#18 + + vsri.u32 $H3#lo,$H2#lo,#14 + vshl.u32 $H2#lo,$H2#lo,#12 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi + + vbic.i32 $H3#lo,#0xfc000000 + vsri.u32 $H2#lo,$H1#lo,#20 + vshl.u32 $H1#lo,$H1#lo,#6 + + vbic.i32 $H2#lo,#0xfc000000 + vsri.u32 $H1#lo,$H0#lo,#26 + vadd.i32 $H3#hi,$H3#lo,$D3#lo + + vbic.i32 $H0#lo,#0xfc000000 + vbic.i32 $H1#lo,#0xfc000000 + vadd.i32 $H2#hi,$H2#lo,$D2#lo + + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + + mov $tbl1,$zeros + add $tbl0,$ctx,#48 + + cmp $len,$len + b .Long_tail + +.align 4 +.Leven: + subs $len,$len,#64 + it lo + movlo $in2,$zeros + + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + itt hi + addhi $tbl1,$ctx,#(48+1*9*4) + addhi $tbl0,$ctx,#(48+3*9*4) + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3,$H3,#18 + + vsri.u32 $H3,$H2,#14 + vshl.u32 $H2,$H2,#12 + + vbic.i32 $H3,#0xfc000000 + vsri.u32 $H2,$H1,#20 + vshl.u32 $H1,$H1,#6 + + vbic.i32 $H2,#0xfc000000 + vsri.u32 $H1,$H0,#26 + + vbic.i32 $H0,#0xfc000000 + vbic.i32 $H1,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ \___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ \___________________/ \____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] + vmull.u32 $D2,$H2#hi,${R0}[1] + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,${R0}[1] + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,${R0}[1] + vmlal.u32 $D2,$H1#hi,${R1}[1] + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,${R0}[1] + + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,${R0}[1] + subs $len,$len,#64 + vmlal.u32 $D0,$H4#hi,${S1}[1] + it lo + movlo $in2,$zeros + vmlal.u32 $D3,$H2#hi,${R1}[1] + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D1,$H0#hi,${R1}[1] + vmlal.u32 $D4,$H3#hi,${R1}[1] + + vmlal.u32 $D0,$H3#hi,${S2}[1] + vmlal.u32 $D3,$H1#hi,${R2}[1] + vmlal.u32 $D4,$H2#hi,${R2}[1] + vmlal.u32 $D1,$H4#hi,${S2}[1] + vmlal.u32 $D2,$H0#hi,${R2}[1] + + vmlal.u32 $D3,$H0#hi,${R3}[1] + vmlal.u32 $D0,$H2#hi,${S3}[1] + vmlal.u32 $D4,$H1#hi,${R3}[1] + vmlal.u32 $D1,$H3#hi,${S3}[1] + vmlal.u32 $D2,$H4#hi,${S3}[1] + + vmlal.u32 $D3,$H4#hi,${S4}[1] + vmlal.u32 $D0,$H1#hi,${S4}[1] + vmlal.u32 $D4,$H0#hi,${R4}[1] + vmlal.u32 $D1,$H2#hi,${S4}[1] + vmlal.u32 $D2,$H3#hi,${S4}[1] + + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 $D3,$H3#lo,${R0}[0] + vmlal.u32 $D0,$H0#lo,${R0}[0] + vmlal.u32 $D4,$H4#lo,${R0}[0] + vmlal.u32 $D1,$H1#lo,${R0}[0] + vmlal.u32 $D2,$H2#lo,${R0}[0] + vld1.32 ${S4}[0],[$tbl0,:32] + + vmlal.u32 $D3,$H2#lo,${R1}[0] + vmlal.u32 $D0,$H4#lo,${S1}[0] + vmlal.u32 $D4,$H3#lo,${R1}[0] + vmlal.u32 $D1,$H0#lo,${R1}[0] + vmlal.u32 $D2,$H1#lo,${R1}[0] + + vmlal.u32 $D3,$H1#lo,${R2}[0] + vmlal.u32 $D0,$H3#lo,${S2}[0] + vmlal.u32 $D4,$H2#lo,${R2}[0] + vmlal.u32 $D1,$H4#lo,${S2}[0] + vmlal.u32 $D2,$H0#lo,${R2}[0] + + vmlal.u32 $D3,$H0#lo,${R3}[0] + vmlal.u32 $D0,$H2#lo,${S3}[0] + vmlal.u32 $D4,$H1#lo,${R3}[0] + vmlal.u32 $D1,$H3#lo,${S3}[0] + vmlal.u32 $D3,$H4#lo,${S4}[0] + + vmlal.u32 $D2,$H4#lo,${S3}[0] + vmlal.u32 $D0,$H1#lo,${S4}[0] + vmlal.u32 $D4,$H0#lo,${R4}[0] + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vmlal.u32 $D1,$H2#lo,${S4}[0] + vmlal.u32 $D2,$H3#lo,${S4}[0] + + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 + vrev32.8 $H3,$H3 +# endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vshl.u32 $H3,$H3,#18 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vsri.u32 $H3,$H2,#14 + vbic.i32 $D4#lo,#0xfc000000 + vshl.u32 $H2,$H2,#12 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vbic.i32 $H3,#0xfc000000 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] + vsri.u32 $H2,$H1,#20 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vshl.u32 $H1,$H1,#6 + vbic.i32 $D2#lo,#0xfc000000 + vbic.i32 $H2,#0xfc000000 + + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow + vmovn.i64 $D0#lo,$D0 + vsri.u32 $H1,$H0,#26 + vbic.i32 $H0,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vbic.i32 $D0#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + vbic.i32 $H1,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add $tbl1,$ctx,#(48+0*9*4) + add $tbl0,$ctx,#(48+1*9*4) + adds $len,$len,#32 + it ne + movne $len,#0 + bne .Long_tail + + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H3#hi,$H3#lo,$D3#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + vadd.i32 $H4#hi,$H4#lo,$D4#lo + +.Long_tail: + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant + vmull.u32 $D2,$H2#hi,$R0 + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,$R0 + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,$R0 + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,$R0 + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,$R0 + + vmlal.u32 $D0,$H4#hi,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#hi,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#hi,$R1 + vmlal.u32 $D4,$H3#hi,$R1 + vmlal.u32 $D2,$H1#hi,$R1 + + vmlal.u32 $D3,$H1#hi,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#hi,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#hi,$R2 + vmlal.u32 $D1,$H4#hi,$S2 + vmlal.u32 $D2,$H0#hi,$R2 + + vmlal.u32 $D3,$H0#hi,$R3 + it ne + addne $tbl1,$ctx,#(48+2*9*4) + vmlal.u32 $D0,$H2#hi,$S3 + it ne + addne $tbl0,$ctx,#(48+3*9*4) + vmlal.u32 $D4,$H1#hi,$R3 + vmlal.u32 $D1,$H3#hi,$S3 + vmlal.u32 $D2,$H4#hi,$S3 + + vmlal.u32 $D3,$H4#hi,$S4 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant + vmlal.u32 $D0,$H1#hi,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#hi,$R4 + vmlal.u32 $D1,$H2#hi,$S4 + vmlal.u32 $D2,$H3#hi,$S4 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + + vmlal.u32 $D2,$H2#lo,$R0 + vmlal.u32 $D0,$H0#lo,$R0 + vmlal.u32 $D3,$H3#lo,$R0 + vmlal.u32 $D1,$H1#lo,$R0 + vmlal.u32 $D4,$H4#lo,$R0 + + vmlal.u32 $D0,$H4#lo,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#lo,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#lo,$R1 + vmlal.u32 $D4,$H3#lo,$R1 + vmlal.u32 $D2,$H1#lo,$R1 + + vmlal.u32 $D3,$H1#lo,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#lo,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#lo,$R2 + vmlal.u32 $D1,$H4#lo,$S2 + vmlal.u32 $D2,$H0#lo,$R2 + + vmlal.u32 $D3,$H0#lo,$R3 + vmlal.u32 $D0,$H2#lo,$S3 + vmlal.u32 $D4,$H1#lo,$R3 + vmlal.u32 $D1,$H3#lo,$S3 + vmlal.u32 $D2,$H4#lo,$S3 + + vmlal.u32 $D3,$H4#lo,$S4 + vorn $MASK,$MASK,$MASK @ all-ones + vmlal.u32 $D0,$H1#lo,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#lo,$R4 + vmlal.u32 $D1,$H2#lo,$S4 + vmlal.u32 $D2,$H3#lo,$S4 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 $T0,$D3,#26 + vand.i64 $D3,$D3,$MASK + vshr.u64 $T1,$D0,#26 + vand.i64 $D0,$D0,$MASK + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + + vshr.u64 $T0,$D4,#26 + vand.i64 $D4,$D4,$MASK + vshr.u64 $T1,$D1,#26 + vand.i64 $D1,$D1,$MASK + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + + vadd.i64 $D0,$D0,$T0 + vshl.u64 $T0,$T0,#2 + vshr.u64 $T1,$D2,#26 + vand.i64 $D2,$D2,$MASK + vadd.i64 $D0,$D0,$T0 @ h4 -> h0 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3 + + vshr.u64 $T0,$D0,#26 + vand.i64 $D0,$D0,$MASK + vshr.u64 $T1,$D3,#26 + vand.i64 $D3,$D3,$MASK + vadd.i64 $D1,$D1,$T0 @ h0 -> h1 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4 + + cmp $len,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + vst1.32 {$D4#lo[0]},[$ctx] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} +.Lno_data_neon: + ret @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.type poly1305_emit_neon,%function +.align 5 +poly1305_emit_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + + stmdb sp!,{r4-r11} + + tst ip,ip + beq .Lpoly1305_emit_enter + + ldmia $ctx,{$h0-$h4} + eor $g0,$g0,$g0 + + adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $h1,$h1,lsr#6 + adcs $h1,$h1,$h2,lsl#20 + mov $h2,$h2,lsr#12 + adcs $h2,$h2,$h3,lsl#14 + mov $h3,$h3,lsr#18 + adcs $h3,$h3,$h4,lsl#8 + adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ... + + and $g0,$h4,#-4 @ ... so reduce + and $h4,$h3,#3 + add $g0,$g0,$g0,lsr#2 @ *= 5 + adds $h0,$h0,$g0 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + + it ne + movne $h0,$g0 + ldr $g0,[$nonce,#0] + it ne + movne $h1,$g1 + ldr $g1,[$nonce,#4] + it ne + movne $h2,$g2 + ldr $g2,[$nonce,#8] + it ne + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 @ accumulate nonce + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] @ store the result + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] + + ldmia sp!,{r4-r11} + ret @ bx lr +.size poly1305_emit_neon,.-poly1305_emit_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lpoly1305_init +#endif +___ +} } +$code.=<<___; +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +#if __ARM_MAX_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-armv8.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-armv8.pl new file mode 100755 index 000000000..9bfee2759 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-armv8.pl @@ -0,0 +1,946 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for ARMv8. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc-4.9 NEON +# +# Apple A7 1.86/+5% 0.72 +# Cortex-A53 2.69/+58% 1.47 +# Cortex-A57 2.70/+7% 1.14 +# Denver 1.64/+50% 1.18(*) +# X-Gene 2.13/+68% 2.27 +# Mongoose 1.77/+75% 1.12 +# Kryo 2.70/+55% 1.13 +# +# (*) estimate based on resources availability is less than 1.0, +# i.e. measured result is worse than expected, presumably binary +# translator is not almighty; + +$flavour=shift; +$output=shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +$code.=<<___; +#include "arm_arch.h" + +.text + +// forward "declarations" are required for Apple +.extern OPENSSL_armcap_P +.globl poly1305_blocks +.globl poly1305_emit + +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: + cmp $inp,xzr + stp xzr,xzr,[$ctx] // zero hash value + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + +#ifdef __ILP32__ + ldrsw $t1,.LOPENSSL_armcap_P +#else + ldr $t1,.LOPENSSL_armcap_P +#endif + adr $t0,.LOPENSSL_armcap_P + + ldp $r0,$r1,[$inp] // load key + mov $s1,#0xfffffffc0fffffff + movk $s1,#0x0fff,lsl#48 + ldr w17,[$t0,$t1] +#ifdef __ARMEB__ + rev $r0,$r0 // flip bytes + rev $r1,$r1 +#endif + and $r0,$r0,$s1 // &=0ffffffc0fffffff + and $s1,$s1,#-4 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc + stp $r0,$r1,[$ctx,#32] // save key value + + tst w17,#ARMV7_NEON + + adr $d0,poly1305_blocks + adr $r0,poly1305_blocks_neon + adr $d1,poly1305_emit + adr $r1,poly1305_emit_neon + + csel $d0,$d0,$r0,eq + csel $d1,$d1,$r1,eq + +#ifdef __ILP32__ + stp w12,w13,[$len] +#else + stp $d0,$d1,[$len] +#endif + + mov x0,#1 +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: + ands $len,$len,#-16 + b.eq .Lno_data + + ldp $h0,$h1,[$ctx] // load hash value + ldp $r0,$r1,[$ctx,#32] // load key value + ldr $h2,[$ctx,#16] + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + b .Loop + +.align 5 +.Loop: + ldp $t0,$t1,[$inp],#16 // load input + sub $len,$len,#16 +#ifdef __ARMEB__ + rev $t0,$t0 + rev $t1,$t1 +#endif + adds $h0,$h0,$t0 // accumulate input + adcs $h1,$h1,$t1 + + mul $d0,$h0,$r0 // h0*r0 + adc $h2,$h2,$padbit + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + cbnz $len,.Loop + + stp $h0,$h1,[$ctx] // store hash value + str $h2,[$ctx,#16] + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,%function +.align 5 +poly1305_emit: + ldp $h0,$h1,[$ctx] // load hash base 2^64 + ldr $h2,[$ctx,#16] + ldp $t0,$t1,[$nonce] // load nonce + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __ARMEB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __ARMEB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit,.-poly1305_emit +___ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); +my ($T0,$T1,$MASK) = map("v$_",(29..31)); + +my ($in2,$zeros)=("x16","x17"); +my $is_base2_26 = $zeros; # borrow + +$code.=<<___; +.type poly1305_mult,%function +.align 5 +poly1305_mult: + mul $d0,$h0,$r0 // h0*r0 + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + ret +.size poly1305_mult,.-poly1305_mult + +.type poly1305_splat,%function +.align 5 +poly1305_splat: + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,$h0,#26,#26 + extr x14,$h1,$h0,#52 + and x14,x14,#0x03ffffff + ubfx x15,$h1,#14,#26 + extr x16,$h2,$h1,#40 + + str w12,[$ctx,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[$ctx,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[$ctx,#16*2] // s1 + str w14,[$ctx,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[$ctx,#16*4] // s2 + str w15,[$ctx,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[$ctx,#16*6] // s3 + str w16,[$ctx,#16*7] // r4 + str w15,[$ctx,#16*8] // s4 + + ret +.size poly1305_splat,.-poly1305_splat + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: + ldr $is_base2_26,[$ctx,#24] + cmp $len,#128 + b.hs .Lblocks_neon + cbz $is_base2_26,poly1305_blocks + +.Lblocks_neon: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + ands $len,$len,#-16 + b.eq .Lno_data_neon + + cbz $is_base2_26,.Lbase2_64_neon + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + tst $len,#31 + b.eq .Leven_neon + + ldp $r0,$r1,[$ctx,#32] // load key value + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $d2,$h2,xzr // can be partially reduced... + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + + and $t0,$d2,#-4 // ... so reduce + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$h0,$t0 + adcs $h1,$h1,xzr + adc $h2,$h2,xzr + +#ifdef __ARMEB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + ldr x30,[sp,#8] + + cbz $padbit,.Lstore_base2_64_neon + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + cbnz $len,.Leven_neon + + stp w10,w11,[$ctx] // store hash value base 2^26 + stp w12,w13,[$ctx,#8] + str w14,[$ctx,#16] + b .Lno_data_neon + +.align 4 +.Lstore_base2_64_neon: + stp $h0,$h1,[$ctx] // store hash value base 2^64 + stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed + b .Lno_data_neon + +.align 4 +.Lbase2_64_neon: + ldp $r0,$r1,[$ctx,#32] // load key value + + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + tst $len,#31 + b.eq .Linit_neon + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __ARMEB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl poly1305_mult + +.Linit_neon: + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + + ////////////////////////////////// initialize r^n table + mov $h0,$r0 // r^1 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + mov $h1,$r1 + mov $h2,xzr + add $ctx,$ctx,#48+12 + bl poly1305_splat + + bl poly1305_mult // r^2 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^3 + sub $ctx,$ctx,#4 + bl poly1305_splat + + bl poly1305_mult // r^4 + sub $ctx,$ctx,#4 + bl poly1305_splat + ldr x30,[sp,#8] + + add $in2,$inp,#32 + adr $zeros,.Lzeros + subs $len,$len,#64 + csel $in2,$zeros,$in2,lo + + mov x4,#1 + str x4,[$ctx,#-24] // set is_base2_26 + sub $ctx,$ctx,#48 // restore original $ctx + b .Ldo_neon + +.align 4 +.Leven_neon: + add $in2,$inp,#32 + adr $zeros,.Lzeros + subs $len,$len,#64 + csel $in2,$zeros,$in2,lo + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + +.Ldo_neon: + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + ldp x9,x13,[$in2],#48 + + lsl $padbit,$padbit,#24 + add x15,$ctx,#48 + +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN23_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN23_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov $IN23_2,x8 + fmov $IN23_3,x10 + fmov $IN23_4,x12 + + ldp x8,x12,[$inp],#16 // inp[0:1] + ldp x9,x13,[$inp],#48 + + ld1 {$R0,$R1,$S1,$R2},[x15],#64 + ld1 {$S2,$R3,$S3,$R4},[x15],#64 + ld1 {$S4},[x15] + +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN01_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN01_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi $MASK.2d,#-1 + fmov $IN01_2,x8 + fmov $IN01_3,x10 + fmov $IN01_4,x12 + ushr $MASK.2d,$MASK.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // \___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // \___________________/ \____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs $len,$len,#64 + umull $ACC4,$IN23_0,${R4}[2] + csel $in2,$zeros,$in2,lo + umull $ACC3,$IN23_0,${R3}[2] + umull $ACC2,$IN23_0,${R2}[2] + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + umull $ACC1,$IN23_0,${R1}[2] + ldp x9,x13,[$in2],#48 + umull $ACC0,$IN23_0,${R0}[2] +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal $ACC4,$IN23_1,${R3}[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC3,$IN23_1,${R2}[2] + and x5,x9,#0x03ffffff + umlal $ACC2,$IN23_1,${R1}[2] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN23_1,${R0}[2] + ubfx x7,x9,#26,#26 + umlal $ACC0,$IN23_1,${S4}[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal $ACC4,$IN23_2,${R2}[2] + extr x8,x12,x8,#52 + umlal $ACC3,$IN23_2,${R1}[2] + extr x9,x13,x9,#52 + umlal $ACC2,$IN23_2,${R0}[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC1,$IN23_2,${S4}[2] + fmov $IN23_0,x4 + umlal $ACC0,$IN23_2,${S3}[2] + and x8,x8,#0x03ffffff + + umlal $ACC4,$IN23_3,${R1}[2] + and x9,x9,#0x03ffffff + umlal $ACC3,$IN23_3,${R0}[2] + ubfx x10,x12,#14,#26 + umlal $ACC2,$IN23_3,${S4}[2] + ubfx x11,x13,#14,#26 + umlal $ACC1,$IN23_3,${S3}[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC0,$IN23_3,${S2}[2] + fmov $IN23_1,x6 + + add $IN01_2,$IN01_2,$H2 + add x12,$padbit,x12,lsr#40 + umlal $ACC4,$IN23_4,${R0}[2] + add x13,$padbit,x13,lsr#40 + umlal $ACC3,$IN23_4,${S4}[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC2,$IN23_4,${S3}[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN23_4,${S2}[2] + fmov $IN23_2,x8 + umlal $ACC0,$IN23_4,${S1}[2] + fmov $IN23_3,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add $IN01_0,$IN01_0,$H0 + fmov $IN23_4,x12 + umlal $ACC3,$IN01_2,${R1}[0] + ldp x8,x12,[$inp],#16 // inp[0:1] + umlal $ACC0,$IN01_2,${S3}[0] + ldp x9,x13,[$inp],#48 + umlal $ACC4,$IN01_2,${R2}[0] + umlal $ACC1,$IN01_2,${S4}[0] + umlal $ACC2,$IN01_2,${R0}[0] +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3}[0] + umlal $ACC4,$IN01_0,${R4}[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC2,$IN01_0,${R2}[0] + and x5,x9,#0x03ffffff + umlal $ACC0,$IN01_0,${R0}[0] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN01_0,${R1}[0] + ubfx x7,x9,#26,#26 + + add $IN01_3,$IN01_3,$H3 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal $ACC3,$IN01_1,${R2}[0] + extr x8,x12,x8,#52 + umlal $ACC4,$IN01_1,${R3}[0] + extr x9,x13,x9,#52 + umlal $ACC0,$IN01_1,${S4}[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC2,$IN01_1,${R1}[0] + fmov $IN01_0,x4 + umlal $ACC1,$IN01_1,${R0}[0] + and x8,x8,#0x03ffffff + + add $IN01_4,$IN01_4,$H4 + and x9,x9,#0x03ffffff + umlal $ACC3,$IN01_3,${R0}[0] + ubfx x10,x12,#14,#26 + umlal $ACC0,$IN01_3,${S2}[0] + ubfx x11,x13,#14,#26 + umlal $ACC4,$IN01_3,${R1}[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC1,$IN01_3,${S3}[0] + fmov $IN01_1,x6 + umlal $ACC2,$IN01_3,${S4}[0] + add x12,$padbit,x12,lsr#40 + + umlal $ACC3,$IN01_4,${S4}[0] + add x13,$padbit,x13,lsr#40 + umlal $ACC0,$IN01_4,${S1}[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC4,$IN01_4,${R0}[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN01_4,${S2}[0] + fmov $IN01_2,x8 + umlal $ACC2,$IN01_4,${S3}[0] + fmov $IN01_3,x10 + fmov $IN01_4,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr $T0.2d,$ACC3,#26 + xtn $H3,$ACC3 + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + xtn $H4,$ACC4 + ushr $T1.2d,$ACC1,#26 + xtn $H1,$ACC1 + bic $H4,#0xfc,lsl#24 + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + shrn $T1.2s,$ACC2,#26 + xtn $H2,$ACC2 + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + bic $H1,#0xfc,lsl#24 + add $H3,$H3,$T1.2s // h2 -> h3 + bic $H2,#0xfc,lsl#24 + + shrn $T0.2s,$ACC0,#26 + xtn $H0,$ACC0 + ushr $T1.2s,$H3,#26 + bic $H3,#0xfc,lsl#24 + bic $H0,#0xfc,lsl#24 + add $H1,$H1,$T0.2s // h0 -> h1 + add $H4,$H4,$T1.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup $IN23_2,${IN23_2}[0] + add $IN01_2,$IN01_2,$H2 + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds $len,$len,#32 + b.ne .Long_tail + + dup $IN23_2,${IN01_2}[0] + add $IN23_0,$IN01_0,$H0 + add $IN23_3,$IN01_3,$H3 + add $IN23_1,$IN01_1,$H1 + add $IN23_4,$IN01_4,$H4 + +.Long_tail: + dup $IN23_0,${IN23_0}[0] + umull2 $ACC0,$IN23_2,${S3} + umull2 $ACC3,$IN23_2,${R1} + umull2 $ACC4,$IN23_2,${R2} + umull2 $ACC2,$IN23_2,${R0} + umull2 $ACC1,$IN23_2,${S4} + + dup $IN23_1,${IN23_1}[0] + umlal2 $ACC0,$IN23_0,${R0} + umlal2 $ACC2,$IN23_0,${R2} + umlal2 $ACC3,$IN23_0,${R3} + umlal2 $ACC4,$IN23_0,${R4} + umlal2 $ACC1,$IN23_0,${R1} + + dup $IN23_3,${IN23_3}[0] + umlal2 $ACC0,$IN23_1,${S4} + umlal2 $ACC3,$IN23_1,${R2} + umlal2 $ACC2,$IN23_1,${R1} + umlal2 $ACC4,$IN23_1,${R3} + umlal2 $ACC1,$IN23_1,${R0} + + dup $IN23_4,${IN23_4}[0] + umlal2 $ACC3,$IN23_3,${R0} + umlal2 $ACC4,$IN23_3,${R1} + umlal2 $ACC0,$IN23_3,${S2} + umlal2 $ACC1,$IN23_3,${S3} + umlal2 $ACC2,$IN23_3,${S4} + + umlal2 $ACC3,$IN23_4,${S4} + umlal2 $ACC0,$IN23_4,${S1} + umlal2 $ACC4,$IN23_4,${R0} + umlal2 $ACC1,$IN23_4,${S2} + umlal2 $ACC2,$IN23_4,${S3} + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add $IN01_0,$IN01_0,$H0 + umlal $ACC3,$IN01_2,${R1} + umlal $ACC0,$IN01_2,${S3} + umlal $ACC4,$IN01_2,${R2} + umlal $ACC1,$IN01_2,${S4} + umlal $ACC2,$IN01_2,${R0} + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3} + umlal $ACC0,$IN01_0,${R0} + umlal $ACC4,$IN01_0,${R4} + umlal $ACC1,$IN01_0,${R1} + umlal $ACC2,$IN01_0,${R2} + + add $IN01_3,$IN01_3,$H3 + umlal $ACC3,$IN01_1,${R2} + umlal $ACC0,$IN01_1,${S4} + umlal $ACC4,$IN01_1,${R3} + umlal $ACC1,$IN01_1,${R0} + umlal $ACC2,$IN01_1,${R1} + + add $IN01_4,$IN01_4,$H4 + umlal $ACC3,$IN01_3,${R0} + umlal $ACC0,$IN01_3,${S2} + umlal $ACC4,$IN01_3,${R1} + umlal $ACC1,$IN01_3,${S3} + umlal $ACC2,$IN01_3,${S4} + + umlal $ACC3,$IN01_4,${S4} + umlal $ACC0,$IN01_4,${S1} + umlal $ACC4,$IN01_4,${R0} + umlal $ACC1,$IN01_4,${S2} + umlal $ACC2,$IN01_4,${S3} + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr $T0.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + and $ACC4,$ACC4,$MASK.2d + ushr $T1.2d,$ACC1,#26 + and $ACC1,$ACC1,$MASK.2d + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + ushr $T1.2d,$ACC2,#26 + and $ACC2,$ACC2,$MASK.2d + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + add $ACC3,$ACC3,$T1.2d // h2 -> h3 + + ushr $T0.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + ushr $T1.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + add $ACC1,$ACC1,$T0.2d // h0 -> h1 + add $ACC4,$ACC4,$T1.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 + st1 {$ACC4}[0],[$ctx] + +.Lno_data_neon: + .inst 0xd50323bf // autiasp + ldr x29,[sp],#80 + ret +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.type poly1305_emit_neon,%function +.align 5 +poly1305_emit_neon: + ldr $is_base2_26,[$ctx,#24] + cbz $is_base2_26,poly1305_emit + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $h2,$h2,xzr // can be partially reduced... + + ldp $t0,$t1,[$nonce] // load nonce + + and $d0,$h2,#-4 // ... so reduce + add $d0,$d0,$h2,lsr#2 + and $h2,$h2,#3 + adds $h0,$h0,$d0 + adcs $h1,$h1,xzr + adc $h2,$h2,xzr + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __ARMEB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __ARMEB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit_neon,.-poly1305_emit_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 +.LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else +.quad OPENSSL_armcap_P-. +#endif +.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +foreach (split("\n",$code)) { + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); + + s/\.[124]([sd])\[/.$1\[/; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-c64xplus.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-c64xplus.pl new file mode 100755 index 000000000..93fef37e6 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-c64xplus.pl @@ -0,0 +1,331 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Poly1305 hash for C64x+. +# +# October 2015 +# +# Performance is [incredible for a 32-bit processor] 1.82 cycles per +# processed byte. Comparison to compiler-generated code is problematic, +# because results were observed to vary from 2.1 to 7.6 cpb depending +# on compiler's ability to inline small functions. Compiler also +# disables interrupts for some reason, thus making interrupt response +# time dependent on input length. This module on the other hand is free +# from such limitation. + +$output=pop; +open STDOUT,">$output"; + +($CTXA,$INPB,$LEN,$PADBIT)=("A4","B4","A6","B6"); +($H0,$H1,$H2,$H3,$H4,$H4a)=("A8","B8","A10","B10","B2",$LEN); +($D0,$D1,$D2,$D3)= ("A9","B9","A11","B11"); +($R0,$R1,$R2,$R3,$S1,$S2,$S3,$S3b)=("A0","B0","A1","B1","A12","B12","A13","B13"); +($THREE,$R0b,$S2a)=("B7","B5","A5"); + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg poly1305_init,_poly1305_init + .asg poly1305_blocks,_poly1305_blocks + .asg poly1305_emit,_poly1305_emit + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .LITTLE_ENDIAN + .asg MV,SWAP2 + .asg MV.L,SWAP4 + .endif + + .global _poly1305_init +_poly1305_init: + .asmfunc + LDNDW *${INPB}[0],B17:B16 ; load key material + LDNDW *${INPB}[1],A17:A16 + +|| ZERO B9:B8 +|| MVK -1,B0 + STDW B9:B8,*${CTXA}[0] ; initialize h1:h0 +|| SHRU B0,4,B0 ; 0x0fffffff +|| MVK -4,B1 + STDW B9:B8,*${CTXA}[1] ; initialize h3:h2 +|| AND B0,B1,B1 ; 0x0ffffffc + STW B8,*${CTXA}[4] ; initialize h4 + + .if .BIG_ENDIAN + SWAP2 B16,B17 +|| SWAP2 B17,B16 + SWAP2 A16,A17 +|| SWAP2 A17,A16 + SWAP4 B16,B16 +|| SWAP4 A16,A16 + SWAP4 B17,B17 +|| SWAP4 A17,A17 + .endif + + AND B16,B0,B20 ; r0 = key[0] & 0x0fffffff +|| AND B17,B1,B22 ; r1 = key[1] & 0x0ffffffc +|| EXTU B17,4,6,B16 ; r1>>2 + AND A16,B1,B21 ; r2 = key[2] & 0x0ffffffc +|| AND A17,B1,A23 ; r3 = key[3] & 0x0ffffffc +|| BNOP RA + SHRU B21,2,B18 +|| ADD B22,B16,B16 ; s1 = r1 + r1>>2 + + STDW B21:B20,*${CTXA}[3] ; save r2:r0 +|| ADD B21,B18,B18 ; s2 = r2 + r2>>2 +|| SHRU A23,2,B17 +|| MV A23,B23 + STDW B23:B22,*${CTXA}[4] ; save r3:r1 +|| ADD B23,B17,B19 ; s3 = r3 + r3>>2 +|| ADD B23,B17,B17 ; s3 = r3 + r3>>2 + STDW B17:B16,*${CTXA}[5] ; save s3:s1 + STDW B19:B18,*${CTXA}[6] ; save s3:s2 +|| ZERO A4 ; return 0 + .endasmfunc + + .global _poly1305_blocks + .align 32 +_poly1305_blocks: + .asmfunc stack_usage(40) + SHRU $LEN,4,A2 ; A2 is loop counter, number of blocks + [!A2] BNOP RA ; no data +|| [A2] STW FP,*SP--(40) ; save frame pointer and alloca(40) +|| [A2] MV SP,FP + [A2] STDW B13:B12,*SP[4] ; ABI says so +|| [A2] MV $CTXA,$S3b ; borrow $S3b + [A2] STDW B11:B10,*SP[3] +|| [A2] STDW A13:A12,*FP[-3] + [A2] STDW A11:A10,*FP[-4] + +|| [A2] LDDW *${S3b}[0],B25:B24 ; load h1:h0 + [A2] LDNW *${INPB}++[4],$D0 ; load inp[0] + [A2] LDNW *${INPB}[-3],$D1 ; load inp[1] + + LDDW *${CTXA}[1],B29:B28 ; load h3:h2, B28 is h2 + LDNW *${INPB}[-2],$D2 ; load inp[2] + LDNW *${INPB}[-1],$D3 ; load inp[3] + + LDDW *${CTXA}[3],$R2:$R0 ; load r2:r0 +|| LDDW *${S3b}[4],$R3:$R1 ; load r3:r1 +|| SWAP2 $D0,$D0 + + LDDW *${CTXA}[5],$S3:$S1 ; load s3:s1 +|| LDDW *${S3b}[6],$S3b:$S2 ; load s3:s2 +|| SWAP4 $D0,$D0 +|| SWAP2 $D1,$D1 + + ADDU $D0,B24,$D0:$H0 ; h0+=inp[0] +|| ADD $D0,B24,B27 ; B-copy of h0+inp[0] +|| SWAP4 $D1,$D1 + ADDU $D1,B25,$D1:$H1 ; h1+=inp[1] +|| MVK 3,$THREE +|| SWAP2 $D2,$D2 + LDW *${CTXA}[4],$H4 ; load h4 +|| SWAP4 $D2,$D2 +|| MV B29,B30 ; B30 is h3 + MV $R0,$R0b + +loop?: + MPY32U $H0,$R0,A17:A16 +|| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16 +|| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1 +|| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2] +|| SWAP2 $D3,$D3 + MPY32U $H0,$R2,A19:A18 +|| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18 +|| ADD $D0,$H1,A24 ; A-copy of B24 +|| SWAP4 $D3,$D3 +|| [A2] SUB A2,1,A2 ; decrement loop counter + + MPY32U A24,$S3,A21:A20 ; MPY32U $H1,$S3,A21:A20 +|| MPY32U B24,$R0b,B21:B20 ; MPY32U $H1,$R0,B21:B20 +|| ADDU B25,$D2:$H2,$D2:$H2 ; ADDU $D1,$D2:$H2,$D2:$H2 +|| ADDU $D3,B30,$D3:$H3 ; h3+=inp[3] +|| ADD B25,$H2,B25 ; B-copy of $H2 + MPY32U A24,$R1,A23:A22 ; MPY32U $H1,$R1,A23:A22 +|| MPY32U B24,$R2,B23:B22 ; MPY32U $H1,$R2,B23:B22 + + MPY32U $H2,$S2,A25:A24 +|| MPY32U B25,$S3b,B25:B24 ; MPY32U $H2,$S3,B25:B24 +|| ADDU $D2,$D3:$H3,$D3:$H3 +|| ADD $PADBIT,$H4,$H4 ; h4+=padbit + MPY32U $H2,$R0,A27:A26 +|| MPY32U $H2,$R1,B27:B26 +|| ADD $D3,$H4,$H4 +|| MV $S2,$S2a + + MPY32U $H3,$S1,A29:A28 +|| MPY32U $H3,$S2,B29:B28 +|| ADD A21,A17,A21 ; start accumulating "d3:d0" +|| ADD B21,B17,B21 +|| ADDU A20,A16,A17:A16 +|| ADDU B20,B16,B17:B16 +|| [A2] LDNW *${INPB}++[4],$D0 ; load inp[0] + MPY32U $H3,$S3,A31:A30 +|| MPY32U $H3,$R0b,B31:B30 +|| ADD A23,A19,A23 +|| ADD B23,B19,B23 +|| ADDU A22,A18,A19:A18 +|| ADDU B22,B18,B19:B18 +|| [A2] LDNW *${INPB}[-3],$D1 ; load inp[1] + + MPY32 $H4,$S1,B20 +|| MPY32 $H4,$S2a,A20 +|| ADD A25,A21,A21 +|| ADD B25,B21,B21 +|| ADDU A24,A17:A16,A17:A16 +|| ADDU B24,B17:B16,B17:B16 +|| [A2] LDNW *${INPB}[-2],$D2 ; load inp[2] + MPY32 $H4,$S3b,B22 +|| ADD A27,A23,A23 +|| ADD B27,B23,B23 +|| ADDU A26,A19:A18,A19:A18 +|| ADDU B26,B19:B18,B19:B18 +|| [A2] LDNW *${INPB}[-1],$D3 ; load inp[3] + + MPY32 $H4,$R0b,$H4 +|| ADD A29,A21,A21 ; final hi("d0") +|| ADD B29,B21,B21 ; final hi("d1") +|| ADDU A28,A17:A16,A17:A16 ; final lo("d0") +|| ADDU B28,B17:B16,B17:B16 + ADD A31,A23,A23 ; final hi("d2") +|| ADD B31,B23,B23 ; final hi("d3") +|| ADDU A30,A19:A18,A19:A18 +|| ADDU B30,B19:B18,B19:B18 + ADDU B20,B17:B16,B17:B16 ; final lo("d1") +|| ADDU A20,A19:A18,A19:A18 ; final lo("d2") + ADDU B22,B19:B18,B19:B18 ; final lo("d3") + +|| ADD A17,A21,A21 ; "flatten" "d3:d0" + MV A19,B29 ; move to avoid cross-path stalls + ADDU A21,B17:B16,B27:B26 ; B26 is h1 + ADD B21,B27,B27 +|| DMV B29,A18,B29:B28 ; move to avoid cross-path stalls + ADDU B27,B29:B28,B29:B28 ; B28 is h2 +|| [A2] SWAP2 $D0,$D0 + ADD A23,B29,B29 +|| [A2] SWAP4 $D0,$D0 + ADDU B29,B19:B18,B31:B30 ; B30 is h3 + ADD B23,B31,B31 +|| MV A16,B24 ; B24 is h0 +|| [A2] SWAP2 $D1,$D1 + ADD B31,$H4,$H4 +|| [A2] SWAP4 $D1,$D1 + + SHRU $H4,2,B16 ; last reduction step +|| AND $H4,$THREE,$H4 + ADDAW B16,B16,B16 ; 5*(h4>>2) +|| [A2] BNOP loop? + + ADDU B24,B16,B25:B24 ; B24 is h0 +|| [A2] SWAP2 $D2,$D2 + ADDU B26,B25,B27:B26 ; B26 is h1 +|| [A2] SWAP4 $D2,$D2 + ADDU B28,B27,B29:B28 ; B28 is h2 +|| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0] +|| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0] + ADDU B30,B29,B31:B30 ; B30 is h3 + ADD B31,$H4,$H4 +|| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1] +;;===== branch to loop? is taken here + + LDDW *FP[-4],A11:A10 ; ABI says so + LDDW *FP[-3],A13:A12 +|| LDDW *SP[3],B11:B10 + LDDW *SP[4],B13:B12 +|| MV B26,B25 +|| BNOP RA + LDW *++SP(40),FP ; restore frame pointer +|| MV B30,B29 + STDW B25:B24,*${CTXA}[0] ; save h1:h0 + STDW B29:B28,*${CTXA}[1] ; save h3:h2 + STW $H4,*${CTXA}[4] ; save h4 + NOP 1 + .endasmfunc +___ +{ +my ($MAC,$NONCEA,$NONCEB)=($INPB,$LEN,$PADBIT); + +$code.=<<___; + .global _poly1305_emit + .align 32 +_poly1305_emit: + .asmfunc + LDDW *${CTXA}[0],A17:A16 ; load h1:h0 + LDDW *${CTXA}[1],A19:A18 ; load h3:h2 + LDW *${CTXA}[4],A20 ; load h4 + MV $NONCEA,$NONCEB + + MVK 5,A22 ; compare to modulus + ADDU A16,A22,A23:A22 +|| LDW *${NONCEA}[0],A8 +|| LDW *${NONCEB}[1],B8 + ADDU A17,A23,A25:A24 +|| LDW *${NONCEA}[2],A9 +|| LDW *${NONCEB}[3],B9 + ADDU A19,A25,A27:A26 + ADDU A19,A27,A29:A28 + ADD A20,A29,A29 + + SHRU A29,2,A2 ; check for overflow in 130-th bit + + [A2] MV A22,A16 ; select +|| [A2] MV A24,A17 + [A2] MV A26,A18 +|| [A2] MV A28,A19 + +|| ADDU A8,A16,A23:A22 ; accumulate nonce + ADDU B8,A17,A25:A24 +|| SWAP2 A22,A22 + ADDU A23,A25:A24,A25:A24 + ADDU A9,A18,A27:A26 +|| SWAP2 A24,A24 + ADDU A25,A27:A26,A27:A26 +|| ADD B9,A19,A28 + ADD A27,A28,A28 +|| SWAP2 A26,A26 + + .if .BIG_ENDIAN + SWAP2 A28,A28 +|| SWAP4 A22,A22 +|| SWAP4 A24,B24 + SWAP4 A26,A26 + SWAP4 A28,A28 +|| MV B24,A24 + .endif + + BNOP RA,1 + STNW A22,*${MAC}[0] ; write the result + STNW A24,*${MAC}[1] + STNW A26,*${MAC}[2] + STNW A28,*${MAC}[3] + .endasmfunc +___ +} +$code.=<<___; + .sect .const + .cstring "Poly1305 for C64x+, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 +___ + +print $code; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-mips.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-mips.pl new file mode 100755 index 000000000..965825dc3 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-mips.pl @@ -0,0 +1,437 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Poly1305 hash for MIPS64. +# +# May 2016 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc +# R1x000 5.64/+120% (big-endian) +# Octeon II 3.80/+280% (little-endian) + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +# <appro@openssl.org> +# +###################################################################### + +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 + +die "MIPS64 only" unless ($flavour =~ /64|n32/i); + +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; + +($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); + +$code.=<<___; +#include "mips_arch.h" + +#ifdef MIPSEB +# define MSB 0 +# define LSB 7 +#else +# define MSB 7 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) + ld $in1,8($inp) +#else + ldl $in0,0+MSB($inp) + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + li $tmp0,1 + dsll $tmp0,32 + daddiu $tmp0,-63 + dsll $tmp0,28 + daddiu $tmp0,-1 # 0ffffffc0fffffff + + and $in0,$tmp0 + daddiu $tmp0,-3 # 0ffffffc0ffffffc + and $in1,$tmp0 + + sd $in0,24($ctx) + dsrl $tmp0,$in1,2 + sd $in1,32($ctx) + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $v0,0 # return 0 + jr $ra +.end poly1305_init +___ +{ +my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); + +$code.=<<___; +.align 5 +.globl poly1305_blocks +.ent poly1305_blocks +poly1305_blocks: + .set noreorder + dsrl $len,4 # number of complete blocks + bnez $len,poly1305_blocks_internal + nop + jr $ra + nop +.end poly1305_blocks + +.align 5 +.ent poly1305_blocks_internal +poly1305_blocks_internal: + .frame $sp,6*8,$ra + .mask $SAVED_REGS_MASK,-8 + .set noreorder + dsubu $sp,6*8 + sd $s5,40($sp) + sd $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sd $s3,24($sp) + sd $s2,16($sp) + sd $s1,8($sp) + sd $s0,0($sp) +___ +$code.=<<___; + .set reorder + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $s1,40($ctx) + +.Loop: +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) # load input + ld $in1,8($inp) +#else + ldl $in0,0+MSB($inp) # load input + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif + daddiu $len,-1 + daddiu $inp,16 +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + daddu $h0,$in0 # accumulate input + daddu $h1,$in1 + sltu $tmp0,$h0,$in0 + sltu $tmp1,$h1,$in1 + daddu $h1,$tmp0 + + dmultu ($r0,$h0) # h0*r0 + daddu $h2,$padbit + sltu $tmp0,$h1,$tmp0 + mflo ($d0,$r0,$h0) + mfhi ($d1,$r0,$h0) + + dmultu ($s1,$h1) # h1*5*r1 + daddu $tmp0,$tmp1 + daddu $h2,$tmp0 + mflo ($tmp0,$s1,$h1) + mfhi ($tmp1,$s1,$h1) + + dmultu ($r1,$h0) # h0*r1 + daddu $d0,$tmp0 + daddu $d1,$tmp1 + mflo ($tmp2,$r1,$h0) + mfhi ($d2,$r1,$h0) + sltu $tmp0,$d0,$tmp0 + daddu $d1,$tmp0 + + dmultu ($r0,$h1) # h1*r0 + daddu $d1,$tmp2 + sltu $tmp2,$d1,$tmp2 + mflo ($tmp0,$r0,$h1) + mfhi ($tmp1,$r0,$h1) + daddu $d2,$tmp2 + + dmultu ($s1,$h2) # h2*5*r1 + daddu $d1,$tmp0 + daddu $d2,$tmp1 + mflo ($tmp2,$s1,$h2) + + dmultu ($r0,$h2) # h2*r0 + sltu $tmp0,$d1,$tmp0 + daddu $d2,$tmp0 + mflo ($tmp3,$r0,$h2) + + daddu $d1,$tmp2 + daddu $d2,$tmp3 + sltu $tmp2,$d1,$tmp2 + daddu $d2,$tmp2 + + li $tmp0,-4 # final reduction + and $tmp0,$d2 + dsrl $tmp1,$d2,2 + andi $h2,$d2,3 + daddu $tmp0,$tmp1 + daddu $h0,$d0,$tmp0 + sltu $tmp0,$h0,$tmp0 + daddu $h1,$d1,$tmp0 + sltu $tmp0,$h1,$tmp0 + daddu $h2,$h2,$tmp0 + + bnez $len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + .set noreorder + ld $s5,40($sp) # epilogue + ld $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue + ld $s3,24($sp) + ld $s2,16($sp) + ld $s1,8($sp) + ld $s0,0($sp) +___ +$code.=<<___; + jr $ra + daddu $sp,6*8 +.end poly1305_blocks_internal +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + ld $tmp2,16($ctx) + + daddiu $in0,$tmp0,5 # compare to modulus + sltiu $tmp3,$in0,5 + daddu $in1,$tmp1,$tmp3 + sltu $tmp3,$in1,$tmp3 + daddu $tmp2,$tmp2,$tmp3 + + dsrl $tmp2,2 # see if it carried/borrowed + dsubu $tmp2,$zero,$tmp2 + nor $tmp3,$zero,$tmp2 + + and $in0,$tmp2 + and $tmp0,$tmp3 + and $in1,$tmp2 + and $tmp1,$tmp3 + or $in0,$tmp0 + or $in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + dsll $tmp1,32 + dsll $tmp3,32 + or $tmp0,$tmp1 + or $tmp2,$tmp3 + + daddu $in0,$tmp0 # accumulate nonce + daddu $in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + daddu $in1,$tmp0 + + dsrl $tmp0,$in0,8 # write mac value + dsrl $tmp1,$in0,16 + dsrl $tmp2,$in0,24 + sb $in0,0($mac) + dsrl $tmp3,$in0,32 + sb $tmp0,1($mac) + dsrl $tmp0,$in0,40 + sb $tmp1,2($mac) + dsrl $tmp1,$in0,48 + sb $tmp2,3($mac) + dsrl $tmp2,$in0,56 + sb $tmp3,4($mac) + dsrl $tmp3,$in1,8 + sb $tmp0,5($mac) + dsrl $tmp0,$in1,16 + sb $tmp1,6($mac) + dsrl $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + dsrl $tmp2,$in1,32 + sb $tmp3,9($mac) + dsrl $tmp3,$in1,40 + sb $tmp0,10($mac) + dsrl $tmp0,$in1,48 + sb $tmp1,11($mac) + dsrl $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ +} + +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT or die "error closing STDOUT: $!"; + diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-ppc.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-ppc.pl new file mode 100755 index 000000000..e5d6933ac --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-ppc.pl @@ -0,0 +1,645 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for PowerPC. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# and improvement coefficients relative to gcc-generated code. +# +# -m32 -m64 +# +# Freescale e300 14.8/+80% - +# PPC74x0 7.60/+60% - +# PPC970 7.00/+114% 3.51/+205% +# POWER7 3.75/+260% 1.93/+100% +# POWER8 - 2.03/+200% +# POWER9 - 2.00/+150% +# +# Do we need floating-point implementation for PPC? Results presented +# in poly1305_ieee754.c are tricky to compare to, because they are for +# compiler-generated code. On the other hand it's known that floating- +# point performance can be dominated by FPU latency, which means that +# there is limit even for ideally optimized (and even vectorized) code. +# And this limit is estimated to be higher than above -m64 results. Or +# in other words floating-point implementation can be meaningful to +# consider only in 32-bit application context. We probably have to +# recognize that 32-bit builds are getting less popular on high-end +# systems and therefore tend to target embedded ones, which might not +# even have FPU... +# +# On side note, Power ISA 2.07 enables vector base 2^26 implementation, +# and POWER8 might have capacity to break 1.0 cycle per byte barrier... + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $UCMP ="cmpld"; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $UCMP ="cmplw"; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; +} else { die "nonsense $flavour"; } + +# Define endianness based on flavour +# i.e.: linux64le +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=24*$SIZE_T; + +$sp="r1"; +my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); +my ($mac,$nonce)=($inp,$len); +my $mask = "r0"; + +$code=<<___; +.machine "any" +.text +___ + if ($flavour =~ /64/) { +############################################################################### +# base 2^64 implementation + +my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31)); + +$code.=<<___; +.globl .poly1305_init_int +.align 4 +.poly1305_init_int: + xor r0,r0,r0 + std r0,0($ctx) # zero hash value + std r0,8($ctx) + std r0,16($ctx) + + $UCMP $inp,r0 + beq- Lno_key +___ +$code.=<<___ if ($LITTLE_ENDIAN); + ld $d0,0($inp) # load key material + ld $d1,8($inp) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + li $h0,4 + lwbrx $d0,0,$inp # load key material + li $d1,8 + lwbrx $h0,$h0,$inp + li $h1,12 + lwbrx $d1,$d1,$inp + lwbrx $h1,$h1,$inp + insrdi $d0,$h0,32,0 + insrdi $d1,$h1,32,0 +___ +$code.=<<___; + lis $h1,0xfff # 0x0fff0000 + ori $h1,$h1,0xfffc # 0x0ffffffc + insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc + ori $h0,$h1,3 # 0x0ffffffc0fffffff + + and $d0,$d0,$h0 + and $d1,$d1,$h1 + + std $d0,32($ctx) # store key + std $d1,40($ctx) + +Lno_key: + xor r3,r3,r3 + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 +.size .poly1305_init_int,.-.poly1305_init_int + +.globl .poly1305_blocks +.align 4 +.poly1305_blocks: + srdi. $len,$len,4 + beq- Labort + + $STU $sp,-$FRAME($sp) + mflr r0 + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + ld $r0,32($ctx) # load key + ld $r1,40($ctx) + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + srdi $s1,$r1,2 + mtctr $len + add $s1,$s1,$r1 # s1 = r1 + r1>>2 + li $mask,3 + b Loop + +.align 4 +Loop: +___ +$code.=<<___ if ($LITTLE_ENDIAN); + ld $t0,0($inp) # load input + ld $t1,8($inp) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + li $d0,4 + lwbrx $t0,0,$inp # load input + li $t1,8 + lwbrx $d0,$d0,$inp + li $d1,12 + lwbrx $t1,$t1,$inp + lwbrx $d1,$d1,$inp + insrdi $t0,$d0,32,0 + insrdi $t1,$d1,32,0 +___ +$code.=<<___; + addi $inp,$inp,16 + + addc $h0,$h0,$t0 # accumulate input + adde $h1,$h1,$t1 + + mulld $d0,$h0,$r0 # h0*r0 + mulhdu $d1,$h0,$r0 + adde $h2,$h2,$padbit + + mulld $t0,$h1,$s1 # h1*5*r1 + mulhdu $t1,$h1,$s1 + addc $d0,$d0,$t0 + adde $d1,$d1,$t1 + + mulld $t0,$h0,$r1 # h0*r1 + mulhdu $d2,$h0,$r1 + addc $d1,$d1,$t0 + addze $d2,$d2 + + mulld $t0,$h1,$r0 # h1*r0 + mulhdu $t1,$h1,$r0 + addc $d1,$d1,$t0 + adde $d2,$d2,$t1 + + mulld $t0,$h2,$s1 # h2*5*r1 + mulld $t1,$h2,$r0 # h2*r0 + addc $d1,$d1,$t0 + adde $d2,$d2,$t1 + + andc $t0,$d2,$mask # final reduction step + and $h2,$d2,$mask + srdi $t1,$t0,2 + add $t0,$t0,$t1 + addc $h0,$d0,$t0 + addze $h1,$d1 + addze $h2,$h2 + + bdnz Loop + + std $h0,0($ctx) # store hash value + std $h1,8($ctx) + std $h2,16($ctx) + + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + addi $sp,$sp,$FRAME +Labort: + blr + .long 0 + .byte 0,12,4,1,0x80,5,4,0 +.size .poly1305_blocks,.-.poly1305_blocks + +.globl .poly1305_emit +.align 4 +.poly1305_emit: + ld $h0,0($ctx) # load hash + ld $h1,8($ctx) + ld $h2,16($ctx) + ld $padbit,0($nonce) # load nonce + ld $nonce,8($nonce) + + addic $d0,$h0,5 # compare to modulus + addze $d1,$h1 + addze $d2,$h2 + + srdi $mask,$d2,2 # did it carry/borrow? + neg $mask,$mask + + andc $h0,$h0,$mask + and $d0,$d0,$mask + andc $h1,$h1,$mask + and $d1,$d1,$mask + or $h0,$h0,$d0 + or $h1,$h1,$d1 +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + rotldi $padbit,$padbit,32 # flip nonce words + rotldi $nonce,$nonce,32 +___ +$code.=<<___; + addc $h0,$h0,$padbit # accumulate nonce + adde $h1,$h1,$nonce +___ +$code.=<<___ if ($LITTLE_ENDIAN); + std $h0,0($mac) # write result + std $h1,8($mac) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + extrdi r0,$h0,32,0 + li $d0,4 + stwbrx $h0,0,$mac # write result + extrdi $h0,$h1,32,0 + li $d1,8 + stwbrx r0,$d0,$mac + li $d2,12 + stwbrx $h1,$d1,$mac + stwbrx $h0,$d2,$mac +___ +$code.=<<___; + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 +.size .poly1305_emit,.-.poly1305_emit +___ + } else { +############################################################################### +# base 2^32 implementation + +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3, + $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3 + ) = map("r$_",(7..12,14..31)); + +$code.=<<___; +.globl .poly1305_init_int +.align 4 +.poly1305_init_int: + xor r0,r0,r0 + stw r0,0($ctx) # zero hash value + stw r0,4($ctx) + stw r0,8($ctx) + stw r0,12($ctx) + stw r0,16($ctx) + + $UCMP $inp,r0 + beq- Lno_key +___ +$code.=<<___ if ($LITTLE_ENDIAN); + lw $h0,0($inp) # load key material + lw $h1,4($inp) + lw $h2,8($inp) + lw $h3,12($inp) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + li $h1,4 + lwbrx $h0,0,$inp # load key material + li $h2,8 + lwbrx $h1,$h1,$inp + li $h3,12 + lwbrx $h2,$h2,$inp + lwbrx $h3,$h3,$inp +___ +$code.=<<___; + lis $mask,0xf000 # 0xf0000000 + li $r0,-4 + andc $r0,$r0,$mask # 0x0ffffffc + + andc $h0,$h0,$mask + and $h1,$h1,$r0 + and $h2,$h2,$r0 + and $h3,$h3,$r0 + + stw $h0,32($ctx) # store key + stw $h1,36($ctx) + stw $h2,40($ctx) + stw $h3,44($ctx) + +Lno_key: + xor r3,r3,r3 + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 +.size .poly1305_init_int,.-.poly1305_init_int + +.globl .poly1305_blocks +.align 4 +.poly1305_blocks: + srwi. $len,$len,4 + beq- Labort + + $STU $sp,-$FRAME($sp) + mflr r0 + $PUSH r14,`$FRAME-$SIZE_T*18`($sp) + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + lwz $r0,32($ctx) # load key + lwz $r1,36($ctx) + lwz $r2,40($ctx) + lwz $r3,44($ctx) + + lwz $h0,0($ctx) # load hash value + lwz $h1,4($ctx) + lwz $h2,8($ctx) + lwz $h3,12($ctx) + lwz $h4,16($ctx) + + srwi $s1,$r1,2 + srwi $s2,$r2,2 + srwi $s3,$r3,2 + add $s1,$s1,$r1 # si = ri + ri>>2 + add $s2,$s2,$r2 + add $s3,$s3,$r3 + mtctr $len + li $mask,3 + b Loop + +.align 4 +Loop: +___ +$code.=<<___ if ($LITTLE_ENDIAN); + lwz $d0,0($inp) # load input + lwz $d1,4($inp) + lwz $d2,8($inp) + lwz $d3,12($inp) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + li $d1,4 + lwbrx $d0,0,$inp # load input + li $d2,8 + lwbrx $d1,$d1,$inp + li $d3,12 + lwbrx $d2,$d2,$inp + lwbrx $d3,$d3,$inp +___ +$code.=<<___; + addi $inp,$inp,16 + + addc $h0,$h0,$d0 # accumulate input + adde $h1,$h1,$d1 + adde $h2,$h2,$d2 + + mullw $d0,$h0,$r0 # h0*r0 + mulhwu $D0,$h0,$r0 + + mullw $d1,$h0,$r1 # h0*r1 + mulhwu $D1,$h0,$r1 + + mullw $d2,$h0,$r2 # h0*r2 + mulhwu $D2,$h0,$r2 + + adde $h3,$h3,$d3 + adde $h4,$h4,$padbit + + mullw $d3,$h0,$r3 # h0*r3 + mulhwu $D3,$h0,$r3 + + mullw $t0,$h1,$s3 # h1*s3 + mulhwu $t1,$h1,$s3 + + mullw $t2,$h1,$r0 # h1*r0 + mulhwu $t3,$h1,$r0 + addc $d0,$d0,$t0 + adde $D0,$D0,$t1 + + mullw $t0,$h1,$r1 # h1*r1 + mulhwu $t1,$h1,$r1 + addc $d1,$d1,$t2 + adde $D1,$D1,$t3 + + mullw $t2,$h1,$r2 # h1*r2 + mulhwu $t3,$h1,$r2 + addc $d2,$d2,$t0 + adde $D2,$D2,$t1 + + mullw $t0,$h2,$s2 # h2*s2 + mulhwu $t1,$h2,$s2 + addc $d3,$d3,$t2 + adde $D3,$D3,$t3 + + mullw $t2,$h2,$s3 # h2*s3 + mulhwu $t3,$h2,$s3 + addc $d0,$d0,$t0 + adde $D0,$D0,$t1 + + mullw $t0,$h2,$r0 # h2*r0 + mulhwu $t1,$h2,$r0 + addc $d1,$d1,$t2 + adde $D1,$D1,$t3 + + mullw $t2,$h2,$r1 # h2*r1 + mulhwu $t3,$h2,$r1 + addc $d2,$d2,$t0 + adde $D2,$D2,$t1 + + mullw $t0,$h3,$s1 # h3*s1 + mulhwu $t1,$h3,$s1 + addc $d3,$d3,$t2 + adde $D3,$D3,$t3 + + mullw $t2,$h3,$s2 # h3*s2 + mulhwu $t3,$h3,$s2 + addc $d0,$d0,$t0 + adde $D0,$D0,$t1 + + mullw $t0,$h3,$s3 # h3*s3 + mulhwu $t1,$h3,$s3 + addc $d1,$d1,$t2 + adde $D1,$D1,$t3 + + mullw $t2,$h3,$r0 # h3*r0 + mulhwu $t3,$h3,$r0 + addc $d2,$d2,$t0 + adde $D2,$D2,$t1 + + mullw $t0,$h4,$s1 # h4*s1 + addc $d3,$d3,$t2 + adde $D3,$D3,$t3 + addc $d1,$d1,$t0 + + mullw $t1,$h4,$s2 # h4*s2 + addze $D1,$D1 + addc $d2,$d2,$t1 + addze $D2,$D2 + + mullw $t2,$h4,$s3 # h4*s3 + addc $d3,$d3,$t2 + addze $D3,$D3 + + mullw $h4,$h4,$r0 # h4*r0 + + addc $h1,$d1,$D0 + adde $h2,$d2,$D1 + adde $h3,$d3,$D2 + adde $h4,$h4,$D3 + + andc $D0,$h4,$mask # final reduction step + and $h4,$h4,$mask + srwi $D1,$D0,2 + add $D0,$D0,$D1 + addc $h0,$d0,$D0 + addze $h1,$h1 + addze $h2,$h2 + addze $h3,$h3 + addze $h4,$h4 + + bdnz Loop + + stw $h0,0($ctx) # store hash value + stw $h1,4($ctx) + stw $h2,8($ctx) + stw $h3,12($ctx) + stw $h4,16($ctx) + + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + addi $sp,$sp,$FRAME +Labort: + blr + .long 0 + .byte 0,12,4,1,0x80,18,4,0 +.size .poly1305_blocks,.-.poly1305_blocks + +.globl .poly1305_emit +.align 4 +.poly1305_emit: + $STU $sp,-$FRAME($sp) + mflr r0 + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + lwz $h0,0($ctx) # load hash + lwz $h1,4($ctx) + lwz $h2,8($ctx) + lwz $h3,12($ctx) + lwz $h4,16($ctx) + + addic $d0,$h0,5 # compare to modulus + addze $d1,$h1 + addze $d2,$h2 + addze $d3,$h3 + addze $mask,$h4 + + srwi $mask,$mask,2 # did it carry/borrow? + neg $mask,$mask + + andc $h0,$h0,$mask + and $d0,$d0,$mask + andc $h1,$h1,$mask + and $d1,$d1,$mask + or $h0,$h0,$d0 + lwz $d0,0($nonce) # load nonce + andc $h2,$h2,$mask + and $d2,$d2,$mask + or $h1,$h1,$d1 + lwz $d1,4($nonce) + andc $h3,$h3,$mask + and $d3,$d3,$mask + or $h2,$h2,$d2 + lwz $d2,8($nonce) + or $h3,$h3,$d3 + lwz $d3,12($nonce) + + addc $h0,$h0,$d0 # accumulate nonce + adde $h1,$h1,$d1 + adde $h2,$h2,$d2 + adde $h3,$h3,$d3 +___ +$code.=<<___ if ($LITTLE_ENDIAN); + stw $h0,0($mac) # write result + stw $h1,4($mac) + stw $h2,8($mac) + stw $h3,12($mac) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + li $d1,4 + stwbrx $h0,0,$mac # write result + li $d2,8 + stwbrx $h1,$d1,$mac + li $d3,12 + stwbrx $h2,$d2,$mac + stwbrx $h3,$d3,$mac +___ +$code.=<<___; + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,4,3,0 +.size .poly1305_emit,.-.poly1305_emit +___ + } +$code.=<<___; +.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-ppcfp.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-ppcfp.pl new file mode 100755 index 000000000..a9ab20714 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-ppcfp.pl @@ -0,0 +1,739 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for PowerPC FPU. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# and improvement coefficients relative to gcc-generated code. +# +# Freescale e300 9.78/+30% +# PPC74x0 6.92/+50% +# PPC970 6.03/+80% +# POWER7 3.50/+30% +# POWER8 3.75/+10% + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $UCMP ="cmpld"; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $UCMP ="cmplw"; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; +} else { die "nonsense $flavour"; } + +$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; + +$LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx"; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$LOCALS=6*$SIZE_T; +$FRAME=$LOCALS+6*8+18*8; + +my $sp="r1"; + +my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); +my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6)); + +my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, + $two0,$two32,$two64,$two96,$two130,$five_two130, + $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, + $s2lo,$s2hi,$s3lo,$s3hi, + $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31)); +# borrowings +my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); +my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); +my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi); + +$code.=<<___; +.machine "any" +.text + +.globl .poly1305_init_fpu +.align 6 +.poly1305_init_fpu: + $STU $sp,-$LOCALS($sp) # minimal frame + mflr $padbit + $PUSH $padbit,`$LOCALS+$LRSAVE`($sp) + + bl LPICmeup + + xor r0,r0,r0 + mtlr $padbit # restore lr + + lfd $two0,8*0($len) # load constants + lfd $two32,8*1($len) + lfd $two64,8*2($len) + lfd $two96,8*3($len) + lfd $two130,8*4($len) + lfd $five_two130,8*5($len) + + stfd $two0,8*0($ctx) # initial hash value, biased 0 + stfd $two32,8*1($ctx) + stfd $two64,8*2($ctx) + stfd $two96,8*3($ctx) + + $UCMP $inp,r0 + beq- Lno_key + + lfd $h3lo,8*13($len) # new fpscr + mffs $h3hi # old fpscr + + stfd $two0,8*4($ctx) # key "template" + stfd $two32,8*5($ctx) + stfd $two64,8*6($ctx) + stfd $two96,8*7($ctx) + + li $in1,4 + li $in2,8 + li $in3,12 + $LWXLE $in0,0,$inp # load key + $LWXLE $in1,$in1,$inp + $LWXLE $in2,$in2,$inp + $LWXLE $in3,$in3,$inp + + lis $i1,0xf000 # 0xf0000000 + ori $i2,$i1,3 # 0xf0000003 + andc $in0,$in0,$i1 # &=0x0fffffff + andc $in1,$in1,$i2 # &=0x0ffffffc + andc $in2,$in2,$i2 + andc $in3,$in3,$i2 + + stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template" + stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx) + stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx) + stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx) + + mtfsf 255,$h3lo # fpscr + stfd $two0,8*18($ctx) # copy constants to context + stfd $two32,8*19($ctx) + stfd $two64,8*20($ctx) + stfd $two96,8*21($ctx) + stfd $two130,8*22($ctx) + stfd $five_two130,8*23($ctx) + + lfd $h0lo,8*4($ctx) # load [biased] key + lfd $h1lo,8*5($ctx) + lfd $h2lo,8*6($ctx) + lfd $h3lo,8*7($ctx) + + fsub $h0lo,$h0lo,$two0 # r0 + fsub $h1lo,$h1lo,$two32 # r1 + fsub $h2lo,$h2lo,$two64 # r2 + fsub $h3lo,$h3lo,$two96 # r3 + + lfd $two0,8*6($len) # more constants + lfd $two32,8*7($len) + lfd $two64,8*8($len) + lfd $two96,8*9($len) + + fmul $h1hi,$h1lo,$five_two130 # s1 + fmul $h2hi,$h2lo,$five_two130 # s2 + stfd $h3hi,8*15($ctx) # borrow slot for original fpscr + fmul $h3hi,$h3lo,$five_two130 # s3 + + fadd $h0hi,$h0lo,$two0 + stfd $h1hi,8*12($ctx) # put aside for now + fadd $h1hi,$h1lo,$two32 + stfd $h2hi,8*13($ctx) + fadd $h2hi,$h2lo,$two64 + stfd $h3hi,8*14($ctx) + fadd $h3hi,$h3lo,$two96 + + fsub $h0hi,$h0hi,$two0 + fsub $h1hi,$h1hi,$two32 + fsub $h2hi,$h2hi,$two64 + fsub $h3hi,$h3hi,$two96 + + lfd $two0,8*10($len) # more constants + lfd $two32,8*11($len) + lfd $two64,8*12($len) + + fsub $h0lo,$h0lo,$h0hi + fsub $h1lo,$h1lo,$h1hi + fsub $h2lo,$h2lo,$h2hi + fsub $h3lo,$h3lo,$h3hi + + stfd $h0hi,8*5($ctx) # r0hi + stfd $h1hi,8*7($ctx) # r1hi + stfd $h2hi,8*9($ctx) # r2hi + stfd $h3hi,8*11($ctx) # r3hi + + stfd $h0lo,8*4($ctx) # r0lo + stfd $h1lo,8*6($ctx) # r1lo + stfd $h2lo,8*8($ctx) # r2lo + stfd $h3lo,8*10($ctx) # r3lo + + lfd $h1lo,8*12($ctx) # s1 + lfd $h2lo,8*13($ctx) # s2 + lfd $h3lo,8*14($ctx) # s3 + lfd $h0lo,8*15($ctx) # pull original fpscr + + fadd $h1hi,$h1lo,$two0 + fadd $h2hi,$h2lo,$two32 + fadd $h3hi,$h3lo,$two64 + + fsub $h1hi,$h1hi,$two0 + fsub $h2hi,$h2hi,$two32 + fsub $h3hi,$h3hi,$two64 + + fsub $h1lo,$h1lo,$h1hi + fsub $h2lo,$h2lo,$h2hi + fsub $h3lo,$h3lo,$h3hi + + stfd $h1hi,8*13($ctx) # s1hi + stfd $h2hi,8*15($ctx) # s2hi + stfd $h3hi,8*17($ctx) # s3hi + + stfd $h1lo,8*12($ctx) # s1lo + stfd $h2lo,8*14($ctx) # s2lo + stfd $h3lo,8*16($ctx) # s3lo + + mtfsf 255,$h0lo # restore fpscr +Lno_key: + xor r3,r3,r3 + addi $sp,$sp,$LOCALS + blr + .long 0 + .byte 0,12,4,1,0x80,0,2,0 +.size .poly1305_init_fpu,.-.poly1305_init_fpu + +.globl .poly1305_blocks_fpu +.align 4 +.poly1305_blocks_fpu: + srwi. $len,$len,4 + beq- Labort + + $STU $sp,-$FRAME($sp) + mflr r0 + stfd f14,`$FRAME-8*18`($sp) + stfd f15,`$FRAME-8*17`($sp) + stfd f16,`$FRAME-8*16`($sp) + stfd f17,`$FRAME-8*15`($sp) + stfd f18,`$FRAME-8*14`($sp) + stfd f19,`$FRAME-8*13`($sp) + stfd f20,`$FRAME-8*12`($sp) + stfd f21,`$FRAME-8*11`($sp) + stfd f22,`$FRAME-8*10`($sp) + stfd f23,`$FRAME-8*9`($sp) + stfd f24,`$FRAME-8*8`($sp) + stfd f25,`$FRAME-8*7`($sp) + stfd f26,`$FRAME-8*6`($sp) + stfd f27,`$FRAME-8*5`($sp) + stfd f28,`$FRAME-8*4`($sp) + stfd f29,`$FRAME-8*3`($sp) + stfd f30,`$FRAME-8*2`($sp) + stfd f31,`$FRAME-8*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + xor r0,r0,r0 + li $in3,1 + mtctr $len + neg $len,$len + stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp) + stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp) + + lfd $two0,8*18($ctx) # load constants + lfd $two32,8*19($ctx) + lfd $two64,8*20($ctx) + lfd $two96,8*21($ctx) + lfd $two130,8*22($ctx) + lfd $five_two130,8*23($ctx) + + lfd $h0lo,8*0($ctx) # load [biased] hash value + lfd $h1lo,8*1($ctx) + lfd $h2lo,8*2($ctx) + lfd $h3lo,8*3($ctx) + + stfd $two0,`$LOCALS+8*0`($sp) # input "template" + oris $in3,$padbit,`(1023+52+96)<<4` + stfd $two32,`$LOCALS+8*1`($sp) + stfd $two64,`$LOCALS+8*2`($sp) + stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp) + + li $i1,4 + li $i2,8 + li $i3,12 + $LWXLE $in0,0,$inp # load input + $LWXLE $in1,$i1,$inp + $LWXLE $in2,$i2,$inp + $LWXLE $in3,$i3,$inp + addi $inp,$inp,16 + + stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" + stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) + stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) + stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) + + mffs $x0 # original fpscr + lfd $x1,`$LOCALS+8*4`($sp) # new fpscr + lfd $r0lo,8*4($ctx) # load key + lfd $r0hi,8*5($ctx) + lfd $r1lo,8*6($ctx) + lfd $r1hi,8*7($ctx) + lfd $r2lo,8*8($ctx) + lfd $r2hi,8*9($ctx) + lfd $r3lo,8*10($ctx) + lfd $r3hi,8*11($ctx) + lfd $s1lo,8*12($ctx) + lfd $s1hi,8*13($ctx) + lfd $s2lo,8*14($ctx) + lfd $s2hi,8*15($ctx) + lfd $s3lo,8*16($ctx) + lfd $s3hi,8*17($ctx) + + stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr + mtfsf 255,$x1 + + addic $len,$len,1 + addze r0,r0 + slwi. r0,r0,4 + sub $inp,$inp,r0 # conditional rewind + + lfd $x0,`$LOCALS+8*0`($sp) + lfd $x1,`$LOCALS+8*1`($sp) + lfd $x2,`$LOCALS+8*2`($sp) + lfd $x3,`$LOCALS+8*3`($sp) + + fsub $h0lo,$h0lo,$two0 # de-bias hash value + $LWXLE $in0,0,$inp # modulo-scheduled input load + fsub $h1lo,$h1lo,$two32 + $LWXLE $in1,$i1,$inp + fsub $h2lo,$h2lo,$two64 + $LWXLE $in2,$i2,$inp + fsub $h3lo,$h3lo,$two96 + $LWXLE $in3,$i3,$inp + + fsub $x0,$x0,$two0 # de-bias input + addi $inp,$inp,16 + fsub $x1,$x1,$two32 + fsub $x2,$x2,$two64 + fsub $x3,$x3,$two96 + + fadd $x0,$x0,$h0lo # accumulate input + stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) + fadd $x1,$x1,$h1lo + stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) + fadd $x2,$x2,$h2lo + stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) + fadd $x3,$x3,$h3lo + stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) + + b Lentry + +.align 4 +Loop: + fsub $y0,$y0,$two0 # de-bias input + addic $len,$len,1 + fsub $y1,$y1,$two32 + addze r0,r0 + fsub $y2,$y2,$two64 + slwi. r0,r0,4 + fsub $y3,$y3,$two96 + sub $inp,$inp,r0 # conditional rewind + + fadd $h0lo,$h0lo,$y0 # accumulate input + fadd $h0hi,$h0hi,$y1 + fadd $h2lo,$h2lo,$y2 + fadd $h2hi,$h2hi,$y3 + + ######################################### base 2^48 -> base 2^32 + fadd $c1lo,$h1lo,$two64 + $LWXLE $in0,0,$inp # modulo-scheduled input load + fadd $c1hi,$h1hi,$two64 + $LWXLE $in1,$i1,$inp + fadd $c3lo,$h3lo,$two130 + $LWXLE $in2,$i2,$inp + fadd $c3hi,$h3hi,$two130 + $LWXLE $in3,$i3,$inp + fadd $c0lo,$h0lo,$two32 + addi $inp,$inp,16 + fadd $c0hi,$h0hi,$two32 + fadd $c2lo,$h2lo,$two96 + fadd $c2hi,$h2hi,$two96 + + fsub $c1lo,$c1lo,$two64 + stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" + fsub $c1hi,$c1hi,$two64 + stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) + fsub $c3lo,$c3lo,$two130 + stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) + fsub $c3hi,$c3hi,$two130 + stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) + fsub $c0lo,$c0lo,$two32 + fsub $c0hi,$c0hi,$two32 + fsub $c2lo,$c2lo,$two96 + fsub $c2hi,$c2hi,$two96 + + fsub $h1lo,$h1lo,$c1lo + fsub $h1hi,$h1hi,$c1hi + fsub $h3lo,$h3lo,$c3lo + fsub $h3hi,$h3hi,$c3hi + fsub $h2lo,$h2lo,$c2lo + fsub $h2hi,$h2hi,$c2hi + fsub $h0lo,$h0lo,$c0lo + fsub $h0hi,$h0hi,$c0hi + + fadd $h1lo,$h1lo,$c0lo + fadd $h1hi,$h1hi,$c0hi + fadd $h3lo,$h3lo,$c2lo + fadd $h3hi,$h3hi,$c2hi + fadd $h2lo,$h2lo,$c1lo + fadd $h2hi,$h2hi,$c1hi + fmadd $h0lo,$c3lo,$five_two130,$h0lo + fmadd $h0hi,$c3hi,$five_two130,$h0hi + + fadd $x1,$h1lo,$h1hi + lfd $s1lo,8*12($ctx) # reload constants + fadd $x3,$h3lo,$h3hi + lfd $s1hi,8*13($ctx) + fadd $x2,$h2lo,$h2hi + lfd $r3lo,8*10($ctx) + fadd $x0,$h0lo,$h0hi + lfd $r3hi,8*11($ctx) +Lentry: + fmul $h0lo,$s3lo,$x1 + fmul $h0hi,$s3hi,$x1 + fmul $h2lo,$r1lo,$x1 + fmul $h2hi,$r1hi,$x1 + fmul $h1lo,$r0lo,$x1 + fmul $h1hi,$r0hi,$x1 + fmul $h3lo,$r2lo,$x1 + fmul $h3hi,$r2hi,$x1 + + fmadd $h0lo,$s1lo,$x3,$h0lo + fmadd $h0hi,$s1hi,$x3,$h0hi + fmadd $h2lo,$s3lo,$x3,$h2lo + fmadd $h2hi,$s3hi,$x3,$h2hi + fmadd $h1lo,$s2lo,$x3,$h1lo + fmadd $h1hi,$s2hi,$x3,$h1hi + fmadd $h3lo,$r0lo,$x3,$h3lo + fmadd $h3hi,$r0hi,$x3,$h3hi + + fmadd $h0lo,$s2lo,$x2,$h0lo + fmadd $h0hi,$s2hi,$x2,$h0hi + fmadd $h2lo,$r0lo,$x2,$h2lo + fmadd $h2hi,$r0hi,$x2,$h2hi + fmadd $h1lo,$s3lo,$x2,$h1lo + fmadd $h1hi,$s3hi,$x2,$h1hi + fmadd $h3lo,$r1lo,$x2,$h3lo + fmadd $h3hi,$r1hi,$x2,$h3hi + + fmadd $h0lo,$r0lo,$x0,$h0lo + lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input + fmadd $h0hi,$r0hi,$x0,$h0hi + lfd $y1,`$LOCALS+8*1`($sp) + fmadd $h2lo,$r2lo,$x0,$h2lo + lfd $y2,`$LOCALS+8*2`($sp) + fmadd $h2hi,$r2hi,$x0,$h2hi + lfd $y3,`$LOCALS+8*3`($sp) + fmadd $h1lo,$r1lo,$x0,$h1lo + fmadd $h1hi,$r1hi,$x0,$h1hi + fmadd $h3lo,$r3lo,$x0,$h3lo + fmadd $h3hi,$r3hi,$x0,$h3hi + + bdnz Loop + + ######################################### base 2^48 -> base 2^32 + fadd $c0lo,$h0lo,$two32 + fadd $c0hi,$h0hi,$two32 + fadd $c2lo,$h2lo,$two96 + fadd $c2hi,$h2hi,$two96 + fadd $c1lo,$h1lo,$two64 + fadd $c1hi,$h1hi,$two64 + fadd $c3lo,$h3lo,$two130 + fadd $c3hi,$h3hi,$two130 + + fsub $c0lo,$c0lo,$two32 + fsub $c0hi,$c0hi,$two32 + fsub $c2lo,$c2lo,$two96 + fsub $c2hi,$c2hi,$two96 + fsub $c1lo,$c1lo,$two64 + fsub $c1hi,$c1hi,$two64 + fsub $c3lo,$c3lo,$two130 + fsub $c3hi,$c3hi,$two130 + + fsub $h1lo,$h1lo,$c1lo + fsub $h1hi,$h1hi,$c1hi + fsub $h3lo,$h3lo,$c3lo + fsub $h3hi,$h3hi,$c3hi + fsub $h2lo,$h2lo,$c2lo + fsub $h2hi,$h2hi,$c2hi + fsub $h0lo,$h0lo,$c0lo + fsub $h0hi,$h0hi,$c0hi + + fadd $h1lo,$h1lo,$c0lo + fadd $h1hi,$h1hi,$c0hi + fadd $h3lo,$h3lo,$c2lo + fadd $h3hi,$h3hi,$c2hi + fadd $h2lo,$h2lo,$c1lo + fadd $h2hi,$h2hi,$c1hi + fmadd $h0lo,$c3lo,$five_two130,$h0lo + fmadd $h0hi,$c3hi,$five_two130,$h0hi + + fadd $x1,$h1lo,$h1hi + fadd $x3,$h3lo,$h3hi + fadd $x2,$h2lo,$h2hi + fadd $x0,$h0lo,$h0hi + + lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr + fadd $x1,$x1,$two32 # bias + fadd $x3,$x3,$two96 + fadd $x2,$x2,$two64 + fadd $x0,$x0,$two0 + + stfd $x1,8*1($ctx) # store [biased] hash value + stfd $x3,8*3($ctx) + stfd $x2,8*2($ctx) + stfd $x0,8*0($ctx) + + mtfsf 255,$h0lo # restore original fpscr + lfd f14,`$FRAME-8*18`($sp) + lfd f15,`$FRAME-8*17`($sp) + lfd f16,`$FRAME-8*16`($sp) + lfd f17,`$FRAME-8*15`($sp) + lfd f18,`$FRAME-8*14`($sp) + lfd f19,`$FRAME-8*13`($sp) + lfd f20,`$FRAME-8*12`($sp) + lfd f21,`$FRAME-8*11`($sp) + lfd f22,`$FRAME-8*10`($sp) + lfd f23,`$FRAME-8*9`($sp) + lfd f24,`$FRAME-8*8`($sp) + lfd f25,`$FRAME-8*7`($sp) + lfd f26,`$FRAME-8*6`($sp) + lfd f27,`$FRAME-8*5`($sp) + lfd f28,`$FRAME-8*4`($sp) + lfd f29,`$FRAME-8*3`($sp) + lfd f30,`$FRAME-8*2`($sp) + lfd f31,`$FRAME-8*1`($sp) + addi $sp,$sp,$FRAME +Labort: + blr + .long 0 + .byte 0,12,4,1,0x80,0,4,0 +.size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu +___ +{ +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3 + ) = map("r$_",(7..11,28..31)); +my $mask = "r0"; +my $FRAME = (6+4)*$SIZE_T; + +$code.=<<___; +.globl .poly1305_emit_fpu +.align 4 +.poly1305_emit_fpu: + $STU $sp,-$FRAME($sp) + mflr r0 + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash + lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx) + lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx) + lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx) + lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx) + lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx) + lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx) + lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx) + + lis $mask,0xfff0 + andc $d0,$d0,$mask # mask exponent + andc $d1,$d1,$mask + andc $d2,$d2,$mask + andc $d3,$d3,$mask # can be partially reduced... + li $mask,3 + + srwi $padbit,$d3,2 # ... so reduce + and $h4,$d3,$mask + andc $d3,$d3,$mask + add $d3,$d3,$padbit +___ + if ($SIZE_T==4) { +$code.=<<___; + addc $h0,$h0,$d3 + adde $h1,$h1,$d0 + adde $h2,$h2,$d1 + adde $h3,$h3,$d2 + addze $h4,$h4 + + addic $d0,$h0,5 # compare to modulus + addze $d1,$h1 + addze $d2,$h2 + addze $d3,$h3 + addze $mask,$h4 + + srwi $mask,$mask,2 # did it carry/borrow? + neg $mask,$mask + srawi $mask,$mask,31 # mask + + andc $h0,$h0,$mask + and $d0,$d0,$mask + andc $h1,$h1,$mask + and $d1,$d1,$mask + or $h0,$h0,$d0 + lwz $d0,0($nonce) # load nonce + andc $h2,$h2,$mask + and $d2,$d2,$mask + or $h1,$h1,$d1 + lwz $d1,4($nonce) + andc $h3,$h3,$mask + and $d3,$d3,$mask + or $h2,$h2,$d2 + lwz $d2,8($nonce) + or $h3,$h3,$d3 + lwz $d3,12($nonce) + + addc $h0,$h0,$d0 # accumulate nonce + adde $h1,$h1,$d1 + adde $h2,$h2,$d2 + adde $h3,$h3,$d3 +___ + } else { +$code.=<<___; + add $h0,$h0,$d3 + add $h1,$h1,$d0 + add $h2,$h2,$d1 + add $h3,$h3,$d2 + + srdi $d0,$h0,32 + add $h1,$h1,$d0 + srdi $d1,$h1,32 + add $h2,$h2,$d1 + srdi $d2,$h2,32 + add $h3,$h3,$d2 + srdi $d3,$h3,32 + add $h4,$h4,$d3 + + insrdi $h0,$h1,32,0 + insrdi $h2,$h3,32,0 + + addic $d0,$h0,5 # compare to modulus + addze $d1,$h2 + addze $d2,$h4 + + srdi $mask,$d2,2 # did it carry/borrow? + neg $mask,$mask + sradi $mask,$mask,63 # mask + ld $d2,0($nonce) # load nonce + ld $d3,8($nonce) + + andc $h0,$h0,$mask + and $d0,$d0,$mask + andc $h2,$h2,$mask + and $d1,$d1,$mask + or $h0,$h0,$d0 + or $h2,$h2,$d1 +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + rotldi $d2,$d2,32 # flip nonce words + rotldi $d3,$d3,32 +___ +$code.=<<___; + addc $h0,$h0,$d2 # accumulate nonce + adde $h2,$h2,$d3 + + srdi $h1,$h0,32 + srdi $h3,$h2,32 +___ + } +$code.=<<___ if ($LITTLE_ENDIAN); + stw $h0,0($mac) # write result + stw $h1,4($mac) + stw $h2,8($mac) + stw $h3,12($mac) +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + li $d1,4 + stwbrx $h0,0,$mac # write result + li $d2,8 + stwbrx $h1,$d1,$mac + li $d3,12 + stwbrx $h2,$d2,$mac + stwbrx $h3,$d3,$mac +___ +$code.=<<___; + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,4,3,0 +.size .poly1305_emit_fpu,.-.poly1305_emit_fpu +___ +} +# Ugly hack here, because PPC assembler syntax seem to vary too +# much from platforms to platform... +$code.=<<___; +.align 6 +LPICmeup: + mflr r0 + bcl 20,31,\$+4 + mflr $len # vvvvvv "distance" between . and 1st data entry + addi $len,$len,`64-8` # borrow $len + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` + +.quad 0x4330000000000000 # 2^(52+0) +.quad 0x4530000000000000 # 2^(52+32) +.quad 0x4730000000000000 # 2^(52+64) +.quad 0x4930000000000000 # 2^(52+96) +.quad 0x4b50000000000000 # 2^(52+130) + +.quad 0x37f4000000000000 # 5/2^130 + +.quad 0x4430000000000000 # 2^(52+16+0) +.quad 0x4630000000000000 # 2^(52+16+32) +.quad 0x4830000000000000 # 2^(52+16+64) +.quad 0x4a30000000000000 # 2^(52+16+96) +.quad 0x3e30000000000000 # 2^(52+16+0-96) +.quad 0x4030000000000000 # 2^(52+16+32-96) +.quad 0x4230000000000000 # 2^(52+16+64-96) + +.quad 0x0000000000000001 # fpscr: truncate, no exceptions +.asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-s390x.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-s390x.pl new file mode 100755 index 000000000..bcc8fd3b8 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-s390x.pl @@ -0,0 +1,227 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for s390x. +# +# June 2015 +# +# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated +# code. For older compiler improvement coefficient is >3x, because +# then base 2^64 and base 2^32 implementations are compared. +# +# On side note, z13 enables vector base 2^26 implementation... + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$sp="%r15"; + +my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); + +$code.=<<___; +.text + +.globl poly1305_init +.type poly1305_init,\@function +.align 16 +poly1305_init: + lghi %r0,0 + lghi %r1,-1 + stg %r0,0($ctx) # zero hash value + stg %r0,8($ctx) + stg %r0,16($ctx) + + cl${g}r $inp,%r0 + je .Lno_key + + lrvg %r4,0($inp) # load little-endian key + lrvg %r5,8($inp) + + nihl %r1,0xffc0 # 0xffffffc0ffffffff + srlg %r0,%r1,4 # 0x0ffffffc0fffffff + srlg %r1,%r1,4 + nill %r1,0xfffc # 0x0ffffffc0ffffffc + + ngr %r4,%r0 + ngr %r5,%r1 + + stg %r4,32($ctx) + stg %r5,40($ctx) + +.Lno_key: + lghi %r2,0 + br %r14 +.size poly1305_init,.-poly1305_init +___ +{ +my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); +my ($r0,$r1,$s1) = map("%r$_",(0..2)); + +$code.=<<___; +.globl poly1305_blocks +.type poly1305_blocks,\@function +.align 16 +poly1305_blocks: + srl${g} $len,4 # fixed-up in 64-bit build + lghi %r0,0 + cl${g}r $len,%r0 + je .Lno_data + + stm${g} %r6,%r14,`6*$SIZE_T`($sp) + + llgfr $padbit,$padbit # clear upper half, much needed with + # non-64-bit ABI + lg $r0,32($ctx) # load key + lg $r1,40($ctx) + + lg $h0,0($ctx) # load hash value + lg $h1,8($ctx) + lg $h2,16($ctx) + + st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx + srlg $s1,$r1,2 + algr $s1,$r1 # s1 = r1 + r1>>2 + j .Loop + +.align 16 +.Loop: + lrvg $d0lo,0($inp) # load little-endian input + lrvg $d1lo,8($inp) + la $inp,16($inp) + + algr $d0lo,$h0 # accumulate input + alcgr $d1lo,$h1 + + lgr $h0,$d0lo + mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo + lgr $h1,$d1lo + mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo + + mlgr $t0,$r1 # h0*r1 -> $t0:$h0 + mlgr $t1,$r0 # h1*r0 -> $t1:$h1 + alcgr $h2,$padbit + + algr $d0lo,$d1lo + lgr $d1lo,$h2 + alcgr $d0hi,$d1hi + lghi $d1hi,0 + + algr $h1,$h0 + alcgr $t1,$t0 + + msgr $d1lo,$s1 # h2*s1 + msgr $h2,$r0 # h2*r0 + + algr $h1,$d1lo + alcgr $t1,$d1hi # $d1hi is zero + + algr $h1,$d0hi + alcgr $h2,$t1 + + lghi $h0,-4 # final reduction step + ngr $h0,$h2 + srlg $t0,$h2,2 + algr $h0,$t0 + lghi $t1,3 + ngr $h2,$t1 + + algr $h0,$d0lo + alcgr $h1,$d1hi # $d1hi is still zero + alcgr $h2,$d1hi # $d1hi is still zero + + brct$g $len,.Loop + + l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx + + stg $h0,0($ctx) # store hash value + stg $h1,8($ctx) + stg $h2,16($ctx) + + lm${g} %r6,%r14,`6*$SIZE_T`($sp) +.Lno_data: + br %r14 +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($mac,$nonce)=($inp,$len); +my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9)); + +$code.=<<___; +.globl poly1305_emit +.type poly1305_emit,\@function +.align 16 +poly1305_emit: + stm${g} %r6,%r9,`6*$SIZE_T`($sp) + + lg $h0,0($ctx) + lg $h1,8($ctx) + lg $h2,16($ctx) + + lghi %r0,5 + lghi %r1,0 + lgr $d0,$h0 + lgr $d1,$h1 + + algr $h0,%r0 # compare to modulus + alcgr $h1,%r1 + alcgr $h2,%r1 + + srlg $h2,$h2,2 # did it borrow/carry? + slgr %r1,$h2 # 0-$h2>>2 + lg $h2,0($nonce) # load nonce + lghi %r0,-1 + lg $ctx,8($nonce) + xgr %r0,%r1 # ~%r1 + + ngr $h0,%r1 + ngr $d0,%r0 + ngr $h1,%r1 + ngr $d1,%r0 + ogr $h0,$d0 + rllg $d0,$h2,32 # flip nonce words + ogr $h1,$d1 + rllg $d1,$ctx,32 + + algr $h0,$d0 # accumulate nonce + alcgr $h1,$d1 + + strvg $h0,0($mac) # write little-endian result + strvg $h1,8($mac) + + lm${g} %r6,%r9,`6*$SIZE_T`($sp) + br %r14 +.size poly1305_emit,.-poly1305_emit + +.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>" +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm; + +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-sparcv9.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-sparcv9.pl new file mode 100755 index 000000000..997e0d834 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-sparcv9.pl @@ -0,0 +1,1120 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for SPARCv9, vanilla, as well +# as VIS3 and FMA extensions. +# +# May, August 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU(*) FMA +# +# UltraSPARC III 12.3(**) +# SPARC T3 7.92 +# SPARC T4 1.70(***) 6.55 +# SPARC64 X 5.60 3.64 +# +# (*) Comparison to compiler-generated code is really problematic, +# because latter's performance varies too much depending on too +# many variables. For example, one can measure from 5x to 15x +# improvement on T4 for gcc-4.6. Well, in T4 case it's a bit +# unfair comparison, because compiler doesn't use VIS3, but +# given same initial conditions coefficient varies from 3x to 9x. +# (**) Pre-III performance should be even worse; floating-point +# performance for UltraSPARC I-IV on the other hand is reported +# to be 4.25 for hand-coded assembly, but they are just too old +# to care about. +# (***) Multi-process benchmark saturates at ~12.5x single-process +# result on 8-core processor, or ~21GBps per 2.85GHz socket. + +my $output = pop; +open STDOUT,">$output"; + +my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5)); +my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7)); +my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7)); +my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4)); + +my $output = pop; +open STDOUT,">$stdout"; + +$code.=<<___; +#include "sparc_arch.h" + +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +# define STPTR stx +# define SIZE_T 8 +#else +# define STPTR st +# define SIZE_T 4 +#endif +#define LOCALS (STACK_BIAS+STACK_FRAME) + +.section ".text",#alloc,#execinstr + +#ifdef __PIC__ +SPARC_PIC_THUNK(%g1) +#endif + +.globl poly1305_init +.align 32 +poly1305_init: + save %sp,-STACK_FRAME-16,%sp + nop + + SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1) + ld [%g1],%g1 + + and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1 + cmp %g1,SPARCV9_FMADD + be .Lpoly1305_init_fma + nop + + stx %g0,[$ctx+0] + stx %g0,[$ctx+8] ! zero hash value + brz,pn $inp,.Lno_key + stx %g0,[$ctx+16] + + and $inp,7,$shr ! alignment factor + andn $inp,7,$inp + sll $shr,3,$shr ! *8 + neg $shr,$shl + + sethi %hi(0x0ffffffc),$t0 + set 8,$h1 + or $t0,%lo(0x0ffffffc),$t0 + set 16,$h2 + sllx $t0,32,$t1 + or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc + or $t1,3,$t0 ! 0x0ffffffc0fffffff + + ldxa [$inp+%g0]0x88,$h0 ! load little-endian key + brz,pt $shr,.Lkey_aligned + ldxa [$inp+$h1]0x88,$h1 + + ldxa [$inp+$h2]0x88,$h2 + srlx $h0,$shr,$h0 + sllx $h1,$shl,$t2 + srlx $h1,$shr,$h1 + or $t2,$h0,$h0 + sllx $h2,$shl,$h2 + or $h2,$h1,$h1 + +.Lkey_aligned: + and $t0,$h0,$h0 + and $t1,$h1,$h1 + stx $h0,[$ctx+32+0] ! store key + stx $h1,[$ctx+32+8] + + andcc %g1,SPARCV9_VIS3,%g0 + be .Lno_key + nop + +1: call .+8 + add %o7,poly1305_blocks_vis3-1b,%o7 + + add %o7,poly1305_emit-poly1305_blocks_vis3,%o5 + STPTR %o7,[%i2] + STPTR %o5,[%i2+SIZE_T] + + ret + restore %g0,1,%o0 ! return 1 + +.Lno_key: + ret + restore %g0,%g0,%o0 ! return 0 +.type poly1305_init,#function +.size poly1305_init,.-poly1305_init + +.globl poly1305_blocks +.align 32 +poly1305_blocks: + save %sp,-STACK_FRAME,%sp + srln $len,4,$len + + brz,pn $len,.Lno_data + nop + + ld [$ctx+32+0],$r1 ! load key + ld [$ctx+32+4],$r0 + ld [$ctx+32+8],$r3 + ld [$ctx+32+12],$r2 + + ld [$ctx+0],$h1 ! load hash value + ld [$ctx+4],$h0 + ld [$ctx+8],$h3 + ld [$ctx+12],$h2 + ld [$ctx+16],$h4 + + and $inp,7,$shr ! alignment factor + andn $inp,7,$inp + set 8,$d1 + sll $shr,3,$shr ! *8 + set 16,$d2 + neg $shr,$shl + + srl $r1,2,$s1 + srl $r2,2,$s2 + add $r1,$s1,$s1 + srl $r3,2,$s3 + add $r2,$s2,$s2 + add $r3,$s3,$s3 + +.Loop: + ldxa [$inp+%g0]0x88,$d0 ! load little-endian input + brz,pt $shr,.Linp_aligned + ldxa [$inp+$d1]0x88,$d1 + + ldxa [$inp+$d2]0x88,$d2 + srlx $d0,$shr,$d0 + sllx $d1,$shl,$t1 + srlx $d1,$shr,$d1 + or $t1,$d0,$d0 + sllx $d2,$shl,$d2 + or $d2,$d1,$d1 + +.Linp_aligned: + srlx $d0,32,$t0 + addcc $d0,$h0,$h0 ! accumulate input + srlx $d1,32,$t1 + addccc $t0,$h1,$h1 + addccc $d1,$h2,$h2 + addccc $t1,$h3,$h3 + addc $padbit,$h4,$h4 + + umul $r0,$h0,$d0 + umul $r1,$h0,$d1 + umul $r2,$h0,$d2 + umul $r3,$h0,$d3 + sub $len,1,$len + add $inp,16,$inp + + umul $s3,$h1,$t0 + umul $r0,$h1,$t1 + umul $r1,$h1,$t2 + add $t0,$d0,$d0 + add $t1,$d1,$d1 + umul $r2,$h1,$t0 + add $t2,$d2,$d2 + add $t0,$d3,$d3 + + umul $s2,$h2,$t1 + umul $s3,$h2,$t2 + umul $r0,$h2,$t0 + add $t1,$d0,$d0 + add $t2,$d1,$d1 + umul $r1,$h2,$t1 + add $t0,$d2,$d2 + add $t1,$d3,$d3 + + umul $s1,$h3,$t2 + umul $s2,$h3,$t0 + umul $s3,$h3,$t1 + add $t2,$d0,$d0 + add $t0,$d1,$d1 + umul $r0,$h3,$t2 + add $t1,$d2,$d2 + add $t2,$d3,$d3 + + umul $s1,$h4,$t0 + umul $s2,$h4,$t1 + umul $s3,$h4,$t2 + umul $r0,$h4,$h4 + add $t0,$d1,$d1 + add $t1,$d2,$d2 + srlx $d0,32,$h1 + add $t2,$d3,$d3 + srlx $d1,32,$h2 + + addcc $d1,$h1,$h1 + srlx $d2,32,$h3 + set 8,$d1 + addccc $d2,$h2,$h2 + srlx $d3,32,$t0 + set 16,$d2 + addccc $d3,$h3,$h3 + addc $t0,$h4,$h4 + + srl $h4,2,$t0 ! final reduction step + andn $h4,3,$t1 + and $h4,3,$h4 + add $t1,$t0,$t0 + + addcc $t0,$d0,$h0 + addccc %g0,$h1,$h1 + addccc %g0,$h2,$h2 + addccc %g0,$h3,$h3 + brnz,pt $len,.Loop + addc %g0,$h4,$h4 + + st $h1,[$ctx+0] ! store hash value + st $h0,[$ctx+4] + st $h3,[$ctx+8] + st $h2,[$ctx+12] + st $h4,[$ctx+16] + +.Lno_data: + ret + restore +.type poly1305_blocks,#function +.size poly1305_blocks,.-poly1305_blocks +___ +######################################################################## +# VIS3 has umulxhi and addxc... +{ +my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7)); +my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4)); + +$code.=<<___; +.align 32 +poly1305_blocks_vis3: + save %sp,-STACK_FRAME,%sp + srln $len,4,$len + + brz,pn $len,.Lno_data + nop + + ldx [$ctx+32+0],$R0 ! load key + ldx [$ctx+32+8],$R1 + + ldx [$ctx+0],$H0 ! load hash value + ldx [$ctx+8],$H1 + ld [$ctx+16],$H2 + + and $inp,7,$shr ! alignment factor + andn $inp,7,$inp + set 8,$r1 + sll $shr,3,$shr ! *8 + set 16,$r2 + neg $shr,$shl + + srlx $R1,2,$S1 + b .Loop_vis3 + add $R1,$S1,$S1 + +.Loop_vis3: + ldxa [$inp+%g0]0x88,$D0 ! load little-endian input + brz,pt $shr,.Linp_aligned_vis3 + ldxa [$inp+$r1]0x88,$D1 + + ldxa [$inp+$r2]0x88,$D2 + srlx $D0,$shr,$D0 + sllx $D1,$shl,$T1 + srlx $D1,$shr,$D1 + or $T1,$D0,$D0 + sllx $D2,$shl,$D2 + or $D2,$D1,$D1 + +.Linp_aligned_vis3: + addcc $D0,$H0,$H0 ! accumulate input + sub $len,1,$len + addxccc $D1,$H1,$H1 + add $inp,16,$inp + + mulx $R0,$H0,$D0 ! r0*h0 + addxc $padbit,$H2,$H2 + umulxhi $R0,$H0,$D1 + mulx $S1,$H1,$T0 ! s1*h1 + umulxhi $S1,$H1,$T1 + addcc $T0,$D0,$D0 + mulx $R1,$H0,$T0 ! r1*h0 + addxc $T1,$D1,$D1 + umulxhi $R1,$H0,$D2 + addcc $T0,$D1,$D1 + mulx $R0,$H1,$T0 ! r0*h1 + addxc %g0,$D2,$D2 + umulxhi $R0,$H1,$T1 + addcc $T0,$D1,$D1 + mulx $S1,$H2,$T0 ! s1*h2 + addxc $T1,$D2,$D2 + mulx $R0,$H2,$T1 ! r0*h2 + addcc $T0,$D1,$D1 + addxc $T1,$D2,$D2 + + srlx $D2,2,$T0 ! final reduction step + andn $D2,3,$T1 + and $D2,3,$H2 + add $T1,$T0,$T0 + + addcc $T0,$D0,$H0 + addxccc %g0,$D1,$H1 + brnz,pt $len,.Loop_vis3 + addxc %g0,$H2,$H2 + + stx $H0,[$ctx+0] ! store hash value + stx $H1,[$ctx+8] + st $H2,[$ctx+16] + + ret + restore +.type poly1305_blocks_vis3,#function +.size poly1305_blocks_vis3,.-poly1305_blocks_vis3 +___ +} +my ($mac,$nonce) = ($inp,$len); + +$code.=<<___; +.globl poly1305_emit +.align 32 +poly1305_emit: + save %sp,-STACK_FRAME,%sp + + ld [$ctx+0],$h1 ! load hash value + ld [$ctx+4],$h0 + ld [$ctx+8],$h3 + ld [$ctx+12],$h2 + ld [$ctx+16],$h4 + + addcc $h0,5,$r0 ! compare to modulus + addccc $h1,0,$r1 + addccc $h2,0,$r2 + addccc $h3,0,$r3 + addc $h4,0,$h4 + andcc $h4,4,%g0 ! did it carry/borrow? + + movnz %icc,$r0,$h0 + ld [$nonce+0],$r0 ! load nonce + movnz %icc,$r1,$h1 + ld [$nonce+4],$r1 + movnz %icc,$r2,$h2 + ld [$nonce+8],$r2 + movnz %icc,$r3,$h3 + ld [$nonce+12],$r3 + + addcc $r0,$h0,$h0 ! accumulate nonce + addccc $r1,$h1,$h1 + addccc $r2,$h2,$h2 + addc $r3,$h3,$h3 + + srl $h0,8,$r0 + stb $h0,[$mac+0] ! store little-endian result + srl $h0,16,$r1 + stb $r0,[$mac+1] + srl $h0,24,$r2 + stb $r1,[$mac+2] + stb $r2,[$mac+3] + + srl $h1,8,$r0 + stb $h1,[$mac+4] + srl $h1,16,$r1 + stb $r0,[$mac+5] + srl $h1,24,$r2 + stb $r1,[$mac+6] + stb $r2,[$mac+7] + + srl $h2,8,$r0 + stb $h2,[$mac+8] + srl $h2,16,$r1 + stb $r0,[$mac+9] + srl $h2,24,$r2 + stb $r1,[$mac+10] + stb $r2,[$mac+11] + + srl $h3,8,$r0 + stb $h3,[$mac+12] + srl $h3,16,$r1 + stb $r0,[$mac+13] + srl $h3,24,$r2 + stb $r1,[$mac+14] + stb $r2,[$mac+15] + + ret + restore +.type poly1305_emit,#function +.size poly1305_emit,.-poly1305_emit +___ + +{ +my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3)); +my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4)); +my ($i1,$step,$shr,$shl) = map("%l$_",(0..7)); +my $i2=$step; + +my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, + $two0,$two32,$two64,$two96,$two130,$five_two130, + $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, + $s2lo,$s2hi,$s3lo,$s3hi, + $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31)); +# borrowings +my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); +my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); +my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo); + +$code.=<<___; +.align 32 +poly1305_init_fma: + save %sp,-STACK_FRAME-16,%sp + nop + +.Lpoly1305_init_fma: +1: call .+8 + add %o7,.Lconsts_fma-1b,%o7 + + ldd [%o7+8*0],$two0 ! load constants + ldd [%o7+8*1],$two32 + ldd [%o7+8*2],$two64 + ldd [%o7+8*3],$two96 + ldd [%o7+8*5],$five_two130 + + std $two0,[$ctx+8*0] ! initial hash value, biased 0 + std $two32,[$ctx+8*1] + std $two64,[$ctx+8*2] + std $two96,[$ctx+8*3] + + brz,pn $inp,.Lno_key_fma + nop + + stx %fsr,[%sp+LOCALS] ! save original %fsr + ldx [%o7+8*6],%fsr ! load new %fsr + + std $two0,[$ctx+8*4] ! key "template" + std $two32,[$ctx+8*5] + std $two64,[$ctx+8*6] + std $two96,[$ctx+8*7] + + and $inp,7,$shr + andn $inp,7,$inp ! align pointer + mov 8,$i1 + sll $shr,3,$shr + mov 16,$i2 + neg $shr,$shl + + ldxa [$inp+%g0]0x88,$in0 ! load little-endian key + ldxa [$inp+$i1]0x88,$in2 + + brz $shr,.Lkey_aligned_fma + sethi %hi(0xf0000000),$i1 ! 0xf0000000 + + ldxa [$inp+$i2]0x88,$in4 + + srlx $in0,$shr,$in0 ! align data + sllx $in2,$shl,$in1 + srlx $in2,$shr,$in2 + or $in1,$in0,$in0 + sllx $in4,$shl,$in3 + or $in3,$in2,$in2 + +.Lkey_aligned_fma: + or $i1,3,$i2 ! 0xf0000003 + srlx $in0,32,$in1 + andn $in0,$i1,$in0 ! &=0x0fffffff + andn $in1,$i2,$in1 ! &=0x0ffffffc + srlx $in2,32,$in3 + andn $in2,$i2,$in2 + andn $in3,$i2,$in3 + + st $in0,[$ctx+`8*4+4`] ! fill "template" + st $in1,[$ctx+`8*5+4`] + st $in2,[$ctx+`8*6+4`] + st $in3,[$ctx+`8*7+4`] + + ldd [$ctx+8*4],$h0lo ! load [biased] key + ldd [$ctx+8*5],$h1lo + ldd [$ctx+8*6],$h2lo + ldd [$ctx+8*7],$h3lo + + fsubd $h0lo,$two0, $h0lo ! r0 + ldd [%o7+8*7],$two0 ! more constants + fsubd $h1lo,$two32,$h1lo ! r1 + ldd [%o7+8*8],$two32 + fsubd $h2lo,$two64,$h2lo ! r2 + ldd [%o7+8*9],$two64 + fsubd $h3lo,$two96,$h3lo ! r3 + ldd [%o7+8*10],$two96 + + fmuld $five_two130,$h1lo,$s1lo ! s1 + fmuld $five_two130,$h2lo,$s2lo ! s2 + fmuld $five_two130,$h3lo,$s3lo ! s3 + + faddd $h0lo,$two0, $h0hi + faddd $h1lo,$two32,$h1hi + faddd $h2lo,$two64,$h2hi + faddd $h3lo,$two96,$h3hi + + fsubd $h0hi,$two0, $h0hi + ldd [%o7+8*11],$two0 ! more constants + fsubd $h1hi,$two32,$h1hi + ldd [%o7+8*12],$two32 + fsubd $h2hi,$two64,$h2hi + ldd [%o7+8*13],$two64 + fsubd $h3hi,$two96,$h3hi + + fsubd $h0lo,$h0hi,$h0lo + std $h0hi,[$ctx+8*5] ! r0hi + fsubd $h1lo,$h1hi,$h1lo + std $h1hi,[$ctx+8*7] ! r1hi + fsubd $h2lo,$h2hi,$h2lo + std $h2hi,[$ctx+8*9] ! r2hi + fsubd $h3lo,$h3hi,$h3lo + std $h3hi,[$ctx+8*11] ! r3hi + + faddd $s1lo,$two0, $s1hi + faddd $s2lo,$two32,$s2hi + faddd $s3lo,$two64,$s3hi + + fsubd $s1hi,$two0, $s1hi + fsubd $s2hi,$two32,$s2hi + fsubd $s3hi,$two64,$s3hi + + fsubd $s1lo,$s1hi,$s1lo + fsubd $s2lo,$s2hi,$s2lo + fsubd $s3lo,$s3hi,$s3lo + + ldx [%sp+LOCALS],%fsr ! restore %fsr + + std $h0lo,[$ctx+8*4] ! r0lo + std $h1lo,[$ctx+8*6] ! r1lo + std $h2lo,[$ctx+8*8] ! r2lo + std $h3lo,[$ctx+8*10] ! r3lo + + std $s1hi,[$ctx+8*13] + std $s2hi,[$ctx+8*15] + std $s3hi,[$ctx+8*17] + + std $s1lo,[$ctx+8*12] + std $s2lo,[$ctx+8*14] + std $s3lo,[$ctx+8*16] + + add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0 + add %o7,poly1305_emit_fma-.Lconsts_fma,%o1 + STPTR %o0,[%i2] + STPTR %o1,[%i2+SIZE_T] + + ret + restore %g0,1,%o0 ! return 1 + +.Lno_key_fma: + ret + restore %g0,%g0,%o0 ! return 0 +.type poly1305_init_fma,#function +.size poly1305_init_fma,.-poly1305_init_fma + +.align 32 +poly1305_blocks_fma: + save %sp,-STACK_FRAME-48,%sp + srln $len,4,$len + + brz,pn $len,.Labort + sub $len,1,$len + +1: call .+8 + add %o7,.Lconsts_fma-1b,%o7 + + ldd [%o7+8*0],$two0 ! load constants + ldd [%o7+8*1],$two32 + ldd [%o7+8*2],$two64 + ldd [%o7+8*3],$two96 + ldd [%o7+8*4],$two130 + ldd [%o7+8*5],$five_two130 + + ldd [$ctx+8*0],$h0lo ! load [biased] hash value + ldd [$ctx+8*1],$h1lo + ldd [$ctx+8*2],$h2lo + ldd [$ctx+8*3],$h3lo + + std $two0,[%sp+LOCALS+8*0] ! input "template" + sethi %hi((1023+52+96)<<20),$in3 + std $two32,[%sp+LOCALS+8*1] + or $padbit,$in3,$in3 + std $two64,[%sp+LOCALS+8*2] + st $in3,[%sp+LOCALS+8*3] + + and $inp,7,$shr + andn $inp,7,$inp ! align pointer + mov 8,$i1 + sll $shr,3,$shr + mov 16,$step + neg $shr,$shl + + ldxa [$inp+%g0]0x88,$in0 ! load little-endian input + brz $shr,.Linp_aligned_fma + ldxa [$inp+$i1]0x88,$in2 + + ldxa [$inp+$step]0x88,$in4 + add $inp,8,$inp + + srlx $in0,$shr,$in0 ! align data + sllx $in2,$shl,$in1 + srlx $in2,$shr,$in2 + or $in1,$in0,$in0 + sllx $in4,$shl,$in3 + srlx $in4,$shr,$in4 ! pre-shift + or $in3,$in2,$in2 + +.Linp_aligned_fma: + srlx $in0,32,$in1 + movrz $len,0,$step + srlx $in2,32,$in3 + add $step,$inp,$inp ! conditional advance + + st $in0,[%sp+LOCALS+8*0+4] ! fill "template" + st $in1,[%sp+LOCALS+8*1+4] + st $in2,[%sp+LOCALS+8*2+4] + st $in3,[%sp+LOCALS+8*3+4] + + ldd [$ctx+8*4],$r0lo ! load key + ldd [$ctx+8*5],$r0hi + ldd [$ctx+8*6],$r1lo + ldd [$ctx+8*7],$r1hi + ldd [$ctx+8*8],$r2lo + ldd [$ctx+8*9],$r2hi + ldd [$ctx+8*10],$r3lo + ldd [$ctx+8*11],$r3hi + ldd [$ctx+8*12],$s1lo + ldd [$ctx+8*13],$s1hi + ldd [$ctx+8*14],$s2lo + ldd [$ctx+8*15],$s2hi + ldd [$ctx+8*16],$s3lo + ldd [$ctx+8*17],$s3hi + + stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr + ldx [%o7+8*6],%fsr ! load new %fsr + + subcc $len,1,$len + movrz $len,0,$step + + ldd [%sp+LOCALS+8*0],$x0 ! load biased input + ldd [%sp+LOCALS+8*1],$x1 + ldd [%sp+LOCALS+8*2],$x2 + ldd [%sp+LOCALS+8*3],$x3 + + fsubd $h0lo,$two0, $h0lo ! de-bias hash value + fsubd $h1lo,$two32,$h1lo + ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load + fsubd $h2lo,$two64,$h2lo + fsubd $h3lo,$two96,$h3lo + ldxa [$inp+$i1]0x88,$in2 + + fsubd $x0,$two0, $x0 ! de-bias input + fsubd $x1,$two32,$x1 + fsubd $x2,$two64,$x2 + fsubd $x3,$two96,$x3 + + brz $shr,.Linp_aligned_fma2 + add $step,$inp,$inp ! conditional advance + + sllx $in0,$shl,$in1 ! align data + srlx $in0,$shr,$in3 + or $in1,$in4,$in0 + sllx $in2,$shl,$in1 + srlx $in2,$shr,$in4 ! pre-shift + or $in3,$in1,$in2 +.Linp_aligned_fma2: + srlx $in0,32,$in1 + srlx $in2,32,$in3 + + faddd $h0lo,$x0,$x0 ! accumulate input + stw $in0,[%sp+LOCALS+8*0+4] + faddd $h1lo,$x1,$x1 + stw $in1,[%sp+LOCALS+8*1+4] + faddd $h2lo,$x2,$x2 + stw $in2,[%sp+LOCALS+8*2+4] + faddd $h3lo,$x3,$x3 + stw $in3,[%sp+LOCALS+8*3+4] + + b .Lentry_fma + nop + +.align 16 +.Loop_fma: + ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load + ldxa [$inp+$i1]0x88,$in2 + movrz $len,0,$step + + faddd $y0,$h0lo,$h0lo ! accumulate input + faddd $y1,$h0hi,$h0hi + faddd $y2,$h2lo,$h2lo + faddd $y3,$h2hi,$h2hi + + brz,pn $shr,.Linp_aligned_fma3 + add $step,$inp,$inp ! conditional advance + + sllx $in0,$shl,$in1 ! align data + srlx $in0,$shr,$in3 + or $in1,$in4,$in0 + sllx $in2,$shl,$in1 + srlx $in2,$shr,$in4 ! pre-shift + or $in3,$in1,$in2 + +.Linp_aligned_fma3: + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 + faddd $two64,$h1lo,$c1lo + srlx $in0,32,$in1 + faddd $two64,$h1hi,$c1hi + srlx $in2,32,$in3 + faddd $two130,$h3lo,$c3lo + st $in0,[%sp+LOCALS+8*0+4] ! fill "template" + faddd $two130,$h3hi,$c3hi + st $in1,[%sp+LOCALS+8*1+4] + faddd $two32,$h0lo,$c0lo + st $in2,[%sp+LOCALS+8*2+4] + faddd $two32,$h0hi,$c0hi + st $in3,[%sp+LOCALS+8*3+4] + faddd $two96,$h2lo,$c2lo + faddd $two96,$h2hi,$c2hi + + fsubd $c1lo,$two64,$c1lo + fsubd $c1hi,$two64,$c1hi + fsubd $c3lo,$two130,$c3lo + fsubd $c3hi,$two130,$c3hi + fsubd $c0lo,$two32,$c0lo + fsubd $c0hi,$two32,$c0hi + fsubd $c2lo,$two96,$c2lo + fsubd $c2hi,$two96,$c2hi + + fsubd $h1lo,$c1lo,$h1lo + fsubd $h1hi,$c1hi,$h1hi + fsubd $h3lo,$c3lo,$h3lo + fsubd $h3hi,$c3hi,$h3hi + fsubd $h2lo,$c2lo,$h2lo + fsubd $h2hi,$c2hi,$h2hi + fsubd $h0lo,$c0lo,$h0lo + fsubd $h0hi,$c0hi,$h0hi + + faddd $h1lo,$c0lo,$h1lo + faddd $h1hi,$c0hi,$h1hi + faddd $h3lo,$c2lo,$h3lo + faddd $h3hi,$c2hi,$h3hi + faddd $h2lo,$c1lo,$h2lo + faddd $h2hi,$c1hi,$h2hi + fmaddd $five_two130,$c3lo,$h0lo,$h0lo + fmaddd $five_two130,$c3hi,$h0hi,$h0hi + + faddd $h1lo,$h1hi,$x1 + ldd [$ctx+8*12],$s1lo ! reload constants + faddd $h3lo,$h3hi,$x3 + ldd [$ctx+8*13],$s1hi + faddd $h2lo,$h2hi,$x2 + ldd [$ctx+8*10],$r3lo + faddd $h0lo,$h0hi,$x0 + ldd [$ctx+8*11],$r3hi + +.Lentry_fma: + fmuld $x1,$s3lo,$h0lo + fmuld $x1,$s3hi,$h0hi + fmuld $x1,$r1lo,$h2lo + fmuld $x1,$r1hi,$h2hi + fmuld $x1,$r0lo,$h1lo + fmuld $x1,$r0hi,$h1hi + fmuld $x1,$r2lo,$h3lo + fmuld $x1,$r2hi,$h3hi + + fmaddd $x3,$s1lo,$h0lo,$h0lo + fmaddd $x3,$s1hi,$h0hi,$h0hi + fmaddd $x3,$s3lo,$h2lo,$h2lo + fmaddd $x3,$s3hi,$h2hi,$h2hi + fmaddd $x3,$s2lo,$h1lo,$h1lo + fmaddd $x3,$s2hi,$h1hi,$h1hi + fmaddd $x3,$r0lo,$h3lo,$h3lo + fmaddd $x3,$r0hi,$h3hi,$h3hi + + fmaddd $x2,$s2lo,$h0lo,$h0lo + fmaddd $x2,$s2hi,$h0hi,$h0hi + fmaddd $x2,$r0lo,$h2lo,$h2lo + fmaddd $x2,$r0hi,$h2hi,$h2hi + fmaddd $x2,$s3lo,$h1lo,$h1lo + ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input + fmaddd $x2,$s3hi,$h1hi,$h1hi + ldd [%sp+LOCALS+8*1],$y1 + fmaddd $x2,$r1lo,$h3lo,$h3lo + ldd [%sp+LOCALS+8*2],$y2 + fmaddd $x2,$r1hi,$h3hi,$h3hi + ldd [%sp+LOCALS+8*3],$y3 + + fmaddd $x0,$r0lo,$h0lo,$h0lo + fsubd $y0,$two0, $y0 ! de-bias input + fmaddd $x0,$r0hi,$h0hi,$h0hi + fsubd $y1,$two32,$y1 + fmaddd $x0,$r2lo,$h2lo,$h2lo + fsubd $y2,$two64,$y2 + fmaddd $x0,$r2hi,$h2hi,$h2hi + fsubd $y3,$two96,$y3 + fmaddd $x0,$r1lo,$h1lo,$h1lo + fmaddd $x0,$r1hi,$h1hi,$h1hi + fmaddd $x0,$r3lo,$h3lo,$h3lo + fmaddd $x0,$r3hi,$h3hi,$h3hi + + bcc SIZE_T_CC,.Loop_fma + subcc $len,1,$len + + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 + faddd $h0lo,$two32,$c0lo + faddd $h0hi,$two32,$c0hi + faddd $h2lo,$two96,$c2lo + faddd $h2hi,$two96,$c2hi + faddd $h1lo,$two64,$c1lo + faddd $h1hi,$two64,$c1hi + faddd $h3lo,$two130,$c3lo + faddd $h3hi,$two130,$c3hi + + fsubd $c0lo,$two32,$c0lo + fsubd $c0hi,$two32,$c0hi + fsubd $c2lo,$two96,$c2lo + fsubd $c2hi,$two96,$c2hi + fsubd $c1lo,$two64,$c1lo + fsubd $c1hi,$two64,$c1hi + fsubd $c3lo,$two130,$c3lo + fsubd $c3hi,$two130,$c3hi + + fsubd $h1lo,$c1lo,$h1lo + fsubd $h1hi,$c1hi,$h1hi + fsubd $h3lo,$c3lo,$h3lo + fsubd $h3hi,$c3hi,$h3hi + fsubd $h2lo,$c2lo,$h2lo + fsubd $h2hi,$c2hi,$h2hi + fsubd $h0lo,$c0lo,$h0lo + fsubd $h0hi,$c0hi,$h0hi + + faddd $h1lo,$c0lo,$h1lo + faddd $h1hi,$c0hi,$h1hi + faddd $h3lo,$c2lo,$h3lo + faddd $h3hi,$c2hi,$h3hi + faddd $h2lo,$c1lo,$h2lo + faddd $h2hi,$c1hi,$h2hi + fmaddd $five_two130,$c3lo,$h0lo,$h0lo + fmaddd $five_two130,$c3hi,$h0hi,$h0hi + + faddd $h1lo,$h1hi,$x1 + faddd $h3lo,$h3hi,$x3 + faddd $h2lo,$h2hi,$x2 + faddd $h0lo,$h0hi,$x0 + + faddd $x1,$two32,$x1 ! bias + faddd $x3,$two96,$x3 + faddd $x2,$two64,$x2 + faddd $x0,$two0, $x0 + + ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr + + std $x1,[$ctx+8*1] ! store [biased] hash value + std $x3,[$ctx+8*3] + std $x2,[$ctx+8*2] + std $x0,[$ctx+8*0] + +.Labort: + ret + restore +.type poly1305_blocks_fma,#function +.size poly1305_blocks_fma,.-poly1305_blocks_fma +___ +{ +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask + ) = (map("%l$_",(0..5)),map("%o$_",(0..4))); + +$code.=<<___; +.align 32 +poly1305_emit_fma: + save %sp,-STACK_FRAME,%sp + + ld [$ctx+8*0+0],$d0 ! load hash + ld [$ctx+8*0+4],$h0 + ld [$ctx+8*1+0],$d1 + ld [$ctx+8*1+4],$h1 + ld [$ctx+8*2+0],$d2 + ld [$ctx+8*2+4],$h2 + ld [$ctx+8*3+0],$d3 + ld [$ctx+8*3+4],$h3 + + sethi %hi(0xfff00000),$mask + andn $d0,$mask,$d0 ! mask exponent + andn $d1,$mask,$d1 + andn $d2,$mask,$d2 + andn $d3,$mask,$d3 ! can be partially reduced... + mov 3,$mask + + srl $d3,2,$padbit ! ... so reduce + and $d3,$mask,$h4 + andn $d3,$mask,$d3 + add $padbit,$d3,$d3 + + addcc $d3,$h0,$h0 + addccc $d0,$h1,$h1 + addccc $d1,$h2,$h2 + addccc $d2,$h3,$h3 + addc %g0,$h4,$h4 + + addcc $h0,5,$d0 ! compare to modulus + addccc $h1,0,$d1 + addccc $h2,0,$d2 + addccc $h3,0,$d3 + addc $h4,0,$mask + + srl $mask,2,$mask ! did it carry/borrow? + neg $mask,$mask + sra $mask,31,$mask ! mask + + andn $h0,$mask,$h0 + and $d0,$mask,$d0 + andn $h1,$mask,$h1 + and $d1,$mask,$d1 + or $d0,$h0,$h0 + ld [$nonce+0],$d0 ! load nonce + andn $h2,$mask,$h2 + and $d2,$mask,$d2 + or $d1,$h1,$h1 + ld [$nonce+4],$d1 + andn $h3,$mask,$h3 + and $d3,$mask,$d3 + or $d2,$h2,$h2 + ld [$nonce+8],$d2 + or $d3,$h3,$h3 + ld [$nonce+12],$d3 + + addcc $d0,$h0,$h0 ! accumulate nonce + addccc $d1,$h1,$h1 + addccc $d2,$h2,$h2 + addc $d3,$h3,$h3 + + stb $h0,[$mac+0] ! write little-endian result + srl $h0,8,$h0 + stb $h1,[$mac+4] + srl $h1,8,$h1 + stb $h2,[$mac+8] + srl $h2,8,$h2 + stb $h3,[$mac+12] + srl $h3,8,$h3 + + stb $h0,[$mac+1] + srl $h0,8,$h0 + stb $h1,[$mac+5] + srl $h1,8,$h1 + stb $h2,[$mac+9] + srl $h2,8,$h2 + stb $h3,[$mac+13] + srl $h3,8,$h3 + + stb $h0,[$mac+2] + srl $h0,8,$h0 + stb $h1,[$mac+6] + srl $h1,8,$h1 + stb $h2,[$mac+10] + srl $h2,8,$h2 + stb $h3,[$mac+14] + srl $h3,8,$h3 + + stb $h0,[$mac+3] + stb $h1,[$mac+7] + stb $h2,[$mac+11] + stb $h3,[$mac+15] + + ret + restore +.type poly1305_emit_fma,#function +.size poly1305_emit_fma,.-poly1305_emit_fma +___ +} + +$code.=<<___; +.align 64 +.Lconsts_fma: +.word 0x43300000,0x00000000 ! 2^(52+0) +.word 0x45300000,0x00000000 ! 2^(52+32) +.word 0x47300000,0x00000000 ! 2^(52+64) +.word 0x49300000,0x00000000 ! 2^(52+96) +.word 0x4b500000,0x00000000 ! 2^(52+130) + +.word 0x37f40000,0x00000000 ! 5/2^130 +.word 0,1<<30 ! fsr: truncate, no exceptions + +.word 0x44300000,0x00000000 ! 2^(52+16+0) +.word 0x46300000,0x00000000 ! 2^(52+16+32) +.word 0x48300000,0x00000000 ! 2^(52+16+64) +.word 0x4a300000,0x00000000 ! 2^(52+16+96) +.word 0x3e300000,0x00000000 ! 2^(52+16+0-96) +.word 0x40300000,0x00000000 ! 2^(52+16+32-96) +.word 0x42300000,0x00000000 ! 2^(52+16+64-96) +.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 +___ +} + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "addxc" => 0x011, + "addxccc" => 0x013, + "umulxhi" => 0x016 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub unfma { +my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; +my ($ref,$opf); +my %fmaopf = ( "fmadds" => 0x1, + "fmaddd" => 0x2, + "fmsubs" => 0x5, + "fmsubd" => 0x6 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; + + if ($opf=$fmaopf{$mnemonic}) { + foreach ($rs1,$rs2,$rs3,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /ge or + s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/ + &unfma($1,$2,$3,$4,$5) + /ge; + + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-x86.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-x86.pl new file mode 100755 index 000000000..4aaf63a0a --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-x86.pl @@ -0,0 +1,1815 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for x86. +# +# April 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# measured with rdtsc at fixed clock frequency. +# +# IALU/gcc-3.4(*) SSE2(**) AVX2 +# Pentium 15.7/+80% - +# PIII 6.21/+90% - +# P4 19.8/+40% 3.24 +# Core 2 4.85/+90% 1.80 +# Westmere 4.58/+100% 1.43 +# Sandy Bridge 3.90/+100% 1.36 +# Haswell 3.88/+70% 1.18 0.72 +# Skylake 3.10/+60% 1.14 0.62 +# Silvermont 11.0/+40% 4.80 +# Goldmont 4.10/+200% 2.10 +# VIA Nano 6.71/+90% 2.47 +# Sledgehammer 3.51/+180% 4.27 +# Bulldozer 4.53/+140% 1.31 +# +# (*) gcc 4.8 for some reason generated worse code; +# (**) besides SSE2 there are floating-point and AVX options; FP +# is deemed unnecessary, because pre-SSE2 processor are too +# old to care about, while it's not the fastest option on +# SSE2-capable ones; AVX is omitted, because it doesn't give +# a lot of improvement, 5-10% depending on processor; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); + +$sse2=$avx=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +if ($sse2) { + &static_label("const_sse2"); + &static_label("enter_blocks"); + &static_label("enter_emit"); + &external_label("OPENSSL_ia32cap_P"); + + if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); + } + + if (!$avx && $ARGV[0] eq "win32n" && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); + } + + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); + } +} + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int32 h[5]; # current hash value base 2^32 +# unsigned __int32 pad; # is_base2_26 in vector context +# unsigned __int32 r[4]; # key value base 2^32 + +&align(64); +&function_begin("poly1305_init"); + &mov ("edi",&wparam(0)); # context + &mov ("esi",&wparam(1)); # key + &mov ("ebp",&wparam(2)); # function table + + &xor ("eax","eax"); + &mov (&DWP(4*0,"edi"),"eax"); # zero hash value + &mov (&DWP(4*1,"edi"),"eax"); + &mov (&DWP(4*2,"edi"),"eax"); + &mov (&DWP(4*3,"edi"),"eax"); + &mov (&DWP(4*4,"edi"),"eax"); + &mov (&DWP(4*5,"edi"),"eax"); # is_base2_26 + + &cmp ("esi",0); + &je (&label("nokey")); + + if ($sse2) { + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("ebx"); + + &lea ("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx")); + &lea ("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx")); + + &picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point")); + &mov ("ecx",&DWP(0,"edi")); + &and ("ecx",1<<26|1<<24); + &cmp ("ecx",1<<26|1<<24); # SSE2 and XMM? + &jne (&label("no_sse2")); + + &lea ("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx")); + &lea ("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx")); + + if ($avx>1) { + &mov ("ecx",&DWP(8,"edi")); + &test ("ecx",1<<5); # AVX2? + &jz (&label("no_sse2")); + + &lea ("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx")); + } + &set_label("no_sse2"); + &mov ("edi",&wparam(0)); # reload context + &mov (&DWP(0,"ebp"),"eax"); # fill function table + &mov (&DWP(4,"ebp"),"edx"); + } + + &mov ("eax",&DWP(4*0,"esi")); # load input key + &mov ("ebx",&DWP(4*1,"esi")); + &mov ("ecx",&DWP(4*2,"esi")); + &mov ("edx",&DWP(4*3,"esi")); + &and ("eax",0x0fffffff); + &and ("ebx",0x0ffffffc); + &and ("ecx",0x0ffffffc); + &and ("edx",0x0ffffffc); + &mov (&DWP(4*6,"edi"),"eax"); + &mov (&DWP(4*7,"edi"),"ebx"); + &mov (&DWP(4*8,"edi"),"ecx"); + &mov (&DWP(4*9,"edi"),"edx"); + + &mov ("eax",$sse2); +&set_label("nokey"); +&function_end("poly1305_init"); + +($h0,$h1,$h2,$h3,$h4, + $d0,$d1,$d2,$d3, + $r0,$r1,$r2,$r3, + $s1,$s2,$s3)=map(4*$_,(0..15)); + +&function_begin("poly1305_blocks"); + &mov ("edi",&wparam(0)); # ctx + &mov ("esi",&wparam(1)); # inp + &mov ("ecx",&wparam(2)); # len +&set_label("enter_blocks"); + &and ("ecx",-15); + &jz (&label("nodata")); + + &stack_push(16); + &mov ("eax",&DWP(4*6,"edi")); # r0 + &mov ("ebx",&DWP(4*7,"edi")); # r1 + &lea ("ebp",&DWP(0,"esi","ecx")); # end of input + &mov ("ecx",&DWP(4*8,"edi")); # r2 + &mov ("edx",&DWP(4*9,"edi")); # r3 + + &mov (&wparam(2),"ebp"); + &mov ("ebp","esi"); + + &mov (&DWP($r0,"esp"),"eax"); # r0 + &mov ("eax","ebx"); + &shr ("eax",2); + &mov (&DWP($r1,"esp"),"ebx"); # r1 + &add ("eax","ebx"); # s1 + &mov ("ebx","ecx"); + &shr ("ebx",2); + &mov (&DWP($r2,"esp"),"ecx"); # r2 + &add ("ebx","ecx"); # s2 + &mov ("ecx","edx"); + &shr ("ecx",2); + &mov (&DWP($r3,"esp"),"edx"); # r3 + &add ("ecx","edx"); # s3 + &mov (&DWP($s1,"esp"),"eax"); # s1 + &mov (&DWP($s2,"esp"),"ebx"); # s2 + &mov (&DWP($s3,"esp"),"ecx"); # s3 + + &mov ("eax",&DWP(4*0,"edi")); # load hash value + &mov ("ebx",&DWP(4*1,"edi")); + &mov ("ecx",&DWP(4*2,"edi")); + &mov ("esi",&DWP(4*3,"edi")); + &mov ("edi",&DWP(4*4,"edi")); + &jmp (&label("loop")); + +&set_label("loop",32); + &add ("eax",&DWP(4*0,"ebp")); # accumulate input + &adc ("ebx",&DWP(4*1,"ebp")); + &adc ("ecx",&DWP(4*2,"ebp")); + &adc ("esi",&DWP(4*3,"ebp")); + &lea ("ebp",&DWP(4*4,"ebp")); + &adc ("edi",&wparam(3)); # padbit + + &mov (&DWP($h0,"esp"),"eax"); # put aside hash[+inp] + &mov (&DWP($h3,"esp"),"esi"); + + &mul (&DWP($r0,"esp")); # h0*r0 + &mov (&DWP($h4,"esp"),"edi"); + &mov ("edi","eax"); + &mov ("eax","ebx"); # h1 + &mov ("esi","edx"); + &mul (&DWP($s3,"esp")); # h1*s3 + &add ("edi","eax"); + &mov ("eax","ecx"); # h2 + &adc ("esi","edx"); + &mul (&DWP($s2,"esp")); # h2*s2 + &add ("edi","eax"); + &mov ("eax",&DWP($h3,"esp")); + &adc ("esi","edx"); + &mul (&DWP($s1,"esp")); # h3*s1 + &add ("edi","eax"); + &mov ("eax",&DWP($h0,"esp")); + &adc ("esi","edx"); + + &mul (&DWP($r1,"esp")); # h0*r1 + &mov (&DWP($d0,"esp"),"edi"); + &xor ("edi","edi"); + &add ("esi","eax"); + &mov ("eax","ebx"); # h1 + &adc ("edi","edx"); + &mul (&DWP($r0,"esp")); # h1*r0 + &add ("esi","eax"); + &mov ("eax","ecx"); # h2 + &adc ("edi","edx"); + &mul (&DWP($s3,"esp")); # h2*s3 + &add ("esi","eax"); + &mov ("eax",&DWP($h3,"esp")); + &adc ("edi","edx"); + &mul (&DWP($s2,"esp")); # h3*s2 + &add ("esi","eax"); + &mov ("eax",&DWP($h4,"esp")); + &adc ("edi","edx"); + &imul ("eax",&DWP($s1,"esp")); # h4*s1 + &add ("esi","eax"); + &mov ("eax",&DWP($h0,"esp")); + &adc ("edi",0); + + &mul (&DWP($r2,"esp")); # h0*r2 + &mov (&DWP($d1,"esp"),"esi"); + &xor ("esi","esi"); + &add ("edi","eax"); + &mov ("eax","ebx"); # h1 + &adc ("esi","edx"); + &mul (&DWP($r1,"esp")); # h1*r1 + &add ("edi","eax"); + &mov ("eax","ecx"); # h2 + &adc ("esi","edx"); + &mul (&DWP($r0,"esp")); # h2*r0 + &add ("edi","eax"); + &mov ("eax",&DWP($h3,"esp")); + &adc ("esi","edx"); + &mul (&DWP($s3,"esp")); # h3*s3 + &add ("edi","eax"); + &mov ("eax",&DWP($h4,"esp")); + &adc ("esi","edx"); + &imul ("eax",&DWP($s2,"esp")); # h4*s2 + &add ("edi","eax"); + &mov ("eax",&DWP($h0,"esp")); + &adc ("esi",0); + + &mul (&DWP($r3,"esp")); # h0*r3 + &mov (&DWP($d2,"esp"),"edi"); + &xor ("edi","edi"); + &add ("esi","eax"); + &mov ("eax","ebx"); # h1 + &adc ("edi","edx"); + &mul (&DWP($r2,"esp")); # h1*r2 + &add ("esi","eax"); + &mov ("eax","ecx"); # h2 + &adc ("edi","edx"); + &mul (&DWP($r1,"esp")); # h2*r1 + &add ("esi","eax"); + &mov ("eax",&DWP($h3,"esp")); + &adc ("edi","edx"); + &mul (&DWP($r0,"esp")); # h3*r0 + &add ("esi","eax"); + &mov ("ecx",&DWP($h4,"esp")); + &adc ("edi","edx"); + + &mov ("edx","ecx"); + &imul ("ecx",&DWP($s3,"esp")); # h4*s3 + &add ("esi","ecx"); + &mov ("eax",&DWP($d0,"esp")); + &adc ("edi",0); + + &imul ("edx",&DWP($r0,"esp")); # h4*r0 + &add ("edx","edi"); + + &mov ("ebx",&DWP($d1,"esp")); + &mov ("ecx",&DWP($d2,"esp")); + + &mov ("edi","edx"); # last reduction step + &shr ("edx",2); + &and ("edi",3); + &lea ("edx",&DWP(0,"edx","edx",4)); # *5 + &add ("eax","edx"); + &adc ("ebx",0); + &adc ("ecx",0); + &adc ("esi",0); + &adc ("edi",0); + + &cmp ("ebp",&wparam(2)); # done yet? + &jne (&label("loop")); + + &mov ("edx",&wparam(0)); # ctx + &stack_pop(16); + &mov (&DWP(4*0,"edx"),"eax"); # store hash value + &mov (&DWP(4*1,"edx"),"ebx"); + &mov (&DWP(4*2,"edx"),"ecx"); + &mov (&DWP(4*3,"edx"),"esi"); + &mov (&DWP(4*4,"edx"),"edi"); +&set_label("nodata"); +&function_end("poly1305_blocks"); + +&function_begin("poly1305_emit"); + &mov ("ebp",&wparam(0)); # context +&set_label("enter_emit"); + &mov ("edi",&wparam(1)); # output + &mov ("eax",&DWP(4*0,"ebp")); # load hash value + &mov ("ebx",&DWP(4*1,"ebp")); + &mov ("ecx",&DWP(4*2,"ebp")); + &mov ("edx",&DWP(4*3,"ebp")); + &mov ("esi",&DWP(4*4,"ebp")); + + &add ("eax",5); # compare to modulus + &adc ("ebx",0); + &adc ("ecx",0); + &adc ("edx",0); + &adc ("esi",0); + &shr ("esi",2); # did it carry/borrow? + &neg ("esi"); # do we choose hash-modulus? + + &and ("eax","esi"); + &and ("ebx","esi"); + &and ("ecx","esi"); + &and ("edx","esi"); + &mov (&DWP(4*0,"edi"),"eax"); + &mov (&DWP(4*1,"edi"),"ebx"); + &mov (&DWP(4*2,"edi"),"ecx"); + &mov (&DWP(4*3,"edi"),"edx"); + + ¬ ("esi"); # or original hash value? + &mov ("eax",&DWP(4*0,"ebp")); + &mov ("ebx",&DWP(4*1,"ebp")); + &mov ("ecx",&DWP(4*2,"ebp")); + &mov ("edx",&DWP(4*3,"ebp")); + &mov ("ebp",&wparam(2)); + &and ("eax","esi"); + &and ("ebx","esi"); + &and ("ecx","esi"); + &and ("edx","esi"); + &or ("eax",&DWP(4*0,"edi")); + &or ("ebx",&DWP(4*1,"edi")); + &or ("ecx",&DWP(4*2,"edi")); + &or ("edx",&DWP(4*3,"edi")); + + &add ("eax",&DWP(4*0,"ebp")); # accumulate key + &adc ("ebx",&DWP(4*1,"ebp")); + &adc ("ecx",&DWP(4*2,"ebp")); + &adc ("edx",&DWP(4*3,"ebp")); + + &mov (&DWP(4*0,"edi"),"eax"); + &mov (&DWP(4*1,"edi"),"ebx"); + &mov (&DWP(4*2,"edi"),"ecx"); + &mov (&DWP(4*3,"edi"),"edx"); +&function_end("poly1305_emit"); + +if ($sse2) { +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int32 h[5]; # current hash value base 2^26 +# unsigned __int32 is_base2_26; +# unsigned __int32 r[4]; # key value base 2^32 +# unsigned __int32 pad[2]; +# struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9]; +# +# where r^n are base 2^26 digits of degrees of multiplier key. There are +# 5 digits, but last four are interleaved with multiples of 5, totalling +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. + +my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7)); +my $MASK=$T2; # borrow and keep in mind + +&align (32); +&function_begin_B("_poly1305_init_sse2"); + &movdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 + &lea ("edi",&DWP(16*3,"edi")); # size optimization + &mov ("ebp","esp"); + &sub ("esp",16*(9+5)); + &and ("esp",-16); + + #&pand ($D4,&QWP(96,"ebx")); # magic mask + &movq ($MASK,&QWP(64,"ebx")); + + &movdqa ($D0,$D4); + &movdqa ($D1,$D4); + &movdqa ($D2,$D4); + + &pand ($D0,$MASK); # -> base 2^26 + &psrlq ($D1,26); + &psrldq ($D2,6); + &pand ($D1,$MASK); + &movdqa ($D3,$D2); + &psrlq ($D2,4) + &psrlq ($D3,30); + &pand ($D2,$MASK); + &pand ($D3,$MASK); + &psrldq ($D4,13); + + &lea ("edx",&DWP(16*9,"esp")); # size optimization + &mov ("ecx",2); +&set_label("square"); + &movdqa (&QWP(16*0,"esp"),$D0); + &movdqa (&QWP(16*1,"esp"),$D1); + &movdqa (&QWP(16*2,"esp"),$D2); + &movdqa (&QWP(16*3,"esp"),$D3); + &movdqa (&QWP(16*4,"esp"),$D4); + + &movdqa ($T1,$D1); + &movdqa ($T0,$D2); + &pslld ($T1,2); + &pslld ($T0,2); + &paddd ($T1,$D1); # *5 + &paddd ($T0,$D2); # *5 + &movdqa (&QWP(16*5,"esp"),$T1); + &movdqa (&QWP(16*6,"esp"),$T0); + &movdqa ($T1,$D3); + &movdqa ($T0,$D4); + &pslld ($T1,2); + &pslld ($T0,2); + &paddd ($T1,$D3); # *5 + &paddd ($T0,$D4); # *5 + &movdqa (&QWP(16*7,"esp"),$T1); + &movdqa (&QWP(16*8,"esp"),$T0); + + &pshufd ($T1,$D0,0b01000100); + &movdqa ($T0,$D1); + &pshufd ($D1,$D1,0b01000100); + &pshufd ($D2,$D2,0b01000100); + &pshufd ($D3,$D3,0b01000100); + &pshufd ($D4,$D4,0b01000100); + &movdqa (&QWP(16*0,"edx"),$T1); + &movdqa (&QWP(16*1,"edx"),$D1); + &movdqa (&QWP(16*2,"edx"),$D2); + &movdqa (&QWP(16*3,"edx"),$D3); + &movdqa (&QWP(16*4,"edx"),$D4); + + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + &pmuludq ($D4,$D0); # h4*r0 + &pmuludq ($D3,$D0); # h3*r0 + &pmuludq ($D2,$D0); # h2*r0 + &pmuludq ($D1,$D0); # h1*r0 + &pmuludq ($D0,$T1); # h0*r0 + +sub pmuladd { +my $load = shift; +my $base = shift; $base = "esp" if (!defined($base)); + + ################################################################ + # As for choice to "rotate" $T0-$T2 in order to move paddq + # past next multiplication. While it makes code harder to read + # and doesn't have significant effect on most processors, it + # makes a lot of difference on Atom, up to 30% improvement. + + &movdqa ($T1,$T0); + &pmuludq ($T0,&QWP(16*3,$base)); # r1*h3 + &movdqa ($T2,$T1); + &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2 + &paddq ($D4,$T0); + &movdqa ($T0,$T2); + &pmuludq ($T2,&QWP(16*1,$base)); # r1*h1 + &paddq ($D3,$T1); + &$load ($T1,5); # s1 + &pmuludq ($T0,&QWP(16*0,$base)); # r1*h0 + &paddq ($D2,$T2); + &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4 + &$load ($T2,2); # r2^n + &paddq ($D1,$T0); + + &movdqa ($T0,$T2); + &pmuludq ($T2,&QWP(16*2,$base)); # r2*h2 + &paddq ($D0,$T1); + &movdqa ($T1,$T0); + &pmuludq ($T0,&QWP(16*1,$base)); # r2*h1 + &paddq ($D4,$T2); + &$load ($T2,6); # s2^n + &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0 + &paddq ($D3,$T0); + &movdqa ($T0,$T2); + &pmuludq ($T2,&QWP(16*4,$base)); # s2*h4 + &paddq ($D2,$T1); + &pmuludq ($T0,&QWP(16*3,$base)); # s2*h3 + &$load ($T1,3); # r3^n + &paddq ($D1,$T2); + + &movdqa ($T2,$T1); + &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1 + &paddq ($D0,$T0); + &$load ($T0,7); # s3^n + &pmuludq ($T2,&QWP(16*0,$base)); # r3*h0 + &paddq ($D4,$T1); + &movdqa ($T1,$T0); + &pmuludq ($T0,&QWP(16*4,$base)); # s3*h4 + &paddq ($D3,$T2); + &movdqa ($T2,$T1); + &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3 + &paddq ($D2,$T0); + &pmuludq ($T2,&QWP(16*2,$base)); # s3*h2 + &$load ($T0,4); # r4^n + &paddq ($D1,$T1); + + &$load ($T1,8); # s4^n + &pmuludq ($T0,&QWP(16*0,$base)); # r4*h0 + &paddq ($D0,$T2); + &movdqa ($T2,$T1); + &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4 + &paddq ($D4,$T0); + &movdqa ($T0,$T2); + &pmuludq ($T2,&QWP(16*1,$base)); # s4*h1 + &paddq ($D3,$T1); + &movdqa ($T1,$T0); + &pmuludq ($T0,&QWP(16*2,$base)); # s4*h2 + &paddq ($D0,$T2); + &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3 + &movdqa ($MASK,&QWP(64,"ebx")); + &paddq ($D1,$T0); + &paddq ($D2,$T1); +} + &pmuladd (sub { my ($reg,$i)=@_; + &movdqa ($reg,&QWP(16*$i,"esp")); + },"edx"); + +sub lazy_reduction { +my $extra = shift; + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + # + # [(*) see discussion in poly1305-armv4 module] + + &movdqa ($T0,$D3); + &pand ($D3,$MASK); + &psrlq ($T0,26); + &$extra () if (defined($extra)); + &paddq ($T0,$D4); # h3 -> h4 + &movdqa ($T1,$D0); + &pand ($D0,$MASK); + &psrlq ($T1,26); + &movdqa ($D4,$T0); + &paddq ($T1,$D1); # h0 -> h1 + &psrlq ($T0,26); + &pand ($D4,$MASK); + &movdqa ($D1,$T1); + &psrlq ($T1,26); + &paddd ($D0,$T0); # favour paddd when + # possible, because + # paddq is "broken" + # on Atom + &psllq ($T0,2); + &paddq ($T1,$D2); # h1 -> h2 + &paddq ($T0,$D0); # h4 -> h0 (*) + &pand ($D1,$MASK); + &movdqa ($D2,$T1); + &psrlq ($T1,26); + &pand ($D2,$MASK); + &paddd ($T1,$D3); # h2 -> h3 + &movdqa ($D0,$T0); + &psrlq ($T0,26); + &movdqa ($D3,$T1); + &psrlq ($T1,26); + &pand ($D0,$MASK); + &paddd ($D1,$T0); # h0 -> h1 + &pand ($D3,$MASK); + &paddd ($D4,$T1); # h3 -> h4 +} + &lazy_reduction (); + + &dec ("ecx"); + &jz (&label("square_break")); + + &punpcklqdq ($D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 + &punpcklqdq ($D1,&QWP(16*1,"esp")); + &punpcklqdq ($D2,&QWP(16*2,"esp")); + &punpcklqdq ($D3,&QWP(16*3,"esp")); + &punpcklqdq ($D4,&QWP(16*4,"esp")); + &jmp (&label("square")); + +&set_label("square_break"); + &psllq ($D0,32); # -> r^3:0:r^4:0 + &psllq ($D1,32); + &psllq ($D2,32); + &psllq ($D3,32); + &psllq ($D4,32); + &por ($D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 + &por ($D1,&QWP(16*1,"esp")); + &por ($D2,&QWP(16*2,"esp")); + &por ($D3,&QWP(16*3,"esp")); + &por ($D4,&QWP(16*4,"esp")); + + &pshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 + &pshufd ($D1,$D1,0b10001101); + &pshufd ($D2,$D2,0b10001101); + &pshufd ($D3,$D3,0b10001101); + &pshufd ($D4,$D4,0b10001101); + + &movdqu (&QWP(16*0,"edi"),$D0); # save the table + &movdqu (&QWP(16*1,"edi"),$D1); + &movdqu (&QWP(16*2,"edi"),$D2); + &movdqu (&QWP(16*3,"edi"),$D3); + &movdqu (&QWP(16*4,"edi"),$D4); + + &movdqa ($T1,$D1); + &movdqa ($T0,$D2); + &pslld ($T1,2); + &pslld ($T0,2); + &paddd ($T1,$D1); # *5 + &paddd ($T0,$D2); # *5 + &movdqu (&QWP(16*5,"edi"),$T1); + &movdqu (&QWP(16*6,"edi"),$T0); + &movdqa ($T1,$D3); + &movdqa ($T0,$D4); + &pslld ($T1,2); + &pslld ($T0,2); + &paddd ($T1,$D3); # *5 + &paddd ($T0,$D4); # *5 + &movdqu (&QWP(16*7,"edi"),$T1); + &movdqu (&QWP(16*8,"edi"),$T0); + + &mov ("esp","ebp"); + &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization + &ret (); +&function_end_B("_poly1305_init_sse2"); + +&align (32); +&function_begin("_poly1305_blocks_sse2"); + &mov ("edi",&wparam(0)); # ctx + &mov ("esi",&wparam(1)); # inp + &mov ("ecx",&wparam(2)); # len + + &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 + &and ("ecx",-16); + &jz (&label("nodata")); + &cmp ("ecx",64); + &jae (&label("enter_sse2")); + &test ("eax","eax"); # is_base2_26? + &jz (&label("enter_blocks")); + +&set_label("enter_sse2",16); + &call (&label("pic_point")); +&set_label("pic_point"); + &blindpop("ebx"); + &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); + + &test ("eax","eax"); # is_base2_26? + &jnz (&label("base2_26")); + + &call ("_poly1305_init_sse2"); + + ################################################# base 2^32 -> base 2^26 + &mov ("eax",&DWP(0,"edi")); + &mov ("ecx",&DWP(3,"edi")); + &mov ("edx",&DWP(6,"edi")); + &mov ("esi",&DWP(9,"edi")); + &mov ("ebp",&DWP(13,"edi")); + &mov (&DWP(4*5,"edi"),1); # is_base2_26 + + &shr ("ecx",2); + &and ("eax",0x3ffffff); + &shr ("edx",4); + &and ("ecx",0x3ffffff); + &shr ("esi",6); + &and ("edx",0x3ffffff); + + &movd ($D0,"eax"); + &movd ($D1,"ecx"); + &movd ($D2,"edx"); + &movd ($D3,"esi"); + &movd ($D4,"ebp"); + + &mov ("esi",&wparam(1)); # [reload] inp + &mov ("ecx",&wparam(2)); # [reload] len + &jmp (&label("base2_32")); + +&set_label("base2_26",16); + &movd ($D0,&DWP(4*0,"edi")); # load hash value + &movd ($D1,&DWP(4*1,"edi")); + &movd ($D2,&DWP(4*2,"edi")); + &movd ($D3,&DWP(4*3,"edi")); + &movd ($D4,&DWP(4*4,"edi")); + &movdqa ($MASK,&QWP(64,"ebx")); + +&set_label("base2_32"); + &mov ("eax",&wparam(3)); # padbit + &mov ("ebp","esp"); + + &sub ("esp",16*(5+5+5+9+9)); + &and ("esp",-16); + + &lea ("edi",&DWP(16*3,"edi")); # size optimization + &shl ("eax",24); # padbit + + &test ("ecx",31); + &jz (&label("even")); + + ################################################################ + # process single block, with SSE2, because it's still faster + # even though half of result is discarded + + &movdqu ($T1,&QWP(0,"esi")); # input + &lea ("esi",&DWP(16,"esi")); + + &movdqa ($T0,$T1); # -> base 2^26 ... + &pand ($T1,$MASK); + &paddd ($D0,$T1); # ... and accumulate + + &movdqa ($T1,$T0); + &psrlq ($T0,26); + &psrldq ($T1,6); + &pand ($T0,$MASK); + &paddd ($D1,$T0); + + &movdqa ($T0,$T1); + &psrlq ($T1,4); + &pand ($T1,$MASK); + &paddd ($D2,$T1); + + &movdqa ($T1,$T0); + &psrlq ($T0,30); + &pand ($T0,$MASK); + &psrldq ($T1,7); + &paddd ($D3,$T0); + + &movd ($T0,"eax"); # padbit + &paddd ($D4,$T1); + &movd ($T1,&DWP(16*0+12,"edi")); # r0 + &paddd ($D4,$T0); + + &movdqa (&QWP(16*0,"esp"),$D0); + &movdqa (&QWP(16*1,"esp"),$D1); + &movdqa (&QWP(16*2,"esp"),$D2); + &movdqa (&QWP(16*3,"esp"),$D3); + &movdqa (&QWP(16*4,"esp"),$D4); + + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + &pmuludq ($D0,$T1); # h4*r0 + &pmuludq ($D1,$T1); # h3*r0 + &pmuludq ($D2,$T1); # h2*r0 + &movd ($T0,&DWP(16*1+12,"edi")); # r1 + &pmuludq ($D3,$T1); # h1*r0 + &pmuludq ($D4,$T1); # h0*r0 + + &pmuladd (sub { my ($reg,$i)=@_; + &movd ($reg,&DWP(16*$i+12,"edi")); + }); + + &lazy_reduction (); + + &sub ("ecx",16); + &jz (&label("done")); + +&set_label("even"); + &lea ("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization + &lea ("eax",&DWP(-16*2,"esi")); + &sub ("ecx",64); + + ################################################################ + # expand and copy pre-calculated table to stack + + &movdqu ($T0,&QWP(16*0,"edi")); # r^1:r^2:r^3:r^4 + &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4 + &cmovb ("esi","eax"); + &pshufd ($T0,$T0,0b11101110); # duplicate r^1:r^2 + &movdqa (&QWP(16*0,"edx"),$T1); + &lea ("eax",&DWP(16*10,"esp")); + &movdqu ($T1,&QWP(16*1,"edi")); + &movdqa (&QWP(16*(0-9),"edx"),$T0); + &pshufd ($T0,$T1,0b01000100); + &pshufd ($T1,$T1,0b11101110); + &movdqa (&QWP(16*1,"edx"),$T0); + &movdqu ($T0,&QWP(16*2,"edi")); + &movdqa (&QWP(16*(1-9),"edx"),$T1); + &pshufd ($T1,$T0,0b01000100); + &pshufd ($T0,$T0,0b11101110); + &movdqa (&QWP(16*2,"edx"),$T1); + &movdqu ($T1,&QWP(16*3,"edi")); + &movdqa (&QWP(16*(2-9),"edx"),$T0); + &pshufd ($T0,$T1,0b01000100); + &pshufd ($T1,$T1,0b11101110); + &movdqa (&QWP(16*3,"edx"),$T0); + &movdqu ($T0,&QWP(16*4,"edi")); + &movdqa (&QWP(16*(3-9),"edx"),$T1); + &pshufd ($T1,$T0,0b01000100); + &pshufd ($T0,$T0,0b11101110); + &movdqa (&QWP(16*4,"edx"),$T1); + &movdqu ($T1,&QWP(16*5,"edi")); + &movdqa (&QWP(16*(4-9),"edx"),$T0); + &pshufd ($T0,$T1,0b01000100); + &pshufd ($T1,$T1,0b11101110); + &movdqa (&QWP(16*5,"edx"),$T0); + &movdqu ($T0,&QWP(16*6,"edi")); + &movdqa (&QWP(16*(5-9),"edx"),$T1); + &pshufd ($T1,$T0,0b01000100); + &pshufd ($T0,$T0,0b11101110); + &movdqa (&QWP(16*6,"edx"),$T1); + &movdqu ($T1,&QWP(16*7,"edi")); + &movdqa (&QWP(16*(6-9),"edx"),$T0); + &pshufd ($T0,$T1,0b01000100); + &pshufd ($T1,$T1,0b11101110); + &movdqa (&QWP(16*7,"edx"),$T0); + &movdqu ($T0,&QWP(16*8,"edi")); + &movdqa (&QWP(16*(7-9),"edx"),$T1); + &pshufd ($T1,$T0,0b01000100); + &pshufd ($T0,$T0,0b11101110); + &movdqa (&QWP(16*8,"edx"),$T1); + &movdqa (&QWP(16*(8-9),"edx"),$T0); + +sub load_input { +my ($inpbase,$offbase)=@_; + + &movdqu ($T0,&QWP($inpbase+0,"esi")); # load input + &movdqu ($T1,&QWP($inpbase+16,"esi")); + &lea ("esi",&DWP(16*2,"esi")); + + &movdqa (&QWP($offbase+16*2,"esp"),$D2); + &movdqa (&QWP($offbase+16*3,"esp"),$D3); + &movdqa (&QWP($offbase+16*4,"esp"),$D4); + + &movdqa ($D2,$T0); # splat input + &movdqa ($D3,$T1); + &psrldq ($D2,6); + &psrldq ($D3,6); + &movdqa ($D4,$T0); + &punpcklqdq ($D2,$D3); # 2:3 + &punpckhqdq ($D4,$T1); # 4 + &punpcklqdq ($T0,$T1); # 0:1 + + &movdqa ($D3,$D2); + &psrlq ($D2,4); + &psrlq ($D3,30); + &movdqa ($T1,$T0); + &psrlq ($D4,40); # 4 + &psrlq ($T1,26); + &pand ($T0,$MASK); # 0 + &pand ($T1,$MASK); # 1 + &pand ($D2,$MASK); # 2 + &pand ($D3,$MASK); # 3 + &por ($D4,&QWP(0,"ebx")); # padbit, yes, always + + &movdqa (&QWP($offbase+16*0,"esp"),$D0) if ($offbase); + &movdqa (&QWP($offbase+16*1,"esp"),$D1) if ($offbase); +} + &load_input (16*2,16*5); + + &jbe (&label("skip_loop")); + &jmp (&label("loop")); + +&set_label("loop",32); + ################################################################ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + # \___________________/ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + # \___________________/ \____________________/ + ################################################################ + + &movdqa ($T2,&QWP(16*(0-9),"edx")); # r0^2 + &movdqa (&QWP(16*1,"eax"),$T1); + &movdqa (&QWP(16*2,"eax"),$D2); + &movdqa (&QWP(16*3,"eax"),$D3); + &movdqa (&QWP(16*4,"eax"),$D4); + + ################################################################ + # d4 = h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 + # d3 = h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + # d1 = h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 + # d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + &movdqa ($D1,$T0); + &pmuludq ($T0,$T2); # h0*r0 + &movdqa ($D0,$T1); + &pmuludq ($T1,$T2); # h1*r0 + &pmuludq ($D2,$T2); # h2*r0 + &pmuludq ($D3,$T2); # h3*r0 + &pmuludq ($D4,$T2); # h4*r0 + +sub pmuladd_alt { +my $addr = shift; + + &pmuludq ($D0,&$addr(8)); # h1*s4 + &movdqa ($T2,$D1); + &pmuludq ($D1,&$addr(1)); # h0*r1 + &paddq ($D0,$T0); + &movdqa ($T0,$T2); + &pmuludq ($T2,&$addr(2)); # h0*r2 + &paddq ($D1,$T1); + &movdqa ($T1,$T0); + &pmuludq ($T0,&$addr(3)); # h0*r3 + &paddq ($D2,$T2); + &movdqa ($T2,&QWP(16*1,"eax")); # pull h1 + &pmuludq ($T1,&$addr(4)); # h0*r4 + &paddq ($D3,$T0); + + &movdqa ($T0,$T2); + &pmuludq ($T2,&$addr(1)); # h1*r1 + &paddq ($D4,$T1); + &movdqa ($T1,$T0); + &pmuludq ($T0,&$addr(2)); # h1*r2 + &paddq ($D2,$T2); + &movdqa ($T2,&QWP(16*2,"eax")); # pull h2 + &pmuludq ($T1,&$addr(3)); # h1*r3 + &paddq ($D3,$T0); + &movdqa ($T0,$T2); + &pmuludq ($T2,&$addr(7)); # h2*s3 + &paddq ($D4,$T1); + &movdqa ($T1,$T0); + &pmuludq ($T0,&$addr(8)); # h2*s4 + &paddq ($D0,$T2); + + &movdqa ($T2,$T1); + &pmuludq ($T1,&$addr(1)); # h2*r1 + &paddq ($D1,$T0); + &movdqa ($T0,&QWP(16*3,"eax")); # pull h3 + &pmuludq ($T2,&$addr(2)); # h2*r2 + &paddq ($D3,$T1); + &movdqa ($T1,$T0); + &pmuludq ($T0,&$addr(6)); # h3*s2 + &paddq ($D4,$T2); + &movdqa ($T2,$T1); + &pmuludq ($T1,&$addr(7)); # h3*s3 + &paddq ($D0,$T0); + &movdqa ($T0,$T2); + &pmuludq ($T2,&$addr(8)); # h3*s4 + &paddq ($D1,$T1); + + &movdqa ($T1,&QWP(16*4,"eax")); # pull h4 + &pmuludq ($T0,&$addr(1)); # h3*r1 + &paddq ($D2,$T2); + &movdqa ($T2,$T1); + &pmuludq ($T1,&$addr(8)); # h4*s4 + &paddq ($D4,$T0); + &movdqa ($T0,$T2); + &pmuludq ($T2,&$addr(5)); # h4*s1 + &paddq ($D3,$T1); + &movdqa ($T1,$T0); + &pmuludq ($T0,&$addr(6)); # h4*s2 + &paddq ($D0,$T2); + &movdqa ($MASK,&QWP(64,"ebx")); + &pmuludq ($T1,&$addr(7)); # h4*s3 + &paddq ($D1,$T0); + &paddq ($D2,$T1); +} + &pmuladd_alt (sub { my $i=shift; &QWP(16*($i-9),"edx"); }); + + &load_input (-16*2,0); + &lea ("eax",&DWP(-16*2,"esi")); + &sub ("ecx",64); + + &paddd ($T0,&QWP(16*(5+0),"esp")); # add hash value + &paddd ($T1,&QWP(16*(5+1),"esp")); + &paddd ($D2,&QWP(16*(5+2),"esp")); + &paddd ($D3,&QWP(16*(5+3),"esp")); + &paddd ($D4,&QWP(16*(5+4),"esp")); + + &cmovb ("esi","eax"); + &lea ("eax",&DWP(16*10,"esp")); + + &movdqa ($T2,&QWP(16*0,"edx")); # r0^4 + &movdqa (&QWP(16*1,"esp"),$D1); + &movdqa (&QWP(16*1,"eax"),$T1); + &movdqa (&QWP(16*2,"eax"),$D2); + &movdqa (&QWP(16*3,"eax"),$D3); + &movdqa (&QWP(16*4,"eax"),$D4); + + ################################################################ + # d4 += h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 + # d3 += h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 + # d2 += h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + # d1 += h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 + # d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + &movdqa ($D1,$T0); + &pmuludq ($T0,$T2); # h0*r0 + &paddq ($T0,$D0); + &movdqa ($D0,$T1); + &pmuludq ($T1,$T2); # h1*r0 + &pmuludq ($D2,$T2); # h2*r0 + &pmuludq ($D3,$T2); # h3*r0 + &pmuludq ($D4,$T2); # h4*r0 + + &paddq ($T1,&QWP(16*1,"esp")); + &paddq ($D2,&QWP(16*2,"esp")); + &paddq ($D3,&QWP(16*3,"esp")); + &paddq ($D4,&QWP(16*4,"esp")); + + &pmuladd_alt (sub { my $i=shift; &QWP(16*$i,"edx"); }); + + &lazy_reduction (); + + &load_input (16*2,16*5); + + &ja (&label("loop")); + +&set_label("skip_loop"); + ################################################################ + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + &pshufd ($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n + &add ("ecx",32); + &jnz (&label("long_tail")); + + &paddd ($T0,$D0); # add hash value + &paddd ($T1,$D1); + &paddd ($D2,&QWP(16*7,"esp")); + &paddd ($D3,&QWP(16*8,"esp")); + &paddd ($D4,&QWP(16*9,"esp")); + +&set_label("long_tail"); + + &movdqa (&QWP(16*0,"eax"),$T0); + &movdqa (&QWP(16*1,"eax"),$T1); + &movdqa (&QWP(16*2,"eax"),$D2); + &movdqa (&QWP(16*3,"eax"),$D3); + &movdqa (&QWP(16*4,"eax"),$D4); + + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + &pmuludq ($T0,$T2); # h0*r0 + &pmuludq ($T1,$T2); # h1*r0 + &pmuludq ($D2,$T2); # h2*r0 + &movdqa ($D0,$T0); + &pshufd ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n + &pmuludq ($D3,$T2); # h3*r0 + &movdqa ($D1,$T1); + &pmuludq ($D4,$T2); # h4*r0 + + &pmuladd (sub { my ($reg,$i)=@_; + &pshufd ($reg,&QWP(16*($i-9),"edx"),0x10); + },"eax"); + + &jz (&label("short_tail")); + + &load_input (-16*2,0); + + &pshufd ($T2,&QWP(16*0,"edx"),0x10); # r0^n + &paddd ($T0,&QWP(16*5,"esp")); # add hash value + &paddd ($T1,&QWP(16*6,"esp")); + &paddd ($D2,&QWP(16*7,"esp")); + &paddd ($D3,&QWP(16*8,"esp")); + &paddd ($D4,&QWP(16*9,"esp")); + + ################################################################ + # multiply inp[0:1] by r^4:r^3 and accumulate + + &movdqa (&QWP(16*0,"esp"),$T0); + &pmuludq ($T0,$T2); # h0*r0 + &movdqa (&QWP(16*1,"esp"),$T1); + &pmuludq ($T1,$T2); # h1*r0 + &paddq ($D0,$T0); + &movdqa ($T0,$D2); + &pmuludq ($D2,$T2); # h2*r0 + &paddq ($D1,$T1); + &movdqa ($T1,$D3); + &pmuludq ($D3,$T2); # h3*r0 + &paddq ($D2,&QWP(16*2,"esp")); + &movdqa (&QWP(16*2,"esp"),$T0); + &pshufd ($T0,&QWP(16*1,"edx"),0x10); # r1^n + &paddq ($D3,&QWP(16*3,"esp")); + &movdqa (&QWP(16*3,"esp"),$T1); + &movdqa ($T1,$D4); + &pmuludq ($D4,$T2); # h4*r0 + &paddq ($D4,&QWP(16*4,"esp")); + &movdqa (&QWP(16*4,"esp"),$T1); + + &pmuladd (sub { my ($reg,$i)=@_; + &pshufd ($reg,&QWP(16*$i,"edx"),0x10); + }); + +&set_label("short_tail"); + + ################################################################ + # horizontal addition + + &pshufd ($T1,$D4,0b01001110); + &pshufd ($T0,$D3,0b01001110); + &paddq ($D4,$T1); + &paddq ($D3,$T0); + &pshufd ($T1,$D0,0b01001110); + &pshufd ($T0,$D1,0b01001110); + &paddq ($D0,$T1); + &paddq ($D1,$T0); + &pshufd ($T1,$D2,0b01001110); + #&paddq ($D2,$T1); + + &lazy_reduction (sub { &paddq ($D2,$T1) }); + +&set_label("done"); + &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value + &movd (&DWP(-16*3+4*1,"edi"),$D1); + &movd (&DWP(-16*3+4*2,"edi"),$D2); + &movd (&DWP(-16*3+4*3,"edi"),$D3); + &movd (&DWP(-16*3+4*4,"edi"),$D4); + &mov ("esp","ebp"); +&set_label("nodata"); +&function_end("_poly1305_blocks_sse2"); + +&align (32); +&function_begin("_poly1305_emit_sse2"); + &mov ("ebp",&wparam(0)); # context + + &cmp (&DWP(4*5,"ebp"),0); # is_base2_26? + &je (&label("enter_emit")); + + &mov ("eax",&DWP(4*0,"ebp")); # load hash value + &mov ("edi",&DWP(4*1,"ebp")); + &mov ("ecx",&DWP(4*2,"ebp")); + &mov ("edx",&DWP(4*3,"ebp")); + &mov ("esi",&DWP(4*4,"ebp")); + + &mov ("ebx","edi"); # base 2^26 -> base 2^32 + &shl ("edi",26); + &shr ("ebx",6); + &add ("eax","edi"); + &mov ("edi","ecx"); + &adc ("ebx",0); + + &shl ("edi",20); + &shr ("ecx",12); + &add ("ebx","edi"); + &mov ("edi","edx"); + &adc ("ecx",0); + + &shl ("edi",14); + &shr ("edx",18); + &add ("ecx","edi"); + &mov ("edi","esi"); + &adc ("edx",0); + + &shl ("edi",8); + &shr ("esi",24); + &add ("edx","edi"); + &adc ("esi",0); # can be partially reduced + + &mov ("edi","esi"); # final reduction + &and ("esi",3); + &shr ("edi",2); + &lea ("ebp",&DWP(0,"edi","edi",4)); # *5 + &mov ("edi",&wparam(1)); # output + &add ("eax","ebp"); + &mov ("ebp",&wparam(2)); # key + &adc ("ebx",0); + &adc ("ecx",0); + &adc ("edx",0); + &adc ("esi",0); + + &movd ($D0,"eax"); # offload original hash value + &add ("eax",5); # compare to modulus + &movd ($D1,"ebx"); + &adc ("ebx",0); + &movd ($D2,"ecx"); + &adc ("ecx",0); + &movd ($D3,"edx"); + &adc ("edx",0); + &adc ("esi",0); + &shr ("esi",2); # did it carry/borrow? + + &neg ("esi"); # do we choose (hash-modulus) ... + &and ("eax","esi"); + &and ("ebx","esi"); + &and ("ecx","esi"); + &and ("edx","esi"); + &mov (&DWP(4*0,"edi"),"eax"); + &movd ("eax",$D0); + &mov (&DWP(4*1,"edi"),"ebx"); + &movd ("ebx",$D1); + &mov (&DWP(4*2,"edi"),"ecx"); + &movd ("ecx",$D2); + &mov (&DWP(4*3,"edi"),"edx"); + &movd ("edx",$D3); + + ¬ ("esi"); # ... or original hash value? + &and ("eax","esi"); + &and ("ebx","esi"); + &or ("eax",&DWP(4*0,"edi")); + &and ("ecx","esi"); + &or ("ebx",&DWP(4*1,"edi")); + &and ("edx","esi"); + &or ("ecx",&DWP(4*2,"edi")); + &or ("edx",&DWP(4*3,"edi")); + + &add ("eax",&DWP(4*0,"ebp")); # accumulate key + &adc ("ebx",&DWP(4*1,"ebp")); + &mov (&DWP(4*0,"edi"),"eax"); + &adc ("ecx",&DWP(4*2,"ebp")); + &mov (&DWP(4*1,"edi"),"ebx"); + &adc ("edx",&DWP(4*3,"ebp")); + &mov (&DWP(4*2,"edi"),"ecx"); + &mov (&DWP(4*3,"edi"),"edx"); +&function_end("_poly1305_emit_sse2"); + +if ($avx>1) { +######################################################################## +# Note that poly1305_init_avx2 operates on %xmm, I could have used +# poly1305_init_sse2... + +&align (32); +&function_begin_B("_poly1305_init_avx2"); + &vmovdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 + &lea ("edi",&DWP(16*3,"edi")); # size optimization + &mov ("ebp","esp"); + &sub ("esp",16*(9+5)); + &and ("esp",-16); + + #&vpand ($D4,$D4,&QWP(96,"ebx")); # magic mask + &vmovdqa ($MASK,&QWP(64,"ebx")); + + &vpand ($D0,$D4,$MASK); # -> base 2^26 + &vpsrlq ($D1,$D4,26); + &vpsrldq ($D3,$D4,6); + &vpand ($D1,$D1,$MASK); + &vpsrlq ($D2,$D3,4) + &vpsrlq ($D3,$D3,30); + &vpand ($D2,$D2,$MASK); + &vpand ($D3,$D3,$MASK); + &vpsrldq ($D4,$D4,13); + + &lea ("edx",&DWP(16*9,"esp")); # size optimization + &mov ("ecx",2); +&set_label("square"); + &vmovdqa (&QWP(16*0,"esp"),$D0); + &vmovdqa (&QWP(16*1,"esp"),$D1); + &vmovdqa (&QWP(16*2,"esp"),$D2); + &vmovdqa (&QWP(16*3,"esp"),$D3); + &vmovdqa (&QWP(16*4,"esp"),$D4); + + &vpslld ($T1,$D1,2); + &vpslld ($T0,$D2,2); + &vpaddd ($T1,$T1,$D1); # *5 + &vpaddd ($T0,$T0,$D2); # *5 + &vmovdqa (&QWP(16*5,"esp"),$T1); + &vmovdqa (&QWP(16*6,"esp"),$T0); + &vpslld ($T1,$D3,2); + &vpslld ($T0,$D4,2); + &vpaddd ($T1,$T1,$D3); # *5 + &vpaddd ($T0,$T0,$D4); # *5 + &vmovdqa (&QWP(16*7,"esp"),$T1); + &vmovdqa (&QWP(16*8,"esp"),$T0); + + &vpshufd ($T0,$D0,0b01000100); + &vmovdqa ($T1,$D1); + &vpshufd ($D1,$D1,0b01000100); + &vpshufd ($D2,$D2,0b01000100); + &vpshufd ($D3,$D3,0b01000100); + &vpshufd ($D4,$D4,0b01000100); + &vmovdqa (&QWP(16*0,"edx"),$T0); + &vmovdqa (&QWP(16*1,"edx"),$D1); + &vmovdqa (&QWP(16*2,"edx"),$D2); + &vmovdqa (&QWP(16*3,"edx"),$D3); + &vmovdqa (&QWP(16*4,"edx"),$D4); + + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + &vpmuludq ($D4,$D4,$D0); # h4*r0 + &vpmuludq ($D3,$D3,$D0); # h3*r0 + &vpmuludq ($D2,$D2,$D0); # h2*r0 + &vpmuludq ($D1,$D1,$D0); # h1*r0 + &vpmuludq ($D0,$T0,$D0); # h0*r0 + + &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3 + &vpaddq ($D4,$D4,$T0); + &vpmuludq ($T2,$T1,&QWP(16*2,"edx")); # r1*h2 + &vpaddq ($D3,$D3,$T2); + &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1 + &vpaddq ($D2,$D2,$T0); + &vmovdqa ($T2,&QWP(16*5,"esp")); # s1 + &vpmuludq ($T1,$T1,&QWP(16*0,"edx")); # r1*h0 + &vpaddq ($D1,$D1,$T1); + &vmovdqa ($T0,&QWP(16*2,"esp")); # r2 + &vpmuludq ($T2,$T2,&QWP(16*4,"edx")); # s1*h4 + &vpaddq ($D0,$D0,$T2); + + &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2 + &vpaddq ($D4,$D4,$T1); + &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r2*h1 + &vpaddq ($D3,$D3,$T2); + &vmovdqa ($T1,&QWP(16*6,"esp")); # s2 + &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r2*h0 + &vpaddq ($D2,$D2,$T0); + &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s2*h4 + &vpaddq ($D1,$D1,$T2); + &vmovdqa ($T0,&QWP(16*3,"esp")); # r3 + &vpmuludq ($T1,$T1,&QWP(16*3,"edx")); # s2*h3 + &vpaddq ($D0,$D0,$T1); + + &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r3*h1 + &vpaddq ($D4,$D4,$T2); + &vmovdqa ($T1,&QWP(16*7,"esp")); # s3 + &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r3*h0 + &vpaddq ($D3,$D3,$T0); + &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s3*h4 + &vpaddq ($D2,$D2,$T2); + &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3 + &vpaddq ($D1,$D1,$T0); + &vmovdqa ($T2,&QWP(16*4,"esp")); # r4 + &vpmuludq ($T1,$T1,&QWP(16*2,"edx")); # s3*h2 + &vpaddq ($D0,$D0,$T1); + + &vmovdqa ($T0,&QWP(16*8,"esp")); # s4 + &vpmuludq ($T2,$T2,&QWP(16*0,"edx")); # r4*h0 + &vpaddq ($D4,$D4,$T2); + &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4 + &vpaddq ($D3,$D3,$T1); + &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # s4*h1 + &vpaddq ($D0,$D0,$T2); + &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2 + &vpaddq ($D1,$D1,$T1); + &vmovdqa ($MASK,&QWP(64,"ebx")); + &vpmuludq ($T0,$T0,&QWP(16*3,"edx")); # s4*h3 + &vpaddq ($D2,$D2,$T0); + + ################################################################ + # lazy reduction + &vpsrlq ($T0,$D3,26); + &vpand ($D3,$D3,$MASK); + &vpsrlq ($T1,$D0,26); + &vpand ($D0,$D0,$MASK); + &vpaddq ($D4,$D4,$T0); # h3 -> h4 + &vpaddq ($D1,$D1,$T1); # h0 -> h1 + &vpsrlq ($T0,$D4,26); + &vpand ($D4,$D4,$MASK); + &vpsrlq ($T1,$D1,26); + &vpand ($D1,$D1,$MASK); + &vpaddq ($D2,$D2,$T1); # h1 -> h2 + &vpaddd ($D0,$D0,$T0); + &vpsllq ($T0,$T0,2); + &vpsrlq ($T1,$D2,26); + &vpand ($D2,$D2,$MASK); + &vpaddd ($D0,$D0,$T0); # h4 -> h0 + &vpaddd ($D3,$D3,$T1); # h2 -> h3 + &vpsrlq ($T1,$D3,26); + &vpsrlq ($T0,$D0,26); + &vpand ($D0,$D0,$MASK); + &vpand ($D3,$D3,$MASK); + &vpaddd ($D1,$D1,$T0); # h0 -> h1 + &vpaddd ($D4,$D4,$T1); # h3 -> h4 + + &dec ("ecx"); + &jz (&label("square_break")); + + &vpunpcklqdq ($D0,$D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 + &vpunpcklqdq ($D1,$D1,&QWP(16*1,"esp")); + &vpunpcklqdq ($D2,$D2,&QWP(16*2,"esp")); + &vpunpcklqdq ($D3,$D3,&QWP(16*3,"esp")); + &vpunpcklqdq ($D4,$D4,&QWP(16*4,"esp")); + &jmp (&label("square")); + +&set_label("square_break"); + &vpsllq ($D0,$D0,32); # -> r^3:0:r^4:0 + &vpsllq ($D1,$D1,32); + &vpsllq ($D2,$D2,32); + &vpsllq ($D3,$D3,32); + &vpsllq ($D4,$D4,32); + &vpor ($D0,$D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 + &vpor ($D1,$D1,&QWP(16*1,"esp")); + &vpor ($D2,$D2,&QWP(16*2,"esp")); + &vpor ($D3,$D3,&QWP(16*3,"esp")); + &vpor ($D4,$D4,&QWP(16*4,"esp")); + + &vpshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 + &vpshufd ($D1,$D1,0b10001101); + &vpshufd ($D2,$D2,0b10001101); + &vpshufd ($D3,$D3,0b10001101); + &vpshufd ($D4,$D4,0b10001101); + + &vmovdqu (&QWP(16*0,"edi"),$D0); # save the table + &vmovdqu (&QWP(16*1,"edi"),$D1); + &vmovdqu (&QWP(16*2,"edi"),$D2); + &vmovdqu (&QWP(16*3,"edi"),$D3); + &vmovdqu (&QWP(16*4,"edi"),$D4); + + &vpslld ($T1,$D1,2); + &vpslld ($T0,$D2,2); + &vpaddd ($T1,$T1,$D1); # *5 + &vpaddd ($T0,$T0,$D2); # *5 + &vmovdqu (&QWP(16*5,"edi"),$T1); + &vmovdqu (&QWP(16*6,"edi"),$T0); + &vpslld ($T1,$D3,2); + &vpslld ($T0,$D4,2); + &vpaddd ($T1,$T1,$D3); # *5 + &vpaddd ($T0,$T0,$D4); # *5 + &vmovdqu (&QWP(16*7,"edi"),$T1); + &vmovdqu (&QWP(16*8,"edi"),$T0); + + &mov ("esp","ebp"); + &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization + &ret (); +&function_end_B("_poly1305_init_avx2"); + +######################################################################## +# now it's time to switch to %ymm + +my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7)); +my $MASK=$T2; + +sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; } + +&align (32); +&function_begin("_poly1305_blocks_avx2"); + &mov ("edi",&wparam(0)); # ctx + &mov ("esi",&wparam(1)); # inp + &mov ("ecx",&wparam(2)); # len + + &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 + &and ("ecx",-16); + &jz (&label("nodata")); + &cmp ("ecx",64); + &jae (&label("enter_avx2")); + &test ("eax","eax"); # is_base2_26? + &jz (&label("enter_blocks")); + +&set_label("enter_avx2"); + &vzeroupper (); + + &call (&label("pic_point")); +&set_label("pic_point"); + &blindpop("ebx"); + &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); + + &test ("eax","eax"); # is_base2_26? + &jnz (&label("base2_26")); + + &call ("_poly1305_init_avx2"); + + ################################################# base 2^32 -> base 2^26 + &mov ("eax",&DWP(0,"edi")); + &mov ("ecx",&DWP(3,"edi")); + &mov ("edx",&DWP(6,"edi")); + &mov ("esi",&DWP(9,"edi")); + &mov ("ebp",&DWP(13,"edi")); + + &shr ("ecx",2); + &and ("eax",0x3ffffff); + &shr ("edx",4); + &and ("ecx",0x3ffffff); + &shr ("esi",6); + &and ("edx",0x3ffffff); + + &mov (&DWP(4*0,"edi"),"eax"); + &mov (&DWP(4*1,"edi"),"ecx"); + &mov (&DWP(4*2,"edi"),"edx"); + &mov (&DWP(4*3,"edi"),"esi"); + &mov (&DWP(4*4,"edi"),"ebp"); + &mov (&DWP(4*5,"edi"),1); # is_base2_26 + + &mov ("esi",&wparam(1)); # [reload] inp + &mov ("ecx",&wparam(2)); # [reload] len + +&set_label("base2_26"); + &mov ("eax",&wparam(3)); # padbit + &mov ("ebp","esp"); + + &sub ("esp",32*(5+9)); + &and ("esp",-512); # ensure that frame + # doesn't cross page + # boundary, which is + # essential for + # misaligned 32-byte + # loads + + ################################################################ + # expand and copy pre-calculated table to stack + + &vmovdqu (&X($D0),&QWP(16*(3+0),"edi")); + &lea ("edx",&DWP(32*5+128,"esp")); # +128 size optimization + &vmovdqu (&X($D1),&QWP(16*(3+1),"edi")); + &vmovdqu (&X($D2),&QWP(16*(3+2),"edi")); + &vmovdqu (&X($D3),&QWP(16*(3+3),"edi")); + &vmovdqu (&X($D4),&QWP(16*(3+4),"edi")); + &lea ("edi",&DWP(16*3,"edi")); # size optimization + &vpermq ($D0,$D0,0b01000000); # 00001234 -> 12343434 + &vpermq ($D1,$D1,0b01000000); + &vpermq ($D2,$D2,0b01000000); + &vpermq ($D3,$D3,0b01000000); + &vpermq ($D4,$D4,0b01000000); + &vpshufd ($D0,$D0,0b11001000); # 12343434 -> 14243444 + &vpshufd ($D1,$D1,0b11001000); + &vpshufd ($D2,$D2,0b11001000); + &vpshufd ($D3,$D3,0b11001000); + &vpshufd ($D4,$D4,0b11001000); + &vmovdqa (&QWP(32*0-128,"edx"),$D0); + &vmovdqu (&X($D0),&QWP(16*5,"edi")); + &vmovdqa (&QWP(32*1-128,"edx"),$D1); + &vmovdqu (&X($D1),&QWP(16*6,"edi")); + &vmovdqa (&QWP(32*2-128,"edx"),$D2); + &vmovdqu (&X($D2),&QWP(16*7,"edi")); + &vmovdqa (&QWP(32*3-128,"edx"),$D3); + &vmovdqu (&X($D3),&QWP(16*8,"edi")); + &vmovdqa (&QWP(32*4-128,"edx"),$D4); + &vpermq ($D0,$D0,0b01000000); + &vpermq ($D1,$D1,0b01000000); + &vpermq ($D2,$D2,0b01000000); + &vpermq ($D3,$D3,0b01000000); + &vpshufd ($D0,$D0,0b11001000); + &vpshufd ($D1,$D1,0b11001000); + &vpshufd ($D2,$D2,0b11001000); + &vpshufd ($D3,$D3,0b11001000); + &vmovdqa (&QWP(32*5-128,"edx"),$D0); + &vmovd (&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value + &vmovdqa (&QWP(32*6-128,"edx"),$D1); + &vmovd (&X($D1),&DWP(-16*3+4*1,"edi")); + &vmovdqa (&QWP(32*7-128,"edx"),$D2); + &vmovd (&X($D2),&DWP(-16*3+4*2,"edi")); + &vmovdqa (&QWP(32*8-128,"edx"),$D3); + &vmovd (&X($D3),&DWP(-16*3+4*3,"edi")); + &vmovd (&X($D4),&DWP(-16*3+4*4,"edi")); + &vmovdqa ($MASK,&QWP(64,"ebx")); + &neg ("eax"); # padbit + + &test ("ecx",63); + &jz (&label("even")); + + &mov ("edx","ecx"); + &and ("ecx",-64); + &and ("edx",63); + + &vmovdqu (&X($T0),&QWP(16*0,"esi")); + &cmp ("edx",32); + &jb (&label("one")); + + &vmovdqu (&X($T1),&QWP(16*1,"esi")); + &je (&label("two")); + + &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); + &lea ("esi",&DWP(16*3,"esi")); + &lea ("ebx",&DWP(8,"ebx")); # three padbits + &lea ("edx",&DWP(32*5+128+8,"esp")); # --:r^1:r^2:r^3 (*) + &jmp (&label("tail")); + +&set_label("two"); + &lea ("esi",&DWP(16*2,"esi")); + &lea ("ebx",&DWP(16,"ebx")); # two padbits + &lea ("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*) + &jmp (&label("tail")); + +&set_label("one"); + &lea ("esi",&DWP(16*1,"esi")); + &vpxor ($T1,$T1,$T1); + &lea ("ebx",&DWP(32,"ebx","eax",8)); # one or no padbits + &lea ("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*) + &jmp (&label("tail")); + +# (*) spots marked with '--' are data from next table entry, but they +# are multiplied by 0 and therefore rendered insignificant + +&set_label("even",32); + &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input + &vmovdqu (&X($T1),&QWP(16*1,"esi")); + &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); + &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1); + &lea ("esi",&DWP(16*4,"esi")); + &sub ("ecx",64); + &jz (&label("tail")); + +&set_label("loop"); + ################################################################ + # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4 + # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3 + # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2 + # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1 + # \________/ \_______/ + ################################################################ + +sub vsplat_input { + &vmovdqa (&QWP(32*2,"esp"),$D2); + &vpsrldq ($D2,$T0,6); # splat input + &vmovdqa (&QWP(32*0,"esp"),$D0); + &vpsrldq ($D0,$T1,6); + &vmovdqa (&QWP(32*1,"esp"),$D1); + &vpunpckhqdq ($D1,$T0,$T1); # 4 + &vpunpcklqdq ($T0,$T0,$T1); # 0:1 + &vpunpcklqdq ($D2,$D2,$D0); # 2:3 + + &vpsrlq ($D0,$D2,30); + &vpsrlq ($D2,$D2,4); + &vpsrlq ($T1,$T0,26); + &vpsrlq ($D1,$D1,40); # 4 + &vpand ($D2,$D2,$MASK); # 2 + &vpand ($T0,$T0,$MASK); # 0 + &vpand ($T1,$T1,$MASK); # 1 + &vpand ($D0,$D0,$MASK); # 3 (*) + &vpor ($D1,$D1,&QWP(0,"ebx")); # padbit, yes, always + + # (*) note that output is counterintuitive, inp[3:4] is + # returned in $D1-2, while $D3-4 are preserved; +} + &vsplat_input (); + +sub vpmuladd { +my $addr = shift; + + &vpaddq ($D2,$D2,&QWP(32*2,"esp")); # add hash value + &vpaddq ($T0,$T0,&QWP(32*0,"esp")); + &vpaddq ($T1,$T1,&QWP(32*1,"esp")); + &vpaddq ($D0,$D0,$D3); + &vpaddq ($D1,$D1,$D4); + + ################################################################ + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + + &vpmuludq ($D3,$D2,&$addr(1)); # d3 = h2*r1 + &vmovdqa (QWP(32*1,"esp"),$T1); + &vpmuludq ($D4,$D2,&$addr(2)); # d4 = h2*r2 + &vmovdqa (QWP(32*3,"esp"),$D0); + &vpmuludq ($D0,$D2,&$addr(7)); # d0 = h2*s3 + &vmovdqa (QWP(32*4,"esp"),$D1); + &vpmuludq ($D1,$D2,&$addr(8)); # d1 = h2*s4 + &vpmuludq ($D2,$D2,&$addr(0)); # d2 = h2*r0 + + &vpmuludq ($T2,$T0,&$addr(3)); # h0*r3 + &vpaddq ($D3,$D3,$T2); # d3 += h0*r3 + &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4 + &vpaddq ($D4,$D4,$T1); # d4 + h0*r4 + &vpmuludq ($T2,$T0,&$addr(0)); # h0*r0 + &vpaddq ($D0,$D0,$T2); # d0 + h0*r0 + &vmovdqa ($T2,&QWP(32*1,"esp")); # h1 + &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1 + &vpaddq ($D1,$D1,$T1); # d1 += h0*r1 + &vpmuludq ($T0,$T0,&$addr(2)); # h0*r2 + &vpaddq ($D2,$D2,$T0); # d2 += h0*r2 + + &vpmuludq ($T1,$T2,&$addr(2)); # h1*r2 + &vpaddq ($D3,$D3,$T1); # d3 += h1*r2 + &vpmuludq ($T0,$T2,&$addr(3)); # h1*r3 + &vpaddq ($D4,$D4,$T0); # d4 += h1*r3 + &vpmuludq ($T1,$T2,&$addr(8)); # h1*s4 + &vpaddq ($D0,$D0,$T1); # d0 += h1*s4 + &vmovdqa ($T1,&QWP(32*3,"esp")); # h3 + &vpmuludq ($T0,$T2,&$addr(0)); # h1*r0 + &vpaddq ($D1,$D1,$T0); # d1 += h1*r0 + &vpmuludq ($T2,$T2,&$addr(1)); # h1*r1 + &vpaddq ($D2,$D2,$T2); # d2 += h1*r1 + + &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0 + &vpaddq ($D3,$D3,$T0); # d3 += h3*r0 + &vpmuludq ($T2,$T1,&$addr(1)); # h3*r1 + &vpaddq ($D4,$D4,$T2); # d4 += h3*r1 + &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2 + &vpaddq ($D0,$D0,$T0); # d0 += h3*s2 + &vmovdqa ($T0,&QWP(32*4,"esp")); # h4 + &vpmuludq ($T2,$T1,&$addr(7)); # h3*s3 + &vpaddq ($D1,$D1,$T2); # d1+= h3*s3 + &vpmuludq ($T1,$T1,&$addr(8)); # h3*s4 + &vpaddq ($D2,$D2,$T1); # d2 += h3*s4 + + &vpmuludq ($T2,$T0,&$addr(8)); # h4*s4 + &vpaddq ($D3,$D3,$T2); # d3 += h4*s4 + &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1 + &vpaddq ($D0,$D0,$T1); # d0 += h4*s1 + &vpmuludq ($T2,$T0,&$addr(0)); # h4*r0 + &vpaddq ($D4,$D4,$T2); # d4 += h4*r0 + &vmovdqa ($MASK,&QWP(64,"ebx")); + &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2 + &vpaddq ($D1,$D1,$T1); # d1 += h4*s2 + &vpmuludq ($T0,$T0,&$addr(7)); # h4*s3 + &vpaddq ($D2,$D2,$T0); # d2 += h4*s3 +} + &vpmuladd (sub { my $i=shift; &QWP(32*$i-128,"edx"); }); + +sub vlazy_reduction { + ################################################################ + # lazy reduction + + &vpsrlq ($T0,$D3,26); + &vpand ($D3,$D3,$MASK); + &vpsrlq ($T1,$D0,26); + &vpand ($D0,$D0,$MASK); + &vpaddq ($D4,$D4,$T0); # h3 -> h4 + &vpaddq ($D1,$D1,$T1); # h0 -> h1 + &vpsrlq ($T0,$D4,26); + &vpand ($D4,$D4,$MASK); + &vpsrlq ($T1,$D1,26); + &vpand ($D1,$D1,$MASK); + &vpaddq ($D2,$D2,$T1); # h1 -> h2 + &vpaddq ($D0,$D0,$T0); + &vpsllq ($T0,$T0,2); + &vpsrlq ($T1,$D2,26); + &vpand ($D2,$D2,$MASK); + &vpaddq ($D0,$D0,$T0); # h4 -> h0 + &vpaddq ($D3,$D3,$T1); # h2 -> h3 + &vpsrlq ($T1,$D3,26); + &vpsrlq ($T0,$D0,26); + &vpand ($D0,$D0,$MASK); + &vpand ($D3,$D3,$MASK); + &vpaddq ($D1,$D1,$T0); # h0 -> h1 + &vpaddq ($D4,$D4,$T1); # h3 -> h4 +} + &vlazy_reduction(); + + &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input + &vmovdqu (&X($T1),&QWP(16*1,"esi")); + &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); + &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1); + &lea ("esi",&DWP(16*4,"esi")); + &sub ("ecx",64); + &jnz (&label("loop")); + +&set_label("tail"); + &vsplat_input (); + &and ("ebx",-64); # restore pointer + + &vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); }); + + ################################################################ + # horizontal addition + + &vpsrldq ($T0,$D4,8); + &vpsrldq ($T1,$D3,8); + &vpaddq ($D4,$D4,$T0); + &vpsrldq ($T0,$D0,8); + &vpaddq ($D3,$D3,$T1); + &vpsrldq ($T1,$D1,8); + &vpaddq ($D0,$D0,$T0); + &vpsrldq ($T0,$D2,8); + &vpaddq ($D1,$D1,$T1); + &vpermq ($T1,$D4,2); # keep folding + &vpaddq ($D2,$D2,$T0); + &vpermq ($T0,$D3,2); + &vpaddq ($D4,$D4,$T1); + &vpermq ($T1,$D0,2); + &vpaddq ($D3,$D3,$T0); + &vpermq ($T0,$D1,2); + &vpaddq ($D0,$D0,$T1); + &vpermq ($T1,$D2,2); + &vpaddq ($D1,$D1,$T0); + &vpaddq ($D2,$D2,$T1); + + &vlazy_reduction(); + + &cmp ("ecx",0); + &je (&label("done")); + + ################################################################ + # clear all but single word + + &vpshufd (&X($D0),&X($D0),0b11111100); + &lea ("edx",&DWP(32*5+128,"esp")); # restore pointer + &vpshufd (&X($D1),&X($D1),0b11111100); + &vpshufd (&X($D2),&X($D2),0b11111100); + &vpshufd (&X($D3),&X($D3),0b11111100); + &vpshufd (&X($D4),&X($D4),0b11111100); + &jmp (&label("even")); + +&set_label("done",16); + &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value + &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1)); + &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2)); + &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3)); + &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4)); + &vzeroupper (); + &mov ("esp","ebp"); +&set_label("nodata"); +&function_end("_poly1305_blocks_avx2"); +} +&set_label("const_sse2",64); + &data_word(1<<24,0, 1<<24,0, 1<<24,0, 1<<24,0); + &data_word(0,0, 0,0, 0,0, 0,0); + &data_word(0x03ffffff,0,0x03ffffff,0, 0x03ffffff,0, 0x03ffffff,0); + &data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc); +} +&asciz ("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>"); +&align (4); + +&asm_finish(); + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-x86_64.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-x86_64.pl new file mode 100755 index 000000000..c014be1ca --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/asm/poly1305-x86_64.pl @@ -0,0 +1,4183 @@ +#! /usr/bin/env perl +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for x86_64. +# +# March 2015 +# +# Initial release. +# +# December 2016 +# +# Add AVX512F+VL+BW code path. +# +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannolake, has AVX512IFMA code path to execute... +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# measured with rdtsc at fixed clock frequency. +# +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 +# P4 4.46/+120% - +# Core 2 2.41/+90% - +# Westmere 1.88/+120% - +# Sandy Bridge 1.39/+140% 1.10 +# Haswell 1.14/+175% 1.11 0.65 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] +# Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) +# Goldmont 1.70/+180% - +# VIA Nano 1.82/+150% - +# Sledgehammer 1.38/+160% - +# Bulldozer 2.30/+130% 0.97 +# Ryzen 1.15/+200% 1.08 1.18 +# +# (*) improvement coefficients relative to clang are more modest and +# are ~50% on most processors, in both cases we are comparing to +# __int128 code; +# (**) SSE2 implementation was attempted, but among non-AVX processors +# it was faster than integer-only code only on older Intel P4 and +# Core processors, 50-30%, less newer processor is, but slower on +# contemporary ones, for example almost 2x slower on Atom, and as +# former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); + $avx += 2 if ($1==2.11 && $2>=8); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=12); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); +my ($mac,$nonce)=($inp,$len); # *_emit arguments +my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); +my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); + +sub poly1305_iteration { +# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 +# output: $h0-$h2 *= $r0-$r1 +$code.=<<___; + mulq $h0 # h0*r1 + mov %rax,$d2 + mov $r0,%rax + mov %rdx,$d3 + + mulq $h0 # h0*r0 + mov %rax,$h0 # future $h0 + mov $r0,%rax + mov %rdx,$d1 + + mulq $h1 # h1*r0 + add %rax,$d2 + mov $s1,%rax + adc %rdx,$d3 + + mulq $h1 # h1*s1 + mov $h2,$h1 # borrow $h1 + add %rax,$h0 + adc %rdx,$d1 + + imulq $s1,$h1 # h2*s1 + add $h1,$d2 + mov $d1,$h1 + adc \$0,$d3 + + imulq $r0,$h2 # h2*r0 + add $d2,$h1 + mov \$-4,%rax # mask value + adc $h2,$d3 + + and $d3,%rax # last reduction step + mov $d3,$h2 + shr \$2,$d3 + and \$3,$h2 + add $d3,%rax + add %rax,$h0 + adc \$0,$h1 + adc \$0,$h2 +___ +} + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^64 +# unsigned __int64 r[2]; # key value base 2^64 + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl poly1305_init +.hidden poly1305_init +.globl poly1305_blocks +.hidden poly1305_blocks +.globl poly1305_emit +.hidden poly1305_emit + +.type poly1305_init,\@function,3 +.align 32 +poly1305_init: +.cfi_startproc + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + + cmp \$0,$inp + je .Lno_key + + lea poly1305_blocks(%rip),%r10 + lea poly1305_emit(%rip),%r11 +___ +$code.=<<___ if ($avx); + mov OPENSSL_ia32cap_P+4(%rip),%r9 + lea poly1305_blocks_avx(%rip),%rax + lea poly1305_emit_avx(%rip),%rcx + bt \$`60-32`,%r9 # AVX? + cmovc %rax,%r10 + cmovc %rcx,%r11 +___ +$code.=<<___ if ($avx>1); + lea poly1305_blocks_avx2(%rip),%rax + bt \$`5+32`,%r9 # AVX2? + cmovc %rax,%r10 +___ +$code.=<<___ if ($avx>3); + mov \$`(1<<31|1<<21|1<<16)`,%rax + shr \$32,%r9 + and %rax,%r9 + cmp %rax,%r9 + je .Linit_base2_44 +___ +$code.=<<___; + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + and 8($inp),%rcx + mov %rax,24($ctx) + mov %rcx,32($ctx) +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax +.Lno_key: + ret +.cfi_endproc +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,\@function,4 +.align 32 +poly1305_blocks: +.cfi_startproc +.Lblocks: + shr \$4,$len + jz .Lno_data # too short + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2 + + mov $s1,$r1 + shr \$2,$s1 + mov $r1,%rax + add $r1,$s1 # s1 = r1 + (r1 >> 2) + jmp .Loop + +.align 32 +.Loop: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 +___ + &poly1305_iteration(); +$code.=<<___; + mov $r1,%rax + dec %r15 # len-=16 + jnz .Loop + + mov $h0,0($ctx) # store hash value + mov $h1,8($ctx) + mov $h2,16($ctx) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data: +.Lblocks_epilogue: + ret +.cfi_endproc +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,\@function,3 +.align 32 +poly1305_emit: +.cfi_startproc +.Lemit: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.cfi_endproc +.size poly1305_emit,.-poly1305_emit +___ +if ($avx) { + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int32 h[5]; # current hash value base 2^26 +# unsigned __int32 is_base2_26; +# unsigned __int64 r[2]; # key value base 2^64 +# unsigned __int64 pad; +# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; +# +# where r^n are base 2^26 digits of degrees of multiplier key. There are +# 5 digits, but last four are interleaved with multiples of 5, totalling +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. + +my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = + map("%xmm$_",(0..15)); + +$code.=<<___; +.type __poly1305_block,\@abi-omnipotent +.align 32 +__poly1305_block: +.cfi_startproc +___ + &poly1305_iteration(); +$code.=<<___; + ret +.cfi_endproc +.size __poly1305_block,.-__poly1305_block + +.type __poly1305_init_avx,\@abi-omnipotent +.align 32 +__poly1305_init_avx: +.cfi_startproc + mov $r0,$h0 + mov $r1,$h1 + xor $h2,$h2 + + lea 48+64($ctx),$ctx # size optimization + + mov $r1,%rax + call __poly1305_block # r^2 + + mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 + mov \$0x3ffffff,%edx + mov $h0,$d1 + and $h0#d,%eax + mov $r0,$d2 + and $r0#d,%edx + mov %eax,`16*0+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*0+4-64`($ctx) + shr \$26,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*1+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*1+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*2+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*2+4-64`($ctx) + shr \$26,$d2 + + mov $h1,%rax + mov $r1,%rdx + shl \$12,%rax + shl \$12,%rdx + or $d1,%rax + or $d2,%rdx + and \$0x3ffffff,%eax + and \$0x3ffffff,%edx + mov %eax,`16*3+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*3+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*4+0-64`($ctx) + mov $h1,$d1 + mov %edx,`16*4+4-64`($ctx) + mov $r1,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + shr \$14,$d1 + shr \$14,$d2 + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*5+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*5+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*6+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*6+4-64`($ctx) + shr \$26,$d2 + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+0-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d2#d,`16*7+4-64`($ctx) + lea ($d2,$d2,4),$d2 # *5 + mov $d1#d,`16*8+0-64`($ctx) + mov $d2#d,`16*8+4-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^3 + + mov \$0x3ffffff,%eax # save r^3 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+12-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+12-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+12-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+12-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+12-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+12-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+12-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^4 + + mov \$0x3ffffff,%eax # save r^4 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+8-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+8-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+8-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+8-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+8-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+8-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+8-64`($ctx) + + lea -48-64($ctx),$ctx # size [de-]optimization + ret +.cfi_endproc +.size __poly1305_init_avx,.-__poly1305_init_avx + +.type poly1305_blocks_avx,\@function,4 +.align 32 +poly1305_blocks_avx: +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + and \$-16,$len + jz .Lno_data_avx + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx + + test \$31,$len + jz .Leven_avx + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + + call __poly1305_block + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + sub \$16,%r15 + jz .Lstore_base2_26_avx + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx: + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx: +.Lblocks_avx_epilogue: + ret +.cfi_endproc + +.align 32 +.Lbase2_64_avx: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$31,$len + jz .Linit_avx + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + +.Linit_avx: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx: + mov %r15,$len + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rax + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx +.cfi_endproc + +.align 32 +.Leven_avx: +.cfi_startproc + vmovd 4*0($ctx),$H0 # load hash value + vmovd 4*1($ctx),$H1 + vmovd 4*2($ctx),$H2 + vmovd 4*3($ctx),$H3 + vmovd 4*4($ctx),$H4 + +.Ldo_avx: +___ +$code.=<<___ if (!$win64); + lea -0x58(%rsp),%r11 +.cfi_def_cfa %r11,0x60 + sub \$0x178,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x218,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx_body: +___ +$code.=<<___; + sub \$64,$len + lea -32($inp),%rax + cmovc %rax,$inp + + vmovdqu `16*3`($ctx),$D4 # preload r0^2 + lea `16*3+64`($ctx),$ctx # size optimization + lea .Lconst(%rip),%rcx + + ################################################################ + # load input + vmovdqu 16*2($inp),$T0 + vmovdqu 16*3($inp),$T1 + vmovdqa 64(%rcx),$MASK # .Lmask26 + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + vpsrlq \$40,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + jbe .Lskip_loop_avx + + # expand and copy pre-calculated table to stack + vmovdqu `16*1-64`($ctx),$D1 + vmovdqu `16*2-64`($ctx),$D2 + vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 + vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 + vmovdqa $D3,-0x90(%r11) + vmovdqa $D0,0x00(%rsp) + vpshufd \$0xEE,$D1,$D4 + vmovdqu `16*3-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x80(%r11) + vmovdqa $D1,0x10(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqu `16*4-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x70(%r11) + vmovdqa $D2,0x20(%rsp) + vpshufd \$0xEE,$D0,$D4 + vmovdqu `16*5-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D4,-0x60(%r11) + vmovdqa $D0,0x30(%rsp) + vpshufd \$0xEE,$D1,$D3 + vmovdqu `16*6-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D3,-0x50(%r11) + vmovdqa $D1,0x40(%rsp) + vpshufd \$0xEE,$D2,$D4 + vmovdqu `16*7-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D4,-0x40(%r11) + vmovdqa $D2,0x50(%rsp) + vpshufd \$0xEE,$D0,$D3 + vmovdqu `16*8-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D3,-0x30(%r11) + vmovdqa $D0,0x60(%rsp) + vpshufd \$0xEE,$D1,$D4 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x20(%r11) + vmovdqa $D1,0x70(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x10(%r11) + vmovdqa $D2,0x80(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + ################################################################ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + # \___________________/ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + # \___________________/ \____________________/ + # + # Note that we start with inp[2:3]*r^2. This is because it + # doesn't depend on reduction in previous iteration. + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # though note that $Tx and $Hx are "reversed" in this section, + # and $D4 is preloaded with r0^2... + + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vmovdqa $H2,0x20(%r11) # offload hash + vpmuludq $T2,$D4,$D2 # d3 = h2*r0 + vmovdqa 0x10(%rsp),$H2 # r1^2 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vmovdqa $H0,0x00(%r11) # + vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 + vmovdqa $H1,0x10(%r11) # + vpmuludq $T3,$H2,$H1 # h3*r1 + vpaddq $H0,$D0,$D0 # d0 += h4*s1 + vpaddq $H1,$D4,$D4 # d4 += h3*r1 + vmovdqa $H3,0x30(%r11) # + vpmuludq $T2,$H2,$H0 # h2*r1 + vpmuludq $T1,$H2,$H1 # h1*r1 + vpaddq $H0,$D3,$D3 # d3 += h2*r1 + vmovdqa 0x30(%rsp),$H3 # r2^2 + vpaddq $H1,$D2,$D2 # d2 += h1*r1 + vmovdqa $H4,0x40(%r11) # + vpmuludq $T0,$H2,$H2 # h0*r1 + vpmuludq $T2,$H3,$H0 # h2*r2 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + + vmovdqa 0x40(%rsp),$H4 # s2^2 + vpaddq $H0,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H3,$H1 # h1*r2 + vpmuludq $T0,$H3,$H3 # h0*r2 + vpaddq $H1,$D3,$D3 # d3 += h1*r2 + vmovdqa 0x50(%rsp),$H2 # r3^2 + vpaddq $H3,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H4,$H0 # h4*s2 + vpmuludq $T3,$H4,$H4 # h3*s2 + vpaddq $H0,$D1,$D1 # d1 += h4*s2 + vmovdqa 0x60(%rsp),$H3 # s3^2 + vpaddq $H4,$D0,$D0 # d0 += h3*s2 + + vmovdqa 0x80(%rsp),$H4 # s4^2 + vpmuludq $T1,$H2,$H1 # h1*r3 + vpmuludq $T0,$H2,$H2 # h0*r3 + vpaddq $H1,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $T4,$H3,$H0 # h4*s3 + vpmuludq $T3,$H3,$H1 # h3*s3 + vpaddq $H0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*0($inp),$H0 # load input + vpaddq $H1,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H3,$H3 # h2*s3 + vpmuludq $T2,$H4,$T2 # h2*s4 + vpaddq $H3,$D0,$D0 # d0 += h2*s3 + + vmovdqu 16*1($inp),$H1 # + vpaddq $T2,$D1,$D1 # d1 += h2*s4 + vpmuludq $T3,$H4,$T3 # h3*s4 + vpmuludq $T4,$H4,$T4 # h4*s4 + vpsrldq \$6,$H0,$H2 # splat input + vpaddq $T3,$D2,$D2 # d2 += h3*s4 + vpaddq $T4,$D3,$D3 # d3 += h4*s4 + vpsrldq \$6,$H1,$H3 # + vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 + vpmuludq $T1,$H4,$T0 # h1*s4 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpaddq $T4,$D4,$D4 # d4 += h0*r4 + vmovdqa -0x90(%r11),$T4 # r0^4 + vpaddq $T0,$D0,$D0 # d0 += h1*s4 + + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + #vpsrlq \$40,$H4,$H4 # 4 + vpsrldq \$`40/8`,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpand 0(%rcx),$H4,$H4 # .Lmask24 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpaddq 0x00(%r11),$H0,$H0 # add hash value + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + lea 16*2($inp),%rax + lea 16*4($inp),$inp + sub \$64,$len + cmovc %rax,$inp + + ################################################################ + # Now we accumulate (inp[0:1]+hash)*r^4 + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vmovdqa -0x80(%r11),$T2 # r1^4 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T0,$D2,$D2 + vpaddq $T1,$D3,$D3 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 + vpaddq $T4,$D4,$D4 + + vpaddq $T0,$D0,$D0 # d0 += h4*s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vmovdqa -0x60(%r11),$T3 # r2^4 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpmuludq $H1,$T2,$T1 # h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T1,$D2,$D2 # d2 += h1*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + + vmovdqa -0x50(%r11),$T4 # s2^4 + vpmuludq $H2,$T3,$T0 # h2*r2 + vpmuludq $H1,$T3,$T1 # h1*r2 + vpaddq $T0,$D4,$D4 # d4 += h2*r2 + vpaddq $T1,$D3,$D3 # d3 += h1*r2 + vmovdqa -0x40(%r11),$T2 # r3^4 + vpmuludq $H0,$T3,$T3 # h0*r2 + vpmuludq $H4,$T4,$T0 # h4*s2 + vpaddq $T3,$D2,$D2 # d2 += h0*r2 + vpaddq $T0,$D1,$D1 # d1 += h4*s2 + vmovdqa -0x30(%r11),$T3 # s3^4 + vpmuludq $H3,$T4,$T4 # h3*s2 + vpmuludq $H1,$T2,$T1 # h1*r3 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + + vmovdqa -0x10(%r11),$T4 # s4^4 + vpaddq $T1,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T2,$T2 # h0*r3 + vpmuludq $H4,$T3,$T0 # h4*s3 + vpaddq $T2,$D3,$D3 # d3 += h0*r3 + vpaddq $T0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*2($inp),$T0 # load input + vpmuludq $H3,$T3,$T2 # h3*s3 + vpmuludq $H2,$T3,$T3 # h2*s3 + vpaddq $T2,$D1,$D1 # d1 += h3*s3 + vmovdqu 16*3($inp),$T1 # + vpaddq $T3,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H2,$T4,$H2 # h2*s4 + vpmuludq $H3,$T4,$H3 # h3*s4 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $H2,$D1,$D1 # d1 += h2*s4 + vpmuludq $H4,$T4,$H4 # h4*s4 + vpsrldq \$6,$T1,$T3 # + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 + vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 + vpmuludq $H1,$T4,$H0 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + #vpsrlq \$40,$T4,$T4 # 4 + vpsrldq \$`40/8`,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpand 0(%rcx),$T4,$T4 # .Lmask24 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D0 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D0,$H0,$H0 + vpsllq \$2,$D0,$D0 + vpaddq $D0,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + ja .Loop_avx + +.Lskip_loop_avx: + ################################################################ + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 + add \$32,$len + jnz .Long_tail_avx + + vpaddq $H2,$T2,$T2 + vpaddq $H0,$T0,$T0 + vpaddq $H1,$T1,$T1 + vpaddq $H3,$T3,$T3 + vpaddq $H4,$T4,$T4 + +.Long_tail_avx: + vmovdqa $H2,0x20(%r11) + vmovdqa $H0,0x00(%r11) + vmovdqa $H1,0x10(%r11) + vmovdqa $H3,0x30(%r11) + vmovdqa $H4,0x40(%r11) + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $T2,$D4,$D2 # d2 = h2*r0 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vpmuludq $T3,$H2,$H0 # h3*r1 + vpaddq $H0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n + vpmuludq $T2,$H2,$H1 # h2*r1 + vpaddq $H1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n + vpmuludq $T1,$H2,$H0 # h1*r1 + vpaddq $H0,$D2,$D2 # d2 += h1*r1 + vpmuludq $T0,$H2,$H2 # h0*r1 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + vpmuludq $T4,$H3,$H3 # h4*s1 + vpaddq $H3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n + vpmuludq $T2,$H4,$H1 # h2*r2 + vpaddq $H1,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H4,$H0 # h1*r2 + vpaddq $H0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n + vpmuludq $T0,$H4,$H4 # h0*r2 + vpaddq $H4,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H2,$H1 # h4*s2 + vpaddq $H1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n + vpmuludq $T3,$H2,$H2 # h3*s2 + vpaddq $H2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $T1,$H3,$H0 # h1*r3 + vpaddq $H0,$D4,$D4 # d4 += h1*r3 + vpmuludq $T0,$H3,$H3 # h0*r3 + vpaddq $H3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n + vpmuludq $T4,$H4,$H1 # h4*s3 + vpaddq $H1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n + vpmuludq $T3,$H4,$H0 # h3*s3 + vpaddq $H0,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H4,$H4 # h2*s3 + vpaddq $H4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $T0,$H2,$H2 # h0*r4 + vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 + vpmuludq $T4,$H3,$H1 # h4*s4 + vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 + vpmuludq $T3,$H3,$H0 # h3*s4 + vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 + vpmuludq $T2,$H3,$H1 # h2*s4 + vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 + vpmuludq $T1,$H3,$H3 # h1*s4 + vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 + + jz .Lshort_tail_avx + + vmovdqu 16*0($inp),$H0 # load input + vmovdqu 16*1($inp),$H1 + + vpsrldq \$6,$H0,$H2 # splat input + vpsrldq \$6,$H1,$H3 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + vpsrlq \$40,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 + vpaddq 0x00(%r11),$H0,$H0 + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + ################################################################ + # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpaddq $T0,$D0,$D0 # d0 += h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T1,$D1,$D1 # d1 += h1*r0 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpaddq $T0,$D2,$D2 # d2 += h2*r0 + vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T1,$D3,$D3 # d3 += h3*r0 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpaddq $T4,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 + vpmuludq $H1,$T2,$T0 # h1*r1 + vpaddq $T0,$D2,$D2 # d2 += h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + vpmuludq $H4,$T3,$T3 # h4*s1 + vpaddq $T3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 + vpmuludq $H2,$T4,$T1 # h2*r2 + vpaddq $T1,$D4,$D4 # d4 += h2*r2 + vpmuludq $H1,$T4,$T0 # h1*r2 + vpaddq $T0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 + vpmuludq $H0,$T4,$T4 # h0*r2 + vpaddq $T4,$D2,$D2 # d2 += h0*r2 + vpmuludq $H4,$T2,$T1 # h4*s2 + vpaddq $T1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 + vpmuludq $H3,$T2,$T2 # h3*s2 + vpaddq $T2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $H1,$T3,$T0 # h1*r3 + vpaddq $T0,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T3,$T3 # h0*r3 + vpaddq $T3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 + vpmuludq $H4,$T4,$T1 # h4*s3 + vpaddq $T1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 + vpmuludq $H3,$T4,$T0 # h3*s3 + vpaddq $T0,$D1,$D1 # d1 += h3*s3 + vpmuludq $H2,$T4,$T4 # h2*s3 + vpaddq $T4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H0,$T2,$T2 # h0*r4 + vpaddq $T2,$D4,$D4 # d4 += h0*r4 + vpmuludq $H4,$T3,$T1 # h4*s4 + vpaddq $T1,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$T3,$T0 # h3*s4 + vpaddq $T0,$D2,$D2 # d2 += h3*s4 + vpmuludq $H2,$T3,$T1 # h2*s4 + vpaddq $T1,$D1,$D1 # d1 += h2*s4 + vpmuludq $H1,$T3,$T3 # h1*s4 + vpaddq $T3,$D0,$D0 # d0 += h1*s4 + +.Lshort_tail_avx: + ################################################################ + # horizontal addition + + vpsrldq \$8,$D4,$T4 + vpsrldq \$8,$D3,$T3 + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$D0,$T0 + vpsrldq \$8,$D2,$T2 + vpaddq $T3,$D3,$D3 + vpaddq $T4,$D4,$D4 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$D2,$D2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D4,$H4 + vpand $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$H1 + vpand $MASK,$D1,$D1 + vpaddq $H1,$D2,$D2 # h1 -> h2 + + vpaddq $H4,$D0,$D0 + vpsllq \$2,$H4,$H4 + vpaddq $H4,$D0,$D0 # h4 -> h0 + + vpsrlq \$26,$D2,$H2 + vpand $MASK,$D2,$D2 + vpaddq $H2,$D3,$D3 # h2 -> h3 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced + vmovd $D1,`4*1-48-64`($ctx) + vmovd $D2,`4*2-48-64`($ctx) + vmovd $D3,`4*3-48-64`($ctx) + vmovd $D4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx_epilogue: +___ +$code.=<<___ if (!$win64); + lea 0x58(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + vzeroupper + ret +.cfi_endproc +.size poly1305_blocks_avx,.-poly1305_blocks_avx + +.type poly1305_emit_avx,\@function,3 +.align 32 +poly1305_emit_avx: +.cfi_startproc + cmpl \$0,20($ctx) # is_base2_26? + je .Lemit + + mov 0($ctx),%eax # load hash value base 2^26 + mov 4($ctx),%ecx + mov 8($ctx),%r8d + mov 12($ctx),%r11d + mov 16($ctx),%r10d + + shl \$26,%rcx # base 2^26 -> base 2^64 + mov %r8,%r9 + shl \$52,%r8 + add %rcx,%rax + shr \$12,%r9 + add %rax,%r8 # h0 + adc \$0,%r9 + + shl \$14,%r11 + mov %r10,%rax + shr \$24,%r10 + add %r11,%r9 + shl \$40,%rax + add %rax,%r9 # h1 + adc \$0,%r10 # h2 + + mov %r10,%rax # could be partially reduced, so reduce + mov %r10,%rcx + and \$3,%r10 + shr \$2,%rax + and \$-4,%rcx + add %rcx,%rax + add %rax,%r8 + adc \$0,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.cfi_endproc +.size poly1305_emit_avx,.-poly1305_emit_avx +___ + +if ($avx>1) { +my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = + map("%ymm$_",(0..15)); +my $S4=$MASK; + +$code.=<<___; +.type poly1305_blocks_avx2,\@function,4 +.align 32 +poly1305_blocks_avx2: +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx2 + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2: + and \$-16,$len + jz .Lno_data_avx2 + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx2 + + test \$63,$len + jz .Leven_avx2 + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx2_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + +.Lbase2_26_pre_avx2: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_26_pre_avx2 + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx2 # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + test %r15,%r15 + jz .Lstore_base2_26_avx2 + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + jmp .Lproceed_avx2 + +.align 32 +.Lstore_base2_64_avx2: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx2 + +.align 16 +.Lstore_base2_26_avx2: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx2: + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx2: +.Lblocks_avx2_epilogue: + ret +.cfi_endproc + +.align 32 +.Lbase2_64_avx2: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx2_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$63,$len + jz .Linit_avx2 + +.Lbase2_64_pre_avx2: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_64_pre_avx2 + +.Linit_avx2: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx2: + mov %r15,$len # restore $len + mov OPENSSL_ia32cap_P+8(%rip),%r10d + mov \$`(1<<31|1<<30|1<<16)`,%r11d + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rax + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx2_epilogue: + jmp .Ldo_avx2 +.cfi_endproc + +.align 32 +.Leven_avx2: +.cfi_startproc + mov OPENSSL_ia32cap_P+8(%rip),%r10d + vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 + vmovd 4*1($ctx),%x#$H1 + vmovd 4*2($ctx),%x#$H2 + vmovd 4*3($ctx),%x#$H3 + vmovd 4*4($ctx),%x#$H4 + +.Ldo_avx2: +___ +$code.=<<___ if ($avx>2); + cmp \$512,$len + jb .Lskip_avx512 + and %r11d,%r10d + test \$`1<<16`,%r10d # check for AVX512F + jnz .Lblocks_avx512 +.Lskip_avx512: +___ +$code.=<<___ if (!$win64); + lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x1c8,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx2_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 + + # expand and copy pre-calculated table to stack + vmovdqu `16*0-64`($ctx),%x#$T2 + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$T3 + vmovdqu `16*2-64`($ctx),%x#$T4 + vmovdqu `16*3-64`($ctx),%x#$D0 + vmovdqu `16*4-64`($ctx),%x#$D1 + vmovdqu `16*5-64`($ctx),%x#$D2 + lea 0x90(%rsp),%rax # size optimization + vmovdqu `16*6-64`($ctx),%x#$D3 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444 + vmovdqu `16*7-64`($ctx),%x#$D4 + vpermd $T3,$T0,$T3 + vmovdqu `16*8-64`($ctx),%x#$MASK + vpermd $T4,$T0,$T4 + vmovdqa $T2,0x00(%rsp) + vpermd $D0,$T0,$D0 + vmovdqa $T3,0x20-0x90(%rax) + vpermd $D1,$T0,$D1 + vmovdqa $T4,0x40-0x90(%rax) + vpermd $D2,$T0,$D2 + vmovdqa $D0,0x60-0x90(%rax) + vpermd $D3,$T0,$D3 + vmovdqa $D1,0x80-0x90(%rax) + vpermd $D4,$T0,$D4 + vmovdqa $D2,0xa0-0x90(%rax) + vpermd $MASK,$T0,$MASK + vmovdqa $D3,0xc0-0x90(%rax) + vmovdqa $D4,0xe0-0x90(%rax) + vmovdqa $MASK,0x100-0x90(%rax) + vmovdqa 64(%rcx),$MASK # .Lmask26 + + ################################################################ + # load input + vmovdqu 16*0($inp),%x#$T0 + vmovdqu 16*1($inp),%x#$T1 + vinserti128 \$1,16*2($inp),$T0,$T0 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$64,$len + jz .Ltail_avx2 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + ################################################################ + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 + # \________/\__________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqa `32*0`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqa `32*1`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqa `32*3`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 + vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 + # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + vmovdqa `32*4-0x90`(%rax),$T1 # s2 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vmovdqu 16*0($inp),%x#$T0 # load input + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + vinserti128 \$1,16*2($inp),$T0,$T0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vmovdqu 16*1($inp),%x#$T1 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqa `32*5-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpsrldq \$6,$T1,$T3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + vpunpckhqdq $T1,$T0,$T4 # 4 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # lazy reduction (interleaved with tail of input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$4,$T3,$T2 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpand $MASK,$T2,$T2 # 2 + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$30,$T3,$T3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + sub \$64,$len + jnz .Loop_avx2 + + .byte 0x66,0x90 +.Ltail_avx2: + ################################################################ + # while above multiplications were by r^4 in all lanes, in last + # iteration we multiply least significant lane by r^4 and most + # significant one by r, so copy of above except that references + # to the precomputed table are displaced by 4... + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqu `32*0+4`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqu `32*1+4`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqu `32*3+4`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 + vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1 + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # horizontal addition + + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$H2,$T2 + vpsrldq \$8,$H3,$T3 + vpsrldq \$8,$H4,$T4 + vpsrldq \$8,$H0,$T0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + + vpermq \$0x2,$H3,$T3 + vpermq \$0x2,$H4,$T4 + vpermq \$0x2,$H0,$T0 + vpermq \$0x2,$D1,$T1 + vpermq \$0x2,$H2,$T2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx2_epilogue: +___ +$code.=<<___ if (!$win64); + lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + vzeroupper + ret +.cfi_endproc +.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 +___ +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); +my $PADBIT="%zmm30"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); + +$code.=<<___; +.type poly1305_blocks_avx512,\@function,4 +.align 32 +poly1305_blocks_avx512: +.cfi_startproc +.Lblocks_avx512: + mov \$15,%eax + kmovw %eax,%k2 +___ +$code.=<<___ if (!$win64); + lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x1c8,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx512_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 + + # expand pre-calculated table + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} + vpsrlq \$32,$R1,$T1 + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} + + ################################################################ + # calculate 5th through 8th powers of the key + # + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 + + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 + vpsrlq \$32,$R2,$T2 + + vpmuludq $T1,$S4,$M0 + vpmuludq $T1,$R0,$M1 + vpmuludq $T1,$R1,$M2 + vpmuludq $T1,$R2,$M3 + vpmuludq $T1,$R3,$M4 + vpsrlq \$32,$R3,$T3 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3 + + vpmuludq $T2,$S3,$M0 + vpmuludq $T2,$S4,$M1 + vpmuludq $T2,$R1,$M3 + vpmuludq $T2,$R2,$M4 + vpmuludq $T2,$R0,$M2 + vpsrlq \$32,$R4,$T4 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0 + + vpmuludq $T3,$S2,$M0 + vpmuludq $T3,$R0,$M3 + vpmuludq $T3,$R1,$M4 + vpmuludq $T3,$S3,$M1 + vpmuludq $T3,$S4,$M2 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 + + vpmuludq $T4,$S4,$M3 + vpmuludq $T4,$R0,$M4 + vpmuludq $T4,$S1,$M0 + vpmuludq $T4,$S2,$M1 + vpmuludq $T4,$S3,$M2 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 + + ################################################################ + # load input + vmovdqu64 16*0($inp),%z#$T3 + vmovdqu64 16*4($inp),%z#$T4 + lea 16*8($inp),$inp + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D4,$M4 + vpandq $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$M1 + vpandq $MASK,$D1,$D1 + vpaddq $M1,$D2,$D2 # d1 -> d2 + + vpaddq $M4,$D0,$D0 + vpsllq \$2,$M4,$M4 + vpaddq $M4,$D0,$D0 # d4 -> d0 + + vpsrlq \$26,$D2,$M2 + vpandq $MASK,$D2,$D2 + vpaddq $M2,$D3,$D3 # d2 -> d3 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + ################################################################ + # at this point we have 14243444 in $R0-$S4 and 05060708 in + # $D0-$D4, ... + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + # ... since input 64-bit lanes are ordered as 73625140, we could + # "vperm" it to 76543210 (here and in each loop iteration), *or* + # we could just flow along, hence the goal for $R0-$S4 is + # 1858286838784888 ... + + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax + kmovw %eax,%k1 + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} + + vpslld \$2,$R1,$S1 # *5 + vpslld \$2,$R2,$S2 + vpslld \$2,$R3,$S3 + vpslld \$2,$R4,$S4 + vpaddd $R1,$S1,$S1 + vpaddd $R2,$S2,$S2 + vpaddd $R3,$S3,$S3 + vpaddd $R4,$S4,$S4 + + vpbroadcastq 32(%rcx),$PADBIT # .L129 + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + vporq $T3,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$14,$T4,$T3 + vpsrlq \$40,$T4,$T4 # 4 + vpandq $MASK,$T2,$T2 # 2 + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$192,$len + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + ################################################################ + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 + # \________/\___________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpaddq $H0,$T0,$H0 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu64 16*0($inp),$T3 # load input + vmovdqu64 16*4($inp),$T4 + lea 16*8($inp),$inp + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpaddq $M3,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$S4,$M2 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + + vpsrlq \$26,$D3,$H3 + vpandq $MASK,$D3,$D3 + vpaddq $H3,$D4,$H4 # h3 -> h4 + + vporq $T3,$T2,$T2 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpandq $MASK,$T2,$T2 # 2 + + vpsrlq \$26,$H4,$D4 + vpandq $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpandq $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpandq $MASK,$H2,$H2 + vpaddq $D2,$D3,$H3 # h2 -> h3 + + vpsrlq \$14,$T4,$T3 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpandq $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + sub \$128,$len + ja .Loop_avx512 + +.Ltail_avx512: + ################################################################ + # while above multiplications were by r^8 in all lanes, in last + # iteration we multiply least significant lane by r^8 and most + # significant one by r, that's why table gets shifted... + + vpsrlq \$32,$R0,$R0 # 0105020603070408 + vpsrlq \$32,$R1,$R1 + vpsrlq \$32,$R2,$R2 + vpsrlq \$32,$S3,$S3 + vpsrlq \$32,$S4,$S4 + vpsrlq \$32,$R3,$R3 + vpsrlq \$32,$R4,$R4 + vpsrlq \$32,$S1,$S1 + vpsrlq \$32,$S2,$S2 + + ################################################################ + # load either next or last 64 byte of input + lea ($inp,$len),$inp + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu 16*0($inp),%x#$T0 + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vmovdqu 16*1($inp),%x#$T1 + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpmuludq $H3,$S4,$M2 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # horizontal addition + + mov \$1,%eax + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + kmovw %eax,%k3 + vpermq \$0x2,$H3,$D3 + vpermq \$0x2,$H4,$D4 + vpermq \$0x2,$H0,$D0 + vpermq \$0x2,$H1,$D1 + vpermq \$0x2,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + vextracti64x4 \$0x1,$H3,%y#$D3 + vextracti64x4 \$0x1,$H4,%y#$D4 + vextracti64x4 \$0x1,$H0,%y#$D0 + vextracti64x4 \$0x1,$H1,%y#$D1 + vextracti64x4 \$0x1,$H2,%y#$D2 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 + vpaddq $D0,$H0,${H0}{%k3}{z} + vpaddq $D1,$H1,${H1}{%k3}{z} + vpaddq $D2,$H2,${H2}{%k3}{z} +___ +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); +$code.=<<___; + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 + vpand $MASK,$T1,$T1 # 1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + vpaddq $D3,$H4,$H4 # h3 -> h4 + + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 + add \$64,$len + jnz .Ltail_avx2 + + vpsubq $T2,$H2,$H2 # undo input accumulation + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) + vzeroall +___ +$code.=<<___ if ($win64); + movdqa 0x50(%r11),%xmm6 + movdqa 0x60(%r11),%xmm7 + movdqa 0x70(%r11),%xmm8 + movdqa 0x80(%r11),%xmm9 + movdqa 0x90(%r11),%xmm10 + movdqa 0xa0(%r11),%xmm11 + movdqa 0xb0(%r11),%xmm12 + movdqa 0xc0(%r11),%xmm13 + movdqa 0xd0(%r11),%xmm14 + movdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx512_epilogue: +___ +$code.=<<___ if (!$win64); + lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + ret +.cfi_endproc +.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 +___ +if ($avx>3) { +######################################################################## +# VPMADD52 version using 2^44 radix. +# +# One can argue that base 2^52 would be more natural. Well, even though +# some operations would be more natural, one has to recognize couple of +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look +# at amount of multiply-n-accumulate operations. Secondly, it makes it +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in +# reference implementations], which means that more such operations +# would have to be performed in inner loop, which in turn makes critical +# path longer. In other words, even though base 2^44 reduction might +# look less elegant, overall critical path is actually shorter... + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^44 +# unsigned __int64 s[2]; # key value*20 base 2^44 +# unsigned __int64 r[3]; # key value base 2^44 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; +# # r^n positions reflect +# # placement in register, not +# # memory, R[3] is R[1]*20 + +$code.=<<___; +.type poly1305_init_base2_44,\@function,3 +.align 32 +poly1305_init_base2_44: +.cfi_startproc + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + +.Linit_base2_44: + lea poly1305_blocks_vpmadd52(%rip),%r10 + lea poly1305_emit_base2_44(%rip),%r11 + + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + mov \$0x00000fffffffffff,%r8 + and 8($inp),%rcx + mov \$0x00000fffffffffff,%r9 + and %rax,%r8 + shrd \$44,%rcx,%rax + mov %r8,40($ctx) # r0 + and %r9,%rax + shr \$24,%rcx + mov %rax,48($ctx) # r1 + lea (%rax,%rax,4),%rax # *5 + mov %rcx,56($ctx) # r2 + shl \$2,%rax # magic <<2 + lea (%rcx,%rcx,4),%rcx # *5 + shl \$2,%rcx # magic <<2 + mov %rax,24($ctx) # s1 + mov %rcx,32($ctx) # s2 + movq \$-1,64($ctx) # write impossible value +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax + ret +.cfi_endproc +.size poly1305_init_base2_44,.-poly1305_init_base2_44 +___ +{ +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52,\@function,4 +.align 32 +poly1305_blocks_vpmadd52: +.cfi_startproc + shr \$4,$len + jz .Lno_data_vpmadd52 # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + # if powers of the key are not calculated yet, process up to 3 + # blocks with this single-block subroutine, otherwise ensure that + # length is divisible by 2 blocks and pass the rest down to next + # subroutine... + + mov \$3,%rax + mov \$1,%r10 + cmp \$4,$len # is input long + cmovae %r10,%rax + test %r8,%r8 # is power value impossible? + cmovns %r10,%rax + + and $len,%rax # is input of favourable length? + jz .Lblocks_vpmadd52_4x + + sub %rax,$len + mov \$7,%r10d + mov \$1,%r11d + kmovw %r10d,%k7 + lea .L2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq $padbit,%x#$PAD + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift + vpermq \$0xcf,$PAD,$PAD + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask + + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} + + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft + + jmp .Loop_vpmadd52 + +.align 32 +.Loop_vpmadd52: + vmovdqu32 0($inp),%x#$T0 # load input as ----3210 + lea 16($inp),$inp + + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 + vpsrlvq $inp_shift,$T0,$T0 + vpandq $reduc_mask,$T0,$T0 + vporq $PAD,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo # accumulate input + + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} + + vpxord $Dlo,$Dlo,$Dlo + vpxord $Dhi,$Dhi,$Dhi + + vpmadd52luq $r2r1r0,$H0,$Dlo + vpmadd52huq $r2r1r0,$H0,$Dhi + + vpmadd52luq $r1r0s2,$H1,$Dlo + vpmadd52huq $r1r0s2,$H1,$Dhi + + vpmadd52luq $r0s2s1,$H2,$Dlo + vpmadd52huq $r0s2s1,$H2,$Dhi + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword + vpandq $reduc_mask,$Dlo,$Dlo + + vpaddq $T0,$Dhi,$Dhi + + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword + + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word + vpandq $reduc_mask,$Dlo,$Dlo + + vpermq \$0b10010011,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} + + vpaddq $T0,$Dlo,$Dlo + vpsllq \$2,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + dec %rax # len-=16 + jnz .Loop_vpmadd52 + + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value + + test $len,$len + jnz .Lblocks_vpmadd52_4x + +.Lno_data_vpmadd52: + ret +.cfi_endproc +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 +___ +} +{ +######################################################################## +# As implied by its name 4x subroutine processes 4 blocks in parallel +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power +# and is handled in 256-bit %ymm registers. + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_4x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_4x: +.cfi_startproc + shr \$4,$len + jz .Lno_data_vpmadd52_4x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + +.Lblocks_vpmadd52_4x: + vpbroadcastq $padbit,$PAD + + vmovdqa64 .Lx_mask44(%rip),$mask44 + mov \$5,%eax + vmovdqa64 .Lx_mask42(%rip),$mask42 + kmovw %eax,%k1 # used in 2x path + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64($ctx),$R0 # load 4th power of the key + vpbroadcastq 96($ctx),$R1 + vpbroadcastq 128($ctx),$R2 + vpbroadcastq 160($ctx),$S1 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + test \$7,$len # is len 8*n? + jz .Lblocks_vpmadd52_8x + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 3-1-2-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$4,$len + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24($ctx),%x#$S1 # load key + vmovq 56($ctx),%x#$H2 + vmovq 32($ctx),%x#$S2 + vmovq 40($ctx),%x#$R0 + vmovq 48($ctx),%x#$R1 + + vmovdqa $R0,$H0 + vmovdqa $R1,$H1 + vmovdqa $H2,$R2 + + mov \$2,%eax + +.Lmul_init_vpmadd52: + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + dec %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq $R1,$H1,$R1 # 1,2 + vpbroadcastq %x#$H1,%x#$H1 # 2,2 + vpunpcklqdq $R2,$H2,$R2 + vpbroadcastq %x#$H2,%x#$H2 + vpunpcklqdq $R0,$H0,$R0 + vpbroadcastq %x#$H0,%x#$H0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R1,$S1,$S1 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S1,$S1 + vpsllq \$2,$S2,$S2 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 + vinserti128 \$1,%x#$R2,$H2,$R2 + vinserti128 \$1,%x#$R0,$H0,$R0 + + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 + vpermq \$0b11011000,$R2,$R2 + vpermq \$0b11011000,$R0,$R0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpaddq $R1,$S1,$S1 + vpsllq \$2,$S1,$S1 + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 $R0,64($ctx) # save key powers + vpbroadcastq %x#$R0,$R0 # broadcast 4th power + vmovdqu64 $R1,96($ctx) + vpbroadcastq %x#$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpbroadcastq %x#$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpbroadcastq %x#$S1,$S1 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 $R0,64($ctx) # save key powers + vpsrldq \$8,$R0,$R0 # 0-1-0-2 + vmovdqu64 $R1,96($ctx) + vpsrldq \$8,$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpsrldq \$8,$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpsrldq \$8,$S1,$S1 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers + vmovdqu64 160+8($ctx),${S1}{%k1}{z} + vmovdqu64 64+8($ctx),${R0}{%k1}{z} + vmovdqu64 96+8($ctx),${R1}{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 16*0($inp),$T2 # load data + vpxorq $T3,$T3,$T3 + lea 16*2($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as x-1-x-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$4,$len # len-=64 + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128($ctx),$R2 # load all key powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + +.Ltail_vpmadd52_2x: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + # at this point $len is + # either 4*n+2 or 0... + sub \$2,$len # len-=32 + ja .Lblocks_vpmadd52_4x_do + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_4x: + ret +.cfi_endproc +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +___ +} +{ +######################################################################## +# As implied by its name 8x subroutine processes 8 blocks in parallel... +# This is intermediate version, as it's used only in cases when input +# length is either 8*n, 8*n+1 or 8*n+2... + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_8x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_8x: +.cfi_startproc + shr \$4,$len + jz .Lno_data_vpmadd52_8x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + vmovdqa64 .Lx_mask44(%rip),$mask44 + vmovdqa64 .Lx_mask42(%rip),$mask42 + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + +.Lblocks_vpmadd52_8x: + ################################################################ + # fist we calculate more key powers + + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power + vpbroadcastq %x#$R0,$RR0 + vpbroadcastq %x#$R1,$RR1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $RR2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $RR2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $RR2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $RR2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $RR2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $RR2,$R0,$D2hi + + vpmadd52luq $RR0,$R0,$D0lo + vpmadd52huq $RR0,$R0,$D0hi + vpmadd52luq $RR0,$R1,$D1lo + vpmadd52huq $RR0,$R1,$D1hi + vpmadd52luq $RR0,$R2,$D2lo + vpmadd52huq $RR0,$R2,$D2hi + + vpmadd52luq $RR1,$S2,$D0lo + vpmadd52huq $RR1,$S2,$D0hi + vpmadd52luq $RR1,$R0,$D1lo + vpmadd52huq $RR1,$R0,$D1hi + vpmadd52luq $RR1,$R1,$D2lo + vpmadd52huq $RR1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$RR0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$RR1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$RR2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + + vpsrlq \$44,$RR0,$tmp # additional step + vpandq $mask44,$RR0,$RR0 + + vpaddq $tmp,$RR1,$RR1 + + ################################################################ + # At this point Rx holds 1324 powers, RRx - 5768, and the goal + # is 15263748, which reflects how data is loaded... + + vpunpcklqdq $R2,$RR2,$T2 # 3748 + vpunpckhqdq $R2,$RR2,$R2 # 1526 + vpunpcklqdq $R0,$RR0,$T0 + vpunpckhqdq $R0,$RR0,$R0 + vpunpcklqdq $R1,$RR1,$T1 + vpunpckhqdq $R1,$RR1,$R1 +___ +######## switch to %zmm +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); + +$code.=<<___; + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 + vshufi64x2 \$0x44,$R0,$T0,$RR0 + vshufi64x2 \$0x44,$R1,$T1,$RR1 + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 + vpaddq $RR2,$SS2,$SS2 + vpaddq $RR1,$SS1,$SS1 + vpsllq \$2,$SS2,$SS2 + vpsllq \$2,$SS1,$SS1 + + vpbroadcastq $padbit,$PAD + vpbroadcastq %x#$mask44,$mask44 + vpbroadcastq %x#$mask42,$mask42 + + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power + vpbroadcastq %x#$SS2,$S2 + vpbroadcastq %x#$RR0,$R0 + vpbroadcastq %x#$RR1,$R1 + vpbroadcastq %x#$RR2,$R2 + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 73625140 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$8,$len + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$8,$len # len-=128 + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$SS1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$SS1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$SS2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$SS2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$RR0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$RR0,$D2hi + + vpmadd52luq $H0,$RR0,$D0lo + vpmadd52huq $H0,$RR0,$D0hi + vpmadd52luq $H0,$RR1,$D1lo + vpmadd52huq $H0,$RR1,$D1hi + vpmadd52luq $H0,$RR2,$D2lo + vpmadd52huq $H0,$RR2,$D2hi + + vpmadd52luq $H1,$SS2,$D0lo + vpmadd52huq $H1,$SS2,$D0hi + vpmadd52luq $H1,$RR0,$D1lo + vpmadd52huq $H1,$RR0,$D1hi + vpmadd52luq $H1,$RR1,$D2lo + vpmadd52huq $H1,$RR1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vextracti64x4 \$1,$D0lo,%y#$T0 + vextracti64x4 \$1,$D0hi,%y#$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vextracti64x4 \$1,$D1lo,%y#$T1 + vextracti64x4 \$1,$D1hi,%y#$H1 + vextracti64x4 \$1,$D2lo,%y#$T2 + vextracti64x4 \$1,$D2hi,%y#$H2 +___ +######## switch back to %ymm +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); + +$code.=<<___; + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + ################################################################ + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_8x: + ret +.cfi_endproc +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +___ +} +$code.=<<___; +.type poly1305_emit_base2_44,\@function,3 +.align 32 +poly1305_emit_base2_44: +.cfi_startproc + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r9,%rax + shr \$20,%r9 + shl \$44,%rax + mov %r10,%rcx + shr \$40,%r10 + shl \$24,%rcx + + add %rax,%r8 + adc %rcx,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.cfi_endproc +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 +___ +} } } +$code.=<<___; +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +___ +} +$code.=<<___; +.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 16 +___ + +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: +.cfi_startproc + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + ret +.cfi_endproc +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: +.cfi_startproc + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11,%r11 +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + ret +.cfi_endproc +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler + +.type avx_handler,\@abi-omnipotent +.align 16 +avx_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 208($context),%rax # pull context->R11 + + lea 0x50(%rax),%rsi + lea 0xf8(%rax),%rax + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size avx_handler,.-avx_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_poly1305_init + .rva .LSEH_end_poly1305_init + .rva .LSEH_info_poly1305_init + + .rva .LSEH_begin_poly1305_blocks + .rva .LSEH_end_poly1305_blocks + .rva .LSEH_info_poly1305_blocks + + .rva .LSEH_begin_poly1305_emit + .rva .LSEH_end_poly1305_emit + .rva .LSEH_info_poly1305_emit +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_poly1305_blocks_avx + .rva .Lbase2_64_avx + .rva .LSEH_info_poly1305_blocks_avx_1 + + .rva .Lbase2_64_avx + .rva .Leven_avx + .rva .LSEH_info_poly1305_blocks_avx_2 + + .rva .Leven_avx + .rva .LSEH_end_poly1305_blocks_avx + .rva .LSEH_info_poly1305_blocks_avx_3 + + .rva .LSEH_begin_poly1305_emit_avx + .rva .LSEH_end_poly1305_emit_avx + .rva .LSEH_info_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_poly1305_blocks_avx2 + .rva .Lbase2_64_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_1 + + .rva .Lbase2_64_avx2 + .rva .Leven_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_2 + + .rva .Leven_avx2 + .rva .LSEH_end_poly1305_blocks_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_3 +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_poly1305_blocks_avx512 + .rva .LSEH_end_poly1305_blocks_avx512 + .rva .LSEH_info_poly1305_blocks_avx512 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_poly1305_init: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init + +.LSEH_info_poly1305_blocks: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_body,.Lblocks_epilogue + +.LSEH_info_poly1305_emit: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit +___ +$code.=<<___ if ($avx); +.LSEH_info_poly1305_blocks_avx_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_emit_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); +.LSEH_info_poly1305_blocks_avx2_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_poly1305_blocks_avx512: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] +___ +} + +foreach (split('\n',$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + s/%r([a-z]+)#d/%e$1/g; + s/%r([0-9]+)#d/%r$1d/g; + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT: $!"; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/build.info b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/build.info new file mode 100644 index 000000000..4e4dcca52 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/build.info @@ -0,0 +1,25 @@ +LIBS=../../libcrypto +SOURCE[../../libcrypto]=\ + poly1305_pmeth.c \ + poly1305_ameth.c \ + poly1305.c {- $target{poly1305_asm_src} -} + +GENERATE[poly1305-sparcv9.S]=asm/poly1305-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[poly1305-sparcv9.o]=.. +GENERATE[poly1305-x86.s]=asm/poly1305-x86.pl \ + $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR) +GENERATE[poly1305-x86_64.s]=asm/poly1305-x86_64.pl $(PERLASM_SCHEME) +GENERATE[poly1305-ppc.s]=asm/poly1305-ppc.pl $(PERLASM_SCHEME) +GENERATE[poly1305-ppcfp.s]=asm/poly1305-ppcfp.pl $(PERLASM_SCHEME) +GENERATE[poly1305-armv4.S]=asm/poly1305-armv4.pl $(PERLASM_SCHEME) +INCLUDE[poly1305-armv4.o]=.. +GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl $(PERLASM_SCHEME) +INCLUDE[poly1305-armv8.o]=.. +GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME) +INCLUDE[poly1305-mips.o]=.. +GENERATE[poly1305-s390x.S]=asm/poly1305-s390x.pl $(PERLASM_SCHEME) + +BEGINRAW[Makefile(unix)] +{- $builddir -}/poly1305-%.S: {- $sourcedir -}/asm/poly1305-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ +ENDRAW[Makefile(unix)] diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305.c b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305.c new file mode 100644 index 000000000..e7f5b92c8 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305.c @@ -0,0 +1,531 @@ +/* + * Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include <stdlib.h> +#include <string.h> +#include <openssl/crypto.h> + +#include "crypto/poly1305.h" +#include "poly1305_local.h" + +size_t Poly1305_ctx_size(void) +{ + return sizeof(struct poly1305_context); +} + +/* pick 32-bit unsigned integer in little endian order */ +static unsigned int U8TOU32(const unsigned char *p) +{ + return (((unsigned int)(p[0] & 0xff)) | + ((unsigned int)(p[1] & 0xff) << 8) | + ((unsigned int)(p[2] & 0xff) << 16) | + ((unsigned int)(p[3] & 0xff) << 24)); +} + +/* + * Implementations can be classified by amount of significant bits in + * words making up the multi-precision value, or in other words radix + * or base of numerical representation, e.g. base 2^64, base 2^32, + * base 2^26. Complementary characteristic is how wide is the result of + * multiplication of pair of digits, e.g. it would take 128 bits to + * accommodate multiplication result in base 2^64 case. These are used + * interchangeably. To describe implementation that is. But interface + * is designed to isolate this so that low-level primitives implemented + * in assembly can be self-contained/self-coherent. + */ +#ifndef POLY1305_ASM +/* + * Even though there is __int128 reference implementation targeting + * 64-bit platforms provided below, it's not obvious that it's optimal + * choice for every one of them. Depending on instruction set overall + * amount of instructions can be comparable to one in __int64 + * implementation. Amount of multiplication instructions would be lower, + * but not necessarily overall. And in out-of-order execution context, + * it is the latter that can be crucial... + * + * On related note. Poly1305 author, D. J. Bernstein, discusses and + * provides floating-point implementations of the algorithm in question. + * It made a lot of sense by the time of introduction, because most + * then-modern processors didn't have pipelined integer multiplier. + * [Not to mention that some had non-constant timing for integer + * multiplications.] Floating-point instructions on the other hand could + * be issued every cycle, which allowed to achieve better performance. + * Nowadays, with SIMD and/or out-or-order execution, shared or + * even emulated FPU, it's more complicated, and floating-point + * implementation is not necessarily optimal choice in every situation, + * rather contrary... + * + * <appro@openssl.org> + */ + +typedef unsigned int u32; + +/* + * poly1305_blocks processes a multiple of POLY1305_BLOCK_SIZE blocks + * of |inp| no longer than |len|. Behaviour for |len| not divisible by + * block size is unspecified in general case, even though in reference + * implementation the trailing chunk is simply ignored. Per algorithm + * specification, every input block, complete or last partial, is to be + * padded with a bit past most significant byte. The latter kind is then + * padded with zeros till block size. This last partial block padding + * is caller(*)'s responsibility, and because of this the last partial + * block is always processed with separate call with |len| set to + * POLY1305_BLOCK_SIZE and |padbit| to 0. In all other cases |padbit| + * should be set to 1 to perform implicit padding with 128th bit. + * poly1305_blocks does not actually check for this constraint though, + * it's caller(*)'s responsibility to comply. + * + * (*) In the context "caller" is not application code, but higher + * level Poly1305_* from this very module, so that quirks are + * handled locally. + */ +static void +poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit); + +/* + * Type-agnostic "rip-off" from constant_time.h + */ +# define CONSTANT_TIME_CARRY(a,b) ( \ + (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \ + ) + +# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \ + (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8) + +typedef unsigned long u64; +typedef __uint128_t u128; + +typedef struct { + u64 h[3]; + u64 r[2]; +} poly1305_internal; + +/* pick 32-bit unsigned integer in little endian order */ +static u64 U8TOU64(const unsigned char *p) +{ + return (((u64)(p[0] & 0xff)) | + ((u64)(p[1] & 0xff) << 8) | + ((u64)(p[2] & 0xff) << 16) | + ((u64)(p[3] & 0xff) << 24) | + ((u64)(p[4] & 0xff) << 32) | + ((u64)(p[5] & 0xff) << 40) | + ((u64)(p[6] & 0xff) << 48) | + ((u64)(p[7] & 0xff) << 56)); +} + +/* store a 32-bit unsigned integer in little endian */ +static void U64TO8(unsigned char *p, u64 v) +{ + p[0] = (unsigned char)((v) & 0xff); + p[1] = (unsigned char)((v >> 8) & 0xff); + p[2] = (unsigned char)((v >> 16) & 0xff); + p[3] = (unsigned char)((v >> 24) & 0xff); + p[4] = (unsigned char)((v >> 32) & 0xff); + p[5] = (unsigned char)((v >> 40) & 0xff); + p[6] = (unsigned char)((v >> 48) & 0xff); + p[7] = (unsigned char)((v >> 56) & 0xff); +} + +static void poly1305_init(void *ctx, const unsigned char key[16]) +{ + poly1305_internal *st = (poly1305_internal *) ctx; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = U8TOU64(&key[0]) & 0x0ffffffc0fffffff; + st->r[1] = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc; +} + +static void +poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) +{ + poly1305_internal *st = (poly1305_internal *)ctx; + u64 r0, r1; + u64 s1; + u64 h0, h1, h2, c; + u128 d0, d1; + + r0 = st->r[0]; + r1 = st->r[1]; + + s1 = r1 + (r1 >> 2); + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + while (len >= POLY1305_BLOCK_SIZE) { + /* h += m[i] */ + h0 = (u64)(d0 = (u128)h0 + U8TOU64(inp + 0)); + h1 = (u64)(d1 = (u128)h1 + (d0 >> 64) + U8TOU64(inp + 8)); + /* + * padbit can be zero only when original len was + * POLY1306_BLOCK_SIZE, but we don't check + */ + h2 += (u64)(d1 >> 64) + padbit; + + /* h *= r "%" p, where "%" stands for "partial remainder" */ + d0 = ((u128)h0 * r0) + + ((u128)h1 * s1); + d1 = ((u128)h0 * r1) + + ((u128)h1 * r0) + + (h2 * s1); + h2 = (h2 * r0); + + /* last reduction step: */ + /* a) h2:h0 = h2<<128 + d1<<64 + d0 */ + h0 = (u64)d0; + h1 = (u64)(d1 += d0 >> 64); + h2 += (u64)(d1 >> 64); + /* b) (h2:h0 += (h2:h0>>130) * 5) %= 2^130 */ + c = (h2 >> 2) + (h2 & ~3UL); + h2 &= 3; + h0 += c; + h1 += (c = CONSTANT_TIME_CARRY(h0,c)); + h2 += CONSTANT_TIME_CARRY(h1,c); + /* + * Occasional overflows to 3rd bit of h2 are taken care of + * "naturally". If after this point we end up at the top of + * this loop, then the overflow bit will be accounted for + * in next iteration. If we end up in poly1305_emit, then + * comparison to modulus below will still count as "carry + * into 131st bit", so that properly reduced value will be + * picked in conditional move. + */ + + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; +} + +static void poly1305_emit(void *ctx, unsigned char mac[16], + const u32 nonce[4]) +{ + poly1305_internal *st = (poly1305_internal *) ctx; + u64 h0, h1, h2; + u64 g0, g1, g2; + u128 t; + u64 mask; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + /* compare to modulus by computing h + -p */ + g0 = (u64)(t = (u128)h0 + 5); + g1 = (u64)(t = (u128)h1 + (t >> 64)); + g2 = h2 + (u64)(t >> 64); + + /* if there was carry into 131st bit, h1:h0 = g1:g0 */ + mask = 0 - (g2 >> 2); + g0 &= mask; + g1 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + + /* mac = (h + nonce) % (2^128) */ + h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32)); + h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64)); + + U64TO8(mac + 0, h0); + U64TO8(mac + 8, h1); +} + +# else + +# if defined(_WIN32) && !defined(__MINGW32__) +typedef unsigned __int64 u64; +# elif defined(__arch64__) +typedef unsigned long u64; +# else +typedef unsigned long long u64; +# endif + +typedef struct { + u32 h[5]; + u32 r[4]; +} poly1305_internal; + +/* store a 32-bit unsigned integer in little endian */ +static void U32TO8(unsigned char *p, unsigned int v) +{ + p[0] = (unsigned char)((v) & 0xff); + p[1] = (unsigned char)((v >> 8) & 0xff); + p[2] = (unsigned char)((v >> 16) & 0xff); + p[3] = (unsigned char)((v >> 24) & 0xff); +} + +static void poly1305_init(void *ctx, const unsigned char key[16]) +{ + poly1305_internal *st = (poly1305_internal *) ctx; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = U8TOU32(&key[0]) & 0x0fffffff; + st->r[1] = U8TOU32(&key[4]) & 0x0ffffffc; + st->r[2] = U8TOU32(&key[8]) & 0x0ffffffc; + st->r[3] = U8TOU32(&key[12]) & 0x0ffffffc; +} + +static void +poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) +{ + poly1305_internal *st = (poly1305_internal *)ctx; + u32 r0, r1, r2, r3; + u32 s1, s2, s3; + u32 h0, h1, h2, h3, h4, c; + u64 d0, d1, d2, d3; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + + s1 = r1 + (r1 >> 2); + s2 = r2 + (r2 >> 2); + s3 = r3 + (r3 >> 2); + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (len >= POLY1305_BLOCK_SIZE) { + /* h += m[i] */ + h0 = (u32)(d0 = (u64)h0 + U8TOU32(inp + 0)); + h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + U8TOU32(inp + 4)); + h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + U8TOU32(inp + 8)); + h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + U8TOU32(inp + 12)); + h4 += (u32)(d3 >> 32) + padbit; + + /* h *= r "%" p, where "%" stands for "partial remainder" */ + d0 = ((u64)h0 * r0) + + ((u64)h1 * s3) + + ((u64)h2 * s2) + + ((u64)h3 * s1); + d1 = ((u64)h0 * r1) + + ((u64)h1 * r0) + + ((u64)h2 * s3) + + ((u64)h3 * s2) + + (h4 * s1); + d2 = ((u64)h0 * r2) + + ((u64)h1 * r1) + + ((u64)h2 * r0) + + ((u64)h3 * s3) + + (h4 * s2); + d3 = ((u64)h0 * r3) + + ((u64)h1 * r2) + + ((u64)h2 * r1) + + ((u64)h3 * r0) + + (h4 * s3); + h4 = (h4 * r0); + + /* last reduction step: */ + /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */ + h0 = (u32)d0; + h1 = (u32)(d1 += d0 >> 32); + h2 = (u32)(d2 += d1 >> 32); + h3 = (u32)(d3 += d2 >> 32); + h4 += (u32)(d3 >> 32); + /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */ + c = (h4 >> 2) + (h4 & ~3U); + h4 &= 3; + h0 += c; + h1 += (c = CONSTANT_TIME_CARRY(h0,c)); + h2 += (c = CONSTANT_TIME_CARRY(h1,c)); + h3 += (c = CONSTANT_TIME_CARRY(h2,c)); + h4 += CONSTANT_TIME_CARRY(h3,c); + /* + * Occasional overflows to 3rd bit of h4 are taken care of + * "naturally". If after this point we end up at the top of + * this loop, then the overflow bit will be accounted for + * in next iteration. If we end up in poly1305_emit, then + * comparison to modulus below will still count as "carry + * into 131st bit", so that properly reduced value will be + * picked in conditional move. + */ + + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; +} + +static void poly1305_emit(void *ctx, unsigned char mac[16], + const u32 nonce[4]) +{ + poly1305_internal *st = (poly1305_internal *) ctx; + u32 h0, h1, h2, h3, h4; + u32 g0, g1, g2, g3, g4; + u64 t; + u32 mask; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + /* compare to modulus by computing h + -p */ + g0 = (u32)(t = (u64)h0 + 5); + g1 = (u32)(t = (u64)h1 + (t >> 32)); + g2 = (u32)(t = (u64)h2 + (t >> 32)); + g3 = (u32)(t = (u64)h3 + (t >> 32)); + g4 = h4 + (u32)(t >> 32); + + /* if there was carry into 131st bit, h3:h0 = g3:g0 */ + mask = 0 - (g4 >> 2); + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + + /* mac = (h + nonce) % (2^128) */ + h0 = (u32)(t = (u64)h0 + nonce[0]); + h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]); + h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]); + h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]); + + U32TO8(mac + 0, h0); + U32TO8(mac + 4, h1); + U32TO8(mac + 8, h2); + U32TO8(mac + 12, h3); +} +# endif +#else +int poly1305_init(void *ctx, const unsigned char key[16], void *func); +void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, + unsigned int padbit); +void poly1305_emit(void *ctx, unsigned char mac[16], + const unsigned int nonce[4]); +#endif + +void Poly1305_Init(POLY1305 *ctx, const unsigned char key[32]) +{ + ctx->nonce[0] = U8TOU32(&key[16]); + ctx->nonce[1] = U8TOU32(&key[20]); + ctx->nonce[2] = U8TOU32(&key[24]); + ctx->nonce[3] = U8TOU32(&key[28]); + +#ifndef POLY1305_ASM + poly1305_init(ctx->opaque, key); +#else + /* + * Unlike reference poly1305_init assembly counterpart is expected + * to return a value: non-zero if it initializes ctx->func, and zero + * otherwise. Latter is to simplify assembly in cases when there no + * multiple code paths to switch between. + */ + if (!poly1305_init(ctx->opaque, key, &ctx->func)) { + ctx->func.blocks = poly1305_blocks; + ctx->func.emit = poly1305_emit; + } +#endif + + ctx->num = 0; + +} + +#ifdef POLY1305_ASM +/* + * This "eclipses" poly1305_blocks and poly1305_emit, but it's + * conscious choice imposed by -Wshadow compiler warnings. + */ +# define poly1305_blocks (*poly1305_blocks_p) +# define poly1305_emit (*poly1305_emit_p) +#endif + +void Poly1305_Update(POLY1305 *ctx, const unsigned char *inp, size_t len) +{ +#ifdef POLY1305_ASM + /* + * As documented, poly1305_blocks is never called with input + * longer than single block and padbit argument set to 0. This + * property is fluently used in assembly modules to optimize + * padbit handling on loop boundary. + */ + poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks; +#endif + size_t rem, num; + + if ((num = ctx->num)) { + rem = POLY1305_BLOCK_SIZE - num; + if (len >= rem) { + memcpy(ctx->data + num, inp, rem); + poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1); + inp += rem; + len -= rem; + } else { + /* Still not enough data to process a block. */ + memcpy(ctx->data + num, inp, len); + ctx->num = num + len; + return; + } + } + + rem = len % POLY1305_BLOCK_SIZE; + len -= rem; + + if (len >= POLY1305_BLOCK_SIZE) { + poly1305_blocks(ctx->opaque, inp, len, 1); + inp += len; + } + + if (rem) + memcpy(ctx->data, inp, rem); + + ctx->num = rem; +} + +void Poly1305_Final(POLY1305 *ctx, unsigned char mac[16]) +{ +#ifdef POLY1305_ASM + poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks; + poly1305_emit_f poly1305_emit_p = ctx->func.emit; +#endif + size_t num; + + if ((num = ctx->num)) { + ctx->data[num++] = 1; /* pad bit */ + while (num < POLY1305_BLOCK_SIZE) + ctx->data[num++] = 0; + poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0); + } + + poly1305_emit(ctx->opaque, mac, ctx->nonce); + + /* zero out the state */ + OPENSSL_cleanse(ctx, sizeof(*ctx)); +} diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_ameth.c b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_ameth.c new file mode 100644 index 000000000..0c8a91dc7 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_ameth.c @@ -0,0 +1,122 @@ +/* + * Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include <stdio.h> +#include "internal/cryptlib.h" +#include <openssl/evp.h> +#include "crypto/asn1.h" +#include "crypto/poly1305.h" +#include "poly1305_local.h" +#include "crypto/evp.h" + +/* + * POLY1305 "ASN1" method. This is just here to indicate the maximum + * POLY1305 output length and to free up a POLY1305 key. + */ + +static int poly1305_size(const EVP_PKEY *pkey) +{ + return POLY1305_DIGEST_SIZE; +} + +static void poly1305_key_free(EVP_PKEY *pkey) +{ + ASN1_OCTET_STRING *os = EVP_PKEY_get0(pkey); + if (os != NULL) { + if (os->data != NULL) + OPENSSL_cleanse(os->data, os->length); + ASN1_OCTET_STRING_free(os); + } +} + +static int poly1305_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) +{ + /* nothing, (including ASN1_PKEY_CTRL_DEFAULT_MD_NID), is supported */ + return -2; +} + +static int poly1305_pkey_public_cmp(const EVP_PKEY *a, const EVP_PKEY *b) +{ + return ASN1_OCTET_STRING_cmp(EVP_PKEY_get0(a), EVP_PKEY_get0(b)); +} + +static int poly1305_set_priv_key(EVP_PKEY *pkey, const unsigned char *priv, + size_t len) +{ + ASN1_OCTET_STRING *os; + + if (pkey->pkey.ptr != NULL || len != POLY1305_KEY_SIZE) + return 0; + + os = ASN1_OCTET_STRING_new(); + if (os == NULL) + return 0; + + if (!ASN1_OCTET_STRING_set(os, priv, len)) { + ASN1_OCTET_STRING_free(os); + return 0; + } + + pkey->pkey.ptr = os; + return 1; +} + +static int poly1305_get_priv_key(const EVP_PKEY *pkey, unsigned char *priv, + size_t *len) +{ + ASN1_OCTET_STRING *os = (ASN1_OCTET_STRING *)pkey->pkey.ptr; + + if (priv == NULL) { + *len = POLY1305_KEY_SIZE; + return 1; + } + + if (os == NULL || *len < POLY1305_KEY_SIZE) + return 0; + + memcpy(priv, ASN1_STRING_get0_data(os), ASN1_STRING_length(os)); + *len = POLY1305_KEY_SIZE; + + return 1; +} + +const EVP_PKEY_ASN1_METHOD poly1305_asn1_meth = { + EVP_PKEY_POLY1305, + EVP_PKEY_POLY1305, + 0, + + "POLY1305", + "OpenSSL POLY1305 method", + + 0, 0, poly1305_pkey_public_cmp, 0, + + 0, 0, 0, + + poly1305_size, + 0, 0, + 0, 0, 0, 0, 0, 0, 0, + + poly1305_key_free, + poly1305_pkey_ctrl, + NULL, + NULL, + + NULL, + NULL, + NULL, + + NULL, + NULL, + NULL, + + poly1305_set_priv_key, + NULL, + poly1305_get_priv_key, + NULL, +}; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_base2_44.c b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_base2_44.c new file mode 100644 index 000000000..b6313d01b --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_base2_44.c @@ -0,0 +1,171 @@ +/* + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* + * This module is meant to be used as template for base 2^44 assembly + * implementation[s]. On side note compiler-generated code is not + * slower than compiler-generated base 2^64 code on [high-end] x86_64, + * even though amount of multiplications is 50% higher. Go figure... + */ +#include <stdlib.h> + +typedef unsigned char u8; +typedef unsigned int u32; +typedef unsigned long u64; +typedef unsigned __int128 u128; + +typedef struct { + u64 h[3]; + u64 s[2]; + u64 r[3]; +} poly1305_internal; + +#define POLY1305_BLOCK_SIZE 16 + +/* pick 64-bit unsigned integer in little endian order */ +static u64 U8TOU64(const unsigned char *p) +{ + return (((u64)(p[0] & 0xff)) | + ((u64)(p[1] & 0xff) << 8) | + ((u64)(p[2] & 0xff) << 16) | + ((u64)(p[3] & 0xff) << 24) | + ((u64)(p[4] & 0xff) << 32) | + ((u64)(p[5] & 0xff) << 40) | + ((u64)(p[6] & 0xff) << 48) | + ((u64)(p[7] & 0xff) << 56)); +} + +/* store a 64-bit unsigned integer in little endian */ +static void U64TO8(unsigned char *p, u64 v) +{ + p[0] = (unsigned char)((v) & 0xff); + p[1] = (unsigned char)((v >> 8) & 0xff); + p[2] = (unsigned char)((v >> 16) & 0xff); + p[3] = (unsigned char)((v >> 24) & 0xff); + p[4] = (unsigned char)((v >> 32) & 0xff); + p[5] = (unsigned char)((v >> 40) & 0xff); + p[6] = (unsigned char)((v >> 48) & 0xff); + p[7] = (unsigned char)((v >> 56) & 0xff); +} + +int poly1305_init(void *ctx, const unsigned char key[16]) +{ + poly1305_internal *st = (poly1305_internal *)ctx; + u64 r0, r1; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + + r0 = U8TOU64(&key[0]) & 0x0ffffffc0fffffff; + r1 = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc; + + /* break r1:r0 to three 44-bit digits, masks are 1<<44-1 */ + st->r[0] = r0 & 0x0fffffffffff; + st->r[1] = ((r0 >> 44) | (r1 << 20)) & 0x0fffffffffff; + st->r[2] = (r1 >> 24); + + st->s[0] = (st->r[1] + (st->r[1] << 2)) << 2; + st->s[1] = (st->r[2] + (st->r[2] << 2)) << 2; + + return 0; +} + +void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, + u32 padbit) +{ + poly1305_internal *st = (poly1305_internal *)ctx; + u64 r0, r1, r2; + u64 s1, s2; + u64 h0, h1, h2, c; + u128 d0, d1, d2; + u64 pad = (u64)padbit << 40; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + + s1 = st->s[0]; + s2 = st->s[1]; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + while (len >= POLY1305_BLOCK_SIZE) { + u64 m0, m1; + + m0 = U8TOU64(inp + 0); + m1 = U8TOU64(inp + 8); + + /* h += m[i], m[i] is broken to 44-bit digits */ + h0 += m0 & 0x0fffffffffff; + h1 += ((m0 >> 44) | (m1 << 20)) & 0x0fffffffffff; + h2 += (m1 >> 24) + pad; + + /* h *= r "%" p, where "%" stands for "partial remainder" */ + d0 = ((u128)h0 * r0) + ((u128)h1 * s2) + ((u128)h2 * s1); + d1 = ((u128)h0 * r1) + ((u128)h1 * r0) + ((u128)h2 * s2); + d2 = ((u128)h0 * r2) + ((u128)h1 * r1) + ((u128)h2 * r0); + + /* "lazy" reduction step */ + h0 = (u64)d0 & 0x0fffffffffff; + h1 = (u64)(d1 += (u64)(d0 >> 44)) & 0x0fffffffffff; + h2 = (u64)(d2 += (u64)(d1 >> 44)) & 0x03ffffffffff; /* last 42 bits */ + + c = (d2 >> 42); + h0 += c + (c << 2); + + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; +} + +void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4]) +{ + poly1305_internal *st = (poly1305_internal *) ctx; + u64 h0, h1, h2; + u64 g0, g1, g2; + u128 t; + u64 mask; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + /* after "lazy" reduction, convert 44+bit digits to 64-bit ones */ + h0 = (u64)(t = (u128)h0 + (h1 << 44)); h1 >>= 20; + h1 = (u64)(t = (u128)h1 + (h2 << 24) + (t >> 64)); h2 >>= 40; + h2 += (u64)(t >> 64); + + /* compare to modulus by computing h + -p */ + g0 = (u64)(t = (u128)h0 + 5); + g1 = (u64)(t = (u128)h1 + (t >> 64)); + g2 = h2 + (u64)(t >> 64); + + /* if there was carry into 131st bit, h1:h0 = g1:g0 */ + mask = 0 - (g2 >> 2); + g0 &= mask; + g1 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + + /* mac = (h + nonce) % (2^128) */ + h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32)); + h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64)); + + U64TO8(mac + 0, h0); + U64TO8(mac + 8, h1); +} diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_ieee754.c b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_ieee754.c new file mode 100644 index 000000000..7cfd96864 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_ieee754.c @@ -0,0 +1,488 @@ +/* + * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* + * This module is meant to be used as template for non-x87 floating- + * point assembly modules. The template itself is x86_64-specific + * though, as it was debugged on x86_64. So that implementor would + * have to recognize platform-specific parts, UxTOy and inline asm, + * and act accordingly. + * + * Huh? x86_64-specific code as template for non-x87? Note seven, which + * is not a typo, but reference to 80-bit precision. This module on the + * other hand relies on 64-bit precision operations, which are default + * for x86_64 code. And since we are at it, just for sense of it, + * large-block performance in cycles per processed byte for *this* code + * is: + * gcc-4.8 icc-15.0 clang-3.4(*) + * + * Westmere 4.96 5.09 4.37 + * Sandy Bridge 4.95 4.90 4.17 + * Haswell 4.92 4.87 3.78 + * Bulldozer 4.67 4.49 4.68 + * VIA Nano 7.07 7.05 5.98 + * Silvermont 10.6 9.61 12.6 + * + * (*) clang managed to discover parallelism and deployed SIMD; + * + * And for range of other platforms with unspecified gcc versions: + * + * Freescale e300 12.5 + * PPC74x0 10.8 + * POWER6 4.92 + * POWER7 4.50 + * POWER8 4.10 + * + * z10 11.2 + * z196+ 7.30 + * + * UltraSPARC III 16.0 + * SPARC T4 16.1 + */ + +#if !(defined(__GNUC__) && __GNUC__>=2) +# error "this is gcc-specific template" +#endif + +#include <stdlib.h> + +typedef unsigned char u8; +typedef unsigned int u32; +typedef unsigned long long u64; +typedef union { double d; u64 u; } elem64; + +#define TWO(p) ((double)(1ULL<<(p))) +#define TWO0 TWO(0) +#define TWO32 TWO(32) +#define TWO64 (TWO32*TWO(32)) +#define TWO96 (TWO64*TWO(32)) +#define TWO130 (TWO96*TWO(34)) + +#define EXP(p) ((1023ULL+(p))<<52) + +#if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__)) +# define U8TOU32(p) (*(const u32 *)(p)) +# define U32TO8(p,v) (*(u32 *)(p) = (v)) +#elif defined(__PPC__) +# define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; }) +# define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory") +#elif defined(__s390x__) +# define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; }) +# define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v)) +#endif + +#ifndef U8TOU32 +# define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \ + (u32)(p)[2]<<16 | (u32)(p)[3]<<24 ) +#endif +#ifndef U32TO8 +# define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \ + (p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) ) +#endif + +typedef struct { + elem64 h[4]; + double r[8]; + double s[6]; +} poly1305_internal; + +/* "round toward zero (truncate), mask all exceptions" */ +#if defined(__x86_64__) +static const u32 mxcsr = 0x7f80; +#elif defined(__PPC__) +static const u64 one = 1; +#elif defined(__s390x__) +static const u32 fpc = 1; +#elif defined(__sparc__) +static const u64 fsr = 1ULL<<30; +#elif defined(__mips__) +static const u32 fcsr = 1; +#else +#error "unrecognized platform" +#endif + +int poly1305_init(void *ctx, const unsigned char key[16]) +{ + poly1305_internal *st = (poly1305_internal *) ctx; + elem64 r0, r1, r2, r3; + + /* h = 0, biased */ +#if 0 + st->h[0].d = TWO(52)*TWO0; + st->h[1].d = TWO(52)*TWO32; + st->h[2].d = TWO(52)*TWO64; + st->h[3].d = TWO(52)*TWO96; +#else + st->h[0].u = EXP(52+0); + st->h[1].u = EXP(52+32); + st->h[2].u = EXP(52+64); + st->h[3].u = EXP(52+96); +#endif + + if (key) { + /* + * set "truncate" rounding mode + */ +#if defined(__x86_64__) + u32 mxcsr_orig; + + asm volatile ("stmxcsr %0":"=m"(mxcsr_orig)); + asm volatile ("ldmxcsr %0"::"m"(mxcsr)); +#elif defined(__PPC__) + double fpscr_orig, fpscr = *(double *)&one; + + asm volatile ("mffs %0":"=f"(fpscr_orig)); + asm volatile ("mtfsf 255,%0"::"f"(fpscr)); +#elif defined(__s390x__) + u32 fpc_orig; + + asm volatile ("stfpc %0":"=m"(fpc_orig)); + asm volatile ("lfpc %0"::"m"(fpc)); +#elif defined(__sparc__) + u64 fsr_orig; + + asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); + asm volatile ("ldx %0,%%fsr"::"m"(fsr)); +#elif defined(__mips__) + u32 fcsr_orig; + + asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig)); + asm volatile ("ctc1 %0,$31"::"r"(fcsr)); +#endif + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + r0.u = EXP(52+0) | (U8TOU32(&key[0]) & 0x0fffffff); + r1.u = EXP(52+32) | (U8TOU32(&key[4]) & 0x0ffffffc); + r2.u = EXP(52+64) | (U8TOU32(&key[8]) & 0x0ffffffc); + r3.u = EXP(52+96) | (U8TOU32(&key[12]) & 0x0ffffffc); + + st->r[0] = r0.d - TWO(52)*TWO0; + st->r[2] = r1.d - TWO(52)*TWO32; + st->r[4] = r2.d - TWO(52)*TWO64; + st->r[6] = r3.d - TWO(52)*TWO96; + + st->s[0] = st->r[2] * (5.0/TWO130); + st->s[2] = st->r[4] * (5.0/TWO130); + st->s[4] = st->r[6] * (5.0/TWO130); + + /* + * base 2^32 -> base 2^16 + */ + st->r[1] = (st->r[0] + TWO(52)*TWO(16)*TWO0) - + TWO(52)*TWO(16)*TWO0; + st->r[0] -= st->r[1]; + + st->r[3] = (st->r[2] + TWO(52)*TWO(16)*TWO32) - + TWO(52)*TWO(16)*TWO32; + st->r[2] -= st->r[3]; + + st->r[5] = (st->r[4] + TWO(52)*TWO(16)*TWO64) - + TWO(52)*TWO(16)*TWO64; + st->r[4] -= st->r[5]; + + st->r[7] = (st->r[6] + TWO(52)*TWO(16)*TWO96) - + TWO(52)*TWO(16)*TWO96; + st->r[6] -= st->r[7]; + + st->s[1] = (st->s[0] + TWO(52)*TWO(16)*TWO0/TWO96) - + TWO(52)*TWO(16)*TWO0/TWO96; + st->s[0] -= st->s[1]; + + st->s[3] = (st->s[2] + TWO(52)*TWO(16)*TWO32/TWO96) - + TWO(52)*TWO(16)*TWO32/TWO96; + st->s[2] -= st->s[3]; + + st->s[5] = (st->s[4] + TWO(52)*TWO(16)*TWO64/TWO96) - + TWO(52)*TWO(16)*TWO64/TWO96; + st->s[4] -= st->s[5]; + + /* + * restore original FPU control register + */ +#if defined(__x86_64__) + asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig)); +#elif defined(__PPC__) + asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig)); +#elif defined(__s390x__) + asm volatile ("lfpc %0"::"m"(fpc_orig)); +#elif defined(__sparc__) + asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); +#elif defined(__mips__) + asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig)); +#endif + } + + return 0; +} + +void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, + int padbit) +{ + poly1305_internal *st = (poly1305_internal *)ctx; + elem64 in0, in1, in2, in3; + u64 pad = (u64)padbit<<32; + + double x0, x1, x2, x3; + double h0lo, h0hi, h1lo, h1hi, h2lo, h2hi, h3lo, h3hi; + double c0lo, c0hi, c1lo, c1hi, c2lo, c2hi, c3lo, c3hi; + + const double r0lo = st->r[0]; + const double r0hi = st->r[1]; + const double r1lo = st->r[2]; + const double r1hi = st->r[3]; + const double r2lo = st->r[4]; + const double r2hi = st->r[5]; + const double r3lo = st->r[6]; + const double r3hi = st->r[7]; + + const double s1lo = st->s[0]; + const double s1hi = st->s[1]; + const double s2lo = st->s[2]; + const double s2hi = st->s[3]; + const double s3lo = st->s[4]; + const double s3hi = st->s[5]; + + /* + * set "truncate" rounding mode + */ +#if defined(__x86_64__) + u32 mxcsr_orig; + + asm volatile ("stmxcsr %0":"=m"(mxcsr_orig)); + asm volatile ("ldmxcsr %0"::"m"(mxcsr)); +#elif defined(__PPC__) + double fpscr_orig, fpscr = *(double *)&one; + + asm volatile ("mffs %0":"=f"(fpscr_orig)); + asm volatile ("mtfsf 255,%0"::"f"(fpscr)); +#elif defined(__s390x__) + u32 fpc_orig; + + asm volatile ("stfpc %0":"=m"(fpc_orig)); + asm volatile ("lfpc %0"::"m"(fpc)); +#elif defined(__sparc__) + u64 fsr_orig; + + asm volatile ("stx %%fsr,%0":"=m"(fsr_orig)); + asm volatile ("ldx %0,%%fsr"::"m"(fsr)); +#elif defined(__mips__) + u32 fcsr_orig; + + asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig)); + asm volatile ("ctc1 %0,$31"::"r"(fcsr)); +#endif + + /* + * load base 2^32 and de-bias + */ + h0lo = st->h[0].d - TWO(52)*TWO0; + h1lo = st->h[1].d - TWO(52)*TWO32; + h2lo = st->h[2].d - TWO(52)*TWO64; + h3lo = st->h[3].d - TWO(52)*TWO96; + +#ifdef __clang__ + h0hi = 0; + h1hi = 0; + h2hi = 0; + h3hi = 0; +#else + in0.u = EXP(52+0) | U8TOU32(&inp[0]); + in1.u = EXP(52+32) | U8TOU32(&inp[4]); + in2.u = EXP(52+64) | U8TOU32(&inp[8]); + in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad; + + x0 = in0.d - TWO(52)*TWO0; + x1 = in1.d - TWO(52)*TWO32; + x2 = in2.d - TWO(52)*TWO64; + x3 = in3.d - TWO(52)*TWO96; + + x0 += h0lo; + x1 += h1lo; + x2 += h2lo; + x3 += h3lo; + + goto fast_entry; +#endif + + do { + in0.u = EXP(52+0) | U8TOU32(&inp[0]); + in1.u = EXP(52+32) | U8TOU32(&inp[4]); + in2.u = EXP(52+64) | U8TOU32(&inp[8]); + in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad; + + x0 = in0.d - TWO(52)*TWO0; + x1 = in1.d - TWO(52)*TWO32; + x2 = in2.d - TWO(52)*TWO64; + x3 = in3.d - TWO(52)*TWO96; + + /* + * note that there are multiple ways to accumulate input, e.g. + * one can as well accumulate to h0lo-h1lo-h1hi-h2hi... + */ + h0lo += x0; + h0hi += x1; + h2lo += x2; + h2hi += x3; + + /* + * carries that cross 32n-bit (and 130-bit) boundaries + */ + c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32; + c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64; + c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96; + c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130; + + c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32; + c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64; + c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96; + c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130; + + /* + * base 2^48 -> base 2^32 with last reduction step + */ + x1 = (h1lo - c1lo) + c0lo; + x2 = (h2lo - c2lo) + c1lo; + x3 = (h3lo - c3lo) + c2lo; + x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130); + + x1 += (h1hi - c1hi) + c0hi; + x2 += (h2hi - c2hi) + c1hi; + x3 += (h3hi - c3hi) + c2hi; + x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130); + +#ifndef __clang__ + fast_entry: +#endif + /* + * base 2^32 * base 2^16 = base 2^48 + */ + h0lo = s3lo * x1 + s2lo * x2 + s1lo * x3 + r0lo * x0; + h1lo = r0lo * x1 + s3lo * x2 + s2lo * x3 + r1lo * x0; + h2lo = r1lo * x1 + r0lo * x2 + s3lo * x3 + r2lo * x0; + h3lo = r2lo * x1 + r1lo * x2 + r0lo * x3 + r3lo * x0; + + h0hi = s3hi * x1 + s2hi * x2 + s1hi * x3 + r0hi * x0; + h1hi = r0hi * x1 + s3hi * x2 + s2hi * x3 + r1hi * x0; + h2hi = r1hi * x1 + r0hi * x2 + s3hi * x3 + r2hi * x0; + h3hi = r2hi * x1 + r1hi * x2 + r0hi * x3 + r3hi * x0; + + inp += 16; + len -= 16; + + } while (len >= 16); + + /* + * carries that cross 32n-bit (and 130-bit) boundaries + */ + c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32; + c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64; + c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96; + c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130; + + c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32; + c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64; + c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96; + c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130; + + /* + * base 2^48 -> base 2^32 with last reduction step + */ + x1 = (h1lo - c1lo) + c0lo; + x2 = (h2lo - c2lo) + c1lo; + x3 = (h3lo - c3lo) + c2lo; + x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130); + + x1 += (h1hi - c1hi) + c0hi; + x2 += (h2hi - c2hi) + c1hi; + x3 += (h3hi - c3hi) + c2hi; + x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130); + + /* + * store base 2^32, with bias + */ + st->h[1].d = x1 + TWO(52)*TWO32; + st->h[2].d = x2 + TWO(52)*TWO64; + st->h[3].d = x3 + TWO(52)*TWO96; + st->h[0].d = x0 + TWO(52)*TWO0; + + /* + * restore original FPU control register + */ +#if defined(__x86_64__) + asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig)); +#elif defined(__PPC__) + asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig)); +#elif defined(__s390x__) + asm volatile ("lfpc %0"::"m"(fpc_orig)); +#elif defined(__sparc__) + asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig)); +#elif defined(__mips__) + asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig)); +#endif +} + +void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4]) +{ + poly1305_internal *st = (poly1305_internal *) ctx; + u64 h0, h1, h2, h3, h4; + u32 g0, g1, g2, g3, g4; + u64 t; + u32 mask; + + /* + * thanks to bias masking exponent gives integer result + */ + h0 = st->h[0].u & 0x000fffffffffffffULL; + h1 = st->h[1].u & 0x000fffffffffffffULL; + h2 = st->h[2].u & 0x000fffffffffffffULL; + h3 = st->h[3].u & 0x000fffffffffffffULL; + + /* + * can be partially reduced, so reduce... + */ + h4 = h3>>32; h3 &= 0xffffffffU; + g4 = h4&-4; + h4 &= 3; + g4 += g4>>2; + + h0 += g4; + h1 += h0>>32; h0 &= 0xffffffffU; + h2 += h1>>32; h1 &= 0xffffffffU; + h3 += h2>>32; h2 &= 0xffffffffU; + + /* compute h + -p */ + g0 = (u32)(t = h0 + 5); + g1 = (u32)(t = h1 + (t >> 32)); + g2 = (u32)(t = h2 + (t >> 32)); + g3 = (u32)(t = h3 + (t >> 32)); + g4 = h4 + (u32)(t >> 32); + + /* if there was carry, select g0-g3 */ + mask = 0 - (g4 >> 2); + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + mask = ~mask; + g0 |= (h0 & mask); + g1 |= (h1 & mask); + g2 |= (h2 & mask); + g3 |= (h3 & mask); + + /* mac = (h + nonce) % (2^128) */ + g0 = (u32)(t = (u64)g0 + nonce[0]); + g1 = (u32)(t = (u64)g1 + (t >> 32) + nonce[1]); + g2 = (u32)(t = (u64)g2 + (t >> 32) + nonce[2]); + g3 = (u32)(t = (u64)g3 + (t >> 32) + nonce[3]); + + U32TO8(mac + 0, g0); + U32TO8(mac + 4, g1); + U32TO8(mac + 8, g2); + U32TO8(mac + 12, g3); +} diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_local.h b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_local.h new file mode 100644 index 000000000..6d4d9dc5b --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_local.h @@ -0,0 +1,27 @@ +/* + * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +typedef void (*poly1305_blocks_f) (void *ctx, const unsigned char *inp, + size_t len, unsigned int padbit); +typedef void (*poly1305_emit_f) (void *ctx, unsigned char mac[16], + const unsigned int nonce[4]); + +struct poly1305_context { + double opaque[24]; /* large enough to hold internal state, declared + * 'double' to ensure at least 64-bit invariant + * alignment across all platforms and + * configurations */ + unsigned int nonce[4]; + unsigned char data[POLY1305_BLOCK_SIZE]; + size_t num; + struct { + poly1305_blocks_f blocks; + poly1305_emit_f emit; + } func; +}; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_pmeth.c b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_pmeth.c new file mode 100644 index 000000000..49a799a12 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/crypto/poly1305/poly1305_pmeth.c @@ -0,0 +1,194 @@ +/* + * Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include <stdio.h> +#include "internal/cryptlib.h" +#include <openssl/x509.h> +#include <openssl/x509v3.h> +#include <openssl/evp.h> +#include <openssl/err.h> +#include "crypto/poly1305.h" +#include "poly1305_local.h" +#include "crypto/evp.h" + +/* POLY1305 pkey context structure */ + +typedef struct { + ASN1_OCTET_STRING ktmp; /* Temp storage for key */ + POLY1305 ctx; +} POLY1305_PKEY_CTX; + +static int pkey_poly1305_init(EVP_PKEY_CTX *ctx) +{ + POLY1305_PKEY_CTX *pctx; + + if ((pctx = OPENSSL_zalloc(sizeof(*pctx))) == NULL) { + CRYPTOerr(CRYPTO_F_PKEY_POLY1305_INIT, ERR_R_MALLOC_FAILURE); + return 0; + } + pctx->ktmp.type = V_ASN1_OCTET_STRING; + + EVP_PKEY_CTX_set_data(ctx, pctx); + EVP_PKEY_CTX_set0_keygen_info(ctx, NULL, 0); + return 1; +} + +static void pkey_poly1305_cleanup(EVP_PKEY_CTX *ctx) +{ + POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx); + + if (pctx != NULL) { + OPENSSL_clear_free(pctx->ktmp.data, pctx->ktmp.length); + OPENSSL_clear_free(pctx, sizeof(*pctx)); + EVP_PKEY_CTX_set_data(ctx, NULL); + } +} + +static int pkey_poly1305_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) +{ + POLY1305_PKEY_CTX *sctx, *dctx; + + /* allocate memory for dst->data and a new POLY1305_CTX in dst->data->ctx */ + if (!pkey_poly1305_init(dst)) + return 0; + sctx = EVP_PKEY_CTX_get_data(src); + dctx = EVP_PKEY_CTX_get_data(dst); + if (ASN1_STRING_get0_data(&sctx->ktmp) != NULL && + !ASN1_STRING_copy(&dctx->ktmp, &sctx->ktmp)) { + /* cleanup and free the POLY1305_PKEY_CTX in dst->data */ + pkey_poly1305_cleanup(dst); + return 0; + } + memcpy(&dctx->ctx, &sctx->ctx, sizeof(POLY1305)); + return 1; +} + +static int pkey_poly1305_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) +{ + ASN1_OCTET_STRING *key; + POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx); + + if (ASN1_STRING_get0_data(&pctx->ktmp) == NULL) + return 0; + key = ASN1_OCTET_STRING_dup(&pctx->ktmp); + if (key == NULL) + return 0; + return EVP_PKEY_assign_POLY1305(pkey, key); +} + +static int int_update(EVP_MD_CTX *ctx, const void *data, size_t count) +{ + POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(EVP_MD_CTX_pkey_ctx(ctx)); + + Poly1305_Update(&pctx->ctx, data, count); + return 1; +} + +static int poly1305_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx) +{ + POLY1305_PKEY_CTX *pctx = ctx->data; + ASN1_OCTET_STRING *key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr; + + if (key->length != POLY1305_KEY_SIZE) + return 0; + EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT); + EVP_MD_CTX_set_update_fn(mctx, int_update); + Poly1305_Init(&pctx->ctx, key->data); + return 1; +} +static int poly1305_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, + EVP_MD_CTX *mctx) +{ + POLY1305_PKEY_CTX *pctx = ctx->data; + + *siglen = POLY1305_DIGEST_SIZE; + if (sig != NULL) + Poly1305_Final(&pctx->ctx, sig); + return 1; +} + +static int pkey_poly1305_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) +{ + POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx); + const unsigned char *key; + size_t len; + + switch (type) { + + case EVP_PKEY_CTRL_MD: + /* ignore */ + break; + + case EVP_PKEY_CTRL_SET_MAC_KEY: + case EVP_PKEY_CTRL_DIGESTINIT: + if (type == EVP_PKEY_CTRL_SET_MAC_KEY) { + /* user explicitly setting the key */ + key = p2; + len = p1; + } else { + /* user indirectly setting the key via EVP_DigestSignInit */ + key = EVP_PKEY_get0_poly1305(EVP_PKEY_CTX_get0_pkey(ctx), &len); + } + if (key == NULL || len != POLY1305_KEY_SIZE || + !ASN1_OCTET_STRING_set(&pctx->ktmp, key, len)) + return 0; + Poly1305_Init(&pctx->ctx, ASN1_STRING_get0_data(&pctx->ktmp)); + break; + + default: + return -2; + + } + return 1; +} + +static int pkey_poly1305_ctrl_str(EVP_PKEY_CTX *ctx, + const char *type, const char *value) +{ + if (value == NULL) + return 0; + if (strcmp(type, "key") == 0) + return EVP_PKEY_CTX_str2ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, value); + if (strcmp(type, "hexkey") == 0) + return EVP_PKEY_CTX_hex2ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, value); + return -2; +} + +const EVP_PKEY_METHOD poly1305_pkey_meth = { + EVP_PKEY_POLY1305, + EVP_PKEY_FLAG_SIGCTX_CUSTOM, /* we don't deal with a separate MD */ + pkey_poly1305_init, + pkey_poly1305_copy, + pkey_poly1305_cleanup, + + 0, 0, + + 0, + pkey_poly1305_keygen, + + 0, 0, + + 0, 0, + + 0, 0, + + poly1305_signctx_init, + poly1305_signctx, + + 0, 0, + + 0, 0, + + 0, 0, + + 0, 0, + + pkey_poly1305_ctrl, + pkey_poly1305_ctrl_str +}; |