diff options
author | 2023-10-10 14:33:42 +0000 | |
---|---|---|
committer | 2023-10-10 14:33:42 +0000 | |
commit | af1a266670d040d2f4083ff309d732d648afba2a (patch) | |
tree | 2fc46203448ddcc6f81546d379abfaeb323575e9 /roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha | |
parent | e02cda008591317b1625707ff8e115a4841aa889 (diff) |
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha')
7 files changed, 6241 insertions, 0 deletions
diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/CMakeLists.txt b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/CMakeLists.txt new file mode 100644 index 000000000..63de06119 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/CMakeLists.txt @@ -0,0 +1,48 @@ +include_directories(../../include) + +if (${ARCH} STREQUAL "arm") + set( + CHACHA_ARCH_SOURCES + + chacha-armv4.${ASM_EXT} + ) +endif() + +if (${ARCH} STREQUAL "aarch64") + set( + CHACHA_ARCH_SOURCES + + chacha-armv8.${ASM_EXT} + ) +endif() + +if (${ARCH} STREQUAL "x86") + set( + CHACHA_ARCH_SOURCES + + chacha-x86.${ASM_EXT} + ) +endif() + +if (${ARCH} STREQUAL "x86_64") + set( + CHACHA_ARCH_SOURCES + + chacha-x86_64.${ASM_EXT} + ) +endif() + +add_library( + chacha + + OBJECT + + chacha.c + + ${CHACHA_ARCH_SOURCES} +) + +perlasm(chacha-armv4.${ASM_EXT} asm/chacha-armv4.pl) +perlasm(chacha-armv8.${ASM_EXT} asm/chacha-armv8.pl) +perlasm(chacha-x86.${ASM_EXT} asm/chacha-x86.pl) +perlasm(chacha-x86_64.${ASM_EXT} asm/chacha-x86_64.pl) diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-armv4.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-armv4.pl new file mode 100755 index 000000000..13698e3a8 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-armv4.pl @@ -0,0 +1,1151 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# December 2014 +# +# ChaCha20 for ARMv4. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU +# +# Cortex-A5 19.3(*)/+95% 21.8 14.1 +# Cortex-A8 10.5(*)/+160% 13.9 6.35 +# Cortex-A9 12.9(**)/+110% 14.3 6.50 +# Cortex-A15 11.0/+40% 16.0 5.00 +# Snapdragon S4 11.5/+125% 13.6 4.90 +# +# (*) most "favourable" result for aligned data on little-endian +# processor, result for misaligned data is 10-15% lower; +# (**) this result is a trade-off: it can be improved by 20%, +# but then Snapdragon S4 and Cortex-A8 results get +# 20-25% worse; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); +my @t=map("r$_",(8..11)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my $odd = $d0&1; +my ($xc,$xc_) = (@t[0..1]); +my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); +my @ret; + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' are permanently allocated in registers, @x[0..7], + # while 'c's and pair of 'd's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. If you observe 'd' column, you'll + # notice that 15 and 13 are reused in next pair of rounds. + # This is why these two are chosen for offloading to memory, + # to make loads count more. + push @ret,( + "&add (@x[$a0],@x[$a0],@x[$b0])", + "&mov ($xd,$xd,'ror#16')", + "&add (@x[$a1],@x[$a1],@x[$b1])", + "&mov ($xd_,$xd_,'ror#16')", + "&eor ($xd,$xd,@x[$a0],'ror#16')", + "&eor ($xd_,$xd_,@x[$a1],'ror#16')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b0],@x[$b0],'ror#20')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b1],@x[$b1],'ror#20')", + "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", + "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", + + "&add (@x[$a0],@x[$a0],@x[$b0])", + "&mov ($xd,$xd,'ror#24')", + "&add (@x[$a1],@x[$a1],@x[$b1])", + "&mov ($xd_,$xd_,'ror#24')", + "&eor ($xd,$xd,@x[$a0],'ror#24')", + "&eor ($xd_,$xd_,@x[$a1],'ror#24')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b0],@x[$b0],'ror#25')" ); + push @ret,( + "&str ($xd,'[sp,#4*(16+$d0)]')", + "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); + push @ret,( + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b1],@x[$b1],'ror#25')" ); + push @ret,( + "&str ($xd_,'[sp,#4*(16+$d1)]')", + "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); + push @ret,( + "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", + "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); + + $xd=@x[$d2] if (!$odd); + $xd_=@x[$d3] if ($odd); + push @ret,( + "&str ($xc,'[sp,#4*(16+$c0)]')", + "&ldr ($xc,'[sp,#4*(16+$c2)]')", + "&add (@x[$a2],@x[$a2],@x[$b2])", + "&mov ($xd,$xd,'ror#16')", + "&str ($xc_,'[sp,#4*(16+$c1)]')", + "&ldr ($xc_,'[sp,#4*(16+$c3)]')", + "&add (@x[$a3],@x[$a3],@x[$b3])", + "&mov ($xd_,$xd_,'ror#16')", + "&eor ($xd,$xd,@x[$a2],'ror#16')", + "&eor ($xd_,$xd_,@x[$a3],'ror#16')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b2],@x[$b2],'ror#20')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b3],@x[$b3],'ror#20')", + "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", + "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", + + "&add (@x[$a2],@x[$a2],@x[$b2])", + "&mov ($xd,$xd,'ror#24')", + "&add (@x[$a3],@x[$a3],@x[$b3])", + "&mov ($xd_,$xd_,'ror#24')", + "&eor ($xd,$xd,@x[$a2],'ror#24')", + "&eor ($xd_,$xd_,@x[$a3],'ror#24')", + + "&add ($xc,$xc,$xd)", + "&mov (@x[$b2],@x[$b2],'ror#25')", + "&add ($xc_,$xc_,$xd_)", + "&mov (@x[$b3],@x[$b3],'ror#25')", + "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", + "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); + + @ret; +} + +$code.=<<___; +#include <openssl/arm_arch.h> + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrhsb ldrbhs +#endif + +.align 5 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone: +.long 1,0,0,0 +#if __ARM_MAX_ARCH__>=7 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.LChaCha20_ctr32 +#else +.word -1 +#endif + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: +.LChaCha20_ctr32: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r14,pc,#16 @ ChaCha20_ctr32 +#else + adr r14,.LChaCha20_ctr32 +#endif + cmp r2,#0 @ len==0? +#ifdef __thumb2__ + itt eq +#endif + addeq sp,sp,#4*3 + beq .Lno_data +#if __ARM_MAX_ARCH__>=7 + cmp r2,#192 @ test len + bls .Lshort + ldr r4,[r14,#-32] + ldr r4,[r14,r4] +# ifdef __APPLE__ + ldr r4,[r4] +# endif + tst r4,#ARMV7_NEON + bne .LChaCha20_neon +.Lshort: +#endif + ldmia r12,{r4-r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + sub r14,r14,#64 @ .Lsigma + stmdb sp!,{r4-r7} @ copy counter and nonce + ldmia r3,{r4-r11} @ load key + ldmia r14,{r0-r3} @ load sigma + stmdb sp!,{r4-r11} @ copy key + stmdb sp!,{r0-r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "@x[10]" + str r11,[sp,#4*(16+11)] @ off-load "@x[11]" + b .Loop_outer_enter + +.align 4 +.Loop_outer: + ldmia sp,{r0-r9} @ load key material + str @t[3],[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +.Loop_outer_enter: + ldr @t[3], [sp,#4*(15)] + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + ldr @t[2], [sp,#4*(13)] + ldr @x[14],[sp,#4*(14)] + str @t[3], [sp,#4*(16+15)] + mov @t[3],#10 + b .Loop + +.align 4 +.Loop: + subs @t[3],@t[3],#1 +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + bne .Loop + + ldr @t[3],[sp,#4*(32+2)] @ load len + + str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store + str @t[1], [sp,#4*(16+9)] + str @x[12],[sp,#4*(16+12)] + str @t[2], [sp,#4*(16+13)] + str @x[14],[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ @x[0-7] and second half at sp+4*(16+8) + + cmp @t[3],#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr @t[0],[sp,#4*(0)] @ load key material + ldr @t[1],[sp,#4*(1)] + +#if __ARM_ARCH__>=6 || !defined(__ARMEB__) +# if __ARM_ARCH__<7 + orr @t[2],r12,r14 + tst @t[2],#3 @ are input and output aligned? + ldr @t[2],[sp,#4*(2)] + bne .Lunaligned + cmp @t[3],#64 @ restore flags +# else + ldr @t[2],[sp,#4*(2)] +# endif + ldr @t[3],[sp,#4*(3)] + + add @x[0],@x[0],@t[0] @ accumulate key material + add @x[1],@x[1],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[0],@x[0],@t[0] @ xor with input + eorhs @x[1],@x[1],@t[1] + add @t[0],sp,#4*(4) + str @x[0],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[2],@x[2],@t[2] + eorhs @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[1],[r14,#-12] + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @x[5],@x[5],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[4],@x[4],@t[0] + eorhs @x[5],@x[5],@t[1] + add @t[0],sp,#4*(8) + str @x[4],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[6],@x[6],@t[2] + eorhs @x[7],@x[7],@t[3] + str @x[5],[r14,#-12] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[6],[r14,#-8] + add @x[0],sp,#4*(16+8) + str @x[7],[r14,#-4] + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + add @x[1],@x[1],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[0],@x[0],@t[0] + eorhs @x[1],@x[1],@t[1] + add @t[0],sp,#4*(12) + str @x[0],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[2],@x[2],@t[2] + eorhs @x[3],@x[3],@t[3] + str @x[1],[r14,#-12] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @x[5],@x[5],@t[1] +# ifdef __thumb2__ + itt hi +# endif + addhi @t[0],@t[0],#1 @ next counter value + strhi @t[0],[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[4],@x[4],@t[0] + eorhs @x[5],@x[5],@t[1] +# ifdef __thumb2__ + it ne +# endif + ldrne @t[0],[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[6],@x[6],@t[2] + eorhs @x[7],@x[7],@t[3] + str @x[4],[r14],#16 @ store output + str @x[5],[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs @t[3],@t[0],#64 @ len-=64 + str @x[6],[r14,#-8] + str @x[7],[r14,#-4] + bhi .Loop_outer + + beq .Ldone +# if __ARM_ARCH__<7 + b .Ltail + +.align 4 +.Lunaligned: @ unaligned endian-neutral path + cmp @t[3],#64 @ restore flags +# endif +#endif +#if __ARM_ARCH__<7 + ldr @t[3],[sp,#4*(3)] +___ +for ($i=0;$i<16;$i+=4) { +my $j=$i&0x7; + +$code.=<<___ if ($i==4); + add @x[0],sp,#4*(16+8) +___ +$code.=<<___ if ($i==8); + ldmia @x[0],{@x[0]-@x[7]} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" +___ +$code.=<<___; + add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material +___ +$code.=<<___ if ($i==12); +# ifdef __thumb2__ + itt hi +# endif + addhi @t[0],@t[0],#1 @ next counter value + strhi @t[0],[sp,#4*(12)] @ save next counter value +___ +$code.=<<___; + add @x[$j+1],@x[$j+1],@t[1] + add @x[$j+2],@x[$j+2],@t[2] +# ifdef __thumb2__ + itete lo +# endif + eorlo @t[0],@t[0],@t[0] @ zero or ... + ldrhsb @t[0],[r12],#16 @ ... load input + eorlo @t[1],@t[1],@t[1] + ldrhsb @t[1],[r12,#-12] + + add @x[$j+3],@x[$j+3],@t[3] +# ifdef __thumb2__ + itete lo +# endif + eorlo @t[2],@t[2],@t[2] + ldrhsb @t[2],[r12,#-8] + eorlo @t[3],@t[3],@t[3] + ldrhsb @t[3],[r12,#-4] + + eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) + eor @x[$j+1],@t[1],@x[$j+1] +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-15] @ load more input + ldrhsb @t[1],[r12,#-11] + eor @x[$j+2],@t[2],@x[$j+2] + strb @x[$j+0],[r14],#16 @ store output + eor @x[$j+3],@t[3],@x[$j+3] +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-7] + ldrhsb @t[3],[r12,#-3] + strb @x[$j+1],[r14,#-12] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+2],[r14,#-8] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-14] @ load more input + ldrhsb @t[1],[r12,#-10] + strb @x[$j+3],[r14,#-4] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+0],[r14,#-15] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-6] + ldrhsb @t[3],[r12,#-2] + strb @x[$j+1],[r14,#-11] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+2],[r14,#-7] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-13] @ load more input + ldrhsb @t[1],[r12,#-9] + strb @x[$j+3],[r14,#-3] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+0],[r14,#-14] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-5] + ldrhsb @t[3],[r12,#-1] + strb @x[$j+1],[r14,#-10] + strb @x[$j+2],[r14,#-6] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+3],[r14,#-2] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 + strb @x[$j+0],[r14,#-13] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+1],[r14,#-9] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 + strb @x[$j+2],[r14,#-5] + strb @x[$j+3],[r14,#-1] +___ +$code.=<<___ if ($i<12); + add @t[0],sp,#4*(4+$i) + ldmia @t[0],{@t[0]-@t[3]} @ load key material +___ +} +$code.=<<___; +# ifdef __thumb2__ + it ne +# endif + ldrne @t[0],[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs @t[3],@t[0],#64 @ len-=64 + bhi .Loop_outer + + beq .Ldone +#endif + +.Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add @t[1],sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +.Loop_tail: + ldrb @t[2],[@t[1]],#1 @ read buffer on stack + ldrb @t[3],[r12],#1 @ read input + subs @t[0],@t[0],#1 + eor @t[3],@t[3],@t[2] + strb @t[3],[r14],#1 @ store output + bne .Loop_tail + +.Ldone: + add sp,sp,#4*(32+3) +.Lno_data: + ldmia sp!,{r4-r11,pc} +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +___ + +{{{ +my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = + map("q$_",(0..15)); + +sub NEONROUND { +my $odd = pop; +my ($a,$b,$c,$d,$t)=@_; + + ( + "&vadd_i32 ($a,$a,$b)", + "&veor ($d,$d,$a)", + "&vrev32_16 ($d,$d)", # vrot ($d,16) + + "&vadd_i32 ($c,$c,$d)", + "&veor ($t,$b,$c)", + "&vshr_u32 ($b,$t,20)", + "&vsli_32 ($b,$t,12)", + + "&vadd_i32 ($a,$a,$b)", + "&veor ($t,$d,$a)", + "&vshr_u32 ($d,$t,24)", + "&vsli_32 ($d,$t,8)", + + "&vadd_i32 ($c,$c,$d)", + "&veor ($t,$b,$c)", + "&vshr_u32 ($b,$t,25)", + "&vsli_32 ($b,$t,7)", + + "&vext_8 ($c,$c,$c,8)", + "&vext_8 ($b,$b,$b,$odd?12:4)", + "&vext_8 ($d,$d,$d,$odd?4:12)" + ); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} +.LChaCha20_neon: + adr r14,.Lsigma + vstmdb sp!,{d8-d15} @ ABI spec says so + stmdb sp!,{r0-r3} + + vld1.32 {$b0-$c0},[r3] @ load key + ldmia r3,{r4-r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {$d0},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0-r3} @ load sigma + vld1.32 {$a0},[r14]! @ load sigma + vld1.32 {$t0},[r14] @ one + vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce + vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "@x[10]" + str r11,[sp,#4*(16+11)] @ off-load "@x[11]" + vshl.i32 $t1#lo,$t0#lo,#1 @ two + vstr $t0#lo,[sp,#4*(16+0)] + vshl.i32 $t2#lo,$t0#lo,#2 @ four + vstr $t1#lo,[sp,#4*(16+2)] + vmov $a1,$a0 + vstr $t2#lo,[sp,#4*(16+4)] + vmov $a2,$a0 + vmov $b1,$b0 + vmov $b2,$b0 + b .Loop_neon_enter + +.align 4 +.Loop_neon_outer: + ldmia sp,{r0-r9} @ load key material + cmp @t[3],#64*2 @ if len<=64*2 + bls .Lbreak_neon @ switch to integer-only + vmov $a1,$a0 + str @t[3],[sp,#4*(32+2)] @ save len + vmov $a2,$a0 + str r12, [sp,#4*(32+1)] @ save inp + vmov $b1,$b0 + str r14, [sp,#4*(32+0)] @ save out + vmov $b2,$b0 +.Loop_neon_enter: + ldr @t[3], [sp,#4*(15)] + vadd.i32 $d1,$d0,$t0 @ counter+1 + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + vmov $c1,$c0 + ldr @t[2], [sp,#4*(13)] + vmov $c2,$c0 + ldr @x[14],[sp,#4*(14)] + vadd.i32 $d2,$d1,$t0 @ counter+2 + str @t[3], [sp,#4*(16+15)] + mov @t[3],#10 + add @x[12],@x[12],#3 @ counter+3 + b .Loop_neon + +.align 4 +.Loop_neon: + subs @t[3],@t[3],#1 +___ + my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); + my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); + my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); + my @thread3=&ROUND(0,4,8,12); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } + + @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); + @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); + @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); + @thread3=&ROUND(0,5,10,15); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } +$code.=<<___; + bne .Loop_neon + + add @t[3],sp,#32 + vld1.32 {$t0-$t1},[sp] @ load key material + vld1.32 {$t2-$t3},[@t[3]] + + ldr @t[3],[sp,#4*(32+2)] @ load len + + str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store + str @t[1], [sp,#4*(16+9)] + str @x[12],[sp,#4*(16+12)] + str @t[2], [sp,#4*(16+13)] + str @x[14],[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ @x[0-7] and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 $a0,$a0,$t0 @ accumulate key material + vadd.i32 $a1,$a1,$t0 + vadd.i32 $a2,$a2,$t0 + vldr $t0#lo,[sp,#4*(16+0)] @ one + + vadd.i32 $b0,$b0,$t1 + vadd.i32 $b1,$b1,$t1 + vadd.i32 $b2,$b2,$t1 + vldr $t1#lo,[sp,#4*(16+2)] @ two + + vadd.i32 $c0,$c0,$t2 + vadd.i32 $c1,$c1,$t2 + vadd.i32 $c2,$c2,$t2 + vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 + vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 + + vadd.i32 $d0,$d0,$t3 + vadd.i32 $d1,$d1,$t3 + vadd.i32 $d2,$d2,$t3 + + cmp @t[3],#64*4 + blo .Ltail_neon + + vld1.8 {$t0-$t1},[r12]! @ load input + mov @t[3],sp + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 @ xor with input + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + vst1.8 {$a0-$b0},[r14]! @ store output + veor $b1,$b1,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c1,$c1,$t2 + vst1.8 {$c0-$d0},[r14]! + veor $d1,$d1,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a2,$a2,$t0 + vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration + veor $t0#hi,$t0#hi,$t0#hi + vldr $t0#lo,[sp,#4*(16+4)] @ four + veor $b2,$b2,$t1 + vld1.32 {$c0-$d0},[@t[3]] + veor $c2,$c2,$t2 + vst1.8 {$a1-$b1},[r14]! + veor $d2,$d2,$t3 + vst1.8 {$c1-$d1},[r14]! + + vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value + vldr $t0#lo,[sp,#4*(16+0)] @ one + + ldmia sp,{@t[0]-@t[3]} @ load key material + add @x[0],@x[0],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + vst1.8 {$a2-$b2},[r14]! + add @x[1],@x[1],@t[1] + ldr @t[1],[r12,#-12] + vst1.8 {$c2-$d2},[r14]! + add @x[2],@x[2],@t[2] + ldr @t[2],[r12,#-8] + add @x[3],@x[3],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif + eor @x[0],@x[0],@t[0] @ xor with input + add @t[0],sp,#4*(4) + eor @x[1],@x[1],@t[1] + str @x[0],[r14],#16 @ store output + eor @x[2],@x[2],@t[2] + str @x[1],[r14,#-12] + eor @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + add @x[5],@x[5],@t[1] + ldr @t[1],[r12,#-12] + add @x[6],@x[6],@t[2] + ldr @t[2],[r12,#-8] + add @x[7],@x[7],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + eor @x[4],@x[4],@t[0] + add @t[0],sp,#4*(8) + eor @x[5],@x[5],@t[1] + str @x[4],[r14],#16 @ store output + eor @x[6],@x[6],@t[2] + str @x[5],[r14,#-12] + eor @x[7],@x[7],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[6],[r14,#-8] + add @x[0],sp,#4*(16+8) + str @x[7],[r14,#-4] + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + add @x[1],@x[1],@t[1] + ldr @t[1],[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it + add @x[2],@x[2],@t[2] + ldr @t[2],[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it + add @x[3],@x[3],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif + eor @x[0],@x[0],@t[0] + add @t[0],sp,#4*(12) + eor @x[1],@x[1],@t[1] + str @x[0],[r14],#16 @ store output + eor @x[2],@x[2],@t[2] + str @x[1],[r14,#-12] + eor @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],@t[0],#4 @ next counter value + add @x[5],@x[5],@t[1] + str @t[0],[sp,#4*(12)] @ save next counter value + ldr @t[0],[r12],#16 @ load input + add @x[6],@x[6],@t[2] + add @x[4],@x[4],#3 @ counter+3 + ldr @t[1],[r12,#-12] + add @x[7],@x[7],@t[3] + ldr @t[2],[r12,#-8] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + eor @x[4],@x[4],@t[0] +# ifdef __thumb2__ + it hi +# endif + ldrhi @t[0],[sp,#4*(32+2)] @ re-load len + eor @x[5],@x[5],@t[1] + eor @x[6],@x[6],@t[2] + str @x[4],[r14],#16 @ store output + eor @x[7],@x[7],@t[3] + str @x[5],[r14,#-12] + sub @t[3],@t[0],#64*4 @ len-=64*4 + str @x[6],[r14,#-8] + str @x[7],[r14,#-4] + bhi .Loop_neon_outer + + b .Ldone_neon + +.align 4 +.Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str @t[3], [sp,#4*(20+32+2)] @ save len + add @t[3],sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr @x[12],[sp,#4*(16+10)] + ldr @x[14],[sp,#4*(16+11)] + vldmia @t[3],{d8-d15} @ fulfill ABI requirement + str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" + str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" + + ldr @t[3], [sp,#4*(15)] + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + ldr @t[2], [sp,#4*(13)] + ldr @x[14],[sp,#4*(14)] + str @t[3], [sp,#4*(20+16+15)] + add @t[3],sp,#4*(20) + vst1.32 {$a0-$b0},[@t[3]]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {$c0-$d0},[@t[3]] + mov @t[3],#10 + b .Loop @ go integer-only + +.align 4 +.Ltail_neon: + cmp @t[3],#64*3 + bhs .L192_or_more_neon + cmp @t[3],#64*2 + bhs .L128_or_more_neon + cmp @t[3],#64*1 + bhs .L64_or_more_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a0-$b0},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c0-$d0},[@t[0]] + b .Loop_tail_neon + +.align 4 +.L64_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vst1.8 {$a0-$b0},[r14]! + vst1.8 {$c0-$d0},[r14]! + + beq .Ldone_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a1-$b1},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c1-$d1},[@t[0]] + sub @t[3],@t[3],#64*1 @ len-=64*1 + b .Loop_tail_neon + +.align 4 +.L128_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + veor $b1,$b1,$t1 + vst1.8 {$a0-$b0},[r14]! + veor $c1,$c1,$t2 + vst1.8 {$c0-$d0},[r14]! + veor $d1,$d1,$t3 + vst1.8 {$a1-$b1},[r14]! + vst1.8 {$c1-$d1},[r14]! + + beq .Ldone_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a2-$b2},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c2-$d2},[@t[0]] + sub @t[3],@t[3],#64*2 @ len-=64*2 + b .Loop_tail_neon + +.align 4 +.L192_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + veor $b1,$b1,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c1,$c1,$t2 + vst1.8 {$a0-$b0},[r14]! + veor $d1,$d1,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a2,$a2,$t0 + vst1.8 {$c0-$d0},[r14]! + veor $b2,$b2,$t1 + vst1.8 {$a1-$b1},[r14]! + veor $c2,$c2,$t2 + vst1.8 {$c1-$d1},[r14]! + veor $d2,$d2,$t3 + vst1.8 {$a2-$b2},[r14]! + vst1.8 {$c2-$d2},[r14]! + + beq .Ldone_neon + + ldmia sp,{@t[0]-@t[3]} @ load key material + add @x[0],@x[0],@t[0] @ accumulate key material + add @t[0],sp,#4*(4) + add @x[1],@x[1],@t[1] + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],sp,#4*(8) + add @x[5],@x[5],@t[1] + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + stmia sp,{@x[0]-@x[7]} + add @x[0],sp,#4*(16+8) + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + add @t[0],sp,#4*(12) + add @x[1],@x[1],@t[1] + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + + add @x[4],@x[4],@t[0] @ accumulate key material + add @t[0],sp,#4*(8) + add @x[5],@x[5],@t[1] + add @x[4],@x[4],#3 @ counter+3 + add @x[6],@x[6],@t[2] + add @x[7],@x[7],@t[3] + ldr @t[3],[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + stmia @t[0],{@x[0]-@x[7]} + add @t[2],sp,#4*(0) + sub @t[3],@t[3],#64*3 @ len-=64*3 + +.Loop_tail_neon: + ldrb @t[0],[@t[2]],#1 @ read buffer on stack + ldrb @t[1],[r12],#1 @ read input + subs @t[3],@t[3],#1 + eor @t[0],@t[0],@t[1] + strb @t[0],[r14],#1 @ store output + bne .Loop_tail_neon + +.Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8-d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4-r11,pc} +.size ChaCha20_neon,.-ChaCha20_neon +.comm OPENSSL_armcap_P,4,4 +#endif +___ +}}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-armv8.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-armv8.pl new file mode 100755 index 000000000..c2d04298a --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-armv8.pl @@ -0,0 +1,1127 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# June 2015 +# +# ChaCha20 for ARMv8. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU +# +# Apple A7 5.50/+49% 3.33 1.70 +# Cortex-A53 8.40/+80% 4.72 4.72(*) +# Cortex-A57 8.06/+43% 4.90 4.43(**) +# Denver 4.50/+82% 2.63 2.67(*) +# X-Gene 9.50/+46% 8.82 8.89(*) +# +# (*) it's expected that doubling interleave factor doesn't help +# all processors, only those with higher NEON latency and +# higher instruction issue rate; +# (**) expected improvement was actually higher; + +$flavour=shift; +$output=shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); + +my @x=map("x$_",(5..17,19..21)); +my @d=map("x$_",(22..28,30)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); + + ( + "&add_32 (@x[$a0],@x[$a0],@x[$b0])", + "&add_32 (@x[$a1],@x[$a1],@x[$b1])", + "&add_32 (@x[$a2],@x[$a2],@x[$b2])", + "&add_32 (@x[$a3],@x[$a3],@x[$b3])", + "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", + "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", + "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", + "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", + "&ror_32 (@x[$d0],@x[$d0],16)", + "&ror_32 (@x[$d1],@x[$d1],16)", + "&ror_32 (@x[$d2],@x[$d2],16)", + "&ror_32 (@x[$d3],@x[$d3],16)", + + "&add_32 (@x[$c0],@x[$c0],@x[$d0])", + "&add_32 (@x[$c1],@x[$c1],@x[$d1])", + "&add_32 (@x[$c2],@x[$c2],@x[$d2])", + "&add_32 (@x[$c3],@x[$c3],@x[$d3])", + "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", + "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", + "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", + "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", + "&ror_32 (@x[$b0],@x[$b0],20)", + "&ror_32 (@x[$b1],@x[$b1],20)", + "&ror_32 (@x[$b2],@x[$b2],20)", + "&ror_32 (@x[$b3],@x[$b3],20)", + + "&add_32 (@x[$a0],@x[$a0],@x[$b0])", + "&add_32 (@x[$a1],@x[$a1],@x[$b1])", + "&add_32 (@x[$a2],@x[$a2],@x[$b2])", + "&add_32 (@x[$a3],@x[$a3],@x[$b3])", + "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", + "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", + "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", + "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", + "&ror_32 (@x[$d0],@x[$d0],24)", + "&ror_32 (@x[$d1],@x[$d1],24)", + "&ror_32 (@x[$d2],@x[$d2],24)", + "&ror_32 (@x[$d3],@x[$d3],24)", + + "&add_32 (@x[$c0],@x[$c0],@x[$d0])", + "&add_32 (@x[$c1],@x[$c1],@x[$d1])", + "&add_32 (@x[$c2],@x[$c2],@x[$d2])", + "&add_32 (@x[$c3],@x[$c3],@x[$d3])", + "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", + "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", + "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", + "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", + "&ror_32 (@x[$b0],@x[$b0],25)", + "&ror_32 (@x[$b1],@x[$b1],25)", + "&ror_32 (@x[$b2],@x[$b2],25)", + "&ror_32 (@x[$b3],@x[$b3],25)" + ); +} + +$code.=<<___; +#include <openssl/arm_arch.h> + +.text + +.extern OPENSSL_armcap_P + +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 +.LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else +.quad OPENSSL_armcap_P-. +#endif +.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: + cbz $len,.Labort + adr @x[0],.LOPENSSL_armcap_P + cmp $len,#192 + b.lo .Lshort +#ifdef __ILP32__ + ldrsw @x[1],[@x[0]] +#else + ldr @x[1],[@x[0]] +#endif + ldr w17,[@x[1],@x[0]] + tst w17,#ARMV7_NEON + b.ne ChaCha20_neon + +.Lshort: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ldp @d[6],@d[7],[$ctr] // load counter +#ifdef __ARMEB__ + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + +.Loop_outer: + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + mov.32 @x[6],@d[3] + lsr @x[7],@d[3],#32 + mov.32 @x[8],@d[4] + lsr @x[9],@d[4],#32 + mov.32 @x[10],@d[5] + lsr @x[11],@d[5],#32 + mov.32 @x[12],@d[6] + lsr @x[13],@d[6],#32 + mov.32 @x[14],@d[7] + lsr @x[15],@d[7],#32 + + mov $ctr,#10 + subs $len,$len,#64 +.Loop: + sub $ctr,$ctr,#1 +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + cbnz $ctr,.Loop + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add @x[1],@x[1],@d[0],lsr#32 + add.32 @x[2],@x[2],@d[1] + add @x[3],@x[3],@d[1],lsr#32 + add.32 @x[4],@x[4],@d[2] + add @x[5],@x[5],@d[2],lsr#32 + add.32 @x[6],@x[6],@d[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add @x[15],@x[15],@d[7],lsr#32 + + b.lo .Ltail + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#1 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.Labort: + ret + +.align 4 +.Ltail: + add $len,$len,#64 +.Less_than_64: + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len + add $ctr,sp,$len + neg $len,$len + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + stp @x[0],@x[2],[sp,#0] + stp @x[4],@x[6],[sp,#16] + stp @x[8],@x[10],[sp,#32] + stp @x[12],@x[14],[sp,#48] + +.Loop_tail: + ldrb w10,[$inp,$len] + ldrb w11,[$ctr,$len] + add $len,$len,#1 + eor w10,w10,w11 + strb w10,[$out,$len] + cbnz $len,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +___ + +{{{ +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = + map("v$_.4s",(0..7,16..23)); +my (@K)=map("v$_.4s",(24..30)); +my $ONE="v31.4s"; + +sub NEONROUND { +my $odd = pop; +my ($a,$b,$c,$d,$t)=@_; + + ( + "&add ('$a','$a','$b')", + "&eor ('$d','$d','$a')", + "&rev32_16 ('$d','$d')", # vrot ($d,16) + + "&add ('$c','$c','$d')", + "&eor ('$t','$b','$c')", + "&ushr ('$b','$t',20)", + "&sli ('$b','$t',12)", + + "&add ('$a','$a','$b')", + "&eor ('$t','$d','$a')", + "&ushr ('$d','$t',24)", + "&sli ('$d','$t',8)", + + "&add ('$c','$c','$d')", + "&eor ('$t','$b','$c')", + "&ushr ('$b','$t',25)", + "&sli ('$b','$t',7)", + + "&ext ('$c','$c','$c',8)", + "&ext ('$d','$d','$d',$odd?4:12)", + "&ext ('$b','$b','$b',$odd?12:4)" + ); +} + +$code.=<<___; + +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp $len,#512 + b.hs .L512_or_more_neon + + sub sp,sp,#64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] + ld1 {$ONE},[@x[0]] +#ifdef __ARMEB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + add @K[3],@K[3],$ONE // += 1 + add @K[4],@K[3],$ONE + add @K[5],@K[4],$ONE + shl $ONE,$ONE,#2 // 1 -> 4 + +.Loop_outer_neon: + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + mov $A0,@K[0] + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + mov $A1,@K[0] + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + mov $A2,@K[0] + mov.32 @x[6],@d[3] + mov $B0,@K[1] + lsr @x[7],@d[3],#32 + mov $B1,@K[1] + mov.32 @x[8],@d[4] + mov $B2,@K[1] + lsr @x[9],@d[4],#32 + mov $D0,@K[3] + mov.32 @x[10],@d[5] + mov $D1,@K[4] + lsr @x[11],@d[5],#32 + mov $D2,@K[5] + mov.32 @x[12],@d[6] + mov $C0,@K[2] + lsr @x[13],@d[6],#32 + mov $C1,@K[2] + mov.32 @x[14],@d[7] + mov $C2,@K[2] + lsr @x[15],@d[7],#32 + + mov $ctr,#10 + subs $len,$len,#256 +.Loop_neon: + sub $ctr,$ctr,#1 +___ + my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread3=&ROUND(0,4,8,12); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&ROUND(0,5,10,15); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } +$code.=<<___; + cbnz $ctr,.Loop_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add $A0,$A0,@K[0] + add @x[1],@x[1],@d[0],lsr#32 + add $A1,$A1,@K[0] + add.32 @x[2],@x[2],@d[1] + add $A2,$A2,@K[0] + add @x[3],@x[3],@d[1],lsr#32 + add $C0,$C0,@K[2] + add.32 @x[4],@x[4],@d[2] + add $C1,$C1,@K[2] + add @x[5],@x[5],@d[2],lsr#32 + add $C2,$C2,@K[2] + add.32 @x[6],@x[6],@d[3] + add $D0,$D0,@K[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add $D1,$D1,@K[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add $D2,$D2,@K[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add $B0,$B0,@K[1] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add $B1,$B1,@K[1] + add @x[15],@x[15],@d[7],lsr#32 + add $B2,$B2,@K[1] + + b.lo .Ltail_neon + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + ld1.8 {$T0-$T3},[$inp],#64 + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor $A0,$A0,$T0 + eor @x[10],@x[10],@x[11] + eor $B0,$B0,$T1 + eor @x[12],@x[12],@x[13] + eor $C0,$C0,$T2 + eor @x[14],@x[14],@x[15] + eor $D0,$D0,$T3 + ld1.8 {$T0-$T3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#4 // increment counter + stp @x[4],@x[6],[$out,#16] + add @K[3],@K[3],$ONE // += 4 + stp @x[8],@x[10],[$out,#32] + add @K[4],@K[4],$ONE + stp @x[12],@x[14],[$out,#48] + add @K[5],@K[5],$ONE + add $out,$out,#64 + + st1.8 {$A0-$D0},[$out],#64 + ld1.8 {$A0-$D0},[$inp],#64 + + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + + eor $A2,$A2,$A0 + eor $B2,$B2,$B0 + eor $C2,$C2,$C0 + eor $D2,$D2,$D0 + st1.8 {$A2-$D2},[$out],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +.Ltail_neon: + add $len,$len,#256 + cmp $len,#64 + b.lo .Less_than_64 + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#4 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 + b.lo .Less_than_128 + + ld1.8 {$T0-$T3},[$inp],#64 + eor $A0,$A0,$T0 + eor $B0,$B0,$T1 + eor $C0,$C0,$T2 + eor $D0,$D0,$T3 + st1.8 {$A0-$D0},[$out],#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 + b.lo .Less_than_192 + + ld1.8 {$T0-$T3},[$inp],#64 + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + b.eq .Ldone_neon + sub $len,$len,#64 + + st1.8 {$A2-$D2},[sp] + b .Last_neon + +.Less_than_128: + st1.8 {$A0-$D0},[sp] + b .Last_neon +.Less_than_192: + st1.8 {$A1-$D1},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len + add $ctr,sp,$len + neg $len,$len + +.Loop_tail_neon: + ldrb w10,[$inp,$len] + ldrb w11,[$ctr,$len] + add $len,$len,#1 + eor w10,w10,w11 + strb w10,[$out,$len] + cbnz $len,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_neon,.-ChaCha20_neon +___ +{ +my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, + $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); + +$code.=<<___; +.type ChaCha20_512_neon,%function +.align 5 +ChaCha20_512_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] + ld1 {$ONE},[@x[0]] +#ifdef __ARMEB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + add @K[3],@K[3],$ONE // += 1 + stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part + add @K[3],@K[3],$ONE // not typo + str @K[2],[sp,#32] + add @K[4],@K[3],$ONE + add @K[5],@K[4],$ONE + add @K[6],@K[5],$ONE + shl $ONE,$ONE,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub $len,$len,#512 // not typo + +.Loop_outer_512_neon: + mov $A0,@K[0] + mov $A1,@K[0] + mov $A2,@K[0] + mov $A3,@K[0] + mov $A4,@K[0] + mov $A5,@K[0] + mov $B0,@K[1] + mov.32 @x[0],@d[0] // unpack key block + mov $B1,@K[1] + lsr @x[1],@d[0],#32 + mov $B2,@K[1] + mov.32 @x[2],@d[1] + mov $B3,@K[1] + lsr @x[3],@d[1],#32 + mov $B4,@K[1] + mov.32 @x[4],@d[2] + mov $B5,@K[1] + lsr @x[5],@d[2],#32 + mov $D0,@K[3] + mov.32 @x[6],@d[3] + mov $D1,@K[4] + lsr @x[7],@d[3],#32 + mov $D2,@K[5] + mov.32 @x[8],@d[4] + mov $D3,@K[6] + lsr @x[9],@d[4],#32 + mov $C0,@K[2] + mov.32 @x[10],@d[5] + mov $C1,@K[2] + lsr @x[11],@d[5],#32 + add $D4,$D0,$ONE // +4 + mov.32 @x[12],@d[6] + add $D5,$D1,$ONE // +4 + lsr @x[13],@d[6],#32 + mov $C2,@K[2] + mov.32 @x[14],@d[7] + mov $C3,@K[2] + lsr @x[15],@d[7],#32 + mov $C4,@K[2] + stp @K[3],@K[4],[sp,#48] // off-load key block, variable part + mov $C5,@K[2] + str @K[5],[sp,#80] + + mov $ctr,#5 + subs $len,$len,#512 +.Loop_upper_neon: + sub $ctr,$ctr,#1 +___ + my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); + my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); + my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); + my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + my $diff = ($#thread0+1)*6 - $#thread67 - 1; + my $i = 0; + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } +$code.=<<___; + cbnz $ctr,.Loop_upper_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add @x[1],@x[1],@d[0],lsr#32 + add.32 @x[2],@x[2],@d[1] + add @x[3],@x[3],@d[1],lsr#32 + add.32 @x[4],@x[4],@d[2] + add @x[5],@x[5],@d[2],lsr#32 + add.32 @x[6],@x[6],@d[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add @x[15],@x[15],@d[7],lsr#32 + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#1 // increment counter + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + stp @x[4],@x[6],[$out,#16] + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + stp @x[8],@x[10],[$out,#32] + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + mov.32 @x[6],@d[3] + lsr @x[7],@d[3],#32 + mov.32 @x[8],@d[4] + lsr @x[9],@d[4],#32 + mov.32 @x[10],@d[5] + lsr @x[11],@d[5],#32 + mov.32 @x[12],@d[6] + lsr @x[13],@d[6],#32 + mov.32 @x[14],@d[7] + lsr @x[15],@d[7],#32 + + mov $ctr,#5 +.Loop_lower_neon: + sub $ctr,$ctr,#1 +___ + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } +$code.=<<___; + cbnz $ctr,.Loop_lower_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + ldp @K[0],@K[1],[sp,#0] + add @x[1],@x[1],@d[0],lsr#32 + ldp @K[2],@K[3],[sp,#32] + add.32 @x[2],@x[2],@d[1] + ldp @K[4],@K[5],[sp,#64] + add @x[3],@x[3],@d[1],lsr#32 + add $A0,$A0,@K[0] + add.32 @x[4],@x[4],@d[2] + add $A1,$A1,@K[0] + add @x[5],@x[5],@d[2],lsr#32 + add $A2,$A2,@K[0] + add.32 @x[6],@x[6],@d[3] + add $A3,$A3,@K[0] + add @x[7],@x[7],@d[3],lsr#32 + add $A4,$A4,@K[0] + add.32 @x[8],@x[8],@d[4] + add $A5,$A5,@K[0] + add @x[9],@x[9],@d[4],lsr#32 + add $C0,$C0,@K[2] + add.32 @x[10],@x[10],@d[5] + add $C1,$C1,@K[2] + add @x[11],@x[11],@d[5],lsr#32 + add $C2,$C2,@K[2] + add.32 @x[12],@x[12],@d[6] + add $C3,$C3,@K[2] + add @x[13],@x[13],@d[6],lsr#32 + add $C4,$C4,@K[2] + add.32 @x[14],@x[14],@d[7] + add $C5,$C5,@K[2] + add @x[15],@x[15],@d[7],lsr#32 + add $D4,$D4,$ONE // +4 + add @x[0],@x[0],@x[1],lsl#32 // pack + add $D5,$D5,$ONE // +4 + add @x[2],@x[2],@x[3],lsl#32 + add $D0,$D0,@K[3] + ldp @x[1],@x[3],[$inp,#0] // load input + add $D1,$D1,@K[4] + add @x[4],@x[4],@x[5],lsl#32 + add $D2,$D2,@K[5] + add @x[6],@x[6],@x[7],lsl#32 + add $D3,$D3,@K[6] + ldp @x[5],@x[7],[$inp,#16] + add $D4,$D4,@K[3] + add @x[8],@x[8],@x[9],lsl#32 + add $D5,$D5,@K[4] + add @x[10],@x[10],@x[11],lsl#32 + add $B0,$B0,@K[1] + ldp @x[9],@x[11],[$inp,#32] + add $B1,$B1,@K[1] + add @x[12],@x[12],@x[13],lsl#32 + add $B2,$B2,@K[1] + add @x[14],@x[14],@x[15],lsl#32 + add $B3,$B3,@K[1] + ldp @x[13],@x[15],[$inp,#48] + add $B4,$B4,@K[1] + add $inp,$inp,#64 + add $B5,$B5,@K[1] + +#ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + ld1.8 {$T0-$T3},[$inp],#64 + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor $A0,$A0,$T0 + eor @x[10],@x[10],@x[11] + eor $B0,$B0,$T1 + eor @x[12],@x[12],@x[13] + eor $C0,$C0,$T2 + eor @x[14],@x[14],@x[15] + eor $D0,$D0,$T3 + ld1.8 {$T0-$T3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#7 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + st1.8 {$A0-$D0},[$out],#64 + + ld1.8 {$A0-$D0},[$inp],#64 + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + + ld1.8 {$A1-$D1},[$inp],#64 + eor $A2,$A2,$A0 + ldp @K[0],@K[1],[sp,#0] + eor $B2,$B2,$B0 + ldp @K[2],@K[3],[sp,#32] + eor $C2,$C2,$C0 + eor $D2,$D2,$D0 + st1.8 {$A2-$D2},[$out],#64 + + ld1.8 {$A2-$D2},[$inp],#64 + eor $A3,$A3,$A1 + eor $B3,$B3,$B1 + eor $C3,$C3,$C1 + eor $D3,$D3,$D1 + st1.8 {$A3-$D3},[$out],#64 + + ld1.8 {$A3-$D3},[$inp],#64 + eor $A4,$A4,$A2 + eor $B4,$B4,$B2 + eor $C4,$C4,$C2 + eor $D4,$D4,$D2 + st1.8 {$A4-$D4},[$out],#64 + + shl $A0,$ONE,#1 // 4 -> 8 + eor $A5,$A5,$A3 + eor $B5,$B5,$B3 + eor $C5,$C5,$C3 + eor $D5,$D5,$D3 + st1.8 {$A5-$D5},[$out],#64 + + add @K[3],@K[3],$A0 // += 8 + add @K[4],@K[4],$A0 + add @K[5],@K[5],$A0 + add @K[6],@K[6],$A0 + + b.hs .Loop_outer_512_neon + + adds $len,$len,#512 + ushr $A0,$ONE,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp @K[0],$ONE,[sp,#0] // wipe off-load area + stp @K[0],$ONE,[sp,#32] + stp @K[0],$ONE,[sp,#64] + + b.eq .Ldone_512_neon + + cmp $len,#192 + sub @K[3],@K[3],$A0 // -= 1 + sub @K[4],@K[4],$A0 + sub @K[5],@K[5],$A0 + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor @K[1],@K[1],@K[1] + eor @K[2],@K[2],@K[2] + eor @K[3],@K[3],@K[3] + eor @K[4],@K[4],@K[4] + eor @K[5],@K[5],@K[5] + eor @K[6],@K[6],@K[6] + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_512_neon,.-ChaCha20_512_neon +___ +} +}}} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or + (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or + (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or + (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or + (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); + + #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT; # flush diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-x86.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-x86.pl new file mode 100755 index 000000000..f8bbb76df --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-x86.pl @@ -0,0 +1,765 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# January 2015 +# +# ChaCha20 for x86. +# +# Performance in cycles per byte out of large buffer. +# +# 1xIALU/gcc 4xSSSE3 +# Pentium 17.5/+80% +# PIII 14.2/+60% +# P4 18.6/+84% +# Core2 9.56/+89% 4.83 +# Westmere 9.50/+45% 3.35 +# Sandy Bridge 10.5/+47% 3.20 +# Haswell 8.15/+50% 2.83 +# Skylake 7.53/+22% 2.75 +# Silvermont 17.4/+36% 8.35 +# Goldmont 13.4/+40% 4.36 +# Sledgehammer 10.2/+54% +# Bulldozer 13.4/+50% 4.38(*) +# +# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; +# +# Modified from upstream OpenSSL to remove the XOP code. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386"); + +$xmm=$ymm=1; +$gasver=999; # enable everything + +$a="eax"; +($b,$b_)=("ebx","ebp"); +($c,$c_)=("ecx","esi"); +($d,$d_)=("edx","edi"); + +sub QUARTERROUND { +my ($ai,$bi,$ci,$di,$i)=@_; +my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next +my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous + + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + + if ($i==0) { + my $j=4; + ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); + } elsif ($i==3) { + my $j=0; + ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); + } elsif ($i==4) { + my $j=4; + ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); + } elsif ($i==7) { + my $j=0; + ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); + } + + #&add ($a,$b); # see elsewhere + &xor ($d,$a); + &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); + &rol ($d,16); + &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); + &add ($c,$d); + &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); + &xor ($b,$c); + &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); + &rol ($b,12); + &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); + &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter + &add ($a,$b); + &xor ($d,$a); + &mov (&DWP(4*$ai,"esp"),$a); + &rol ($d,8); + &mov ($a,&DWP(4*$an,"esp")); + &add ($c,$d); + &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); + &mov ($d_,$d) if ($di==$dn); + &xor ($b,$c); + &add ($a,$b_) if ($i<7); # elsewhere + &rol ($b,7); + + ($b,$b_)=($b_,$b); + ($c,$c_)=($c_,$c); + ($d,$d_)=($d_,$d); +} + +&static_label("ssse3_shortcut"); +&static_label("ssse3_data"); +&static_label("pic_point"); + +&function_begin("ChaCha20_ctr32"); + &xor ("eax","eax"); + &cmp ("eax",&wparam(2)); # len==0? + &je (&label("no_data")); +if ($xmm) { + &call (&label("pic_point")); +&set_label("pic_point"); + &blindpop("eax"); + &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); + &test (&DWP(0,"ebp"),1<<24); # test FXSR bit + &jz (&label("x86")); + &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit + &jz (&label("x86")); + &jmp (&label("ssse3_shortcut")); +&set_label("x86"); +} + &mov ("esi",&wparam(3)); # key + &mov ("edi",&wparam(4)); # counter and nonce + + &stack_push(33); + + &mov ("eax",&DWP(4*0,"esi")); # copy key + &mov ("ebx",&DWP(4*1,"esi")); + &mov ("ecx",&DWP(4*2,"esi")); + &mov ("edx",&DWP(4*3,"esi")); + &mov (&DWP(64+4*4,"esp"),"eax"); + &mov (&DWP(64+4*5,"esp"),"ebx"); + &mov (&DWP(64+4*6,"esp"),"ecx"); + &mov (&DWP(64+4*7,"esp"),"edx"); + &mov ("eax",&DWP(4*4,"esi")); + &mov ("ebx",&DWP(4*5,"esi")); + &mov ("ecx",&DWP(4*6,"esi")); + &mov ("edx",&DWP(4*7,"esi")); + &mov (&DWP(64+4*8,"esp"),"eax"); + &mov (&DWP(64+4*9,"esp"),"ebx"); + &mov (&DWP(64+4*10,"esp"),"ecx"); + &mov (&DWP(64+4*11,"esp"),"edx"); + &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce + &mov ("ebx",&DWP(4*1,"edi")); + &mov ("ecx",&DWP(4*2,"edi")); + &mov ("edx",&DWP(4*3,"edi")); + &sub ("eax",1); + &mov (&DWP(64+4*12,"esp"),"eax"); + &mov (&DWP(64+4*13,"esp"),"ebx"); + &mov (&DWP(64+4*14,"esp"),"ecx"); + &mov (&DWP(64+4*15,"esp"),"edx"); + &jmp (&label("entry")); + +&set_label("outer_loop",16); + &mov (&wparam(1),$b); # save input + &mov (&wparam(0),$a); # save output + &mov (&wparam(2),$c); # save len +&set_label("entry"); + &mov ($a,0x61707865); + &mov (&DWP(4*1,"esp"),0x3320646e); + &mov (&DWP(4*2,"esp"),0x79622d32); + &mov (&DWP(4*3,"esp"),0x6b206574); + + &mov ($b, &DWP(64+4*5,"esp")); # copy key material + &mov ($b_,&DWP(64+4*6,"esp")); + &mov ($c, &DWP(64+4*10,"esp")); + &mov ($c_,&DWP(64+4*11,"esp")); + &mov ($d, &DWP(64+4*13,"esp")); + &mov ($d_,&DWP(64+4*14,"esp")); + &mov (&DWP(4*5,"esp"),$b); + &mov (&DWP(4*6,"esp"),$b_); + &mov (&DWP(4*10,"esp"),$c); + &mov (&DWP(4*11,"esp"),$c_); + &mov (&DWP(4*13,"esp"),$d); + &mov (&DWP(4*14,"esp"),$d_); + + &mov ($b, &DWP(64+4*7,"esp")); + &mov ($d_,&DWP(64+4*15,"esp")); + &mov ($d, &DWP(64+4*12,"esp")); + &mov ($b_,&DWP(64+4*4,"esp")); + &mov ($c, &DWP(64+4*8,"esp")); + &mov ($c_,&DWP(64+4*9,"esp")); + &add ($d,1); # counter value + &mov (&DWP(4*7,"esp"),$b); + &mov (&DWP(4*15,"esp"),$d_); + &mov (&DWP(64+4*12,"esp"),$d); # save counter value + + &mov ($b,10); # loop counter + &jmp (&label("loop")); + +&set_label("loop",16); + &add ($a,$b_); # elsewhere + &mov (&DWP(128,"esp"),$b); # save loop counter + &mov ($b,$b_); + &QUARTERROUND(0, 4, 8, 12, 0); + &QUARTERROUND(1, 5, 9, 13, 1); + &QUARTERROUND(2, 6,10, 14, 2); + &QUARTERROUND(3, 7,11, 15, 3); + &QUARTERROUND(0, 5,10, 15, 4); + &QUARTERROUND(1, 6,11, 12, 5); + &QUARTERROUND(2, 7, 8, 13, 6); + &QUARTERROUND(3, 4, 9, 14, 7); + &dec ($b); + &jnz (&label("loop")); + + &mov ($b,&wparam(2)); # load len + + &add ($a,0x61707865); # accumulate key material + &add ($b_,&DWP(64+4*4,"esp")); + &add ($c, &DWP(64+4*8,"esp")); + &add ($c_,&DWP(64+4*9,"esp")); + + &cmp ($b,64); + &jb (&label("tail")); + + &mov ($b,&wparam(1)); # load input pointer + &add ($d, &DWP(64+4*12,"esp")); + &add ($d_,&DWP(64+4*14,"esp")); + + &xor ($a, &DWP(4*0,$b)); # xor with input + &xor ($b_,&DWP(4*4,$b)); + &mov (&DWP(4*0,"esp"),$a); + &mov ($a,&wparam(0)); # load output pointer + &xor ($c, &DWP(4*8,$b)); + &xor ($c_,&DWP(4*9,$b)); + &xor ($d, &DWP(4*12,$b)); + &xor ($d_,&DWP(4*14,$b)); + &mov (&DWP(4*4,$a),$b_); # write output + &mov (&DWP(4*8,$a),$c); + &mov (&DWP(4*9,$a),$c_); + &mov (&DWP(4*12,$a),$d); + &mov (&DWP(4*14,$a),$d_); + + &mov ($b_,&DWP(4*1,"esp")); + &mov ($c, &DWP(4*2,"esp")); + &mov ($c_,&DWP(4*3,"esp")); + &mov ($d, &DWP(4*5,"esp")); + &mov ($d_,&DWP(4*6,"esp")); + &add ($b_,0x3320646e); # accumulate key material + &add ($c, 0x79622d32); + &add ($c_,0x6b206574); + &add ($d, &DWP(64+4*5,"esp")); + &add ($d_,&DWP(64+4*6,"esp")); + &xor ($b_,&DWP(4*1,$b)); + &xor ($c, &DWP(4*2,$b)); + &xor ($c_,&DWP(4*3,$b)); + &xor ($d, &DWP(4*5,$b)); + &xor ($d_,&DWP(4*6,$b)); + &mov (&DWP(4*1,$a),$b_); + &mov (&DWP(4*2,$a),$c); + &mov (&DWP(4*3,$a),$c_); + &mov (&DWP(4*5,$a),$d); + &mov (&DWP(4*6,$a),$d_); + + &mov ($b_,&DWP(4*7,"esp")); + &mov ($c, &DWP(4*10,"esp")); + &mov ($c_,&DWP(4*11,"esp")); + &mov ($d, &DWP(4*13,"esp")); + &mov ($d_,&DWP(4*15,"esp")); + &add ($b_,&DWP(64+4*7,"esp")); + &add ($c, &DWP(64+4*10,"esp")); + &add ($c_,&DWP(64+4*11,"esp")); + &add ($d, &DWP(64+4*13,"esp")); + &add ($d_,&DWP(64+4*15,"esp")); + &xor ($b_,&DWP(4*7,$b)); + &xor ($c, &DWP(4*10,$b)); + &xor ($c_,&DWP(4*11,$b)); + &xor ($d, &DWP(4*13,$b)); + &xor ($d_,&DWP(4*15,$b)); + &lea ($b,&DWP(4*16,$b)); + &mov (&DWP(4*7,$a),$b_); + &mov ($b_,&DWP(4*0,"esp")); + &mov (&DWP(4*10,$a),$c); + &mov ($c,&wparam(2)); # len + &mov (&DWP(4*11,$a),$c_); + &mov (&DWP(4*13,$a),$d); + &mov (&DWP(4*15,$a),$d_); + &mov (&DWP(4*0,$a),$b_); + &lea ($a,&DWP(4*16,$a)); + &sub ($c,64); + &jnz (&label("outer_loop")); + + &jmp (&label("done")); + +&set_label("tail"); + &add ($d, &DWP(64+4*12,"esp")); + &add ($d_,&DWP(64+4*14,"esp")); + &mov (&DWP(4*0,"esp"),$a); + &mov (&DWP(4*4,"esp"),$b_); + &mov (&DWP(4*8,"esp"),$c); + &mov (&DWP(4*9,"esp"),$c_); + &mov (&DWP(4*12,"esp"),$d); + &mov (&DWP(4*14,"esp"),$d_); + + &mov ($b_,&DWP(4*1,"esp")); + &mov ($c, &DWP(4*2,"esp")); + &mov ($c_,&DWP(4*3,"esp")); + &mov ($d, &DWP(4*5,"esp")); + &mov ($d_,&DWP(4*6,"esp")); + &add ($b_,0x3320646e); # accumulate key material + &add ($c, 0x79622d32); + &add ($c_,0x6b206574); + &add ($d, &DWP(64+4*5,"esp")); + &add ($d_,&DWP(64+4*6,"esp")); + &mov (&DWP(4*1,"esp"),$b_); + &mov (&DWP(4*2,"esp"),$c); + &mov (&DWP(4*3,"esp"),$c_); + &mov (&DWP(4*5,"esp"),$d); + &mov (&DWP(4*6,"esp"),$d_); + + &mov ($b_,&DWP(4*7,"esp")); + &mov ($c, &DWP(4*10,"esp")); + &mov ($c_,&DWP(4*11,"esp")); + &mov ($d, &DWP(4*13,"esp")); + &mov ($d_,&DWP(4*15,"esp")); + &add ($b_,&DWP(64+4*7,"esp")); + &add ($c, &DWP(64+4*10,"esp")); + &add ($c_,&DWP(64+4*11,"esp")); + &add ($d, &DWP(64+4*13,"esp")); + &add ($d_,&DWP(64+4*15,"esp")); + &mov (&DWP(4*7,"esp"),$b_); + &mov ($b_,&wparam(1)); # load input + &mov (&DWP(4*10,"esp"),$c); + &mov ($c,&wparam(0)); # load output + &mov (&DWP(4*11,"esp"),$c_); + &xor ($c_,$c_); + &mov (&DWP(4*13,"esp"),$d); + &mov (&DWP(4*15,"esp"),$d_); + + &xor ("eax","eax"); + &xor ("edx","edx"); +&set_label("tail_loop"); + &movb ("al",&BP(0,$c_,$b_)); + &movb ("dl",&BP(0,"esp",$c_)); + &lea ($c_,&DWP(1,$c_)); + &xor ("al","dl"); + &mov (&BP(-1,$c,$c_),"al"); + &dec ($b); + &jnz (&label("tail_loop")); + +&set_label("done"); + &stack_pop(33); +&set_label("no_data"); +&function_end("ChaCha20_ctr32"); + +if ($xmm) { +my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); +my ($out,$inp,$len)=("edi","esi","ecx"); + +sub QUARTERROUND_SSSE3 { +my ($ai,$bi,$ci,$di,$i)=@_; +my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next +my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous + + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + + if ($i==0) { + my $j=4; + ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); + } elsif ($i==3) { + my $j=0; + ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); + } elsif ($i==4) { + my $j=4; + ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); + } elsif ($i==7) { + my $j=0; + ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); + } + + #&paddd ($xa,$xb); # see elsewhere + #&pxor ($xd,$xa); # see elsewhere + &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); + &pshufb ($xd,&QWP(0,"eax")); # rot16 + &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); + &paddd ($xc,$xd); + &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); + &pxor ($xb,$xc); + &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); + &movdqa ($xa_,$xb); # borrow as temporary + &pslld ($xb,12); + &psrld ($xa_,20); + &por ($xb,$xa_); + &movdqa($xa_,&QWP(16*$an-128,"ebx")); + &paddd ($xa,$xb); + &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); + &pxor ($xd,$xa); + &movdqa (&QWP(16*$ai-128,"ebx"),$xa); + &pshufb ($xd,&QWP(16,"eax")); # rot8 + &paddd ($xc,$xd); + &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); + &movdqa ($xd_,$xd) if ($di==$dn); + &pxor ($xb,$xc); + &paddd ($xa_,$xb_) if ($i<7); # elsewhere + &movdqa ($xa,$xb); # borrow as temporary + &pslld ($xb,7); + &psrld ($xa,25); + &pxor ($xd_,$xa_) if ($i<7); # elsewhere + &por ($xb,$xa); + + ($xa,$xa_)=($xa_,$xa); + ($xb,$xb_)=($xb_,$xb); + ($xc,$xc_)=($xc_,$xc); + ($xd,$xd_)=($xd_,$xd); +} + +&function_begin("ChaCha20_ssse3"); +&set_label("ssse3_shortcut"); + &mov ($out,&wparam(0)); + &mov ($inp,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ("edx",&wparam(3)); # key + &mov ("ebx",&wparam(4)); # counter and nonce + + &mov ("ebp","esp"); + &stack_push (131); + &and ("esp",-64); + &mov (&DWP(512,"esp"),"ebp"); + + &lea ("eax",&DWP(&label("ssse3_data")."-". + &label("pic_point"),"eax")); + &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce + +if (defined($gasver) && $gasver>=2.17) { # even though we encode + # pshufb manually, we + # handle only register + # operands, while this + # segment uses memory + # operand... + &cmp ($len,64*4); + &jb (&label("1x")); + + &mov (&DWP(512+4,"esp"),"edx"); # offload pointers + &mov (&DWP(512+8,"esp"),"ebx"); + &sub ($len,64*4); # bias len + &lea ("ebp",&DWP(256+128,"esp")); # size optimization + + &movdqu ("xmm7",&QWP(0,"edx")); # key + &pshufd ("xmm0","xmm3",0x00); + &pshufd ("xmm1","xmm3",0x55); + &pshufd ("xmm2","xmm3",0xaa); + &pshufd ("xmm3","xmm3",0xff); + &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters + &pshufd ("xmm4","xmm7",0x00); + &pshufd ("xmm5","xmm7",0x55); + &psubd ("xmm0",&QWP(16*4,"eax")); + &pshufd ("xmm6","xmm7",0xaa); + &pshufd ("xmm7","xmm7",0xff); + &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); + &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); + &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); + &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); + &movdqu ("xmm3",&QWP(16,"edx")); # key + &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); + &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); + &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); + &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); + &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma + &lea ("ebx",&DWP(128,"esp")); # size optimization + + &pshufd ("xmm0","xmm3",0x00); + &pshufd ("xmm1","xmm3",0x55); + &pshufd ("xmm2","xmm3",0xaa); + &pshufd ("xmm3","xmm3",0xff); + &pshufd ("xmm4","xmm7",0x00); + &pshufd ("xmm5","xmm7",0x55); + &pshufd ("xmm6","xmm7",0xaa); + &pshufd ("xmm7","xmm7",0xff); + &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); + &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); + &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); + &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); + &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); + &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); + &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); + &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); + + &lea ($inp,&DWP(128,$inp)); # size optimization + &lea ($out,&DWP(128,$out)); # size optimization + &jmp (&label("outer_loop")); + +&set_label("outer_loop",16); + #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material + &movdqa ("xmm1",&QWP(16*1-128,"ebp")); + &movdqa ("xmm2",&QWP(16*2-128,"ebp")); + &movdqa ("xmm3",&QWP(16*3-128,"ebp")); + #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); + &movdqa ("xmm5",&QWP(16*5-128,"ebp")); + &movdqa ("xmm6",&QWP(16*6-128,"ebp")); + &movdqa ("xmm7",&QWP(16*7-128,"ebp")); + #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); + &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); + &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); + &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); + #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); + &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); + &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); + &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); + #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); + #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); + &movdqa ("xmm2",&QWP(16*10-128,"ebp")); + &movdqa ("xmm3",&QWP(16*11-128,"ebp")); + &movdqa ("xmm4",&QWP(16*12-128,"ebp")); + &movdqa ("xmm5",&QWP(16*13-128,"ebp")); + &movdqa ("xmm6",&QWP(16*14-128,"ebp")); + &movdqa ("xmm7",&QWP(16*15-128,"ebp")); + &paddd ("xmm4",&QWP(16*4,"eax")); # counter value + #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); + #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); + &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); + &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); + &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); + &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); + &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); + &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); + &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value + + &movdqa ($xa, &QWP(16*0-128,"ebp")); + &movdqa ($xd, "xmm4"); + &movdqa ($xb_,&QWP(16*4-128,"ebp")); + &movdqa ($xc, &QWP(16*8-128,"ebp")); + &movdqa ($xc_,&QWP(16*9-128,"ebp")); + + &mov ("edx",10); # loop counter + &nop (); + +&set_label("loop",16); + &paddd ($xa,$xb_); # elsewhere + &movdqa ($xb,$xb_); + &pxor ($xd,$xa); # elsewhere + &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); + &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); + &QUARTERROUND_SSSE3(2, 6,10, 14, 2); + &QUARTERROUND_SSSE3(3, 7,11, 15, 3); + &QUARTERROUND_SSSE3(0, 5,10, 15, 4); + &QUARTERROUND_SSSE3(1, 6,11, 12, 5); + &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); + &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); + &dec ("edx"); + &jnz (&label("loop")); + + &movdqa (&QWP(16*4-128,"ebx"),$xb_); + &movdqa (&QWP(16*8-128,"ebx"),$xc); + &movdqa (&QWP(16*9-128,"ebx"),$xc_); + &movdqa (&QWP(16*12-128,"ebx"),$xd); + &movdqa (&QWP(16*14-128,"ebx"),$xd_); + + my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); + + #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there + &movdqa ($xa1,&QWP(16*1-128,"ebx")); + &movdqa ($xa2,&QWP(16*2-128,"ebx")); + &movdqa ($xa3,&QWP(16*3-128,"ebx")); + + for($i=0;$i<256;$i+=64) { + &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material + &paddd ($xa1,&QWP($i+16*1-128,"ebp")); + &paddd ($xa2,&QWP($i+16*2-128,"ebp")); + &paddd ($xa3,&QWP($i+16*3-128,"ebp")); + + &movdqa ($xt2,$xa0); # "de-interlace" data + &punpckldq ($xa0,$xa1); + &movdqa ($xt3,$xa2); + &punpckldq ($xa2,$xa3); + &punpckhdq ($xt2,$xa1); + &punpckhdq ($xt3,$xa3); + &movdqa ($xa1,$xa0); + &punpcklqdq ($xa0,$xa2); # "a0" + &movdqa ($xa3,$xt2); + &punpcklqdq ($xt2,$xt3); # "a2" + &punpckhqdq ($xa1,$xa2); # "a1" + &punpckhqdq ($xa3,$xt3); # "a3" + + #($xa2,$xt2)=($xt2,$xa2); + + &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input + &movdqu ($xt1,&QWP(64*1-128,$inp)); + &movdqu ($xa2,&QWP(64*2-128,$inp)); + &movdqu ($xt3,&QWP(64*3-128,$inp)); + &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); + &pxor ($xt0,$xa0); + &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); + &pxor ($xt1,$xa1); + &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); + &pxor ($xt2,$xa2); + &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); + &pxor ($xt3,$xa3); + &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); + &movdqu (&QWP(64*0-128,$out),$xt0); # store output + &movdqu (&QWP(64*1-128,$out),$xt1); + &movdqu (&QWP(64*2-128,$out),$xt2); + &movdqu (&QWP(64*3-128,$out),$xt3); + &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); + } + &sub ($len,64*4); + &jnc (&label("outer_loop")); + + &add ($len,64*4); + &jz (&label("done")); + + &mov ("ebx",&DWP(512+8,"esp")); # restore pointers + &lea ($inp,&DWP(-128,$inp)); + &mov ("edx",&DWP(512+4,"esp")); + &lea ($out,&DWP(-128,$out)); + + &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value + &movdqu ("xmm3",&QWP(0,"ebx")); + &paddd ("xmm2",&QWP(16*6,"eax")); # +four + &pand ("xmm3",&QWP(16*7,"eax")); + &por ("xmm3","xmm2"); # counter value +} +{ +my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); + +sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round + &paddd ($a,$b); + &pxor ($d,$a); + &pshufb ($d,$rot16); + + &paddd ($c,$d); + &pxor ($b,$c); + &movdqa ($t,$b); + &psrld ($b,20); + &pslld ($t,12); + &por ($b,$t); + + &paddd ($a,$b); + &pxor ($d,$a); + &pshufb ($d,$rot24); + + &paddd ($c,$d); + &pxor ($b,$c); + &movdqa ($t,$b); + &psrld ($b,25); + &pslld ($t,7); + &por ($b,$t); +} + +&set_label("1x"); + &movdqa ($a,&QWP(16*2,"eax")); # sigma + &movdqu ($b,&QWP(0,"edx")); + &movdqu ($c,&QWP(16,"edx")); + #&movdqu ($d,&QWP(0,"ebx")); # already loaded + &movdqa ($rot16,&QWP(0,"eax")); + &movdqa ($rot24,&QWP(16,"eax")); + &mov (&DWP(16*3,"esp"),"ebp"); + + &movdqa (&QWP(16*0,"esp"),$a); + &movdqa (&QWP(16*1,"esp"),$b); + &movdqa (&QWP(16*2,"esp"),$c); + &movdqa (&QWP(16*3,"esp"),$d); + &mov ("edx",10); + &jmp (&label("loop1x")); + +&set_label("outer1x",16); + &movdqa ($d,&QWP(16*5,"eax")); # one + &movdqa ($a,&QWP(16*0,"esp")); + &movdqa ($b,&QWP(16*1,"esp")); + &movdqa ($c,&QWP(16*2,"esp")); + &paddd ($d,&QWP(16*3,"esp")); + &mov ("edx",10); + &movdqa (&QWP(16*3,"esp"),$d); + &jmp (&label("loop1x")); + +&set_label("loop1x",16); + &SSSE3ROUND(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b00111001); + &pshufd ($d,$d,0b10010011); + &nop (); + + &SSSE3ROUND(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b10010011); + &pshufd ($d,$d,0b00111001); + + &dec ("edx"); + &jnz (&label("loop1x")); + + &paddd ($a,&QWP(16*0,"esp")); + &paddd ($b,&QWP(16*1,"esp")); + &paddd ($c,&QWP(16*2,"esp")); + &paddd ($d,&QWP(16*3,"esp")); + + &cmp ($len,64); + &jb (&label("tail")); + + &movdqu ($t,&QWP(16*0,$inp)); + &movdqu ($t1,&QWP(16*1,$inp)); + &pxor ($a,$t); # xor with input + &movdqu ($t,&QWP(16*2,$inp)); + &pxor ($b,$t1); + &movdqu ($t1,&QWP(16*3,$inp)); + &pxor ($c,$t); + &pxor ($d,$t1); + &lea ($inp,&DWP(16*4,$inp)); # inp+=64 + + &movdqu (&QWP(16*0,$out),$a); # write output + &movdqu (&QWP(16*1,$out),$b); + &movdqu (&QWP(16*2,$out),$c); + &movdqu (&QWP(16*3,$out),$d); + &lea ($out,&DWP(16*4,$out)); # inp+=64 + + &sub ($len,64); + &jnz (&label("outer1x")); + + &jmp (&label("done")); + +&set_label("tail"); + &movdqa (&QWP(16*0,"esp"),$a); + &movdqa (&QWP(16*1,"esp"),$b); + &movdqa (&QWP(16*2,"esp"),$c); + &movdqa (&QWP(16*3,"esp"),$d); + + &xor ("eax","eax"); + &xor ("edx","edx"); + &xor ("ebp","ebp"); + +&set_label("tail_loop"); + &movb ("al",&BP(0,"esp","ebp")); + &movb ("dl",&BP(0,$inp,"ebp")); + &lea ("ebp",&DWP(1,"ebp")); + &xor ("al","dl"); + &movb (&BP(-1,$out,"ebp"),"al"); + &dec ($len); + &jnz (&label("tail_loop")); +} +&set_label("done"); + &mov ("esp",&DWP(512,"esp")); +&function_end("ChaCha20_ssse3"); + +&align (64); +&set_label("ssse3_data"); +&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); +&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); +&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); +&data_word(0,1,2,3); +&data_word(4,4,4,4); +&data_word(1,0,0,0); +&data_word(4,0,0,0); +&data_word(0,-1,-1,-1); +&align (64); +} +&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); + +close STDOUT; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-x86_64.pl b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-x86_64.pl new file mode 100755 index 000000000..5ab6f879f --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/asm/chacha-x86_64.pl @@ -0,0 +1,2747 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# November 2014 +# +# ChaCha20 for x86_64. +# +# December 2016 +# +# Add AVX512F code path. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2 +# +# P4 9.48/+99% -/22.7(ii) - +# Core2 7.83/+55% 7.90/8.08 4.35 +# Westmere 7.19/+50% 5.60/6.70 3.00 +# Sandy Bridge 8.31/+42% 5.45/6.76 2.72 +# Ivy Bridge 6.71/+46% 5.40/6.49 2.41 +# Haswell 5.92/+43% 5.20/6.45 2.42 1.23 +# Skylake 5.87/+39% 4.70/- 2.31 1.19 +# Silvermont 12.0/+33% 7.75/7.40 7.03(iii) +# Goldmont 10.6/+17% 5.10/- 3.28 +# Sledgehammer 7.28/+52% -/14.2(ii) - +# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv) +# VIA Nano 10.5/+46% 6.72/8.60 6.05 +# +# (i) compared to older gcc 3.x one can observe >2x improvement on +# most platforms; +# (ii) as it can be seen, SSE2 performance is too low on legacy +# processors; NxSSE2 results are naturally better, but not +# impressively better than IALU ones, which is why you won't +# find SSE2 code below; +# (iii) this is not optimal result for Atom because of MSROM +# limitations, SSE2 can do better, but gain is considered too +# low to justify the [maintenance] effort; +# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20; +# +# Modified from upstream OpenSSL to remove the XOP code. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx = 2; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +# input parameter block +($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.align 64 +.Lzero: +.long 0,0,0,0 +.Lone: +.long 1,0,0,0 +.Linc: +.long 0,1,2,3 +.Lfour: +.long 4,4,4,4 +.Lincy: +.long 0,2,4,6,1,3,5,7 +.Leight: +.long 8,8,8,8,8,8,8,8 +.Lrot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +.Lrot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +.Lsigma: +.asciz "expand 32-byte k" +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), + "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); +@t=("%esi","%edi"); + +sub ROUND { # critical path is 24 cycles per round +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_)=map("\"$_\"",@t); +my @x=map("\"$_\"",@x); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + # Normally instructions would be interleaved to favour in-order + # execution. Generally out-of-order cores manage it gracefully, + # but not this time for some reason. As in-order execution + # cores are dying breed, old Atom is the only one around, + # instructions are left uninterleaved. Besides, Atom is better + # off executing 1xSSSE3 code anyway... + + ( + "&add (@x[$a0],@x[$b0])", # Q1 + "&xor (@x[$d0],@x[$a0])", + "&rol (@x[$d0],16)", + "&add (@x[$a1],@x[$b1])", # Q2 + "&xor (@x[$d1],@x[$a1])", + "&rol (@x[$d1],16)", + + "&add ($xc,@x[$d0])", + "&xor (@x[$b0],$xc)", + "&rol (@x[$b0],12)", + "&add ($xc_,@x[$d1])", + "&xor (@x[$b1],$xc_)", + "&rol (@x[$b1],12)", + + "&add (@x[$a0],@x[$b0])", + "&xor (@x[$d0],@x[$a0])", + "&rol (@x[$d0],8)", + "&add (@x[$a1],@x[$b1])", + "&xor (@x[$d1],@x[$a1])", + "&rol (@x[$d1],8)", + + "&add ($xc,@x[$d0])", + "&xor (@x[$b0],$xc)", + "&rol (@x[$b0],7)", + "&add ($xc_,@x[$d1])", + "&xor (@x[$b1],$xc_)", + "&rol (@x[$b1],7)", + + "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's + "&mov (\"4*$c1(%rsp)\",$xc_)", + "&mov ($xc,\"4*$c2(%rsp)\")", + "&mov ($xc_,\"4*$c3(%rsp)\")", + + "&add (@x[$a2],@x[$b2])", # Q3 + "&xor (@x[$d2],@x[$a2])", + "&rol (@x[$d2],16)", + "&add (@x[$a3],@x[$b3])", # Q4 + "&xor (@x[$d3],@x[$a3])", + "&rol (@x[$d3],16)", + + "&add ($xc,@x[$d2])", + "&xor (@x[$b2],$xc)", + "&rol (@x[$b2],12)", + "&add ($xc_,@x[$d3])", + "&xor (@x[$b3],$xc_)", + "&rol (@x[$b3],12)", + + "&add (@x[$a2],@x[$b2])", + "&xor (@x[$d2],@x[$a2])", + "&rol (@x[$d2],8)", + "&add (@x[$a3],@x[$b3])", + "&xor (@x[$d3],@x[$a3])", + "&rol (@x[$d3],8)", + + "&add ($xc,@x[$d2])", + "&xor (@x[$b2],$xc)", + "&rol (@x[$b2],7)", + "&add ($xc_,@x[$d3])", + "&xor (@x[$b3],$xc_)", + "&rol (@x[$b3],7)" + ); +} + +######################################################################## +# Generic code path that handles all lengths on pre-SSSE3 processors. +$code.=<<___; +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,\@function,5 +.align 64 +ChaCha20_ctr32: + cmp \$0,$len + je .Lno_data + mov OPENSSL_ia32cap_P+4(%rip),%r10 +___ +$code.=<<___ if ($avx>2); + bt \$48,%r10 # check for AVX512F + jc .LChaCha20_avx512 +___ +$code.=<<___; + test \$`1<<(41-32)`,%r10d + jnz .LChaCha20_ssse3 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub \$64+24,%rsp +.Lctr32_body: + + #movdqa .Lsigma(%rip),%xmm0 + movdqu ($key),%xmm1 + movdqu 16($key),%xmm2 + movdqu ($counter),%xmm3 + movdqa .Lone(%rip),%xmm4 + + #movdqa %xmm0,4*0(%rsp) # key[0] + movdqa %xmm1,4*4(%rsp) # key[1] + movdqa %xmm2,4*8(%rsp) # key[2] + movdqa %xmm3,4*12(%rsp) # key[3] + mov $len,%rbp # reassign $len + jmp .Loop_outer + +.align 32 +.Loop_outer: + mov \$0x61707865,@x[0] # 'expa' + mov \$0x3320646e,@x[1] # 'nd 3' + mov \$0x79622d32,@x[2] # '2-by' + mov \$0x6b206574,@x[3] # 'te k' + mov 4*4(%rsp),@x[4] + mov 4*5(%rsp),@x[5] + mov 4*6(%rsp),@x[6] + mov 4*7(%rsp),@x[7] + movd %xmm3,@x[12] + mov 4*13(%rsp),@x[13] + mov 4*14(%rsp),@x[14] + mov 4*15(%rsp),@x[15] + + mov %rbp,64+0(%rsp) # save len + mov \$10,%ebp + mov $inp,64+8(%rsp) # save inp + movq %xmm2,%rsi # "@x[8]" + mov $out,64+16(%rsp) # save out + mov %rsi,%rdi + shr \$32,%rdi # "@x[9]" + jmp .Loop + +.align 32 +.Loop: +___ + foreach (&ROUND (0, 4, 8,12)) { eval; } + foreach (&ROUND (0, 5,10,15)) { eval; } + &dec ("%ebp"); + &jnz (".Loop"); + +$code.=<<___; + mov @t[1],4*9(%rsp) # modulo-scheduled + mov @t[0],4*8(%rsp) + mov 64(%rsp),%rbp # load len + movdqa %xmm2,%xmm1 + mov 64+8(%rsp),$inp # load inp + paddd %xmm4,%xmm3 # increment counter + mov 64+16(%rsp),$out # load out + + add \$0x61707865,@x[0] # 'expa' + add \$0x3320646e,@x[1] # 'nd 3' + add \$0x79622d32,@x[2] # '2-by' + add \$0x6b206574,@x[3] # 'te k' + add 4*4(%rsp),@x[4] + add 4*5(%rsp),@x[5] + add 4*6(%rsp),@x[6] + add 4*7(%rsp),@x[7] + add 4*12(%rsp),@x[12] + add 4*13(%rsp),@x[13] + add 4*14(%rsp),@x[14] + add 4*15(%rsp),@x[15] + paddd 4*8(%rsp),%xmm1 + + cmp \$64,%rbp + jb .Ltail + + xor 4*0($inp),@x[0] # xor with input + xor 4*1($inp),@x[1] + xor 4*2($inp),@x[2] + xor 4*3($inp),@x[3] + xor 4*4($inp),@x[4] + xor 4*5($inp),@x[5] + xor 4*6($inp),@x[6] + xor 4*7($inp),@x[7] + movdqu 4*8($inp),%xmm0 + xor 4*12($inp),@x[12] + xor 4*13($inp),@x[13] + xor 4*14($inp),@x[14] + xor 4*15($inp),@x[15] + lea 4*16($inp),$inp # inp+=64 + pxor %xmm1,%xmm0 + + movdqa %xmm2,4*8(%rsp) + movd %xmm3,4*12(%rsp) + + mov @x[0],4*0($out) # write output + mov @x[1],4*1($out) + mov @x[2],4*2($out) + mov @x[3],4*3($out) + mov @x[4],4*4($out) + mov @x[5],4*5($out) + mov @x[6],4*6($out) + mov @x[7],4*7($out) + movdqu %xmm0,4*8($out) + mov @x[12],4*12($out) + mov @x[13],4*13($out) + mov @x[14],4*14($out) + mov @x[15],4*15($out) + lea 4*16($out),$out # out+=64 + + sub \$64,%rbp + jnz .Loop_outer + + jmp .Ldone + +.align 16 +.Ltail: + mov @x[0],4*0(%rsp) + mov @x[1],4*1(%rsp) + xor %rbx,%rbx + mov @x[2],4*2(%rsp) + mov @x[3],4*3(%rsp) + mov @x[4],4*4(%rsp) + mov @x[5],4*5(%rsp) + mov @x[6],4*6(%rsp) + mov @x[7],4*7(%rsp) + movdqa %xmm1,4*8(%rsp) + mov @x[12],4*12(%rsp) + mov @x[13],4*13(%rsp) + mov @x[14],4*14(%rsp) + mov @x[15],4*15(%rsp) + +.Loop_tail: + movzb ($inp,%rbx),%eax + movzb (%rsp,%rbx),%edx + lea 1(%rbx),%rbx + xor %edx,%eax + mov %al,-1($out,%rbx) + dec %rbp + jnz .Loop_tail + +.Ldone: + lea 64+24+48(%rsp),%rsi + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lno_data: + ret +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +___ + +######################################################################## +# SSSE3 code path that handles shorter lengths +{ +my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); + +sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round + &paddd ($a,$b); + &pxor ($d,$a); + &pshufb ($d,$rot16); + + &paddd ($c,$d); + &pxor ($b,$c); + &movdqa ($t,$b); + &psrld ($b,20); + &pslld ($t,12); + &por ($b,$t); + + &paddd ($a,$b); + &pxor ($d,$a); + &pshufb ($d,$rot24); + + &paddd ($c,$d); + &pxor ($b,$c); + &movdqa ($t,$b); + &psrld ($b,25); + &pslld ($t,7); + &por ($b,$t); +} + +my $xframe = $win64 ? 32+8 : 8; + +$code.=<<___; +.type ChaCha20_ssse3,\@function,5 +.align 32 +ChaCha20_ssse3: +.LChaCha20_ssse3: + mov %rsp,%r9 # frame pointer +___ +$code.=<<___; + cmp \$128,$len # we might throw away some data, + ja .LChaCha20_4x # but overall it won't be slower + +.Ldo_sse3_after_all: + sub \$64+$xframe,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lssse3_body: +___ +$code.=<<___; + movdqa .Lsigma(%rip),$a + movdqu ($key),$b + movdqu 16($key),$c + movdqu ($counter),$d + movdqa .Lrot16(%rip),$rot16 + movdqa .Lrot24(%rip),$rot24 + + movdqa $a,0x00(%rsp) + movdqa $b,0x10(%rsp) + movdqa $c,0x20(%rsp) + movdqa $d,0x30(%rsp) + mov \$10,$counter # reuse $counter + jmp .Loop_ssse3 + +.align 32 +.Loop_outer_ssse3: + movdqa .Lone(%rip),$d + movdqa 0x00(%rsp),$a + movdqa 0x10(%rsp),$b + movdqa 0x20(%rsp),$c + paddd 0x30(%rsp),$d + mov \$10,$counter + movdqa $d,0x30(%rsp) + jmp .Loop_ssse3 + +.align 32 +.Loop_ssse3: +___ + &SSSE3ROUND(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b00111001); + &pshufd ($d,$d,0b10010011); + &nop (); + + &SSSE3ROUND(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b10010011); + &pshufd ($d,$d,0b00111001); + + &dec ($counter); + &jnz (".Loop_ssse3"); + +$code.=<<___; + paddd 0x00(%rsp),$a + paddd 0x10(%rsp),$b + paddd 0x20(%rsp),$c + paddd 0x30(%rsp),$d + + cmp \$64,$len + jb .Ltail_ssse3 + + movdqu 0x00($inp),$t + movdqu 0x10($inp),$t1 + pxor $t,$a # xor with input + movdqu 0x20($inp),$t + pxor $t1,$b + movdqu 0x30($inp),$t1 + lea 0x40($inp),$inp # inp+=64 + pxor $t,$c + pxor $t1,$d + + movdqu $a,0x00($out) # write output + movdqu $b,0x10($out) + movdqu $c,0x20($out) + movdqu $d,0x30($out) + lea 0x40($out),$out # out+=64 + + sub \$64,$len + jnz .Loop_outer_ssse3 + + jmp .Ldone_ssse3 + +.align 16 +.Ltail_ssse3: + movdqa $a,0x00(%rsp) + movdqa $b,0x10(%rsp) + movdqa $c,0x20(%rsp) + movdqa $d,0x30(%rsp) + xor $counter,$counter + +.Loop_tail_ssse3: + movzb ($inp,$counter),%eax + movzb (%rsp,$counter),%ecx + lea 1($counter),$counter + xor %ecx,%eax + mov %al,-1($out,$counter) + dec $len + jnz .Loop_tail_ssse3 + +.Ldone_ssse3: +___ +$code.=<<___ if ($win64); + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 +___ +$code.=<<___; + lea (%r9),%rsp +.Lssse3_epilogue: + ret +.size ChaCha20_ssse3,.-ChaCha20_ssse3 +___ +} + +######################################################################## +# SSSE3 code path that handles longer messages. +{ +# assign variables to favor Atom front-end +my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, + $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); + +sub SSSE3_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); +my @x=map("\"$_\"",@xx); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + ( + "&paddd (@x[$a0],@x[$b0])", # Q1 + "&paddd (@x[$a1],@x[$b1])", # Q2 + "&pxor (@x[$d0],@x[$a0])", + "&pxor (@x[$d1],@x[$a1])", + "&pshufb (@x[$d0],$t1)", + "&pshufb (@x[$d1],$t1)", + + "&paddd ($xc,@x[$d0])", + "&paddd ($xc_,@x[$d1])", + "&pxor (@x[$b0],$xc)", + "&pxor (@x[$b1],$xc_)", + "&movdqa ($t0,@x[$b0])", + "&pslld (@x[$b0],12)", + "&psrld ($t0,20)", + "&movdqa ($t1,@x[$b1])", + "&pslld (@x[$b1],12)", + "&por (@x[$b0],$t0)", + "&psrld ($t1,20)", + "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) + "&por (@x[$b1],$t1)", + + "&paddd (@x[$a0],@x[$b0])", + "&paddd (@x[$a1],@x[$b1])", + "&pxor (@x[$d0],@x[$a0])", + "&pxor (@x[$d1],@x[$a1])", + "&pshufb (@x[$d0],$t0)", + "&pshufb (@x[$d1],$t0)", + + "&paddd ($xc,@x[$d0])", + "&paddd ($xc_,@x[$d1])", + "&pxor (@x[$b0],$xc)", + "&pxor (@x[$b1],$xc_)", + "&movdqa ($t1,@x[$b0])", + "&pslld (@x[$b0],7)", + "&psrld ($t1,25)", + "&movdqa ($t0,@x[$b1])", + "&pslld (@x[$b1],7)", + "&por (@x[$b0],$t1)", + "&psrld ($t0,25)", + "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) + "&por (@x[$b1],$t0)", + + "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's + "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", + "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", + "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", + + "&paddd (@x[$a2],@x[$b2])", # Q3 + "&paddd (@x[$a3],@x[$b3])", # Q4 + "&pxor (@x[$d2],@x[$a2])", + "&pxor (@x[$d3],@x[$a3])", + "&pshufb (@x[$d2],$t1)", + "&pshufb (@x[$d3],$t1)", + + "&paddd ($xc,@x[$d2])", + "&paddd ($xc_,@x[$d3])", + "&pxor (@x[$b2],$xc)", + "&pxor (@x[$b3],$xc_)", + "&movdqa ($t0,@x[$b2])", + "&pslld (@x[$b2],12)", + "&psrld ($t0,20)", + "&movdqa ($t1,@x[$b3])", + "&pslld (@x[$b3],12)", + "&por (@x[$b2],$t0)", + "&psrld ($t1,20)", + "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) + "&por (@x[$b3],$t1)", + + "&paddd (@x[$a2],@x[$b2])", + "&paddd (@x[$a3],@x[$b3])", + "&pxor (@x[$d2],@x[$a2])", + "&pxor (@x[$d3],@x[$a3])", + "&pshufb (@x[$d2],$t0)", + "&pshufb (@x[$d3],$t0)", + + "&paddd ($xc,@x[$d2])", + "&paddd ($xc_,@x[$d3])", + "&pxor (@x[$b2],$xc)", + "&pxor (@x[$b3],$xc_)", + "&movdqa ($t1,@x[$b2])", + "&pslld (@x[$b2],7)", + "&psrld ($t1,25)", + "&movdqa ($t0,@x[$b3])", + "&pslld (@x[$b3],7)", + "&por (@x[$b2],$t1)", + "&psrld ($t0,25)", + "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) + "&por (@x[$b3],$t0)" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.type ChaCha20_4x,\@function,5 +.align 32 +ChaCha20_4x: +.LChaCha20_4x: + mov %rsp,%r9 # frame pointer + mov %r10,%r11 +___ +$code.=<<___ if ($avx>1); + shr \$32,%r10 # OPENSSL_ia32cap_P+8 + test \$`1<<5`,%r10 # test AVX2 + jnz .LChaCha20_8x +___ +$code.=<<___; + cmp \$192,$len + ja .Lproceed4x + + and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE + cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE + je .Ldo_sse3_after_all # to detect Atom + +.Lproceed4x: + sub \$0x140+$xframe,%rsp +___ + ################ stack layout + # +0x00 SIMD equivalent of @x[8-12] + # ... + # +0x40 constant copy of key[0-2] smashed by lanes + # ... + # +0x100 SIMD counters (with nonce smashed by lanes) + # ... + # +0x140 +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L4x_body: +___ +$code.=<<___; + movdqa .Lsigma(%rip),$xa3 # key[0] + movdqu ($key),$xb3 # key[1] + movdqu 16($key),$xt3 # key[2] + movdqu ($counter),$xd3 # key[3] + lea 0x100(%rsp),%rcx # size optimization + lea .Lrot16(%rip),%r10 + lea .Lrot24(%rip),%r11 + + pshufd \$0x00,$xa3,$xa0 # smash key by lanes... + pshufd \$0x55,$xa3,$xa1 + movdqa $xa0,0x40(%rsp) # ... and offload + pshufd \$0xaa,$xa3,$xa2 + movdqa $xa1,0x50(%rsp) + pshufd \$0xff,$xa3,$xa3 + movdqa $xa2,0x60(%rsp) + movdqa $xa3,0x70(%rsp) + + pshufd \$0x00,$xb3,$xb0 + pshufd \$0x55,$xb3,$xb1 + movdqa $xb0,0x80-0x100(%rcx) + pshufd \$0xaa,$xb3,$xb2 + movdqa $xb1,0x90-0x100(%rcx) + pshufd \$0xff,$xb3,$xb3 + movdqa $xb2,0xa0-0x100(%rcx) + movdqa $xb3,0xb0-0x100(%rcx) + + pshufd \$0x00,$xt3,$xt0 # "$xc0" + pshufd \$0x55,$xt3,$xt1 # "$xc1" + movdqa $xt0,0xc0-0x100(%rcx) + pshufd \$0xaa,$xt3,$xt2 # "$xc2" + movdqa $xt1,0xd0-0x100(%rcx) + pshufd \$0xff,$xt3,$xt3 # "$xc3" + movdqa $xt2,0xe0-0x100(%rcx) + movdqa $xt3,0xf0-0x100(%rcx) + + pshufd \$0x00,$xd3,$xd0 + pshufd \$0x55,$xd3,$xd1 + paddd .Linc(%rip),$xd0 # don't save counters yet + pshufd \$0xaa,$xd3,$xd2 + movdqa $xd1,0x110-0x100(%rcx) + pshufd \$0xff,$xd3,$xd3 + movdqa $xd2,0x120-0x100(%rcx) + movdqa $xd3,0x130-0x100(%rcx) + + jmp .Loop_enter4x + +.align 32 +.Loop_outer4x: + movdqa 0x40(%rsp),$xa0 # re-load smashed key + movdqa 0x50(%rsp),$xa1 + movdqa 0x60(%rsp),$xa2 + movdqa 0x70(%rsp),$xa3 + movdqa 0x80-0x100(%rcx),$xb0 + movdqa 0x90-0x100(%rcx),$xb1 + movdqa 0xa0-0x100(%rcx),$xb2 + movdqa 0xb0-0x100(%rcx),$xb3 + movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" + movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" + movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" + movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" + movdqa 0x100-0x100(%rcx),$xd0 + movdqa 0x110-0x100(%rcx),$xd1 + movdqa 0x120-0x100(%rcx),$xd2 + movdqa 0x130-0x100(%rcx),$xd3 + paddd .Lfour(%rip),$xd0 # next SIMD counters + +.Loop_enter4x: + movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" + movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" + movdqa (%r10),$xt3 # .Lrot16(%rip) + mov \$10,%eax + movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters + jmp .Loop4x + +.align 32 +.Loop4x: +___ + foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop4x + + paddd 0x40(%rsp),$xa0 # accumulate key material + paddd 0x50(%rsp),$xa1 + paddd 0x60(%rsp),$xa2 + paddd 0x70(%rsp),$xa3 + + movdqa $xa0,$xt2 # "de-interlace" data + punpckldq $xa1,$xa0 + movdqa $xa2,$xt3 + punpckldq $xa3,$xa2 + punpckhdq $xa1,$xt2 + punpckhdq $xa3,$xt3 + movdqa $xa0,$xa1 + punpcklqdq $xa2,$xa0 # "a0" + movdqa $xt2,$xa3 + punpcklqdq $xt3,$xt2 # "a2" + punpckhqdq $xa2,$xa1 # "a1" + punpckhqdq $xt3,$xa3 # "a3" +___ + ($xa2,$xt2)=($xt2,$xa2); +$code.=<<___; + paddd 0x80-0x100(%rcx),$xb0 + paddd 0x90-0x100(%rcx),$xb1 + paddd 0xa0-0x100(%rcx),$xb2 + paddd 0xb0-0x100(%rcx),$xb3 + + movdqa $xa0,0x00(%rsp) # offload $xaN + movdqa $xa1,0x10(%rsp) + movdqa 0x20(%rsp),$xa0 # "xc2" + movdqa 0x30(%rsp),$xa1 # "xc3" + + movdqa $xb0,$xt2 + punpckldq $xb1,$xb0 + movdqa $xb2,$xt3 + punpckldq $xb3,$xb2 + punpckhdq $xb1,$xt2 + punpckhdq $xb3,$xt3 + movdqa $xb0,$xb1 + punpcklqdq $xb2,$xb0 # "b0" + movdqa $xt2,$xb3 + punpcklqdq $xt3,$xt2 # "b2" + punpckhqdq $xb2,$xb1 # "b1" + punpckhqdq $xt3,$xb3 # "b3" +___ + ($xb2,$xt2)=($xt2,$xb2); + my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); +$code.=<<___; + paddd 0xc0-0x100(%rcx),$xc0 + paddd 0xd0-0x100(%rcx),$xc1 + paddd 0xe0-0x100(%rcx),$xc2 + paddd 0xf0-0x100(%rcx),$xc3 + + movdqa $xa2,0x20(%rsp) # keep offloading $xaN + movdqa $xa3,0x30(%rsp) + + movdqa $xc0,$xt2 + punpckldq $xc1,$xc0 + movdqa $xc2,$xt3 + punpckldq $xc3,$xc2 + punpckhdq $xc1,$xt2 + punpckhdq $xc3,$xt3 + movdqa $xc0,$xc1 + punpcklqdq $xc2,$xc0 # "c0" + movdqa $xt2,$xc3 + punpcklqdq $xt3,$xt2 # "c2" + punpckhqdq $xc2,$xc1 # "c1" + punpckhqdq $xt3,$xc3 # "c3" +___ + ($xc2,$xt2)=($xt2,$xc2); + ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary +$code.=<<___; + paddd 0x100-0x100(%rcx),$xd0 + paddd 0x110-0x100(%rcx),$xd1 + paddd 0x120-0x100(%rcx),$xd2 + paddd 0x130-0x100(%rcx),$xd3 + + movdqa $xd0,$xt2 + punpckldq $xd1,$xd0 + movdqa $xd2,$xt3 + punpckldq $xd3,$xd2 + punpckhdq $xd1,$xt2 + punpckhdq $xd3,$xt3 + movdqa $xd0,$xd1 + punpcklqdq $xd2,$xd0 # "d0" + movdqa $xt2,$xd3 + punpcklqdq $xt3,$xt2 # "d2" + punpckhqdq $xd2,$xd1 # "d1" + punpckhqdq $xt3,$xd3 # "d3" +___ + ($xd2,$xt2)=($xt2,$xd2); +$code.=<<___; + cmp \$64*4,$len + jb .Ltail4x + + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # size optimization + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + + movdqu $xt0,0x40($out) + movdqu 0x00($inp),$xt0 + movdqu $xt1,0x50($out) + movdqu 0x10($inp),$xt1 + movdqu $xt2,0x60($out) + movdqu 0x20($inp),$xt2 + movdqu $xt3,0x70($out) + lea 0x80($out),$out # size optimization + movdqu 0x30($inp),$xt3 + pxor 0x20(%rsp),$xt0 + pxor $xb2,$xt1 + pxor $xc2,$xt2 + pxor $xd2,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # inp+=64*4 + pxor 0x30(%rsp),$xt0 + pxor $xb3,$xt1 + pxor $xc3,$xt2 + pxor $xd3,$xt3 + movdqu $xt0,0x40($out) + movdqu $xt1,0x50($out) + movdqu $xt2,0x60($out) + movdqu $xt3,0x70($out) + lea 0x80($out),$out # out+=64*4 + + sub \$64*4,$len + jnz .Loop_outer4x + + jmp .Ldone4x + +.Ltail4x: + cmp \$192,$len + jae .L192_or_more4x + cmp \$128,$len + jae .L128_or_more4x + cmp \$64,$len + jae .L64_or_more4x + + #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + xor %r10,%r10 + #movdqa $xt0,0x00(%rsp) + movdqa $xb0,0x10(%rsp) + movdqa $xc0,0x20(%rsp) + movdqa $xd0,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L64_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + movdqu $xt0,0x00($out) + movdqu $xt1,0x10($out) + movdqu $xt2,0x20($out) + movdqu $xt3,0x30($out) + je .Ldone4x + + movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x40($inp),$inp # inp+=64*1 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb1,0x10(%rsp) + lea 0x40($out),$out # out+=64*1 + movdqa $xc1,0x20(%rsp) + sub \$64,$len # len-=64*1 + movdqa $xd1,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L128_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + movdqu $xt0,0x40($out) + movdqu $xt1,0x50($out) + movdqu $xt2,0x60($out) + movdqu $xt3,0x70($out) + je .Ldone4x + + movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x80($inp),$inp # inp+=64*2 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb2,0x10(%rsp) + lea 0x80($out),$out # out+=64*2 + movdqa $xc2,0x20(%rsp) + sub \$128,$len # len-=64*2 + movdqa $xd2,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L192_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # size optimization + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + + movdqu $xt0,0x40($out) + movdqu 0x00($inp),$xt0 + movdqu $xt1,0x50($out) + movdqu 0x10($inp),$xt1 + movdqu $xt2,0x60($out) + movdqu 0x20($inp),$xt2 + movdqu $xt3,0x70($out) + lea 0x80($out),$out # size optimization + movdqu 0x30($inp),$xt3 + pxor 0x20(%rsp),$xt0 + pxor $xb2,$xt1 + pxor $xc2,$xt2 + pxor $xd2,$xt3 + movdqu $xt0,0x00($out) + movdqu $xt1,0x10($out) + movdqu $xt2,0x20($out) + movdqu $xt3,0x30($out) + je .Ldone4x + + movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x40($inp),$inp # inp+=64*3 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb3,0x10(%rsp) + lea 0x40($out),$out # out+=64*3 + movdqa $xc3,0x20(%rsp) + sub \$192,$len # len-=64*3 + movdqa $xd3,0x30(%rsp) + +.Loop_tail4x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail4x + +.Ldone4x: +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.L4x_epilogue: + ret +.size ChaCha20_4x,.-ChaCha20_4x +___ +} + +######################################################################## +# AVX2 code path +if ($avx>1) { +my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, + $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); + +sub AVX2_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); +my @x=map("\"$_\"",@xx); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + ( + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 + "&vpxor (@x[$d0],@x[$a0],@x[$d0])", + "&vpshufb (@x[$d0],@x[$d0],$t1)", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 + "&vpxor (@x[$d1],@x[$a1],@x[$d1])", + "&vpshufb (@x[$d1],@x[$d1],$t1)", + + "&vpaddd ($xc,$xc,@x[$d0])", + "&vpxor (@x[$b0],$xc,@x[$b0])", + "&vpslld ($t0,@x[$b0],12)", + "&vpsrld (@x[$b0],@x[$b0],20)", + "&vpor (@x[$b0],$t0,@x[$b0])", + "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) + "&vpaddd ($xc_,$xc_,@x[$d1])", + "&vpxor (@x[$b1],$xc_,@x[$b1])", + "&vpslld ($t1,@x[$b1],12)", + "&vpsrld (@x[$b1],@x[$b1],20)", + "&vpor (@x[$b1],$t1,@x[$b1])", + + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", + "&vpxor (@x[$d0],@x[$a0],@x[$d0])", + "&vpshufb (@x[$d0],@x[$d0],$t0)", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", + "&vpxor (@x[$d1],@x[$a1],@x[$d1])", + "&vpshufb (@x[$d1],@x[$d1],$t0)", + + "&vpaddd ($xc,$xc,@x[$d0])", + "&vpxor (@x[$b0],$xc,@x[$b0])", + "&vpslld ($t1,@x[$b0],7)", + "&vpsrld (@x[$b0],@x[$b0],25)", + "&vpor (@x[$b0],$t1,@x[$b0])", + "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) + "&vpaddd ($xc_,$xc_,@x[$d1])", + "&vpxor (@x[$b1],$xc_,@x[$b1])", + "&vpslld ($t0,@x[$b1],7)", + "&vpsrld (@x[$b1],@x[$b1],25)", + "&vpor (@x[$b1],$t0,@x[$b1])", + + "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's + "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", + "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", + "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", + + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 + "&vpxor (@x[$d2],@x[$a2],@x[$d2])", + "&vpshufb (@x[$d2],@x[$d2],$t1)", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 + "&vpxor (@x[$d3],@x[$a3],@x[$d3])", + "&vpshufb (@x[$d3],@x[$d3],$t1)", + + "&vpaddd ($xc,$xc,@x[$d2])", + "&vpxor (@x[$b2],$xc,@x[$b2])", + "&vpslld ($t0,@x[$b2],12)", + "&vpsrld (@x[$b2],@x[$b2],20)", + "&vpor (@x[$b2],$t0,@x[$b2])", + "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) + "&vpaddd ($xc_,$xc_,@x[$d3])", + "&vpxor (@x[$b3],$xc_,@x[$b3])", + "&vpslld ($t1,@x[$b3],12)", + "&vpsrld (@x[$b3],@x[$b3],20)", + "&vpor (@x[$b3],$t1,@x[$b3])", + + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", + "&vpxor (@x[$d2],@x[$a2],@x[$d2])", + "&vpshufb (@x[$d2],@x[$d2],$t0)", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", + "&vpxor (@x[$d3],@x[$a3],@x[$d3])", + "&vpshufb (@x[$d3],@x[$d3],$t0)", + + "&vpaddd ($xc,$xc,@x[$d2])", + "&vpxor (@x[$b2],$xc,@x[$b2])", + "&vpslld ($t1,@x[$b2],7)", + "&vpsrld (@x[$b2],@x[$b2],25)", + "&vpor (@x[$b2],$t1,@x[$b2])", + "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) + "&vpaddd ($xc_,$xc_,@x[$d3])", + "&vpxor (@x[$b3],$xc_,@x[$b3])", + "&vpslld ($t0,@x[$b3],7)", + "&vpsrld (@x[$b3],@x[$b3],25)", + "&vpor (@x[$b3],$t0,@x[$b3])" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.type ChaCha20_8x,\@function,5 +.align 32 +ChaCha20_8x: +.LChaCha20_8x: + mov %rsp,%r9 # frame register + sub \$0x280+$xframe,%rsp + and \$-32,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L8x_body: +___ +$code.=<<___; + vzeroupper + + ################ stack layout + # +0x00 SIMD equivalent of @x[8-12] + # ... + # +0x80 constant copy of key[0-2] smashed by lanes + # ... + # +0x200 SIMD counters (with nonce smashed by lanes) + # ... + # +0x280 + + vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] + vbroadcasti128 ($key),$xb3 # key[1] + vbroadcasti128 16($key),$xt3 # key[2] + vbroadcasti128 ($counter),$xd3 # key[3] + lea 0x100(%rsp),%rcx # size optimization + lea 0x200(%rsp),%rax # size optimization + lea .Lrot16(%rip),%r10 + lea .Lrot24(%rip),%r11 + + vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... + vpshufd \$0x55,$xa3,$xa1 + vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload + vpshufd \$0xaa,$xa3,$xa2 + vmovdqa $xa1,0xa0-0x100(%rcx) + vpshufd \$0xff,$xa3,$xa3 + vmovdqa $xa2,0xc0-0x100(%rcx) + vmovdqa $xa3,0xe0-0x100(%rcx) + + vpshufd \$0x00,$xb3,$xb0 + vpshufd \$0x55,$xb3,$xb1 + vmovdqa $xb0,0x100-0x100(%rcx) + vpshufd \$0xaa,$xb3,$xb2 + vmovdqa $xb1,0x120-0x100(%rcx) + vpshufd \$0xff,$xb3,$xb3 + vmovdqa $xb2,0x140-0x100(%rcx) + vmovdqa $xb3,0x160-0x100(%rcx) + + vpshufd \$0x00,$xt3,$xt0 # "xc0" + vpshufd \$0x55,$xt3,$xt1 # "xc1" + vmovdqa $xt0,0x180-0x200(%rax) + vpshufd \$0xaa,$xt3,$xt2 # "xc2" + vmovdqa $xt1,0x1a0-0x200(%rax) + vpshufd \$0xff,$xt3,$xt3 # "xc3" + vmovdqa $xt2,0x1c0-0x200(%rax) + vmovdqa $xt3,0x1e0-0x200(%rax) + + vpshufd \$0x00,$xd3,$xd0 + vpshufd \$0x55,$xd3,$xd1 + vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet + vpshufd \$0xaa,$xd3,$xd2 + vmovdqa $xd1,0x220-0x200(%rax) + vpshufd \$0xff,$xd3,$xd3 + vmovdqa $xd2,0x240-0x200(%rax) + vmovdqa $xd3,0x260-0x200(%rax) + + jmp .Loop_enter8x + +.align 32 +.Loop_outer8x: + vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key + vmovdqa 0xa0-0x100(%rcx),$xa1 + vmovdqa 0xc0-0x100(%rcx),$xa2 + vmovdqa 0xe0-0x100(%rcx),$xa3 + vmovdqa 0x100-0x100(%rcx),$xb0 + vmovdqa 0x120-0x100(%rcx),$xb1 + vmovdqa 0x140-0x100(%rcx),$xb2 + vmovdqa 0x160-0x100(%rcx),$xb3 + vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" + vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" + vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" + vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" + vmovdqa 0x200-0x200(%rax),$xd0 + vmovdqa 0x220-0x200(%rax),$xd1 + vmovdqa 0x240-0x200(%rax),$xd2 + vmovdqa 0x260-0x200(%rax),$xd3 + vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters + +.Loop_enter8x: + vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" + vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" + vbroadcasti128 (%r10),$xt3 + vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters + mov \$10,%eax + jmp .Loop8x + +.align 32 +.Loop8x: +___ + foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop8x + + lea 0x200(%rsp),%rax # size optimization + vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key + vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 + vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 + vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 + + vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data + vpunpckldq $xa3,$xa2,$xt3 + vpunpckhdq $xa1,$xa0,$xa0 + vpunpckhdq $xa3,$xa2,$xa2 + vpunpcklqdq $xt3,$xt2,$xa1 # "a0" + vpunpckhqdq $xt3,$xt2,$xt2 # "a1" + vpunpcklqdq $xa2,$xa0,$xa3 # "a2" + vpunpckhqdq $xa2,$xa0,$xa0 # "a3" +___ + ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); +$code.=<<___; + vpaddd 0x100-0x100(%rcx),$xb0,$xb0 + vpaddd 0x120-0x100(%rcx),$xb1,$xb1 + vpaddd 0x140-0x100(%rcx),$xb2,$xb2 + vpaddd 0x160-0x100(%rcx),$xb3,$xb3 + + vpunpckldq $xb1,$xb0,$xt2 + vpunpckldq $xb3,$xb2,$xt3 + vpunpckhdq $xb1,$xb0,$xb0 + vpunpckhdq $xb3,$xb2,$xb2 + vpunpcklqdq $xt3,$xt2,$xb1 # "b0" + vpunpckhqdq $xt3,$xt2,$xt2 # "b1" + vpunpcklqdq $xb2,$xb0,$xb3 # "b2" + vpunpckhqdq $xb2,$xb0,$xb0 # "b3" +___ + ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); +$code.=<<___; + vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further + vperm2i128 \$0x31,$xb0,$xa0,$xb0 + vperm2i128 \$0x20,$xb1,$xa1,$xa0 + vperm2i128 \$0x31,$xb1,$xa1,$xb1 + vperm2i128 \$0x20,$xb2,$xa2,$xa1 + vperm2i128 \$0x31,$xb2,$xa2,$xb2 + vperm2i128 \$0x20,$xb3,$xa3,$xa2 + vperm2i128 \$0x31,$xb3,$xa3,$xb3 +___ + ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); + my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); +$code.=<<___; + vmovdqa $xa0,0x00(%rsp) # offload $xaN + vmovdqa $xa1,0x20(%rsp) + vmovdqa 0x40(%rsp),$xc2 # $xa0 + vmovdqa 0x60(%rsp),$xc3 # $xa1 + + vpaddd 0x180-0x200(%rax),$xc0,$xc0 + vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 + vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 + vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 + + vpunpckldq $xc1,$xc0,$xt2 + vpunpckldq $xc3,$xc2,$xt3 + vpunpckhdq $xc1,$xc0,$xc0 + vpunpckhdq $xc3,$xc2,$xc2 + vpunpcklqdq $xt3,$xt2,$xc1 # "c0" + vpunpckhqdq $xt3,$xt2,$xt2 # "c1" + vpunpcklqdq $xc2,$xc0,$xc3 # "c2" + vpunpckhqdq $xc2,$xc0,$xc0 # "c3" +___ + ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); +$code.=<<___; + vpaddd 0x200-0x200(%rax),$xd0,$xd0 + vpaddd 0x220-0x200(%rax),$xd1,$xd1 + vpaddd 0x240-0x200(%rax),$xd2,$xd2 + vpaddd 0x260-0x200(%rax),$xd3,$xd3 + + vpunpckldq $xd1,$xd0,$xt2 + vpunpckldq $xd3,$xd2,$xt3 + vpunpckhdq $xd1,$xd0,$xd0 + vpunpckhdq $xd3,$xd2,$xd2 + vpunpcklqdq $xt3,$xt2,$xd1 # "d0" + vpunpckhqdq $xt3,$xt2,$xt2 # "d1" + vpunpcklqdq $xd2,$xd0,$xd3 # "d2" + vpunpckhqdq $xd2,$xd0,$xd0 # "d3" +___ + ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); +$code.=<<___; + vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further + vperm2i128 \$0x31,$xd0,$xc0,$xd0 + vperm2i128 \$0x20,$xd1,$xc1,$xc0 + vperm2i128 \$0x31,$xd1,$xc1,$xd1 + vperm2i128 \$0x20,$xd2,$xc2,$xc1 + vperm2i128 \$0x31,$xd2,$xc2,$xd2 + vperm2i128 \$0x20,$xd3,$xc3,$xc2 + vperm2i128 \$0x31,$xd3,$xc3,$xd3 +___ + ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); + ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= + ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); + ($xa0,$xa1)=($xt2,$xt3); +$code.=<<___; + vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? + vmovdqa 0x20(%rsp),$xa1 + + cmp \$64*8,$len + jb .Ltail8x + + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa1,$xa1 + vpxor 0x20($inp),$xb1,$xb1 + vpxor 0x40($inp),$xc1,$xc1 + vpxor 0x60($inp),$xd1,$xd1 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa1,0x00($out) + vmovdqu $xb1,0x20($out) + vmovdqu $xc1,0x40($out) + vmovdqu $xd1,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa2,$xa2 + vpxor 0x20($inp),$xb2,$xb2 + vpxor 0x40($inp),$xc2,$xc2 + vpxor 0x60($inp),$xd2,$xd2 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa2,0x00($out) + vmovdqu $xb2,0x20($out) + vmovdqu $xc2,0x40($out) + vmovdqu $xd2,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa3,$xa3 + vpxor 0x20($inp),$xb3,$xb3 + vpxor 0x40($inp),$xc3,$xc3 + vpxor 0x60($inp),$xd3,$xd3 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa3,0x00($out) + vmovdqu $xb3,0x20($out) + vmovdqu $xc3,0x40($out) + vmovdqu $xd3,0x60($out) + lea 0x80($out),$out # size optimization + + sub \$64*8,$len + jnz .Loop_outer8x + + jmp .Ldone8x + +.Ltail8x: + cmp \$448,$len + jae .L448_or_more8x + cmp \$384,$len + jae .L384_or_more8x + cmp \$320,$len + jae .L320_or_more8x + cmp \$256,$len + jae .L256_or_more8x + cmp \$192,$len + jae .L192_or_more8x + cmp \$128,$len + jae .L128_or_more8x + cmp \$64,$len + jae .L64_or_more8x + + xor %r10,%r10 + vmovdqa $xa0,0x00(%rsp) + vmovdqa $xb0,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L64_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + je .Ldone8x + + lea 0x40($inp),$inp # inp+=64*1 + xor %r10,%r10 + vmovdqa $xc0,0x00(%rsp) + lea 0x40($out),$out # out+=64*1 + sub \$64,$len # len-=64*1 + vmovdqa $xd0,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L128_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + je .Ldone8x + + lea 0x80($inp),$inp # inp+=64*2 + xor %r10,%r10 + vmovdqa $xa1,0x00(%rsp) + lea 0x80($out),$out # out+=64*2 + sub \$128,$len # len-=64*2 + vmovdqa $xb1,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L192_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + je .Ldone8x + + lea 0xc0($inp),$inp # inp+=64*3 + xor %r10,%r10 + vmovdqa $xc1,0x00(%rsp) + lea 0xc0($out),$out # out+=64*3 + sub \$192,$len # len-=64*3 + vmovdqa $xd1,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L256_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + je .Ldone8x + + lea 0x100($inp),$inp # inp+=64*4 + xor %r10,%r10 + vmovdqa $xa2,0x00(%rsp) + lea 0x100($out),$out # out+=64*4 + sub \$256,$len # len-=64*4 + vmovdqa $xb2,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L320_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + je .Ldone8x + + lea 0x140($inp),$inp # inp+=64*5 + xor %r10,%r10 + vmovdqa $xc2,0x00(%rsp) + lea 0x140($out),$out # out+=64*5 + sub \$320,$len # len-=64*5 + vmovdqa $xd2,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L384_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vpxor 0x140($inp),$xc2,$xc2 + vpxor 0x160($inp),$xd2,$xd2 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + vmovdqu $xc2,0x140($out) + vmovdqu $xd2,0x160($out) + je .Ldone8x + + lea 0x180($inp),$inp # inp+=64*6 + xor %r10,%r10 + vmovdqa $xa3,0x00(%rsp) + lea 0x180($out),$out # out+=64*6 + sub \$384,$len # len-=64*6 + vmovdqa $xb3,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L448_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vpxor 0x140($inp),$xc2,$xc2 + vpxor 0x160($inp),$xd2,$xd2 + vpxor 0x180($inp),$xa3,$xa3 + vpxor 0x1a0($inp),$xb3,$xb3 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + vmovdqu $xc2,0x140($out) + vmovdqu $xd2,0x160($out) + vmovdqu $xa3,0x180($out) + vmovdqu $xb3,0x1a0($out) + je .Ldone8x + + lea 0x1c0($inp),$inp # inp+=64*7 + xor %r10,%r10 + vmovdqa $xc3,0x00(%rsp) + lea 0x1c0($out),$out # out+=64*7 + sub \$448,$len # len-=64*7 + vmovdqa $xd3,0x20(%rsp) + +.Loop_tail8x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail8x + +.Ldone8x: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.L8x_epilogue: + ret +.size ChaCha20_8x,.-ChaCha20_8x +___ +} + +######################################################################## +# AVX512 code paths +if ($avx>2) { +# This one handles shorter inputs... + +my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); +my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); + +sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round + &vpaddd ($a,$a,$b); + &vpxord ($d,$d,$a); + &vprold ($d,$d,16); + + &vpaddd ($c,$c,$d); + &vpxord ($b,$b,$c); + &vprold ($b,$b,12); + + &vpaddd ($a,$a,$b); + &vpxord ($d,$d,$a); + &vprold ($d,$d,8); + + &vpaddd ($c,$c,$d); + &vpxord ($b,$b,$c); + &vprold ($b,$b,7); +} + +my $xframe = $win64 ? 32+8 : 8; + +$code.=<<___; +.type ChaCha20_avx512,\@function,5 +.align 32 +ChaCha20_avx512: +.LChaCha20_avx512: + mov %rsp,%r9 # frame pointer + cmp \$512,$len + ja .LChaCha20_16x + + sub \$64+$xframe,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lavx512_body: +___ +$code.=<<___; + vbroadcasti32x4 .Lsigma(%rip),$a + vbroadcasti32x4 ($key),$b + vbroadcasti32x4 16($key),$c + vbroadcasti32x4 ($counter),$d + + vmovdqa32 $a,$a_ + vmovdqa32 $b,$b_ + vmovdqa32 $c,$c_ + vpaddd .Lzeroz(%rip),$d,$d + vmovdqa32 .Lfourz(%rip),$fourz + mov \$10,$counter # reuse $counter + vmovdqa32 $d,$d_ + jmp .Loop_avx512 + +.align 16 +.Loop_outer_avx512: + vmovdqa32 $a_,$a + vmovdqa32 $b_,$b + vmovdqa32 $c_,$c + vpaddd $fourz,$d_,$d + mov \$10,$counter + vmovdqa32 $d,$d_ + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: +___ + &AVX512ROUND(); + &vpshufd ($c,$c,0b01001110); + &vpshufd ($b,$b,0b00111001); + &vpshufd ($d,$d,0b10010011); + + &AVX512ROUND(); + &vpshufd ($c,$c,0b01001110); + &vpshufd ($b,$b,0b10010011); + &vpshufd ($d,$d,0b00111001); + + &dec ($counter); + &jnz (".Loop_avx512"); + +$code.=<<___; + vpaddd $a_,$a,$a + vpaddd $b_,$b,$b + vpaddd $c_,$c,$c + vpaddd $d_,$d,$d + + sub \$64,$len + jb .Ltail64_avx512 + + vpxor 0x00($inp),%x#$a,$t0 # xor with input + vpxor 0x10($inp),%x#$b,$t1 + vpxor 0x20($inp),%x#$c,$t2 + vpxor 0x30($inp),%x#$d,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$1,$a,$t0 + vextracti32x4 \$1,$b,$t1 + vextracti32x4 \$1,$c,$t2 + vextracti32x4 \$1,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$2,$a,$t0 + vextracti32x4 \$2,$b,$t1 + vextracti32x4 \$2,$c,$t2 + vextracti32x4 \$2,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$3,$a,$t0 + vextracti32x4 \$3,$b,$t1 + vextracti32x4 \$3,$c,$t2 + vextracti32x4 \$3,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jnz .Loop_outer_avx512 + + jmp .Ldone_avx512 + +.align 16 +.Ltail64_avx512: + vmovdqa %x#$a,0x00(%rsp) + vmovdqa %x#$b,0x10(%rsp) + vmovdqa %x#$c,0x20(%rsp) + vmovdqa %x#$d,0x30(%rsp) + add \$64,$len + jmp .Loop_tail_avx512 + +.align 16 +.Ltail_avx512: + vmovdqa $t0,0x00(%rsp) + vmovdqa $t1,0x10(%rsp) + vmovdqa $t2,0x20(%rsp) + vmovdqa $t3,0x30(%rsp) + add \$64,$len + +.Loop_tail_avx512: + movzb ($inp,$counter),%eax + movzb (%rsp,$counter),%ecx + lea 1($counter),$counter + xor %ecx,%eax + mov %al,-1($out,$counter) + dec $len + jnz .Loop_tail_avx512 + + vmovdqa32 $a_,0x00(%rsp) + +.Ldone_avx512: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 +___ +$code.=<<___; + lea (%r9),%rsp +.Lavx512_epilogue: + ret +.size ChaCha20_avx512,.-ChaCha20_avx512 +___ +} +if ($avx>2) { +# This one handles longer inputs... + +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); +my @key=map("%zmm$_",(16..31)); +my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; + +sub AVX512_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my @x=map("\"$_\"",@xx); + + ( + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 + "&vpxord (@x[$d0],@x[$d0],@x[$a0])", + "&vpxord (@x[$d1],@x[$d1],@x[$a1])", + "&vpxord (@x[$d2],@x[$d2],@x[$a2])", + "&vpxord (@x[$d3],@x[$d3],@x[$a3])", + "&vprold (@x[$d0],@x[$d0],16)", + "&vprold (@x[$d1],@x[$d1],16)", + "&vprold (@x[$d2],@x[$d2],16)", + "&vprold (@x[$d3],@x[$d3],16)", + + "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", + "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", + "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", + "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", + "&vpxord (@x[$b0],@x[$b0],@x[$c0])", + "&vpxord (@x[$b1],@x[$b1],@x[$c1])", + "&vpxord (@x[$b2],@x[$b2],@x[$c2])", + "&vpxord (@x[$b3],@x[$b3],@x[$c3])", + "&vprold (@x[$b0],@x[$b0],12)", + "&vprold (@x[$b1],@x[$b1],12)", + "&vprold (@x[$b2],@x[$b2],12)", + "&vprold (@x[$b3],@x[$b3],12)", + + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", + "&vpxord (@x[$d0],@x[$d0],@x[$a0])", + "&vpxord (@x[$d1],@x[$d1],@x[$a1])", + "&vpxord (@x[$d2],@x[$d2],@x[$a2])", + "&vpxord (@x[$d3],@x[$d3],@x[$a3])", + "&vprold (@x[$d0],@x[$d0],8)", + "&vprold (@x[$d1],@x[$d1],8)", + "&vprold (@x[$d2],@x[$d2],8)", + "&vprold (@x[$d3],@x[$d3],8)", + + "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", + "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", + "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", + "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", + "&vpxord (@x[$b0],@x[$b0],@x[$c0])", + "&vpxord (@x[$b1],@x[$b1],@x[$c1])", + "&vpxord (@x[$b2],@x[$b2],@x[$c2])", + "&vpxord (@x[$b3],@x[$b3],@x[$c3])", + "&vprold (@x[$b0],@x[$b0],7)", + "&vprold (@x[$b1],@x[$b1],7)", + "&vprold (@x[$b2],@x[$b2],7)", + "&vprold (@x[$b3],@x[$b3],7)" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.type ChaCha20_16x,\@function,5 +.align 32 +ChaCha20_16x: +.LChaCha20_16x: + mov %rsp,%r9 # frame register + sub \$64+$xframe,%rsp + and \$-64,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L16x_body: +___ +$code.=<<___; + vzeroupper + + lea .Lsigma(%rip),%r10 + vbroadcasti32x4 (%r10),$xa3 # key[0] + vbroadcasti32x4 ($key),$xb3 # key[1] + vbroadcasti32x4 16($key),$xc3 # key[2] + vbroadcasti32x4 ($counter),$xd3 # key[3] + + vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... + vpshufd \$0x55,$xa3,$xa1 + vpshufd \$0xaa,$xa3,$xa2 + vpshufd \$0xff,$xa3,$xa3 + vmovdqa64 $xa0,@key[0] + vmovdqa64 $xa1,@key[1] + vmovdqa64 $xa2,@key[2] + vmovdqa64 $xa3,@key[3] + + vpshufd \$0x00,$xb3,$xb0 + vpshufd \$0x55,$xb3,$xb1 + vpshufd \$0xaa,$xb3,$xb2 + vpshufd \$0xff,$xb3,$xb3 + vmovdqa64 $xb0,@key[4] + vmovdqa64 $xb1,@key[5] + vmovdqa64 $xb2,@key[6] + vmovdqa64 $xb3,@key[7] + + vpshufd \$0x00,$xc3,$xc0 + vpshufd \$0x55,$xc3,$xc1 + vpshufd \$0xaa,$xc3,$xc2 + vpshufd \$0xff,$xc3,$xc3 + vmovdqa64 $xc0,@key[8] + vmovdqa64 $xc1,@key[9] + vmovdqa64 $xc2,@key[10] + vmovdqa64 $xc3,@key[11] + + vpshufd \$0x00,$xd3,$xd0 + vpshufd \$0x55,$xd3,$xd1 + vpshufd \$0xaa,$xd3,$xd2 + vpshufd \$0xff,$xd3,$xd3 + vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet + vmovdqa64 $xd0,@key[12] + vmovdqa64 $xd1,@key[13] + vmovdqa64 $xd2,@key[14] + vmovdqa64 $xd3,@key[15] + + mov \$10,%eax + jmp .Loop16x + +.align 32 +.Loop_outer16x: + vpbroadcastd 0(%r10),$xa0 # reload key + vpbroadcastd 4(%r10),$xa1 + vpbroadcastd 8(%r10),$xa2 + vpbroadcastd 12(%r10),$xa3 + vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters + vmovdqa64 @key[4],$xb0 + vmovdqa64 @key[5],$xb1 + vmovdqa64 @key[6],$xb2 + vmovdqa64 @key[7],$xb3 + vmovdqa64 @key[8],$xc0 + vmovdqa64 @key[9],$xc1 + vmovdqa64 @key[10],$xc2 + vmovdqa64 @key[11],$xc3 + vmovdqa64 @key[12],$xd0 + vmovdqa64 @key[13],$xd1 + vmovdqa64 @key[14],$xd2 + vmovdqa64 @key[15],$xd3 + + vmovdqa64 $xa0,@key[0] + vmovdqa64 $xa1,@key[1] + vmovdqa64 $xa2,@key[2] + vmovdqa64 $xa3,@key[3] + + mov \$10,%eax + jmp .Loop16x + +.align 32 +.Loop16x: +___ + foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop16x + + vpaddd @key[0],$xa0,$xa0 # accumulate key + vpaddd @key[1],$xa1,$xa1 + vpaddd @key[2],$xa2,$xa2 + vpaddd @key[3],$xa3,$xa3 + + vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data + vpunpckldq $xa3,$xa2,$xt3 + vpunpckhdq $xa1,$xa0,$xa0 + vpunpckhdq $xa3,$xa2,$xa2 + vpunpcklqdq $xt3,$xt2,$xa1 # "a0" + vpunpckhqdq $xt3,$xt2,$xt2 # "a1" + vpunpcklqdq $xa2,$xa0,$xa3 # "a2" + vpunpckhqdq $xa2,$xa0,$xa0 # "a3" +___ + ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); +$code.=<<___; + vpaddd @key[4],$xb0,$xb0 + vpaddd @key[5],$xb1,$xb1 + vpaddd @key[6],$xb2,$xb2 + vpaddd @key[7],$xb3,$xb3 + + vpunpckldq $xb1,$xb0,$xt2 + vpunpckldq $xb3,$xb2,$xt3 + vpunpckhdq $xb1,$xb0,$xb0 + vpunpckhdq $xb3,$xb2,$xb2 + vpunpcklqdq $xt3,$xt2,$xb1 # "b0" + vpunpckhqdq $xt3,$xt2,$xt2 # "b1" + vpunpcklqdq $xb2,$xb0,$xb3 # "b2" + vpunpckhqdq $xb2,$xb0,$xb0 # "b3" +___ + ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); +$code.=<<___; + vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further + vshufi32x4 \$0xee,$xb0,$xa0,$xb0 + vshufi32x4 \$0x44,$xb1,$xa1,$xa0 + vshufi32x4 \$0xee,$xb1,$xa1,$xb1 + vshufi32x4 \$0x44,$xb2,$xa2,$xa1 + vshufi32x4 \$0xee,$xb2,$xa2,$xb2 + vshufi32x4 \$0x44,$xb3,$xa3,$xa2 + vshufi32x4 \$0xee,$xb3,$xa3,$xb3 +___ + ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); +$code.=<<___; + vpaddd @key[8],$xc0,$xc0 + vpaddd @key[9],$xc1,$xc1 + vpaddd @key[10],$xc2,$xc2 + vpaddd @key[11],$xc3,$xc3 + + vpunpckldq $xc1,$xc0,$xt2 + vpunpckldq $xc3,$xc2,$xt3 + vpunpckhdq $xc1,$xc0,$xc0 + vpunpckhdq $xc3,$xc2,$xc2 + vpunpcklqdq $xt3,$xt2,$xc1 # "c0" + vpunpckhqdq $xt3,$xt2,$xt2 # "c1" + vpunpcklqdq $xc2,$xc0,$xc3 # "c2" + vpunpckhqdq $xc2,$xc0,$xc0 # "c3" +___ + ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); +$code.=<<___; + vpaddd @key[12],$xd0,$xd0 + vpaddd @key[13],$xd1,$xd1 + vpaddd @key[14],$xd2,$xd2 + vpaddd @key[15],$xd3,$xd3 + + vpunpckldq $xd1,$xd0,$xt2 + vpunpckldq $xd3,$xd2,$xt3 + vpunpckhdq $xd1,$xd0,$xd0 + vpunpckhdq $xd3,$xd2,$xd2 + vpunpcklqdq $xt3,$xt2,$xd1 # "d0" + vpunpckhqdq $xt3,$xt2,$xt2 # "d1" + vpunpcklqdq $xd2,$xd0,$xd3 # "d2" + vpunpckhqdq $xd2,$xd0,$xd0 # "d3" +___ + ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); +$code.=<<___; + vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further + vshufi32x4 \$0xee,$xd0,$xc0,$xd0 + vshufi32x4 \$0x44,$xd1,$xc1,$xc0 + vshufi32x4 \$0xee,$xd1,$xc1,$xd1 + vshufi32x4 \$0x44,$xd2,$xc2,$xc1 + vshufi32x4 \$0xee,$xd2,$xc2,$xd2 + vshufi32x4 \$0x44,$xd3,$xc3,$xc2 + vshufi32x4 \$0xee,$xd3,$xc3,$xd3 +___ + ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); +$code.=<<___; + vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further + vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 + vshufi32x4 \$0x88,$xd0,$xb0,$xc0 + vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 + vshufi32x4 \$0x88,$xc1,$xa1,$xt1 + vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 + vshufi32x4 \$0x88,$xd1,$xb1,$xc1 + vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 + vshufi32x4 \$0x88,$xc2,$xa2,$xt2 + vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 + vshufi32x4 \$0x88,$xd2,$xb2,$xc2 + vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 + vshufi32x4 \$0x88,$xc3,$xa3,$xt3 + vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 + vshufi32x4 \$0x88,$xd3,$xb3,$xc3 + vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 +___ + ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= + ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); + + ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, + $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = + ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); +$code.=<<___; + cmp \$64*16,$len + jb .Ltail16x + + vpxord 0x00($inp),$xa0,$xa0 # xor with input + vpxord 0x40($inp),$xb0,$xb0 + vpxord 0x80($inp),$xc0,$xc0 + vpxord 0xc0($inp),$xd0,$xd0 + vmovdqu32 $xa0,0x00($out) + vmovdqu32 $xb0,0x40($out) + vmovdqu32 $xc0,0x80($out) + vmovdqu32 $xd0,0xc0($out) + + vpxord 0x100($inp),$xa1,$xa1 + vpxord 0x140($inp),$xb1,$xb1 + vpxord 0x180($inp),$xc1,$xc1 + vpxord 0x1c0($inp),$xd1,$xd1 + vmovdqu32 $xa1,0x100($out) + vmovdqu32 $xb1,0x140($out) + vmovdqu32 $xc1,0x180($out) + vmovdqu32 $xd1,0x1c0($out) + + vpxord 0x200($inp),$xa2,$xa2 + vpxord 0x240($inp),$xb2,$xb2 + vpxord 0x280($inp),$xc2,$xc2 + vpxord 0x2c0($inp),$xd2,$xd2 + vmovdqu32 $xa2,0x200($out) + vmovdqu32 $xb2,0x240($out) + vmovdqu32 $xc2,0x280($out) + vmovdqu32 $xd2,0x2c0($out) + + vpxord 0x300($inp),$xa3,$xa3 + vpxord 0x340($inp),$xb3,$xb3 + vpxord 0x380($inp),$xc3,$xc3 + vpxord 0x3c0($inp),$xd3,$xd3 + lea 0x400($inp),$inp + vmovdqu32 $xa3,0x300($out) + vmovdqu32 $xb3,0x340($out) + vmovdqu32 $xc3,0x380($out) + vmovdqu32 $xd3,0x3c0($out) + lea 0x400($out),$out + + sub \$64*16,$len + jnz .Loop_outer16x + + jmp .Ldone16x + +.align 32 +.Ltail16x: + xor %r10,%r10 + sub $inp,$out + cmp \$64*1,$len + jb .Less_than_64_16x + vpxord ($inp),$xa0,$xa0 # xor with input + vmovdqu32 $xa0,($out,$inp) + je .Ldone16x + vmovdqa32 $xb0,$xa0 + lea 64($inp),$inp + + cmp \$64*2,$len + jb .Less_than_64_16x + vpxord ($inp),$xb0,$xb0 + vmovdqu32 $xb0,($out,$inp) + je .Ldone16x + vmovdqa32 $xc0,$xa0 + lea 64($inp),$inp + + cmp \$64*3,$len + jb .Less_than_64_16x + vpxord ($inp),$xc0,$xc0 + vmovdqu32 $xc0,($out,$inp) + je .Ldone16x + vmovdqa32 $xd0,$xa0 + lea 64($inp),$inp + + cmp \$64*4,$len + jb .Less_than_64_16x + vpxord ($inp),$xd0,$xd0 + vmovdqu32 $xd0,($out,$inp) + je .Ldone16x + vmovdqa32 $xa1,$xa0 + lea 64($inp),$inp + + cmp \$64*5,$len + jb .Less_than_64_16x + vpxord ($inp),$xa1,$xa1 + vmovdqu32 $xa1,($out,$inp) + je .Ldone16x + vmovdqa32 $xb1,$xa0 + lea 64($inp),$inp + + cmp \$64*6,$len + jb .Less_than_64_16x + vpxord ($inp),$xb1,$xb1 + vmovdqu32 $xb1,($out,$inp) + je .Ldone16x + vmovdqa32 $xc1,$xa0 + lea 64($inp),$inp + + cmp \$64*7,$len + jb .Less_than_64_16x + vpxord ($inp),$xc1,$xc1 + vmovdqu32 $xc1,($out,$inp) + je .Ldone16x + vmovdqa32 $xd1,$xa0 + lea 64($inp),$inp + + cmp \$64*8,$len + jb .Less_than_64_16x + vpxord ($inp),$xd1,$xd1 + vmovdqu32 $xd1,($out,$inp) + je .Ldone16x + vmovdqa32 $xa2,$xa0 + lea 64($inp),$inp + + cmp \$64*9,$len + jb .Less_than_64_16x + vpxord ($inp),$xa2,$xa2 + vmovdqu32 $xa2,($out,$inp) + je .Ldone16x + vmovdqa32 $xb2,$xa0 + lea 64($inp),$inp + + cmp \$64*10,$len + jb .Less_than_64_16x + vpxord ($inp),$xb2,$xb2 + vmovdqu32 $xb2,($out,$inp) + je .Ldone16x + vmovdqa32 $xc2,$xa0 + lea 64($inp),$inp + + cmp \$64*11,$len + jb .Less_than_64_16x + vpxord ($inp),$xc2,$xc2 + vmovdqu32 $xc2,($out,$inp) + je .Ldone16x + vmovdqa32 $xd2,$xa0 + lea 64($inp),$inp + + cmp \$64*12,$len + jb .Less_than_64_16x + vpxord ($inp),$xd2,$xd2 + vmovdqu32 $xd2,($out,$inp) + je .Ldone16x + vmovdqa32 $xa3,$xa0 + lea 64($inp),$inp + + cmp \$64*13,$len + jb .Less_than_64_16x + vpxord ($inp),$xa3,$xa3 + vmovdqu32 $xa3,($out,$inp) + je .Ldone16x + vmovdqa32 $xb3,$xa0 + lea 64($inp),$inp + + cmp \$64*14,$len + jb .Less_than_64_16x + vpxord ($inp),$xb3,$xb3 + vmovdqu32 $xb3,($out,$inp) + je .Ldone16x + vmovdqa32 $xc3,$xa0 + lea 64($inp),$inp + + cmp \$64*15,$len + jb .Less_than_64_16x + vpxord ($inp),$xc3,$xc3 + vmovdqu32 $xc3,($out,$inp) + je .Ldone16x + vmovdqa32 $xd3,$xa0 + lea 64($inp),$inp + +.Less_than_64_16x: + vmovdqa32 $xa0,0x00(%rsp) + lea ($out,$inp),$out + and \$63,$len + +.Loop_tail16x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail16x + + vpxord $xa0,$xa0,$xa0 + vmovdqa32 $xa0,0(%rsp) + +.Ldone16x: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.L16x_epilogue: + ret +.size ChaCha20_16x,.-ChaCha20_16x +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + lea .Lctr32_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lno_data(%rip),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 64+24+48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 192($context),%rax # pull context->R9 + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea -0x28(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$4,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size ssse3_handler,.-ssse3_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 192($context),%rax # pull context->R9 + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea -0xa8(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ChaCha20_ctr32 + .rva .LSEH_end_ChaCha20_ctr32 + .rva .LSEH_info_ChaCha20_ctr32 + + .rva .LSEH_begin_ChaCha20_ssse3 + .rva .LSEH_end_ChaCha20_ssse3 + .rva .LSEH_info_ChaCha20_ssse3 + + .rva .LSEH_begin_ChaCha20_4x + .rva .LSEH_end_ChaCha20_4x + .rva .LSEH_info_ChaCha20_4x +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ChaCha20_8x + .rva .LSEH_end_ChaCha20_8x + .rva .LSEH_info_ChaCha20_8x +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_ChaCha20_avx512 + .rva .LSEH_end_ChaCha20_avx512 + .rva .LSEH_info_ChaCha20_avx512 + + .rva .LSEH_begin_ChaCha20_16x + .rva .LSEH_end_ChaCha20_16x + .rva .LSEH_info_ChaCha20_16x +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_ChaCha20_ctr32: + .byte 9,0,0,0 + .rva se_handler + +.LSEH_info_ChaCha20_ssse3: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lssse3_body,.Lssse3_epilogue + +.LSEH_info_ChaCha20_4x: + .byte 9,0,0,0 + .rva full_handler + .rva .L4x_body,.L4x_epilogue +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ChaCha20_8x: + .byte 9,0,0,0 + .rva full_handler + .rva .L8x_body,.L8x_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_ChaCha20_avx512: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] + +.LSEH_info_ChaCha20_16x: + .byte 9,0,0,0 + .rva full_handler + .rva .L16x_body,.L16x_epilogue # HandlerData[] +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/%x#%[yz]/%x/g; # "down-shift" + + print $_,"\n"; +} + +close STDOUT; diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/chacha.c b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/chacha.c new file mode 100644 index 000000000..fe32596a2 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/chacha.c @@ -0,0 +1,167 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* Adapted from the public domain, estream code by D. Bernstein. */ + +#include <openssl/chacha.h> + +#include <assert.h> +#include <string.h> + +#include <openssl/cpu.h> + +#include "../internal.h" + + +#define U8TO32_LITTLE(p) \ + (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ + defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) + +/* ChaCha20_ctr32 is defined in asm/chacha-*.pl. */ +void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); + +void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[12], + uint32_t counter) { + assert(!buffers_alias(out, in_len, in, in_len) || in == out); + + uint32_t counter_nonce[4]; counter_nonce[0] = counter; + counter_nonce[1] = U8TO32_LITTLE(nonce + 0); + counter_nonce[2] = U8TO32_LITTLE(nonce + 4); + counter_nonce[3] = U8TO32_LITTLE(nonce + 8); + + const uint32_t *key_ptr = (const uint32_t *)key; +#if !defined(OPENSSL_X86) && !defined(OPENSSL_X86_64) + /* The assembly expects the key to be four-byte aligned. */ + uint32_t key_u32[8]; + if ((((uintptr_t)key) & 3) != 0) { + key_u32[0] = U8TO32_LITTLE(key + 0); + key_u32[1] = U8TO32_LITTLE(key + 4); + key_u32[2] = U8TO32_LITTLE(key + 8); + key_u32[3] = U8TO32_LITTLE(key + 12); + key_u32[4] = U8TO32_LITTLE(key + 16); + key_u32[5] = U8TO32_LITTLE(key + 20); + key_u32[6] = U8TO32_LITTLE(key + 24); + key_u32[7] = U8TO32_LITTLE(key + 28); + + key_ptr = key_u32; + } +#endif + + ChaCha20_ctr32(out, in, in_len, key_ptr, counter_nonce); +} + +#else + +/* sigma contains the ChaCha constants, which happen to be an ASCII string. */ +static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', + '2', '-', 'b', 'y', 't', 'e', ' ', 'k' }; + +#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) + +#define U32TO8_LITTLE(p, v) \ + { \ + (p)[0] = (v >> 0) & 0xff; \ + (p)[1] = (v >> 8) & 0xff; \ + (p)[2] = (v >> 16) & 0xff; \ + (p)[3] = (v >> 24) & 0xff; \ + } + +/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */ +#define QUARTERROUND(a, b, c, d) \ + x[a] += x[b]; x[d] = ROTATE(x[d] ^ x[a], 16); \ + x[c] += x[d]; x[b] = ROTATE(x[b] ^ x[c], 12); \ + x[a] += x[b]; x[d] = ROTATE(x[d] ^ x[a], 8); \ + x[c] += x[d]; x[b] = ROTATE(x[b] ^ x[c], 7); + +/* chacha_core performs 20 rounds of ChaCha on the input words in + * |input| and writes the 64 output bytes to |output|. */ +static void chacha_core(uint8_t output[64], const uint32_t input[16]) { + uint32_t x[16]; + int i; + + OPENSSL_memcpy(x, input, sizeof(uint32_t) * 16); + for (i = 20; i > 0; i -= 2) { + QUARTERROUND(0, 4, 8, 12) + QUARTERROUND(1, 5, 9, 13) + QUARTERROUND(2, 6, 10, 14) + QUARTERROUND(3, 7, 11, 15) + QUARTERROUND(0, 5, 10, 15) + QUARTERROUND(1, 6, 11, 12) + QUARTERROUND(2, 7, 8, 13) + QUARTERROUND(3, 4, 9, 14) + } + + for (i = 0; i < 16; ++i) { + x[i] += input[i]; + } + for (i = 0; i < 16; ++i) { + U32TO8_LITTLE(output + 4 * i, x[i]); + } +} + +void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[12], + uint32_t counter) { + assert(!buffers_alias(out, in_len, in, in_len) || in == out); + + uint32_t input[16]; + uint8_t buf[64]; + size_t todo, i; + + input[0] = U8TO32_LITTLE(sigma + 0); + input[1] = U8TO32_LITTLE(sigma + 4); + input[2] = U8TO32_LITTLE(sigma + 8); + input[3] = U8TO32_LITTLE(sigma + 12); + + input[4] = U8TO32_LITTLE(key + 0); + input[5] = U8TO32_LITTLE(key + 4); + input[6] = U8TO32_LITTLE(key + 8); + input[7] = U8TO32_LITTLE(key + 12); + + input[8] = U8TO32_LITTLE(key + 16); + input[9] = U8TO32_LITTLE(key + 20); + input[10] = U8TO32_LITTLE(key + 24); + input[11] = U8TO32_LITTLE(key + 28); + + input[12] = counter; + input[13] = U8TO32_LITTLE(nonce + 0); + input[14] = U8TO32_LITTLE(nonce + 4); + input[15] = U8TO32_LITTLE(nonce + 8); + + while (in_len > 0) { + todo = sizeof(buf); + if (in_len < todo) { + todo = in_len; + } + + chacha_core(buf, input); + for (i = 0; i < todo; i++) { + out[i] = in[i] ^ buf[i]; + } + + out += todo; + in += todo; + in_len -= todo; + + input[12]++; + } +} + +#endif diff --git a/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/chacha_test.cc b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/chacha_test.cc new file mode 100644 index 000000000..a40653fa6 --- /dev/null +++ b/roms/edk2/CryptoPkg/Library/OpensslLib/openssl/boringssl/crypto/chacha/chacha_test.cc @@ -0,0 +1,236 @@ +/* Copyright (c) 2016, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include <stdio.h> +#include <stdint.h> +#include <string.h> + +#include <memory> + +#include <gtest/gtest.h> + +#include <openssl/crypto.h> +#include <openssl/chacha.h> + +#include "../internal.h" +#include "../test/test_util.h" + + +static const uint8_t kKey[32] = { + 0x98, 0xbe, 0xf1, 0x46, 0x9b, 0xe7, 0x26, 0x98, 0x37, 0xa4, 0x5b, + 0xfb, 0xc9, 0x2a, 0x5a, 0x6a, 0xc7, 0x62, 0x50, 0x7c, 0xf9, 0x64, + 0x43, 0xbf, 0x33, 0xb9, 0x6b, 0x1b, 0xd4, 0xc6, 0xf8, 0xf6, +}; + +static const uint8_t kNonce[12] = { + 0x44, 0xe7, 0x92, 0xd6, 0x33, 0x35, 0xab, 0xb1, 0x58, 0x2e, 0x92, 0x53, +}; + +static uint32_t kCounter = 42; + +static const uint8_t kInput[] = { + 0x58, 0x28, 0xd5, 0x30, 0x36, 0x2c, 0x60, 0x55, 0x29, 0xf8, 0xe1, 0x8c, + 0xae, 0x15, 0x15, 0x26, 0xf2, 0x3a, 0x73, 0xa0, 0xf3, 0x12, 0xa3, 0x88, + 0x5f, 0x2b, 0x74, 0x23, 0x3d, 0xc9, 0x05, 0x23, 0xc6, 0x54, 0x49, 0x1e, + 0x44, 0x88, 0x14, 0xd9, 0xda, 0x37, 0x15, 0xdc, 0xb7, 0xe4, 0x23, 0xb3, + 0x9d, 0x7e, 0x16, 0x68, 0x35, 0xfc, 0x02, 0x6d, 0xcc, 0x8a, 0xe5, 0xdd, + 0x5f, 0xe4, 0xd2, 0x56, 0x6f, 0x12, 0x9c, 0x9c, 0x7d, 0x6a, 0x38, 0x48, + 0xbd, 0xdf, 0xd9, 0xac, 0x1b, 0xa2, 0x4d, 0xc5, 0x43, 0x04, 0x3c, 0xd7, + 0x99, 0xe1, 0xa7, 0x13, 0x9c, 0x51, 0xc2, 0x6d, 0xf9, 0xcf, 0x07, 0x3b, + 0xe4, 0xbf, 0x93, 0xa3, 0xa9, 0xb4, 0xc5, 0xf0, 0x1a, 0xe4, 0x8d, 0x5f, + 0xc6, 0xc4, 0x7c, 0x69, 0x7a, 0xde, 0x1a, 0xc1, 0xc9, 0xcf, 0xc2, 0x4e, + 0x7a, 0x25, 0x2c, 0x32, 0xe9, 0x17, 0xba, 0x68, 0xf1, 0x37, 0x5d, 0x62, + 0x84, 0x46, 0xf5, 0x80, 0x7f, 0x1a, 0x71, 0xf7, 0xbe, 0x72, 0x4b, 0xb8, + 0x1c, 0xfe, 0x3e, 0xbd, 0xae, 0x0d, 0x73, 0x0d, 0x87, 0x4a, 0x31, 0xc3, + 0x3d, 0x46, 0x6f, 0xb3, 0xd7, 0x6b, 0xe3, 0xb8, 0x70, 0x17, 0x8e, 0x7a, + 0x6a, 0x0e, 0xbf, 0xa8, 0xbc, 0x2b, 0xdb, 0xfa, 0x4f, 0xb6, 0x26, 0x20, + 0xee, 0x63, 0xf0, 0x6d, 0x26, 0xac, 0x6a, 0x18, 0x37, 0x6e, 0x59, 0x81, + 0xd1, 0x60, 0xe6, 0x40, 0xd5, 0x6d, 0x68, 0xba, 0x8b, 0x65, 0x4a, 0xf9, + 0xf1, 0xae, 0x56, 0x24, 0x8f, 0xe3, 0x8e, 0xe7, 0x7e, 0x6f, 0xcf, 0x92, + 0xdf, 0xa9, 0x75, 0x3a, 0xd6, 0x2e, 0x1c, 0xaf, 0xf2, 0xd6, 0x8b, 0x39, + 0xad, 0xd2, 0x5d, 0xfb, 0xd7, 0xdf, 0x05, 0x57, 0x0d, 0xf7, 0xf6, 0x8f, + 0x2d, 0x14, 0xb0, 0x4e, 0x1a, 0x3c, 0x77, 0x04, 0xcd, 0x3c, 0x5c, 0x58, + 0x52, 0x10, 0x6f, 0xcf, 0x5c, 0x03, 0xc8, 0x5f, 0x85, 0x2b, 0x05, 0x82, + 0x60, 0xda, 0xcc, 0xcd, 0xd6, 0x88, 0xbf, 0xc0, 0x10, 0xb3, 0x6f, 0x54, + 0x54, 0x42, 0xbc, 0x4b, 0x77, 0x21, 0x4d, 0xee, 0x87, 0x45, 0x06, 0x4c, + 0x60, 0x38, 0xd2, 0x7e, 0x1d, 0x30, 0x6c, 0x55, 0xf0, 0x38, 0x80, 0x1c, + 0xde, 0x3d, 0xea, 0x68, 0x3e, 0xf6, 0x3e, 0x59, 0xcf, 0x0d, 0x08, 0xae, + 0x8c, 0x02, 0x0b, 0xc1, 0x72, 0x6a, 0xb4, 0x6d, 0xf3, 0xf7, 0xb3, 0xef, + 0x3a, 0xb1, 0x06, 0xf2, 0xf4, 0xd6, 0x69, 0x7b, 0x3e, 0xa2, 0x16, 0x31, + 0x31, 0x79, 0xb6, 0x33, 0xa9, 0xca, 0x8a, 0xa8, 0xbe, 0xf3, 0xe9, 0x38, + 0x28, 0xd1, 0xe1, 0x3b, 0x4e, 0x2e, 0x47, 0x35, 0xa4, 0x61, 0x14, 0x1e, + 0x42, 0x2c, 0x49, 0x55, 0xea, 0xe3, 0xb3, 0xce, 0x39, 0xd3, 0xb3, 0xef, + 0x4a, 0x4d, 0x78, 0x49, 0xbd, 0xf6, 0x7c, 0x0a, 0x2c, 0xd3, 0x26, 0xcb, + 0xd9, 0x6a, 0xad, 0x63, 0x93, 0xa7, 0x29, 0x92, 0xdc, 0x1f, 0xaf, 0x61, + 0x82, 0x80, 0x74, 0xb2, 0x9c, 0x4a, 0x86, 0x73, 0x50, 0xd8, 0xd1, 0xff, + 0xee, 0x1a, 0xe2, 0xdd, 0xa2, 0x61, 0xbd, 0x10, 0xc3, 0x5f, 0x67, 0x9f, + 0x29, 0xe4, 0xd3, 0x70, 0xe5, 0x67, 0x3a, 0xd2, 0x20, 0x00, 0xcc, 0x25, + 0x15, 0x96, 0x54, 0x45, 0x85, 0xed, 0x82, 0x88, 0x3b, 0x9f, 0x3b, 0xc3, + 0x04, 0xd4, 0x23, 0xb1, 0x0d, 0xdc, 0xc8, 0x26, 0x9d, 0x28, 0xb3, 0x25, + 0x4d, 0x52, 0xe5, 0x33, 0xf3, 0xed, 0x2c, 0xb8, 0x1a, 0xcf, 0xc3, 0x52, + 0xb4, 0x2f, 0xc7, 0x79, 0x96, 0x14, 0x7d, 0x72, 0x27, 0x72, 0x85, 0xea, + 0x6d, 0x41, 0xa0, 0x22, 0x13, 0x6d, 0x06, 0x83, 0xa4, 0xdd, 0x0f, 0x69, + 0xd2, 0x01, 0xcd, 0xc6, 0xb8, 0x64, 0x5c, 0x2c, 0x79, 0xd1, 0xc7, 0xd3, + 0x31, 0xdb, 0x2c, 0xff, 0xda, 0xd0, 0x69, 0x31, 0xad, 0x83, 0x5f, 0xed, + 0x6a, 0x97, 0xe4, 0x00, 0x43, 0xb0, 0x2e, 0x97, 0xae, 0x00, 0x5f, 0x5c, + 0xb9, 0xe8, 0x39, 0x80, 0x10, 0xca, 0x0c, 0xfa, 0xf0, 0xb5, 0xcd, 0xaa, + 0x27, 0x11, 0x60, 0xd9, 0x21, 0x86, 0x93, 0x91, 0x9f, 0x2d, 0x1a, 0x8e, + 0xde, 0x0b, 0xb5, 0xcb, 0x05, 0x24, 0x30, 0x45, 0x4d, 0x11, 0x75, 0xfd, + 0xe5, 0xa0, 0xa9, 0x4e, 0x3a, 0x8c, 0x3b, 0x52, 0x5a, 0x37, 0x18, 0x05, + 0x4a, 0x7a, 0x09, 0x6a, 0xe6, 0xd5, 0xa9, 0xa6, 0x71, 0x47, 0x4c, 0x50, + 0xe1, 0x3e, 0x8a, 0x21, 0x2b, 0x4f, 0x0e, 0xe3, 0xcb, 0x72, 0xc5, 0x28, + 0x3e, 0x5a, 0x33, 0xec, 0x48, 0x92, 0x2e, 0xa1, 0x24, 0x57, 0x09, 0x0f, + 0x01, 0x85, 0x3b, 0x34, 0x39, 0x7e, 0xc7, 0x90, 0x62, 0xe2, 0xdc, 0x5d, + 0x0a, 0x2c, 0x51, 0x26, 0x95, 0x3a, 0x95, 0x92, 0xa5, 0x39, 0x8f, 0x0c, + 0x83, 0x0b, 0x9d, 0x38, 0xab, 0x98, 0x2a, 0xc4, 0x01, 0xc4, 0x0d, 0x77, + 0x13, 0xcb, 0xca, 0xf1, 0x28, 0x31, 0x52, 0x75, 0x27, 0x2c, 0xf0, 0x04, + 0x86, 0xc8, 0xf3, 0x3d, 0xf2, 0x9d, 0x8f, 0x55, 0x52, 0x40, 0x3f, 0xaa, + 0x22, 0x7f, 0xe7, 0x69, 0x3b, 0xee, 0x44, 0x09, 0xde, 0xff, 0xb0, 0x69, + 0x3a, 0xae, 0x74, 0xe9, 0x9d, 0x33, 0xae, 0x8b, 0x6d, 0x60, 0x04, 0xff, + 0x53, 0x3f, 0x88, 0xe9, 0x63, 0x9b, 0xb1, 0x6d, 0x2c, 0x22, 0x15, 0x5a, + 0x15, 0xd9, 0xe5, 0xcb, 0x03, 0x78, 0x3c, 0xca, 0x59, 0x8c, 0xc8, 0xc2, + 0x86, 0xff, 0xd2, 0x79, 0xd6, 0xc6, 0xec, 0x5b, 0xbb, 0xa0, 0xae, 0x01, + 0x20, 0x09, 0x2e, 0x38, 0x5d, 0xda, 0x5d, 0xe0, 0x59, 0x4e, 0xe5, 0x8b, + 0x84, 0x8f, 0xb6, 0xe0, 0x56, 0x9f, 0x21, 0xa1, 0xcf, 0xb2, 0x0f, 0x2c, + 0x93, 0xf8, 0xcf, 0x37, 0xc1, 0x9f, 0x32, 0x98, 0x21, 0x65, 0x52, 0x66, + 0x6e, 0xd3, 0x71, 0x98, 0x55, 0xb9, 0x46, 0x9f, 0x1a, 0x35, 0xc4, 0x47, + 0x69, 0x62, 0x70, 0x4b, 0x77, 0x9e, 0xe4, 0x21, 0xe6, 0x32, 0x5a, 0x26, + 0x05, 0xba, 0x57, 0x53, 0xd7, 0x9b, 0x55, 0x3c, 0xbb, 0x53, 0x79, 0x60, + 0x9c, 0xc8, 0x4d, 0xf7, 0xf5, 0x1d, 0x54, 0x02, 0x91, 0x68, 0x0e, 0xaa, + 0xca, 0x5a, 0x78, 0x0c, 0x28, 0x9a, 0xc3, 0xac, 0x49, 0xc0, 0xf4, 0x85, + 0xee, 0x59, 0x76, 0x7e, 0x28, 0x4e, 0xf1, 0x5c, 0x63, 0xf7, 0xce, 0x0e, + 0x2c, 0x21, 0xa0, 0x58, 0xe9, 0x01, 0xfd, 0xeb, 0xd1, 0xaf, 0xe6, 0xef, + 0x93, 0xb3, 0x95, 0x51, 0x60, 0xa2, 0x74, 0x40, 0x15, 0xe5, 0xf4, 0x0a, + 0xca, 0x6d, 0x9a, 0x37, 0x42, 0x4d, 0x5a, 0x58, 0x49, 0x0f, 0xe9, 0x02, + 0xfc, 0x77, 0xd8, 0x59, 0xde, 0xdd, 0xad, 0x4b, 0x99, 0x2e, 0x64, 0x73, + 0xad, 0x42, 0x2f, 0xf3, 0x2c, 0x0d, 0x49, 0xe4, 0x2e, 0x6c, 0xa4, 0x73, + 0x75, 0x18, 0x14, 0x85, 0xbb, 0x64, 0xb4, 0xa1, 0xb0, 0x6e, 0x01, 0xc0, + 0xcf, 0x17, 0x9c, 0xc5, 0x28, 0xc3, 0x2d, 0x6c, 0x17, 0x2a, 0x3d, 0x06, + 0x5c, 0xf3, 0xb4, 0x49, 0x75, 0xad, 0x17, 0x69, 0xd4, 0xca, 0x65, 0xae, + 0x44, 0x71, 0xa5, 0xf6, 0x0d, 0x0f, 0x8e, 0x37, 0xc7, 0x43, 0xce, 0x6b, + 0x08, 0xe9, 0xd1, 0x34, 0x48, 0x8f, 0xc9, 0xfc, 0xf3, 0x5d, 0x2d, 0xec, + 0x62, 0xd3, 0xf0, 0xb3, 0xfe, 0x2e, 0x40, 0x55, 0x76, 0x54, 0xc7, 0xb4, + 0x61, 0x16, 0xcc, 0x7c, 0x1c, 0x19, 0x24, 0xe6, 0x4d, 0xd4, 0xc3, 0x77, + 0x67, 0x1f, 0x3c, 0x74, 0x79, 0xa1, 0xf8, 0x85, 0x88, 0x1d, 0x6f, 0xa4, + 0x7e, 0x2c, 0x21, 0x9f, 0x49, 0xf5, 0xaa, 0x4e, 0xf3, 0x4a, 0xfa, 0x9d, + 0xbe, 0xf6, 0xce, 0xda, 0xb5, 0xab, 0x39, 0xbd, 0x16, 0x41, 0xa9, 0x4a, + 0xac, 0x09, 0x01, 0xca, +}; +static const uint8_t kOutput[] = { + 0x54, 0x30, 0x6a, 0x13, 0xda, 0x59, 0x6b, 0x6d, 0x59, 0x49, 0xc8, 0xc5, + 0xab, 0x26, 0xd4, 0x8a, 0xad, 0xc0, 0x3d, 0xaf, 0x14, 0xb9, 0x15, 0xb8, + 0xca, 0xdf, 0x17, 0xa7, 0x03, 0xd3, 0xc5, 0x06, 0x01, 0xef, 0x21, 0xdd, + 0xa3, 0x0b, 0x9e, 0x48, 0xb8, 0x5e, 0x0b, 0x87, 0x9f, 0x95, 0x23, 0x68, + 0x85, 0x69, 0xd2, 0x5d, 0xaf, 0x57, 0xe9, 0x27, 0x11, 0x3d, 0x49, 0xfa, + 0xf1, 0x08, 0xcc, 0x15, 0xec, 0x1d, 0x19, 0x16, 0x12, 0x9b, 0xc8, 0x66, + 0x1f, 0xfa, 0x2c, 0x93, 0xf4, 0x99, 0x11, 0x27, 0x31, 0x0e, 0xd8, 0x46, + 0x47, 0x40, 0x11, 0x70, 0x01, 0xca, 0xe8, 0x5b, 0xc5, 0x91, 0xc8, 0x3a, + 0xdc, 0xaa, 0xf3, 0x4b, 0x80, 0xe5, 0xbc, 0x03, 0xd0, 0x89, 0x72, 0xbc, + 0xce, 0x2a, 0x76, 0x0c, 0xf5, 0xda, 0x4c, 0x10, 0x06, 0x35, 0x41, 0xb1, + 0xe6, 0xb4, 0xaa, 0x7a, 0xef, 0xf0, 0x62, 0x4a, 0xc5, 0x9f, 0x2c, 0xaf, + 0xb8, 0x2f, 0xd9, 0xd1, 0x01, 0x7a, 0x36, 0x2f, 0x3e, 0x83, 0xa5, 0xeb, + 0x81, 0x70, 0xa0, 0x57, 0x17, 0x46, 0xea, 0x9e, 0xcb, 0x0e, 0x74, 0xd3, + 0x44, 0x57, 0x1d, 0x40, 0x06, 0xf8, 0xb7, 0xcb, 0x5f, 0xf4, 0x79, 0xbd, + 0x11, 0x19, 0xd6, 0xee, 0xf8, 0xb0, 0xaa, 0xdd, 0x00, 0x62, 0xad, 0x3b, + 0x88, 0x9a, 0x88, 0x5b, 0x1b, 0x07, 0xc9, 0xae, 0x9e, 0xa6, 0x94, 0xe5, + 0x55, 0xdb, 0x45, 0x23, 0xb9, 0x2c, 0xcd, 0x29, 0xd3, 0x54, 0xc3, 0x88, + 0x1e, 0x5f, 0x52, 0xf2, 0x09, 0x00, 0x26, 0x26, 0x1a, 0xed, 0xf5, 0xc2, + 0xa9, 0x7d, 0xf9, 0x21, 0x5a, 0xaf, 0x6d, 0xab, 0x8e, 0x16, 0x84, 0x96, + 0xb5, 0x4f, 0xcf, 0x1e, 0xa3, 0xaf, 0x08, 0x9f, 0x79, 0x86, 0xc3, 0xbe, + 0x0c, 0x70, 0xcb, 0x8f, 0xf3, 0xc5, 0xf8, 0xe8, 0x4b, 0x21, 0x7d, 0x18, + 0xa9, 0xed, 0x8b, 0xfb, 0x6b, 0x5a, 0x6f, 0x26, 0x0b, 0x56, 0x04, 0x7c, + 0xfe, 0x0e, 0x1e, 0xc1, 0x3f, 0x82, 0xc5, 0x73, 0xbd, 0x53, 0x0c, 0xf0, + 0xe2, 0xc9, 0xf3, 0x3d, 0x1b, 0x6d, 0xba, 0x70, 0xc1, 0x6d, 0xb6, 0x00, + 0x28, 0xe1, 0xc4, 0x78, 0x62, 0x04, 0xda, 0x23, 0x86, 0xc3, 0xda, 0x74, + 0x3d, 0x7c, 0xd6, 0x76, 0x29, 0xb2, 0x27, 0x2e, 0xb2, 0x35, 0x42, 0x60, + 0x82, 0xcf, 0x30, 0x2c, 0x59, 0xe4, 0xe3, 0xd0, 0x74, 0x1f, 0x58, 0xe8, + 0xda, 0x47, 0x45, 0x73, 0x1c, 0x05, 0x93, 0xae, 0x75, 0xbe, 0x1f, 0x81, + 0xd8, 0xb7, 0xb3, 0xff, 0xfc, 0x8b, 0x52, 0x9e, 0xed, 0x8b, 0x37, 0x9f, + 0xe0, 0xb8, 0xa2, 0x66, 0xe1, 0x6a, 0xc5, 0x1f, 0x1d, 0xf0, 0xde, 0x3f, + 0x3d, 0xb0, 0x28, 0xf3, 0xaa, 0x4e, 0x4d, 0x31, 0xb0, 0x26, 0x79, 0x2b, + 0x08, 0x0f, 0xe9, 0x2f, 0x79, 0xb3, 0xc8, 0xdd, 0xa7, 0x89, 0xa8, 0xa8, + 0x1d, 0x59, 0x0e, 0x4f, 0x1e, 0x93, 0x1f, 0x70, 0x7f, 0x4e, 0x7e, 0xfe, + 0xb8, 0xca, 0x63, 0xe0, 0xa6, 0x05, 0xcc, 0xd7, 0xde, 0x2a, 0x49, 0x31, + 0x78, 0x5c, 0x5f, 0x44, 0xb2, 0x9b, 0x91, 0x99, 0x14, 0x29, 0x63, 0x09, + 0x12, 0xdd, 0x02, 0xd9, 0x7b, 0xe9, 0xf5, 0x12, 0x07, 0xd0, 0xe7, 0xe6, + 0xe8, 0xdd, 0xda, 0xa4, 0x73, 0xc4, 0x8e, 0xbd, 0x7b, 0xb7, 0xbb, 0xcb, + 0x83, 0x2f, 0x43, 0xf6, 0x1c, 0x50, 0xae, 0x9b, 0x2e, 0x52, 0x80, 0x18, + 0x85, 0xa8, 0x23, 0x52, 0x7a, 0x6a, 0xf7, 0x42, 0x36, 0xca, 0x91, 0x5a, + 0x3d, 0x2a, 0xa0, 0x35, 0x7d, 0x70, 0xfc, 0x4c, 0x18, 0x7c, 0x57, 0x72, + 0xcf, 0x9b, 0x29, 0xd6, 0xd0, 0xb4, 0xd7, 0xe6, 0x89, 0x70, 0x69, 0x22, + 0x5e, 0x45, 0x09, 0x4d, 0x49, 0x87, 0x84, 0x5f, 0x8a, 0x5f, 0xe4, 0x15, + 0xd3, 0xe3, 0x72, 0xaf, 0xb2, 0x30, 0x9c, 0xc1, 0xff, 0x8e, 0x6d, 0x2a, + 0x76, 0x9e, 0x08, 0x03, 0x7e, 0xe0, 0xc3, 0xc2, 0x97, 0x06, 0x6b, 0x33, + 0x2b, 0x08, 0xe3, 0xd5, 0x0b, 0xd8, 0x32, 0x67, 0x61, 0x10, 0xed, 0x6b, + 0xed, 0x50, 0xef, 0xd7, 0x1c, 0x1b, 0xe0, 0x6d, 0xa1, 0x64, 0x19, 0x34, + 0x2f, 0xe4, 0xe8, 0x54, 0xbf, 0x84, 0x0e, 0xdf, 0x0e, 0x8b, 0xd8, 0xdd, + 0x77, 0x96, 0xb8, 0x54, 0xab, 0xf2, 0x95, 0x59, 0x0d, 0x0d, 0x0a, 0x15, + 0x6e, 0x01, 0xf2, 0x24, 0xab, 0xa0, 0xd8, 0xdf, 0x38, 0xea, 0x97, 0x58, + 0x76, 0x88, 0xbe, 0xaf, 0x45, 0xe3, 0x56, 0x4f, 0x68, 0xe8, 0x4b, 0xe7, + 0x2b, 0x22, 0x18, 0x96, 0x82, 0x89, 0x25, 0x34, 0xd1, 0xdd, 0x08, 0xea, + 0x7e, 0x21, 0xef, 0x57, 0x55, 0x43, 0xf7, 0xfa, 0xca, 0x1c, 0xde, 0x99, + 0x2e, 0x8b, 0xd8, 0xc3, 0xcf, 0x89, 0x4d, 0xfc, 0x3b, 0x7d, 0x4a, 0xc9, + 0x99, 0xc4, 0x31, 0xb6, 0x7a, 0xae, 0xf8, 0x49, 0xb2, 0x46, 0xc1, 0x60, + 0x05, 0x75, 0xf3, 0x3d, 0xf2, 0xc9, 0x84, 0xa4, 0xb9, 0x8a, 0x87, 0x2a, + 0x87, 0x5c, 0x0a, 0xbc, 0x51, 0x7d, 0x9a, 0xf5, 0xc9, 0x24, 0x2d, 0x5e, + 0xe6, 0xc6, 0xe3, 0xcd, 0x7e, 0xe4, 0xaf, 0x8a, 0x6c, 0x00, 0x04, 0xc8, + 0xd7, 0xa5, 0xad, 0xfa, 0xb2, 0x08, 0x4a, 0x26, 0x9b, 0x7c, 0xd0, 0xc6, + 0x13, 0xb1, 0xb9, 0x65, 0x3f, 0x70, 0x30, 0xf9, 0x98, 0x9d, 0x87, 0x99, + 0x57, 0x71, 0x3e, 0xb1, 0xc3, 0x24, 0xf0, 0xa6, 0xa2, 0x60, 0x9d, 0x66, + 0xd2, 0x5f, 0xae, 0xe3, 0x94, 0x87, 0xea, 0xd1, 0xea, 0x0d, 0x2a, 0x77, + 0xef, 0x31, 0xcc, 0xeb, 0xf9, 0x0c, 0xdc, 0x9c, 0x12, 0x80, 0xbb, 0xb0, + 0x8e, 0xab, 0x9a, 0x04, 0xcd, 0x4b, 0x95, 0x4f, 0x7a, 0x0b, 0x53, 0x7c, + 0x16, 0xcc, 0x0e, 0xb1, 0x73, 0x10, 0xdd, 0xaa, 0x76, 0x94, 0x90, 0xd9, + 0x8b, 0x66, 0x41, 0x31, 0xed, 0x8c, 0x7d, 0x74, 0xc4, 0x33, 0xfa, 0xc3, + 0x43, 0x8d, 0x10, 0xbc, 0x84, 0x4d, 0x0e, 0x95, 0x32, 0xdf, 0x17, 0x43, + 0x6d, 0xd2, 0x5e, 0x12, 0xb9, 0xed, 0x33, 0xd9, 0x97, 0x6f, 0x4a, 0xcd, + 0xc3, 0xcd, 0x81, 0x34, 0xbe, 0x7e, 0xa2, 0xd0, 0xa7, 0x91, 0x5d, 0x90, + 0xf6, 0x5e, 0x4a, 0x25, 0x0f, 0xcc, 0x24, 0xeb, 0xe1, 0xe4, 0x62, 0x6c, + 0x8f, 0x45, 0x36, 0x97, 0x5d, 0xda, 0x20, 0x2b, 0x86, 0x00, 0x8c, 0x94, + 0xa9, 0x6a, 0x69, 0xb2, 0xe9, 0xbb, 0x82, 0x8e, 0x41, 0x95, 0xb4, 0xb7, + 0xf1, 0x55, 0x52, 0x30, 0x39, 0x48, 0xb3, 0x25, 0x82, 0xa9, 0x10, 0x27, + 0x89, 0xb5, 0xe5, 0x1f, 0xab, 0x72, 0x3c, 0x70, 0x08, 0xce, 0xe6, 0x61, + 0xbf, 0x19, 0xc8, 0x90, 0x2b, 0x29, 0x30, 0x3e, 0xb8, 0x4c, 0x33, 0xf0, + 0xf0, 0x15, 0x2e, 0xb7, 0x25, 0xca, 0x99, 0x4b, 0x6f, 0x4b, 0x41, 0x50, + 0xee, 0x56, 0x99, 0xcf, 0x2b, 0xa4, 0xc4, 0x7c, 0x5c, 0xa6, 0xd4, 0x67, + 0x04, 0x5c, 0x5d, 0x5f, 0x26, 0x9e, 0x0f, 0xe2, 0x58, 0x68, 0x4c, 0x30, + 0xcd, 0xef, 0x46, 0xdb, 0x37, 0x6f, 0xbb, 0xc4, 0x80, 0xca, 0x8a, 0x54, + 0x5d, 0x71, 0x9d, 0x0c, 0xe8, 0xb8, 0x2c, 0x10, 0x90, 0x44, 0xa4, 0x88, + 0x3f, 0xbc, 0x15, 0x3c, 0xd2, 0xca, 0x0e, 0xc3, 0xe4, 0x6e, 0xef, 0xb0, + 0xcb, 0xfd, 0x61, 0x7c, 0x27, 0xf2, 0x25, 0xea, 0x71, 0x6d, 0xf7, 0x49, + 0x9c, 0x81, 0x27, 0xf0, 0x61, 0x33, 0xcf, 0x55, 0x68, 0xd3, 0x73, 0xa4, + 0xed, 0x35, 0x65, 0x2a, 0xf2, 0x3e, 0xcf, 0x90, 0x98, 0x54, 0x6d, 0x95, + 0x6a, 0x0c, 0x9c, 0x24, 0x0e, 0xb4, 0xb7, 0x9b, 0x8d, 0x6e, 0x1c, 0xbc, + 0xeb, 0x17, 0x10, 0x86, 0xda, 0x91, 0x6d, 0x89, 0x4c, 0xeb, 0xf5, 0x50, + 0x8f, 0x40, 0xcf, 0x4a, +}; + +static_assert(sizeof(kInput) == sizeof(kOutput), + "Input and output lengths don't match."); + +TEST(ChaChaTest, TestVector) { + // Run the test with the test vector at all lengths. + for (size_t len = 0; len <= sizeof(kInput); len++) { + SCOPED_TRACE(len); + + std::unique_ptr<uint8_t[]> buf(new uint8_t[len]); + CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kCounter); + EXPECT_EQ(Bytes(kOutput, len), Bytes(buf.get(), len)); + + // Test the in-place version. + OPENSSL_memcpy(buf.get(), kInput, len); + CRYPTO_chacha_20(buf.get(), buf.get(), len, kKey, kNonce, kCounter); + EXPECT_EQ(Bytes(kOutput, len), Bytes(buf.get(), len)); + } +} |