From fcf26d1d2024758e485e061c634bc5866671b4f9 Mon Sep 17 00:00:00 2001 From: aoeu Date: Wed, 10 Mar 2021 10:42:33 +0000 Subject: [PATCH] Bug 1613235 - Add POWER ChaCha20 stream cipher vector acceleration. r=bbeurdouche Differential Revision: https://phabricator.services.mozilla.com/D107220 --HG-- extra : moz-landing-system : lando --- lib/freebl/Makefile | 3 + lib/freebl/chacha20-ppc64le.S | 546 +++++++++++++++++++++++++++ lib/freebl/chacha20poly1305-ppc.c | 588 ++++++++++++++++++++++++++++++ lib/freebl/chacha20poly1305.c | 47 +++ lib/freebl/freebl.gyp | 9 + lib/freebl/freebl_base.gypi | 6 + 6 files changed, 1199 insertions(+) create mode 100644 lib/freebl/chacha20-ppc64le.S create mode 100644 lib/freebl/chacha20poly1305-ppc.c diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile index 269e34c5ce..10654360c6 100644 --- a/lib/freebl/Makefile +++ b/lib/freebl/Makefile @@ -298,6 +298,8 @@ ifdef USE_64 PPC_ABI := $(shell $(CC) -dM -E - < /dev/null | awk '$$2 == "_CALL_ELF" {print $$3}') ifeq ($(PPC_ABI),2) ASFILES += sha512-p8.s + EXTRA_SRCS += chacha20poly1305-ppc.c + ASFILES += chacha20-ppc64le.s endif endif # USE_64 endif # ppc @@ -762,6 +764,7 @@ $(OBJDIR)/$(PROG_PREFIX)gcm$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec -mvsx $(OBJDIR)/$(PROG_PREFIX)rijndael$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec -mvsx $(OBJDIR)/$(PROG_PREFIX)sha512$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec -mvsx \ -funroll-loops -fpeel-loops +$(OBJDIR)/$(PROG_PREFIX)chacha20poly1305-ppc$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec -mvsx endif endif diff --git a/lib/freebl/chacha20-ppc64le.S b/lib/freebl/chacha20-ppc64le.S new file mode 100644 index 0000000000..241bef41fc --- /dev/null +++ b/lib/freebl/chacha20-ppc64le.S @@ -0,0 +1,546 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +# vs0 - vs15 : buffer for xor +# vs32 - vs47 (v0 - v15) : 4 "converted" states +# vs48 - vs51 (v16 - v19) : original state +# vs52 - vs55 (v20 - v23) : "converted" constants +# vs56 (v24) : "converted" counter +# vs57 (v25) : increment for "converted" counter +# vs60 - vs63 (v28 - v31) : constants for rotate left or vpermxor + +#include + +.equ rSIZE, r3 +.equ rDST, r4 +.equ rSRC, r5 +.equ rKEY, r6 +.equ rNONCE, r7 +.equ rCNTR, r8 + +.abiversion 2 +.section ".data" +.align 5 +lblock: .skip 256 +cnts0: .long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 +cnts1: .long 0x61707865, 0x61707865, 0x61707865, 0x61707865 +cnts2: .long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e +cnts3: .long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32 +cnts4: .long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574 +st4: .long 0, 0, 0, 0 +cntr: .long 0, 0, 0, 0 +incr: .long 4, 4, 4, 4 +rotl1: .long 0x22330011, 0x66774455, 0xAABB8899, 0xEEFFCCDD +rotl2: .long 12, 12, 12, 12 +rotl3: .long 0x11223300, 0x55667744, 0x99AABB88, 0xDDEEFFCC +rotl4: .long 7, 7, 7, 7 + +.section ".text" +.align 5 +.globl chacha20vsx +.type chacha20vsx, @function +chacha20vsx: + # prologue + addis 2, r12, .TOC.-chacha20vsx@ha + addi 2, 2, .TOC.-chacha20vsx@l + .localentry chacha20vsx, .-chacha20vsx + std r14, -8(sp) + std r15, -16(sp) + std r16, -24(sp) + std r17, -32(sp) + std r18, -40(sp) + std r19, -48(sp) + std r20, -56(sp) + std r21, -64(sp) + std r22, -72(sp) + std r23, -80(sp) + std r24, -88(sp) + std r25, -96(sp) + std r26, -104(sp) + std r27, -112(sp) + std r28, -120(sp) + std r29, -128(sp) + std r30, -136(sp) + std r31, -144(sp) + + addi r14, sp, -160 + + li r16, -16 + li r17, -32 + li r18, -48 + li r19, -64 + li r20, -80 + li r21, -96 + li r22, -112 + li r23, -128 + li r24, -144 + li r25, -160 + li r26, -176 + li r27, -192 + li r28, -208 + + # save f14, f15 + stxvw4x vs14, 0, r14 + stxvw4x vs15, r16, r14 + + # save v20 - v31 + stxvw4x vs52, r17, r14 + stxvw4x vs53, r18, r14 + stxvw4x vs54, r19, r14 + stxvw4x vs55, r20, r14 + stxvw4x vs56, r21, r14 + stxvw4x vs57, r22, r14 + stxvw4x vs58, r23, r14 + stxvw4x vs59, r24, r14 + stxvw4x vs60, r25, r14 + stxvw4x vs61, r26, r14 + stxvw4x vs62, r27, r14 + stxvw4x vs63, r28, r14 + + # offset in src/dst + li r17, 16 + li r18, 32 + li r19, 48 + li r20, 64 + li r21, 80 + li r22, 96 + li r23, 112 + li r24, 128 + li r25, 144 + li r26, 160 + li r27, 176 + li r28, 192 + li r29, 208 + li r30, 224 + li r31, 240 + + # load const's address + addis r14, 2, cnts0@toc@ha + addi r14, r14, cnts0@toc@l + + # save nonce to st4 + lwz r15, 0(rNONCE) + stw r15, 84(r14) + lwz r15, 4(rNONCE) + stw r15, 88(r14) + lwz r15, 8(rNONCE) + stw r15, 92(r14) + + # load state to vectors + lxvw4x vs48, 0, r14 + lxvw4x vs49, 0, rKEY + lxvw4x vs50, r17, rKEY + lxvw4x vs51, r21, r14 + + # load consts for x4 rounds + lxvw4x vs52, r17, r14 + lxvw4x vs53, r18, r14 + lxvw4x vs54, r19, r14 + lxvw4x vs55, r20, r14 + + # counter + stw rCNTR, 96(r14) + addi rCNTR, rCNTR, 1 + stw rCNTR, 100(r14) + addi rCNTR, rCNTR, 1 + stw rCNTR, 104(r14) + addi rCNTR, rCNTR, 1 + stw rCNTR, 108(r14) + lxvw4x vs56, r22, r14 + + # load increment + lxvw4x vs57, r23, r14 + + # load rotl to vectors + lxvw4x vs60, r24, r14 + lxvw4x vs61, r25, r14 + lxvw4x vs62, r26, r14 + lxvw4x vs63, r27, r14 + + # counter for loop = size/256 + li r15, 256 + divdu. r16, rSIZE, r15 + beq lastblock + mtctr r16 + +mainloop: + # init 16 vectors (4 states x4) + vor v0, v20, v20 + vor v1, v21, v21 + vor v2, v22, v22 + vor v3, v23, v23 + vspltw v4, v17, v0 + vspltw v5, v17, v1 + vspltw v6, v17, v2 + vspltw v7, v17, v3 + vspltw v8, v18, v0 + vspltw v9, v18, v1 + vspltw v10, v18, v2 + vspltw v11, v18, v3 + vor v12, v24, v24 + vspltw v13, v19, v1 + vspltw v14, v19, v2 + vspltw v15, v19, v3 + +.macro _plus a b_y b_x + vadduwm \a, \a, \b_y*4+(\b_x)%4 + vadduwm \a+1, \a+1, \b_y*4+(\b_x+1)%4 + vadduwm \a+2, \a+2, \b_y*4+(\b_x+2)%4 + vadduwm \a+3, \a+3, \b_y*4+(\b_x+3)%4 +.endm + +.macro _xor a b_y b_x + vxor \a, \a, \b_y*4+(\b_x)%4 + vxor \a+1, \a+1, \b_y*4+(\b_x+1)%4 + vxor \a+2, \a+2, \b_y*4+(\b_x+2)%4 + vxor \a+3, \a+3, \b_y*4+(\b_x+3)%4 +.endm + +.macro _rotl a b + vrlw \a, \a, \b + vrlw \a+1, \a+1, \b + vrlw \a+2, \a+2, \b + vrlw \a+3, \a+3, \b +.endm + +.macro _pxor a b_y b_x c + vpermxor \a, \a, \b_y*4+(\b_x)%4, \c + vpermxor \a+1, \a+1, \b_y*4+(\b_x+1)%4, \c + vpermxor \a+2, \a+2, \b_y*4+(\b_x+2)%4, \c + vpermxor \a+3, \a+3, \b_y*4+(\b_x+3)%4, \c +.endm + +# 00 01 02 03 +# 04 05 06 07 +# 08 09 10 11 +# 12 13 14 15 +.macro doubleround + # column round + _plus v0, v1, v0 # a+=b + _pxor v12, v0, v0, v28 # d^=a; d<<<=16 + _plus v8, v3, v0 # c+=d + _xor v4, v2, v0 # b^=c + _rotl v4, v29 # b<<<=12 + _plus v0, v1, v0 # a+=b + _pxor v12, v0, v0, v30 # d^=a; d<<<=8 + _plus v8, v3, v0 # c+=d + _xor v4, v2, v0 # b^=c + _rotl v4, v31 # b<<<=7 + + # diagonal round + _plus v0, v1, v1 # a+=b + _pxor v12, v0, v1, v28 # d^=a; d<<<=16 + _plus v8, v3, v1 # c+=d + _xor v4, v2, v1 # b^=c + _rotl v4, v29 # b<<<=12 + _plus v0, v1, v1 # a+=b + _pxor v12, v0, v1, v30 # d^=a; d<<<=8 + _plus v8, v3, v1 # c+=d + _xor v4, v2, v1 # b^=c + _rotl v4, v31 # b<<<=7 +.endm + + doubleround # 1 + doubleround # 2 + doubleround # 3 + doubleround # 4 + doubleround # 5 + doubleround # 6 + doubleround # 7 + doubleround # 8 + doubleround # 9 + doubleround # 10 + + # counter += original counter + vadduwm v12, v12, v24 + +.macro convert a + vmrgew 26, 0+\a, 1+\a + vmrgew 27, 2+\a, 3+\a + vmrgow 0+\a, 0+\a, 1+\a + vmrgow 2+\a, 2+\a, 3+\a + xxmrghd 33+\a, 32+\a, 34+\a + xxmrgld 35+\a, 32+\a, 34+\a + xxmrghd 32+\a, 58, 59 + xxmrgld 34+\a, 58, 59 +.endm + + convert 0 + convert 4 + convert 8 + convert 12 + +.macro addition a + vadduwm 0+\a, 0+\a, 16 + vadduwm 4+\a, 4+\a, 17 + vadduwm 8+\a, 8+\a, 18 + vadduwm 12+\a, 12+\a, 19 +.endm + + addition 0 + addition 1 + addition 2 + addition 3 + + # load text/cipher + lxvw4x vs0, 0, rSRC + lxvw4x vs1, r17, rSRC + lxvw4x vs2, r18, rSRC + lxvw4x vs3, r19, rSRC + lxvw4x vs4, r20, rSRC + lxvw4x vs5, r21, rSRC + lxvw4x vs6, r22, rSRC + lxvw4x vs7, r23, rSRC + lxvw4x vs8, r24, rSRC + lxvw4x vs9, r25, rSRC + lxvw4x vs10, r26, rSRC + lxvw4x vs11, r27, rSRC + lxvw4x vs12, r28, rSRC + lxvw4x vs13, r29, rSRC + lxvw4x vs14, r30, rSRC + lxvw4x vs15, r31, rSRC + # xor (encrypt/decrypt) + xxlxor vs0, vs0, vs32 + xxlxor vs1, vs1, vs36 + xxlxor vs2, vs2, vs40 + xxlxor vs3, vs3, vs44 + xxlxor vs4, vs4, vs33 + xxlxor vs5, vs5, vs37 + xxlxor vs6, vs6, vs41 + xxlxor vs7, vs7, vs45 + xxlxor vs8, vs8, vs34 + xxlxor vs9, vs9, vs38 + xxlxor vs10, vs10, vs42 + xxlxor vs11, vs11, vs46 + xxlxor vs12, vs12, vs35 + xxlxor vs13, vs13, vs39 + xxlxor vs14, vs14, vs43 + xxlxor vs15, vs15, vs47 + # store cipher/text + stxvw4x vs0, 0, rDST + stxvw4x vs1, r17, rDST + stxvw4x vs2, r18, rDST + stxvw4x vs3, r19, rDST + stxvw4x vs4, r20, rDST + stxvw4x vs5, r21, rDST + stxvw4x vs6, r22, rDST + stxvw4x vs7, r23, rDST + stxvw4x vs8, r24, rDST + stxvw4x vs9, r25, rDST + stxvw4x vs10, r26, rDST + stxvw4x vs11, r27, rDST + stxvw4x vs12, r28, rDST + stxvw4x vs13, r29, rDST + stxvw4x vs14, r30, rDST + stxvw4x vs15, r31, rDST + + # src/dst increment + addi rSRC, rSRC, 256 + addi rDST, rDST, 256 + + # counter increment + vadduwm v24, v24, v25 + + bdnz mainloop + +lastblock: + # reminder + mulld r16, r16, r15 + subf. r16, r16, rSIZE + + # check reminder + beq exitsub + + addi r14, r14, -256 + # last block x4 + # init 16 vectors (4 states x4) + vor v0, v20, v20 + vor v1, v21, v21 + vor v2, v22, v22 + vor v3, v23, v23 + vspltw v4, v17, v0 + vspltw v5, v17, v1 + vspltw v6, v17, v2 + vspltw v7, v17, v3 + vspltw v8, v18, v0 + vspltw v9, v18, v1 + vspltw v10, v18, v2 + vspltw v11, v18, v3 + vor v12, v24, v24 + vspltw v13, v19, v1 + vspltw v14, v19, v2 + vspltw v15, v19, v3 + + doubleround # 1 + doubleround # 2 + doubleround # 3 + doubleround # 4 + doubleround # 5 + doubleround # 6 + doubleround # 7 + doubleround # 8 + doubleround # 9 + doubleround # 10 + + vadduwm v12, v12, v24 + + convert 0 + convert 4 + convert 8 + convert 12 + + addition 0 + addition 1 + addition 2 + addition 3 + + # store vectors + stxvw4x vs32, 0, r14 + stxvw4x vs36, r17, r14 + stxvw4x vs40, r18, r14 + stxvw4x vs44, r19, r14 + stxvw4x vs33, r20, r14 + stxvw4x vs37, r21, r14 + stxvw4x vs41, r22, r14 + stxvw4x vs45, r23, r14 + stxvw4x vs34, r24, r14 + stxvw4x vs38, r25, r14 + stxvw4x vs42, r26, r14 + stxvw4x vs46, r27, r14 + stxvw4x vs35, r28, r14 + stxvw4x vs39, r29, r14 + stxvw4x vs43, r30, r14 + stxvw4x vs47, r31, r14 + + mtctr r16 + addi rSIZE, r14, -1 + addi rSRC, rSRC, -1 + addi rDST, rDST, -1 +xorlast: + lbzu r15, 1(rSIZE) + lbzu r16, 1(rSRC) + xor r15, r15, r16 + stbu r15, 1(rDST) + bdnz xorlast + + # zeroing last block + xxlxor vs0, vs0, vs0 + stxvw4x vs0, 0, r14 + stxvw4x vs0, r17, r14 + stxvw4x vs0, r18, r14 + stxvw4x vs0, r19, r14 + stxvw4x vs0, r20, r14 + stxvw4x vs0, r21, r14 + stxvw4x vs0, r22, r14 + stxvw4x vs0, r23, r14 + stxvw4x vs0, r24, r14 + stxvw4x vs0, r25, r14 + stxvw4x vs0, r26, r14 + stxvw4x vs0, r27, r14 + stxvw4x vs0, r28, r14 + stxvw4x vs0, r29, r14 + stxvw4x vs0, r30, r14 + stxvw4x vs0, r31, r14 + +exitsub: + # zeroing volatile registers + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + xxlxor vs8, vs8, vs8 + xxlxor vs9, vs9, vs9 + xxlxor vs10, vs10, vs10 + xxlxor vs11, vs11, vs11 + xxlxor vs12, vs12, vs12 + xxlxor vs13, vs13, vs13 + + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + + li rSIZE, 0 + li rDST, 0 + li rSRC, 0 + li rKEY, 0 + li rNONCE, 0 + li rCNTR, 0 + + # epilogue + addi r14, sp, -160 + + li r16, -16 + li r17, -32 + li r18, -48 + li r19, -64 + li r20, -80 + li r21, -96 + li r22, -112 + li r23, -128 + li r24, -144 + li r25, -160 + li r26, -176 + li r27, -192 + li r28, -208 + + # load f14, f15 + lxvw4x vs14, 0, r14 + lxvw4x vs15, r16, r14 + + # load v20 - v31 + lxvw4x vs52, r17, r14 + lxvw4x vs53, r18, r14 + lxvw4x vs54, r19, r14 + lxvw4x vs55, r20, r14 + lxvw4x vs56, r21, r14 + lxvw4x vs57, r22, r14 + lxvw4x vs58, r23, r14 + lxvw4x vs59, r24, r14 + lxvw4x vs60, r25, r14 + lxvw4x vs61, r26, r14 + lxvw4x vs62, r27, r14 + lxvw4x vs63, r28, r14 + + ld r14, -8(sp) + ld r15, -16(sp) + ld r16, -24(sp) + ld r17, -32(sp) + ld r18, -40(sp) + ld r19, -48(sp) + ld r20, -56(sp) + ld r21, -64(sp) + ld r22, -72(sp) + ld r23, -80(sp) + ld r24, -88(sp) + ld r25, -96(sp) + ld r26, -104(sp) + ld r27, -112(sp) + ld r28, -120(sp) + ld r29, -128(sp) + ld r30, -136(sp) + ld r31, -144(sp) + + blr diff --git a/lib/freebl/chacha20poly1305-ppc.c b/lib/freebl/chacha20poly1305-ppc.c new file mode 100644 index 0000000000..55101ceb2a --- /dev/null +++ b/lib/freebl/chacha20poly1305-ppc.c @@ -0,0 +1,588 @@ +/* MIT License + * + * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "Hacl_Chacha20Poly1305_32.h" + +/* Forward declaration from chacha20-ppc64le.S */ +void chacha20vsx(uint32_t len, uint8_t *output, uint8_t *block, uint8_t *k, + uint8_t *nonce, uint32_t ctr); + +static inline void +poly1305_padded_32(uint64_t *ctx, uint32_t len, uint8_t *text) +{ + uint32_t n = len / (uint32_t)16U; + uint32_t r = len % (uint32_t)16U; + uint8_t *blocks = text; + uint8_t *rem = text + n * (uint32_t)16U; + uint64_t *pre0 = ctx + (uint32_t)5U; + uint64_t *acc0 = ctx; + uint32_t nb = n * (uint32_t)16U / (uint32_t)16U; + uint32_t rem1 = n * (uint32_t)16U % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) { + uint8_t *block = blocks + i * (uint32_t)16U; + uint64_t e[5U] = { 0U }; + uint64_t u0 = load64_le(block); + uint64_t lo = u0; + uint64_t u = load64_le(block + (uint32_t)8U); + uint64_t hi = u; + uint64_t f0 = lo; + uint64_t f1 = hi; + uint64_t f010 = f0 & (uint64_t)0x3ffffffU; + uint64_t f110 = f0 >> (uint32_t)26U & (uint64_t)0x3ffffffU; + uint64_t f20 = f0 >> (uint32_t)52U | (f1 & (uint64_t)0x3fffU) << (uint32_t)12U; + uint64_t f30 = f1 >> (uint32_t)14U & (uint64_t)0x3ffffffU; + uint64_t f40 = f1 >> (uint32_t)40U; + uint64_t f01 = f010; + uint64_t f111 = f110; + uint64_t f2 = f20; + uint64_t f3 = f30; + uint64_t f41 = f40; + e[0U] = f01; + e[1U] = f111; + e[2U] = f2; + e[3U] = f3; + e[4U] = f41; + uint64_t b = (uint64_t)0x1000000U; + uint64_t mask = b; + uint64_t f4 = e[4U]; + e[4U] = f4 | mask; + uint64_t *r1 = pre0; + uint64_t *r5 = pre0 + (uint32_t)5U; + uint64_t r0 = r1[0U]; + uint64_t r11 = r1[1U]; + uint64_t r2 = r1[2U]; + uint64_t r3 = r1[3U]; + uint64_t r4 = r1[4U]; + uint64_t r51 = r5[1U]; + uint64_t r52 = r5[2U]; + uint64_t r53 = r5[3U]; + uint64_t r54 = r5[4U]; + uint64_t f10 = e[0U]; + uint64_t f11 = e[1U]; + uint64_t f12 = e[2U]; + uint64_t f13 = e[3U]; + uint64_t f14 = e[4U]; + uint64_t a0 = acc0[0U]; + uint64_t a1 = acc0[1U]; + uint64_t a2 = acc0[2U]; + uint64_t a3 = acc0[3U]; + uint64_t a4 = acc0[4U]; + uint64_t a01 = a0 + f10; + uint64_t a11 = a1 + f11; + uint64_t a21 = a2 + f12; + uint64_t a31 = a3 + f13; + uint64_t a41 = a4 + f14; + uint64_t a02 = r0 * a01; + uint64_t a12 = r11 * a01; + uint64_t a22 = r2 * a01; + uint64_t a32 = r3 * a01; + uint64_t a42 = r4 * a01; + uint64_t a03 = a02 + r54 * a11; + uint64_t a13 = a12 + r0 * a11; + uint64_t a23 = a22 + r11 * a11; + uint64_t a33 = a32 + r2 * a11; + uint64_t a43 = a42 + r3 * a11; + uint64_t a04 = a03 + r53 * a21; + uint64_t a14 = a13 + r54 * a21; + uint64_t a24 = a23 + r0 * a21; + uint64_t a34 = a33 + r11 * a21; + uint64_t a44 = a43 + r2 * a21; + uint64_t a05 = a04 + r52 * a31; + uint64_t a15 = a14 + r53 * a31; + uint64_t a25 = a24 + r54 * a31; + uint64_t a35 = a34 + r0 * a31; + uint64_t a45 = a44 + r11 * a31; + uint64_t a06 = a05 + r51 * a41; + uint64_t a16 = a15 + r52 * a41; + uint64_t a26 = a25 + r53 * a41; + uint64_t a36 = a35 + r54 * a41; + uint64_t a46 = a45 + r0 * a41; + uint64_t t0 = a06; + uint64_t t1 = a16; + uint64_t t2 = a26; + uint64_t t3 = a36; + uint64_t t4 = a46; + uint64_t mask26 = (uint64_t)0x3ffffffU; + uint64_t z0 = t0 >> (uint32_t)26U; + uint64_t z1 = t3 >> (uint32_t)26U; + uint64_t x0 = t0 & mask26; + uint64_t x3 = t3 & mask26; + uint64_t x1 = t1 + z0; + uint64_t x4 = t4 + z1; + uint64_t z01 = x1 >> (uint32_t)26U; + uint64_t z11 = x4 >> (uint32_t)26U; + uint64_t t = z11 << (uint32_t)2U; + uint64_t z12 = z11 + t; + uint64_t x11 = x1 & mask26; + uint64_t x41 = x4 & mask26; + uint64_t x2 = t2 + z01; + uint64_t x01 = x0 + z12; + uint64_t z02 = x2 >> (uint32_t)26U; + uint64_t z13 = x01 >> (uint32_t)26U; + uint64_t x21 = x2 & mask26; + uint64_t x02 = x01 & mask26; + uint64_t x31 = x3 + z02; + uint64_t x12 = x11 + z13; + uint64_t z03 = x31 >> (uint32_t)26U; + uint64_t x32 = x31 & mask26; + uint64_t x42 = x41 + z03; + uint64_t o0 = x02; + uint64_t o1 = x12; + uint64_t o2 = x21; + uint64_t o3 = x32; + uint64_t o4 = x42; + acc0[0U] = o0; + acc0[1U] = o1; + acc0[2U] = o2; + acc0[3U] = o3; + acc0[4U] = o4; + } + if (rem1 > (uint32_t)0U) { + uint8_t *last = blocks + nb * (uint32_t)16U; + uint64_t e[5U] = { 0U }; + uint8_t tmp[16U] = { 0U }; + memcpy(tmp, last, rem1 * sizeof(last[0U])); + uint64_t u0 = load64_le(tmp); + uint64_t lo = u0; + uint64_t u = load64_le(tmp + (uint32_t)8U); + uint64_t hi = u; + uint64_t f0 = lo; + uint64_t f1 = hi; + uint64_t f010 = f0 & (uint64_t)0x3ffffffU; + uint64_t f110 = f0 >> (uint32_t)26U & (uint64_t)0x3ffffffU; + uint64_t f20 = f0 >> (uint32_t)52U | (f1 & (uint64_t)0x3fffU) << (uint32_t)12U; + uint64_t f30 = f1 >> (uint32_t)14U & (uint64_t)0x3ffffffU; + uint64_t f40 = f1 >> (uint32_t)40U; + uint64_t f01 = f010; + uint64_t f111 = f110; + uint64_t f2 = f20; + uint64_t f3 = f30; + uint64_t f4 = f40; + e[0U] = f01; + e[1U] = f111; + e[2U] = f2; + e[3U] = f3; + e[4U] = f4; + uint64_t b = (uint64_t)1U << rem1 * (uint32_t)8U % (uint32_t)26U; + uint64_t mask = b; + uint64_t fi = e[rem1 * (uint32_t)8U / (uint32_t)26U]; + e[rem1 * (uint32_t)8U / (uint32_t)26U] = fi | mask; + uint64_t *r1 = pre0; + uint64_t *r5 = pre0 + (uint32_t)5U; + uint64_t r0 = r1[0U]; + uint64_t r11 = r1[1U]; + uint64_t r2 = r1[2U]; + uint64_t r3 = r1[3U]; + uint64_t r4 = r1[4U]; + uint64_t r51 = r5[1U]; + uint64_t r52 = r5[2U]; + uint64_t r53 = r5[3U]; + uint64_t r54 = r5[4U]; + uint64_t f10 = e[0U]; + uint64_t f11 = e[1U]; + uint64_t f12 = e[2U]; + uint64_t f13 = e[3U]; + uint64_t f14 = e[4U]; + uint64_t a0 = acc0[0U]; + uint64_t a1 = acc0[1U]; + uint64_t a2 = acc0[2U]; + uint64_t a3 = acc0[3U]; + uint64_t a4 = acc0[4U]; + uint64_t a01 = a0 + f10; + uint64_t a11 = a1 + f11; + uint64_t a21 = a2 + f12; + uint64_t a31 = a3 + f13; + uint64_t a41 = a4 + f14; + uint64_t a02 = r0 * a01; + uint64_t a12 = r11 * a01; + uint64_t a22 = r2 * a01; + uint64_t a32 = r3 * a01; + uint64_t a42 = r4 * a01; + uint64_t a03 = a02 + r54 * a11; + uint64_t a13 = a12 + r0 * a11; + uint64_t a23 = a22 + r11 * a11; + uint64_t a33 = a32 + r2 * a11; + uint64_t a43 = a42 + r3 * a11; + uint64_t a04 = a03 + r53 * a21; + uint64_t a14 = a13 + r54 * a21; + uint64_t a24 = a23 + r0 * a21; + uint64_t a34 = a33 + r11 * a21; + uint64_t a44 = a43 + r2 * a21; + uint64_t a05 = a04 + r52 * a31; + uint64_t a15 = a14 + r53 * a31; + uint64_t a25 = a24 + r54 * a31; + uint64_t a35 = a34 + r0 * a31; + uint64_t a45 = a44 + r11 * a31; + uint64_t a06 = a05 + r51 * a41; + uint64_t a16 = a15 + r52 * a41; + uint64_t a26 = a25 + r53 * a41; + uint64_t a36 = a35 + r54 * a41; + uint64_t a46 = a45 + r0 * a41; + uint64_t t0 = a06; + uint64_t t1 = a16; + uint64_t t2 = a26; + uint64_t t3 = a36; + uint64_t t4 = a46; + uint64_t mask26 = (uint64_t)0x3ffffffU; + uint64_t z0 = t0 >> (uint32_t)26U; + uint64_t z1 = t3 >> (uint32_t)26U; + uint64_t x0 = t0 & mask26; + uint64_t x3 = t3 & mask26; + uint64_t x1 = t1 + z0; + uint64_t x4 = t4 + z1; + uint64_t z01 = x1 >> (uint32_t)26U; + uint64_t z11 = x4 >> (uint32_t)26U; + uint64_t t = z11 << (uint32_t)2U; + uint64_t z12 = z11 + t; + uint64_t x11 = x1 & mask26; + uint64_t x41 = x4 & mask26; + uint64_t x2 = t2 + z01; + uint64_t x01 = x0 + z12; + uint64_t z02 = x2 >> (uint32_t)26U; + uint64_t z13 = x01 >> (uint32_t)26U; + uint64_t x21 = x2 & mask26; + uint64_t x02 = x01 & mask26; + uint64_t x31 = x3 + z02; + uint64_t x12 = x11 + z13; + uint64_t z03 = x31 >> (uint32_t)26U; + uint64_t x32 = x31 & mask26; + uint64_t x42 = x41 + z03; + uint64_t o0 = x02; + uint64_t o1 = x12; + uint64_t o2 = x21; + uint64_t o3 = x32; + uint64_t o4 = x42; + acc0[0U] = o0; + acc0[1U] = o1; + acc0[2U] = o2; + acc0[3U] = o3; + acc0[4U] = o4; + } + uint8_t tmp[16U] = { 0U }; + memcpy(tmp, rem, r * sizeof(rem[0U])); + if (r > (uint32_t)0U) { + uint64_t *pre = ctx + (uint32_t)5U; + uint64_t *acc = ctx; + uint64_t e[5U] = { 0U }; + uint64_t u0 = load64_le(tmp); + uint64_t lo = u0; + uint64_t u = load64_le(tmp + (uint32_t)8U); + uint64_t hi = u; + uint64_t f0 = lo; + uint64_t f1 = hi; + uint64_t f010 = f0 & (uint64_t)0x3ffffffU; + uint64_t f110 = f0 >> (uint32_t)26U & (uint64_t)0x3ffffffU; + uint64_t f20 = f0 >> (uint32_t)52U | (f1 & (uint64_t)0x3fffU) << (uint32_t)12U; + uint64_t f30 = f1 >> (uint32_t)14U & (uint64_t)0x3ffffffU; + uint64_t f40 = f1 >> (uint32_t)40U; + uint64_t f01 = f010; + uint64_t f111 = f110; + uint64_t f2 = f20; + uint64_t f3 = f30; + uint64_t f41 = f40; + e[0U] = f01; + e[1U] = f111; + e[2U] = f2; + e[3U] = f3; + e[4U] = f41; + uint64_t b = (uint64_t)0x1000000U; + uint64_t mask = b; + uint64_t f4 = e[4U]; + e[4U] = f4 | mask; + uint64_t *r1 = pre; + uint64_t *r5 = pre + (uint32_t)5U; + uint64_t r0 = r1[0U]; + uint64_t r11 = r1[1U]; + uint64_t r2 = r1[2U]; + uint64_t r3 = r1[3U]; + uint64_t r4 = r1[4U]; + uint64_t r51 = r5[1U]; + uint64_t r52 = r5[2U]; + uint64_t r53 = r5[3U]; + uint64_t r54 = r5[4U]; + uint64_t f10 = e[0U]; + uint64_t f11 = e[1U]; + uint64_t f12 = e[2U]; + uint64_t f13 = e[3U]; + uint64_t f14 = e[4U]; + uint64_t a0 = acc[0U]; + uint64_t a1 = acc[1U]; + uint64_t a2 = acc[2U]; + uint64_t a3 = acc[3U]; + uint64_t a4 = acc[4U]; + uint64_t a01 = a0 + f10; + uint64_t a11 = a1 + f11; + uint64_t a21 = a2 + f12; + uint64_t a31 = a3 + f13; + uint64_t a41 = a4 + f14; + uint64_t a02 = r0 * a01; + uint64_t a12 = r11 * a01; + uint64_t a22 = r2 * a01; + uint64_t a32 = r3 * a01; + uint64_t a42 = r4 * a01; + uint64_t a03 = a02 + r54 * a11; + uint64_t a13 = a12 + r0 * a11; + uint64_t a23 = a22 + r11 * a11; + uint64_t a33 = a32 + r2 * a11; + uint64_t a43 = a42 + r3 * a11; + uint64_t a04 = a03 + r53 * a21; + uint64_t a14 = a13 + r54 * a21; + uint64_t a24 = a23 + r0 * a21; + uint64_t a34 = a33 + r11 * a21; + uint64_t a44 = a43 + r2 * a21; + uint64_t a05 = a04 + r52 * a31; + uint64_t a15 = a14 + r53 * a31; + uint64_t a25 = a24 + r54 * a31; + uint64_t a35 = a34 + r0 * a31; + uint64_t a45 = a44 + r11 * a31; + uint64_t a06 = a05 + r51 * a41; + uint64_t a16 = a15 + r52 * a41; + uint64_t a26 = a25 + r53 * a41; + uint64_t a36 = a35 + r54 * a41; + uint64_t a46 = a45 + r0 * a41; + uint64_t t0 = a06; + uint64_t t1 = a16; + uint64_t t2 = a26; + uint64_t t3 = a36; + uint64_t t4 = a46; + uint64_t mask26 = (uint64_t)0x3ffffffU; + uint64_t z0 = t0 >> (uint32_t)26U; + uint64_t z1 = t3 >> (uint32_t)26U; + uint64_t x0 = t0 & mask26; + uint64_t x3 = t3 & mask26; + uint64_t x1 = t1 + z0; + uint64_t x4 = t4 + z1; + uint64_t z01 = x1 >> (uint32_t)26U; + uint64_t z11 = x4 >> (uint32_t)26U; + uint64_t t = z11 << (uint32_t)2U; + uint64_t z12 = z11 + t; + uint64_t x11 = x1 & mask26; + uint64_t x41 = x4 & mask26; + uint64_t x2 = t2 + z01; + uint64_t x01 = x0 + z12; + uint64_t z02 = x2 >> (uint32_t)26U; + uint64_t z13 = x01 >> (uint32_t)26U; + uint64_t x21 = x2 & mask26; + uint64_t x02 = x01 & mask26; + uint64_t x31 = x3 + z02; + uint64_t x12 = x11 + z13; + uint64_t z03 = x31 >> (uint32_t)26U; + uint64_t x32 = x31 & mask26; + uint64_t x42 = x41 + z03; + uint64_t o0 = x02; + uint64_t o1 = x12; + uint64_t o2 = x21; + uint64_t o3 = x32; + uint64_t o4 = x42; + acc[0U] = o0; + acc[1U] = o1; + acc[2U] = o2; + acc[3U] = o3; + acc[4U] = o4; + return; + } +} + +static inline void +poly1305_do_32( + uint8_t *k, + uint32_t aadlen, + uint8_t *aad, + uint32_t mlen, + uint8_t *m, + uint8_t *out) +{ + uint64_t ctx[25U] = { 0U }; + uint8_t block[16U] = { 0U }; + Hacl_Poly1305_32_poly1305_init(ctx, k); + poly1305_padded_32(ctx, aadlen, aad); + poly1305_padded_32(ctx, mlen, m); + store64_le(block, (uint64_t)aadlen); + store64_le(block + (uint32_t)8U, (uint64_t)mlen); + uint64_t *pre = ctx + (uint32_t)5U; + uint64_t *acc = ctx; + uint64_t e[5U] = { 0U }; + uint64_t u0 = load64_le(block); + uint64_t lo = u0; + uint64_t u = load64_le(block + (uint32_t)8U); + uint64_t hi = u; + uint64_t f0 = lo; + uint64_t f1 = hi; + uint64_t f010 = f0 & (uint64_t)0x3ffffffU; + uint64_t f110 = f0 >> (uint32_t)26U & (uint64_t)0x3ffffffU; + uint64_t f20 = f0 >> (uint32_t)52U | (f1 & (uint64_t)0x3fffU) << (uint32_t)12U; + uint64_t f30 = f1 >> (uint32_t)14U & (uint64_t)0x3ffffffU; + uint64_t f40 = f1 >> (uint32_t)40U; + uint64_t f01 = f010; + uint64_t f111 = f110; + uint64_t f2 = f20; + uint64_t f3 = f30; + uint64_t f41 = f40; + e[0U] = f01; + e[1U] = f111; + e[2U] = f2; + e[3U] = f3; + e[4U] = f41; + uint64_t b = (uint64_t)0x1000000U; + uint64_t mask = b; + uint64_t f4 = e[4U]; + e[4U] = f4 | mask; + uint64_t *r = pre; + uint64_t *r5 = pre + (uint32_t)5U; + uint64_t r0 = r[0U]; + uint64_t r1 = r[1U]; + uint64_t r2 = r[2U]; + uint64_t r3 = r[3U]; + uint64_t r4 = r[4U]; + uint64_t r51 = r5[1U]; + uint64_t r52 = r5[2U]; + uint64_t r53 = r5[3U]; + uint64_t r54 = r5[4U]; + uint64_t f10 = e[0U]; + uint64_t f11 = e[1U]; + uint64_t f12 = e[2U]; + uint64_t f13 = e[3U]; + uint64_t f14 = e[4U]; + uint64_t a0 = acc[0U]; + uint64_t a1 = acc[1U]; + uint64_t a2 = acc[2U]; + uint64_t a3 = acc[3U]; + uint64_t a4 = acc[4U]; + uint64_t a01 = a0 + f10; + uint64_t a11 = a1 + f11; + uint64_t a21 = a2 + f12; + uint64_t a31 = a3 + f13; + uint64_t a41 = a4 + f14; + uint64_t a02 = r0 * a01; + uint64_t a12 = r1 * a01; + uint64_t a22 = r2 * a01; + uint64_t a32 = r3 * a01; + uint64_t a42 = r4 * a01; + uint64_t a03 = a02 + r54 * a11; + uint64_t a13 = a12 + r0 * a11; + uint64_t a23 = a22 + r1 * a11; + uint64_t a33 = a32 + r2 * a11; + uint64_t a43 = a42 + r3 * a11; + uint64_t a04 = a03 + r53 * a21; + uint64_t a14 = a13 + r54 * a21; + uint64_t a24 = a23 + r0 * a21; + uint64_t a34 = a33 + r1 * a21; + uint64_t a44 = a43 + r2 * a21; + uint64_t a05 = a04 + r52 * a31; + uint64_t a15 = a14 + r53 * a31; + uint64_t a25 = a24 + r54 * a31; + uint64_t a35 = a34 + r0 * a31; + uint64_t a45 = a44 + r1 * a31; + uint64_t a06 = a05 + r51 * a41; + uint64_t a16 = a15 + r52 * a41; + uint64_t a26 = a25 + r53 * a41; + uint64_t a36 = a35 + r54 * a41; + uint64_t a46 = a45 + r0 * a41; + uint64_t t0 = a06; + uint64_t t1 = a16; + uint64_t t2 = a26; + uint64_t t3 = a36; + uint64_t t4 = a46; + uint64_t mask26 = (uint64_t)0x3ffffffU; + uint64_t z0 = t0 >> (uint32_t)26U; + uint64_t z1 = t3 >> (uint32_t)26U; + uint64_t x0 = t0 & mask26; + uint64_t x3 = t3 & mask26; + uint64_t x1 = t1 + z0; + uint64_t x4 = t4 + z1; + uint64_t z01 = x1 >> (uint32_t)26U; + uint64_t z11 = x4 >> (uint32_t)26U; + uint64_t t = z11 << (uint32_t)2U; + uint64_t z12 = z11 + t; + uint64_t x11 = x1 & mask26; + uint64_t x41 = x4 & mask26; + uint64_t x2 = t2 + z01; + uint64_t x01 = x0 + z12; + uint64_t z02 = x2 >> (uint32_t)26U; + uint64_t z13 = x01 >> (uint32_t)26U; + uint64_t x21 = x2 & mask26; + uint64_t x02 = x01 & mask26; + uint64_t x31 = x3 + z02; + uint64_t x12 = x11 + z13; + uint64_t z03 = x31 >> (uint32_t)26U; + uint64_t x32 = x31 & mask26; + uint64_t x42 = x41 + z03; + uint64_t o0 = x02; + uint64_t o1 = x12; + uint64_t o2 = x21; + uint64_t o3 = x32; + uint64_t o4 = x42; + acc[0U] = o0; + acc[1U] = o1; + acc[2U] = o2; + acc[3U] = o3; + acc[4U] = o4; + Hacl_Poly1305_32_poly1305_finish(out, k, ctx); +} + +void +Chacha20Poly1305_vsx_aead_encrypt( + uint8_t *k, + uint8_t *n, + uint32_t aadlen, + uint8_t *aad, + uint32_t mlen, + uint8_t *m, + uint8_t *cipher, + uint8_t *mac) +{ + chacha20vsx(mlen, cipher, m, k, n, (uint32_t)1U); + uint8_t tmp[64U] = { 0U }; + chacha20vsx((uint32_t)64U, tmp, tmp, k, n, (uint32_t)0U); + uint8_t *key = tmp; + poly1305_do_32(key, aadlen, aad, mlen, cipher, mac); +} + +uint32_t +Chacha20Poly1305_vsx_aead_decrypt( + uint8_t *k, + uint8_t *n, + uint32_t aadlen, + uint8_t *aad, + uint32_t mlen, + uint8_t *m, + uint8_t *cipher, + uint8_t *mac) +{ + uint8_t computed_mac[16U] = { 0U }; + uint8_t tmp[64U] = { 0U }; + chacha20vsx((uint32_t)64U, tmp, tmp, k, n, (uint32_t)0U); + uint8_t *key = tmp; + poly1305_do_32(key, aadlen, aad, mlen, cipher, computed_mac); + uint8_t res = (uint8_t)255U; + for (uint32_t i = (uint32_t)0U; i < (uint32_t)16U; i++) { + uint8_t uu____0 = FStar_UInt8_eq_mask(computed_mac[i], mac[i]); + res = uu____0 & res; + } + uint8_t z = res; + if (z == (uint8_t)255U) { + chacha20vsx(mlen, m, cipher, k, n, (uint32_t)1U); + return (uint32_t)0U; + } + return (uint32_t)1U; +} diff --git a/lib/freebl/chacha20poly1305.c b/lib/freebl/chacha20poly1305.c index 5c294a9eaf..aa1a63fe41 100644 --- a/lib/freebl/chacha20poly1305.c +++ b/lib/freebl/chacha20poly1305.c @@ -69,6 +69,20 @@ Hacl_Chacha20Poly1305_32_aead_decrypt(uint8_t *k, uint8_t *n1, uint32_t aadlen, uint8_t *aad, uint32_t mlen, uint8_t *m, uint8_t *cipher, uint8_t *mac); +// Forward declaration from chacha20-ppc64le.S +void chacha20vsx(uint32_t len, uint8_t *output, uint8_t *block, uint8_t *k, + uint8_t *nonce, uint32_t ctr); + +// Forward declaration from chacha20poly1305-ppc.c +extern void +Chacha20Poly1305_vsx_aead_encrypt(uint8_t *k, uint8_t *n1, uint32_t aadlen, + uint8_t *aad, uint32_t mlen, uint8_t *m, + uint8_t *cipher, uint8_t *mac); +extern uint32_t +Chacha20Poly1305_vsx_aead_decrypt(uint8_t *k, uint8_t *n1, uint32_t aadlen, + uint8_t *aad, uint32_t mlen, uint8_t *m, + uint8_t *cipher, uint8_t *mac); + SECStatus ChaCha20Poly1305_InitContext(ChaCha20Poly1305Context *ctx, const unsigned char *key, unsigned int keyLen, @@ -144,6 +158,11 @@ ChaCha20Xor(uint8_t *output, uint8_t *block, uint32_t len, uint8_t *k, } #endif } else +#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \ + !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX) + if (__builtin_cpu_supports ("vsx")) { + chacha20vsx(len, output, block, k, nonce, ctr); + } else #endif { Hacl_Chacha20_chacha20_encrypt(len, output, block, k, nonce, ctr); @@ -212,6 +231,13 @@ ChaCha20Poly1305_Seal(const ChaCha20Poly1305Context *ctx, unsigned char *output, } #endif } else +#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \ + !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX) + if (__builtin_cpu_supports ("vsx")) { + Chacha20Poly1305_vsx_aead_encrypt( + (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, inputLen, + (uint8_t *)input, output, output + inputLen); + } else #endif { Hacl_Chacha20Poly1305_32_aead_encrypt( @@ -274,6 +300,13 @@ ChaCha20Poly1305_Open(const ChaCha20Poly1305Context *ctx, unsigned char *output, } #endif } else +#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \ + !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX) + if (__builtin_cpu_supports ("vsx")) { + res = Chacha20Poly1305_vsx_aead_decrypt( + (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, ciphertextLen, + (uint8_t *)output, (uint8_t *)input, (uint8_t *)input + ciphertextLen); + } else #endif { res = Hacl_Chacha20Poly1305_32_aead_decrypt( @@ -323,6 +356,13 @@ ChaCha20Poly1305_Encrypt(const ChaCha20Poly1305Context *ctx, (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, inputLen, (uint8_t *)input, output, outTag); } else +#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \ + !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX) + if (__builtin_cpu_supports ("vsx")) { + Chacha20Poly1305_vsx_aead_encrypt( + (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, inputLen, + (uint8_t *)input, output, outTag); + } else #endif { Hacl_Chacha20Poly1305_32_aead_encrypt( @@ -370,6 +410,13 @@ ChaCha20Poly1305_Decrypt(const ChaCha20Poly1305Context *ctx, (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, ciphertextLen, (uint8_t *)output, (uint8_t *)input, (uint8_t *)tagIn); } else +#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__) && \ + !defined(NSS_DISABLE_ALTIVEC) && !defined(NSS_DISABLE_CRYPTO_VSX) + if (__builtin_cpu_supports ("vsx")) { + res = Chacha20Poly1305_vsx_aead_decrypt( + (uint8_t *)ctx->key, (uint8_t *)nonce, adLen, (uint8_t *)ad, ciphertextLen, + (uint8_t *)output, (uint8_t *)input, (uint8_t *)tagIn); + } else #endif { res = Hacl_Chacha20Poly1305_32_aead_decrypt( diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp index 6578fac6a0..19807e4011 100644 --- a/lib/freebl/freebl.gyp +++ b/lib/freebl/freebl.gyp @@ -324,6 +324,14 @@ 'FREEBL_NO_DEPEND', ], }, + { + 'target_name': 'chacha20-ppc_lib', + 'type': 'static_library', + 'sources': [ + 'chacha20poly1305-ppc.c', + 'chacha20-ppc64le.S', + ] + }, { 'target_name': 'armv8_c_lib', 'type': 'static_library', @@ -410,6 +418,7 @@ 'dependencies': [ 'gcm-aes-ppc_c_lib', 'gcm-sha512-ppc_c_lib', + 'chacha20-ppc_lib', ], }], [ 'disable_altivec==1 and (target_arch=="ppc64" or target_arch=="ppc64le")', { diff --git a/lib/freebl/freebl_base.gypi b/lib/freebl/freebl_base.gypi index 39ec14982e..afbffac729 100644 --- a/lib/freebl/freebl_base.gypi +++ b/lib/freebl/freebl_base.gypi @@ -95,6 +95,12 @@ 'mpi/mpi_arm.c', ], }], + [ 'target_arch=="ppc64le"', { + 'sources': [ + 'chacha20poly1305-ppc.c', + 'chacha20-ppc64le.S', + ], + }] ], }], [ 'OS=="win"', {