diff --git a/gtests/freebl_gtest/freebl_gtest.gyp b/gtests/freebl_gtest/freebl_gtest.gyp index 99f10fbd26..fa3c9fb2c3 100644 --- a/gtests/freebl_gtest/freebl_gtest.gyp +++ b/gtests/freebl_gtest/freebl_gtest.gyp @@ -14,6 +14,7 @@ 'mpi_unittest.cc', 'dh_unittest.cc', 'ecl_unittest.cc', + 'ghash_unittest.cc', '<(DEPTH)/gtests/common/gtests.cc' ], 'dependencies': [ diff --git a/gtests/freebl_gtest/freebl_util.h b/gtests/freebl_gtest/freebl_util.h new file mode 100644 index 0000000000..c992b9c4ea --- /dev/null +++ b/gtests/freebl_gtest/freebl_util.h @@ -0,0 +1,16 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this file, +// You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include +#include + +std::vector hex_string_to_bytes(std::string s) { + std::vector bytes; + assert(s.length() % 2 == 0); + for (size_t i = 0; i < s.length(); i += 2) { + bytes.push_back(std::stoul(s.substr(i, 2), nullptr, 16)); + } + return bytes; +} diff --git a/gtests/freebl_gtest/ghash_unittest.cc b/gtests/freebl_gtest/ghash_unittest.cc new file mode 100644 index 0000000000..eeca2daaf2 --- /dev/null +++ b/gtests/freebl_gtest/ghash_unittest.cc @@ -0,0 +1,163 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this file, +// You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "gtest/gtest.h" + +#include "freebl_util.h" +#include "gcm.h" + +namespace nss_test { + +typedef struct ghash_kat_str { + std::string hash_key; + std::string additional_data; + std::string cipher_text; + std::string result; +} ghash_kat_value; + +/* + * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf + */ +const ghash_kat_value kKatValues[] = { + {"66e94bd4ef8a2c3b884cfa59ca342b2e", "", "", + "00000000000000000000000000000000"}, + + {"66e94bd4ef8a2c3b884cfa59ca342b2e", "", "0388dace60b6a392f328c2b971b2fe78", + "f38cbb1ad69223dcc3457ae5b6b0f885"}, + + {"b83b533708bf535d0aa6e52980d53b78", "", + "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25" + "4" + "66931c7d8f6a5aac84aa051ba30b396a0aac973d58e091473f5985", + "7f1b32b81b820d02614f8895ac1d4eac"}, + + {"b83b533708bf535d0aa6e52980d53b78", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25" + "4" + "66931c7d8f6a5aac84aa051ba30b396a0aac973d58e091", + "698e57f70e6ecc7fd9463b7260a9ae5f"}, + + {"b83b533708bf535d0aa6e52980d53b78", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "61353b4c2806934a777ff51fa22a4755699b2a714fcdc6f83766e5f97b6c742373806900e" + "4" + "9f24b22b097544d4896b424989b5e1ebac0f07c23f4598", + "df586bb4c249b92cb6922877e444d37b"}, + + {"b83b533708bf535d0aa6e52980d53b78", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "8ce24998625615b603a033aca13fb894be9112a5c3a211a8ba262a3cca7e2ca701e4a9a4f" + "b" + "a43c90ccdcb281d48c7c6fd62875d2aca417034c34aee5", + "1c5afe9760d3932f3c9a878aac3dc3de"}, + + {"aae06992acbf52a3e8f4a96ec9300bd7", "", "98e7247c07f0fe411c267e4384b0f600", + "e2c63f0ac44ad0e02efa05ab6743d4ce"}, + + {"466923ec9ae682214f2c082badb39249", "", + "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c" + "1" + "44c525ac619d18c84a3f4718e2448b2fe324d9ccda2710acade256", + "51110d40f6c8fff0eb1ae33445a889f0"}, + + {"466923ec9ae682214f2c082badb39249", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c" + "1" + "44c525ac619d18c84a3f4718e2448b2fe324d9ccda2710", + "ed2ce3062e4a8ec06db8b4c490e8a268"}, + + {"466923ec9ae682214f2c082badb39249", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9" + "a" + "471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7", + "1e6a133806607858ee80eaf237064089"}, + + {"466923ec9ae682214f2c082badb39249", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d27e88681ce3243c4830165a8fdcf9ff1de9a1d8e6b447ef6ef7b79828666e4581e79012a" + "f" + "34ddd9e2f037589b292db3e67c036745fa22e7e9b7373b", + "82567fb0b4cc371801eadec005968e94"}, + + {"dc95c078a2408989ad48a21492842087", "", "cea7403d4d606b6e074ec5d3baf39d18", + "83de425c5edc5d498f382c441041ca92"}, + + {"acbef20579b4b8ebce889bac8732dad7", "", + "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e485" + "9" + "0dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad", + "4db870d37cb75fcb46097c36230d1612"}, + + {"acbef20579b4b8ebce889bac8732dad7", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e485" + "9" + "0dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662", + "8bd0c4d8aacd391e67cca447e8c38f65"}, + + {"acbef20579b4b8ebce889bac8732dad7", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "c3762df1ca787d32ae47c13bf19844cbaf1ae14d0b976afac52ff7d79bba9de0feb582d33" + "9" + "34a4f0954cc2363bc73f7862ac430e64abe499f47c9b1f", + "75a34288b8c68f811c52b2e9a2f97f63"}, + + {"acbef20579b4b8ebce889bac8732dad7", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "5a8def2f0c9e53f1f75d7853659e2a20eeb2b22aafde6419a058ab4f6f746bf40fc0c3b78" + "0" + "f244452da3ebf1c5d82cdea2418997200ef82e44ae7e3f", + "d5ffcf6fc5ac4d69722187421a7f170b"}, + + /* Extra, non-nist, test case to test 64-bit binary multiplication carry + * correctness. */ + {"0000000000000000fcefef64ffc4766c", "", "0000000000000000ffcef9ebbffdbd8b", + "3561e34e52d8b598f9937982512fff27"}}; + +class GHashTest : public ::testing::TestWithParam { + protected: + void TestGHash(const ghash_kat_value val, bool sw) { + // Read test data. + std::vector hash_key = hex_string_to_bytes(val.hash_key); + ASSERT_EQ(16UL, hash_key.size()); + std::vector additional_data = + hex_string_to_bytes(val.additional_data); + std::vector cipher_text = hex_string_to_bytes(val.cipher_text); + std::vector expected = hex_string_to_bytes(val.result); + ASSERT_EQ(16UL, expected.size()); + + // Prepare context. + gcmHashContext ghashCtx; + ASSERT_EQ(SECSuccess, gcmHash_InitContext(&ghashCtx, hash_key.data(), sw)); + + // Hash additional_data, cipher_text. + gcmHash_Reset(&ghashCtx, + const_cast(additional_data.data()), + additional_data.size(), 16); + gcmHash_Update(&ghashCtx, + const_cast(cipher_text.data()), + cipher_text.size(), 16); + + // Finalise (hash in the length). + uint8_t result_bytes[16]; + unsigned int out_len; + ASSERT_EQ(SECSuccess, + gcmHash_Final(&ghashCtx, result_bytes, &out_len, 16, 16)); + ASSERT_EQ(16U, out_len); + EXPECT_EQ(expected, std::vector(result_bytes, result_bytes + 16)); + } +}; + +#ifdef NSS_X86_OR_X64 +TEST_P(GHashTest, KAT_X86_HW) { TestGHash(GetParam(), false); } +#endif +TEST_P(GHashTest, KAT_Sftw) { TestGHash(GetParam(), true); } + +INSTANTIATE_TEST_CASE_P(NISTTestVector, GHashTest, + ::testing::ValuesIn(kKatValues)); + +} // nss_test diff --git a/gtests/freebl_gtest/prng_kat_unittest.cc b/gtests/freebl_gtest/prng_kat_unittest.cc index 7bc9beea35..257da8c1b5 100644 --- a/gtests/freebl_gtest/prng_kat_unittest.cc +++ b/gtests/freebl_gtest/prng_kat_unittest.cc @@ -8,12 +8,12 @@ #include #include -#include #define GTEST_HAS_RTTI 0 #include "gtest/gtest.h" #include "blapi.h" +#include "freebl_util.h" namespace nss_test { @@ -44,15 +44,6 @@ std::string trim(std::string str) { return str.substr(strBegin, strRange); } -std::vector hex_string_to_bytes(std::string s) { - std::vector bytes; - assert(s.length() % 2 == 0); - for (size_t i = 0; i < s.length(); i += 2) { - bytes.push_back(std::stoul(s.substr(i, 2), nullptr, 16)); - } - return bytes; -} - std::vector read_option_s(std::string& s) { size_t start = s.find("=") + 1; assert(start > 0); diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile index aa02f4be3f..914a0119c2 100644 --- a/lib/freebl/Makefile +++ b/lib/freebl/Makefile @@ -110,6 +110,7 @@ endif # NSS_X86_OR_X64 means the target is either x86 or x64 ifeq (,$(filter-out i386 x386 x86 x86_64,$(CPU_ARCH))) DEFINES += -DNSS_X86_OR_X64 + CFLAGS += -mpclmul -maes ifneq (,$(USE_64)$(USE_X32)) DEFINES += -DNSS_X64 else diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp index f75474252b..6f0087e72d 100644 --- a/lib/freebl/freebl.gyp +++ b/lib/freebl/freebl.gyp @@ -153,6 +153,16 @@ 'MP_API_COMPATIBLE' ], 'conditions': [ + [ 'OS=="mac"', { + 'xcode_settings': { + # I'm not sure since when this is supported. + # But I hope that doesn't matter. We also assume this is x86/x64. + 'OTHER_CFLAGS': [ + '-mpclmul', + '-maes', + ], + }, + }], [ 'OS=="win" and target_arch=="ia32"', { 'msvs_settings': { 'VCCLCompilerTool': { @@ -243,6 +253,14 @@ 'MP_USE_UINT_DIGIT', ], }], + [ 'target_arch=="ia32" or target_arch=="x64"', { + 'cflags': [ + # enable isa option for pclmul am aes-ni; supported since gcc 4.4 + # This is only support by x84/x64. It's not needed for Windows. + '-mpclmul', + '-maes', + ], + }], [ 'target_arch=="arm"', { 'defines': [ 'MP_ASSEMBLY_MULTIPLY', diff --git a/lib/freebl/gcm.c b/lib/freebl/gcm.c index 22121001b6..1a176f0672 100644 --- a/lib/freebl/gcm.c +++ b/lib/freebl/gcm.c @@ -1,6 +1,8 @@ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/* Thanks to Thomas Pornin for the ideas how to implement the constat time + * binary multiplication. */ #ifdef FREEBL_NO_DEPEND #include "stubs.h" @@ -15,438 +17,376 @@ #include -/************************************************************************** - * First implement the Galois hash function of GCM (gcmHash) * - **************************************************************************/ -#define GCM_HASH_LEN_LEN 8 /* gcm hash defines lengths to be 64 bits */ - -typedef struct gcmHashContextStr gcmHashContext; - -static SECStatus gcmHash_InitContext(gcmHashContext *hash, - const unsigned char *H, - unsigned int blocksize); -static void gcmHash_DestroyContext(gcmHashContext *ghash, PRBool freeit); -static SECStatus gcmHash_Update(gcmHashContext *ghash, - const unsigned char *buf, unsigned int len, - unsigned int blocksize); -static SECStatus gcmHash_Sync(gcmHashContext *ghash, unsigned int blocksize); -static SECStatus gcmHash_Final(gcmHashContext *gcm, unsigned char *outbuf, - unsigned int *outlen, unsigned int maxout, - unsigned int blocksize); -static SECStatus gcmHash_Reset(gcmHashContext *ghash, - const unsigned char *inbuf, - unsigned int inbufLen, unsigned int blocksize); - -/* compile time defines to select how the GF2 multiply is calculated. - * There are currently 2 algorithms implemented here: MPI and ALGORITHM_1. - * - * MPI uses the GF2m implemented in mpi to support GF2 ECC. - * ALGORITHM_1 is the Algorithm 1 in both NIST SP 800-38D and - * "The Galois/Counter Mode of Operation (GCM)", McGrew & Viega. - */ -#if !defined(GCM_USE_ALGORITHM_1) && !defined(GCM_USE_MPI) -#define GCM_USE_MPI 1 /* MPI is about 5x faster with the \ - * same or less complexity. It's possible to use \ - * tables to speed things up even more */ -#endif - -/* GCM defines the bit string to be LSB first, which is exactly - * opposite everyone else, including hardware. build array - * to reverse everything. */ -static const unsigned char gcm_byte_rev[256] = { - 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, - 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, - 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, - 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, - 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, - 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, - 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, - 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, - 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, - 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, - 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, - 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, - 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, - 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, - 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, - 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, - 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, - 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, - 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, - 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, - 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, - 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, - 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, - 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, - 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, - 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, - 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, - 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, - 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, - 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, - 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, - 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff -}; - -#ifdef GCM_TRACE -#include - -#define GCM_TRACE_X(ghash, label) \ - { \ - unsigned char _X[MAX_BLOCK_SIZE]; \ - int i; \ - gcm_getX(ghash, _X, blocksize); \ - printf(label, (ghash)->m); \ - for (i = 0; i < blocksize; i++) \ - printf("%02x", _X[i]); \ - printf("\n"); \ - } -#define GCM_TRACE_BLOCK(label, buf, blocksize) \ - { \ - printf(label); \ - for (i = 0; i < blocksize; i++) \ - printf("%02x", buf[i]); \ - printf("\n"); \ - } -#else -#define GCM_TRACE_X(ghash, label) -#define GCM_TRACE_BLOCK(label, buf, blocksize) -#endif - -#ifdef GCM_USE_MPI - -#ifdef GCM_USE_ALGORITHM_1 -#error "Only define one of GCM_USE_MPI, GCM_USE_ALGORITHM_1" +#ifdef NSS_X86_OR_X64 +#include /* clmul */ #endif -/* use the MPI functions to calculate Xn = (Xn-1^C_i)*H mod poly */ -#include "mpi.h" -#include "secmpi.h" -#include "mplogic.h" -#include "mp_gf2m.h" - -/* state needed to handle GCM Hash function */ -struct gcmHashContextStr { - mp_int H; - mp_int X; - mp_int C_i; - const unsigned int *poly; - unsigned char buffer[MAX_BLOCK_SIZE]; - unsigned int bufLen; - int m; /* XXX what is m? */ - unsigned char counterBuf[2 * GCM_HASH_LEN_LEN]; - PRUint64 cLen; -}; -/* f = x^128 + x^7 + x^2 + x + 1 */ -static const unsigned int poly_128[] = { 128, 7, 2, 1, 0 }; +/* Forward declarations */ +SECStatus gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count, unsigned int blocksize); +SECStatus gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count, unsigned int blocksize); +SECStatus gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count, unsigned int blocksize); -/* sigh, GCM defines the bit strings exactly backwards from everything else */ -static void -gcm_reverse(unsigned char *target, const unsigned char *src, - unsigned int blocksize) +uint64_t +get64(const unsigned char *bytes) { - unsigned int i; - for (i = 0; i < blocksize; i++) { - target[blocksize - i - 1] = gcm_byte_rev[src[i]]; - } + return ((uint64_t)bytes[0]) << 56 | + ((uint64_t)bytes[1]) << 48 | + ((uint64_t)bytes[2]) << 40 | + ((uint64_t)bytes[3]) << 32 | + ((uint64_t)bytes[4]) << 24 | + ((uint64_t)bytes[5]) << 16 | + ((uint64_t)bytes[6]) << 8 | + ((uint64_t)bytes[7]); } /* Initialize a gcmHashContext */ -static SECStatus -gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, - unsigned int blocksize) +SECStatus +gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) { - mp_err err = MP_OKAY; - unsigned char H_rev[MAX_BLOCK_SIZE]; - - MP_DIGITS(&ghash->H) = 0; - MP_DIGITS(&ghash->X) = 0; - MP_DIGITS(&ghash->C_i) = 0; - CHECK_MPI_OK(mp_init(&ghash->H)); - CHECK_MPI_OK(mp_init(&ghash->X)); - CHECK_MPI_OK(mp_init(&ghash->C_i)); - - mp_zero(&ghash->X); - gcm_reverse(H_rev, H, blocksize); - CHECK_MPI_OK(mp_read_unsigned_octets(&ghash->H, H_rev, blocksize)); - - /* set the irreducible polynomial. Each blocksize has its own polynomial. - * for now only blocksize 16 (=128 bits) is defined */ - switch (blocksize) { - case 16: /* 128 bits */ - ghash->poly = poly_128; - break; - default: - PORT_SetError(SEC_ERROR_INVALID_ARGS); - goto cleanup; - } ghash->cLen = 0; ghash->bufLen = 0; - ghash->m = 0; PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf)); - return SECSuccess; -cleanup: - gcmHash_DestroyContext(ghash, PR_FALSE); - return SECFailure; -} - -/* Destroy a HashContext (Note we zero the digits so this function - * is idempotent if called with freeit == PR_FALSE */ -static void -gcmHash_DestroyContext(gcmHashContext *ghash, PRBool freeit) -{ - mp_clear(&ghash->H); - mp_clear(&ghash->X); - mp_clear(&ghash->C_i); - PORT_Memset(ghash, 0, sizeof(gcmHashContext)); - if (freeit) { - PORT_Free(ghash); - } -} - -static SECStatus -gcm_getX(gcmHashContext *ghash, unsigned char *T, unsigned int blocksize) -{ - int len; - mp_err err; - unsigned char tmp_buf[MAX_BLOCK_SIZE]; - unsigned char *X; - - len = mp_unsigned_octet_size(&ghash->X); - if (len <= 0) { - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; - } - X = tmp_buf; - PORT_Assert((unsigned int)len <= blocksize); - if ((unsigned int)len > blocksize) { - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; - } - /* zero pad the result */ - if (len != blocksize) { - PORT_Memset(X, 0, blocksize - len); - X += blocksize - len; - } - err = mp_to_unsigned_octets(&ghash->X, X, len); - if (err < 0) { + ghash->h_low = get64(H + 8); + ghash->h_high = get64(H); + if (clmul_support() && !sw) { +#ifdef NSS_X86_OR_X64 + ghash->ghash_mul = gcm_HashMult_hw; + ghash->x = _mm_setzero_si128(); + /* MSVC requires __m64 to load epi64. */ + ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, + ghash->h_low >> 32, (uint32_t)ghash->h_low); + ghash->hw = PR_TRUE; +#else PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); return SECFailure; +#endif /* NSS_X86_OR_X64 */ + } else { +/* We fall back to the software implementation if we can't use / don't + * want to use pclmul. */ +#ifdef HAVE_INT128_SUPPORT + ghash->ghash_mul = gcm_HashMult_sftw; +#else + ghash->ghash_mul = gcm_HashMult_sftw32; +#endif + ghash->x_high = ghash->x_low = 0; + ghash->hw = PR_FALSE; } - gcm_reverse(T, tmp_buf, blocksize); return SECSuccess; } -static SECStatus -gcm_HashMult(gcmHashContext *ghash, const unsigned char *buf, - unsigned int count, unsigned int blocksize) -{ - SECStatus rv = SECFailure; - mp_err err = MP_OKAY; - unsigned char tmp_buf[MAX_BLOCK_SIZE]; - unsigned int i; - - for (i = 0; i < count; i++, buf += blocksize) { - ghash->m++; - gcm_reverse(tmp_buf, buf, blocksize); - CHECK_MPI_OK(mp_read_unsigned_octets(&ghash->C_i, tmp_buf, blocksize)); - CHECK_MPI_OK(mp_badd(&ghash->X, &ghash->C_i, &ghash->C_i)); - /* - * Looking to speed up GCM, this the the place to do it. - * There are two areas that can be exploited to speed up this code. - * - * 1) H is a constant in this multiply. We can precompute H * (0 - 255) - * at init time and this becomes an blockize xors of our table lookup. - * - * 2) poly is a constant for each blocksize. We can calculate the - * modulo reduction by a series of adds and shifts. - * - * For now we are after functionality, so we will go ahead and use - * the builtin bmulmod from mpi - */ - CHECK_MPI_OK(mp_bmulmod(&ghash->C_i, &ghash->H, - ghash->poly, &ghash->X)); - GCM_TRACE_X(ghash, "X%d = ") - } - rv = SECSuccess; -cleanup: - PORT_Memset(tmp_buf, 0, sizeof(tmp_buf)); - if (rv != SECSuccess) { - MP_TO_SEC_ERROR(err); - } - return rv; -} - -static void -gcm_zeroX(gcmHashContext *ghash) +#ifdef HAVE_INT128_SUPPORT +/* Binary multiplication x * y = r_high << 64 | r_low. */ +void +bmul(uint64_t x, uint64_t y, uint64_t *r_high, uint64_t *r_low) { - mp_zero(&ghash->X); - ghash->m = 0; + uint128_t x1, x2, x3, x4, x5; + uint128_t y1, y2, y3, y4, y5; + uint128_t r, z; + + uint128_t m1 = (uint128_t)0x2108421084210842 << 64 | 0x1084210842108421; + uint128_t m2 = (uint128_t)0x4210842108421084 << 64 | 0x2108421084210842; + uint128_t m3 = (uint128_t)0x8421084210842108 << 64 | 0x4210842108421084; + uint128_t m4 = (uint128_t)0x0842108421084210 << 64 | 0x8421084210842108; + uint128_t m5 = (uint128_t)0x1084210842108421 << 64 | 0x0842108421084210; + + x1 = x & m1; + y1 = y & m1; + x2 = x & m2; + y2 = y & m2; + x3 = x & m3; + y3 = y & m3; + x4 = x & m4; + y4 = y & m4; + x5 = x & m5; + y5 = y & m5; + + z = (x1 * y1) ^ (x2 * y5) ^ (x3 * y4) ^ (x4 * y3) ^ (x5 * y2); + r = z & m1; + z = (x1 * y2) ^ (x2 * y1) ^ (x3 * y5) ^ (x4 * y4) ^ (x5 * y3); + r |= z & m2; + z = (x1 * y3) ^ (x2 * y2) ^ (x3 * y1) ^ (x4 * y5) ^ (x5 * y4); + r |= z & m3; + z = (x1 * y4) ^ (x2 * y3) ^ (x3 * y2) ^ (x4 * y1) ^ (x5 * y5); + r |= z & m4; + z = (x1 * y5) ^ (x2 * y4) ^ (x3 * y3) ^ (x4 * y2) ^ (x5 * y1); + r |= z & m5; + + *r_high = (uint64_t)(r >> 64); + *r_low = (uint64_t)r; } -#endif - -#ifdef GCM_USE_ALGORITHM_1 -/* use algorithm 1 of McGrew & Viega "The Galois/Counter Mode of Operation" */ - -#define GCM_ARRAY_SIZE (MAX_BLOCK_SIZE / sizeof(unsigned long)) - -struct gcmHashContextStr { - unsigned long H[GCM_ARRAY_SIZE]; - unsigned long X[GCM_ARRAY_SIZE]; - unsigned long R; - unsigned char buffer[MAX_BLOCK_SIZE]; - unsigned int bufLen; - int m; - unsigned char counterBuf[2 * GCM_HASH_LEN_LEN]; - PRUint64 cLen; -}; - -static void -gcm_bytes_to_longs(unsigned long *l, const unsigned char *c, unsigned int len) +SECStatus +gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count, unsigned int blocksize) { - int i, j; - int array_size = len / sizeof(unsigned long); - - PORT_Assert(len % sizeof(unsigned long) == 0); - for (i = 0; i < array_size; i++) { - unsigned long tmp = 0; - int byte_offset = i * sizeof(unsigned long); - for (j = sizeof(unsigned long) - 1; j >= 0; j--) { - tmp = (tmp << PR_BITS_PER_BYTE) | gcm_byte_rev[c[byte_offset + j]]; - } - l[i] = tmp; - } + uint64_t ci_low, ci_high; + size_t i; + uint64_t z2_low, z2_high, z0_low, z0_high, z1a_low, z1a_high; + uint128_t z_high = 0, z_low = 0; + + ci_low = ghash->x_low; + ci_high = ghash->x_high; + for (i = 0; i < count; i++, buf += 16) { + ci_low ^= get64(buf + 8); + ci_high ^= get64(buf); + + /* Do binary mult ghash->X = C * ghash->H (Karatsuba). */ + bmul(ci_high, ghash->h_high, &z2_high, &z2_low); + bmul(ci_low, ghash->h_low, &z0_high, &z0_low); + bmul(ci_high ^ ci_low, ghash->h_high ^ ghash->h_low, &z1a_high, &z1a_low); + z1a_high ^= z2_high ^ z0_high; + z1a_low ^= z2_low ^ z0_low; + z_high = ((uint128_t)z2_high << 64) | (z2_low ^ z1a_high); + z_low = (((uint128_t)z0_high << 64) | z0_low) ^ (((uint128_t)z1a_low) << 64); + + /* Shift one (multiply by x) as gcm spec is stupid. */ + z_high = (z_high << 1) | (z_low >> 127); + z_low <<= 1; + + /* Reduce */ + z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); + z_high ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); + ci_low = (uint64_t)z_high; + ci_high = (uint64_t)(z_high >> 64); + } + ghash->x_low = ci_low; + ghash->x_high = ci_high; + return SECSuccess; } - -static void -gcm_longs_to_bytes(const unsigned long *l, unsigned char *c, unsigned int len) +#else +/* Binary multiplication x * y = r_high << 32 | r_low. */ +void +bmul32(uint32_t x, uint32_t y, uint32_t *r_high, uint32_t *r_low) { - int i, j; - int array_size = len / sizeof(unsigned long); - - PORT_Assert(len % sizeof(unsigned long) == 0); - for (i = 0; i < array_size; i++) { - unsigned long tmp = l[i]; - int byte_offset = i * sizeof(unsigned long); - for (j = 0; j < sizeof(unsigned long); j++) { - c[byte_offset + j] = gcm_byte_rev[tmp & 0xff]; - tmp = (tmp >> PR_BITS_PER_BYTE); - } - } + uint32_t x0, x1, x2, x3; + uint32_t y0, y1, y2, y3; + uint32_t m1 = (uint32_t)0x11111111; + uint32_t m2 = (uint32_t)0x22222222; + uint32_t m4 = (uint32_t)0x44444444; + uint32_t m8 = (uint32_t)0x88888888; + uint64_t z0, z1, z2, z3; + uint64_t z; + + x0 = x & m1; + x1 = x & m2; + x2 = x & m4; + x3 = x & m8; + y0 = y & m1; + y1 = y & m2; + y2 = y & m4; + y3 = y & m8; + z0 = ((uint64_t)x0 * y0) ^ ((uint64_t)x1 * y3) ^ + ((uint64_t)x2 * y2) ^ ((uint64_t)x3 * y1); + z1 = ((uint64_t)x0 * y1) ^ ((uint64_t)x1 * y0) ^ + ((uint64_t)x2 * y3) ^ ((uint64_t)x3 * y2); + z2 = ((uint64_t)x0 * y2) ^ ((uint64_t)x1 * y1) ^ + ((uint64_t)x2 * y0) ^ ((uint64_t)x3 * y3); + z3 = ((uint64_t)x0 * y3) ^ ((uint64_t)x1 * y2) ^ + ((uint64_t)x2 * y1) ^ ((uint64_t)x3 * y0); + z0 &= ((uint64_t)m1 << 32) | m1; + z1 &= ((uint64_t)m2 << 32) | m2; + z2 &= ((uint64_t)m4 << 32) | m4; + z3 &= ((uint64_t)m8 << 32) | m8; + z = z0 | z1 | z2 | z3; + *r_high = (uint32_t)(z >> 32); + *r_low = (uint32_t)z; } -/* Initialize a gcmHashContext */ -static SECStatus -gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, - unsigned int blocksize) +SECStatus +gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count, unsigned int blocksize) { - PORT_Memset(ghash->X, 0, sizeof(ghash->X)); - PORT_Memset(ghash->H, 0, sizeof(ghash->H)); - gcm_bytes_to_longs(ghash->H, H, blocksize); - - /* set the irreducible polynomial. Each blocksize has its own polynommial - * for now only blocksize 16 (=128 bits) is defined */ - switch (blocksize) { - case 16: /* 128 bits */ - ghash->R = (unsigned long)0x87; /* x^7 + x^2 + x +1 */ - break; - default: - PORT_SetError(SEC_ERROR_INVALID_ARGS); - goto cleanup; + size_t i; + uint64_t ci_low, ci_high; + uint64_t z_high_h, z_high_l, z_low_h, z_low_l; + uint32_t ci_high_h, ci_high_l, ci_low_h, ci_low_l; + uint32_t b_a_h, b_a_l, a_a_h, a_a_l, b_b_h, b_b_l; + uint32_t a_b_h, a_b_l, b_c_h, b_c_l, a_c_h, a_c_l, c_c_h, c_c_l; + uint32_t ci_highXlow_h, ci_highXlow_l, c_a_h, c_a_l, c_b_h, c_b_l; + + uint32_t h_high_h = (uint32_t)(ghash->h_high >> 32); + uint32_t h_high_l = (uint32_t)ghash->h_high; + uint32_t h_low_h = (uint32_t)(ghash->h_low >> 32); + uint32_t h_low_l = (uint32_t)ghash->h_low; + uint32_t h_highXlow_h = h_high_h ^ h_low_h; + uint32_t h_highXlow_l = h_high_l ^ h_low_l; + uint32_t h_highX_xored = h_highXlow_h ^ h_highXlow_l; + + for (i = 0; i < count; i++, buf += 16) { + ci_low = ghash->x_low ^ get64(buf + 8); + ci_high = ghash->x_high ^ get64(buf); + ci_low_h = (uint32_t)(ci_low >> 32); + ci_low_l = (uint32_t)ci_low; + ci_high_h = (uint32_t)(ci_high >> 32); + ci_high_l = (uint32_t)ci_high; + ci_highXlow_h = ci_high_h ^ ci_low_h; + ci_highXlow_l = ci_high_l ^ ci_low_l; + + /* Do binary mult ghash->X = C * ghash->H (recursive Karatsuba). */ + bmul32(ci_high_h, h_high_h, &a_a_h, &a_a_l); + bmul32(ci_high_l, h_high_l, &a_b_h, &a_b_l); + bmul32(ci_high_h ^ ci_high_l, h_high_h ^ h_high_l, &a_c_h, &a_c_l); + a_c_h ^= a_a_h ^ a_b_h; + a_c_l ^= a_a_l ^ a_b_l; + a_a_l ^= a_c_h; + a_b_h ^= a_c_l; + /* ci_high * h_high = a_a_h:a_a_l:a_b_h:a_b_l */ + + bmul32(ci_low_h, h_low_h, &b_a_h, &b_a_l); + bmul32(ci_low_l, h_low_l, &b_b_h, &b_b_l); + bmul32(ci_low_h ^ ci_low_l, h_low_h ^ h_low_l, &b_c_h, &b_c_l); + b_c_h ^= b_a_h ^ b_b_h; + b_c_l ^= b_a_l ^ b_b_l; + b_a_l ^= b_c_h; + b_b_h ^= b_c_l; + /* ci_low * h_low = b_a_h:b_a_l:b_b_h:b_b_l */ + + bmul32(ci_highXlow_h, h_highXlow_h, &c_a_h, &c_a_l); + bmul32(ci_highXlow_l, h_highXlow_l, &c_b_h, &c_b_l); + bmul32(ci_highXlow_h ^ ci_highXlow_l, h_highX_xored, &c_c_h, &c_c_l); + c_c_h ^= c_a_h ^ c_b_h; + c_c_l ^= c_a_l ^ c_b_l; + c_a_l ^= c_c_h; + c_b_h ^= c_c_l; + /* (ci_high ^ ci_low) * (h_high ^ h_low) = c_a_h:c_a_l:c_b_h:c_b_l */ + + c_a_h ^= b_a_h ^ a_a_h; + c_a_l ^= b_a_l ^ a_a_l; + c_b_h ^= b_b_h ^ a_b_h; + c_b_l ^= b_b_l ^ a_b_l; + z_high_h = ((uint64_t)a_a_h << 32) | a_a_l; + z_high_l = (((uint64_t)a_b_h << 32) | a_b_l) ^ + (((uint64_t)c_a_h << 32) | c_a_l); + z_low_h = (((uint64_t)b_a_h << 32) | b_a_l) ^ + (((uint64_t)c_b_h << 32) | c_b_l); + z_low_l = ((uint64_t)b_b_h << 32) | b_b_l; + + /* Shift one (multiply by x) as gcm spec is stupid. */ + z_high_h = z_high_h << 1 | z_high_l >> 63; + z_high_l = z_high_l << 1 | z_low_h >> 63; + z_low_h = z_low_h << 1 | z_low_l >> 63; + z_low_l <<= 1; + + /* Reduce */ + z_low_h ^= (z_low_l << 63) ^ (z_low_l << 62) ^ (z_low_l << 57); + z_high_h ^= z_low_h ^ (z_low_h >> 1) ^ (z_low_h >> 2) ^ (z_low_h >> 7); + z_high_l ^= z_low_l ^ (z_low_l >> 1) ^ (z_low_l >> 2) ^ (z_low_l >> 7) ^ + (z_low_h << 63) ^ (z_low_h << 62) ^ (z_low_h << 57); + ghash->x_high = z_high_h; + ghash->x_low = z_high_l; } - ghash->cLen = 0; - ghash->bufLen = 0; - ghash->m = 0; - PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf)); return SECSuccess; -cleanup: - return SECFailure; -} - -/* Destroy a HashContext (Note we zero the digits so this function - * is idempotent if called with freeit == PR_FALSE */ -static void -gcmHash_DestroyContext(gcmHashContext *ghash, PRBool freeit) -{ - PORT_Memset(ghash, 0, sizeof(gcmHashContext)); - if (freeit) { - PORT_Free(ghash); - } } +#endif /* HAVE_INT128_SUPPORT */ -static unsigned long -gcm_shift_one(unsigned long *t, unsigned int count) +SECStatus +gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count, unsigned int blocksize) { - unsigned long carry = 0; - unsigned long nextcarry = 0; - unsigned int i; - for (i = 0; i < count; i++) { - nextcarry = t[i] >> ((sizeof(unsigned long) * PR_BITS_PER_BYTE) - 1); - t[i] = (t[i] << 1) | carry; - carry = nextcarry; +#ifdef NSS_X86_OR_X64 + size_t i; + pre_align __m128i z_high post_align; + pre_align __m128i z_low post_align; + pre_align __m128i C post_align; + pre_align __m128i D post_align; + pre_align __m128i E post_align; + pre_align __m128i F post_align; + pre_align __m128i bin post_align; + pre_align __m128i Ci post_align; + pre_align __m128i tmp post_align; + + for (i = 0; i < count; i++, buf += 16) { + bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], + ((uint16_t)buf[2] << 8) | buf[3], + ((uint16_t)buf[4] << 8) | buf[5], + ((uint16_t)buf[6] << 8) | buf[7], + ((uint16_t)buf[8] << 8) | buf[9], + ((uint16_t)buf[10] << 8) | buf[11], + ((uint16_t)buf[12] << 8) | buf[13], + ((uint16_t)buf[14] << 8) | buf[15]); + Ci = _mm_xor_si128(bin, ghash->x); + + /* Do binary mult ghash->X = Ci * ghash->H. */ + C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); + D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); + E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); + F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); + tmp = _mm_xor_si128(E, F); + z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); + z_high = _mm_unpackhi_epi64(z_high, D); + z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); + z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); + + /* Shift one to the left (multiply by x) as gcm spec is stupid. */ + C = _mm_slli_si128(z_low, 8); + E = _mm_srli_epi64(C, 63); + D = _mm_slli_si128(z_high, 8); + F = _mm_srli_epi64(D, 63); + /* Carry over */ + C = _mm_srli_si128(z_low, 8); + D = _mm_srli_epi64(C, 63); + z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); + z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); + + /* Reduce */ + C = _mm_slli_si128(z_low, 8); + /* D = z_low << 127 */ + D = _mm_slli_epi64(C, 63); + /* E = z_low << 126 */ + E = _mm_slli_epi64(C, 62); + /* F = z_low << 121 */ + F = _mm_slli_epi64(C, 57); + /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ + z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); + C = _mm_srli_si128(z_low, 8); + /* D = z_low >> 1 */ + D = _mm_slli_epi64(C, 63); + D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); + /* E = z_low >> 2 */ + E = _mm_slli_epi64(C, 62); + E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); + /* F = z_low >> 7 */ + F = _mm_slli_epi64(C, 57); + F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); + /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ + ghash->x = _mm_xor_si128(_mm_xor_si128( + _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), + F); } - return carry; -} - -static SECStatus -gcm_getX(gcmHashContext *ghash, unsigned char *T, unsigned int blocksize) -{ - gcm_longs_to_bytes(ghash->X, T, blocksize); return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ } -#define GCM_XOR(t, s, len) \ - for (l = 0; l < len; l++) \ - t[l] ^= s[l] - static SECStatus -gcm_HashMult(gcmHashContext *ghash, const unsigned char *buf, - unsigned int count, unsigned int blocksize) +gcm_zeroX(gcmHashContext *ghash) { - unsigned long C_i[GCM_ARRAY_SIZE]; - unsigned int arraysize = blocksize / sizeof(unsigned long); - unsigned int i, j, k, l; - - for (i = 0; i < count; i++, buf += blocksize) { - ghash->m++; - gcm_bytes_to_longs(C_i, buf, blocksize); - GCM_XOR(C_i, ghash->X, arraysize); - /* multiply X = C_i * H */ - PORT_Memset(ghash->X, 0, sizeof(ghash->X)); - for (j = 0; j < arraysize; j++) { - unsigned long H = ghash->H[j]; - for (k = 0; k < sizeof(unsigned long) * PR_BITS_PER_BYTE; k++) { - if (H & 1) { - GCM_XOR(ghash->X, C_i, arraysize); - } - if (gcm_shift_one(C_i, arraysize)) { - C_i[0] = C_i[0] ^ ghash->R; - } - H = H >> 1; - } - } - GCM_TRACE_X(ghash, "X%d = ") + if (ghash->hw) { +#ifdef NSS_X86_OR_X64 + ghash->x = _mm_setzero_si128(); + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ } - PORT_Memset(C_i, 0, sizeof(C_i)); - return SECSuccess; -} -static void -gcm_zeroX(gcmHashContext *ghash) -{ - PORT_Memset(ghash->X, 0, sizeof(ghash->X)); - ghash->m = 0; + ghash->x_high = ghash->x_low = 0; + return SECSuccess; } -#endif /* * implement GCM GHASH using the freebl GHASH function. The gcm_HashMult * function always takes blocksize lengths of data. gcmHash_Update will * format the data properly. */ -static SECStatus +SECStatus gcmHash_Update(gcmHashContext *ghash, const unsigned char *buf, unsigned int len, unsigned int blocksize) { @@ -471,7 +411,7 @@ gcmHash_Update(gcmHashContext *ghash, const unsigned char *buf, } PORT_Assert(ghash->bufLen == blocksize); /* hash the buffer and clear it */ - rv = gcm_HashMult(ghash, ghash->buffer, 1, blocksize); + rv = ghash->ghash_mul(ghash, ghash->buffer, 1, blocksize); PORT_Memset(ghash->buffer, 0, blocksize); ghash->bufLen = 0; if (rv != SECSuccess) { @@ -481,7 +421,7 @@ gcmHash_Update(gcmHashContext *ghash, const unsigned char *buf, /* now hash any full blocks remaining in the data stream */ blocks = len / blocksize; if (blocks) { - rv = gcm_HashMult(ghash, buf, blocks, blocksize); + rv = ghash->ghash_mul(ghash, buf, blocks, blocksize); if (rv != SECSuccess) { return SECFailure; } @@ -520,7 +460,7 @@ gcmHash_Sync(gcmHashContext *ghash, unsigned int blocksize) /* now zero fill the buffer and hash the last block */ if (ghash->bufLen) { PORT_Memset(ghash->buffer + ghash->bufLen, 0, blocksize - ghash->bufLen); - rv = gcm_HashMult(ghash, ghash->buffer, 1, blocksize); + rv = ghash->ghash_mul(ghash, ghash->buffer, 1, blocksize); PORT_Memset(ghash->buffer, 0, blocksize); ghash->bufLen = 0; if (rv != SECSuccess) { @@ -530,11 +470,21 @@ gcmHash_Sync(gcmHashContext *ghash, unsigned int blocksize) return SECSuccess; } +#define WRITE64(x, bytes) \ + (bytes)[0] = (x) >> 56; \ + (bytes)[1] = (x) >> 48; \ + (bytes)[2] = (x) >> 40; \ + (bytes)[3] = (x) >> 32; \ + (bytes)[4] = (x) >> 24; \ + (bytes)[5] = (x) >> 16; \ + (bytes)[6] = (x) >> 8; \ + (bytes)[7] = (x); + /* * This does the final sync, hashes the lengths, then returns * "T", the hashed output. */ -static SECStatus +SECStatus gcmHash_Final(gcmHashContext *ghash, unsigned char *outbuf, unsigned int *outlen, unsigned int maxout, unsigned int blocksize) @@ -547,21 +497,31 @@ gcmHash_Final(gcmHashContext *ghash, unsigned char *outbuf, goto cleanup; } - rv = gcm_HashMult(ghash, ghash->counterBuf, (GCM_HASH_LEN_LEN * 2) / blocksize, - blocksize); + rv = ghash->ghash_mul(ghash, ghash->counterBuf, + (GCM_HASH_LEN_LEN * 2) / blocksize, + blocksize); if (rv != SECSuccess) { goto cleanup; } - GCM_TRACE_X(ghash, "GHASH(H,A,C) = ") - - rv = gcm_getX(ghash, T, blocksize); - if (rv != SECSuccess) { - goto cleanup; + if (ghash->hw) { +#ifdef NSS_X86_OR_X64 + uint64_t tmp_out[2]; + _mm_storeu_si128((__m128i *)tmp_out, ghash->x); + WRITE64(tmp_out[0], T + 8); + WRITE64(tmp_out[1], T); +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ + } else { + WRITE64(ghash->x_low, T + 8); + WRITE64(ghash->x_high, T); } - if (maxout > blocksize) + if (maxout > blocksize) { maxout = blocksize; + } PORT_Memcpy(outbuf, T, maxout); *outlen = maxout; rv = SECSuccess; @@ -580,7 +540,10 @@ gcmHash_Reset(gcmHashContext *ghash, const unsigned char *AAD, ghash->cLen = 0; PORT_Memset(ghash->counterBuf, 0, GCM_HASH_LEN_LEN * 2); ghash->bufLen = 0; - gcm_zeroX(ghash); + rv = gcm_zeroX(ghash); + if (rv != SECSuccess) { + return rv; + } /* now kick things off by hashing the Additional Authenticated Data */ if (AADLen != 0) { @@ -602,7 +565,7 @@ gcmHash_Reset(gcmHashContext *ghash, const unsigned char *AAD, /* state to handle the full GCM operation (hash and counter) */ struct GCMContextStr { - gcmHashContext ghash_context; + gcmHashContext *ghash_context; CTRContext ctr_context; unsigned long tagBits; unsigned char tagKey[MAX_BLOCK_SIZE]; @@ -613,7 +576,7 @@ GCM_CreateContext(void *context, freeblCipherFunc cipher, const unsigned char *params, unsigned int blocksize) { GCMContext *gcm = NULL; - gcmHashContext *ghash; + gcmHashContext *ghash = NULL; unsigned char H[MAX_BLOCK_SIZE]; unsigned int tmp; PRBool freeCtr = PR_FALSE; @@ -621,6 +584,11 @@ GCM_CreateContext(void *context, freeblCipherFunc cipher, const CK_GCM_PARAMS *gcmParams = (const CK_GCM_PARAMS *)params; CK_AES_CTR_PARAMS ctrParams; SECStatus rv; +#ifdef DISABLE_HW_GCM + const PRBool sw = PR_TRUE; +#else + const PRBool sw = PR_FALSE; +#endif if (blocksize > MAX_BLOCK_SIZE || blocksize > sizeof(ctrParams.cb)) { PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); @@ -628,16 +596,26 @@ GCM_CreateContext(void *context, freeblCipherFunc cipher, } gcm = PORT_ZNew(GCMContext); if (gcm == NULL) { + PORT_SetError(SEC_ERROR_NO_MEMORY); return NULL; } - /* first fill in the ghash context */ - ghash = &gcm->ghash_context; + /* aligned_alloc is C11 so we have to do it the old way. */ + ghash = PORT_ZAlloc(sizeof(gcmHashContext) + 15); + if (ghash == NULL) { + PORT_SetError(SEC_ERROR_NO_MEMORY); + return NULL; + } + ghash->mem = ghash; + ghash = (gcmHashContext *)(((uintptr_t)ghash + 15) & ~(uintptr_t)0x0F); + + /* first plug in the ghash context */ + gcm->ghash_context = ghash; PORT_Memset(H, 0, blocksize); rv = (*cipher)(context, H, &tmp, blocksize, H, blocksize, blocksize); if (rv != SECSuccess) { goto loser; } - rv = gcmHash_InitContext(ghash, H, blocksize); + rv = gcmHash_InitContext(ghash, H, sw); if (rv != SECSuccess) { goto loser; } @@ -690,7 +668,7 @@ GCM_CreateContext(void *context, freeblCipherFunc cipher, CTR_DestroyContext(&gcm->ctr_context, PR_FALSE); } if (freeHash) { - gcmHash_DestroyContext(&gcm->ghash_context, PR_FALSE); + PORT_Free(gcm->ghash_context->mem); } if (gcm) { PORT_Free(gcm); @@ -705,7 +683,7 @@ GCM_DestroyContext(GCMContext *gcm, PRBool freeit) * gcm. call their destroy functions to free up any locally * allocated data (like mp_int's) */ CTR_DestroyContext(&gcm->ctr_context, PR_FALSE); - gcmHash_DestroyContext(&gcm->ghash_context, PR_FALSE); + PORT_Free(gcm->ghash_context->mem); PORT_Memset(&gcm->tagBits, 0, sizeof(gcm->tagBits)); PORT_Memset(gcm->tagKey, 0, sizeof(gcm->tagKey)); if (freeit) { @@ -738,18 +716,14 @@ gcm_GetTag(GCMContext *gcm, unsigned char *outbuf, return SECFailure; } maxout = tagBytes; - rv = gcmHash_Final(&gcm->ghash_context, outbuf, outlen, maxout, blocksize); + rv = gcmHash_Final(gcm->ghash_context, outbuf, outlen, maxout, blocksize); if (rv != SECSuccess) { return SECFailure; } - GCM_TRACE_BLOCK("GHASH=", outbuf, blocksize); - GCM_TRACE_BLOCK("Y0=", gcm->tagKey, blocksize); for (i = 0; i < *outlen; i++) { outbuf[i] ^= gcm->tagKey[i]; } - GCM_TRACE_BLOCK("Y0=", gcm->tagKey, blocksize); - GCM_TRACE_BLOCK("T=", outbuf, blocksize); /* mask off any extra bits we got */ if (extra) { outbuf[tagBytes - 1] &= ~((1 << extra) - 1); @@ -788,7 +762,7 @@ GCM_EncryptUpdate(GCMContext *gcm, unsigned char *outbuf, if (rv != SECSuccess) { return SECFailure; } - rv = gcmHash_Update(&gcm->ghash_context, outbuf, *outlen, blocksize); + rv = gcmHash_Update(gcm->ghash_context, outbuf, *outlen, blocksize); if (rv != SECSuccess) { PORT_Memset(outbuf, 0, *outlen); /* clear the output buffer */ *outlen = 0; @@ -836,7 +810,7 @@ GCM_DecryptUpdate(GCMContext *gcm, unsigned char *outbuf, intag = inbuf + inlen; /* verify the block */ - rv = gcmHash_Update(&gcm->ghash_context, inbuf, inlen, blocksize); + rv = gcmHash_Update(gcm->ghash_context, inbuf, inlen, blocksize); if (rv != SECSuccess) { return SECFailure; } diff --git a/lib/freebl/gcm.h b/lib/freebl/gcm.h index 1cdba534d0..0185d412b3 100644 --- a/lib/freebl/gcm.h +++ b/lib/freebl/gcm.h @@ -6,6 +6,17 @@ #define GCM_H 1 #include "blapii.h" +#include + +#ifdef NSS_X86_OR_X64 +#include /* __m128i */ +#endif + +SEC_BEGIN_PROTOS + +#ifdef HAVE_INT128_SUPPORT +typedef unsigned __int128 uint128_t; +#endif typedef struct GCMContextStr GCMContext; @@ -28,4 +39,46 @@ SECStatus GCM_DecryptUpdate(GCMContext *gcm, unsigned char *outbuf, const unsigned char *inbuf, unsigned int inlen, unsigned int blocksize); +/* These functions are here only so we can test them */ +#if defined(_WINDOWS) && defined(NSS_X86_OR_X64) +#define pre_align __declspec(align(16)) +#define post_align +#elif defined(NSS_X86_OR_X64) +#define pre_align +#define post_align __attribute__((aligned(16))) +#else +#define pre_align +#define post_align +#endif + +#define GCM_HASH_LEN_LEN 8 /* gcm hash defines lengths to be 64 bits */ +typedef struct gcmHashContextStr gcmHashContext; +typedef SECStatus (*ghash_t)(gcmHashContext *, const unsigned char *, + unsigned int, unsigned int); +pre_align struct gcmHashContextStr { +#ifdef NSS_X86_OR_X64 + __m128i x, h; +#endif + uint64_t x_low, x_high, h_high, h_low; + unsigned char buffer[MAX_BLOCK_SIZE]; + unsigned int bufLen; + uint8_t counterBuf[16]; + uint64_t cLen; + ghash_t ghash_mul; + PRBool hw; + gcmHashContext *mem; +} post_align; + +SECStatus gcmHash_Update(gcmHashContext *ghash, const unsigned char *buf, + unsigned int len, unsigned int blocksize); +SECStatus gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, + PRBool sw); +SECStatus gcmHash_Reset(gcmHashContext *ghash, const unsigned char *AAD, + unsigned int AADLen, unsigned int blocksize); +SECStatus gcmHash_Final(gcmHashContext *ghash, unsigned char *outbuf, + unsigned int *outlen, unsigned int maxout, + unsigned int blocksize); + +SEC_END_PROTOS + #endif