nostrdb: ccan: sync with normal versions.
This is the version of CCAN which CLN was using at the time these were taken. Unfortunately lots of whitespace has been changed, but AFAICT no source changes. Here's the command I ran (with ../ccan checked out to 1ae4c432): ``` make update-ccan CCAN_NEW="alignof array_size build_assert check_type container_of cppmagic likely list mem short_types str structeq take tal tal/str typesafe_cb utf8 endian crypto/sha256" ``` Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: William Casarin <jb55@jb55.com>
This commit is contained in:
committed by
Daniel D’Aquino
parent
201cdd7edc
commit
a8d7d971b1
1
nostrdb/ccan/ccan/crypto/sha256/LICENSE
Symbolic link
1
nostrdb/ccan/ccan/crypto/sha256/LICENSE
Symbolic link
@@ -0,0 +1 @@
|
||||
../../../licenses/BSD-MIT
|
||||
61
nostrdb/ccan/ccan/crypto/sha256/_info
Normal file
61
nostrdb/ccan/ccan/crypto/sha256/_info
Normal file
@@ -0,0 +1,61 @@
|
||||
#include "config.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
/**
|
||||
* crypto/sha256 - implementation of SHA-2 with 256 bit digest.
|
||||
*
|
||||
* This code is either a wrapper for openssl (if CCAN_CRYPTO_SHA256_USE_OPENSSL
|
||||
* is defined) or an open-coded implementation based on Bitcoin's.
|
||||
*
|
||||
* License: BSD-MIT
|
||||
* Maintainer: Rusty Russell <rusty@rustcorp.com.au>
|
||||
*
|
||||
* Example:
|
||||
* #include <ccan/crypto/sha256/sha256.h>
|
||||
* #include <err.h>
|
||||
* #include <stdio.h>
|
||||
* #include <string.h>
|
||||
*
|
||||
* // Simple demonstration: idential strings will have the same hash, but
|
||||
* // two different strings will not.
|
||||
* int main(int argc, char *argv[])
|
||||
* {
|
||||
* struct sha256 hash1, hash2;
|
||||
*
|
||||
* if (argc != 3)
|
||||
* errx(1, "Usage: %s <string1> <string2>", argv[0]);
|
||||
*
|
||||
* sha256(&hash1, argv[1], strlen(argv[1]));
|
||||
* sha256(&hash2, argv[2], strlen(argv[2]));
|
||||
* printf("Hash is %s\n", memcmp(&hash1, &hash2, sizeof(hash1))
|
||||
* ? "different" : "same");
|
||||
* return 0;
|
||||
* }
|
||||
*/
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
/* Expect exactly one argument */
|
||||
if (argc != 2)
|
||||
return 1;
|
||||
|
||||
if (strcmp(argv[1], "depends") == 0) {
|
||||
printf("ccan/compiler\n");
|
||||
printf("ccan/endian\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (strcmp(argv[1], "testdepends") == 0) {
|
||||
printf("ccan/str/hex\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (strcmp(argv[1], "libs") == 0) {
|
||||
#ifdef CCAN_CRYPTO_SHA256_USE_OPENSSL
|
||||
printf("crypto\n");
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
20
nostrdb/ccan/ccan/crypto/sha256/benchmarks/Makefile
Normal file
20
nostrdb/ccan/ccan/crypto/sha256/benchmarks/Makefile
Normal file
@@ -0,0 +1,20 @@
|
||||
CCANDIR := ../../../../
|
||||
CFLAGS := -Wall -I$(CCANDIR) -O3 -flto -DCCAN_USE_ORIGINAL=1
|
||||
LDFLAGS := -O3 -flto
|
||||
|
||||
INTEL_OBJS := sha256_avx1.o sha256_avx2_rorx2.o sha256_avx2_rorx8.o sha256_sse4.o
|
||||
|
||||
double-sha-bench: double-sha-bench.o ccan-time.o $(INTEL_OBJS) #ccan-crypto-sha256.o
|
||||
|
||||
$(INTEL_OBJS): %.o : %.asm
|
||||
|
||||
%.o : %.asm
|
||||
yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o $@ $<
|
||||
|
||||
clean:
|
||||
rm -f *.o
|
||||
|
||||
ccan-crypto-sha256.o: $(CCANDIR)/ccan/crypto/sha256/sha256.c
|
||||
$(CC) $(CFLAGS) -c -o $@ $<
|
||||
ccan-time.o: $(CCANDIR)/ccan/time/time.c
|
||||
$(CC) $(CFLAGS) -c -o $@ $<
|
||||
122
nostrdb/ccan/ccan/crypto/sha256/benchmarks/double-sha-bench.c
Normal file
122
nostrdb/ccan/ccan/crypto/sha256/benchmarks/double-sha-bench.c
Normal file
@@ -0,0 +1,122 @@
|
||||
/* Bitcoin does a lot of SHA of SHA. Benchmark that. */
|
||||
#include <ccan/crypto/sha256/sha256.c>
|
||||
#include <ccan/time/time.h>
|
||||
#include <stdio.h>
|
||||
|
||||
void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
void sha256_rorx(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
void sha256_rorx_x8ms(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks);
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
struct timeabs start;
|
||||
struct timerel diff;
|
||||
size_t i, n;
|
||||
union {
|
||||
struct sha256 h;
|
||||
uint32_t u32[16];
|
||||
uint8_t u8[64];
|
||||
} block;
|
||||
|
||||
n = atoi(argv[1] ? argv[1] : "1000000");
|
||||
memset(&block, 0, sizeof(block));
|
||||
sha256(&block.h, &n, sizeof(n));
|
||||
|
||||
start = time_now();
|
||||
for (i = 0; i < n; i++) {
|
||||
sha256(&block.h, &block.h, sizeof(block.h));
|
||||
}
|
||||
diff = time_divide(time_between(time_now(), start), n);
|
||||
printf("Normal gave %02x%02x%02x%02x%02x%02x... in %llu nsec\n",
|
||||
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
|
||||
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
|
||||
(unsigned long long)time_to_nsec(diff));
|
||||
|
||||
/* Now, don't re-initialize every time; use Transform */
|
||||
memset(&block, 0, sizeof(block));
|
||||
sha256(&block.h, &n, sizeof(n));
|
||||
block.u8[sizeof(block.h)] = 0x80;
|
||||
/* Size is 256 bits */
|
||||
block.u8[sizeof(block)-2] = 1;
|
||||
|
||||
start = time_now();
|
||||
for (i = 0; i < n; i++) {
|
||||
struct sha256_ctx ctx = SHA256_INIT;
|
||||
size_t j;
|
||||
Transform(ctx.s, block.u32);
|
||||
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
|
||||
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
|
||||
}
|
||||
diff = time_divide(time_between(time_now(), start), n);
|
||||
printf("Transform gave %02x%02x%02x%02x%02x%02x... in %llu nsec\n",
|
||||
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
|
||||
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
|
||||
(unsigned long long)time_to_nsec(diff));
|
||||
|
||||
/* Now, assembler variants */
|
||||
sha256(&block.h, &n, sizeof(n));
|
||||
|
||||
start = time_now();
|
||||
for (i = 0; i < n; i++) {
|
||||
struct sha256_ctx ctx = SHA256_INIT;
|
||||
size_t j;
|
||||
sha256_rorx(block.u32, ctx.s, 1);
|
||||
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
|
||||
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
|
||||
}
|
||||
diff = time_divide(time_between(time_now(), start), n);
|
||||
printf("Asm rorx for %02x%02x%02x%02x%02x%02x... is %llu nsec\n",
|
||||
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
|
||||
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
|
||||
(unsigned long long)time_to_nsec(diff));
|
||||
|
||||
sha256(&block.h, &n, sizeof(n));
|
||||
|
||||
start = time_now();
|
||||
for (i = 0; i < n; i++) {
|
||||
struct sha256_ctx ctx = SHA256_INIT;
|
||||
size_t j;
|
||||
sha256_sse4(block.u32, ctx.s, 1);
|
||||
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
|
||||
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
|
||||
}
|
||||
diff = time_divide(time_between(time_now(), start), n);
|
||||
printf("Asm SSE4 for %02x%02x%02x%02x%02x%02x... is %llu nsec\n",
|
||||
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
|
||||
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
|
||||
(unsigned long long)time_to_nsec(diff));
|
||||
|
||||
sha256(&block.h, &n, sizeof(n));
|
||||
start = time_now();
|
||||
for (i = 0; i < n; i++) {
|
||||
struct sha256_ctx ctx = SHA256_INIT;
|
||||
size_t j;
|
||||
sha256_rorx_x8ms(block.u32, ctx.s, 1);
|
||||
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
|
||||
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
|
||||
}
|
||||
diff = time_divide(time_between(time_now(), start), n);
|
||||
printf("Asm RORx-x8ms for %02x%02x%02x%02x%02x%02x... is %llu nsec\n",
|
||||
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
|
||||
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
|
||||
(unsigned long long)time_to_nsec(diff));
|
||||
|
||||
sha256(&block.h, &n, sizeof(n));
|
||||
start = time_now();
|
||||
for (i = 0; i < n; i++) {
|
||||
struct sha256_ctx ctx = SHA256_INIT;
|
||||
size_t j;
|
||||
sha256_avx(block.u32, ctx.s, 1);
|
||||
for (j = 0; j < sizeof(ctx.s) / sizeof(ctx.s[0]); j++)
|
||||
block.h.u.u32[j] = cpu_to_be32(ctx.s[j]);
|
||||
}
|
||||
diff = time_divide(time_between(time_now(), start), n);
|
||||
printf("Asm AVX for %02x%02x%02x%02x%02x%02x... is %llu nsec\n",
|
||||
block.h.u.u8[0], block.h.u.u8[1], block.h.u.u8[2],
|
||||
block.h.u.u8[3], block.h.u.u8[4], block.h.u.u8[5],
|
||||
(unsigned long long)time_to_nsec(diff));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
Copyright (c) 2012, Intel Corporation
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
* Neither the name of the Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
|
||||
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
586
nostrdb/ccan/ccan/crypto/sha256/benchmarks/sha256_avx1.asm
Normal file
586
nostrdb/ccan/ccan/crypto/sha256/benchmarks/sha256_avx1.asm
Normal file
@@ -0,0 +1,586 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright (c) 2012, Intel Corporation
|
||||
;
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are
|
||||
; met:
|
||||
;
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
;
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in the
|
||||
; documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
;
|
||||
; * Neither the name of the Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived from
|
||||
; this software without specific prior written permission.
|
||||
;
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
|
||||
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
|
||||
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
|
||||
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; This code is described in an Intel White-Paper:
|
||||
; "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
;
|
||||
; To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
; and search for that title.
|
||||
; The paper is expected to be released roughly at the end of April, 2012
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; This code schedules 1 blocks at a time, with 4 lanes per block
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define VMOVDQ vmovdqu ;; assume buffers not aligned
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
|
||||
|
||||
; addm [mem], reg
|
||||
; Add reg to mem using reg-mem add and store
|
||||
%macro addm 2
|
||||
add %2, %1
|
||||
mov %1, %2
|
||||
%endm
|
||||
|
||||
%macro MY_ROR 2
|
||||
shld %1,%1,(32-(%2))
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
; Load xmm with mem and byte swap each dword
|
||||
%macro COPY_XMM_AND_BSWAP 3
|
||||
VMOVDQ %1, %2
|
||||
vpshufb %1, %1, %3
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define X0 xmm4
|
||||
%define X1 xmm5
|
||||
%define X2 xmm6
|
||||
%define X3 xmm7
|
||||
|
||||
%define XTMP0 xmm0
|
||||
%define XTMP1 xmm1
|
||||
%define XTMP2 xmm2
|
||||
%define XTMP3 xmm3
|
||||
%define XTMP4 xmm8
|
||||
%define XFER xmm9
|
||||
%define XTMP5 xmm11
|
||||
|
||||
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
|
||||
%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
|
||||
%define BYTE_FLIP_MASK xmm13
|
||||
|
||||
%ifdef LINUX
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define CTX rsi ; 2nd arg
|
||||
%define INP rdi ; 1st arg
|
||||
|
||||
%define SRND rdi ; clobbers INP
|
||||
%define c ecx
|
||||
%define d r8d
|
||||
%define e edx
|
||||
%else
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define CTX rdx ; 2nd arg
|
||||
%define INP rcx ; 1st arg
|
||||
|
||||
%define SRND rcx ; clobbers INP
|
||||
%define c edi
|
||||
%define d esi
|
||||
%define e r8d
|
||||
|
||||
%endif
|
||||
%define TBL rbp
|
||||
%define a eax
|
||||
%define b ebx
|
||||
|
||||
%define f r9d
|
||||
%define g r10d
|
||||
%define h r11d
|
||||
|
||||
%define y0 r13d
|
||||
%define y1 r14d
|
||||
%define y2 r15d
|
||||
|
||||
|
||||
_INP_END_SIZE equ 8
|
||||
_INP_SIZE equ 8
|
||||
_XFER_SIZE equ 8
|
||||
%ifdef LINUX
|
||||
_XMM_SAVE_SIZE equ 0
|
||||
%else
|
||||
_XMM_SAVE_SIZE equ 8*16
|
||||
%endif
|
||||
; STACK_SIZE plus pushes must be an odd multiple of 8
|
||||
_ALIGN_SIZE equ 8
|
||||
|
||||
_INP_END equ 0
|
||||
_INP equ _INP_END + _INP_END_SIZE
|
||||
_XFER equ _INP + _INP_SIZE
|
||||
_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
|
||||
STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
|
||||
; rotate_Xs
|
||||
; Rotate values of symbols X0...X3
|
||||
%macro rotate_Xs 0
|
||||
%xdefine X_ X0
|
||||
%xdefine X0 X1
|
||||
%xdefine X1 X2
|
||||
%xdefine X2 X3
|
||||
%xdefine X3 X_
|
||||
%endm
|
||||
|
||||
; ROTATE_ARGS
|
||||
; Rotate values of symbols a...h
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
%macro FOUR_ROUNDS_AND_SCHED 0
|
||||
;; compute s0 four at a time and s1 two at a time
|
||||
;; compute W[-16] + W[-7] 4 at a time
|
||||
;vmovdqa XTMP0, X3
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
;vmovdqa XTMP1, X1
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
;; compute s0
|
||||
vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
|
||||
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
|
||||
vpsrld XTMP2, XTMP1, 7
|
||||
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
|
||||
vpslld XTMP3, XTMP1, (32-7)
|
||||
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
|
||||
vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] nostrdb: MY_ROR 7
|
||||
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
|
||||
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
|
||||
vpsrld XTMP2, XTMP1,18
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] nostrdb: >> 3
|
||||
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
|
||||
vpslld XTMP1, XTMP1, (32-18)
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP1
|
||||
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] nostrdb: MY_ROR 7 ^ W[-15] MY_ROR 18
|
||||
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
|
||||
vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
|
||||
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
;; compute low s1
|
||||
vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
|
||||
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
|
||||
;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
|
||||
vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
|
||||
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
|
||||
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
|
||||
vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
;; compute high s1
|
||||
vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
|
||||
vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
|
||||
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
|
||||
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
|
||||
vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
%endm
|
||||
|
||||
;; input is [rsp + _XFER + %1 * 4]
|
||||
%macro DO_ROUND 1
|
||||
mov y0, e ; y0 = e
|
||||
MY_ROR y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
MY_ROR y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
;; arg 1 : pointer to input data
|
||||
;; arg 2 : pointer to digest
|
||||
;; arg 3 : Num blocks
|
||||
section .text
|
||||
global sha256_avx
|
||||
align 32
|
||||
sha256_avx:
|
||||
push rbx
|
||||
%ifndef LINUX
|
||||
push rsi
|
||||
push rdi
|
||||
%endif
|
||||
push rbp
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp,STACK_SIZE
|
||||
%ifndef LINUX
|
||||
vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
|
||||
vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
|
||||
vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
|
||||
vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
|
||||
vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
|
||||
vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
|
||||
vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
|
||||
vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
|
||||
%endif
|
||||
|
||||
shl NUM_BLKS, 6 ; convert to bytes
|
||||
jz done_hash
|
||||
add NUM_BLKS, INP ; pointer to end of data
|
||||
mov [rsp + _INP_END], NUM_BLKS
|
||||
|
||||
;; load initial digest
|
||||
mov a,[4*0 + CTX]
|
||||
mov b,[4*1 + CTX]
|
||||
mov c,[4*2 + CTX]
|
||||
mov d,[4*3 + CTX]
|
||||
mov e,[4*4 + CTX]
|
||||
mov f,[4*5 + CTX]
|
||||
mov g,[4*6 + CTX]
|
||||
mov h,[4*7 + CTX]
|
||||
|
||||
vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
||||
vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
|
||||
vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
|
||||
|
||||
loop0:
|
||||
lea TBL,[K256 wrt rip]
|
||||
|
||||
;; byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
|
||||
|
||||
mov [rsp + _INP], INP
|
||||
|
||||
;; schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
mov SRND, 3
|
||||
align 16
|
||||
loop1:
|
||||
vpaddd XFER, X0, [TBL + 0*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 1*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 2*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd XFER, X0, [TBL + 3*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
add TBL, 4*16
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
sub SRND, 1
|
||||
jne loop1
|
||||
|
||||
mov SRND, 2
|
||||
loop2:
|
||||
vpaddd XFER, X0, [TBL + 0*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vpaddd XFER, X1, [TBL + 1*16]
|
||||
vmovdqa [rsp + _XFER], XFER
|
||||
add TBL, 2*16
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vmovdqa X0, X2
|
||||
vmovdqa X1, X3
|
||||
|
||||
sub SRND, 1
|
||||
jne loop2
|
||||
|
||||
|
||||
addm [4*0 + CTX],a
|
||||
addm [4*1 + CTX],b
|
||||
addm [4*2 + CTX],c
|
||||
addm [4*3 + CTX],d
|
||||
addm [4*4 + CTX],e
|
||||
addm [4*5 + CTX],f
|
||||
addm [4*6 + CTX],g
|
||||
addm [4*7 + CTX],h
|
||||
|
||||
mov INP, [rsp + _INP]
|
||||
add INP, 64
|
||||
cmp INP, [rsp + _INP_END]
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
%ifndef LINUX
|
||||
vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
|
||||
vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
|
||||
vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
|
||||
vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
|
||||
vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
|
||||
vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
|
||||
vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
|
||||
vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
|
||||
%endif
|
||||
|
||||
|
||||
add rsp, STACK_SIZE
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop rbp
|
||||
%ifndef LINUX
|
||||
pop rdi
|
||||
pop rsi
|
||||
%endif
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
section .data
|
||||
align 64
|
||||
K256:
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
; shuffle xBxA -> 00BA
|
||||
_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
; shuffle xDxC -> DC00
|
||||
_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
826
nostrdb/ccan/ccan/crypto/sha256/benchmarks/sha256_avx2_rorx2.asm
Normal file
826
nostrdb/ccan/ccan/crypto/sha256/benchmarks/sha256_avx2_rorx2.asm
Normal file
@@ -0,0 +1,826 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright (c) 2012, Intel Corporation
|
||||
;
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are
|
||||
; met:
|
||||
;
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
;
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in the
|
||||
; documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
;
|
||||
; * Neither the name of the Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived from
|
||||
; this software without specific prior written permission.
|
||||
;
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
|
||||
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
|
||||
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx2.obj -g cv8 sha256_avx2_rorx2.asm
|
||||
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx2.o sha256_avx2_rorx2.asm
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; This code is described in an Intel White-Paper:
|
||||
; "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
;
|
||||
; To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
; and search for that title.
|
||||
; The paper is expected to be released roughly at the end of April, 2012
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; This code schedules 2 blocks at a time, with 4 lanes per block
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define VMOVDQ vmovdqu ;; assume buffers not aligned
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
|
||||
|
||||
; addm [mem], reg
|
||||
; Add reg to mem using reg-mem add and store
|
||||
%macro addm 2
|
||||
add %2, %1
|
||||
mov %1, %2
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define X0 ymm4
|
||||
%define X1 ymm5
|
||||
%define X2 ymm6
|
||||
%define X3 ymm7
|
||||
|
||||
; XMM versions of above
|
||||
%define XWORD0 xmm4
|
||||
%define XWORD1 xmm5
|
||||
%define XWORD2 xmm6
|
||||
%define XWORD3 xmm7
|
||||
|
||||
%define XTMP0 ymm0
|
||||
%define XTMP1 ymm1
|
||||
%define XTMP2 ymm2
|
||||
%define XTMP3 ymm3
|
||||
%define XTMP4 ymm8
|
||||
%define XFER ymm9
|
||||
%define XTMP5 ymm11
|
||||
|
||||
%define SHUF_00BA ymm10 ; shuffle xBxA -> 00BA
|
||||
%define SHUF_DC00 ymm12 ; shuffle xDxC -> DC00
|
||||
%define BYTE_FLIP_MASK ymm13
|
||||
|
||||
%define X_BYTE_FLIP_MASK xmm13 ; XMM version of BYTE_FLIP_MASK
|
||||
|
||||
%ifdef LINUX
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define CTX rsi ; 2nd arg
|
||||
%define INP rdi ; 1st arg
|
||||
%define c ecx
|
||||
%define d r8d
|
||||
%define e edx ; clobbers NUM_BLKS
|
||||
%define y3 edi ; clobbers INP
|
||||
%else
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define CTX rdx ; 2nd arg
|
||||
%define INP rcx ; 1st arg
|
||||
%define c edi
|
||||
%define d esi
|
||||
%define e r8d ; clobbers NUM_BLKS
|
||||
%define y3 ecx ; clobbers INP
|
||||
|
||||
%endif
|
||||
|
||||
|
||||
%define TBL rbp
|
||||
%define SRND CTX ; SRND is same register as CTX
|
||||
|
||||
%define a eax
|
||||
%define b ebx
|
||||
%define f r9d
|
||||
%define g r10d
|
||||
%define h r11d
|
||||
%define old_h r11d
|
||||
|
||||
%define T1 r12d
|
||||
%define y0 r13d
|
||||
%define y1 r14d
|
||||
%define y2 r15d
|
||||
|
||||
|
||||
_XFER_SIZE equ 2*64*4 ; 2 blocks, 64 rounds, 4 bytes/round
|
||||
%ifdef LINUX
|
||||
_XMM_SAVE_SIZE equ 0
|
||||
%else
|
||||
_XMM_SAVE_SIZE equ 8*16
|
||||
%endif
|
||||
_INP_END_SIZE equ 8
|
||||
_INP_SIZE equ 8
|
||||
_CTX_SIZE equ 8
|
||||
_RSP_SIZE equ 8
|
||||
|
||||
_XFER equ 0
|
||||
_XMM_SAVE equ _XFER + _XFER_SIZE
|
||||
_INP_END equ _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
_INP equ _INP_END + _INP_END_SIZE
|
||||
_CTX equ _INP + _INP_SIZE
|
||||
_RSP equ _CTX + _CTX_SIZE
|
||||
STACK_SIZE equ _RSP + _RSP_SIZE
|
||||
|
||||
; rotate_Xs
|
||||
; Rotate values of symbols X0...X3
|
||||
%macro rotate_Xs 0
|
||||
%xdefine X_ X0
|
||||
%xdefine X0 X1
|
||||
%xdefine X1 X2
|
||||
%xdefine X2 X3
|
||||
%xdefine X3 X_
|
||||
%endm
|
||||
|
||||
; ROTATE_ARGS
|
||||
; Rotate values of symbols a...h
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine old_h h
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
%macro FOUR_ROUNDS_AND_SCHED 1
|
||||
%define %%XFER %1
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
mov y3, a ; y3 = a ; MAJA
|
||||
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
|
||||
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
|
||||
|
||||
add h, dword[%%XFER+0*4] ; h = k + w + h ; --
|
||||
or y3, c ; y3 = a|c ; MAJA
|
||||
vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
|
||||
mov y2, f ; y2 = f ; CH
|
||||
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
|
||||
xor y2, g ; y2 = f^g ; CH
|
||||
vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1
|
||||
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
|
||||
|
||||
and y2, e ; y2 = (f^g)&e ; CH
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
|
||||
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
|
||||
add d, h ; d = k + w + h + d ; --
|
||||
|
||||
and y3, b ; y3 = (a|c)&b ; MAJA
|
||||
vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
|
||||
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
|
||||
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
|
||||
vpsrld XTMP2, XTMP1, 7
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
|
||||
mov T1, a ; T1 = a ; MAJB
|
||||
and T1, c ; T1 = a&c ; MAJB
|
||||
|
||||
add y2, y0 ; y2 = S1 + CH ; --
|
||||
vpslld XTMP3, XTMP1, (32-7)
|
||||
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
|
||||
add h, y1 ; h = k + w + h + S0 ; --
|
||||
|
||||
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
|
||||
vpor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] nostrdb: ror 7
|
||||
|
||||
vpsrld XTMP2, XTMP1,18
|
||||
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
add h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
mov y3, a ; y3 = a ; MAJA
|
||||
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
|
||||
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
|
||||
add h, dword[%%XFER+1*4] ; h = k + w + h ; --
|
||||
or y3, c ; y3 = a|c ; MAJA
|
||||
|
||||
|
||||
vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] nostrdb: >> 3
|
||||
mov y2, f ; y2 = f ; CH
|
||||
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
|
||||
xor y2, g ; y2 = f^g ; CH
|
||||
|
||||
|
||||
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
|
||||
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
|
||||
and y2, e ; y2 = (f^g)&e ; CH
|
||||
add d, h ; d = k + w + h + d ; --
|
||||
|
||||
vpslld XTMP1, XTMP1, (32-18)
|
||||
and y3, b ; y3 = (a|c)&b ; MAJA
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP1
|
||||
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
|
||||
|
||||
vpxor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] nostrdb: ror 7 ^ W[-15] ror 18
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
|
||||
mov T1, a ; T1 = a ; MAJB
|
||||
and T1, c ; T1 = a&c ; MAJB
|
||||
add y2, y0 ; y2 = S1 + CH ; --
|
||||
|
||||
vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
|
||||
vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
|
||||
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
|
||||
add h, y1 ; h = k + w + h + S0 ; --
|
||||
|
||||
vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
|
||||
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
|
||||
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
add h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
|
||||
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
mov y3, a ; y3 = a ; MAJA
|
||||
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
|
||||
add h, [%%XFER+2*4] ; h = k + w + h ; --
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
|
||||
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
|
||||
or y3, c ; y3 = a|c ; MAJA
|
||||
mov y2, f ; y2 = f ; CH
|
||||
xor y2, g ; y2 = f^g ; CH
|
||||
|
||||
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
|
||||
and y2, e ; y2 = (f^g)&e ; CH
|
||||
|
||||
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
add d, h ; d = k + w + h + d ; --
|
||||
and y3, b ; y3 = (a|c)&b ; MAJA
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
|
||||
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
|
||||
vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
|
||||
|
||||
vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
|
||||
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
|
||||
vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
|
||||
mov T1, a ; T1 = a ; MAJB
|
||||
and T1, c ; T1 = a&c ; MAJB
|
||||
add y2, y0 ; y2 = S1 + CH ; --
|
||||
vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
|
||||
|
||||
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
|
||||
add h, y1 ; h = k + w + h + S0 ; --
|
||||
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
|
||||
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
|
||||
add h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
mov y3, a ; y3 = a ; MAJA
|
||||
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
|
||||
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
|
||||
add h, dword[%%XFER+3*4] ; h = k + w + h ; --
|
||||
or y3, c ; y3 = a|c ; MAJA
|
||||
|
||||
|
||||
vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
|
||||
mov y2, f ; y2 = f ; CH
|
||||
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
|
||||
xor y2, g ; y2 = f^g ; CH
|
||||
|
||||
|
||||
vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
|
||||
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
|
||||
and y2, e ; y2 = (f^g)&e ; CH
|
||||
add d, h ; d = k + w + h + d ; --
|
||||
and y3, b ; y3 = (a|c)&b ; MAJA
|
||||
|
||||
vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
|
||||
|
||||
vpxor XTMP2, XTMP2, XTMP3
|
||||
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
|
||||
add y2, y0 ; y2 = S1 + CH ; --
|
||||
|
||||
vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
|
||||
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
|
||||
|
||||
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
|
||||
vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
|
||||
|
||||
vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
|
||||
mov T1, a ; T1 = a ; MAJB
|
||||
and T1, c ; T1 = a&c ; MAJB
|
||||
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
|
||||
|
||||
add h, y1 ; h = k + w + h + S0 ; --
|
||||
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
add h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
%endm
|
||||
|
||||
%macro DO_4ROUNDS 1
|
||||
%define %%XFER %1
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
mov y2, f ; y2 = f ; CH
|
||||
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
|
||||
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
|
||||
xor y2, g ; y2 = f^g ; CH
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
|
||||
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
|
||||
and y2, e ; y2 = (f^g)&e ; CH
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
|
||||
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
|
||||
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
|
||||
mov y3, a ; y3 = a ; MAJA
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
|
||||
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
|
||||
add h, dword[%%XFER + 4*0] ; h = k + w + h ; --
|
||||
or y3, c ; y3 = a|c ; MAJA
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
|
||||
mov T1, a ; T1 = a ; MAJB
|
||||
and y3, b ; y3 = (a|c)&b ; MAJA
|
||||
and T1, c ; T1 = a&c ; MAJB
|
||||
add y2, y0 ; y2 = S1 + CH ; --
|
||||
|
||||
|
||||
add d, h ; d = k + w + h + d ; --
|
||||
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
|
||||
add h, y1 ; h = k + w + h + S0 ; --
|
||||
|
||||
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
|
||||
|
||||
|
||||
;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
|
||||
;add h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
mov y2, f ; y2 = f ; CH
|
||||
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
|
||||
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
|
||||
xor y2, g ; y2 = f^g ; CH
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
|
||||
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
|
||||
and y2, e ; y2 = (f^g)&e ; CH
|
||||
add old_h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
|
||||
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
|
||||
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
|
||||
mov y3, a ; y3 = a ; MAJA
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
|
||||
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
|
||||
add h, dword[%%XFER + 4*1] ; h = k + w + h ; --
|
||||
or y3, c ; y3 = a|c ; MAJA
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
|
||||
mov T1, a ; T1 = a ; MAJB
|
||||
and y3, b ; y3 = (a|c)&b ; MAJA
|
||||
and T1, c ; T1 = a&c ; MAJB
|
||||
add y2, y0 ; y2 = S1 + CH ; --
|
||||
|
||||
|
||||
add d, h ; d = k + w + h + d ; --
|
||||
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
|
||||
add h, y1 ; h = k + w + h + S0 ; --
|
||||
|
||||
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
|
||||
|
||||
|
||||
;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
|
||||
;add h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
mov y2, f ; y2 = f ; CH
|
||||
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
|
||||
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
|
||||
xor y2, g ; y2 = f^g ; CH
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
|
||||
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
|
||||
and y2, e ; y2 = (f^g)&e ; CH
|
||||
add old_h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
|
||||
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
|
||||
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
|
||||
mov y3, a ; y3 = a ; MAJA
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
|
||||
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
|
||||
add h, dword[%%XFER + 4*2] ; h = k + w + h ; --
|
||||
or y3, c ; y3 = a|c ; MAJA
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
|
||||
mov T1, a ; T1 = a ; MAJB
|
||||
and y3, b ; y3 = (a|c)&b ; MAJA
|
||||
and T1, c ; T1 = a&c ; MAJB
|
||||
add y2, y0 ; y2 = S1 + CH ; --
|
||||
|
||||
|
||||
add d, h ; d = k + w + h + d ; --
|
||||
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
|
||||
add h, y1 ; h = k + w + h + S0 ; --
|
||||
|
||||
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
|
||||
|
||||
|
||||
;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
|
||||
;add h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
mov y2, f ; y2 = f ; CH
|
||||
rorx y0, e, 25 ; y0 = e >> 25 ; S1A
|
||||
rorx y1, e, 11 ; y1 = e >> 11 ; S1B
|
||||
xor y2, g ; y2 = f^g ; CH
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
|
||||
rorx y1, e, 6 ; y1 = (e >> 6) ; S1
|
||||
and y2, e ; y2 = (f^g)&e ; CH
|
||||
add old_h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
|
||||
rorx T1, a, 13 ; T1 = a >> 13 ; S0B
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
|
||||
rorx y1, a, 22 ; y1 = a >> 22 ; S0A
|
||||
mov y3, a ; y3 = a ; MAJA
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
|
||||
rorx T1, a, 2 ; T1 = (a >> 2) ; S0
|
||||
add h, dword[%%XFER + 4*3] ; h = k + w + h ; --
|
||||
or y3, c ; y3 = a|c ; MAJA
|
||||
|
||||
xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
|
||||
mov T1, a ; T1 = a ; MAJB
|
||||
and y3, b ; y3 = (a|c)&b ; MAJA
|
||||
and T1, c ; T1 = a&c ; MAJB
|
||||
add y2, y0 ; y2 = S1 + CH ; --
|
||||
|
||||
|
||||
add d, h ; d = k + w + h + d ; --
|
||||
or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
|
||||
add h, y1 ; h = k + w + h + S0 ; --
|
||||
|
||||
add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
|
||||
|
||||
|
||||
add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
||||
|
||||
add h, y3 ; h = t1 + S0 + MAJ ; --
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
;; arg 1 : pointer to input data
|
||||
;; arg 2 : pointer to digest
|
||||
;; arg 3 : Num blocks
|
||||
section .text
|
||||
global sha256_rorx
|
||||
align 32
|
||||
sha256_rorx:
|
||||
push rbx
|
||||
%ifndef LINUX
|
||||
push rsi
|
||||
push rdi
|
||||
%endif
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
mov rax, rsp
|
||||
sub rsp,STACK_SIZE
|
||||
and rsp, -32
|
||||
mov [rsp + _RSP], rax
|
||||
|
||||
%ifndef LINUX
|
||||
vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
|
||||
vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
|
||||
vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
|
||||
vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
|
||||
vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
|
||||
vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
|
||||
vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
|
||||
vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
|
||||
%endif
|
||||
|
||||
shl NUM_BLKS, 6 ; convert to bytes
|
||||
jz done_hash
|
||||
lea NUM_BLKS, [NUM_BLKS + INP - 64] ; pointer to last block
|
||||
mov [rsp + _INP_END], NUM_BLKS
|
||||
|
||||
cmp INP, NUM_BLKS
|
||||
je only_one_block
|
||||
|
||||
;; load initial digest
|
||||
mov a,[4*0 + CTX]
|
||||
mov b,[4*1 + CTX]
|
||||
mov c,[4*2 + CTX]
|
||||
mov d,[4*3 + CTX]
|
||||
mov e,[4*4 + CTX]
|
||||
mov f,[4*5 + CTX]
|
||||
mov g,[4*6 + CTX]
|
||||
mov h,[4*7 + CTX]
|
||||
|
||||
vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
||||
vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
|
||||
vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
|
||||
|
||||
mov [rsp + _CTX], CTX
|
||||
|
||||
loop0:
|
||||
lea TBL,[K256 wrt rip]
|
||||
|
||||
;; Load first 16 dwords from two blocks
|
||||
VMOVDQ XTMP0, [INP + 0*32]
|
||||
VMOVDQ XTMP1, [INP + 1*32]
|
||||
VMOVDQ XTMP2, [INP + 2*32]
|
||||
VMOVDQ XTMP3, [INP + 3*32]
|
||||
|
||||
;; byte swap data
|
||||
vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK
|
||||
vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK
|
||||
vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK
|
||||
vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK
|
||||
|
||||
;; transpose data into high/low halves
|
||||
vperm2i128 X0, XTMP0, XTMP2, 0x20
|
||||
vperm2i128 X1, XTMP0, XTMP2, 0x31
|
||||
vperm2i128 X2, XTMP1, XTMP3, 0x20
|
||||
vperm2i128 X3, XTMP1, XTMP3, 0x31
|
||||
|
||||
last_block_enter:
|
||||
add INP, 64
|
||||
mov [rsp + _INP], INP
|
||||
|
||||
;; schedule 48 input dwords, by doing 3 rounds of 12 each
|
||||
xor SRND, SRND
|
||||
|
||||
align 16
|
||||
loop1:
|
||||
vpaddd XFER, X0, [TBL + SRND + 0*32]
|
||||
vmovdqa [rsp + _XFER + SRND + 0*32], XFER
|
||||
FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32
|
||||
|
||||
vpaddd XFER, X0, [TBL + SRND + 1*32]
|
||||
vmovdqa [rsp + _XFER + SRND + 1*32], XFER
|
||||
FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32
|
||||
|
||||
vpaddd XFER, X0, [TBL + SRND + 2*32]
|
||||
vmovdqa [rsp + _XFER + SRND + 2*32], XFER
|
||||
FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32
|
||||
|
||||
vpaddd XFER, X0, [TBL + SRND + 3*32]
|
||||
vmovdqa [rsp + _XFER + SRND + 3*32], XFER
|
||||
FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32
|
||||
|
||||
add SRND, 4*32
|
||||
cmp SRND, 3 * 4*32
|
||||
jb loop1
|
||||
|
||||
loop2:
|
||||
;; Do last 16 rounds with no scheduling
|
||||
vpaddd XFER, X0, [TBL + SRND + 0*32]
|
||||
vmovdqa [rsp + _XFER + SRND + 0*32], XFER
|
||||
DO_4ROUNDS rsp + _XFER + SRND + 0*32
|
||||
vpaddd XFER, X1, [TBL + SRND + 1*32]
|
||||
vmovdqa [rsp + _XFER + SRND + 1*32], XFER
|
||||
DO_4ROUNDS rsp + _XFER + SRND + 1*32
|
||||
add SRND, 2*32
|
||||
|
||||
vmovdqa X0, X2
|
||||
vmovdqa X1, X3
|
||||
|
||||
cmp SRND, 4 * 4*32
|
||||
jb loop2
|
||||
|
||||
mov CTX, [rsp + _CTX]
|
||||
mov INP, [rsp + _INP]
|
||||
|
||||
addm [4*0 + CTX],a
|
||||
addm [4*1 + CTX],b
|
||||
addm [4*2 + CTX],c
|
||||
addm [4*3 + CTX],d
|
||||
addm [4*4 + CTX],e
|
||||
addm [4*5 + CTX],f
|
||||
addm [4*6 + CTX],g
|
||||
addm [4*7 + CTX],h
|
||||
|
||||
cmp INP, [rsp + _INP_END]
|
||||
ja done_hash
|
||||
|
||||
;;;; Do second block using previously scheduled results
|
||||
xor SRND, SRND
|
||||
align 16
|
||||
loop3:
|
||||
DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16
|
||||
DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16
|
||||
add SRND, 2*32
|
||||
cmp SRND, 4 * 4*32
|
||||
jb loop3
|
||||
|
||||
mov CTX, [rsp + _CTX]
|
||||
mov INP, [rsp + _INP]
|
||||
add INP, 64
|
||||
|
||||
addm [4*0 + CTX],a
|
||||
addm [4*1 + CTX],b
|
||||
addm [4*2 + CTX],c
|
||||
addm [4*3 + CTX],d
|
||||
addm [4*4 + CTX],e
|
||||
addm [4*5 + CTX],f
|
||||
addm [4*6 + CTX],g
|
||||
addm [4*7 + CTX],h
|
||||
|
||||
cmp INP, [rsp + _INP_END]
|
||||
jb loop0
|
||||
ja done_hash
|
||||
|
||||
do_last_block:
|
||||
;;;; do last block
|
||||
lea TBL,[K256 wrt rip]
|
||||
|
||||
VMOVDQ XWORD0, [INP + 0*16]
|
||||
VMOVDQ XWORD1, [INP + 1*16]
|
||||
VMOVDQ XWORD2, [INP + 2*16]
|
||||
VMOVDQ XWORD3, [INP + 3*16]
|
||||
|
||||
vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK
|
||||
vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK
|
||||
vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK
|
||||
vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK
|
||||
|
||||
jmp last_block_enter
|
||||
|
||||
only_one_block:
|
||||
|
||||
;; load initial digest
|
||||
mov a,[4*0 + CTX]
|
||||
mov b,[4*1 + CTX]
|
||||
mov c,[4*2 + CTX]
|
||||
mov d,[4*3 + CTX]
|
||||
mov e,[4*4 + CTX]
|
||||
mov f,[4*5 + CTX]
|
||||
mov g,[4*6 + CTX]
|
||||
mov h,[4*7 + CTX]
|
||||
|
||||
vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
||||
vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
|
||||
vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
|
||||
|
||||
mov [rsp + _CTX], CTX
|
||||
jmp do_last_block
|
||||
|
||||
done_hash:
|
||||
%ifndef LINUX
|
||||
vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
|
||||
vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
|
||||
vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
|
||||
vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
|
||||
vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
|
||||
vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
|
||||
vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
|
||||
vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
|
||||
%endif
|
||||
|
||||
mov rsp, [rsp + _RSP]
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
%ifndef LINUX
|
||||
pop rdi
|
||||
pop rsi
|
||||
%endif
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
section .data
|
||||
align 64
|
||||
K256:
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
ddq 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
; shuffle xBxA -> 00BA
|
||||
_SHUF_00BA:
|
||||
ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
; shuffle xDxC -> DC00
|
||||
_SHUF_DC00:
|
||||
ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
1507
nostrdb/ccan/ccan/crypto/sha256/benchmarks/sha256_avx2_rorx8.asm
Normal file
1507
nostrdb/ccan/ccan/crypto/sha256/benchmarks/sha256_avx2_rorx8.asm
Normal file
File diff suppressed because it is too large
Load Diff
544
nostrdb/ccan/ccan/crypto/sha256/benchmarks/sha256_sse4.asm
Normal file
544
nostrdb/ccan/ccan/crypto/sha256/benchmarks/sha256_sse4.asm
Normal file
@@ -0,0 +1,544 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright (c) 2012, Intel Corporation
|
||||
;
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are
|
||||
; met:
|
||||
;
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
;
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in the
|
||||
; documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
;
|
||||
; * Neither the name of the Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived from
|
||||
; this software without specific prior written permission.
|
||||
;
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
|
||||
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
|
||||
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; Example YASM command lines:
|
||||
; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
|
||||
; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; This code is described in an Intel White-Paper:
|
||||
; "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
;
|
||||
; To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
; and search for that title.
|
||||
; The paper is expected to be released roughly at the end of April, 2012
|
||||
;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; This code schedules 1 blocks at a time, with 4 lanes per block
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define MOVDQ movdqu ;; assume buffers not aligned
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
|
||||
|
||||
; addm [mem], reg
|
||||
; Add reg to mem using reg-mem add and store
|
||||
%macro addm 2
|
||||
add %2, %1
|
||||
mov %1, %2
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
; Load xmm with mem and byte swap each dword
|
||||
%macro COPY_XMM_AND_BSWAP 3
|
||||
MOVDQ %1, %2
|
||||
pshufb %1, %3
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%define X0 xmm4
|
||||
%define X1 xmm5
|
||||
%define X2 xmm6
|
||||
%define X3 xmm7
|
||||
|
||||
%define XTMP0 xmm0
|
||||
%define XTMP1 xmm1
|
||||
%define XTMP2 xmm2
|
||||
%define XTMP3 xmm3
|
||||
%define XTMP4 xmm8
|
||||
%define XFER xmm9
|
||||
|
||||
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
|
||||
%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
|
||||
%define BYTE_FLIP_MASK xmm12
|
||||
|
||||
%ifdef LINUX
|
||||
%define NUM_BLKS rdx ; 3rd arg
|
||||
%define CTX rsi ; 2nd arg
|
||||
%define INP rdi ; 1st arg
|
||||
|
||||
%define SRND rdi ; clobbers INP
|
||||
%define c ecx
|
||||
%define d r8d
|
||||
%define e edx
|
||||
%else
|
||||
%define NUM_BLKS r8 ; 3rd arg
|
||||
%define CTX rdx ; 2nd arg
|
||||
%define INP rcx ; 1st arg
|
||||
|
||||
%define SRND rcx ; clobbers INP
|
||||
%define c edi
|
||||
%define d esi
|
||||
%define e r8d
|
||||
|
||||
%endif
|
||||
%define TBL rbp
|
||||
%define a eax
|
||||
%define b ebx
|
||||
|
||||
%define f r9d
|
||||
%define g r10d
|
||||
%define h r11d
|
||||
|
||||
%define y0 r13d
|
||||
%define y1 r14d
|
||||
%define y2 r15d
|
||||
|
||||
|
||||
|
||||
_INP_END_SIZE equ 8
|
||||
_INP_SIZE equ 8
|
||||
_XFER_SIZE equ 8
|
||||
%ifdef LINUX
|
||||
_XMM_SAVE_SIZE equ 0
|
||||
%else
|
||||
_XMM_SAVE_SIZE equ 7*16
|
||||
%endif
|
||||
; STACK_SIZE plus pushes must be an odd multiple of 8
|
||||
_ALIGN_SIZE equ 8
|
||||
|
||||
_INP_END equ 0
|
||||
_INP equ _INP_END + _INP_END_SIZE
|
||||
_XFER equ _INP + _INP_SIZE
|
||||
_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
|
||||
STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
|
||||
; rotate_Xs
|
||||
; Rotate values of symbols X0...X3
|
||||
%macro rotate_Xs 0
|
||||
%xdefine X_ X0
|
||||
%xdefine X0 X1
|
||||
%xdefine X1 X2
|
||||
%xdefine X2 X3
|
||||
%xdefine X3 X_
|
||||
%endm
|
||||
|
||||
; ROTATE_ARGS
|
||||
; Rotate values of symbols a...h
|
||||
%macro ROTATE_ARGS 0
|
||||
%xdefine TMP_ h
|
||||
%xdefine h g
|
||||
%xdefine g f
|
||||
%xdefine f e
|
||||
%xdefine e d
|
||||
%xdefine d c
|
||||
%xdefine c b
|
||||
%xdefine b a
|
||||
%xdefine a TMP_
|
||||
%endm
|
||||
|
||||
%macro FOUR_ROUNDS_AND_SCHED 0
|
||||
;; compute s0 four at a time and s1 two at a time
|
||||
;; compute W[-16] + W[-7] 4 at a time
|
||||
movdqa XTMP0, X3
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
movdqa XTMP1, X1
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
;; compute s0
|
||||
palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
|
||||
movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pslld XTMP1, (32-7)
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
psrld XTMP2, 7
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
por XTMP1, XTMP2 ; XTMP1 = W[-15] nostrdb: ror 7
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
pslld XTMP3, (32-18)
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
psrld XTMP2, 18
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
pxor XTMP1, XTMP3
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
psrld XTMP4, 3 ; XTMP4 = W[-15] nostrdb: >> 3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
pxor XTMP1, XTMP2 ; XTMP1 = W[-15] nostrdb: ror 7 ^ W[-15] ror 18
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pxor XTMP1, XTMP4 ; XTMP1 = s0
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
;; compute low s1
|
||||
pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
|
||||
mov y0, e ; y0 = e
|
||||
mov y1, a ; y1 = a
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
|
||||
xor y2, g ; y2 = f^g
|
||||
psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
pxor XTMP2, XTMP3
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
|
||||
pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
;; compute high s1
|
||||
pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
mov y2, f ; y2 = f
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
xor y2, g ; y2 = f^g
|
||||
psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
pxor XTMP2, XTMP3
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
|
||||
pxor X0, XTMP2 ; X0 = s1 {xDxC}
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
%endm
|
||||
|
||||
;; input is [rsp + _XFER + %1 * 4]
|
||||
%macro DO_ROUND 1
|
||||
mov y0, e ; y0 = e
|
||||
ror y0, (25-11) ; y0 = e >> (25-11)
|
||||
mov y1, a ; y1 = a
|
||||
xor y0, e ; y0 = e ^ (e >> (25-11))
|
||||
ror y1, (22-13) ; y1 = a >> (22-13)
|
||||
mov y2, f ; y2 = f
|
||||
xor y1, a ; y1 = a ^ (a >> (22-13)
|
||||
ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor y2, g ; y2 = f^g
|
||||
xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and y2, e ; y2 = (f^g)&e
|
||||
xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor y2, g ; y2 = CH = ((f^g)&e)^g
|
||||
add y2, y0 ; y2 = S1 + CH
|
||||
ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
|
||||
mov y0, a ; y0 = a
|
||||
add h, y2 ; h = h + S1 + CH + k + w
|
||||
mov y2, a ; y2 = a
|
||||
or y0, c ; y0 = a|c
|
||||
add d, h ; d = d + h + S1 + CH + k + w
|
||||
and y2, c ; y2 = a&c
|
||||
and y0, b ; y0 = (a|c)&b
|
||||
add h, y1 ; h = h + S1 + CH + k + w + S0
|
||||
or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
%endm
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
;; arg 1 : pointer to input data
|
||||
;; arg 2 : pointer to digest
|
||||
;; arg 3 : Num blocks
|
||||
section .text
|
||||
global sha256_sse4
|
||||
align 32
|
||||
sha256_sse4:
|
||||
push rbx
|
||||
%ifndef LINUX
|
||||
push rsi
|
||||
push rdi
|
||||
%endif
|
||||
push rbp
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp,STACK_SIZE
|
||||
%ifndef LINUX
|
||||
movdqa [rsp + _XMM_SAVE + 0*16],xmm6
|
||||
movdqa [rsp + _XMM_SAVE + 1*16],xmm7
|
||||
movdqa [rsp + _XMM_SAVE + 2*16],xmm8
|
||||
movdqa [rsp + _XMM_SAVE + 3*16],xmm9
|
||||
movdqa [rsp + _XMM_SAVE + 4*16],xmm10
|
||||
movdqa [rsp + _XMM_SAVE + 5*16],xmm11
|
||||
movdqa [rsp + _XMM_SAVE + 6*16],xmm12
|
||||
%endif
|
||||
|
||||
shl NUM_BLKS, 6 ; convert to bytes
|
||||
jz done_hash
|
||||
add NUM_BLKS, INP ; pointer to end of data
|
||||
mov [rsp + _INP_END], NUM_BLKS
|
||||
|
||||
;; load initial digest
|
||||
mov a,[4*0 + CTX]
|
||||
mov b,[4*1 + CTX]
|
||||
mov c,[4*2 + CTX]
|
||||
mov d,[4*3 + CTX]
|
||||
mov e,[4*4 + CTX]
|
||||
mov f,[4*5 + CTX]
|
||||
mov g,[4*6 + CTX]
|
||||
mov h,[4*7 + CTX]
|
||||
|
||||
movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
||||
movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
|
||||
movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
|
||||
|
||||
loop0:
|
||||
lea TBL,[K256 wrt rip]
|
||||
|
||||
;; byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
|
||||
|
||||
mov [rsp + _INP], INP
|
||||
|
||||
;; schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
mov SRND, 3
|
||||
align 16
|
||||
loop1:
|
||||
movdqa XFER, [TBL + 0*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 1*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 2*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa XFER, [TBL + 3*16]
|
||||
paddd XFER, X0
|
||||
movdqa [rsp + _XFER], XFER
|
||||
add TBL, 4*16
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
sub SRND, 1
|
||||
jne loop1
|
||||
|
||||
mov SRND, 2
|
||||
loop2:
|
||||
paddd X0, [TBL + 0*16]
|
||||
movdqa [rsp + _XFER], X0
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
paddd X1, [TBL + 1*16]
|
||||
movdqa [rsp + _XFER], X1
|
||||
add TBL, 2*16
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
movdqa X0, X2
|
||||
movdqa X1, X3
|
||||
|
||||
sub SRND, 1
|
||||
jne loop2
|
||||
|
||||
addm [4*0 + CTX],a
|
||||
addm [4*1 + CTX],b
|
||||
addm [4*2 + CTX],c
|
||||
addm [4*3 + CTX],d
|
||||
addm [4*4 + CTX],e
|
||||
addm [4*5 + CTX],f
|
||||
addm [4*6 + CTX],g
|
||||
addm [4*7 + CTX],h
|
||||
|
||||
mov INP, [rsp + _INP]
|
||||
add INP, 64
|
||||
cmp INP, [rsp + _INP_END]
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
%ifndef LINUX
|
||||
movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
|
||||
movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
|
||||
movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
|
||||
movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
|
||||
movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
|
||||
movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
|
||||
movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
|
||||
%endif
|
||||
|
||||
add rsp, STACK_SIZE
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop rbp
|
||||
%ifndef LINUX
|
||||
pop rdi
|
||||
pop rsi
|
||||
%endif
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
section .data
|
||||
align 64
|
||||
K256:
|
||||
dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
; shuffle xBxA -> 00BA
|
||||
_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
; shuffle xDxC -> DC00
|
||||
_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
@@ -6,9 +6,9 @@
|
||||
* Distributed under the MIT software license, see the accompanying
|
||||
* file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||
*/
|
||||
#include "sha256.h"
|
||||
#include "ccan/endian/endian.h"
|
||||
#include "ccan/compiler/compiler.h"
|
||||
#include <ccan/crypto/sha256/sha256.h>
|
||||
#include <ccan/endian/endian.h>
|
||||
#include <ccan/compiler/compiler.h>
|
||||
#include <stdbool.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
@@ -22,7 +22,7 @@ static void invalidate_sha256(struct sha256_ctx *ctx)
|
||||
#endif
|
||||
}
|
||||
|
||||
static void check_sha256(struct sha256_ctx *ctx)
|
||||
static void check_sha256(struct sha256_ctx *ctx UNUSED)
|
||||
{
|
||||
#ifdef CCAN_CRYPTO_SHA256_USE_OPENSSL
|
||||
assert(ctx->c.md_len != 0);
|
||||
@@ -167,6 +167,14 @@ static void Transform(uint32_t *s, const uint32_t *chunk)
|
||||
s[7] += h;
|
||||
}
|
||||
|
||||
static bool alignment_ok(const void *p UNUSED, size_t n UNUSED)
|
||||
{
|
||||
#if HAVE_UNALIGNED_ACCESS
|
||||
return true;
|
||||
#else
|
||||
return ((size_t)p % n == 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void add(struct sha256_ctx *ctx, const void *p, size_t len)
|
||||
{
|
||||
@@ -195,7 +203,7 @@ static void add(struct sha256_ctx *ctx, const void *p, size_t len)
|
||||
data += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
|
||||
if (len) {
|
||||
/* Fill the buffer with what remains. */
|
||||
memcpy(ctx->buf.u8 + bufsize, data, len);
|
||||
@@ -240,7 +248,7 @@ void sha256(struct sha256 *sha, const void *p, size_t size)
|
||||
sha256_update(&ctx, p, size);
|
||||
sha256_done(&ctx, sha);
|
||||
}
|
||||
|
||||
|
||||
void sha256_u8(struct sha256_ctx *ctx, uint8_t v)
|
||||
{
|
||||
sha256_update(ctx, &v, sizeof(v));
|
||||
@@ -267,13 +275,13 @@ void sha256_le16(struct sha256_ctx *ctx, uint16_t v)
|
||||
leint16_t lev = cpu_to_le16(v);
|
||||
sha256_update(ctx, &lev, sizeof(lev));
|
||||
}
|
||||
|
||||
|
||||
void sha256_le32(struct sha256_ctx *ctx, uint32_t v)
|
||||
{
|
||||
leint32_t lev = cpu_to_le32(v);
|
||||
sha256_update(ctx, &lev, sizeof(lev));
|
||||
}
|
||||
|
||||
|
||||
void sha256_le64(struct sha256_ctx *ctx, uint64_t v)
|
||||
{
|
||||
leint64_t lev = cpu_to_le64(v);
|
||||
@@ -286,17 +294,15 @@ void sha256_be16(struct sha256_ctx *ctx, uint16_t v)
|
||||
beint16_t bev = cpu_to_be16(v);
|
||||
sha256_update(ctx, &bev, sizeof(bev));
|
||||
}
|
||||
|
||||
|
||||
void sha256_be32(struct sha256_ctx *ctx, uint32_t v)
|
||||
{
|
||||
beint32_t bev = cpu_to_be32(v);
|
||||
sha256_update(ctx, &bev, sizeof(bev));
|
||||
}
|
||||
|
||||
|
||||
void sha256_be64(struct sha256_ctx *ctx, uint64_t v)
|
||||
{
|
||||
beint64_t bev = cpu_to_be64(v);
|
||||
sha256_update(ctx, &bev, sizeof(bev));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,14 +1,7 @@
|
||||
|
||||
#ifndef CCAN_CRYPTO_SHA256_H
|
||||
#define CCAN_CRYPTO_SHA256_H
|
||||
|
||||
|
||||
/** Output length for `wally_sha256` */
|
||||
#define SHA256_LEN 32
|
||||
|
||||
|
||||
/* BSD-MIT - see LICENSE file for details */
|
||||
/* #include "config.h" */
|
||||
#include "config.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@@ -151,5 +144,4 @@ void sha256_le64(struct sha256_ctx *ctx, uint64_t v);
|
||||
void sha256_be16(struct sha256_ctx *ctx, uint16_t v);
|
||||
void sha256_be32(struct sha256_ctx *ctx, uint32_t v);
|
||||
void sha256_be64(struct sha256_ctx *ctx, uint64_t v);
|
||||
|
||||
#endif /* CCAN_CRYPTO_SHA256_H */
|
||||
|
||||
Reference in New Issue
Block a user