/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009 Intel Corporation
* All Rights Reserved.
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Accelerated GHASH implementation with Intel PCLMULQDQ-NI
* instructions. This file contains an accelerated
* Galois Field Multiplication implementation.
*
* PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
* carry-less multiplication. More information about PCLMULQDQ can be
* found at:
* carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
*
*/
/*
* ====================================================================
* OpenSolaris OS modifications
*
* This source originates as file galois_hash_asm.c from
* Intel Corporation dated September 21, 2009.
*
* This OpenSolaris version has these major changes from the original source:
*
* /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
* definition for lint.
*
* 2. Formatted code, added comments, and added #includes and #defines.
*
* 3. If bit CR0.TS is set, clear and set the TS bit, after and before
* calling kpreempt_disable() and kpreempt_enable().
* If the TS bit is not set, Save and restore %xmm registers at the beginning
* and end of function calls (%xmm* registers are not saved and restored by
* during kernel thread preemption).
*
* 4. Removed code to perform hashing. This is already done with C macro
* GHASH in gcm.c. For better performance, this removed code should be
* reintegrated in the future to replace the C GHASH macro.
*
* 5. Added code to byte swap 16-byte input and output.
*
* 6. Folded in comments from the original C source with embedded assembly
* (SB_w_shift_xor.c)
*
* 7. Renamed function and reordered parameters to match OpenSolaris:
* Intel interface:
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
* unsigned char *d, int length)
* OpenSolaris OS interface:
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
* ====================================================================
*/
/* ARGSUSED */
void
}
#else /* lint */
#include <sys/asm_linkage.h>
#include <sys/controlregs.h>
#ifdef _KERNEL
#include <sys/machprivregs.h>
#endif
#ifdef _KERNEL
/*
* Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
* it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
* uses it to pass P2 to syscall.
* This also occurs with the STTS macro, but we don't care if
* P2 (%rsi) is modified just before function exit.
* The CLTS and STTS macros push and pop P1 (%rdi) already.
*/
#ifdef __xpv
#define PROTECTED_CLTS \
CLTS; \
#else
#define PROTECTED_CLTS \
#endif /* __xpv */
/*
* If CR0_TS is not set, align stack (with push %rbp) and push
* %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
*/
jnz 1f; \
jmp 2f; \
1: \
2:
/*
* If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
* otherwise set CR0_TS.
*/
jnz 1f; \
jmp 2f; \
1: \
2: \
#else
#define PROTECTED_CLTS
#endif /* _KERNEL */
/*
* Use this mask to byte-swap a 16-byte integer with the pshufb instruction
*/
// static uint8_t byte_swap16_mask[] = {
// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
.text
/*
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
*
* Perform a carry-less multiplication (that is, use XOR instead of the
* multiply operator) on P1 and P2 and place the result in P3.
*
* Byte swap the input and the output.
*
* Note: x_in, y, and res all point to a block of 20-byte numbers
* (an array of two 64-bit integers).
*
* Note2: For kernel code, caller is responsible for ensuring
* kpreempt_disable() has been called. This is because %xmm registers are
* respectively, if TS is set on entry. Otherwise, if TS is not set,
* save and restore %xmm registers on the stack.
*
* Note3: Original Intel definition:
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
* unsigned char *d, int length)
*
* Intel:
* Parameter 1: %rcx (copied to %xmm0) hk or x_in
* Parameter 2: %rdx (copied to %xmm1) s or y
* Parameter 3: %rdi (result) d or res
* OpenSolaris:
* Parameter 1: %rdi (copied to %xmm0) x_in
* Parameter 2: %rsi (copied to %xmm1) y
* Parameter 3: %rdx (result) res
*/
//
// Copy Parameters
//
//
// Byte swap 16-byte input
//
//
// Multiply with the hash key
//
// of the carry-less multiplication of
// xmm0 by xmm1.
// We shift the result of the multiplication by one bit position
// to the left to cope for the fact that the bits are reversed.
//
// First phase of the reduction
//
// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
// independently.
//
// Second phase of the reduction
//
// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
// shift operations.
//
// Byte swap 16-byte result
//
//
// Store the result
//
//
// Cleanup and Return
//
#endif /* lint || __lint */