104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson/*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * CDDL HEADER START
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * The contents of this file are subject to the terms of the
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Common Development and Distribution License (the "License").
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * You may not use this file except in compliance with the License.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * or http://www.opensolaris.org/os/licensing.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * See the License for the specific language governing permissions
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * and limitations under the License.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * When distributing Covered Code, include this CDDL HEADER in each
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * If applicable, add the following below this CDDL HEADER, with the
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * fields enclosed by brackets "[]" replaced with your own identifying
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * information: Portions Copyright [yyyy] [name of copyright owner]
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * CDDL HEADER END
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson/*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Copyright (c) 2009 Intel Corporation
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * All Rights Reserved.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson/*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Use is subject to license terms.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson/*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * instructions. This file contains an accelerated
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Galois Field Multiplication implementation.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * carry-less multiplication. More information about PCLMULQDQ can be
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * found at:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * http://software.intel.com/en-us/articles/
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson/*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * ====================================================================
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * OpenSolaris OS modifications
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * This source originates as file galois_hash_asm.c from
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Intel Corporation dated September 21, 2009.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * This OpenSolaris version has these major changes from the original source:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * definition for lint.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * 2. Formatted code, added comments, and added #includes and #defines.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * calling kpreempt_disable() and kpreempt_enable().
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * If the TS bit is not set, Save and restore %xmm registers at the beginning
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * and end of function calls (%xmm* registers are not saved and restored by
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * during kernel thread preemption).
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson * 4. Removed code to perform hashing. This is already done with C macro
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * GHASH in gcm.c. For better performance, this removed code should be
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * reintegrated in the future to replace the C GHASH macro.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson * 5. Added code to byte swap 16-byte input and output.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson * 6. Folded in comments from the original C source with embedded assembly
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * (SB_w_shift_xor.c)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson * 7. Renamed function and reordered parameters to match OpenSolaris:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Intel interface:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * void galois_hash_asm(unsigned char *hk, unsigned char *s,
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * unsigned char *d, int length)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * OpenSolaris OS interface:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * ====================================================================
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#if defined(lint) || defined(__lint)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#include <sys/types.h>
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson/* ARGSUSED */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Andersonvoid
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Andersongcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson}
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#else /* lint */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#include <sys/asm_linkage.h>
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#include <sys/controlregs.h>
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#ifdef _KERNEL
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#include <sys/machprivregs.h>
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#endif
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#ifdef _KERNEL
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson /*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * uses it to pass P2 to syscall.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * This also occurs with the STTS macro, but we don't care if
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * P2 (%rsi) is modified just before function exit.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * The CLTS and STTS macros push and pop P1 (%rdi) already.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#ifdef __xpv
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#define PROTECTED_CLTS \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson push %rsi; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson CLTS; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pop %rsi
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#else
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#define PROTECTED_CLTS \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson CLTS
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#endif /* __xpv */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson /*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * If CR0_TS is not set, align stack (with push %rbp) and push
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson push %rbp; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson mov %rsp, %rbp; \
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson movq %cr0, tmpreg; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson testq $CR0_TS, tmpreg; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson jnz 1f; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson and $-XMM_ALIGN, %rsp; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson sub $[XMM_SIZE * 11], %rsp; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm0, 160(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm1, 144(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm2, 128(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm3, 112(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm4, 96(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm5, 80(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm6, 64(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm7, 48(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm8, 32(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm9, 16(%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps %xmm10, (%rsp); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson jmp 2f; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson1: \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson PROTECTED_CLTS; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson2:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson /*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * otherwise set CR0_TS.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson testq $CR0_TS, tmpreg; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson jnz 1f; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps (%rsp), %xmm10; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 16(%rsp), %xmm9; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 32(%rsp), %xmm8; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 48(%rsp), %xmm7; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 64(%rsp), %xmm6; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 80(%rsp), %xmm5; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 96(%rsp), %xmm4; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 112(%rsp), %xmm3; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 128(%rsp), %xmm2; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 144(%rsp), %xmm1; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps 160(%rsp), %xmm0; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson jmp 2f; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson1: \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson STTS(tmpreg); \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson2: \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson mov %rbp, %rsp; \
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pop %rbp
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#else
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#define PROTECTED_CLTS
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson#endif /* _KERNEL */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson/*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson// static uint8_t byte_swap16_mask[] = {
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson.text
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson.align XMM_ALIGN
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson.Lbyte_swap16_mask:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson/*
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Perform a carry-less multiplication (that is, use XOR instead of the
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson * multiply operator) on P1 and P2 and place the result in P3.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Byte swap the input and the output.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Note: x_in, y, and res all point to a block of 20-byte numbers
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * (an array of two 64-bit integers).
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Note2: For kernel code, caller is responsible for ensuring
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * kpreempt_disable() has been called. This is because %xmm registers are
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson * respectively, if TS is set on entry. Otherwise, if TS is not set,
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * save and restore %xmm registers on the stack.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Note3: Original Intel definition:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * void galois_hash_asm(unsigned char *hk, unsigned char *s,
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * unsigned char *d, int length)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson *
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Note4: Register/parameter mapping:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Intel:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Parameter 1: %rcx (copied to %xmm0) hk or x_in
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Parameter 2: %rdx (copied to %xmm1) s or y
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Parameter 3: %rdi (result) d or res
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * OpenSolaris:
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Parameter 1: %rdi (copied to %xmm0) x_in
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Parameter 2: %rsi (copied to %xmm1) y
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson * Parameter 3: %rdx (result) res
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson */
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris AndersonENTRY_NP(gcm_mul_pclmulqdq)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Copy Parameters
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson movdqu (%rdi), %xmm0 // P1
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson movdqu (%rsi), %xmm1 // P2
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Byte swap 16-byte input
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson lea .Lbyte_swap16_mask(%rip), %rax
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movaps (%rax), %xmm10
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson pshufb %xmm10, %xmm0
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson pshufb %xmm10, %xmm1
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Multiply with the hash key
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm0, %xmm3
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm0, %xmm4
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm0, %xmm5
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm0, %xmm6
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm5, %xmm3
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // of the carry-less multiplication of
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // xmm0 by xmm1.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // We shift the result of the multiplication by one bit position
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // to the left to cope for the fact that the bits are reversed.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm3, %xmm7
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm6, %xmm8
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslld $1, %xmm3
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslld $1, %xmm6
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson psrld $31, %xmm7
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson psrld $31, %xmm8
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm7, %xmm9
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslldq $4, %xmm8
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslldq $4, %xmm7
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson psrldq $12, %xmm9
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson por %xmm7, %xmm3
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson por %xmm8, %xmm6
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson por %xmm9, %xmm6
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // First phase of the reduction
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // independently.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm3, %xmm7
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm3, %xmm8
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm3, %xmm9
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslld $31, %xmm7 // packed right shift shifting << 31
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslld $30, %xmm8 // packed right shift shifting << 30
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslld $25, %xmm9 // packed right shift shifting << 25
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm8, %xmm7 // xor the shifted versions
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm9, %xmm7
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm7, %xmm8
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pslldq $12, %xmm7
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson psrldq $4, %xmm8
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm7, %xmm3 // first phase of the reduction complete
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Second phase of the reduction
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // shift operations.
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm3, %xmm2
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm3, %xmm4 // packed left shifting >> 1
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson movdqu %xmm3, %xmm5
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson psrld $1, %xmm2
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson psrld $2, %xmm4 // packed left shifting >> 2
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson psrld $7, %xmm5 // packed left shifting >> 7
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm4, %xmm2 // xor the shifted versions
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm5, %xmm2
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm8, %xmm2
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm2, %xmm3
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson pxor %xmm3, %xmm6 // the result is in xmm6
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Byte swap 16-byte result
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Store the result
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson movdqu %xmm6, (%rdx) // P3
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson // Cleanup and Return
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson //
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson SET_TS_OR_POP_XMM_REGISTERS(%r10)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson ret
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson SET_SIZE(gcm_mul_pclmulqdq)
104d3bde5b4ac46904f144d3676110fc57a69603Dan OpenSolaris Anderson
8de5c4f463386063e184a851437d58080c6c626cDan OpenSolaris Anderson#endif /* lint || __lint */