/*
 * sboxes-xopintrin128.c :
 * Bitslice DES for AMD Next Generation Processor with AVX/XOP
 */

/*
 * Copyright 2009 DumplingerBoy (Dango-Chu). All Right Reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *  notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *  notice, this list of conditions and the following disclaimer in the
 *  documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <xopintrin.h>

static  const __m128i MM128_ALL_FF = {
	0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL
};

void s1(
	__m128i a1,
	__m128i a2,
	__m128i a3,
	__m128i a4,
	__m128i a5,
	__m128i a6,
	__m128i *out1,
	__m128i *out2,
	__m128i *out3,
	__m128i *out4
) {
	__m128i x1, x2, x3, x4, x5, x6;
	__m128i x7, x8, x9, x10, x11, x12;
	__m128i x13, x14, x15, x16, x17, x18;
	__m128i x19, x20, x21, x22, x23, x24;
	__m128i x25, x26, x27, x28, x29, x30;
	__m128i x31, x32, x33, x34, x35, x36;
	__m128i x37, x38, x39, x40, x41, x42;
	__m128i x43, x44, x45;

	x1 = _mm_andnot_si128(a2, a1);
	x2 = _mm_cmov_si128(a2, x1, a4);
	x3 = _mm_xor_si128(x2, MM128_ALL_FF);
	x4 = _mm_xor_si128(x3, a6);
	x5 = _mm_cmov_si128(x3, a6, a4);
	x6 = _mm_xor_si128(x1, x5);
	x7 = _mm_cmov_si128(x4, x6, a5);
	x8 = _mm_cmov_si128(a1, a5, a2);
	x9 = _mm_cmov_si128(x7, x8, a4);
	x10 = _mm_cmov_si128(x3, a2, a1);
	x11 = _mm_cmov_si128(x9, x10, a6);
	x12 = _mm_xor_si128(a1, a2);
	x13 = _mm_cmov_si128(a1, x3, a2);
	x14 = _mm_cmov_si128(x12, x13, a6);
	x15 = _mm_cmov_si128(x11, x14, a5);
	x16 = _mm_cmov_si128(x7, x15, a3);
	*out1 = _mm_xor_si128(*out1, x16);
	x17 = _mm_cmov_si128(a4, a2, x6);
	x18 = _mm_xor_si128(x14, x17);
	x19 = _mm_xor_si128(x2, x18);
	x20 = _mm_cmov_si128(x17, a1, x3);
	x21 = _mm_cmov_si128(x19, x20, a6);
	x22 = _mm_cmov_si128(x18, x21, a5);
	x23 = _mm_cmov_si128(x2, x6, x19);
	x24 = _mm_cmov_si128(x9, x20, x6);
	x25 = _mm_cmov_si128(x23, x24, a5);
	x26 = _mm_cmov_si128(x22, x25, a3);
	*out4 = _mm_xor_si128(*out4, x26);
	x27 = _mm_cmov_si128(x19, x24, x14);
	x28 = _mm_xor_si128(a2, x27);
	x29 = _mm_cmov_si128(x4, x13, a4);
	x30 = _mm_cmov_si128(x28, x29, a5);
	x31 = _mm_cmov_si128(a6, a2, x28);
	x32 = _mm_cmov_si128(x4, x18, a1);
	x33 = _mm_cmov_si128(x31, x32, a5);
	x34 = _mm_cmov_si128(x30, x33, a3);
	*out3 = _mm_xor_si128(*out3, x34);
	x35 = _mm_cmov_si128(x21, a3, x11);
	x36 = _mm_xor_si128(x17, x35);
	x37 = _mm_xor_si128(a6, x9);
	x38 = _mm_xor_si128(x37, x21);
	x39 = _mm_cmov_si128(x36, x38, a5);
	x40 = _mm_cmov_si128(x19, a2, x32);
	x41 = _mm_xor_si128(x11, x40);
	x42 = _mm_cmov_si128(x27, x2, x4);
	x43 = _mm_xor_si128(x24, x42);
	x44 = _mm_cmov_si128(x41, x43, a5);
	x45 = _mm_cmov_si128(x39, x44, a3);
	*out2 = _mm_xor_si128(*out2, x45);
}

void s2(
	__m128i a1,
	__m128i a2,
	__m128i a3,
	__m128i a4,
	__m128i a5,
	__m128i a6,
	__m128i *out1,
	__m128i *out2,
	__m128i *out3,
	__m128i *out4
) {
	__m128i x1, x2, x3, x4, x5, x6;
	__m128i x7, x8, x9, x10, x11, x12;
	__m128i x13, x14, x15, x16, x17, x18;
	__m128i x19, x20, x21, x22, x23, x24;
	__m128i x25, x26, x27, x28, x29, x30;
	__m128i x31, x32, x33, x34, x35, x36;
	__m128i x37, x38, x39, x40, x41, x42;

	x1 = _mm_xor_si128(a3, a5);
	x2 = _mm_xor_si128(x1, MM128_ALL_FF);
	x3 = _mm_or_si128(a3, x2);
	x4 = _mm_xor_si128(x3, a4);
	x5 = _mm_cmov_si128(x2, x4, a2);
	x6 = _mm_cmov_si128(a3, x4, a5);
	x7 = _mm_cmov_si128(x6, a4, a2);
	x8 = _mm_cmov_si128(x5, x7, a6);
	x9 = _mm_xor_si128(a2, a5);
	x10 = _mm_xor_si128(x9, x6);
	x11 = _mm_cmov_si128(a3, x1, x7);
	x12 = _mm_xor_si128(x6, x11);
	x13 = _mm_cmov_si128(x10, x12, a6);
	x14 = _mm_cmov_si128(x8, x13, a1);
	*out4 = _mm_xor_si128(*out4, x14);
	x15 = _mm_cmov_si128(a4, x10, a3);
	x16 = _mm_xor_si128(a5, x15);
	x17 = _mm_cmov_si128(x11, x4, x9);
	x18 = _mm_xor_si128(x3, x17);
	x19 = _mm_cmov_si128(x16, x18, a6);
	x20 = _mm_xor_si128(x16, MM128_ALL_FF);
	x21 = _mm_cmov_si128(x3, a2, x9);
	x22 = _mm_xor_si128(x5, x21);
	x23 = _mm_cmov_si128(x20, x22, a6);
	x24 = _mm_cmov_si128(x19, x23, a1);
	*out2 = _mm_xor_si128(*out2, x24);
	x25 = _mm_cmov_si128(x21, x18, x4);
	x26 = _mm_xor_si128(x11, x25);
	x27 = _mm_cmov_si128(x21, a5, x2);
	x28 = _mm_xor_si128(x26, x27);
	x29 = _mm_cmov_si128(x26, x28, a6);
	x30 = _mm_cmov_si128(a3, x22, x26);
	x31 = _mm_cmov_si128(x1, x3, a4);
	x32 = _mm_xor_si128(a2, x31);
	x33 = _mm_cmov_si128(x30, x32, a6);
	x34 = _mm_cmov_si128(x29, x33, a1);
	*out3 = _mm_xor_si128(*out3, x34);
	x35 = _mm_xor_si128(x20, x30);
	x36 = _mm_cmov_si128(x32, x1, a4);
	x37 = _mm_cmov_si128(x35, x36, a6);
	x38 = _mm_cmov_si128(x32, x1, x25);
	x39 = _mm_xor_si128(x27, x38);
	x40 = _mm_cmov_si128(x4, x35, x9);
	x41 = _mm_cmov_si128(x39, x40, a6);
	x42 = _mm_cmov_si128(x37, x41, a1);
	*out1 = _mm_xor_si128(*out1, x42);
}

void s3(
	__m128i a1,
	__m128i a2,
	__m128i a3,
	__m128i a4,
	__m128i a5,
	__m128i a6,
	__m128i *out1,
	__m128i *out2,
	__m128i *out3,
	__m128i *out4
) {
	__m128i x1, x2, x3, x4, x5, x6;
	__m128i x7, x8, x9, x10, x11, x12;
	__m128i x13, x14, x15, x16, x17, x18;
	__m128i x19, x20, x21, x22, x23, x24;
	__m128i x25, x26, x27, x28, x29, x30;
	__m128i x31, x32, x33, x34, x35, x36;
	__m128i x37, x38, x39, x40, x41, x42;
	__m128i x43;

	x1 = _mm_xor_si128(a2, a3);
	x2 = _mm_xor_si128(x1, MM128_ALL_FF);
	x3 = _mm_xor_si128(x2, a6);
	x4 = _mm_and_si128(a6, a2);
	x5 = _mm_or_si128(x4, x1);
	x6 = _mm_cmov_si128(x3, x5, a4);
	x7 = _mm_cmov_si128(x3, a1, a2);
	x8 = _mm_xor_si128(x6, x7);
	x9 = _mm_cmov_si128(x6, x8, a5);
	x10 = _mm_xor_si128(x3, MM128_ALL_FF);
	x11 = _mm_xor_si128(a4, a6);
	x12 = _mm_xor_si128(x11, a2);
	x13 = _mm_cmov_si128(x10, x12, a5);
	x14 = _mm_cmov_si128(x9, x13, a1);
	*out4 = _mm_xor_si128(*out4, x14);
	x15 = _mm_cmov_si128(x7, a6, a3);
	x16 = _mm_xor_si128(a4, x15);
	x17 = _mm_or_si128(a3, x2);
	x18 = _mm_xor_si128(x17, x12);
	x19 = _mm_cmov_si128(x16, x18, a5);
	x20 = _mm_or_si128(a2, x16);
	x21 = _mm_cmov_si128(x6, a4, x20);
	x22 = _mm_cmov_si128(x18, x1, a4);
	x23 = _mm_xor_si128(x22, MM128_ALL_FF);
	x24 = _mm_cmov_si128(x21, x23, a5);
	x25 = _mm_cmov_si128(x19, x24, a1);
	*out1 = _mm_xor_si128(*out1, x25);
	x26 = _mm_xor_si128(a2, x3);
	x27 = _mm_xor_si128(x26, x20);
	x28 = _mm_cmov_si128(a3, x4, x18);
	x29 = _mm_xor_si128(x1, x28);
	x30 = _mm_cmov_si128(x27, x29, a5);
	x31 = _mm_cmov_si128(x27, x1, x29);
	x32 = _mm_xor_si128(x11, x31);
	x33 = _mm_or_si128(x23, x27);
	x34 = _mm_xor_si128(x33, x32);
	x35 = _mm_cmov_si128(x32, x34, a5);
	x36 = _mm_cmov_si128(x30, x35, a1);
	*out3 = _mm_xor_si128(*out3, x36);
	x37 = _mm_cmov_si128(x28, x26, x6);
	x38 = _mm_cmov_si128(x3, x2, x20);
	x39 = _mm_cmov_si128(x37, x38, a5);
	x40 = _mm_cmov_si128(x1, x12, x29);
	x41 = _mm_cmov_si128(x32, x10, x18);
	x42 = _mm_cmov_si128(x40, x41, a5);
	x43 = _mm_cmov_si128(x39, x42, a1);
	*out2 = _mm_xor_si128(*out2, x43);
}

void s4(
	__m128i a1,
	__m128i a2,
	__m128i a3,
	__m128i a4,
	__m128i a5,
	__m128i a6,
	__m128i *out1,
	__m128i *out2,
	__m128i *out3,
	__m128i *out4
) {
	__m128i x1, x2, x3, x4, x5, x6;
	__m128i x7, x8, x9, x10, x11, x12;
	__m128i x13, x14, x15, x16, x17, x18;
	__m128i x19, x20, x21, x22, x23, x24;
	__m128i x25, x26, x27, x28, x29, x30;
	__m128i x31, x32;

	x1 = _mm_or_si128(a2, a5);
	x2 = _mm_xor_si128(x1, MM128_ALL_FF);
	x3 = _mm_cmov_si128(x2, a2, a3);
	x4 = _mm_or_si128(a5, x2);
	x5 = _mm_andnot_si128(x3, x4);
	x6 = _mm_cmov_si128(x3, x5, a1);
	x7 = _mm_xor_si128(x3, x4);
	x8 = _mm_cmov_si128(a2, a5, a3);
	x9 = _mm_xor_si128(x5, x8);
	x10 = _mm_cmov_si128(x7, x9, a1);
	x11 = _mm_cmov_si128(x6, x10, a4);
	x12 = _mm_cmov_si128(x6, x10, x9);
	x13 = _mm_xor_si128(x4, x12);
	x14 = _mm_cmov_si128(x7, a2, x12);
	x15 = _mm_xor_si128(a5, x14);
	x16 = _mm_cmov_si128(x13, x15, a4);
	x17 = _mm_cmov_si128(x11, x16, a6);
	*out4 = _mm_xor_si128(*out4, x17);
	x18 = _mm_cmov_si128(x6, a1, a3);
	x19 = _mm_xor_si128(x10, x18);
	x20 = _mm_xor_si128(x1, x19);
	x21 = _mm_cmov_si128(x20, x6, a1);
	x22 = _mm_cmov_si128(x19, x21, a4);
	x23 = _mm_cmov_si128(x14, x2, x7);
	x24 = _mm_xor_si128(a1, x23);
	x25 = _mm_cmov_si128(a5, x15, x20);
	x26 = _mm_xor_si128(x18, x25);
	x27 = _mm_cmov_si128(x24, x26, a4);
	x28 = _mm_cmov_si128(x22, x27, a6);
	*out1 = _mm_xor_si128(*out1, x28);
	x29 = _mm_cmov_si128(x27, x22, a6);
	x30 = _mm_xor_si128(a6, x29);
	*out2 = _mm_xor_si128(*out2, x30);
	x31 = _mm_cmov_si128(x16, x11, a6);
	x32 = _mm_xor_si128(a6, x31);
	*out3 = _mm_xor_si128(*out3, x32);
}

void s5(
	__m128i a1,
	__m128i a2,
	__m128i a3,
	__m128i a4,
	__m128i a5,
	__m128i a6,
	__m128i *out1,
	__m128i *out2,
	__m128i *out3,
	__m128i *out4
) {
	__m128i x1, x2, x3, x4, x5, x6;
	__m128i x7, x8, x9, x10, x11, x12;
	__m128i x13, x14, x15, x16, x17, x18;
	__m128i x19, x20, x21, x22, x23, x24;
	__m128i x25, x26, x27, x28, x29, x30;
	__m128i x31, x32, x33, x34, x35, x36;
	__m128i x37, x38, x39, x40, x41, x42;
	__m128i x43, x44, x45;

	x1 = _mm_xor_si128(a2, a6);
	x2 = _mm_xor_si128(x1, MM128_ALL_FF);
	x3 = _mm_xor_si128(x2, a5);
	x4 = _mm_cmov_si128(x3, x1, a3);
	x5 = _mm_xor_si128(a3, x3);
	x6 = _mm_cmov_si128(x4, x5, a1);
	x7 = _mm_cmov_si128(x5, a5, a2);
	x8 = _mm_cmov_si128(x2, a2, x3);
	x9 = _mm_cmov_si128(x7, x8, a3);
	x10 = _mm_cmov_si128(x8, a5, a3);
	x11 = _mm_xor_si128(a6, x10);
	x12 = _mm_cmov_si128(x9, x11, a1);
	x13 = _mm_cmov_si128(x6, x12, a4);
	*out2 = _mm_xor_si128(*out2, x13);
	x14 = _mm_cmov_si128(x7, x4, x8);
	x15 = _mm_xor_si128(x11, x14);
	x16 = _mm_cmov_si128(x5, x2, a2);
	x17 = _mm_cmov_si128(x11, a2, x14);
	x18 = _mm_cmov_si128(x16, x17, a3);
	x19 = _mm_cmov_si128(x15, x18, a1);
	x20 = _mm_cmov_si128(a5, a1, x7);
	x21 = _mm_xor_si128(x5, x20);
	x22 = _mm_cmov_si128(x18, x3, x7);
	x23 = _mm_xor_si128(x14, x22);
	x24 = _mm_cmov_si128(x21, x23, a1);
	x25 = _mm_cmov_si128(x19, x24, a4);
	*out3 = _mm_xor_si128(*out3, x25);
	x26 = _mm_cmov_si128(x3, x10, x23);
	x27 = _mm_and_si128(x20, x26);
	x28 = _mm_cmov_si128(x16, a2, a5);
	x29 = _mm_xor_si128(a3, x28);
	x30 = _mm_cmov_si128(x27, x29, a1);
	x31 = _mm_cmov_si128(x8, a1, a5);
	x32 = _mm_xor_si128(x22, x31);
	x33 = _mm_cmov_si128(x26, x14, x4);
	x34 = _mm_cmov_si128(x32, x33, a1);
	x35 = _mm_cmov_si128(x30, x34, a4);
	*out4 = _mm_xor_si128(*out4, x35);
	x36 = _mm_cmov_si128(x31, x3, a3);
	x37 = _mm_xor_si128(x36, MM128_ALL_FF);
	x38 = _mm_andnot_si128(a2, x5);
	x39 = _mm_xor_si128(x38, x32);
	x40 = _mm_cmov_si128(x37, x39, a1);
	x41 = _mm_cmov_si128(x18, x4, x32);
	x42 = _mm_andnot_si128(x15, x1);
	x43 = _mm_xor_si128(x42, x20);
	x44 = _mm_cmov_si128(x41, x43, a1);
	x45 = _mm_cmov_si128(x40, x44, a4);
	*out1 = _mm_xor_si128(*out1, x45);
}

void s6(
	__m128i a1,
	__m128i a2,
	__m128i a3,
	__m128i a4,
	__m128i a5,
	__m128i a6,
	__m128i *out1,
	__m128i *out2,
	__m128i *out3,
	__m128i *out4
) {
	__m128i x1, x2, x3, x4, x5, x6;
	__m128i x7, x8, x9, x10, x11, x12;
	__m128i x13, x14, x15, x16, x17, x18;
	__m128i x19, x20, x21, x22, x23, x24;
	__m128i x25, x26, x27, x28, x29, x30;
	__m128i x31, x32, x33, x34, x35, x36;
	__m128i x37, x38, x39, x40, x41, x42;
	__m128i x43;

	x1 = _mm_cmov_si128(a2, a5, a4);
	x2 = _mm_cmov_si128(a4, x1, a1);
	x3 = _mm_xor_si128(a1, a4);
	x4 = _mm_cmov_si128(x2, x3, a6);
	x5 = _mm_xor_si128(a6, a2);
	x6 = _mm_xor_si128(x5, x3);
	x7 = _mm_cmov_si128(x4, x6, a3);
	x8 = _mm_cmov_si128(x5, a1, x3);
	x9 = _mm_xor_si128(x3, MM128_ALL_FF);
	x10 = _mm_cmov_si128(x8, x9, a6);
	x11 = _mm_andnot_si128(x8, x6);
	x12 = _mm_xor_si128(a2, MM128_ALL_FF);
	x13 = _mm_cmov_si128(x11, x12, a6);
	x14 = _mm_cmov_si128(x10, x13, a3);
	x15 = _mm_cmov_si128(x7, x14, a5);
	*out1 = _mm_xor_si128(*out1, x15);
	x16 = _mm_cmov_si128(x10, x4, x2);
	x17 = _mm_xor_si128(x12, x16);
	x18 = _mm_cmov_si128(x17, x3, x5);
	x19 = _mm_xor_si128(x8, x18);
	x20 = _mm_cmov_si128(x17, x19, a3);
	x21 = _mm_cmov_si128(a4, x6, x16);
	x22 = _mm_xor_si128(x1, x21);
	x23 = _mm_xor_si128(x1, x6);
	x24 = _mm_xor_si128(x23, MM128_ALL_FF);
	x25 = _mm_cmov_si128(x22, x24, a3);
	x26 = _mm_cmov_si128(x20, x25, a5);
	*out2 = _mm_xor_si128(*out2, x26);
	x27 = _mm_andnot_si128(x22, x4);
	x28 = _mm_xor_si128(x27, x2);
	x29 = _mm_cmov_si128(x21, x18, a2);
	x30 = _mm_cmov_si128(x28, x29, a3);
	x31 = _mm_cmov_si128(a2, a6, a1);
	x32 = _mm_xor_si128(a4, x31);
	x33 = _mm_cmov_si128(x6, x32, a3);
	x34 = _mm_cmov_si128(x30, x33, a5);
	*out3 = _mm_xor_si128(*out3, x34);
	x35 = _mm_or_si128(x29, x32);
	x36 = _mm_xor_si128(x35, x6);
	x37 = _mm_cmov_si128(x32, x5, x8);
	x38 = _mm_xor_si128(x23, x37);
	x39 = _mm_cmov_si128(x36, x38, a3);
	x40 = _mm_cmov_si128(x5, x24, x32);
	x41 = _mm_cmov_si128(x38, x1, a1);
	x42 = _mm_cmov_si128(x40, x41, a3);
	x43 = _mm_cmov_si128(x39, x42, a5);
	*out4 = _mm_xor_si128(*out4, x43);
}

void s7(
	__m128i a1,
	__m128i a2,
	__m128i a3,
	__m128i a4,
	__m128i a5,
	__m128i a6,
	__m128i *out1,
	__m128i *out2,
	__m128i *out3,
	__m128i *out4
) {
	__m128i x1, x2, x3, x4, x5, x6;
	__m128i x7, x8, x9, x10, x11, x12;
	__m128i x13, x14, x15, x16, x17, x18;
	__m128i x19, x20, x21, x22, x23, x24;
	__m128i x25, x26, x27, x28, x29, x30;
	__m128i x31, x32, x33, x34, x35, x36;
	__m128i x37, x38, x39, x40, x41, x42;
	__m128i x43;

	x1 = _mm_xor_si128(a4, a5);
	x2 = _mm_xor_si128(x1, MM128_ALL_FF);
	x3 = _mm_andnot_si128(a5, a4);
	x4 = _mm_cmov_si128(x2, x3, a2);
	x5 = _mm_andnot_si128(a2, a5);
	x6 = _mm_or_si128(x5, x4);
	x7 = _mm_cmov_si128(x4, x6, a3);
	x8 = _mm_cmov_si128(x3, x2, a2);
	x9 = _mm_cmov_si128(x6, a2, a4);
	x10 = _mm_cmov_si128(x8, x9, a3);
	x11 = _mm_cmov_si128(x7, x10, a6);
	x12 = _mm_cmov_si128(x4, x2, a4);
	x13 = _mm_xor_si128(a3, x12);
	x14 = _mm_cmov_si128(x9, x8, x1);
	x15 = _mm_xor_si128(x12, MM128_ALL_FF);
	x16 = _mm_cmov_si128(x14, x15, a3);
	x17 = _mm_cmov_si128(x13, x16, a6);
	x18 = _mm_cmov_si128(x11, x17, a1);
	*out1 = _mm_xor_si128(*out1, x18);
	x19 = _mm_xor_si128(a5, x9);
	x20 = _mm_xor_si128(x19, x13);
	x21 = _mm_cmov_si128(x1, x7, x20);
	x22 = _mm_xor_si128(x15, x21);
	x23 = _mm_cmov_si128(x20, x22, a6);
	x24 = _mm_cmov_si128(x16, a5, x20);
	x25 = _mm_xor_si128(x19, x24);
	x26 = _mm_xor_si128(a3, x6);
	x27 = _mm_cmov_si128(x25, x26, a6);
	x28 = _mm_cmov_si128(x23, x27, a1);
	*out3 = _mm_xor_si128(*out3, x28);
	x29 = _mm_cmov_si128(x19, x10, x13);
	x30 = _mm_xor_si128(x2, x29);
	x31 = _mm_cmov_si128(x30, x17, a6);
	x32 = _mm_xor_si128(x16, x19);
	x33 = _mm_xor_si128(x32, x30);
	x34 = _mm_cmov_si128(x10, x20, x16);
	x35 = _mm_cmov_si128(x33, x34, a6);
	x36 = _mm_cmov_si128(x31, x35, a1);
	*out2 = _mm_xor_si128(*out2, x36);
	x37 = _mm_cmov_si128(x20, x10, x5);
	x38 = _mm_cmov_si128(x1, x14, a3);
	x39 = _mm_cmov_si128(x37, x38, a6);
	x40 = _mm_cmov_si128(x16, x38, a2);
	x41 = _mm_xor_si128(x38, MM128_ALL_FF);
	x42 = _mm_cmov_si128(x40, x41, a6);
	x43 = _mm_cmov_si128(x39, x42, a1);
	*out4 = _mm_xor_si128(*out4, x43);
}

void s8(
	__m128i a1,
	__m128i a2,
	__m128i a3,
	__m128i a4,
	__m128i a5,
	__m128i a6,
	__m128i *out1,
	__m128i *out2,
	__m128i *out3,
	__m128i *out4
) {
	__m128i x1, x2, x3, x4, x5, x6;
	__m128i x7, x8, x9, x10, x11, x12;
	__m128i x13, x14, x15, x16, x17, x18;
	__m128i x19, x20, x21, x22, x23, x24;
	__m128i x25, x26, x27, x28, x29, x30;
	__m128i x31, x32, x33, x34, x35, x36;
	__m128i x37, x38, x39, x40, x41;

	x1 = _mm_xor_si128(a5, MM128_ALL_FF);
	x2 = _mm_cmov_si128(a3, x1, a4);
	x3 = _mm_or_si128(a4, a3);
	x4 = _mm_xor_si128(x3, x2);
	x5 = _mm_cmov_si128(x2, x4, a2);
	x6 = _mm_or_si128(a3, a5);
	x7 = _mm_xor_si128(x6, a4);
	x8 = _mm_xor_si128(a4, a3);
	x9 = _mm_xor_si128(x8, MM128_ALL_FF);
	x10 = _mm_cmov_si128(x7, x9, a2);
	x11 = _mm_cmov_si128(x5, x10, a6);
	x12 = _mm_cmov_si128(a5, a4, a3);
	x13 = _mm_cmov_si128(x7, a3, a5);
	x14 = _mm_cmov_si128(x12, x13, a2);
	x15 = _mm_xor_si128(a2, x13);
	x16 = _mm_cmov_si128(x14, x15, a6);
	x17 = _mm_cmov_si128(x11, x16, a1);
	*out3 = _mm_xor_si128(*out3, x17);
	x18 = _mm_xor_si128(a2, a5);
	x19 = _mm_xor_si128(x18, x7);
	x20 = _mm_xor_si128(a3, a5);
	x21 = _mm_cmov_si128(x4, x1, x7);
	x22 = _mm_cmov_si128(x20, x21, a2);
	x23 = _mm_cmov_si128(x19, x22, a6);
	x24 = _mm_cmov_si128(x5, x6, x10);
	x25 = _mm_xor_si128(a6, x24);
	x26 = _mm_cmov_si128(x23, x25, a1);
	*out2 = _mm_xor_si128(*out2, x26);
	x27 = _mm_cmov_si128(x18, a1, x10);
	x28 = _mm_xor_si128(x20, x27);
	x29 = _mm_cmov_si128(x7, x15, x18);
	x30 = _mm_cmov_si128(x28, x29, a6);
	x31 = _mm_xor_si128(x25, x28);
	x32 = _mm_cmov_si128(x21, x22, x2);
	x33 = _mm_cmov_si128(x31, x32, a6);
	x34 = _mm_cmov_si128(x30, x33, a1);
	*out1 = _mm_xor_si128(*out1, x34);
	x35 = _mm_or_si128(a5, x21);
	x36 = _mm_xor_si128(x35, x22);
	x37 = _mm_xor_si128(x28, MM128_ALL_FF);
	x38 = _mm_cmov_si128(x36, x37, a6);
	x39 = _mm_andnot_si128(x27, a6);
	x40 = _mm_cmov_si128(x22, x31, x39);
	x41 = _mm_cmov_si128(x38, x40, a1);
	*out4 = _mm_xor_si128(*out4, x41);
}
