/* sboxes-spu.c :
Bitslice DES faster implementation for Cell/SpursEngine SPU (GPL Ver.)

Copyright (C) 2008 Dumplinger Boy (Dango-Chu).

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

typedef vector unsigned char spu_qword_t; 

static void s1(
	spu_qword_t a1,
	spu_qword_t a2,
	spu_qword_t a3,
	spu_qword_t a4,
	spu_qword_t a5,
	spu_qword_t a6,
	spu_qword_t *out1,
	spu_qword_t *out2,
	spu_qword_t *out3,
	spu_qword_t *out4
) {
	spu_qword_t x1, x2, x3, x4, x5, x6;
	spu_qword_t x7, x8, x9, x10, x11, x12;
	spu_qword_t x13, x14, x15, x16, x17, x18;
	spu_qword_t x19, x20, x21, x22, x23, x24;
	spu_qword_t x25, x26, x27, x28, x29, x30;
	spu_qword_t x31, x32, x33, x34, x35, x36;
	spu_qword_t x37, x38, x39, x40, x41, x42;
	spu_qword_t x43, x44;

	x1 = spu_eqv(a4, a5);
	x2 = spu_nor(a4, x1);
	x3 = spu_sel(x2, x1, a6);
	x4 = spu_eqv(a6, a5);
	x5 = spu_or(x4, x2);
	x6 = spu_sel(x5, x3, a3);
	x7 = spu_sel(x1, x2, a6);
	x8 = spu_eqv(a5, x7);
	x9 = spu_sel(x8, x7, a3);
	x10 = spu_sel(x9, x6, a2);
	x11 = spu_nor(a3, x4);
	x12 = spu_or(x11, x3);
	x13 = spu_andc(a3, a4);
	x14 = spu_eqv(x13, x12);
	x15 = spu_sel(x14, x12, a2);
	x16 = spu_sel(x15, x10, a1);
	*out1 = spu_xor(*out1, x16);
	x17 = spu_sel(x12, x2, a3);
	x18 = spu_xor(x5, x17);
	x19 = spu_sel(x13, x1, a6);
	x20 = spu_eqv(x18, x19);
	x21 = spu_sel(x20, x18, a2);
	x22 = spu_andc(x4, x9);
	x23 = spu_xor(x22, a3);
	x24 = spu_sel(a3, x20, a6);
	x25 = spu_xor(x9, x24);
	x26 = spu_sel(x25, x23, a2);
	x27 = spu_sel(x26, x21, a1);
	*out4 = spu_xor(*out4, x27);
	x28 = spu_sel(x7, a4, x18);
	x29 = spu_eqv(x11, x28);
	x30 = spu_sel(x28, x19, x17);
	x31 = spu_sel(x30, x29, a2);
	x32 = spu_xor(x5, x24);
	x33 = spu_sel(x4, a6, x9);
	x34 = spu_xor(x19, x33);
	x35 = spu_sel(x34, x32, a2);
	x36 = spu_sel(x35, x31, a1);
	*out3 = spu_xor(*out3, x36);
	x37 = spu_sel(a3, x7, x17);
	x38 = spu_eqv(x25, x37);
	x39 = spu_sel(x14, x19, x24);
	x40 = spu_sel(x39, x38, a2);
	x41 = spu_andc(x9, x3);
	x42 = spu_sel(x6, x29, a6);
	x43 = spu_sel(x42, x41, a2);
	x44 = spu_sel(x43, x40, a1);
	*out2 = spu_xor(*out2, x44);
}

static void s2(
	spu_qword_t a1,
	spu_qword_t a2,
	spu_qword_t a3,
	spu_qword_t a4,
	spu_qword_t a5,
	spu_qword_t a6,
	spu_qword_t *out1,
	spu_qword_t *out2,
	spu_qword_t *out3,
	spu_qword_t *out4
) {
	spu_qword_t x1, x2, x3, x4, x5, x6;
	spu_qword_t x7, x8, x9, x10, x11, x12;
	spu_qword_t x13, x14, x15, x16, x17, x18;
	spu_qword_t x19, x20, x21, x22, x23, x24;
	spu_qword_t x25, x26, x27, x28, x29, x30;
	spu_qword_t x31, x32, x33, x34, x35, x36;
	spu_qword_t x37, x38, x39, x40, x41;

	x1 = spu_or(a3, a6);
	x2 = spu_eqv(x1, a2);
	x3 = spu_orc(a2, a6);
	x4 = spu_nand(x3, x1);
	x5 = spu_sel(x4, x2, a4);
	x6 = spu_sel(x4, a2, a3);
	x7 = spu_xor(a6, x6);
	x8 = spu_eqv(x7, a4);
	x9 = spu_sel(x8, x5, a1);
	x10 = spu_andc(a2, x5);
	x11 = spu_eqv(x10, x7);
	x12 = spu_sel(x4, x2, x11);
	x13 = spu_sel(x12, x11, a1);
	x14 = spu_sel(x13, x9, a5);
	*out1 = spu_xor(*out1, x14);
	x15 = spu_sel(x5, x10, a6);
	x16 = spu_eqv(x8, x15);
	x17 = spu_xor(x3, x5);
	x18 = spu_eqv(x17, x8);
	x19 = spu_sel(x18, x16, a1);
	x20 = spu_sel(a2, a6, x11);
	x21 = spu_xor(x16, x20);
	x22 = spu_xor(a4, x4);
	x23 = spu_sel(x22, x21, a1);
	x24 = spu_sel(x23, x19, a5);
	*out4 = spu_xor(*out4, x24);
	x25 = spu_sel(a2, x10, a3);
	x26 = spu_eqv(x11, x25);
	x27 = spu_sel(x1, x12, x22);
	x28 = spu_xor(x2, x27);
	x29 = spu_sel(x28, x26, a1);
	x30 = spu_andc(a4, x25);
	x31 = spu_eqv(x30, x6);
	x32 = spu_sel(x17, x18, x2);
	x33 = spu_sel(x32, x31, a1);
	x34 = spu_sel(x33, x29, a5);
	*out3 = spu_xor(*out3, x34);
	x35 = spu_sel(a4, a6, x20);
	x36 = spu_eqv(x25, x35);
	x37 = spu_sel(x35, x4, x25);
	x38 = spu_sel(x37, x36, a1);
	x39 = spu_sel(x22, x37, x26);
	x40 = spu_eqv(a1, x39);
	x41 = spu_sel(x40, x38, a5);
	*out2 = spu_xor(*out2, x41);
}

static void s3(
	spu_qword_t a1,
	spu_qword_t a2,
	spu_qword_t a3,
	spu_qword_t a4,
	spu_qword_t a5,
	spu_qword_t a6,
	spu_qword_t *out1,
	spu_qword_t *out2,
	spu_qword_t *out3,
	spu_qword_t *out4
) {
	spu_qword_t x1, x2, x3, x4, x5, x6;
	spu_qword_t x7, x8, x9, x10, x11, x12;
	spu_qword_t x13, x14, x15, x16, x17, x18;
	spu_qword_t x19, x20, x21, x22, x23, x24;
	spu_qword_t x25, x26, x27, x28, x29, x30;
	spu_qword_t x31, x32, x33, x34, x35, x36;
	spu_qword_t x37, x38, x39, x40, x41;

	x1 = spu_xor(a3, a6);
	x2 = spu_and(x1, a5);
	x3 = spu_xor(a5, x1);
	x4 = spu_sel(x3, x2, a2);
	x5 = spu_xor(a3, a5);
	x6 = spu_orc(x5, x1);
	x7 = spu_nor(x5, x5);
	x8 = spu_sel(x7, x6, a2);
	x9 = spu_sel(x8, x4, a4);
	x10 = spu_sel(a6, x3, a5);
	x11 = spu_eqv(a2, x10);
	x12 = spu_eqv(a5, x11);
	x13 = spu_sel(x12, x11, a4);
	x14 = spu_sel(x13, x9, a1);
	*out4 = spu_xor(*out4, x14);
	x15 = spu_sel(x3, x12, a3);
	x16 = spu_andc(x15, x2);
	x17 = spu_eqv(x16, a4);
	x18 = spu_or(a2, x12);
	x19 = spu_nand(x18, x15);
	x20 = spu_xor(x1, x11);
	x21 = spu_and(x20, x8);
	x22 = spu_sel(x21, x19, a4);
	x23 = spu_sel(x22, x17, a1);
	*out1 = spu_xor(*out1, x23);
	x24 = spu_sel(x7, x3, x12);
	x25 = spu_eqv(a3, x12);
	x26 = spu_sel(x25, x24, a4);
	x27 = spu_sel(x19, x10, x12);
	x28 = spu_eqv(x5, x27);
	x29 = spu_sel(x4, x11, x5);
	x30 = spu_sel(x29, x28, a4);
	x31 = spu_sel(x30, x26, a1);
	*out2 = spu_xor(*out2, x31);
	x32 = spu_sel(x2, x19, x29);
	x33 = spu_eqv(x27, x32);
	x34 = spu_or(a3, x18);
	x35 = spu_xor(x34, x3);
	x36 = spu_sel(x35, x33, a4);
	x37 = spu_sel(x27, x5, x20);
	x38 = spu_orc(x4, x37);
	x39 = spu_eqv(x38, x11);
	x40 = spu_sel(x39, x37, a4);
	x41 = spu_sel(x40, x36, a1);
	*out3 = spu_xor(*out3, x41);
}

static void s4(
	spu_qword_t a1,
	spu_qword_t a2,
	spu_qword_t a3,
	spu_qword_t a4,
	spu_qword_t a5,
	spu_qword_t a6,
	spu_qword_t *out1,
	spu_qword_t *out2,
	spu_qword_t *out3,
	spu_qword_t *out4
) {
	spu_qword_t x1, x2, x3, x4, x5, x6;
	spu_qword_t x7, x8, x9, x10, x11, x12;
	spu_qword_t x13, x14, x15, x16, x17, x18;
	spu_qword_t x19, x20, x21, x22, x23, x24;
	spu_qword_t x25, x26, x27, x28, x29, x30;

	x1 = spu_nor(a3, a3);
	x2 = spu_nor(x1, a5);
	x3 = spu_sel(x2, x1, a2);
	x4 = spu_nor(x2, x3);
	x5 = spu_sel(x4, x3, a1);
	x6 = spu_sel(x1, x4, a5);
	x7 = spu_eqv(a2, x1);
	x8 = spu_eqv(x7, a5);
	x9 = spu_sel(x8, x6, a1);
	x10 = spu_sel(x9, x5, a4);
	x11 = spu_or(x2, x5);
	x12 = spu_xor(x11, x8);
	x13 = spu_sel(x6, a2, x5);
	x14 = spu_eqv(x7, x13);
	x15 = spu_sel(x14, x12, a4);
	x16 = spu_sel(x15, x10, a6);
	*out4 = spu_xor(*out4, x16);
	x17 = spu_sel(x5, a1, x1);
	x18 = spu_xor(x9, x17);
	x19 = spu_sel(x17, x2, x8);
	x20 = spu_sel(x5, x19, a1);
	x21 = spu_sel(x20, x18, a4);
	x22 = spu_sel(x20, a1, x13);
	x23 = spu_eqv(x8, x22);
	x24 = spu_sel(x19, x12, x22);
	x25 = spu_sel(x24, x23, a4);
	x26 = spu_sel(x25, x21, a6);
	*out1 = spu_xor(*out1, x26);
	x27 = spu_sel(x21, x25, a6);
	x28 = spu_xor(a6, x27);
	*out2 = spu_xor(*out2, x28);
	x29 = spu_sel(x10, x15, a6);
	x30 = spu_xor(a6, x29);
	*out3 = spu_xor(*out3, x30);
}

static void s5(
	spu_qword_t a1,
	spu_qword_t a2,
	spu_qword_t a3,
	spu_qword_t a4,
	spu_qword_t a5,
	spu_qword_t a6,
	spu_qword_t *out1,
	spu_qword_t *out2,
	spu_qword_t *out3,
	spu_qword_t *out4
) {
	spu_qword_t x1, x2, x3, x4, x5, x6;
	spu_qword_t x7, x8, x9, x10, x11, x12;
	spu_qword_t x13, x14, x15, x16, x17, x18;
	spu_qword_t x19, x20, x21, x22, x23, x24;
	spu_qword_t x25, x26, x27, x28, x29, x30;
	spu_qword_t x31, x32, x33, x34, x35, x36;
	spu_qword_t x37, x38, x39, x40, x41, x42;

	x1 = spu_xor(a2, a6);
	x2 = spu_eqv(x1, a5);
	x3 = spu_eqv(x2, a1);
	x4 = spu_sel(x2, x1, a1);
	x5 = spu_sel(x4, x3, a3);
	x6 = spu_sel(a1, a6, x4);
	x7 = spu_eqv(a5, x6);
	x8 = spu_sel(x3, x6, a6);
	x9 = spu_xor(x1, x8);
	x10 = spu_sel(x9, x7, a3);
	x11 = spu_sel(x10, x5, a4);
	*out2 = spu_xor(*out2, x11);
	x12 = spu_sel(x1, x7, x9);
	x13 = spu_eqv(x3, x12);
	x14 = spu_nor(x2, x2);
	x15 = spu_sel(a2, a6, x8);
	x16 = spu_sel(x15, x14, a1);
	x17 = spu_sel(x16, x13, a3);
	x18 = spu_sel(x8, x15, x4);
	x19 = spu_xor(x16, x18);
	x20 = spu_sel(x1, x8, a5);
	x21 = spu_sel(x20, x19, a3);
	x22 = spu_sel(x21, x17, a4);
	*out1 = spu_xor(*out1, x22);
	x23 = spu_sel(a2, x4, x7);
	x24 = spu_xor(x18, x23);
	x25 = spu_sel(x14, x13, x18);
	x26 = spu_sel(x25, x24, a3);
	x27 = spu_sel(a6, x18, x7);
	x28 = spu_xor(x24, x27);
	x29 = spu_sel(a2, x3, x13);
	x30 = spu_eqv(x9, x29);
	x31 = spu_sel(x30, x28, a3);
	x32 = spu_sel(x31, x26, a4);
	*out3 = spu_xor(*out3, x32);
	x33 = spu_sel(a1, a5, x16);
	x34 = spu_eqv(x18, x33);
	x35 = spu_or(x1, x28);
	x36 = spu_xor(x35, x30);
	x37 = spu_sel(x36, x34, a3);
	x38 = spu_and(x25, x28);
	x39 = spu_eqv(x38, x9);
	x40 = spu_sel(x4, x16, x34);
	x41 = spu_sel(x40, x39, a3);
	x42 = spu_sel(x41, x37, a4);
	*out4 = spu_xor(*out4, x42);
}

static void s6(
	spu_qword_t a1,
	spu_qword_t a2,
	spu_qword_t a3,
	spu_qword_t a4,
	spu_qword_t a5,
	spu_qword_t a6,
	spu_qword_t *out1,
	spu_qword_t *out2,
	spu_qword_t *out3,
	spu_qword_t *out4
) {
	spu_qword_t x1, x2, x3, x4, x5, x6;
	spu_qword_t x7, x8, x9, x10, x11, x12;
	spu_qword_t x13, x14, x15, x16, x17, x18;
	spu_qword_t x19, x20, x21, x22, x23, x24;
	spu_qword_t x25, x26, x27, x28, x29, x30;
	spu_qword_t x31, x32, x33, x34, x35, x36;
	spu_qword_t x37, x38, x39, x40, x41;

	x1 = spu_sel(a5, a4, a1);
	x2 = spu_eqv(a1, a4);
	x3 = spu_or(a1, a4);
	x4 = spu_sel(x3, x2, a5);
	x5 = spu_sel(x4, x1, a2);
	x6 = spu_xor(a5, x2);
	x7 = spu_sel(x6, x5, a6);
	x8 = spu_sel(x4, x6, a2);
	x9 = spu_eqv(a5, x8);
	x10 = spu_and(a5, x2);
	x11 = spu_eqv(x10, a2);
	x12 = spu_sel(x11, x9, a6);
	x13 = spu_sel(x12, x7, a3);
	*out1 = spu_xor(*out1, x13);
	x14 = spu_andc(a1, x6);
	x15 = spu_orc(x6, x3);
	x16 = spu_sel(x15, x14, a2);
	x17 = spu_sel(x11, a2, x14);
	x18 = spu_eqv(x5, x17);
	x19 = spu_sel(x18, x16, a6);
	x20 = spu_sel(a2, x15, x18);
	x21 = spu_xor(x4, x20);
	x22 = spu_andc(a4, x18);
	x23 = spu_orc(x22, x21);
	x24 = spu_sel(x23, x21, a6);
	x25 = spu_sel(x24, x19, a3);
	*out2 = spu_xor(*out2, x25);
	x26 = spu_sel(x11, x4, x18);
	x27 = spu_xor(x3, x11);
	x28 = spu_sel(x27, x26, a6);
	x29 = spu_sel(a2, a5, a1);
	x30 = spu_eqv(x26, x29);
	x31 = spu_and(a4, x11);
	x32 = spu_eqv(x31, x6);
	x33 = spu_sel(x32, x30, a6);
	x34 = spu_sel(x33, x28, a3);
	*out4 = spu_xor(*out4, x34);
	x35 = spu_sel(x27, x17, x20);
	x36 = spu_sel(x21, x1, x27);
	x37 = spu_sel(x36, x35, a6);
	x38 = spu_orc(x16, x4);
	x39 = spu_sel(x3, x17, x8);
	x40 = spu_sel(x39, x38, a6);
	x41 = spu_sel(x40, x37, a3);
	*out3 = spu_xor(*out3, x41);
}

static void s7(
	spu_qword_t a1,
	spu_qword_t a2,
	spu_qword_t a3,
	spu_qword_t a4,
	spu_qword_t a5,
	spu_qword_t a6,
	spu_qword_t *out1,
	spu_qword_t *out2,
	spu_qword_t *out3,
	spu_qword_t *out4
) {
	spu_qword_t x1, x2, x3, x4, x5, x6;
	spu_qword_t x7, x8, x9, x10, x11, x12;
	spu_qword_t x13, x14, x15, x16, x17, x18;
	spu_qword_t x19, x20, x21, x22, x23, x24;
	spu_qword_t x25, x26, x27, x28, x29, x30;
	spu_qword_t x31, x32, x33, x34, x35, x36;
	spu_qword_t x37, x38, x39, x40;

	x1 = spu_orc(a4, a3);
	x2 = spu_sel(a3, x1, a2);
	x3 = spu_eqv(x2, a5);
	x4 = spu_sel(a4, a2, a3);
	x5 = spu_sel(a3, a4, x3);
	x6 = spu_xor(x1, x5);
	x7 = spu_sel(x6, x4, a5);
	x8 = spu_sel(x7, x3, a1);
	x9 = spu_sel(x7, x5, x4);
	x10 = spu_xor(x1, x9);
	x11 = spu_sel(a2, a3, a4);
	x12 = spu_xor(x4, x11);
	x13 = spu_eqv(x12, a5);
	x14 = spu_sel(x13, x10, a1);
	x15 = spu_sel(x14, x8, a6);
	*out2 = spu_xor(*out2, x15);
	x16 = spu_xor(x6, x12);
	x17 = spu_sel(a3, x3, x6);
	x18 = spu_sel(x17, x16, a5);
	x19 = spu_xor(x9, x13);
	x20 = spu_sel(x19, x18, a1);
	x21 = spu_sel(x12, x17, a5);
	x22 = spu_sel(a5, a2, a4);
	x23 = spu_eqv(x18, x22);
	x24 = spu_sel(x23, x21, a1);
	x25 = spu_sel(x24, x20, a6);
	*out3 = spu_xor(*out3, x25);
	x26 = spu_sel(a4, x6, a5);
	x27 = spu_xor(a2, x26);
	x28 = spu_sel(x3, x23, a2);
	x29 = spu_sel(x28, x27, a1);
	x30 = spu_sel(x23, x12, a5);
	x31 = spu_sel(x10, x30, a1);
	x32 = spu_sel(x31, x29, a6);
	*out1 = spu_xor(*out1, x32);
	x33 = spu_orc(x9, x23);
	x34 = spu_eqv(x33, x30);
	x35 = spu_sel(x4, x19, a4);
	x36 = spu_eqv(x23, x35);
	x37 = spu_sel(x36, x34, a1);
	x38 = spu_xor(a1, x17);
	x39 = spu_eqv(x38, x35);
	x40 = spu_sel(x39, x37, a6);
	*out4 = spu_xor(*out4, x40);
}

static void s8(
	spu_qword_t a1,
	spu_qword_t a2,
	spu_qword_t a3,
	spu_qword_t a4,
	spu_qword_t a5,
	spu_qword_t a6,
	spu_qword_t *out1,
	spu_qword_t *out2,
	spu_qword_t *out3,
	spu_qword_t *out4
) {
	spu_qword_t x1, x2, x3, x4, x5, x6;
	spu_qword_t x7, x8, x9, x10, x11, x12;
	spu_qword_t x13, x14, x15, x16, x17, x18;
	spu_qword_t x19, x20, x21, x22, x23, x24;
	spu_qword_t x25, x26, x27, x28, x29, x30;
	spu_qword_t x31, x32, x33, x34, x35, x36;
	spu_qword_t x37, x38, x39, x40;

	x1 = spu_eqv(a2, a3);
	x2 = spu_orc(a2, a3);
	x3 = spu_xor(x2, a5);
	x4 = spu_sel(x3, x1, a4);
	x5 = spu_andc(a3, a5);
	x6 = spu_xor(x5, a2);
	x7 = spu_sel(a3, a5, x1);
	x8 = spu_sel(x7, x6, a4);
	x9 = spu_sel(x8, x4, a1);
	x10 = spu_sel(a5, x1, x3);
	x11 = spu_xor(a4, x10);
	x12 = spu_orc(a4, a5);
	x13 = spu_eqv(x12, x6);
	x14 = spu_sel(x13, x11, a1);
	x15 = spu_sel(x14, x9, a6);
	*out3 = spu_xor(*out3, x15);
	x16 = spu_xor(a4, x6);
	x17 = spu_sel(x16, x13, a2);
	x18 = spu_eqv(x3, x17);
	x19 = spu_sel(x18, x16, a1);
	x20 = spu_sel(a4, x1, x2);
	x21 = spu_xor(a5, x20);
	x22 = spu_nor(x18, x18);
	x23 = spu_sel(x22, x21, a1);
	x24 = spu_sel(x23, x19, a6);
	*out2 = spu_xor(*out2, x24);
	x25 = spu_orc(a4, x5);
	x26 = spu_xor(x25, x21);
	x27 = spu_sel(x25, x21, x6);
	x28 = spu_sel(x27, x21, a4);
	x29 = spu_sel(x28, x26, a1);
	x30 = spu_sel(a2, x3, x11);
	x31 = spu_eqv(x21, x30);
	x32 = spu_xor(x11, x18);
	x33 = spu_eqv(x32, x31);
	x34 = spu_sel(x33, x31, a1);
	x35 = spu_sel(x34, x29, a6);
	*out4 = spu_xor(*out4, x35);
	x36 = spu_nor(x34, x34);
	x37 = spu_sel(x13, a3, x32);
	x38 = spu_sel(x21, x25, x32);
	x39 = spu_sel(x38, x37, a1);
	x40 = spu_sel(x39, x36, a6);
	*out1 = spu_xor(*out1, x40);
}
