/////////////////////////////////////////////////////////////////////////////
// This C file has been created automatically. Do not edit!!!
/////////////////////////////////////////////////////////////////////////////

// %%COMMENT
// TODO: Yet to be documented!!!


#include <string.h>
#include "mat24_functions.h"
#include "mm_op15.h"   



static void invert15_xyz(uint_mmv_t *v_in, uint_mmv_t *v_out)
{
    uint_fast32_t i;
    const uint16_t *p_theta = MAT24_THETA_TABLE;
    
    for (i = 0; i <2048; ++i) {
        uint_mmv_t mask = 0 - ((uint_mmv_t)(((p_theta[i] >> 12) & 0x1ULL)));
        mask &= 0xffffffffffffffffULL;
        *v_out++ = *v_in++ ^ mask;
        mask &= 0xffffffffULL;
        *v_out++ = *v_in++ ^ mask;
    }
}



// %%EXPORT px
void mm_op15_t(uint_mmv_t *v_in,  uint32_t exp, uint_mmv_t *v_out)
{
    uint_mmv_t i, j, exp1;
 
    exp %= 3;
    if (exp == 0) {
        for (i = 0; i < 15468; ++i) v_out[i] = v_in[i];
        return;
    }
    exp1 = 0x1ULL - (uint_mmv_t)exp;

    // Do off-diagonal part of tags A, B, C
    for (i = 0; i < 48; ++i) {
        // %%MUL_MATRIX_T3 v_in, exp1, v_out

        // This is an automatically generated matrix operation, do not change!
        {
        uint_mmv_t r0, r1, r2, r3, r4;
        uint_mmv_t r5, r6;

        // Multiply the vector of integers mod 15 stored in
        // (v_in) by t**e, where t is the 3 times 3 triality
        // matrix [[0, 2,  -2], [1, 1, 1], [1,  -1, -1]] / 2.
        // and e = 1 if exp1 = 0, e = 2 if exp1 = 
        // (uint_mmv_t)(-1). The result is stored in (v_out).
        // 
        // v_in and v_out are pointers of type *uint_mmv_t.
        // Components with tags A, B, C referred by (v_in) 
        // are processed, one integer of type uint_mmv_t
        // for each tag.
        // 
        // 
        // Loading vector from rep 196884x with tags A,B,C
        // to v[0...2]. Here v_in refers to the tag A part. 
        // Negate v[2] if exp1 == -1.
        r0 = (v_in)[0];
        r1 = (v_in)[48];
        r2 = (v_in)[96] ^ ((exp1) & 0xffffffffffffffffULL);
        // Vector is now  r(i) for i = 0,1,2,3,4,5
        exp1 = ~(exp1);
        r3 = ((r0 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r0 = (r0 & 0xf0f0f0f0f0f0f0fULL);
        r4 = ((r1 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r1 = (r1 & 0xf0f0f0f0f0f0f0fULL);
        r5 = ((r2 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r2 = (r2 & 0xf0f0f0f0f0f0f0fULL);
        r6 = (r4 + (r5 ^ 0xf0f0f0f0f0f0f0fULL));
        r4 = (r4 + r5);
        r5 = (r6 & 0x1010101010101010ULL);
        r5 = ((r6 - r5) + (r5 >> 4));
        r6 = (r4 & 0x1010101010101010ULL);
        r4 = ((r4 - r6) + (r6 >> 4));
        r4 = (((r4 & 0x1111111111111111ULL) << 3)
            | ((r4 & 0xeeeeeeeeeeeeeeeeULL) >> 1));
        r5 = (((r5 & 0x1111111111111111ULL) << 3)
            | ((r5 & 0xeeeeeeeeeeeeeeeeULL) >> 1));
        r6 = (r3 + (r5 ^ 0xf0f0f0f0f0f0f0fULL));
        r3 = (r3 + r5);
        r5 = (r6 & 0x1010101010101010ULL);
        r5 = ((r6 - r5) + (r5 >> 4));
        r6 = (r3 & 0x1010101010101010ULL);
        r3 = ((r3 - r6) + (r6 >> 4));
        r6 = (r1 + (r2 ^ 0xf0f0f0f0f0f0f0fULL));
        r1 = (r1 + r2);
        r2 = (r6 & 0x1010101010101010ULL);
        r2 = ((r6 - r2) + (r2 >> 4));
        r6 = (r1 & 0x1010101010101010ULL);
        r1 = ((r1 - r6) + (r6 >> 4));
        r1 = (((r1 & 0x1111111111111111ULL) << 3)
            | ((r1 & 0xeeeeeeeeeeeeeeeeULL) >> 1));
        r2 = (((r2 & 0x1111111111111111ULL) << 3)
            | ((r2 & 0xeeeeeeeeeeeeeeeeULL) >> 1));
        r6 = (r0 + (r2 ^ 0xf0f0f0f0f0f0f0fULL));
        r0 = (r0 + r2);
        r2 = (r6 & 0x1010101010101010ULL);
        r2 = ((r6 - r2) + (r2 >> 4));
        r6 = (r0 & 0x1010101010101010ULL);
        r0 = ((r0 - r6) + (r6 >> 4));
        r0 ^= (r3 << 4);
        r1 ^= (r4 << 4);
        r2 ^= (r5 << 4);
        // Store vector v[0...2] to rep 196884x with 
        // tags A,B,C. Here v_out refers to the tag A part. 
        // Negate v[2] if exp1 == -1.
        (v_out)[0] = r1;
        (v_out)[48] = r0;
        (v_out)[96]  = r2 ^ ((exp1) & 0xffffffffffffffffULL);
        exp1 = ~(exp1);
        // 45 lines of code, 85 operations
        }
        // End of automatically generated matrix operation.
 
        ++v_in; ++v_out;
    }

    v_in -= 48;
    v_out -= 48;
    // Do diagonal part of tags A, B, C
    for (i = 0; i < 24; ++i) {
        // Copy diagonal of A, zero diagonals of B and C
        uint_mmv_t mask = 0xfULL << ((i << 2) & 63);
        j = (i << 1) + (i >> 4);
        v_out[j] = (v_out[j] & ~mask) | (v_in[j] & mask);
        v_out[j + 48] &= ~mask;
        v_out[j + 96] &= ~mask;
        // Zero slack
        j = ((i + 1) << 1) - 1;
        v_out[j] &= 0xffffffffULL;
        v_out[j + 48] &= 0xffffffffULL;
        v_out[j + 96] &= 0xffffffffULL;  
    }


    // Do tag T
    v_in += MM_OP15_OFS_T;
    v_out +=  MM_OP15_OFS_T;
    for (i = 0; i < 759; ++i) {
        // %%MUL_MATRIX_T64 v_in, exp1, v_out

        // This is an automatically generated matrix operation, do not change!
        {
        uint_mmv_t r0, r1, r2, r3, r4;
        uint_mmv_t r5, r6, r7, r8;

        // Multiply the vector of integers mod 15 stored
        // in (v_in) by t**e, where t is the 64 times 64 
        // triality matrix and e = 1 if exp1 = 0, e = 2 if
        // exp1 = (uint_mmv_t)(-1). The result is stored
        // in (v_out).
        // 
        // Loading vector v from array v_in; multiply v
        // with diagonal matrix if exp1 == -1.
        r0 = v_in[0] ^ ((exp1) & 0xf0fff0ffffff0ULL);
        r1 = v_in[1] ^ ((exp1) & 0xf000000f000f0fffULL);
        r2 = v_in[2] ^ ((exp1) & 0xf000000f000f0fffULL);
        r3 = v_in[3] ^ ((exp1) & 0xfff0f000f000000fULL);
        // Vector is now  r(i) for i = 0,1,2,3
        exp1 = ~(exp1);
        // Exchange component i with component 63-i if i 
        // has odd parity; fix it if i has even parity.
        r4 = ((r0 & 0xff0f00ff00f0ff0ULL)
            | (r1 & 0xf00f0ff00ff0f00fULL));
        r4 = ((r4 << 32) | (r4 >> 32));
        r4 = (((r4 & 0xffff0000ffffULL) << 16)
            | ((r4 >> 16) & 0xffff0000ffffULL));
        r4 = (((r4 & 0xff00ff00ff00ffULL) << 8)
            | ((r4 >> 8) & 0xff00ff00ff00ffULL));
        r4 = (((r4 & 0xf0f0f0f0f0f0f0fULL) << 4)
            | ((r4 >> 4) & 0xf0f0f0f0f0f0f0fULL));
        r5 = ((r2 & 0xf00f0ff00ff0f00fULL)
            | (r3 & 0xff0f00ff00f0ff0ULL));
        r5 = ((r5 << 32) | (r5 >> 32));
        r5 = (((r5 & 0xffff0000ffffULL) << 16)
            | ((r5 >> 16) & 0xffff0000ffffULL));
        r5 = (((r5 & 0xff00ff00ff00ffULL) << 8)
            | ((r5 >> 8) & 0xff00ff00ff00ffULL));
        r5 = (((r5 & 0xf0f0f0f0f0f0f0fULL) << 4)
            | ((r5 >> 4) & 0xf0f0f0f0f0f0f0fULL));
        r0 = ((r0 & 0xf00f0ff00ff0f00fULL)
            | (r5 & 0xff0f00ff00f0ff0ULL));
        r1 = ((r1 & 0xff0f00ff00f0ff0ULL)
            | (r5 & 0xf00f0ff00ff0f00fULL));
        r2 = ((r2 & 0xff0f00ff00f0ff0ULL)
            | (r4 & 0xf00f0ff00ff0f00fULL));
        r3 = ((r3 & 0xf00f0ff00ff0f00fULL)
            | (r4 & 0xff0f00ff00f0ff0ULL));
        // Expansion for Hadamard operation:
        // There is no space for a carry bit between bit fields. So 
        // we move bit field 2*i + 1  to bit field 2*i + 64.
        r4 = ((r0 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r0 = (r0 & 0xf0f0f0f0f0f0f0fULL);
        r5 = ((r1 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r1 = (r1 & 0xf0f0f0f0f0f0f0fULL);
        r6 = ((r2 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r2 = (r2 & 0xf0f0f0f0f0f0f0fULL);
        r7 = ((r3 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r3 = (r3 & 0xf0f0f0f0f0f0f0fULL);
        // Vector is now  r(i) for i = 0,1,2,3,4,5,6,7
        // Butterfly: v[i], v[i+2] = v[i]+v[i+2], v[i]-v[i+2]
        r8 = (((r0 << 8) & 0xf000f000f000f00ULL)
            | ((r0 & 0xf000f000f000f00ULL) >> 8));
        r0 = ((r0 ^ 0xf000f000f000f00ULL) + r8);
        r8 = (r0 & 0x1010101010101010ULL);
        r0 = ((r0 - r8) + (r8 >> 4));
        r8 = (((r1 << 8) & 0xf000f000f000f00ULL)
            | ((r1 & 0xf000f000f000f00ULL) >> 8));
        r1 = ((r1 ^ 0xf000f000f000f00ULL) + r8);
        r8 = (r1 & 0x1010101010101010ULL);
        r1 = ((r1 - r8) + (r8 >> 4));
        r8 = (((r2 << 8) & 0xf000f000f000f00ULL)
            | ((r2 & 0xf000f000f000f00ULL) >> 8));
        r2 = ((r2 ^ 0xf000f000f000f00ULL) + r8);
        r8 = (r2 & 0x1010101010101010ULL);
        r2 = ((r2 - r8) + (r8 >> 4));
        r8 = (((r3 << 8) & 0xf000f000f000f00ULL)
            | ((r3 & 0xf000f000f000f00ULL) >> 8));
        r3 = ((r3 ^ 0xf000f000f000f00ULL) + r8);
        r8 = (r3 & 0x1010101010101010ULL);
        r3 = ((r3 - r8) + (r8 >> 4));
        r8 = (((r4 << 8) & 0xf000f000f000f00ULL)
            | ((r4 & 0xf000f000f000f00ULL) >> 8));
        r4 = ((r4 ^ 0xf000f000f000f00ULL) + r8);
        r8 = (r4 & 0x1010101010101010ULL);
        r4 = ((r4 - r8) + (r8 >> 4));
        r8 = (((r5 << 8) & 0xf000f000f000f00ULL)
            | ((r5 & 0xf000f000f000f00ULL) >> 8));
        r5 = ((r5 ^ 0xf000f000f000f00ULL) + r8);
        r8 = (r5 & 0x1010101010101010ULL);
        r5 = ((r5 - r8) + (r8 >> 4));
        r8 = (((r6 << 8) & 0xf000f000f000f00ULL)
            | ((r6 & 0xf000f000f000f00ULL) >> 8));
        r6 = ((r6 ^ 0xf000f000f000f00ULL) + r8);
        r8 = (r6 & 0x1010101010101010ULL);
        r6 = ((r6 - r8) + (r8 >> 4));
        r8 = (((r7 << 8) & 0xf000f000f000f00ULL)
            | ((r7 & 0xf000f000f000f00ULL) >> 8));
        r7 = ((r7 ^ 0xf000f000f000f00ULL) + r8);
        r8 = (r7 & 0x1010101010101010ULL);
        r7 = ((r7 - r8) + (r8 >> 4));
        // Vector is now  r(i) for i = 0,1,2,3,4,5,6,7
        // Butterfly: v[i], v[i+4] = v[i]+v[i+4], v[i]-v[i+4]
        r8 = (((r0 << 16) & 0xf0f00000f0f0000ULL)
            | ((r0 & 0xf0f00000f0f0000ULL) >> 16));
        r0 = ((r0 ^ 0xf0f00000f0f0000ULL) + r8);
        r8 = (r0 & 0x1010101010101010ULL);
        r0 = ((r0 - r8) + (r8 >> 4));
        r8 = (((r1 << 16) & 0xf0f00000f0f0000ULL)
            | ((r1 & 0xf0f00000f0f0000ULL) >> 16));
        r1 = ((r1 ^ 0xf0f00000f0f0000ULL) + r8);
        r8 = (r1 & 0x1010101010101010ULL);
        r1 = ((r1 - r8) + (r8 >> 4));
        r8 = (((r2 << 16) & 0xf0f00000f0f0000ULL)
            | ((r2 & 0xf0f00000f0f0000ULL) >> 16));
        r2 = ((r2 ^ 0xf0f00000f0f0000ULL) + r8);
        r8 = (r2 & 0x1010101010101010ULL);
        r2 = ((r2 - r8) + (r8 >> 4));
        r8 = (((r3 << 16) & 0xf0f00000f0f0000ULL)
            | ((r3 & 0xf0f00000f0f0000ULL) >> 16));
        r3 = ((r3 ^ 0xf0f00000f0f0000ULL) + r8);
        r8 = (r3 & 0x1010101010101010ULL);
        r3 = ((r3 - r8) + (r8 >> 4));
        r8 = (((r4 << 16) & 0xf0f00000f0f0000ULL)
            | ((r4 & 0xf0f00000f0f0000ULL) >> 16));
        r4 = ((r4 ^ 0xf0f00000f0f0000ULL) + r8);
        r8 = (r4 & 0x1010101010101010ULL);
        r4 = ((r4 - r8) + (r8 >> 4));
        r8 = (((r5 << 16) & 0xf0f00000f0f0000ULL)
            | ((r5 & 0xf0f00000f0f0000ULL) >> 16));
        r5 = ((r5 ^ 0xf0f00000f0f0000ULL) + r8);
        r8 = (r5 & 0x1010101010101010ULL);
        r5 = ((r5 - r8) + (r8 >> 4));
        r8 = (((r6 << 16) & 0xf0f00000f0f0000ULL)
            | ((r6 & 0xf0f00000f0f0000ULL) >> 16));
        r6 = ((r6 ^ 0xf0f00000f0f0000ULL) + r8);
        r8 = (r6 & 0x1010101010101010ULL);
        r6 = ((r6 - r8) + (r8 >> 4));
        r8 = (((r7 << 16) & 0xf0f00000f0f0000ULL)
            | ((r7 & 0xf0f00000f0f0000ULL) >> 16));
        r7 = ((r7 ^ 0xf0f00000f0f0000ULL) + r8);
        r8 = (r7 & 0x1010101010101010ULL);
        r7 = ((r7 - r8) + (r8 >> 4));
        // Vector is now  r(i) for i = 0,1,2,3,4,5,6,7
        // Butterfly: v[i], v[i+8] = v[i]+v[i+8], v[i]-v[i+8]
        r8 = ((r0 << 32) | (r0 >> 32));
        r0 = ((r0 ^ 0xf0f0f0f00000000ULL) + r8);
        r8 = (r0 & 0x1010101010101010ULL);
        r0 = ((r0 - r8) + (r8 >> 4));
        r8 = ((r1 << 32) | (r1 >> 32));
        r1 = ((r1 ^ 0xf0f0f0f00000000ULL) + r8);
        r8 = (r1 & 0x1010101010101010ULL);
        r1 = ((r1 - r8) + (r8 >> 4));
        r8 = ((r2 << 32) | (r2 >> 32));
        r2 = ((r2 ^ 0xf0f0f0f00000000ULL) + r8);
        r8 = (r2 & 0x1010101010101010ULL);
        r2 = ((r2 - r8) + (r8 >> 4));
        r8 = ((r3 << 32) | (r3 >> 32));
        r3 = ((r3 ^ 0xf0f0f0f00000000ULL) + r8);
        r8 = (r3 & 0x1010101010101010ULL);
        r3 = ((r3 - r8) + (r8 >> 4));
        r8 = ((r4 << 32) | (r4 >> 32));
        r4 = ((r4 ^ 0xf0f0f0f00000000ULL) + r8);
        r8 = (r4 & 0x1010101010101010ULL);
        r4 = ((r4 - r8) + (r8 >> 4));
        r8 = ((r5 << 32) | (r5 >> 32));
        r5 = ((r5 ^ 0xf0f0f0f00000000ULL) + r8);
        r8 = (r5 & 0x1010101010101010ULL);
        r5 = ((r5 - r8) + (r8 >> 4));
        r8 = ((r6 << 32) | (r6 >> 32));
        r6 = ((r6 ^ 0xf0f0f0f00000000ULL) + r8);
        r8 = (r6 & 0x1010101010101010ULL);
        r6 = ((r6 - r8) + (r8 >> 4));
        r8 = ((r7 << 32) | (r7 >> 32));
        r7 = ((r7 ^ 0xf0f0f0f00000000ULL) + r8);
        r8 = (r7 & 0x1010101010101010ULL);
        r7 = ((r7 - r8) + (r8 >> 4));
        // Vector is now  r(i) for i = 0,1,2,3,4,5,6,7
        // Butterfly: v[i], v[i+16] = v[i]+v[i+16], v[i]-v[i+16]
        r8 = (r0 + (r1 ^ 0xf0f0f0f0f0f0f0fULL));
        r0 = (r0 + r1);
        r1 = (r8 & 0x1010101010101010ULL);
        r1 = ((r8 - r1) + (r1 >> 4));
        r8 = (r0 & 0x1010101010101010ULL);
        r0 = ((r0 - r8) + (r8 >> 4));
        r8 = (r2 + (r3 ^ 0xf0f0f0f0f0f0f0fULL));
        r2 = (r2 + r3);
        r3 = (r8 & 0x1010101010101010ULL);
        r3 = ((r8 - r3) + (r3 >> 4));
        r8 = (r2 & 0x1010101010101010ULL);
        r2 = ((r2 - r8) + (r8 >> 4));
        r8 = (r4 + (r5 ^ 0xf0f0f0f0f0f0f0fULL));
        r4 = (r4 + r5);
        r5 = (r8 & 0x1010101010101010ULL);
        r5 = ((r8 - r5) + (r5 >> 4));
        r8 = (r4 & 0x1010101010101010ULL);
        r4 = ((r4 - r8) + (r8 >> 4));
        r8 = (r6 + (r7 ^ 0xf0f0f0f0f0f0f0fULL));
        r6 = (r6 + r7);
        r7 = (r8 & 0x1010101010101010ULL);
        r7 = ((r8 - r7) + (r7 >> 4));
        r8 = (r6 & 0x1010101010101010ULL);
        r6 = ((r6 - r8) + (r8 >> 4));
        // Vector is now  r(i) for i = 0,1,2,3,4,5,6,7
        // Butterfly: v[i], v[i+32] = v[i]+v[i+32], v[i]-v[i+32]
        r8 = (r0 + (r2 ^ 0xf0f0f0f0f0f0f0fULL));
        r0 = (r0 + r2);
        r2 = (r8 & 0x1010101010101010ULL);
        r2 = ((r8 - r2) + (r2 >> 4));
        r8 = (r0 & 0x1010101010101010ULL);
        r0 = ((r0 - r8) + (r8 >> 4));
        r8 = (r1 + (r3 ^ 0xf0f0f0f0f0f0f0fULL));
        r1 = (r1 + r3);
        r3 = (r8 & 0x1010101010101010ULL);
        r3 = ((r8 - r3) + (r3 >> 4));
        r8 = (r1 & 0x1010101010101010ULL);
        r1 = ((r1 - r8) + (r8 >> 4));
        r8 = (r4 + (r6 ^ 0xf0f0f0f0f0f0f0fULL));
        r4 = (r4 + r6);
        r6 = (r8 & 0x1010101010101010ULL);
        r6 = ((r8 - r6) + (r6 >> 4));
        r8 = (r4 & 0x1010101010101010ULL);
        r4 = ((r4 - r8) + (r8 >> 4));
        r8 = (r5 + (r7 ^ 0xf0f0f0f0f0f0f0fULL));
        r5 = (r5 + r7);
        r7 = (r8 & 0x1010101010101010ULL);
        r7 = ((r8 - r7) + (r7 >> 4));
        r8 = (r5 & 0x1010101010101010ULL);
        r5 = ((r5 - r8) + (r8 >> 4));
        // Vector is now  r(i) for i = 0,1,2,3,4,5,6,7
        // Butterfly: v[i], v[i+64] = v[i]+v[i+64], v[i]-v[i+64]
        r8 = (r0 + (r4 ^ 0xf0f0f0f0f0f0f0fULL));
        r0 = (r0 + r4);
        r4 = (r8 & 0x1010101010101010ULL);
        r4 = ((r8 - r4) + (r4 >> 4));
        r8 = (r0 & 0x1010101010101010ULL);
        r0 = ((r0 - r8) + (r8 >> 4));
        r8 = (r1 + (r5 ^ 0xf0f0f0f0f0f0f0fULL));
        r1 = (r1 + r5);
        r5 = (r8 & 0x1010101010101010ULL);
        r5 = ((r8 - r5) + (r5 >> 4));
        r8 = (r1 & 0x1010101010101010ULL);
        r1 = ((r1 - r8) + (r8 >> 4));
        r8 = (r2 + (r6 ^ 0xf0f0f0f0f0f0f0fULL));
        r2 = (r2 + r6);
        r6 = (r8 & 0x1010101010101010ULL);
        r6 = ((r8 - r6) + (r6 >> 4));
        r8 = (r2 & 0x1010101010101010ULL);
        r2 = ((r2 - r8) + (r8 >> 4));
        r8 = (r3 + (r7 ^ 0xf0f0f0f0f0f0f0fULL));
        r3 = (r3 + r7);
        r7 = (r8 & 0x1010101010101010ULL);
        r7 = ((r8 - r7) + (r7 >> 4));
        r8 = (r3 & 0x1010101010101010ULL);
        r3 = ((r3 - r8) + (r8 >> 4));
        // Vector is now  r(i) for i = 0,1,2,3,4,5,6,7
        // Reverse expansion for Hadamard operation
        r0 ^= (r4 << 4);
        r1 ^= (r5 << 4);
        r2 ^= (r6 << 4);
        r3 ^= (r7 << 4);
        // Vector is now  r(i) for i = 0,1,2,3
        // Multiply vector by scalar 2**-3 mod 15
        r0 = (((r0 & 0x7777777777777777ULL) << 1)
            | ((r0 & 0x8888888888888888ULL) >> 3));
        r1 = (((r1 & 0x7777777777777777ULL) << 1)
            | ((r1 & 0x8888888888888888ULL) >> 3));
        r2 = (((r2 & 0x7777777777777777ULL) << 1)
            | ((r2 & 0x8888888888888888ULL) >> 3));
        r3 = (((r3 & 0x7777777777777777ULL) << 1)
            | ((r3 & 0x8888888888888888ULL) >> 3));
        // Storing vector v to array v_out; multiply v
        // with diagonal matrix if exp1 == -1.
        v_out[0] = r0 ^ ((exp1) & 0xf0fff0ffffff0ULL);
        v_out[1] = r1 ^ ((exp1) & 0xf000000f000f0fffULL);
        v_out[2] = r2 ^ ((exp1) & 0xf000000f000f0fffULL);
        v_out[3] = r3 ^ ((exp1) & 0xfff0f000f000000fULL);
        exp1 = ~(exp1);
        // 208 lines of code, 492 operations
        }
        // End of automatically generated matrix operation.
 
        v_in += 4;
        v_out += 4;
    }

    // Do tags X, Y, and Z
    {
         uint_mmv_t *pXYin, *pYZin, *pZXin;
         uint_mmv_t *pXYout, *pYZout, *pZXout;
         if (exp1 == 0) {
             pXYin = v_in; 
             pXYout = v_out + 8192;  
             pYZin = v_in + 8192; 
             pYZout = v_out + 4096;  
             pZXin = v_in + 4096; 
             pZXout = v_out; 
         } else {
             pXYout = v_out; 
             pXYin = v_in + 8192;  
             pYZout = v_out + 8192; 
             pYZin = v_in + 4096;  
             pZXout = v_out + 4096; 
             pZXin = v_in; 
         }

         // Map X to Y for t and Y to X for t**2
         for (i = 0; i < 4096; ++i) pXYout[i] = pXYin[i];
         mm15_neg_scalprod_d_i(pXYout);
         
         // Map Y to Z for t and Z to Y for t**2
         invert15_xyz(pYZin, pYZout);
         mm15_neg_scalprod_d_i(pYZout);

         // Map Z to X for t and X to Z for t**2
         invert15_xyz(pZXin, pZXout);
    }
}




// %%EXPORT px
void mm_op15_t_A(uint_mmv_t *v_in,  uint32_t exp, uint_mmv_t *v_out)
{
    uint_mmv_t i, j, exp1;
 
    exp %= 3;
    if (exp == 0) {
        for (i = 0; i < 48; ++i) v_out[i] = v_in[i];
        return;
    }
    exp1 = 0x1ULL - (uint_mmv_t)exp;
    for (i = 0; i < 48; ++i) {
        // %%MUL_MATRIX_T3A v_in, exp1, v_out

        // This is an automatically generated matrix operation, do not change!
        {
        uint_mmv_t r0, r1, r2, r3, r4;

        // Put dest_A =  (src_B + mask * src_C) / 2   (mod 15)
        // 
        // Here src_B and src_C are the part of a vector of integers 
        // mod 15 stored in (v_in) with tag B and C, and dest_A is 
        // the part of a vector of integers mod 15 stored in (v_out),  
        // with tag A. Here exp1 must be 0 or -1.
        // 
        // This means that the function computes the part with tag A of
        // the vector (v_out) = (v_in) * t**e, where e = 1 - mask.
        // 
        // v_in and v_out are pointers of type *uint_mmv_t.
        // Components with tags B, C referred by (v_in) 
        // are processed, one integer of type uint_mmv_t
        // for each tag.
        // 
        // 
        // Loading vector from rep 196884x with tags A,B,C
        // to v[0...2]. Here v_in refers to the tag A part. 
        // Negate v[2] if exp1 == -1.
        r0 = (v_in)[48];
        r1 = (v_in)[96] ^ ((exp1) & 0xffffffffffffffffULL);
        // Vector is now  r(i) for i = 0,1,2,3
        r2 = ((r0 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r0 = (r0 & 0xf0f0f0f0f0f0f0fULL);
        r3 = ((r1 >> 4) & 0xf0f0f0f0f0f0f0fULL);
        r1 = (r1 & 0xf0f0f0f0f0f0f0fULL);
        r2 = (r2 + r3);
        r4 = (r2 & 0x1010101010101010ULL);
        r2 = ((r2 - r4) + (r4 >> 4));
        r0 = (r0 + r1);
        r4 = (r0 & 0x1010101010101010ULL);
        r0 = ((r0 - r4) + (r4 >> 4));
        r0 ^= (r2 << 4);
        r0 = (((r0 & 0x1111111111111111ULL) << 3)
            | ((r0 & 0xeeeeeeeeeeeeeeeeULL) >> 1));
        // Store vector v[0] to rep 196884x with 
        // tags A. Here v_out refers to the tag A part. 
        (v_out)[0] = r0;
        // 15 lines of code, 25 operations
        }
        // End of automatically generated matrix operation.
 
        ++v_in; ++v_out;
    }

    v_in -= 48;
    v_out -= 48;
    // Do diagonal part of tag A
    for (i = 0; i < 24; ++i) {
        // Copy diagonal of A, zero diagonals of B and C
        uint_mmv_t mask = 0xfULL << ((i << 2) & 63);
        j = (i << 1) + (i >> 4);
        v_out[j] = (v_out[j] & ~mask) | (v_in[j] & mask);
        // Zero slack
        j = ((i + 1) << 1) - 1;
        v_out[j] &= 0xffffffffULL;
    } 
}