NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example8 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=56 Height=29 Width=20
Conv FromTensor=in ToTensor=out ToChannels=244 FilterH=5 FilterW=5 StrideH=2 StrideW=2 PaddingH=3 PaddingW=2 DilationH=1 DilationW=1 Groups=4
Output FromTensor=out

Top || Output Example8.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example8Params);
// Example8Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example8Params Example8Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example8Params* params = malloc(sizeof(Example8Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example8Net* net; // For example, 4 threads:
// char* err = Example8NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example8NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example8Net Example8Net;

char* Example8NetCreate(
Example8Net**,
Example8Params*,
ptrdiff_t threads
);

void Example8NetDestroy(Example8Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example8Net* net;
//
// ... Create net ...
//
// Example8Engine* engine; // For example, 4 inference threads:
// char* err = Example8EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example8EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example8EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*56*29*20);
// float* outData = malloc(sizeof(float)*244*16*10);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example8EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example8Engine Example8Engine;

char* Example8EngineCreate(
Example8Engine**,
Example8Net*,
ptrdiff_t threads
);

char* Example8EnginePthreadT(
Example8Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example8EngineInference(
Example8Engine*,
float* inData,
float* outData
);

void Example8EngineDestroy(Example8Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example8Params {
float outBiases[244]; // 1x244x1x1
float outWeights[85400]; // 244x14x5x5
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example8.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example8.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example8.h"

static char* Example8Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(276);
int step1 = sprintf(msg1, "Example8: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 276-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example8ThreaderTask1 Example8ThreaderTask1;
typedef void (*Example8ThreaderCallee1)(Example8ThreaderTask1*, int64_t*);
typedef struct Example8ThreaderHub1 Example8ThreaderHub1;
typedef struct Example8ThreaderNode1 Example8ThreaderNode1;
typedef struct Example8ThreaderUnwind1 Example8ThreaderUnwind1;
typedef struct Example8ThreaderTeam1 Example8ThreaderTeam1;

struct Example8ThreaderTask1 {
Example8ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example8ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example8ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example8ThreaderTask1* task1;
pthread_cond_t cond2;
Example8ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example8ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example8ThreaderTeam1 {
ptrdiff_t nt1;
Example8ThreaderHub1* hub2;
Example8ThreaderNode1* nodes2;
Example8ThreaderUnwind1 unwind1;
};

static void Example8ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example8ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example8ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example8ThreaderMain1(void* arg1) {
Example8ThreaderNode1* node1 = arg1;
Example8ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example8ThreaderHub1* hub3 = team2->hub2;
Example8ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example8ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example8ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example8ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example8ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example8ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example8ThreaderDestroy1(Example8ThreaderTeam1* team3) {
if (!team3) return;
Example8ThreaderNode1* nodes4 = team3->nodes2;
Example8ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example8ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example8ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example8ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example8ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example8ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example8ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example8ThreaderCreate1Up4(Example8ThreaderTeam1* team8, ptrdiff_t nt7) {
Example8ThreaderNode1* nodes5 = team8->nodes2;
for (Example8ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example8Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example8Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example8ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example8Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example8ThreaderCreate1Up3(Example8ThreaderTeam1* team7, ptrdiff_t nt6) {
Example8ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example8Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example8Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example8ThreaderCreate1Up4(team7, nt6);
}

static char* Example8ThreaderCreate1Up2(Example8ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example8ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example8ThreaderNode1) != (size_t)nt5, 0)) {
return Example8Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example8Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example8ThreaderCreate1Up3(team6, nt5);
}

static char* Example8ThreaderCreate1Up1(Example8ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example8ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example8Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example8ThreaderCreate1Up2(team5, nt4);
}

static char* Example8ThreaderCreate1(Example8ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example8Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example8ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example8Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example8ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example8ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example8ThreaderPthreadT1(
pthread_t* thr2,
Example8ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example8Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example8ThreaderDo1(Example8ThreaderTeam1* team10, Example8ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example8ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example8ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example8ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example8ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example8Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example8Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example8StriderArrangeFilts1Callee1(Example8ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = pt7[1];
ptrdiff_t e1 = 0;
char*restrict bfPtr1 = tensors2[2]+992*e1;
char*restrict wfPtr1 = tensors2[2]+1024+50282496*e1;
char*restrict wtPtr1 = tensors2[0]+39600*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 5*b2;
ptrdiff_t jj1 = j1+(b2 < 5 ? 4 : 5);
if (j1 < 30) {
for (; j1 != 30; ++j1) {
for (ptrdiff_t k1 = 0; k1 < 14; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(31, wtPtr1+0+85400*i5+2800*j1+100*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(31, wtPtr1+20+85400*i5+2800*j1+100*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(31, wtPtr1+40+85400*i5+2800*j1+100*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(31, wtPtr1+60+85400*i5+2800*j1+100*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(31, wtPtr1+80+85400*i5+2800*j1+100*k1);
__m512 fft1 = _mm512_add_ps(wt1, _mm512_setzero_ps());
__m512 fft89 = _mm512_add_ps(wt2, _mm512_setzero_ps());
__m512 fft2 = _mm512_sub_ps(wt1, _mm512_setzero_ps());
__m512 fft90 = _mm512_sub_ps(wt2, _mm512_setzero_ps());
__m512 fft3 = _mm512_add_ps(wt3, _mm512_setzero_ps());
__m512 fft91 = _mm512_add_ps(wt4, _mm512_setzero_ps());
__m512 fft4 = _mm512_sub_ps(wt3, _mm512_setzero_ps());
__m512 fft92 = _mm512_sub_ps(wt4, _mm512_setzero_ps());
__m512 fft5 = _mm512_add_ps(wt5, _mm512_setzero_ps());
__m512 fft93 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6 = _mm512_sub_ps(wt5, _mm512_setzero_ps());
__m512 fft94 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft95 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft8 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft96 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9 = _mm512_add_ps(fft1, fft5);
__m512 fft97 = _mm512_add_ps(fft89, fft93);
__m512 fft10 = _mm512_sub_ps(fft1, fft5);
__m512 fft98 = _mm512_sub_ps(fft89, fft93);
__m512 fft11 = _mm512_add_ps(fft3, fft7);
__m512 fft99 = _mm512_add_ps(fft91, fft95);
__m512 fft12 = _mm512_sub_ps(fft7, fft3);
__m512 fft100 = _mm512_sub_ps(fft95, fft91);
__m512 fft13 = _mm512_sub_ps(fft4, fft8);
__m512 fft101 = _mm512_sub_ps(fft92, fft96);
__m512 fft14 = _mm512_add_ps(fft4, fft8);
__m512 fft102 = _mm512_add_ps(fft92, fft96);
__m512 fft15 = _mm512_add_ps(fft9, fft11);
__m512 fft103 = _mm512_add_ps(fft97, fft99);
__m512 fft16 = _mm512_sub_ps(fft9, fft11);
__m512 fft104 = _mm512_sub_ps(fft97, fft99);
__m512 fft17 = _mm512_fmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft105 = _mm512_fmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft18 = _mm512_fnmsub_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft106 = _mm512_fnmsub_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft19 = _mm512_fnmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft107 = _mm512_fnmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft20 = _mm512_fnmadd_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft108 = _mm512_fnmadd_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft21 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft22 = _mm512_fmadd_ps(fft15, fft21, _mm512_shuffle_f32x4(fft15, fft15, 78));
__m512 fft109 = _mm512_fmadd_ps(fft103, fft21, _mm512_shuffle_f32x4(fft103, fft103, 78));
__m512 fft23 = _mm512_fmadd_ps(fft16, fft21, _mm512_shuffle_f32x4(fft16, fft16, 78));
__m512 fft110 = _mm512_fmadd_ps(fft104, fft21, _mm512_shuffle_f32x4(fft104, fft104, 78));
__m512 fft24 = _mm512_fmadd_ps(fft17, fft21, _mm512_shuffle_f32x4(fft17, fft17, 78));
__m512 fft111 = _mm512_fmadd_ps(fft105, fft21, _mm512_shuffle_f32x4(fft105, fft105, 78));
__m512 fft25 = _mm512_fmadd_ps(fft18, fft21, _mm512_shuffle_f32x4(fft18, fft18, 78));
__m512 fft112 = _mm512_fmadd_ps(fft106, fft21, _mm512_shuffle_f32x4(fft106, fft106, 78));
__m512 fft26 = _mm512_fmadd_ps(fft10, fft21, _mm512_shuffle_f32x4(fft10, fft10, 78));
__m512 fft113 = _mm512_fmadd_ps(fft98, fft21, _mm512_shuffle_f32x4(fft98, fft98, 78));
__m512 fft27 = _mm512_fmadd_ps(fft12, fft21, _mm512_shuffle_f32x4(fft12, fft12, 78));
__m512 fft114 = _mm512_fmadd_ps(fft100, fft21, _mm512_shuffle_f32x4(fft100, fft100, 78));
__m512 fft28 = _mm512_fmadd_ps(fft19, fft21, _mm512_shuffle_f32x4(fft19, fft19, 78));
__m512 fft115 = _mm512_fmadd_ps(fft107, fft21, _mm512_shuffle_f32x4(fft107, fft107, 78));
__m512 fft29 = _mm512_fmadd_ps(fft20, fft21, _mm512_shuffle_f32x4(fft20, fft20, 78));
__m512 fft116 = _mm512_fmadd_ps(fft108, fft21, _mm512_shuffle_f32x4(fft108, fft108, 78));
__m512 fft30 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft31 = _mm512_mul_ps(fft22, fft30);
__m512 fft117 = _mm512_mul_ps(fft109, fft30);
__m512 fft32 = _mm512_mul_ps(fft23, fft30);
__m512 fft118 = _mm512_mul_ps(fft110, fft30);
__m512 fft33 = _mm512_mul_ps(fft24, fft30);
__m512 fft119 = _mm512_mul_ps(fft111, fft30);
__m512 fft34 = _mm512_mul_ps(fft25, fft30);
__m512 fft120 = _mm512_mul_ps(fft112, fft30);
__m512 fft35 = _mm512_mul_ps(fft26, fft30);
__m512 fft121 = _mm512_mul_ps(fft113, fft30);
__m512 fft36 = _mm512_mul_ps(fft27, fft30);
__m512 fft122 = _mm512_mul_ps(fft114, fft30);
__m512 fft37 = _mm512_mul_ps(fft28, fft30);
__m512 fft123 = _mm512_mul_ps(fft115, fft30);
__m512 fft38 = _mm512_mul_ps(fft29, fft30);
__m512 fft124 = _mm512_mul_ps(fft116, fft30);
__m512 fft39 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft40 = _mm512_fmadd_ps(fft23, fft39, fft31);
__m512 fft125 = _mm512_fmadd_ps(fft110, fft39, fft117);
__m512 fft41 = _mm512_fnmadd_ps(fft22, fft39, fft32);
__m512 fft126 = _mm512_fnmadd_ps(fft109, fft39, fft118);
__m512 fft42 = _mm512_fmadd_ps(fft25, fft39, fft33);
__m512 fft127 = _mm512_fmadd_ps(fft112, fft39, fft119);
__m512 fft43 = _mm512_fnmadd_ps(fft24, fft39, fft34);
__m512 fft128 = _mm512_fnmadd_ps(fft111, fft39, fft120);
__m512 fft44 = _mm512_fmadd_ps(fft27, fft39, fft35);
__m512 fft129 = _mm512_fmadd_ps(fft114, fft39, fft121);
__m512 fft45 = _mm512_fnmadd_ps(fft26, fft39, fft36);
__m512 fft130 = _mm512_fnmadd_ps(fft113, fft39, fft122);
__m512 fft46 = _mm512_fmadd_ps(fft29, fft39, fft37);
__m512 fft131 = _mm512_fmadd_ps(fft116, fft39, fft123);
__m512 fft47 = _mm512_fnmadd_ps(fft28, fft39, fft38);
__m512 fft132 = _mm512_fnmadd_ps(fft115, fft39, fft124);
__m512 fft48 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft49 = _mm512_fmadd_ps(fft40, fft48, _mm512_shuffle_f32x4(fft40, fft40, 177));
__m512 fft133 = _mm512_fmadd_ps(fft125, fft48, _mm512_shuffle_f32x4(fft125, fft125, 177));
__m512 fft50 = _mm512_fmadd_ps(fft41, fft48, _mm512_shuffle_f32x4(fft41, fft41, 177));
__m512 fft134 = _mm512_fmadd_ps(fft126, fft48, _mm512_shuffle_f32x4(fft126, fft126, 177));
__m512 fft51 = _mm512_fmadd_ps(fft42, fft48, _mm512_shuffle_f32x4(fft42, fft42, 177));
__m512 fft135 = _mm512_fmadd_ps(fft127, fft48, _mm512_shuffle_f32x4(fft127, fft127, 177));
__m512 fft52 = _mm512_fmadd_ps(fft43, fft48, _mm512_shuffle_f32x4(fft43, fft43, 177));
__m512 fft136 = _mm512_fmadd_ps(fft128, fft48, _mm512_shuffle_f32x4(fft128, fft128, 177));
__m512 fft53 = _mm512_fmadd_ps(fft44, fft48, _mm512_shuffle_f32x4(fft44, fft44, 177));
__m512 fft137 = _mm512_fmadd_ps(fft129, fft48, _mm512_shuffle_f32x4(fft129, fft129, 177));
__m512 fft54 = _mm512_fmadd_ps(fft45, fft48, _mm512_shuffle_f32x4(fft45, fft45, 177));
__m512 fft138 = _mm512_fmadd_ps(fft130, fft48, _mm512_shuffle_f32x4(fft130, fft130, 177));
__m512 fft55 = _mm512_fmadd_ps(fft46, fft48, _mm512_shuffle_f32x4(fft46, fft46, 177));
__m512 fft139 = _mm512_fmadd_ps(fft131, fft48, _mm512_shuffle_f32x4(fft131, fft131, 177));
__m512 fft56 = _mm512_fmadd_ps(fft47, fft48, _mm512_shuffle_f32x4(fft47, fft47, 177));
__m512 fft140 = _mm512_fmadd_ps(fft132, fft48, _mm512_shuffle_f32x4(fft132, fft132, 177));
__m512 fft57 = _mm512_mask_mov_ps(fft49, 49344, fft50);
__m512 fft141 = _mm512_mask_mov_ps(fft133, 49344, fft134);
__m512 fft58 = _mm512_mask_sub_ps(fft50, 49344, _mm512_setzero_ps(), fft49);
__m512 fft142 = _mm512_mask_sub_ps(fft134, 49344, _mm512_setzero_ps(), fft133);
__m512 fft59 = _mm512_mask_mov_ps(fft51, 49344, fft52);
__m512 fft143 = _mm512_mask_mov_ps(fft135, 49344, fft136);
__m512 fft60 = _mm512_mask_sub_ps(fft52, 49344, _mm512_setzero_ps(), fft51);
__m512 fft144 = _mm512_mask_sub_ps(fft136, 49344, _mm512_setzero_ps(), fft135);
__m512 fft61 = _mm512_mask_mov_ps(fft53, 49344, fft54);
__m512 fft145 = _mm512_mask_mov_ps(fft137, 49344, fft138);
__m512 fft62 = _mm512_mask_sub_ps(fft54, 49344, _mm512_setzero_ps(), fft53);
__m512 fft146 = _mm512_mask_sub_ps(fft138, 49344, _mm512_setzero_ps(), fft137);
__m512 fft63 = _mm512_mask_mov_ps(fft55, 49344, fft56);
__m512 fft147 = _mm512_mask_mov_ps(fft139, 49344, fft140);
__m512 fft64 = _mm512_mask_sub_ps(fft56, 49344, _mm512_setzero_ps(), fft55);
__m512 fft148 = _mm512_mask_sub_ps(fft140, 49344, _mm512_setzero_ps(), fft139);
__m512 fft65 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft66 = _mm512_fmadd_ps(fft57, fft65, _mm512_shuffle_ps(fft57, fft57, 78));
__m512 fft149 = _mm512_fmadd_ps(fft141, fft65, _mm512_shuffle_ps(fft141, fft141, 78));
__m512 fft67 = _mm512_fmadd_ps(fft58, fft65, _mm512_shuffle_ps(fft58, fft58, 78));
__m512 fft150 = _mm512_fmadd_ps(fft142, fft65, _mm512_shuffle_ps(fft142, fft142, 78));
__m512 fft68 = _mm512_fmadd_ps(fft59, fft65, _mm512_shuffle_ps(fft59, fft59, 78));
__m512 fft151 = _mm512_fmadd_ps(fft143, fft65, _mm512_shuffle_ps(fft143, fft143, 78));
__m512 fft69 = _mm512_fmadd_ps(fft60, fft65, _mm512_shuffle_ps(fft60, fft60, 78));
__m512 fft152 = _mm512_fmadd_ps(fft144, fft65, _mm512_shuffle_ps(fft144, fft144, 78));
__m512 fft70 = _mm512_fmadd_ps(fft61, fft65, _mm512_shuffle_ps(fft61, fft61, 78));
__m512 fft153 = _mm512_fmadd_ps(fft145, fft65, _mm512_shuffle_ps(fft145, fft145, 78));
__m512 fft71 = _mm512_fmadd_ps(fft62, fft65, _mm512_shuffle_ps(fft62, fft62, 78));
__m512 fft154 = _mm512_fmadd_ps(fft146, fft65, _mm512_shuffle_ps(fft146, fft146, 78));
__m512 fft72 = _mm512_fmadd_ps(fft63, fft65, _mm512_shuffle_ps(fft63, fft63, 78));
__m512 fft155 = _mm512_fmadd_ps(fft147, fft65, _mm512_shuffle_ps(fft147, fft147, 78));
__m512 fft73 = _mm512_fmadd_ps(fft64, fft65, _mm512_shuffle_ps(fft64, fft64, 78));
__m512 fft156 = _mm512_fmadd_ps(fft148, fft65, _mm512_shuffle_ps(fft148, fft148, 78));
__m512i fft74 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft75 = _mm512_permutexvar_ps(fft74, fft66);
__m512 fft157 = _mm512_permutexvar_ps(fft74, fft149);
__m512i fft76 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft77 = _mm512_permutexvar_ps(fft76, fft66);
__m512 fft158 = _mm512_permutexvar_ps(fft76, fft149);
__m512 fft78 = _mm512_permutexvar_ps(fft74, fft67);
__m512 fft159 = _mm512_permutexvar_ps(fft74, fft150);
__m512 fft79 = _mm512_permutexvar_ps(fft76, fft67);
__m512 fft160 = _mm512_permutexvar_ps(fft76, fft150);
__m512 fft80 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft81 = _mm512_fmadd_ps(fft75, fft80, fft77);
__m512 fft161 = _mm512_fmadd_ps(fft157, fft80, fft158);
__m512 fft82 = _mm512_fnmadd_ps(fft79, fft80, fft78);
__m512 fft162 = _mm512_fnmadd_ps(fft160, fft80, fft159);
__m512 fft83 = _mm512_mask_mov_ps(fft79, 21845, fft81);
__m512 fft163 = _mm512_mask_mov_ps(fft160, 21845, fft161);
__m512 fft84 = _mm512_mask_mov_ps(fft75, 43176, fft81);
__m512 fft164 = _mm512_mask_mov_ps(fft157, 43176, fft161);
__m512 fft85 = _mm512_mask_mov_ps(fft83, 43176, fft82);
__m512 fft165 = _mm512_mask_mov_ps(fft163, 43176, fft162);
__m512 fft86 = _mm512_mask_mov_ps(fft84, 22102, fft82);
__m512 fft166 = _mm512_mask_mov_ps(fft164, 22102, fft162);
__m512 fft87 = _mm512_mask_mul_ps(fft85, 64764, fft85, _mm512_set1_ps(5e-01f));
__m512 fft167 = _mm512_mask_mul_ps(fft165, 64764, fft165, _mm512_set1_ps(5e-01f));
__m512 fft88 = _mm512_mask_mul_ps(fft86, 64764, fft86, _mm512_set1_ps(5e-01f));
__m512 fft168 = _mm512_mask_mul_ps(fft166, 64764, fft166, _mm512_set1_ps(5e-01f));
__m512 wf1 = fft87;
__m512 wf9 = fft167;
__m512 wf2 = fft88;
__m512 wf10 = fft168;
__m512 wf3 = fft68;
__m512 wf11 = fft151;
__m512 wf4 = fft69;
__m512 wf12 = fft152;
__m512 wf5 = fft70;
__m512 wf13 = fft153;
__m512 wf6 = fft71;
__m512 wf14 = fft154;
__m512 wf7 = fft72;
__m512 wf15 = fft155;
__m512 wf8 = fft73;
__m512 wf16 = fft156;
ptrdiff_t c1 = (size_t)(0+2*j1)/4;
ptrdiff_t m1 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f2 = (size_t)(0+2*j1)%2;
__m512i eo1 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf3 = _mm512_permutexvar_ps(eo1, wf3);
wf4 = _mm512_permutexvar_ps(eo1, wf4);
__m512i wfs1 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs1 = _mm512_inserti64x4(wfs1, _mm512_cvtps_ph(wf4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+27776+111104*i5+1792*c1+128*k1+64*m1+16*f2, 3855, wfs1);
_mm512_mask_storeu_epi32(wfPtr1+472176+111104*i5+1792*c1+128*k1+64*m1+16*f2, 61680, wfs1);
wf11 = _mm512_permutexvar_ps(eo1, wf11);
wf12 = _mm512_permutexvar_ps(eo1, wf12);
__m512i wfs2 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs2 = _mm512_inserti64x4(wfs2, _mm512_cvtps_ph(wf12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+916608+111104*i5+1792*c1+128*k1+64*m1+16*f2, 3855, wfs2);
_mm512_mask_storeu_epi32(wfPtr1+1361008+111104*i5+1792*c1+128*k1+64*m1+16*f2, 61680, wfs2);
wf5 = _mm512_permutexvar_ps(eo1, wf5);
wf6 = _mm512_permutexvar_ps(eo1, wf6);
__m512i wfs3 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs3 = _mm512_inserti64x4(wfs3, _mm512_cvtps_ph(wf6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+55552+111104*i5+1792*c1+128*k1+64*m1+16*f2, 3855, wfs3);
_mm512_mask_storeu_epi32(wfPtr1+499952+111104*i5+1792*c1+128*k1+64*m1+16*f2, 61680, wfs3);
wf13 = _mm512_permutexvar_ps(eo1, wf13);
wf14 = _mm512_permutexvar_ps(eo1, wf14);
__m512i wfs4 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs4 = _mm512_inserti64x4(wfs4, _mm512_cvtps_ph(wf14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+944384+111104*i5+1792*c1+128*k1+64*m1+16*f2, 3855, wfs4);
_mm512_mask_storeu_epi32(wfPtr1+1388784+111104*i5+1792*c1+128*k1+64*m1+16*f2, 61680, wfs4);
wf7 = _mm512_permutexvar_ps(eo1, wf7);
wf8 = _mm512_permutexvar_ps(eo1, wf8);
__m512i wfs5 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs5 = _mm512_inserti64x4(wfs5, _mm512_cvtps_ph(wf8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+83328+111104*i5+1792*c1+128*k1+64*m1+16*f2, 3855, wfs5);
_mm512_mask_storeu_epi32(wfPtr1+527728+111104*i5+1792*c1+128*k1+64*m1+16*f2, 61680, wfs5);
wf15 = _mm512_permutexvar_ps(eo1, wf15);
wf16 = _mm512_permutexvar_ps(eo1, wf16);
__m512i wfs6 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs6 = _mm512_inserti64x4(wfs6, _mm512_cvtps_ph(wf16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+972160+111104*i5+1792*c1+128*k1+64*m1+16*f2, 3855, wfs6);
_mm512_mask_storeu_epi32(wfPtr1+1416560+111104*i5+1792*c1+128*k1+64*m1+16*f2, 61680, wfs6);
__m512i wfs7 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs7 = _mm512_inserti64x4(wfs7, _mm512_cvtps_ph(wf2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+111104*i5+1792*c1+128*k1+64*m1+16*f2, 3855, wfs7);
_mm512_mask_storeu_epi32(wfPtr1+444400+111104*i5+1792*c1+128*k1+64*m1+16*f2, 61680, wfs7);
__m512i wfs8 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs8 = _mm512_inserti64x4(wfs8, _mm512_cvtps_ph(wf10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+888832+111104*i5+1792*c1+128*k1+64*m1+16*f2, 3855, wfs8);
_mm512_mask_storeu_epi32(wfPtr1+1333232+111104*i5+1792*c1+128*k1+64*m1+16*f2, 61680, wfs8);
__m512 wt6 = _mm512_maskz_loadu_ps(31, wtPtr1+1400+85400*i5+2800*j1+100*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(31, wtPtr1+1420+85400*i5+2800*j1+100*k1);
__m512 wt8 = _mm512_maskz_loadu_ps(31, wtPtr1+1440+85400*i5+2800*j1+100*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(31, wtPtr1+1460+85400*i5+2800*j1+100*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(31, wtPtr1+1480+85400*i5+2800*j1+100*k1);
__m512 fft169 = _mm512_add_ps(wt6, _mm512_setzero_ps());
__m512 fft257 = _mm512_add_ps(wt7, _mm512_setzero_ps());
__m512 fft170 = _mm512_sub_ps(wt6, _mm512_setzero_ps());
__m512 fft258 = _mm512_sub_ps(wt7, _mm512_setzero_ps());
__m512 fft171 = _mm512_add_ps(wt8, _mm512_setzero_ps());
__m512 fft259 = _mm512_add_ps(wt9, _mm512_setzero_ps());
__m512 fft172 = _mm512_sub_ps(wt8, _mm512_setzero_ps());
__m512 fft260 = _mm512_sub_ps(wt9, _mm512_setzero_ps());
__m512 fft173 = _mm512_add_ps(wt10, _mm512_setzero_ps());
__m512 fft261 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft174 = _mm512_sub_ps(wt10, _mm512_setzero_ps());
__m512 fft262 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft175 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft263 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft176 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft264 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft177 = _mm512_add_ps(fft169, fft173);
__m512 fft265 = _mm512_add_ps(fft257, fft261);
__m512 fft178 = _mm512_sub_ps(fft169, fft173);
__m512 fft266 = _mm512_sub_ps(fft257, fft261);
__m512 fft179 = _mm512_add_ps(fft171, fft175);
__m512 fft267 = _mm512_add_ps(fft259, fft263);
__m512 fft180 = _mm512_sub_ps(fft175, fft171);
__m512 fft268 = _mm512_sub_ps(fft263, fft259);
__m512 fft181 = _mm512_sub_ps(fft172, fft176);
__m512 fft269 = _mm512_sub_ps(fft260, fft264);
__m512 fft182 = _mm512_add_ps(fft172, fft176);
__m512 fft270 = _mm512_add_ps(fft260, fft264);
__m512 fft183 = _mm512_add_ps(fft177, fft179);
__m512 fft271 = _mm512_add_ps(fft265, fft267);
__m512 fft184 = _mm512_sub_ps(fft177, fft179);
__m512 fft272 = _mm512_sub_ps(fft265, fft267);
__m512 fft185 = _mm512_fmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft273 = _mm512_fmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft186 = _mm512_fnmsub_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft274 = _mm512_fnmsub_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft187 = _mm512_fnmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft275 = _mm512_fnmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft188 = _mm512_fnmadd_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft276 = _mm512_fnmadd_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft189 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft190 = _mm512_fmadd_ps(fft183, fft189, _mm512_shuffle_f32x4(fft183, fft183, 78));
__m512 fft277 = _mm512_fmadd_ps(fft271, fft189, _mm512_shuffle_f32x4(fft271, fft271, 78));
__m512 fft191 = _mm512_fmadd_ps(fft184, fft189, _mm512_shuffle_f32x4(fft184, fft184, 78));
__m512 fft278 = _mm512_fmadd_ps(fft272, fft189, _mm512_shuffle_f32x4(fft272, fft272, 78));
__m512 fft192 = _mm512_fmadd_ps(fft185, fft189, _mm512_shuffle_f32x4(fft185, fft185, 78));
__m512 fft279 = _mm512_fmadd_ps(fft273, fft189, _mm512_shuffle_f32x4(fft273, fft273, 78));
__m512 fft193 = _mm512_fmadd_ps(fft186, fft189, _mm512_shuffle_f32x4(fft186, fft186, 78));
__m512 fft280 = _mm512_fmadd_ps(fft274, fft189, _mm512_shuffle_f32x4(fft274, fft274, 78));
__m512 fft194 = _mm512_fmadd_ps(fft178, fft189, _mm512_shuffle_f32x4(fft178, fft178, 78));
__m512 fft281 = _mm512_fmadd_ps(fft266, fft189, _mm512_shuffle_f32x4(fft266, fft266, 78));
__m512 fft195 = _mm512_fmadd_ps(fft180, fft189, _mm512_shuffle_f32x4(fft180, fft180, 78));
__m512 fft282 = _mm512_fmadd_ps(fft268, fft189, _mm512_shuffle_f32x4(fft268, fft268, 78));
__m512 fft196 = _mm512_fmadd_ps(fft187, fft189, _mm512_shuffle_f32x4(fft187, fft187, 78));
__m512 fft283 = _mm512_fmadd_ps(fft275, fft189, _mm512_shuffle_f32x4(fft275, fft275, 78));
__m512 fft197 = _mm512_fmadd_ps(fft188, fft189, _mm512_shuffle_f32x4(fft188, fft188, 78));
__m512 fft284 = _mm512_fmadd_ps(fft276, fft189, _mm512_shuffle_f32x4(fft276, fft276, 78));
__m512 fft198 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft199 = _mm512_mul_ps(fft190, fft198);
__m512 fft285 = _mm512_mul_ps(fft277, fft198);
__m512 fft200 = _mm512_mul_ps(fft191, fft198);
__m512 fft286 = _mm512_mul_ps(fft278, fft198);
__m512 fft201 = _mm512_mul_ps(fft192, fft198);
__m512 fft287 = _mm512_mul_ps(fft279, fft198);
__m512 fft202 = _mm512_mul_ps(fft193, fft198);
__m512 fft288 = _mm512_mul_ps(fft280, fft198);
__m512 fft203 = _mm512_mul_ps(fft194, fft198);
__m512 fft289 = _mm512_mul_ps(fft281, fft198);
__m512 fft204 = _mm512_mul_ps(fft195, fft198);
__m512 fft290 = _mm512_mul_ps(fft282, fft198);
__m512 fft205 = _mm512_mul_ps(fft196, fft198);
__m512 fft291 = _mm512_mul_ps(fft283, fft198);
__m512 fft206 = _mm512_mul_ps(fft197, fft198);
__m512 fft292 = _mm512_mul_ps(fft284, fft198);
__m512 fft207 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft208 = _mm512_fmadd_ps(fft191, fft207, fft199);
__m512 fft293 = _mm512_fmadd_ps(fft278, fft207, fft285);
__m512 fft209 = _mm512_fnmadd_ps(fft190, fft207, fft200);
__m512 fft294 = _mm512_fnmadd_ps(fft277, fft207, fft286);
__m512 fft210 = _mm512_fmadd_ps(fft193, fft207, fft201);
__m512 fft295 = _mm512_fmadd_ps(fft280, fft207, fft287);
__m512 fft211 = _mm512_fnmadd_ps(fft192, fft207, fft202);
__m512 fft296 = _mm512_fnmadd_ps(fft279, fft207, fft288);
__m512 fft212 = _mm512_fmadd_ps(fft195, fft207, fft203);
__m512 fft297 = _mm512_fmadd_ps(fft282, fft207, fft289);
__m512 fft213 = _mm512_fnmadd_ps(fft194, fft207, fft204);
__m512 fft298 = _mm512_fnmadd_ps(fft281, fft207, fft290);
__m512 fft214 = _mm512_fmadd_ps(fft197, fft207, fft205);
__m512 fft299 = _mm512_fmadd_ps(fft284, fft207, fft291);
__m512 fft215 = _mm512_fnmadd_ps(fft196, fft207, fft206);
__m512 fft300 = _mm512_fnmadd_ps(fft283, fft207, fft292);
__m512 fft216 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft217 = _mm512_fmadd_ps(fft208, fft216, _mm512_shuffle_f32x4(fft208, fft208, 177));
__m512 fft301 = _mm512_fmadd_ps(fft293, fft216, _mm512_shuffle_f32x4(fft293, fft293, 177));
__m512 fft218 = _mm512_fmadd_ps(fft209, fft216, _mm512_shuffle_f32x4(fft209, fft209, 177));
__m512 fft302 = _mm512_fmadd_ps(fft294, fft216, _mm512_shuffle_f32x4(fft294, fft294, 177));
__m512 fft219 = _mm512_fmadd_ps(fft210, fft216, _mm512_shuffle_f32x4(fft210, fft210, 177));
__m512 fft303 = _mm512_fmadd_ps(fft295, fft216, _mm512_shuffle_f32x4(fft295, fft295, 177));
__m512 fft220 = _mm512_fmadd_ps(fft211, fft216, _mm512_shuffle_f32x4(fft211, fft211, 177));
__m512 fft304 = _mm512_fmadd_ps(fft296, fft216, _mm512_shuffle_f32x4(fft296, fft296, 177));
__m512 fft221 = _mm512_fmadd_ps(fft212, fft216, _mm512_shuffle_f32x4(fft212, fft212, 177));
__m512 fft305 = _mm512_fmadd_ps(fft297, fft216, _mm512_shuffle_f32x4(fft297, fft297, 177));
__m512 fft222 = _mm512_fmadd_ps(fft213, fft216, _mm512_shuffle_f32x4(fft213, fft213, 177));
__m512 fft306 = _mm512_fmadd_ps(fft298, fft216, _mm512_shuffle_f32x4(fft298, fft298, 177));
__m512 fft223 = _mm512_fmadd_ps(fft214, fft216, _mm512_shuffle_f32x4(fft214, fft214, 177));
__m512 fft307 = _mm512_fmadd_ps(fft299, fft216, _mm512_shuffle_f32x4(fft299, fft299, 177));
__m512 fft224 = _mm512_fmadd_ps(fft215, fft216, _mm512_shuffle_f32x4(fft215, fft215, 177));
__m512 fft308 = _mm512_fmadd_ps(fft300, fft216, _mm512_shuffle_f32x4(fft300, fft300, 177));
__m512 fft225 = _mm512_mask_mov_ps(fft217, 49344, fft218);
__m512 fft309 = _mm512_mask_mov_ps(fft301, 49344, fft302);
__m512 fft226 = _mm512_mask_sub_ps(fft218, 49344, _mm512_setzero_ps(), fft217);
__m512 fft310 = _mm512_mask_sub_ps(fft302, 49344, _mm512_setzero_ps(), fft301);
__m512 fft227 = _mm512_mask_mov_ps(fft219, 49344, fft220);
__m512 fft311 = _mm512_mask_mov_ps(fft303, 49344, fft304);
__m512 fft228 = _mm512_mask_sub_ps(fft220, 49344, _mm512_setzero_ps(), fft219);
__m512 fft312 = _mm512_mask_sub_ps(fft304, 49344, _mm512_setzero_ps(), fft303);
__m512 fft229 = _mm512_mask_mov_ps(fft221, 49344, fft222);
__m512 fft313 = _mm512_mask_mov_ps(fft305, 49344, fft306);
__m512 fft230 = _mm512_mask_sub_ps(fft222, 49344, _mm512_setzero_ps(), fft221);
__m512 fft314 = _mm512_mask_sub_ps(fft306, 49344, _mm512_setzero_ps(), fft305);
__m512 fft231 = _mm512_mask_mov_ps(fft223, 49344, fft224);
__m512 fft315 = _mm512_mask_mov_ps(fft307, 49344, fft308);
__m512 fft232 = _mm512_mask_sub_ps(fft224, 49344, _mm512_setzero_ps(), fft223);
__m512 fft316 = _mm512_mask_sub_ps(fft308, 49344, _mm512_setzero_ps(), fft307);
__m512 fft233 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft234 = _mm512_fmadd_ps(fft225, fft233, _mm512_shuffle_ps(fft225, fft225, 78));
__m512 fft317 = _mm512_fmadd_ps(fft309, fft233, _mm512_shuffle_ps(fft309, fft309, 78));
__m512 fft235 = _mm512_fmadd_ps(fft226, fft233, _mm512_shuffle_ps(fft226, fft226, 78));
__m512 fft318 = _mm512_fmadd_ps(fft310, fft233, _mm512_shuffle_ps(fft310, fft310, 78));
__m512 fft236 = _mm512_fmadd_ps(fft227, fft233, _mm512_shuffle_ps(fft227, fft227, 78));
__m512 fft319 = _mm512_fmadd_ps(fft311, fft233, _mm512_shuffle_ps(fft311, fft311, 78));
__m512 fft237 = _mm512_fmadd_ps(fft228, fft233, _mm512_shuffle_ps(fft228, fft228, 78));
__m512 fft320 = _mm512_fmadd_ps(fft312, fft233, _mm512_shuffle_ps(fft312, fft312, 78));
__m512 fft238 = _mm512_fmadd_ps(fft229, fft233, _mm512_shuffle_ps(fft229, fft229, 78));
__m512 fft321 = _mm512_fmadd_ps(fft313, fft233, _mm512_shuffle_ps(fft313, fft313, 78));
__m512 fft239 = _mm512_fmadd_ps(fft230, fft233, _mm512_shuffle_ps(fft230, fft230, 78));
__m512 fft322 = _mm512_fmadd_ps(fft314, fft233, _mm512_shuffle_ps(fft314, fft314, 78));
__m512 fft240 = _mm512_fmadd_ps(fft231, fft233, _mm512_shuffle_ps(fft231, fft231, 78));
__m512 fft323 = _mm512_fmadd_ps(fft315, fft233, _mm512_shuffle_ps(fft315, fft315, 78));
__m512 fft241 = _mm512_fmadd_ps(fft232, fft233, _mm512_shuffle_ps(fft232, fft232, 78));
__m512 fft324 = _mm512_fmadd_ps(fft316, fft233, _mm512_shuffle_ps(fft316, fft316, 78));
__m512i fft242 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft243 = _mm512_permutexvar_ps(fft242, fft234);
__m512 fft325 = _mm512_permutexvar_ps(fft242, fft317);
__m512i fft244 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft245 = _mm512_permutexvar_ps(fft244, fft234);
__m512 fft326 = _mm512_permutexvar_ps(fft244, fft317);
__m512 fft246 = _mm512_permutexvar_ps(fft242, fft235);
__m512 fft327 = _mm512_permutexvar_ps(fft242, fft318);
__m512 fft247 = _mm512_permutexvar_ps(fft244, fft235);
__m512 fft328 = _mm512_permutexvar_ps(fft244, fft318);
__m512 fft248 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft249 = _mm512_fmadd_ps(fft243, fft248, fft245);
__m512 fft329 = _mm512_fmadd_ps(fft325, fft248, fft326);
__m512 fft250 = _mm512_fnmadd_ps(fft247, fft248, fft246);
__m512 fft330 = _mm512_fnmadd_ps(fft328, fft248, fft327);
__m512 fft251 = _mm512_mask_mov_ps(fft247, 21845, fft249);
__m512 fft331 = _mm512_mask_mov_ps(fft328, 21845, fft329);
__m512 fft252 = _mm512_mask_mov_ps(fft243, 43176, fft249);
__m512 fft332 = _mm512_mask_mov_ps(fft325, 43176, fft329);
__m512 fft253 = _mm512_mask_mov_ps(fft251, 43176, fft250);
__m512 fft333 = _mm512_mask_mov_ps(fft331, 43176, fft330);
__m512 fft254 = _mm512_mask_mov_ps(fft252, 22102, fft250);
__m512 fft334 = _mm512_mask_mov_ps(fft332, 22102, fft330);
__m512 fft255 = _mm512_mask_mul_ps(fft253, 64764, fft253, _mm512_set1_ps(5e-01f));
__m512 fft335 = _mm512_mask_mul_ps(fft333, 64764, fft333, _mm512_set1_ps(5e-01f));
__m512 fft256 = _mm512_mask_mul_ps(fft254, 64764, fft254, _mm512_set1_ps(5e-01f));
__m512 fft336 = _mm512_mask_mul_ps(fft334, 64764, fft334, _mm512_set1_ps(5e-01f));
__m512 wf17 = fft255;
__m512 wf25 = fft335;
__m512 wf18 = fft256;
__m512 wf26 = fft336;
__m512 wf19 = fft236;
__m512 wf27 = fft319;
__m512 wf20 = fft237;
__m512 wf28 = fft320;
__m512 wf21 = fft238;
__m512 wf29 = fft321;
__m512 wf22 = fft239;
__m512 wf30 = fft322;
__m512 wf23 = fft240;
__m512 wf31 = fft323;
__m512 wf24 = fft241;
__m512 wf32 = fft324;
ptrdiff_t c2 = (size_t)(1+2*j1)/4;
ptrdiff_t m2 = (size_t)(1+2*j1)%4/2;
ptrdiff_t f3 = (size_t)(1+2*j1)%2;
__m512i eo2 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf19 = _mm512_permutexvar_ps(eo2, wf19);
wf20 = _mm512_permutexvar_ps(eo2, wf20);
__m512i wfs9 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs9 = _mm512_inserti64x4(wfs9, _mm512_cvtps_ph(wf20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+27776+111104*i5+1792*c2+128*k1+64*m2+16*f3, 3855, wfs9);
_mm512_mask_storeu_epi32(wfPtr1+472176+111104*i5+1792*c2+128*k1+64*m2+16*f3, 61680, wfs9);
wf27 = _mm512_permutexvar_ps(eo2, wf27);
wf28 = _mm512_permutexvar_ps(eo2, wf28);
__m512i wfs10 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs10 = _mm512_inserti64x4(wfs10, _mm512_cvtps_ph(wf28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+916608+111104*i5+1792*c2+128*k1+64*m2+16*f3, 3855, wfs10);
_mm512_mask_storeu_epi32(wfPtr1+1361008+111104*i5+1792*c2+128*k1+64*m2+16*f3, 61680, wfs10);
wf21 = _mm512_permutexvar_ps(eo2, wf21);
wf22 = _mm512_permutexvar_ps(eo2, wf22);
__m512i wfs11 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs11 = _mm512_inserti64x4(wfs11, _mm512_cvtps_ph(wf22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+55552+111104*i5+1792*c2+128*k1+64*m2+16*f3, 3855, wfs11);
_mm512_mask_storeu_epi32(wfPtr1+499952+111104*i5+1792*c2+128*k1+64*m2+16*f3, 61680, wfs11);
wf29 = _mm512_permutexvar_ps(eo2, wf29);
wf30 = _mm512_permutexvar_ps(eo2, wf30);
__m512i wfs12 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf29, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs12 = _mm512_inserti64x4(wfs12, _mm512_cvtps_ph(wf30, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+944384+111104*i5+1792*c2+128*k1+64*m2+16*f3, 3855, wfs12);
_mm512_mask_storeu_epi32(wfPtr1+1388784+111104*i5+1792*c2+128*k1+64*m2+16*f3, 61680, wfs12);
wf23 = _mm512_permutexvar_ps(eo2, wf23);
wf24 = _mm512_permutexvar_ps(eo2, wf24);
__m512i wfs13 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs13 = _mm512_inserti64x4(wfs13, _mm512_cvtps_ph(wf24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+83328+111104*i5+1792*c2+128*k1+64*m2+16*f3, 3855, wfs13);
_mm512_mask_storeu_epi32(wfPtr1+527728+111104*i5+1792*c2+128*k1+64*m2+16*f3, 61680, wfs13);
wf31 = _mm512_permutexvar_ps(eo2, wf31);
wf32 = _mm512_permutexvar_ps(eo2, wf32);
__m512i wfs14 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf31, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs14 = _mm512_inserti64x4(wfs14, _mm512_cvtps_ph(wf32, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+972160+111104*i5+1792*c2+128*k1+64*m2+16*f3, 3855, wfs14);
_mm512_mask_storeu_epi32(wfPtr1+1416560+111104*i5+1792*c2+128*k1+64*m2+16*f3, 61680, wfs14);
__m512i wfs15 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs15 = _mm512_inserti64x4(wfs15, _mm512_cvtps_ph(wf18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+111104*i5+1792*c2+128*k1+64*m2+16*f3, 3855, wfs15);
_mm512_mask_storeu_epi32(wfPtr1+444400+111104*i5+1792*c2+128*k1+64*m2+16*f3, 61680, wfs15);
__m512i wfs16 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs16 = _mm512_inserti64x4(wfs16, _mm512_cvtps_ph(wf26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+888832+111104*i5+1792*c2+128*k1+64*m2+16*f3, 3855, wfs16);
_mm512_mask_storeu_epi32(wfPtr1+1333232+111104*i5+1792*c2+128*k1+64*m2+16*f3, 61680, wfs16);
}
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(3, biasPtr1-0+244*i5+8*j1);
bias1 = _mm512_mul_ps(bias1, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+248*i5+8*j1, 3, bias1);
if (j1 >= jj1) return;
}
}
if (j1 == 30) {
for (ptrdiff_t k2 = 0; k2 < 14; ++k2) {
__m512 wt11 = _mm512_maskz_loadu_ps(31, wtPtr1+0+85400*i5+2800*j1+100*k2);
__m512 wt12 = _mm512_maskz_loadu_ps(31, wtPtr1+20+85400*i5+2800*j1+100*k2);
__m512 wt13 = _mm512_maskz_loadu_ps(31, wtPtr1+40+85400*i5+2800*j1+100*k2);
__m512 wt14 = _mm512_maskz_loadu_ps(31, wtPtr1+60+85400*i5+2800*j1+100*k2);
__m512 wt15 = _mm512_maskz_loadu_ps(31, wtPtr1+80+85400*i5+2800*j1+100*k2);
__m512 fft337 = _mm512_add_ps(wt11, _mm512_setzero_ps());
__m512 fft425 = _mm512_add_ps(wt12, _mm512_setzero_ps());
__m512 fft338 = _mm512_sub_ps(wt11, _mm512_setzero_ps());
__m512 fft426 = _mm512_sub_ps(wt12, _mm512_setzero_ps());
__m512 fft339 = _mm512_add_ps(wt13, _mm512_setzero_ps());
__m512 fft427 = _mm512_add_ps(wt14, _mm512_setzero_ps());
__m512 fft340 = _mm512_sub_ps(wt13, _mm512_setzero_ps());
__m512 fft428 = _mm512_sub_ps(wt14, _mm512_setzero_ps());
__m512 fft341 = _mm512_add_ps(wt15, _mm512_setzero_ps());
__m512 fft429 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft342 = _mm512_sub_ps(wt15, _mm512_setzero_ps());
__m512 fft430 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft343 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft431 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft344 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft432 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft345 = _mm512_add_ps(fft337, fft341);
__m512 fft433 = _mm512_add_ps(fft425, fft429);
__m512 fft346 = _mm512_sub_ps(fft337, fft341);
__m512 fft434 = _mm512_sub_ps(fft425, fft429);
__m512 fft347 = _mm512_add_ps(fft339, fft343);
__m512 fft435 = _mm512_add_ps(fft427, fft431);
__m512 fft348 = _mm512_sub_ps(fft343, fft339);
__m512 fft436 = _mm512_sub_ps(fft431, fft427);
__m512 fft349 = _mm512_sub_ps(fft340, fft344);
__m512 fft437 = _mm512_sub_ps(fft428, fft432);
__m512 fft350 = _mm512_add_ps(fft340, fft344);
__m512 fft438 = _mm512_add_ps(fft428, fft432);
__m512 fft351 = _mm512_add_ps(fft345, fft347);
__m512 fft439 = _mm512_add_ps(fft433, fft435);
__m512 fft352 = _mm512_sub_ps(fft345, fft347);
__m512 fft440 = _mm512_sub_ps(fft433, fft435);
__m512 fft353 = _mm512_fmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft441 = _mm512_fmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft354 = _mm512_fnmsub_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft442 = _mm512_fnmsub_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft355 = _mm512_fnmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft443 = _mm512_fnmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft356 = _mm512_fnmadd_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft444 = _mm512_fnmadd_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft357 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft358 = _mm512_fmadd_ps(fft351, fft357, _mm512_shuffle_f32x4(fft351, fft351, 78));
__m512 fft445 = _mm512_fmadd_ps(fft439, fft357, _mm512_shuffle_f32x4(fft439, fft439, 78));
__m512 fft359 = _mm512_fmadd_ps(fft352, fft357, _mm512_shuffle_f32x4(fft352, fft352, 78));
__m512 fft446 = _mm512_fmadd_ps(fft440, fft357, _mm512_shuffle_f32x4(fft440, fft440, 78));
__m512 fft360 = _mm512_fmadd_ps(fft353, fft357, _mm512_shuffle_f32x4(fft353, fft353, 78));
__m512 fft447 = _mm512_fmadd_ps(fft441, fft357, _mm512_shuffle_f32x4(fft441, fft441, 78));
__m512 fft361 = _mm512_fmadd_ps(fft354, fft357, _mm512_shuffle_f32x4(fft354, fft354, 78));
__m512 fft448 = _mm512_fmadd_ps(fft442, fft357, _mm512_shuffle_f32x4(fft442, fft442, 78));
__m512 fft362 = _mm512_fmadd_ps(fft346, fft357, _mm512_shuffle_f32x4(fft346, fft346, 78));
__m512 fft449 = _mm512_fmadd_ps(fft434, fft357, _mm512_shuffle_f32x4(fft434, fft434, 78));
__m512 fft363 = _mm512_fmadd_ps(fft348, fft357, _mm512_shuffle_f32x4(fft348, fft348, 78));
__m512 fft450 = _mm512_fmadd_ps(fft436, fft357, _mm512_shuffle_f32x4(fft436, fft436, 78));
__m512 fft364 = _mm512_fmadd_ps(fft355, fft357, _mm512_shuffle_f32x4(fft355, fft355, 78));
__m512 fft451 = _mm512_fmadd_ps(fft443, fft357, _mm512_shuffle_f32x4(fft443, fft443, 78));
__m512 fft365 = _mm512_fmadd_ps(fft356, fft357, _mm512_shuffle_f32x4(fft356, fft356, 78));
__m512 fft452 = _mm512_fmadd_ps(fft444, fft357, _mm512_shuffle_f32x4(fft444, fft444, 78));
__m512 fft366 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft367 = _mm512_mul_ps(fft358, fft366);
__m512 fft453 = _mm512_mul_ps(fft445, fft366);
__m512 fft368 = _mm512_mul_ps(fft359, fft366);
__m512 fft454 = _mm512_mul_ps(fft446, fft366);
__m512 fft369 = _mm512_mul_ps(fft360, fft366);
__m512 fft455 = _mm512_mul_ps(fft447, fft366);
__m512 fft370 = _mm512_mul_ps(fft361, fft366);
__m512 fft456 = _mm512_mul_ps(fft448, fft366);
__m512 fft371 = _mm512_mul_ps(fft362, fft366);
__m512 fft457 = _mm512_mul_ps(fft449, fft366);
__m512 fft372 = _mm512_mul_ps(fft363, fft366);
__m512 fft458 = _mm512_mul_ps(fft450, fft366);
__m512 fft373 = _mm512_mul_ps(fft364, fft366);
__m512 fft459 = _mm512_mul_ps(fft451, fft366);
__m512 fft374 = _mm512_mul_ps(fft365, fft366);
__m512 fft460 = _mm512_mul_ps(fft452, fft366);
__m512 fft375 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft376 = _mm512_fmadd_ps(fft359, fft375, fft367);
__m512 fft461 = _mm512_fmadd_ps(fft446, fft375, fft453);
__m512 fft377 = _mm512_fnmadd_ps(fft358, fft375, fft368);
__m512 fft462 = _mm512_fnmadd_ps(fft445, fft375, fft454);
__m512 fft378 = _mm512_fmadd_ps(fft361, fft375, fft369);
__m512 fft463 = _mm512_fmadd_ps(fft448, fft375, fft455);
__m512 fft379 = _mm512_fnmadd_ps(fft360, fft375, fft370);
__m512 fft464 = _mm512_fnmadd_ps(fft447, fft375, fft456);
__m512 fft380 = _mm512_fmadd_ps(fft363, fft375, fft371);
__m512 fft465 = _mm512_fmadd_ps(fft450, fft375, fft457);
__m512 fft381 = _mm512_fnmadd_ps(fft362, fft375, fft372);
__m512 fft466 = _mm512_fnmadd_ps(fft449, fft375, fft458);
__m512 fft382 = _mm512_fmadd_ps(fft365, fft375, fft373);
__m512 fft467 = _mm512_fmadd_ps(fft452, fft375, fft459);
__m512 fft383 = _mm512_fnmadd_ps(fft364, fft375, fft374);
__m512 fft468 = _mm512_fnmadd_ps(fft451, fft375, fft460);
__m512 fft384 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft385 = _mm512_fmadd_ps(fft376, fft384, _mm512_shuffle_f32x4(fft376, fft376, 177));
__m512 fft469 = _mm512_fmadd_ps(fft461, fft384, _mm512_shuffle_f32x4(fft461, fft461, 177));
__m512 fft386 = _mm512_fmadd_ps(fft377, fft384, _mm512_shuffle_f32x4(fft377, fft377, 177));
__m512 fft470 = _mm512_fmadd_ps(fft462, fft384, _mm512_shuffle_f32x4(fft462, fft462, 177));
__m512 fft387 = _mm512_fmadd_ps(fft378, fft384, _mm512_shuffle_f32x4(fft378, fft378, 177));
__m512 fft471 = _mm512_fmadd_ps(fft463, fft384, _mm512_shuffle_f32x4(fft463, fft463, 177));
__m512 fft388 = _mm512_fmadd_ps(fft379, fft384, _mm512_shuffle_f32x4(fft379, fft379, 177));
__m512 fft472 = _mm512_fmadd_ps(fft464, fft384, _mm512_shuffle_f32x4(fft464, fft464, 177));
__m512 fft389 = _mm512_fmadd_ps(fft380, fft384, _mm512_shuffle_f32x4(fft380, fft380, 177));
__m512 fft473 = _mm512_fmadd_ps(fft465, fft384, _mm512_shuffle_f32x4(fft465, fft465, 177));
__m512 fft390 = _mm512_fmadd_ps(fft381, fft384, _mm512_shuffle_f32x4(fft381, fft381, 177));
__m512 fft474 = _mm512_fmadd_ps(fft466, fft384, _mm512_shuffle_f32x4(fft466, fft466, 177));
__m512 fft391 = _mm512_fmadd_ps(fft382, fft384, _mm512_shuffle_f32x4(fft382, fft382, 177));
__m512 fft475 = _mm512_fmadd_ps(fft467, fft384, _mm512_shuffle_f32x4(fft467, fft467, 177));
__m512 fft392 = _mm512_fmadd_ps(fft383, fft384, _mm512_shuffle_f32x4(fft383, fft383, 177));
__m512 fft476 = _mm512_fmadd_ps(fft468, fft384, _mm512_shuffle_f32x4(fft468, fft468, 177));
__m512 fft393 = _mm512_mask_mov_ps(fft385, 49344, fft386);
__m512 fft477 = _mm512_mask_mov_ps(fft469, 49344, fft470);
__m512 fft394 = _mm512_mask_sub_ps(fft386, 49344, _mm512_setzero_ps(), fft385);
__m512 fft478 = _mm512_mask_sub_ps(fft470, 49344, _mm512_setzero_ps(), fft469);
__m512 fft395 = _mm512_mask_mov_ps(fft387, 49344, fft388);
__m512 fft479 = _mm512_mask_mov_ps(fft471, 49344, fft472);
__m512 fft396 = _mm512_mask_sub_ps(fft388, 49344, _mm512_setzero_ps(), fft387);
__m512 fft480 = _mm512_mask_sub_ps(fft472, 49344, _mm512_setzero_ps(), fft471);
__m512 fft397 = _mm512_mask_mov_ps(fft389, 49344, fft390);
__m512 fft481 = _mm512_mask_mov_ps(fft473, 49344, fft474);
__m512 fft398 = _mm512_mask_sub_ps(fft390, 49344, _mm512_setzero_ps(), fft389);
__m512 fft482 = _mm512_mask_sub_ps(fft474, 49344, _mm512_setzero_ps(), fft473);
__m512 fft399 = _mm512_mask_mov_ps(fft391, 49344, fft392);
__m512 fft483 = _mm512_mask_mov_ps(fft475, 49344, fft476);
__m512 fft400 = _mm512_mask_sub_ps(fft392, 49344, _mm512_setzero_ps(), fft391);
__m512 fft484 = _mm512_mask_sub_ps(fft476, 49344, _mm512_setzero_ps(), fft475);
__m512 fft401 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft402 = _mm512_fmadd_ps(fft393, fft401, _mm512_shuffle_ps(fft393, fft393, 78));
__m512 fft485 = _mm512_fmadd_ps(fft477, fft401, _mm512_shuffle_ps(fft477, fft477, 78));
__m512 fft403 = _mm512_fmadd_ps(fft394, fft401, _mm512_shuffle_ps(fft394, fft394, 78));
__m512 fft486 = _mm512_fmadd_ps(fft478, fft401, _mm512_shuffle_ps(fft478, fft478, 78));
__m512 fft404 = _mm512_fmadd_ps(fft395, fft401, _mm512_shuffle_ps(fft395, fft395, 78));
__m512 fft487 = _mm512_fmadd_ps(fft479, fft401, _mm512_shuffle_ps(fft479, fft479, 78));
__m512 fft405 = _mm512_fmadd_ps(fft396, fft401, _mm512_shuffle_ps(fft396, fft396, 78));
__m512 fft488 = _mm512_fmadd_ps(fft480, fft401, _mm512_shuffle_ps(fft480, fft480, 78));
__m512 fft406 = _mm512_fmadd_ps(fft397, fft401, _mm512_shuffle_ps(fft397, fft397, 78));
__m512 fft489 = _mm512_fmadd_ps(fft481, fft401, _mm512_shuffle_ps(fft481, fft481, 78));
__m512 fft407 = _mm512_fmadd_ps(fft398, fft401, _mm512_shuffle_ps(fft398, fft398, 78));
__m512 fft490 = _mm512_fmadd_ps(fft482, fft401, _mm512_shuffle_ps(fft482, fft482, 78));
__m512 fft408 = _mm512_fmadd_ps(fft399, fft401, _mm512_shuffle_ps(fft399, fft399, 78));
__m512 fft491 = _mm512_fmadd_ps(fft483, fft401, _mm512_shuffle_ps(fft483, fft483, 78));
__m512 fft409 = _mm512_fmadd_ps(fft400, fft401, _mm512_shuffle_ps(fft400, fft400, 78));
__m512 fft492 = _mm512_fmadd_ps(fft484, fft401, _mm512_shuffle_ps(fft484, fft484, 78));
__m512i fft410 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft411 = _mm512_permutexvar_ps(fft410, fft402);
__m512 fft493 = _mm512_permutexvar_ps(fft410, fft485);
__m512i fft412 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft413 = _mm512_permutexvar_ps(fft412, fft402);
__m512 fft494 = _mm512_permutexvar_ps(fft412, fft485);
__m512 fft414 = _mm512_permutexvar_ps(fft410, fft403);
__m512 fft495 = _mm512_permutexvar_ps(fft410, fft486);
__m512 fft415 = _mm512_permutexvar_ps(fft412, fft403);
__m512 fft496 = _mm512_permutexvar_ps(fft412, fft486);
__m512 fft416 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft417 = _mm512_fmadd_ps(fft411, fft416, fft413);
__m512 fft497 = _mm512_fmadd_ps(fft493, fft416, fft494);
__m512 fft418 = _mm512_fnmadd_ps(fft415, fft416, fft414);
__m512 fft498 = _mm512_fnmadd_ps(fft496, fft416, fft495);
__m512 fft419 = _mm512_mask_mov_ps(fft415, 21845, fft417);
__m512 fft499 = _mm512_mask_mov_ps(fft496, 21845, fft497);
__m512 fft420 = _mm512_mask_mov_ps(fft411, 43176, fft417);
__m512 fft500 = _mm512_mask_mov_ps(fft493, 43176, fft497);
__m512 fft421 = _mm512_mask_mov_ps(fft419, 43176, fft418);
__m512 fft501 = _mm512_mask_mov_ps(fft499, 43176, fft498);
__m512 fft422 = _mm512_mask_mov_ps(fft420, 22102, fft418);
__m512 fft502 = _mm512_mask_mov_ps(fft500, 22102, fft498);
__m512 fft423 = _mm512_mask_mul_ps(fft421, 64764, fft421, _mm512_set1_ps(5e-01f));
__m512 fft503 = _mm512_mask_mul_ps(fft501, 64764, fft501, _mm512_set1_ps(5e-01f));
__m512 fft424 = _mm512_mask_mul_ps(fft422, 64764, fft422, _mm512_set1_ps(5e-01f));
__m512 fft504 = _mm512_mask_mul_ps(fft502, 64764, fft502, _mm512_set1_ps(5e-01f));
__m512 wf33 = fft423;
__m512 wf41 = fft503;
__m512 wf34 = fft424;
__m512 wf42 = fft504;
__m512 wf35 = fft404;
__m512 wf43 = fft487;
__m512 wf36 = fft405;
__m512 wf44 = fft488;
__m512 wf37 = fft406;
__m512 wf45 = fft489;
__m512 wf38 = fft407;
__m512 wf46 = fft490;
__m512 wf39 = fft408;
__m512 wf47 = fft491;
__m512 wf40 = fft409;
__m512 wf48 = fft492;
ptrdiff_t c3 = (size_t)(0+2*j1)/4;
ptrdiff_t m3 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f4 = (size_t)(0+2*j1)%2;
__m512i eo3 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf35 = _mm512_permutexvar_ps(eo3, wf35);
wf36 = _mm512_permutexvar_ps(eo3, wf36);
__m512i wfs17 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf35, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs17 = _mm512_inserti64x4(wfs17, _mm512_cvtps_ph(wf36, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep1 = _mm512_shuffle_i32x4(wfs17, wfs17, 160);
_mm512_mask_storeu_epi32(wfPtr1+27776+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep1);
__m512i rep2 = _mm512_shuffle_i32x4(wfs17, wfs17, 245);
_mm512_mask_storeu_epi32(wfPtr1+472192+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep2);
wf43 = _mm512_permutexvar_ps(eo3, wf43);
wf44 = _mm512_permutexvar_ps(eo3, wf44);
__m512i wfs18 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf43, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs18 = _mm512_inserti64x4(wfs18, _mm512_cvtps_ph(wf44, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep3 = _mm512_shuffle_i32x4(wfs18, wfs18, 160);
_mm512_mask_storeu_epi32(wfPtr1+916608+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep3);
__m512i rep4 = _mm512_shuffle_i32x4(wfs18, wfs18, 245);
_mm512_mask_storeu_epi32(wfPtr1+1361024+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep4);
wf37 = _mm512_permutexvar_ps(eo3, wf37);
wf38 = _mm512_permutexvar_ps(eo3, wf38);
__m512i wfs19 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf37, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs19 = _mm512_inserti64x4(wfs19, _mm512_cvtps_ph(wf38, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep5 = _mm512_shuffle_i32x4(wfs19, wfs19, 160);
_mm512_mask_storeu_epi32(wfPtr1+55552+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep5);
__m512i rep6 = _mm512_shuffle_i32x4(wfs19, wfs19, 245);
_mm512_mask_storeu_epi32(wfPtr1+499968+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep6);
wf45 = _mm512_permutexvar_ps(eo3, wf45);
wf46 = _mm512_permutexvar_ps(eo3, wf46);
__m512i wfs20 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf45, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs20 = _mm512_inserti64x4(wfs20, _mm512_cvtps_ph(wf46, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep7 = _mm512_shuffle_i32x4(wfs20, wfs20, 160);
_mm512_mask_storeu_epi32(wfPtr1+944384+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep7);
__m512i rep8 = _mm512_shuffle_i32x4(wfs20, wfs20, 245);
_mm512_mask_storeu_epi32(wfPtr1+1388800+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep8);
wf39 = _mm512_permutexvar_ps(eo3, wf39);
wf40 = _mm512_permutexvar_ps(eo3, wf40);
__m512i wfs21 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf39, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs21 = _mm512_inserti64x4(wfs21, _mm512_cvtps_ph(wf40, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep9 = _mm512_shuffle_i32x4(wfs21, wfs21, 160);
_mm512_mask_storeu_epi32(wfPtr1+83328+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep9);
__m512i rep10 = _mm512_shuffle_i32x4(wfs21, wfs21, 245);
_mm512_mask_storeu_epi32(wfPtr1+527744+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep10);
wf47 = _mm512_permutexvar_ps(eo3, wf47);
wf48 = _mm512_permutexvar_ps(eo3, wf48);
__m512i wfs22 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf47, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs22 = _mm512_inserti64x4(wfs22, _mm512_cvtps_ph(wf48, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep11 = _mm512_shuffle_i32x4(wfs22, wfs22, 160);
_mm512_mask_storeu_epi32(wfPtr1+972160+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep11);
__m512i rep12 = _mm512_shuffle_i32x4(wfs22, wfs22, 245);
_mm512_mask_storeu_epi32(wfPtr1+1416576+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep12);
__m512i wfs23 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf33, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs23 = _mm512_inserti64x4(wfs23, _mm512_cvtps_ph(wf34, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep13 = _mm512_shuffle_i32x4(wfs23, wfs23, 160);
_mm512_mask_storeu_epi32(wfPtr1+0+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep13);
__m512i rep14 = _mm512_shuffle_i32x4(wfs23, wfs23, 245);
_mm512_mask_storeu_epi32(wfPtr1+444416+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep14);
__m512i wfs24 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf41, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs24 = _mm512_inserti64x4(wfs24, _mm512_cvtps_ph(wf42, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep15 = _mm512_shuffle_i32x4(wfs24, wfs24, 160);
_mm512_mask_storeu_epi32(wfPtr1+888832+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep15);
__m512i rep16 = _mm512_shuffle_i32x4(wfs24, wfs24, 245);
_mm512_mask_storeu_epi32(wfPtr1+1333248+111104*i5+1792*c3+64*k2+64*m3+16*f4, 65535, rep16);
}
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(1, biasPtr1-0+244*i5+8*j1);
bias2 = _mm512_mul_ps(bias2, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+248*i5+8*j1, 1, bias2);
if (j1 >= jj1) return;
j1 = 31;
}
}

static void Example8StriderArrangeFilts1(Example8ThreaderTeam1* team13, char** tensors1) {
Example8ThreaderTask1 task5;
task5.callee1 = Example8StriderArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 6;
task5.hull1[1] = 4;
task5.hull1[2] = 1;
Example8ThreaderDo1(team13, &task5);
}

static void Example8StriderArrangeDats1Callee1(Example8ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = 0;
ptrdiff_t c4 = 0;
ptrdiff_t g3 = pt8[2];
ptrdiff_t e2 = 0;
char*restrict datPtr1 = tensors4[0]-248+918720*e2;
char*restrict dfPtr1 = tensors4[1]+9732096*e2;
ptrdiff_t i6 = 2*g3;
ptrdiff_t ii1 = i6+1;
for (; i6 <= ii1; ++i6) {
ptrdiff_t j2 = 1*c4;
ptrdiff_t rel1 = j2-0;
ptrdiff_t base1 = 0;
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k3 = 14*s1;
ptrdiff_t kk1 = k3+13;
for (; k3 <= kk1; ++k3) {
ptrdiff_t b3 = 0;
ptrdiff_t m4 = (size_t)b3/2;
ptrdiff_t f5 = (size_t)b3%2;
__m512 dat1 = _mm512_maskz_loadu_ps(65532, datPtr1+240+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat2 = _mm512_maskz_loadu_ps(65532, datPtr1+320+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat3 = _mm512_maskz_loadu_ps(65532, datPtr1+400+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat4 = _mm512_maskz_loadu_ps(65532, datPtr1+480+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat5 = _mm512_maskz_loadu_ps(65532, datPtr1+560+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat6 = _mm512_maskz_loadu_ps(65532, datPtr1+640+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat7 = _mm512_maskz_loadu_ps(65532, datPtr1+720+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat8 = _mm512_maskz_loadu_ps(65532, datPtr1+800+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat9 = _mm512_maskz_loadu_ps(65532, datPtr1+880+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat10 = _mm512_maskz_loadu_ps(65532, datPtr1+960+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat11 = _mm512_maskz_loadu_ps(65532, datPtr1+1040+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat12 = _mm512_maskz_loadu_ps(65532, datPtr1+1120+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 dat13 = _mm512_maskz_loadu_ps(65532, datPtr1+1200+32480*i6+2320*k3+80*h1+4*w1+0*b3);
__m512 fft505 = _mm512_add_ps(_mm512_setzero_ps(), dat6);
__m512 fft593 = _mm512_add_ps(_mm512_setzero_ps(), dat7);
__m512 fft506 = _mm512_sub_ps(_mm512_setzero_ps(), dat6);
__m512 fft594 = _mm512_sub_ps(_mm512_setzero_ps(), dat7);
__m512 fft507 = _mm512_add_ps(_mm512_setzero_ps(), dat8);
__m512 fft595 = _mm512_add_ps(dat1, dat9);
__m512 fft508 = _mm512_sub_ps(_mm512_setzero_ps(), dat8);
__m512 fft596 = _mm512_sub_ps(dat1, dat9);
__m512 fft509 = _mm512_add_ps(dat2, dat10);
__m512 fft597 = _mm512_add_ps(dat3, dat11);
__m512 fft510 = _mm512_sub_ps(dat2, dat10);
__m512 fft598 = _mm512_sub_ps(dat3, dat11);
__m512 fft511 = _mm512_add_ps(dat4, dat12);
__m512 fft599 = _mm512_add_ps(dat5, dat13);
__m512 fft512 = _mm512_sub_ps(dat4, dat12);
__m512 fft600 = _mm512_sub_ps(dat5, dat13);
__m512 fft513 = _mm512_add_ps(fft505, fft509);
__m512 fft601 = _mm512_add_ps(fft593, fft597);
__m512 fft514 = _mm512_sub_ps(fft505, fft509);
__m512 fft602 = _mm512_sub_ps(fft593, fft597);
__m512 fft515 = _mm512_add_ps(fft507, fft511);
__m512 fft603 = _mm512_add_ps(fft595, fft599);
__m512 fft516 = _mm512_sub_ps(fft511, fft507);
__m512 fft604 = _mm512_sub_ps(fft599, fft595);
__m512 fft517 = _mm512_sub_ps(fft508, fft512);
__m512 fft605 = _mm512_sub_ps(fft596, fft600);
__m512 fft518 = _mm512_add_ps(fft508, fft512);
__m512 fft606 = _mm512_add_ps(fft596, fft600);
__m512 fft519 = _mm512_add_ps(fft513, fft515);
__m512 fft607 = _mm512_add_ps(fft601, fft603);
__m512 fft520 = _mm512_sub_ps(fft513, fft515);
__m512 fft608 = _mm512_sub_ps(fft601, fft603);
__m512 fft521 = _mm512_fmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft609 = _mm512_fmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft522 = _mm512_fnmsub_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft610 = _mm512_fnmsub_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft523 = _mm512_fnmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft611 = _mm512_fnmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft524 = _mm512_fnmadd_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft612 = _mm512_fnmadd_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft525 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft526 = _mm512_fmadd_ps(fft519, fft525, _mm512_shuffle_f32x4(fft519, fft519, 78));
__m512 fft613 = _mm512_fmadd_ps(fft607, fft525, _mm512_shuffle_f32x4(fft607, fft607, 78));
__m512 fft527 = _mm512_fmadd_ps(fft520, fft525, _mm512_shuffle_f32x4(fft520, fft520, 78));
__m512 fft614 = _mm512_fmadd_ps(fft608, fft525, _mm512_shuffle_f32x4(fft608, fft608, 78));
__m512 fft528 = _mm512_fmadd_ps(fft521, fft525, _mm512_shuffle_f32x4(fft521, fft521, 78));
__m512 fft615 = _mm512_fmadd_ps(fft609, fft525, _mm512_shuffle_f32x4(fft609, fft609, 78));
__m512 fft529 = _mm512_fmadd_ps(fft522, fft525, _mm512_shuffle_f32x4(fft522, fft522, 78));
__m512 fft616 = _mm512_fmadd_ps(fft610, fft525, _mm512_shuffle_f32x4(fft610, fft610, 78));
__m512 fft530 = _mm512_fmadd_ps(fft514, fft525, _mm512_shuffle_f32x4(fft514, fft514, 78));
__m512 fft617 = _mm512_fmadd_ps(fft602, fft525, _mm512_shuffle_f32x4(fft602, fft602, 78));
__m512 fft531 = _mm512_fmadd_ps(fft516, fft525, _mm512_shuffle_f32x4(fft516, fft516, 78));
__m512 fft618 = _mm512_fmadd_ps(fft604, fft525, _mm512_shuffle_f32x4(fft604, fft604, 78));
__m512 fft532 = _mm512_fmadd_ps(fft523, fft525, _mm512_shuffle_f32x4(fft523, fft523, 78));
__m512 fft619 = _mm512_fmadd_ps(fft611, fft525, _mm512_shuffle_f32x4(fft611, fft611, 78));
__m512 fft533 = _mm512_fmadd_ps(fft524, fft525, _mm512_shuffle_f32x4(fft524, fft524, 78));
__m512 fft620 = _mm512_fmadd_ps(fft612, fft525, _mm512_shuffle_f32x4(fft612, fft612, 78));
__m512 fft534 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft535 = _mm512_mul_ps(fft526, fft534);
__m512 fft621 = _mm512_mul_ps(fft613, fft534);
__m512 fft536 = _mm512_mul_ps(fft527, fft534);
__m512 fft622 = _mm512_mul_ps(fft614, fft534);
__m512 fft537 = _mm512_mul_ps(fft528, fft534);
__m512 fft623 = _mm512_mul_ps(fft615, fft534);
__m512 fft538 = _mm512_mul_ps(fft529, fft534);
__m512 fft624 = _mm512_mul_ps(fft616, fft534);
__m512 fft539 = _mm512_mul_ps(fft530, fft534);
__m512 fft625 = _mm512_mul_ps(fft617, fft534);
__m512 fft540 = _mm512_mul_ps(fft531, fft534);
__m512 fft626 = _mm512_mul_ps(fft618, fft534);
__m512 fft541 = _mm512_mul_ps(fft532, fft534);
__m512 fft627 = _mm512_mul_ps(fft619, fft534);
__m512 fft542 = _mm512_mul_ps(fft533, fft534);
__m512 fft628 = _mm512_mul_ps(fft620, fft534);
__m512 fft543 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft544 = _mm512_fmadd_ps(fft527, fft543, fft535);
__m512 fft629 = _mm512_fmadd_ps(fft614, fft543, fft621);
__m512 fft545 = _mm512_fnmadd_ps(fft526, fft543, fft536);
__m512 fft630 = _mm512_fnmadd_ps(fft613, fft543, fft622);
__m512 fft546 = _mm512_fmadd_ps(fft529, fft543, fft537);
__m512 fft631 = _mm512_fmadd_ps(fft616, fft543, fft623);
__m512 fft547 = _mm512_fnmadd_ps(fft528, fft543, fft538);
__m512 fft632 = _mm512_fnmadd_ps(fft615, fft543, fft624);
__m512 fft548 = _mm512_fmadd_ps(fft531, fft543, fft539);
__m512 fft633 = _mm512_fmadd_ps(fft618, fft543, fft625);
__m512 fft549 = _mm512_fnmadd_ps(fft530, fft543, fft540);
__m512 fft634 = _mm512_fnmadd_ps(fft617, fft543, fft626);
__m512 fft550 = _mm512_fmadd_ps(fft533, fft543, fft541);
__m512 fft635 = _mm512_fmadd_ps(fft620, fft543, fft627);
__m512 fft551 = _mm512_fnmadd_ps(fft532, fft543, fft542);
__m512 fft636 = _mm512_fnmadd_ps(fft619, fft543, fft628);
__m512 fft552 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft553 = _mm512_fmadd_ps(fft544, fft552, _mm512_shuffle_f32x4(fft544, fft544, 177));
__m512 fft637 = _mm512_fmadd_ps(fft629, fft552, _mm512_shuffle_f32x4(fft629, fft629, 177));
__m512 fft554 = _mm512_fmadd_ps(fft545, fft552, _mm512_shuffle_f32x4(fft545, fft545, 177));
__m512 fft638 = _mm512_fmadd_ps(fft630, fft552, _mm512_shuffle_f32x4(fft630, fft630, 177));
__m512 fft555 = _mm512_fmadd_ps(fft546, fft552, _mm512_shuffle_f32x4(fft546, fft546, 177));
__m512 fft639 = _mm512_fmadd_ps(fft631, fft552, _mm512_shuffle_f32x4(fft631, fft631, 177));
__m512 fft556 = _mm512_fmadd_ps(fft547, fft552, _mm512_shuffle_f32x4(fft547, fft547, 177));
__m512 fft640 = _mm512_fmadd_ps(fft632, fft552, _mm512_shuffle_f32x4(fft632, fft632, 177));
__m512 fft557 = _mm512_fmadd_ps(fft548, fft552, _mm512_shuffle_f32x4(fft548, fft548, 177));
__m512 fft641 = _mm512_fmadd_ps(fft633, fft552, _mm512_shuffle_f32x4(fft633, fft633, 177));
__m512 fft558 = _mm512_fmadd_ps(fft549, fft552, _mm512_shuffle_f32x4(fft549, fft549, 177));
__m512 fft642 = _mm512_fmadd_ps(fft634, fft552, _mm512_shuffle_f32x4(fft634, fft634, 177));
__m512 fft559 = _mm512_fmadd_ps(fft550, fft552, _mm512_shuffle_f32x4(fft550, fft550, 177));
__m512 fft643 = _mm512_fmadd_ps(fft635, fft552, _mm512_shuffle_f32x4(fft635, fft635, 177));
__m512 fft560 = _mm512_fmadd_ps(fft551, fft552, _mm512_shuffle_f32x4(fft551, fft551, 177));
__m512 fft644 = _mm512_fmadd_ps(fft636, fft552, _mm512_shuffle_f32x4(fft636, fft636, 177));
__m512 fft561 = _mm512_mask_mov_ps(fft553, 49344, fft554);
__m512 fft645 = _mm512_mask_mov_ps(fft637, 49344, fft638);
__m512 fft562 = _mm512_mask_sub_ps(fft554, 49344, _mm512_setzero_ps(), fft553);
__m512 fft646 = _mm512_mask_sub_ps(fft638, 49344, _mm512_setzero_ps(), fft637);
__m512 fft563 = _mm512_mask_mov_ps(fft555, 49344, fft556);
__m512 fft647 = _mm512_mask_mov_ps(fft639, 49344, fft640);
__m512 fft564 = _mm512_mask_sub_ps(fft556, 49344, _mm512_setzero_ps(), fft555);
__m512 fft648 = _mm512_mask_sub_ps(fft640, 49344, _mm512_setzero_ps(), fft639);
__m512 fft565 = _mm512_mask_mov_ps(fft557, 49344, fft558);
__m512 fft649 = _mm512_mask_mov_ps(fft641, 49344, fft642);
__m512 fft566 = _mm512_mask_sub_ps(fft558, 49344, _mm512_setzero_ps(), fft557);
__m512 fft650 = _mm512_mask_sub_ps(fft642, 49344, _mm512_setzero_ps(), fft641);
__m512 fft567 = _mm512_mask_mov_ps(fft559, 49344, fft560);
__m512 fft651 = _mm512_mask_mov_ps(fft643, 49344, fft644);
__m512 fft568 = _mm512_mask_sub_ps(fft560, 49344, _mm512_setzero_ps(), fft559);
__m512 fft652 = _mm512_mask_sub_ps(fft644, 49344, _mm512_setzero_ps(), fft643);
__m512 fft569 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft570 = _mm512_fmadd_ps(fft561, fft569, _mm512_shuffle_ps(fft561, fft561, 78));
__m512 fft653 = _mm512_fmadd_ps(fft645, fft569, _mm512_shuffle_ps(fft645, fft645, 78));
__m512 fft571 = _mm512_fmadd_ps(fft562, fft569, _mm512_shuffle_ps(fft562, fft562, 78));
__m512 fft654 = _mm512_fmadd_ps(fft646, fft569, _mm512_shuffle_ps(fft646, fft646, 78));
__m512 fft572 = _mm512_fmadd_ps(fft563, fft569, _mm512_shuffle_ps(fft563, fft563, 78));
__m512 fft655 = _mm512_fmadd_ps(fft647, fft569, _mm512_shuffle_ps(fft647, fft647, 78));
__m512 fft573 = _mm512_fmadd_ps(fft564, fft569, _mm512_shuffle_ps(fft564, fft564, 78));
__m512 fft656 = _mm512_fmadd_ps(fft648, fft569, _mm512_shuffle_ps(fft648, fft648, 78));
__m512 fft574 = _mm512_fmadd_ps(fft565, fft569, _mm512_shuffle_ps(fft565, fft565, 78));
__m512 fft657 = _mm512_fmadd_ps(fft649, fft569, _mm512_shuffle_ps(fft649, fft649, 78));
__m512 fft575 = _mm512_fmadd_ps(fft566, fft569, _mm512_shuffle_ps(fft566, fft566, 78));
__m512 fft658 = _mm512_fmadd_ps(fft650, fft569, _mm512_shuffle_ps(fft650, fft650, 78));
__m512 fft576 = _mm512_fmadd_ps(fft567, fft569, _mm512_shuffle_ps(fft567, fft567, 78));
__m512 fft659 = _mm512_fmadd_ps(fft651, fft569, _mm512_shuffle_ps(fft651, fft651, 78));
__m512 fft577 = _mm512_fmadd_ps(fft568, fft569, _mm512_shuffle_ps(fft568, fft568, 78));
__m512 fft660 = _mm512_fmadd_ps(fft652, fft569, _mm512_shuffle_ps(fft652, fft652, 78));
__m512i fft578 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft579 = _mm512_permutexvar_ps(fft578, fft570);
__m512 fft661 = _mm512_permutexvar_ps(fft578, fft653);
__m512i fft580 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft581 = _mm512_permutexvar_ps(fft580, fft570);
__m512 fft662 = _mm512_permutexvar_ps(fft580, fft653);
__m512 fft582 = _mm512_permutexvar_ps(fft578, fft571);
__m512 fft663 = _mm512_permutexvar_ps(fft578, fft654);
__m512 fft583 = _mm512_permutexvar_ps(fft580, fft571);
__m512 fft664 = _mm512_permutexvar_ps(fft580, fft654);
__m512 fft584 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft585 = _mm512_fmadd_ps(fft579, fft584, fft581);
__m512 fft665 = _mm512_fmadd_ps(fft661, fft584, fft662);
__m512 fft586 = _mm512_fnmadd_ps(fft583, fft584, fft582);
__m512 fft666 = _mm512_fnmadd_ps(fft664, fft584, fft663);
__m512 fft587 = _mm512_mask_mov_ps(fft583, 21845, fft585);
__m512 fft667 = _mm512_mask_mov_ps(fft664, 21845, fft665);
__m512 fft588 = _mm512_mask_mov_ps(fft579, 43176, fft585);
__m512 fft668 = _mm512_mask_mov_ps(fft661, 43176, fft665);
__m512 fft589 = _mm512_mask_mov_ps(fft587, 43176, fft586);
__m512 fft669 = _mm512_mask_mov_ps(fft667, 43176, fft666);
__m512 fft590 = _mm512_mask_mov_ps(fft588, 22102, fft586);
__m512 fft670 = _mm512_mask_mov_ps(fft668, 22102, fft666);
__m512 fft591 = _mm512_mask_mul_ps(fft589, 64764, fft589, _mm512_set1_ps(5e-01f));
__m512 fft671 = _mm512_mask_mul_ps(fft669, 64764, fft669, _mm512_set1_ps(5e-01f));
__m512 fft592 = _mm512_mask_mul_ps(fft590, 64764, fft590, _mm512_set1_ps(5e-01f));
__m512 fft672 = _mm512_mask_mul_ps(fft670, 64764, fft670, _mm512_set1_ps(5e-01f));
__m512 df1 = fft591;
__m512 df9 = fft671;
__m512 df2 = fft592;
__m512 df10 = fft672;
__m512 df3 = fft572;
__m512 df11 = fft655;
__m512 df4 = fft573;
__m512 df12 = fft656;
__m512 df5 = fft574;
__m512 df13 = fft657;
__m512 df6 = fft575;
__m512 df14 = fft658;
__m512 df7 = fft576;
__m512 df15 = fft659;
__m512 df8 = fft577;
__m512 df16 = fft660;
__m512i eo4 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df3 = _mm512_permutexvar_ps(eo4, df3);
df4 = _mm512_permutexvar_ps(eo4, df4);
_mm512_mask_storeu_ps(dfPtr1+5376+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df3);
_mm512_mask_storeu_ps(dfPtr1+5440+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df4);
_mm512_mask_storeu_ps(dfPtr1+91360+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df3);
_mm512_mask_storeu_ps(dfPtr1+91424+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df4);
df11 = _mm512_permutexvar_ps(eo4, df11);
df12 = _mm512_permutexvar_ps(eo4, df12);
_mm512_mask_storeu_ps(dfPtr1+177408+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df11);
_mm512_mask_storeu_ps(dfPtr1+177472+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df12);
_mm512_mask_storeu_ps(dfPtr1+263392+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df11);
_mm512_mask_storeu_ps(dfPtr1+263456+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df12);
df5 = _mm512_permutexvar_ps(eo4, df5);
df6 = _mm512_permutexvar_ps(eo4, df6);
_mm512_mask_storeu_ps(dfPtr1+10752+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df5);
_mm512_mask_storeu_ps(dfPtr1+10816+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df6);
_mm512_mask_storeu_ps(dfPtr1+96736+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df5);
_mm512_mask_storeu_ps(dfPtr1+96800+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df6);
df13 = _mm512_permutexvar_ps(eo4, df13);
df14 = _mm512_permutexvar_ps(eo4, df14);
_mm512_mask_storeu_ps(dfPtr1+182784+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df13);
_mm512_mask_storeu_ps(dfPtr1+182848+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df14);
_mm512_mask_storeu_ps(dfPtr1+268768+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df13);
_mm512_mask_storeu_ps(dfPtr1+268832+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df14);
df7 = _mm512_permutexvar_ps(eo4, df7);
df8 = _mm512_permutexvar_ps(eo4, df8);
_mm512_mask_storeu_ps(dfPtr1+16128+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df7);
_mm512_mask_storeu_ps(dfPtr1+16192+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df8);
_mm512_mask_storeu_ps(dfPtr1+102112+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df7);
_mm512_mask_storeu_ps(dfPtr1+102176+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df8);
df15 = _mm512_permutexvar_ps(eo4, df15);
df16 = _mm512_permutexvar_ps(eo4, df16);
_mm512_mask_storeu_ps(dfPtr1+188160+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df15);
_mm512_mask_storeu_ps(dfPtr1+188224+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df16);
_mm512_mask_storeu_ps(dfPtr1+274144+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df15);
_mm512_mask_storeu_ps(dfPtr1+274208+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df16);
_mm512_mask_storeu_ps(dfPtr1+0+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df1);
_mm512_mask_storeu_ps(dfPtr1+64+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df2);
_mm512_mask_storeu_ps(dfPtr1+85984+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df1);
_mm512_mask_storeu_ps(dfPtr1+86048+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df2);
_mm512_mask_storeu_ps(dfPtr1+172032+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df9);
_mm512_mask_storeu_ps(dfPtr1+172096+21504*i6+5376*j2+384*k3+128*m4+32*f5, 255, df10);
_mm512_mask_storeu_ps(dfPtr1+258016+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df9);
_mm512_mask_storeu_ps(dfPtr1+258080+21504*i6+5376*j2+384*k3+128*m4+32*f5, 65280, df10);
ptrdiff_t b4 = 1;
ptrdiff_t m5 = (size_t)b4/2;
ptrdiff_t f6 = (size_t)b4%2;
__m512 dat14 = _mm512_maskz_loadu_ps(1023, datPtr1+288+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat15 = _mm512_maskz_loadu_ps(1023, datPtr1+368+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat16 = _mm512_maskz_loadu_ps(1023, datPtr1+448+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat17 = _mm512_maskz_loadu_ps(1023, datPtr1+528+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat18 = _mm512_maskz_loadu_ps(1023, datPtr1+608+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat19 = _mm512_maskz_loadu_ps(1023, datPtr1+688+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat20 = _mm512_maskz_loadu_ps(1023, datPtr1+768+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat21 = _mm512_maskz_loadu_ps(1023, datPtr1+848+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat22 = _mm512_maskz_loadu_ps(1023, datPtr1+928+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat23 = _mm512_maskz_loadu_ps(1023, datPtr1+1008+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat24 = _mm512_maskz_loadu_ps(1023, datPtr1+1088+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat25 = _mm512_maskz_loadu_ps(1023, datPtr1+1168+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 dat26 = _mm512_maskz_loadu_ps(1023, datPtr1+1248+32480*i6+2320*k3+80*h1+4*w1+0*b4);
__m512 fft673 = _mm512_add_ps(_mm512_setzero_ps(), dat19);
__m512 fft761 = _mm512_add_ps(_mm512_setzero_ps(), dat20);
__m512 fft674 = _mm512_sub_ps(_mm512_setzero_ps(), dat19);
__m512 fft762 = _mm512_sub_ps(_mm512_setzero_ps(), dat20);
__m512 fft675 = _mm512_add_ps(_mm512_setzero_ps(), dat21);
__m512 fft763 = _mm512_add_ps(dat14, dat22);
__m512 fft676 = _mm512_sub_ps(_mm512_setzero_ps(), dat21);
__m512 fft764 = _mm512_sub_ps(dat14, dat22);
__m512 fft677 = _mm512_add_ps(dat15, dat23);
__m512 fft765 = _mm512_add_ps(dat16, dat24);
__m512 fft678 = _mm512_sub_ps(dat15, dat23);
__m512 fft766 = _mm512_sub_ps(dat16, dat24);
__m512 fft679 = _mm512_add_ps(dat17, dat25);
__m512 fft767 = _mm512_add_ps(dat18, dat26);
__m512 fft680 = _mm512_sub_ps(dat17, dat25);
__m512 fft768 = _mm512_sub_ps(dat18, dat26);
__m512 fft681 = _mm512_add_ps(fft673, fft677);
__m512 fft769 = _mm512_add_ps(fft761, fft765);
__m512 fft682 = _mm512_sub_ps(fft673, fft677);
__m512 fft770 = _mm512_sub_ps(fft761, fft765);
__m512 fft683 = _mm512_add_ps(fft675, fft679);
__m512 fft771 = _mm512_add_ps(fft763, fft767);
__m512 fft684 = _mm512_sub_ps(fft679, fft675);
__m512 fft772 = _mm512_sub_ps(fft767, fft763);
__m512 fft685 = _mm512_sub_ps(fft676, fft680);
__m512 fft773 = _mm512_sub_ps(fft764, fft768);
__m512 fft686 = _mm512_add_ps(fft676, fft680);
__m512 fft774 = _mm512_add_ps(fft764, fft768);
__m512 fft687 = _mm512_add_ps(fft681, fft683);
__m512 fft775 = _mm512_add_ps(fft769, fft771);
__m512 fft688 = _mm512_sub_ps(fft681, fft683);
__m512 fft776 = _mm512_sub_ps(fft769, fft771);
__m512 fft689 = _mm512_fmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft777 = _mm512_fmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft690 = _mm512_fnmsub_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft778 = _mm512_fnmsub_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft691 = _mm512_fnmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft779 = _mm512_fnmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft692 = _mm512_fnmadd_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft780 = _mm512_fnmadd_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft693 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft694 = _mm512_fmadd_ps(fft687, fft693, _mm512_shuffle_f32x4(fft687, fft687, 78));
__m512 fft781 = _mm512_fmadd_ps(fft775, fft693, _mm512_shuffle_f32x4(fft775, fft775, 78));
__m512 fft695 = _mm512_fmadd_ps(fft688, fft693, _mm512_shuffle_f32x4(fft688, fft688, 78));
__m512 fft782 = _mm512_fmadd_ps(fft776, fft693, _mm512_shuffle_f32x4(fft776, fft776, 78));
__m512 fft696 = _mm512_fmadd_ps(fft689, fft693, _mm512_shuffle_f32x4(fft689, fft689, 78));
__m512 fft783 = _mm512_fmadd_ps(fft777, fft693, _mm512_shuffle_f32x4(fft777, fft777, 78));
__m512 fft697 = _mm512_fmadd_ps(fft690, fft693, _mm512_shuffle_f32x4(fft690, fft690, 78));
__m512 fft784 = _mm512_fmadd_ps(fft778, fft693, _mm512_shuffle_f32x4(fft778, fft778, 78));
__m512 fft698 = _mm512_fmadd_ps(fft682, fft693, _mm512_shuffle_f32x4(fft682, fft682, 78));
__m512 fft785 = _mm512_fmadd_ps(fft770, fft693, _mm512_shuffle_f32x4(fft770, fft770, 78));
__m512 fft699 = _mm512_fmadd_ps(fft684, fft693, _mm512_shuffle_f32x4(fft684, fft684, 78));
__m512 fft786 = _mm512_fmadd_ps(fft772, fft693, _mm512_shuffle_f32x4(fft772, fft772, 78));
__m512 fft700 = _mm512_fmadd_ps(fft691, fft693, _mm512_shuffle_f32x4(fft691, fft691, 78));
__m512 fft787 = _mm512_fmadd_ps(fft779, fft693, _mm512_shuffle_f32x4(fft779, fft779, 78));
__m512 fft701 = _mm512_fmadd_ps(fft692, fft693, _mm512_shuffle_f32x4(fft692, fft692, 78));
__m512 fft788 = _mm512_fmadd_ps(fft780, fft693, _mm512_shuffle_f32x4(fft780, fft780, 78));
__m512 fft702 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft703 = _mm512_mul_ps(fft694, fft702);
__m512 fft789 = _mm512_mul_ps(fft781, fft702);
__m512 fft704 = _mm512_mul_ps(fft695, fft702);
__m512 fft790 = _mm512_mul_ps(fft782, fft702);
__m512 fft705 = _mm512_mul_ps(fft696, fft702);
__m512 fft791 = _mm512_mul_ps(fft783, fft702);
__m512 fft706 = _mm512_mul_ps(fft697, fft702);
__m512 fft792 = _mm512_mul_ps(fft784, fft702);
__m512 fft707 = _mm512_mul_ps(fft698, fft702);
__m512 fft793 = _mm512_mul_ps(fft785, fft702);
__m512 fft708 = _mm512_mul_ps(fft699, fft702);
__m512 fft794 = _mm512_mul_ps(fft786, fft702);
__m512 fft709 = _mm512_mul_ps(fft700, fft702);
__m512 fft795 = _mm512_mul_ps(fft787, fft702);
__m512 fft710 = _mm512_mul_ps(fft701, fft702);
__m512 fft796 = _mm512_mul_ps(fft788, fft702);
__m512 fft711 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft712 = _mm512_fmadd_ps(fft695, fft711, fft703);
__m512 fft797 = _mm512_fmadd_ps(fft782, fft711, fft789);
__m512 fft713 = _mm512_fnmadd_ps(fft694, fft711, fft704);
__m512 fft798 = _mm512_fnmadd_ps(fft781, fft711, fft790);
__m512 fft714 = _mm512_fmadd_ps(fft697, fft711, fft705);
__m512 fft799 = _mm512_fmadd_ps(fft784, fft711, fft791);
__m512 fft715 = _mm512_fnmadd_ps(fft696, fft711, fft706);
__m512 fft800 = _mm512_fnmadd_ps(fft783, fft711, fft792);
__m512 fft716 = _mm512_fmadd_ps(fft699, fft711, fft707);
__m512 fft801 = _mm512_fmadd_ps(fft786, fft711, fft793);
__m512 fft717 = _mm512_fnmadd_ps(fft698, fft711, fft708);
__m512 fft802 = _mm512_fnmadd_ps(fft785, fft711, fft794);
__m512 fft718 = _mm512_fmadd_ps(fft701, fft711, fft709);
__m512 fft803 = _mm512_fmadd_ps(fft788, fft711, fft795);
__m512 fft719 = _mm512_fnmadd_ps(fft700, fft711, fft710);
__m512 fft804 = _mm512_fnmadd_ps(fft787, fft711, fft796);
__m512 fft720 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft721 = _mm512_fmadd_ps(fft712, fft720, _mm512_shuffle_f32x4(fft712, fft712, 177));
__m512 fft805 = _mm512_fmadd_ps(fft797, fft720, _mm512_shuffle_f32x4(fft797, fft797, 177));
__m512 fft722 = _mm512_fmadd_ps(fft713, fft720, _mm512_shuffle_f32x4(fft713, fft713, 177));
__m512 fft806 = _mm512_fmadd_ps(fft798, fft720, _mm512_shuffle_f32x4(fft798, fft798, 177));
__m512 fft723 = _mm512_fmadd_ps(fft714, fft720, _mm512_shuffle_f32x4(fft714, fft714, 177));
__m512 fft807 = _mm512_fmadd_ps(fft799, fft720, _mm512_shuffle_f32x4(fft799, fft799, 177));
__m512 fft724 = _mm512_fmadd_ps(fft715, fft720, _mm512_shuffle_f32x4(fft715, fft715, 177));
__m512 fft808 = _mm512_fmadd_ps(fft800, fft720, _mm512_shuffle_f32x4(fft800, fft800, 177));
__m512 fft725 = _mm512_fmadd_ps(fft716, fft720, _mm512_shuffle_f32x4(fft716, fft716, 177));
__m512 fft809 = _mm512_fmadd_ps(fft801, fft720, _mm512_shuffle_f32x4(fft801, fft801, 177));
__m512 fft726 = _mm512_fmadd_ps(fft717, fft720, _mm512_shuffle_f32x4(fft717, fft717, 177));
__m512 fft810 = _mm512_fmadd_ps(fft802, fft720, _mm512_shuffle_f32x4(fft802, fft802, 177));
__m512 fft727 = _mm512_fmadd_ps(fft718, fft720, _mm512_shuffle_f32x4(fft718, fft718, 177));
__m512 fft811 = _mm512_fmadd_ps(fft803, fft720, _mm512_shuffle_f32x4(fft803, fft803, 177));
__m512 fft728 = _mm512_fmadd_ps(fft719, fft720, _mm512_shuffle_f32x4(fft719, fft719, 177));
__m512 fft812 = _mm512_fmadd_ps(fft804, fft720, _mm512_shuffle_f32x4(fft804, fft804, 177));
__m512 fft729 = _mm512_mask_mov_ps(fft721, 49344, fft722);
__m512 fft813 = _mm512_mask_mov_ps(fft805, 49344, fft806);
__m512 fft730 = _mm512_mask_sub_ps(fft722, 49344, _mm512_setzero_ps(), fft721);
__m512 fft814 = _mm512_mask_sub_ps(fft806, 49344, _mm512_setzero_ps(), fft805);
__m512 fft731 = _mm512_mask_mov_ps(fft723, 49344, fft724);
__m512 fft815 = _mm512_mask_mov_ps(fft807, 49344, fft808);
__m512 fft732 = _mm512_mask_sub_ps(fft724, 49344, _mm512_setzero_ps(), fft723);
__m512 fft816 = _mm512_mask_sub_ps(fft808, 49344, _mm512_setzero_ps(), fft807);
__m512 fft733 = _mm512_mask_mov_ps(fft725, 49344, fft726);
__m512 fft817 = _mm512_mask_mov_ps(fft809, 49344, fft810);
__m512 fft734 = _mm512_mask_sub_ps(fft726, 49344, _mm512_setzero_ps(), fft725);
__m512 fft818 = _mm512_mask_sub_ps(fft810, 49344, _mm512_setzero_ps(), fft809);
__m512 fft735 = _mm512_mask_mov_ps(fft727, 49344, fft728);
__m512 fft819 = _mm512_mask_mov_ps(fft811, 49344, fft812);
__m512 fft736 = _mm512_mask_sub_ps(fft728, 49344, _mm512_setzero_ps(), fft727);
__m512 fft820 = _mm512_mask_sub_ps(fft812, 49344, _mm512_setzero_ps(), fft811);
__m512 fft737 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft738 = _mm512_fmadd_ps(fft729, fft737, _mm512_shuffle_ps(fft729, fft729, 78));
__m512 fft821 = _mm512_fmadd_ps(fft813, fft737, _mm512_shuffle_ps(fft813, fft813, 78));
__m512 fft739 = _mm512_fmadd_ps(fft730, fft737, _mm512_shuffle_ps(fft730, fft730, 78));
__m512 fft822 = _mm512_fmadd_ps(fft814, fft737, _mm512_shuffle_ps(fft814, fft814, 78));
__m512 fft740 = _mm512_fmadd_ps(fft731, fft737, _mm512_shuffle_ps(fft731, fft731, 78));
__m512 fft823 = _mm512_fmadd_ps(fft815, fft737, _mm512_shuffle_ps(fft815, fft815, 78));
__m512 fft741 = _mm512_fmadd_ps(fft732, fft737, _mm512_shuffle_ps(fft732, fft732, 78));
__m512 fft824 = _mm512_fmadd_ps(fft816, fft737, _mm512_shuffle_ps(fft816, fft816, 78));
__m512 fft742 = _mm512_fmadd_ps(fft733, fft737, _mm512_shuffle_ps(fft733, fft733, 78));
__m512 fft825 = _mm512_fmadd_ps(fft817, fft737, _mm512_shuffle_ps(fft817, fft817, 78));
__m512 fft743 = _mm512_fmadd_ps(fft734, fft737, _mm512_shuffle_ps(fft734, fft734, 78));
__m512 fft826 = _mm512_fmadd_ps(fft818, fft737, _mm512_shuffle_ps(fft818, fft818, 78));
__m512 fft744 = _mm512_fmadd_ps(fft735, fft737, _mm512_shuffle_ps(fft735, fft735, 78));
__m512 fft827 = _mm512_fmadd_ps(fft819, fft737, _mm512_shuffle_ps(fft819, fft819, 78));
__m512 fft745 = _mm512_fmadd_ps(fft736, fft737, _mm512_shuffle_ps(fft736, fft736, 78));
__m512 fft828 = _mm512_fmadd_ps(fft820, fft737, _mm512_shuffle_ps(fft820, fft820, 78));
__m512i fft746 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft747 = _mm512_permutexvar_ps(fft746, fft738);
__m512 fft829 = _mm512_permutexvar_ps(fft746, fft821);
__m512i fft748 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft749 = _mm512_permutexvar_ps(fft748, fft738);
__m512 fft830 = _mm512_permutexvar_ps(fft748, fft821);
__m512 fft750 = _mm512_permutexvar_ps(fft746, fft739);
__m512 fft831 = _mm512_permutexvar_ps(fft746, fft822);
__m512 fft751 = _mm512_permutexvar_ps(fft748, fft739);
__m512 fft832 = _mm512_permutexvar_ps(fft748, fft822);
__m512 fft752 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft753 = _mm512_fmadd_ps(fft747, fft752, fft749);
__m512 fft833 = _mm512_fmadd_ps(fft829, fft752, fft830);
__m512 fft754 = _mm512_fnmadd_ps(fft751, fft752, fft750);
__m512 fft834 = _mm512_fnmadd_ps(fft832, fft752, fft831);
__m512 fft755 = _mm512_mask_mov_ps(fft751, 21845, fft753);
__m512 fft835 = _mm512_mask_mov_ps(fft832, 21845, fft833);
__m512 fft756 = _mm512_mask_mov_ps(fft747, 43176, fft753);
__m512 fft836 = _mm512_mask_mov_ps(fft829, 43176, fft833);
__m512 fft757 = _mm512_mask_mov_ps(fft755, 43176, fft754);
__m512 fft837 = _mm512_mask_mov_ps(fft835, 43176, fft834);
__m512 fft758 = _mm512_mask_mov_ps(fft756, 22102, fft754);
__m512 fft838 = _mm512_mask_mov_ps(fft836, 22102, fft834);
__m512 fft759 = _mm512_mask_mul_ps(fft757, 64764, fft757, _mm512_set1_ps(5e-01f));
__m512 fft839 = _mm512_mask_mul_ps(fft837, 64764, fft837, _mm512_set1_ps(5e-01f));
__m512 fft760 = _mm512_mask_mul_ps(fft758, 64764, fft758, _mm512_set1_ps(5e-01f));
__m512 fft840 = _mm512_mask_mul_ps(fft838, 64764, fft838, _mm512_set1_ps(5e-01f));
__m512 df17 = fft759;
__m512 df25 = fft839;
__m512 df18 = fft760;
__m512 df26 = fft840;
__m512 df19 = fft740;
__m512 df27 = fft823;
__m512 df20 = fft741;
__m512 df28 = fft824;
__m512 df21 = fft742;
__m512 df29 = fft825;
__m512 df22 = fft743;
__m512 df30 = fft826;
__m512 df23 = fft744;
__m512 df31 = fft827;
__m512 df24 = fft745;
__m512 df32 = fft828;
__m512i eo5 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df19 = _mm512_permutexvar_ps(eo5, df19);
df20 = _mm512_permutexvar_ps(eo5, df20);
_mm512_mask_storeu_ps(dfPtr1+5376+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df19);
_mm512_mask_storeu_ps(dfPtr1+5440+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df20);
_mm512_mask_storeu_ps(dfPtr1+91360+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df19);
_mm512_mask_storeu_ps(dfPtr1+91424+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df20);
df27 = _mm512_permutexvar_ps(eo5, df27);
df28 = _mm512_permutexvar_ps(eo5, df28);
_mm512_mask_storeu_ps(dfPtr1+177408+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df27);
_mm512_mask_storeu_ps(dfPtr1+177472+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df28);
_mm512_mask_storeu_ps(dfPtr1+263392+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df27);
_mm512_mask_storeu_ps(dfPtr1+263456+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df28);
df21 = _mm512_permutexvar_ps(eo5, df21);
df22 = _mm512_permutexvar_ps(eo5, df22);
_mm512_mask_storeu_ps(dfPtr1+10752+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df21);
_mm512_mask_storeu_ps(dfPtr1+10816+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df22);
_mm512_mask_storeu_ps(dfPtr1+96736+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df21);
_mm512_mask_storeu_ps(dfPtr1+96800+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df22);
df29 = _mm512_permutexvar_ps(eo5, df29);
df30 = _mm512_permutexvar_ps(eo5, df30);
_mm512_mask_storeu_ps(dfPtr1+182784+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df29);
_mm512_mask_storeu_ps(dfPtr1+182848+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df30);
_mm512_mask_storeu_ps(dfPtr1+268768+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df29);
_mm512_mask_storeu_ps(dfPtr1+268832+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df30);
df23 = _mm512_permutexvar_ps(eo5, df23);
df24 = _mm512_permutexvar_ps(eo5, df24);
_mm512_mask_storeu_ps(dfPtr1+16128+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df23);
_mm512_mask_storeu_ps(dfPtr1+16192+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df24);
_mm512_mask_storeu_ps(dfPtr1+102112+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df23);
_mm512_mask_storeu_ps(dfPtr1+102176+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df24);
df31 = _mm512_permutexvar_ps(eo5, df31);
df32 = _mm512_permutexvar_ps(eo5, df32);
_mm512_mask_storeu_ps(dfPtr1+188160+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df31);
_mm512_mask_storeu_ps(dfPtr1+188224+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df32);
_mm512_mask_storeu_ps(dfPtr1+274144+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df31);
_mm512_mask_storeu_ps(dfPtr1+274208+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df32);
_mm512_mask_storeu_ps(dfPtr1+0+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df17);
_mm512_mask_storeu_ps(dfPtr1+64+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df18);
_mm512_mask_storeu_ps(dfPtr1+85984+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df17);
_mm512_mask_storeu_ps(dfPtr1+86048+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df18);
_mm512_mask_storeu_ps(dfPtr1+172032+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df25);
_mm512_mask_storeu_ps(dfPtr1+172096+21504*i6+5376*j2+384*k3+128*m5+32*f6, 255, df26);
_mm512_mask_storeu_ps(dfPtr1+258016+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df25);
_mm512_mask_storeu_ps(dfPtr1+258080+21504*i6+5376*j2+384*k3+128*m5+32*f6, 65280, df26);
ptrdiff_t b5 = 2;
ptrdiff_t m6 = (size_t)b5/2;
ptrdiff_t f7 = (size_t)b5%2;
__m512 dat27 = _mm512_maskz_loadu_ps(65532, datPtr1+960+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat28 = _mm512_maskz_loadu_ps(65532, datPtr1+1040+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat29 = _mm512_maskz_loadu_ps(65532, datPtr1+1120+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat30 = _mm512_maskz_loadu_ps(65532, datPtr1+1200+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat31 = _mm512_maskz_loadu_ps(65532, datPtr1+1280+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat32 = _mm512_maskz_loadu_ps(65532, datPtr1+1360+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat33 = _mm512_maskz_loadu_ps(65532, datPtr1+1440+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat34 = _mm512_maskz_loadu_ps(65532, datPtr1+1520+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat35 = _mm512_maskz_loadu_ps(65532, datPtr1+1600+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat36 = _mm512_maskz_loadu_ps(65532, datPtr1+1680+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat37 = _mm512_maskz_loadu_ps(65532, datPtr1+1760+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat38 = _mm512_maskz_loadu_ps(65532, datPtr1+1840+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat39 = _mm512_maskz_loadu_ps(65532, datPtr1+1920+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat40 = _mm512_maskz_loadu_ps(65532, datPtr1+2000+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat41 = _mm512_maskz_loadu_ps(65532, datPtr1+2080+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 dat42 = _mm512_maskz_loadu_ps(65532, datPtr1+2160+32480*i6+2320*k3+80*h1+4*w1+0*b5);
__m512 fft841 = _mm512_add_ps(dat27, dat35);
__m512 fft929 = _mm512_add_ps(dat28, dat36);
__m512 fft842 = _mm512_sub_ps(dat27, dat35);
__m512 fft930 = _mm512_sub_ps(dat28, dat36);
__m512 fft843 = _mm512_add_ps(dat29, dat37);
__m512 fft931 = _mm512_add_ps(dat30, dat38);
__m512 fft844 = _mm512_sub_ps(dat29, dat37);
__m512 fft932 = _mm512_sub_ps(dat30, dat38);
__m512 fft845 = _mm512_add_ps(dat31, dat39);
__m512 fft933 = _mm512_add_ps(dat32, dat40);
__m512 fft846 = _mm512_sub_ps(dat31, dat39);
__m512 fft934 = _mm512_sub_ps(dat32, dat40);
__m512 fft847 = _mm512_add_ps(dat33, dat41);
__m512 fft935 = _mm512_add_ps(dat34, dat42);
__m512 fft848 = _mm512_sub_ps(dat33, dat41);
__m512 fft936 = _mm512_sub_ps(dat34, dat42);
__m512 fft849 = _mm512_add_ps(fft841, fft845);
__m512 fft937 = _mm512_add_ps(fft929, fft933);
__m512 fft850 = _mm512_sub_ps(fft841, fft845);
__m512 fft938 = _mm512_sub_ps(fft929, fft933);
__m512 fft851 = _mm512_add_ps(fft843, fft847);
__m512 fft939 = _mm512_add_ps(fft931, fft935);
__m512 fft852 = _mm512_sub_ps(fft847, fft843);
__m512 fft940 = _mm512_sub_ps(fft935, fft931);
__m512 fft853 = _mm512_sub_ps(fft844, fft848);
__m512 fft941 = _mm512_sub_ps(fft932, fft936);
__m512 fft854 = _mm512_add_ps(fft844, fft848);
__m512 fft942 = _mm512_add_ps(fft932, fft936);
__m512 fft855 = _mm512_add_ps(fft849, fft851);
__m512 fft943 = _mm512_add_ps(fft937, fft939);
__m512 fft856 = _mm512_sub_ps(fft849, fft851);
__m512 fft944 = _mm512_sub_ps(fft937, fft939);
__m512 fft857 = _mm512_fmadd_ps(fft853, _mm512_set1_ps(7.0710677e-01f), fft842);
__m512 fft945 = _mm512_fmadd_ps(fft941, _mm512_set1_ps(7.0710677e-01f), fft930);
__m512 fft858 = _mm512_fnmsub_ps(fft854, _mm512_set1_ps(7.0710677e-01f), fft846);
__m512 fft946 = _mm512_fnmsub_ps(fft942, _mm512_set1_ps(7.0710677e-01f), fft934);
__m512 fft859 = _mm512_fnmadd_ps(fft853, _mm512_set1_ps(7.0710677e-01f), fft842);
__m512 fft947 = _mm512_fnmadd_ps(fft941, _mm512_set1_ps(7.0710677e-01f), fft930);
__m512 fft860 = _mm512_fnmadd_ps(fft854, _mm512_set1_ps(7.0710677e-01f), fft846);
__m512 fft948 = _mm512_fnmadd_ps(fft942, _mm512_set1_ps(7.0710677e-01f), fft934);
__m512 fft861 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft862 = _mm512_fmadd_ps(fft855, fft861, _mm512_shuffle_f32x4(fft855, fft855, 78));
__m512 fft949 = _mm512_fmadd_ps(fft943, fft861, _mm512_shuffle_f32x4(fft943, fft943, 78));
__m512 fft863 = _mm512_fmadd_ps(fft856, fft861, _mm512_shuffle_f32x4(fft856, fft856, 78));
__m512 fft950 = _mm512_fmadd_ps(fft944, fft861, _mm512_shuffle_f32x4(fft944, fft944, 78));
__m512 fft864 = _mm512_fmadd_ps(fft857, fft861, _mm512_shuffle_f32x4(fft857, fft857, 78));
__m512 fft951 = _mm512_fmadd_ps(fft945, fft861, _mm512_shuffle_f32x4(fft945, fft945, 78));
__m512 fft865 = _mm512_fmadd_ps(fft858, fft861, _mm512_shuffle_f32x4(fft858, fft858, 78));
__m512 fft952 = _mm512_fmadd_ps(fft946, fft861, _mm512_shuffle_f32x4(fft946, fft946, 78));
__m512 fft866 = _mm512_fmadd_ps(fft850, fft861, _mm512_shuffle_f32x4(fft850, fft850, 78));
__m512 fft953 = _mm512_fmadd_ps(fft938, fft861, _mm512_shuffle_f32x4(fft938, fft938, 78));
__m512 fft867 = _mm512_fmadd_ps(fft852, fft861, _mm512_shuffle_f32x4(fft852, fft852, 78));
__m512 fft954 = _mm512_fmadd_ps(fft940, fft861, _mm512_shuffle_f32x4(fft940, fft940, 78));
__m512 fft868 = _mm512_fmadd_ps(fft859, fft861, _mm512_shuffle_f32x4(fft859, fft859, 78));
__m512 fft955 = _mm512_fmadd_ps(fft947, fft861, _mm512_shuffle_f32x4(fft947, fft947, 78));
__m512 fft869 = _mm512_fmadd_ps(fft860, fft861, _mm512_shuffle_f32x4(fft860, fft860, 78));
__m512 fft956 = _mm512_fmadd_ps(fft948, fft861, _mm512_shuffle_f32x4(fft948, fft948, 78));
__m512 fft870 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft871 = _mm512_mul_ps(fft862, fft870);
__m512 fft957 = _mm512_mul_ps(fft949, fft870);
__m512 fft872 = _mm512_mul_ps(fft863, fft870);
__m512 fft958 = _mm512_mul_ps(fft950, fft870);
__m512 fft873 = _mm512_mul_ps(fft864, fft870);
__m512 fft959 = _mm512_mul_ps(fft951, fft870);
__m512 fft874 = _mm512_mul_ps(fft865, fft870);
__m512 fft960 = _mm512_mul_ps(fft952, fft870);
__m512 fft875 = _mm512_mul_ps(fft866, fft870);
__m512 fft961 = _mm512_mul_ps(fft953, fft870);
__m512 fft876 = _mm512_mul_ps(fft867, fft870);
__m512 fft962 = _mm512_mul_ps(fft954, fft870);
__m512 fft877 = _mm512_mul_ps(fft868, fft870);
__m512 fft963 = _mm512_mul_ps(fft955, fft870);
__m512 fft878 = _mm512_mul_ps(fft869, fft870);
__m512 fft964 = _mm512_mul_ps(fft956, fft870);
__m512 fft879 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft880 = _mm512_fmadd_ps(fft863, fft879, fft871);
__m512 fft965 = _mm512_fmadd_ps(fft950, fft879, fft957);
__m512 fft881 = _mm512_fnmadd_ps(fft862, fft879, fft872);
__m512 fft966 = _mm512_fnmadd_ps(fft949, fft879, fft958);
__m512 fft882 = _mm512_fmadd_ps(fft865, fft879, fft873);
__m512 fft967 = _mm512_fmadd_ps(fft952, fft879, fft959);
__m512 fft883 = _mm512_fnmadd_ps(fft864, fft879, fft874);
__m512 fft968 = _mm512_fnmadd_ps(fft951, fft879, fft960);
__m512 fft884 = _mm512_fmadd_ps(fft867, fft879, fft875);
__m512 fft969 = _mm512_fmadd_ps(fft954, fft879, fft961);
__m512 fft885 = _mm512_fnmadd_ps(fft866, fft879, fft876);
__m512 fft970 = _mm512_fnmadd_ps(fft953, fft879, fft962);
__m512 fft886 = _mm512_fmadd_ps(fft869, fft879, fft877);
__m512 fft971 = _mm512_fmadd_ps(fft956, fft879, fft963);
__m512 fft887 = _mm512_fnmadd_ps(fft868, fft879, fft878);
__m512 fft972 = _mm512_fnmadd_ps(fft955, fft879, fft964);
__m512 fft888 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft889 = _mm512_fmadd_ps(fft880, fft888, _mm512_shuffle_f32x4(fft880, fft880, 177));
__m512 fft973 = _mm512_fmadd_ps(fft965, fft888, _mm512_shuffle_f32x4(fft965, fft965, 177));
__m512 fft890 = _mm512_fmadd_ps(fft881, fft888, _mm512_shuffle_f32x4(fft881, fft881, 177));
__m512 fft974 = _mm512_fmadd_ps(fft966, fft888, _mm512_shuffle_f32x4(fft966, fft966, 177));
__m512 fft891 = _mm512_fmadd_ps(fft882, fft888, _mm512_shuffle_f32x4(fft882, fft882, 177));
__m512 fft975 = _mm512_fmadd_ps(fft967, fft888, _mm512_shuffle_f32x4(fft967, fft967, 177));
__m512 fft892 = _mm512_fmadd_ps(fft883, fft888, _mm512_shuffle_f32x4(fft883, fft883, 177));
__m512 fft976 = _mm512_fmadd_ps(fft968, fft888, _mm512_shuffle_f32x4(fft968, fft968, 177));
__m512 fft893 = _mm512_fmadd_ps(fft884, fft888, _mm512_shuffle_f32x4(fft884, fft884, 177));
__m512 fft977 = _mm512_fmadd_ps(fft969, fft888, _mm512_shuffle_f32x4(fft969, fft969, 177));
__m512 fft894 = _mm512_fmadd_ps(fft885, fft888, _mm512_shuffle_f32x4(fft885, fft885, 177));
__m512 fft978 = _mm512_fmadd_ps(fft970, fft888, _mm512_shuffle_f32x4(fft970, fft970, 177));
__m512 fft895 = _mm512_fmadd_ps(fft886, fft888, _mm512_shuffle_f32x4(fft886, fft886, 177));
__m512 fft979 = _mm512_fmadd_ps(fft971, fft888, _mm512_shuffle_f32x4(fft971, fft971, 177));
__m512 fft896 = _mm512_fmadd_ps(fft887, fft888, _mm512_shuffle_f32x4(fft887, fft887, 177));
__m512 fft980 = _mm512_fmadd_ps(fft972, fft888, _mm512_shuffle_f32x4(fft972, fft972, 177));
__m512 fft897 = _mm512_mask_mov_ps(fft889, 49344, fft890);
__m512 fft981 = _mm512_mask_mov_ps(fft973, 49344, fft974);
__m512 fft898 = _mm512_mask_sub_ps(fft890, 49344, _mm512_setzero_ps(), fft889);
__m512 fft982 = _mm512_mask_sub_ps(fft974, 49344, _mm512_setzero_ps(), fft973);
__m512 fft899 = _mm512_mask_mov_ps(fft891, 49344, fft892);
__m512 fft983 = _mm512_mask_mov_ps(fft975, 49344, fft976);
__m512 fft900 = _mm512_mask_sub_ps(fft892, 49344, _mm512_setzero_ps(), fft891);
__m512 fft984 = _mm512_mask_sub_ps(fft976, 49344, _mm512_setzero_ps(), fft975);
__m512 fft901 = _mm512_mask_mov_ps(fft893, 49344, fft894);
__m512 fft985 = _mm512_mask_mov_ps(fft977, 49344, fft978);
__m512 fft902 = _mm512_mask_sub_ps(fft894, 49344, _mm512_setzero_ps(), fft893);
__m512 fft986 = _mm512_mask_sub_ps(fft978, 49344, _mm512_setzero_ps(), fft977);
__m512 fft903 = _mm512_mask_mov_ps(fft895, 49344, fft896);
__m512 fft987 = _mm512_mask_mov_ps(fft979, 49344, fft980);
__m512 fft904 = _mm512_mask_sub_ps(fft896, 49344, _mm512_setzero_ps(), fft895);
__m512 fft988 = _mm512_mask_sub_ps(fft980, 49344, _mm512_setzero_ps(), fft979);
__m512 fft905 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft906 = _mm512_fmadd_ps(fft897, fft905, _mm512_shuffle_ps(fft897, fft897, 78));
__m512 fft989 = _mm512_fmadd_ps(fft981, fft905, _mm512_shuffle_ps(fft981, fft981, 78));
__m512 fft907 = _mm512_fmadd_ps(fft898, fft905, _mm512_shuffle_ps(fft898, fft898, 78));
__m512 fft990 = _mm512_fmadd_ps(fft982, fft905, _mm512_shuffle_ps(fft982, fft982, 78));
__m512 fft908 = _mm512_fmadd_ps(fft899, fft905, _mm512_shuffle_ps(fft899, fft899, 78));
__m512 fft991 = _mm512_fmadd_ps(fft983, fft905, _mm512_shuffle_ps(fft983, fft983, 78));
__m512 fft909 = _mm512_fmadd_ps(fft900, fft905, _mm512_shuffle_ps(fft900, fft900, 78));
__m512 fft992 = _mm512_fmadd_ps(fft984, fft905, _mm512_shuffle_ps(fft984, fft984, 78));
__m512 fft910 = _mm512_fmadd_ps(fft901, fft905, _mm512_shuffle_ps(fft901, fft901, 78));
__m512 fft993 = _mm512_fmadd_ps(fft985, fft905, _mm512_shuffle_ps(fft985, fft985, 78));
__m512 fft911 = _mm512_fmadd_ps(fft902, fft905, _mm512_shuffle_ps(fft902, fft902, 78));
__m512 fft994 = _mm512_fmadd_ps(fft986, fft905, _mm512_shuffle_ps(fft986, fft986, 78));
__m512 fft912 = _mm512_fmadd_ps(fft903, fft905, _mm512_shuffle_ps(fft903, fft903, 78));
__m512 fft995 = _mm512_fmadd_ps(fft987, fft905, _mm512_shuffle_ps(fft987, fft987, 78));
__m512 fft913 = _mm512_fmadd_ps(fft904, fft905, _mm512_shuffle_ps(fft904, fft904, 78));
__m512 fft996 = _mm512_fmadd_ps(fft988, fft905, _mm512_shuffle_ps(fft988, fft988, 78));
__m512i fft914 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft915 = _mm512_permutexvar_ps(fft914, fft906);
__m512 fft997 = _mm512_permutexvar_ps(fft914, fft989);
__m512i fft916 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft917 = _mm512_permutexvar_ps(fft916, fft906);
__m512 fft998 = _mm512_permutexvar_ps(fft916, fft989);
__m512 fft918 = _mm512_permutexvar_ps(fft914, fft907);
__m512 fft999 = _mm512_permutexvar_ps(fft914, fft990);
__m512 fft919 = _mm512_permutexvar_ps(fft916, fft907);
__m512 fft1000 = _mm512_permutexvar_ps(fft916, fft990);
__m512 fft920 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft921 = _mm512_fmadd_ps(fft915, fft920, fft917);
__m512 fft1001 = _mm512_fmadd_ps(fft997, fft920, fft998);
__m512 fft922 = _mm512_fnmadd_ps(fft919, fft920, fft918);
__m512 fft1002 = _mm512_fnmadd_ps(fft1000, fft920, fft999);
__m512 fft923 = _mm512_mask_mov_ps(fft919, 21845, fft921);
__m512 fft1003 = _mm512_mask_mov_ps(fft1000, 21845, fft1001);
__m512 fft924 = _mm512_mask_mov_ps(fft915, 43176, fft921);
__m512 fft1004 = _mm512_mask_mov_ps(fft997, 43176, fft1001);
__m512 fft925 = _mm512_mask_mov_ps(fft923, 43176, fft922);
__m512 fft1005 = _mm512_mask_mov_ps(fft1003, 43176, fft1002);
__m512 fft926 = _mm512_mask_mov_ps(fft924, 22102, fft922);
__m512 fft1006 = _mm512_mask_mov_ps(fft1004, 22102, fft1002);
__m512 fft927 = _mm512_mask_mul_ps(fft925, 64764, fft925, _mm512_set1_ps(5e-01f));
__m512 fft1007 = _mm512_mask_mul_ps(fft1005, 64764, fft1005, _mm512_set1_ps(5e-01f));
__m512 fft928 = _mm512_mask_mul_ps(fft926, 64764, fft926, _mm512_set1_ps(5e-01f));
__m512 fft1008 = _mm512_mask_mul_ps(fft1006, 64764, fft1006, _mm512_set1_ps(5e-01f));
__m512 df33 = fft927;
__m512 df41 = fft1007;
__m512 df34 = fft928;
__m512 df42 = fft1008;
__m512 df35 = fft908;
__m512 df43 = fft991;
__m512 df36 = fft909;
__m512 df44 = fft992;
__m512 df37 = fft910;
__m512 df45 = fft993;
__m512 df38 = fft911;
__m512 df46 = fft994;
__m512 df39 = fft912;
__m512 df47 = fft995;
__m512 df40 = fft913;
__m512 df48 = fft996;
__m512i eo6 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df35 = _mm512_permutexvar_ps(eo6, df35);
df36 = _mm512_permutexvar_ps(eo6, df36);
_mm512_mask_storeu_ps(dfPtr1+5376+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df35);
_mm512_mask_storeu_ps(dfPtr1+5440+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df36);
_mm512_mask_storeu_ps(dfPtr1+91360+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df35);
_mm512_mask_storeu_ps(dfPtr1+91424+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df36);
df43 = _mm512_permutexvar_ps(eo6, df43);
df44 = _mm512_permutexvar_ps(eo6, df44);
_mm512_mask_storeu_ps(dfPtr1+177408+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df43);
_mm512_mask_storeu_ps(dfPtr1+177472+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df44);
_mm512_mask_storeu_ps(dfPtr1+263392+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df43);
_mm512_mask_storeu_ps(dfPtr1+263456+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df44);
df37 = _mm512_permutexvar_ps(eo6, df37);
df38 = _mm512_permutexvar_ps(eo6, df38);
_mm512_mask_storeu_ps(dfPtr1+10752+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df37);
_mm512_mask_storeu_ps(dfPtr1+10816+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df38);
_mm512_mask_storeu_ps(dfPtr1+96736+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df37);
_mm512_mask_storeu_ps(dfPtr1+96800+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df38);
df45 = _mm512_permutexvar_ps(eo6, df45);
df46 = _mm512_permutexvar_ps(eo6, df46);
_mm512_mask_storeu_ps(dfPtr1+182784+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df45);
_mm512_mask_storeu_ps(dfPtr1+182848+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df46);
_mm512_mask_storeu_ps(dfPtr1+268768+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df45);
_mm512_mask_storeu_ps(dfPtr1+268832+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df46);
df39 = _mm512_permutexvar_ps(eo6, df39);
df40 = _mm512_permutexvar_ps(eo6, df40);
_mm512_mask_storeu_ps(dfPtr1+16128+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df39);
_mm512_mask_storeu_ps(dfPtr1+16192+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df40);
_mm512_mask_storeu_ps(dfPtr1+102112+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df39);
_mm512_mask_storeu_ps(dfPtr1+102176+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df40);
df47 = _mm512_permutexvar_ps(eo6, df47);
df48 = _mm512_permutexvar_ps(eo6, df48);
_mm512_mask_storeu_ps(dfPtr1+188160+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df47);
_mm512_mask_storeu_ps(dfPtr1+188224+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df48);
_mm512_mask_storeu_ps(dfPtr1+274144+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df47);
_mm512_mask_storeu_ps(dfPtr1+274208+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df48);
_mm512_mask_storeu_ps(dfPtr1+0+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df33);
_mm512_mask_storeu_ps(dfPtr1+64+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df34);
_mm512_mask_storeu_ps(dfPtr1+85984+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df33);
_mm512_mask_storeu_ps(dfPtr1+86048+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df34);
_mm512_mask_storeu_ps(dfPtr1+172032+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df41);
_mm512_mask_storeu_ps(dfPtr1+172096+21504*i6+5376*j2+384*k3+128*m6+32*f7, 255, df42);
_mm512_mask_storeu_ps(dfPtr1+258016+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df41);
_mm512_mask_storeu_ps(dfPtr1+258080+21504*i6+5376*j2+384*k3+128*m6+32*f7, 65280, df42);
ptrdiff_t b6 = 3;
ptrdiff_t m7 = (size_t)b6/2;
ptrdiff_t f8 = (size_t)b6%2;
__m512 dat43 = _mm512_maskz_loadu_ps(1023, datPtr1+1008+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat44 = _mm512_maskz_loadu_ps(1023, datPtr1+1088+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat45 = _mm512_maskz_loadu_ps(1023, datPtr1+1168+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat46 = _mm512_maskz_loadu_ps(1023, datPtr1+1248+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat47 = _mm512_maskz_loadu_ps(1023, datPtr1+1328+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat48 = _mm512_maskz_loadu_ps(1023, datPtr1+1408+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat49 = _mm512_maskz_loadu_ps(1023, datPtr1+1488+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat50 = _mm512_maskz_loadu_ps(1023, datPtr1+1568+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat51 = _mm512_maskz_loadu_ps(1023, datPtr1+1648+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat52 = _mm512_maskz_loadu_ps(1023, datPtr1+1728+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat53 = _mm512_maskz_loadu_ps(1023, datPtr1+1808+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat54 = _mm512_maskz_loadu_ps(1023, datPtr1+1888+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat55 = _mm512_maskz_loadu_ps(1023, datPtr1+1968+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat56 = _mm512_maskz_loadu_ps(1023, datPtr1+2048+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat57 = _mm512_maskz_loadu_ps(1023, datPtr1+2128+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 dat58 = _mm512_maskz_loadu_ps(1023, datPtr1+2208+32480*i6+2320*k3+80*h1+4*w1+0*b6);
__m512 fft1009 = _mm512_add_ps(dat43, dat51);
__m512 fft1097 = _mm512_add_ps(dat44, dat52);
__m512 fft1010 = _mm512_sub_ps(dat43, dat51);
__m512 fft1098 = _mm512_sub_ps(dat44, dat52);
__m512 fft1011 = _mm512_add_ps(dat45, dat53);
__m512 fft1099 = _mm512_add_ps(dat46, dat54);
__m512 fft1012 = _mm512_sub_ps(dat45, dat53);
__m512 fft1100 = _mm512_sub_ps(dat46, dat54);
__m512 fft1013 = _mm512_add_ps(dat47, dat55);
__m512 fft1101 = _mm512_add_ps(dat48, dat56);
__m512 fft1014 = _mm512_sub_ps(dat47, dat55);
__m512 fft1102 = _mm512_sub_ps(dat48, dat56);
__m512 fft1015 = _mm512_add_ps(dat49, dat57);
__m512 fft1103 = _mm512_add_ps(dat50, dat58);
__m512 fft1016 = _mm512_sub_ps(dat49, dat57);
__m512 fft1104 = _mm512_sub_ps(dat50, dat58);
__m512 fft1017 = _mm512_add_ps(fft1009, fft1013);
__m512 fft1105 = _mm512_add_ps(fft1097, fft1101);
__m512 fft1018 = _mm512_sub_ps(fft1009, fft1013);
__m512 fft1106 = _mm512_sub_ps(fft1097, fft1101);
__m512 fft1019 = _mm512_add_ps(fft1011, fft1015);
__m512 fft1107 = _mm512_add_ps(fft1099, fft1103);
__m512 fft1020 = _mm512_sub_ps(fft1015, fft1011);
__m512 fft1108 = _mm512_sub_ps(fft1103, fft1099);
__m512 fft1021 = _mm512_sub_ps(fft1012, fft1016);
__m512 fft1109 = _mm512_sub_ps(fft1100, fft1104);
__m512 fft1022 = _mm512_add_ps(fft1012, fft1016);
__m512 fft1110 = _mm512_add_ps(fft1100, fft1104);
__m512 fft1023 = _mm512_add_ps(fft1017, fft1019);
__m512 fft1111 = _mm512_add_ps(fft1105, fft1107);
__m512 fft1024 = _mm512_sub_ps(fft1017, fft1019);
__m512 fft1112 = _mm512_sub_ps(fft1105, fft1107);
__m512 fft1025 = _mm512_fmadd_ps(fft1021, _mm512_set1_ps(7.0710677e-01f), fft1010);
__m512 fft1113 = _mm512_fmadd_ps(fft1109, _mm512_set1_ps(7.0710677e-01f), fft1098);
__m512 fft1026 = _mm512_fnmsub_ps(fft1022, _mm512_set1_ps(7.0710677e-01f), fft1014);
__m512 fft1114 = _mm512_fnmsub_ps(fft1110, _mm512_set1_ps(7.0710677e-01f), fft1102);
__m512 fft1027 = _mm512_fnmadd_ps(fft1021, _mm512_set1_ps(7.0710677e-01f), fft1010);
__m512 fft1115 = _mm512_fnmadd_ps(fft1109, _mm512_set1_ps(7.0710677e-01f), fft1098);
__m512 fft1028 = _mm512_fnmadd_ps(fft1022, _mm512_set1_ps(7.0710677e-01f), fft1014);
__m512 fft1116 = _mm512_fnmadd_ps(fft1110, _mm512_set1_ps(7.0710677e-01f), fft1102);
__m512 fft1029 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1030 = _mm512_fmadd_ps(fft1023, fft1029, _mm512_shuffle_f32x4(fft1023, fft1023, 78));
__m512 fft1117 = _mm512_fmadd_ps(fft1111, fft1029, _mm512_shuffle_f32x4(fft1111, fft1111, 78));
__m512 fft1031 = _mm512_fmadd_ps(fft1024, fft1029, _mm512_shuffle_f32x4(fft1024, fft1024, 78));
__m512 fft1118 = _mm512_fmadd_ps(fft1112, fft1029, _mm512_shuffle_f32x4(fft1112, fft1112, 78));
__m512 fft1032 = _mm512_fmadd_ps(fft1025, fft1029, _mm512_shuffle_f32x4(fft1025, fft1025, 78));
__m512 fft1119 = _mm512_fmadd_ps(fft1113, fft1029, _mm512_shuffle_f32x4(fft1113, fft1113, 78));
__m512 fft1033 = _mm512_fmadd_ps(fft1026, fft1029, _mm512_shuffle_f32x4(fft1026, fft1026, 78));
__m512 fft1120 = _mm512_fmadd_ps(fft1114, fft1029, _mm512_shuffle_f32x4(fft1114, fft1114, 78));
__m512 fft1034 = _mm512_fmadd_ps(fft1018, fft1029, _mm512_shuffle_f32x4(fft1018, fft1018, 78));
__m512 fft1121 = _mm512_fmadd_ps(fft1106, fft1029, _mm512_shuffle_f32x4(fft1106, fft1106, 78));
__m512 fft1035 = _mm512_fmadd_ps(fft1020, fft1029, _mm512_shuffle_f32x4(fft1020, fft1020, 78));
__m512 fft1122 = _mm512_fmadd_ps(fft1108, fft1029, _mm512_shuffle_f32x4(fft1108, fft1108, 78));
__m512 fft1036 = _mm512_fmadd_ps(fft1027, fft1029, _mm512_shuffle_f32x4(fft1027, fft1027, 78));
__m512 fft1123 = _mm512_fmadd_ps(fft1115, fft1029, _mm512_shuffle_f32x4(fft1115, fft1115, 78));
__m512 fft1037 = _mm512_fmadd_ps(fft1028, fft1029, _mm512_shuffle_f32x4(fft1028, fft1028, 78));
__m512 fft1124 = _mm512_fmadd_ps(fft1116, fft1029, _mm512_shuffle_f32x4(fft1116, fft1116, 78));
__m512 fft1038 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1039 = _mm512_mul_ps(fft1030, fft1038);
__m512 fft1125 = _mm512_mul_ps(fft1117, fft1038);
__m512 fft1040 = _mm512_mul_ps(fft1031, fft1038);
__m512 fft1126 = _mm512_mul_ps(fft1118, fft1038);
__m512 fft1041 = _mm512_mul_ps(fft1032, fft1038);
__m512 fft1127 = _mm512_mul_ps(fft1119, fft1038);
__m512 fft1042 = _mm512_mul_ps(fft1033, fft1038);
__m512 fft1128 = _mm512_mul_ps(fft1120, fft1038);
__m512 fft1043 = _mm512_mul_ps(fft1034, fft1038);
__m512 fft1129 = _mm512_mul_ps(fft1121, fft1038);
__m512 fft1044 = _mm512_mul_ps(fft1035, fft1038);
__m512 fft1130 = _mm512_mul_ps(fft1122, fft1038);
__m512 fft1045 = _mm512_mul_ps(fft1036, fft1038);
__m512 fft1131 = _mm512_mul_ps(fft1123, fft1038);
__m512 fft1046 = _mm512_mul_ps(fft1037, fft1038);
__m512 fft1132 = _mm512_mul_ps(fft1124, fft1038);
__m512 fft1047 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1048 = _mm512_fmadd_ps(fft1031, fft1047, fft1039);
__m512 fft1133 = _mm512_fmadd_ps(fft1118, fft1047, fft1125);
__m512 fft1049 = _mm512_fnmadd_ps(fft1030, fft1047, fft1040);
__m512 fft1134 = _mm512_fnmadd_ps(fft1117, fft1047, fft1126);
__m512 fft1050 = _mm512_fmadd_ps(fft1033, fft1047, fft1041);
__m512 fft1135 = _mm512_fmadd_ps(fft1120, fft1047, fft1127);
__m512 fft1051 = _mm512_fnmadd_ps(fft1032, fft1047, fft1042);
__m512 fft1136 = _mm512_fnmadd_ps(fft1119, fft1047, fft1128);
__m512 fft1052 = _mm512_fmadd_ps(fft1035, fft1047, fft1043);
__m512 fft1137 = _mm512_fmadd_ps(fft1122, fft1047, fft1129);
__m512 fft1053 = _mm512_fnmadd_ps(fft1034, fft1047, fft1044);
__m512 fft1138 = _mm512_fnmadd_ps(fft1121, fft1047, fft1130);
__m512 fft1054 = _mm512_fmadd_ps(fft1037, fft1047, fft1045);
__m512 fft1139 = _mm512_fmadd_ps(fft1124, fft1047, fft1131);
__m512 fft1055 = _mm512_fnmadd_ps(fft1036, fft1047, fft1046);
__m512 fft1140 = _mm512_fnmadd_ps(fft1123, fft1047, fft1132);
__m512 fft1056 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1057 = _mm512_fmadd_ps(fft1048, fft1056, _mm512_shuffle_f32x4(fft1048, fft1048, 177));
__m512 fft1141 = _mm512_fmadd_ps(fft1133, fft1056, _mm512_shuffle_f32x4(fft1133, fft1133, 177));
__m512 fft1058 = _mm512_fmadd_ps(fft1049, fft1056, _mm512_shuffle_f32x4(fft1049, fft1049, 177));
__m512 fft1142 = _mm512_fmadd_ps(fft1134, fft1056, _mm512_shuffle_f32x4(fft1134, fft1134, 177));
__m512 fft1059 = _mm512_fmadd_ps(fft1050, fft1056, _mm512_shuffle_f32x4(fft1050, fft1050, 177));
__m512 fft1143 = _mm512_fmadd_ps(fft1135, fft1056, _mm512_shuffle_f32x4(fft1135, fft1135, 177));
__m512 fft1060 = _mm512_fmadd_ps(fft1051, fft1056, _mm512_shuffle_f32x4(fft1051, fft1051, 177));
__m512 fft1144 = _mm512_fmadd_ps(fft1136, fft1056, _mm512_shuffle_f32x4(fft1136, fft1136, 177));
__m512 fft1061 = _mm512_fmadd_ps(fft1052, fft1056, _mm512_shuffle_f32x4(fft1052, fft1052, 177));
__m512 fft1145 = _mm512_fmadd_ps(fft1137, fft1056, _mm512_shuffle_f32x4(fft1137, fft1137, 177));
__m512 fft1062 = _mm512_fmadd_ps(fft1053, fft1056, _mm512_shuffle_f32x4(fft1053, fft1053, 177));
__m512 fft1146 = _mm512_fmadd_ps(fft1138, fft1056, _mm512_shuffle_f32x4(fft1138, fft1138, 177));
__m512 fft1063 = _mm512_fmadd_ps(fft1054, fft1056, _mm512_shuffle_f32x4(fft1054, fft1054, 177));
__m512 fft1147 = _mm512_fmadd_ps(fft1139, fft1056, _mm512_shuffle_f32x4(fft1139, fft1139, 177));
__m512 fft1064 = _mm512_fmadd_ps(fft1055, fft1056, _mm512_shuffle_f32x4(fft1055, fft1055, 177));
__m512 fft1148 = _mm512_fmadd_ps(fft1140, fft1056, _mm512_shuffle_f32x4(fft1140, fft1140, 177));
__m512 fft1065 = _mm512_mask_mov_ps(fft1057, 49344, fft1058);
__m512 fft1149 = _mm512_mask_mov_ps(fft1141, 49344, fft1142);
__m512 fft1066 = _mm512_mask_sub_ps(fft1058, 49344, _mm512_setzero_ps(), fft1057);
__m512 fft1150 = _mm512_mask_sub_ps(fft1142, 49344, _mm512_setzero_ps(), fft1141);
__m512 fft1067 = _mm512_mask_mov_ps(fft1059, 49344, fft1060);
__m512 fft1151 = _mm512_mask_mov_ps(fft1143, 49344, fft1144);
__m512 fft1068 = _mm512_mask_sub_ps(fft1060, 49344, _mm512_setzero_ps(), fft1059);
__m512 fft1152 = _mm512_mask_sub_ps(fft1144, 49344, _mm512_setzero_ps(), fft1143);
__m512 fft1069 = _mm512_mask_mov_ps(fft1061, 49344, fft1062);
__m512 fft1153 = _mm512_mask_mov_ps(fft1145, 49344, fft1146);
__m512 fft1070 = _mm512_mask_sub_ps(fft1062, 49344, _mm512_setzero_ps(), fft1061);
__m512 fft1154 = _mm512_mask_sub_ps(fft1146, 49344, _mm512_setzero_ps(), fft1145);
__m512 fft1071 = _mm512_mask_mov_ps(fft1063, 49344, fft1064);
__m512 fft1155 = _mm512_mask_mov_ps(fft1147, 49344, fft1148);
__m512 fft1072 = _mm512_mask_sub_ps(fft1064, 49344, _mm512_setzero_ps(), fft1063);
__m512 fft1156 = _mm512_mask_sub_ps(fft1148, 49344, _mm512_setzero_ps(), fft1147);
__m512 fft1073 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1074 = _mm512_fmadd_ps(fft1065, fft1073, _mm512_shuffle_ps(fft1065, fft1065, 78));
__m512 fft1157 = _mm512_fmadd_ps(fft1149, fft1073, _mm512_shuffle_ps(fft1149, fft1149, 78));
__m512 fft1075 = _mm512_fmadd_ps(fft1066, fft1073, _mm512_shuffle_ps(fft1066, fft1066, 78));
__m512 fft1158 = _mm512_fmadd_ps(fft1150, fft1073, _mm512_shuffle_ps(fft1150, fft1150, 78));
__m512 fft1076 = _mm512_fmadd_ps(fft1067, fft1073, _mm512_shuffle_ps(fft1067, fft1067, 78));
__m512 fft1159 = _mm512_fmadd_ps(fft1151, fft1073, _mm512_shuffle_ps(fft1151, fft1151, 78));
__m512 fft1077 = _mm512_fmadd_ps(fft1068, fft1073, _mm512_shuffle_ps(fft1068, fft1068, 78));
__m512 fft1160 = _mm512_fmadd_ps(fft1152, fft1073, _mm512_shuffle_ps(fft1152, fft1152, 78));
__m512 fft1078 = _mm512_fmadd_ps(fft1069, fft1073, _mm512_shuffle_ps(fft1069, fft1069, 78));
__m512 fft1161 = _mm512_fmadd_ps(fft1153, fft1073, _mm512_shuffle_ps(fft1153, fft1153, 78));
__m512 fft1079 = _mm512_fmadd_ps(fft1070, fft1073, _mm512_shuffle_ps(fft1070, fft1070, 78));
__m512 fft1162 = _mm512_fmadd_ps(fft1154, fft1073, _mm512_shuffle_ps(fft1154, fft1154, 78));
__m512 fft1080 = _mm512_fmadd_ps(fft1071, fft1073, _mm512_shuffle_ps(fft1071, fft1071, 78));
__m512 fft1163 = _mm512_fmadd_ps(fft1155, fft1073, _mm512_shuffle_ps(fft1155, fft1155, 78));
__m512 fft1081 = _mm512_fmadd_ps(fft1072, fft1073, _mm512_shuffle_ps(fft1072, fft1072, 78));
__m512 fft1164 = _mm512_fmadd_ps(fft1156, fft1073, _mm512_shuffle_ps(fft1156, fft1156, 78));
__m512i fft1082 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1083 = _mm512_permutexvar_ps(fft1082, fft1074);
__m512 fft1165 = _mm512_permutexvar_ps(fft1082, fft1157);
__m512i fft1084 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1085 = _mm512_permutexvar_ps(fft1084, fft1074);
__m512 fft1166 = _mm512_permutexvar_ps(fft1084, fft1157);
__m512 fft1086 = _mm512_permutexvar_ps(fft1082, fft1075);
__m512 fft1167 = _mm512_permutexvar_ps(fft1082, fft1158);
__m512 fft1087 = _mm512_permutexvar_ps(fft1084, fft1075);
__m512 fft1168 = _mm512_permutexvar_ps(fft1084, fft1158);
__m512 fft1088 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1089 = _mm512_fmadd_ps(fft1083, fft1088, fft1085);
__m512 fft1169 = _mm512_fmadd_ps(fft1165, fft1088, fft1166);
__m512 fft1090 = _mm512_fnmadd_ps(fft1087, fft1088, fft1086);
__m512 fft1170 = _mm512_fnmadd_ps(fft1168, fft1088, fft1167);
__m512 fft1091 = _mm512_mask_mov_ps(fft1087, 21845, fft1089);
__m512 fft1171 = _mm512_mask_mov_ps(fft1168, 21845, fft1169);
__m512 fft1092 = _mm512_mask_mov_ps(fft1083, 43176, fft1089);
__m512 fft1172 = _mm512_mask_mov_ps(fft1165, 43176, fft1169);
__m512 fft1093 = _mm512_mask_mov_ps(fft1091, 43176, fft1090);
__m512 fft1173 = _mm512_mask_mov_ps(fft1171, 43176, fft1170);
__m512 fft1094 = _mm512_mask_mov_ps(fft1092, 22102, fft1090);
__m512 fft1174 = _mm512_mask_mov_ps(fft1172, 22102, fft1170);
__m512 fft1095 = _mm512_mask_mul_ps(fft1093, 64764, fft1093, _mm512_set1_ps(5e-01f));
__m512 fft1175 = _mm512_mask_mul_ps(fft1173, 64764, fft1173, _mm512_set1_ps(5e-01f));
__m512 fft1096 = _mm512_mask_mul_ps(fft1094, 64764, fft1094, _mm512_set1_ps(5e-01f));
__m512 fft1176 = _mm512_mask_mul_ps(fft1174, 64764, fft1174, _mm512_set1_ps(5e-01f));
__m512 df49 = fft1095;
__m512 df57 = fft1175;
__m512 df50 = fft1096;
__m512 df58 = fft1176;
__m512 df51 = fft1076;
__m512 df59 = fft1159;
__m512 df52 = fft1077;
__m512 df60 = fft1160;
__m512 df53 = fft1078;
__m512 df61 = fft1161;
__m512 df54 = fft1079;
__m512 df62 = fft1162;
__m512 df55 = fft1080;
__m512 df63 = fft1163;
__m512 df56 = fft1081;
__m512 df64 = fft1164;
__m512i eo7 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df51 = _mm512_permutexvar_ps(eo7, df51);
df52 = _mm512_permutexvar_ps(eo7, df52);
_mm512_mask_storeu_ps(dfPtr1+5376+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df51);
_mm512_mask_storeu_ps(dfPtr1+5440+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df52);
_mm512_mask_storeu_ps(dfPtr1+91360+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df51);
_mm512_mask_storeu_ps(dfPtr1+91424+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df52);
df59 = _mm512_permutexvar_ps(eo7, df59);
df60 = _mm512_permutexvar_ps(eo7, df60);
_mm512_mask_storeu_ps(dfPtr1+177408+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df59);
_mm512_mask_storeu_ps(dfPtr1+177472+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df60);
_mm512_mask_storeu_ps(dfPtr1+263392+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df59);
_mm512_mask_storeu_ps(dfPtr1+263456+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df60);
df53 = _mm512_permutexvar_ps(eo7, df53);
df54 = _mm512_permutexvar_ps(eo7, df54);
_mm512_mask_storeu_ps(dfPtr1+10752+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df53);
_mm512_mask_storeu_ps(dfPtr1+10816+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df54);
_mm512_mask_storeu_ps(dfPtr1+96736+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df53);
_mm512_mask_storeu_ps(dfPtr1+96800+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df54);
df61 = _mm512_permutexvar_ps(eo7, df61);
df62 = _mm512_permutexvar_ps(eo7, df62);
_mm512_mask_storeu_ps(dfPtr1+182784+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df61);
_mm512_mask_storeu_ps(dfPtr1+182848+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df62);
_mm512_mask_storeu_ps(dfPtr1+268768+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df61);
_mm512_mask_storeu_ps(dfPtr1+268832+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df62);
df55 = _mm512_permutexvar_ps(eo7, df55);
df56 = _mm512_permutexvar_ps(eo7, df56);
_mm512_mask_storeu_ps(dfPtr1+16128+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df55);
_mm512_mask_storeu_ps(dfPtr1+16192+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df56);
_mm512_mask_storeu_ps(dfPtr1+102112+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df55);
_mm512_mask_storeu_ps(dfPtr1+102176+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df56);
df63 = _mm512_permutexvar_ps(eo7, df63);
df64 = _mm512_permutexvar_ps(eo7, df64);
_mm512_mask_storeu_ps(dfPtr1+188160+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df63);
_mm512_mask_storeu_ps(dfPtr1+188224+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df64);
_mm512_mask_storeu_ps(dfPtr1+274144+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df63);
_mm512_mask_storeu_ps(dfPtr1+274208+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df64);
_mm512_mask_storeu_ps(dfPtr1+0+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df49);
_mm512_mask_storeu_ps(dfPtr1+64+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df50);
_mm512_mask_storeu_ps(dfPtr1+85984+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df49);
_mm512_mask_storeu_ps(dfPtr1+86048+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df50);
_mm512_mask_storeu_ps(dfPtr1+172032+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df57);
_mm512_mask_storeu_ps(dfPtr1+172096+21504*i6+5376*j2+384*k3+128*m7+32*f8, 255, df58);
_mm512_mask_storeu_ps(dfPtr1+258016+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df57);
_mm512_mask_storeu_ps(dfPtr1+258080+21504*i6+5376*j2+384*k3+128*m7+32*f8, 65280, df58);
ptrdiff_t b7 = 4;
ptrdiff_t m8 = (size_t)b7/2;
ptrdiff_t f9 = (size_t)b7%2;
__m512 dat59 = _mm512_maskz_loadu_ps(65532, datPtr1+1920+32480*i6+2320*k3+80*h1+4*w1+0*b7);
__m512 dat60 = _mm512_maskz_loadu_ps(65532, datPtr1+2000+32480*i6+2320*k3+80*h1+4*w1+0*b7);
__m512 dat61 = _mm512_maskz_loadu_ps(65532, datPtr1+2080+32480*i6+2320*k3+80*h1+4*w1+0*b7);
__m512 dat62 = _mm512_maskz_loadu_ps(65532, datPtr1+2160+32480*i6+2320*k3+80*h1+4*w1+0*b7);
__m512 dat63 = _mm512_maskz_loadu_ps(65532, datPtr1+2240+32480*i6+2320*k3+80*h1+4*w1+0*b7);
__m512 dat64 = _mm512_maskz_loadu_ps(65532, datPtr1+2320+32480*i6+2320*k3+80*h1+4*w1+0*b7);
__m512 dat65 = _mm512_maskz_loadu_ps(65532, datPtr1+2400+32480*i6+2320*k3+80*h1+4*w1+0*b7);
__m512 dat66 = _mm512_maskz_loadu_ps(65532, datPtr1+2480+32480*i6+2320*k3+80*h1+4*w1+0*b7);
__m512 fft1177 = _mm512_add_ps(dat59, _mm512_setzero_ps());
__m512 fft1265 = _mm512_add_ps(dat60, _mm512_setzero_ps());
__m512 fft1178 = _mm512_sub_ps(dat59, _mm512_setzero_ps());
__m512 fft1266 = _mm512_sub_ps(dat60, _mm512_setzero_ps());
__m512 fft1179 = _mm512_add_ps(dat61, _mm512_setzero_ps());
__m512 fft1267 = _mm512_add_ps(dat62, _mm512_setzero_ps());
__m512 fft1180 = _mm512_sub_ps(dat61, _mm512_setzero_ps());
__m512 fft1268 = _mm512_sub_ps(dat62, _mm512_setzero_ps());
__m512 fft1181 = _mm512_add_ps(dat63, _mm512_setzero_ps());
__m512 fft1269 = _mm512_add_ps(dat64, _mm512_setzero_ps());
__m512 fft1182 = _mm512_sub_ps(dat63, _mm512_setzero_ps());
__m512 fft1270 = _mm512_sub_ps(dat64, _mm512_setzero_ps());
__m512 fft1183 = _mm512_add_ps(dat65, _mm512_setzero_ps());
__m512 fft1271 = _mm512_add_ps(dat66, _mm512_setzero_ps());
__m512 fft1184 = _mm512_sub_ps(dat65, _mm512_setzero_ps());
__m512 fft1272 = _mm512_sub_ps(dat66, _mm512_setzero_ps());
__m512 fft1185 = _mm512_add_ps(fft1177, fft1181);
__m512 fft1273 = _mm512_add_ps(fft1265, fft1269);
__m512 fft1186 = _mm512_sub_ps(fft1177, fft1181);
__m512 fft1274 = _mm512_sub_ps(fft1265, fft1269);
__m512 fft1187 = _mm512_add_ps(fft1179, fft1183);
__m512 fft1275 = _mm512_add_ps(fft1267, fft1271);
__m512 fft1188 = _mm512_sub_ps(fft1183, fft1179);
__m512 fft1276 = _mm512_sub_ps(fft1271, fft1267);
__m512 fft1189 = _mm512_sub_ps(fft1180, fft1184);
__m512 fft1277 = _mm512_sub_ps(fft1268, fft1272);
__m512 fft1190 = _mm512_add_ps(fft1180, fft1184);
__m512 fft1278 = _mm512_add_ps(fft1268, fft1272);
__m512 fft1191 = _mm512_add_ps(fft1185, fft1187);
__m512 fft1279 = _mm512_add_ps(fft1273, fft1275);
__m512 fft1192 = _mm512_sub_ps(fft1185, fft1187);
__m512 fft1280 = _mm512_sub_ps(fft1273, fft1275);
__m512 fft1193 = _mm512_fmadd_ps(fft1189, _mm512_set1_ps(7.0710677e-01f), fft1178);
__m512 fft1281 = _mm512_fmadd_ps(fft1277, _mm512_set1_ps(7.0710677e-01f), fft1266);
__m512 fft1194 = _mm512_fnmsub_ps(fft1190, _mm512_set1_ps(7.0710677e-01f), fft1182);
__m512 fft1282 = _mm512_fnmsub_ps(fft1278, _mm512_set1_ps(7.0710677e-01f), fft1270);
__m512 fft1195 = _mm512_fnmadd_ps(fft1189, _mm512_set1_ps(7.0710677e-01f), fft1178);
__m512 fft1283 = _mm512_fnmadd_ps(fft1277, _mm512_set1_ps(7.0710677e-01f), fft1266);
__m512 fft1196 = _mm512_fnmadd_ps(fft1190, _mm512_set1_ps(7.0710677e-01f), fft1182);
__m512 fft1284 = _mm512_fnmadd_ps(fft1278, _mm512_set1_ps(7.0710677e-01f), fft1270);
__m512 fft1197 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1198 = _mm512_fmadd_ps(fft1191, fft1197, _mm512_shuffle_f32x4(fft1191, fft1191, 78));
__m512 fft1285 = _mm512_fmadd_ps(fft1279, fft1197, _mm512_shuffle_f32x4(fft1279, fft1279, 78));
__m512 fft1199 = _mm512_fmadd_ps(fft1192, fft1197, _mm512_shuffle_f32x4(fft1192, fft1192, 78));
__m512 fft1286 = _mm512_fmadd_ps(fft1280, fft1197, _mm512_shuffle_f32x4(fft1280, fft1280, 78));
__m512 fft1200 = _mm512_fmadd_ps(fft1193, fft1197, _mm512_shuffle_f32x4(fft1193, fft1193, 78));
__m512 fft1287 = _mm512_fmadd_ps(fft1281, fft1197, _mm512_shuffle_f32x4(fft1281, fft1281, 78));
__m512 fft1201 = _mm512_fmadd_ps(fft1194, fft1197, _mm512_shuffle_f32x4(fft1194, fft1194, 78));
__m512 fft1288 = _mm512_fmadd_ps(fft1282, fft1197, _mm512_shuffle_f32x4(fft1282, fft1282, 78));
__m512 fft1202 = _mm512_fmadd_ps(fft1186, fft1197, _mm512_shuffle_f32x4(fft1186, fft1186, 78));
__m512 fft1289 = _mm512_fmadd_ps(fft1274, fft1197, _mm512_shuffle_f32x4(fft1274, fft1274, 78));
__m512 fft1203 = _mm512_fmadd_ps(fft1188, fft1197, _mm512_shuffle_f32x4(fft1188, fft1188, 78));
__m512 fft1290 = _mm512_fmadd_ps(fft1276, fft1197, _mm512_shuffle_f32x4(fft1276, fft1276, 78));
__m512 fft1204 = _mm512_fmadd_ps(fft1195, fft1197, _mm512_shuffle_f32x4(fft1195, fft1195, 78));
__m512 fft1291 = _mm512_fmadd_ps(fft1283, fft1197, _mm512_shuffle_f32x4(fft1283, fft1283, 78));
__m512 fft1205 = _mm512_fmadd_ps(fft1196, fft1197, _mm512_shuffle_f32x4(fft1196, fft1196, 78));
__m512 fft1292 = _mm512_fmadd_ps(fft1284, fft1197, _mm512_shuffle_f32x4(fft1284, fft1284, 78));
__m512 fft1206 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1207 = _mm512_mul_ps(fft1198, fft1206);
__m512 fft1293 = _mm512_mul_ps(fft1285, fft1206);
__m512 fft1208 = _mm512_mul_ps(fft1199, fft1206);
__m512 fft1294 = _mm512_mul_ps(fft1286, fft1206);
__m512 fft1209 = _mm512_mul_ps(fft1200, fft1206);
__m512 fft1295 = _mm512_mul_ps(fft1287, fft1206);
__m512 fft1210 = _mm512_mul_ps(fft1201, fft1206);
__m512 fft1296 = _mm512_mul_ps(fft1288, fft1206);
__m512 fft1211 = _mm512_mul_ps(fft1202, fft1206);
__m512 fft1297 = _mm512_mul_ps(fft1289, fft1206);
__m512 fft1212 = _mm512_mul_ps(fft1203, fft1206);
__m512 fft1298 = _mm512_mul_ps(fft1290, fft1206);
__m512 fft1213 = _mm512_mul_ps(fft1204, fft1206);
__m512 fft1299 = _mm512_mul_ps(fft1291, fft1206);
__m512 fft1214 = _mm512_mul_ps(fft1205, fft1206);
__m512 fft1300 = _mm512_mul_ps(fft1292, fft1206);
__m512 fft1215 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1216 = _mm512_fmadd_ps(fft1199, fft1215, fft1207);
__m512 fft1301 = _mm512_fmadd_ps(fft1286, fft1215, fft1293);
__m512 fft1217 = _mm512_fnmadd_ps(fft1198, fft1215, fft1208);
__m512 fft1302 = _mm512_fnmadd_ps(fft1285, fft1215, fft1294);
__m512 fft1218 = _mm512_fmadd_ps(fft1201, fft1215, fft1209);
__m512 fft1303 = _mm512_fmadd_ps(fft1288, fft1215, fft1295);
__m512 fft1219 = _mm512_fnmadd_ps(fft1200, fft1215, fft1210);
__m512 fft1304 = _mm512_fnmadd_ps(fft1287, fft1215, fft1296);
__m512 fft1220 = _mm512_fmadd_ps(fft1203, fft1215, fft1211);
__m512 fft1305 = _mm512_fmadd_ps(fft1290, fft1215, fft1297);
__m512 fft1221 = _mm512_fnmadd_ps(fft1202, fft1215, fft1212);
__m512 fft1306 = _mm512_fnmadd_ps(fft1289, fft1215, fft1298);
__m512 fft1222 = _mm512_fmadd_ps(fft1205, fft1215, fft1213);
__m512 fft1307 = _mm512_fmadd_ps(fft1292, fft1215, fft1299);
__m512 fft1223 = _mm512_fnmadd_ps(fft1204, fft1215, fft1214);
__m512 fft1308 = _mm512_fnmadd_ps(fft1291, fft1215, fft1300);
__m512 fft1224 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1225 = _mm512_fmadd_ps(fft1216, fft1224, _mm512_shuffle_f32x4(fft1216, fft1216, 177));
__m512 fft1309 = _mm512_fmadd_ps(fft1301, fft1224, _mm512_shuffle_f32x4(fft1301, fft1301, 177));
__m512 fft1226 = _mm512_fmadd_ps(fft1217, fft1224, _mm512_shuffle_f32x4(fft1217, fft1217, 177));
__m512 fft1310 = _mm512_fmadd_ps(fft1302, fft1224, _mm512_shuffle_f32x4(fft1302, fft1302, 177));
__m512 fft1227 = _mm512_fmadd_ps(fft1218, fft1224, _mm512_shuffle_f32x4(fft1218, fft1218, 177));
__m512 fft1311 = _mm512_fmadd_ps(fft1303, fft1224, _mm512_shuffle_f32x4(fft1303, fft1303, 177));
__m512 fft1228 = _mm512_fmadd_ps(fft1219, fft1224, _mm512_shuffle_f32x4(fft1219, fft1219, 177));
__m512 fft1312 = _mm512_fmadd_ps(fft1304, fft1224, _mm512_shuffle_f32x4(fft1304, fft1304, 177));
__m512 fft1229 = _mm512_fmadd_ps(fft1220, fft1224, _mm512_shuffle_f32x4(fft1220, fft1220, 177));
__m512 fft1313 = _mm512_fmadd_ps(fft1305, fft1224, _mm512_shuffle_f32x4(fft1305, fft1305, 177));
__m512 fft1230 = _mm512_fmadd_ps(fft1221, fft1224, _mm512_shuffle_f32x4(fft1221, fft1221, 177));
__m512 fft1314 = _mm512_fmadd_ps(fft1306, fft1224, _mm512_shuffle_f32x4(fft1306, fft1306, 177));
__m512 fft1231 = _mm512_fmadd_ps(fft1222, fft1224, _mm512_shuffle_f32x4(fft1222, fft1222, 177));
__m512 fft1315 = _mm512_fmadd_ps(fft1307, fft1224, _mm512_shuffle_f32x4(fft1307, fft1307, 177));
__m512 fft1232 = _mm512_fmadd_ps(fft1223, fft1224, _mm512_shuffle_f32x4(fft1223, fft1223, 177));
__m512 fft1316 = _mm512_fmadd_ps(fft1308, fft1224, _mm512_shuffle_f32x4(fft1308, fft1308, 177));
__m512 fft1233 = _mm512_mask_mov_ps(fft1225, 49344, fft1226);
__m512 fft1317 = _mm512_mask_mov_ps(fft1309, 49344, fft1310);
__m512 fft1234 = _mm512_mask_sub_ps(fft1226, 49344, _mm512_setzero_ps(), fft1225);
__m512 fft1318 = _mm512_mask_sub_ps(fft1310, 49344, _mm512_setzero_ps(), fft1309);
__m512 fft1235 = _mm512_mask_mov_ps(fft1227, 49344, fft1228);
__m512 fft1319 = _mm512_mask_mov_ps(fft1311, 49344, fft1312);
__m512 fft1236 = _mm512_mask_sub_ps(fft1228, 49344, _mm512_setzero_ps(), fft1227);
__m512 fft1320 = _mm512_mask_sub_ps(fft1312, 49344, _mm512_setzero_ps(), fft1311);
__m512 fft1237 = _mm512_mask_mov_ps(fft1229, 49344, fft1230);
__m512 fft1321 = _mm512_mask_mov_ps(fft1313, 49344, fft1314);
__m512 fft1238 = _mm512_mask_sub_ps(fft1230, 49344, _mm512_setzero_ps(), fft1229);
__m512 fft1322 = _mm512_mask_sub_ps(fft1314, 49344, _mm512_setzero_ps(), fft1313);
__m512 fft1239 = _mm512_mask_mov_ps(fft1231, 49344, fft1232);
__m512 fft1323 = _mm512_mask_mov_ps(fft1315, 49344, fft1316);
__m512 fft1240 = _mm512_mask_sub_ps(fft1232, 49344, _mm512_setzero_ps(), fft1231);
__m512 fft1324 = _mm512_mask_sub_ps(fft1316, 49344, _mm512_setzero_ps(), fft1315);
__m512 fft1241 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1242 = _mm512_fmadd_ps(fft1233, fft1241, _mm512_shuffle_ps(fft1233, fft1233, 78));
__m512 fft1325 = _mm512_fmadd_ps(fft1317, fft1241, _mm512_shuffle_ps(fft1317, fft1317, 78));
__m512 fft1243 = _mm512_fmadd_ps(fft1234, fft1241, _mm512_shuffle_ps(fft1234, fft1234, 78));
__m512 fft1326 = _mm512_fmadd_ps(fft1318, fft1241, _mm512_shuffle_ps(fft1318, fft1318, 78));
__m512 fft1244 = _mm512_fmadd_ps(fft1235, fft1241, _mm512_shuffle_ps(fft1235, fft1235, 78));
__m512 fft1327 = _mm512_fmadd_ps(fft1319, fft1241, _mm512_shuffle_ps(fft1319, fft1319, 78));
__m512 fft1245 = _mm512_fmadd_ps(fft1236, fft1241, _mm512_shuffle_ps(fft1236, fft1236, 78));
__m512 fft1328 = _mm512_fmadd_ps(fft1320, fft1241, _mm512_shuffle_ps(fft1320, fft1320, 78));
__m512 fft1246 = _mm512_fmadd_ps(fft1237, fft1241, _mm512_shuffle_ps(fft1237, fft1237, 78));
__m512 fft1329 = _mm512_fmadd_ps(fft1321, fft1241, _mm512_shuffle_ps(fft1321, fft1321, 78));
__m512 fft1247 = _mm512_fmadd_ps(fft1238, fft1241, _mm512_shuffle_ps(fft1238, fft1238, 78));
__m512 fft1330 = _mm512_fmadd_ps(fft1322, fft1241, _mm512_shuffle_ps(fft1322, fft1322, 78));
__m512 fft1248 = _mm512_fmadd_ps(fft1239, fft1241, _mm512_shuffle_ps(fft1239, fft1239, 78));
__m512 fft1331 = _mm512_fmadd_ps(fft1323, fft1241, _mm512_shuffle_ps(fft1323, fft1323, 78));
__m512 fft1249 = _mm512_fmadd_ps(fft1240, fft1241, _mm512_shuffle_ps(fft1240, fft1240, 78));
__m512 fft1332 = _mm512_fmadd_ps(fft1324, fft1241, _mm512_shuffle_ps(fft1324, fft1324, 78));
__m512i fft1250 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1251 = _mm512_permutexvar_ps(fft1250, fft1242);
__m512 fft1333 = _mm512_permutexvar_ps(fft1250, fft1325);
__m512i fft1252 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1253 = _mm512_permutexvar_ps(fft1252, fft1242);
__m512 fft1334 = _mm512_permutexvar_ps(fft1252, fft1325);
__m512 fft1254 = _mm512_permutexvar_ps(fft1250, fft1243);
__m512 fft1335 = _mm512_permutexvar_ps(fft1250, fft1326);
__m512 fft1255 = _mm512_permutexvar_ps(fft1252, fft1243);
__m512 fft1336 = _mm512_permutexvar_ps(fft1252, fft1326);
__m512 fft1256 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1257 = _mm512_fmadd_ps(fft1251, fft1256, fft1253);
__m512 fft1337 = _mm512_fmadd_ps(fft1333, fft1256, fft1334);
__m512 fft1258 = _mm512_fnmadd_ps(fft1255, fft1256, fft1254);
__m512 fft1338 = _mm512_fnmadd_ps(fft1336, fft1256, fft1335);
__m512 fft1259 = _mm512_mask_mov_ps(fft1255, 21845, fft1257);
__m512 fft1339 = _mm512_mask_mov_ps(fft1336, 21845, fft1337);
__m512 fft1260 = _mm512_mask_mov_ps(fft1251, 43176, fft1257);
__m512 fft1340 = _mm512_mask_mov_ps(fft1333, 43176, fft1337);
__m512 fft1261 = _mm512_mask_mov_ps(fft1259, 43176, fft1258);
__m512 fft1341 = _mm512_mask_mov_ps(fft1339, 43176, fft1338);
__m512 fft1262 = _mm512_mask_mov_ps(fft1260, 22102, fft1258);
__m512 fft1342 = _mm512_mask_mov_ps(fft1340, 22102, fft1338);
__m512 fft1263 = _mm512_mask_mul_ps(fft1261, 64764, fft1261, _mm512_set1_ps(5e-01f));
__m512 fft1343 = _mm512_mask_mul_ps(fft1341, 64764, fft1341, _mm512_set1_ps(5e-01f));
__m512 fft1264 = _mm512_mask_mul_ps(fft1262, 64764, fft1262, _mm512_set1_ps(5e-01f));
__m512 fft1344 = _mm512_mask_mul_ps(fft1342, 64764, fft1342, _mm512_set1_ps(5e-01f));
__m512 df65 = fft1263;
__m512 df73 = fft1343;
__m512 df66 = fft1264;
__m512 df74 = fft1344;
__m512 df67 = fft1244;
__m512 df75 = fft1327;
__m512 df68 = fft1245;
__m512 df76 = fft1328;
__m512 df69 = fft1246;
__m512 df77 = fft1329;
__m512 df70 = fft1247;
__m512 df78 = fft1330;
__m512 df71 = fft1248;
__m512 df79 = fft1331;
__m512 df72 = fft1249;
__m512 df80 = fft1332;
__m512i eo8 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df67 = _mm512_permutexvar_ps(eo8, df67);
df68 = _mm512_permutexvar_ps(eo8, df68);
_mm512_mask_storeu_ps(dfPtr1+5376+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df67);
_mm512_mask_storeu_ps(dfPtr1+5440+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df68);
_mm512_mask_storeu_ps(dfPtr1+91360+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df67);
_mm512_mask_storeu_ps(dfPtr1+91424+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df68);
df75 = _mm512_permutexvar_ps(eo8, df75);
df76 = _mm512_permutexvar_ps(eo8, df76);
_mm512_mask_storeu_ps(dfPtr1+177408+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df75);
_mm512_mask_storeu_ps(dfPtr1+177472+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df76);
_mm512_mask_storeu_ps(dfPtr1+263392+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df75);
_mm512_mask_storeu_ps(dfPtr1+263456+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df76);
df69 = _mm512_permutexvar_ps(eo8, df69);
df70 = _mm512_permutexvar_ps(eo8, df70);
_mm512_mask_storeu_ps(dfPtr1+10752+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df69);
_mm512_mask_storeu_ps(dfPtr1+10816+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df70);
_mm512_mask_storeu_ps(dfPtr1+96736+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df69);
_mm512_mask_storeu_ps(dfPtr1+96800+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df70);
df77 = _mm512_permutexvar_ps(eo8, df77);
df78 = _mm512_permutexvar_ps(eo8, df78);
_mm512_mask_storeu_ps(dfPtr1+182784+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df77);
_mm512_mask_storeu_ps(dfPtr1+182848+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df78);
_mm512_mask_storeu_ps(dfPtr1+268768+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df77);
_mm512_mask_storeu_ps(dfPtr1+268832+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df78);
df71 = _mm512_permutexvar_ps(eo8, df71);
df72 = _mm512_permutexvar_ps(eo8, df72);
_mm512_mask_storeu_ps(dfPtr1+16128+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df71);
_mm512_mask_storeu_ps(dfPtr1+16192+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df72);
_mm512_mask_storeu_ps(dfPtr1+102112+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df71);
_mm512_mask_storeu_ps(dfPtr1+102176+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df72);
df79 = _mm512_permutexvar_ps(eo8, df79);
df80 = _mm512_permutexvar_ps(eo8, df80);
_mm512_mask_storeu_ps(dfPtr1+188160+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df79);
_mm512_mask_storeu_ps(dfPtr1+188224+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df80);
_mm512_mask_storeu_ps(dfPtr1+274144+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df79);
_mm512_mask_storeu_ps(dfPtr1+274208+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df80);
_mm512_mask_storeu_ps(dfPtr1+0+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df65);
_mm512_mask_storeu_ps(dfPtr1+64+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df66);
_mm512_mask_storeu_ps(dfPtr1+85984+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df65);
_mm512_mask_storeu_ps(dfPtr1+86048+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df66);
_mm512_mask_storeu_ps(dfPtr1+172032+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df73);
_mm512_mask_storeu_ps(dfPtr1+172096+21504*i6+5376*j2+384*k3+128*m8+32*f9, 255, df74);
_mm512_mask_storeu_ps(dfPtr1+258016+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df73);
_mm512_mask_storeu_ps(dfPtr1+258080+21504*i6+5376*j2+384*k3+128*m8+32*f9, 65280, df74);
ptrdiff_t b8 = 5;
ptrdiff_t m9 = (size_t)b8/2;
ptrdiff_t f10 = (size_t)b8%2;
__m512 dat67 = _mm512_maskz_loadu_ps(1023, datPtr1+1968+32480*i6+2320*k3+80*h1+4*w1+0*b8);
__m512 dat68 = _mm512_maskz_loadu_ps(1023, datPtr1+2048+32480*i6+2320*k3+80*h1+4*w1+0*b8);
__m512 dat69 = _mm512_maskz_loadu_ps(1023, datPtr1+2128+32480*i6+2320*k3+80*h1+4*w1+0*b8);
__m512 dat70 = _mm512_maskz_loadu_ps(1023, datPtr1+2208+32480*i6+2320*k3+80*h1+4*w1+0*b8);
__m512 dat71 = _mm512_maskz_loadu_ps(1023, datPtr1+2288+32480*i6+2320*k3+80*h1+4*w1+0*b8);
__m512 dat72 = _mm512_maskz_loadu_ps(1023, datPtr1+2368+32480*i6+2320*k3+80*h1+4*w1+0*b8);
__m512 dat73 = _mm512_maskz_loadu_ps(1023, datPtr1+2448+32480*i6+2320*k3+80*h1+4*w1+0*b8);
__m512 dat74 = _mm512_maskz_loadu_ps(1023, datPtr1+2528+32480*i6+2320*k3+80*h1+4*w1+0*b8);
__m512 fft1345 = _mm512_add_ps(dat67, _mm512_setzero_ps());
__m512 fft1433 = _mm512_add_ps(dat68, _mm512_setzero_ps());
__m512 fft1346 = _mm512_sub_ps(dat67, _mm512_setzero_ps());
__m512 fft1434 = _mm512_sub_ps(dat68, _mm512_setzero_ps());
__m512 fft1347 = _mm512_add_ps(dat69, _mm512_setzero_ps());
__m512 fft1435 = _mm512_add_ps(dat70, _mm512_setzero_ps());
__m512 fft1348 = _mm512_sub_ps(dat69, _mm512_setzero_ps());
__m512 fft1436 = _mm512_sub_ps(dat70, _mm512_setzero_ps());
__m512 fft1349 = _mm512_add_ps(dat71, _mm512_setzero_ps());
__m512 fft1437 = _mm512_add_ps(dat72, _mm512_setzero_ps());
__m512 fft1350 = _mm512_sub_ps(dat71, _mm512_setzero_ps());
__m512 fft1438 = _mm512_sub_ps(dat72, _mm512_setzero_ps());
__m512 fft1351 = _mm512_add_ps(dat73, _mm512_setzero_ps());
__m512 fft1439 = _mm512_add_ps(dat74, _mm512_setzero_ps());
__m512 fft1352 = _mm512_sub_ps(dat73, _mm512_setzero_ps());
__m512 fft1440 = _mm512_sub_ps(dat74, _mm512_setzero_ps());
__m512 fft1353 = _mm512_add_ps(fft1345, fft1349);
__m512 fft1441 = _mm512_add_ps(fft1433, fft1437);
__m512 fft1354 = _mm512_sub_ps(fft1345, fft1349);
__m512 fft1442 = _mm512_sub_ps(fft1433, fft1437);
__m512 fft1355 = _mm512_add_ps(fft1347, fft1351);
__m512 fft1443 = _mm512_add_ps(fft1435, fft1439);
__m512 fft1356 = _mm512_sub_ps(fft1351, fft1347);
__m512 fft1444 = _mm512_sub_ps(fft1439, fft1435);
__m512 fft1357 = _mm512_sub_ps(fft1348, fft1352);
__m512 fft1445 = _mm512_sub_ps(fft1436, fft1440);
__m512 fft1358 = _mm512_add_ps(fft1348, fft1352);
__m512 fft1446 = _mm512_add_ps(fft1436, fft1440);
__m512 fft1359 = _mm512_add_ps(fft1353, fft1355);
__m512 fft1447 = _mm512_add_ps(fft1441, fft1443);
__m512 fft1360 = _mm512_sub_ps(fft1353, fft1355);
__m512 fft1448 = _mm512_sub_ps(fft1441, fft1443);
__m512 fft1361 = _mm512_fmadd_ps(fft1357, _mm512_set1_ps(7.0710677e-01f), fft1346);
__m512 fft1449 = _mm512_fmadd_ps(fft1445, _mm512_set1_ps(7.0710677e-01f), fft1434);
__m512 fft1362 = _mm512_fnmsub_ps(fft1358, _mm512_set1_ps(7.0710677e-01f), fft1350);
__m512 fft1450 = _mm512_fnmsub_ps(fft1446, _mm512_set1_ps(7.0710677e-01f), fft1438);
__m512 fft1363 = _mm512_fnmadd_ps(fft1357, _mm512_set1_ps(7.0710677e-01f), fft1346);
__m512 fft1451 = _mm512_fnmadd_ps(fft1445, _mm512_set1_ps(7.0710677e-01f), fft1434);
__m512 fft1364 = _mm512_fnmadd_ps(fft1358, _mm512_set1_ps(7.0710677e-01f), fft1350);
__m512 fft1452 = _mm512_fnmadd_ps(fft1446, _mm512_set1_ps(7.0710677e-01f), fft1438);
__m512 fft1365 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1366 = _mm512_fmadd_ps(fft1359, fft1365, _mm512_shuffle_f32x4(fft1359, fft1359, 78));
__m512 fft1453 = _mm512_fmadd_ps(fft1447, fft1365, _mm512_shuffle_f32x4(fft1447, fft1447, 78));
__m512 fft1367 = _mm512_fmadd_ps(fft1360, fft1365, _mm512_shuffle_f32x4(fft1360, fft1360, 78));
__m512 fft1454 = _mm512_fmadd_ps(fft1448, fft1365, _mm512_shuffle_f32x4(fft1448, fft1448, 78));
__m512 fft1368 = _mm512_fmadd_ps(fft1361, fft1365, _mm512_shuffle_f32x4(fft1361, fft1361, 78));
__m512 fft1455 = _mm512_fmadd_ps(fft1449, fft1365, _mm512_shuffle_f32x4(fft1449, fft1449, 78));
__m512 fft1369 = _mm512_fmadd_ps(fft1362, fft1365, _mm512_shuffle_f32x4(fft1362, fft1362, 78));
__m512 fft1456 = _mm512_fmadd_ps(fft1450, fft1365, _mm512_shuffle_f32x4(fft1450, fft1450, 78));
__m512 fft1370 = _mm512_fmadd_ps(fft1354, fft1365, _mm512_shuffle_f32x4(fft1354, fft1354, 78));
__m512 fft1457 = _mm512_fmadd_ps(fft1442, fft1365, _mm512_shuffle_f32x4(fft1442, fft1442, 78));
__m512 fft1371 = _mm512_fmadd_ps(fft1356, fft1365, _mm512_shuffle_f32x4(fft1356, fft1356, 78));
__m512 fft1458 = _mm512_fmadd_ps(fft1444, fft1365, _mm512_shuffle_f32x4(fft1444, fft1444, 78));
__m512 fft1372 = _mm512_fmadd_ps(fft1363, fft1365, _mm512_shuffle_f32x4(fft1363, fft1363, 78));
__m512 fft1459 = _mm512_fmadd_ps(fft1451, fft1365, _mm512_shuffle_f32x4(fft1451, fft1451, 78));
__m512 fft1373 = _mm512_fmadd_ps(fft1364, fft1365, _mm512_shuffle_f32x4(fft1364, fft1364, 78));
__m512 fft1460 = _mm512_fmadd_ps(fft1452, fft1365, _mm512_shuffle_f32x4(fft1452, fft1452, 78));
__m512 fft1374 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1375 = _mm512_mul_ps(fft1366, fft1374);
__m512 fft1461 = _mm512_mul_ps(fft1453, fft1374);
__m512 fft1376 = _mm512_mul_ps(fft1367, fft1374);
__m512 fft1462 = _mm512_mul_ps(fft1454, fft1374);
__m512 fft1377 = _mm512_mul_ps(fft1368, fft1374);
__m512 fft1463 = _mm512_mul_ps(fft1455, fft1374);
__m512 fft1378 = _mm512_mul_ps(fft1369, fft1374);
__m512 fft1464 = _mm512_mul_ps(fft1456, fft1374);
__m512 fft1379 = _mm512_mul_ps(fft1370, fft1374);
__m512 fft1465 = _mm512_mul_ps(fft1457, fft1374);
__m512 fft1380 = _mm512_mul_ps(fft1371, fft1374);
__m512 fft1466 = _mm512_mul_ps(fft1458, fft1374);
__m512 fft1381 = _mm512_mul_ps(fft1372, fft1374);
__m512 fft1467 = _mm512_mul_ps(fft1459, fft1374);
__m512 fft1382 = _mm512_mul_ps(fft1373, fft1374);
__m512 fft1468 = _mm512_mul_ps(fft1460, fft1374);
__m512 fft1383 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1384 = _mm512_fmadd_ps(fft1367, fft1383, fft1375);
__m512 fft1469 = _mm512_fmadd_ps(fft1454, fft1383, fft1461);
__m512 fft1385 = _mm512_fnmadd_ps(fft1366, fft1383, fft1376);
__m512 fft1470 = _mm512_fnmadd_ps(fft1453, fft1383, fft1462);
__m512 fft1386 = _mm512_fmadd_ps(fft1369, fft1383, fft1377);
__m512 fft1471 = _mm512_fmadd_ps(fft1456, fft1383, fft1463);
__m512 fft1387 = _mm512_fnmadd_ps(fft1368, fft1383, fft1378);
__m512 fft1472 = _mm512_fnmadd_ps(fft1455, fft1383, fft1464);
__m512 fft1388 = _mm512_fmadd_ps(fft1371, fft1383, fft1379);
__m512 fft1473 = _mm512_fmadd_ps(fft1458, fft1383, fft1465);
__m512 fft1389 = _mm512_fnmadd_ps(fft1370, fft1383, fft1380);
__m512 fft1474 = _mm512_fnmadd_ps(fft1457, fft1383, fft1466);
__m512 fft1390 = _mm512_fmadd_ps(fft1373, fft1383, fft1381);
__m512 fft1475 = _mm512_fmadd_ps(fft1460, fft1383, fft1467);
__m512 fft1391 = _mm512_fnmadd_ps(fft1372, fft1383, fft1382);
__m512 fft1476 = _mm512_fnmadd_ps(fft1459, fft1383, fft1468);
__m512 fft1392 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1393 = _mm512_fmadd_ps(fft1384, fft1392, _mm512_shuffle_f32x4(fft1384, fft1384, 177));
__m512 fft1477 = _mm512_fmadd_ps(fft1469, fft1392, _mm512_shuffle_f32x4(fft1469, fft1469, 177));
__m512 fft1394 = _mm512_fmadd_ps(fft1385, fft1392, _mm512_shuffle_f32x4(fft1385, fft1385, 177));
__m512 fft1478 = _mm512_fmadd_ps(fft1470, fft1392, _mm512_shuffle_f32x4(fft1470, fft1470, 177));
__m512 fft1395 = _mm512_fmadd_ps(fft1386, fft1392, _mm512_shuffle_f32x4(fft1386, fft1386, 177));
__m512 fft1479 = _mm512_fmadd_ps(fft1471, fft1392, _mm512_shuffle_f32x4(fft1471, fft1471, 177));
__m512 fft1396 = _mm512_fmadd_ps(fft1387, fft1392, _mm512_shuffle_f32x4(fft1387, fft1387, 177));
__m512 fft1480 = _mm512_fmadd_ps(fft1472, fft1392, _mm512_shuffle_f32x4(fft1472, fft1472, 177));
__m512 fft1397 = _mm512_fmadd_ps(fft1388, fft1392, _mm512_shuffle_f32x4(fft1388, fft1388, 177));
__m512 fft1481 = _mm512_fmadd_ps(fft1473, fft1392, _mm512_shuffle_f32x4(fft1473, fft1473, 177));
__m512 fft1398 = _mm512_fmadd_ps(fft1389, fft1392, _mm512_shuffle_f32x4(fft1389, fft1389, 177));
__m512 fft1482 = _mm512_fmadd_ps(fft1474, fft1392, _mm512_shuffle_f32x4(fft1474, fft1474, 177));
__m512 fft1399 = _mm512_fmadd_ps(fft1390, fft1392, _mm512_shuffle_f32x4(fft1390, fft1390, 177));
__m512 fft1483 = _mm512_fmadd_ps(fft1475, fft1392, _mm512_shuffle_f32x4(fft1475, fft1475, 177));
__m512 fft1400 = _mm512_fmadd_ps(fft1391, fft1392, _mm512_shuffle_f32x4(fft1391, fft1391, 177));
__m512 fft1484 = _mm512_fmadd_ps(fft1476, fft1392, _mm512_shuffle_f32x4(fft1476, fft1476, 177));
__m512 fft1401 = _mm512_mask_mov_ps(fft1393, 49344, fft1394);
__m512 fft1485 = _mm512_mask_mov_ps(fft1477, 49344, fft1478);
__m512 fft1402 = _mm512_mask_sub_ps(fft1394, 49344, _mm512_setzero_ps(), fft1393);
__m512 fft1486 = _mm512_mask_sub_ps(fft1478, 49344, _mm512_setzero_ps(), fft1477);
__m512 fft1403 = _mm512_mask_mov_ps(fft1395, 49344, fft1396);
__m512 fft1487 = _mm512_mask_mov_ps(fft1479, 49344, fft1480);
__m512 fft1404 = _mm512_mask_sub_ps(fft1396, 49344, _mm512_setzero_ps(), fft1395);
__m512 fft1488 = _mm512_mask_sub_ps(fft1480, 49344, _mm512_setzero_ps(), fft1479);
__m512 fft1405 = _mm512_mask_mov_ps(fft1397, 49344, fft1398);
__m512 fft1489 = _mm512_mask_mov_ps(fft1481, 49344, fft1482);
__m512 fft1406 = _mm512_mask_sub_ps(fft1398, 49344, _mm512_setzero_ps(), fft1397);
__m512 fft1490 = _mm512_mask_sub_ps(fft1482, 49344, _mm512_setzero_ps(), fft1481);
__m512 fft1407 = _mm512_mask_mov_ps(fft1399, 49344, fft1400);
__m512 fft1491 = _mm512_mask_mov_ps(fft1483, 49344, fft1484);
__m512 fft1408 = _mm512_mask_sub_ps(fft1400, 49344, _mm512_setzero_ps(), fft1399);
__m512 fft1492 = _mm512_mask_sub_ps(fft1484, 49344, _mm512_setzero_ps(), fft1483);
__m512 fft1409 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1410 = _mm512_fmadd_ps(fft1401, fft1409, _mm512_shuffle_ps(fft1401, fft1401, 78));
__m512 fft1493 = _mm512_fmadd_ps(fft1485, fft1409, _mm512_shuffle_ps(fft1485, fft1485, 78));
__m512 fft1411 = _mm512_fmadd_ps(fft1402, fft1409, _mm512_shuffle_ps(fft1402, fft1402, 78));
__m512 fft1494 = _mm512_fmadd_ps(fft1486, fft1409, _mm512_shuffle_ps(fft1486, fft1486, 78));
__m512 fft1412 = _mm512_fmadd_ps(fft1403, fft1409, _mm512_shuffle_ps(fft1403, fft1403, 78));
__m512 fft1495 = _mm512_fmadd_ps(fft1487, fft1409, _mm512_shuffle_ps(fft1487, fft1487, 78));
__m512 fft1413 = _mm512_fmadd_ps(fft1404, fft1409, _mm512_shuffle_ps(fft1404, fft1404, 78));
__m512 fft1496 = _mm512_fmadd_ps(fft1488, fft1409, _mm512_shuffle_ps(fft1488, fft1488, 78));
__m512 fft1414 = _mm512_fmadd_ps(fft1405, fft1409, _mm512_shuffle_ps(fft1405, fft1405, 78));
__m512 fft1497 = _mm512_fmadd_ps(fft1489, fft1409, _mm512_shuffle_ps(fft1489, fft1489, 78));
__m512 fft1415 = _mm512_fmadd_ps(fft1406, fft1409, _mm512_shuffle_ps(fft1406, fft1406, 78));
__m512 fft1498 = _mm512_fmadd_ps(fft1490, fft1409, _mm512_shuffle_ps(fft1490, fft1490, 78));
__m512 fft1416 = _mm512_fmadd_ps(fft1407, fft1409, _mm512_shuffle_ps(fft1407, fft1407, 78));
__m512 fft1499 = _mm512_fmadd_ps(fft1491, fft1409, _mm512_shuffle_ps(fft1491, fft1491, 78));
__m512 fft1417 = _mm512_fmadd_ps(fft1408, fft1409, _mm512_shuffle_ps(fft1408, fft1408, 78));
__m512 fft1500 = _mm512_fmadd_ps(fft1492, fft1409, _mm512_shuffle_ps(fft1492, fft1492, 78));
__m512i fft1418 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1419 = _mm512_permutexvar_ps(fft1418, fft1410);
__m512 fft1501 = _mm512_permutexvar_ps(fft1418, fft1493);
__m512i fft1420 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1421 = _mm512_permutexvar_ps(fft1420, fft1410);
__m512 fft1502 = _mm512_permutexvar_ps(fft1420, fft1493);
__m512 fft1422 = _mm512_permutexvar_ps(fft1418, fft1411);
__m512 fft1503 = _mm512_permutexvar_ps(fft1418, fft1494);
__m512 fft1423 = _mm512_permutexvar_ps(fft1420, fft1411);
__m512 fft1504 = _mm512_permutexvar_ps(fft1420, fft1494);
__m512 fft1424 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1425 = _mm512_fmadd_ps(fft1419, fft1424, fft1421);
__m512 fft1505 = _mm512_fmadd_ps(fft1501, fft1424, fft1502);
__m512 fft1426 = _mm512_fnmadd_ps(fft1423, fft1424, fft1422);
__m512 fft1506 = _mm512_fnmadd_ps(fft1504, fft1424, fft1503);
__m512 fft1427 = _mm512_mask_mov_ps(fft1423, 21845, fft1425);
__m512 fft1507 = _mm512_mask_mov_ps(fft1504, 21845, fft1505);
__m512 fft1428 = _mm512_mask_mov_ps(fft1419, 43176, fft1425);
__m512 fft1508 = _mm512_mask_mov_ps(fft1501, 43176, fft1505);
__m512 fft1429 = _mm512_mask_mov_ps(fft1427, 43176, fft1426);
__m512 fft1509 = _mm512_mask_mov_ps(fft1507, 43176, fft1506);
__m512 fft1430 = _mm512_mask_mov_ps(fft1428, 22102, fft1426);
__m512 fft1510 = _mm512_mask_mov_ps(fft1508, 22102, fft1506);
__m512 fft1431 = _mm512_mask_mul_ps(fft1429, 64764, fft1429, _mm512_set1_ps(5e-01f));
__m512 fft1511 = _mm512_mask_mul_ps(fft1509, 64764, fft1509, _mm512_set1_ps(5e-01f));
__m512 fft1432 = _mm512_mask_mul_ps(fft1430, 64764, fft1430, _mm512_set1_ps(5e-01f));
__m512 fft1512 = _mm512_mask_mul_ps(fft1510, 64764, fft1510, _mm512_set1_ps(5e-01f));
__m512 df81 = fft1431;
__m512 df89 = fft1511;
__m512 df82 = fft1432;
__m512 df90 = fft1512;
__m512 df83 = fft1412;
__m512 df91 = fft1495;
__m512 df84 = fft1413;
__m512 df92 = fft1496;
__m512 df85 = fft1414;
__m512 df93 = fft1497;
__m512 df86 = fft1415;
__m512 df94 = fft1498;
__m512 df87 = fft1416;
__m512 df95 = fft1499;
__m512 df88 = fft1417;
__m512 df96 = fft1500;
__m512i eo9 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df83 = _mm512_permutexvar_ps(eo9, df83);
df84 = _mm512_permutexvar_ps(eo9, df84);
_mm512_mask_storeu_ps(dfPtr1+5376+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df83);
_mm512_mask_storeu_ps(dfPtr1+5440+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df84);
_mm512_mask_storeu_ps(dfPtr1+91360+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df83);
_mm512_mask_storeu_ps(dfPtr1+91424+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df84);
df91 = _mm512_permutexvar_ps(eo9, df91);
df92 = _mm512_permutexvar_ps(eo9, df92);
_mm512_mask_storeu_ps(dfPtr1+177408+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df91);
_mm512_mask_storeu_ps(dfPtr1+177472+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df92);
_mm512_mask_storeu_ps(dfPtr1+263392+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df91);
_mm512_mask_storeu_ps(dfPtr1+263456+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df92);
df85 = _mm512_permutexvar_ps(eo9, df85);
df86 = _mm512_permutexvar_ps(eo9, df86);
_mm512_mask_storeu_ps(dfPtr1+10752+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df85);
_mm512_mask_storeu_ps(dfPtr1+10816+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df86);
_mm512_mask_storeu_ps(dfPtr1+96736+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df85);
_mm512_mask_storeu_ps(dfPtr1+96800+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df86);
df93 = _mm512_permutexvar_ps(eo9, df93);
df94 = _mm512_permutexvar_ps(eo9, df94);
_mm512_mask_storeu_ps(dfPtr1+182784+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df93);
_mm512_mask_storeu_ps(dfPtr1+182848+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df94);
_mm512_mask_storeu_ps(dfPtr1+268768+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df93);
_mm512_mask_storeu_ps(dfPtr1+268832+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df94);
df87 = _mm512_permutexvar_ps(eo9, df87);
df88 = _mm512_permutexvar_ps(eo9, df88);
_mm512_mask_storeu_ps(dfPtr1+16128+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df87);
_mm512_mask_storeu_ps(dfPtr1+16192+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df88);
_mm512_mask_storeu_ps(dfPtr1+102112+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df87);
_mm512_mask_storeu_ps(dfPtr1+102176+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df88);
df95 = _mm512_permutexvar_ps(eo9, df95);
df96 = _mm512_permutexvar_ps(eo9, df96);
_mm512_mask_storeu_ps(dfPtr1+188160+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df95);
_mm512_mask_storeu_ps(dfPtr1+188224+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df96);
_mm512_mask_storeu_ps(dfPtr1+274144+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df95);
_mm512_mask_storeu_ps(dfPtr1+274208+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df96);
_mm512_mask_storeu_ps(dfPtr1+0+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df81);
_mm512_mask_storeu_ps(dfPtr1+64+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df82);
_mm512_mask_storeu_ps(dfPtr1+85984+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df81);
_mm512_mask_storeu_ps(dfPtr1+86048+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df82);
_mm512_mask_storeu_ps(dfPtr1+172032+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df89);
_mm512_mask_storeu_ps(dfPtr1+172096+21504*i6+5376*j2+384*k3+128*m9+32*f10, 255, df90);
_mm512_mask_storeu_ps(dfPtr1+258016+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df89);
_mm512_mask_storeu_ps(dfPtr1+258080+21504*i6+5376*j2+384*k3+128*m9+32*f10, 65280, df90);
}
++j2;
}
}

static void Example8StriderArrangeDats1(Example8ThreaderTeam1* team15, char** tensors3) {
Example8ThreaderTask1 task7;
task7.callee1 = Example8StriderArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 1;
task7.hull1[1] = 1;
task7.hull1[2] = 2;
task7.hull1[3] = 1;
Example8ThreaderDo1(team15, &task7);
}

static void Example8StriderProduceSums1Callee1(Example8ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t e3 = 0;
ptrdiff_t z2 = (ptrdiff_t)tuple2[2];
ptrdiff_t g4 = pt9[3];
ptrdiff_t p1 = pt9[2];
ptrdiff_t d1 = 0;
ptrdiff_t w2 = 0;
if (__builtin_expect(!(e3|z2), 0)) {
z2 = 0;
char*restrict bfPtr2 = tensors6[0]+992*e3;
char*restrict wfPtr2 = tensors6[0]+1024+50282496*e3+444416*z2;
char*restrict dfPtr2 = tensors6[1]+9732096*e3+86016*z2;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i7 = 1*g4;
ptrdiff_t j3 = 2*p1;
ptrdiff_t jj2 = j3+1;
if (__builtin_expect(!j3, 0)) {
ptrdiff_t k4 = 1*d1;
for (; k4 != 1; ++k4) {
ptrdiff_t l1 = 16*w2;
for (; l1 != 15; ++l1) {
__m512 sfRe1 = _mm512_setzero_ps();
__m512 sfIm1 = _mm512_setzero_ps();
__m512 sfRe7 = _mm512_setzero_ps();
__m512 sfIm7 = _mm512_setzero_ps();
sfRe1 = _mm512_mask_mov_ps(sfRe1, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+248*i7+16*l1)));
sfRe1 = _mm512_mask_mov_ps(sfRe1, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+248*i7+16*l1)));
sfRe7 = _mm512_mask_mov_ps(sfRe7, 1, _mm512_set1_ps(*(float*)(bfPtr2+8+248*i7+16*l1)));
sfRe7 = _mm512_mask_mov_ps(sfRe7, 256, _mm512_set1_ps(*(float*)(bfPtr2+12+248*i7+16*l1)));
__m512 sfRe2 = sfRe1;
__m512 sfIm2 = sfIm1;
__m512 sfRe3 = sfRe1;
__m512 sfIm3 = sfIm1;
__m512 sfRe4 = sfRe1;
__m512 sfIm4 = sfIm1;
__m512 sfRe5 = sfRe1;
__m512 sfIm5 = sfIm1;
__m512 sfRe6 = sfRe1;
__m512 sfIm6 = sfIm1;
__m512 sfRe8 = sfRe7;
__m512 sfIm8 = sfIm7;
__m512 sfRe9 = sfRe7;
__m512 sfIm9 = sfIm7;
__m512 sfRe10 = sfRe7;
__m512 sfIm10 = sfIm7;
__m512 sfRe11 = sfRe7;
__m512 sfIm11 = sfIm7;
__m512 sfRe12 = sfRe7;
__m512 sfIm12 = sfIm7;
for (ptrdiff_t s2 = 0; s2 < 14; ++s2) {
__m512i wfLd1 = _mm512_loadu_si512(wfPtr2+0+111104*i7+27776*j3+1792*l1+128*s2);
__m512 wfRe1 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd1));
__m512 wfIm1 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd1, 1));
__m512 wfMx1 = _mm512_mask_mov_ps(wfIm1, 64764, wfRe1);
__m512i wfLd2 = _mm512_loadu_si512(wfPtr2+64+111104*i7+27776*j3+1792*l1+128*s2);
__m512 wfRe2 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd2));
__m512 wfIm2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd2, 1));
__m512 wfMx2 = _mm512_mask_mov_ps(wfIm2, 64764, wfRe2);
__m512 dfRe1 = _mm512_loadu_ps(dfPtr2+0+21504*i7+5376*j3+5376*k4+384*s2);
__m512 dfIm1 = _mm512_loadu_ps(dfPtr2+64+21504*i7+5376*j3+5376*k4+384*s2);
sfRe1 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe1);
sfRe1 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe1, 64764);
sfIm1 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm1);
sfIm1 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm1, 64764);
sfRe7 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe7);
sfRe7 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe7, 64764);
sfIm7 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm7);
sfIm7 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm7, 64764);
dfRe1 = _mm512_shuffle_f32x4(dfRe1, dfRe1, 78);
dfIm1 = _mm512_shuffle_f32x4(dfIm1, dfIm1, 78);
sfRe2 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe2);
sfRe2 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe2, 64764);
sfIm2 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm2);
sfIm2 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm2, 64764);
sfRe8 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe8);
sfRe8 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe8, 64764);
sfIm8 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm8);
sfIm8 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm8, 64764);
__m512 dfRe2 = _mm512_loadu_ps(dfPtr2+128+21504*i7+5376*j3+5376*k4+384*s2);
__m512 dfIm2 = _mm512_loadu_ps(dfPtr2+192+21504*i7+5376*j3+5376*k4+384*s2);
sfRe3 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe3);
sfRe3 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe3, 64764);
sfIm3 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm3);
sfIm3 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm3, 64764);
sfRe9 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe9);
sfRe9 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe9, 64764);
sfIm9 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm9);
sfIm9 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm9, 64764);
dfRe2 = _mm512_shuffle_f32x4(dfRe2, dfRe2, 78);
dfIm2 = _mm512_shuffle_f32x4(dfIm2, dfIm2, 78);
sfRe4 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe4);
sfRe4 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe4, 64764);
sfIm4 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm4);
sfIm4 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm4, 64764);
sfRe10 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe10);
sfRe10 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe10, 64764);
sfIm10 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm10);
sfIm10 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm10, 64764);
__m512 dfRe3 = _mm512_loadu_ps(dfPtr2+256+21504*i7+5376*j3+5376*k4+384*s2);
__m512 dfIm3 = _mm512_loadu_ps(dfPtr2+320+21504*i7+5376*j3+5376*k4+384*s2);
sfRe5 = _mm512_fmadd_ps(wfRe1, dfRe3, sfRe5);
sfRe5 = _mm512_mask3_fmadd_ps(wfIm1, dfIm3, sfRe5, 64764);
sfIm5 = _mm512_fmadd_ps(wfMx1, dfIm3, sfIm5);
sfIm5 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe3, sfIm5, 64764);
sfRe11 = _mm512_fmadd_ps(wfRe2, dfRe3, sfRe11);
sfRe11 = _mm512_mask3_fmadd_ps(wfIm2, dfIm3, sfRe11, 64764);
sfIm11 = _mm512_fmadd_ps(wfMx2, dfIm3, sfIm11);
sfIm11 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe3, sfIm11, 64764);
dfRe3 = _mm512_shuffle_f32x4(dfRe3, dfRe3, 78);
dfIm3 = _mm512_shuffle_f32x4(dfIm3, dfIm3, 78);
sfRe6 = _mm512_fmadd_ps(wfRe1, dfRe3, sfRe6);
sfRe6 = _mm512_mask3_fmadd_ps(wfIm1, dfIm3, sfRe6, 64764);
sfIm6 = _mm512_fmadd_ps(wfMx1, dfIm3, sfIm6);
sfIm6 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe3, sfIm6, 64764);
sfRe12 = _mm512_fmadd_ps(wfRe2, dfRe3, sfRe12);
sfRe12 = _mm512_mask3_fmadd_ps(wfIm2, dfIm3, sfRe12, 64764);
sfIm12 = _mm512_fmadd_ps(wfMx2, dfIm3, sfIm12);
sfIm12 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe3, sfIm12, 64764);
}
_mm512_storeu_ps(sfPtr1+0+93696*i7+23424*j3+23424*k4+1536*l1, sfRe1);
_mm512_storeu_ps(sfPtr1+64+93696*i7+23424*j3+23424*k4+1536*l1, sfIm1);
_mm512_storeu_ps(sfPtr1+128+93696*i7+23424*j3+23424*k4+1536*l1, sfRe2);
_mm512_storeu_ps(sfPtr1+192+93696*i7+23424*j3+23424*k4+1536*l1, sfIm2);
_mm512_storeu_ps(sfPtr1+256+93696*i7+23424*j3+23424*k4+1536*l1, sfRe3);
_mm512_storeu_ps(sfPtr1+320+93696*i7+23424*j3+23424*k4+1536*l1, sfIm3);
_mm512_storeu_ps(sfPtr1+384+93696*i7+23424*j3+23424*k4+1536*l1, sfRe4);
_mm512_storeu_ps(sfPtr1+448+93696*i7+23424*j3+23424*k4+1536*l1, sfIm4);
_mm512_storeu_ps(sfPtr1+512+93696*i7+23424*j3+23424*k4+1536*l1, sfRe5);
_mm512_storeu_ps(sfPtr1+576+93696*i7+23424*j3+23424*k4+1536*l1, sfIm5);
_mm512_storeu_ps(sfPtr1+640+93696*i7+23424*j3+23424*k4+1536*l1, sfRe6);
_mm512_storeu_ps(sfPtr1+704+93696*i7+23424*j3+23424*k4+1536*l1, sfIm6);
_mm512_storeu_ps(sfPtr1+768+93696*i7+23424*j3+23424*k4+1536*l1, sfRe7);
_mm512_storeu_ps(sfPtr1+832+93696*i7+23424*j3+23424*k4+1536*l1, sfIm7);
_mm512_storeu_ps(sfPtr1+896+93696*i7+23424*j3+23424*k4+1536*l1, sfRe8);
_mm512_storeu_ps(sfPtr1+960+93696*i7+23424*j3+23424*k4+1536*l1, sfIm8);
_mm512_storeu_ps(sfPtr1+1024+93696*i7+23424*j3+23424*k4+1536*l1, sfRe9);
_mm512_storeu_ps(sfPtr1+1088+93696*i7+23424*j3+23424*k4+1536*l1, sfIm9);
_mm512_storeu_ps(sfPtr1+1152+93696*i7+23424*j3+23424*k4+1536*l1, sfRe10);
_mm512_storeu_ps(sfPtr1+1216+93696*i7+23424*j3+23424*k4+1536*l1, sfIm10);
_mm512_storeu_ps(sfPtr1+1280+93696*i7+23424*j3+23424*k4+1536*l1, sfRe11);
_mm512_storeu_ps(sfPtr1+1344+93696*i7+23424*j3+23424*k4+1536*l1, sfIm11);
_mm512_storeu_ps(sfPtr1+1408+93696*i7+23424*j3+23424*k4+1536*l1, sfRe12);
_mm512_storeu_ps(sfPtr1+1472+93696*i7+23424*j3+23424*k4+1536*l1, sfIm12);
}
__m512 sfRe13 = _mm512_setzero_ps();
__m512 sfIm13 = _mm512_setzero_ps();
sfRe13 = _mm512_mask_mov_ps(sfRe13, 257, _mm512_set1_ps(*(float*)(bfPtr2+0+248*i7+16*l1)));
__m512 sfRe14 = sfRe13;
__m512 sfIm14 = sfIm13;
__m512 sfRe15 = sfRe13;
__m512 sfIm15 = sfIm13;
for (ptrdiff_t s3 = 0; s3 < 14; ++s3) {
__m512i wfLd3 = _mm512_loadu_si512(wfPtr2+0+111104*i7+27776*j3+1792*l1+64*s3);
__m512 wfRe3 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd3));
__m512 wfIm3 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd3, 1));
__m512 wfMx3 = _mm512_mask_mov_ps(wfIm3, 64764, wfRe3);
__m512 dfRe4 = _mm512_loadu_ps(dfPtr2+0+21504*i7+5376*j3+5376*k4+384*s3);
__m512 dfIm4 = _mm512_loadu_ps(dfPtr2+64+21504*i7+5376*j3+5376*k4+384*s3);
sfRe13 = _mm512_fmadd_ps(wfRe3, dfRe4, sfRe13);
sfRe13 = _mm512_mask3_fmadd_ps(wfIm3, dfIm4, sfRe13, 64764);
sfIm13 = _mm512_fmadd_ps(wfMx3, dfIm4, sfIm13);
sfIm13 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe4, sfIm13, 64764);
__m512 dfRe5 = _mm512_loadu_ps(dfPtr2+128+21504*i7+5376*j3+5376*k4+384*s3);
__m512 dfIm5 = _mm512_loadu_ps(dfPtr2+192+21504*i7+5376*j3+5376*k4+384*s3);
sfRe14 = _mm512_fmadd_ps(wfRe3, dfRe5, sfRe14);
sfRe14 = _mm512_mask3_fmadd_ps(wfIm3, dfIm5, sfRe14, 64764);
sfIm14 = _mm512_fmadd_ps(wfMx3, dfIm5, sfIm14);
sfIm14 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe5, sfIm14, 64764);
__m512 dfRe6 = _mm512_loadu_ps(dfPtr2+256+21504*i7+5376*j3+5376*k4+384*s3);
__m512 dfIm6 = _mm512_loadu_ps(dfPtr2+320+21504*i7+5376*j3+5376*k4+384*s3);
sfRe15 = _mm512_fmadd_ps(wfRe3, dfRe6, sfRe15);
sfRe15 = _mm512_mask3_fmadd_ps(wfIm3, dfIm6, sfRe15, 64764);
sfIm15 = _mm512_fmadd_ps(wfMx3, dfIm6, sfIm15);
sfIm15 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe6, sfIm15, 64764);
}
_mm512_storeu_ps(sfPtr1+0+93696*i7+23424*j3+23424*k4+1536*l1, sfRe13);
_mm512_storeu_ps(sfPtr1+64+93696*i7+23424*j3+23424*k4+1536*l1, sfIm13);
_mm512_storeu_ps(sfPtr1+128+93696*i7+23424*j3+23424*k4+1536*l1, sfRe14);
_mm512_storeu_ps(sfPtr1+192+93696*i7+23424*j3+23424*k4+1536*l1, sfIm14);
_mm512_storeu_ps(sfPtr1+256+93696*i7+23424*j3+23424*k4+1536*l1, sfRe15);
_mm512_storeu_ps(sfPtr1+320+93696*i7+23424*j3+23424*k4+1536*l1, sfIm15);
}
j3 = 1;
}
for (; j3 <= jj2; ++j3) {
ptrdiff_t k5 = 1*d1;
for (; k5 != 1; ++k5) {
ptrdiff_t l2 = 16*w2;
for (; l2 != 15; ++l2) {
__m512 sfRe16 = _mm512_setzero_ps();
__m512 sfIm16 = _mm512_setzero_ps();
__m512 sfRe22 = _mm512_setzero_ps();
__m512 sfIm22 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe17 = sfRe16;
__m512 sfIm17 = sfIm16;
__m512 sfRe18 = sfRe16;
__m512 sfIm18 = sfIm16;
__m512 sfRe19 = sfRe16;
__m512 sfIm19 = sfIm16;
__m512 sfRe20 = sfRe16;
__m512 sfIm20 = sfIm16;
__m512 sfRe21 = sfRe16;
__m512 sfIm21 = sfIm16;
__m512 sfRe23 = sfRe22;
__m512 sfIm23 = sfIm22;
__m512 sfRe24 = sfRe22;
__m512 sfIm24 = sfIm22;
__m512 sfRe25 = sfRe22;
__m512 sfIm25 = sfIm22;
__m512 sfRe26 = sfRe22;
__m512 sfIm26 = sfIm22;
__m512 sfRe27 = sfRe22;
__m512 sfIm27 = sfIm22;
for (ptrdiff_t s4 = 0; s4 < 14; ++s4) {
__m512i wfLd4 = _mm512_loadu_si512(wfPtr2+0+111104*i7+27776*j3+1792*l2+128*s4);
__m512 wfRe4 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd4));
__m512 wfIm4 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd4, 1));
__m512i wfLd5 = _mm512_loadu_si512(wfPtr2+64+111104*i7+27776*j3+1792*l2+128*s4);
__m512 wfRe5 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd5));
__m512 wfIm5 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd5, 1));
__m512 dfRe7 = _mm512_loadu_ps(dfPtr2+0+21504*i7+5376*j3+5376*k5+384*s4);
__m512 dfIm7 = _mm512_loadu_ps(dfPtr2+64+21504*i7+5376*j3+5376*k5+384*s4);
sfRe16 = _mm512_fmadd_ps(wfRe4, dfRe7, sfRe16);
sfRe16 = _mm512_fmadd_ps(wfIm4, dfIm7, sfRe16);
sfIm16 = _mm512_fmadd_ps(wfRe4, dfIm7, sfIm16);
sfIm16 = _mm512_fnmadd_ps(wfIm4, dfRe7, sfIm16);
sfRe22 = _mm512_fmadd_ps(wfRe5, dfRe7, sfRe22);
sfRe22 = _mm512_fmadd_ps(wfIm5, dfIm7, sfRe22);
sfIm22 = _mm512_fmadd_ps(wfRe5, dfIm7, sfIm22);
sfIm22 = _mm512_fnmadd_ps(wfIm5, dfRe7, sfIm22);
dfRe7 = _mm512_shuffle_f32x4(dfRe7, dfRe7, 78);
dfIm7 = _mm512_shuffle_f32x4(dfIm7, dfIm7, 78);
sfRe17 = _mm512_fmadd_ps(wfRe4, dfRe7, sfRe17);
sfRe17 = _mm512_fmadd_ps(wfIm4, dfIm7, sfRe17);
sfIm17 = _mm512_fmadd_ps(wfRe4, dfIm7, sfIm17);
sfIm17 = _mm512_fnmadd_ps(wfIm4, dfRe7, sfIm17);
sfRe23 = _mm512_fmadd_ps(wfRe5, dfRe7, sfRe23);
sfRe23 = _mm512_fmadd_ps(wfIm5, dfIm7, sfRe23);
sfIm23 = _mm512_fmadd_ps(wfRe5, dfIm7, sfIm23);
sfIm23 = _mm512_fnmadd_ps(wfIm5, dfRe7, sfIm23);
__m512 dfRe8 = _mm512_loadu_ps(dfPtr2+128+21504*i7+5376*j3+5376*k5+384*s4);
__m512 dfIm8 = _mm512_loadu_ps(dfPtr2+192+21504*i7+5376*j3+5376*k5+384*s4);
sfRe18 = _mm512_fmadd_ps(wfRe4, dfRe8, sfRe18);
sfRe18 = _mm512_fmadd_ps(wfIm4, dfIm8, sfRe18);
sfIm18 = _mm512_fmadd_ps(wfRe4, dfIm8, sfIm18);
sfIm18 = _mm512_fnmadd_ps(wfIm4, dfRe8, sfIm18);
sfRe24 = _mm512_fmadd_ps(wfRe5, dfRe8, sfRe24);
sfRe24 = _mm512_fmadd_ps(wfIm5, dfIm8, sfRe24);
sfIm24 = _mm512_fmadd_ps(wfRe5, dfIm8, sfIm24);
sfIm24 = _mm512_fnmadd_ps(wfIm5, dfRe8, sfIm24);
dfRe8 = _mm512_shuffle_f32x4(dfRe8, dfRe8, 78);
dfIm8 = _mm512_shuffle_f32x4(dfIm8, dfIm8, 78);
sfRe19 = _mm512_fmadd_ps(wfRe4, dfRe8, sfRe19);
sfRe19 = _mm512_fmadd_ps(wfIm4, dfIm8, sfRe19);
sfIm19 = _mm512_fmadd_ps(wfRe4, dfIm8, sfIm19);
sfIm19 = _mm512_fnmadd_ps(wfIm4, dfRe8, sfIm19);
sfRe25 = _mm512_fmadd_ps(wfRe5, dfRe8, sfRe25);
sfRe25 = _mm512_fmadd_ps(wfIm5, dfIm8, sfRe25);
sfIm25 = _mm512_fmadd_ps(wfRe5, dfIm8, sfIm25);
sfIm25 = _mm512_fnmadd_ps(wfIm5, dfRe8, sfIm25);
__m512 dfRe9 = _mm512_loadu_ps(dfPtr2+256+21504*i7+5376*j3+5376*k5+384*s4);
__m512 dfIm9 = _mm512_loadu_ps(dfPtr2+320+21504*i7+5376*j3+5376*k5+384*s4);
sfRe20 = _mm512_fmadd_ps(wfRe4, dfRe9, sfRe20);
sfRe20 = _mm512_fmadd_ps(wfIm4, dfIm9, sfRe20);
sfIm20 = _mm512_fmadd_ps(wfRe4, dfIm9, sfIm20);
sfIm20 = _mm512_fnmadd_ps(wfIm4, dfRe9, sfIm20);
sfRe26 = _mm512_fmadd_ps(wfRe5, dfRe9, sfRe26);
sfRe26 = _mm512_fmadd_ps(wfIm5, dfIm9, sfRe26);
sfIm26 = _mm512_fmadd_ps(wfRe5, dfIm9, sfIm26);
sfIm26 = _mm512_fnmadd_ps(wfIm5, dfRe9, sfIm26);
dfRe9 = _mm512_shuffle_f32x4(dfRe9, dfRe9, 78);
dfIm9 = _mm512_shuffle_f32x4(dfIm9, dfIm9, 78);
sfRe21 = _mm512_fmadd_ps(wfRe4, dfRe9, sfRe21);
sfRe21 = _mm512_fmadd_ps(wfIm4, dfIm9, sfRe21);
sfIm21 = _mm512_fmadd_ps(wfRe4, dfIm9, sfIm21);
sfIm21 = _mm512_fnmadd_ps(wfIm4, dfRe9, sfIm21);
sfRe27 = _mm512_fmadd_ps(wfRe5, dfRe9, sfRe27);
sfRe27 = _mm512_fmadd_ps(wfIm5, dfIm9, sfRe27);
sfIm27 = _mm512_fmadd_ps(wfRe5, dfIm9, sfIm27);
sfIm27 = _mm512_fnmadd_ps(wfIm5, dfRe9, sfIm27);
}
_mm512_storeu_ps(sfPtr1+0+93696*i7+23424*j3+23424*k5+1536*l2, sfRe16);
_mm512_storeu_ps(sfPtr1+64+93696*i7+23424*j3+23424*k5+1536*l2, sfIm16);
_mm512_storeu_ps(sfPtr1+128+93696*i7+23424*j3+23424*k5+1536*l2, sfRe17);
_mm512_storeu_ps(sfPtr1+192+93696*i7+23424*j3+23424*k5+1536*l2, sfIm17);
_mm512_storeu_ps(sfPtr1+256+93696*i7+23424*j3+23424*k5+1536*l2, sfRe18);
_mm512_storeu_ps(sfPtr1+320+93696*i7+23424*j3+23424*k5+1536*l2, sfIm18);
_mm512_storeu_ps(sfPtr1+384+93696*i7+23424*j3+23424*k5+1536*l2, sfRe19);
_mm512_storeu_ps(sfPtr1+448+93696*i7+23424*j3+23424*k5+1536*l2, sfIm19);
_mm512_storeu_ps(sfPtr1+512+93696*i7+23424*j3+23424*k5+1536*l2, sfRe20);
_mm512_storeu_ps(sfPtr1+576+93696*i7+23424*j3+23424*k5+1536*l2, sfIm20);
_mm512_storeu_ps(sfPtr1+640+93696*i7+23424*j3+23424*k5+1536*l2, sfRe21);
_mm512_storeu_ps(sfPtr1+704+93696*i7+23424*j3+23424*k5+1536*l2, sfIm21);
_mm512_storeu_ps(sfPtr1+768+93696*i7+23424*j3+23424*k5+1536*l2, sfRe22);
_mm512_storeu_ps(sfPtr1+832+93696*i7+23424*j3+23424*k5+1536*l2, sfIm22);
_mm512_storeu_ps(sfPtr1+896+93696*i7+23424*j3+23424*k5+1536*l2, sfRe23);
_mm512_storeu_ps(sfPtr1+960+93696*i7+23424*j3+23424*k5+1536*l2, sfIm23);
_mm512_storeu_ps(sfPtr1+1024+93696*i7+23424*j3+23424*k5+1536*l2, sfRe24);
_mm512_storeu_ps(sfPtr1+1088+93696*i7+23424*j3+23424*k5+1536*l2, sfIm24);
_mm512_storeu_ps(sfPtr1+1152+93696*i7+23424*j3+23424*k5+1536*l2, sfRe25);
_mm512_storeu_ps(sfPtr1+1216+93696*i7+23424*j3+23424*k5+1536*l2, sfIm25);
_mm512_storeu_ps(sfPtr1+1280+93696*i7+23424*j3+23424*k5+1536*l2, sfRe26);
_mm512_storeu_ps(sfPtr1+1344+93696*i7+23424*j3+23424*k5+1536*l2, sfIm26);
_mm512_storeu_ps(sfPtr1+1408+93696*i7+23424*j3+23424*k5+1536*l2, sfRe27);
_mm512_storeu_ps(sfPtr1+1472+93696*i7+23424*j3+23424*k5+1536*l2, sfIm27);
}
__m512 sfRe28 = _mm512_setzero_ps();
__m512 sfIm28 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe29 = sfRe28;
__m512 sfIm29 = sfIm28;
__m512 sfRe30 = sfRe28;
__m512 sfIm30 = sfIm28;
for (ptrdiff_t s5 = 0; s5 < 14; ++s5) {
__m512i wfLd6 = _mm512_loadu_si512(wfPtr2+0+111104*i7+27776*j3+1792*l2+64*s5);
__m512 wfRe6 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd6));
__m512 wfIm6 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd6, 1));
__m512 dfRe10 = _mm512_loadu_ps(dfPtr2+0+21504*i7+5376*j3+5376*k5+384*s5);
__m512 dfIm10 = _mm512_loadu_ps(dfPtr2+64+21504*i7+5376*j3+5376*k5+384*s5);
sfRe28 = _mm512_fmadd_ps(wfRe6, dfRe10, sfRe28);
sfRe28 = _mm512_fmadd_ps(wfIm6, dfIm10, sfRe28);
sfIm28 = _mm512_fmadd_ps(wfRe6, dfIm10, sfIm28);
sfIm28 = _mm512_fnmadd_ps(wfIm6, dfRe10, sfIm28);
__m512 dfRe11 = _mm512_loadu_ps(dfPtr2+128+21504*i7+5376*j3+5376*k5+384*s5);
__m512 dfIm11 = _mm512_loadu_ps(dfPtr2+192+21504*i7+5376*j3+5376*k5+384*s5);
sfRe29 = _mm512_fmadd_ps(wfRe6, dfRe11, sfRe29);
sfRe29 = _mm512_fmadd_ps(wfIm6, dfIm11, sfRe29);
sfIm29 = _mm512_fmadd_ps(wfRe6, dfIm11, sfIm29);
sfIm29 = _mm512_fnmadd_ps(wfIm6, dfRe11, sfIm29);
__m512 dfRe12 = _mm512_loadu_ps(dfPtr2+256+21504*i7+5376*j3+5376*k5+384*s5);
__m512 dfIm12 = _mm512_loadu_ps(dfPtr2+320+21504*i7+5376*j3+5376*k5+384*s5);
sfRe30 = _mm512_fmadd_ps(wfRe6, dfRe12, sfRe30);
sfRe30 = _mm512_fmadd_ps(wfIm6, dfIm12, sfRe30);
sfIm30 = _mm512_fmadd_ps(wfRe6, dfIm12, sfIm30);
sfIm30 = _mm512_fnmadd_ps(wfIm6, dfRe12, sfIm30);
}
_mm512_storeu_ps(sfPtr1+0+93696*i7+23424*j3+23424*k5+1536*l2, sfRe28);
_mm512_storeu_ps(sfPtr1+64+93696*i7+23424*j3+23424*k5+1536*l2, sfIm28);
_mm512_storeu_ps(sfPtr1+128+93696*i7+23424*j3+23424*k5+1536*l2, sfRe29);
_mm512_storeu_ps(sfPtr1+192+93696*i7+23424*j3+23424*k5+1536*l2, sfIm29);
_mm512_storeu_ps(sfPtr1+256+93696*i7+23424*j3+23424*k5+1536*l2, sfRe30);
_mm512_storeu_ps(sfPtr1+320+93696*i7+23424*j3+23424*k5+1536*l2, sfIm30);
}
}
return;
}
char*restrict bfPtr3 = tensors6[0]+992*e3;
char*restrict wfPtr3 = tensors6[0]+1024+50282496*e3+444416*z2;
char*restrict dfPtr3 = tensors6[1]+9732096*e3+86016*z2;
char*restrict sfPtr2 = tensors6[2];
ptrdiff_t i8 = 1*g4;
ptrdiff_t j4 = 2*p1;
ptrdiff_t jj3 = j4+1;
if (__builtin_expect(!j4, 0)) {
ptrdiff_t k6 = 1*d1;
for (; k6 != 1; ++k6) {
ptrdiff_t l3 = 16*w2;
for (; l3 != 15; ++l3) {
__m512 sfRe31 = _mm512_setzero_ps();
__m512 sfIm31 = _mm512_setzero_ps();
__m512 sfRe37 = _mm512_setzero_ps();
__m512 sfIm37 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe32 = sfRe31;
__m512 sfIm32 = sfIm31;
__m512 sfRe33 = sfRe31;
__m512 sfIm33 = sfIm31;
__m512 sfRe34 = sfRe31;
__m512 sfIm34 = sfIm31;
__m512 sfRe35 = sfRe31;
__m512 sfIm35 = sfIm31;
__m512 sfRe36 = sfRe31;
__m512 sfIm36 = sfIm31;
__m512 sfRe38 = sfRe37;
__m512 sfIm38 = sfIm37;
__m512 sfRe39 = sfRe37;
__m512 sfIm39 = sfIm37;
__m512 sfRe40 = sfRe37;
__m512 sfIm40 = sfIm37;
__m512 sfRe41 = sfRe37;
__m512 sfIm41 = sfIm37;
__m512 sfRe42 = sfRe37;
__m512 sfIm42 = sfIm37;
for (ptrdiff_t s6 = 0; s6 < 14; ++s6) {
__m512i wfLd7 = _mm512_loadu_si512(wfPtr3+0+111104*i8+27776*j4+1792*l3+128*s6);
__m512 wfRe7 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd7));
__m512 wfIm7 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd7, 1));
__m512 wfMx4 = _mm512_mask_mov_ps(wfIm7, 64764, wfRe7);
__m512i wfLd8 = _mm512_loadu_si512(wfPtr3+64+111104*i8+27776*j4+1792*l3+128*s6);
__m512 wfRe8 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd8));
__m512 wfIm8 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd8, 1));
__m512 wfMx5 = _mm512_mask_mov_ps(wfIm8, 64764, wfRe8);
__m512 dfRe13 = _mm512_loadu_ps(dfPtr3+0+21504*i8+5376*j4+5376*k6+384*s6);
__m512 dfIm13 = _mm512_loadu_ps(dfPtr3+64+21504*i8+5376*j4+5376*k6+384*s6);
sfRe31 = _mm512_fmadd_ps(wfRe7, dfRe13, sfRe31);
sfRe31 = _mm512_mask3_fmadd_ps(wfIm7, dfIm13, sfRe31, 64764);
sfIm31 = _mm512_fmadd_ps(wfMx4, dfIm13, sfIm31);
sfIm31 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe13, sfIm31, 64764);
sfRe37 = _mm512_fmadd_ps(wfRe8, dfRe13, sfRe37);
sfRe37 = _mm512_mask3_fmadd_ps(wfIm8, dfIm13, sfRe37, 64764);
sfIm37 = _mm512_fmadd_ps(wfMx5, dfIm13, sfIm37);
sfIm37 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe13, sfIm37, 64764);
dfRe13 = _mm512_shuffle_f32x4(dfRe13, dfRe13, 78);
dfIm13 = _mm512_shuffle_f32x4(dfIm13, dfIm13, 78);
sfRe32 = _mm512_fmadd_ps(wfRe7, dfRe13, sfRe32);
sfRe32 = _mm512_mask3_fmadd_ps(wfIm7, dfIm13, sfRe32, 64764);
sfIm32 = _mm512_fmadd_ps(wfMx4, dfIm13, sfIm32);
sfIm32 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe13, sfIm32, 64764);
sfRe38 = _mm512_fmadd_ps(wfRe8, dfRe13, sfRe38);
sfRe38 = _mm512_mask3_fmadd_ps(wfIm8, dfIm13, sfRe38, 64764);
sfIm38 = _mm512_fmadd_ps(wfMx5, dfIm13, sfIm38);
sfIm38 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe13, sfIm38, 64764);
__m512 dfRe14 = _mm512_loadu_ps(dfPtr3+128+21504*i8+5376*j4+5376*k6+384*s6);
__m512 dfIm14 = _mm512_loadu_ps(dfPtr3+192+21504*i8+5376*j4+5376*k6+384*s6);
sfRe33 = _mm512_fmadd_ps(wfRe7, dfRe14, sfRe33);
sfRe33 = _mm512_mask3_fmadd_ps(wfIm7, dfIm14, sfRe33, 64764);
sfIm33 = _mm512_fmadd_ps(wfMx4, dfIm14, sfIm33);
sfIm33 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe14, sfIm33, 64764);
sfRe39 = _mm512_fmadd_ps(wfRe8, dfRe14, sfRe39);
sfRe39 = _mm512_mask3_fmadd_ps(wfIm8, dfIm14, sfRe39, 64764);
sfIm39 = _mm512_fmadd_ps(wfMx5, dfIm14, sfIm39);
sfIm39 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe14, sfIm39, 64764);
dfRe14 = _mm512_shuffle_f32x4(dfRe14, dfRe14, 78);
dfIm14 = _mm512_shuffle_f32x4(dfIm14, dfIm14, 78);
sfRe34 = _mm512_fmadd_ps(wfRe7, dfRe14, sfRe34);
sfRe34 = _mm512_mask3_fmadd_ps(wfIm7, dfIm14, sfRe34, 64764);
sfIm34 = _mm512_fmadd_ps(wfMx4, dfIm14, sfIm34);
sfIm34 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe14, sfIm34, 64764);
sfRe40 = _mm512_fmadd_ps(wfRe8, dfRe14, sfRe40);
sfRe40 = _mm512_mask3_fmadd_ps(wfIm8, dfIm14, sfRe40, 64764);
sfIm40 = _mm512_fmadd_ps(wfMx5, dfIm14, sfIm40);
sfIm40 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe14, sfIm40, 64764);
__m512 dfRe15 = _mm512_loadu_ps(dfPtr3+256+21504*i8+5376*j4+5376*k6+384*s6);
__m512 dfIm15 = _mm512_loadu_ps(dfPtr3+320+21504*i8+5376*j4+5376*k6+384*s6);
sfRe35 = _mm512_fmadd_ps(wfRe7, dfRe15, sfRe35);
sfRe35 = _mm512_mask3_fmadd_ps(wfIm7, dfIm15, sfRe35, 64764);
sfIm35 = _mm512_fmadd_ps(wfMx4, dfIm15, sfIm35);
sfIm35 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe15, sfIm35, 64764);
sfRe41 = _mm512_fmadd_ps(wfRe8, dfRe15, sfRe41);
sfRe41 = _mm512_mask3_fmadd_ps(wfIm8, dfIm15, sfRe41, 64764);
sfIm41 = _mm512_fmadd_ps(wfMx5, dfIm15, sfIm41);
sfIm41 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe15, sfIm41, 64764);
dfRe15 = _mm512_shuffle_f32x4(dfRe15, dfRe15, 78);
dfIm15 = _mm512_shuffle_f32x4(dfIm15, dfIm15, 78);
sfRe36 = _mm512_fmadd_ps(wfRe7, dfRe15, sfRe36);
sfRe36 = _mm512_mask3_fmadd_ps(wfIm7, dfIm15, sfRe36, 64764);
sfIm36 = _mm512_fmadd_ps(wfMx4, dfIm15, sfIm36);
sfIm36 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe15, sfIm36, 64764);
sfRe42 = _mm512_fmadd_ps(wfRe8, dfRe15, sfRe42);
sfRe42 = _mm512_mask3_fmadd_ps(wfIm8, dfIm15, sfRe42, 64764);
sfIm42 = _mm512_fmadd_ps(wfMx5, dfIm15, sfIm42);
sfIm42 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe15, sfIm42, 64764);
}
sfRe31 = _mm512_add_ps(sfRe31, _mm512_loadu_ps(sfPtr2+0+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm31 = _mm512_add_ps(sfIm31, _mm512_loadu_ps(sfPtr2+64+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe32 = _mm512_add_ps(sfRe32, _mm512_loadu_ps(sfPtr2+128+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm32 = _mm512_add_ps(sfIm32, _mm512_loadu_ps(sfPtr2+192+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe33 = _mm512_add_ps(sfRe33, _mm512_loadu_ps(sfPtr2+256+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm33 = _mm512_add_ps(sfIm33, _mm512_loadu_ps(sfPtr2+320+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe34 = _mm512_add_ps(sfRe34, _mm512_loadu_ps(sfPtr2+384+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm34 = _mm512_add_ps(sfIm34, _mm512_loadu_ps(sfPtr2+448+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe35 = _mm512_add_ps(sfRe35, _mm512_loadu_ps(sfPtr2+512+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm35 = _mm512_add_ps(sfIm35, _mm512_loadu_ps(sfPtr2+576+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe36 = _mm512_add_ps(sfRe36, _mm512_loadu_ps(sfPtr2+640+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm36 = _mm512_add_ps(sfIm36, _mm512_loadu_ps(sfPtr2+704+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe37 = _mm512_add_ps(sfRe37, _mm512_loadu_ps(sfPtr2+768+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm37 = _mm512_add_ps(sfIm37, _mm512_loadu_ps(sfPtr2+832+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe38 = _mm512_add_ps(sfRe38, _mm512_loadu_ps(sfPtr2+896+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm38 = _mm512_add_ps(sfIm38, _mm512_loadu_ps(sfPtr2+960+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe39 = _mm512_add_ps(sfRe39, _mm512_loadu_ps(sfPtr2+1024+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm39 = _mm512_add_ps(sfIm39, _mm512_loadu_ps(sfPtr2+1088+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe40 = _mm512_add_ps(sfRe40, _mm512_loadu_ps(sfPtr2+1152+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm40 = _mm512_add_ps(sfIm40, _mm512_loadu_ps(sfPtr2+1216+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe41 = _mm512_add_ps(sfRe41, _mm512_loadu_ps(sfPtr2+1280+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm41 = _mm512_add_ps(sfIm41, _mm512_loadu_ps(sfPtr2+1344+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe42 = _mm512_add_ps(sfRe42, _mm512_loadu_ps(sfPtr2+1408+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm42 = _mm512_add_ps(sfIm42, _mm512_loadu_ps(sfPtr2+1472+93696*i8+23424*j4+23424*k6+1536*l3));
_mm512_storeu_ps(sfPtr2+0+93696*i8+23424*j4+23424*k6+1536*l3, sfRe31);
_mm512_storeu_ps(sfPtr2+64+93696*i8+23424*j4+23424*k6+1536*l3, sfIm31);
_mm512_storeu_ps(sfPtr2+128+93696*i8+23424*j4+23424*k6+1536*l3, sfRe32);
_mm512_storeu_ps(sfPtr2+192+93696*i8+23424*j4+23424*k6+1536*l3, sfIm32);
_mm512_storeu_ps(sfPtr2+256+93696*i8+23424*j4+23424*k6+1536*l3, sfRe33);
_mm512_storeu_ps(sfPtr2+320+93696*i8+23424*j4+23424*k6+1536*l3, sfIm33);
_mm512_storeu_ps(sfPtr2+384+93696*i8+23424*j4+23424*k6+1536*l3, sfRe34);
_mm512_storeu_ps(sfPtr2+448+93696*i8+23424*j4+23424*k6+1536*l3, sfIm34);
_mm512_storeu_ps(sfPtr2+512+93696*i8+23424*j4+23424*k6+1536*l3, sfRe35);
_mm512_storeu_ps(sfPtr2+576+93696*i8+23424*j4+23424*k6+1536*l3, sfIm35);
_mm512_storeu_ps(sfPtr2+640+93696*i8+23424*j4+23424*k6+1536*l3, sfRe36);
_mm512_storeu_ps(sfPtr2+704+93696*i8+23424*j4+23424*k6+1536*l3, sfIm36);
_mm512_storeu_ps(sfPtr2+768+93696*i8+23424*j4+23424*k6+1536*l3, sfRe37);
_mm512_storeu_ps(sfPtr2+832+93696*i8+23424*j4+23424*k6+1536*l3, sfIm37);
_mm512_storeu_ps(sfPtr2+896+93696*i8+23424*j4+23424*k6+1536*l3, sfRe38);
_mm512_storeu_ps(sfPtr2+960+93696*i8+23424*j4+23424*k6+1536*l3, sfIm38);
_mm512_storeu_ps(sfPtr2+1024+93696*i8+23424*j4+23424*k6+1536*l3, sfRe39);
_mm512_storeu_ps(sfPtr2+1088+93696*i8+23424*j4+23424*k6+1536*l3, sfIm39);
_mm512_storeu_ps(sfPtr2+1152+93696*i8+23424*j4+23424*k6+1536*l3, sfRe40);
_mm512_storeu_ps(sfPtr2+1216+93696*i8+23424*j4+23424*k6+1536*l3, sfIm40);
_mm512_storeu_ps(sfPtr2+1280+93696*i8+23424*j4+23424*k6+1536*l3, sfRe41);
_mm512_storeu_ps(sfPtr2+1344+93696*i8+23424*j4+23424*k6+1536*l3, sfIm41);
_mm512_storeu_ps(sfPtr2+1408+93696*i8+23424*j4+23424*k6+1536*l3, sfRe42);
_mm512_storeu_ps(sfPtr2+1472+93696*i8+23424*j4+23424*k6+1536*l3, sfIm42);
}
__m512 sfRe43 = _mm512_setzero_ps();
__m512 sfIm43 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe44 = sfRe43;
__m512 sfIm44 = sfIm43;
__m512 sfRe45 = sfRe43;
__m512 sfIm45 = sfIm43;
for (ptrdiff_t s7 = 0; s7 < 14; ++s7) {
__m512i wfLd9 = _mm512_loadu_si512(wfPtr3+0+111104*i8+27776*j4+1792*l3+64*s7);
__m512 wfRe9 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd9));
__m512 wfIm9 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd9, 1));
__m512 wfMx6 = _mm512_mask_mov_ps(wfIm9, 64764, wfRe9);
__m512 dfRe16 = _mm512_loadu_ps(dfPtr3+0+21504*i8+5376*j4+5376*k6+384*s7);
__m512 dfIm16 = _mm512_loadu_ps(dfPtr3+64+21504*i8+5376*j4+5376*k6+384*s7);
sfRe43 = _mm512_fmadd_ps(wfRe9, dfRe16, sfRe43);
sfRe43 = _mm512_mask3_fmadd_ps(wfIm9, dfIm16, sfRe43, 64764);
sfIm43 = _mm512_fmadd_ps(wfMx6, dfIm16, sfIm43);
sfIm43 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe16, sfIm43, 64764);
__m512 dfRe17 = _mm512_loadu_ps(dfPtr3+128+21504*i8+5376*j4+5376*k6+384*s7);
__m512 dfIm17 = _mm512_loadu_ps(dfPtr3+192+21504*i8+5376*j4+5376*k6+384*s7);
sfRe44 = _mm512_fmadd_ps(wfRe9, dfRe17, sfRe44);
sfRe44 = _mm512_mask3_fmadd_ps(wfIm9, dfIm17, sfRe44, 64764);
sfIm44 = _mm512_fmadd_ps(wfMx6, dfIm17, sfIm44);
sfIm44 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe17, sfIm44, 64764);
__m512 dfRe18 = _mm512_loadu_ps(dfPtr3+256+21504*i8+5376*j4+5376*k6+384*s7);
__m512 dfIm18 = _mm512_loadu_ps(dfPtr3+320+21504*i8+5376*j4+5376*k6+384*s7);
sfRe45 = _mm512_fmadd_ps(wfRe9, dfRe18, sfRe45);
sfRe45 = _mm512_mask3_fmadd_ps(wfIm9, dfIm18, sfRe45, 64764);
sfIm45 = _mm512_fmadd_ps(wfMx6, dfIm18, sfIm45);
sfIm45 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe18, sfIm45, 64764);
}
sfRe43 = _mm512_add_ps(sfRe43, _mm512_loadu_ps(sfPtr2+0+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm43 = _mm512_add_ps(sfIm43, _mm512_loadu_ps(sfPtr2+64+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe44 = _mm512_add_ps(sfRe44, _mm512_loadu_ps(sfPtr2+128+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm44 = _mm512_add_ps(sfIm44, _mm512_loadu_ps(sfPtr2+192+93696*i8+23424*j4+23424*k6+1536*l3));
sfRe45 = _mm512_add_ps(sfRe45, _mm512_loadu_ps(sfPtr2+256+93696*i8+23424*j4+23424*k6+1536*l3));
sfIm45 = _mm512_add_ps(sfIm45, _mm512_loadu_ps(sfPtr2+320+93696*i8+23424*j4+23424*k6+1536*l3));
_mm512_storeu_ps(sfPtr2+0+93696*i8+23424*j4+23424*k6+1536*l3, sfRe43);
_mm512_storeu_ps(sfPtr2+64+93696*i8+23424*j4+23424*k6+1536*l3, sfIm43);
_mm512_storeu_ps(sfPtr2+128+93696*i8+23424*j4+23424*k6+1536*l3, sfRe44);
_mm512_storeu_ps(sfPtr2+192+93696*i8+23424*j4+23424*k6+1536*l3, sfIm44);
_mm512_storeu_ps(sfPtr2+256+93696*i8+23424*j4+23424*k6+1536*l3, sfRe45);
_mm512_storeu_ps(sfPtr2+320+93696*i8+23424*j4+23424*k6+1536*l3, sfIm45);
}
j4 = 1;
}
for (; j4 <= jj3; ++j4) {
ptrdiff_t k7 = 1*d1;
for (; k7 != 1; ++k7) {
ptrdiff_t l4 = 16*w2;
for (; l4 != 15; ++l4) {
__m512 sfRe46 = _mm512_setzero_ps();
__m512 sfIm46 = _mm512_setzero_ps();
__m512 sfRe52 = _mm512_setzero_ps();
__m512 sfIm52 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe47 = sfRe46;
__m512 sfIm47 = sfIm46;
__m512 sfRe48 = sfRe46;
__m512 sfIm48 = sfIm46;
__m512 sfRe49 = sfRe46;
__m512 sfIm49 = sfIm46;
__m512 sfRe50 = sfRe46;
__m512 sfIm50 = sfIm46;
__m512 sfRe51 = sfRe46;
__m512 sfIm51 = sfIm46;
__m512 sfRe53 = sfRe52;
__m512 sfIm53 = sfIm52;
__m512 sfRe54 = sfRe52;
__m512 sfIm54 = sfIm52;
__m512 sfRe55 = sfRe52;
__m512 sfIm55 = sfIm52;
__m512 sfRe56 = sfRe52;
__m512 sfIm56 = sfIm52;
__m512 sfRe57 = sfRe52;
__m512 sfIm57 = sfIm52;
for (ptrdiff_t s8 = 0; s8 < 14; ++s8) {
__m512i wfLd10 = _mm512_loadu_si512(wfPtr3+0+111104*i8+27776*j4+1792*l4+128*s8);
__m512 wfRe10 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd10));
__m512 wfIm10 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd10, 1));
__m512i wfLd11 = _mm512_loadu_si512(wfPtr3+64+111104*i8+27776*j4+1792*l4+128*s8);
__m512 wfRe11 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd11));
__m512 wfIm11 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd11, 1));
__m512 dfRe19 = _mm512_loadu_ps(dfPtr3+0+21504*i8+5376*j4+5376*k7+384*s8);
__m512 dfIm19 = _mm512_loadu_ps(dfPtr3+64+21504*i8+5376*j4+5376*k7+384*s8);
sfRe46 = _mm512_fmadd_ps(wfRe10, dfRe19, sfRe46);
sfRe46 = _mm512_fmadd_ps(wfIm10, dfIm19, sfRe46);
sfIm46 = _mm512_fmadd_ps(wfRe10, dfIm19, sfIm46);
sfIm46 = _mm512_fnmadd_ps(wfIm10, dfRe19, sfIm46);
sfRe52 = _mm512_fmadd_ps(wfRe11, dfRe19, sfRe52);
sfRe52 = _mm512_fmadd_ps(wfIm11, dfIm19, sfRe52);
sfIm52 = _mm512_fmadd_ps(wfRe11, dfIm19, sfIm52);
sfIm52 = _mm512_fnmadd_ps(wfIm11, dfRe19, sfIm52);
dfRe19 = _mm512_shuffle_f32x4(dfRe19, dfRe19, 78);
dfIm19 = _mm512_shuffle_f32x4(dfIm19, dfIm19, 78);
sfRe47 = _mm512_fmadd_ps(wfRe10, dfRe19, sfRe47);
sfRe47 = _mm512_fmadd_ps(wfIm10, dfIm19, sfRe47);
sfIm47 = _mm512_fmadd_ps(wfRe10, dfIm19, sfIm47);
sfIm47 = _mm512_fnmadd_ps(wfIm10, dfRe19, sfIm47);
sfRe53 = _mm512_fmadd_ps(wfRe11, dfRe19, sfRe53);
sfRe53 = _mm512_fmadd_ps(wfIm11, dfIm19, sfRe53);
sfIm53 = _mm512_fmadd_ps(wfRe11, dfIm19, sfIm53);
sfIm53 = _mm512_fnmadd_ps(wfIm11, dfRe19, sfIm53);
__m512 dfRe20 = _mm512_loadu_ps(dfPtr3+128+21504*i8+5376*j4+5376*k7+384*s8);
__m512 dfIm20 = _mm512_loadu_ps(dfPtr3+192+21504*i8+5376*j4+5376*k7+384*s8);
sfRe48 = _mm512_fmadd_ps(wfRe10, dfRe20, sfRe48);
sfRe48 = _mm512_fmadd_ps(wfIm10, dfIm20, sfRe48);
sfIm48 = _mm512_fmadd_ps(wfRe10, dfIm20, sfIm48);
sfIm48 = _mm512_fnmadd_ps(wfIm10, dfRe20, sfIm48);
sfRe54 = _mm512_fmadd_ps(wfRe11, dfRe20, sfRe54);
sfRe54 = _mm512_fmadd_ps(wfIm11, dfIm20, sfRe54);
sfIm54 = _mm512_fmadd_ps(wfRe11, dfIm20, sfIm54);
sfIm54 = _mm512_fnmadd_ps(wfIm11, dfRe20, sfIm54);
dfRe20 = _mm512_shuffle_f32x4(dfRe20, dfRe20, 78);
dfIm20 = _mm512_shuffle_f32x4(dfIm20, dfIm20, 78);
sfRe49 = _mm512_fmadd_ps(wfRe10, dfRe20, sfRe49);
sfRe49 = _mm512_fmadd_ps(wfIm10, dfIm20, sfRe49);
sfIm49 = _mm512_fmadd_ps(wfRe10, dfIm20, sfIm49);
sfIm49 = _mm512_fnmadd_ps(wfIm10, dfRe20, sfIm49);
sfRe55 = _mm512_fmadd_ps(wfRe11, dfRe20, sfRe55);
sfRe55 = _mm512_fmadd_ps(wfIm11, dfIm20, sfRe55);
sfIm55 = _mm512_fmadd_ps(wfRe11, dfIm20, sfIm55);
sfIm55 = _mm512_fnmadd_ps(wfIm11, dfRe20, sfIm55);
__m512 dfRe21 = _mm512_loadu_ps(dfPtr3+256+21504*i8+5376*j4+5376*k7+384*s8);
__m512 dfIm21 = _mm512_loadu_ps(dfPtr3+320+21504*i8+5376*j4+5376*k7+384*s8);
sfRe50 = _mm512_fmadd_ps(wfRe10, dfRe21, sfRe50);
sfRe50 = _mm512_fmadd_ps(wfIm10, dfIm21, sfRe50);
sfIm50 = _mm512_fmadd_ps(wfRe10, dfIm21, sfIm50);
sfIm50 = _mm512_fnmadd_ps(wfIm10, dfRe21, sfIm50);
sfRe56 = _mm512_fmadd_ps(wfRe11, dfRe21, sfRe56);
sfRe56 = _mm512_fmadd_ps(wfIm11, dfIm21, sfRe56);
sfIm56 = _mm512_fmadd_ps(wfRe11, dfIm21, sfIm56);
sfIm56 = _mm512_fnmadd_ps(wfIm11, dfRe21, sfIm56);
dfRe21 = _mm512_shuffle_f32x4(dfRe21, dfRe21, 78);
dfIm21 = _mm512_shuffle_f32x4(dfIm21, dfIm21, 78);
sfRe51 = _mm512_fmadd_ps(wfRe10, dfRe21, sfRe51);
sfRe51 = _mm512_fmadd_ps(wfIm10, dfIm21, sfRe51);
sfIm51 = _mm512_fmadd_ps(wfRe10, dfIm21, sfIm51);
sfIm51 = _mm512_fnmadd_ps(wfIm10, dfRe21, sfIm51);
sfRe57 = _mm512_fmadd_ps(wfRe11, dfRe21, sfRe57);
sfRe57 = _mm512_fmadd_ps(wfIm11, dfIm21, sfRe57);
sfIm57 = _mm512_fmadd_ps(wfRe11, dfIm21, sfIm57);
sfIm57 = _mm512_fnmadd_ps(wfIm11, dfRe21, sfIm57);
}
sfRe46 = _mm512_add_ps(sfRe46, _mm512_loadu_ps(sfPtr2+0+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm46 = _mm512_add_ps(sfIm46, _mm512_loadu_ps(sfPtr2+64+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe47 = _mm512_add_ps(sfRe47, _mm512_loadu_ps(sfPtr2+128+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm47 = _mm512_add_ps(sfIm47, _mm512_loadu_ps(sfPtr2+192+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe48 = _mm512_add_ps(sfRe48, _mm512_loadu_ps(sfPtr2+256+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm48 = _mm512_add_ps(sfIm48, _mm512_loadu_ps(sfPtr2+320+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe49 = _mm512_add_ps(sfRe49, _mm512_loadu_ps(sfPtr2+384+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm49 = _mm512_add_ps(sfIm49, _mm512_loadu_ps(sfPtr2+448+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe50 = _mm512_add_ps(sfRe50, _mm512_loadu_ps(sfPtr2+512+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm50 = _mm512_add_ps(sfIm50, _mm512_loadu_ps(sfPtr2+576+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe51 = _mm512_add_ps(sfRe51, _mm512_loadu_ps(sfPtr2+640+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm51 = _mm512_add_ps(sfIm51, _mm512_loadu_ps(sfPtr2+704+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe52 = _mm512_add_ps(sfRe52, _mm512_loadu_ps(sfPtr2+768+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm52 = _mm512_add_ps(sfIm52, _mm512_loadu_ps(sfPtr2+832+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe53 = _mm512_add_ps(sfRe53, _mm512_loadu_ps(sfPtr2+896+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm53 = _mm512_add_ps(sfIm53, _mm512_loadu_ps(sfPtr2+960+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe54 = _mm512_add_ps(sfRe54, _mm512_loadu_ps(sfPtr2+1024+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm54 = _mm512_add_ps(sfIm54, _mm512_loadu_ps(sfPtr2+1088+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe55 = _mm512_add_ps(sfRe55, _mm512_loadu_ps(sfPtr2+1152+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm55 = _mm512_add_ps(sfIm55, _mm512_loadu_ps(sfPtr2+1216+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe56 = _mm512_add_ps(sfRe56, _mm512_loadu_ps(sfPtr2+1280+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm56 = _mm512_add_ps(sfIm56, _mm512_loadu_ps(sfPtr2+1344+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe57 = _mm512_add_ps(sfRe57, _mm512_loadu_ps(sfPtr2+1408+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm57 = _mm512_add_ps(sfIm57, _mm512_loadu_ps(sfPtr2+1472+93696*i8+23424*j4+23424*k7+1536*l4));
_mm512_storeu_ps(sfPtr2+0+93696*i8+23424*j4+23424*k7+1536*l4, sfRe46);
_mm512_storeu_ps(sfPtr2+64+93696*i8+23424*j4+23424*k7+1536*l4, sfIm46);
_mm512_storeu_ps(sfPtr2+128+93696*i8+23424*j4+23424*k7+1536*l4, sfRe47);
_mm512_storeu_ps(sfPtr2+192+93696*i8+23424*j4+23424*k7+1536*l4, sfIm47);
_mm512_storeu_ps(sfPtr2+256+93696*i8+23424*j4+23424*k7+1536*l4, sfRe48);
_mm512_storeu_ps(sfPtr2+320+93696*i8+23424*j4+23424*k7+1536*l4, sfIm48);
_mm512_storeu_ps(sfPtr2+384+93696*i8+23424*j4+23424*k7+1536*l4, sfRe49);
_mm512_storeu_ps(sfPtr2+448+93696*i8+23424*j4+23424*k7+1536*l4, sfIm49);
_mm512_storeu_ps(sfPtr2+512+93696*i8+23424*j4+23424*k7+1536*l4, sfRe50);
_mm512_storeu_ps(sfPtr2+576+93696*i8+23424*j4+23424*k7+1536*l4, sfIm50);
_mm512_storeu_ps(sfPtr2+640+93696*i8+23424*j4+23424*k7+1536*l4, sfRe51);
_mm512_storeu_ps(sfPtr2+704+93696*i8+23424*j4+23424*k7+1536*l4, sfIm51);
_mm512_storeu_ps(sfPtr2+768+93696*i8+23424*j4+23424*k7+1536*l4, sfRe52);
_mm512_storeu_ps(sfPtr2+832+93696*i8+23424*j4+23424*k7+1536*l4, sfIm52);
_mm512_storeu_ps(sfPtr2+896+93696*i8+23424*j4+23424*k7+1536*l4, sfRe53);
_mm512_storeu_ps(sfPtr2+960+93696*i8+23424*j4+23424*k7+1536*l4, sfIm53);
_mm512_storeu_ps(sfPtr2+1024+93696*i8+23424*j4+23424*k7+1536*l4, sfRe54);
_mm512_storeu_ps(sfPtr2+1088+93696*i8+23424*j4+23424*k7+1536*l4, sfIm54);
_mm512_storeu_ps(sfPtr2+1152+93696*i8+23424*j4+23424*k7+1536*l4, sfRe55);
_mm512_storeu_ps(sfPtr2+1216+93696*i8+23424*j4+23424*k7+1536*l4, sfIm55);
_mm512_storeu_ps(sfPtr2+1280+93696*i8+23424*j4+23424*k7+1536*l4, sfRe56);
_mm512_storeu_ps(sfPtr2+1344+93696*i8+23424*j4+23424*k7+1536*l4, sfIm56);
_mm512_storeu_ps(sfPtr2+1408+93696*i8+23424*j4+23424*k7+1536*l4, sfRe57);
_mm512_storeu_ps(sfPtr2+1472+93696*i8+23424*j4+23424*k7+1536*l4, sfIm57);
}
__m512 sfRe58 = _mm512_setzero_ps();
__m512 sfIm58 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe59 = sfRe58;
__m512 sfIm59 = sfIm58;
__m512 sfRe60 = sfRe58;
__m512 sfIm60 = sfIm58;
for (ptrdiff_t s9 = 0; s9 < 14; ++s9) {
__m512i wfLd12 = _mm512_loadu_si512(wfPtr3+0+111104*i8+27776*j4+1792*l4+64*s9);
__m512 wfRe12 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd12));
__m512 wfIm12 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd12, 1));
__m512 dfRe22 = _mm512_loadu_ps(dfPtr3+0+21504*i8+5376*j4+5376*k7+384*s9);
__m512 dfIm22 = _mm512_loadu_ps(dfPtr3+64+21504*i8+5376*j4+5376*k7+384*s9);
sfRe58 = _mm512_fmadd_ps(wfRe12, dfRe22, sfRe58);
sfRe58 = _mm512_fmadd_ps(wfIm12, dfIm22, sfRe58);
sfIm58 = _mm512_fmadd_ps(wfRe12, dfIm22, sfIm58);
sfIm58 = _mm512_fnmadd_ps(wfIm12, dfRe22, sfIm58);
__m512 dfRe23 = _mm512_loadu_ps(dfPtr3+128+21504*i8+5376*j4+5376*k7+384*s9);
__m512 dfIm23 = _mm512_loadu_ps(dfPtr3+192+21504*i8+5376*j4+5376*k7+384*s9);
sfRe59 = _mm512_fmadd_ps(wfRe12, dfRe23, sfRe59);
sfRe59 = _mm512_fmadd_ps(wfIm12, dfIm23, sfRe59);
sfIm59 = _mm512_fmadd_ps(wfRe12, dfIm23, sfIm59);
sfIm59 = _mm512_fnmadd_ps(wfIm12, dfRe23, sfIm59);
__m512 dfRe24 = _mm512_loadu_ps(dfPtr3+256+21504*i8+5376*j4+5376*k7+384*s9);
__m512 dfIm24 = _mm512_loadu_ps(dfPtr3+320+21504*i8+5376*j4+5376*k7+384*s9);
sfRe60 = _mm512_fmadd_ps(wfRe12, dfRe24, sfRe60);
sfRe60 = _mm512_fmadd_ps(wfIm12, dfIm24, sfRe60);
sfIm60 = _mm512_fmadd_ps(wfRe12, dfIm24, sfIm60);
sfIm60 = _mm512_fnmadd_ps(wfIm12, dfRe24, sfIm60);
}
sfRe58 = _mm512_add_ps(sfRe58, _mm512_loadu_ps(sfPtr2+0+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm58 = _mm512_add_ps(sfIm58, _mm512_loadu_ps(sfPtr2+64+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe59 = _mm512_add_ps(sfRe59, _mm512_loadu_ps(sfPtr2+128+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm59 = _mm512_add_ps(sfIm59, _mm512_loadu_ps(sfPtr2+192+93696*i8+23424*j4+23424*k7+1536*l4));
sfRe60 = _mm512_add_ps(sfRe60, _mm512_loadu_ps(sfPtr2+256+93696*i8+23424*j4+23424*k7+1536*l4));
sfIm60 = _mm512_add_ps(sfIm60, _mm512_loadu_ps(sfPtr2+320+93696*i8+23424*j4+23424*k7+1536*l4));
_mm512_storeu_ps(sfPtr2+0+93696*i8+23424*j4+23424*k7+1536*l4, sfRe58);
_mm512_storeu_ps(sfPtr2+64+93696*i8+23424*j4+23424*k7+1536*l4, sfIm58);
_mm512_storeu_ps(sfPtr2+128+93696*i8+23424*j4+23424*k7+1536*l4, sfRe59);
_mm512_storeu_ps(sfPtr2+192+93696*i8+23424*j4+23424*k7+1536*l4, sfIm59);
_mm512_storeu_ps(sfPtr2+256+93696*i8+23424*j4+23424*k7+1536*l4, sfRe60);
_mm512_storeu_ps(sfPtr2+320+93696*i8+23424*j4+23424*k7+1536*l4, sfIm60);
}
}
}

static void Example8StriderProduceSums1(Example8ThreaderTeam1* team16, char** tensors5) {
void* tuple1[3];
tuple1[0] = tensors5;
for (ptrdiff_t e4 = 0; e4 < 1; ++e4) {
tuple1[1] = (void*)e4;
for (ptrdiff_t z3 = 0; z3 < 4; ++z3) {
tuple1[2] = (void*)z3;
Example8ThreaderTask1 task9;
task9.callee1 = Example8StriderProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 1;
task9.hull1[1] = 1;
task9.hull1[2] = 2;
task9.hull1[3] = 4;
Example8ThreaderDo1(team16, &task9);
}
}
}

static void Example8StriderConsumeSums1Callee1(Example8ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t w3 = 0;
ptrdiff_t d2 = 0;
ptrdiff_t g5 = pt10[2];
char*restrict sfPtr3 = tensors8[0];
char*restrict datPtr2 = tensors8[1];
ptrdiff_t i9 = 2*g5;
ptrdiff_t ii2 = i9+1;
for (; i9 <= ii2; ++i9) {
ptrdiff_t j5 = 1*d2;
ptrdiff_t rel2 = j5-0;
ptrdiff_t base2 = 0;
ptrdiff_t toH1 = base2+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k8 = 16*w3;
for (; k8 != 15; ++k8) {
ptrdiff_t r2 = 0;
for (; r2 != 2; ++r2) {
ptrdiff_t t2 = 0;
__m512 sfRe61 = _mm512_loadu_ps(sfPtr3+0+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfIm61 = _mm512_loadu_ps(sfPtr3+64+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfRe65 = _mm512_loadu_ps(sfPtr3+128+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfIm65 = _mm512_loadu_ps(sfPtr3+192+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfRe62 = _mm512_loadu_ps(sfPtr3+23424+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfIm62 = _mm512_loadu_ps(sfPtr3+23488+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfRe66 = _mm512_loadu_ps(sfPtr3+23552+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfIm66 = _mm512_loadu_ps(sfPtr3+23616+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfRe63 = _mm512_loadu_ps(sfPtr3+46848+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfIm63 = _mm512_loadu_ps(sfPtr3+46912+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfRe67 = _mm512_loadu_ps(sfPtr3+46976+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfIm67 = _mm512_loadu_ps(sfPtr3+47040+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfRe64 = _mm512_loadu_ps(sfPtr3+70272+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfIm64 = _mm512_loadu_ps(sfPtr3+70336+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfRe68 = _mm512_loadu_ps(sfPtr3+70400+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512 sfIm68 = _mm512_loadu_ps(sfPtr3+70464+93696*i9+23424*j5+1536*k8+768*r2+256*t2);
__m512i ifft1 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2 = _mm512_permutexvar_ps(ifft1, sfRe61);
__m512 ifft93 = _mm512_permutexvar_ps(ifft1, sfRe65);
__m512i ifft3 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4 = _mm512_permutexvar_ps(ifft3, sfRe61);
__m512 ifft94 = _mm512_permutexvar_ps(ifft3, sfRe65);
__m512 ifft5 = _mm512_permutexvar_ps(ifft1, sfIm61);
__m512 ifft95 = _mm512_permutexvar_ps(ifft1, sfIm65);
__m512 ifft6 = _mm512_permutexvar_ps(ifft3, sfIm61);
__m512 ifft96 = _mm512_permutexvar_ps(ifft3, sfIm65);
__m512 ifft7 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft8 = _mm512_mask_fmadd_ps(ifft6, 65021, ifft7, ifft2);
__m512 ifft97 = _mm512_mask_fmadd_ps(ifft96, 65021, ifft7, ifft93);
__m512 ifft9 = _mm512_mask_fnmadd_ps(ifft5, 65021, ifft7, ifft4);
__m512 ifft98 = _mm512_mask_fnmadd_ps(ifft95, 65021, ifft7, ifft94);
__m512 ifft10 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft11 = _mm512_fmadd_ps(ifft8, ifft10, _mm512_shuffle_ps(ifft8, ifft8, 177));
__m512 ifft99 = _mm512_fmadd_ps(ifft97, ifft10, _mm512_shuffle_ps(ifft97, ifft97, 177));
__m512 ifft12 = _mm512_fmadd_ps(ifft9, ifft10, _mm512_shuffle_ps(ifft9, ifft9, 177));
__m512 ifft100 = _mm512_fmadd_ps(ifft98, ifft10, _mm512_shuffle_ps(ifft98, ifft98, 177));
__m512 ifft13 = _mm512_fmadd_ps(sfRe62, ifft10, _mm512_shuffle_ps(sfRe62, sfRe62, 177));
__m512 ifft101 = _mm512_fmadd_ps(sfRe66, ifft10, _mm512_shuffle_ps(sfRe66, sfRe66, 177));
__m512 ifft14 = _mm512_fmadd_ps(sfIm62, ifft10, _mm512_shuffle_ps(sfIm62, sfIm62, 177));
__m512 ifft102 = _mm512_fmadd_ps(sfIm66, ifft10, _mm512_shuffle_ps(sfIm66, sfIm66, 177));
__m512 ifft15 = _mm512_fmadd_ps(sfRe63, ifft10, _mm512_shuffle_ps(sfRe63, sfRe63, 177));
__m512 ifft103 = _mm512_fmadd_ps(sfRe67, ifft10, _mm512_shuffle_ps(sfRe67, sfRe67, 177));
__m512 ifft16 = _mm512_fmadd_ps(sfIm63, ifft10, _mm512_shuffle_ps(sfIm63, sfIm63, 177));
__m512 ifft104 = _mm512_fmadd_ps(sfIm67, ifft10, _mm512_shuffle_ps(sfIm67, sfIm67, 177));
__m512 ifft17 = _mm512_fmadd_ps(sfRe64, ifft10, _mm512_shuffle_ps(sfRe64, sfRe64, 177));
__m512 ifft105 = _mm512_fmadd_ps(sfRe68, ifft10, _mm512_shuffle_ps(sfRe68, sfRe68, 177));
__m512 ifft18 = _mm512_fmadd_ps(sfIm64, ifft10, _mm512_shuffle_ps(sfIm64, sfIm64, 177));
__m512 ifft106 = _mm512_fmadd_ps(sfIm68, ifft10, _mm512_shuffle_ps(sfIm68, sfIm68, 177));
__m512 ifft19 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft20 = _mm512_mul_ps(ifft11, ifft19);
__m512 ifft107 = _mm512_mul_ps(ifft99, ifft19);
__m512 ifft21 = _mm512_mul_ps(ifft12, ifft19);
__m512 ifft108 = _mm512_mul_ps(ifft100, ifft19);
__m512 ifft22 = _mm512_mul_ps(ifft13, ifft19);
__m512 ifft109 = _mm512_mul_ps(ifft101, ifft19);
__m512 ifft23 = _mm512_mul_ps(ifft14, ifft19);
__m512 ifft110 = _mm512_mul_ps(ifft102, ifft19);
__m512 ifft24 = _mm512_mul_ps(ifft15, ifft19);
__m512 ifft111 = _mm512_mul_ps(ifft103, ifft19);
__m512 ifft25 = _mm512_mul_ps(ifft16, ifft19);
__m512 ifft112 = _mm512_mul_ps(ifft104, ifft19);
__m512 ifft26 = _mm512_mul_ps(ifft17, ifft19);
__m512 ifft113 = _mm512_mul_ps(ifft105, ifft19);
__m512 ifft27 = _mm512_mul_ps(ifft18, ifft19);
__m512 ifft114 = _mm512_mul_ps(ifft106, ifft19);
__m512 ifft28 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft29 = _mm512_fnmadd_ps(ifft12, ifft28, ifft20);
__m512 ifft115 = _mm512_fnmadd_ps(ifft100, ifft28, ifft107);
__m512 ifft30 = _mm512_fmadd_ps(ifft11, ifft28, ifft21);
__m512 ifft116 = _mm512_fmadd_ps(ifft99, ifft28, ifft108);
__m512 ifft31 = _mm512_fnmadd_ps(ifft14, ifft28, ifft22);
__m512 ifft117 = _mm512_fnmadd_ps(ifft102, ifft28, ifft109);
__m512 ifft32 = _mm512_fmadd_ps(ifft13, ifft28, ifft23);
__m512 ifft118 = _mm512_fmadd_ps(ifft101, ifft28, ifft110);
__m512 ifft33 = _mm512_fnmadd_ps(ifft16, ifft28, ifft24);
__m512 ifft119 = _mm512_fnmadd_ps(ifft104, ifft28, ifft111);
__m512 ifft34 = _mm512_fmadd_ps(ifft15, ifft28, ifft25);
__m512 ifft120 = _mm512_fmadd_ps(ifft103, ifft28, ifft112);
__m512 ifft35 = _mm512_fnmadd_ps(ifft18, ifft28, ifft26);
__m512 ifft121 = _mm512_fnmadd_ps(ifft106, ifft28, ifft113);
__m512 ifft36 = _mm512_fmadd_ps(ifft17, ifft28, ifft27);
__m512 ifft122 = _mm512_fmadd_ps(ifft105, ifft28, ifft114);
__m512 ifft37 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft38 = _mm512_fmadd_ps(ifft29, ifft37, _mm512_shuffle_ps(ifft29, ifft29, 78));
__m512 ifft123 = _mm512_fmadd_ps(ifft115, ifft37, _mm512_shuffle_ps(ifft115, ifft115, 78));
__m512 ifft39 = _mm512_fmadd_ps(ifft30, ifft37, _mm512_shuffle_ps(ifft30, ifft30, 78));
__m512 ifft124 = _mm512_fmadd_ps(ifft116, ifft37, _mm512_shuffle_ps(ifft116, ifft116, 78));
__m512 ifft40 = _mm512_fmadd_ps(ifft31, ifft37, _mm512_shuffle_ps(ifft31, ifft31, 78));
__m512 ifft125 = _mm512_fmadd_ps(ifft117, ifft37, _mm512_shuffle_ps(ifft117, ifft117, 78));
__m512 ifft41 = _mm512_fmadd_ps(ifft32, ifft37, _mm512_shuffle_ps(ifft32, ifft32, 78));
__m512 ifft126 = _mm512_fmadd_ps(ifft118, ifft37, _mm512_shuffle_ps(ifft118, ifft118, 78));
__m512 ifft42 = _mm512_fmadd_ps(ifft33, ifft37, _mm512_shuffle_ps(ifft33, ifft33, 78));
__m512 ifft127 = _mm512_fmadd_ps(ifft119, ifft37, _mm512_shuffle_ps(ifft119, ifft119, 78));
__m512 ifft43 = _mm512_fmadd_ps(ifft34, ifft37, _mm512_shuffle_ps(ifft34, ifft34, 78));
__m512 ifft128 = _mm512_fmadd_ps(ifft120, ifft37, _mm512_shuffle_ps(ifft120, ifft120, 78));
__m512 ifft44 = _mm512_fmadd_ps(ifft35, ifft37, _mm512_shuffle_ps(ifft35, ifft35, 78));
__m512 ifft129 = _mm512_fmadd_ps(ifft121, ifft37, _mm512_shuffle_ps(ifft121, ifft121, 78));
__m512 ifft45 = _mm512_fmadd_ps(ifft36, ifft37, _mm512_shuffle_ps(ifft36, ifft36, 78));
__m512 ifft130 = _mm512_fmadd_ps(ifft122, ifft37, _mm512_shuffle_ps(ifft122, ifft122, 78));
__m512 ifft46 = _mm512_mask_sub_ps(ifft38, 49344, _mm512_setzero_ps(), ifft39);
__m512 ifft131 = _mm512_mask_sub_ps(ifft123, 49344, _mm512_setzero_ps(), ifft124);
__m512 ifft47 = _mm512_mask_mov_ps(ifft39, 49344, ifft38);
__m512 ifft132 = _mm512_mask_mov_ps(ifft124, 49344, ifft123);
__m512 ifft48 = _mm512_mask_sub_ps(ifft40, 49344, _mm512_setzero_ps(), ifft41);
__m512 ifft133 = _mm512_mask_sub_ps(ifft125, 49344, _mm512_setzero_ps(), ifft126);
__m512 ifft49 = _mm512_mask_mov_ps(ifft41, 49344, ifft40);
__m512 ifft134 = _mm512_mask_mov_ps(ifft126, 49344, ifft125);
__m512 ifft50 = _mm512_mask_sub_ps(ifft42, 49344, _mm512_setzero_ps(), ifft43);
__m512 ifft135 = _mm512_mask_sub_ps(ifft127, 49344, _mm512_setzero_ps(), ifft128);
__m512 ifft51 = _mm512_mask_mov_ps(ifft43, 49344, ifft42);
__m512 ifft136 = _mm512_mask_mov_ps(ifft128, 49344, ifft127);
__m512 ifft52 = _mm512_mask_sub_ps(ifft44, 49344, _mm512_setzero_ps(), ifft45);
__m512 ifft137 = _mm512_mask_sub_ps(ifft129, 49344, _mm512_setzero_ps(), ifft130);
__m512 ifft53 = _mm512_mask_mov_ps(ifft45, 49344, ifft44);
__m512 ifft138 = _mm512_mask_mov_ps(ifft130, 49344, ifft129);
__m512 ifft54 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft55 = _mm512_fmadd_ps(ifft46, ifft54, _mm512_shuffle_f32x4(ifft46, ifft46, 177));
__m512 ifft139 = _mm512_fmadd_ps(ifft131, ifft54, _mm512_shuffle_f32x4(ifft131, ifft131, 177));
__m512 ifft56 = _mm512_fmadd_ps(ifft47, ifft54, _mm512_shuffle_f32x4(ifft47, ifft47, 177));
__m512 ifft140 = _mm512_fmadd_ps(ifft132, ifft54, _mm512_shuffle_f32x4(ifft132, ifft132, 177));
__m512 ifft57 = _mm512_fmadd_ps(ifft48, ifft54, _mm512_shuffle_f32x4(ifft48, ifft48, 177));
__m512 ifft141 = _mm512_fmadd_ps(ifft133, ifft54, _mm512_shuffle_f32x4(ifft133, ifft133, 177));
__m512 ifft58 = _mm512_fmadd_ps(ifft49, ifft54, _mm512_shuffle_f32x4(ifft49, ifft49, 177));
__m512 ifft142 = _mm512_fmadd_ps(ifft134, ifft54, _mm512_shuffle_f32x4(ifft134, ifft134, 177));
__m512 ifft59 = _mm512_fmadd_ps(ifft50, ifft54, _mm512_shuffle_f32x4(ifft50, ifft50, 177));
__m512 ifft143 = _mm512_fmadd_ps(ifft135, ifft54, _mm512_shuffle_f32x4(ifft135, ifft135, 177));
__m512 ifft60 = _mm512_fnmsub_ps(ifft51, ifft54, _mm512_shuffle_f32x4(ifft51, ifft51, 177));
__m512 ifft144 = _mm512_fnmsub_ps(ifft136, ifft54, _mm512_shuffle_f32x4(ifft136, ifft136, 177));
__m512 ifft61 = _mm512_fmadd_ps(ifft52, ifft54, _mm512_shuffle_f32x4(ifft52, ifft52, 177));
__m512 ifft145 = _mm512_fmadd_ps(ifft137, ifft54, _mm512_shuffle_f32x4(ifft137, ifft137, 177));
__m512 ifft62 = _mm512_fmadd_ps(ifft53, ifft54, _mm512_shuffle_f32x4(ifft53, ifft53, 177));
__m512 ifft146 = _mm512_fmadd_ps(ifft138, ifft54, _mm512_shuffle_f32x4(ifft138, ifft138, 177));
__m512 ifft63 = _mm512_add_ps(ifft55, ifft56);
__m512 ifft147 = _mm512_add_ps(ifft139, ifft140);
__m512 ifft64 = _mm512_sub_ps(ifft55, ifft56);
__m512 ifft148 = _mm512_sub_ps(ifft139, ifft140);
__m512 ifft65 = _mm512_sub_ps(ifft57, ifft61);
__m512 ifft149 = _mm512_sub_ps(ifft141, ifft145);
__m512 ifft66 = _mm512_add_ps(ifft58, ifft62);
__m512 ifft150 = _mm512_add_ps(ifft142, ifft146);
__m512 ifft67 = _mm512_add_ps(ifft57, ifft61);
__m512 ifft151 = _mm512_add_ps(ifft141, ifft145);
__m512 ifft68 = _mm512_sub_ps(ifft58, ifft62);
__m512 ifft152 = _mm512_sub_ps(ifft142, ifft146);
__m512 ifft69 = _mm512_mul_ps(ifft59, _mm512_set1_ps(3.125e-02f));
__m512 ifft153 = _mm512_mul_ps(ifft143, _mm512_set1_ps(3.125e-02f));
__m512 ifft70 = _mm512_mul_ps(ifft60, _mm512_set1_ps(3.125e-02f));
__m512 ifft154 = _mm512_mul_ps(ifft144, _mm512_set1_ps(3.125e-02f));
__m512 ifft71 = _mm512_fmadd_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft155 = _mm512_fmadd_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft72 = _mm512_fmsub_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft156 = _mm512_fmsub_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft73 = _mm512_fmadd_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft157 = _mm512_fmadd_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft74 = _mm512_fmsub_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft158 = _mm512_fmsub_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft75 = _mm512_add_ps(ifft65, ifft66);
__m512 ifft159 = _mm512_add_ps(ifft149, ifft150);
__m512 ifft76 = _mm512_sub_ps(ifft65, ifft66);
__m512 ifft160 = _mm512_sub_ps(ifft149, ifft150);
__m512 ifft77 = _mm512_fnmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft161 = _mm512_fnmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft78 = _mm512_fmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft162 = _mm512_fmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft79 = _mm512_fmadd_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft163 = _mm512_fmadd_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft80 = _mm512_fmsub_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft164 = _mm512_fmsub_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft81 = _mm512_add_ps(ifft77, ifft78);
__m512 ifft165 = _mm512_add_ps(ifft161, ifft162);
__m512 ifft82 = _mm512_sub_ps(ifft77, ifft78);
__m512 ifft166 = _mm512_sub_ps(ifft161, ifft162);
__m512 ifft83 = _mm512_add_ps(ifft79, ifft80);
__m512 ifft167 = _mm512_add_ps(ifft163, ifft164);
__m512 ifft84 = _mm512_sub_ps(ifft79, ifft80);
__m512 ifft168 = _mm512_sub_ps(ifft163, ifft164);
__m512 ifft85 = _mm512_fmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft169 = _mm512_fmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft86 = _mm512_fnmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft170 = _mm512_fnmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft87 = _mm512_fmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft171 = _mm512_fmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft88 = _mm512_fnmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft172 = _mm512_fnmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft89 = _mm512_fnmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft173 = _mm512_fnmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft90 = _mm512_fmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft174 = _mm512_fmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft91 = _mm512_fmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft175 = _mm512_fmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 ifft92 = _mm512_fnmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft176 = _mm512_fnmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 dat75 = ifft85;
__m512 dat81 = ifft169;
__m512 dat76 = ifft87;
__m512 dat82 = ifft171;
__m512 dat77 = ifft89;
__m512 dat83 = ifft173;
__m512 dat78 = ifft91;
__m512 dat84 = ifft175;
__m512 dat79 = ifft86;
__m512 dat85 = ifft170;
__m512 dat80 = ifft88;
__m512 dat86 = ifft172;
(void)ifft90;
(void)ifft174;
(void)ifft92;
(void)ifft176;
__m512i pm1 = _mm512_set_epi32(3, 2, 1, 0, 21, 20, 19, 18, 17, 16, 5, 4, 3, 2, 1, 0);
__m512 pack1 = _mm512_permutex2var_ps(dat75, pm1, dat81);
__m512i pm2 = _mm512_set_epi32(27, 26, 25, 24, 13, 12, 11, 10, 9, 8, 29, 28, 27, 26, 25, 24);
__m512 pack2 = _mm512_permutex2var_ps(dat75, pm2, dat81);
__m512 pack3 = _mm512_permutex2var_ps(dat76, pm1, dat82);
__m512 pack4 = _mm512_permutex2var_ps(dat76, pm2, dat82);
__m512 pack5 = _mm512_permutex2var_ps(dat77, pm1, dat83);
__m512 pack6 = _mm512_permutex2var_ps(dat77, pm2, dat83);
__m512 pack7 = _mm512_permutex2var_ps(dat78, pm1, dat84);
__m512 pack8 = _mm512_permutex2var_ps(dat78, pm2, dat84);
__m512 pack9 = _mm512_permutex2var_ps(dat79, pm1, dat85);
__m512 pack10 = _mm512_permutex2var_ps(dat79, pm2, dat85);
__m512 pack11 = _mm512_permutex2var_ps(dat80, pm1, dat86);
__m512 pack12 = _mm512_permutex2var_ps(dat80, pm2, dat86);
_mm512_mask_storeu_ps(datPtr2+0+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack1);
_mm512_mask_storeu_ps(datPtr2+640+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack2);
_mm512_mask_storeu_ps(datPtr2+40+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack3);
_mm512_mask_storeu_ps(datPtr2+680+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack4);
_mm512_mask_storeu_ps(datPtr2+80+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack5);
_mm512_mask_storeu_ps(datPtr2+720+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack6);
_mm512_mask_storeu_ps(datPtr2+120+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack7);
_mm512_mask_storeu_ps(datPtr2+760+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack8);
_mm512_mask_storeu_ps(datPtr2+160+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack9);
_mm512_mask_storeu_ps(datPtr2+800+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack10);
_mm512_mask_storeu_ps(datPtr2+200+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack11);
_mm512_mask_storeu_ps(datPtr2+840+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t2, 1023, pack12);
ptrdiff_t t3 = 0;
__m512 sfRe69 = _mm512_loadu_ps(sfPtr3+256+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfIm69 = _mm512_loadu_ps(sfPtr3+320+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfRe73 = _mm512_loadu_ps(sfPtr3+384+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfIm73 = _mm512_loadu_ps(sfPtr3+448+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfRe70 = _mm512_loadu_ps(sfPtr3+23680+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfIm70 = _mm512_loadu_ps(sfPtr3+23744+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfRe74 = _mm512_loadu_ps(sfPtr3+23808+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfIm74 = _mm512_loadu_ps(sfPtr3+23872+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfRe71 = _mm512_loadu_ps(sfPtr3+47104+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfIm71 = _mm512_loadu_ps(sfPtr3+47168+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfRe75 = _mm512_loadu_ps(sfPtr3+47232+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfIm75 = _mm512_loadu_ps(sfPtr3+47296+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfRe72 = _mm512_loadu_ps(sfPtr3+70528+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfIm72 = _mm512_loadu_ps(sfPtr3+70592+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfRe76 = _mm512_loadu_ps(sfPtr3+70656+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512 sfIm76 = _mm512_loadu_ps(sfPtr3+70720+93696*i9+23424*j5+1536*k8+768*r2+256*t3);
__m512i ifft177 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft178 = _mm512_permutexvar_ps(ifft177, sfRe69);
__m512 ifft269 = _mm512_permutexvar_ps(ifft177, sfRe73);
__m512i ifft179 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft180 = _mm512_permutexvar_ps(ifft179, sfRe69);
__m512 ifft270 = _mm512_permutexvar_ps(ifft179, sfRe73);
__m512 ifft181 = _mm512_permutexvar_ps(ifft177, sfIm69);
__m512 ifft271 = _mm512_permutexvar_ps(ifft177, sfIm73);
__m512 ifft182 = _mm512_permutexvar_ps(ifft179, sfIm69);
__m512 ifft272 = _mm512_permutexvar_ps(ifft179, sfIm73);
__m512 ifft183 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft184 = _mm512_mask_fmadd_ps(ifft182, 65021, ifft183, ifft178);
__m512 ifft273 = _mm512_mask_fmadd_ps(ifft272, 65021, ifft183, ifft269);
__m512 ifft185 = _mm512_mask_fnmadd_ps(ifft181, 65021, ifft183, ifft180);
__m512 ifft274 = _mm512_mask_fnmadd_ps(ifft271, 65021, ifft183, ifft270);
__m512 ifft186 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft187 = _mm512_fmadd_ps(ifft184, ifft186, _mm512_shuffle_ps(ifft184, ifft184, 177));
__m512 ifft275 = _mm512_fmadd_ps(ifft273, ifft186, _mm512_shuffle_ps(ifft273, ifft273, 177));
__m512 ifft188 = _mm512_fmadd_ps(ifft185, ifft186, _mm512_shuffle_ps(ifft185, ifft185, 177));
__m512 ifft276 = _mm512_fmadd_ps(ifft274, ifft186, _mm512_shuffle_ps(ifft274, ifft274, 177));
__m512 ifft189 = _mm512_fmadd_ps(sfRe70, ifft186, _mm512_shuffle_ps(sfRe70, sfRe70, 177));
__m512 ifft277 = _mm512_fmadd_ps(sfRe74, ifft186, _mm512_shuffle_ps(sfRe74, sfRe74, 177));
__m512 ifft190 = _mm512_fmadd_ps(sfIm70, ifft186, _mm512_shuffle_ps(sfIm70, sfIm70, 177));
__m512 ifft278 = _mm512_fmadd_ps(sfIm74, ifft186, _mm512_shuffle_ps(sfIm74, sfIm74, 177));
__m512 ifft191 = _mm512_fmadd_ps(sfRe71, ifft186, _mm512_shuffle_ps(sfRe71, sfRe71, 177));
__m512 ifft279 = _mm512_fmadd_ps(sfRe75, ifft186, _mm512_shuffle_ps(sfRe75, sfRe75, 177));
__m512 ifft192 = _mm512_fmadd_ps(sfIm71, ifft186, _mm512_shuffle_ps(sfIm71, sfIm71, 177));
__m512 ifft280 = _mm512_fmadd_ps(sfIm75, ifft186, _mm512_shuffle_ps(sfIm75, sfIm75, 177));
__m512 ifft193 = _mm512_fmadd_ps(sfRe72, ifft186, _mm512_shuffle_ps(sfRe72, sfRe72, 177));
__m512 ifft281 = _mm512_fmadd_ps(sfRe76, ifft186, _mm512_shuffle_ps(sfRe76, sfRe76, 177));
__m512 ifft194 = _mm512_fmadd_ps(sfIm72, ifft186, _mm512_shuffle_ps(sfIm72, sfIm72, 177));
__m512 ifft282 = _mm512_fmadd_ps(sfIm76, ifft186, _mm512_shuffle_ps(sfIm76, sfIm76, 177));
__m512 ifft195 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft196 = _mm512_mul_ps(ifft187, ifft195);
__m512 ifft283 = _mm512_mul_ps(ifft275, ifft195);
__m512 ifft197 = _mm512_mul_ps(ifft188, ifft195);
__m512 ifft284 = _mm512_mul_ps(ifft276, ifft195);
__m512 ifft198 = _mm512_mul_ps(ifft189, ifft195);
__m512 ifft285 = _mm512_mul_ps(ifft277, ifft195);
__m512 ifft199 = _mm512_mul_ps(ifft190, ifft195);
__m512 ifft286 = _mm512_mul_ps(ifft278, ifft195);
__m512 ifft200 = _mm512_mul_ps(ifft191, ifft195);
__m512 ifft287 = _mm512_mul_ps(ifft279, ifft195);
__m512 ifft201 = _mm512_mul_ps(ifft192, ifft195);
__m512 ifft288 = _mm512_mul_ps(ifft280, ifft195);
__m512 ifft202 = _mm512_mul_ps(ifft193, ifft195);
__m512 ifft289 = _mm512_mul_ps(ifft281, ifft195);
__m512 ifft203 = _mm512_mul_ps(ifft194, ifft195);
__m512 ifft290 = _mm512_mul_ps(ifft282, ifft195);
__m512 ifft204 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft205 = _mm512_fnmadd_ps(ifft188, ifft204, ifft196);
__m512 ifft291 = _mm512_fnmadd_ps(ifft276, ifft204, ifft283);
__m512 ifft206 = _mm512_fmadd_ps(ifft187, ifft204, ifft197);
__m512 ifft292 = _mm512_fmadd_ps(ifft275, ifft204, ifft284);
__m512 ifft207 = _mm512_fnmadd_ps(ifft190, ifft204, ifft198);
__m512 ifft293 = _mm512_fnmadd_ps(ifft278, ifft204, ifft285);
__m512 ifft208 = _mm512_fmadd_ps(ifft189, ifft204, ifft199);
__m512 ifft294 = _mm512_fmadd_ps(ifft277, ifft204, ifft286);
__m512 ifft209 = _mm512_fnmadd_ps(ifft192, ifft204, ifft200);
__m512 ifft295 = _mm512_fnmadd_ps(ifft280, ifft204, ifft287);
__m512 ifft210 = _mm512_fmadd_ps(ifft191, ifft204, ifft201);
__m512 ifft296 = _mm512_fmadd_ps(ifft279, ifft204, ifft288);
__m512 ifft211 = _mm512_fnmadd_ps(ifft194, ifft204, ifft202);
__m512 ifft297 = _mm512_fnmadd_ps(ifft282, ifft204, ifft289);
__m512 ifft212 = _mm512_fmadd_ps(ifft193, ifft204, ifft203);
__m512 ifft298 = _mm512_fmadd_ps(ifft281, ifft204, ifft290);
__m512 ifft213 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft214 = _mm512_fmadd_ps(ifft205, ifft213, _mm512_shuffle_ps(ifft205, ifft205, 78));
__m512 ifft299 = _mm512_fmadd_ps(ifft291, ifft213, _mm512_shuffle_ps(ifft291, ifft291, 78));
__m512 ifft215 = _mm512_fmadd_ps(ifft206, ifft213, _mm512_shuffle_ps(ifft206, ifft206, 78));
__m512 ifft300 = _mm512_fmadd_ps(ifft292, ifft213, _mm512_shuffle_ps(ifft292, ifft292, 78));
__m512 ifft216 = _mm512_fmadd_ps(ifft207, ifft213, _mm512_shuffle_ps(ifft207, ifft207, 78));
__m512 ifft301 = _mm512_fmadd_ps(ifft293, ifft213, _mm512_shuffle_ps(ifft293, ifft293, 78));
__m512 ifft217 = _mm512_fmadd_ps(ifft208, ifft213, _mm512_shuffle_ps(ifft208, ifft208, 78));
__m512 ifft302 = _mm512_fmadd_ps(ifft294, ifft213, _mm512_shuffle_ps(ifft294, ifft294, 78));
__m512 ifft218 = _mm512_fmadd_ps(ifft209, ifft213, _mm512_shuffle_ps(ifft209, ifft209, 78));
__m512 ifft303 = _mm512_fmadd_ps(ifft295, ifft213, _mm512_shuffle_ps(ifft295, ifft295, 78));
__m512 ifft219 = _mm512_fmadd_ps(ifft210, ifft213, _mm512_shuffle_ps(ifft210, ifft210, 78));
__m512 ifft304 = _mm512_fmadd_ps(ifft296, ifft213, _mm512_shuffle_ps(ifft296, ifft296, 78));
__m512 ifft220 = _mm512_fmadd_ps(ifft211, ifft213, _mm512_shuffle_ps(ifft211, ifft211, 78));
__m512 ifft305 = _mm512_fmadd_ps(ifft297, ifft213, _mm512_shuffle_ps(ifft297, ifft297, 78));
__m512 ifft221 = _mm512_fmadd_ps(ifft212, ifft213, _mm512_shuffle_ps(ifft212, ifft212, 78));
__m512 ifft306 = _mm512_fmadd_ps(ifft298, ifft213, _mm512_shuffle_ps(ifft298, ifft298, 78));
__m512 ifft222 = _mm512_mask_sub_ps(ifft214, 49344, _mm512_setzero_ps(), ifft215);
__m512 ifft307 = _mm512_mask_sub_ps(ifft299, 49344, _mm512_setzero_ps(), ifft300);
__m512 ifft223 = _mm512_mask_mov_ps(ifft215, 49344, ifft214);
__m512 ifft308 = _mm512_mask_mov_ps(ifft300, 49344, ifft299);
__m512 ifft224 = _mm512_mask_sub_ps(ifft216, 49344, _mm512_setzero_ps(), ifft217);
__m512 ifft309 = _mm512_mask_sub_ps(ifft301, 49344, _mm512_setzero_ps(), ifft302);
__m512 ifft225 = _mm512_mask_mov_ps(ifft217, 49344, ifft216);
__m512 ifft310 = _mm512_mask_mov_ps(ifft302, 49344, ifft301);
__m512 ifft226 = _mm512_mask_sub_ps(ifft218, 49344, _mm512_setzero_ps(), ifft219);
__m512 ifft311 = _mm512_mask_sub_ps(ifft303, 49344, _mm512_setzero_ps(), ifft304);
__m512 ifft227 = _mm512_mask_mov_ps(ifft219, 49344, ifft218);
__m512 ifft312 = _mm512_mask_mov_ps(ifft304, 49344, ifft303);
__m512 ifft228 = _mm512_mask_sub_ps(ifft220, 49344, _mm512_setzero_ps(), ifft221);
__m512 ifft313 = _mm512_mask_sub_ps(ifft305, 49344, _mm512_setzero_ps(), ifft306);
__m512 ifft229 = _mm512_mask_mov_ps(ifft221, 49344, ifft220);
__m512 ifft314 = _mm512_mask_mov_ps(ifft306, 49344, ifft305);
__m512 ifft230 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft231 = _mm512_fmadd_ps(ifft222, ifft230, _mm512_shuffle_f32x4(ifft222, ifft222, 177));
__m512 ifft315 = _mm512_fmadd_ps(ifft307, ifft230, _mm512_shuffle_f32x4(ifft307, ifft307, 177));
__m512 ifft232 = _mm512_fmadd_ps(ifft223, ifft230, _mm512_shuffle_f32x4(ifft223, ifft223, 177));
__m512 ifft316 = _mm512_fmadd_ps(ifft308, ifft230, _mm512_shuffle_f32x4(ifft308, ifft308, 177));
__m512 ifft233 = _mm512_fmadd_ps(ifft224, ifft230, _mm512_shuffle_f32x4(ifft224, ifft224, 177));
__m512 ifft317 = _mm512_fmadd_ps(ifft309, ifft230, _mm512_shuffle_f32x4(ifft309, ifft309, 177));
__m512 ifft234 = _mm512_fmadd_ps(ifft225, ifft230, _mm512_shuffle_f32x4(ifft225, ifft225, 177));
__m512 ifft318 = _mm512_fmadd_ps(ifft310, ifft230, _mm512_shuffle_f32x4(ifft310, ifft310, 177));
__m512 ifft235 = _mm512_fmadd_ps(ifft226, ifft230, _mm512_shuffle_f32x4(ifft226, ifft226, 177));
__m512 ifft319 = _mm512_fmadd_ps(ifft311, ifft230, _mm512_shuffle_f32x4(ifft311, ifft311, 177));
__m512 ifft236 = _mm512_fnmsub_ps(ifft227, ifft230, _mm512_shuffle_f32x4(ifft227, ifft227, 177));
__m512 ifft320 = _mm512_fnmsub_ps(ifft312, ifft230, _mm512_shuffle_f32x4(ifft312, ifft312, 177));
__m512 ifft237 = _mm512_fmadd_ps(ifft228, ifft230, _mm512_shuffle_f32x4(ifft228, ifft228, 177));
__m512 ifft321 = _mm512_fmadd_ps(ifft313, ifft230, _mm512_shuffle_f32x4(ifft313, ifft313, 177));
__m512 ifft238 = _mm512_fmadd_ps(ifft229, ifft230, _mm512_shuffle_f32x4(ifft229, ifft229, 177));
__m512 ifft322 = _mm512_fmadd_ps(ifft314, ifft230, _mm512_shuffle_f32x4(ifft314, ifft314, 177));
__m512 ifft239 = _mm512_add_ps(ifft231, ifft232);
__m512 ifft323 = _mm512_add_ps(ifft315, ifft316);
__m512 ifft240 = _mm512_sub_ps(ifft231, ifft232);
__m512 ifft324 = _mm512_sub_ps(ifft315, ifft316);
__m512 ifft241 = _mm512_sub_ps(ifft233, ifft237);
__m512 ifft325 = _mm512_sub_ps(ifft317, ifft321);
__m512 ifft242 = _mm512_add_ps(ifft234, ifft238);
__m512 ifft326 = _mm512_add_ps(ifft318, ifft322);
__m512 ifft243 = _mm512_add_ps(ifft233, ifft237);
__m512 ifft327 = _mm512_add_ps(ifft317, ifft321);
__m512 ifft244 = _mm512_sub_ps(ifft234, ifft238);
__m512 ifft328 = _mm512_sub_ps(ifft318, ifft322);
__m512 ifft245 = _mm512_mul_ps(ifft235, _mm512_set1_ps(3.125e-02f));
__m512 ifft329 = _mm512_mul_ps(ifft319, _mm512_set1_ps(3.125e-02f));
__m512 ifft246 = _mm512_mul_ps(ifft236, _mm512_set1_ps(3.125e-02f));
__m512 ifft330 = _mm512_mul_ps(ifft320, _mm512_set1_ps(3.125e-02f));
__m512 ifft247 = _mm512_fmadd_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft331 = _mm512_fmadd_ps(ifft323, _mm512_set1_ps(1.5625e-02f), ifft329);
__m512 ifft248 = _mm512_fmsub_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft332 = _mm512_fmsub_ps(ifft323, _mm512_set1_ps(1.5625e-02f), ifft329);
__m512 ifft249 = _mm512_fmadd_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft333 = _mm512_fmadd_ps(ifft324, _mm512_set1_ps(1.5625e-02f), ifft330);
__m512 ifft250 = _mm512_fmsub_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft334 = _mm512_fmsub_ps(ifft324, _mm512_set1_ps(1.5625e-02f), ifft330);
__m512 ifft251 = _mm512_add_ps(ifft241, ifft242);
__m512 ifft335 = _mm512_add_ps(ifft325, ifft326);
__m512 ifft252 = _mm512_sub_ps(ifft241, ifft242);
__m512 ifft336 = _mm512_sub_ps(ifft325, ifft326);
__m512 ifft253 = _mm512_fnmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft337 = _mm512_fnmadd_ps(ifft335, _mm512_set1_ps(7.0710677e-01f), ifft327);
__m512 ifft254 = _mm512_fmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft338 = _mm512_fmadd_ps(ifft335, _mm512_set1_ps(7.0710677e-01f), ifft327);
__m512 ifft255 = _mm512_fmadd_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft339 = _mm512_fmadd_ps(ifft336, _mm512_set1_ps(7.0710677e-01f), ifft328);
__m512 ifft256 = _mm512_fmsub_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft340 = _mm512_fmsub_ps(ifft336, _mm512_set1_ps(7.0710677e-01f), ifft328);
__m512 ifft257 = _mm512_add_ps(ifft253, ifft254);
__m512 ifft341 = _mm512_add_ps(ifft337, ifft338);
__m512 ifft258 = _mm512_sub_ps(ifft253, ifft254);
__m512 ifft342 = _mm512_sub_ps(ifft337, ifft338);
__m512 ifft259 = _mm512_add_ps(ifft255, ifft256);
__m512 ifft343 = _mm512_add_ps(ifft339, ifft340);
__m512 ifft260 = _mm512_sub_ps(ifft255, ifft256);
__m512 ifft344 = _mm512_sub_ps(ifft339, ifft340);
__m512 ifft261 = _mm512_fmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft345 = _mm512_fmadd_ps(ifft341, _mm512_set1_ps(1.5625e-02f), ifft331);
__m512 ifft262 = _mm512_fnmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft346 = _mm512_fnmadd_ps(ifft341, _mm512_set1_ps(1.5625e-02f), ifft331);
__m512 ifft263 = _mm512_fmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft347 = _mm512_fmadd_ps(ifft343, _mm512_set1_ps(1.5625e-02f), ifft333);
__m512 ifft264 = _mm512_fnmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft348 = _mm512_fnmadd_ps(ifft343, _mm512_set1_ps(1.5625e-02f), ifft333);
__m512 ifft265 = _mm512_fnmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft349 = _mm512_fnmadd_ps(ifft344, _mm512_set1_ps(1.5625e-02f), ifft332);
__m512 ifft266 = _mm512_fmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft350 = _mm512_fmadd_ps(ifft344, _mm512_set1_ps(1.5625e-02f), ifft332);
__m512 ifft267 = _mm512_fmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft351 = _mm512_fmadd_ps(ifft342, _mm512_set1_ps(1.5625e-02f), ifft334);
__m512 ifft268 = _mm512_fnmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft352 = _mm512_fnmadd_ps(ifft342, _mm512_set1_ps(1.5625e-02f), ifft334);
__m512 dat87 = ifft261;
__m512 dat93 = ifft345;
__m512 dat88 = ifft263;
__m512 dat94 = ifft347;
__m512 dat89 = ifft265;
__m512 dat95 = ifft349;
__m512 dat90 = ifft267;
__m512 dat96 = ifft351;
__m512 dat91 = ifft262;
__m512 dat97 = ifft346;
__m512 dat92 = ifft264;
__m512 dat98 = ifft348;
(void)ifft266;
(void)ifft350;
(void)ifft268;
(void)ifft352;
__m512i pm3 = _mm512_set_epi32(3, 2, 1, 0, 21, 20, 19, 18, 17, 16, 5, 4, 3, 2, 1, 0);
__m512 pack13 = _mm512_permutex2var_ps(dat87, pm3, dat93);
__m512i pm4 = _mm512_set_epi32(27, 26, 25, 24, 13, 12, 11, 10, 9, 8, 29, 28, 27, 26, 25, 24);
__m512 pack14 = _mm512_permutex2var_ps(dat87, pm4, dat93);
__m512 pack15 = _mm512_permutex2var_ps(dat88, pm3, dat94);
__m512 pack16 = _mm512_permutex2var_ps(dat88, pm4, dat94);
__m512 pack17 = _mm512_permutex2var_ps(dat89, pm3, dat95);
__m512 pack18 = _mm512_permutex2var_ps(dat89, pm4, dat95);
__m512 pack19 = _mm512_permutex2var_ps(dat90, pm3, dat96);
__m512 pack20 = _mm512_permutex2var_ps(dat90, pm4, dat96);
__m512 pack21 = _mm512_permutex2var_ps(dat91, pm3, dat97);
__m512 pack22 = _mm512_permutex2var_ps(dat91, pm4, dat97);
__m512 pack23 = _mm512_permutex2var_ps(dat92, pm3, dat98);
__m512 pack24 = _mm512_permutex2var_ps(dat92, pm4, dat98);
_mm512_mask_storeu_ps(datPtr2+240+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack13);
_mm512_mask_storeu_ps(datPtr2+880+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack14);
_mm512_mask_storeu_ps(datPtr2+280+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack15);
_mm512_mask_storeu_ps(datPtr2+920+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack16);
_mm512_mask_storeu_ps(datPtr2+320+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack17);
_mm512_mask_storeu_ps(datPtr2+960+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack18);
_mm512_mask_storeu_ps(datPtr2+360+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack19);
_mm512_mask_storeu_ps(datPtr2+1000+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack20);
_mm512_mask_storeu_ps(datPtr2+400+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack21);
_mm512_mask_storeu_ps(datPtr2+1040+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack22);
_mm512_mask_storeu_ps(datPtr2+440+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack23);
_mm512_mask_storeu_ps(datPtr2+1080+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t3, 1023, pack24);
ptrdiff_t t4 = 0;
__m512 sfRe77 = _mm512_loadu_ps(sfPtr3+512+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfIm77 = _mm512_loadu_ps(sfPtr3+576+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfRe81 = _mm512_loadu_ps(sfPtr3+640+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfIm81 = _mm512_loadu_ps(sfPtr3+704+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfRe78 = _mm512_loadu_ps(sfPtr3+23936+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfIm78 = _mm512_loadu_ps(sfPtr3+24000+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfRe82 = _mm512_loadu_ps(sfPtr3+24064+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfIm82 = _mm512_loadu_ps(sfPtr3+24128+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfRe79 = _mm512_loadu_ps(sfPtr3+47360+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfIm79 = _mm512_loadu_ps(sfPtr3+47424+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfRe83 = _mm512_loadu_ps(sfPtr3+47488+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfIm83 = _mm512_loadu_ps(sfPtr3+47552+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfRe80 = _mm512_loadu_ps(sfPtr3+70784+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfIm80 = _mm512_loadu_ps(sfPtr3+70848+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfRe84 = _mm512_loadu_ps(sfPtr3+70912+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512 sfIm84 = _mm512_loadu_ps(sfPtr3+70976+93696*i9+23424*j5+1536*k8+768*r2+256*t4);
__m512i ifft353 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft354 = _mm512_permutexvar_ps(ifft353, sfRe77);
__m512 ifft445 = _mm512_permutexvar_ps(ifft353, sfRe81);
__m512i ifft355 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft356 = _mm512_permutexvar_ps(ifft355, sfRe77);
__m512 ifft446 = _mm512_permutexvar_ps(ifft355, sfRe81);
__m512 ifft357 = _mm512_permutexvar_ps(ifft353, sfIm77);
__m512 ifft447 = _mm512_permutexvar_ps(ifft353, sfIm81);
__m512 ifft358 = _mm512_permutexvar_ps(ifft355, sfIm77);
__m512 ifft448 = _mm512_permutexvar_ps(ifft355, sfIm81);
__m512 ifft359 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft360 = _mm512_mask_fmadd_ps(ifft358, 65021, ifft359, ifft354);
__m512 ifft449 = _mm512_mask_fmadd_ps(ifft448, 65021, ifft359, ifft445);
__m512 ifft361 = _mm512_mask_fnmadd_ps(ifft357, 65021, ifft359, ifft356);
__m512 ifft450 = _mm512_mask_fnmadd_ps(ifft447, 65021, ifft359, ifft446);
__m512 ifft362 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft363 = _mm512_fmadd_ps(ifft360, ifft362, _mm512_shuffle_ps(ifft360, ifft360, 177));
__m512 ifft451 = _mm512_fmadd_ps(ifft449, ifft362, _mm512_shuffle_ps(ifft449, ifft449, 177));
__m512 ifft364 = _mm512_fmadd_ps(ifft361, ifft362, _mm512_shuffle_ps(ifft361, ifft361, 177));
__m512 ifft452 = _mm512_fmadd_ps(ifft450, ifft362, _mm512_shuffle_ps(ifft450, ifft450, 177));
__m512 ifft365 = _mm512_fmadd_ps(sfRe78, ifft362, _mm512_shuffle_ps(sfRe78, sfRe78, 177));
__m512 ifft453 = _mm512_fmadd_ps(sfRe82, ifft362, _mm512_shuffle_ps(sfRe82, sfRe82, 177));
__m512 ifft366 = _mm512_fmadd_ps(sfIm78, ifft362, _mm512_shuffle_ps(sfIm78, sfIm78, 177));
__m512 ifft454 = _mm512_fmadd_ps(sfIm82, ifft362, _mm512_shuffle_ps(sfIm82, sfIm82, 177));
__m512 ifft367 = _mm512_fmadd_ps(sfRe79, ifft362, _mm512_shuffle_ps(sfRe79, sfRe79, 177));
__m512 ifft455 = _mm512_fmadd_ps(sfRe83, ifft362, _mm512_shuffle_ps(sfRe83, sfRe83, 177));
__m512 ifft368 = _mm512_fmadd_ps(sfIm79, ifft362, _mm512_shuffle_ps(sfIm79, sfIm79, 177));
__m512 ifft456 = _mm512_fmadd_ps(sfIm83, ifft362, _mm512_shuffle_ps(sfIm83, sfIm83, 177));
__m512 ifft369 = _mm512_fmadd_ps(sfRe80, ifft362, _mm512_shuffle_ps(sfRe80, sfRe80, 177));
__m512 ifft457 = _mm512_fmadd_ps(sfRe84, ifft362, _mm512_shuffle_ps(sfRe84, sfRe84, 177));
__m512 ifft370 = _mm512_fmadd_ps(sfIm80, ifft362, _mm512_shuffle_ps(sfIm80, sfIm80, 177));
__m512 ifft458 = _mm512_fmadd_ps(sfIm84, ifft362, _mm512_shuffle_ps(sfIm84, sfIm84, 177));
__m512 ifft371 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft372 = _mm512_mul_ps(ifft363, ifft371);
__m512 ifft459 = _mm512_mul_ps(ifft451, ifft371);
__m512 ifft373 = _mm512_mul_ps(ifft364, ifft371);
__m512 ifft460 = _mm512_mul_ps(ifft452, ifft371);
__m512 ifft374 = _mm512_mul_ps(ifft365, ifft371);
__m512 ifft461 = _mm512_mul_ps(ifft453, ifft371);
__m512 ifft375 = _mm512_mul_ps(ifft366, ifft371);
__m512 ifft462 = _mm512_mul_ps(ifft454, ifft371);
__m512 ifft376 = _mm512_mul_ps(ifft367, ifft371);
__m512 ifft463 = _mm512_mul_ps(ifft455, ifft371);
__m512 ifft377 = _mm512_mul_ps(ifft368, ifft371);
__m512 ifft464 = _mm512_mul_ps(ifft456, ifft371);
__m512 ifft378 = _mm512_mul_ps(ifft369, ifft371);
__m512 ifft465 = _mm512_mul_ps(ifft457, ifft371);
__m512 ifft379 = _mm512_mul_ps(ifft370, ifft371);
__m512 ifft466 = _mm512_mul_ps(ifft458, ifft371);
__m512 ifft380 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft381 = _mm512_fnmadd_ps(ifft364, ifft380, ifft372);
__m512 ifft467 = _mm512_fnmadd_ps(ifft452, ifft380, ifft459);
__m512 ifft382 = _mm512_fmadd_ps(ifft363, ifft380, ifft373);
__m512 ifft468 = _mm512_fmadd_ps(ifft451, ifft380, ifft460);
__m512 ifft383 = _mm512_fnmadd_ps(ifft366, ifft380, ifft374);
__m512 ifft469 = _mm512_fnmadd_ps(ifft454, ifft380, ifft461);
__m512 ifft384 = _mm512_fmadd_ps(ifft365, ifft380, ifft375);
__m512 ifft470 = _mm512_fmadd_ps(ifft453, ifft380, ifft462);
__m512 ifft385 = _mm512_fnmadd_ps(ifft368, ifft380, ifft376);
__m512 ifft471 = _mm512_fnmadd_ps(ifft456, ifft380, ifft463);
__m512 ifft386 = _mm512_fmadd_ps(ifft367, ifft380, ifft377);
__m512 ifft472 = _mm512_fmadd_ps(ifft455, ifft380, ifft464);
__m512 ifft387 = _mm512_fnmadd_ps(ifft370, ifft380, ifft378);
__m512 ifft473 = _mm512_fnmadd_ps(ifft458, ifft380, ifft465);
__m512 ifft388 = _mm512_fmadd_ps(ifft369, ifft380, ifft379);
__m512 ifft474 = _mm512_fmadd_ps(ifft457, ifft380, ifft466);
__m512 ifft389 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft390 = _mm512_fmadd_ps(ifft381, ifft389, _mm512_shuffle_ps(ifft381, ifft381, 78));
__m512 ifft475 = _mm512_fmadd_ps(ifft467, ifft389, _mm512_shuffle_ps(ifft467, ifft467, 78));
__m512 ifft391 = _mm512_fmadd_ps(ifft382, ifft389, _mm512_shuffle_ps(ifft382, ifft382, 78));
__m512 ifft476 = _mm512_fmadd_ps(ifft468, ifft389, _mm512_shuffle_ps(ifft468, ifft468, 78));
__m512 ifft392 = _mm512_fmadd_ps(ifft383, ifft389, _mm512_shuffle_ps(ifft383, ifft383, 78));
__m512 ifft477 = _mm512_fmadd_ps(ifft469, ifft389, _mm512_shuffle_ps(ifft469, ifft469, 78));
__m512 ifft393 = _mm512_fmadd_ps(ifft384, ifft389, _mm512_shuffle_ps(ifft384, ifft384, 78));
__m512 ifft478 = _mm512_fmadd_ps(ifft470, ifft389, _mm512_shuffle_ps(ifft470, ifft470, 78));
__m512 ifft394 = _mm512_fmadd_ps(ifft385, ifft389, _mm512_shuffle_ps(ifft385, ifft385, 78));
__m512 ifft479 = _mm512_fmadd_ps(ifft471, ifft389, _mm512_shuffle_ps(ifft471, ifft471, 78));
__m512 ifft395 = _mm512_fmadd_ps(ifft386, ifft389, _mm512_shuffle_ps(ifft386, ifft386, 78));
__m512 ifft480 = _mm512_fmadd_ps(ifft472, ifft389, _mm512_shuffle_ps(ifft472, ifft472, 78));
__m512 ifft396 = _mm512_fmadd_ps(ifft387, ifft389, _mm512_shuffle_ps(ifft387, ifft387, 78));
__m512 ifft481 = _mm512_fmadd_ps(ifft473, ifft389, _mm512_shuffle_ps(ifft473, ifft473, 78));
__m512 ifft397 = _mm512_fmadd_ps(ifft388, ifft389, _mm512_shuffle_ps(ifft388, ifft388, 78));
__m512 ifft482 = _mm512_fmadd_ps(ifft474, ifft389, _mm512_shuffle_ps(ifft474, ifft474, 78));
__m512 ifft398 = _mm512_mask_sub_ps(ifft390, 49344, _mm512_setzero_ps(), ifft391);
__m512 ifft483 = _mm512_mask_sub_ps(ifft475, 49344, _mm512_setzero_ps(), ifft476);
__m512 ifft399 = _mm512_mask_mov_ps(ifft391, 49344, ifft390);
__m512 ifft484 = _mm512_mask_mov_ps(ifft476, 49344, ifft475);
__m512 ifft400 = _mm512_mask_sub_ps(ifft392, 49344, _mm512_setzero_ps(), ifft393);
__m512 ifft485 = _mm512_mask_sub_ps(ifft477, 49344, _mm512_setzero_ps(), ifft478);
__m512 ifft401 = _mm512_mask_mov_ps(ifft393, 49344, ifft392);
__m512 ifft486 = _mm512_mask_mov_ps(ifft478, 49344, ifft477);
__m512 ifft402 = _mm512_mask_sub_ps(ifft394, 49344, _mm512_setzero_ps(), ifft395);
__m512 ifft487 = _mm512_mask_sub_ps(ifft479, 49344, _mm512_setzero_ps(), ifft480);
__m512 ifft403 = _mm512_mask_mov_ps(ifft395, 49344, ifft394);
__m512 ifft488 = _mm512_mask_mov_ps(ifft480, 49344, ifft479);
__m512 ifft404 = _mm512_mask_sub_ps(ifft396, 49344, _mm512_setzero_ps(), ifft397);
__m512 ifft489 = _mm512_mask_sub_ps(ifft481, 49344, _mm512_setzero_ps(), ifft482);
__m512 ifft405 = _mm512_mask_mov_ps(ifft397, 49344, ifft396);
__m512 ifft490 = _mm512_mask_mov_ps(ifft482, 49344, ifft481);
__m512 ifft406 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft407 = _mm512_fmadd_ps(ifft398, ifft406, _mm512_shuffle_f32x4(ifft398, ifft398, 177));
__m512 ifft491 = _mm512_fmadd_ps(ifft483, ifft406, _mm512_shuffle_f32x4(ifft483, ifft483, 177));
__m512 ifft408 = _mm512_fmadd_ps(ifft399, ifft406, _mm512_shuffle_f32x4(ifft399, ifft399, 177));
__m512 ifft492 = _mm512_fmadd_ps(ifft484, ifft406, _mm512_shuffle_f32x4(ifft484, ifft484, 177));
__m512 ifft409 = _mm512_fmadd_ps(ifft400, ifft406, _mm512_shuffle_f32x4(ifft400, ifft400, 177));
__m512 ifft493 = _mm512_fmadd_ps(ifft485, ifft406, _mm512_shuffle_f32x4(ifft485, ifft485, 177));
__m512 ifft410 = _mm512_fmadd_ps(ifft401, ifft406, _mm512_shuffle_f32x4(ifft401, ifft401, 177));
__m512 ifft494 = _mm512_fmadd_ps(ifft486, ifft406, _mm512_shuffle_f32x4(ifft486, ifft486, 177));
__m512 ifft411 = _mm512_fmadd_ps(ifft402, ifft406, _mm512_shuffle_f32x4(ifft402, ifft402, 177));
__m512 ifft495 = _mm512_fmadd_ps(ifft487, ifft406, _mm512_shuffle_f32x4(ifft487, ifft487, 177));
__m512 ifft412 = _mm512_fnmsub_ps(ifft403, ifft406, _mm512_shuffle_f32x4(ifft403, ifft403, 177));
__m512 ifft496 = _mm512_fnmsub_ps(ifft488, ifft406, _mm512_shuffle_f32x4(ifft488, ifft488, 177));
__m512 ifft413 = _mm512_fmadd_ps(ifft404, ifft406, _mm512_shuffle_f32x4(ifft404, ifft404, 177));
__m512 ifft497 = _mm512_fmadd_ps(ifft489, ifft406, _mm512_shuffle_f32x4(ifft489, ifft489, 177));
__m512 ifft414 = _mm512_fmadd_ps(ifft405, ifft406, _mm512_shuffle_f32x4(ifft405, ifft405, 177));
__m512 ifft498 = _mm512_fmadd_ps(ifft490, ifft406, _mm512_shuffle_f32x4(ifft490, ifft490, 177));
__m512 ifft415 = _mm512_add_ps(ifft407, ifft408);
__m512 ifft499 = _mm512_add_ps(ifft491, ifft492);
__m512 ifft416 = _mm512_sub_ps(ifft407, ifft408);
__m512 ifft500 = _mm512_sub_ps(ifft491, ifft492);
__m512 ifft417 = _mm512_sub_ps(ifft409, ifft413);
__m512 ifft501 = _mm512_sub_ps(ifft493, ifft497);
__m512 ifft418 = _mm512_add_ps(ifft410, ifft414);
__m512 ifft502 = _mm512_add_ps(ifft494, ifft498);
__m512 ifft419 = _mm512_add_ps(ifft409, ifft413);
__m512 ifft503 = _mm512_add_ps(ifft493, ifft497);
__m512 ifft420 = _mm512_sub_ps(ifft410, ifft414);
__m512 ifft504 = _mm512_sub_ps(ifft494, ifft498);
__m512 ifft421 = _mm512_mul_ps(ifft411, _mm512_set1_ps(3.125e-02f));
__m512 ifft505 = _mm512_mul_ps(ifft495, _mm512_set1_ps(3.125e-02f));
__m512 ifft422 = _mm512_mul_ps(ifft412, _mm512_set1_ps(3.125e-02f));
__m512 ifft506 = _mm512_mul_ps(ifft496, _mm512_set1_ps(3.125e-02f));
__m512 ifft423 = _mm512_fmadd_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft507 = _mm512_fmadd_ps(ifft499, _mm512_set1_ps(1.5625e-02f), ifft505);
__m512 ifft424 = _mm512_fmsub_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft508 = _mm512_fmsub_ps(ifft499, _mm512_set1_ps(1.5625e-02f), ifft505);
__m512 ifft425 = _mm512_fmadd_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft509 = _mm512_fmadd_ps(ifft500, _mm512_set1_ps(1.5625e-02f), ifft506);
__m512 ifft426 = _mm512_fmsub_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft510 = _mm512_fmsub_ps(ifft500, _mm512_set1_ps(1.5625e-02f), ifft506);
__m512 ifft427 = _mm512_add_ps(ifft417, ifft418);
__m512 ifft511 = _mm512_add_ps(ifft501, ifft502);
__m512 ifft428 = _mm512_sub_ps(ifft417, ifft418);
__m512 ifft512 = _mm512_sub_ps(ifft501, ifft502);
__m512 ifft429 = _mm512_fnmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft513 = _mm512_fnmadd_ps(ifft511, _mm512_set1_ps(7.0710677e-01f), ifft503);
__m512 ifft430 = _mm512_fmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft514 = _mm512_fmadd_ps(ifft511, _mm512_set1_ps(7.0710677e-01f), ifft503);
__m512 ifft431 = _mm512_fmadd_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft515 = _mm512_fmadd_ps(ifft512, _mm512_set1_ps(7.0710677e-01f), ifft504);
__m512 ifft432 = _mm512_fmsub_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft516 = _mm512_fmsub_ps(ifft512, _mm512_set1_ps(7.0710677e-01f), ifft504);
__m512 ifft433 = _mm512_add_ps(ifft429, ifft430);
__m512 ifft517 = _mm512_add_ps(ifft513, ifft514);
__m512 ifft434 = _mm512_sub_ps(ifft429, ifft430);
__m512 ifft518 = _mm512_sub_ps(ifft513, ifft514);
__m512 ifft435 = _mm512_add_ps(ifft431, ifft432);
__m512 ifft519 = _mm512_add_ps(ifft515, ifft516);
__m512 ifft436 = _mm512_sub_ps(ifft431, ifft432);
__m512 ifft520 = _mm512_sub_ps(ifft515, ifft516);
__m512 ifft437 = _mm512_fmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft521 = _mm512_fmadd_ps(ifft517, _mm512_set1_ps(1.5625e-02f), ifft507);
__m512 ifft438 = _mm512_fnmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft522 = _mm512_fnmadd_ps(ifft517, _mm512_set1_ps(1.5625e-02f), ifft507);
__m512 ifft439 = _mm512_fmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft523 = _mm512_fmadd_ps(ifft519, _mm512_set1_ps(1.5625e-02f), ifft509);
__m512 ifft440 = _mm512_fnmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft524 = _mm512_fnmadd_ps(ifft519, _mm512_set1_ps(1.5625e-02f), ifft509);
__m512 ifft441 = _mm512_fnmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft525 = _mm512_fnmadd_ps(ifft520, _mm512_set1_ps(1.5625e-02f), ifft508);
__m512 ifft442 = _mm512_fmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft526 = _mm512_fmadd_ps(ifft520, _mm512_set1_ps(1.5625e-02f), ifft508);
__m512 ifft443 = _mm512_fmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft527 = _mm512_fmadd_ps(ifft518, _mm512_set1_ps(1.5625e-02f), ifft510);
__m512 ifft444 = _mm512_fnmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft528 = _mm512_fnmadd_ps(ifft518, _mm512_set1_ps(1.5625e-02f), ifft510);
__m512 dat99 = ifft437;
__m512 dat103 = ifft521;
__m512 dat100 = ifft439;
__m512 dat104 = ifft523;
__m512 dat101 = ifft441;
__m512 dat105 = ifft525;
__m512 dat102 = ifft443;
__m512 dat106 = ifft527;
(void)ifft438;
(void)ifft522;
(void)ifft440;
(void)ifft524;
(void)ifft442;
(void)ifft526;
(void)ifft444;
(void)ifft528;
__m512i pm5 = _mm512_set_epi32(3, 2, 1, 0, 21, 20, 19, 18, 17, 16, 5, 4, 3, 2, 1, 0);
__m512 pack25 = _mm512_permutex2var_ps(dat99, pm5, dat103);
__m512i pm6 = _mm512_set_epi32(27, 26, 25, 24, 13, 12, 11, 10, 9, 8, 29, 28, 27, 26, 25, 24);
__m512 pack26 = _mm512_permutex2var_ps(dat99, pm6, dat103);
__m512 pack27 = _mm512_permutex2var_ps(dat100, pm5, dat104);
__m512 pack28 = _mm512_permutex2var_ps(dat100, pm6, dat104);
__m512 pack29 = _mm512_permutex2var_ps(dat101, pm5, dat105);
__m512 pack30 = _mm512_permutex2var_ps(dat101, pm6, dat105);
__m512 pack31 = _mm512_permutex2var_ps(dat102, pm5, dat106);
__m512 pack32 = _mm512_permutex2var_ps(dat102, pm6, dat106);
_mm512_mask_storeu_ps(datPtr2+480+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t4, 1023, pack25);
_mm512_mask_storeu_ps(datPtr2+1120+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t4, 1023, pack26);
_mm512_mask_storeu_ps(datPtr2+520+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t4, 1023, pack27);
_mm512_mask_storeu_ps(datPtr2+1160+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t4, 1023, pack28);
_mm512_mask_storeu_ps(datPtr2+560+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t4, 1023, pack29);
_mm512_mask_storeu_ps(datPtr2+1200+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t4, 1023, pack30);
_mm512_mask_storeu_ps(datPtr2+600+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t4, 1023, pack31);
_mm512_mask_storeu_ps(datPtr2+1240+39040*i9+2560*k8+1280*r2+40*toH1+4*toW1+0*t4, 1023, pack32);
}
}
ptrdiff_t r3 = 0;
ptrdiff_t t5 = 0;
__m512 sfRe85 = _mm512_loadu_ps(sfPtr3+0+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfIm85 = _mm512_loadu_ps(sfPtr3+64+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfRe89 = _mm512_loadu_ps(sfPtr3+128+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfIm89 = _mm512_loadu_ps(sfPtr3+192+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfRe86 = _mm512_loadu_ps(sfPtr3+23424+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfIm86 = _mm512_loadu_ps(sfPtr3+23488+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfRe90 = _mm512_loadu_ps(sfPtr3+23552+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfIm90 = _mm512_loadu_ps(sfPtr3+23616+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfRe87 = _mm512_loadu_ps(sfPtr3+46848+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfIm87 = _mm512_loadu_ps(sfPtr3+46912+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfRe91 = _mm512_loadu_ps(sfPtr3+46976+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfIm91 = _mm512_loadu_ps(sfPtr3+47040+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfRe88 = _mm512_loadu_ps(sfPtr3+70272+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfIm88 = _mm512_loadu_ps(sfPtr3+70336+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfRe92 = _mm512_loadu_ps(sfPtr3+70400+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512 sfIm92 = _mm512_loadu_ps(sfPtr3+70464+93696*i9+23424*j5+1536*k8+768*r3+256*t5);
__m512i ifft529 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft530 = _mm512_permutexvar_ps(ifft529, sfRe85);
__m512 ifft621 = _mm512_permutexvar_ps(ifft529, sfRe89);
__m512i ifft531 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft532 = _mm512_permutexvar_ps(ifft531, sfRe85);
__m512 ifft622 = _mm512_permutexvar_ps(ifft531, sfRe89);
__m512 ifft533 = _mm512_permutexvar_ps(ifft529, sfIm85);
__m512 ifft623 = _mm512_permutexvar_ps(ifft529, sfIm89);
__m512 ifft534 = _mm512_permutexvar_ps(ifft531, sfIm85);
__m512 ifft624 = _mm512_permutexvar_ps(ifft531, sfIm89);
__m512 ifft535 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft536 = _mm512_mask_fmadd_ps(ifft534, 65021, ifft535, ifft530);
__m512 ifft625 = _mm512_mask_fmadd_ps(ifft624, 65021, ifft535, ifft621);
__m512 ifft537 = _mm512_mask_fnmadd_ps(ifft533, 65021, ifft535, ifft532);
__m512 ifft626 = _mm512_mask_fnmadd_ps(ifft623, 65021, ifft535, ifft622);
__m512 ifft538 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft539 = _mm512_fmadd_ps(ifft536, ifft538, _mm512_shuffle_ps(ifft536, ifft536, 177));
__m512 ifft627 = _mm512_fmadd_ps(ifft625, ifft538, _mm512_shuffle_ps(ifft625, ifft625, 177));
__m512 ifft540 = _mm512_fmadd_ps(ifft537, ifft538, _mm512_shuffle_ps(ifft537, ifft537, 177));
__m512 ifft628 = _mm512_fmadd_ps(ifft626, ifft538, _mm512_shuffle_ps(ifft626, ifft626, 177));
__m512 ifft541 = _mm512_fmadd_ps(sfRe86, ifft538, _mm512_shuffle_ps(sfRe86, sfRe86, 177));
__m512 ifft629 = _mm512_fmadd_ps(sfRe90, ifft538, _mm512_shuffle_ps(sfRe90, sfRe90, 177));
__m512 ifft542 = _mm512_fmadd_ps(sfIm86, ifft538, _mm512_shuffle_ps(sfIm86, sfIm86, 177));
__m512 ifft630 = _mm512_fmadd_ps(sfIm90, ifft538, _mm512_shuffle_ps(sfIm90, sfIm90, 177));
__m512 ifft543 = _mm512_fmadd_ps(sfRe87, ifft538, _mm512_shuffle_ps(sfRe87, sfRe87, 177));
__m512 ifft631 = _mm512_fmadd_ps(sfRe91, ifft538, _mm512_shuffle_ps(sfRe91, sfRe91, 177));
__m512 ifft544 = _mm512_fmadd_ps(sfIm87, ifft538, _mm512_shuffle_ps(sfIm87, sfIm87, 177));
__m512 ifft632 = _mm512_fmadd_ps(sfIm91, ifft538, _mm512_shuffle_ps(sfIm91, sfIm91, 177));
__m512 ifft545 = _mm512_fmadd_ps(sfRe88, ifft538, _mm512_shuffle_ps(sfRe88, sfRe88, 177));
__m512 ifft633 = _mm512_fmadd_ps(sfRe92, ifft538, _mm512_shuffle_ps(sfRe92, sfRe92, 177));
__m512 ifft546 = _mm512_fmadd_ps(sfIm88, ifft538, _mm512_shuffle_ps(sfIm88, sfIm88, 177));
__m512 ifft634 = _mm512_fmadd_ps(sfIm92, ifft538, _mm512_shuffle_ps(sfIm92, sfIm92, 177));
__m512 ifft547 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft548 = _mm512_mul_ps(ifft539, ifft547);
__m512 ifft635 = _mm512_mul_ps(ifft627, ifft547);
__m512 ifft549 = _mm512_mul_ps(ifft540, ifft547);
__m512 ifft636 = _mm512_mul_ps(ifft628, ifft547);
__m512 ifft550 = _mm512_mul_ps(ifft541, ifft547);
__m512 ifft637 = _mm512_mul_ps(ifft629, ifft547);
__m512 ifft551 = _mm512_mul_ps(ifft542, ifft547);
__m512 ifft638 = _mm512_mul_ps(ifft630, ifft547);
__m512 ifft552 = _mm512_mul_ps(ifft543, ifft547);
__m512 ifft639 = _mm512_mul_ps(ifft631, ifft547);
__m512 ifft553 = _mm512_mul_ps(ifft544, ifft547);
__m512 ifft640 = _mm512_mul_ps(ifft632, ifft547);
__m512 ifft554 = _mm512_mul_ps(ifft545, ifft547);
__m512 ifft641 = _mm512_mul_ps(ifft633, ifft547);
__m512 ifft555 = _mm512_mul_ps(ifft546, ifft547);
__m512 ifft642 = _mm512_mul_ps(ifft634, ifft547);
__m512 ifft556 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft557 = _mm512_fnmadd_ps(ifft540, ifft556, ifft548);
__m512 ifft643 = _mm512_fnmadd_ps(ifft628, ifft556, ifft635);
__m512 ifft558 = _mm512_fmadd_ps(ifft539, ifft556, ifft549);
__m512 ifft644 = _mm512_fmadd_ps(ifft627, ifft556, ifft636);
__m512 ifft559 = _mm512_fnmadd_ps(ifft542, ifft556, ifft550);
__m512 ifft645 = _mm512_fnmadd_ps(ifft630, ifft556, ifft637);
__m512 ifft560 = _mm512_fmadd_ps(ifft541, ifft556, ifft551);
__m512 ifft646 = _mm512_fmadd_ps(ifft629, ifft556, ifft638);
__m512 ifft561 = _mm512_fnmadd_ps(ifft544, ifft556, ifft552);
__m512 ifft647 = _mm512_fnmadd_ps(ifft632, ifft556, ifft639);
__m512 ifft562 = _mm512_fmadd_ps(ifft543, ifft556, ifft553);
__m512 ifft648 = _mm512_fmadd_ps(ifft631, ifft556, ifft640);
__m512 ifft563 = _mm512_fnmadd_ps(ifft546, ifft556, ifft554);
__m512 ifft649 = _mm512_fnmadd_ps(ifft634, ifft556, ifft641);
__m512 ifft564 = _mm512_fmadd_ps(ifft545, ifft556, ifft555);
__m512 ifft650 = _mm512_fmadd_ps(ifft633, ifft556, ifft642);
__m512 ifft565 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft566 = _mm512_fmadd_ps(ifft557, ifft565, _mm512_shuffle_ps(ifft557, ifft557, 78));
__m512 ifft651 = _mm512_fmadd_ps(ifft643, ifft565, _mm512_shuffle_ps(ifft643, ifft643, 78));
__m512 ifft567 = _mm512_fmadd_ps(ifft558, ifft565, _mm512_shuffle_ps(ifft558, ifft558, 78));
__m512 ifft652 = _mm512_fmadd_ps(ifft644, ifft565, _mm512_shuffle_ps(ifft644, ifft644, 78));
__m512 ifft568 = _mm512_fmadd_ps(ifft559, ifft565, _mm512_shuffle_ps(ifft559, ifft559, 78));
__m512 ifft653 = _mm512_fmadd_ps(ifft645, ifft565, _mm512_shuffle_ps(ifft645, ifft645, 78));
__m512 ifft569 = _mm512_fmadd_ps(ifft560, ifft565, _mm512_shuffle_ps(ifft560, ifft560, 78));
__m512 ifft654 = _mm512_fmadd_ps(ifft646, ifft565, _mm512_shuffle_ps(ifft646, ifft646, 78));
__m512 ifft570 = _mm512_fmadd_ps(ifft561, ifft565, _mm512_shuffle_ps(ifft561, ifft561, 78));
__m512 ifft655 = _mm512_fmadd_ps(ifft647, ifft565, _mm512_shuffle_ps(ifft647, ifft647, 78));
__m512 ifft571 = _mm512_fmadd_ps(ifft562, ifft565, _mm512_shuffle_ps(ifft562, ifft562, 78));
__m512 ifft656 = _mm512_fmadd_ps(ifft648, ifft565, _mm512_shuffle_ps(ifft648, ifft648, 78));
__m512 ifft572 = _mm512_fmadd_ps(ifft563, ifft565, _mm512_shuffle_ps(ifft563, ifft563, 78));
__m512 ifft657 = _mm512_fmadd_ps(ifft649, ifft565, _mm512_shuffle_ps(ifft649, ifft649, 78));
__m512 ifft573 = _mm512_fmadd_ps(ifft564, ifft565, _mm512_shuffle_ps(ifft564, ifft564, 78));
__m512 ifft658 = _mm512_fmadd_ps(ifft650, ifft565, _mm512_shuffle_ps(ifft650, ifft650, 78));
__m512 ifft574 = _mm512_mask_sub_ps(ifft566, 49344, _mm512_setzero_ps(), ifft567);
__m512 ifft659 = _mm512_mask_sub_ps(ifft651, 49344, _mm512_setzero_ps(), ifft652);
__m512 ifft575 = _mm512_mask_mov_ps(ifft567, 49344, ifft566);
__m512 ifft660 = _mm512_mask_mov_ps(ifft652, 49344, ifft651);
__m512 ifft576 = _mm512_mask_sub_ps(ifft568, 49344, _mm512_setzero_ps(), ifft569);
__m512 ifft661 = _mm512_mask_sub_ps(ifft653, 49344, _mm512_setzero_ps(), ifft654);
__m512 ifft577 = _mm512_mask_mov_ps(ifft569, 49344, ifft568);
__m512 ifft662 = _mm512_mask_mov_ps(ifft654, 49344, ifft653);
__m512 ifft578 = _mm512_mask_sub_ps(ifft570, 49344, _mm512_setzero_ps(), ifft571);
__m512 ifft663 = _mm512_mask_sub_ps(ifft655, 49344, _mm512_setzero_ps(), ifft656);
__m512 ifft579 = _mm512_mask_mov_ps(ifft571, 49344, ifft570);
__m512 ifft664 = _mm512_mask_mov_ps(ifft656, 49344, ifft655);
__m512 ifft580 = _mm512_mask_sub_ps(ifft572, 49344, _mm512_setzero_ps(), ifft573);
__m512 ifft665 = _mm512_mask_sub_ps(ifft657, 49344, _mm512_setzero_ps(), ifft658);
__m512 ifft581 = _mm512_mask_mov_ps(ifft573, 49344, ifft572);
__m512 ifft666 = _mm512_mask_mov_ps(ifft658, 49344, ifft657);
__m512 ifft582 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft583 = _mm512_fmadd_ps(ifft574, ifft582, _mm512_shuffle_f32x4(ifft574, ifft574, 177));
__m512 ifft667 = _mm512_fmadd_ps(ifft659, ifft582, _mm512_shuffle_f32x4(ifft659, ifft659, 177));
__m512 ifft584 = _mm512_fmadd_ps(ifft575, ifft582, _mm512_shuffle_f32x4(ifft575, ifft575, 177));
__m512 ifft668 = _mm512_fmadd_ps(ifft660, ifft582, _mm512_shuffle_f32x4(ifft660, ifft660, 177));
__m512 ifft585 = _mm512_fmadd_ps(ifft576, ifft582, _mm512_shuffle_f32x4(ifft576, ifft576, 177));
__m512 ifft669 = _mm512_fmadd_ps(ifft661, ifft582, _mm512_shuffle_f32x4(ifft661, ifft661, 177));
__m512 ifft586 = _mm512_fmadd_ps(ifft577, ifft582, _mm512_shuffle_f32x4(ifft577, ifft577, 177));
__m512 ifft670 = _mm512_fmadd_ps(ifft662, ifft582, _mm512_shuffle_f32x4(ifft662, ifft662, 177));
__m512 ifft587 = _mm512_fmadd_ps(ifft578, ifft582, _mm512_shuffle_f32x4(ifft578, ifft578, 177));
__m512 ifft671 = _mm512_fmadd_ps(ifft663, ifft582, _mm512_shuffle_f32x4(ifft663, ifft663, 177));
__m512 ifft588 = _mm512_fnmsub_ps(ifft579, ifft582, _mm512_shuffle_f32x4(ifft579, ifft579, 177));
__m512 ifft672 = _mm512_fnmsub_ps(ifft664, ifft582, _mm512_shuffle_f32x4(ifft664, ifft664, 177));
__m512 ifft589 = _mm512_fmadd_ps(ifft580, ifft582, _mm512_shuffle_f32x4(ifft580, ifft580, 177));
__m512 ifft673 = _mm512_fmadd_ps(ifft665, ifft582, _mm512_shuffle_f32x4(ifft665, ifft665, 177));
__m512 ifft590 = _mm512_fmadd_ps(ifft581, ifft582, _mm512_shuffle_f32x4(ifft581, ifft581, 177));
__m512 ifft674 = _mm512_fmadd_ps(ifft666, ifft582, _mm512_shuffle_f32x4(ifft666, ifft666, 177));
__m512 ifft591 = _mm512_add_ps(ifft583, ifft584);
__m512 ifft675 = _mm512_add_ps(ifft667, ifft668);
__m512 ifft592 = _mm512_sub_ps(ifft583, ifft584);
__m512 ifft676 = _mm512_sub_ps(ifft667, ifft668);
__m512 ifft593 = _mm512_sub_ps(ifft585, ifft589);
__m512 ifft677 = _mm512_sub_ps(ifft669, ifft673);
__m512 ifft594 = _mm512_add_ps(ifft586, ifft590);
__m512 ifft678 = _mm512_add_ps(ifft670, ifft674);
__m512 ifft595 = _mm512_add_ps(ifft585, ifft589);
__m512 ifft679 = _mm512_add_ps(ifft669, ifft673);
__m512 ifft596 = _mm512_sub_ps(ifft586, ifft590);
__m512 ifft680 = _mm512_sub_ps(ifft670, ifft674);
__m512 ifft597 = _mm512_mul_ps(ifft587, _mm512_set1_ps(3.125e-02f));
__m512 ifft681 = _mm512_mul_ps(ifft671, _mm512_set1_ps(3.125e-02f));
__m512 ifft598 = _mm512_mul_ps(ifft588, _mm512_set1_ps(3.125e-02f));
__m512 ifft682 = _mm512_mul_ps(ifft672, _mm512_set1_ps(3.125e-02f));
__m512 ifft599 = _mm512_fmadd_ps(ifft591, _mm512_set1_ps(1.5625e-02f), ifft597);
__m512 ifft683 = _mm512_fmadd_ps(ifft675, _mm512_set1_ps(1.5625e-02f), ifft681);
__m512 ifft600 = _mm512_fmsub_ps(ifft591, _mm512_set1_ps(1.5625e-02f), ifft597);
__m512 ifft684 = _mm512_fmsub_ps(ifft675, _mm512_set1_ps(1.5625e-02f), ifft681);
__m512 ifft601 = _mm512_fmadd_ps(ifft592, _mm512_set1_ps(1.5625e-02f), ifft598);
__m512 ifft685 = _mm512_fmadd_ps(ifft676, _mm512_set1_ps(1.5625e-02f), ifft682);
__m512 ifft602 = _mm512_fmsub_ps(ifft592, _mm512_set1_ps(1.5625e-02f), ifft598);
__m512 ifft686 = _mm512_fmsub_ps(ifft676, _mm512_set1_ps(1.5625e-02f), ifft682);
__m512 ifft603 = _mm512_add_ps(ifft593, ifft594);
__m512 ifft687 = _mm512_add_ps(ifft677, ifft678);
__m512 ifft604 = _mm512_sub_ps(ifft593, ifft594);
__m512 ifft688 = _mm512_sub_ps(ifft677, ifft678);
__m512 ifft605 = _mm512_fnmadd_ps(ifft603, _mm512_set1_ps(7.0710677e-01f), ifft595);
__m512 ifft689 = _mm512_fnmadd_ps(ifft687, _mm512_set1_ps(7.0710677e-01f), ifft679);
__m512 ifft606 = _mm512_fmadd_ps(ifft603, _mm512_set1_ps(7.0710677e-01f), ifft595);
__m512 ifft690 = _mm512_fmadd_ps(ifft687, _mm512_set1_ps(7.0710677e-01f), ifft679);
__m512 ifft607 = _mm512_fmadd_ps(ifft604, _mm512_set1_ps(7.0710677e-01f), ifft596);
__m512 ifft691 = _mm512_fmadd_ps(ifft688, _mm512_set1_ps(7.0710677e-01f), ifft680);
__m512 ifft608 = _mm512_fmsub_ps(ifft604, _mm512_set1_ps(7.0710677e-01f), ifft596);
__m512 ifft692 = _mm512_fmsub_ps(ifft688, _mm512_set1_ps(7.0710677e-01f), ifft680);
__m512 ifft609 = _mm512_add_ps(ifft605, ifft606);
__m512 ifft693 = _mm512_add_ps(ifft689, ifft690);
__m512 ifft610 = _mm512_sub_ps(ifft605, ifft606);
__m512 ifft694 = _mm512_sub_ps(ifft689, ifft690);
__m512 ifft611 = _mm512_add_ps(ifft607, ifft608);
__m512 ifft695 = _mm512_add_ps(ifft691, ifft692);
__m512 ifft612 = _mm512_sub_ps(ifft607, ifft608);
__m512 ifft696 = _mm512_sub_ps(ifft691, ifft692);
__m512 ifft613 = _mm512_fmadd_ps(ifft609, _mm512_set1_ps(1.5625e-02f), ifft599);
__m512 ifft697 = _mm512_fmadd_ps(ifft693, _mm512_set1_ps(1.5625e-02f), ifft683);
__m512 ifft614 = _mm512_fnmadd_ps(ifft609, _mm512_set1_ps(1.5625e-02f), ifft599);
__m512 ifft698 = _mm512_fnmadd_ps(ifft693, _mm512_set1_ps(1.5625e-02f), ifft683);
__m512 ifft615 = _mm512_fmadd_ps(ifft611, _mm512_set1_ps(1.5625e-02f), ifft601);
__m512 ifft699 = _mm512_fmadd_ps(ifft695, _mm512_set1_ps(1.5625e-02f), ifft685);
__m512 ifft616 = _mm512_fnmadd_ps(ifft611, _mm512_set1_ps(1.5625e-02f), ifft601);
__m512 ifft700 = _mm512_fnmadd_ps(ifft695, _mm512_set1_ps(1.5625e-02f), ifft685);
__m512 ifft617 = _mm512_fnmadd_ps(ifft612, _mm512_set1_ps(1.5625e-02f), ifft600);
__m512 ifft701 = _mm512_fnmadd_ps(ifft696, _mm512_set1_ps(1.5625e-02f), ifft684);
__m512 ifft618 = _mm512_fmadd_ps(ifft612, _mm512_set1_ps(1.5625e-02f), ifft600);
__m512 ifft702 = _mm512_fmadd_ps(ifft696, _mm512_set1_ps(1.5625e-02f), ifft684);
__m512 ifft619 = _mm512_fmadd_ps(ifft610, _mm512_set1_ps(1.5625e-02f), ifft602);
__m512 ifft703 = _mm512_fmadd_ps(ifft694, _mm512_set1_ps(1.5625e-02f), ifft686);
__m512 ifft620 = _mm512_fnmadd_ps(ifft610, _mm512_set1_ps(1.5625e-02f), ifft602);
__m512 ifft704 = _mm512_fnmadd_ps(ifft694, _mm512_set1_ps(1.5625e-02f), ifft686);
__m512 dat107 = ifft613;
__m512 dat113 = ifft697;
__m512 dat108 = ifft615;
__m512 dat114 = ifft699;
__m512 dat109 = ifft617;
__m512 dat115 = ifft701;
__m512 dat110 = ifft619;
__m512 dat116 = ifft703;
__m512 dat111 = ifft614;
__m512 dat117 = ifft698;
__m512 dat112 = ifft616;
__m512 dat118 = ifft700;
(void)ifft618;
(void)ifft702;
(void)ifft620;
(void)ifft704;
__m512i pm7 = _mm512_set_epi32(19, 18, 17, 16, 13, 12, 11, 10, 9, 8, 5, 4, 3, 2, 1, 0);
__m512 pack33 = _mm512_permutexvar_ps(pm7, dat107);
__m512 pack34 = _mm512_permutexvar_ps(pm7, dat113);
__m512 pack35 = _mm512_permutexvar_ps(pm7, dat108);
__m512 pack36 = _mm512_permutexvar_ps(pm7, dat114);
__m512 pack37 = _mm512_permutexvar_ps(pm7, dat109);
__m512 pack38 = _mm512_permutexvar_ps(pm7, dat115);
__m512 pack39 = _mm512_permutexvar_ps(pm7, dat110);
__m512 pack40 = _mm512_permutexvar_ps(pm7, dat116);
__m512 pack41 = _mm512_permutexvar_ps(pm7, dat111);
__m512 pack42 = _mm512_permutexvar_ps(pm7, dat117);
__m512 pack43 = _mm512_permutexvar_ps(pm7, dat112);
__m512 pack44 = _mm512_permutexvar_ps(pm7, dat118);
_mm512_mask_storeu_ps(datPtr2+0+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack33);
_mm512_mask_storeu_ps(datPtr2+240+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack34);
_mm512_mask_storeu_ps(datPtr2+40+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack35);
_mm512_mask_storeu_ps(datPtr2+280+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack36);
_mm512_mask_storeu_ps(datPtr2+80+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack37);
_mm512_mask_storeu_ps(datPtr2+320+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack38);
_mm512_mask_storeu_ps(datPtr2+120+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack39);
_mm512_mask_storeu_ps(datPtr2+360+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack40);
_mm512_mask_storeu_ps(datPtr2+160+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack41);
_mm512_mask_storeu_ps(datPtr2+400+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack42);
_mm512_mask_storeu_ps(datPtr2+200+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack43);
_mm512_mask_storeu_ps(datPtr2+440+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t5, 1023, pack44);
ptrdiff_t t6 = 0;
__m512 sfRe93 = _mm512_loadu_ps(sfPtr3+256+93696*i9+23424*j5+1536*k8+768*r3+128*t6);
__m512 sfIm93 = _mm512_loadu_ps(sfPtr3+320+93696*i9+23424*j5+1536*k8+768*r3+128*t6);
__m512 sfRe94 = _mm512_loadu_ps(sfPtr3+23680+93696*i9+23424*j5+1536*k8+768*r3+128*t6);
__m512 sfIm94 = _mm512_loadu_ps(sfPtr3+23744+93696*i9+23424*j5+1536*k8+768*r3+128*t6);
__m512 sfRe95 = _mm512_loadu_ps(sfPtr3+47104+93696*i9+23424*j5+1536*k8+768*r3+128*t6);
__m512 sfIm95 = _mm512_loadu_ps(sfPtr3+47168+93696*i9+23424*j5+1536*k8+768*r3+128*t6);
__m512 sfRe96 = _mm512_loadu_ps(sfPtr3+70528+93696*i9+23424*j5+1536*k8+768*r3+128*t6);
__m512 sfIm96 = _mm512_loadu_ps(sfPtr3+70592+93696*i9+23424*j5+1536*k8+768*r3+128*t6);
__m512i ifft705 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft706 = _mm512_permutexvar_ps(ifft705, sfRe93);
__m512i ifft707 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft708 = _mm512_permutexvar_ps(ifft707, sfRe93);
__m512 ifft709 = _mm512_permutexvar_ps(ifft705, sfIm93);
__m512 ifft710 = _mm512_permutexvar_ps(ifft707, sfIm93);
__m512 ifft711 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft712 = _mm512_mask_fmadd_ps(ifft710, 65021, ifft711, ifft706);
__m512 ifft713 = _mm512_mask_fnmadd_ps(ifft709, 65021, ifft711, ifft708);
__m512 ifft714 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft715 = _mm512_fmadd_ps(ifft712, ifft714, _mm512_shuffle_ps(ifft712, ifft712, 177));
__m512 ifft716 = _mm512_fmadd_ps(ifft713, ifft714, _mm512_shuffle_ps(ifft713, ifft713, 177));
__m512 ifft717 = _mm512_fmadd_ps(sfRe94, ifft714, _mm512_shuffle_ps(sfRe94, sfRe94, 177));
__m512 ifft718 = _mm512_fmadd_ps(sfIm94, ifft714, _mm512_shuffle_ps(sfIm94, sfIm94, 177));
__m512 ifft719 = _mm512_fmadd_ps(sfRe95, ifft714, _mm512_shuffle_ps(sfRe95, sfRe95, 177));
__m512 ifft720 = _mm512_fmadd_ps(sfIm95, ifft714, _mm512_shuffle_ps(sfIm95, sfIm95, 177));
__m512 ifft721 = _mm512_fmadd_ps(sfRe96, ifft714, _mm512_shuffle_ps(sfRe96, sfRe96, 177));
__m512 ifft722 = _mm512_fmadd_ps(sfIm96, ifft714, _mm512_shuffle_ps(sfIm96, sfIm96, 177));
__m512 ifft723 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft724 = _mm512_mul_ps(ifft715, ifft723);
__m512 ifft725 = _mm512_mul_ps(ifft716, ifft723);
__m512 ifft726 = _mm512_mul_ps(ifft717, ifft723);
__m512 ifft727 = _mm512_mul_ps(ifft718, ifft723);
__m512 ifft728 = _mm512_mul_ps(ifft719, ifft723);
__m512 ifft729 = _mm512_mul_ps(ifft720, ifft723);
__m512 ifft730 = _mm512_mul_ps(ifft721, ifft723);
__m512 ifft731 = _mm512_mul_ps(ifft722, ifft723);
__m512 ifft732 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft733 = _mm512_fnmadd_ps(ifft716, ifft732, ifft724);
__m512 ifft734 = _mm512_fmadd_ps(ifft715, ifft732, ifft725);
__m512 ifft735 = _mm512_fnmadd_ps(ifft718, ifft732, ifft726);
__m512 ifft736 = _mm512_fmadd_ps(ifft717, ifft732, ifft727);
__m512 ifft737 = _mm512_fnmadd_ps(ifft720, ifft732, ifft728);
__m512 ifft738 = _mm512_fmadd_ps(ifft719, ifft732, ifft729);
__m512 ifft739 = _mm512_fnmadd_ps(ifft722, ifft732, ifft730);
__m512 ifft740 = _mm512_fmadd_ps(ifft721, ifft732, ifft731);
__m512 ifft741 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft742 = _mm512_fmadd_ps(ifft733, ifft741, _mm512_shuffle_ps(ifft733, ifft733, 78));
__m512 ifft743 = _mm512_fmadd_ps(ifft734, ifft741, _mm512_shuffle_ps(ifft734, ifft734, 78));
__m512 ifft744 = _mm512_fmadd_ps(ifft735, ifft741, _mm512_shuffle_ps(ifft735, ifft735, 78));
__m512 ifft745 = _mm512_fmadd_ps(ifft736, ifft741, _mm512_shuffle_ps(ifft736, ifft736, 78));
__m512 ifft746 = _mm512_fmadd_ps(ifft737, ifft741, _mm512_shuffle_ps(ifft737, ifft737, 78));
__m512 ifft747 = _mm512_fmadd_ps(ifft738, ifft741, _mm512_shuffle_ps(ifft738, ifft738, 78));
__m512 ifft748 = _mm512_fmadd_ps(ifft739, ifft741, _mm512_shuffle_ps(ifft739, ifft739, 78));
__m512 ifft749 = _mm512_fmadd_ps(ifft740, ifft741, _mm512_shuffle_ps(ifft740, ifft740, 78));
__m512 ifft750 = _mm512_mask_sub_ps(ifft742, 49344, _mm512_setzero_ps(), ifft743);
__m512 ifft751 = _mm512_mask_mov_ps(ifft743, 49344, ifft742);
__m512 ifft752 = _mm512_mask_sub_ps(ifft744, 49344, _mm512_setzero_ps(), ifft745);
__m512 ifft753 = _mm512_mask_mov_ps(ifft745, 49344, ifft744);
__m512 ifft754 = _mm512_mask_sub_ps(ifft746, 49344, _mm512_setzero_ps(), ifft747);
__m512 ifft755 = _mm512_mask_mov_ps(ifft747, 49344, ifft746);
__m512 ifft756 = _mm512_mask_sub_ps(ifft748, 49344, _mm512_setzero_ps(), ifft749);
__m512 ifft757 = _mm512_mask_mov_ps(ifft749, 49344, ifft748);
__m512 ifft758 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft759 = _mm512_fmadd_ps(ifft750, ifft758, _mm512_shuffle_f32x4(ifft750, ifft750, 177));
__m512 ifft760 = _mm512_fmadd_ps(ifft751, ifft758, _mm512_shuffle_f32x4(ifft751, ifft751, 177));
__m512 ifft761 = _mm512_fmadd_ps(ifft752, ifft758, _mm512_shuffle_f32x4(ifft752, ifft752, 177));
__m512 ifft762 = _mm512_fmadd_ps(ifft753, ifft758, _mm512_shuffle_f32x4(ifft753, ifft753, 177));
__m512 ifft763 = _mm512_fmadd_ps(ifft754, ifft758, _mm512_shuffle_f32x4(ifft754, ifft754, 177));
__m512 ifft764 = _mm512_fnmsub_ps(ifft755, ifft758, _mm512_shuffle_f32x4(ifft755, ifft755, 177));
__m512 ifft765 = _mm512_fmadd_ps(ifft756, ifft758, _mm512_shuffle_f32x4(ifft756, ifft756, 177));
__m512 ifft766 = _mm512_fmadd_ps(ifft757, ifft758, _mm512_shuffle_f32x4(ifft757, ifft757, 177));
__m512 ifft767 = _mm512_add_ps(ifft759, ifft760);
__m512 ifft768 = _mm512_sub_ps(ifft759, ifft760);
__m512 ifft769 = _mm512_sub_ps(ifft761, ifft765);
__m512 ifft770 = _mm512_add_ps(ifft762, ifft766);
__m512 ifft771 = _mm512_add_ps(ifft761, ifft765);
__m512 ifft772 = _mm512_sub_ps(ifft762, ifft766);
__m512 ifft773 = _mm512_mul_ps(ifft763, _mm512_set1_ps(3.125e-02f));
__m512 ifft774 = _mm512_mul_ps(ifft764, _mm512_set1_ps(3.125e-02f));
__m512 ifft775 = _mm512_fmadd_ps(ifft767, _mm512_set1_ps(1.5625e-02f), ifft773);
__m512 ifft776 = _mm512_fmsub_ps(ifft767, _mm512_set1_ps(1.5625e-02f), ifft773);
__m512 ifft777 = _mm512_fmadd_ps(ifft768, _mm512_set1_ps(1.5625e-02f), ifft774);
__m512 ifft778 = _mm512_fmsub_ps(ifft768, _mm512_set1_ps(1.5625e-02f), ifft774);
__m512 ifft779 = _mm512_add_ps(ifft769, ifft770);
__m512 ifft780 = _mm512_sub_ps(ifft769, ifft770);
__m512 ifft781 = _mm512_fnmadd_ps(ifft779, _mm512_set1_ps(7.0710677e-01f), ifft771);
__m512 ifft782 = _mm512_fmadd_ps(ifft779, _mm512_set1_ps(7.0710677e-01f), ifft771);
__m512 ifft783 = _mm512_fmadd_ps(ifft780, _mm512_set1_ps(7.0710677e-01f), ifft772);
__m512 ifft784 = _mm512_fmsub_ps(ifft780, _mm512_set1_ps(7.0710677e-01f), ifft772);
__m512 ifft785 = _mm512_add_ps(ifft781, ifft782);
__m512 ifft786 = _mm512_sub_ps(ifft781, ifft782);
__m512 ifft787 = _mm512_add_ps(ifft783, ifft784);
__m512 ifft788 = _mm512_sub_ps(ifft783, ifft784);
__m512 ifft789 = _mm512_fmadd_ps(ifft785, _mm512_set1_ps(1.5625e-02f), ifft775);
__m512 ifft790 = _mm512_fnmadd_ps(ifft785, _mm512_set1_ps(1.5625e-02f), ifft775);
__m512 ifft791 = _mm512_fmadd_ps(ifft787, _mm512_set1_ps(1.5625e-02f), ifft777);
__m512 ifft792 = _mm512_fnmadd_ps(ifft787, _mm512_set1_ps(1.5625e-02f), ifft777);
__m512 ifft793 = _mm512_fnmadd_ps(ifft788, _mm512_set1_ps(1.5625e-02f), ifft776);
__m512 ifft794 = _mm512_fmadd_ps(ifft788, _mm512_set1_ps(1.5625e-02f), ifft776);
__m512 ifft795 = _mm512_fmadd_ps(ifft786, _mm512_set1_ps(1.5625e-02f), ifft778);
__m512 ifft796 = _mm512_fnmadd_ps(ifft786, _mm512_set1_ps(1.5625e-02f), ifft778);
__m512 dat119 = ifft789;
__m512 dat120 = ifft791;
__m512 dat121 = ifft793;
__m512 dat122 = ifft795;
(void)ifft790;
(void)ifft792;
(void)ifft794;
(void)ifft796;
__m512i pm8 = _mm512_set_epi32(19, 18, 17, 16, 13, 12, 11, 10, 9, 8, 5, 4, 3, 2, 1, 0);
__m512 pack45 = _mm512_permutexvar_ps(pm8, dat119);
__m512 pack46 = _mm512_permutexvar_ps(pm8, dat120);
__m512 pack47 = _mm512_permutexvar_ps(pm8, dat121);
__m512 pack48 = _mm512_permutexvar_ps(pm8, dat122);
_mm512_mask_storeu_ps(datPtr2+480+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t6, 1023, pack45);
_mm512_mask_storeu_ps(datPtr2+520+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t6, 1023, pack46);
_mm512_mask_storeu_ps(datPtr2+560+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t6, 1023, pack47);
_mm512_mask_storeu_ps(datPtr2+600+39040*i9+2560*k8+1280*r3+40*toH1+4*toW1+0*t6, 1023, pack48);
++j5;
}
}

static void Example8StriderConsumeSums1(Example8ThreaderTeam1* team17, char** tensors7) {
Example8ThreaderTask1 task11;
task11.callee1 = Example8StriderConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 3;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 2;
Example8ThreaderDo1(team17, &task11);
}

struct Example8Net {
char* alloc1;
char* align1;
};

void Example8NetDestroy(Example8Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example8NetCreate(
Example8Net** net1,
Example8Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example8Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(1778751);
if (__builtin_expect(!alloc3, 0)) {
return Example8Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example8ThreaderTeam1* team12 = 0;
char* err8 = Example8ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors12[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example8StriderArrangeFilts1(team12, tensors12);
}
Example8ThreaderDestroy1(team12);
Example8Net* net5 = malloc(sizeof(Example8Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example8Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example8Engine {
Example8Net* net3;
Example8ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example8EnginePthreadT(
Example8Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example8ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example8EngineDestroy(Example8Engine* eng3) {
Example8ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example8EngineCreate(
Example8Engine** eng4,
Example8Net* net4,
ptrdiff_t threads2
) {
Example8Engine* eng5 = malloc(sizeof(Example8Engine));
if (__builtin_expect(!eng5, 0)) {
return Example8Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(718911);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example8Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example8ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example8EngineInference(
Example8Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example8ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example8StriderArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
align4+344064
};
Example8StriderProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+344064,
(char*)outData
};
Example8StriderConsumeSums1(team14, tensors11);
}
}

// End of file.

Top