NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example9 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=285 Height=18 Width=30
Conv FromTensor=in ToTensor=out ToChannels=395 FilterH=7 FilterW=7 StrideH=2 StrideW=2 PaddingH=1 PaddingW=0 DilationH=1 DilationW=1 Groups=5
Output FromTensor=out

Top || Output Example9.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example9Params);
// Example9Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example9Params Example9Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example9Params* params = malloc(sizeof(Example9Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example9Net* net; // For example, 4 threads:
// char* err = Example9NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example9NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example9Net Example9Net;

char* Example9NetCreate(
Example9Net**,
Example9Params*,
ptrdiff_t threads
);

void Example9NetDestroy(Example9Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example9Net* net;
//
// ... Create net ...
//
// Example9Engine* engine; // For example, 4 inference threads:
// char* err = Example9EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example9EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example9EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*285*18*30);
// float* outData = malloc(sizeof(float)*395*7*12);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example9EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example9Engine Example9Engine;

char* Example9EngineCreate(
Example9Engine**,
Example9Net*,
ptrdiff_t threads
);

char* Example9EnginePthreadT(
Example9Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example9EngineInference(
Example9Engine*,
float* inData,
float* outData
);

void Example9EngineDestroy(Example9Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example9Params {
float outBiases[395]; // 1x395x1x1
float outWeights[1103235]; // 395x57x7x7
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example9.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example9.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example9.h"

static char* Example9Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(276);
int step1 = sprintf(msg1, "Example9: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 276-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example9ThreaderTask1 Example9ThreaderTask1;
typedef void (*Example9ThreaderCallee1)(Example9ThreaderTask1*, int64_t*);
typedef struct Example9ThreaderHub1 Example9ThreaderHub1;
typedef struct Example9ThreaderNode1 Example9ThreaderNode1;
typedef struct Example9ThreaderUnwind1 Example9ThreaderUnwind1;
typedef struct Example9ThreaderTeam1 Example9ThreaderTeam1;

struct Example9ThreaderTask1 {
Example9ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example9ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example9ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example9ThreaderTask1* task1;
pthread_cond_t cond2;
Example9ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example9ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example9ThreaderTeam1 {
ptrdiff_t nt1;
Example9ThreaderHub1* hub2;
Example9ThreaderNode1* nodes2;
Example9ThreaderUnwind1 unwind1;
};

static void Example9ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example9ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example9ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example9ThreaderMain1(void* arg1) {
Example9ThreaderNode1* node1 = arg1;
Example9ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example9ThreaderHub1* hub3 = team2->hub2;
Example9ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example9ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example9ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example9ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example9ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example9ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example9ThreaderDestroy1(Example9ThreaderTeam1* team3) {
if (!team3) return;
Example9ThreaderNode1* nodes4 = team3->nodes2;
Example9ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example9ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example9ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example9ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example9ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example9ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example9ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example9ThreaderCreate1Up4(Example9ThreaderTeam1* team8, ptrdiff_t nt7) {
Example9ThreaderNode1* nodes5 = team8->nodes2;
for (Example9ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example9Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example9Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example9ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example9Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example9ThreaderCreate1Up3(Example9ThreaderTeam1* team7, ptrdiff_t nt6) {
Example9ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example9Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example9Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example9ThreaderCreate1Up4(team7, nt6);
}

static char* Example9ThreaderCreate1Up2(Example9ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example9ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example9ThreaderNode1) != (size_t)nt5, 0)) {
return Example9Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example9Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example9ThreaderCreate1Up3(team6, nt5);
}

static char* Example9ThreaderCreate1Up1(Example9ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example9ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example9Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example9ThreaderCreate1Up2(team5, nt4);
}

static char* Example9ThreaderCreate1(Example9ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example9Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example9ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example9Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example9ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example9ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example9ThreaderPthreadT1(
pthread_t* thr2,
Example9ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example9Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example9ThreaderDo1(Example9ThreaderTeam1* team10, Example9ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example9ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example9ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example9ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example9ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example9Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example9Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example9StriderArrangeFilts1Callee1(Example9ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = pt7[1];
ptrdiff_t e1 = 0;
char*restrict bfPtr1 = tensors2[2]+1600*e1;
char*restrict wfPtr1 = tensors2[2]+1600+81100800*e1;
char*restrict wtPtr1 = tensors2[0]+77616*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 2*b2;
ptrdiff_t jj1 = j1+1;
if (j1 < 38) {
for (; j1 != 38; ++j1) {
for (ptrdiff_t k1 = 0; k1 < 57; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(127, wtPtr1+0+882588*i5+22344*j1+196*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(127, wtPtr1+28+882588*i5+22344*j1+196*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(127, wtPtr1+56+882588*i5+22344*j1+196*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(127, wtPtr1+84+882588*i5+22344*j1+196*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(127, wtPtr1+112+882588*i5+22344*j1+196*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(127, wtPtr1+140+882588*i5+22344*j1+196*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(127, wtPtr1+168+882588*i5+22344*j1+196*k1);
__m512 fft1 = _mm512_add_ps(wt1, _mm512_setzero_ps());
__m512 fft89 = _mm512_add_ps(wt2, _mm512_setzero_ps());
__m512 fft2 = _mm512_sub_ps(wt1, _mm512_setzero_ps());
__m512 fft90 = _mm512_sub_ps(wt2, _mm512_setzero_ps());
__m512 fft3 = _mm512_add_ps(wt3, _mm512_setzero_ps());
__m512 fft91 = _mm512_add_ps(wt4, _mm512_setzero_ps());
__m512 fft4 = _mm512_sub_ps(wt3, _mm512_setzero_ps());
__m512 fft92 = _mm512_sub_ps(wt4, _mm512_setzero_ps());
__m512 fft5 = _mm512_add_ps(wt5, _mm512_setzero_ps());
__m512 fft93 = _mm512_add_ps(wt6, _mm512_setzero_ps());
__m512 fft6 = _mm512_sub_ps(wt5, _mm512_setzero_ps());
__m512 fft94 = _mm512_sub_ps(wt6, _mm512_setzero_ps());
__m512 fft7 = _mm512_add_ps(wt7, _mm512_setzero_ps());
__m512 fft95 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft8 = _mm512_sub_ps(wt7, _mm512_setzero_ps());
__m512 fft96 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9 = _mm512_add_ps(fft1, fft5);
__m512 fft97 = _mm512_add_ps(fft89, fft93);
__m512 fft10 = _mm512_sub_ps(fft1, fft5);
__m512 fft98 = _mm512_sub_ps(fft89, fft93);
__m512 fft11 = _mm512_add_ps(fft3, fft7);
__m512 fft99 = _mm512_add_ps(fft91, fft95);
__m512 fft12 = _mm512_sub_ps(fft7, fft3);
__m512 fft100 = _mm512_sub_ps(fft95, fft91);
__m512 fft13 = _mm512_sub_ps(fft4, fft8);
__m512 fft101 = _mm512_sub_ps(fft92, fft96);
__m512 fft14 = _mm512_add_ps(fft4, fft8);
__m512 fft102 = _mm512_add_ps(fft92, fft96);
__m512 fft15 = _mm512_add_ps(fft9, fft11);
__m512 fft103 = _mm512_add_ps(fft97, fft99);
__m512 fft16 = _mm512_sub_ps(fft9, fft11);
__m512 fft104 = _mm512_sub_ps(fft97, fft99);
__m512 fft17 = _mm512_fmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft105 = _mm512_fmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft18 = _mm512_fnmsub_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft106 = _mm512_fnmsub_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft19 = _mm512_fnmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft107 = _mm512_fnmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft20 = _mm512_fnmadd_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft108 = _mm512_fnmadd_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft21 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft22 = _mm512_fmadd_ps(fft15, fft21, _mm512_shuffle_f32x4(fft15, fft15, 78));
__m512 fft109 = _mm512_fmadd_ps(fft103, fft21, _mm512_shuffle_f32x4(fft103, fft103, 78));
__m512 fft23 = _mm512_fmadd_ps(fft16, fft21, _mm512_shuffle_f32x4(fft16, fft16, 78));
__m512 fft110 = _mm512_fmadd_ps(fft104, fft21, _mm512_shuffle_f32x4(fft104, fft104, 78));
__m512 fft24 = _mm512_fmadd_ps(fft17, fft21, _mm512_shuffle_f32x4(fft17, fft17, 78));
__m512 fft111 = _mm512_fmadd_ps(fft105, fft21, _mm512_shuffle_f32x4(fft105, fft105, 78));
__m512 fft25 = _mm512_fmadd_ps(fft18, fft21, _mm512_shuffle_f32x4(fft18, fft18, 78));
__m512 fft112 = _mm512_fmadd_ps(fft106, fft21, _mm512_shuffle_f32x4(fft106, fft106, 78));
__m512 fft26 = _mm512_fmadd_ps(fft10, fft21, _mm512_shuffle_f32x4(fft10, fft10, 78));
__m512 fft113 = _mm512_fmadd_ps(fft98, fft21, _mm512_shuffle_f32x4(fft98, fft98, 78));
__m512 fft27 = _mm512_fmadd_ps(fft12, fft21, _mm512_shuffle_f32x4(fft12, fft12, 78));
__m512 fft114 = _mm512_fmadd_ps(fft100, fft21, _mm512_shuffle_f32x4(fft100, fft100, 78));
__m512 fft28 = _mm512_fmadd_ps(fft19, fft21, _mm512_shuffle_f32x4(fft19, fft19, 78));
__m512 fft115 = _mm512_fmadd_ps(fft107, fft21, _mm512_shuffle_f32x4(fft107, fft107, 78));
__m512 fft29 = _mm512_fmadd_ps(fft20, fft21, _mm512_shuffle_f32x4(fft20, fft20, 78));
__m512 fft116 = _mm512_fmadd_ps(fft108, fft21, _mm512_shuffle_f32x4(fft108, fft108, 78));
__m512 fft30 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft31 = _mm512_mul_ps(fft22, fft30);
__m512 fft117 = _mm512_mul_ps(fft109, fft30);
__m512 fft32 = _mm512_mul_ps(fft23, fft30);
__m512 fft118 = _mm512_mul_ps(fft110, fft30);
__m512 fft33 = _mm512_mul_ps(fft24, fft30);
__m512 fft119 = _mm512_mul_ps(fft111, fft30);
__m512 fft34 = _mm512_mul_ps(fft25, fft30);
__m512 fft120 = _mm512_mul_ps(fft112, fft30);
__m512 fft35 = _mm512_mul_ps(fft26, fft30);
__m512 fft121 = _mm512_mul_ps(fft113, fft30);
__m512 fft36 = _mm512_mul_ps(fft27, fft30);
__m512 fft122 = _mm512_mul_ps(fft114, fft30);
__m512 fft37 = _mm512_mul_ps(fft28, fft30);
__m512 fft123 = _mm512_mul_ps(fft115, fft30);
__m512 fft38 = _mm512_mul_ps(fft29, fft30);
__m512 fft124 = _mm512_mul_ps(fft116, fft30);
__m512 fft39 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft40 = _mm512_fmadd_ps(fft23, fft39, fft31);
__m512 fft125 = _mm512_fmadd_ps(fft110, fft39, fft117);
__m512 fft41 = _mm512_fnmadd_ps(fft22, fft39, fft32);
__m512 fft126 = _mm512_fnmadd_ps(fft109, fft39, fft118);
__m512 fft42 = _mm512_fmadd_ps(fft25, fft39, fft33);
__m512 fft127 = _mm512_fmadd_ps(fft112, fft39, fft119);
__m512 fft43 = _mm512_fnmadd_ps(fft24, fft39, fft34);
__m512 fft128 = _mm512_fnmadd_ps(fft111, fft39, fft120);
__m512 fft44 = _mm512_fmadd_ps(fft27, fft39, fft35);
__m512 fft129 = _mm512_fmadd_ps(fft114, fft39, fft121);
__m512 fft45 = _mm512_fnmadd_ps(fft26, fft39, fft36);
__m512 fft130 = _mm512_fnmadd_ps(fft113, fft39, fft122);
__m512 fft46 = _mm512_fmadd_ps(fft29, fft39, fft37);
__m512 fft131 = _mm512_fmadd_ps(fft116, fft39, fft123);
__m512 fft47 = _mm512_fnmadd_ps(fft28, fft39, fft38);
__m512 fft132 = _mm512_fnmadd_ps(fft115, fft39, fft124);
__m512 fft48 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft49 = _mm512_fmadd_ps(fft40, fft48, _mm512_shuffle_f32x4(fft40, fft40, 177));
__m512 fft133 = _mm512_fmadd_ps(fft125, fft48, _mm512_shuffle_f32x4(fft125, fft125, 177));
__m512 fft50 = _mm512_fmadd_ps(fft41, fft48, _mm512_shuffle_f32x4(fft41, fft41, 177));
__m512 fft134 = _mm512_fmadd_ps(fft126, fft48, _mm512_shuffle_f32x4(fft126, fft126, 177));
__m512 fft51 = _mm512_fmadd_ps(fft42, fft48, _mm512_shuffle_f32x4(fft42, fft42, 177));
__m512 fft135 = _mm512_fmadd_ps(fft127, fft48, _mm512_shuffle_f32x4(fft127, fft127, 177));
__m512 fft52 = _mm512_fmadd_ps(fft43, fft48, _mm512_shuffle_f32x4(fft43, fft43, 177));
__m512 fft136 = _mm512_fmadd_ps(fft128, fft48, _mm512_shuffle_f32x4(fft128, fft128, 177));
__m512 fft53 = _mm512_fmadd_ps(fft44, fft48, _mm512_shuffle_f32x4(fft44, fft44, 177));
__m512 fft137 = _mm512_fmadd_ps(fft129, fft48, _mm512_shuffle_f32x4(fft129, fft129, 177));
__m512 fft54 = _mm512_fmadd_ps(fft45, fft48, _mm512_shuffle_f32x4(fft45, fft45, 177));
__m512 fft138 = _mm512_fmadd_ps(fft130, fft48, _mm512_shuffle_f32x4(fft130, fft130, 177));
__m512 fft55 = _mm512_fmadd_ps(fft46, fft48, _mm512_shuffle_f32x4(fft46, fft46, 177));
__m512 fft139 = _mm512_fmadd_ps(fft131, fft48, _mm512_shuffle_f32x4(fft131, fft131, 177));
__m512 fft56 = _mm512_fmadd_ps(fft47, fft48, _mm512_shuffle_f32x4(fft47, fft47, 177));
__m512 fft140 = _mm512_fmadd_ps(fft132, fft48, _mm512_shuffle_f32x4(fft132, fft132, 177));
__m512 fft57 = _mm512_mask_mov_ps(fft49, 49344, fft50);
__m512 fft141 = _mm512_mask_mov_ps(fft133, 49344, fft134);
__m512 fft58 = _mm512_mask_sub_ps(fft50, 49344, _mm512_setzero_ps(), fft49);
__m512 fft142 = _mm512_mask_sub_ps(fft134, 49344, _mm512_setzero_ps(), fft133);
__m512 fft59 = _mm512_mask_mov_ps(fft51, 49344, fft52);
__m512 fft143 = _mm512_mask_mov_ps(fft135, 49344, fft136);
__m512 fft60 = _mm512_mask_sub_ps(fft52, 49344, _mm512_setzero_ps(), fft51);
__m512 fft144 = _mm512_mask_sub_ps(fft136, 49344, _mm512_setzero_ps(), fft135);
__m512 fft61 = _mm512_mask_mov_ps(fft53, 49344, fft54);
__m512 fft145 = _mm512_mask_mov_ps(fft137, 49344, fft138);
__m512 fft62 = _mm512_mask_sub_ps(fft54, 49344, _mm512_setzero_ps(), fft53);
__m512 fft146 = _mm512_mask_sub_ps(fft138, 49344, _mm512_setzero_ps(), fft137);
__m512 fft63 = _mm512_mask_mov_ps(fft55, 49344, fft56);
__m512 fft147 = _mm512_mask_mov_ps(fft139, 49344, fft140);
__m512 fft64 = _mm512_mask_sub_ps(fft56, 49344, _mm512_setzero_ps(), fft55);
__m512 fft148 = _mm512_mask_sub_ps(fft140, 49344, _mm512_setzero_ps(), fft139);
__m512 fft65 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft66 = _mm512_fmadd_ps(fft57, fft65, _mm512_shuffle_ps(fft57, fft57, 78));
__m512 fft149 = _mm512_fmadd_ps(fft141, fft65, _mm512_shuffle_ps(fft141, fft141, 78));
__m512 fft67 = _mm512_fmadd_ps(fft58, fft65, _mm512_shuffle_ps(fft58, fft58, 78));
__m512 fft150 = _mm512_fmadd_ps(fft142, fft65, _mm512_shuffle_ps(fft142, fft142, 78));
__m512 fft68 = _mm512_fmadd_ps(fft59, fft65, _mm512_shuffle_ps(fft59, fft59, 78));
__m512 fft151 = _mm512_fmadd_ps(fft143, fft65, _mm512_shuffle_ps(fft143, fft143, 78));
__m512 fft69 = _mm512_fmadd_ps(fft60, fft65, _mm512_shuffle_ps(fft60, fft60, 78));
__m512 fft152 = _mm512_fmadd_ps(fft144, fft65, _mm512_shuffle_ps(fft144, fft144, 78));
__m512 fft70 = _mm512_fmadd_ps(fft61, fft65, _mm512_shuffle_ps(fft61, fft61, 78));
__m512 fft153 = _mm512_fmadd_ps(fft145, fft65, _mm512_shuffle_ps(fft145, fft145, 78));
__m512 fft71 = _mm512_fmadd_ps(fft62, fft65, _mm512_shuffle_ps(fft62, fft62, 78));
__m512 fft154 = _mm512_fmadd_ps(fft146, fft65, _mm512_shuffle_ps(fft146, fft146, 78));
__m512 fft72 = _mm512_fmadd_ps(fft63, fft65, _mm512_shuffle_ps(fft63, fft63, 78));
__m512 fft155 = _mm512_fmadd_ps(fft147, fft65, _mm512_shuffle_ps(fft147, fft147, 78));
__m512 fft73 = _mm512_fmadd_ps(fft64, fft65, _mm512_shuffle_ps(fft64, fft64, 78));
__m512 fft156 = _mm512_fmadd_ps(fft148, fft65, _mm512_shuffle_ps(fft148, fft148, 78));
__m512i fft74 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft75 = _mm512_permutexvar_ps(fft74, fft66);
__m512 fft157 = _mm512_permutexvar_ps(fft74, fft149);
__m512i fft76 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft77 = _mm512_permutexvar_ps(fft76, fft66);
__m512 fft158 = _mm512_permutexvar_ps(fft76, fft149);
__m512 fft78 = _mm512_permutexvar_ps(fft74, fft67);
__m512 fft159 = _mm512_permutexvar_ps(fft74, fft150);
__m512 fft79 = _mm512_permutexvar_ps(fft76, fft67);
__m512 fft160 = _mm512_permutexvar_ps(fft76, fft150);
__m512 fft80 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft81 = _mm512_fmadd_ps(fft75, fft80, fft77);
__m512 fft161 = _mm512_fmadd_ps(fft157, fft80, fft158);
__m512 fft82 = _mm512_fnmadd_ps(fft79, fft80, fft78);
__m512 fft162 = _mm512_fnmadd_ps(fft160, fft80, fft159);
__m512 fft83 = _mm512_mask_mov_ps(fft79, 21845, fft81);
__m512 fft163 = _mm512_mask_mov_ps(fft160, 21845, fft161);
__m512 fft84 = _mm512_mask_mov_ps(fft75, 43176, fft81);
__m512 fft164 = _mm512_mask_mov_ps(fft157, 43176, fft161);
__m512 fft85 = _mm512_mask_mov_ps(fft83, 43176, fft82);
__m512 fft165 = _mm512_mask_mov_ps(fft163, 43176, fft162);
__m512 fft86 = _mm512_mask_mov_ps(fft84, 22102, fft82);
__m512 fft166 = _mm512_mask_mov_ps(fft164, 22102, fft162);
__m512 fft87 = _mm512_mask_mul_ps(fft85, 64764, fft85, _mm512_set1_ps(5e-01f));
__m512 fft167 = _mm512_mask_mul_ps(fft165, 64764, fft165, _mm512_set1_ps(5e-01f));
__m512 fft88 = _mm512_mask_mul_ps(fft86, 64764, fft86, _mm512_set1_ps(5e-01f));
__m512 fft168 = _mm512_mask_mul_ps(fft166, 64764, fft166, _mm512_set1_ps(5e-01f));
__m512 wf1 = fft87;
__m512 wf9 = fft167;
__m512 wf2 = fft88;
__m512 wf10 = fft168;
__m512 wf3 = fft68;
__m512 wf11 = fft151;
__m512 wf4 = fft69;
__m512 wf12 = fft152;
__m512 wf5 = fft70;
__m512 wf13 = fft153;
__m512 wf6 = fft71;
__m512 wf14 = fft154;
__m512 wf7 = fft72;
__m512 wf15 = fft155;
__m512 wf8 = fft73;
__m512 wf16 = fft156;
ptrdiff_t c1 = (size_t)(0+2*j1)/4;
ptrdiff_t m1 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f2 = (size_t)(0+2*j1)%2;
__m512i eo1 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf3 = _mm512_permutexvar_ps(eo1, wf3);
wf4 = _mm512_permutexvar_ps(eo1, wf4);
__m512i wfs1 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs1 = _mm512_inserti64x4(wfs1, _mm512_cvtps_ph(wf4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+145920+583680*i5+7296*c1+128*k1+64*m1+16*f2, 3855, wfs1);
_mm512_mask_storeu_epi32(wfPtr1+3064304+583680*i5+7296*c1+128*k1+64*m1+16*f2, 61680, wfs1);
wf11 = _mm512_permutexvar_ps(eo1, wf11);
wf12 = _mm512_permutexvar_ps(eo1, wf12);
__m512i wfs2 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs2 = _mm512_inserti64x4(wfs2, _mm512_cvtps_ph(wf12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+5982720+583680*i5+7296*c1+128*k1+64*m1+16*f2, 3855, wfs2);
_mm512_mask_storeu_epi32(wfPtr1+8901104+583680*i5+7296*c1+128*k1+64*m1+16*f2, 61680, wfs2);
wf5 = _mm512_permutexvar_ps(eo1, wf5);
wf6 = _mm512_permutexvar_ps(eo1, wf6);
__m512i wfs3 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs3 = _mm512_inserti64x4(wfs3, _mm512_cvtps_ph(wf6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+291840+583680*i5+7296*c1+128*k1+64*m1+16*f2, 3855, wfs3);
_mm512_mask_storeu_epi32(wfPtr1+3210224+583680*i5+7296*c1+128*k1+64*m1+16*f2, 61680, wfs3);
wf13 = _mm512_permutexvar_ps(eo1, wf13);
wf14 = _mm512_permutexvar_ps(eo1, wf14);
__m512i wfs4 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs4 = _mm512_inserti64x4(wfs4, _mm512_cvtps_ph(wf14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6128640+583680*i5+7296*c1+128*k1+64*m1+16*f2, 3855, wfs4);
_mm512_mask_storeu_epi32(wfPtr1+9047024+583680*i5+7296*c1+128*k1+64*m1+16*f2, 61680, wfs4);
wf7 = _mm512_permutexvar_ps(eo1, wf7);
wf8 = _mm512_permutexvar_ps(eo1, wf8);
__m512i wfs5 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs5 = _mm512_inserti64x4(wfs5, _mm512_cvtps_ph(wf8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+437760+583680*i5+7296*c1+128*k1+64*m1+16*f2, 3855, wfs5);
_mm512_mask_storeu_epi32(wfPtr1+3356144+583680*i5+7296*c1+128*k1+64*m1+16*f2, 61680, wfs5);
wf15 = _mm512_permutexvar_ps(eo1, wf15);
wf16 = _mm512_permutexvar_ps(eo1, wf16);
__m512i wfs6 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs6 = _mm512_inserti64x4(wfs6, _mm512_cvtps_ph(wf16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6274560+583680*i5+7296*c1+128*k1+64*m1+16*f2, 3855, wfs6);
_mm512_mask_storeu_epi32(wfPtr1+9192944+583680*i5+7296*c1+128*k1+64*m1+16*f2, 61680, wfs6);
__m512i wfs7 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs7 = _mm512_inserti64x4(wfs7, _mm512_cvtps_ph(wf2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+583680*i5+7296*c1+128*k1+64*m1+16*f2, 3855, wfs7);
_mm512_mask_storeu_epi32(wfPtr1+2918384+583680*i5+7296*c1+128*k1+64*m1+16*f2, 61680, wfs7);
__m512i wfs8 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs8 = _mm512_inserti64x4(wfs8, _mm512_cvtps_ph(wf10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+5836800+583680*i5+7296*c1+128*k1+64*m1+16*f2, 3855, wfs8);
_mm512_mask_storeu_epi32(wfPtr1+8755184+583680*i5+7296*c1+128*k1+64*m1+16*f2, 61680, wfs8);
__m512 wt8 = _mm512_maskz_loadu_ps(127, wtPtr1+11172+882588*i5+22344*j1+196*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(127, wtPtr1+11200+882588*i5+22344*j1+196*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(127, wtPtr1+11228+882588*i5+22344*j1+196*k1);
__m512 wt11 = _mm512_maskz_loadu_ps(127, wtPtr1+11256+882588*i5+22344*j1+196*k1);
__m512 wt12 = _mm512_maskz_loadu_ps(127, wtPtr1+11284+882588*i5+22344*j1+196*k1);
__m512 wt13 = _mm512_maskz_loadu_ps(127, wtPtr1+11312+882588*i5+22344*j1+196*k1);
__m512 wt14 = _mm512_maskz_loadu_ps(127, wtPtr1+11340+882588*i5+22344*j1+196*k1);
__m512 fft169 = _mm512_add_ps(wt8, _mm512_setzero_ps());
__m512 fft257 = _mm512_add_ps(wt9, _mm512_setzero_ps());
__m512 fft170 = _mm512_sub_ps(wt8, _mm512_setzero_ps());
__m512 fft258 = _mm512_sub_ps(wt9, _mm512_setzero_ps());
__m512 fft171 = _mm512_add_ps(wt10, _mm512_setzero_ps());
__m512 fft259 = _mm512_add_ps(wt11, _mm512_setzero_ps());
__m512 fft172 = _mm512_sub_ps(wt10, _mm512_setzero_ps());
__m512 fft260 = _mm512_sub_ps(wt11, _mm512_setzero_ps());
__m512 fft173 = _mm512_add_ps(wt12, _mm512_setzero_ps());
__m512 fft261 = _mm512_add_ps(wt13, _mm512_setzero_ps());
__m512 fft174 = _mm512_sub_ps(wt12, _mm512_setzero_ps());
__m512 fft262 = _mm512_sub_ps(wt13, _mm512_setzero_ps());
__m512 fft175 = _mm512_add_ps(wt14, _mm512_setzero_ps());
__m512 fft263 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft176 = _mm512_sub_ps(wt14, _mm512_setzero_ps());
__m512 fft264 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft177 = _mm512_add_ps(fft169, fft173);
__m512 fft265 = _mm512_add_ps(fft257, fft261);
__m512 fft178 = _mm512_sub_ps(fft169, fft173);
__m512 fft266 = _mm512_sub_ps(fft257, fft261);
__m512 fft179 = _mm512_add_ps(fft171, fft175);
__m512 fft267 = _mm512_add_ps(fft259, fft263);
__m512 fft180 = _mm512_sub_ps(fft175, fft171);
__m512 fft268 = _mm512_sub_ps(fft263, fft259);
__m512 fft181 = _mm512_sub_ps(fft172, fft176);
__m512 fft269 = _mm512_sub_ps(fft260, fft264);
__m512 fft182 = _mm512_add_ps(fft172, fft176);
__m512 fft270 = _mm512_add_ps(fft260, fft264);
__m512 fft183 = _mm512_add_ps(fft177, fft179);
__m512 fft271 = _mm512_add_ps(fft265, fft267);
__m512 fft184 = _mm512_sub_ps(fft177, fft179);
__m512 fft272 = _mm512_sub_ps(fft265, fft267);
__m512 fft185 = _mm512_fmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft273 = _mm512_fmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft186 = _mm512_fnmsub_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft274 = _mm512_fnmsub_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft187 = _mm512_fnmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft275 = _mm512_fnmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft188 = _mm512_fnmadd_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft276 = _mm512_fnmadd_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft189 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft190 = _mm512_fmadd_ps(fft183, fft189, _mm512_shuffle_f32x4(fft183, fft183, 78));
__m512 fft277 = _mm512_fmadd_ps(fft271, fft189, _mm512_shuffle_f32x4(fft271, fft271, 78));
__m512 fft191 = _mm512_fmadd_ps(fft184, fft189, _mm512_shuffle_f32x4(fft184, fft184, 78));
__m512 fft278 = _mm512_fmadd_ps(fft272, fft189, _mm512_shuffle_f32x4(fft272, fft272, 78));
__m512 fft192 = _mm512_fmadd_ps(fft185, fft189, _mm512_shuffle_f32x4(fft185, fft185, 78));
__m512 fft279 = _mm512_fmadd_ps(fft273, fft189, _mm512_shuffle_f32x4(fft273, fft273, 78));
__m512 fft193 = _mm512_fmadd_ps(fft186, fft189, _mm512_shuffle_f32x4(fft186, fft186, 78));
__m512 fft280 = _mm512_fmadd_ps(fft274, fft189, _mm512_shuffle_f32x4(fft274, fft274, 78));
__m512 fft194 = _mm512_fmadd_ps(fft178, fft189, _mm512_shuffle_f32x4(fft178, fft178, 78));
__m512 fft281 = _mm512_fmadd_ps(fft266, fft189, _mm512_shuffle_f32x4(fft266, fft266, 78));
__m512 fft195 = _mm512_fmadd_ps(fft180, fft189, _mm512_shuffle_f32x4(fft180, fft180, 78));
__m512 fft282 = _mm512_fmadd_ps(fft268, fft189, _mm512_shuffle_f32x4(fft268, fft268, 78));
__m512 fft196 = _mm512_fmadd_ps(fft187, fft189, _mm512_shuffle_f32x4(fft187, fft187, 78));
__m512 fft283 = _mm512_fmadd_ps(fft275, fft189, _mm512_shuffle_f32x4(fft275, fft275, 78));
__m512 fft197 = _mm512_fmadd_ps(fft188, fft189, _mm512_shuffle_f32x4(fft188, fft188, 78));
__m512 fft284 = _mm512_fmadd_ps(fft276, fft189, _mm512_shuffle_f32x4(fft276, fft276, 78));
__m512 fft198 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft199 = _mm512_mul_ps(fft190, fft198);
__m512 fft285 = _mm512_mul_ps(fft277, fft198);
__m512 fft200 = _mm512_mul_ps(fft191, fft198);
__m512 fft286 = _mm512_mul_ps(fft278, fft198);
__m512 fft201 = _mm512_mul_ps(fft192, fft198);
__m512 fft287 = _mm512_mul_ps(fft279, fft198);
__m512 fft202 = _mm512_mul_ps(fft193, fft198);
__m512 fft288 = _mm512_mul_ps(fft280, fft198);
__m512 fft203 = _mm512_mul_ps(fft194, fft198);
__m512 fft289 = _mm512_mul_ps(fft281, fft198);
__m512 fft204 = _mm512_mul_ps(fft195, fft198);
__m512 fft290 = _mm512_mul_ps(fft282, fft198);
__m512 fft205 = _mm512_mul_ps(fft196, fft198);
__m512 fft291 = _mm512_mul_ps(fft283, fft198);
__m512 fft206 = _mm512_mul_ps(fft197, fft198);
__m512 fft292 = _mm512_mul_ps(fft284, fft198);
__m512 fft207 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft208 = _mm512_fmadd_ps(fft191, fft207, fft199);
__m512 fft293 = _mm512_fmadd_ps(fft278, fft207, fft285);
__m512 fft209 = _mm512_fnmadd_ps(fft190, fft207, fft200);
__m512 fft294 = _mm512_fnmadd_ps(fft277, fft207, fft286);
__m512 fft210 = _mm512_fmadd_ps(fft193, fft207, fft201);
__m512 fft295 = _mm512_fmadd_ps(fft280, fft207, fft287);
__m512 fft211 = _mm512_fnmadd_ps(fft192, fft207, fft202);
__m512 fft296 = _mm512_fnmadd_ps(fft279, fft207, fft288);
__m512 fft212 = _mm512_fmadd_ps(fft195, fft207, fft203);
__m512 fft297 = _mm512_fmadd_ps(fft282, fft207, fft289);
__m512 fft213 = _mm512_fnmadd_ps(fft194, fft207, fft204);
__m512 fft298 = _mm512_fnmadd_ps(fft281, fft207, fft290);
__m512 fft214 = _mm512_fmadd_ps(fft197, fft207, fft205);
__m512 fft299 = _mm512_fmadd_ps(fft284, fft207, fft291);
__m512 fft215 = _mm512_fnmadd_ps(fft196, fft207, fft206);
__m512 fft300 = _mm512_fnmadd_ps(fft283, fft207, fft292);
__m512 fft216 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft217 = _mm512_fmadd_ps(fft208, fft216, _mm512_shuffle_f32x4(fft208, fft208, 177));
__m512 fft301 = _mm512_fmadd_ps(fft293, fft216, _mm512_shuffle_f32x4(fft293, fft293, 177));
__m512 fft218 = _mm512_fmadd_ps(fft209, fft216, _mm512_shuffle_f32x4(fft209, fft209, 177));
__m512 fft302 = _mm512_fmadd_ps(fft294, fft216, _mm512_shuffle_f32x4(fft294, fft294, 177));
__m512 fft219 = _mm512_fmadd_ps(fft210, fft216, _mm512_shuffle_f32x4(fft210, fft210, 177));
__m512 fft303 = _mm512_fmadd_ps(fft295, fft216, _mm512_shuffle_f32x4(fft295, fft295, 177));
__m512 fft220 = _mm512_fmadd_ps(fft211, fft216, _mm512_shuffle_f32x4(fft211, fft211, 177));
__m512 fft304 = _mm512_fmadd_ps(fft296, fft216, _mm512_shuffle_f32x4(fft296, fft296, 177));
__m512 fft221 = _mm512_fmadd_ps(fft212, fft216, _mm512_shuffle_f32x4(fft212, fft212, 177));
__m512 fft305 = _mm512_fmadd_ps(fft297, fft216, _mm512_shuffle_f32x4(fft297, fft297, 177));
__m512 fft222 = _mm512_fmadd_ps(fft213, fft216, _mm512_shuffle_f32x4(fft213, fft213, 177));
__m512 fft306 = _mm512_fmadd_ps(fft298, fft216, _mm512_shuffle_f32x4(fft298, fft298, 177));
__m512 fft223 = _mm512_fmadd_ps(fft214, fft216, _mm512_shuffle_f32x4(fft214, fft214, 177));
__m512 fft307 = _mm512_fmadd_ps(fft299, fft216, _mm512_shuffle_f32x4(fft299, fft299, 177));
__m512 fft224 = _mm512_fmadd_ps(fft215, fft216, _mm512_shuffle_f32x4(fft215, fft215, 177));
__m512 fft308 = _mm512_fmadd_ps(fft300, fft216, _mm512_shuffle_f32x4(fft300, fft300, 177));
__m512 fft225 = _mm512_mask_mov_ps(fft217, 49344, fft218);
__m512 fft309 = _mm512_mask_mov_ps(fft301, 49344, fft302);
__m512 fft226 = _mm512_mask_sub_ps(fft218, 49344, _mm512_setzero_ps(), fft217);
__m512 fft310 = _mm512_mask_sub_ps(fft302, 49344, _mm512_setzero_ps(), fft301);
__m512 fft227 = _mm512_mask_mov_ps(fft219, 49344, fft220);
__m512 fft311 = _mm512_mask_mov_ps(fft303, 49344, fft304);
__m512 fft228 = _mm512_mask_sub_ps(fft220, 49344, _mm512_setzero_ps(), fft219);
__m512 fft312 = _mm512_mask_sub_ps(fft304, 49344, _mm512_setzero_ps(), fft303);
__m512 fft229 = _mm512_mask_mov_ps(fft221, 49344, fft222);
__m512 fft313 = _mm512_mask_mov_ps(fft305, 49344, fft306);
__m512 fft230 = _mm512_mask_sub_ps(fft222, 49344, _mm512_setzero_ps(), fft221);
__m512 fft314 = _mm512_mask_sub_ps(fft306, 49344, _mm512_setzero_ps(), fft305);
__m512 fft231 = _mm512_mask_mov_ps(fft223, 49344, fft224);
__m512 fft315 = _mm512_mask_mov_ps(fft307, 49344, fft308);
__m512 fft232 = _mm512_mask_sub_ps(fft224, 49344, _mm512_setzero_ps(), fft223);
__m512 fft316 = _mm512_mask_sub_ps(fft308, 49344, _mm512_setzero_ps(), fft307);
__m512 fft233 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft234 = _mm512_fmadd_ps(fft225, fft233, _mm512_shuffle_ps(fft225, fft225, 78));
__m512 fft317 = _mm512_fmadd_ps(fft309, fft233, _mm512_shuffle_ps(fft309, fft309, 78));
__m512 fft235 = _mm512_fmadd_ps(fft226, fft233, _mm512_shuffle_ps(fft226, fft226, 78));
__m512 fft318 = _mm512_fmadd_ps(fft310, fft233, _mm512_shuffle_ps(fft310, fft310, 78));
__m512 fft236 = _mm512_fmadd_ps(fft227, fft233, _mm512_shuffle_ps(fft227, fft227, 78));
__m512 fft319 = _mm512_fmadd_ps(fft311, fft233, _mm512_shuffle_ps(fft311, fft311, 78));
__m512 fft237 = _mm512_fmadd_ps(fft228, fft233, _mm512_shuffle_ps(fft228, fft228, 78));
__m512 fft320 = _mm512_fmadd_ps(fft312, fft233, _mm512_shuffle_ps(fft312, fft312, 78));
__m512 fft238 = _mm512_fmadd_ps(fft229, fft233, _mm512_shuffle_ps(fft229, fft229, 78));
__m512 fft321 = _mm512_fmadd_ps(fft313, fft233, _mm512_shuffle_ps(fft313, fft313, 78));
__m512 fft239 = _mm512_fmadd_ps(fft230, fft233, _mm512_shuffle_ps(fft230, fft230, 78));
__m512 fft322 = _mm512_fmadd_ps(fft314, fft233, _mm512_shuffle_ps(fft314, fft314, 78));
__m512 fft240 = _mm512_fmadd_ps(fft231, fft233, _mm512_shuffle_ps(fft231, fft231, 78));
__m512 fft323 = _mm512_fmadd_ps(fft315, fft233, _mm512_shuffle_ps(fft315, fft315, 78));
__m512 fft241 = _mm512_fmadd_ps(fft232, fft233, _mm512_shuffle_ps(fft232, fft232, 78));
__m512 fft324 = _mm512_fmadd_ps(fft316, fft233, _mm512_shuffle_ps(fft316, fft316, 78));
__m512i fft242 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft243 = _mm512_permutexvar_ps(fft242, fft234);
__m512 fft325 = _mm512_permutexvar_ps(fft242, fft317);
__m512i fft244 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft245 = _mm512_permutexvar_ps(fft244, fft234);
__m512 fft326 = _mm512_permutexvar_ps(fft244, fft317);
__m512 fft246 = _mm512_permutexvar_ps(fft242, fft235);
__m512 fft327 = _mm512_permutexvar_ps(fft242, fft318);
__m512 fft247 = _mm512_permutexvar_ps(fft244, fft235);
__m512 fft328 = _mm512_permutexvar_ps(fft244, fft318);
__m512 fft248 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft249 = _mm512_fmadd_ps(fft243, fft248, fft245);
__m512 fft329 = _mm512_fmadd_ps(fft325, fft248, fft326);
__m512 fft250 = _mm512_fnmadd_ps(fft247, fft248, fft246);
__m512 fft330 = _mm512_fnmadd_ps(fft328, fft248, fft327);
__m512 fft251 = _mm512_mask_mov_ps(fft247, 21845, fft249);
__m512 fft331 = _mm512_mask_mov_ps(fft328, 21845, fft329);
__m512 fft252 = _mm512_mask_mov_ps(fft243, 43176, fft249);
__m512 fft332 = _mm512_mask_mov_ps(fft325, 43176, fft329);
__m512 fft253 = _mm512_mask_mov_ps(fft251, 43176, fft250);
__m512 fft333 = _mm512_mask_mov_ps(fft331, 43176, fft330);
__m512 fft254 = _mm512_mask_mov_ps(fft252, 22102, fft250);
__m512 fft334 = _mm512_mask_mov_ps(fft332, 22102, fft330);
__m512 fft255 = _mm512_mask_mul_ps(fft253, 64764, fft253, _mm512_set1_ps(5e-01f));
__m512 fft335 = _mm512_mask_mul_ps(fft333, 64764, fft333, _mm512_set1_ps(5e-01f));
__m512 fft256 = _mm512_mask_mul_ps(fft254, 64764, fft254, _mm512_set1_ps(5e-01f));
__m512 fft336 = _mm512_mask_mul_ps(fft334, 64764, fft334, _mm512_set1_ps(5e-01f));
__m512 wf17 = fft255;
__m512 wf25 = fft335;
__m512 wf18 = fft256;
__m512 wf26 = fft336;
__m512 wf19 = fft236;
__m512 wf27 = fft319;
__m512 wf20 = fft237;
__m512 wf28 = fft320;
__m512 wf21 = fft238;
__m512 wf29 = fft321;
__m512 wf22 = fft239;
__m512 wf30 = fft322;
__m512 wf23 = fft240;
__m512 wf31 = fft323;
__m512 wf24 = fft241;
__m512 wf32 = fft324;
ptrdiff_t c2 = (size_t)(1+2*j1)/4;
ptrdiff_t m2 = (size_t)(1+2*j1)%4/2;
ptrdiff_t f3 = (size_t)(1+2*j1)%2;
__m512i eo2 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf19 = _mm512_permutexvar_ps(eo2, wf19);
wf20 = _mm512_permutexvar_ps(eo2, wf20);
__m512i wfs9 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs9 = _mm512_inserti64x4(wfs9, _mm512_cvtps_ph(wf20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+145920+583680*i5+7296*c2+128*k1+64*m2+16*f3, 3855, wfs9);
_mm512_mask_storeu_epi32(wfPtr1+3064304+583680*i5+7296*c2+128*k1+64*m2+16*f3, 61680, wfs9);
wf27 = _mm512_permutexvar_ps(eo2, wf27);
wf28 = _mm512_permutexvar_ps(eo2, wf28);
__m512i wfs10 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs10 = _mm512_inserti64x4(wfs10, _mm512_cvtps_ph(wf28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+5982720+583680*i5+7296*c2+128*k1+64*m2+16*f3, 3855, wfs10);
_mm512_mask_storeu_epi32(wfPtr1+8901104+583680*i5+7296*c2+128*k1+64*m2+16*f3, 61680, wfs10);
wf21 = _mm512_permutexvar_ps(eo2, wf21);
wf22 = _mm512_permutexvar_ps(eo2, wf22);
__m512i wfs11 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs11 = _mm512_inserti64x4(wfs11, _mm512_cvtps_ph(wf22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+291840+583680*i5+7296*c2+128*k1+64*m2+16*f3, 3855, wfs11);
_mm512_mask_storeu_epi32(wfPtr1+3210224+583680*i5+7296*c2+128*k1+64*m2+16*f3, 61680, wfs11);
wf29 = _mm512_permutexvar_ps(eo2, wf29);
wf30 = _mm512_permutexvar_ps(eo2, wf30);
__m512i wfs12 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf29, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs12 = _mm512_inserti64x4(wfs12, _mm512_cvtps_ph(wf30, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6128640+583680*i5+7296*c2+128*k1+64*m2+16*f3, 3855, wfs12);
_mm512_mask_storeu_epi32(wfPtr1+9047024+583680*i5+7296*c2+128*k1+64*m2+16*f3, 61680, wfs12);
wf23 = _mm512_permutexvar_ps(eo2, wf23);
wf24 = _mm512_permutexvar_ps(eo2, wf24);
__m512i wfs13 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs13 = _mm512_inserti64x4(wfs13, _mm512_cvtps_ph(wf24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+437760+583680*i5+7296*c2+128*k1+64*m2+16*f3, 3855, wfs13);
_mm512_mask_storeu_epi32(wfPtr1+3356144+583680*i5+7296*c2+128*k1+64*m2+16*f3, 61680, wfs13);
wf31 = _mm512_permutexvar_ps(eo2, wf31);
wf32 = _mm512_permutexvar_ps(eo2, wf32);
__m512i wfs14 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf31, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs14 = _mm512_inserti64x4(wfs14, _mm512_cvtps_ph(wf32, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6274560+583680*i5+7296*c2+128*k1+64*m2+16*f3, 3855, wfs14);
_mm512_mask_storeu_epi32(wfPtr1+9192944+583680*i5+7296*c2+128*k1+64*m2+16*f3, 61680, wfs14);
__m512i wfs15 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs15 = _mm512_inserti64x4(wfs15, _mm512_cvtps_ph(wf18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+583680*i5+7296*c2+128*k1+64*m2+16*f3, 3855, wfs15);
_mm512_mask_storeu_epi32(wfPtr1+2918384+583680*i5+7296*c2+128*k1+64*m2+16*f3, 61680, wfs15);
__m512i wfs16 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs16 = _mm512_inserti64x4(wfs16, _mm512_cvtps_ph(wf26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+5836800+583680*i5+7296*c2+128*k1+64*m2+16*f3, 3855, wfs16);
_mm512_mask_storeu_epi32(wfPtr1+8755184+583680*i5+7296*c2+128*k1+64*m2+16*f3, 61680, wfs16);
}
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(3, biasPtr1-0+316*i5+8*j1);
bias1 = _mm512_mul_ps(bias1, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+320*i5+8*j1, 3, bias1);
if (j1 >= jj1) return;
}
}
if (j1 == 38) {
for (ptrdiff_t k2 = 0; k2 < 57; ++k2) {
__m512 wt15 = _mm512_maskz_loadu_ps(127, wtPtr1+0+882588*i5+22344*j1+196*k2);
__m512 wt16 = _mm512_maskz_loadu_ps(127, wtPtr1+28+882588*i5+22344*j1+196*k2);
__m512 wt17 = _mm512_maskz_loadu_ps(127, wtPtr1+56+882588*i5+22344*j1+196*k2);
__m512 wt18 = _mm512_maskz_loadu_ps(127, wtPtr1+84+882588*i5+22344*j1+196*k2);
__m512 wt19 = _mm512_maskz_loadu_ps(127, wtPtr1+112+882588*i5+22344*j1+196*k2);
__m512 wt20 = _mm512_maskz_loadu_ps(127, wtPtr1+140+882588*i5+22344*j1+196*k2);
__m512 wt21 = _mm512_maskz_loadu_ps(127, wtPtr1+168+882588*i5+22344*j1+196*k2);
__m512 fft337 = _mm512_add_ps(wt15, _mm512_setzero_ps());
__m512 fft425 = _mm512_add_ps(wt16, _mm512_setzero_ps());
__m512 fft338 = _mm512_sub_ps(wt15, _mm512_setzero_ps());
__m512 fft426 = _mm512_sub_ps(wt16, _mm512_setzero_ps());
__m512 fft339 = _mm512_add_ps(wt17, _mm512_setzero_ps());
__m512 fft427 = _mm512_add_ps(wt18, _mm512_setzero_ps());
__m512 fft340 = _mm512_sub_ps(wt17, _mm512_setzero_ps());
__m512 fft428 = _mm512_sub_ps(wt18, _mm512_setzero_ps());
__m512 fft341 = _mm512_add_ps(wt19, _mm512_setzero_ps());
__m512 fft429 = _mm512_add_ps(wt20, _mm512_setzero_ps());
__m512 fft342 = _mm512_sub_ps(wt19, _mm512_setzero_ps());
__m512 fft430 = _mm512_sub_ps(wt20, _mm512_setzero_ps());
__m512 fft343 = _mm512_add_ps(wt21, _mm512_setzero_ps());
__m512 fft431 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft344 = _mm512_sub_ps(wt21, _mm512_setzero_ps());
__m512 fft432 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft345 = _mm512_add_ps(fft337, fft341);
__m512 fft433 = _mm512_add_ps(fft425, fft429);
__m512 fft346 = _mm512_sub_ps(fft337, fft341);
__m512 fft434 = _mm512_sub_ps(fft425, fft429);
__m512 fft347 = _mm512_add_ps(fft339, fft343);
__m512 fft435 = _mm512_add_ps(fft427, fft431);
__m512 fft348 = _mm512_sub_ps(fft343, fft339);
__m512 fft436 = _mm512_sub_ps(fft431, fft427);
__m512 fft349 = _mm512_sub_ps(fft340, fft344);
__m512 fft437 = _mm512_sub_ps(fft428, fft432);
__m512 fft350 = _mm512_add_ps(fft340, fft344);
__m512 fft438 = _mm512_add_ps(fft428, fft432);
__m512 fft351 = _mm512_add_ps(fft345, fft347);
__m512 fft439 = _mm512_add_ps(fft433, fft435);
__m512 fft352 = _mm512_sub_ps(fft345, fft347);
__m512 fft440 = _mm512_sub_ps(fft433, fft435);
__m512 fft353 = _mm512_fmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft441 = _mm512_fmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft354 = _mm512_fnmsub_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft442 = _mm512_fnmsub_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft355 = _mm512_fnmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft443 = _mm512_fnmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft356 = _mm512_fnmadd_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft444 = _mm512_fnmadd_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft357 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft358 = _mm512_fmadd_ps(fft351, fft357, _mm512_shuffle_f32x4(fft351, fft351, 78));
__m512 fft445 = _mm512_fmadd_ps(fft439, fft357, _mm512_shuffle_f32x4(fft439, fft439, 78));
__m512 fft359 = _mm512_fmadd_ps(fft352, fft357, _mm512_shuffle_f32x4(fft352, fft352, 78));
__m512 fft446 = _mm512_fmadd_ps(fft440, fft357, _mm512_shuffle_f32x4(fft440, fft440, 78));
__m512 fft360 = _mm512_fmadd_ps(fft353, fft357, _mm512_shuffle_f32x4(fft353, fft353, 78));
__m512 fft447 = _mm512_fmadd_ps(fft441, fft357, _mm512_shuffle_f32x4(fft441, fft441, 78));
__m512 fft361 = _mm512_fmadd_ps(fft354, fft357, _mm512_shuffle_f32x4(fft354, fft354, 78));
__m512 fft448 = _mm512_fmadd_ps(fft442, fft357, _mm512_shuffle_f32x4(fft442, fft442, 78));
__m512 fft362 = _mm512_fmadd_ps(fft346, fft357, _mm512_shuffle_f32x4(fft346, fft346, 78));
__m512 fft449 = _mm512_fmadd_ps(fft434, fft357, _mm512_shuffle_f32x4(fft434, fft434, 78));
__m512 fft363 = _mm512_fmadd_ps(fft348, fft357, _mm512_shuffle_f32x4(fft348, fft348, 78));
__m512 fft450 = _mm512_fmadd_ps(fft436, fft357, _mm512_shuffle_f32x4(fft436, fft436, 78));
__m512 fft364 = _mm512_fmadd_ps(fft355, fft357, _mm512_shuffle_f32x4(fft355, fft355, 78));
__m512 fft451 = _mm512_fmadd_ps(fft443, fft357, _mm512_shuffle_f32x4(fft443, fft443, 78));
__m512 fft365 = _mm512_fmadd_ps(fft356, fft357, _mm512_shuffle_f32x4(fft356, fft356, 78));
__m512 fft452 = _mm512_fmadd_ps(fft444, fft357, _mm512_shuffle_f32x4(fft444, fft444, 78));
__m512 fft366 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft367 = _mm512_mul_ps(fft358, fft366);
__m512 fft453 = _mm512_mul_ps(fft445, fft366);
__m512 fft368 = _mm512_mul_ps(fft359, fft366);
__m512 fft454 = _mm512_mul_ps(fft446, fft366);
__m512 fft369 = _mm512_mul_ps(fft360, fft366);
__m512 fft455 = _mm512_mul_ps(fft447, fft366);
__m512 fft370 = _mm512_mul_ps(fft361, fft366);
__m512 fft456 = _mm512_mul_ps(fft448, fft366);
__m512 fft371 = _mm512_mul_ps(fft362, fft366);
__m512 fft457 = _mm512_mul_ps(fft449, fft366);
__m512 fft372 = _mm512_mul_ps(fft363, fft366);
__m512 fft458 = _mm512_mul_ps(fft450, fft366);
__m512 fft373 = _mm512_mul_ps(fft364, fft366);
__m512 fft459 = _mm512_mul_ps(fft451, fft366);
__m512 fft374 = _mm512_mul_ps(fft365, fft366);
__m512 fft460 = _mm512_mul_ps(fft452, fft366);
__m512 fft375 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft376 = _mm512_fmadd_ps(fft359, fft375, fft367);
__m512 fft461 = _mm512_fmadd_ps(fft446, fft375, fft453);
__m512 fft377 = _mm512_fnmadd_ps(fft358, fft375, fft368);
__m512 fft462 = _mm512_fnmadd_ps(fft445, fft375, fft454);
__m512 fft378 = _mm512_fmadd_ps(fft361, fft375, fft369);
__m512 fft463 = _mm512_fmadd_ps(fft448, fft375, fft455);
__m512 fft379 = _mm512_fnmadd_ps(fft360, fft375, fft370);
__m512 fft464 = _mm512_fnmadd_ps(fft447, fft375, fft456);
__m512 fft380 = _mm512_fmadd_ps(fft363, fft375, fft371);
__m512 fft465 = _mm512_fmadd_ps(fft450, fft375, fft457);
__m512 fft381 = _mm512_fnmadd_ps(fft362, fft375, fft372);
__m512 fft466 = _mm512_fnmadd_ps(fft449, fft375, fft458);
__m512 fft382 = _mm512_fmadd_ps(fft365, fft375, fft373);
__m512 fft467 = _mm512_fmadd_ps(fft452, fft375, fft459);
__m512 fft383 = _mm512_fnmadd_ps(fft364, fft375, fft374);
__m512 fft468 = _mm512_fnmadd_ps(fft451, fft375, fft460);
__m512 fft384 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft385 = _mm512_fmadd_ps(fft376, fft384, _mm512_shuffle_f32x4(fft376, fft376, 177));
__m512 fft469 = _mm512_fmadd_ps(fft461, fft384, _mm512_shuffle_f32x4(fft461, fft461, 177));
__m512 fft386 = _mm512_fmadd_ps(fft377, fft384, _mm512_shuffle_f32x4(fft377, fft377, 177));
__m512 fft470 = _mm512_fmadd_ps(fft462, fft384, _mm512_shuffle_f32x4(fft462, fft462, 177));
__m512 fft387 = _mm512_fmadd_ps(fft378, fft384, _mm512_shuffle_f32x4(fft378, fft378, 177));
__m512 fft471 = _mm512_fmadd_ps(fft463, fft384, _mm512_shuffle_f32x4(fft463, fft463, 177));
__m512 fft388 = _mm512_fmadd_ps(fft379, fft384, _mm512_shuffle_f32x4(fft379, fft379, 177));
__m512 fft472 = _mm512_fmadd_ps(fft464, fft384, _mm512_shuffle_f32x4(fft464, fft464, 177));
__m512 fft389 = _mm512_fmadd_ps(fft380, fft384, _mm512_shuffle_f32x4(fft380, fft380, 177));
__m512 fft473 = _mm512_fmadd_ps(fft465, fft384, _mm512_shuffle_f32x4(fft465, fft465, 177));
__m512 fft390 = _mm512_fmadd_ps(fft381, fft384, _mm512_shuffle_f32x4(fft381, fft381, 177));
__m512 fft474 = _mm512_fmadd_ps(fft466, fft384, _mm512_shuffle_f32x4(fft466, fft466, 177));
__m512 fft391 = _mm512_fmadd_ps(fft382, fft384, _mm512_shuffle_f32x4(fft382, fft382, 177));
__m512 fft475 = _mm512_fmadd_ps(fft467, fft384, _mm512_shuffle_f32x4(fft467, fft467, 177));
__m512 fft392 = _mm512_fmadd_ps(fft383, fft384, _mm512_shuffle_f32x4(fft383, fft383, 177));
__m512 fft476 = _mm512_fmadd_ps(fft468, fft384, _mm512_shuffle_f32x4(fft468, fft468, 177));
__m512 fft393 = _mm512_mask_mov_ps(fft385, 49344, fft386);
__m512 fft477 = _mm512_mask_mov_ps(fft469, 49344, fft470);
__m512 fft394 = _mm512_mask_sub_ps(fft386, 49344, _mm512_setzero_ps(), fft385);
__m512 fft478 = _mm512_mask_sub_ps(fft470, 49344, _mm512_setzero_ps(), fft469);
__m512 fft395 = _mm512_mask_mov_ps(fft387, 49344, fft388);
__m512 fft479 = _mm512_mask_mov_ps(fft471, 49344, fft472);
__m512 fft396 = _mm512_mask_sub_ps(fft388, 49344, _mm512_setzero_ps(), fft387);
__m512 fft480 = _mm512_mask_sub_ps(fft472, 49344, _mm512_setzero_ps(), fft471);
__m512 fft397 = _mm512_mask_mov_ps(fft389, 49344, fft390);
__m512 fft481 = _mm512_mask_mov_ps(fft473, 49344, fft474);
__m512 fft398 = _mm512_mask_sub_ps(fft390, 49344, _mm512_setzero_ps(), fft389);
__m512 fft482 = _mm512_mask_sub_ps(fft474, 49344, _mm512_setzero_ps(), fft473);
__m512 fft399 = _mm512_mask_mov_ps(fft391, 49344, fft392);
__m512 fft483 = _mm512_mask_mov_ps(fft475, 49344, fft476);
__m512 fft400 = _mm512_mask_sub_ps(fft392, 49344, _mm512_setzero_ps(), fft391);
__m512 fft484 = _mm512_mask_sub_ps(fft476, 49344, _mm512_setzero_ps(), fft475);
__m512 fft401 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft402 = _mm512_fmadd_ps(fft393, fft401, _mm512_shuffle_ps(fft393, fft393, 78));
__m512 fft485 = _mm512_fmadd_ps(fft477, fft401, _mm512_shuffle_ps(fft477, fft477, 78));
__m512 fft403 = _mm512_fmadd_ps(fft394, fft401, _mm512_shuffle_ps(fft394, fft394, 78));
__m512 fft486 = _mm512_fmadd_ps(fft478, fft401, _mm512_shuffle_ps(fft478, fft478, 78));
__m512 fft404 = _mm512_fmadd_ps(fft395, fft401, _mm512_shuffle_ps(fft395, fft395, 78));
__m512 fft487 = _mm512_fmadd_ps(fft479, fft401, _mm512_shuffle_ps(fft479, fft479, 78));
__m512 fft405 = _mm512_fmadd_ps(fft396, fft401, _mm512_shuffle_ps(fft396, fft396, 78));
__m512 fft488 = _mm512_fmadd_ps(fft480, fft401, _mm512_shuffle_ps(fft480, fft480, 78));
__m512 fft406 = _mm512_fmadd_ps(fft397, fft401, _mm512_shuffle_ps(fft397, fft397, 78));
__m512 fft489 = _mm512_fmadd_ps(fft481, fft401, _mm512_shuffle_ps(fft481, fft481, 78));
__m512 fft407 = _mm512_fmadd_ps(fft398, fft401, _mm512_shuffle_ps(fft398, fft398, 78));
__m512 fft490 = _mm512_fmadd_ps(fft482, fft401, _mm512_shuffle_ps(fft482, fft482, 78));
__m512 fft408 = _mm512_fmadd_ps(fft399, fft401, _mm512_shuffle_ps(fft399, fft399, 78));
__m512 fft491 = _mm512_fmadd_ps(fft483, fft401, _mm512_shuffle_ps(fft483, fft483, 78));
__m512 fft409 = _mm512_fmadd_ps(fft400, fft401, _mm512_shuffle_ps(fft400, fft400, 78));
__m512 fft492 = _mm512_fmadd_ps(fft484, fft401, _mm512_shuffle_ps(fft484, fft484, 78));
__m512i fft410 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft411 = _mm512_permutexvar_ps(fft410, fft402);
__m512 fft493 = _mm512_permutexvar_ps(fft410, fft485);
__m512i fft412 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft413 = _mm512_permutexvar_ps(fft412, fft402);
__m512 fft494 = _mm512_permutexvar_ps(fft412, fft485);
__m512 fft414 = _mm512_permutexvar_ps(fft410, fft403);
__m512 fft495 = _mm512_permutexvar_ps(fft410, fft486);
__m512 fft415 = _mm512_permutexvar_ps(fft412, fft403);
__m512 fft496 = _mm512_permutexvar_ps(fft412, fft486);
__m512 fft416 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft417 = _mm512_fmadd_ps(fft411, fft416, fft413);
__m512 fft497 = _mm512_fmadd_ps(fft493, fft416, fft494);
__m512 fft418 = _mm512_fnmadd_ps(fft415, fft416, fft414);
__m512 fft498 = _mm512_fnmadd_ps(fft496, fft416, fft495);
__m512 fft419 = _mm512_mask_mov_ps(fft415, 21845, fft417);
__m512 fft499 = _mm512_mask_mov_ps(fft496, 21845, fft497);
__m512 fft420 = _mm512_mask_mov_ps(fft411, 43176, fft417);
__m512 fft500 = _mm512_mask_mov_ps(fft493, 43176, fft497);
__m512 fft421 = _mm512_mask_mov_ps(fft419, 43176, fft418);
__m512 fft501 = _mm512_mask_mov_ps(fft499, 43176, fft498);
__m512 fft422 = _mm512_mask_mov_ps(fft420, 22102, fft418);
__m512 fft502 = _mm512_mask_mov_ps(fft500, 22102, fft498);
__m512 fft423 = _mm512_mask_mul_ps(fft421, 64764, fft421, _mm512_set1_ps(5e-01f));
__m512 fft503 = _mm512_mask_mul_ps(fft501, 64764, fft501, _mm512_set1_ps(5e-01f));
__m512 fft424 = _mm512_mask_mul_ps(fft422, 64764, fft422, _mm512_set1_ps(5e-01f));
__m512 fft504 = _mm512_mask_mul_ps(fft502, 64764, fft502, _mm512_set1_ps(5e-01f));
__m512 wf33 = fft423;
__m512 wf41 = fft503;
__m512 wf34 = fft424;
__m512 wf42 = fft504;
__m512 wf35 = fft404;
__m512 wf43 = fft487;
__m512 wf36 = fft405;
__m512 wf44 = fft488;
__m512 wf37 = fft406;
__m512 wf45 = fft489;
__m512 wf38 = fft407;
__m512 wf46 = fft490;
__m512 wf39 = fft408;
__m512 wf47 = fft491;
__m512 wf40 = fft409;
__m512 wf48 = fft492;
ptrdiff_t c3 = (size_t)(0+2*j1)/4;
ptrdiff_t m3 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f4 = (size_t)(0+2*j1)%2;
__m512i eo3 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf35 = _mm512_permutexvar_ps(eo3, wf35);
wf36 = _mm512_permutexvar_ps(eo3, wf36);
__m512i wfs17 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf35, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs17 = _mm512_inserti64x4(wfs17, _mm512_cvtps_ph(wf36, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+145920+583680*i5+7296*c3+128*k2+64*m3+16*f4, 3855, wfs17);
_mm512_mask_storeu_epi32(wfPtr1+3064304+583680*i5+7296*c3+128*k2+64*m3+16*f4, 61680, wfs17);
wf43 = _mm512_permutexvar_ps(eo3, wf43);
wf44 = _mm512_permutexvar_ps(eo3, wf44);
__m512i wfs18 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf43, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs18 = _mm512_inserti64x4(wfs18, _mm512_cvtps_ph(wf44, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+5982720+583680*i5+7296*c3+128*k2+64*m3+16*f4, 3855, wfs18);
_mm512_mask_storeu_epi32(wfPtr1+8901104+583680*i5+7296*c3+128*k2+64*m3+16*f4, 61680, wfs18);
wf37 = _mm512_permutexvar_ps(eo3, wf37);
wf38 = _mm512_permutexvar_ps(eo3, wf38);
__m512i wfs19 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf37, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs19 = _mm512_inserti64x4(wfs19, _mm512_cvtps_ph(wf38, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+291840+583680*i5+7296*c3+128*k2+64*m3+16*f4, 3855, wfs19);
_mm512_mask_storeu_epi32(wfPtr1+3210224+583680*i5+7296*c3+128*k2+64*m3+16*f4, 61680, wfs19);
wf45 = _mm512_permutexvar_ps(eo3, wf45);
wf46 = _mm512_permutexvar_ps(eo3, wf46);
__m512i wfs20 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf45, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs20 = _mm512_inserti64x4(wfs20, _mm512_cvtps_ph(wf46, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6128640+583680*i5+7296*c3+128*k2+64*m3+16*f4, 3855, wfs20);
_mm512_mask_storeu_epi32(wfPtr1+9047024+583680*i5+7296*c3+128*k2+64*m3+16*f4, 61680, wfs20);
wf39 = _mm512_permutexvar_ps(eo3, wf39);
wf40 = _mm512_permutexvar_ps(eo3, wf40);
__m512i wfs21 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf39, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs21 = _mm512_inserti64x4(wfs21, _mm512_cvtps_ph(wf40, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+437760+583680*i5+7296*c3+128*k2+64*m3+16*f4, 3855, wfs21);
_mm512_mask_storeu_epi32(wfPtr1+3356144+583680*i5+7296*c3+128*k2+64*m3+16*f4, 61680, wfs21);
wf47 = _mm512_permutexvar_ps(eo3, wf47);
wf48 = _mm512_permutexvar_ps(eo3, wf48);
__m512i wfs22 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf47, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs22 = _mm512_inserti64x4(wfs22, _mm512_cvtps_ph(wf48, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6274560+583680*i5+7296*c3+128*k2+64*m3+16*f4, 3855, wfs22);
_mm512_mask_storeu_epi32(wfPtr1+9192944+583680*i5+7296*c3+128*k2+64*m3+16*f4, 61680, wfs22);
__m512i wfs23 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf33, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs23 = _mm512_inserti64x4(wfs23, _mm512_cvtps_ph(wf34, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+583680*i5+7296*c3+128*k2+64*m3+16*f4, 3855, wfs23);
_mm512_mask_storeu_epi32(wfPtr1+2918384+583680*i5+7296*c3+128*k2+64*m3+16*f4, 61680, wfs23);
__m512i wfs24 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf41, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs24 = _mm512_inserti64x4(wfs24, _mm512_cvtps_ph(wf42, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+5836800+583680*i5+7296*c3+128*k2+64*m3+16*f4, 3855, wfs24);
_mm512_mask_storeu_epi32(wfPtr1+8755184+583680*i5+7296*c3+128*k2+64*m3+16*f4, 61680, wfs24);
__m512 wt22 = _mm512_maskz_loadu_ps(127, wtPtr1+11172+882588*i5+22344*j1+196*k2);
__m512 wt23 = _mm512_maskz_loadu_ps(127, wtPtr1+11200+882588*i5+22344*j1+196*k2);
__m512 wt24 = _mm512_maskz_loadu_ps(127, wtPtr1+11228+882588*i5+22344*j1+196*k2);
__m512 wt25 = _mm512_maskz_loadu_ps(127, wtPtr1+11256+882588*i5+22344*j1+196*k2);
__m512 wt26 = _mm512_maskz_loadu_ps(127, wtPtr1+11284+882588*i5+22344*j1+196*k2);
__m512 wt27 = _mm512_maskz_loadu_ps(127, wtPtr1+11312+882588*i5+22344*j1+196*k2);
__m512 wt28 = _mm512_maskz_loadu_ps(127, wtPtr1+11340+882588*i5+22344*j1+196*k2);
__m512 fft505 = _mm512_add_ps(wt22, _mm512_setzero_ps());
__m512 fft593 = _mm512_add_ps(wt23, _mm512_setzero_ps());
__m512 fft506 = _mm512_sub_ps(wt22, _mm512_setzero_ps());
__m512 fft594 = _mm512_sub_ps(wt23, _mm512_setzero_ps());
__m512 fft507 = _mm512_add_ps(wt24, _mm512_setzero_ps());
__m512 fft595 = _mm512_add_ps(wt25, _mm512_setzero_ps());
__m512 fft508 = _mm512_sub_ps(wt24, _mm512_setzero_ps());
__m512 fft596 = _mm512_sub_ps(wt25, _mm512_setzero_ps());
__m512 fft509 = _mm512_add_ps(wt26, _mm512_setzero_ps());
__m512 fft597 = _mm512_add_ps(wt27, _mm512_setzero_ps());
__m512 fft510 = _mm512_sub_ps(wt26, _mm512_setzero_ps());
__m512 fft598 = _mm512_sub_ps(wt27, _mm512_setzero_ps());
__m512 fft511 = _mm512_add_ps(wt28, _mm512_setzero_ps());
__m512 fft599 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft512 = _mm512_sub_ps(wt28, _mm512_setzero_ps());
__m512 fft600 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft513 = _mm512_add_ps(fft505, fft509);
__m512 fft601 = _mm512_add_ps(fft593, fft597);
__m512 fft514 = _mm512_sub_ps(fft505, fft509);
__m512 fft602 = _mm512_sub_ps(fft593, fft597);
__m512 fft515 = _mm512_add_ps(fft507, fft511);
__m512 fft603 = _mm512_add_ps(fft595, fft599);
__m512 fft516 = _mm512_sub_ps(fft511, fft507);
__m512 fft604 = _mm512_sub_ps(fft599, fft595);
__m512 fft517 = _mm512_sub_ps(fft508, fft512);
__m512 fft605 = _mm512_sub_ps(fft596, fft600);
__m512 fft518 = _mm512_add_ps(fft508, fft512);
__m512 fft606 = _mm512_add_ps(fft596, fft600);
__m512 fft519 = _mm512_add_ps(fft513, fft515);
__m512 fft607 = _mm512_add_ps(fft601, fft603);
__m512 fft520 = _mm512_sub_ps(fft513, fft515);
__m512 fft608 = _mm512_sub_ps(fft601, fft603);
__m512 fft521 = _mm512_fmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft609 = _mm512_fmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft522 = _mm512_fnmsub_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft610 = _mm512_fnmsub_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft523 = _mm512_fnmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft611 = _mm512_fnmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft524 = _mm512_fnmadd_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft612 = _mm512_fnmadd_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft525 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft526 = _mm512_fmadd_ps(fft519, fft525, _mm512_shuffle_f32x4(fft519, fft519, 78));
__m512 fft613 = _mm512_fmadd_ps(fft607, fft525, _mm512_shuffle_f32x4(fft607, fft607, 78));
__m512 fft527 = _mm512_fmadd_ps(fft520, fft525, _mm512_shuffle_f32x4(fft520, fft520, 78));
__m512 fft614 = _mm512_fmadd_ps(fft608, fft525, _mm512_shuffle_f32x4(fft608, fft608, 78));
__m512 fft528 = _mm512_fmadd_ps(fft521, fft525, _mm512_shuffle_f32x4(fft521, fft521, 78));
__m512 fft615 = _mm512_fmadd_ps(fft609, fft525, _mm512_shuffle_f32x4(fft609, fft609, 78));
__m512 fft529 = _mm512_fmadd_ps(fft522, fft525, _mm512_shuffle_f32x4(fft522, fft522, 78));
__m512 fft616 = _mm512_fmadd_ps(fft610, fft525, _mm512_shuffle_f32x4(fft610, fft610, 78));
__m512 fft530 = _mm512_fmadd_ps(fft514, fft525, _mm512_shuffle_f32x4(fft514, fft514, 78));
__m512 fft617 = _mm512_fmadd_ps(fft602, fft525, _mm512_shuffle_f32x4(fft602, fft602, 78));
__m512 fft531 = _mm512_fmadd_ps(fft516, fft525, _mm512_shuffle_f32x4(fft516, fft516, 78));
__m512 fft618 = _mm512_fmadd_ps(fft604, fft525, _mm512_shuffle_f32x4(fft604, fft604, 78));
__m512 fft532 = _mm512_fmadd_ps(fft523, fft525, _mm512_shuffle_f32x4(fft523, fft523, 78));
__m512 fft619 = _mm512_fmadd_ps(fft611, fft525, _mm512_shuffle_f32x4(fft611, fft611, 78));
__m512 fft533 = _mm512_fmadd_ps(fft524, fft525, _mm512_shuffle_f32x4(fft524, fft524, 78));
__m512 fft620 = _mm512_fmadd_ps(fft612, fft525, _mm512_shuffle_f32x4(fft612, fft612, 78));
__m512 fft534 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft535 = _mm512_mul_ps(fft526, fft534);
__m512 fft621 = _mm512_mul_ps(fft613, fft534);
__m512 fft536 = _mm512_mul_ps(fft527, fft534);
__m512 fft622 = _mm512_mul_ps(fft614, fft534);
__m512 fft537 = _mm512_mul_ps(fft528, fft534);
__m512 fft623 = _mm512_mul_ps(fft615, fft534);
__m512 fft538 = _mm512_mul_ps(fft529, fft534);
__m512 fft624 = _mm512_mul_ps(fft616, fft534);
__m512 fft539 = _mm512_mul_ps(fft530, fft534);
__m512 fft625 = _mm512_mul_ps(fft617, fft534);
__m512 fft540 = _mm512_mul_ps(fft531, fft534);
__m512 fft626 = _mm512_mul_ps(fft618, fft534);
__m512 fft541 = _mm512_mul_ps(fft532, fft534);
__m512 fft627 = _mm512_mul_ps(fft619, fft534);
__m512 fft542 = _mm512_mul_ps(fft533, fft534);
__m512 fft628 = _mm512_mul_ps(fft620, fft534);
__m512 fft543 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft544 = _mm512_fmadd_ps(fft527, fft543, fft535);
__m512 fft629 = _mm512_fmadd_ps(fft614, fft543, fft621);
__m512 fft545 = _mm512_fnmadd_ps(fft526, fft543, fft536);
__m512 fft630 = _mm512_fnmadd_ps(fft613, fft543, fft622);
__m512 fft546 = _mm512_fmadd_ps(fft529, fft543, fft537);
__m512 fft631 = _mm512_fmadd_ps(fft616, fft543, fft623);
__m512 fft547 = _mm512_fnmadd_ps(fft528, fft543, fft538);
__m512 fft632 = _mm512_fnmadd_ps(fft615, fft543, fft624);
__m512 fft548 = _mm512_fmadd_ps(fft531, fft543, fft539);
__m512 fft633 = _mm512_fmadd_ps(fft618, fft543, fft625);
__m512 fft549 = _mm512_fnmadd_ps(fft530, fft543, fft540);
__m512 fft634 = _mm512_fnmadd_ps(fft617, fft543, fft626);
__m512 fft550 = _mm512_fmadd_ps(fft533, fft543, fft541);
__m512 fft635 = _mm512_fmadd_ps(fft620, fft543, fft627);
__m512 fft551 = _mm512_fnmadd_ps(fft532, fft543, fft542);
__m512 fft636 = _mm512_fnmadd_ps(fft619, fft543, fft628);
__m512 fft552 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft553 = _mm512_fmadd_ps(fft544, fft552, _mm512_shuffle_f32x4(fft544, fft544, 177));
__m512 fft637 = _mm512_fmadd_ps(fft629, fft552, _mm512_shuffle_f32x4(fft629, fft629, 177));
__m512 fft554 = _mm512_fmadd_ps(fft545, fft552, _mm512_shuffle_f32x4(fft545, fft545, 177));
__m512 fft638 = _mm512_fmadd_ps(fft630, fft552, _mm512_shuffle_f32x4(fft630, fft630, 177));
__m512 fft555 = _mm512_fmadd_ps(fft546, fft552, _mm512_shuffle_f32x4(fft546, fft546, 177));
__m512 fft639 = _mm512_fmadd_ps(fft631, fft552, _mm512_shuffle_f32x4(fft631, fft631, 177));
__m512 fft556 = _mm512_fmadd_ps(fft547, fft552, _mm512_shuffle_f32x4(fft547, fft547, 177));
__m512 fft640 = _mm512_fmadd_ps(fft632, fft552, _mm512_shuffle_f32x4(fft632, fft632, 177));
__m512 fft557 = _mm512_fmadd_ps(fft548, fft552, _mm512_shuffle_f32x4(fft548, fft548, 177));
__m512 fft641 = _mm512_fmadd_ps(fft633, fft552, _mm512_shuffle_f32x4(fft633, fft633, 177));
__m512 fft558 = _mm512_fmadd_ps(fft549, fft552, _mm512_shuffle_f32x4(fft549, fft549, 177));
__m512 fft642 = _mm512_fmadd_ps(fft634, fft552, _mm512_shuffle_f32x4(fft634, fft634, 177));
__m512 fft559 = _mm512_fmadd_ps(fft550, fft552, _mm512_shuffle_f32x4(fft550, fft550, 177));
__m512 fft643 = _mm512_fmadd_ps(fft635, fft552, _mm512_shuffle_f32x4(fft635, fft635, 177));
__m512 fft560 = _mm512_fmadd_ps(fft551, fft552, _mm512_shuffle_f32x4(fft551, fft551, 177));
__m512 fft644 = _mm512_fmadd_ps(fft636, fft552, _mm512_shuffle_f32x4(fft636, fft636, 177));
__m512 fft561 = _mm512_mask_mov_ps(fft553, 49344, fft554);
__m512 fft645 = _mm512_mask_mov_ps(fft637, 49344, fft638);
__m512 fft562 = _mm512_mask_sub_ps(fft554, 49344, _mm512_setzero_ps(), fft553);
__m512 fft646 = _mm512_mask_sub_ps(fft638, 49344, _mm512_setzero_ps(), fft637);
__m512 fft563 = _mm512_mask_mov_ps(fft555, 49344, fft556);
__m512 fft647 = _mm512_mask_mov_ps(fft639, 49344, fft640);
__m512 fft564 = _mm512_mask_sub_ps(fft556, 49344, _mm512_setzero_ps(), fft555);
__m512 fft648 = _mm512_mask_sub_ps(fft640, 49344, _mm512_setzero_ps(), fft639);
__m512 fft565 = _mm512_mask_mov_ps(fft557, 49344, fft558);
__m512 fft649 = _mm512_mask_mov_ps(fft641, 49344, fft642);
__m512 fft566 = _mm512_mask_sub_ps(fft558, 49344, _mm512_setzero_ps(), fft557);
__m512 fft650 = _mm512_mask_sub_ps(fft642, 49344, _mm512_setzero_ps(), fft641);
__m512 fft567 = _mm512_mask_mov_ps(fft559, 49344, fft560);
__m512 fft651 = _mm512_mask_mov_ps(fft643, 49344, fft644);
__m512 fft568 = _mm512_mask_sub_ps(fft560, 49344, _mm512_setzero_ps(), fft559);
__m512 fft652 = _mm512_mask_sub_ps(fft644, 49344, _mm512_setzero_ps(), fft643);
__m512 fft569 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft570 = _mm512_fmadd_ps(fft561, fft569, _mm512_shuffle_ps(fft561, fft561, 78));
__m512 fft653 = _mm512_fmadd_ps(fft645, fft569, _mm512_shuffle_ps(fft645, fft645, 78));
__m512 fft571 = _mm512_fmadd_ps(fft562, fft569, _mm512_shuffle_ps(fft562, fft562, 78));
__m512 fft654 = _mm512_fmadd_ps(fft646, fft569, _mm512_shuffle_ps(fft646, fft646, 78));
__m512 fft572 = _mm512_fmadd_ps(fft563, fft569, _mm512_shuffle_ps(fft563, fft563, 78));
__m512 fft655 = _mm512_fmadd_ps(fft647, fft569, _mm512_shuffle_ps(fft647, fft647, 78));
__m512 fft573 = _mm512_fmadd_ps(fft564, fft569, _mm512_shuffle_ps(fft564, fft564, 78));
__m512 fft656 = _mm512_fmadd_ps(fft648, fft569, _mm512_shuffle_ps(fft648, fft648, 78));
__m512 fft574 = _mm512_fmadd_ps(fft565, fft569, _mm512_shuffle_ps(fft565, fft565, 78));
__m512 fft657 = _mm512_fmadd_ps(fft649, fft569, _mm512_shuffle_ps(fft649, fft649, 78));
__m512 fft575 = _mm512_fmadd_ps(fft566, fft569, _mm512_shuffle_ps(fft566, fft566, 78));
__m512 fft658 = _mm512_fmadd_ps(fft650, fft569, _mm512_shuffle_ps(fft650, fft650, 78));
__m512 fft576 = _mm512_fmadd_ps(fft567, fft569, _mm512_shuffle_ps(fft567, fft567, 78));
__m512 fft659 = _mm512_fmadd_ps(fft651, fft569, _mm512_shuffle_ps(fft651, fft651, 78));
__m512 fft577 = _mm512_fmadd_ps(fft568, fft569, _mm512_shuffle_ps(fft568, fft568, 78));
__m512 fft660 = _mm512_fmadd_ps(fft652, fft569, _mm512_shuffle_ps(fft652, fft652, 78));
__m512i fft578 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft579 = _mm512_permutexvar_ps(fft578, fft570);
__m512 fft661 = _mm512_permutexvar_ps(fft578, fft653);
__m512i fft580 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft581 = _mm512_permutexvar_ps(fft580, fft570);
__m512 fft662 = _mm512_permutexvar_ps(fft580, fft653);
__m512 fft582 = _mm512_permutexvar_ps(fft578, fft571);
__m512 fft663 = _mm512_permutexvar_ps(fft578, fft654);
__m512 fft583 = _mm512_permutexvar_ps(fft580, fft571);
__m512 fft664 = _mm512_permutexvar_ps(fft580, fft654);
__m512 fft584 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft585 = _mm512_fmadd_ps(fft579, fft584, fft581);
__m512 fft665 = _mm512_fmadd_ps(fft661, fft584, fft662);
__m512 fft586 = _mm512_fnmadd_ps(fft583, fft584, fft582);
__m512 fft666 = _mm512_fnmadd_ps(fft664, fft584, fft663);
__m512 fft587 = _mm512_mask_mov_ps(fft583, 21845, fft585);
__m512 fft667 = _mm512_mask_mov_ps(fft664, 21845, fft665);
__m512 fft588 = _mm512_mask_mov_ps(fft579, 43176, fft585);
__m512 fft668 = _mm512_mask_mov_ps(fft661, 43176, fft665);
__m512 fft589 = _mm512_mask_mov_ps(fft587, 43176, fft586);
__m512 fft669 = _mm512_mask_mov_ps(fft667, 43176, fft666);
__m512 fft590 = _mm512_mask_mov_ps(fft588, 22102, fft586);
__m512 fft670 = _mm512_mask_mov_ps(fft668, 22102, fft666);
__m512 fft591 = _mm512_mask_mul_ps(fft589, 64764, fft589, _mm512_set1_ps(5e-01f));
__m512 fft671 = _mm512_mask_mul_ps(fft669, 64764, fft669, _mm512_set1_ps(5e-01f));
__m512 fft592 = _mm512_mask_mul_ps(fft590, 64764, fft590, _mm512_set1_ps(5e-01f));
__m512 fft672 = _mm512_mask_mul_ps(fft670, 64764, fft670, _mm512_set1_ps(5e-01f));
__m512 wf49 = fft591;
__m512 wf57 = fft671;
__m512 wf50 = fft592;
__m512 wf58 = fft672;
__m512 wf51 = fft572;
__m512 wf59 = fft655;
__m512 wf52 = fft573;
__m512 wf60 = fft656;
__m512 wf53 = fft574;
__m512 wf61 = fft657;
__m512 wf54 = fft575;
__m512 wf62 = fft658;
__m512 wf55 = fft576;
__m512 wf63 = fft659;
__m512 wf56 = fft577;
__m512 wf64 = fft660;
ptrdiff_t c4 = (size_t)(1+2*j1)/4;
ptrdiff_t m4 = (size_t)(1+2*j1)%4/2;
ptrdiff_t f5 = (size_t)(1+2*j1)%2;
__m512i eo4 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf51 = _mm512_permutexvar_ps(eo4, wf51);
wf52 = _mm512_permutexvar_ps(eo4, wf52);
__m512i wfs25 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf51, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs25 = _mm512_inserti64x4(wfs25, _mm512_cvtps_ph(wf52, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+145920+583680*i5+7296*c4+128*k2+64*m4+16*f5, 3855, wfs25);
_mm512_mask_storeu_epi32(wfPtr1+3064304+583680*i5+7296*c4+128*k2+64*m4+16*f5, 61680, wfs25);
wf59 = _mm512_permutexvar_ps(eo4, wf59);
wf60 = _mm512_permutexvar_ps(eo4, wf60);
__m512i wfs26 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf59, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs26 = _mm512_inserti64x4(wfs26, _mm512_cvtps_ph(wf60, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+5982720+583680*i5+7296*c4+128*k2+64*m4+16*f5, 3855, wfs26);
_mm512_mask_storeu_epi32(wfPtr1+8901104+583680*i5+7296*c4+128*k2+64*m4+16*f5, 61680, wfs26);
wf53 = _mm512_permutexvar_ps(eo4, wf53);
wf54 = _mm512_permutexvar_ps(eo4, wf54);
__m512i wfs27 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf53, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs27 = _mm512_inserti64x4(wfs27, _mm512_cvtps_ph(wf54, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+291840+583680*i5+7296*c4+128*k2+64*m4+16*f5, 3855, wfs27);
_mm512_mask_storeu_epi32(wfPtr1+3210224+583680*i5+7296*c4+128*k2+64*m4+16*f5, 61680, wfs27);
wf61 = _mm512_permutexvar_ps(eo4, wf61);
wf62 = _mm512_permutexvar_ps(eo4, wf62);
__m512i wfs28 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf61, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs28 = _mm512_inserti64x4(wfs28, _mm512_cvtps_ph(wf62, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6128640+583680*i5+7296*c4+128*k2+64*m4+16*f5, 3855, wfs28);
_mm512_mask_storeu_epi32(wfPtr1+9047024+583680*i5+7296*c4+128*k2+64*m4+16*f5, 61680, wfs28);
wf55 = _mm512_permutexvar_ps(eo4, wf55);
wf56 = _mm512_permutexvar_ps(eo4, wf56);
__m512i wfs29 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf55, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs29 = _mm512_inserti64x4(wfs29, _mm512_cvtps_ph(wf56, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+437760+583680*i5+7296*c4+128*k2+64*m4+16*f5, 3855, wfs29);
_mm512_mask_storeu_epi32(wfPtr1+3356144+583680*i5+7296*c4+128*k2+64*m4+16*f5, 61680, wfs29);
wf63 = _mm512_permutexvar_ps(eo4, wf63);
wf64 = _mm512_permutexvar_ps(eo4, wf64);
__m512i wfs30 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf63, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs30 = _mm512_inserti64x4(wfs30, _mm512_cvtps_ph(wf64, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6274560+583680*i5+7296*c4+128*k2+64*m4+16*f5, 3855, wfs30);
_mm512_mask_storeu_epi32(wfPtr1+9192944+583680*i5+7296*c4+128*k2+64*m4+16*f5, 61680, wfs30);
__m512i wfs31 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf49, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs31 = _mm512_inserti64x4(wfs31, _mm512_cvtps_ph(wf50, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+583680*i5+7296*c4+128*k2+64*m4+16*f5, 3855, wfs31);
_mm512_mask_storeu_epi32(wfPtr1+2918384+583680*i5+7296*c4+128*k2+64*m4+16*f5, 61680, wfs31);
__m512i wfs32 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf57, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs32 = _mm512_inserti64x4(wfs32, _mm512_cvtps_ph(wf58, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+5836800+583680*i5+7296*c4+128*k2+64*m4+16*f5, 3855, wfs32);
_mm512_mask_storeu_epi32(wfPtr1+8755184+583680*i5+7296*c4+128*k2+64*m4+16*f5, 61680, wfs32);
}
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(3, biasPtr1-0+316*i5+8*j1);
bias2 = _mm512_mul_ps(bias2, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+320*i5+8*j1, 3, bias2);
if (j1 >= jj1) return;
j1 = 39;
}
if (j1 == 39) {
for (ptrdiff_t k3 = 0; k3 < 57; ++k3) {
__m512 wt29 = _mm512_maskz_loadu_ps(127, wtPtr1+0+882588*i5+22344*j1+196*k3);
__m512 wt30 = _mm512_maskz_loadu_ps(127, wtPtr1+28+882588*i5+22344*j1+196*k3);
__m512 wt31 = _mm512_maskz_loadu_ps(127, wtPtr1+56+882588*i5+22344*j1+196*k3);
__m512 wt32 = _mm512_maskz_loadu_ps(127, wtPtr1+84+882588*i5+22344*j1+196*k3);
__m512 wt33 = _mm512_maskz_loadu_ps(127, wtPtr1+112+882588*i5+22344*j1+196*k3);
__m512 wt34 = _mm512_maskz_loadu_ps(127, wtPtr1+140+882588*i5+22344*j1+196*k3);
__m512 wt35 = _mm512_maskz_loadu_ps(127, wtPtr1+168+882588*i5+22344*j1+196*k3);
__m512 fft673 = _mm512_add_ps(wt29, _mm512_setzero_ps());
__m512 fft761 = _mm512_add_ps(wt30, _mm512_setzero_ps());
__m512 fft674 = _mm512_sub_ps(wt29, _mm512_setzero_ps());
__m512 fft762 = _mm512_sub_ps(wt30, _mm512_setzero_ps());
__m512 fft675 = _mm512_add_ps(wt31, _mm512_setzero_ps());
__m512 fft763 = _mm512_add_ps(wt32, _mm512_setzero_ps());
__m512 fft676 = _mm512_sub_ps(wt31, _mm512_setzero_ps());
__m512 fft764 = _mm512_sub_ps(wt32, _mm512_setzero_ps());
__m512 fft677 = _mm512_add_ps(wt33, _mm512_setzero_ps());
__m512 fft765 = _mm512_add_ps(wt34, _mm512_setzero_ps());
__m512 fft678 = _mm512_sub_ps(wt33, _mm512_setzero_ps());
__m512 fft766 = _mm512_sub_ps(wt34, _mm512_setzero_ps());
__m512 fft679 = _mm512_add_ps(wt35, _mm512_setzero_ps());
__m512 fft767 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft680 = _mm512_sub_ps(wt35, _mm512_setzero_ps());
__m512 fft768 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft681 = _mm512_add_ps(fft673, fft677);
__m512 fft769 = _mm512_add_ps(fft761, fft765);
__m512 fft682 = _mm512_sub_ps(fft673, fft677);
__m512 fft770 = _mm512_sub_ps(fft761, fft765);
__m512 fft683 = _mm512_add_ps(fft675, fft679);
__m512 fft771 = _mm512_add_ps(fft763, fft767);
__m512 fft684 = _mm512_sub_ps(fft679, fft675);
__m512 fft772 = _mm512_sub_ps(fft767, fft763);
__m512 fft685 = _mm512_sub_ps(fft676, fft680);
__m512 fft773 = _mm512_sub_ps(fft764, fft768);
__m512 fft686 = _mm512_add_ps(fft676, fft680);
__m512 fft774 = _mm512_add_ps(fft764, fft768);
__m512 fft687 = _mm512_add_ps(fft681, fft683);
__m512 fft775 = _mm512_add_ps(fft769, fft771);
__m512 fft688 = _mm512_sub_ps(fft681, fft683);
__m512 fft776 = _mm512_sub_ps(fft769, fft771);
__m512 fft689 = _mm512_fmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft777 = _mm512_fmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft690 = _mm512_fnmsub_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft778 = _mm512_fnmsub_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft691 = _mm512_fnmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft779 = _mm512_fnmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft692 = _mm512_fnmadd_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft780 = _mm512_fnmadd_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft693 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft694 = _mm512_fmadd_ps(fft687, fft693, _mm512_shuffle_f32x4(fft687, fft687, 78));
__m512 fft781 = _mm512_fmadd_ps(fft775, fft693, _mm512_shuffle_f32x4(fft775, fft775, 78));
__m512 fft695 = _mm512_fmadd_ps(fft688, fft693, _mm512_shuffle_f32x4(fft688, fft688, 78));
__m512 fft782 = _mm512_fmadd_ps(fft776, fft693, _mm512_shuffle_f32x4(fft776, fft776, 78));
__m512 fft696 = _mm512_fmadd_ps(fft689, fft693, _mm512_shuffle_f32x4(fft689, fft689, 78));
__m512 fft783 = _mm512_fmadd_ps(fft777, fft693, _mm512_shuffle_f32x4(fft777, fft777, 78));
__m512 fft697 = _mm512_fmadd_ps(fft690, fft693, _mm512_shuffle_f32x4(fft690, fft690, 78));
__m512 fft784 = _mm512_fmadd_ps(fft778, fft693, _mm512_shuffle_f32x4(fft778, fft778, 78));
__m512 fft698 = _mm512_fmadd_ps(fft682, fft693, _mm512_shuffle_f32x4(fft682, fft682, 78));
__m512 fft785 = _mm512_fmadd_ps(fft770, fft693, _mm512_shuffle_f32x4(fft770, fft770, 78));
__m512 fft699 = _mm512_fmadd_ps(fft684, fft693, _mm512_shuffle_f32x4(fft684, fft684, 78));
__m512 fft786 = _mm512_fmadd_ps(fft772, fft693, _mm512_shuffle_f32x4(fft772, fft772, 78));
__m512 fft700 = _mm512_fmadd_ps(fft691, fft693, _mm512_shuffle_f32x4(fft691, fft691, 78));
__m512 fft787 = _mm512_fmadd_ps(fft779, fft693, _mm512_shuffle_f32x4(fft779, fft779, 78));
__m512 fft701 = _mm512_fmadd_ps(fft692, fft693, _mm512_shuffle_f32x4(fft692, fft692, 78));
__m512 fft788 = _mm512_fmadd_ps(fft780, fft693, _mm512_shuffle_f32x4(fft780, fft780, 78));
__m512 fft702 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft703 = _mm512_mul_ps(fft694, fft702);
__m512 fft789 = _mm512_mul_ps(fft781, fft702);
__m512 fft704 = _mm512_mul_ps(fft695, fft702);
__m512 fft790 = _mm512_mul_ps(fft782, fft702);
__m512 fft705 = _mm512_mul_ps(fft696, fft702);
__m512 fft791 = _mm512_mul_ps(fft783, fft702);
__m512 fft706 = _mm512_mul_ps(fft697, fft702);
__m512 fft792 = _mm512_mul_ps(fft784, fft702);
__m512 fft707 = _mm512_mul_ps(fft698, fft702);
__m512 fft793 = _mm512_mul_ps(fft785, fft702);
__m512 fft708 = _mm512_mul_ps(fft699, fft702);
__m512 fft794 = _mm512_mul_ps(fft786, fft702);
__m512 fft709 = _mm512_mul_ps(fft700, fft702);
__m512 fft795 = _mm512_mul_ps(fft787, fft702);
__m512 fft710 = _mm512_mul_ps(fft701, fft702);
__m512 fft796 = _mm512_mul_ps(fft788, fft702);
__m512 fft711 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft712 = _mm512_fmadd_ps(fft695, fft711, fft703);
__m512 fft797 = _mm512_fmadd_ps(fft782, fft711, fft789);
__m512 fft713 = _mm512_fnmadd_ps(fft694, fft711, fft704);
__m512 fft798 = _mm512_fnmadd_ps(fft781, fft711, fft790);
__m512 fft714 = _mm512_fmadd_ps(fft697, fft711, fft705);
__m512 fft799 = _mm512_fmadd_ps(fft784, fft711, fft791);
__m512 fft715 = _mm512_fnmadd_ps(fft696, fft711, fft706);
__m512 fft800 = _mm512_fnmadd_ps(fft783, fft711, fft792);
__m512 fft716 = _mm512_fmadd_ps(fft699, fft711, fft707);
__m512 fft801 = _mm512_fmadd_ps(fft786, fft711, fft793);
__m512 fft717 = _mm512_fnmadd_ps(fft698, fft711, fft708);
__m512 fft802 = _mm512_fnmadd_ps(fft785, fft711, fft794);
__m512 fft718 = _mm512_fmadd_ps(fft701, fft711, fft709);
__m512 fft803 = _mm512_fmadd_ps(fft788, fft711, fft795);
__m512 fft719 = _mm512_fnmadd_ps(fft700, fft711, fft710);
__m512 fft804 = _mm512_fnmadd_ps(fft787, fft711, fft796);
__m512 fft720 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft721 = _mm512_fmadd_ps(fft712, fft720, _mm512_shuffle_f32x4(fft712, fft712, 177));
__m512 fft805 = _mm512_fmadd_ps(fft797, fft720, _mm512_shuffle_f32x4(fft797, fft797, 177));
__m512 fft722 = _mm512_fmadd_ps(fft713, fft720, _mm512_shuffle_f32x4(fft713, fft713, 177));
__m512 fft806 = _mm512_fmadd_ps(fft798, fft720, _mm512_shuffle_f32x4(fft798, fft798, 177));
__m512 fft723 = _mm512_fmadd_ps(fft714, fft720, _mm512_shuffle_f32x4(fft714, fft714, 177));
__m512 fft807 = _mm512_fmadd_ps(fft799, fft720, _mm512_shuffle_f32x4(fft799, fft799, 177));
__m512 fft724 = _mm512_fmadd_ps(fft715, fft720, _mm512_shuffle_f32x4(fft715, fft715, 177));
__m512 fft808 = _mm512_fmadd_ps(fft800, fft720, _mm512_shuffle_f32x4(fft800, fft800, 177));
__m512 fft725 = _mm512_fmadd_ps(fft716, fft720, _mm512_shuffle_f32x4(fft716, fft716, 177));
__m512 fft809 = _mm512_fmadd_ps(fft801, fft720, _mm512_shuffle_f32x4(fft801, fft801, 177));
__m512 fft726 = _mm512_fmadd_ps(fft717, fft720, _mm512_shuffle_f32x4(fft717, fft717, 177));
__m512 fft810 = _mm512_fmadd_ps(fft802, fft720, _mm512_shuffle_f32x4(fft802, fft802, 177));
__m512 fft727 = _mm512_fmadd_ps(fft718, fft720, _mm512_shuffle_f32x4(fft718, fft718, 177));
__m512 fft811 = _mm512_fmadd_ps(fft803, fft720, _mm512_shuffle_f32x4(fft803, fft803, 177));
__m512 fft728 = _mm512_fmadd_ps(fft719, fft720, _mm512_shuffle_f32x4(fft719, fft719, 177));
__m512 fft812 = _mm512_fmadd_ps(fft804, fft720, _mm512_shuffle_f32x4(fft804, fft804, 177));
__m512 fft729 = _mm512_mask_mov_ps(fft721, 49344, fft722);
__m512 fft813 = _mm512_mask_mov_ps(fft805, 49344, fft806);
__m512 fft730 = _mm512_mask_sub_ps(fft722, 49344, _mm512_setzero_ps(), fft721);
__m512 fft814 = _mm512_mask_sub_ps(fft806, 49344, _mm512_setzero_ps(), fft805);
__m512 fft731 = _mm512_mask_mov_ps(fft723, 49344, fft724);
__m512 fft815 = _mm512_mask_mov_ps(fft807, 49344, fft808);
__m512 fft732 = _mm512_mask_sub_ps(fft724, 49344, _mm512_setzero_ps(), fft723);
__m512 fft816 = _mm512_mask_sub_ps(fft808, 49344, _mm512_setzero_ps(), fft807);
__m512 fft733 = _mm512_mask_mov_ps(fft725, 49344, fft726);
__m512 fft817 = _mm512_mask_mov_ps(fft809, 49344, fft810);
__m512 fft734 = _mm512_mask_sub_ps(fft726, 49344, _mm512_setzero_ps(), fft725);
__m512 fft818 = _mm512_mask_sub_ps(fft810, 49344, _mm512_setzero_ps(), fft809);
__m512 fft735 = _mm512_mask_mov_ps(fft727, 49344, fft728);
__m512 fft819 = _mm512_mask_mov_ps(fft811, 49344, fft812);
__m512 fft736 = _mm512_mask_sub_ps(fft728, 49344, _mm512_setzero_ps(), fft727);
__m512 fft820 = _mm512_mask_sub_ps(fft812, 49344, _mm512_setzero_ps(), fft811);
__m512 fft737 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft738 = _mm512_fmadd_ps(fft729, fft737, _mm512_shuffle_ps(fft729, fft729, 78));
__m512 fft821 = _mm512_fmadd_ps(fft813, fft737, _mm512_shuffle_ps(fft813, fft813, 78));
__m512 fft739 = _mm512_fmadd_ps(fft730, fft737, _mm512_shuffle_ps(fft730, fft730, 78));
__m512 fft822 = _mm512_fmadd_ps(fft814, fft737, _mm512_shuffle_ps(fft814, fft814, 78));
__m512 fft740 = _mm512_fmadd_ps(fft731, fft737, _mm512_shuffle_ps(fft731, fft731, 78));
__m512 fft823 = _mm512_fmadd_ps(fft815, fft737, _mm512_shuffle_ps(fft815, fft815, 78));
__m512 fft741 = _mm512_fmadd_ps(fft732, fft737, _mm512_shuffle_ps(fft732, fft732, 78));
__m512 fft824 = _mm512_fmadd_ps(fft816, fft737, _mm512_shuffle_ps(fft816, fft816, 78));
__m512 fft742 = _mm512_fmadd_ps(fft733, fft737, _mm512_shuffle_ps(fft733, fft733, 78));
__m512 fft825 = _mm512_fmadd_ps(fft817, fft737, _mm512_shuffle_ps(fft817, fft817, 78));
__m512 fft743 = _mm512_fmadd_ps(fft734, fft737, _mm512_shuffle_ps(fft734, fft734, 78));
__m512 fft826 = _mm512_fmadd_ps(fft818, fft737, _mm512_shuffle_ps(fft818, fft818, 78));
__m512 fft744 = _mm512_fmadd_ps(fft735, fft737, _mm512_shuffle_ps(fft735, fft735, 78));
__m512 fft827 = _mm512_fmadd_ps(fft819, fft737, _mm512_shuffle_ps(fft819, fft819, 78));
__m512 fft745 = _mm512_fmadd_ps(fft736, fft737, _mm512_shuffle_ps(fft736, fft736, 78));
__m512 fft828 = _mm512_fmadd_ps(fft820, fft737, _mm512_shuffle_ps(fft820, fft820, 78));
__m512i fft746 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft747 = _mm512_permutexvar_ps(fft746, fft738);
__m512 fft829 = _mm512_permutexvar_ps(fft746, fft821);
__m512i fft748 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft749 = _mm512_permutexvar_ps(fft748, fft738);
__m512 fft830 = _mm512_permutexvar_ps(fft748, fft821);
__m512 fft750 = _mm512_permutexvar_ps(fft746, fft739);
__m512 fft831 = _mm512_permutexvar_ps(fft746, fft822);
__m512 fft751 = _mm512_permutexvar_ps(fft748, fft739);
__m512 fft832 = _mm512_permutexvar_ps(fft748, fft822);
__m512 fft752 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft753 = _mm512_fmadd_ps(fft747, fft752, fft749);
__m512 fft833 = _mm512_fmadd_ps(fft829, fft752, fft830);
__m512 fft754 = _mm512_fnmadd_ps(fft751, fft752, fft750);
__m512 fft834 = _mm512_fnmadd_ps(fft832, fft752, fft831);
__m512 fft755 = _mm512_mask_mov_ps(fft751, 21845, fft753);
__m512 fft835 = _mm512_mask_mov_ps(fft832, 21845, fft833);
__m512 fft756 = _mm512_mask_mov_ps(fft747, 43176, fft753);
__m512 fft836 = _mm512_mask_mov_ps(fft829, 43176, fft833);
__m512 fft757 = _mm512_mask_mov_ps(fft755, 43176, fft754);
__m512 fft837 = _mm512_mask_mov_ps(fft835, 43176, fft834);
__m512 fft758 = _mm512_mask_mov_ps(fft756, 22102, fft754);
__m512 fft838 = _mm512_mask_mov_ps(fft836, 22102, fft834);
__m512 fft759 = _mm512_mask_mul_ps(fft757, 64764, fft757, _mm512_set1_ps(5e-01f));
__m512 fft839 = _mm512_mask_mul_ps(fft837, 64764, fft837, _mm512_set1_ps(5e-01f));
__m512 fft760 = _mm512_mask_mul_ps(fft758, 64764, fft758, _mm512_set1_ps(5e-01f));
__m512 fft840 = _mm512_mask_mul_ps(fft838, 64764, fft838, _mm512_set1_ps(5e-01f));
__m512 wf65 = fft759;
__m512 wf73 = fft839;
__m512 wf66 = fft760;
__m512 wf74 = fft840;
__m512 wf67 = fft740;
__m512 wf75 = fft823;
__m512 wf68 = fft741;
__m512 wf76 = fft824;
__m512 wf69 = fft742;
__m512 wf77 = fft825;
__m512 wf70 = fft743;
__m512 wf78 = fft826;
__m512 wf71 = fft744;
__m512 wf79 = fft827;
__m512 wf72 = fft745;
__m512 wf80 = fft828;
ptrdiff_t c5 = (size_t)(0+2*j1)/4;
ptrdiff_t m5 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f6 = (size_t)(0+2*j1)%2;
__m512i eo5 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf67 = _mm512_permutexvar_ps(eo5, wf67);
wf68 = _mm512_permutexvar_ps(eo5, wf68);
__m512i wfs33 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf67, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs33 = _mm512_inserti64x4(wfs33, _mm512_cvtps_ph(wf68, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep1 = _mm512_shuffle_i32x4(wfs33, wfs33, 160);
_mm512_mask_storeu_epi32(wfPtr1+145920+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep1);
__m512i rep2 = _mm512_shuffle_i32x4(wfs33, wfs33, 245);
_mm512_mask_storeu_epi32(wfPtr1+3064320+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep2);
wf75 = _mm512_permutexvar_ps(eo5, wf75);
wf76 = _mm512_permutexvar_ps(eo5, wf76);
__m512i wfs34 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf75, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs34 = _mm512_inserti64x4(wfs34, _mm512_cvtps_ph(wf76, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep3 = _mm512_shuffle_i32x4(wfs34, wfs34, 160);
_mm512_mask_storeu_epi32(wfPtr1+5982720+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep3);
__m512i rep4 = _mm512_shuffle_i32x4(wfs34, wfs34, 245);
_mm512_mask_storeu_epi32(wfPtr1+8901120+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep4);
wf69 = _mm512_permutexvar_ps(eo5, wf69);
wf70 = _mm512_permutexvar_ps(eo5, wf70);
__m512i wfs35 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf69, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs35 = _mm512_inserti64x4(wfs35, _mm512_cvtps_ph(wf70, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep5 = _mm512_shuffle_i32x4(wfs35, wfs35, 160);
_mm512_mask_storeu_epi32(wfPtr1+291840+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep5);
__m512i rep6 = _mm512_shuffle_i32x4(wfs35, wfs35, 245);
_mm512_mask_storeu_epi32(wfPtr1+3210240+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep6);
wf77 = _mm512_permutexvar_ps(eo5, wf77);
wf78 = _mm512_permutexvar_ps(eo5, wf78);
__m512i wfs36 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf77, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs36 = _mm512_inserti64x4(wfs36, _mm512_cvtps_ph(wf78, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep7 = _mm512_shuffle_i32x4(wfs36, wfs36, 160);
_mm512_mask_storeu_epi32(wfPtr1+6128640+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep7);
__m512i rep8 = _mm512_shuffle_i32x4(wfs36, wfs36, 245);
_mm512_mask_storeu_epi32(wfPtr1+9047040+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep8);
wf71 = _mm512_permutexvar_ps(eo5, wf71);
wf72 = _mm512_permutexvar_ps(eo5, wf72);
__m512i wfs37 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf71, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs37 = _mm512_inserti64x4(wfs37, _mm512_cvtps_ph(wf72, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep9 = _mm512_shuffle_i32x4(wfs37, wfs37, 160);
_mm512_mask_storeu_epi32(wfPtr1+437760+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep9);
__m512i rep10 = _mm512_shuffle_i32x4(wfs37, wfs37, 245);
_mm512_mask_storeu_epi32(wfPtr1+3356160+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep10);
wf79 = _mm512_permutexvar_ps(eo5, wf79);
wf80 = _mm512_permutexvar_ps(eo5, wf80);
__m512i wfs38 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf79, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs38 = _mm512_inserti64x4(wfs38, _mm512_cvtps_ph(wf80, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep11 = _mm512_shuffle_i32x4(wfs38, wfs38, 160);
_mm512_mask_storeu_epi32(wfPtr1+6274560+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep11);
__m512i rep12 = _mm512_shuffle_i32x4(wfs38, wfs38, 245);
_mm512_mask_storeu_epi32(wfPtr1+9192960+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep12);
__m512i wfs39 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf65, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs39 = _mm512_inserti64x4(wfs39, _mm512_cvtps_ph(wf66, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep13 = _mm512_shuffle_i32x4(wfs39, wfs39, 160);
_mm512_mask_storeu_epi32(wfPtr1+0+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep13);
__m512i rep14 = _mm512_shuffle_i32x4(wfs39, wfs39, 245);
_mm512_mask_storeu_epi32(wfPtr1+2918400+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep14);
__m512i wfs40 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf73, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs40 = _mm512_inserti64x4(wfs40, _mm512_cvtps_ph(wf74, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep15 = _mm512_shuffle_i32x4(wfs40, wfs40, 160);
_mm512_mask_storeu_epi32(wfPtr1+5836800+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep15);
__m512i rep16 = _mm512_shuffle_i32x4(wfs40, wfs40, 245);
_mm512_mask_storeu_epi32(wfPtr1+8755200+583680*i5+7296*c5+128*k3+64*m5+16*f6, 65535, rep16);
}
__m512 bias3 = _mm512_setzero_ps();
if (!e1) {
bias3 = _mm512_maskz_loadu_ps(1, biasPtr1-0+316*i5+8*j1);
bias3 = _mm512_mul_ps(bias3, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+320*i5+8*j1, 1, bias3);
if (j1 >= jj1) return;
j1 = 40;
}
}

static void Example9StriderArrangeFilts1(Example9ThreaderTeam1* team13, char** tensors1) {
Example9ThreaderTask1 task5;
task5.callee1 = Example9StriderArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 20;
task5.hull1[1] = 5;
task5.hull1[2] = 1;
Example9ThreaderDo1(team13, &task5);
}

static void Example9StriderArrangeDats1Callee1(Example9ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = pt8[0];
ptrdiff_t c6 = 0;
ptrdiff_t g3 = pt8[2];
ptrdiff_t e2 = 0;
char*restrict datPtr1 = tensors4[0]-120+855360*e2;
char*restrict dfPtr1 = tensors4[1]+12165120*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 1*c6;
ptrdiff_t rel1 = j2-0;
ptrdiff_t base1 = 0;
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k4 = 28*s1;
ptrdiff_t kk1 = k4+(s1 < 1 ? 27 : 28);
for (; k4 <= kk1; ++k4) {
for (ptrdiff_t b3 = 0; b3 < 2; ++b3) {
ptrdiff_t m6 = (size_t)b3/2;
ptrdiff_t f7 = (size_t)b3%2;
__m512 dat1 = _mm512_maskz_loadu_ps(65535, datPtr1+120+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat2 = _mm512_maskz_loadu_ps(65535, datPtr1+240+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat3 = _mm512_maskz_loadu_ps(65535, datPtr1+360+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat4 = _mm512_maskz_loadu_ps(65535, datPtr1+480+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat5 = _mm512_maskz_loadu_ps(65535, datPtr1+600+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat6 = _mm512_maskz_loadu_ps(65535, datPtr1+720+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat7 = _mm512_maskz_loadu_ps(65535, datPtr1+840+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat8 = _mm512_maskz_loadu_ps(65535, datPtr1+960+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat9 = _mm512_maskz_loadu_ps(65535, datPtr1+1080+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat10 = _mm512_maskz_loadu_ps(65535, datPtr1+1200+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat11 = _mm512_maskz_loadu_ps(65535, datPtr1+1320+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat12 = _mm512_maskz_loadu_ps(65535, datPtr1+1440+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat13 = _mm512_maskz_loadu_ps(65535, datPtr1+1560+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat14 = _mm512_maskz_loadu_ps(65535, datPtr1+1680+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 dat15 = _mm512_maskz_loadu_ps(65535, datPtr1+1800+123120*i6+2160*k4+120*h1+4*w1+40*b3);
__m512 fft841 = _mm512_add_ps(_mm512_setzero_ps(), dat8);
__m512 fft929 = _mm512_add_ps(dat1, dat9);
__m512 fft842 = _mm512_sub_ps(_mm512_setzero_ps(), dat8);
__m512 fft930 = _mm512_sub_ps(dat1, dat9);
__m512 fft843 = _mm512_add_ps(dat2, dat10);
__m512 fft931 = _mm512_add_ps(dat3, dat11);
__m512 fft844 = _mm512_sub_ps(dat2, dat10);
__m512 fft932 = _mm512_sub_ps(dat3, dat11);
__m512 fft845 = _mm512_add_ps(dat4, dat12);
__m512 fft933 = _mm512_add_ps(dat5, dat13);
__m512 fft846 = _mm512_sub_ps(dat4, dat12);
__m512 fft934 = _mm512_sub_ps(dat5, dat13);
__m512 fft847 = _mm512_add_ps(dat6, dat14);
__m512 fft935 = _mm512_add_ps(dat7, dat15);
__m512 fft848 = _mm512_sub_ps(dat6, dat14);
__m512 fft936 = _mm512_sub_ps(dat7, dat15);
__m512 fft849 = _mm512_add_ps(fft841, fft845);
__m512 fft937 = _mm512_add_ps(fft929, fft933);
__m512 fft850 = _mm512_sub_ps(fft841, fft845);
__m512 fft938 = _mm512_sub_ps(fft929, fft933);
__m512 fft851 = _mm512_add_ps(fft843, fft847);
__m512 fft939 = _mm512_add_ps(fft931, fft935);
__m512 fft852 = _mm512_sub_ps(fft847, fft843);
__m512 fft940 = _mm512_sub_ps(fft935, fft931);
__m512 fft853 = _mm512_sub_ps(fft844, fft848);
__m512 fft941 = _mm512_sub_ps(fft932, fft936);
__m512 fft854 = _mm512_add_ps(fft844, fft848);
__m512 fft942 = _mm512_add_ps(fft932, fft936);
__m512 fft855 = _mm512_add_ps(fft849, fft851);
__m512 fft943 = _mm512_add_ps(fft937, fft939);
__m512 fft856 = _mm512_sub_ps(fft849, fft851);
__m512 fft944 = _mm512_sub_ps(fft937, fft939);
__m512 fft857 = _mm512_fmadd_ps(fft853, _mm512_set1_ps(7.0710677e-01f), fft842);
__m512 fft945 = _mm512_fmadd_ps(fft941, _mm512_set1_ps(7.0710677e-01f), fft930);
__m512 fft858 = _mm512_fnmsub_ps(fft854, _mm512_set1_ps(7.0710677e-01f), fft846);
__m512 fft946 = _mm512_fnmsub_ps(fft942, _mm512_set1_ps(7.0710677e-01f), fft934);
__m512 fft859 = _mm512_fnmadd_ps(fft853, _mm512_set1_ps(7.0710677e-01f), fft842);
__m512 fft947 = _mm512_fnmadd_ps(fft941, _mm512_set1_ps(7.0710677e-01f), fft930);
__m512 fft860 = _mm512_fnmadd_ps(fft854, _mm512_set1_ps(7.0710677e-01f), fft846);
__m512 fft948 = _mm512_fnmadd_ps(fft942, _mm512_set1_ps(7.0710677e-01f), fft934);
__m512 fft861 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft862 = _mm512_fmadd_ps(fft855, fft861, _mm512_shuffle_f32x4(fft855, fft855, 78));
__m512 fft949 = _mm512_fmadd_ps(fft943, fft861, _mm512_shuffle_f32x4(fft943, fft943, 78));
__m512 fft863 = _mm512_fmadd_ps(fft856, fft861, _mm512_shuffle_f32x4(fft856, fft856, 78));
__m512 fft950 = _mm512_fmadd_ps(fft944, fft861, _mm512_shuffle_f32x4(fft944, fft944, 78));
__m512 fft864 = _mm512_fmadd_ps(fft857, fft861, _mm512_shuffle_f32x4(fft857, fft857, 78));
__m512 fft951 = _mm512_fmadd_ps(fft945, fft861, _mm512_shuffle_f32x4(fft945, fft945, 78));
__m512 fft865 = _mm512_fmadd_ps(fft858, fft861, _mm512_shuffle_f32x4(fft858, fft858, 78));
__m512 fft952 = _mm512_fmadd_ps(fft946, fft861, _mm512_shuffle_f32x4(fft946, fft946, 78));
__m512 fft866 = _mm512_fmadd_ps(fft850, fft861, _mm512_shuffle_f32x4(fft850, fft850, 78));
__m512 fft953 = _mm512_fmadd_ps(fft938, fft861, _mm512_shuffle_f32x4(fft938, fft938, 78));
__m512 fft867 = _mm512_fmadd_ps(fft852, fft861, _mm512_shuffle_f32x4(fft852, fft852, 78));
__m512 fft954 = _mm512_fmadd_ps(fft940, fft861, _mm512_shuffle_f32x4(fft940, fft940, 78));
__m512 fft868 = _mm512_fmadd_ps(fft859, fft861, _mm512_shuffle_f32x4(fft859, fft859, 78));
__m512 fft955 = _mm512_fmadd_ps(fft947, fft861, _mm512_shuffle_f32x4(fft947, fft947, 78));
__m512 fft869 = _mm512_fmadd_ps(fft860, fft861, _mm512_shuffle_f32x4(fft860, fft860, 78));
__m512 fft956 = _mm512_fmadd_ps(fft948, fft861, _mm512_shuffle_f32x4(fft948, fft948, 78));
__m512 fft870 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft871 = _mm512_mul_ps(fft862, fft870);
__m512 fft957 = _mm512_mul_ps(fft949, fft870);
__m512 fft872 = _mm512_mul_ps(fft863, fft870);
__m512 fft958 = _mm512_mul_ps(fft950, fft870);
__m512 fft873 = _mm512_mul_ps(fft864, fft870);
__m512 fft959 = _mm512_mul_ps(fft951, fft870);
__m512 fft874 = _mm512_mul_ps(fft865, fft870);
__m512 fft960 = _mm512_mul_ps(fft952, fft870);
__m512 fft875 = _mm512_mul_ps(fft866, fft870);
__m512 fft961 = _mm512_mul_ps(fft953, fft870);
__m512 fft876 = _mm512_mul_ps(fft867, fft870);
__m512 fft962 = _mm512_mul_ps(fft954, fft870);
__m512 fft877 = _mm512_mul_ps(fft868, fft870);
__m512 fft963 = _mm512_mul_ps(fft955, fft870);
__m512 fft878 = _mm512_mul_ps(fft869, fft870);
__m512 fft964 = _mm512_mul_ps(fft956, fft870);
__m512 fft879 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft880 = _mm512_fmadd_ps(fft863, fft879, fft871);
__m512 fft965 = _mm512_fmadd_ps(fft950, fft879, fft957);
__m512 fft881 = _mm512_fnmadd_ps(fft862, fft879, fft872);
__m512 fft966 = _mm512_fnmadd_ps(fft949, fft879, fft958);
__m512 fft882 = _mm512_fmadd_ps(fft865, fft879, fft873);
__m512 fft967 = _mm512_fmadd_ps(fft952, fft879, fft959);
__m512 fft883 = _mm512_fnmadd_ps(fft864, fft879, fft874);
__m512 fft968 = _mm512_fnmadd_ps(fft951, fft879, fft960);
__m512 fft884 = _mm512_fmadd_ps(fft867, fft879, fft875);
__m512 fft969 = _mm512_fmadd_ps(fft954, fft879, fft961);
__m512 fft885 = _mm512_fnmadd_ps(fft866, fft879, fft876);
__m512 fft970 = _mm512_fnmadd_ps(fft953, fft879, fft962);
__m512 fft886 = _mm512_fmadd_ps(fft869, fft879, fft877);
__m512 fft971 = _mm512_fmadd_ps(fft956, fft879, fft963);
__m512 fft887 = _mm512_fnmadd_ps(fft868, fft879, fft878);
__m512 fft972 = _mm512_fnmadd_ps(fft955, fft879, fft964);
__m512 fft888 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft889 = _mm512_fmadd_ps(fft880, fft888, _mm512_shuffle_f32x4(fft880, fft880, 177));
__m512 fft973 = _mm512_fmadd_ps(fft965, fft888, _mm512_shuffle_f32x4(fft965, fft965, 177));
__m512 fft890 = _mm512_fmadd_ps(fft881, fft888, _mm512_shuffle_f32x4(fft881, fft881, 177));
__m512 fft974 = _mm512_fmadd_ps(fft966, fft888, _mm512_shuffle_f32x4(fft966, fft966, 177));
__m512 fft891 = _mm512_fmadd_ps(fft882, fft888, _mm512_shuffle_f32x4(fft882, fft882, 177));
__m512 fft975 = _mm512_fmadd_ps(fft967, fft888, _mm512_shuffle_f32x4(fft967, fft967, 177));
__m512 fft892 = _mm512_fmadd_ps(fft883, fft888, _mm512_shuffle_f32x4(fft883, fft883, 177));
__m512 fft976 = _mm512_fmadd_ps(fft968, fft888, _mm512_shuffle_f32x4(fft968, fft968, 177));
__m512 fft893 = _mm512_fmadd_ps(fft884, fft888, _mm512_shuffle_f32x4(fft884, fft884, 177));
__m512 fft977 = _mm512_fmadd_ps(fft969, fft888, _mm512_shuffle_f32x4(fft969, fft969, 177));
__m512 fft894 = _mm512_fmadd_ps(fft885, fft888, _mm512_shuffle_f32x4(fft885, fft885, 177));
__m512 fft978 = _mm512_fmadd_ps(fft970, fft888, _mm512_shuffle_f32x4(fft970, fft970, 177));
__m512 fft895 = _mm512_fmadd_ps(fft886, fft888, _mm512_shuffle_f32x4(fft886, fft886, 177));
__m512 fft979 = _mm512_fmadd_ps(fft971, fft888, _mm512_shuffle_f32x4(fft971, fft971, 177));
__m512 fft896 = _mm512_fmadd_ps(fft887, fft888, _mm512_shuffle_f32x4(fft887, fft887, 177));
__m512 fft980 = _mm512_fmadd_ps(fft972, fft888, _mm512_shuffle_f32x4(fft972, fft972, 177));
__m512 fft897 = _mm512_mask_mov_ps(fft889, 49344, fft890);
__m512 fft981 = _mm512_mask_mov_ps(fft973, 49344, fft974);
__m512 fft898 = _mm512_mask_sub_ps(fft890, 49344, _mm512_setzero_ps(), fft889);
__m512 fft982 = _mm512_mask_sub_ps(fft974, 49344, _mm512_setzero_ps(), fft973);
__m512 fft899 = _mm512_mask_mov_ps(fft891, 49344, fft892);
__m512 fft983 = _mm512_mask_mov_ps(fft975, 49344, fft976);
__m512 fft900 = _mm512_mask_sub_ps(fft892, 49344, _mm512_setzero_ps(), fft891);
__m512 fft984 = _mm512_mask_sub_ps(fft976, 49344, _mm512_setzero_ps(), fft975);
__m512 fft901 = _mm512_mask_mov_ps(fft893, 49344, fft894);
__m512 fft985 = _mm512_mask_mov_ps(fft977, 49344, fft978);
__m512 fft902 = _mm512_mask_sub_ps(fft894, 49344, _mm512_setzero_ps(), fft893);
__m512 fft986 = _mm512_mask_sub_ps(fft978, 49344, _mm512_setzero_ps(), fft977);
__m512 fft903 = _mm512_mask_mov_ps(fft895, 49344, fft896);
__m512 fft987 = _mm512_mask_mov_ps(fft979, 49344, fft980);
__m512 fft904 = _mm512_mask_sub_ps(fft896, 49344, _mm512_setzero_ps(), fft895);
__m512 fft988 = _mm512_mask_sub_ps(fft980, 49344, _mm512_setzero_ps(), fft979);
__m512 fft905 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft906 = _mm512_fmadd_ps(fft897, fft905, _mm512_shuffle_ps(fft897, fft897, 78));
__m512 fft989 = _mm512_fmadd_ps(fft981, fft905, _mm512_shuffle_ps(fft981, fft981, 78));
__m512 fft907 = _mm512_fmadd_ps(fft898, fft905, _mm512_shuffle_ps(fft898, fft898, 78));
__m512 fft990 = _mm512_fmadd_ps(fft982, fft905, _mm512_shuffle_ps(fft982, fft982, 78));
__m512 fft908 = _mm512_fmadd_ps(fft899, fft905, _mm512_shuffle_ps(fft899, fft899, 78));
__m512 fft991 = _mm512_fmadd_ps(fft983, fft905, _mm512_shuffle_ps(fft983, fft983, 78));
__m512 fft909 = _mm512_fmadd_ps(fft900, fft905, _mm512_shuffle_ps(fft900, fft900, 78));
__m512 fft992 = _mm512_fmadd_ps(fft984, fft905, _mm512_shuffle_ps(fft984, fft984, 78));
__m512 fft910 = _mm512_fmadd_ps(fft901, fft905, _mm512_shuffle_ps(fft901, fft901, 78));
__m512 fft993 = _mm512_fmadd_ps(fft985, fft905, _mm512_shuffle_ps(fft985, fft985, 78));
__m512 fft911 = _mm512_fmadd_ps(fft902, fft905, _mm512_shuffle_ps(fft902, fft902, 78));
__m512 fft994 = _mm512_fmadd_ps(fft986, fft905, _mm512_shuffle_ps(fft986, fft986, 78));
__m512 fft912 = _mm512_fmadd_ps(fft903, fft905, _mm512_shuffle_ps(fft903, fft903, 78));
__m512 fft995 = _mm512_fmadd_ps(fft987, fft905, _mm512_shuffle_ps(fft987, fft987, 78));
__m512 fft913 = _mm512_fmadd_ps(fft904, fft905, _mm512_shuffle_ps(fft904, fft904, 78));
__m512 fft996 = _mm512_fmadd_ps(fft988, fft905, _mm512_shuffle_ps(fft988, fft988, 78));
__m512i fft914 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft915 = _mm512_permutexvar_ps(fft914, fft906);
__m512 fft997 = _mm512_permutexvar_ps(fft914, fft989);
__m512i fft916 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft917 = _mm512_permutexvar_ps(fft916, fft906);
__m512 fft998 = _mm512_permutexvar_ps(fft916, fft989);
__m512 fft918 = _mm512_permutexvar_ps(fft914, fft907);
__m512 fft999 = _mm512_permutexvar_ps(fft914, fft990);
__m512 fft919 = _mm512_permutexvar_ps(fft916, fft907);
__m512 fft1000 = _mm512_permutexvar_ps(fft916, fft990);
__m512 fft920 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft921 = _mm512_fmadd_ps(fft915, fft920, fft917);
__m512 fft1001 = _mm512_fmadd_ps(fft997, fft920, fft998);
__m512 fft922 = _mm512_fnmadd_ps(fft919, fft920, fft918);
__m512 fft1002 = _mm512_fnmadd_ps(fft1000, fft920, fft999);
__m512 fft923 = _mm512_mask_mov_ps(fft919, 21845, fft921);
__m512 fft1003 = _mm512_mask_mov_ps(fft1000, 21845, fft1001);
__m512 fft924 = _mm512_mask_mov_ps(fft915, 43176, fft921);
__m512 fft1004 = _mm512_mask_mov_ps(fft997, 43176, fft1001);
__m512 fft925 = _mm512_mask_mov_ps(fft923, 43176, fft922);
__m512 fft1005 = _mm512_mask_mov_ps(fft1003, 43176, fft1002);
__m512 fft926 = _mm512_mask_mov_ps(fft924, 22102, fft922);
__m512 fft1006 = _mm512_mask_mov_ps(fft1004, 22102, fft1002);
__m512 fft927 = _mm512_mask_mul_ps(fft925, 64764, fft925, _mm512_set1_ps(5e-01f));
__m512 fft1007 = _mm512_mask_mul_ps(fft1005, 64764, fft1005, _mm512_set1_ps(5e-01f));
__m512 fft928 = _mm512_mask_mul_ps(fft926, 64764, fft926, _mm512_set1_ps(5e-01f));
__m512 fft1008 = _mm512_mask_mul_ps(fft1006, 64764, fft1006, _mm512_set1_ps(5e-01f));
__m512 df1 = fft927;
__m512 df9 = fft1007;
__m512 df2 = fft928;
__m512 df10 = fft1008;
__m512 df3 = fft908;
__m512 df11 = fft991;
__m512 df4 = fft909;
__m512 df12 = fft992;
__m512 df5 = fft910;
__m512 df13 = fft993;
__m512 df6 = fft911;
__m512 df14 = fft994;
__m512 df7 = fft912;
__m512 df15 = fft995;
__m512 df8 = fft913;
__m512 df16 = fft996;
__m512i eo6 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df3 = _mm512_permutexvar_ps(eo6, df3);
df4 = _mm512_permutexvar_ps(eo6, df4);
_mm512_mask_storeu_ps(dfPtr1+21888+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df3);
_mm512_mask_storeu_ps(dfPtr1+21952+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df4);
_mm512_mask_storeu_ps(dfPtr1+459616+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df3);
_mm512_mask_storeu_ps(dfPtr1+459680+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df4);
df11 = _mm512_permutexvar_ps(eo6, df11);
df12 = _mm512_permutexvar_ps(eo6, df12);
_mm512_mask_storeu_ps(dfPtr1+897408+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df11);
_mm512_mask_storeu_ps(dfPtr1+897472+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df12);
_mm512_mask_storeu_ps(dfPtr1+1335136+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df11);
_mm512_mask_storeu_ps(dfPtr1+1335200+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df12);
df5 = _mm512_permutexvar_ps(eo6, df5);
df6 = _mm512_permutexvar_ps(eo6, df6);
_mm512_mask_storeu_ps(dfPtr1+43776+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df5);
_mm512_mask_storeu_ps(dfPtr1+43840+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df6);
_mm512_mask_storeu_ps(dfPtr1+481504+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df5);
_mm512_mask_storeu_ps(dfPtr1+481568+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df6);
df13 = _mm512_permutexvar_ps(eo6, df13);
df14 = _mm512_permutexvar_ps(eo6, df14);
_mm512_mask_storeu_ps(dfPtr1+919296+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df13);
_mm512_mask_storeu_ps(dfPtr1+919360+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df14);
_mm512_mask_storeu_ps(dfPtr1+1357024+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df13);
_mm512_mask_storeu_ps(dfPtr1+1357088+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df14);
df7 = _mm512_permutexvar_ps(eo6, df7);
df8 = _mm512_permutexvar_ps(eo6, df8);
_mm512_mask_storeu_ps(dfPtr1+65664+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df7);
_mm512_mask_storeu_ps(dfPtr1+65728+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df8);
_mm512_mask_storeu_ps(dfPtr1+503392+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df7);
_mm512_mask_storeu_ps(dfPtr1+503456+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df8);
df15 = _mm512_permutexvar_ps(eo6, df15);
df16 = _mm512_permutexvar_ps(eo6, df16);
_mm512_mask_storeu_ps(dfPtr1+941184+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df15);
_mm512_mask_storeu_ps(dfPtr1+941248+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df16);
_mm512_mask_storeu_ps(dfPtr1+1378912+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df15);
_mm512_mask_storeu_ps(dfPtr1+1378976+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df16);
_mm512_mask_storeu_ps(dfPtr1+0+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df1);
_mm512_mask_storeu_ps(dfPtr1+64+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df2);
_mm512_mask_storeu_ps(dfPtr1+437728+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df1);
_mm512_mask_storeu_ps(dfPtr1+437792+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df2);
_mm512_mask_storeu_ps(dfPtr1+875520+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df9);
_mm512_mask_storeu_ps(dfPtr1+875584+87552*i6+21888*j2+384*k4+128*m6+32*f7, 255, df10);
_mm512_mask_storeu_ps(dfPtr1+1313248+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df9);
_mm512_mask_storeu_ps(dfPtr1+1313312+87552*i6+21888*j2+384*k4+128*m6+32*f7, 65280, df10);
}
ptrdiff_t b4 = 2;
ptrdiff_t m7 = (size_t)b4/2;
ptrdiff_t f8 = (size_t)b4%2;
__m512 dat16 = _mm512_maskz_loadu_ps(1023, datPtr1+200+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat17 = _mm512_maskz_loadu_ps(1023, datPtr1+320+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat18 = _mm512_maskz_loadu_ps(1023, datPtr1+440+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat19 = _mm512_maskz_loadu_ps(1023, datPtr1+560+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat20 = _mm512_maskz_loadu_ps(1023, datPtr1+680+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat21 = _mm512_maskz_loadu_ps(1023, datPtr1+800+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat22 = _mm512_maskz_loadu_ps(1023, datPtr1+920+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat23 = _mm512_maskz_loadu_ps(1023, datPtr1+1040+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat24 = _mm512_maskz_loadu_ps(1023, datPtr1+1160+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat25 = _mm512_maskz_loadu_ps(1023, datPtr1+1280+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat26 = _mm512_maskz_loadu_ps(1023, datPtr1+1400+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat27 = _mm512_maskz_loadu_ps(1023, datPtr1+1520+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat28 = _mm512_maskz_loadu_ps(1023, datPtr1+1640+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat29 = _mm512_maskz_loadu_ps(1023, datPtr1+1760+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 dat30 = _mm512_maskz_loadu_ps(1023, datPtr1+1880+123120*i6+2160*k4+120*h1+4*w1+0*b4);
__m512 fft1009 = _mm512_add_ps(_mm512_setzero_ps(), dat23);
__m512 fft1097 = _mm512_add_ps(dat16, dat24);
__m512 fft1010 = _mm512_sub_ps(_mm512_setzero_ps(), dat23);
__m512 fft1098 = _mm512_sub_ps(dat16, dat24);
__m512 fft1011 = _mm512_add_ps(dat17, dat25);
__m512 fft1099 = _mm512_add_ps(dat18, dat26);
__m512 fft1012 = _mm512_sub_ps(dat17, dat25);
__m512 fft1100 = _mm512_sub_ps(dat18, dat26);
__m512 fft1013 = _mm512_add_ps(dat19, dat27);
__m512 fft1101 = _mm512_add_ps(dat20, dat28);
__m512 fft1014 = _mm512_sub_ps(dat19, dat27);
__m512 fft1102 = _mm512_sub_ps(dat20, dat28);
__m512 fft1015 = _mm512_add_ps(dat21, dat29);
__m512 fft1103 = _mm512_add_ps(dat22, dat30);
__m512 fft1016 = _mm512_sub_ps(dat21, dat29);
__m512 fft1104 = _mm512_sub_ps(dat22, dat30);
__m512 fft1017 = _mm512_add_ps(fft1009, fft1013);
__m512 fft1105 = _mm512_add_ps(fft1097, fft1101);
__m512 fft1018 = _mm512_sub_ps(fft1009, fft1013);
__m512 fft1106 = _mm512_sub_ps(fft1097, fft1101);
__m512 fft1019 = _mm512_add_ps(fft1011, fft1015);
__m512 fft1107 = _mm512_add_ps(fft1099, fft1103);
__m512 fft1020 = _mm512_sub_ps(fft1015, fft1011);
__m512 fft1108 = _mm512_sub_ps(fft1103, fft1099);
__m512 fft1021 = _mm512_sub_ps(fft1012, fft1016);
__m512 fft1109 = _mm512_sub_ps(fft1100, fft1104);
__m512 fft1022 = _mm512_add_ps(fft1012, fft1016);
__m512 fft1110 = _mm512_add_ps(fft1100, fft1104);
__m512 fft1023 = _mm512_add_ps(fft1017, fft1019);
__m512 fft1111 = _mm512_add_ps(fft1105, fft1107);
__m512 fft1024 = _mm512_sub_ps(fft1017, fft1019);
__m512 fft1112 = _mm512_sub_ps(fft1105, fft1107);
__m512 fft1025 = _mm512_fmadd_ps(fft1021, _mm512_set1_ps(7.0710677e-01f), fft1010);
__m512 fft1113 = _mm512_fmadd_ps(fft1109, _mm512_set1_ps(7.0710677e-01f), fft1098);
__m512 fft1026 = _mm512_fnmsub_ps(fft1022, _mm512_set1_ps(7.0710677e-01f), fft1014);
__m512 fft1114 = _mm512_fnmsub_ps(fft1110, _mm512_set1_ps(7.0710677e-01f), fft1102);
__m512 fft1027 = _mm512_fnmadd_ps(fft1021, _mm512_set1_ps(7.0710677e-01f), fft1010);
__m512 fft1115 = _mm512_fnmadd_ps(fft1109, _mm512_set1_ps(7.0710677e-01f), fft1098);
__m512 fft1028 = _mm512_fnmadd_ps(fft1022, _mm512_set1_ps(7.0710677e-01f), fft1014);
__m512 fft1116 = _mm512_fnmadd_ps(fft1110, _mm512_set1_ps(7.0710677e-01f), fft1102);
__m512 fft1029 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1030 = _mm512_fmadd_ps(fft1023, fft1029, _mm512_shuffle_f32x4(fft1023, fft1023, 78));
__m512 fft1117 = _mm512_fmadd_ps(fft1111, fft1029, _mm512_shuffle_f32x4(fft1111, fft1111, 78));
__m512 fft1031 = _mm512_fmadd_ps(fft1024, fft1029, _mm512_shuffle_f32x4(fft1024, fft1024, 78));
__m512 fft1118 = _mm512_fmadd_ps(fft1112, fft1029, _mm512_shuffle_f32x4(fft1112, fft1112, 78));
__m512 fft1032 = _mm512_fmadd_ps(fft1025, fft1029, _mm512_shuffle_f32x4(fft1025, fft1025, 78));
__m512 fft1119 = _mm512_fmadd_ps(fft1113, fft1029, _mm512_shuffle_f32x4(fft1113, fft1113, 78));
__m512 fft1033 = _mm512_fmadd_ps(fft1026, fft1029, _mm512_shuffle_f32x4(fft1026, fft1026, 78));
__m512 fft1120 = _mm512_fmadd_ps(fft1114, fft1029, _mm512_shuffle_f32x4(fft1114, fft1114, 78));
__m512 fft1034 = _mm512_fmadd_ps(fft1018, fft1029, _mm512_shuffle_f32x4(fft1018, fft1018, 78));
__m512 fft1121 = _mm512_fmadd_ps(fft1106, fft1029, _mm512_shuffle_f32x4(fft1106, fft1106, 78));
__m512 fft1035 = _mm512_fmadd_ps(fft1020, fft1029, _mm512_shuffle_f32x4(fft1020, fft1020, 78));
__m512 fft1122 = _mm512_fmadd_ps(fft1108, fft1029, _mm512_shuffle_f32x4(fft1108, fft1108, 78));
__m512 fft1036 = _mm512_fmadd_ps(fft1027, fft1029, _mm512_shuffle_f32x4(fft1027, fft1027, 78));
__m512 fft1123 = _mm512_fmadd_ps(fft1115, fft1029, _mm512_shuffle_f32x4(fft1115, fft1115, 78));
__m512 fft1037 = _mm512_fmadd_ps(fft1028, fft1029, _mm512_shuffle_f32x4(fft1028, fft1028, 78));
__m512 fft1124 = _mm512_fmadd_ps(fft1116, fft1029, _mm512_shuffle_f32x4(fft1116, fft1116, 78));
__m512 fft1038 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1039 = _mm512_mul_ps(fft1030, fft1038);
__m512 fft1125 = _mm512_mul_ps(fft1117, fft1038);
__m512 fft1040 = _mm512_mul_ps(fft1031, fft1038);
__m512 fft1126 = _mm512_mul_ps(fft1118, fft1038);
__m512 fft1041 = _mm512_mul_ps(fft1032, fft1038);
__m512 fft1127 = _mm512_mul_ps(fft1119, fft1038);
__m512 fft1042 = _mm512_mul_ps(fft1033, fft1038);
__m512 fft1128 = _mm512_mul_ps(fft1120, fft1038);
__m512 fft1043 = _mm512_mul_ps(fft1034, fft1038);
__m512 fft1129 = _mm512_mul_ps(fft1121, fft1038);
__m512 fft1044 = _mm512_mul_ps(fft1035, fft1038);
__m512 fft1130 = _mm512_mul_ps(fft1122, fft1038);
__m512 fft1045 = _mm512_mul_ps(fft1036, fft1038);
__m512 fft1131 = _mm512_mul_ps(fft1123, fft1038);
__m512 fft1046 = _mm512_mul_ps(fft1037, fft1038);
__m512 fft1132 = _mm512_mul_ps(fft1124, fft1038);
__m512 fft1047 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1048 = _mm512_fmadd_ps(fft1031, fft1047, fft1039);
__m512 fft1133 = _mm512_fmadd_ps(fft1118, fft1047, fft1125);
__m512 fft1049 = _mm512_fnmadd_ps(fft1030, fft1047, fft1040);
__m512 fft1134 = _mm512_fnmadd_ps(fft1117, fft1047, fft1126);
__m512 fft1050 = _mm512_fmadd_ps(fft1033, fft1047, fft1041);
__m512 fft1135 = _mm512_fmadd_ps(fft1120, fft1047, fft1127);
__m512 fft1051 = _mm512_fnmadd_ps(fft1032, fft1047, fft1042);
__m512 fft1136 = _mm512_fnmadd_ps(fft1119, fft1047, fft1128);
__m512 fft1052 = _mm512_fmadd_ps(fft1035, fft1047, fft1043);
__m512 fft1137 = _mm512_fmadd_ps(fft1122, fft1047, fft1129);
__m512 fft1053 = _mm512_fnmadd_ps(fft1034, fft1047, fft1044);
__m512 fft1138 = _mm512_fnmadd_ps(fft1121, fft1047, fft1130);
__m512 fft1054 = _mm512_fmadd_ps(fft1037, fft1047, fft1045);
__m512 fft1139 = _mm512_fmadd_ps(fft1124, fft1047, fft1131);
__m512 fft1055 = _mm512_fnmadd_ps(fft1036, fft1047, fft1046);
__m512 fft1140 = _mm512_fnmadd_ps(fft1123, fft1047, fft1132);
__m512 fft1056 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1057 = _mm512_fmadd_ps(fft1048, fft1056, _mm512_shuffle_f32x4(fft1048, fft1048, 177));
__m512 fft1141 = _mm512_fmadd_ps(fft1133, fft1056, _mm512_shuffle_f32x4(fft1133, fft1133, 177));
__m512 fft1058 = _mm512_fmadd_ps(fft1049, fft1056, _mm512_shuffle_f32x4(fft1049, fft1049, 177));
__m512 fft1142 = _mm512_fmadd_ps(fft1134, fft1056, _mm512_shuffle_f32x4(fft1134, fft1134, 177));
__m512 fft1059 = _mm512_fmadd_ps(fft1050, fft1056, _mm512_shuffle_f32x4(fft1050, fft1050, 177));
__m512 fft1143 = _mm512_fmadd_ps(fft1135, fft1056, _mm512_shuffle_f32x4(fft1135, fft1135, 177));
__m512 fft1060 = _mm512_fmadd_ps(fft1051, fft1056, _mm512_shuffle_f32x4(fft1051, fft1051, 177));
__m512 fft1144 = _mm512_fmadd_ps(fft1136, fft1056, _mm512_shuffle_f32x4(fft1136, fft1136, 177));
__m512 fft1061 = _mm512_fmadd_ps(fft1052, fft1056, _mm512_shuffle_f32x4(fft1052, fft1052, 177));
__m512 fft1145 = _mm512_fmadd_ps(fft1137, fft1056, _mm512_shuffle_f32x4(fft1137, fft1137, 177));
__m512 fft1062 = _mm512_fmadd_ps(fft1053, fft1056, _mm512_shuffle_f32x4(fft1053, fft1053, 177));
__m512 fft1146 = _mm512_fmadd_ps(fft1138, fft1056, _mm512_shuffle_f32x4(fft1138, fft1138, 177));
__m512 fft1063 = _mm512_fmadd_ps(fft1054, fft1056, _mm512_shuffle_f32x4(fft1054, fft1054, 177));
__m512 fft1147 = _mm512_fmadd_ps(fft1139, fft1056, _mm512_shuffle_f32x4(fft1139, fft1139, 177));
__m512 fft1064 = _mm512_fmadd_ps(fft1055, fft1056, _mm512_shuffle_f32x4(fft1055, fft1055, 177));
__m512 fft1148 = _mm512_fmadd_ps(fft1140, fft1056, _mm512_shuffle_f32x4(fft1140, fft1140, 177));
__m512 fft1065 = _mm512_mask_mov_ps(fft1057, 49344, fft1058);
__m512 fft1149 = _mm512_mask_mov_ps(fft1141, 49344, fft1142);
__m512 fft1066 = _mm512_mask_sub_ps(fft1058, 49344, _mm512_setzero_ps(), fft1057);
__m512 fft1150 = _mm512_mask_sub_ps(fft1142, 49344, _mm512_setzero_ps(), fft1141);
__m512 fft1067 = _mm512_mask_mov_ps(fft1059, 49344, fft1060);
__m512 fft1151 = _mm512_mask_mov_ps(fft1143, 49344, fft1144);
__m512 fft1068 = _mm512_mask_sub_ps(fft1060, 49344, _mm512_setzero_ps(), fft1059);
__m512 fft1152 = _mm512_mask_sub_ps(fft1144, 49344, _mm512_setzero_ps(), fft1143);
__m512 fft1069 = _mm512_mask_mov_ps(fft1061, 49344, fft1062);
__m512 fft1153 = _mm512_mask_mov_ps(fft1145, 49344, fft1146);
__m512 fft1070 = _mm512_mask_sub_ps(fft1062, 49344, _mm512_setzero_ps(), fft1061);
__m512 fft1154 = _mm512_mask_sub_ps(fft1146, 49344, _mm512_setzero_ps(), fft1145);
__m512 fft1071 = _mm512_mask_mov_ps(fft1063, 49344, fft1064);
__m512 fft1155 = _mm512_mask_mov_ps(fft1147, 49344, fft1148);
__m512 fft1072 = _mm512_mask_sub_ps(fft1064, 49344, _mm512_setzero_ps(), fft1063);
__m512 fft1156 = _mm512_mask_sub_ps(fft1148, 49344, _mm512_setzero_ps(), fft1147);
__m512 fft1073 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1074 = _mm512_fmadd_ps(fft1065, fft1073, _mm512_shuffle_ps(fft1065, fft1065, 78));
__m512 fft1157 = _mm512_fmadd_ps(fft1149, fft1073, _mm512_shuffle_ps(fft1149, fft1149, 78));
__m512 fft1075 = _mm512_fmadd_ps(fft1066, fft1073, _mm512_shuffle_ps(fft1066, fft1066, 78));
__m512 fft1158 = _mm512_fmadd_ps(fft1150, fft1073, _mm512_shuffle_ps(fft1150, fft1150, 78));
__m512 fft1076 = _mm512_fmadd_ps(fft1067, fft1073, _mm512_shuffle_ps(fft1067, fft1067, 78));
__m512 fft1159 = _mm512_fmadd_ps(fft1151, fft1073, _mm512_shuffle_ps(fft1151, fft1151, 78));
__m512 fft1077 = _mm512_fmadd_ps(fft1068, fft1073, _mm512_shuffle_ps(fft1068, fft1068, 78));
__m512 fft1160 = _mm512_fmadd_ps(fft1152, fft1073, _mm512_shuffle_ps(fft1152, fft1152, 78));
__m512 fft1078 = _mm512_fmadd_ps(fft1069, fft1073, _mm512_shuffle_ps(fft1069, fft1069, 78));
__m512 fft1161 = _mm512_fmadd_ps(fft1153, fft1073, _mm512_shuffle_ps(fft1153, fft1153, 78));
__m512 fft1079 = _mm512_fmadd_ps(fft1070, fft1073, _mm512_shuffle_ps(fft1070, fft1070, 78));
__m512 fft1162 = _mm512_fmadd_ps(fft1154, fft1073, _mm512_shuffle_ps(fft1154, fft1154, 78));
__m512 fft1080 = _mm512_fmadd_ps(fft1071, fft1073, _mm512_shuffle_ps(fft1071, fft1071, 78));
__m512 fft1163 = _mm512_fmadd_ps(fft1155, fft1073, _mm512_shuffle_ps(fft1155, fft1155, 78));
__m512 fft1081 = _mm512_fmadd_ps(fft1072, fft1073, _mm512_shuffle_ps(fft1072, fft1072, 78));
__m512 fft1164 = _mm512_fmadd_ps(fft1156, fft1073, _mm512_shuffle_ps(fft1156, fft1156, 78));
__m512i fft1082 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1083 = _mm512_permutexvar_ps(fft1082, fft1074);
__m512 fft1165 = _mm512_permutexvar_ps(fft1082, fft1157);
__m512i fft1084 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1085 = _mm512_permutexvar_ps(fft1084, fft1074);
__m512 fft1166 = _mm512_permutexvar_ps(fft1084, fft1157);
__m512 fft1086 = _mm512_permutexvar_ps(fft1082, fft1075);
__m512 fft1167 = _mm512_permutexvar_ps(fft1082, fft1158);
__m512 fft1087 = _mm512_permutexvar_ps(fft1084, fft1075);
__m512 fft1168 = _mm512_permutexvar_ps(fft1084, fft1158);
__m512 fft1088 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1089 = _mm512_fmadd_ps(fft1083, fft1088, fft1085);
__m512 fft1169 = _mm512_fmadd_ps(fft1165, fft1088, fft1166);
__m512 fft1090 = _mm512_fnmadd_ps(fft1087, fft1088, fft1086);
__m512 fft1170 = _mm512_fnmadd_ps(fft1168, fft1088, fft1167);
__m512 fft1091 = _mm512_mask_mov_ps(fft1087, 21845, fft1089);
__m512 fft1171 = _mm512_mask_mov_ps(fft1168, 21845, fft1169);
__m512 fft1092 = _mm512_mask_mov_ps(fft1083, 43176, fft1089);
__m512 fft1172 = _mm512_mask_mov_ps(fft1165, 43176, fft1169);
__m512 fft1093 = _mm512_mask_mov_ps(fft1091, 43176, fft1090);
__m512 fft1173 = _mm512_mask_mov_ps(fft1171, 43176, fft1170);
__m512 fft1094 = _mm512_mask_mov_ps(fft1092, 22102, fft1090);
__m512 fft1174 = _mm512_mask_mov_ps(fft1172, 22102, fft1170);
__m512 fft1095 = _mm512_mask_mul_ps(fft1093, 64764, fft1093, _mm512_set1_ps(5e-01f));
__m512 fft1175 = _mm512_mask_mul_ps(fft1173, 64764, fft1173, _mm512_set1_ps(5e-01f));
__m512 fft1096 = _mm512_mask_mul_ps(fft1094, 64764, fft1094, _mm512_set1_ps(5e-01f));
__m512 fft1176 = _mm512_mask_mul_ps(fft1174, 64764, fft1174, _mm512_set1_ps(5e-01f));
__m512 df17 = fft1095;
__m512 df25 = fft1175;
__m512 df18 = fft1096;
__m512 df26 = fft1176;
__m512 df19 = fft1076;
__m512 df27 = fft1159;
__m512 df20 = fft1077;
__m512 df28 = fft1160;
__m512 df21 = fft1078;
__m512 df29 = fft1161;
__m512 df22 = fft1079;
__m512 df30 = fft1162;
__m512 df23 = fft1080;
__m512 df31 = fft1163;
__m512 df24 = fft1081;
__m512 df32 = fft1164;
__m512i eo7 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df19 = _mm512_permutexvar_ps(eo7, df19);
df20 = _mm512_permutexvar_ps(eo7, df20);
_mm512_mask_storeu_ps(dfPtr1+21888+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df19);
_mm512_mask_storeu_ps(dfPtr1+21952+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df20);
_mm512_mask_storeu_ps(dfPtr1+459616+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df19);
_mm512_mask_storeu_ps(dfPtr1+459680+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df20);
df27 = _mm512_permutexvar_ps(eo7, df27);
df28 = _mm512_permutexvar_ps(eo7, df28);
_mm512_mask_storeu_ps(dfPtr1+897408+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df27);
_mm512_mask_storeu_ps(dfPtr1+897472+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df28);
_mm512_mask_storeu_ps(dfPtr1+1335136+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df27);
_mm512_mask_storeu_ps(dfPtr1+1335200+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df28);
df21 = _mm512_permutexvar_ps(eo7, df21);
df22 = _mm512_permutexvar_ps(eo7, df22);
_mm512_mask_storeu_ps(dfPtr1+43776+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df21);
_mm512_mask_storeu_ps(dfPtr1+43840+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df22);
_mm512_mask_storeu_ps(dfPtr1+481504+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df21);
_mm512_mask_storeu_ps(dfPtr1+481568+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df22);
df29 = _mm512_permutexvar_ps(eo7, df29);
df30 = _mm512_permutexvar_ps(eo7, df30);
_mm512_mask_storeu_ps(dfPtr1+919296+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df29);
_mm512_mask_storeu_ps(dfPtr1+919360+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df30);
_mm512_mask_storeu_ps(dfPtr1+1357024+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df29);
_mm512_mask_storeu_ps(dfPtr1+1357088+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df30);
df23 = _mm512_permutexvar_ps(eo7, df23);
df24 = _mm512_permutexvar_ps(eo7, df24);
_mm512_mask_storeu_ps(dfPtr1+65664+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df23);
_mm512_mask_storeu_ps(dfPtr1+65728+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df24);
_mm512_mask_storeu_ps(dfPtr1+503392+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df23);
_mm512_mask_storeu_ps(dfPtr1+503456+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df24);
df31 = _mm512_permutexvar_ps(eo7, df31);
df32 = _mm512_permutexvar_ps(eo7, df32);
_mm512_mask_storeu_ps(dfPtr1+941184+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df31);
_mm512_mask_storeu_ps(dfPtr1+941248+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df32);
_mm512_mask_storeu_ps(dfPtr1+1378912+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df31);
_mm512_mask_storeu_ps(dfPtr1+1378976+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df32);
_mm512_mask_storeu_ps(dfPtr1+0+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df17);
_mm512_mask_storeu_ps(dfPtr1+64+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df18);
_mm512_mask_storeu_ps(dfPtr1+437728+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df17);
_mm512_mask_storeu_ps(dfPtr1+437792+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df18);
_mm512_mask_storeu_ps(dfPtr1+875520+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df25);
_mm512_mask_storeu_ps(dfPtr1+875584+87552*i6+21888*j2+384*k4+128*m7+32*f8, 255, df26);
_mm512_mask_storeu_ps(dfPtr1+1313248+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df25);
_mm512_mask_storeu_ps(dfPtr1+1313312+87552*i6+21888*j2+384*k4+128*m7+32*f8, 65280, df26);
for (ptrdiff_t b5 = 3; b5 < 5; ++b5) {
ptrdiff_t m8 = (size_t)b5/2;
ptrdiff_t f9 = (size_t)b5%2;
__m512 dat31 = _mm512_maskz_loadu_ps(65535, datPtr1+1080+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 dat32 = _mm512_maskz_loadu_ps(65535, datPtr1+1200+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 dat33 = _mm512_maskz_loadu_ps(65535, datPtr1+1320+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 dat34 = _mm512_maskz_loadu_ps(65535, datPtr1+1440+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 dat35 = _mm512_maskz_loadu_ps(65535, datPtr1+1560+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 dat36 = _mm512_maskz_loadu_ps(65535, datPtr1+1680+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 dat37 = _mm512_maskz_loadu_ps(65535, datPtr1+1800+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 dat38 = _mm512_maskz_loadu_ps(65535, datPtr1+1920+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 dat39 = _mm512_maskz_loadu_ps(65535, datPtr1+2040+123120*i6+2160*k4+120*h1+4*w1+40*b5);
__m512 fft1177 = _mm512_add_ps(dat31, dat39);
__m512 fft1265 = _mm512_add_ps(dat32, _mm512_setzero_ps());
__m512 fft1178 = _mm512_sub_ps(dat31, dat39);
__m512 fft1266 = _mm512_sub_ps(dat32, _mm512_setzero_ps());
__m512 fft1179 = _mm512_add_ps(dat33, _mm512_setzero_ps());
__m512 fft1267 = _mm512_add_ps(dat34, _mm512_setzero_ps());
__m512 fft1180 = _mm512_sub_ps(dat33, _mm512_setzero_ps());
__m512 fft1268 = _mm512_sub_ps(dat34, _mm512_setzero_ps());
__m512 fft1181 = _mm512_add_ps(dat35, _mm512_setzero_ps());
__m512 fft1269 = _mm512_add_ps(dat36, _mm512_setzero_ps());
__m512 fft1182 = _mm512_sub_ps(dat35, _mm512_setzero_ps());
__m512 fft1270 = _mm512_sub_ps(dat36, _mm512_setzero_ps());
__m512 fft1183 = _mm512_add_ps(dat37, _mm512_setzero_ps());
__m512 fft1271 = _mm512_add_ps(dat38, _mm512_setzero_ps());
__m512 fft1184 = _mm512_sub_ps(dat37, _mm512_setzero_ps());
__m512 fft1272 = _mm512_sub_ps(dat38, _mm512_setzero_ps());
__m512 fft1185 = _mm512_add_ps(fft1177, fft1181);
__m512 fft1273 = _mm512_add_ps(fft1265, fft1269);
__m512 fft1186 = _mm512_sub_ps(fft1177, fft1181);
__m512 fft1274 = _mm512_sub_ps(fft1265, fft1269);
__m512 fft1187 = _mm512_add_ps(fft1179, fft1183);
__m512 fft1275 = _mm512_add_ps(fft1267, fft1271);
__m512 fft1188 = _mm512_sub_ps(fft1183, fft1179);
__m512 fft1276 = _mm512_sub_ps(fft1271, fft1267);
__m512 fft1189 = _mm512_sub_ps(fft1180, fft1184);
__m512 fft1277 = _mm512_sub_ps(fft1268, fft1272);
__m512 fft1190 = _mm512_add_ps(fft1180, fft1184);
__m512 fft1278 = _mm512_add_ps(fft1268, fft1272);
__m512 fft1191 = _mm512_add_ps(fft1185, fft1187);
__m512 fft1279 = _mm512_add_ps(fft1273, fft1275);
__m512 fft1192 = _mm512_sub_ps(fft1185, fft1187);
__m512 fft1280 = _mm512_sub_ps(fft1273, fft1275);
__m512 fft1193 = _mm512_fmadd_ps(fft1189, _mm512_set1_ps(7.0710677e-01f), fft1178);
__m512 fft1281 = _mm512_fmadd_ps(fft1277, _mm512_set1_ps(7.0710677e-01f), fft1266);
__m512 fft1194 = _mm512_fnmsub_ps(fft1190, _mm512_set1_ps(7.0710677e-01f), fft1182);
__m512 fft1282 = _mm512_fnmsub_ps(fft1278, _mm512_set1_ps(7.0710677e-01f), fft1270);
__m512 fft1195 = _mm512_fnmadd_ps(fft1189, _mm512_set1_ps(7.0710677e-01f), fft1178);
__m512 fft1283 = _mm512_fnmadd_ps(fft1277, _mm512_set1_ps(7.0710677e-01f), fft1266);
__m512 fft1196 = _mm512_fnmadd_ps(fft1190, _mm512_set1_ps(7.0710677e-01f), fft1182);
__m512 fft1284 = _mm512_fnmadd_ps(fft1278, _mm512_set1_ps(7.0710677e-01f), fft1270);
__m512 fft1197 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1198 = _mm512_fmadd_ps(fft1191, fft1197, _mm512_shuffle_f32x4(fft1191, fft1191, 78));
__m512 fft1285 = _mm512_fmadd_ps(fft1279, fft1197, _mm512_shuffle_f32x4(fft1279, fft1279, 78));
__m512 fft1199 = _mm512_fmadd_ps(fft1192, fft1197, _mm512_shuffle_f32x4(fft1192, fft1192, 78));
__m512 fft1286 = _mm512_fmadd_ps(fft1280, fft1197, _mm512_shuffle_f32x4(fft1280, fft1280, 78));
__m512 fft1200 = _mm512_fmadd_ps(fft1193, fft1197, _mm512_shuffle_f32x4(fft1193, fft1193, 78));
__m512 fft1287 = _mm512_fmadd_ps(fft1281, fft1197, _mm512_shuffle_f32x4(fft1281, fft1281, 78));
__m512 fft1201 = _mm512_fmadd_ps(fft1194, fft1197, _mm512_shuffle_f32x4(fft1194, fft1194, 78));
__m512 fft1288 = _mm512_fmadd_ps(fft1282, fft1197, _mm512_shuffle_f32x4(fft1282, fft1282, 78));
__m512 fft1202 = _mm512_fmadd_ps(fft1186, fft1197, _mm512_shuffle_f32x4(fft1186, fft1186, 78));
__m512 fft1289 = _mm512_fmadd_ps(fft1274, fft1197, _mm512_shuffle_f32x4(fft1274, fft1274, 78));
__m512 fft1203 = _mm512_fmadd_ps(fft1188, fft1197, _mm512_shuffle_f32x4(fft1188, fft1188, 78));
__m512 fft1290 = _mm512_fmadd_ps(fft1276, fft1197, _mm512_shuffle_f32x4(fft1276, fft1276, 78));
__m512 fft1204 = _mm512_fmadd_ps(fft1195, fft1197, _mm512_shuffle_f32x4(fft1195, fft1195, 78));
__m512 fft1291 = _mm512_fmadd_ps(fft1283, fft1197, _mm512_shuffle_f32x4(fft1283, fft1283, 78));
__m512 fft1205 = _mm512_fmadd_ps(fft1196, fft1197, _mm512_shuffle_f32x4(fft1196, fft1196, 78));
__m512 fft1292 = _mm512_fmadd_ps(fft1284, fft1197, _mm512_shuffle_f32x4(fft1284, fft1284, 78));
__m512 fft1206 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1207 = _mm512_mul_ps(fft1198, fft1206);
__m512 fft1293 = _mm512_mul_ps(fft1285, fft1206);
__m512 fft1208 = _mm512_mul_ps(fft1199, fft1206);
__m512 fft1294 = _mm512_mul_ps(fft1286, fft1206);
__m512 fft1209 = _mm512_mul_ps(fft1200, fft1206);
__m512 fft1295 = _mm512_mul_ps(fft1287, fft1206);
__m512 fft1210 = _mm512_mul_ps(fft1201, fft1206);
__m512 fft1296 = _mm512_mul_ps(fft1288, fft1206);
__m512 fft1211 = _mm512_mul_ps(fft1202, fft1206);
__m512 fft1297 = _mm512_mul_ps(fft1289, fft1206);
__m512 fft1212 = _mm512_mul_ps(fft1203, fft1206);
__m512 fft1298 = _mm512_mul_ps(fft1290, fft1206);
__m512 fft1213 = _mm512_mul_ps(fft1204, fft1206);
__m512 fft1299 = _mm512_mul_ps(fft1291, fft1206);
__m512 fft1214 = _mm512_mul_ps(fft1205, fft1206);
__m512 fft1300 = _mm512_mul_ps(fft1292, fft1206);
__m512 fft1215 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1216 = _mm512_fmadd_ps(fft1199, fft1215, fft1207);
__m512 fft1301 = _mm512_fmadd_ps(fft1286, fft1215, fft1293);
__m512 fft1217 = _mm512_fnmadd_ps(fft1198, fft1215, fft1208);
__m512 fft1302 = _mm512_fnmadd_ps(fft1285, fft1215, fft1294);
__m512 fft1218 = _mm512_fmadd_ps(fft1201, fft1215, fft1209);
__m512 fft1303 = _mm512_fmadd_ps(fft1288, fft1215, fft1295);
__m512 fft1219 = _mm512_fnmadd_ps(fft1200, fft1215, fft1210);
__m512 fft1304 = _mm512_fnmadd_ps(fft1287, fft1215, fft1296);
__m512 fft1220 = _mm512_fmadd_ps(fft1203, fft1215, fft1211);
__m512 fft1305 = _mm512_fmadd_ps(fft1290, fft1215, fft1297);
__m512 fft1221 = _mm512_fnmadd_ps(fft1202, fft1215, fft1212);
__m512 fft1306 = _mm512_fnmadd_ps(fft1289, fft1215, fft1298);
__m512 fft1222 = _mm512_fmadd_ps(fft1205, fft1215, fft1213);
__m512 fft1307 = _mm512_fmadd_ps(fft1292, fft1215, fft1299);
__m512 fft1223 = _mm512_fnmadd_ps(fft1204, fft1215, fft1214);
__m512 fft1308 = _mm512_fnmadd_ps(fft1291, fft1215, fft1300);
__m512 fft1224 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1225 = _mm512_fmadd_ps(fft1216, fft1224, _mm512_shuffle_f32x4(fft1216, fft1216, 177));
__m512 fft1309 = _mm512_fmadd_ps(fft1301, fft1224, _mm512_shuffle_f32x4(fft1301, fft1301, 177));
__m512 fft1226 = _mm512_fmadd_ps(fft1217, fft1224, _mm512_shuffle_f32x4(fft1217, fft1217, 177));
__m512 fft1310 = _mm512_fmadd_ps(fft1302, fft1224, _mm512_shuffle_f32x4(fft1302, fft1302, 177));
__m512 fft1227 = _mm512_fmadd_ps(fft1218, fft1224, _mm512_shuffle_f32x4(fft1218, fft1218, 177));
__m512 fft1311 = _mm512_fmadd_ps(fft1303, fft1224, _mm512_shuffle_f32x4(fft1303, fft1303, 177));
__m512 fft1228 = _mm512_fmadd_ps(fft1219, fft1224, _mm512_shuffle_f32x4(fft1219, fft1219, 177));
__m512 fft1312 = _mm512_fmadd_ps(fft1304, fft1224, _mm512_shuffle_f32x4(fft1304, fft1304, 177));
__m512 fft1229 = _mm512_fmadd_ps(fft1220, fft1224, _mm512_shuffle_f32x4(fft1220, fft1220, 177));
__m512 fft1313 = _mm512_fmadd_ps(fft1305, fft1224, _mm512_shuffle_f32x4(fft1305, fft1305, 177));
__m512 fft1230 = _mm512_fmadd_ps(fft1221, fft1224, _mm512_shuffle_f32x4(fft1221, fft1221, 177));
__m512 fft1314 = _mm512_fmadd_ps(fft1306, fft1224, _mm512_shuffle_f32x4(fft1306, fft1306, 177));
__m512 fft1231 = _mm512_fmadd_ps(fft1222, fft1224, _mm512_shuffle_f32x4(fft1222, fft1222, 177));
__m512 fft1315 = _mm512_fmadd_ps(fft1307, fft1224, _mm512_shuffle_f32x4(fft1307, fft1307, 177));
__m512 fft1232 = _mm512_fmadd_ps(fft1223, fft1224, _mm512_shuffle_f32x4(fft1223, fft1223, 177));
__m512 fft1316 = _mm512_fmadd_ps(fft1308, fft1224, _mm512_shuffle_f32x4(fft1308, fft1308, 177));
__m512 fft1233 = _mm512_mask_mov_ps(fft1225, 49344, fft1226);
__m512 fft1317 = _mm512_mask_mov_ps(fft1309, 49344, fft1310);
__m512 fft1234 = _mm512_mask_sub_ps(fft1226, 49344, _mm512_setzero_ps(), fft1225);
__m512 fft1318 = _mm512_mask_sub_ps(fft1310, 49344, _mm512_setzero_ps(), fft1309);
__m512 fft1235 = _mm512_mask_mov_ps(fft1227, 49344, fft1228);
__m512 fft1319 = _mm512_mask_mov_ps(fft1311, 49344, fft1312);
__m512 fft1236 = _mm512_mask_sub_ps(fft1228, 49344, _mm512_setzero_ps(), fft1227);
__m512 fft1320 = _mm512_mask_sub_ps(fft1312, 49344, _mm512_setzero_ps(), fft1311);
__m512 fft1237 = _mm512_mask_mov_ps(fft1229, 49344, fft1230);
__m512 fft1321 = _mm512_mask_mov_ps(fft1313, 49344, fft1314);
__m512 fft1238 = _mm512_mask_sub_ps(fft1230, 49344, _mm512_setzero_ps(), fft1229);
__m512 fft1322 = _mm512_mask_sub_ps(fft1314, 49344, _mm512_setzero_ps(), fft1313);
__m512 fft1239 = _mm512_mask_mov_ps(fft1231, 49344, fft1232);
__m512 fft1323 = _mm512_mask_mov_ps(fft1315, 49344, fft1316);
__m512 fft1240 = _mm512_mask_sub_ps(fft1232, 49344, _mm512_setzero_ps(), fft1231);
__m512 fft1324 = _mm512_mask_sub_ps(fft1316, 49344, _mm512_setzero_ps(), fft1315);
__m512 fft1241 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1242 = _mm512_fmadd_ps(fft1233, fft1241, _mm512_shuffle_ps(fft1233, fft1233, 78));
__m512 fft1325 = _mm512_fmadd_ps(fft1317, fft1241, _mm512_shuffle_ps(fft1317, fft1317, 78));
__m512 fft1243 = _mm512_fmadd_ps(fft1234, fft1241, _mm512_shuffle_ps(fft1234, fft1234, 78));
__m512 fft1326 = _mm512_fmadd_ps(fft1318, fft1241, _mm512_shuffle_ps(fft1318, fft1318, 78));
__m512 fft1244 = _mm512_fmadd_ps(fft1235, fft1241, _mm512_shuffle_ps(fft1235, fft1235, 78));
__m512 fft1327 = _mm512_fmadd_ps(fft1319, fft1241, _mm512_shuffle_ps(fft1319, fft1319, 78));
__m512 fft1245 = _mm512_fmadd_ps(fft1236, fft1241, _mm512_shuffle_ps(fft1236, fft1236, 78));
__m512 fft1328 = _mm512_fmadd_ps(fft1320, fft1241, _mm512_shuffle_ps(fft1320, fft1320, 78));
__m512 fft1246 = _mm512_fmadd_ps(fft1237, fft1241, _mm512_shuffle_ps(fft1237, fft1237, 78));
__m512 fft1329 = _mm512_fmadd_ps(fft1321, fft1241, _mm512_shuffle_ps(fft1321, fft1321, 78));
__m512 fft1247 = _mm512_fmadd_ps(fft1238, fft1241, _mm512_shuffle_ps(fft1238, fft1238, 78));
__m512 fft1330 = _mm512_fmadd_ps(fft1322, fft1241, _mm512_shuffle_ps(fft1322, fft1322, 78));
__m512 fft1248 = _mm512_fmadd_ps(fft1239, fft1241, _mm512_shuffle_ps(fft1239, fft1239, 78));
__m512 fft1331 = _mm512_fmadd_ps(fft1323, fft1241, _mm512_shuffle_ps(fft1323, fft1323, 78));
__m512 fft1249 = _mm512_fmadd_ps(fft1240, fft1241, _mm512_shuffle_ps(fft1240, fft1240, 78));
__m512 fft1332 = _mm512_fmadd_ps(fft1324, fft1241, _mm512_shuffle_ps(fft1324, fft1324, 78));
__m512i fft1250 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1251 = _mm512_permutexvar_ps(fft1250, fft1242);
__m512 fft1333 = _mm512_permutexvar_ps(fft1250, fft1325);
__m512i fft1252 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1253 = _mm512_permutexvar_ps(fft1252, fft1242);
__m512 fft1334 = _mm512_permutexvar_ps(fft1252, fft1325);
__m512 fft1254 = _mm512_permutexvar_ps(fft1250, fft1243);
__m512 fft1335 = _mm512_permutexvar_ps(fft1250, fft1326);
__m512 fft1255 = _mm512_permutexvar_ps(fft1252, fft1243);
__m512 fft1336 = _mm512_permutexvar_ps(fft1252, fft1326);
__m512 fft1256 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1257 = _mm512_fmadd_ps(fft1251, fft1256, fft1253);
__m512 fft1337 = _mm512_fmadd_ps(fft1333, fft1256, fft1334);
__m512 fft1258 = _mm512_fnmadd_ps(fft1255, fft1256, fft1254);
__m512 fft1338 = _mm512_fnmadd_ps(fft1336, fft1256, fft1335);
__m512 fft1259 = _mm512_mask_mov_ps(fft1255, 21845, fft1257);
__m512 fft1339 = _mm512_mask_mov_ps(fft1336, 21845, fft1337);
__m512 fft1260 = _mm512_mask_mov_ps(fft1251, 43176, fft1257);
__m512 fft1340 = _mm512_mask_mov_ps(fft1333, 43176, fft1337);
__m512 fft1261 = _mm512_mask_mov_ps(fft1259, 43176, fft1258);
__m512 fft1341 = _mm512_mask_mov_ps(fft1339, 43176, fft1338);
__m512 fft1262 = _mm512_mask_mov_ps(fft1260, 22102, fft1258);
__m512 fft1342 = _mm512_mask_mov_ps(fft1340, 22102, fft1338);
__m512 fft1263 = _mm512_mask_mul_ps(fft1261, 64764, fft1261, _mm512_set1_ps(5e-01f));
__m512 fft1343 = _mm512_mask_mul_ps(fft1341, 64764, fft1341, _mm512_set1_ps(5e-01f));
__m512 fft1264 = _mm512_mask_mul_ps(fft1262, 64764, fft1262, _mm512_set1_ps(5e-01f));
__m512 fft1344 = _mm512_mask_mul_ps(fft1342, 64764, fft1342, _mm512_set1_ps(5e-01f));
__m512 df33 = fft1263;
__m512 df41 = fft1343;
__m512 df34 = fft1264;
__m512 df42 = fft1344;
__m512 df35 = fft1244;
__m512 df43 = fft1327;
__m512 df36 = fft1245;
__m512 df44 = fft1328;
__m512 df37 = fft1246;
__m512 df45 = fft1329;
__m512 df38 = fft1247;
__m512 df46 = fft1330;
__m512 df39 = fft1248;
__m512 df47 = fft1331;
__m512 df40 = fft1249;
__m512 df48 = fft1332;
__m512i eo8 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df35 = _mm512_permutexvar_ps(eo8, df35);
df36 = _mm512_permutexvar_ps(eo8, df36);
_mm512_mask_storeu_ps(dfPtr1+21888+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df35);
_mm512_mask_storeu_ps(dfPtr1+21952+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df36);
_mm512_mask_storeu_ps(dfPtr1+459616+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df35);
_mm512_mask_storeu_ps(dfPtr1+459680+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df36);
df43 = _mm512_permutexvar_ps(eo8, df43);
df44 = _mm512_permutexvar_ps(eo8, df44);
_mm512_mask_storeu_ps(dfPtr1+897408+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df43);
_mm512_mask_storeu_ps(dfPtr1+897472+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df44);
_mm512_mask_storeu_ps(dfPtr1+1335136+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df43);
_mm512_mask_storeu_ps(dfPtr1+1335200+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df44);
df37 = _mm512_permutexvar_ps(eo8, df37);
df38 = _mm512_permutexvar_ps(eo8, df38);
_mm512_mask_storeu_ps(dfPtr1+43776+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df37);
_mm512_mask_storeu_ps(dfPtr1+43840+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df38);
_mm512_mask_storeu_ps(dfPtr1+481504+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df37);
_mm512_mask_storeu_ps(dfPtr1+481568+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df38);
df45 = _mm512_permutexvar_ps(eo8, df45);
df46 = _mm512_permutexvar_ps(eo8, df46);
_mm512_mask_storeu_ps(dfPtr1+919296+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df45);
_mm512_mask_storeu_ps(dfPtr1+919360+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df46);
_mm512_mask_storeu_ps(dfPtr1+1357024+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df45);
_mm512_mask_storeu_ps(dfPtr1+1357088+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df46);
df39 = _mm512_permutexvar_ps(eo8, df39);
df40 = _mm512_permutexvar_ps(eo8, df40);
_mm512_mask_storeu_ps(dfPtr1+65664+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df39);
_mm512_mask_storeu_ps(dfPtr1+65728+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df40);
_mm512_mask_storeu_ps(dfPtr1+503392+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df39);
_mm512_mask_storeu_ps(dfPtr1+503456+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df40);
df47 = _mm512_permutexvar_ps(eo8, df47);
df48 = _mm512_permutexvar_ps(eo8, df48);
_mm512_mask_storeu_ps(dfPtr1+941184+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df47);
_mm512_mask_storeu_ps(dfPtr1+941248+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df48);
_mm512_mask_storeu_ps(dfPtr1+1378912+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df47);
_mm512_mask_storeu_ps(dfPtr1+1378976+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df48);
_mm512_mask_storeu_ps(dfPtr1+0+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df33);
_mm512_mask_storeu_ps(dfPtr1+64+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df34);
_mm512_mask_storeu_ps(dfPtr1+437728+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df33);
_mm512_mask_storeu_ps(dfPtr1+437792+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df34);
_mm512_mask_storeu_ps(dfPtr1+875520+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df41);
_mm512_mask_storeu_ps(dfPtr1+875584+87552*i6+21888*j2+384*k4+128*m8+32*f9, 255, df42);
_mm512_mask_storeu_ps(dfPtr1+1313248+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df41);
_mm512_mask_storeu_ps(dfPtr1+1313312+87552*i6+21888*j2+384*k4+128*m8+32*f9, 65280, df42);
}
ptrdiff_t b6 = 5;
ptrdiff_t m9 = (size_t)b6/2;
ptrdiff_t f10 = (size_t)b6%2;
__m512 dat40 = _mm512_maskz_loadu_ps(1023, datPtr1+1280+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 dat41 = _mm512_maskz_loadu_ps(1023, datPtr1+1400+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 dat42 = _mm512_maskz_loadu_ps(1023, datPtr1+1520+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 dat43 = _mm512_maskz_loadu_ps(1023, datPtr1+1640+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 dat44 = _mm512_maskz_loadu_ps(1023, datPtr1+1760+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 dat45 = _mm512_maskz_loadu_ps(1023, datPtr1+1880+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 dat46 = _mm512_maskz_loadu_ps(1023, datPtr1+2000+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 dat47 = _mm512_maskz_loadu_ps(1023, datPtr1+2120+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 dat48 = _mm512_maskz_loadu_ps(1023, datPtr1+2240+123120*i6+2160*k4+120*h1+4*w1+0*b6);
__m512 fft1345 = _mm512_add_ps(dat40, dat48);
__m512 fft1433 = _mm512_add_ps(dat41, _mm512_setzero_ps());
__m512 fft1346 = _mm512_sub_ps(dat40, dat48);
__m512 fft1434 = _mm512_sub_ps(dat41, _mm512_setzero_ps());
__m512 fft1347 = _mm512_add_ps(dat42, _mm512_setzero_ps());
__m512 fft1435 = _mm512_add_ps(dat43, _mm512_setzero_ps());
__m512 fft1348 = _mm512_sub_ps(dat42, _mm512_setzero_ps());
__m512 fft1436 = _mm512_sub_ps(dat43, _mm512_setzero_ps());
__m512 fft1349 = _mm512_add_ps(dat44, _mm512_setzero_ps());
__m512 fft1437 = _mm512_add_ps(dat45, _mm512_setzero_ps());
__m512 fft1350 = _mm512_sub_ps(dat44, _mm512_setzero_ps());
__m512 fft1438 = _mm512_sub_ps(dat45, _mm512_setzero_ps());
__m512 fft1351 = _mm512_add_ps(dat46, _mm512_setzero_ps());
__m512 fft1439 = _mm512_add_ps(dat47, _mm512_setzero_ps());
__m512 fft1352 = _mm512_sub_ps(dat46, _mm512_setzero_ps());
__m512 fft1440 = _mm512_sub_ps(dat47, _mm512_setzero_ps());
__m512 fft1353 = _mm512_add_ps(fft1345, fft1349);
__m512 fft1441 = _mm512_add_ps(fft1433, fft1437);
__m512 fft1354 = _mm512_sub_ps(fft1345, fft1349);
__m512 fft1442 = _mm512_sub_ps(fft1433, fft1437);
__m512 fft1355 = _mm512_add_ps(fft1347, fft1351);
__m512 fft1443 = _mm512_add_ps(fft1435, fft1439);
__m512 fft1356 = _mm512_sub_ps(fft1351, fft1347);
__m512 fft1444 = _mm512_sub_ps(fft1439, fft1435);
__m512 fft1357 = _mm512_sub_ps(fft1348, fft1352);
__m512 fft1445 = _mm512_sub_ps(fft1436, fft1440);
__m512 fft1358 = _mm512_add_ps(fft1348, fft1352);
__m512 fft1446 = _mm512_add_ps(fft1436, fft1440);
__m512 fft1359 = _mm512_add_ps(fft1353, fft1355);
__m512 fft1447 = _mm512_add_ps(fft1441, fft1443);
__m512 fft1360 = _mm512_sub_ps(fft1353, fft1355);
__m512 fft1448 = _mm512_sub_ps(fft1441, fft1443);
__m512 fft1361 = _mm512_fmadd_ps(fft1357, _mm512_set1_ps(7.0710677e-01f), fft1346);
__m512 fft1449 = _mm512_fmadd_ps(fft1445, _mm512_set1_ps(7.0710677e-01f), fft1434);
__m512 fft1362 = _mm512_fnmsub_ps(fft1358, _mm512_set1_ps(7.0710677e-01f), fft1350);
__m512 fft1450 = _mm512_fnmsub_ps(fft1446, _mm512_set1_ps(7.0710677e-01f), fft1438);
__m512 fft1363 = _mm512_fnmadd_ps(fft1357, _mm512_set1_ps(7.0710677e-01f), fft1346);
__m512 fft1451 = _mm512_fnmadd_ps(fft1445, _mm512_set1_ps(7.0710677e-01f), fft1434);
__m512 fft1364 = _mm512_fnmadd_ps(fft1358, _mm512_set1_ps(7.0710677e-01f), fft1350);
__m512 fft1452 = _mm512_fnmadd_ps(fft1446, _mm512_set1_ps(7.0710677e-01f), fft1438);
__m512 fft1365 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1366 = _mm512_fmadd_ps(fft1359, fft1365, _mm512_shuffle_f32x4(fft1359, fft1359, 78));
__m512 fft1453 = _mm512_fmadd_ps(fft1447, fft1365, _mm512_shuffle_f32x4(fft1447, fft1447, 78));
__m512 fft1367 = _mm512_fmadd_ps(fft1360, fft1365, _mm512_shuffle_f32x4(fft1360, fft1360, 78));
__m512 fft1454 = _mm512_fmadd_ps(fft1448, fft1365, _mm512_shuffle_f32x4(fft1448, fft1448, 78));
__m512 fft1368 = _mm512_fmadd_ps(fft1361, fft1365, _mm512_shuffle_f32x4(fft1361, fft1361, 78));
__m512 fft1455 = _mm512_fmadd_ps(fft1449, fft1365, _mm512_shuffle_f32x4(fft1449, fft1449, 78));
__m512 fft1369 = _mm512_fmadd_ps(fft1362, fft1365, _mm512_shuffle_f32x4(fft1362, fft1362, 78));
__m512 fft1456 = _mm512_fmadd_ps(fft1450, fft1365, _mm512_shuffle_f32x4(fft1450, fft1450, 78));
__m512 fft1370 = _mm512_fmadd_ps(fft1354, fft1365, _mm512_shuffle_f32x4(fft1354, fft1354, 78));
__m512 fft1457 = _mm512_fmadd_ps(fft1442, fft1365, _mm512_shuffle_f32x4(fft1442, fft1442, 78));
__m512 fft1371 = _mm512_fmadd_ps(fft1356, fft1365, _mm512_shuffle_f32x4(fft1356, fft1356, 78));
__m512 fft1458 = _mm512_fmadd_ps(fft1444, fft1365, _mm512_shuffle_f32x4(fft1444, fft1444, 78));
__m512 fft1372 = _mm512_fmadd_ps(fft1363, fft1365, _mm512_shuffle_f32x4(fft1363, fft1363, 78));
__m512 fft1459 = _mm512_fmadd_ps(fft1451, fft1365, _mm512_shuffle_f32x4(fft1451, fft1451, 78));
__m512 fft1373 = _mm512_fmadd_ps(fft1364, fft1365, _mm512_shuffle_f32x4(fft1364, fft1364, 78));
__m512 fft1460 = _mm512_fmadd_ps(fft1452, fft1365, _mm512_shuffle_f32x4(fft1452, fft1452, 78));
__m512 fft1374 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1375 = _mm512_mul_ps(fft1366, fft1374);
__m512 fft1461 = _mm512_mul_ps(fft1453, fft1374);
__m512 fft1376 = _mm512_mul_ps(fft1367, fft1374);
__m512 fft1462 = _mm512_mul_ps(fft1454, fft1374);
__m512 fft1377 = _mm512_mul_ps(fft1368, fft1374);
__m512 fft1463 = _mm512_mul_ps(fft1455, fft1374);
__m512 fft1378 = _mm512_mul_ps(fft1369, fft1374);
__m512 fft1464 = _mm512_mul_ps(fft1456, fft1374);
__m512 fft1379 = _mm512_mul_ps(fft1370, fft1374);
__m512 fft1465 = _mm512_mul_ps(fft1457, fft1374);
__m512 fft1380 = _mm512_mul_ps(fft1371, fft1374);
__m512 fft1466 = _mm512_mul_ps(fft1458, fft1374);
__m512 fft1381 = _mm512_mul_ps(fft1372, fft1374);
__m512 fft1467 = _mm512_mul_ps(fft1459, fft1374);
__m512 fft1382 = _mm512_mul_ps(fft1373, fft1374);
__m512 fft1468 = _mm512_mul_ps(fft1460, fft1374);
__m512 fft1383 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1384 = _mm512_fmadd_ps(fft1367, fft1383, fft1375);
__m512 fft1469 = _mm512_fmadd_ps(fft1454, fft1383, fft1461);
__m512 fft1385 = _mm512_fnmadd_ps(fft1366, fft1383, fft1376);
__m512 fft1470 = _mm512_fnmadd_ps(fft1453, fft1383, fft1462);
__m512 fft1386 = _mm512_fmadd_ps(fft1369, fft1383, fft1377);
__m512 fft1471 = _mm512_fmadd_ps(fft1456, fft1383, fft1463);
__m512 fft1387 = _mm512_fnmadd_ps(fft1368, fft1383, fft1378);
__m512 fft1472 = _mm512_fnmadd_ps(fft1455, fft1383, fft1464);
__m512 fft1388 = _mm512_fmadd_ps(fft1371, fft1383, fft1379);
__m512 fft1473 = _mm512_fmadd_ps(fft1458, fft1383, fft1465);
__m512 fft1389 = _mm512_fnmadd_ps(fft1370, fft1383, fft1380);
__m512 fft1474 = _mm512_fnmadd_ps(fft1457, fft1383, fft1466);
__m512 fft1390 = _mm512_fmadd_ps(fft1373, fft1383, fft1381);
__m512 fft1475 = _mm512_fmadd_ps(fft1460, fft1383, fft1467);
__m512 fft1391 = _mm512_fnmadd_ps(fft1372, fft1383, fft1382);
__m512 fft1476 = _mm512_fnmadd_ps(fft1459, fft1383, fft1468);
__m512 fft1392 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1393 = _mm512_fmadd_ps(fft1384, fft1392, _mm512_shuffle_f32x4(fft1384, fft1384, 177));
__m512 fft1477 = _mm512_fmadd_ps(fft1469, fft1392, _mm512_shuffle_f32x4(fft1469, fft1469, 177));
__m512 fft1394 = _mm512_fmadd_ps(fft1385, fft1392, _mm512_shuffle_f32x4(fft1385, fft1385, 177));
__m512 fft1478 = _mm512_fmadd_ps(fft1470, fft1392, _mm512_shuffle_f32x4(fft1470, fft1470, 177));
__m512 fft1395 = _mm512_fmadd_ps(fft1386, fft1392, _mm512_shuffle_f32x4(fft1386, fft1386, 177));
__m512 fft1479 = _mm512_fmadd_ps(fft1471, fft1392, _mm512_shuffle_f32x4(fft1471, fft1471, 177));
__m512 fft1396 = _mm512_fmadd_ps(fft1387, fft1392, _mm512_shuffle_f32x4(fft1387, fft1387, 177));
__m512 fft1480 = _mm512_fmadd_ps(fft1472, fft1392, _mm512_shuffle_f32x4(fft1472, fft1472, 177));
__m512 fft1397 = _mm512_fmadd_ps(fft1388, fft1392, _mm512_shuffle_f32x4(fft1388, fft1388, 177));
__m512 fft1481 = _mm512_fmadd_ps(fft1473, fft1392, _mm512_shuffle_f32x4(fft1473, fft1473, 177));
__m512 fft1398 = _mm512_fmadd_ps(fft1389, fft1392, _mm512_shuffle_f32x4(fft1389, fft1389, 177));
__m512 fft1482 = _mm512_fmadd_ps(fft1474, fft1392, _mm512_shuffle_f32x4(fft1474, fft1474, 177));
__m512 fft1399 = _mm512_fmadd_ps(fft1390, fft1392, _mm512_shuffle_f32x4(fft1390, fft1390, 177));
__m512 fft1483 = _mm512_fmadd_ps(fft1475, fft1392, _mm512_shuffle_f32x4(fft1475, fft1475, 177));
__m512 fft1400 = _mm512_fmadd_ps(fft1391, fft1392, _mm512_shuffle_f32x4(fft1391, fft1391, 177));
__m512 fft1484 = _mm512_fmadd_ps(fft1476, fft1392, _mm512_shuffle_f32x4(fft1476, fft1476, 177));
__m512 fft1401 = _mm512_mask_mov_ps(fft1393, 49344, fft1394);
__m512 fft1485 = _mm512_mask_mov_ps(fft1477, 49344, fft1478);
__m512 fft1402 = _mm512_mask_sub_ps(fft1394, 49344, _mm512_setzero_ps(), fft1393);
__m512 fft1486 = _mm512_mask_sub_ps(fft1478, 49344, _mm512_setzero_ps(), fft1477);
__m512 fft1403 = _mm512_mask_mov_ps(fft1395, 49344, fft1396);
__m512 fft1487 = _mm512_mask_mov_ps(fft1479, 49344, fft1480);
__m512 fft1404 = _mm512_mask_sub_ps(fft1396, 49344, _mm512_setzero_ps(), fft1395);
__m512 fft1488 = _mm512_mask_sub_ps(fft1480, 49344, _mm512_setzero_ps(), fft1479);
__m512 fft1405 = _mm512_mask_mov_ps(fft1397, 49344, fft1398);
__m512 fft1489 = _mm512_mask_mov_ps(fft1481, 49344, fft1482);
__m512 fft1406 = _mm512_mask_sub_ps(fft1398, 49344, _mm512_setzero_ps(), fft1397);
__m512 fft1490 = _mm512_mask_sub_ps(fft1482, 49344, _mm512_setzero_ps(), fft1481);
__m512 fft1407 = _mm512_mask_mov_ps(fft1399, 49344, fft1400);
__m512 fft1491 = _mm512_mask_mov_ps(fft1483, 49344, fft1484);
__m512 fft1408 = _mm512_mask_sub_ps(fft1400, 49344, _mm512_setzero_ps(), fft1399);
__m512 fft1492 = _mm512_mask_sub_ps(fft1484, 49344, _mm512_setzero_ps(), fft1483);
__m512 fft1409 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1410 = _mm512_fmadd_ps(fft1401, fft1409, _mm512_shuffle_ps(fft1401, fft1401, 78));
__m512 fft1493 = _mm512_fmadd_ps(fft1485, fft1409, _mm512_shuffle_ps(fft1485, fft1485, 78));
__m512 fft1411 = _mm512_fmadd_ps(fft1402, fft1409, _mm512_shuffle_ps(fft1402, fft1402, 78));
__m512 fft1494 = _mm512_fmadd_ps(fft1486, fft1409, _mm512_shuffle_ps(fft1486, fft1486, 78));
__m512 fft1412 = _mm512_fmadd_ps(fft1403, fft1409, _mm512_shuffle_ps(fft1403, fft1403, 78));
__m512 fft1495 = _mm512_fmadd_ps(fft1487, fft1409, _mm512_shuffle_ps(fft1487, fft1487, 78));
__m512 fft1413 = _mm512_fmadd_ps(fft1404, fft1409, _mm512_shuffle_ps(fft1404, fft1404, 78));
__m512 fft1496 = _mm512_fmadd_ps(fft1488, fft1409, _mm512_shuffle_ps(fft1488, fft1488, 78));
__m512 fft1414 = _mm512_fmadd_ps(fft1405, fft1409, _mm512_shuffle_ps(fft1405, fft1405, 78));
__m512 fft1497 = _mm512_fmadd_ps(fft1489, fft1409, _mm512_shuffle_ps(fft1489, fft1489, 78));
__m512 fft1415 = _mm512_fmadd_ps(fft1406, fft1409, _mm512_shuffle_ps(fft1406, fft1406, 78));
__m512 fft1498 = _mm512_fmadd_ps(fft1490, fft1409, _mm512_shuffle_ps(fft1490, fft1490, 78));
__m512 fft1416 = _mm512_fmadd_ps(fft1407, fft1409, _mm512_shuffle_ps(fft1407, fft1407, 78));
__m512 fft1499 = _mm512_fmadd_ps(fft1491, fft1409, _mm512_shuffle_ps(fft1491, fft1491, 78));
__m512 fft1417 = _mm512_fmadd_ps(fft1408, fft1409, _mm512_shuffle_ps(fft1408, fft1408, 78));
__m512 fft1500 = _mm512_fmadd_ps(fft1492, fft1409, _mm512_shuffle_ps(fft1492, fft1492, 78));
__m512i fft1418 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1419 = _mm512_permutexvar_ps(fft1418, fft1410);
__m512 fft1501 = _mm512_permutexvar_ps(fft1418, fft1493);
__m512i fft1420 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1421 = _mm512_permutexvar_ps(fft1420, fft1410);
__m512 fft1502 = _mm512_permutexvar_ps(fft1420, fft1493);
__m512 fft1422 = _mm512_permutexvar_ps(fft1418, fft1411);
__m512 fft1503 = _mm512_permutexvar_ps(fft1418, fft1494);
__m512 fft1423 = _mm512_permutexvar_ps(fft1420, fft1411);
__m512 fft1504 = _mm512_permutexvar_ps(fft1420, fft1494);
__m512 fft1424 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1425 = _mm512_fmadd_ps(fft1419, fft1424, fft1421);
__m512 fft1505 = _mm512_fmadd_ps(fft1501, fft1424, fft1502);
__m512 fft1426 = _mm512_fnmadd_ps(fft1423, fft1424, fft1422);
__m512 fft1506 = _mm512_fnmadd_ps(fft1504, fft1424, fft1503);
__m512 fft1427 = _mm512_mask_mov_ps(fft1423, 21845, fft1425);
__m512 fft1507 = _mm512_mask_mov_ps(fft1504, 21845, fft1505);
__m512 fft1428 = _mm512_mask_mov_ps(fft1419, 43176, fft1425);
__m512 fft1508 = _mm512_mask_mov_ps(fft1501, 43176, fft1505);
__m512 fft1429 = _mm512_mask_mov_ps(fft1427, 43176, fft1426);
__m512 fft1509 = _mm512_mask_mov_ps(fft1507, 43176, fft1506);
__m512 fft1430 = _mm512_mask_mov_ps(fft1428, 22102, fft1426);
__m512 fft1510 = _mm512_mask_mov_ps(fft1508, 22102, fft1506);
__m512 fft1431 = _mm512_mask_mul_ps(fft1429, 64764, fft1429, _mm512_set1_ps(5e-01f));
__m512 fft1511 = _mm512_mask_mul_ps(fft1509, 64764, fft1509, _mm512_set1_ps(5e-01f));
__m512 fft1432 = _mm512_mask_mul_ps(fft1430, 64764, fft1430, _mm512_set1_ps(5e-01f));
__m512 fft1512 = _mm512_mask_mul_ps(fft1510, 64764, fft1510, _mm512_set1_ps(5e-01f));
__m512 df49 = fft1431;
__m512 df57 = fft1511;
__m512 df50 = fft1432;
__m512 df58 = fft1512;
__m512 df51 = fft1412;
__m512 df59 = fft1495;
__m512 df52 = fft1413;
__m512 df60 = fft1496;
__m512 df53 = fft1414;
__m512 df61 = fft1497;
__m512 df54 = fft1415;
__m512 df62 = fft1498;
__m512 df55 = fft1416;
__m512 df63 = fft1499;
__m512 df56 = fft1417;
__m512 df64 = fft1500;
__m512i eo9 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df51 = _mm512_permutexvar_ps(eo9, df51);
df52 = _mm512_permutexvar_ps(eo9, df52);
_mm512_mask_storeu_ps(dfPtr1+21888+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df51);
_mm512_mask_storeu_ps(dfPtr1+21952+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df52);
_mm512_mask_storeu_ps(dfPtr1+459616+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df51);
_mm512_mask_storeu_ps(dfPtr1+459680+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df52);
df59 = _mm512_permutexvar_ps(eo9, df59);
df60 = _mm512_permutexvar_ps(eo9, df60);
_mm512_mask_storeu_ps(dfPtr1+897408+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df59);
_mm512_mask_storeu_ps(dfPtr1+897472+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df60);
_mm512_mask_storeu_ps(dfPtr1+1335136+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df59);
_mm512_mask_storeu_ps(dfPtr1+1335200+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df60);
df53 = _mm512_permutexvar_ps(eo9, df53);
df54 = _mm512_permutexvar_ps(eo9, df54);
_mm512_mask_storeu_ps(dfPtr1+43776+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df53);
_mm512_mask_storeu_ps(dfPtr1+43840+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df54);
_mm512_mask_storeu_ps(dfPtr1+481504+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df53);
_mm512_mask_storeu_ps(dfPtr1+481568+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df54);
df61 = _mm512_permutexvar_ps(eo9, df61);
df62 = _mm512_permutexvar_ps(eo9, df62);
_mm512_mask_storeu_ps(dfPtr1+919296+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df61);
_mm512_mask_storeu_ps(dfPtr1+919360+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df62);
_mm512_mask_storeu_ps(dfPtr1+1357024+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df61);
_mm512_mask_storeu_ps(dfPtr1+1357088+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df62);
df55 = _mm512_permutexvar_ps(eo9, df55);
df56 = _mm512_permutexvar_ps(eo9, df56);
_mm512_mask_storeu_ps(dfPtr1+65664+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df55);
_mm512_mask_storeu_ps(dfPtr1+65728+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df56);
_mm512_mask_storeu_ps(dfPtr1+503392+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df55);
_mm512_mask_storeu_ps(dfPtr1+503456+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df56);
df63 = _mm512_permutexvar_ps(eo9, df63);
df64 = _mm512_permutexvar_ps(eo9, df64);
_mm512_mask_storeu_ps(dfPtr1+941184+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df63);
_mm512_mask_storeu_ps(dfPtr1+941248+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df64);
_mm512_mask_storeu_ps(dfPtr1+1378912+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df63);
_mm512_mask_storeu_ps(dfPtr1+1378976+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df64);
_mm512_mask_storeu_ps(dfPtr1+0+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df49);
_mm512_mask_storeu_ps(dfPtr1+64+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df50);
_mm512_mask_storeu_ps(dfPtr1+437728+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df49);
_mm512_mask_storeu_ps(dfPtr1+437792+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df50);
_mm512_mask_storeu_ps(dfPtr1+875520+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df57);
_mm512_mask_storeu_ps(dfPtr1+875584+87552*i6+21888*j2+384*k4+128*m9+32*f10, 255, df58);
_mm512_mask_storeu_ps(dfPtr1+1313248+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df57);
_mm512_mask_storeu_ps(dfPtr1+1313312+87552*i6+21888*j2+384*k4+128*m9+32*f10, 65280, df58);
}
++j2;
}

static void Example9StriderArrangeDats1(Example9ThreaderTeam1* team15, char** tensors3) {
Example9ThreaderTask1 task7;
task7.callee1 = Example9StriderArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 2;
task7.hull1[1] = 1;
task7.hull1[2] = 5;
task7.hull1[3] = 1;
Example9ThreaderDo1(team15, &task7);
}

static void Example9StriderProduceSums1Callee1(Example9ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t e3 = 0;
ptrdiff_t z2 = (ptrdiff_t)tuple2[2];
ptrdiff_t g4 = pt9[3];
ptrdiff_t p1 = pt9[2];
ptrdiff_t d1 = 0;
ptrdiff_t w2 = pt9[0];
if (__builtin_expect(!(e3|z2), 0)) {
z2 = 0;
char*restrict bfPtr2 = tensors6[0]+1600*e3;
char*restrict wfPtr2 = tensors6[0]+1600+81100800*e3+2918400*z2;
char*restrict dfPtr2 = tensors6[1]+12165120*e3+437760*z2;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i7 = 1*g4;
ptrdiff_t j3 = 1*p1;
ptrdiff_t jj2 = j3+0;
if (__builtin_expect(!j3, 0)) {
ptrdiff_t k5 = 1*d1;
for (; k5 != 1; ++k5) {
ptrdiff_t l1 = 5*w2;
ptrdiff_t ll1 = l1+4;
for (; l1 != 19; ++l1) {
__m512 sfRe1 = _mm512_setzero_ps();
__m512 sfIm1 = _mm512_setzero_ps();
__m512 sfRe7 = _mm512_setzero_ps();
__m512 sfIm7 = _mm512_setzero_ps();
sfRe1 = _mm512_mask_mov_ps(sfRe1, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+320*i7+16*l1)));
sfRe1 = _mm512_mask_mov_ps(sfRe1, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+320*i7+16*l1)));
sfRe7 = _mm512_mask_mov_ps(sfRe7, 1, _mm512_set1_ps(*(float*)(bfPtr2+8+320*i7+16*l1)));
sfRe7 = _mm512_mask_mov_ps(sfRe7, 256, _mm512_set1_ps(*(float*)(bfPtr2+12+320*i7+16*l1)));
__m512 sfRe2 = sfRe1;
__m512 sfIm2 = sfIm1;
__m512 sfRe3 = sfRe1;
__m512 sfIm3 = sfIm1;
__m512 sfRe4 = sfRe1;
__m512 sfIm4 = sfIm1;
__m512 sfRe5 = sfRe1;
__m512 sfIm5 = sfIm1;
__m512 sfRe6 = sfRe1;
__m512 sfIm6 = sfIm1;
__m512 sfRe8 = sfRe7;
__m512 sfIm8 = sfIm7;
__m512 sfRe9 = sfRe7;
__m512 sfIm9 = sfIm7;
__m512 sfRe10 = sfRe7;
__m512 sfIm10 = sfIm7;
__m512 sfRe11 = sfRe7;
__m512 sfIm11 = sfIm7;
__m512 sfRe12 = sfRe7;
__m512 sfIm12 = sfIm7;
for (ptrdiff_t s2 = 0; s2 < 57; ++s2) {
__m512i wfLd1 = _mm512_loadu_si512(wfPtr2+0+583680*i7+145920*j3+7296*l1+128*s2);
__m512 wfRe1 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd1));
__m512 wfIm1 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd1, 1));
__m512 wfMx1 = _mm512_mask_mov_ps(wfIm1, 64764, wfRe1);
__m512i wfLd2 = _mm512_loadu_si512(wfPtr2+64+583680*i7+145920*j3+7296*l1+128*s2);
__m512 wfRe2 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd2));
__m512 wfIm2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd2, 1));
__m512 wfMx2 = _mm512_mask_mov_ps(wfIm2, 64764, wfRe2);
__m512 dfRe1 = _mm512_loadu_ps(dfPtr2+0+87552*i7+21888*j3+21888*k5+384*s2);
__m512 dfIm1 = _mm512_loadu_ps(dfPtr2+64+87552*i7+21888*j3+21888*k5+384*s2);
sfRe1 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe1);
sfRe1 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe1, 64764);
sfIm1 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm1);
sfIm1 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm1, 64764);
sfRe7 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe7);
sfRe7 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe7, 64764);
sfIm7 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm7);
sfIm7 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm7, 64764);
dfRe1 = _mm512_shuffle_f32x4(dfRe1, dfRe1, 78);
dfIm1 = _mm512_shuffle_f32x4(dfIm1, dfIm1, 78);
sfRe2 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe2);
sfRe2 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe2, 64764);
sfIm2 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm2);
sfIm2 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm2, 64764);
sfRe8 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe8);
sfRe8 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe8, 64764);
sfIm8 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm8);
sfIm8 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm8, 64764);
__m512 dfRe2 = _mm512_loadu_ps(dfPtr2+128+87552*i7+21888*j3+21888*k5+384*s2);
__m512 dfIm2 = _mm512_loadu_ps(dfPtr2+192+87552*i7+21888*j3+21888*k5+384*s2);
sfRe3 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe3);
sfRe3 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe3, 64764);
sfIm3 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm3);
sfIm3 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm3, 64764);
sfRe9 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe9);
sfRe9 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe9, 64764);
sfIm9 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm9);
sfIm9 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm9, 64764);
dfRe2 = _mm512_shuffle_f32x4(dfRe2, dfRe2, 78);
dfIm2 = _mm512_shuffle_f32x4(dfIm2, dfIm2, 78);
sfRe4 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe4);
sfRe4 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe4, 64764);
sfIm4 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm4);
sfIm4 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm4, 64764);
sfRe10 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe10);
sfRe10 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe10, 64764);
sfIm10 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm10);
sfIm10 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm10, 64764);
__m512 dfRe3 = _mm512_loadu_ps(dfPtr2+256+87552*i7+21888*j3+21888*k5+384*s2);
__m512 dfIm3 = _mm512_loadu_ps(dfPtr2+320+87552*i7+21888*j3+21888*k5+384*s2);
sfRe5 = _mm512_fmadd_ps(wfRe1, dfRe3, sfRe5);
sfRe5 = _mm512_mask3_fmadd_ps(wfIm1, dfIm3, sfRe5, 64764);
sfIm5 = _mm512_fmadd_ps(wfMx1, dfIm3, sfIm5);
sfIm5 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe3, sfIm5, 64764);
sfRe11 = _mm512_fmadd_ps(wfRe2, dfRe3, sfRe11);
sfRe11 = _mm512_mask3_fmadd_ps(wfIm2, dfIm3, sfRe11, 64764);
sfIm11 = _mm512_fmadd_ps(wfMx2, dfIm3, sfIm11);
sfIm11 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe3, sfIm11, 64764);
dfRe3 = _mm512_shuffle_f32x4(dfRe3, dfRe3, 78);
dfIm3 = _mm512_shuffle_f32x4(dfIm3, dfIm3, 78);
sfRe6 = _mm512_fmadd_ps(wfRe1, dfRe3, sfRe6);
sfRe6 = _mm512_mask3_fmadd_ps(wfIm1, dfIm3, sfRe6, 64764);
sfIm6 = _mm512_fmadd_ps(wfMx1, dfIm3, sfIm6);
sfIm6 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe3, sfIm6, 64764);
sfRe12 = _mm512_fmadd_ps(wfRe2, dfRe3, sfRe12);
sfRe12 = _mm512_mask3_fmadd_ps(wfIm2, dfIm3, sfRe12, 64764);
sfIm12 = _mm512_fmadd_ps(wfMx2, dfIm3, sfIm12);
sfIm12 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe3, sfIm12, 64764);
}
_mm512_storeu_ps(sfPtr1+0+121344*i7+30336*j3+30336*k5+1536*l1, sfRe1);
_mm512_storeu_ps(sfPtr1+64+121344*i7+30336*j3+30336*k5+1536*l1, sfIm1);
_mm512_storeu_ps(sfPtr1+128+121344*i7+30336*j3+30336*k5+1536*l1, sfRe2);
_mm512_storeu_ps(sfPtr1+192+121344*i7+30336*j3+30336*k5+1536*l1, sfIm2);
_mm512_storeu_ps(sfPtr1+256+121344*i7+30336*j3+30336*k5+1536*l1, sfRe3);
_mm512_storeu_ps(sfPtr1+320+121344*i7+30336*j3+30336*k5+1536*l1, sfIm3);
_mm512_storeu_ps(sfPtr1+384+121344*i7+30336*j3+30336*k5+1536*l1, sfRe4);
_mm512_storeu_ps(sfPtr1+448+121344*i7+30336*j3+30336*k5+1536*l1, sfIm4);
_mm512_storeu_ps(sfPtr1+512+121344*i7+30336*j3+30336*k5+1536*l1, sfRe5);
_mm512_storeu_ps(sfPtr1+576+121344*i7+30336*j3+30336*k5+1536*l1, sfIm5);
_mm512_storeu_ps(sfPtr1+640+121344*i7+30336*j3+30336*k5+1536*l1, sfRe6);
_mm512_storeu_ps(sfPtr1+704+121344*i7+30336*j3+30336*k5+1536*l1, sfIm6);
_mm512_storeu_ps(sfPtr1+768+121344*i7+30336*j3+30336*k5+1536*l1, sfRe7);
_mm512_storeu_ps(sfPtr1+832+121344*i7+30336*j3+30336*k5+1536*l1, sfIm7);
_mm512_storeu_ps(sfPtr1+896+121344*i7+30336*j3+30336*k5+1536*l1, sfRe8);
_mm512_storeu_ps(sfPtr1+960+121344*i7+30336*j3+30336*k5+1536*l1, sfIm8);
_mm512_storeu_ps(sfPtr1+1024+121344*i7+30336*j3+30336*k5+1536*l1, sfRe9);
_mm512_storeu_ps(sfPtr1+1088+121344*i7+30336*j3+30336*k5+1536*l1, sfIm9);
_mm512_storeu_ps(sfPtr1+1152+121344*i7+30336*j3+30336*k5+1536*l1, sfRe10);
_mm512_storeu_ps(sfPtr1+1216+121344*i7+30336*j3+30336*k5+1536*l1, sfIm10);
_mm512_storeu_ps(sfPtr1+1280+121344*i7+30336*j3+30336*k5+1536*l1, sfRe11);
_mm512_storeu_ps(sfPtr1+1344+121344*i7+30336*j3+30336*k5+1536*l1, sfIm11);
_mm512_storeu_ps(sfPtr1+1408+121344*i7+30336*j3+30336*k5+1536*l1, sfRe12);
_mm512_storeu_ps(sfPtr1+1472+121344*i7+30336*j3+30336*k5+1536*l1, sfIm12);
if (l1 >= ll1) return;
}
__m512 sfRe13 = _mm512_setzero_ps();
__m512 sfIm13 = _mm512_setzero_ps();
__m512 sfRe19 = _mm512_setzero_ps();
__m512 sfIm19 = _mm512_setzero_ps();
sfRe13 = _mm512_mask_mov_ps(sfRe13, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+320*i7+16*l1)));
sfRe13 = _mm512_mask_mov_ps(sfRe13, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+320*i7+16*l1)));
sfRe19 = _mm512_mask_mov_ps(sfRe19, 257, _mm512_set1_ps(*(float*)(bfPtr2+8+320*i7+16*l1)));
__m512 sfRe14 = sfRe13;
__m512 sfIm14 = sfIm13;
__m512 sfRe15 = sfRe13;
__m512 sfIm15 = sfIm13;
__m512 sfRe16 = sfRe13;
__m512 sfIm16 = sfIm13;
__m512 sfRe17 = sfRe13;
__m512 sfIm17 = sfIm13;
__m512 sfRe18 = sfRe13;
__m512 sfIm18 = sfIm13;
__m512 sfRe20 = sfRe19;
__m512 sfIm20 = sfIm19;
__m512 sfRe21 = sfRe19;
__m512 sfIm21 = sfIm19;
for (ptrdiff_t s3 = 0; s3 < 57; ++s3) {
__m512i wfLd3 = _mm512_loadu_si512(wfPtr2+0+583680*i7+145920*j3+7296*l1+128*s3);
__m512 wfRe3 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd3));
__m512 wfIm3 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd3, 1));
__m512 wfMx3 = _mm512_mask_mov_ps(wfIm3, 64764, wfRe3);
__m512i wfLd4 = _mm512_loadu_si512(wfPtr2+64+583680*i7+145920*j3+7296*l1+128*s3);
__m512 wfRe4 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd4));
__m512 wfIm4 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd4, 1));
__m512 wfMx4 = _mm512_mask_mov_ps(wfIm4, 64764, wfRe4);
__m512 dfRe4 = _mm512_loadu_ps(dfPtr2+0+87552*i7+21888*j3+21888*k5+384*s3);
__m512 dfIm4 = _mm512_loadu_ps(dfPtr2+64+87552*i7+21888*j3+21888*k5+384*s3);
sfRe13 = _mm512_fmadd_ps(wfRe3, dfRe4, sfRe13);
sfRe13 = _mm512_mask3_fmadd_ps(wfIm3, dfIm4, sfRe13, 64764);
sfIm13 = _mm512_fmadd_ps(wfMx3, dfIm4, sfIm13);
sfIm13 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe4, sfIm13, 64764);
sfRe19 = _mm512_fmadd_ps(wfRe4, dfRe4, sfRe19);
sfRe19 = _mm512_mask3_fmadd_ps(wfIm4, dfIm4, sfRe19, 64764);
sfIm19 = _mm512_fmadd_ps(wfMx4, dfIm4, sfIm19);
sfIm19 = _mm512_mask3_fnmadd_ps(wfIm4, dfRe4, sfIm19, 64764);
dfRe4 = _mm512_shuffle_f32x4(dfRe4, dfRe4, 78);
dfIm4 = _mm512_shuffle_f32x4(dfIm4, dfIm4, 78);
sfRe14 = _mm512_fmadd_ps(wfRe3, dfRe4, sfRe14);
sfRe14 = _mm512_mask3_fmadd_ps(wfIm3, dfIm4, sfRe14, 64764);
sfIm14 = _mm512_fmadd_ps(wfMx3, dfIm4, sfIm14);
sfIm14 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe4, sfIm14, 64764);
__m512 dfRe5 = _mm512_loadu_ps(dfPtr2+128+87552*i7+21888*j3+21888*k5+384*s3);
__m512 dfIm5 = _mm512_loadu_ps(dfPtr2+192+87552*i7+21888*j3+21888*k5+384*s3);
sfRe15 = _mm512_fmadd_ps(wfRe3, dfRe5, sfRe15);
sfRe15 = _mm512_mask3_fmadd_ps(wfIm3, dfIm5, sfRe15, 64764);
sfIm15 = _mm512_fmadd_ps(wfMx3, dfIm5, sfIm15);
sfIm15 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe5, sfIm15, 64764);
sfRe20 = _mm512_fmadd_ps(wfRe4, dfRe5, sfRe20);
sfRe20 = _mm512_mask3_fmadd_ps(wfIm4, dfIm5, sfRe20, 64764);
sfIm20 = _mm512_fmadd_ps(wfMx4, dfIm5, sfIm20);
sfIm20 = _mm512_mask3_fnmadd_ps(wfIm4, dfRe5, sfIm20, 64764);
dfRe5 = _mm512_shuffle_f32x4(dfRe5, dfRe5, 78);
dfIm5 = _mm512_shuffle_f32x4(dfIm5, dfIm5, 78);
sfRe16 = _mm512_fmadd_ps(wfRe3, dfRe5, sfRe16);
sfRe16 = _mm512_mask3_fmadd_ps(wfIm3, dfIm5, sfRe16, 64764);
sfIm16 = _mm512_fmadd_ps(wfMx3, dfIm5, sfIm16);
sfIm16 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe5, sfIm16, 64764);
__m512 dfRe6 = _mm512_loadu_ps(dfPtr2+256+87552*i7+21888*j3+21888*k5+384*s3);
__m512 dfIm6 = _mm512_loadu_ps(dfPtr2+320+87552*i7+21888*j3+21888*k5+384*s3);
sfRe17 = _mm512_fmadd_ps(wfRe3, dfRe6, sfRe17);
sfRe17 = _mm512_mask3_fmadd_ps(wfIm3, dfIm6, sfRe17, 64764);
sfIm17 = _mm512_fmadd_ps(wfMx3, dfIm6, sfIm17);
sfIm17 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe6, sfIm17, 64764);
sfRe21 = _mm512_fmadd_ps(wfRe4, dfRe6, sfRe21);
sfRe21 = _mm512_mask3_fmadd_ps(wfIm4, dfIm6, sfRe21, 64764);
sfIm21 = _mm512_fmadd_ps(wfMx4, dfIm6, sfIm21);
sfIm21 = _mm512_mask3_fnmadd_ps(wfIm4, dfRe6, sfIm21, 64764);
dfRe6 = _mm512_shuffle_f32x4(dfRe6, dfRe6, 78);
dfIm6 = _mm512_shuffle_f32x4(dfIm6, dfIm6, 78);
sfRe18 = _mm512_fmadd_ps(wfRe3, dfRe6, sfRe18);
sfRe18 = _mm512_mask3_fmadd_ps(wfIm3, dfIm6, sfRe18, 64764);
sfIm18 = _mm512_fmadd_ps(wfMx3, dfIm6, sfIm18);
sfIm18 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe6, sfIm18, 64764);
}
_mm512_storeu_ps(sfPtr1+0+121344*i7+30336*j3+30336*k5+1536*l1, sfRe13);
_mm512_storeu_ps(sfPtr1+64+121344*i7+30336*j3+30336*k5+1536*l1, sfIm13);
_mm512_storeu_ps(sfPtr1+128+121344*i7+30336*j3+30336*k5+1536*l1, sfRe14);
_mm512_storeu_ps(sfPtr1+192+121344*i7+30336*j3+30336*k5+1536*l1, sfIm14);
_mm512_storeu_ps(sfPtr1+256+121344*i7+30336*j3+30336*k5+1536*l1, sfRe15);
_mm512_storeu_ps(sfPtr1+320+121344*i7+30336*j3+30336*k5+1536*l1, sfIm15);
_mm512_storeu_ps(sfPtr1+384+121344*i7+30336*j3+30336*k5+1536*l1, sfRe16);
_mm512_storeu_ps(sfPtr1+448+121344*i7+30336*j3+30336*k5+1536*l1, sfIm16);
_mm512_storeu_ps(sfPtr1+512+121344*i7+30336*j3+30336*k5+1536*l1, sfRe17);
_mm512_storeu_ps(sfPtr1+576+121344*i7+30336*j3+30336*k5+1536*l1, sfIm17);
_mm512_storeu_ps(sfPtr1+640+121344*i7+30336*j3+30336*k5+1536*l1, sfRe18);
_mm512_storeu_ps(sfPtr1+704+121344*i7+30336*j3+30336*k5+1536*l1, sfIm18);
_mm512_storeu_ps(sfPtr1+768+121344*i7+30336*j3+30336*k5+1536*l1, sfRe19);
_mm512_storeu_ps(sfPtr1+832+121344*i7+30336*j3+30336*k5+1536*l1, sfIm19);
_mm512_storeu_ps(sfPtr1+896+121344*i7+30336*j3+30336*k5+1536*l1, sfRe20);
_mm512_storeu_ps(sfPtr1+960+121344*i7+30336*j3+30336*k5+1536*l1, sfIm20);
_mm512_storeu_ps(sfPtr1+1024+121344*i7+30336*j3+30336*k5+1536*l1, sfRe21);
_mm512_storeu_ps(sfPtr1+1088+121344*i7+30336*j3+30336*k5+1536*l1, sfIm21);
}
j3 = 1;
}
for (; j3 <= jj2; ++j3) {
ptrdiff_t k6 = 1*d1;
for (; k6 != 1; ++k6) {
ptrdiff_t l2 = 5*w2;
ptrdiff_t ll2 = l2+4;
for (; l2 != 19; ++l2) {
__m512 sfRe22 = _mm512_setzero_ps();
__m512 sfIm22 = _mm512_setzero_ps();
__m512 sfRe28 = _mm512_setzero_ps();
__m512 sfIm28 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe23 = sfRe22;
__m512 sfIm23 = sfIm22;
__m512 sfRe24 = sfRe22;
__m512 sfIm24 = sfIm22;
__m512 sfRe25 = sfRe22;
__m512 sfIm25 = sfIm22;
__m512 sfRe26 = sfRe22;
__m512 sfIm26 = sfIm22;
__m512 sfRe27 = sfRe22;
__m512 sfIm27 = sfIm22;
__m512 sfRe29 = sfRe28;
__m512 sfIm29 = sfIm28;
__m512 sfRe30 = sfRe28;
__m512 sfIm30 = sfIm28;
__m512 sfRe31 = sfRe28;
__m512 sfIm31 = sfIm28;
__m512 sfRe32 = sfRe28;
__m512 sfIm32 = sfIm28;
__m512 sfRe33 = sfRe28;
__m512 sfIm33 = sfIm28;
for (ptrdiff_t s4 = 0; s4 < 57; ++s4) {
__m512i wfLd5 = _mm512_loadu_si512(wfPtr2+0+583680*i7+145920*j3+7296*l2+128*s4);
__m512 wfRe5 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd5));
__m512 wfIm5 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd5, 1));
__m512i wfLd6 = _mm512_loadu_si512(wfPtr2+64+583680*i7+145920*j3+7296*l2+128*s4);
__m512 wfRe6 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd6));
__m512 wfIm6 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd6, 1));
__m512 dfRe7 = _mm512_loadu_ps(dfPtr2+0+87552*i7+21888*j3+21888*k6+384*s4);
__m512 dfIm7 = _mm512_loadu_ps(dfPtr2+64+87552*i7+21888*j3+21888*k6+384*s4);
sfRe22 = _mm512_fmadd_ps(wfRe5, dfRe7, sfRe22);
sfRe22 = _mm512_fmadd_ps(wfIm5, dfIm7, sfRe22);
sfIm22 = _mm512_fmadd_ps(wfRe5, dfIm7, sfIm22);
sfIm22 = _mm512_fnmadd_ps(wfIm5, dfRe7, sfIm22);
sfRe28 = _mm512_fmadd_ps(wfRe6, dfRe7, sfRe28);
sfRe28 = _mm512_fmadd_ps(wfIm6, dfIm7, sfRe28);
sfIm28 = _mm512_fmadd_ps(wfRe6, dfIm7, sfIm28);
sfIm28 = _mm512_fnmadd_ps(wfIm6, dfRe7, sfIm28);
dfRe7 = _mm512_shuffle_f32x4(dfRe7, dfRe7, 78);
dfIm7 = _mm512_shuffle_f32x4(dfIm7, dfIm7, 78);
sfRe23 = _mm512_fmadd_ps(wfRe5, dfRe7, sfRe23);
sfRe23 = _mm512_fmadd_ps(wfIm5, dfIm7, sfRe23);
sfIm23 = _mm512_fmadd_ps(wfRe5, dfIm7, sfIm23);
sfIm23 = _mm512_fnmadd_ps(wfIm5, dfRe7, sfIm23);
sfRe29 = _mm512_fmadd_ps(wfRe6, dfRe7, sfRe29);
sfRe29 = _mm512_fmadd_ps(wfIm6, dfIm7, sfRe29);
sfIm29 = _mm512_fmadd_ps(wfRe6, dfIm7, sfIm29);
sfIm29 = _mm512_fnmadd_ps(wfIm6, dfRe7, sfIm29);
__m512 dfRe8 = _mm512_loadu_ps(dfPtr2+128+87552*i7+21888*j3+21888*k6+384*s4);
__m512 dfIm8 = _mm512_loadu_ps(dfPtr2+192+87552*i7+21888*j3+21888*k6+384*s4);
sfRe24 = _mm512_fmadd_ps(wfRe5, dfRe8, sfRe24);
sfRe24 = _mm512_fmadd_ps(wfIm5, dfIm8, sfRe24);
sfIm24 = _mm512_fmadd_ps(wfRe5, dfIm8, sfIm24);
sfIm24 = _mm512_fnmadd_ps(wfIm5, dfRe8, sfIm24);
sfRe30 = _mm512_fmadd_ps(wfRe6, dfRe8, sfRe30);
sfRe30 = _mm512_fmadd_ps(wfIm6, dfIm8, sfRe30);
sfIm30 = _mm512_fmadd_ps(wfRe6, dfIm8, sfIm30);
sfIm30 = _mm512_fnmadd_ps(wfIm6, dfRe8, sfIm30);
dfRe8 = _mm512_shuffle_f32x4(dfRe8, dfRe8, 78);
dfIm8 = _mm512_shuffle_f32x4(dfIm8, dfIm8, 78);
sfRe25 = _mm512_fmadd_ps(wfRe5, dfRe8, sfRe25);
sfRe25 = _mm512_fmadd_ps(wfIm5, dfIm8, sfRe25);
sfIm25 = _mm512_fmadd_ps(wfRe5, dfIm8, sfIm25);
sfIm25 = _mm512_fnmadd_ps(wfIm5, dfRe8, sfIm25);
sfRe31 = _mm512_fmadd_ps(wfRe6, dfRe8, sfRe31);
sfRe31 = _mm512_fmadd_ps(wfIm6, dfIm8, sfRe31);
sfIm31 = _mm512_fmadd_ps(wfRe6, dfIm8, sfIm31);
sfIm31 = _mm512_fnmadd_ps(wfIm6, dfRe8, sfIm31);
__m512 dfRe9 = _mm512_loadu_ps(dfPtr2+256+87552*i7+21888*j3+21888*k6+384*s4);
__m512 dfIm9 = _mm512_loadu_ps(dfPtr2+320+87552*i7+21888*j3+21888*k6+384*s4);
sfRe26 = _mm512_fmadd_ps(wfRe5, dfRe9, sfRe26);
sfRe26 = _mm512_fmadd_ps(wfIm5, dfIm9, sfRe26);
sfIm26 = _mm512_fmadd_ps(wfRe5, dfIm9, sfIm26);
sfIm26 = _mm512_fnmadd_ps(wfIm5, dfRe9, sfIm26);
sfRe32 = _mm512_fmadd_ps(wfRe6, dfRe9, sfRe32);
sfRe32 = _mm512_fmadd_ps(wfIm6, dfIm9, sfRe32);
sfIm32 = _mm512_fmadd_ps(wfRe6, dfIm9, sfIm32);
sfIm32 = _mm512_fnmadd_ps(wfIm6, dfRe9, sfIm32);
dfRe9 = _mm512_shuffle_f32x4(dfRe9, dfRe9, 78);
dfIm9 = _mm512_shuffle_f32x4(dfIm9, dfIm9, 78);
sfRe27 = _mm512_fmadd_ps(wfRe5, dfRe9, sfRe27);
sfRe27 = _mm512_fmadd_ps(wfIm5, dfIm9, sfRe27);
sfIm27 = _mm512_fmadd_ps(wfRe5, dfIm9, sfIm27);
sfIm27 = _mm512_fnmadd_ps(wfIm5, dfRe9, sfIm27);
sfRe33 = _mm512_fmadd_ps(wfRe6, dfRe9, sfRe33);
sfRe33 = _mm512_fmadd_ps(wfIm6, dfIm9, sfRe33);
sfIm33 = _mm512_fmadd_ps(wfRe6, dfIm9, sfIm33);
sfIm33 = _mm512_fnmadd_ps(wfIm6, dfRe9, sfIm33);
}
_mm512_storeu_ps(sfPtr1+0+121344*i7+30336*j3+30336*k6+1536*l2, sfRe22);
_mm512_storeu_ps(sfPtr1+64+121344*i7+30336*j3+30336*k6+1536*l2, sfIm22);
_mm512_storeu_ps(sfPtr1+128+121344*i7+30336*j3+30336*k6+1536*l2, sfRe23);
_mm512_storeu_ps(sfPtr1+192+121344*i7+30336*j3+30336*k6+1536*l2, sfIm23);
_mm512_storeu_ps(sfPtr1+256+121344*i7+30336*j3+30336*k6+1536*l2, sfRe24);
_mm512_storeu_ps(sfPtr1+320+121344*i7+30336*j3+30336*k6+1536*l2, sfIm24);
_mm512_storeu_ps(sfPtr1+384+121344*i7+30336*j3+30336*k6+1536*l2, sfRe25);
_mm512_storeu_ps(sfPtr1+448+121344*i7+30336*j3+30336*k6+1536*l2, sfIm25);
_mm512_storeu_ps(sfPtr1+512+121344*i7+30336*j3+30336*k6+1536*l2, sfRe26);
_mm512_storeu_ps(sfPtr1+576+121344*i7+30336*j3+30336*k6+1536*l2, sfIm26);
_mm512_storeu_ps(sfPtr1+640+121344*i7+30336*j3+30336*k6+1536*l2, sfRe27);
_mm512_storeu_ps(sfPtr1+704+121344*i7+30336*j3+30336*k6+1536*l2, sfIm27);
_mm512_storeu_ps(sfPtr1+768+121344*i7+30336*j3+30336*k6+1536*l2, sfRe28);
_mm512_storeu_ps(sfPtr1+832+121344*i7+30336*j3+30336*k6+1536*l2, sfIm28);
_mm512_storeu_ps(sfPtr1+896+121344*i7+30336*j3+30336*k6+1536*l2, sfRe29);
_mm512_storeu_ps(sfPtr1+960+121344*i7+30336*j3+30336*k6+1536*l2, sfIm29);
_mm512_storeu_ps(sfPtr1+1024+121344*i7+30336*j3+30336*k6+1536*l2, sfRe30);
_mm512_storeu_ps(sfPtr1+1088+121344*i7+30336*j3+30336*k6+1536*l2, sfIm30);
_mm512_storeu_ps(sfPtr1+1152+121344*i7+30336*j3+30336*k6+1536*l2, sfRe31);
_mm512_storeu_ps(sfPtr1+1216+121344*i7+30336*j3+30336*k6+1536*l2, sfIm31);
_mm512_storeu_ps(sfPtr1+1280+121344*i7+30336*j3+30336*k6+1536*l2, sfRe32);
_mm512_storeu_ps(sfPtr1+1344+121344*i7+30336*j3+30336*k6+1536*l2, sfIm32);
_mm512_storeu_ps(sfPtr1+1408+121344*i7+30336*j3+30336*k6+1536*l2, sfRe33);
_mm512_storeu_ps(sfPtr1+1472+121344*i7+30336*j3+30336*k6+1536*l2, sfIm33);
if (l2 >= ll2) return;
}
__m512 sfRe34 = _mm512_setzero_ps();
__m512 sfIm34 = _mm512_setzero_ps();
__m512 sfRe40 = _mm512_setzero_ps();
__m512 sfIm40 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe35 = sfRe34;
__m512 sfIm35 = sfIm34;
__m512 sfRe36 = sfRe34;
__m512 sfIm36 = sfIm34;
__m512 sfRe37 = sfRe34;
__m512 sfIm37 = sfIm34;
__m512 sfRe38 = sfRe34;
__m512 sfIm38 = sfIm34;
__m512 sfRe39 = sfRe34;
__m512 sfIm39 = sfIm34;
__m512 sfRe41 = sfRe40;
__m512 sfIm41 = sfIm40;
__m512 sfRe42 = sfRe40;
__m512 sfIm42 = sfIm40;
for (ptrdiff_t s5 = 0; s5 < 57; ++s5) {
__m512i wfLd7 = _mm512_loadu_si512(wfPtr2+0+583680*i7+145920*j3+7296*l2+128*s5);
__m512 wfRe7 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd7));
__m512 wfIm7 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd7, 1));
__m512i wfLd8 = _mm512_loadu_si512(wfPtr2+64+583680*i7+145920*j3+7296*l2+128*s5);
__m512 wfRe8 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd8));
__m512 wfIm8 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd8, 1));
__m512 dfRe10 = _mm512_loadu_ps(dfPtr2+0+87552*i7+21888*j3+21888*k6+384*s5);
__m512 dfIm10 = _mm512_loadu_ps(dfPtr2+64+87552*i7+21888*j3+21888*k6+384*s5);
sfRe34 = _mm512_fmadd_ps(wfRe7, dfRe10, sfRe34);
sfRe34 = _mm512_fmadd_ps(wfIm7, dfIm10, sfRe34);
sfIm34 = _mm512_fmadd_ps(wfRe7, dfIm10, sfIm34);
sfIm34 = _mm512_fnmadd_ps(wfIm7, dfRe10, sfIm34);
sfRe40 = _mm512_fmadd_ps(wfRe8, dfRe10, sfRe40);
sfRe40 = _mm512_fmadd_ps(wfIm8, dfIm10, sfRe40);
sfIm40 = _mm512_fmadd_ps(wfRe8, dfIm10, sfIm40);
sfIm40 = _mm512_fnmadd_ps(wfIm8, dfRe10, sfIm40);
dfRe10 = _mm512_shuffle_f32x4(dfRe10, dfRe10, 78);
dfIm10 = _mm512_shuffle_f32x4(dfIm10, dfIm10, 78);
sfRe35 = _mm512_fmadd_ps(wfRe7, dfRe10, sfRe35);
sfRe35 = _mm512_fmadd_ps(wfIm7, dfIm10, sfRe35);
sfIm35 = _mm512_fmadd_ps(wfRe7, dfIm10, sfIm35);
sfIm35 = _mm512_fnmadd_ps(wfIm7, dfRe10, sfIm35);
__m512 dfRe11 = _mm512_loadu_ps(dfPtr2+128+87552*i7+21888*j3+21888*k6+384*s5);
__m512 dfIm11 = _mm512_loadu_ps(dfPtr2+192+87552*i7+21888*j3+21888*k6+384*s5);
sfRe36 = _mm512_fmadd_ps(wfRe7, dfRe11, sfRe36);
sfRe36 = _mm512_fmadd_ps(wfIm7, dfIm11, sfRe36);
sfIm36 = _mm512_fmadd_ps(wfRe7, dfIm11, sfIm36);
sfIm36 = _mm512_fnmadd_ps(wfIm7, dfRe11, sfIm36);
sfRe41 = _mm512_fmadd_ps(wfRe8, dfRe11, sfRe41);
sfRe41 = _mm512_fmadd_ps(wfIm8, dfIm11, sfRe41);
sfIm41 = _mm512_fmadd_ps(wfRe8, dfIm11, sfIm41);
sfIm41 = _mm512_fnmadd_ps(wfIm8, dfRe11, sfIm41);
dfRe11 = _mm512_shuffle_f32x4(dfRe11, dfRe11, 78);
dfIm11 = _mm512_shuffle_f32x4(dfIm11, dfIm11, 78);
sfRe37 = _mm512_fmadd_ps(wfRe7, dfRe11, sfRe37);
sfRe37 = _mm512_fmadd_ps(wfIm7, dfIm11, sfRe37);
sfIm37 = _mm512_fmadd_ps(wfRe7, dfIm11, sfIm37);
sfIm37 = _mm512_fnmadd_ps(wfIm7, dfRe11, sfIm37);
__m512 dfRe12 = _mm512_loadu_ps(dfPtr2+256+87552*i7+21888*j3+21888*k6+384*s5);
__m512 dfIm12 = _mm512_loadu_ps(dfPtr2+320+87552*i7+21888*j3+21888*k6+384*s5);
sfRe38 = _mm512_fmadd_ps(wfRe7, dfRe12, sfRe38);
sfRe38 = _mm512_fmadd_ps(wfIm7, dfIm12, sfRe38);
sfIm38 = _mm512_fmadd_ps(wfRe7, dfIm12, sfIm38);
sfIm38 = _mm512_fnmadd_ps(wfIm7, dfRe12, sfIm38);
sfRe42 = _mm512_fmadd_ps(wfRe8, dfRe12, sfRe42);
sfRe42 = _mm512_fmadd_ps(wfIm8, dfIm12, sfRe42);
sfIm42 = _mm512_fmadd_ps(wfRe8, dfIm12, sfIm42);
sfIm42 = _mm512_fnmadd_ps(wfIm8, dfRe12, sfIm42);
dfRe12 = _mm512_shuffle_f32x4(dfRe12, dfRe12, 78);
dfIm12 = _mm512_shuffle_f32x4(dfIm12, dfIm12, 78);
sfRe39 = _mm512_fmadd_ps(wfRe7, dfRe12, sfRe39);
sfRe39 = _mm512_fmadd_ps(wfIm7, dfIm12, sfRe39);
sfIm39 = _mm512_fmadd_ps(wfRe7, dfIm12, sfIm39);
sfIm39 = _mm512_fnmadd_ps(wfIm7, dfRe12, sfIm39);
}
_mm512_storeu_ps(sfPtr1+0+121344*i7+30336*j3+30336*k6+1536*l2, sfRe34);
_mm512_storeu_ps(sfPtr1+64+121344*i7+30336*j3+30336*k6+1536*l2, sfIm34);
_mm512_storeu_ps(sfPtr1+128+121344*i7+30336*j3+30336*k6+1536*l2, sfRe35);
_mm512_storeu_ps(sfPtr1+192+121344*i7+30336*j3+30336*k6+1536*l2, sfIm35);
_mm512_storeu_ps(sfPtr1+256+121344*i7+30336*j3+30336*k6+1536*l2, sfRe36);
_mm512_storeu_ps(sfPtr1+320+121344*i7+30336*j3+30336*k6+1536*l2, sfIm36);
_mm512_storeu_ps(sfPtr1+384+121344*i7+30336*j3+30336*k6+1536*l2, sfRe37);
_mm512_storeu_ps(sfPtr1+448+121344*i7+30336*j3+30336*k6+1536*l2, sfIm37);
_mm512_storeu_ps(sfPtr1+512+121344*i7+30336*j3+30336*k6+1536*l2, sfRe38);
_mm512_storeu_ps(sfPtr1+576+121344*i7+30336*j3+30336*k6+1536*l2, sfIm38);
_mm512_storeu_ps(sfPtr1+640+121344*i7+30336*j3+30336*k6+1536*l2, sfRe39);
_mm512_storeu_ps(sfPtr1+704+121344*i7+30336*j3+30336*k6+1536*l2, sfIm39);
_mm512_storeu_ps(sfPtr1+768+121344*i7+30336*j3+30336*k6+1536*l2, sfRe40);
_mm512_storeu_ps(sfPtr1+832+121344*i7+30336*j3+30336*k6+1536*l2, sfIm40);
_mm512_storeu_ps(sfPtr1+896+121344*i7+30336*j3+30336*k6+1536*l2, sfRe41);
_mm512_storeu_ps(sfPtr1+960+121344*i7+30336*j3+30336*k6+1536*l2, sfIm41);
_mm512_storeu_ps(sfPtr1+1024+121344*i7+30336*j3+30336*k6+1536*l2, sfRe42);
_mm512_storeu_ps(sfPtr1+1088+121344*i7+30336*j3+30336*k6+1536*l2, sfIm42);
}
}
return;
}
char*restrict bfPtr3 = tensors6[0]+1600*e3;
char*restrict wfPtr3 = tensors6[0]+1600+81100800*e3+2918400*z2;
char*restrict dfPtr3 = tensors6[1]+12165120*e3+437760*z2;
char*restrict sfPtr2 = tensors6[2];
ptrdiff_t i8 = 1*g4;
ptrdiff_t j4 = 1*p1;
ptrdiff_t jj3 = j4+0;
if (__builtin_expect(!j4, 0)) {
ptrdiff_t k7 = 1*d1;
for (; k7 != 1; ++k7) {
ptrdiff_t l3 = 5*w2;
ptrdiff_t ll3 = l3+4;
for (; l3 != 19; ++l3) {
__m512 sfRe43 = _mm512_setzero_ps();
__m512 sfIm43 = _mm512_setzero_ps();
__m512 sfRe49 = _mm512_setzero_ps();
__m512 sfIm49 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe44 = sfRe43;
__m512 sfIm44 = sfIm43;
__m512 sfRe45 = sfRe43;
__m512 sfIm45 = sfIm43;
__m512 sfRe46 = sfRe43;
__m512 sfIm46 = sfIm43;
__m512 sfRe47 = sfRe43;
__m512 sfIm47 = sfIm43;
__m512 sfRe48 = sfRe43;
__m512 sfIm48 = sfIm43;
__m512 sfRe50 = sfRe49;
__m512 sfIm50 = sfIm49;
__m512 sfRe51 = sfRe49;
__m512 sfIm51 = sfIm49;
__m512 sfRe52 = sfRe49;
__m512 sfIm52 = sfIm49;
__m512 sfRe53 = sfRe49;
__m512 sfIm53 = sfIm49;
__m512 sfRe54 = sfRe49;
__m512 sfIm54 = sfIm49;
for (ptrdiff_t s6 = 0; s6 < 57; ++s6) {
__m512i wfLd9 = _mm512_loadu_si512(wfPtr3+0+583680*i8+145920*j4+7296*l3+128*s6);
__m512 wfRe9 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd9));
__m512 wfIm9 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd9, 1));
__m512 wfMx5 = _mm512_mask_mov_ps(wfIm9, 64764, wfRe9);
__m512i wfLd10 = _mm512_loadu_si512(wfPtr3+64+583680*i8+145920*j4+7296*l3+128*s6);
__m512 wfRe10 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd10));
__m512 wfIm10 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd10, 1));
__m512 wfMx6 = _mm512_mask_mov_ps(wfIm10, 64764, wfRe10);
__m512 dfRe13 = _mm512_loadu_ps(dfPtr3+0+87552*i8+21888*j4+21888*k7+384*s6);
__m512 dfIm13 = _mm512_loadu_ps(dfPtr3+64+87552*i8+21888*j4+21888*k7+384*s6);
sfRe43 = _mm512_fmadd_ps(wfRe9, dfRe13, sfRe43);
sfRe43 = _mm512_mask3_fmadd_ps(wfIm9, dfIm13, sfRe43, 64764);
sfIm43 = _mm512_fmadd_ps(wfMx5, dfIm13, sfIm43);
sfIm43 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe13, sfIm43, 64764);
sfRe49 = _mm512_fmadd_ps(wfRe10, dfRe13, sfRe49);
sfRe49 = _mm512_mask3_fmadd_ps(wfIm10, dfIm13, sfRe49, 64764);
sfIm49 = _mm512_fmadd_ps(wfMx6, dfIm13, sfIm49);
sfIm49 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe13, sfIm49, 64764);
dfRe13 = _mm512_shuffle_f32x4(dfRe13, dfRe13, 78);
dfIm13 = _mm512_shuffle_f32x4(dfIm13, dfIm13, 78);
sfRe44 = _mm512_fmadd_ps(wfRe9, dfRe13, sfRe44);
sfRe44 = _mm512_mask3_fmadd_ps(wfIm9, dfIm13, sfRe44, 64764);
sfIm44 = _mm512_fmadd_ps(wfMx5, dfIm13, sfIm44);
sfIm44 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe13, sfIm44, 64764);
sfRe50 = _mm512_fmadd_ps(wfRe10, dfRe13, sfRe50);
sfRe50 = _mm512_mask3_fmadd_ps(wfIm10, dfIm13, sfRe50, 64764);
sfIm50 = _mm512_fmadd_ps(wfMx6, dfIm13, sfIm50);
sfIm50 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe13, sfIm50, 64764);
__m512 dfRe14 = _mm512_loadu_ps(dfPtr3+128+87552*i8+21888*j4+21888*k7+384*s6);
__m512 dfIm14 = _mm512_loadu_ps(dfPtr3+192+87552*i8+21888*j4+21888*k7+384*s6);
sfRe45 = _mm512_fmadd_ps(wfRe9, dfRe14, sfRe45);
sfRe45 = _mm512_mask3_fmadd_ps(wfIm9, dfIm14, sfRe45, 64764);
sfIm45 = _mm512_fmadd_ps(wfMx5, dfIm14, sfIm45);
sfIm45 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe14, sfIm45, 64764);
sfRe51 = _mm512_fmadd_ps(wfRe10, dfRe14, sfRe51);
sfRe51 = _mm512_mask3_fmadd_ps(wfIm10, dfIm14, sfRe51, 64764);
sfIm51 = _mm512_fmadd_ps(wfMx6, dfIm14, sfIm51);
sfIm51 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe14, sfIm51, 64764);
dfRe14 = _mm512_shuffle_f32x4(dfRe14, dfRe14, 78);
dfIm14 = _mm512_shuffle_f32x4(dfIm14, dfIm14, 78);
sfRe46 = _mm512_fmadd_ps(wfRe9, dfRe14, sfRe46);
sfRe46 = _mm512_mask3_fmadd_ps(wfIm9, dfIm14, sfRe46, 64764);
sfIm46 = _mm512_fmadd_ps(wfMx5, dfIm14, sfIm46);
sfIm46 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe14, sfIm46, 64764);
sfRe52 = _mm512_fmadd_ps(wfRe10, dfRe14, sfRe52);
sfRe52 = _mm512_mask3_fmadd_ps(wfIm10, dfIm14, sfRe52, 64764);
sfIm52 = _mm512_fmadd_ps(wfMx6, dfIm14, sfIm52);
sfIm52 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe14, sfIm52, 64764);
__m512 dfRe15 = _mm512_loadu_ps(dfPtr3+256+87552*i8+21888*j4+21888*k7+384*s6);
__m512 dfIm15 = _mm512_loadu_ps(dfPtr3+320+87552*i8+21888*j4+21888*k7+384*s6);
sfRe47 = _mm512_fmadd_ps(wfRe9, dfRe15, sfRe47);
sfRe47 = _mm512_mask3_fmadd_ps(wfIm9, dfIm15, sfRe47, 64764);
sfIm47 = _mm512_fmadd_ps(wfMx5, dfIm15, sfIm47);
sfIm47 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe15, sfIm47, 64764);
sfRe53 = _mm512_fmadd_ps(wfRe10, dfRe15, sfRe53);
sfRe53 = _mm512_mask3_fmadd_ps(wfIm10, dfIm15, sfRe53, 64764);
sfIm53 = _mm512_fmadd_ps(wfMx6, dfIm15, sfIm53);
sfIm53 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe15, sfIm53, 64764);
dfRe15 = _mm512_shuffle_f32x4(dfRe15, dfRe15, 78);
dfIm15 = _mm512_shuffle_f32x4(dfIm15, dfIm15, 78);
sfRe48 = _mm512_fmadd_ps(wfRe9, dfRe15, sfRe48);
sfRe48 = _mm512_mask3_fmadd_ps(wfIm9, dfIm15, sfRe48, 64764);
sfIm48 = _mm512_fmadd_ps(wfMx5, dfIm15, sfIm48);
sfIm48 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe15, sfIm48, 64764);
sfRe54 = _mm512_fmadd_ps(wfRe10, dfRe15, sfRe54);
sfRe54 = _mm512_mask3_fmadd_ps(wfIm10, dfIm15, sfRe54, 64764);
sfIm54 = _mm512_fmadd_ps(wfMx6, dfIm15, sfIm54);
sfIm54 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe15, sfIm54, 64764);
}
sfRe43 = _mm512_add_ps(sfRe43, _mm512_loadu_ps(sfPtr2+0+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm43 = _mm512_add_ps(sfIm43, _mm512_loadu_ps(sfPtr2+64+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe44 = _mm512_add_ps(sfRe44, _mm512_loadu_ps(sfPtr2+128+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm44 = _mm512_add_ps(sfIm44, _mm512_loadu_ps(sfPtr2+192+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe45 = _mm512_add_ps(sfRe45, _mm512_loadu_ps(sfPtr2+256+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm45 = _mm512_add_ps(sfIm45, _mm512_loadu_ps(sfPtr2+320+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe46 = _mm512_add_ps(sfRe46, _mm512_loadu_ps(sfPtr2+384+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm46 = _mm512_add_ps(sfIm46, _mm512_loadu_ps(sfPtr2+448+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe47 = _mm512_add_ps(sfRe47, _mm512_loadu_ps(sfPtr2+512+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm47 = _mm512_add_ps(sfIm47, _mm512_loadu_ps(sfPtr2+576+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe48 = _mm512_add_ps(sfRe48, _mm512_loadu_ps(sfPtr2+640+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm48 = _mm512_add_ps(sfIm48, _mm512_loadu_ps(sfPtr2+704+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe49 = _mm512_add_ps(sfRe49, _mm512_loadu_ps(sfPtr2+768+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm49 = _mm512_add_ps(sfIm49, _mm512_loadu_ps(sfPtr2+832+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe50 = _mm512_add_ps(sfRe50, _mm512_loadu_ps(sfPtr2+896+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm50 = _mm512_add_ps(sfIm50, _mm512_loadu_ps(sfPtr2+960+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe51 = _mm512_add_ps(sfRe51, _mm512_loadu_ps(sfPtr2+1024+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm51 = _mm512_add_ps(sfIm51, _mm512_loadu_ps(sfPtr2+1088+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe52 = _mm512_add_ps(sfRe52, _mm512_loadu_ps(sfPtr2+1152+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm52 = _mm512_add_ps(sfIm52, _mm512_loadu_ps(sfPtr2+1216+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe53 = _mm512_add_ps(sfRe53, _mm512_loadu_ps(sfPtr2+1280+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm53 = _mm512_add_ps(sfIm53, _mm512_loadu_ps(sfPtr2+1344+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe54 = _mm512_add_ps(sfRe54, _mm512_loadu_ps(sfPtr2+1408+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm54 = _mm512_add_ps(sfIm54, _mm512_loadu_ps(sfPtr2+1472+121344*i8+30336*j4+30336*k7+1536*l3));
_mm512_storeu_ps(sfPtr2+0+121344*i8+30336*j4+30336*k7+1536*l3, sfRe43);
_mm512_storeu_ps(sfPtr2+64+121344*i8+30336*j4+30336*k7+1536*l3, sfIm43);
_mm512_storeu_ps(sfPtr2+128+121344*i8+30336*j4+30336*k7+1536*l3, sfRe44);
_mm512_storeu_ps(sfPtr2+192+121344*i8+30336*j4+30336*k7+1536*l3, sfIm44);
_mm512_storeu_ps(sfPtr2+256+121344*i8+30336*j4+30336*k7+1536*l3, sfRe45);
_mm512_storeu_ps(sfPtr2+320+121344*i8+30336*j4+30336*k7+1536*l3, sfIm45);
_mm512_storeu_ps(sfPtr2+384+121344*i8+30336*j4+30336*k7+1536*l3, sfRe46);
_mm512_storeu_ps(sfPtr2+448+121344*i8+30336*j4+30336*k7+1536*l3, sfIm46);
_mm512_storeu_ps(sfPtr2+512+121344*i8+30336*j4+30336*k7+1536*l3, sfRe47);
_mm512_storeu_ps(sfPtr2+576+121344*i8+30336*j4+30336*k7+1536*l3, sfIm47);
_mm512_storeu_ps(sfPtr2+640+121344*i8+30336*j4+30336*k7+1536*l3, sfRe48);
_mm512_storeu_ps(sfPtr2+704+121344*i8+30336*j4+30336*k7+1536*l3, sfIm48);
_mm512_storeu_ps(sfPtr2+768+121344*i8+30336*j4+30336*k7+1536*l3, sfRe49);
_mm512_storeu_ps(sfPtr2+832+121344*i8+30336*j4+30336*k7+1536*l3, sfIm49);
_mm512_storeu_ps(sfPtr2+896+121344*i8+30336*j4+30336*k7+1536*l3, sfRe50);
_mm512_storeu_ps(sfPtr2+960+121344*i8+30336*j4+30336*k7+1536*l3, sfIm50);
_mm512_storeu_ps(sfPtr2+1024+121344*i8+30336*j4+30336*k7+1536*l3, sfRe51);
_mm512_storeu_ps(sfPtr2+1088+121344*i8+30336*j4+30336*k7+1536*l3, sfIm51);
_mm512_storeu_ps(sfPtr2+1152+121344*i8+30336*j4+30336*k7+1536*l3, sfRe52);
_mm512_storeu_ps(sfPtr2+1216+121344*i8+30336*j4+30336*k7+1536*l3, sfIm52);
_mm512_storeu_ps(sfPtr2+1280+121344*i8+30336*j4+30336*k7+1536*l3, sfRe53);
_mm512_storeu_ps(sfPtr2+1344+121344*i8+30336*j4+30336*k7+1536*l3, sfIm53);
_mm512_storeu_ps(sfPtr2+1408+121344*i8+30336*j4+30336*k7+1536*l3, sfRe54);
_mm512_storeu_ps(sfPtr2+1472+121344*i8+30336*j4+30336*k7+1536*l3, sfIm54);
if (l3 >= ll3) return;
}
__m512 sfRe55 = _mm512_setzero_ps();
__m512 sfIm55 = _mm512_setzero_ps();
__m512 sfRe61 = _mm512_setzero_ps();
__m512 sfIm61 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe56 = sfRe55;
__m512 sfIm56 = sfIm55;
__m512 sfRe57 = sfRe55;
__m512 sfIm57 = sfIm55;
__m512 sfRe58 = sfRe55;
__m512 sfIm58 = sfIm55;
__m512 sfRe59 = sfRe55;
__m512 sfIm59 = sfIm55;
__m512 sfRe60 = sfRe55;
__m512 sfIm60 = sfIm55;
__m512 sfRe62 = sfRe61;
__m512 sfIm62 = sfIm61;
__m512 sfRe63 = sfRe61;
__m512 sfIm63 = sfIm61;
for (ptrdiff_t s7 = 0; s7 < 57; ++s7) {
__m512i wfLd11 = _mm512_loadu_si512(wfPtr3+0+583680*i8+145920*j4+7296*l3+128*s7);
__m512 wfRe11 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd11));
__m512 wfIm11 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd11, 1));
__m512 wfMx7 = _mm512_mask_mov_ps(wfIm11, 64764, wfRe11);
__m512i wfLd12 = _mm512_loadu_si512(wfPtr3+64+583680*i8+145920*j4+7296*l3+128*s7);
__m512 wfRe12 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd12));
__m512 wfIm12 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd12, 1));
__m512 wfMx8 = _mm512_mask_mov_ps(wfIm12, 64764, wfRe12);
__m512 dfRe16 = _mm512_loadu_ps(dfPtr3+0+87552*i8+21888*j4+21888*k7+384*s7);
__m512 dfIm16 = _mm512_loadu_ps(dfPtr3+64+87552*i8+21888*j4+21888*k7+384*s7);
sfRe55 = _mm512_fmadd_ps(wfRe11, dfRe16, sfRe55);
sfRe55 = _mm512_mask3_fmadd_ps(wfIm11, dfIm16, sfRe55, 64764);
sfIm55 = _mm512_fmadd_ps(wfMx7, dfIm16, sfIm55);
sfIm55 = _mm512_mask3_fnmadd_ps(wfIm11, dfRe16, sfIm55, 64764);
sfRe61 = _mm512_fmadd_ps(wfRe12, dfRe16, sfRe61);
sfRe61 = _mm512_mask3_fmadd_ps(wfIm12, dfIm16, sfRe61, 64764);
sfIm61 = _mm512_fmadd_ps(wfMx8, dfIm16, sfIm61);
sfIm61 = _mm512_mask3_fnmadd_ps(wfIm12, dfRe16, sfIm61, 64764);
dfRe16 = _mm512_shuffle_f32x4(dfRe16, dfRe16, 78);
dfIm16 = _mm512_shuffle_f32x4(dfIm16, dfIm16, 78);
sfRe56 = _mm512_fmadd_ps(wfRe11, dfRe16, sfRe56);
sfRe56 = _mm512_mask3_fmadd_ps(wfIm11, dfIm16, sfRe56, 64764);
sfIm56 = _mm512_fmadd_ps(wfMx7, dfIm16, sfIm56);
sfIm56 = _mm512_mask3_fnmadd_ps(wfIm11, dfRe16, sfIm56, 64764);
__m512 dfRe17 = _mm512_loadu_ps(dfPtr3+128+87552*i8+21888*j4+21888*k7+384*s7);
__m512 dfIm17 = _mm512_loadu_ps(dfPtr3+192+87552*i8+21888*j4+21888*k7+384*s7);
sfRe57 = _mm512_fmadd_ps(wfRe11, dfRe17, sfRe57);
sfRe57 = _mm512_mask3_fmadd_ps(wfIm11, dfIm17, sfRe57, 64764);
sfIm57 = _mm512_fmadd_ps(wfMx7, dfIm17, sfIm57);
sfIm57 = _mm512_mask3_fnmadd_ps(wfIm11, dfRe17, sfIm57, 64764);
sfRe62 = _mm512_fmadd_ps(wfRe12, dfRe17, sfRe62);
sfRe62 = _mm512_mask3_fmadd_ps(wfIm12, dfIm17, sfRe62, 64764);
sfIm62 = _mm512_fmadd_ps(wfMx8, dfIm17, sfIm62);
sfIm62 = _mm512_mask3_fnmadd_ps(wfIm12, dfRe17, sfIm62, 64764);
dfRe17 = _mm512_shuffle_f32x4(dfRe17, dfRe17, 78);
dfIm17 = _mm512_shuffle_f32x4(dfIm17, dfIm17, 78);
sfRe58 = _mm512_fmadd_ps(wfRe11, dfRe17, sfRe58);
sfRe58 = _mm512_mask3_fmadd_ps(wfIm11, dfIm17, sfRe58, 64764);
sfIm58 = _mm512_fmadd_ps(wfMx7, dfIm17, sfIm58);
sfIm58 = _mm512_mask3_fnmadd_ps(wfIm11, dfRe17, sfIm58, 64764);
__m512 dfRe18 = _mm512_loadu_ps(dfPtr3+256+87552*i8+21888*j4+21888*k7+384*s7);
__m512 dfIm18 = _mm512_loadu_ps(dfPtr3+320+87552*i8+21888*j4+21888*k7+384*s7);
sfRe59 = _mm512_fmadd_ps(wfRe11, dfRe18, sfRe59);
sfRe59 = _mm512_mask3_fmadd_ps(wfIm11, dfIm18, sfRe59, 64764);
sfIm59 = _mm512_fmadd_ps(wfMx7, dfIm18, sfIm59);
sfIm59 = _mm512_mask3_fnmadd_ps(wfIm11, dfRe18, sfIm59, 64764);
sfRe63 = _mm512_fmadd_ps(wfRe12, dfRe18, sfRe63);
sfRe63 = _mm512_mask3_fmadd_ps(wfIm12, dfIm18, sfRe63, 64764);
sfIm63 = _mm512_fmadd_ps(wfMx8, dfIm18, sfIm63);
sfIm63 = _mm512_mask3_fnmadd_ps(wfIm12, dfRe18, sfIm63, 64764);
dfRe18 = _mm512_shuffle_f32x4(dfRe18, dfRe18, 78);
dfIm18 = _mm512_shuffle_f32x4(dfIm18, dfIm18, 78);
sfRe60 = _mm512_fmadd_ps(wfRe11, dfRe18, sfRe60);
sfRe60 = _mm512_mask3_fmadd_ps(wfIm11, dfIm18, sfRe60, 64764);
sfIm60 = _mm512_fmadd_ps(wfMx7, dfIm18, sfIm60);
sfIm60 = _mm512_mask3_fnmadd_ps(wfIm11, dfRe18, sfIm60, 64764);
}
sfRe55 = _mm512_add_ps(sfRe55, _mm512_loadu_ps(sfPtr2+0+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm55 = _mm512_add_ps(sfIm55, _mm512_loadu_ps(sfPtr2+64+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe56 = _mm512_add_ps(sfRe56, _mm512_loadu_ps(sfPtr2+128+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm56 = _mm512_add_ps(sfIm56, _mm512_loadu_ps(sfPtr2+192+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe57 = _mm512_add_ps(sfRe57, _mm512_loadu_ps(sfPtr2+256+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm57 = _mm512_add_ps(sfIm57, _mm512_loadu_ps(sfPtr2+320+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe58 = _mm512_add_ps(sfRe58, _mm512_loadu_ps(sfPtr2+384+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm58 = _mm512_add_ps(sfIm58, _mm512_loadu_ps(sfPtr2+448+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe59 = _mm512_add_ps(sfRe59, _mm512_loadu_ps(sfPtr2+512+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm59 = _mm512_add_ps(sfIm59, _mm512_loadu_ps(sfPtr2+576+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe60 = _mm512_add_ps(sfRe60, _mm512_loadu_ps(sfPtr2+640+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm60 = _mm512_add_ps(sfIm60, _mm512_loadu_ps(sfPtr2+704+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe61 = _mm512_add_ps(sfRe61, _mm512_loadu_ps(sfPtr2+768+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm61 = _mm512_add_ps(sfIm61, _mm512_loadu_ps(sfPtr2+832+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe62 = _mm512_add_ps(sfRe62, _mm512_loadu_ps(sfPtr2+896+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm62 = _mm512_add_ps(sfIm62, _mm512_loadu_ps(sfPtr2+960+121344*i8+30336*j4+30336*k7+1536*l3));
sfRe63 = _mm512_add_ps(sfRe63, _mm512_loadu_ps(sfPtr2+1024+121344*i8+30336*j4+30336*k7+1536*l3));
sfIm63 = _mm512_add_ps(sfIm63, _mm512_loadu_ps(sfPtr2+1088+121344*i8+30336*j4+30336*k7+1536*l3));
_mm512_storeu_ps(sfPtr2+0+121344*i8+30336*j4+30336*k7+1536*l3, sfRe55);
_mm512_storeu_ps(sfPtr2+64+121344*i8+30336*j4+30336*k7+1536*l3, sfIm55);
_mm512_storeu_ps(sfPtr2+128+121344*i8+30336*j4+30336*k7+1536*l3, sfRe56);
_mm512_storeu_ps(sfPtr2+192+121344*i8+30336*j4+30336*k7+1536*l3, sfIm56);
_mm512_storeu_ps(sfPtr2+256+121344*i8+30336*j4+30336*k7+1536*l3, sfRe57);
_mm512_storeu_ps(sfPtr2+320+121344*i8+30336*j4+30336*k7+1536*l3, sfIm57);
_mm512_storeu_ps(sfPtr2+384+121344*i8+30336*j4+30336*k7+1536*l3, sfRe58);
_mm512_storeu_ps(sfPtr2+448+121344*i8+30336*j4+30336*k7+1536*l3, sfIm58);
_mm512_storeu_ps(sfPtr2+512+121344*i8+30336*j4+30336*k7+1536*l3, sfRe59);
_mm512_storeu_ps(sfPtr2+576+121344*i8+30336*j4+30336*k7+1536*l3, sfIm59);
_mm512_storeu_ps(sfPtr2+640+121344*i8+30336*j4+30336*k7+1536*l3, sfRe60);
_mm512_storeu_ps(sfPtr2+704+121344*i8+30336*j4+30336*k7+1536*l3, sfIm60);
_mm512_storeu_ps(sfPtr2+768+121344*i8+30336*j4+30336*k7+1536*l3, sfRe61);
_mm512_storeu_ps(sfPtr2+832+121344*i8+30336*j4+30336*k7+1536*l3, sfIm61);
_mm512_storeu_ps(sfPtr2+896+121344*i8+30336*j4+30336*k7+1536*l3, sfRe62);
_mm512_storeu_ps(sfPtr2+960+121344*i8+30336*j4+30336*k7+1536*l3, sfIm62);
_mm512_storeu_ps(sfPtr2+1024+121344*i8+30336*j4+30336*k7+1536*l3, sfRe63);
_mm512_storeu_ps(sfPtr2+1088+121344*i8+30336*j4+30336*k7+1536*l3, sfIm63);
}
j4 = 1;
}
for (; j4 <= jj3; ++j4) {
ptrdiff_t k8 = 1*d1;
for (; k8 != 1; ++k8) {
ptrdiff_t l4 = 5*w2;
ptrdiff_t ll4 = l4+4;
for (; l4 != 19; ++l4) {
__m512 sfRe64 = _mm512_setzero_ps();
__m512 sfIm64 = _mm512_setzero_ps();
__m512 sfRe70 = _mm512_setzero_ps();
__m512 sfIm70 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe65 = sfRe64;
__m512 sfIm65 = sfIm64;
__m512 sfRe66 = sfRe64;
__m512 sfIm66 = sfIm64;
__m512 sfRe67 = sfRe64;
__m512 sfIm67 = sfIm64;
__m512 sfRe68 = sfRe64;
__m512 sfIm68 = sfIm64;
__m512 sfRe69 = sfRe64;
__m512 sfIm69 = sfIm64;
__m512 sfRe71 = sfRe70;
__m512 sfIm71 = sfIm70;
__m512 sfRe72 = sfRe70;
__m512 sfIm72 = sfIm70;
__m512 sfRe73 = sfRe70;
__m512 sfIm73 = sfIm70;
__m512 sfRe74 = sfRe70;
__m512 sfIm74 = sfIm70;
__m512 sfRe75 = sfRe70;
__m512 sfIm75 = sfIm70;
for (ptrdiff_t s8 = 0; s8 < 57; ++s8) {
__m512i wfLd13 = _mm512_loadu_si512(wfPtr3+0+583680*i8+145920*j4+7296*l4+128*s8);
__m512 wfRe13 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd13));
__m512 wfIm13 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd13, 1));
__m512i wfLd14 = _mm512_loadu_si512(wfPtr3+64+583680*i8+145920*j4+7296*l4+128*s8);
__m512 wfRe14 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd14));
__m512 wfIm14 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd14, 1));
__m512 dfRe19 = _mm512_loadu_ps(dfPtr3+0+87552*i8+21888*j4+21888*k8+384*s8);
__m512 dfIm19 = _mm512_loadu_ps(dfPtr3+64+87552*i8+21888*j4+21888*k8+384*s8);
sfRe64 = _mm512_fmadd_ps(wfRe13, dfRe19, sfRe64);
sfRe64 = _mm512_fmadd_ps(wfIm13, dfIm19, sfRe64);
sfIm64 = _mm512_fmadd_ps(wfRe13, dfIm19, sfIm64);
sfIm64 = _mm512_fnmadd_ps(wfIm13, dfRe19, sfIm64);
sfRe70 = _mm512_fmadd_ps(wfRe14, dfRe19, sfRe70);
sfRe70 = _mm512_fmadd_ps(wfIm14, dfIm19, sfRe70);
sfIm70 = _mm512_fmadd_ps(wfRe14, dfIm19, sfIm70);
sfIm70 = _mm512_fnmadd_ps(wfIm14, dfRe19, sfIm70);
dfRe19 = _mm512_shuffle_f32x4(dfRe19, dfRe19, 78);
dfIm19 = _mm512_shuffle_f32x4(dfIm19, dfIm19, 78);
sfRe65 = _mm512_fmadd_ps(wfRe13, dfRe19, sfRe65);
sfRe65 = _mm512_fmadd_ps(wfIm13, dfIm19, sfRe65);
sfIm65 = _mm512_fmadd_ps(wfRe13, dfIm19, sfIm65);
sfIm65 = _mm512_fnmadd_ps(wfIm13, dfRe19, sfIm65);
sfRe71 = _mm512_fmadd_ps(wfRe14, dfRe19, sfRe71);
sfRe71 = _mm512_fmadd_ps(wfIm14, dfIm19, sfRe71);
sfIm71 = _mm512_fmadd_ps(wfRe14, dfIm19, sfIm71);
sfIm71 = _mm512_fnmadd_ps(wfIm14, dfRe19, sfIm71);
__m512 dfRe20 = _mm512_loadu_ps(dfPtr3+128+87552*i8+21888*j4+21888*k8+384*s8);
__m512 dfIm20 = _mm512_loadu_ps(dfPtr3+192+87552*i8+21888*j4+21888*k8+384*s8);
sfRe66 = _mm512_fmadd_ps(wfRe13, dfRe20, sfRe66);
sfRe66 = _mm512_fmadd_ps(wfIm13, dfIm20, sfRe66);
sfIm66 = _mm512_fmadd_ps(wfRe13, dfIm20, sfIm66);
sfIm66 = _mm512_fnmadd_ps(wfIm13, dfRe20, sfIm66);
sfRe72 = _mm512_fmadd_ps(wfRe14, dfRe20, sfRe72);
sfRe72 = _mm512_fmadd_ps(wfIm14, dfIm20, sfRe72);
sfIm72 = _mm512_fmadd_ps(wfRe14, dfIm20, sfIm72);
sfIm72 = _mm512_fnmadd_ps(wfIm14, dfRe20, sfIm72);
dfRe20 = _mm512_shuffle_f32x4(dfRe20, dfRe20, 78);
dfIm20 = _mm512_shuffle_f32x4(dfIm20, dfIm20, 78);
sfRe67 = _mm512_fmadd_ps(wfRe13, dfRe20, sfRe67);
sfRe67 = _mm512_fmadd_ps(wfIm13, dfIm20, sfRe67);
sfIm67 = _mm512_fmadd_ps(wfRe13, dfIm20, sfIm67);
sfIm67 = _mm512_fnmadd_ps(wfIm13, dfRe20, sfIm67);
sfRe73 = _mm512_fmadd_ps(wfRe14, dfRe20, sfRe73);
sfRe73 = _mm512_fmadd_ps(wfIm14, dfIm20, sfRe73);
sfIm73 = _mm512_fmadd_ps(wfRe14, dfIm20, sfIm73);
sfIm73 = _mm512_fnmadd_ps(wfIm14, dfRe20, sfIm73);
__m512 dfRe21 = _mm512_loadu_ps(dfPtr3+256+87552*i8+21888*j4+21888*k8+384*s8);
__m512 dfIm21 = _mm512_loadu_ps(dfPtr3+320+87552*i8+21888*j4+21888*k8+384*s8);
sfRe68 = _mm512_fmadd_ps(wfRe13, dfRe21, sfRe68);
sfRe68 = _mm512_fmadd_ps(wfIm13, dfIm21, sfRe68);
sfIm68 = _mm512_fmadd_ps(wfRe13, dfIm21, sfIm68);
sfIm68 = _mm512_fnmadd_ps(wfIm13, dfRe21, sfIm68);
sfRe74 = _mm512_fmadd_ps(wfRe14, dfRe21, sfRe74);
sfRe74 = _mm512_fmadd_ps(wfIm14, dfIm21, sfRe74);
sfIm74 = _mm512_fmadd_ps(wfRe14, dfIm21, sfIm74);
sfIm74 = _mm512_fnmadd_ps(wfIm14, dfRe21, sfIm74);
dfRe21 = _mm512_shuffle_f32x4(dfRe21, dfRe21, 78);
dfIm21 = _mm512_shuffle_f32x4(dfIm21, dfIm21, 78);
sfRe69 = _mm512_fmadd_ps(wfRe13, dfRe21, sfRe69);
sfRe69 = _mm512_fmadd_ps(wfIm13, dfIm21, sfRe69);
sfIm69 = _mm512_fmadd_ps(wfRe13, dfIm21, sfIm69);
sfIm69 = _mm512_fnmadd_ps(wfIm13, dfRe21, sfIm69);
sfRe75 = _mm512_fmadd_ps(wfRe14, dfRe21, sfRe75);
sfRe75 = _mm512_fmadd_ps(wfIm14, dfIm21, sfRe75);
sfIm75 = _mm512_fmadd_ps(wfRe14, dfIm21, sfIm75);
sfIm75 = _mm512_fnmadd_ps(wfIm14, dfRe21, sfIm75);
}
sfRe64 = _mm512_add_ps(sfRe64, _mm512_loadu_ps(sfPtr2+0+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm64 = _mm512_add_ps(sfIm64, _mm512_loadu_ps(sfPtr2+64+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe65 = _mm512_add_ps(sfRe65, _mm512_loadu_ps(sfPtr2+128+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm65 = _mm512_add_ps(sfIm65, _mm512_loadu_ps(sfPtr2+192+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe66 = _mm512_add_ps(sfRe66, _mm512_loadu_ps(sfPtr2+256+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm66 = _mm512_add_ps(sfIm66, _mm512_loadu_ps(sfPtr2+320+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe67 = _mm512_add_ps(sfRe67, _mm512_loadu_ps(sfPtr2+384+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm67 = _mm512_add_ps(sfIm67, _mm512_loadu_ps(sfPtr2+448+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe68 = _mm512_add_ps(sfRe68, _mm512_loadu_ps(sfPtr2+512+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm68 = _mm512_add_ps(sfIm68, _mm512_loadu_ps(sfPtr2+576+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe69 = _mm512_add_ps(sfRe69, _mm512_loadu_ps(sfPtr2+640+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm69 = _mm512_add_ps(sfIm69, _mm512_loadu_ps(sfPtr2+704+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe70 = _mm512_add_ps(sfRe70, _mm512_loadu_ps(sfPtr2+768+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm70 = _mm512_add_ps(sfIm70, _mm512_loadu_ps(sfPtr2+832+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe71 = _mm512_add_ps(sfRe71, _mm512_loadu_ps(sfPtr2+896+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm71 = _mm512_add_ps(sfIm71, _mm512_loadu_ps(sfPtr2+960+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe72 = _mm512_add_ps(sfRe72, _mm512_loadu_ps(sfPtr2+1024+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm72 = _mm512_add_ps(sfIm72, _mm512_loadu_ps(sfPtr2+1088+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe73 = _mm512_add_ps(sfRe73, _mm512_loadu_ps(sfPtr2+1152+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm73 = _mm512_add_ps(sfIm73, _mm512_loadu_ps(sfPtr2+1216+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe74 = _mm512_add_ps(sfRe74, _mm512_loadu_ps(sfPtr2+1280+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm74 = _mm512_add_ps(sfIm74, _mm512_loadu_ps(sfPtr2+1344+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe75 = _mm512_add_ps(sfRe75, _mm512_loadu_ps(sfPtr2+1408+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm75 = _mm512_add_ps(sfIm75, _mm512_loadu_ps(sfPtr2+1472+121344*i8+30336*j4+30336*k8+1536*l4));
_mm512_storeu_ps(sfPtr2+0+121344*i8+30336*j4+30336*k8+1536*l4, sfRe64);
_mm512_storeu_ps(sfPtr2+64+121344*i8+30336*j4+30336*k8+1536*l4, sfIm64);
_mm512_storeu_ps(sfPtr2+128+121344*i8+30336*j4+30336*k8+1536*l4, sfRe65);
_mm512_storeu_ps(sfPtr2+192+121344*i8+30336*j4+30336*k8+1536*l4, sfIm65);
_mm512_storeu_ps(sfPtr2+256+121344*i8+30336*j4+30336*k8+1536*l4, sfRe66);
_mm512_storeu_ps(sfPtr2+320+121344*i8+30336*j4+30336*k8+1536*l4, sfIm66);
_mm512_storeu_ps(sfPtr2+384+121344*i8+30336*j4+30336*k8+1536*l4, sfRe67);
_mm512_storeu_ps(sfPtr2+448+121344*i8+30336*j4+30336*k8+1536*l4, sfIm67);
_mm512_storeu_ps(sfPtr2+512+121344*i8+30336*j4+30336*k8+1536*l4, sfRe68);
_mm512_storeu_ps(sfPtr2+576+121344*i8+30336*j4+30336*k8+1536*l4, sfIm68);
_mm512_storeu_ps(sfPtr2+640+121344*i8+30336*j4+30336*k8+1536*l4, sfRe69);
_mm512_storeu_ps(sfPtr2+704+121344*i8+30336*j4+30336*k8+1536*l4, sfIm69);
_mm512_storeu_ps(sfPtr2+768+121344*i8+30336*j4+30336*k8+1536*l4, sfRe70);
_mm512_storeu_ps(sfPtr2+832+121344*i8+30336*j4+30336*k8+1536*l4, sfIm70);
_mm512_storeu_ps(sfPtr2+896+121344*i8+30336*j4+30336*k8+1536*l4, sfRe71);
_mm512_storeu_ps(sfPtr2+960+121344*i8+30336*j4+30336*k8+1536*l4, sfIm71);
_mm512_storeu_ps(sfPtr2+1024+121344*i8+30336*j4+30336*k8+1536*l4, sfRe72);
_mm512_storeu_ps(sfPtr2+1088+121344*i8+30336*j4+30336*k8+1536*l4, sfIm72);
_mm512_storeu_ps(sfPtr2+1152+121344*i8+30336*j4+30336*k8+1536*l4, sfRe73);
_mm512_storeu_ps(sfPtr2+1216+121344*i8+30336*j4+30336*k8+1536*l4, sfIm73);
_mm512_storeu_ps(sfPtr2+1280+121344*i8+30336*j4+30336*k8+1536*l4, sfRe74);
_mm512_storeu_ps(sfPtr2+1344+121344*i8+30336*j4+30336*k8+1536*l4, sfIm74);
_mm512_storeu_ps(sfPtr2+1408+121344*i8+30336*j4+30336*k8+1536*l4, sfRe75);
_mm512_storeu_ps(sfPtr2+1472+121344*i8+30336*j4+30336*k8+1536*l4, sfIm75);
if (l4 >= ll4) return;
}
__m512 sfRe76 = _mm512_setzero_ps();
__m512 sfIm76 = _mm512_setzero_ps();
__m512 sfRe82 = _mm512_setzero_ps();
__m512 sfIm82 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe77 = sfRe76;
__m512 sfIm77 = sfIm76;
__m512 sfRe78 = sfRe76;
__m512 sfIm78 = sfIm76;
__m512 sfRe79 = sfRe76;
__m512 sfIm79 = sfIm76;
__m512 sfRe80 = sfRe76;
__m512 sfIm80 = sfIm76;
__m512 sfRe81 = sfRe76;
__m512 sfIm81 = sfIm76;
__m512 sfRe83 = sfRe82;
__m512 sfIm83 = sfIm82;
__m512 sfRe84 = sfRe82;
__m512 sfIm84 = sfIm82;
for (ptrdiff_t s9 = 0; s9 < 57; ++s9) {
__m512i wfLd15 = _mm512_loadu_si512(wfPtr3+0+583680*i8+145920*j4+7296*l4+128*s9);
__m512 wfRe15 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd15));
__m512 wfIm15 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd15, 1));
__m512i wfLd16 = _mm512_loadu_si512(wfPtr3+64+583680*i8+145920*j4+7296*l4+128*s9);
__m512 wfRe16 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd16));
__m512 wfIm16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd16, 1));
__m512 dfRe22 = _mm512_loadu_ps(dfPtr3+0+87552*i8+21888*j4+21888*k8+384*s9);
__m512 dfIm22 = _mm512_loadu_ps(dfPtr3+64+87552*i8+21888*j4+21888*k8+384*s9);
sfRe76 = _mm512_fmadd_ps(wfRe15, dfRe22, sfRe76);
sfRe76 = _mm512_fmadd_ps(wfIm15, dfIm22, sfRe76);
sfIm76 = _mm512_fmadd_ps(wfRe15, dfIm22, sfIm76);
sfIm76 = _mm512_fnmadd_ps(wfIm15, dfRe22, sfIm76);
sfRe82 = _mm512_fmadd_ps(wfRe16, dfRe22, sfRe82);
sfRe82 = _mm512_fmadd_ps(wfIm16, dfIm22, sfRe82);
sfIm82 = _mm512_fmadd_ps(wfRe16, dfIm22, sfIm82);
sfIm82 = _mm512_fnmadd_ps(wfIm16, dfRe22, sfIm82);
dfRe22 = _mm512_shuffle_f32x4(dfRe22, dfRe22, 78);
dfIm22 = _mm512_shuffle_f32x4(dfIm22, dfIm22, 78);
sfRe77 = _mm512_fmadd_ps(wfRe15, dfRe22, sfRe77);
sfRe77 = _mm512_fmadd_ps(wfIm15, dfIm22, sfRe77);
sfIm77 = _mm512_fmadd_ps(wfRe15, dfIm22, sfIm77);
sfIm77 = _mm512_fnmadd_ps(wfIm15, dfRe22, sfIm77);
__m512 dfRe23 = _mm512_loadu_ps(dfPtr3+128+87552*i8+21888*j4+21888*k8+384*s9);
__m512 dfIm23 = _mm512_loadu_ps(dfPtr3+192+87552*i8+21888*j4+21888*k8+384*s9);
sfRe78 = _mm512_fmadd_ps(wfRe15, dfRe23, sfRe78);
sfRe78 = _mm512_fmadd_ps(wfIm15, dfIm23, sfRe78);
sfIm78 = _mm512_fmadd_ps(wfRe15, dfIm23, sfIm78);
sfIm78 = _mm512_fnmadd_ps(wfIm15, dfRe23, sfIm78);
sfRe83 = _mm512_fmadd_ps(wfRe16, dfRe23, sfRe83);
sfRe83 = _mm512_fmadd_ps(wfIm16, dfIm23, sfRe83);
sfIm83 = _mm512_fmadd_ps(wfRe16, dfIm23, sfIm83);
sfIm83 = _mm512_fnmadd_ps(wfIm16, dfRe23, sfIm83);
dfRe23 = _mm512_shuffle_f32x4(dfRe23, dfRe23, 78);
dfIm23 = _mm512_shuffle_f32x4(dfIm23, dfIm23, 78);
sfRe79 = _mm512_fmadd_ps(wfRe15, dfRe23, sfRe79);
sfRe79 = _mm512_fmadd_ps(wfIm15, dfIm23, sfRe79);
sfIm79 = _mm512_fmadd_ps(wfRe15, dfIm23, sfIm79);
sfIm79 = _mm512_fnmadd_ps(wfIm15, dfRe23, sfIm79);
__m512 dfRe24 = _mm512_loadu_ps(dfPtr3+256+87552*i8+21888*j4+21888*k8+384*s9);
__m512 dfIm24 = _mm512_loadu_ps(dfPtr3+320+87552*i8+21888*j4+21888*k8+384*s9);
sfRe80 = _mm512_fmadd_ps(wfRe15, dfRe24, sfRe80);
sfRe80 = _mm512_fmadd_ps(wfIm15, dfIm24, sfRe80);
sfIm80 = _mm512_fmadd_ps(wfRe15, dfIm24, sfIm80);
sfIm80 = _mm512_fnmadd_ps(wfIm15, dfRe24, sfIm80);
sfRe84 = _mm512_fmadd_ps(wfRe16, dfRe24, sfRe84);
sfRe84 = _mm512_fmadd_ps(wfIm16, dfIm24, sfRe84);
sfIm84 = _mm512_fmadd_ps(wfRe16, dfIm24, sfIm84);
sfIm84 = _mm512_fnmadd_ps(wfIm16, dfRe24, sfIm84);
dfRe24 = _mm512_shuffle_f32x4(dfRe24, dfRe24, 78);
dfIm24 = _mm512_shuffle_f32x4(dfIm24, dfIm24, 78);
sfRe81 = _mm512_fmadd_ps(wfRe15, dfRe24, sfRe81);
sfRe81 = _mm512_fmadd_ps(wfIm15, dfIm24, sfRe81);
sfIm81 = _mm512_fmadd_ps(wfRe15, dfIm24, sfIm81);
sfIm81 = _mm512_fnmadd_ps(wfIm15, dfRe24, sfIm81);
}
sfRe76 = _mm512_add_ps(sfRe76, _mm512_loadu_ps(sfPtr2+0+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm76 = _mm512_add_ps(sfIm76, _mm512_loadu_ps(sfPtr2+64+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe77 = _mm512_add_ps(sfRe77, _mm512_loadu_ps(sfPtr2+128+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm77 = _mm512_add_ps(sfIm77, _mm512_loadu_ps(sfPtr2+192+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe78 = _mm512_add_ps(sfRe78, _mm512_loadu_ps(sfPtr2+256+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm78 = _mm512_add_ps(sfIm78, _mm512_loadu_ps(sfPtr2+320+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe79 = _mm512_add_ps(sfRe79, _mm512_loadu_ps(sfPtr2+384+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm79 = _mm512_add_ps(sfIm79, _mm512_loadu_ps(sfPtr2+448+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe80 = _mm512_add_ps(sfRe80, _mm512_loadu_ps(sfPtr2+512+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm80 = _mm512_add_ps(sfIm80, _mm512_loadu_ps(sfPtr2+576+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe81 = _mm512_add_ps(sfRe81, _mm512_loadu_ps(sfPtr2+640+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm81 = _mm512_add_ps(sfIm81, _mm512_loadu_ps(sfPtr2+704+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe82 = _mm512_add_ps(sfRe82, _mm512_loadu_ps(sfPtr2+768+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm82 = _mm512_add_ps(sfIm82, _mm512_loadu_ps(sfPtr2+832+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe83 = _mm512_add_ps(sfRe83, _mm512_loadu_ps(sfPtr2+896+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm83 = _mm512_add_ps(sfIm83, _mm512_loadu_ps(sfPtr2+960+121344*i8+30336*j4+30336*k8+1536*l4));
sfRe84 = _mm512_add_ps(sfRe84, _mm512_loadu_ps(sfPtr2+1024+121344*i8+30336*j4+30336*k8+1536*l4));
sfIm84 = _mm512_add_ps(sfIm84, _mm512_loadu_ps(sfPtr2+1088+121344*i8+30336*j4+30336*k8+1536*l4));
_mm512_storeu_ps(sfPtr2+0+121344*i8+30336*j4+30336*k8+1536*l4, sfRe76);
_mm512_storeu_ps(sfPtr2+64+121344*i8+30336*j4+30336*k8+1536*l4, sfIm76);
_mm512_storeu_ps(sfPtr2+128+121344*i8+30336*j4+30336*k8+1536*l4, sfRe77);
_mm512_storeu_ps(sfPtr2+192+121344*i8+30336*j4+30336*k8+1536*l4, sfIm77);
_mm512_storeu_ps(sfPtr2+256+121344*i8+30336*j4+30336*k8+1536*l4, sfRe78);
_mm512_storeu_ps(sfPtr2+320+121344*i8+30336*j4+30336*k8+1536*l4, sfIm78);
_mm512_storeu_ps(sfPtr2+384+121344*i8+30336*j4+30336*k8+1536*l4, sfRe79);
_mm512_storeu_ps(sfPtr2+448+121344*i8+30336*j4+30336*k8+1536*l4, sfIm79);
_mm512_storeu_ps(sfPtr2+512+121344*i8+30336*j4+30336*k8+1536*l4, sfRe80);
_mm512_storeu_ps(sfPtr2+576+121344*i8+30336*j4+30336*k8+1536*l4, sfIm80);
_mm512_storeu_ps(sfPtr2+640+121344*i8+30336*j4+30336*k8+1536*l4, sfRe81);
_mm512_storeu_ps(sfPtr2+704+121344*i8+30336*j4+30336*k8+1536*l4, sfIm81);
_mm512_storeu_ps(sfPtr2+768+121344*i8+30336*j4+30336*k8+1536*l4, sfRe82);
_mm512_storeu_ps(sfPtr2+832+121344*i8+30336*j4+30336*k8+1536*l4, sfIm82);
_mm512_storeu_ps(sfPtr2+896+121344*i8+30336*j4+30336*k8+1536*l4, sfRe83);
_mm512_storeu_ps(sfPtr2+960+121344*i8+30336*j4+30336*k8+1536*l4, sfIm83);
_mm512_storeu_ps(sfPtr2+1024+121344*i8+30336*j4+30336*k8+1536*l4, sfRe84);
_mm512_storeu_ps(sfPtr2+1088+121344*i8+30336*j4+30336*k8+1536*l4, sfIm84);
}
}
}

static void Example9StriderProduceSums1(Example9ThreaderTeam1* team16, char** tensors5) {
void* tuple1[3];
tuple1[0] = tensors5;
for (ptrdiff_t e4 = 0; e4 < 1; ++e4) {
tuple1[1] = (void*)e4;
for (ptrdiff_t z3 = 0; z3 < 4; ++z3) {
tuple1[2] = (void*)z3;
Example9ThreaderTask1 task9;
task9.callee1 = Example9StriderProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 4;
task9.hull1[1] = 1;
task9.hull1[2] = 4;
task9.hull1[3] = 5;
Example9ThreaderDo1(team16, &task9);
}
}
}

static void Example9StriderConsumeSums1Callee1(Example9ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t w3 = 0;
ptrdiff_t d2 = 0;
ptrdiff_t g5 = pt10[2];
char*restrict sfPtr3 = tensors8[0];
char*restrict datPtr2 = tensors8[1];
ptrdiff_t i9 = 2*g5;
ptrdiff_t ii1 = i9+(g5 < 1 ? 1 : 2);
for (; i9 <= ii1; ++i9) {
ptrdiff_t j5 = 1*d2;
ptrdiff_t rel2 = j5-0;
ptrdiff_t base2 = 0;
ptrdiff_t toH1 = base2+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k9 = 20*w3;
for (; k9 != 19; ++k9) {
ptrdiff_t r2 = 0;
for (; r2 != 2; ++r2) {
ptrdiff_t t2 = 0;
__m512 sfRe85 = _mm512_loadu_ps(sfPtr3+0+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfIm85 = _mm512_loadu_ps(sfPtr3+64+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfRe89 = _mm512_loadu_ps(sfPtr3+128+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfIm89 = _mm512_loadu_ps(sfPtr3+192+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfRe86 = _mm512_loadu_ps(sfPtr3+30336+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfIm86 = _mm512_loadu_ps(sfPtr3+30400+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfRe90 = _mm512_loadu_ps(sfPtr3+30464+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfIm90 = _mm512_loadu_ps(sfPtr3+30528+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfRe87 = _mm512_loadu_ps(sfPtr3+60672+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfIm87 = _mm512_loadu_ps(sfPtr3+60736+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfRe91 = _mm512_loadu_ps(sfPtr3+60800+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfIm91 = _mm512_loadu_ps(sfPtr3+60864+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfRe88 = _mm512_loadu_ps(sfPtr3+91008+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfIm88 = _mm512_loadu_ps(sfPtr3+91072+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfRe92 = _mm512_loadu_ps(sfPtr3+91136+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512 sfIm92 = _mm512_loadu_ps(sfPtr3+91200+121344*i9+30336*j5+1536*k9+768*r2+256*t2);
__m512i ifft1 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2 = _mm512_permutexvar_ps(ifft1, sfRe85);
__m512 ifft93 = _mm512_permutexvar_ps(ifft1, sfRe89);
__m512i ifft3 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4 = _mm512_permutexvar_ps(ifft3, sfRe85);
__m512 ifft94 = _mm512_permutexvar_ps(ifft3, sfRe89);
__m512 ifft5 = _mm512_permutexvar_ps(ifft1, sfIm85);
__m512 ifft95 = _mm512_permutexvar_ps(ifft1, sfIm89);
__m512 ifft6 = _mm512_permutexvar_ps(ifft3, sfIm85);
__m512 ifft96 = _mm512_permutexvar_ps(ifft3, sfIm89);
__m512 ifft7 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft8 = _mm512_mask_fmadd_ps(ifft6, 65021, ifft7, ifft2);
__m512 ifft97 = _mm512_mask_fmadd_ps(ifft96, 65021, ifft7, ifft93);
__m512 ifft9 = _mm512_mask_fnmadd_ps(ifft5, 65021, ifft7, ifft4);
__m512 ifft98 = _mm512_mask_fnmadd_ps(ifft95, 65021, ifft7, ifft94);
__m512 ifft10 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft11 = _mm512_fmadd_ps(ifft8, ifft10, _mm512_shuffle_ps(ifft8, ifft8, 177));
__m512 ifft99 = _mm512_fmadd_ps(ifft97, ifft10, _mm512_shuffle_ps(ifft97, ifft97, 177));
__m512 ifft12 = _mm512_fmadd_ps(ifft9, ifft10, _mm512_shuffle_ps(ifft9, ifft9, 177));
__m512 ifft100 = _mm512_fmadd_ps(ifft98, ifft10, _mm512_shuffle_ps(ifft98, ifft98, 177));
__m512 ifft13 = _mm512_fmadd_ps(sfRe86, ifft10, _mm512_shuffle_ps(sfRe86, sfRe86, 177));
__m512 ifft101 = _mm512_fmadd_ps(sfRe90, ifft10, _mm512_shuffle_ps(sfRe90, sfRe90, 177));
__m512 ifft14 = _mm512_fmadd_ps(sfIm86, ifft10, _mm512_shuffle_ps(sfIm86, sfIm86, 177));
__m512 ifft102 = _mm512_fmadd_ps(sfIm90, ifft10, _mm512_shuffle_ps(sfIm90, sfIm90, 177));
__m512 ifft15 = _mm512_fmadd_ps(sfRe87, ifft10, _mm512_shuffle_ps(sfRe87, sfRe87, 177));
__m512 ifft103 = _mm512_fmadd_ps(sfRe91, ifft10, _mm512_shuffle_ps(sfRe91, sfRe91, 177));
__m512 ifft16 = _mm512_fmadd_ps(sfIm87, ifft10, _mm512_shuffle_ps(sfIm87, sfIm87, 177));
__m512 ifft104 = _mm512_fmadd_ps(sfIm91, ifft10, _mm512_shuffle_ps(sfIm91, sfIm91, 177));
__m512 ifft17 = _mm512_fmadd_ps(sfRe88, ifft10, _mm512_shuffle_ps(sfRe88, sfRe88, 177));
__m512 ifft105 = _mm512_fmadd_ps(sfRe92, ifft10, _mm512_shuffle_ps(sfRe92, sfRe92, 177));
__m512 ifft18 = _mm512_fmadd_ps(sfIm88, ifft10, _mm512_shuffle_ps(sfIm88, sfIm88, 177));
__m512 ifft106 = _mm512_fmadd_ps(sfIm92, ifft10, _mm512_shuffle_ps(sfIm92, sfIm92, 177));
__m512 ifft19 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft20 = _mm512_mul_ps(ifft11, ifft19);
__m512 ifft107 = _mm512_mul_ps(ifft99, ifft19);
__m512 ifft21 = _mm512_mul_ps(ifft12, ifft19);
__m512 ifft108 = _mm512_mul_ps(ifft100, ifft19);
__m512 ifft22 = _mm512_mul_ps(ifft13, ifft19);
__m512 ifft109 = _mm512_mul_ps(ifft101, ifft19);
__m512 ifft23 = _mm512_mul_ps(ifft14, ifft19);
__m512 ifft110 = _mm512_mul_ps(ifft102, ifft19);
__m512 ifft24 = _mm512_mul_ps(ifft15, ifft19);
__m512 ifft111 = _mm512_mul_ps(ifft103, ifft19);
__m512 ifft25 = _mm512_mul_ps(ifft16, ifft19);
__m512 ifft112 = _mm512_mul_ps(ifft104, ifft19);
__m512 ifft26 = _mm512_mul_ps(ifft17, ifft19);
__m512 ifft113 = _mm512_mul_ps(ifft105, ifft19);
__m512 ifft27 = _mm512_mul_ps(ifft18, ifft19);
__m512 ifft114 = _mm512_mul_ps(ifft106, ifft19);
__m512 ifft28 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft29 = _mm512_fnmadd_ps(ifft12, ifft28, ifft20);
__m512 ifft115 = _mm512_fnmadd_ps(ifft100, ifft28, ifft107);
__m512 ifft30 = _mm512_fmadd_ps(ifft11, ifft28, ifft21);
__m512 ifft116 = _mm512_fmadd_ps(ifft99, ifft28, ifft108);
__m512 ifft31 = _mm512_fnmadd_ps(ifft14, ifft28, ifft22);
__m512 ifft117 = _mm512_fnmadd_ps(ifft102, ifft28, ifft109);
__m512 ifft32 = _mm512_fmadd_ps(ifft13, ifft28, ifft23);
__m512 ifft118 = _mm512_fmadd_ps(ifft101, ifft28, ifft110);
__m512 ifft33 = _mm512_fnmadd_ps(ifft16, ifft28, ifft24);
__m512 ifft119 = _mm512_fnmadd_ps(ifft104, ifft28, ifft111);
__m512 ifft34 = _mm512_fmadd_ps(ifft15, ifft28, ifft25);
__m512 ifft120 = _mm512_fmadd_ps(ifft103, ifft28, ifft112);
__m512 ifft35 = _mm512_fnmadd_ps(ifft18, ifft28, ifft26);
__m512 ifft121 = _mm512_fnmadd_ps(ifft106, ifft28, ifft113);
__m512 ifft36 = _mm512_fmadd_ps(ifft17, ifft28, ifft27);
__m512 ifft122 = _mm512_fmadd_ps(ifft105, ifft28, ifft114);
__m512 ifft37 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft38 = _mm512_fmadd_ps(ifft29, ifft37, _mm512_shuffle_ps(ifft29, ifft29, 78));
__m512 ifft123 = _mm512_fmadd_ps(ifft115, ifft37, _mm512_shuffle_ps(ifft115, ifft115, 78));
__m512 ifft39 = _mm512_fmadd_ps(ifft30, ifft37, _mm512_shuffle_ps(ifft30, ifft30, 78));
__m512 ifft124 = _mm512_fmadd_ps(ifft116, ifft37, _mm512_shuffle_ps(ifft116, ifft116, 78));
__m512 ifft40 = _mm512_fmadd_ps(ifft31, ifft37, _mm512_shuffle_ps(ifft31, ifft31, 78));
__m512 ifft125 = _mm512_fmadd_ps(ifft117, ifft37, _mm512_shuffle_ps(ifft117, ifft117, 78));
__m512 ifft41 = _mm512_fmadd_ps(ifft32, ifft37, _mm512_shuffle_ps(ifft32, ifft32, 78));
__m512 ifft126 = _mm512_fmadd_ps(ifft118, ifft37, _mm512_shuffle_ps(ifft118, ifft118, 78));
__m512 ifft42 = _mm512_fmadd_ps(ifft33, ifft37, _mm512_shuffle_ps(ifft33, ifft33, 78));
__m512 ifft127 = _mm512_fmadd_ps(ifft119, ifft37, _mm512_shuffle_ps(ifft119, ifft119, 78));
__m512 ifft43 = _mm512_fmadd_ps(ifft34, ifft37, _mm512_shuffle_ps(ifft34, ifft34, 78));
__m512 ifft128 = _mm512_fmadd_ps(ifft120, ifft37, _mm512_shuffle_ps(ifft120, ifft120, 78));
__m512 ifft44 = _mm512_fmadd_ps(ifft35, ifft37, _mm512_shuffle_ps(ifft35, ifft35, 78));
__m512 ifft129 = _mm512_fmadd_ps(ifft121, ifft37, _mm512_shuffle_ps(ifft121, ifft121, 78));
__m512 ifft45 = _mm512_fmadd_ps(ifft36, ifft37, _mm512_shuffle_ps(ifft36, ifft36, 78));
__m512 ifft130 = _mm512_fmadd_ps(ifft122, ifft37, _mm512_shuffle_ps(ifft122, ifft122, 78));
__m512 ifft46 = _mm512_mask_sub_ps(ifft38, 49344, _mm512_setzero_ps(), ifft39);
__m512 ifft131 = _mm512_mask_sub_ps(ifft123, 49344, _mm512_setzero_ps(), ifft124);
__m512 ifft47 = _mm512_mask_mov_ps(ifft39, 49344, ifft38);
__m512 ifft132 = _mm512_mask_mov_ps(ifft124, 49344, ifft123);
__m512 ifft48 = _mm512_mask_sub_ps(ifft40, 49344, _mm512_setzero_ps(), ifft41);
__m512 ifft133 = _mm512_mask_sub_ps(ifft125, 49344, _mm512_setzero_ps(), ifft126);
__m512 ifft49 = _mm512_mask_mov_ps(ifft41, 49344, ifft40);
__m512 ifft134 = _mm512_mask_mov_ps(ifft126, 49344, ifft125);
__m512 ifft50 = _mm512_mask_sub_ps(ifft42, 49344, _mm512_setzero_ps(), ifft43);
__m512 ifft135 = _mm512_mask_sub_ps(ifft127, 49344, _mm512_setzero_ps(), ifft128);
__m512 ifft51 = _mm512_mask_mov_ps(ifft43, 49344, ifft42);
__m512 ifft136 = _mm512_mask_mov_ps(ifft128, 49344, ifft127);
__m512 ifft52 = _mm512_mask_sub_ps(ifft44, 49344, _mm512_setzero_ps(), ifft45);
__m512 ifft137 = _mm512_mask_sub_ps(ifft129, 49344, _mm512_setzero_ps(), ifft130);
__m512 ifft53 = _mm512_mask_mov_ps(ifft45, 49344, ifft44);
__m512 ifft138 = _mm512_mask_mov_ps(ifft130, 49344, ifft129);
__m512 ifft54 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft55 = _mm512_fmadd_ps(ifft46, ifft54, _mm512_shuffle_f32x4(ifft46, ifft46, 177));
__m512 ifft139 = _mm512_fmadd_ps(ifft131, ifft54, _mm512_shuffle_f32x4(ifft131, ifft131, 177));
__m512 ifft56 = _mm512_fmadd_ps(ifft47, ifft54, _mm512_shuffle_f32x4(ifft47, ifft47, 177));
__m512 ifft140 = _mm512_fmadd_ps(ifft132, ifft54, _mm512_shuffle_f32x4(ifft132, ifft132, 177));
__m512 ifft57 = _mm512_fmadd_ps(ifft48, ifft54, _mm512_shuffle_f32x4(ifft48, ifft48, 177));
__m512 ifft141 = _mm512_fmadd_ps(ifft133, ifft54, _mm512_shuffle_f32x4(ifft133, ifft133, 177));
__m512 ifft58 = _mm512_fmadd_ps(ifft49, ifft54, _mm512_shuffle_f32x4(ifft49, ifft49, 177));
__m512 ifft142 = _mm512_fmadd_ps(ifft134, ifft54, _mm512_shuffle_f32x4(ifft134, ifft134, 177));
__m512 ifft59 = _mm512_fmadd_ps(ifft50, ifft54, _mm512_shuffle_f32x4(ifft50, ifft50, 177));
__m512 ifft143 = _mm512_fmadd_ps(ifft135, ifft54, _mm512_shuffle_f32x4(ifft135, ifft135, 177));
__m512 ifft60 = _mm512_fnmsub_ps(ifft51, ifft54, _mm512_shuffle_f32x4(ifft51, ifft51, 177));
__m512 ifft144 = _mm512_fnmsub_ps(ifft136, ifft54, _mm512_shuffle_f32x4(ifft136, ifft136, 177));
__m512 ifft61 = _mm512_fmadd_ps(ifft52, ifft54, _mm512_shuffle_f32x4(ifft52, ifft52, 177));
__m512 ifft145 = _mm512_fmadd_ps(ifft137, ifft54, _mm512_shuffle_f32x4(ifft137, ifft137, 177));
__m512 ifft62 = _mm512_fmadd_ps(ifft53, ifft54, _mm512_shuffle_f32x4(ifft53, ifft53, 177));
__m512 ifft146 = _mm512_fmadd_ps(ifft138, ifft54, _mm512_shuffle_f32x4(ifft138, ifft138, 177));
__m512 ifft63 = _mm512_add_ps(ifft55, ifft56);
__m512 ifft147 = _mm512_add_ps(ifft139, ifft140);
__m512 ifft64 = _mm512_sub_ps(ifft55, ifft56);
__m512 ifft148 = _mm512_sub_ps(ifft139, ifft140);
__m512 ifft65 = _mm512_sub_ps(ifft57, ifft61);
__m512 ifft149 = _mm512_sub_ps(ifft141, ifft145);
__m512 ifft66 = _mm512_add_ps(ifft58, ifft62);
__m512 ifft150 = _mm512_add_ps(ifft142, ifft146);
__m512 ifft67 = _mm512_add_ps(ifft57, ifft61);
__m512 ifft151 = _mm512_add_ps(ifft141, ifft145);
__m512 ifft68 = _mm512_sub_ps(ifft58, ifft62);
__m512 ifft152 = _mm512_sub_ps(ifft142, ifft146);
__m512 ifft69 = _mm512_mul_ps(ifft59, _mm512_set1_ps(3.125e-02f));
__m512 ifft153 = _mm512_mul_ps(ifft143, _mm512_set1_ps(3.125e-02f));
__m512 ifft70 = _mm512_mul_ps(ifft60, _mm512_set1_ps(3.125e-02f));
__m512 ifft154 = _mm512_mul_ps(ifft144, _mm512_set1_ps(3.125e-02f));
__m512 ifft71 = _mm512_fmadd_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft155 = _mm512_fmadd_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft72 = _mm512_fmsub_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft156 = _mm512_fmsub_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft73 = _mm512_fmadd_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft157 = _mm512_fmadd_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft74 = _mm512_fmsub_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft158 = _mm512_fmsub_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft75 = _mm512_add_ps(ifft65, ifft66);
__m512 ifft159 = _mm512_add_ps(ifft149, ifft150);
__m512 ifft76 = _mm512_sub_ps(ifft65, ifft66);
__m512 ifft160 = _mm512_sub_ps(ifft149, ifft150);
__m512 ifft77 = _mm512_fnmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft161 = _mm512_fnmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft78 = _mm512_fmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft162 = _mm512_fmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft79 = _mm512_fmadd_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft163 = _mm512_fmadd_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft80 = _mm512_fmsub_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft164 = _mm512_fmsub_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft81 = _mm512_add_ps(ifft77, ifft78);
__m512 ifft165 = _mm512_add_ps(ifft161, ifft162);
__m512 ifft82 = _mm512_sub_ps(ifft77, ifft78);
__m512 ifft166 = _mm512_sub_ps(ifft161, ifft162);
__m512 ifft83 = _mm512_add_ps(ifft79, ifft80);
__m512 ifft167 = _mm512_add_ps(ifft163, ifft164);
__m512 ifft84 = _mm512_sub_ps(ifft79, ifft80);
__m512 ifft168 = _mm512_sub_ps(ifft163, ifft164);
__m512 ifft85 = _mm512_fmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft169 = _mm512_fmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft86 = _mm512_fnmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft170 = _mm512_fnmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft87 = _mm512_fmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft171 = _mm512_fmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft88 = _mm512_fnmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft172 = _mm512_fnmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft89 = _mm512_fnmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft173 = _mm512_fnmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft90 = _mm512_fmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft174 = _mm512_fmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft91 = _mm512_fmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft175 = _mm512_fmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 ifft92 = _mm512_fnmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft176 = _mm512_fnmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 dat49 = ifft85;
__m512 dat54 = ifft169;
__m512 dat50 = ifft87;
__m512 dat55 = ifft171;
__m512 dat51 = ifft89;
__m512 dat56 = ifft173;
__m512 dat52 = ifft91;
__m512 dat57 = ifft175;
__m512 dat53 = ifft86;
__m512 dat58 = ifft170;
(void)ifft88;
(void)ifft172;
(void)ifft90;
(void)ifft174;
(void)ifft92;
(void)ifft176;
__m512i pm1 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack1 = _mm512_permutex2var_ps(dat49, pm1, dat54);
__m512i pm2 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack2 = _mm512_permutex2var_ps(dat49, pm2, dat54);
__m512 pack3 = _mm512_permutex2var_ps(dat50, pm1, dat55);
__m512 pack4 = _mm512_permutex2var_ps(dat50, pm2, dat55);
__m512 pack5 = _mm512_permutex2var_ps(dat51, pm1, dat56);
__m512 pack6 = _mm512_permutex2var_ps(dat51, pm2, dat56);
__m512 pack7 = _mm512_permutex2var_ps(dat52, pm1, dat57);
__m512 pack8 = _mm512_permutex2var_ps(dat52, pm2, dat57);
__m512 pack9 = _mm512_permutex2var_ps(dat53, pm1, dat58);
__m512 pack10 = _mm512_permutex2var_ps(dat53, pm2, dat58);
_mm512_mask_storeu_ps(datPtr2+0+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack1);
_mm512_mask_storeu_ps(datPtr2+336+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack2);
_mm512_mask_storeu_ps(datPtr2+48+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack3);
_mm512_mask_storeu_ps(datPtr2+384+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack4);
_mm512_mask_storeu_ps(datPtr2+96+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack5);
_mm512_mask_storeu_ps(datPtr2+432+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack6);
_mm512_mask_storeu_ps(datPtr2+144+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack7);
_mm512_mask_storeu_ps(datPtr2+480+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack8);
_mm512_mask_storeu_ps(datPtr2+192+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack9);
_mm512_mask_storeu_ps(datPtr2+528+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t2, 1023, pack10);
ptrdiff_t t3 = 0;
__m512 sfRe93 = _mm512_loadu_ps(sfPtr3+256+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfIm93 = _mm512_loadu_ps(sfPtr3+320+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfRe97 = _mm512_loadu_ps(sfPtr3+384+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfIm97 = _mm512_loadu_ps(sfPtr3+448+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfRe94 = _mm512_loadu_ps(sfPtr3+30592+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfIm94 = _mm512_loadu_ps(sfPtr3+30656+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfRe98 = _mm512_loadu_ps(sfPtr3+30720+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfIm98 = _mm512_loadu_ps(sfPtr3+30784+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfRe95 = _mm512_loadu_ps(sfPtr3+60928+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfIm95 = _mm512_loadu_ps(sfPtr3+60992+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfRe99 = _mm512_loadu_ps(sfPtr3+61056+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfIm99 = _mm512_loadu_ps(sfPtr3+61120+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfRe96 = _mm512_loadu_ps(sfPtr3+91264+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfIm96 = _mm512_loadu_ps(sfPtr3+91328+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfRe100 = _mm512_loadu_ps(sfPtr3+91392+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512 sfIm100 = _mm512_loadu_ps(sfPtr3+91456+121344*i9+30336*j5+1536*k9+768*r2+256*t3);
__m512i ifft177 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft178 = _mm512_permutexvar_ps(ifft177, sfRe93);
__m512 ifft269 = _mm512_permutexvar_ps(ifft177, sfRe97);
__m512i ifft179 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft180 = _mm512_permutexvar_ps(ifft179, sfRe93);
__m512 ifft270 = _mm512_permutexvar_ps(ifft179, sfRe97);
__m512 ifft181 = _mm512_permutexvar_ps(ifft177, sfIm93);
__m512 ifft271 = _mm512_permutexvar_ps(ifft177, sfIm97);
__m512 ifft182 = _mm512_permutexvar_ps(ifft179, sfIm93);
__m512 ifft272 = _mm512_permutexvar_ps(ifft179, sfIm97);
__m512 ifft183 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft184 = _mm512_mask_fmadd_ps(ifft182, 65021, ifft183, ifft178);
__m512 ifft273 = _mm512_mask_fmadd_ps(ifft272, 65021, ifft183, ifft269);
__m512 ifft185 = _mm512_mask_fnmadd_ps(ifft181, 65021, ifft183, ifft180);
__m512 ifft274 = _mm512_mask_fnmadd_ps(ifft271, 65021, ifft183, ifft270);
__m512 ifft186 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft187 = _mm512_fmadd_ps(ifft184, ifft186, _mm512_shuffle_ps(ifft184, ifft184, 177));
__m512 ifft275 = _mm512_fmadd_ps(ifft273, ifft186, _mm512_shuffle_ps(ifft273, ifft273, 177));
__m512 ifft188 = _mm512_fmadd_ps(ifft185, ifft186, _mm512_shuffle_ps(ifft185, ifft185, 177));
__m512 ifft276 = _mm512_fmadd_ps(ifft274, ifft186, _mm512_shuffle_ps(ifft274, ifft274, 177));
__m512 ifft189 = _mm512_fmadd_ps(sfRe94, ifft186, _mm512_shuffle_ps(sfRe94, sfRe94, 177));
__m512 ifft277 = _mm512_fmadd_ps(sfRe98, ifft186, _mm512_shuffle_ps(sfRe98, sfRe98, 177));
__m512 ifft190 = _mm512_fmadd_ps(sfIm94, ifft186, _mm512_shuffle_ps(sfIm94, sfIm94, 177));
__m512 ifft278 = _mm512_fmadd_ps(sfIm98, ifft186, _mm512_shuffle_ps(sfIm98, sfIm98, 177));
__m512 ifft191 = _mm512_fmadd_ps(sfRe95, ifft186, _mm512_shuffle_ps(sfRe95, sfRe95, 177));
__m512 ifft279 = _mm512_fmadd_ps(sfRe99, ifft186, _mm512_shuffle_ps(sfRe99, sfRe99, 177));
__m512 ifft192 = _mm512_fmadd_ps(sfIm95, ifft186, _mm512_shuffle_ps(sfIm95, sfIm95, 177));
__m512 ifft280 = _mm512_fmadd_ps(sfIm99, ifft186, _mm512_shuffle_ps(sfIm99, sfIm99, 177));
__m512 ifft193 = _mm512_fmadd_ps(sfRe96, ifft186, _mm512_shuffle_ps(sfRe96, sfRe96, 177));
__m512 ifft281 = _mm512_fmadd_ps(sfRe100, ifft186, _mm512_shuffle_ps(sfRe100, sfRe100, 177));
__m512 ifft194 = _mm512_fmadd_ps(sfIm96, ifft186, _mm512_shuffle_ps(sfIm96, sfIm96, 177));
__m512 ifft282 = _mm512_fmadd_ps(sfIm100, ifft186, _mm512_shuffle_ps(sfIm100, sfIm100, 177));
__m512 ifft195 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft196 = _mm512_mul_ps(ifft187, ifft195);
__m512 ifft283 = _mm512_mul_ps(ifft275, ifft195);
__m512 ifft197 = _mm512_mul_ps(ifft188, ifft195);
__m512 ifft284 = _mm512_mul_ps(ifft276, ifft195);
__m512 ifft198 = _mm512_mul_ps(ifft189, ifft195);
__m512 ifft285 = _mm512_mul_ps(ifft277, ifft195);
__m512 ifft199 = _mm512_mul_ps(ifft190, ifft195);
__m512 ifft286 = _mm512_mul_ps(ifft278, ifft195);
__m512 ifft200 = _mm512_mul_ps(ifft191, ifft195);
__m512 ifft287 = _mm512_mul_ps(ifft279, ifft195);
__m512 ifft201 = _mm512_mul_ps(ifft192, ifft195);
__m512 ifft288 = _mm512_mul_ps(ifft280, ifft195);
__m512 ifft202 = _mm512_mul_ps(ifft193, ifft195);
__m512 ifft289 = _mm512_mul_ps(ifft281, ifft195);
__m512 ifft203 = _mm512_mul_ps(ifft194, ifft195);
__m512 ifft290 = _mm512_mul_ps(ifft282, ifft195);
__m512 ifft204 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft205 = _mm512_fnmadd_ps(ifft188, ifft204, ifft196);
__m512 ifft291 = _mm512_fnmadd_ps(ifft276, ifft204, ifft283);
__m512 ifft206 = _mm512_fmadd_ps(ifft187, ifft204, ifft197);
__m512 ifft292 = _mm512_fmadd_ps(ifft275, ifft204, ifft284);
__m512 ifft207 = _mm512_fnmadd_ps(ifft190, ifft204, ifft198);
__m512 ifft293 = _mm512_fnmadd_ps(ifft278, ifft204, ifft285);
__m512 ifft208 = _mm512_fmadd_ps(ifft189, ifft204, ifft199);
__m512 ifft294 = _mm512_fmadd_ps(ifft277, ifft204, ifft286);
__m512 ifft209 = _mm512_fnmadd_ps(ifft192, ifft204, ifft200);
__m512 ifft295 = _mm512_fnmadd_ps(ifft280, ifft204, ifft287);
__m512 ifft210 = _mm512_fmadd_ps(ifft191, ifft204, ifft201);
__m512 ifft296 = _mm512_fmadd_ps(ifft279, ifft204, ifft288);
__m512 ifft211 = _mm512_fnmadd_ps(ifft194, ifft204, ifft202);
__m512 ifft297 = _mm512_fnmadd_ps(ifft282, ifft204, ifft289);
__m512 ifft212 = _mm512_fmadd_ps(ifft193, ifft204, ifft203);
__m512 ifft298 = _mm512_fmadd_ps(ifft281, ifft204, ifft290);
__m512 ifft213 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft214 = _mm512_fmadd_ps(ifft205, ifft213, _mm512_shuffle_ps(ifft205, ifft205, 78));
__m512 ifft299 = _mm512_fmadd_ps(ifft291, ifft213, _mm512_shuffle_ps(ifft291, ifft291, 78));
__m512 ifft215 = _mm512_fmadd_ps(ifft206, ifft213, _mm512_shuffle_ps(ifft206, ifft206, 78));
__m512 ifft300 = _mm512_fmadd_ps(ifft292, ifft213, _mm512_shuffle_ps(ifft292, ifft292, 78));
__m512 ifft216 = _mm512_fmadd_ps(ifft207, ifft213, _mm512_shuffle_ps(ifft207, ifft207, 78));
__m512 ifft301 = _mm512_fmadd_ps(ifft293, ifft213, _mm512_shuffle_ps(ifft293, ifft293, 78));
__m512 ifft217 = _mm512_fmadd_ps(ifft208, ifft213, _mm512_shuffle_ps(ifft208, ifft208, 78));
__m512 ifft302 = _mm512_fmadd_ps(ifft294, ifft213, _mm512_shuffle_ps(ifft294, ifft294, 78));
__m512 ifft218 = _mm512_fmadd_ps(ifft209, ifft213, _mm512_shuffle_ps(ifft209, ifft209, 78));
__m512 ifft303 = _mm512_fmadd_ps(ifft295, ifft213, _mm512_shuffle_ps(ifft295, ifft295, 78));
__m512 ifft219 = _mm512_fmadd_ps(ifft210, ifft213, _mm512_shuffle_ps(ifft210, ifft210, 78));
__m512 ifft304 = _mm512_fmadd_ps(ifft296, ifft213, _mm512_shuffle_ps(ifft296, ifft296, 78));
__m512 ifft220 = _mm512_fmadd_ps(ifft211, ifft213, _mm512_shuffle_ps(ifft211, ifft211, 78));
__m512 ifft305 = _mm512_fmadd_ps(ifft297, ifft213, _mm512_shuffle_ps(ifft297, ifft297, 78));
__m512 ifft221 = _mm512_fmadd_ps(ifft212, ifft213, _mm512_shuffle_ps(ifft212, ifft212, 78));
__m512 ifft306 = _mm512_fmadd_ps(ifft298, ifft213, _mm512_shuffle_ps(ifft298, ifft298, 78));
__m512 ifft222 = _mm512_mask_sub_ps(ifft214, 49344, _mm512_setzero_ps(), ifft215);
__m512 ifft307 = _mm512_mask_sub_ps(ifft299, 49344, _mm512_setzero_ps(), ifft300);
__m512 ifft223 = _mm512_mask_mov_ps(ifft215, 49344, ifft214);
__m512 ifft308 = _mm512_mask_mov_ps(ifft300, 49344, ifft299);
__m512 ifft224 = _mm512_mask_sub_ps(ifft216, 49344, _mm512_setzero_ps(), ifft217);
__m512 ifft309 = _mm512_mask_sub_ps(ifft301, 49344, _mm512_setzero_ps(), ifft302);
__m512 ifft225 = _mm512_mask_mov_ps(ifft217, 49344, ifft216);
__m512 ifft310 = _mm512_mask_mov_ps(ifft302, 49344, ifft301);
__m512 ifft226 = _mm512_mask_sub_ps(ifft218, 49344, _mm512_setzero_ps(), ifft219);
__m512 ifft311 = _mm512_mask_sub_ps(ifft303, 49344, _mm512_setzero_ps(), ifft304);
__m512 ifft227 = _mm512_mask_mov_ps(ifft219, 49344, ifft218);
__m512 ifft312 = _mm512_mask_mov_ps(ifft304, 49344, ifft303);
__m512 ifft228 = _mm512_mask_sub_ps(ifft220, 49344, _mm512_setzero_ps(), ifft221);
__m512 ifft313 = _mm512_mask_sub_ps(ifft305, 49344, _mm512_setzero_ps(), ifft306);
__m512 ifft229 = _mm512_mask_mov_ps(ifft221, 49344, ifft220);
__m512 ifft314 = _mm512_mask_mov_ps(ifft306, 49344, ifft305);
__m512 ifft230 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft231 = _mm512_fmadd_ps(ifft222, ifft230, _mm512_shuffle_f32x4(ifft222, ifft222, 177));
__m512 ifft315 = _mm512_fmadd_ps(ifft307, ifft230, _mm512_shuffle_f32x4(ifft307, ifft307, 177));
__m512 ifft232 = _mm512_fmadd_ps(ifft223, ifft230, _mm512_shuffle_f32x4(ifft223, ifft223, 177));
__m512 ifft316 = _mm512_fmadd_ps(ifft308, ifft230, _mm512_shuffle_f32x4(ifft308, ifft308, 177));
__m512 ifft233 = _mm512_fmadd_ps(ifft224, ifft230, _mm512_shuffle_f32x4(ifft224, ifft224, 177));
__m512 ifft317 = _mm512_fmadd_ps(ifft309, ifft230, _mm512_shuffle_f32x4(ifft309, ifft309, 177));
__m512 ifft234 = _mm512_fmadd_ps(ifft225, ifft230, _mm512_shuffle_f32x4(ifft225, ifft225, 177));
__m512 ifft318 = _mm512_fmadd_ps(ifft310, ifft230, _mm512_shuffle_f32x4(ifft310, ifft310, 177));
__m512 ifft235 = _mm512_fmadd_ps(ifft226, ifft230, _mm512_shuffle_f32x4(ifft226, ifft226, 177));
__m512 ifft319 = _mm512_fmadd_ps(ifft311, ifft230, _mm512_shuffle_f32x4(ifft311, ifft311, 177));
__m512 ifft236 = _mm512_fnmsub_ps(ifft227, ifft230, _mm512_shuffle_f32x4(ifft227, ifft227, 177));
__m512 ifft320 = _mm512_fnmsub_ps(ifft312, ifft230, _mm512_shuffle_f32x4(ifft312, ifft312, 177));
__m512 ifft237 = _mm512_fmadd_ps(ifft228, ifft230, _mm512_shuffle_f32x4(ifft228, ifft228, 177));
__m512 ifft321 = _mm512_fmadd_ps(ifft313, ifft230, _mm512_shuffle_f32x4(ifft313, ifft313, 177));
__m512 ifft238 = _mm512_fmadd_ps(ifft229, ifft230, _mm512_shuffle_f32x4(ifft229, ifft229, 177));
__m512 ifft322 = _mm512_fmadd_ps(ifft314, ifft230, _mm512_shuffle_f32x4(ifft314, ifft314, 177));
__m512 ifft239 = _mm512_add_ps(ifft231, ifft232);
__m512 ifft323 = _mm512_add_ps(ifft315, ifft316);
__m512 ifft240 = _mm512_sub_ps(ifft231, ifft232);
__m512 ifft324 = _mm512_sub_ps(ifft315, ifft316);
__m512 ifft241 = _mm512_sub_ps(ifft233, ifft237);
__m512 ifft325 = _mm512_sub_ps(ifft317, ifft321);
__m512 ifft242 = _mm512_add_ps(ifft234, ifft238);
__m512 ifft326 = _mm512_add_ps(ifft318, ifft322);
__m512 ifft243 = _mm512_add_ps(ifft233, ifft237);
__m512 ifft327 = _mm512_add_ps(ifft317, ifft321);
__m512 ifft244 = _mm512_sub_ps(ifft234, ifft238);
__m512 ifft328 = _mm512_sub_ps(ifft318, ifft322);
__m512 ifft245 = _mm512_mul_ps(ifft235, _mm512_set1_ps(3.125e-02f));
__m512 ifft329 = _mm512_mul_ps(ifft319, _mm512_set1_ps(3.125e-02f));
__m512 ifft246 = _mm512_mul_ps(ifft236, _mm512_set1_ps(3.125e-02f));
__m512 ifft330 = _mm512_mul_ps(ifft320, _mm512_set1_ps(3.125e-02f));
__m512 ifft247 = _mm512_fmadd_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft331 = _mm512_fmadd_ps(ifft323, _mm512_set1_ps(1.5625e-02f), ifft329);
__m512 ifft248 = _mm512_fmsub_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft332 = _mm512_fmsub_ps(ifft323, _mm512_set1_ps(1.5625e-02f), ifft329);
__m512 ifft249 = _mm512_fmadd_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft333 = _mm512_fmadd_ps(ifft324, _mm512_set1_ps(1.5625e-02f), ifft330);
__m512 ifft250 = _mm512_fmsub_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft334 = _mm512_fmsub_ps(ifft324, _mm512_set1_ps(1.5625e-02f), ifft330);
__m512 ifft251 = _mm512_add_ps(ifft241, ifft242);
__m512 ifft335 = _mm512_add_ps(ifft325, ifft326);
__m512 ifft252 = _mm512_sub_ps(ifft241, ifft242);
__m512 ifft336 = _mm512_sub_ps(ifft325, ifft326);
__m512 ifft253 = _mm512_fnmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft337 = _mm512_fnmadd_ps(ifft335, _mm512_set1_ps(7.0710677e-01f), ifft327);
__m512 ifft254 = _mm512_fmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft338 = _mm512_fmadd_ps(ifft335, _mm512_set1_ps(7.0710677e-01f), ifft327);
__m512 ifft255 = _mm512_fmadd_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft339 = _mm512_fmadd_ps(ifft336, _mm512_set1_ps(7.0710677e-01f), ifft328);
__m512 ifft256 = _mm512_fmsub_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft340 = _mm512_fmsub_ps(ifft336, _mm512_set1_ps(7.0710677e-01f), ifft328);
__m512 ifft257 = _mm512_add_ps(ifft253, ifft254);
__m512 ifft341 = _mm512_add_ps(ifft337, ifft338);
__m512 ifft258 = _mm512_sub_ps(ifft253, ifft254);
__m512 ifft342 = _mm512_sub_ps(ifft337, ifft338);
__m512 ifft259 = _mm512_add_ps(ifft255, ifft256);
__m512 ifft343 = _mm512_add_ps(ifft339, ifft340);
__m512 ifft260 = _mm512_sub_ps(ifft255, ifft256);
__m512 ifft344 = _mm512_sub_ps(ifft339, ifft340);
__m512 ifft261 = _mm512_fmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft345 = _mm512_fmadd_ps(ifft341, _mm512_set1_ps(1.5625e-02f), ifft331);
__m512 ifft262 = _mm512_fnmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft346 = _mm512_fnmadd_ps(ifft341, _mm512_set1_ps(1.5625e-02f), ifft331);
__m512 ifft263 = _mm512_fmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft347 = _mm512_fmadd_ps(ifft343, _mm512_set1_ps(1.5625e-02f), ifft333);
__m512 ifft264 = _mm512_fnmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft348 = _mm512_fnmadd_ps(ifft343, _mm512_set1_ps(1.5625e-02f), ifft333);
__m512 ifft265 = _mm512_fnmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft349 = _mm512_fnmadd_ps(ifft344, _mm512_set1_ps(1.5625e-02f), ifft332);
__m512 ifft266 = _mm512_fmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft350 = _mm512_fmadd_ps(ifft344, _mm512_set1_ps(1.5625e-02f), ifft332);
__m512 ifft267 = _mm512_fmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft351 = _mm512_fmadd_ps(ifft342, _mm512_set1_ps(1.5625e-02f), ifft334);
__m512 ifft268 = _mm512_fnmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft352 = _mm512_fnmadd_ps(ifft342, _mm512_set1_ps(1.5625e-02f), ifft334);
__m512 dat59 = ifft261;
__m512 dat64 = ifft345;
__m512 dat60 = ifft263;
__m512 dat65 = ifft347;
__m512 dat61 = ifft265;
__m512 dat66 = ifft349;
__m512 dat62 = ifft267;
__m512 dat67 = ifft351;
__m512 dat63 = ifft262;
__m512 dat68 = ifft346;
(void)ifft264;
(void)ifft348;
(void)ifft266;
(void)ifft350;
(void)ifft268;
(void)ifft352;
_mm512_mask_storeu_ps(datPtr2+40+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 3, dat59);
_mm512_mask_storeu_ps(datPtr2+544+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 7936, dat59);
_mm512_mask_storeu_ps(datPtr2+240+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 31, dat64);
_mm512_mask_storeu_ps(datPtr2+344+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 768, dat64);
_mm512_mask_storeu_ps(datPtr2+88+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 3, dat60);
_mm512_mask_storeu_ps(datPtr2+592+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 7936, dat60);
_mm512_mask_storeu_ps(datPtr2+288+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 31, dat65);
_mm512_mask_storeu_ps(datPtr2+392+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 768, dat65);
_mm512_mask_storeu_ps(datPtr2+136+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 3, dat61);
_mm512_mask_storeu_ps(datPtr2+440+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 768, dat66);
_mm512_mask_storeu_ps(datPtr2+184+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 3, dat62);
_mm512_mask_storeu_ps(datPtr2+488+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 768, dat67);
_mm512_mask_storeu_ps(datPtr2+232+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 3, dat63);
_mm512_mask_storeu_ps(datPtr2+536+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+0*t3, 768, dat68);
ptrdiff_t t4 = 0;
__m512 sfRe101 = _mm512_loadu_ps(sfPtr3+512+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfIm101 = _mm512_loadu_ps(sfPtr3+576+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfRe105 = _mm512_loadu_ps(sfPtr3+640+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfIm105 = _mm512_loadu_ps(sfPtr3+704+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfRe102 = _mm512_loadu_ps(sfPtr3+30848+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfIm102 = _mm512_loadu_ps(sfPtr3+30912+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfRe106 = _mm512_loadu_ps(sfPtr3+30976+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfIm106 = _mm512_loadu_ps(sfPtr3+31040+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfRe103 = _mm512_loadu_ps(sfPtr3+61184+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfIm103 = _mm512_loadu_ps(sfPtr3+61248+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfRe107 = _mm512_loadu_ps(sfPtr3+61312+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfIm107 = _mm512_loadu_ps(sfPtr3+61376+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfRe104 = _mm512_loadu_ps(sfPtr3+91520+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfIm104 = _mm512_loadu_ps(sfPtr3+91584+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfRe108 = _mm512_loadu_ps(sfPtr3+91648+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512 sfIm108 = _mm512_loadu_ps(sfPtr3+91712+121344*i9+30336*j5+1536*k9+768*r2+256*t4);
__m512i ifft353 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft354 = _mm512_permutexvar_ps(ifft353, sfRe101);
__m512 ifft445 = _mm512_permutexvar_ps(ifft353, sfRe105);
__m512i ifft355 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft356 = _mm512_permutexvar_ps(ifft355, sfRe101);
__m512 ifft446 = _mm512_permutexvar_ps(ifft355, sfRe105);
__m512 ifft357 = _mm512_permutexvar_ps(ifft353, sfIm101);
__m512 ifft447 = _mm512_permutexvar_ps(ifft353, sfIm105);
__m512 ifft358 = _mm512_permutexvar_ps(ifft355, sfIm101);
__m512 ifft448 = _mm512_permutexvar_ps(ifft355, sfIm105);
__m512 ifft359 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft360 = _mm512_mask_fmadd_ps(ifft358, 65021, ifft359, ifft354);
__m512 ifft449 = _mm512_mask_fmadd_ps(ifft448, 65021, ifft359, ifft445);
__m512 ifft361 = _mm512_mask_fnmadd_ps(ifft357, 65021, ifft359, ifft356);
__m512 ifft450 = _mm512_mask_fnmadd_ps(ifft447, 65021, ifft359, ifft446);
__m512 ifft362 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft363 = _mm512_fmadd_ps(ifft360, ifft362, _mm512_shuffle_ps(ifft360, ifft360, 177));
__m512 ifft451 = _mm512_fmadd_ps(ifft449, ifft362, _mm512_shuffle_ps(ifft449, ifft449, 177));
__m512 ifft364 = _mm512_fmadd_ps(ifft361, ifft362, _mm512_shuffle_ps(ifft361, ifft361, 177));
__m512 ifft452 = _mm512_fmadd_ps(ifft450, ifft362, _mm512_shuffle_ps(ifft450, ifft450, 177));
__m512 ifft365 = _mm512_fmadd_ps(sfRe102, ifft362, _mm512_shuffle_ps(sfRe102, sfRe102, 177));
__m512 ifft453 = _mm512_fmadd_ps(sfRe106, ifft362, _mm512_shuffle_ps(sfRe106, sfRe106, 177));
__m512 ifft366 = _mm512_fmadd_ps(sfIm102, ifft362, _mm512_shuffle_ps(sfIm102, sfIm102, 177));
__m512 ifft454 = _mm512_fmadd_ps(sfIm106, ifft362, _mm512_shuffle_ps(sfIm106, sfIm106, 177));
__m512 ifft367 = _mm512_fmadd_ps(sfRe103, ifft362, _mm512_shuffle_ps(sfRe103, sfRe103, 177));
__m512 ifft455 = _mm512_fmadd_ps(sfRe107, ifft362, _mm512_shuffle_ps(sfRe107, sfRe107, 177));
__m512 ifft368 = _mm512_fmadd_ps(sfIm103, ifft362, _mm512_shuffle_ps(sfIm103, sfIm103, 177));
__m512 ifft456 = _mm512_fmadd_ps(sfIm107, ifft362, _mm512_shuffle_ps(sfIm107, sfIm107, 177));
__m512 ifft369 = _mm512_fmadd_ps(sfRe104, ifft362, _mm512_shuffle_ps(sfRe104, sfRe104, 177));
__m512 ifft457 = _mm512_fmadd_ps(sfRe108, ifft362, _mm512_shuffle_ps(sfRe108, sfRe108, 177));
__m512 ifft370 = _mm512_fmadd_ps(sfIm104, ifft362, _mm512_shuffle_ps(sfIm104, sfIm104, 177));
__m512 ifft458 = _mm512_fmadd_ps(sfIm108, ifft362, _mm512_shuffle_ps(sfIm108, sfIm108, 177));
__m512 ifft371 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft372 = _mm512_mul_ps(ifft363, ifft371);
__m512 ifft459 = _mm512_mul_ps(ifft451, ifft371);
__m512 ifft373 = _mm512_mul_ps(ifft364, ifft371);
__m512 ifft460 = _mm512_mul_ps(ifft452, ifft371);
__m512 ifft374 = _mm512_mul_ps(ifft365, ifft371);
__m512 ifft461 = _mm512_mul_ps(ifft453, ifft371);
__m512 ifft375 = _mm512_mul_ps(ifft366, ifft371);
__m512 ifft462 = _mm512_mul_ps(ifft454, ifft371);
__m512 ifft376 = _mm512_mul_ps(ifft367, ifft371);
__m512 ifft463 = _mm512_mul_ps(ifft455, ifft371);
__m512 ifft377 = _mm512_mul_ps(ifft368, ifft371);
__m512 ifft464 = _mm512_mul_ps(ifft456, ifft371);
__m512 ifft378 = _mm512_mul_ps(ifft369, ifft371);
__m512 ifft465 = _mm512_mul_ps(ifft457, ifft371);
__m512 ifft379 = _mm512_mul_ps(ifft370, ifft371);
__m512 ifft466 = _mm512_mul_ps(ifft458, ifft371);
__m512 ifft380 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft381 = _mm512_fnmadd_ps(ifft364, ifft380, ifft372);
__m512 ifft467 = _mm512_fnmadd_ps(ifft452, ifft380, ifft459);
__m512 ifft382 = _mm512_fmadd_ps(ifft363, ifft380, ifft373);
__m512 ifft468 = _mm512_fmadd_ps(ifft451, ifft380, ifft460);
__m512 ifft383 = _mm512_fnmadd_ps(ifft366, ifft380, ifft374);
__m512 ifft469 = _mm512_fnmadd_ps(ifft454, ifft380, ifft461);
__m512 ifft384 = _mm512_fmadd_ps(ifft365, ifft380, ifft375);
__m512 ifft470 = _mm512_fmadd_ps(ifft453, ifft380, ifft462);
__m512 ifft385 = _mm512_fnmadd_ps(ifft368, ifft380, ifft376);
__m512 ifft471 = _mm512_fnmadd_ps(ifft456, ifft380, ifft463);
__m512 ifft386 = _mm512_fmadd_ps(ifft367, ifft380, ifft377);
__m512 ifft472 = _mm512_fmadd_ps(ifft455, ifft380, ifft464);
__m512 ifft387 = _mm512_fnmadd_ps(ifft370, ifft380, ifft378);
__m512 ifft473 = _mm512_fnmadd_ps(ifft458, ifft380, ifft465);
__m512 ifft388 = _mm512_fmadd_ps(ifft369, ifft380, ifft379);
__m512 ifft474 = _mm512_fmadd_ps(ifft457, ifft380, ifft466);
__m512 ifft389 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft390 = _mm512_fmadd_ps(ifft381, ifft389, _mm512_shuffle_ps(ifft381, ifft381, 78));
__m512 ifft475 = _mm512_fmadd_ps(ifft467, ifft389, _mm512_shuffle_ps(ifft467, ifft467, 78));
__m512 ifft391 = _mm512_fmadd_ps(ifft382, ifft389, _mm512_shuffle_ps(ifft382, ifft382, 78));
__m512 ifft476 = _mm512_fmadd_ps(ifft468, ifft389, _mm512_shuffle_ps(ifft468, ifft468, 78));
__m512 ifft392 = _mm512_fmadd_ps(ifft383, ifft389, _mm512_shuffle_ps(ifft383, ifft383, 78));
__m512 ifft477 = _mm512_fmadd_ps(ifft469, ifft389, _mm512_shuffle_ps(ifft469, ifft469, 78));
__m512 ifft393 = _mm512_fmadd_ps(ifft384, ifft389, _mm512_shuffle_ps(ifft384, ifft384, 78));
__m512 ifft478 = _mm512_fmadd_ps(ifft470, ifft389, _mm512_shuffle_ps(ifft470, ifft470, 78));
__m512 ifft394 = _mm512_fmadd_ps(ifft385, ifft389, _mm512_shuffle_ps(ifft385, ifft385, 78));
__m512 ifft479 = _mm512_fmadd_ps(ifft471, ifft389, _mm512_shuffle_ps(ifft471, ifft471, 78));
__m512 ifft395 = _mm512_fmadd_ps(ifft386, ifft389, _mm512_shuffle_ps(ifft386, ifft386, 78));
__m512 ifft480 = _mm512_fmadd_ps(ifft472, ifft389, _mm512_shuffle_ps(ifft472, ifft472, 78));
__m512 ifft396 = _mm512_fmadd_ps(ifft387, ifft389, _mm512_shuffle_ps(ifft387, ifft387, 78));
__m512 ifft481 = _mm512_fmadd_ps(ifft473, ifft389, _mm512_shuffle_ps(ifft473, ifft473, 78));
__m512 ifft397 = _mm512_fmadd_ps(ifft388, ifft389, _mm512_shuffle_ps(ifft388, ifft388, 78));
__m512 ifft482 = _mm512_fmadd_ps(ifft474, ifft389, _mm512_shuffle_ps(ifft474, ifft474, 78));
__m512 ifft398 = _mm512_mask_sub_ps(ifft390, 49344, _mm512_setzero_ps(), ifft391);
__m512 ifft483 = _mm512_mask_sub_ps(ifft475, 49344, _mm512_setzero_ps(), ifft476);
__m512 ifft399 = _mm512_mask_mov_ps(ifft391, 49344, ifft390);
__m512 ifft484 = _mm512_mask_mov_ps(ifft476, 49344, ifft475);
__m512 ifft400 = _mm512_mask_sub_ps(ifft392, 49344, _mm512_setzero_ps(), ifft393);
__m512 ifft485 = _mm512_mask_sub_ps(ifft477, 49344, _mm512_setzero_ps(), ifft478);
__m512 ifft401 = _mm512_mask_mov_ps(ifft393, 49344, ifft392);
__m512 ifft486 = _mm512_mask_mov_ps(ifft478, 49344, ifft477);
__m512 ifft402 = _mm512_mask_sub_ps(ifft394, 49344, _mm512_setzero_ps(), ifft395);
__m512 ifft487 = _mm512_mask_sub_ps(ifft479, 49344, _mm512_setzero_ps(), ifft480);
__m512 ifft403 = _mm512_mask_mov_ps(ifft395, 49344, ifft394);
__m512 ifft488 = _mm512_mask_mov_ps(ifft480, 49344, ifft479);
__m512 ifft404 = _mm512_mask_sub_ps(ifft396, 49344, _mm512_setzero_ps(), ifft397);
__m512 ifft489 = _mm512_mask_sub_ps(ifft481, 49344, _mm512_setzero_ps(), ifft482);
__m512 ifft405 = _mm512_mask_mov_ps(ifft397, 49344, ifft396);
__m512 ifft490 = _mm512_mask_mov_ps(ifft482, 49344, ifft481);
__m512 ifft406 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft407 = _mm512_fmadd_ps(ifft398, ifft406, _mm512_shuffle_f32x4(ifft398, ifft398, 177));
__m512 ifft491 = _mm512_fmadd_ps(ifft483, ifft406, _mm512_shuffle_f32x4(ifft483, ifft483, 177));
__m512 ifft408 = _mm512_fmadd_ps(ifft399, ifft406, _mm512_shuffle_f32x4(ifft399, ifft399, 177));
__m512 ifft492 = _mm512_fmadd_ps(ifft484, ifft406, _mm512_shuffle_f32x4(ifft484, ifft484, 177));
__m512 ifft409 = _mm512_fmadd_ps(ifft400, ifft406, _mm512_shuffle_f32x4(ifft400, ifft400, 177));
__m512 ifft493 = _mm512_fmadd_ps(ifft485, ifft406, _mm512_shuffle_f32x4(ifft485, ifft485, 177));
__m512 ifft410 = _mm512_fmadd_ps(ifft401, ifft406, _mm512_shuffle_f32x4(ifft401, ifft401, 177));
__m512 ifft494 = _mm512_fmadd_ps(ifft486, ifft406, _mm512_shuffle_f32x4(ifft486, ifft486, 177));
__m512 ifft411 = _mm512_fmadd_ps(ifft402, ifft406, _mm512_shuffle_f32x4(ifft402, ifft402, 177));
__m512 ifft495 = _mm512_fmadd_ps(ifft487, ifft406, _mm512_shuffle_f32x4(ifft487, ifft487, 177));
__m512 ifft412 = _mm512_fnmsub_ps(ifft403, ifft406, _mm512_shuffle_f32x4(ifft403, ifft403, 177));
__m512 ifft496 = _mm512_fnmsub_ps(ifft488, ifft406, _mm512_shuffle_f32x4(ifft488, ifft488, 177));
__m512 ifft413 = _mm512_fmadd_ps(ifft404, ifft406, _mm512_shuffle_f32x4(ifft404, ifft404, 177));
__m512 ifft497 = _mm512_fmadd_ps(ifft489, ifft406, _mm512_shuffle_f32x4(ifft489, ifft489, 177));
__m512 ifft414 = _mm512_fmadd_ps(ifft405, ifft406, _mm512_shuffle_f32x4(ifft405, ifft405, 177));
__m512 ifft498 = _mm512_fmadd_ps(ifft490, ifft406, _mm512_shuffle_f32x4(ifft490, ifft490, 177));
__m512 ifft415 = _mm512_add_ps(ifft407, ifft408);
__m512 ifft499 = _mm512_add_ps(ifft491, ifft492);
__m512 ifft416 = _mm512_sub_ps(ifft407, ifft408);
__m512 ifft500 = _mm512_sub_ps(ifft491, ifft492);
__m512 ifft417 = _mm512_sub_ps(ifft409, ifft413);
__m512 ifft501 = _mm512_sub_ps(ifft493, ifft497);
__m512 ifft418 = _mm512_add_ps(ifft410, ifft414);
__m512 ifft502 = _mm512_add_ps(ifft494, ifft498);
__m512 ifft419 = _mm512_add_ps(ifft409, ifft413);
__m512 ifft503 = _mm512_add_ps(ifft493, ifft497);
__m512 ifft420 = _mm512_sub_ps(ifft410, ifft414);
__m512 ifft504 = _mm512_sub_ps(ifft494, ifft498);
__m512 ifft421 = _mm512_mul_ps(ifft411, _mm512_set1_ps(3.125e-02f));
__m512 ifft505 = _mm512_mul_ps(ifft495, _mm512_set1_ps(3.125e-02f));
__m512 ifft422 = _mm512_mul_ps(ifft412, _mm512_set1_ps(3.125e-02f));
__m512 ifft506 = _mm512_mul_ps(ifft496, _mm512_set1_ps(3.125e-02f));
__m512 ifft423 = _mm512_fmadd_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft507 = _mm512_fmadd_ps(ifft499, _mm512_set1_ps(1.5625e-02f), ifft505);
__m512 ifft424 = _mm512_fmsub_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft508 = _mm512_fmsub_ps(ifft499, _mm512_set1_ps(1.5625e-02f), ifft505);
__m512 ifft425 = _mm512_fmadd_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft509 = _mm512_fmadd_ps(ifft500, _mm512_set1_ps(1.5625e-02f), ifft506);
__m512 ifft426 = _mm512_fmsub_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft510 = _mm512_fmsub_ps(ifft500, _mm512_set1_ps(1.5625e-02f), ifft506);
__m512 ifft427 = _mm512_add_ps(ifft417, ifft418);
__m512 ifft511 = _mm512_add_ps(ifft501, ifft502);
__m512 ifft428 = _mm512_sub_ps(ifft417, ifft418);
__m512 ifft512 = _mm512_sub_ps(ifft501, ifft502);
__m512 ifft429 = _mm512_fnmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft513 = _mm512_fnmadd_ps(ifft511, _mm512_set1_ps(7.0710677e-01f), ifft503);
__m512 ifft430 = _mm512_fmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft514 = _mm512_fmadd_ps(ifft511, _mm512_set1_ps(7.0710677e-01f), ifft503);
__m512 ifft431 = _mm512_fmadd_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft515 = _mm512_fmadd_ps(ifft512, _mm512_set1_ps(7.0710677e-01f), ifft504);
__m512 ifft432 = _mm512_fmsub_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft516 = _mm512_fmsub_ps(ifft512, _mm512_set1_ps(7.0710677e-01f), ifft504);
__m512 ifft433 = _mm512_add_ps(ifft429, ifft430);
__m512 ifft517 = _mm512_add_ps(ifft513, ifft514);
__m512 ifft434 = _mm512_sub_ps(ifft429, ifft430);
__m512 ifft518 = _mm512_sub_ps(ifft513, ifft514);
__m512 ifft435 = _mm512_add_ps(ifft431, ifft432);
__m512 ifft519 = _mm512_add_ps(ifft515, ifft516);
__m512 ifft436 = _mm512_sub_ps(ifft431, ifft432);
__m512 ifft520 = _mm512_sub_ps(ifft515, ifft516);
__m512 ifft437 = _mm512_fmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft521 = _mm512_fmadd_ps(ifft517, _mm512_set1_ps(1.5625e-02f), ifft507);
__m512 ifft438 = _mm512_fnmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft522 = _mm512_fnmadd_ps(ifft517, _mm512_set1_ps(1.5625e-02f), ifft507);
__m512 ifft439 = _mm512_fmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft523 = _mm512_fmadd_ps(ifft519, _mm512_set1_ps(1.5625e-02f), ifft509);
__m512 ifft440 = _mm512_fnmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft524 = _mm512_fnmadd_ps(ifft519, _mm512_set1_ps(1.5625e-02f), ifft509);
__m512 ifft441 = _mm512_fnmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft525 = _mm512_fnmadd_ps(ifft520, _mm512_set1_ps(1.5625e-02f), ifft508);
__m512 ifft442 = _mm512_fmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft526 = _mm512_fmadd_ps(ifft520, _mm512_set1_ps(1.5625e-02f), ifft508);
__m512 ifft443 = _mm512_fmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft527 = _mm512_fmadd_ps(ifft518, _mm512_set1_ps(1.5625e-02f), ifft510);
__m512 ifft444 = _mm512_fnmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft528 = _mm512_fnmadd_ps(ifft518, _mm512_set1_ps(1.5625e-02f), ifft510);
__m512 dat69 = ifft437;
__m512 dat71 = ifft521;
__m512 dat70 = ifft439;
__m512 dat72 = ifft523;
(void)ifft441;
(void)ifft525;
(void)ifft443;
(void)ifft527;
(void)ifft438;
(void)ifft522;
(void)ifft440;
(void)ifft524;
(void)ifft442;
(void)ifft526;
(void)ifft444;
(void)ifft528;
__m512i pm3 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack11 = _mm512_permutex2var_ps(dat69, pm3, dat71);
__m512i pm4 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack12 = _mm512_permutex2var_ps(dat69, pm4, dat71);
__m512 pack13 = _mm512_permutex2var_ps(dat70, pm3, dat72);
__m512 pack14 = _mm512_permutex2var_ps(dat70, pm4, dat72);
_mm512_mask_storeu_ps(datPtr2+260+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t4, 127, pack11);
_mm512_mask_storeu_ps(datPtr2+596+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t4, 127, pack12);
_mm512_mask_storeu_ps(datPtr2+308+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t4, 127, pack13);
_mm512_mask_storeu_ps(datPtr2+644+26544*i9+1344*k9+672*r2+48*toH1+4*toW1+40*t4, 127, pack14);
}
}
ptrdiff_t r3 = 0;
for (; r3 != 1; ++r3) {
ptrdiff_t t5 = 0;
__m512 sfRe109 = _mm512_loadu_ps(sfPtr3+0+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfIm109 = _mm512_loadu_ps(sfPtr3+64+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfRe113 = _mm512_loadu_ps(sfPtr3+128+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfIm113 = _mm512_loadu_ps(sfPtr3+192+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfRe110 = _mm512_loadu_ps(sfPtr3+30336+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfIm110 = _mm512_loadu_ps(sfPtr3+30400+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfRe114 = _mm512_loadu_ps(sfPtr3+30464+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfIm114 = _mm512_loadu_ps(sfPtr3+30528+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfRe111 = _mm512_loadu_ps(sfPtr3+60672+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfIm111 = _mm512_loadu_ps(sfPtr3+60736+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfRe115 = _mm512_loadu_ps(sfPtr3+60800+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfIm115 = _mm512_loadu_ps(sfPtr3+60864+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfRe112 = _mm512_loadu_ps(sfPtr3+91008+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfIm112 = _mm512_loadu_ps(sfPtr3+91072+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfRe116 = _mm512_loadu_ps(sfPtr3+91136+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512 sfIm116 = _mm512_loadu_ps(sfPtr3+91200+121344*i9+30336*j5+1536*k9+768*r3+256*t5);
__m512i ifft529 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft530 = _mm512_permutexvar_ps(ifft529, sfRe109);
__m512 ifft621 = _mm512_permutexvar_ps(ifft529, sfRe113);
__m512i ifft531 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft532 = _mm512_permutexvar_ps(ifft531, sfRe109);
__m512 ifft622 = _mm512_permutexvar_ps(ifft531, sfRe113);
__m512 ifft533 = _mm512_permutexvar_ps(ifft529, sfIm109);
__m512 ifft623 = _mm512_permutexvar_ps(ifft529, sfIm113);
__m512 ifft534 = _mm512_permutexvar_ps(ifft531, sfIm109);
__m512 ifft624 = _mm512_permutexvar_ps(ifft531, sfIm113);
__m512 ifft535 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft536 = _mm512_mask_fmadd_ps(ifft534, 65021, ifft535, ifft530);
__m512 ifft625 = _mm512_mask_fmadd_ps(ifft624, 65021, ifft535, ifft621);
__m512 ifft537 = _mm512_mask_fnmadd_ps(ifft533, 65021, ifft535, ifft532);
__m512 ifft626 = _mm512_mask_fnmadd_ps(ifft623, 65021, ifft535, ifft622);
__m512 ifft538 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft539 = _mm512_fmadd_ps(ifft536, ifft538, _mm512_shuffle_ps(ifft536, ifft536, 177));
__m512 ifft627 = _mm512_fmadd_ps(ifft625, ifft538, _mm512_shuffle_ps(ifft625, ifft625, 177));
__m512 ifft540 = _mm512_fmadd_ps(ifft537, ifft538, _mm512_shuffle_ps(ifft537, ifft537, 177));
__m512 ifft628 = _mm512_fmadd_ps(ifft626, ifft538, _mm512_shuffle_ps(ifft626, ifft626, 177));
__m512 ifft541 = _mm512_fmadd_ps(sfRe110, ifft538, _mm512_shuffle_ps(sfRe110, sfRe110, 177));
__m512 ifft629 = _mm512_fmadd_ps(sfRe114, ifft538, _mm512_shuffle_ps(sfRe114, sfRe114, 177));
__m512 ifft542 = _mm512_fmadd_ps(sfIm110, ifft538, _mm512_shuffle_ps(sfIm110, sfIm110, 177));
__m512 ifft630 = _mm512_fmadd_ps(sfIm114, ifft538, _mm512_shuffle_ps(sfIm114, sfIm114, 177));
__m512 ifft543 = _mm512_fmadd_ps(sfRe111, ifft538, _mm512_shuffle_ps(sfRe111, sfRe111, 177));
__m512 ifft631 = _mm512_fmadd_ps(sfRe115, ifft538, _mm512_shuffle_ps(sfRe115, sfRe115, 177));
__m512 ifft544 = _mm512_fmadd_ps(sfIm111, ifft538, _mm512_shuffle_ps(sfIm111, sfIm111, 177));
__m512 ifft632 = _mm512_fmadd_ps(sfIm115, ifft538, _mm512_shuffle_ps(sfIm115, sfIm115, 177));
__m512 ifft545 = _mm512_fmadd_ps(sfRe112, ifft538, _mm512_shuffle_ps(sfRe112, sfRe112, 177));
__m512 ifft633 = _mm512_fmadd_ps(sfRe116, ifft538, _mm512_shuffle_ps(sfRe116, sfRe116, 177));
__m512 ifft546 = _mm512_fmadd_ps(sfIm112, ifft538, _mm512_shuffle_ps(sfIm112, sfIm112, 177));
__m512 ifft634 = _mm512_fmadd_ps(sfIm116, ifft538, _mm512_shuffle_ps(sfIm116, sfIm116, 177));
__m512 ifft547 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft548 = _mm512_mul_ps(ifft539, ifft547);
__m512 ifft635 = _mm512_mul_ps(ifft627, ifft547);
__m512 ifft549 = _mm512_mul_ps(ifft540, ifft547);
__m512 ifft636 = _mm512_mul_ps(ifft628, ifft547);
__m512 ifft550 = _mm512_mul_ps(ifft541, ifft547);
__m512 ifft637 = _mm512_mul_ps(ifft629, ifft547);
__m512 ifft551 = _mm512_mul_ps(ifft542, ifft547);
__m512 ifft638 = _mm512_mul_ps(ifft630, ifft547);
__m512 ifft552 = _mm512_mul_ps(ifft543, ifft547);
__m512 ifft639 = _mm512_mul_ps(ifft631, ifft547);
__m512 ifft553 = _mm512_mul_ps(ifft544, ifft547);
__m512 ifft640 = _mm512_mul_ps(ifft632, ifft547);
__m512 ifft554 = _mm512_mul_ps(ifft545, ifft547);
__m512 ifft641 = _mm512_mul_ps(ifft633, ifft547);
__m512 ifft555 = _mm512_mul_ps(ifft546, ifft547);
__m512 ifft642 = _mm512_mul_ps(ifft634, ifft547);
__m512 ifft556 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft557 = _mm512_fnmadd_ps(ifft540, ifft556, ifft548);
__m512 ifft643 = _mm512_fnmadd_ps(ifft628, ifft556, ifft635);
__m512 ifft558 = _mm512_fmadd_ps(ifft539, ifft556, ifft549);
__m512 ifft644 = _mm512_fmadd_ps(ifft627, ifft556, ifft636);
__m512 ifft559 = _mm512_fnmadd_ps(ifft542, ifft556, ifft550);
__m512 ifft645 = _mm512_fnmadd_ps(ifft630, ifft556, ifft637);
__m512 ifft560 = _mm512_fmadd_ps(ifft541, ifft556, ifft551);
__m512 ifft646 = _mm512_fmadd_ps(ifft629, ifft556, ifft638);
__m512 ifft561 = _mm512_fnmadd_ps(ifft544, ifft556, ifft552);
__m512 ifft647 = _mm512_fnmadd_ps(ifft632, ifft556, ifft639);
__m512 ifft562 = _mm512_fmadd_ps(ifft543, ifft556, ifft553);
__m512 ifft648 = _mm512_fmadd_ps(ifft631, ifft556, ifft640);
__m512 ifft563 = _mm512_fnmadd_ps(ifft546, ifft556, ifft554);
__m512 ifft649 = _mm512_fnmadd_ps(ifft634, ifft556, ifft641);
__m512 ifft564 = _mm512_fmadd_ps(ifft545, ifft556, ifft555);
__m512 ifft650 = _mm512_fmadd_ps(ifft633, ifft556, ifft642);
__m512 ifft565 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft566 = _mm512_fmadd_ps(ifft557, ifft565, _mm512_shuffle_ps(ifft557, ifft557, 78));
__m512 ifft651 = _mm512_fmadd_ps(ifft643, ifft565, _mm512_shuffle_ps(ifft643, ifft643, 78));
__m512 ifft567 = _mm512_fmadd_ps(ifft558, ifft565, _mm512_shuffle_ps(ifft558, ifft558, 78));
__m512 ifft652 = _mm512_fmadd_ps(ifft644, ifft565, _mm512_shuffle_ps(ifft644, ifft644, 78));
__m512 ifft568 = _mm512_fmadd_ps(ifft559, ifft565, _mm512_shuffle_ps(ifft559, ifft559, 78));
__m512 ifft653 = _mm512_fmadd_ps(ifft645, ifft565, _mm512_shuffle_ps(ifft645, ifft645, 78));
__m512 ifft569 = _mm512_fmadd_ps(ifft560, ifft565, _mm512_shuffle_ps(ifft560, ifft560, 78));
__m512 ifft654 = _mm512_fmadd_ps(ifft646, ifft565, _mm512_shuffle_ps(ifft646, ifft646, 78));
__m512 ifft570 = _mm512_fmadd_ps(ifft561, ifft565, _mm512_shuffle_ps(ifft561, ifft561, 78));
__m512 ifft655 = _mm512_fmadd_ps(ifft647, ifft565, _mm512_shuffle_ps(ifft647, ifft647, 78));
__m512 ifft571 = _mm512_fmadd_ps(ifft562, ifft565, _mm512_shuffle_ps(ifft562, ifft562, 78));
__m512 ifft656 = _mm512_fmadd_ps(ifft648, ifft565, _mm512_shuffle_ps(ifft648, ifft648, 78));
__m512 ifft572 = _mm512_fmadd_ps(ifft563, ifft565, _mm512_shuffle_ps(ifft563, ifft563, 78));
__m512 ifft657 = _mm512_fmadd_ps(ifft649, ifft565, _mm512_shuffle_ps(ifft649, ifft649, 78));
__m512 ifft573 = _mm512_fmadd_ps(ifft564, ifft565, _mm512_shuffle_ps(ifft564, ifft564, 78));
__m512 ifft658 = _mm512_fmadd_ps(ifft650, ifft565, _mm512_shuffle_ps(ifft650, ifft650, 78));
__m512 ifft574 = _mm512_mask_sub_ps(ifft566, 49344, _mm512_setzero_ps(), ifft567);
__m512 ifft659 = _mm512_mask_sub_ps(ifft651, 49344, _mm512_setzero_ps(), ifft652);
__m512 ifft575 = _mm512_mask_mov_ps(ifft567, 49344, ifft566);
__m512 ifft660 = _mm512_mask_mov_ps(ifft652, 49344, ifft651);
__m512 ifft576 = _mm512_mask_sub_ps(ifft568, 49344, _mm512_setzero_ps(), ifft569);
__m512 ifft661 = _mm512_mask_sub_ps(ifft653, 49344, _mm512_setzero_ps(), ifft654);
__m512 ifft577 = _mm512_mask_mov_ps(ifft569, 49344, ifft568);
__m512 ifft662 = _mm512_mask_mov_ps(ifft654, 49344, ifft653);
__m512 ifft578 = _mm512_mask_sub_ps(ifft570, 49344, _mm512_setzero_ps(), ifft571);
__m512 ifft663 = _mm512_mask_sub_ps(ifft655, 49344, _mm512_setzero_ps(), ifft656);
__m512 ifft579 = _mm512_mask_mov_ps(ifft571, 49344, ifft570);
__m512 ifft664 = _mm512_mask_mov_ps(ifft656, 49344, ifft655);
__m512 ifft580 = _mm512_mask_sub_ps(ifft572, 49344, _mm512_setzero_ps(), ifft573);
__m512 ifft665 = _mm512_mask_sub_ps(ifft657, 49344, _mm512_setzero_ps(), ifft658);
__m512 ifft581 = _mm512_mask_mov_ps(ifft573, 49344, ifft572);
__m512 ifft666 = _mm512_mask_mov_ps(ifft658, 49344, ifft657);
__m512 ifft582 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft583 = _mm512_fmadd_ps(ifft574, ifft582, _mm512_shuffle_f32x4(ifft574, ifft574, 177));
__m512 ifft667 = _mm512_fmadd_ps(ifft659, ifft582, _mm512_shuffle_f32x4(ifft659, ifft659, 177));
__m512 ifft584 = _mm512_fmadd_ps(ifft575, ifft582, _mm512_shuffle_f32x4(ifft575, ifft575, 177));
__m512 ifft668 = _mm512_fmadd_ps(ifft660, ifft582, _mm512_shuffle_f32x4(ifft660, ifft660, 177));
__m512 ifft585 = _mm512_fmadd_ps(ifft576, ifft582, _mm512_shuffle_f32x4(ifft576, ifft576, 177));
__m512 ifft669 = _mm512_fmadd_ps(ifft661, ifft582, _mm512_shuffle_f32x4(ifft661, ifft661, 177));
__m512 ifft586 = _mm512_fmadd_ps(ifft577, ifft582, _mm512_shuffle_f32x4(ifft577, ifft577, 177));
__m512 ifft670 = _mm512_fmadd_ps(ifft662, ifft582, _mm512_shuffle_f32x4(ifft662, ifft662, 177));
__m512 ifft587 = _mm512_fmadd_ps(ifft578, ifft582, _mm512_shuffle_f32x4(ifft578, ifft578, 177));
__m512 ifft671 = _mm512_fmadd_ps(ifft663, ifft582, _mm512_shuffle_f32x4(ifft663, ifft663, 177));
__m512 ifft588 = _mm512_fnmsub_ps(ifft579, ifft582, _mm512_shuffle_f32x4(ifft579, ifft579, 177));
__m512 ifft672 = _mm512_fnmsub_ps(ifft664, ifft582, _mm512_shuffle_f32x4(ifft664, ifft664, 177));
__m512 ifft589 = _mm512_fmadd_ps(ifft580, ifft582, _mm512_shuffle_f32x4(ifft580, ifft580, 177));
__m512 ifft673 = _mm512_fmadd_ps(ifft665, ifft582, _mm512_shuffle_f32x4(ifft665, ifft665, 177));
__m512 ifft590 = _mm512_fmadd_ps(ifft581, ifft582, _mm512_shuffle_f32x4(ifft581, ifft581, 177));
__m512 ifft674 = _mm512_fmadd_ps(ifft666, ifft582, _mm512_shuffle_f32x4(ifft666, ifft666, 177));
__m512 ifft591 = _mm512_add_ps(ifft583, ifft584);
__m512 ifft675 = _mm512_add_ps(ifft667, ifft668);
__m512 ifft592 = _mm512_sub_ps(ifft583, ifft584);
__m512 ifft676 = _mm512_sub_ps(ifft667, ifft668);
__m512 ifft593 = _mm512_sub_ps(ifft585, ifft589);
__m512 ifft677 = _mm512_sub_ps(ifft669, ifft673);
__m512 ifft594 = _mm512_add_ps(ifft586, ifft590);
__m512 ifft678 = _mm512_add_ps(ifft670, ifft674);
__m512 ifft595 = _mm512_add_ps(ifft585, ifft589);
__m512 ifft679 = _mm512_add_ps(ifft669, ifft673);
__m512 ifft596 = _mm512_sub_ps(ifft586, ifft590);
__m512 ifft680 = _mm512_sub_ps(ifft670, ifft674);
__m512 ifft597 = _mm512_mul_ps(ifft587, _mm512_set1_ps(3.125e-02f));
__m512 ifft681 = _mm512_mul_ps(ifft671, _mm512_set1_ps(3.125e-02f));
__m512 ifft598 = _mm512_mul_ps(ifft588, _mm512_set1_ps(3.125e-02f));
__m512 ifft682 = _mm512_mul_ps(ifft672, _mm512_set1_ps(3.125e-02f));
__m512 ifft599 = _mm512_fmadd_ps(ifft591, _mm512_set1_ps(1.5625e-02f), ifft597);
__m512 ifft683 = _mm512_fmadd_ps(ifft675, _mm512_set1_ps(1.5625e-02f), ifft681);
__m512 ifft600 = _mm512_fmsub_ps(ifft591, _mm512_set1_ps(1.5625e-02f), ifft597);
__m512 ifft684 = _mm512_fmsub_ps(ifft675, _mm512_set1_ps(1.5625e-02f), ifft681);
__m512 ifft601 = _mm512_fmadd_ps(ifft592, _mm512_set1_ps(1.5625e-02f), ifft598);
__m512 ifft685 = _mm512_fmadd_ps(ifft676, _mm512_set1_ps(1.5625e-02f), ifft682);
__m512 ifft602 = _mm512_fmsub_ps(ifft592, _mm512_set1_ps(1.5625e-02f), ifft598);
__m512 ifft686 = _mm512_fmsub_ps(ifft676, _mm512_set1_ps(1.5625e-02f), ifft682);
__m512 ifft603 = _mm512_add_ps(ifft593, ifft594);
__m512 ifft687 = _mm512_add_ps(ifft677, ifft678);
__m512 ifft604 = _mm512_sub_ps(ifft593, ifft594);
__m512 ifft688 = _mm512_sub_ps(ifft677, ifft678);
__m512 ifft605 = _mm512_fnmadd_ps(ifft603, _mm512_set1_ps(7.0710677e-01f), ifft595);
__m512 ifft689 = _mm512_fnmadd_ps(ifft687, _mm512_set1_ps(7.0710677e-01f), ifft679);
__m512 ifft606 = _mm512_fmadd_ps(ifft603, _mm512_set1_ps(7.0710677e-01f), ifft595);
__m512 ifft690 = _mm512_fmadd_ps(ifft687, _mm512_set1_ps(7.0710677e-01f), ifft679);
__m512 ifft607 = _mm512_fmadd_ps(ifft604, _mm512_set1_ps(7.0710677e-01f), ifft596);
__m512 ifft691 = _mm512_fmadd_ps(ifft688, _mm512_set1_ps(7.0710677e-01f), ifft680);
__m512 ifft608 = _mm512_fmsub_ps(ifft604, _mm512_set1_ps(7.0710677e-01f), ifft596);
__m512 ifft692 = _mm512_fmsub_ps(ifft688, _mm512_set1_ps(7.0710677e-01f), ifft680);
__m512 ifft609 = _mm512_add_ps(ifft605, ifft606);
__m512 ifft693 = _mm512_add_ps(ifft689, ifft690);
__m512 ifft610 = _mm512_sub_ps(ifft605, ifft606);
__m512 ifft694 = _mm512_sub_ps(ifft689, ifft690);
__m512 ifft611 = _mm512_add_ps(ifft607, ifft608);
__m512 ifft695 = _mm512_add_ps(ifft691, ifft692);
__m512 ifft612 = _mm512_sub_ps(ifft607, ifft608);
__m512 ifft696 = _mm512_sub_ps(ifft691, ifft692);
__m512 ifft613 = _mm512_fmadd_ps(ifft609, _mm512_set1_ps(1.5625e-02f), ifft599);
__m512 ifft697 = _mm512_fmadd_ps(ifft693, _mm512_set1_ps(1.5625e-02f), ifft683);
__m512 ifft614 = _mm512_fnmadd_ps(ifft609, _mm512_set1_ps(1.5625e-02f), ifft599);
__m512 ifft698 = _mm512_fnmadd_ps(ifft693, _mm512_set1_ps(1.5625e-02f), ifft683);
__m512 ifft615 = _mm512_fmadd_ps(ifft611, _mm512_set1_ps(1.5625e-02f), ifft601);
__m512 ifft699 = _mm512_fmadd_ps(ifft695, _mm512_set1_ps(1.5625e-02f), ifft685);
__m512 ifft616 = _mm512_fnmadd_ps(ifft611, _mm512_set1_ps(1.5625e-02f), ifft601);
__m512 ifft700 = _mm512_fnmadd_ps(ifft695, _mm512_set1_ps(1.5625e-02f), ifft685);
__m512 ifft617 = _mm512_fnmadd_ps(ifft612, _mm512_set1_ps(1.5625e-02f), ifft600);
__m512 ifft701 = _mm512_fnmadd_ps(ifft696, _mm512_set1_ps(1.5625e-02f), ifft684);
__m512 ifft618 = _mm512_fmadd_ps(ifft612, _mm512_set1_ps(1.5625e-02f), ifft600);
__m512 ifft702 = _mm512_fmadd_ps(ifft696, _mm512_set1_ps(1.5625e-02f), ifft684);
__m512 ifft619 = _mm512_fmadd_ps(ifft610, _mm512_set1_ps(1.5625e-02f), ifft602);
__m512 ifft703 = _mm512_fmadd_ps(ifft694, _mm512_set1_ps(1.5625e-02f), ifft686);
__m512 ifft620 = _mm512_fnmadd_ps(ifft610, _mm512_set1_ps(1.5625e-02f), ifft602);
__m512 ifft704 = _mm512_fnmadd_ps(ifft694, _mm512_set1_ps(1.5625e-02f), ifft686);
__m512 dat73 = ifft613;
__m512 dat78 = ifft697;
__m512 dat74 = ifft615;
__m512 dat79 = ifft699;
__m512 dat75 = ifft617;
__m512 dat80 = ifft701;
__m512 dat76 = ifft619;
__m512 dat81 = ifft703;
__m512 dat77 = ifft614;
__m512 dat82 = ifft698;
(void)ifft616;
(void)ifft700;
(void)ifft618;
(void)ifft702;
(void)ifft620;
(void)ifft704;
__m512i pm5 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack15 = _mm512_permutex2var_ps(dat73, pm5, dat78);
__m512i pm6 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack16 = _mm512_permutex2var_ps(dat73, pm6, dat78);
__m512 pack17 = _mm512_permutex2var_ps(dat74, pm5, dat79);
__m512 pack18 = _mm512_permutex2var_ps(dat74, pm6, dat79);
__m512 pack19 = _mm512_permutex2var_ps(dat75, pm5, dat80);
__m512 pack20 = _mm512_permutex2var_ps(dat75, pm6, dat80);
__m512 pack21 = _mm512_permutex2var_ps(dat76, pm5, dat81);
__m512 pack22 = _mm512_permutex2var_ps(dat76, pm6, dat81);
__m512 pack23 = _mm512_permutex2var_ps(dat77, pm5, dat82);
__m512 pack24 = _mm512_permutex2var_ps(dat77, pm6, dat82);
_mm512_mask_storeu_ps(datPtr2+0+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack15);
_mm512_mask_storeu_ps(datPtr2+336+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack16);
_mm512_mask_storeu_ps(datPtr2+48+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack17);
_mm512_mask_storeu_ps(datPtr2+384+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack18);
_mm512_mask_storeu_ps(datPtr2+96+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack19);
_mm512_mask_storeu_ps(datPtr2+432+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack20);
_mm512_mask_storeu_ps(datPtr2+144+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack21);
_mm512_mask_storeu_ps(datPtr2+480+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack22);
_mm512_mask_storeu_ps(datPtr2+192+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack23);
_mm512_mask_storeu_ps(datPtr2+528+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t5, 1023, pack24);
ptrdiff_t t6 = 0;
__m512 sfRe117 = _mm512_loadu_ps(sfPtr3+256+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfIm117 = _mm512_loadu_ps(sfPtr3+320+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfRe121 = _mm512_loadu_ps(sfPtr3+384+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfIm121 = _mm512_loadu_ps(sfPtr3+448+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfRe118 = _mm512_loadu_ps(sfPtr3+30592+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfIm118 = _mm512_loadu_ps(sfPtr3+30656+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfRe122 = _mm512_loadu_ps(sfPtr3+30720+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfIm122 = _mm512_loadu_ps(sfPtr3+30784+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfRe119 = _mm512_loadu_ps(sfPtr3+60928+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfIm119 = _mm512_loadu_ps(sfPtr3+60992+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfRe123 = _mm512_loadu_ps(sfPtr3+61056+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfIm123 = _mm512_loadu_ps(sfPtr3+61120+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfRe120 = _mm512_loadu_ps(sfPtr3+91264+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfIm120 = _mm512_loadu_ps(sfPtr3+91328+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfRe124 = _mm512_loadu_ps(sfPtr3+91392+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512 sfIm124 = _mm512_loadu_ps(sfPtr3+91456+121344*i9+30336*j5+1536*k9+768*r3+256*t6);
__m512i ifft705 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft706 = _mm512_permutexvar_ps(ifft705, sfRe117);
__m512 ifft797 = _mm512_permutexvar_ps(ifft705, sfRe121);
__m512i ifft707 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft708 = _mm512_permutexvar_ps(ifft707, sfRe117);
__m512 ifft798 = _mm512_permutexvar_ps(ifft707, sfRe121);
__m512 ifft709 = _mm512_permutexvar_ps(ifft705, sfIm117);
__m512 ifft799 = _mm512_permutexvar_ps(ifft705, sfIm121);
__m512 ifft710 = _mm512_permutexvar_ps(ifft707, sfIm117);
__m512 ifft800 = _mm512_permutexvar_ps(ifft707, sfIm121);
__m512 ifft711 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft712 = _mm512_mask_fmadd_ps(ifft710, 65021, ifft711, ifft706);
__m512 ifft801 = _mm512_mask_fmadd_ps(ifft800, 65021, ifft711, ifft797);
__m512 ifft713 = _mm512_mask_fnmadd_ps(ifft709, 65021, ifft711, ifft708);
__m512 ifft802 = _mm512_mask_fnmadd_ps(ifft799, 65021, ifft711, ifft798);
__m512 ifft714 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft715 = _mm512_fmadd_ps(ifft712, ifft714, _mm512_shuffle_ps(ifft712, ifft712, 177));
__m512 ifft803 = _mm512_fmadd_ps(ifft801, ifft714, _mm512_shuffle_ps(ifft801, ifft801, 177));
__m512 ifft716 = _mm512_fmadd_ps(ifft713, ifft714, _mm512_shuffle_ps(ifft713, ifft713, 177));
__m512 ifft804 = _mm512_fmadd_ps(ifft802, ifft714, _mm512_shuffle_ps(ifft802, ifft802, 177));
__m512 ifft717 = _mm512_fmadd_ps(sfRe118, ifft714, _mm512_shuffle_ps(sfRe118, sfRe118, 177));
__m512 ifft805 = _mm512_fmadd_ps(sfRe122, ifft714, _mm512_shuffle_ps(sfRe122, sfRe122, 177));
__m512 ifft718 = _mm512_fmadd_ps(sfIm118, ifft714, _mm512_shuffle_ps(sfIm118, sfIm118, 177));
__m512 ifft806 = _mm512_fmadd_ps(sfIm122, ifft714, _mm512_shuffle_ps(sfIm122, sfIm122, 177));
__m512 ifft719 = _mm512_fmadd_ps(sfRe119, ifft714, _mm512_shuffle_ps(sfRe119, sfRe119, 177));
__m512 ifft807 = _mm512_fmadd_ps(sfRe123, ifft714, _mm512_shuffle_ps(sfRe123, sfRe123, 177));
__m512 ifft720 = _mm512_fmadd_ps(sfIm119, ifft714, _mm512_shuffle_ps(sfIm119, sfIm119, 177));
__m512 ifft808 = _mm512_fmadd_ps(sfIm123, ifft714, _mm512_shuffle_ps(sfIm123, sfIm123, 177));
__m512 ifft721 = _mm512_fmadd_ps(sfRe120, ifft714, _mm512_shuffle_ps(sfRe120, sfRe120, 177));
__m512 ifft809 = _mm512_fmadd_ps(sfRe124, ifft714, _mm512_shuffle_ps(sfRe124, sfRe124, 177));
__m512 ifft722 = _mm512_fmadd_ps(sfIm120, ifft714, _mm512_shuffle_ps(sfIm120, sfIm120, 177));
__m512 ifft810 = _mm512_fmadd_ps(sfIm124, ifft714, _mm512_shuffle_ps(sfIm124, sfIm124, 177));
__m512 ifft723 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft724 = _mm512_mul_ps(ifft715, ifft723);
__m512 ifft811 = _mm512_mul_ps(ifft803, ifft723);
__m512 ifft725 = _mm512_mul_ps(ifft716, ifft723);
__m512 ifft812 = _mm512_mul_ps(ifft804, ifft723);
__m512 ifft726 = _mm512_mul_ps(ifft717, ifft723);
__m512 ifft813 = _mm512_mul_ps(ifft805, ifft723);
__m512 ifft727 = _mm512_mul_ps(ifft718, ifft723);
__m512 ifft814 = _mm512_mul_ps(ifft806, ifft723);
__m512 ifft728 = _mm512_mul_ps(ifft719, ifft723);
__m512 ifft815 = _mm512_mul_ps(ifft807, ifft723);
__m512 ifft729 = _mm512_mul_ps(ifft720, ifft723);
__m512 ifft816 = _mm512_mul_ps(ifft808, ifft723);
__m512 ifft730 = _mm512_mul_ps(ifft721, ifft723);
__m512 ifft817 = _mm512_mul_ps(ifft809, ifft723);
__m512 ifft731 = _mm512_mul_ps(ifft722, ifft723);
__m512 ifft818 = _mm512_mul_ps(ifft810, ifft723);
__m512 ifft732 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft733 = _mm512_fnmadd_ps(ifft716, ifft732, ifft724);
__m512 ifft819 = _mm512_fnmadd_ps(ifft804, ifft732, ifft811);
__m512 ifft734 = _mm512_fmadd_ps(ifft715, ifft732, ifft725);
__m512 ifft820 = _mm512_fmadd_ps(ifft803, ifft732, ifft812);
__m512 ifft735 = _mm512_fnmadd_ps(ifft718, ifft732, ifft726);
__m512 ifft821 = _mm512_fnmadd_ps(ifft806, ifft732, ifft813);
__m512 ifft736 = _mm512_fmadd_ps(ifft717, ifft732, ifft727);
__m512 ifft822 = _mm512_fmadd_ps(ifft805, ifft732, ifft814);
__m512 ifft737 = _mm512_fnmadd_ps(ifft720, ifft732, ifft728);
__m512 ifft823 = _mm512_fnmadd_ps(ifft808, ifft732, ifft815);
__m512 ifft738 = _mm512_fmadd_ps(ifft719, ifft732, ifft729);
__m512 ifft824 = _mm512_fmadd_ps(ifft807, ifft732, ifft816);
__m512 ifft739 = _mm512_fnmadd_ps(ifft722, ifft732, ifft730);
__m512 ifft825 = _mm512_fnmadd_ps(ifft810, ifft732, ifft817);
__m512 ifft740 = _mm512_fmadd_ps(ifft721, ifft732, ifft731);
__m512 ifft826 = _mm512_fmadd_ps(ifft809, ifft732, ifft818);
__m512 ifft741 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft742 = _mm512_fmadd_ps(ifft733, ifft741, _mm512_shuffle_ps(ifft733, ifft733, 78));
__m512 ifft827 = _mm512_fmadd_ps(ifft819, ifft741, _mm512_shuffle_ps(ifft819, ifft819, 78));
__m512 ifft743 = _mm512_fmadd_ps(ifft734, ifft741, _mm512_shuffle_ps(ifft734, ifft734, 78));
__m512 ifft828 = _mm512_fmadd_ps(ifft820, ifft741, _mm512_shuffle_ps(ifft820, ifft820, 78));
__m512 ifft744 = _mm512_fmadd_ps(ifft735, ifft741, _mm512_shuffle_ps(ifft735, ifft735, 78));
__m512 ifft829 = _mm512_fmadd_ps(ifft821, ifft741, _mm512_shuffle_ps(ifft821, ifft821, 78));
__m512 ifft745 = _mm512_fmadd_ps(ifft736, ifft741, _mm512_shuffle_ps(ifft736, ifft736, 78));
__m512 ifft830 = _mm512_fmadd_ps(ifft822, ifft741, _mm512_shuffle_ps(ifft822, ifft822, 78));
__m512 ifft746 = _mm512_fmadd_ps(ifft737, ifft741, _mm512_shuffle_ps(ifft737, ifft737, 78));
__m512 ifft831 = _mm512_fmadd_ps(ifft823, ifft741, _mm512_shuffle_ps(ifft823, ifft823, 78));
__m512 ifft747 = _mm512_fmadd_ps(ifft738, ifft741, _mm512_shuffle_ps(ifft738, ifft738, 78));
__m512 ifft832 = _mm512_fmadd_ps(ifft824, ifft741, _mm512_shuffle_ps(ifft824, ifft824, 78));
__m512 ifft748 = _mm512_fmadd_ps(ifft739, ifft741, _mm512_shuffle_ps(ifft739, ifft739, 78));
__m512 ifft833 = _mm512_fmadd_ps(ifft825, ifft741, _mm512_shuffle_ps(ifft825, ifft825, 78));
__m512 ifft749 = _mm512_fmadd_ps(ifft740, ifft741, _mm512_shuffle_ps(ifft740, ifft740, 78));
__m512 ifft834 = _mm512_fmadd_ps(ifft826, ifft741, _mm512_shuffle_ps(ifft826, ifft826, 78));
__m512 ifft750 = _mm512_mask_sub_ps(ifft742, 49344, _mm512_setzero_ps(), ifft743);
__m512 ifft835 = _mm512_mask_sub_ps(ifft827, 49344, _mm512_setzero_ps(), ifft828);
__m512 ifft751 = _mm512_mask_mov_ps(ifft743, 49344, ifft742);
__m512 ifft836 = _mm512_mask_mov_ps(ifft828, 49344, ifft827);
__m512 ifft752 = _mm512_mask_sub_ps(ifft744, 49344, _mm512_setzero_ps(), ifft745);
__m512 ifft837 = _mm512_mask_sub_ps(ifft829, 49344, _mm512_setzero_ps(), ifft830);
__m512 ifft753 = _mm512_mask_mov_ps(ifft745, 49344, ifft744);
__m512 ifft838 = _mm512_mask_mov_ps(ifft830, 49344, ifft829);
__m512 ifft754 = _mm512_mask_sub_ps(ifft746, 49344, _mm512_setzero_ps(), ifft747);
__m512 ifft839 = _mm512_mask_sub_ps(ifft831, 49344, _mm512_setzero_ps(), ifft832);
__m512 ifft755 = _mm512_mask_mov_ps(ifft747, 49344, ifft746);
__m512 ifft840 = _mm512_mask_mov_ps(ifft832, 49344, ifft831);
__m512 ifft756 = _mm512_mask_sub_ps(ifft748, 49344, _mm512_setzero_ps(), ifft749);
__m512 ifft841 = _mm512_mask_sub_ps(ifft833, 49344, _mm512_setzero_ps(), ifft834);
__m512 ifft757 = _mm512_mask_mov_ps(ifft749, 49344, ifft748);
__m512 ifft842 = _mm512_mask_mov_ps(ifft834, 49344, ifft833);
__m512 ifft758 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft759 = _mm512_fmadd_ps(ifft750, ifft758, _mm512_shuffle_f32x4(ifft750, ifft750, 177));
__m512 ifft843 = _mm512_fmadd_ps(ifft835, ifft758, _mm512_shuffle_f32x4(ifft835, ifft835, 177));
__m512 ifft760 = _mm512_fmadd_ps(ifft751, ifft758, _mm512_shuffle_f32x4(ifft751, ifft751, 177));
__m512 ifft844 = _mm512_fmadd_ps(ifft836, ifft758, _mm512_shuffle_f32x4(ifft836, ifft836, 177));
__m512 ifft761 = _mm512_fmadd_ps(ifft752, ifft758, _mm512_shuffle_f32x4(ifft752, ifft752, 177));
__m512 ifft845 = _mm512_fmadd_ps(ifft837, ifft758, _mm512_shuffle_f32x4(ifft837, ifft837, 177));
__m512 ifft762 = _mm512_fmadd_ps(ifft753, ifft758, _mm512_shuffle_f32x4(ifft753, ifft753, 177));
__m512 ifft846 = _mm512_fmadd_ps(ifft838, ifft758, _mm512_shuffle_f32x4(ifft838, ifft838, 177));
__m512 ifft763 = _mm512_fmadd_ps(ifft754, ifft758, _mm512_shuffle_f32x4(ifft754, ifft754, 177));
__m512 ifft847 = _mm512_fmadd_ps(ifft839, ifft758, _mm512_shuffle_f32x4(ifft839, ifft839, 177));
__m512 ifft764 = _mm512_fnmsub_ps(ifft755, ifft758, _mm512_shuffle_f32x4(ifft755, ifft755, 177));
__m512 ifft848 = _mm512_fnmsub_ps(ifft840, ifft758, _mm512_shuffle_f32x4(ifft840, ifft840, 177));
__m512 ifft765 = _mm512_fmadd_ps(ifft756, ifft758, _mm512_shuffle_f32x4(ifft756, ifft756, 177));
__m512 ifft849 = _mm512_fmadd_ps(ifft841, ifft758, _mm512_shuffle_f32x4(ifft841, ifft841, 177));
__m512 ifft766 = _mm512_fmadd_ps(ifft757, ifft758, _mm512_shuffle_f32x4(ifft757, ifft757, 177));
__m512 ifft850 = _mm512_fmadd_ps(ifft842, ifft758, _mm512_shuffle_f32x4(ifft842, ifft842, 177));
__m512 ifft767 = _mm512_add_ps(ifft759, ifft760);
__m512 ifft851 = _mm512_add_ps(ifft843, ifft844);
__m512 ifft768 = _mm512_sub_ps(ifft759, ifft760);
__m512 ifft852 = _mm512_sub_ps(ifft843, ifft844);
__m512 ifft769 = _mm512_sub_ps(ifft761, ifft765);
__m512 ifft853 = _mm512_sub_ps(ifft845, ifft849);
__m512 ifft770 = _mm512_add_ps(ifft762, ifft766);
__m512 ifft854 = _mm512_add_ps(ifft846, ifft850);
__m512 ifft771 = _mm512_add_ps(ifft761, ifft765);
__m512 ifft855 = _mm512_add_ps(ifft845, ifft849);
__m512 ifft772 = _mm512_sub_ps(ifft762, ifft766);
__m512 ifft856 = _mm512_sub_ps(ifft846, ifft850);
__m512 ifft773 = _mm512_mul_ps(ifft763, _mm512_set1_ps(3.125e-02f));
__m512 ifft857 = _mm512_mul_ps(ifft847, _mm512_set1_ps(3.125e-02f));
__m512 ifft774 = _mm512_mul_ps(ifft764, _mm512_set1_ps(3.125e-02f));
__m512 ifft858 = _mm512_mul_ps(ifft848, _mm512_set1_ps(3.125e-02f));
__m512 ifft775 = _mm512_fmadd_ps(ifft767, _mm512_set1_ps(1.5625e-02f), ifft773);
__m512 ifft859 = _mm512_fmadd_ps(ifft851, _mm512_set1_ps(1.5625e-02f), ifft857);
__m512 ifft776 = _mm512_fmsub_ps(ifft767, _mm512_set1_ps(1.5625e-02f), ifft773);
__m512 ifft860 = _mm512_fmsub_ps(ifft851, _mm512_set1_ps(1.5625e-02f), ifft857);
__m512 ifft777 = _mm512_fmadd_ps(ifft768, _mm512_set1_ps(1.5625e-02f), ifft774);
__m512 ifft861 = _mm512_fmadd_ps(ifft852, _mm512_set1_ps(1.5625e-02f), ifft858);
__m512 ifft778 = _mm512_fmsub_ps(ifft768, _mm512_set1_ps(1.5625e-02f), ifft774);
__m512 ifft862 = _mm512_fmsub_ps(ifft852, _mm512_set1_ps(1.5625e-02f), ifft858);
__m512 ifft779 = _mm512_add_ps(ifft769, ifft770);
__m512 ifft863 = _mm512_add_ps(ifft853, ifft854);
__m512 ifft780 = _mm512_sub_ps(ifft769, ifft770);
__m512 ifft864 = _mm512_sub_ps(ifft853, ifft854);
__m512 ifft781 = _mm512_fnmadd_ps(ifft779, _mm512_set1_ps(7.0710677e-01f), ifft771);
__m512 ifft865 = _mm512_fnmadd_ps(ifft863, _mm512_set1_ps(7.0710677e-01f), ifft855);
__m512 ifft782 = _mm512_fmadd_ps(ifft779, _mm512_set1_ps(7.0710677e-01f), ifft771);
__m512 ifft866 = _mm512_fmadd_ps(ifft863, _mm512_set1_ps(7.0710677e-01f), ifft855);
__m512 ifft783 = _mm512_fmadd_ps(ifft780, _mm512_set1_ps(7.0710677e-01f), ifft772);
__m512 ifft867 = _mm512_fmadd_ps(ifft864, _mm512_set1_ps(7.0710677e-01f), ifft856);
__m512 ifft784 = _mm512_fmsub_ps(ifft780, _mm512_set1_ps(7.0710677e-01f), ifft772);
__m512 ifft868 = _mm512_fmsub_ps(ifft864, _mm512_set1_ps(7.0710677e-01f), ifft856);
__m512 ifft785 = _mm512_add_ps(ifft781, ifft782);
__m512 ifft869 = _mm512_add_ps(ifft865, ifft866);
__m512 ifft786 = _mm512_sub_ps(ifft781, ifft782);
__m512 ifft870 = _mm512_sub_ps(ifft865, ifft866);
__m512 ifft787 = _mm512_add_ps(ifft783, ifft784);
__m512 ifft871 = _mm512_add_ps(ifft867, ifft868);
__m512 ifft788 = _mm512_sub_ps(ifft783, ifft784);
__m512 ifft872 = _mm512_sub_ps(ifft867, ifft868);
__m512 ifft789 = _mm512_fmadd_ps(ifft785, _mm512_set1_ps(1.5625e-02f), ifft775);
__m512 ifft873 = _mm512_fmadd_ps(ifft869, _mm512_set1_ps(1.5625e-02f), ifft859);
__m512 ifft790 = _mm512_fnmadd_ps(ifft785, _mm512_set1_ps(1.5625e-02f), ifft775);
__m512 ifft874 = _mm512_fnmadd_ps(ifft869, _mm512_set1_ps(1.5625e-02f), ifft859);
__m512 ifft791 = _mm512_fmadd_ps(ifft787, _mm512_set1_ps(1.5625e-02f), ifft777);
__m512 ifft875 = _mm512_fmadd_ps(ifft871, _mm512_set1_ps(1.5625e-02f), ifft861);
__m512 ifft792 = _mm512_fnmadd_ps(ifft787, _mm512_set1_ps(1.5625e-02f), ifft777);
__m512 ifft876 = _mm512_fnmadd_ps(ifft871, _mm512_set1_ps(1.5625e-02f), ifft861);
__m512 ifft793 = _mm512_fnmadd_ps(ifft788, _mm512_set1_ps(1.5625e-02f), ifft776);
__m512 ifft877 = _mm512_fnmadd_ps(ifft872, _mm512_set1_ps(1.5625e-02f), ifft860);
__m512 ifft794 = _mm512_fmadd_ps(ifft788, _mm512_set1_ps(1.5625e-02f), ifft776);
__m512 ifft878 = _mm512_fmadd_ps(ifft872, _mm512_set1_ps(1.5625e-02f), ifft860);
__m512 ifft795 = _mm512_fmadd_ps(ifft786, _mm512_set1_ps(1.5625e-02f), ifft778);
__m512 ifft879 = _mm512_fmadd_ps(ifft870, _mm512_set1_ps(1.5625e-02f), ifft862);
__m512 ifft796 = _mm512_fnmadd_ps(ifft786, _mm512_set1_ps(1.5625e-02f), ifft778);
__m512 ifft880 = _mm512_fnmadd_ps(ifft870, _mm512_set1_ps(1.5625e-02f), ifft862);
__m512 dat83 = ifft789;
__m512 dat88 = ifft873;
__m512 dat84 = ifft791;
__m512 dat89 = ifft875;
__m512 dat85 = ifft793;
__m512 dat90 = ifft877;
__m512 dat86 = ifft795;
__m512 dat91 = ifft879;
__m512 dat87 = ifft790;
__m512 dat92 = ifft874;
(void)ifft792;
(void)ifft876;
(void)ifft794;
(void)ifft878;
(void)ifft796;
(void)ifft880;
_mm512_mask_storeu_ps(datPtr2+40+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 3, dat83);
_mm512_mask_storeu_ps(datPtr2+544+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 7936, dat83);
_mm512_mask_storeu_ps(datPtr2+240+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 31, dat88);
_mm512_mask_storeu_ps(datPtr2+344+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 768, dat88);
_mm512_mask_storeu_ps(datPtr2+88+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 3, dat84);
_mm512_mask_storeu_ps(datPtr2+592+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 7936, dat84);
_mm512_mask_storeu_ps(datPtr2+288+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 31, dat89);
_mm512_mask_storeu_ps(datPtr2+392+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 768, dat89);
_mm512_mask_storeu_ps(datPtr2+136+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 3, dat85);
_mm512_mask_storeu_ps(datPtr2+440+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 768, dat90);
_mm512_mask_storeu_ps(datPtr2+184+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 3, dat86);
_mm512_mask_storeu_ps(datPtr2+488+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 768, dat91);
_mm512_mask_storeu_ps(datPtr2+232+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 3, dat87);
_mm512_mask_storeu_ps(datPtr2+536+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+0*t6, 768, dat92);
ptrdiff_t t7 = 0;
__m512 sfRe125 = _mm512_loadu_ps(sfPtr3+512+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfIm125 = _mm512_loadu_ps(sfPtr3+576+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfRe129 = _mm512_loadu_ps(sfPtr3+640+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfIm129 = _mm512_loadu_ps(sfPtr3+704+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfRe126 = _mm512_loadu_ps(sfPtr3+30848+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfIm126 = _mm512_loadu_ps(sfPtr3+30912+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfRe130 = _mm512_loadu_ps(sfPtr3+30976+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfIm130 = _mm512_loadu_ps(sfPtr3+31040+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfRe127 = _mm512_loadu_ps(sfPtr3+61184+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfIm127 = _mm512_loadu_ps(sfPtr3+61248+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfRe131 = _mm512_loadu_ps(sfPtr3+61312+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfIm131 = _mm512_loadu_ps(sfPtr3+61376+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfRe128 = _mm512_loadu_ps(sfPtr3+91520+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfIm128 = _mm512_loadu_ps(sfPtr3+91584+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfRe132 = _mm512_loadu_ps(sfPtr3+91648+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512 sfIm132 = _mm512_loadu_ps(sfPtr3+91712+121344*i9+30336*j5+1536*k9+768*r3+256*t7);
__m512i ifft881 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft882 = _mm512_permutexvar_ps(ifft881, sfRe125);
__m512 ifft973 = _mm512_permutexvar_ps(ifft881, sfRe129);
__m512i ifft883 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft884 = _mm512_permutexvar_ps(ifft883, sfRe125);
__m512 ifft974 = _mm512_permutexvar_ps(ifft883, sfRe129);
__m512 ifft885 = _mm512_permutexvar_ps(ifft881, sfIm125);
__m512 ifft975 = _mm512_permutexvar_ps(ifft881, sfIm129);
__m512 ifft886 = _mm512_permutexvar_ps(ifft883, sfIm125);
__m512 ifft976 = _mm512_permutexvar_ps(ifft883, sfIm129);
__m512 ifft887 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft888 = _mm512_mask_fmadd_ps(ifft886, 65021, ifft887, ifft882);
__m512 ifft977 = _mm512_mask_fmadd_ps(ifft976, 65021, ifft887, ifft973);
__m512 ifft889 = _mm512_mask_fnmadd_ps(ifft885, 65021, ifft887, ifft884);
__m512 ifft978 = _mm512_mask_fnmadd_ps(ifft975, 65021, ifft887, ifft974);
__m512 ifft890 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft891 = _mm512_fmadd_ps(ifft888, ifft890, _mm512_shuffle_ps(ifft888, ifft888, 177));
__m512 ifft979 = _mm512_fmadd_ps(ifft977, ifft890, _mm512_shuffle_ps(ifft977, ifft977, 177));
__m512 ifft892 = _mm512_fmadd_ps(ifft889, ifft890, _mm512_shuffle_ps(ifft889, ifft889, 177));
__m512 ifft980 = _mm512_fmadd_ps(ifft978, ifft890, _mm512_shuffle_ps(ifft978, ifft978, 177));
__m512 ifft893 = _mm512_fmadd_ps(sfRe126, ifft890, _mm512_shuffle_ps(sfRe126, sfRe126, 177));
__m512 ifft981 = _mm512_fmadd_ps(sfRe130, ifft890, _mm512_shuffle_ps(sfRe130, sfRe130, 177));
__m512 ifft894 = _mm512_fmadd_ps(sfIm126, ifft890, _mm512_shuffle_ps(sfIm126, sfIm126, 177));
__m512 ifft982 = _mm512_fmadd_ps(sfIm130, ifft890, _mm512_shuffle_ps(sfIm130, sfIm130, 177));
__m512 ifft895 = _mm512_fmadd_ps(sfRe127, ifft890, _mm512_shuffle_ps(sfRe127, sfRe127, 177));
__m512 ifft983 = _mm512_fmadd_ps(sfRe131, ifft890, _mm512_shuffle_ps(sfRe131, sfRe131, 177));
__m512 ifft896 = _mm512_fmadd_ps(sfIm127, ifft890, _mm512_shuffle_ps(sfIm127, sfIm127, 177));
__m512 ifft984 = _mm512_fmadd_ps(sfIm131, ifft890, _mm512_shuffle_ps(sfIm131, sfIm131, 177));
__m512 ifft897 = _mm512_fmadd_ps(sfRe128, ifft890, _mm512_shuffle_ps(sfRe128, sfRe128, 177));
__m512 ifft985 = _mm512_fmadd_ps(sfRe132, ifft890, _mm512_shuffle_ps(sfRe132, sfRe132, 177));
__m512 ifft898 = _mm512_fmadd_ps(sfIm128, ifft890, _mm512_shuffle_ps(sfIm128, sfIm128, 177));
__m512 ifft986 = _mm512_fmadd_ps(sfIm132, ifft890, _mm512_shuffle_ps(sfIm132, sfIm132, 177));
__m512 ifft899 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft900 = _mm512_mul_ps(ifft891, ifft899);
__m512 ifft987 = _mm512_mul_ps(ifft979, ifft899);
__m512 ifft901 = _mm512_mul_ps(ifft892, ifft899);
__m512 ifft988 = _mm512_mul_ps(ifft980, ifft899);
__m512 ifft902 = _mm512_mul_ps(ifft893, ifft899);
__m512 ifft989 = _mm512_mul_ps(ifft981, ifft899);
__m512 ifft903 = _mm512_mul_ps(ifft894, ifft899);
__m512 ifft990 = _mm512_mul_ps(ifft982, ifft899);
__m512 ifft904 = _mm512_mul_ps(ifft895, ifft899);
__m512 ifft991 = _mm512_mul_ps(ifft983, ifft899);
__m512 ifft905 = _mm512_mul_ps(ifft896, ifft899);
__m512 ifft992 = _mm512_mul_ps(ifft984, ifft899);
__m512 ifft906 = _mm512_mul_ps(ifft897, ifft899);
__m512 ifft993 = _mm512_mul_ps(ifft985, ifft899);
__m512 ifft907 = _mm512_mul_ps(ifft898, ifft899);
__m512 ifft994 = _mm512_mul_ps(ifft986, ifft899);
__m512 ifft908 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft909 = _mm512_fnmadd_ps(ifft892, ifft908, ifft900);
__m512 ifft995 = _mm512_fnmadd_ps(ifft980, ifft908, ifft987);
__m512 ifft910 = _mm512_fmadd_ps(ifft891, ifft908, ifft901);
__m512 ifft996 = _mm512_fmadd_ps(ifft979, ifft908, ifft988);
__m512 ifft911 = _mm512_fnmadd_ps(ifft894, ifft908, ifft902);
__m512 ifft997 = _mm512_fnmadd_ps(ifft982, ifft908, ifft989);
__m512 ifft912 = _mm512_fmadd_ps(ifft893, ifft908, ifft903);
__m512 ifft998 = _mm512_fmadd_ps(ifft981, ifft908, ifft990);
__m512 ifft913 = _mm512_fnmadd_ps(ifft896, ifft908, ifft904);
__m512 ifft999 = _mm512_fnmadd_ps(ifft984, ifft908, ifft991);
__m512 ifft914 = _mm512_fmadd_ps(ifft895, ifft908, ifft905);
__m512 ifft1000 = _mm512_fmadd_ps(ifft983, ifft908, ifft992);
__m512 ifft915 = _mm512_fnmadd_ps(ifft898, ifft908, ifft906);
__m512 ifft1001 = _mm512_fnmadd_ps(ifft986, ifft908, ifft993);
__m512 ifft916 = _mm512_fmadd_ps(ifft897, ifft908, ifft907);
__m512 ifft1002 = _mm512_fmadd_ps(ifft985, ifft908, ifft994);
__m512 ifft917 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft918 = _mm512_fmadd_ps(ifft909, ifft917, _mm512_shuffle_ps(ifft909, ifft909, 78));
__m512 ifft1003 = _mm512_fmadd_ps(ifft995, ifft917, _mm512_shuffle_ps(ifft995, ifft995, 78));
__m512 ifft919 = _mm512_fmadd_ps(ifft910, ifft917, _mm512_shuffle_ps(ifft910, ifft910, 78));
__m512 ifft1004 = _mm512_fmadd_ps(ifft996, ifft917, _mm512_shuffle_ps(ifft996, ifft996, 78));
__m512 ifft920 = _mm512_fmadd_ps(ifft911, ifft917, _mm512_shuffle_ps(ifft911, ifft911, 78));
__m512 ifft1005 = _mm512_fmadd_ps(ifft997, ifft917, _mm512_shuffle_ps(ifft997, ifft997, 78));
__m512 ifft921 = _mm512_fmadd_ps(ifft912, ifft917, _mm512_shuffle_ps(ifft912, ifft912, 78));
__m512 ifft1006 = _mm512_fmadd_ps(ifft998, ifft917, _mm512_shuffle_ps(ifft998, ifft998, 78));
__m512 ifft922 = _mm512_fmadd_ps(ifft913, ifft917, _mm512_shuffle_ps(ifft913, ifft913, 78));
__m512 ifft1007 = _mm512_fmadd_ps(ifft999, ifft917, _mm512_shuffle_ps(ifft999, ifft999, 78));
__m512 ifft923 = _mm512_fmadd_ps(ifft914, ifft917, _mm512_shuffle_ps(ifft914, ifft914, 78));
__m512 ifft1008 = _mm512_fmadd_ps(ifft1000, ifft917, _mm512_shuffle_ps(ifft1000, ifft1000, 78));
__m512 ifft924 = _mm512_fmadd_ps(ifft915, ifft917, _mm512_shuffle_ps(ifft915, ifft915, 78));
__m512 ifft1009 = _mm512_fmadd_ps(ifft1001, ifft917, _mm512_shuffle_ps(ifft1001, ifft1001, 78));
__m512 ifft925 = _mm512_fmadd_ps(ifft916, ifft917, _mm512_shuffle_ps(ifft916, ifft916, 78));
__m512 ifft1010 = _mm512_fmadd_ps(ifft1002, ifft917, _mm512_shuffle_ps(ifft1002, ifft1002, 78));
__m512 ifft926 = _mm512_mask_sub_ps(ifft918, 49344, _mm512_setzero_ps(), ifft919);
__m512 ifft1011 = _mm512_mask_sub_ps(ifft1003, 49344, _mm512_setzero_ps(), ifft1004);
__m512 ifft927 = _mm512_mask_mov_ps(ifft919, 49344, ifft918);
__m512 ifft1012 = _mm512_mask_mov_ps(ifft1004, 49344, ifft1003);
__m512 ifft928 = _mm512_mask_sub_ps(ifft920, 49344, _mm512_setzero_ps(), ifft921);
__m512 ifft1013 = _mm512_mask_sub_ps(ifft1005, 49344, _mm512_setzero_ps(), ifft1006);
__m512 ifft929 = _mm512_mask_mov_ps(ifft921, 49344, ifft920);
__m512 ifft1014 = _mm512_mask_mov_ps(ifft1006, 49344, ifft1005);
__m512 ifft930 = _mm512_mask_sub_ps(ifft922, 49344, _mm512_setzero_ps(), ifft923);
__m512 ifft1015 = _mm512_mask_sub_ps(ifft1007, 49344, _mm512_setzero_ps(), ifft1008);
__m512 ifft931 = _mm512_mask_mov_ps(ifft923, 49344, ifft922);
__m512 ifft1016 = _mm512_mask_mov_ps(ifft1008, 49344, ifft1007);
__m512 ifft932 = _mm512_mask_sub_ps(ifft924, 49344, _mm512_setzero_ps(), ifft925);
__m512 ifft1017 = _mm512_mask_sub_ps(ifft1009, 49344, _mm512_setzero_ps(), ifft1010);
__m512 ifft933 = _mm512_mask_mov_ps(ifft925, 49344, ifft924);
__m512 ifft1018 = _mm512_mask_mov_ps(ifft1010, 49344, ifft1009);
__m512 ifft934 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft935 = _mm512_fmadd_ps(ifft926, ifft934, _mm512_shuffle_f32x4(ifft926, ifft926, 177));
__m512 ifft1019 = _mm512_fmadd_ps(ifft1011, ifft934, _mm512_shuffle_f32x4(ifft1011, ifft1011, 177));
__m512 ifft936 = _mm512_fmadd_ps(ifft927, ifft934, _mm512_shuffle_f32x4(ifft927, ifft927, 177));
__m512 ifft1020 = _mm512_fmadd_ps(ifft1012, ifft934, _mm512_shuffle_f32x4(ifft1012, ifft1012, 177));
__m512 ifft937 = _mm512_fmadd_ps(ifft928, ifft934, _mm512_shuffle_f32x4(ifft928, ifft928, 177));
__m512 ifft1021 = _mm512_fmadd_ps(ifft1013, ifft934, _mm512_shuffle_f32x4(ifft1013, ifft1013, 177));
__m512 ifft938 = _mm512_fmadd_ps(ifft929, ifft934, _mm512_shuffle_f32x4(ifft929, ifft929, 177));
__m512 ifft1022 = _mm512_fmadd_ps(ifft1014, ifft934, _mm512_shuffle_f32x4(ifft1014, ifft1014, 177));
__m512 ifft939 = _mm512_fmadd_ps(ifft930, ifft934, _mm512_shuffle_f32x4(ifft930, ifft930, 177));
__m512 ifft1023 = _mm512_fmadd_ps(ifft1015, ifft934, _mm512_shuffle_f32x4(ifft1015, ifft1015, 177));
__m512 ifft940 = _mm512_fnmsub_ps(ifft931, ifft934, _mm512_shuffle_f32x4(ifft931, ifft931, 177));
__m512 ifft1024 = _mm512_fnmsub_ps(ifft1016, ifft934, _mm512_shuffle_f32x4(ifft1016, ifft1016, 177));
__m512 ifft941 = _mm512_fmadd_ps(ifft932, ifft934, _mm512_shuffle_f32x4(ifft932, ifft932, 177));
__m512 ifft1025 = _mm512_fmadd_ps(ifft1017, ifft934, _mm512_shuffle_f32x4(ifft1017, ifft1017, 177));
__m512 ifft942 = _mm512_fmadd_ps(ifft933, ifft934, _mm512_shuffle_f32x4(ifft933, ifft933, 177));
__m512 ifft1026 = _mm512_fmadd_ps(ifft1018, ifft934, _mm512_shuffle_f32x4(ifft1018, ifft1018, 177));
__m512 ifft943 = _mm512_add_ps(ifft935, ifft936);
__m512 ifft1027 = _mm512_add_ps(ifft1019, ifft1020);
__m512 ifft944 = _mm512_sub_ps(ifft935, ifft936);
__m512 ifft1028 = _mm512_sub_ps(ifft1019, ifft1020);
__m512 ifft945 = _mm512_sub_ps(ifft937, ifft941);
__m512 ifft1029 = _mm512_sub_ps(ifft1021, ifft1025);
__m512 ifft946 = _mm512_add_ps(ifft938, ifft942);
__m512 ifft1030 = _mm512_add_ps(ifft1022, ifft1026);
__m512 ifft947 = _mm512_add_ps(ifft937, ifft941);
__m512 ifft1031 = _mm512_add_ps(ifft1021, ifft1025);
__m512 ifft948 = _mm512_sub_ps(ifft938, ifft942);
__m512 ifft1032 = _mm512_sub_ps(ifft1022, ifft1026);
__m512 ifft949 = _mm512_mul_ps(ifft939, _mm512_set1_ps(3.125e-02f));
__m512 ifft1033 = _mm512_mul_ps(ifft1023, _mm512_set1_ps(3.125e-02f));
__m512 ifft950 = _mm512_mul_ps(ifft940, _mm512_set1_ps(3.125e-02f));
__m512 ifft1034 = _mm512_mul_ps(ifft1024, _mm512_set1_ps(3.125e-02f));
__m512 ifft951 = _mm512_fmadd_ps(ifft943, _mm512_set1_ps(1.5625e-02f), ifft949);
__m512 ifft1035 = _mm512_fmadd_ps(ifft1027, _mm512_set1_ps(1.5625e-02f), ifft1033);
__m512 ifft952 = _mm512_fmsub_ps(ifft943, _mm512_set1_ps(1.5625e-02f), ifft949);
__m512 ifft1036 = _mm512_fmsub_ps(ifft1027, _mm512_set1_ps(1.5625e-02f), ifft1033);
__m512 ifft953 = _mm512_fmadd_ps(ifft944, _mm512_set1_ps(1.5625e-02f), ifft950);
__m512 ifft1037 = _mm512_fmadd_ps(ifft1028, _mm512_set1_ps(1.5625e-02f), ifft1034);
__m512 ifft954 = _mm512_fmsub_ps(ifft944, _mm512_set1_ps(1.5625e-02f), ifft950);
__m512 ifft1038 = _mm512_fmsub_ps(ifft1028, _mm512_set1_ps(1.5625e-02f), ifft1034);
__m512 ifft955 = _mm512_add_ps(ifft945, ifft946);
__m512 ifft1039 = _mm512_add_ps(ifft1029, ifft1030);
__m512 ifft956 = _mm512_sub_ps(ifft945, ifft946);
__m512 ifft1040 = _mm512_sub_ps(ifft1029, ifft1030);
__m512 ifft957 = _mm512_fnmadd_ps(ifft955, _mm512_set1_ps(7.0710677e-01f), ifft947);
__m512 ifft1041 = _mm512_fnmadd_ps(ifft1039, _mm512_set1_ps(7.0710677e-01f), ifft1031);
__m512 ifft958 = _mm512_fmadd_ps(ifft955, _mm512_set1_ps(7.0710677e-01f), ifft947);
__m512 ifft1042 = _mm512_fmadd_ps(ifft1039, _mm512_set1_ps(7.0710677e-01f), ifft1031);
__m512 ifft959 = _mm512_fmadd_ps(ifft956, _mm512_set1_ps(7.0710677e-01f), ifft948);
__m512 ifft1043 = _mm512_fmadd_ps(ifft1040, _mm512_set1_ps(7.0710677e-01f), ifft1032);
__m512 ifft960 = _mm512_fmsub_ps(ifft956, _mm512_set1_ps(7.0710677e-01f), ifft948);
__m512 ifft1044 = _mm512_fmsub_ps(ifft1040, _mm512_set1_ps(7.0710677e-01f), ifft1032);
__m512 ifft961 = _mm512_add_ps(ifft957, ifft958);
__m512 ifft1045 = _mm512_add_ps(ifft1041, ifft1042);
__m512 ifft962 = _mm512_sub_ps(ifft957, ifft958);
__m512 ifft1046 = _mm512_sub_ps(ifft1041, ifft1042);
__m512 ifft963 = _mm512_add_ps(ifft959, ifft960);
__m512 ifft1047 = _mm512_add_ps(ifft1043, ifft1044);
__m512 ifft964 = _mm512_sub_ps(ifft959, ifft960);
__m512 ifft1048 = _mm512_sub_ps(ifft1043, ifft1044);
__m512 ifft965 = _mm512_fmadd_ps(ifft961, _mm512_set1_ps(1.5625e-02f), ifft951);
__m512 ifft1049 = _mm512_fmadd_ps(ifft1045, _mm512_set1_ps(1.5625e-02f), ifft1035);
__m512 ifft966 = _mm512_fnmadd_ps(ifft961, _mm512_set1_ps(1.5625e-02f), ifft951);
__m512 ifft1050 = _mm512_fnmadd_ps(ifft1045, _mm512_set1_ps(1.5625e-02f), ifft1035);
__m512 ifft967 = _mm512_fmadd_ps(ifft963, _mm512_set1_ps(1.5625e-02f), ifft953);
__m512 ifft1051 = _mm512_fmadd_ps(ifft1047, _mm512_set1_ps(1.5625e-02f), ifft1037);
__m512 ifft968 = _mm512_fnmadd_ps(ifft963, _mm512_set1_ps(1.5625e-02f), ifft953);
__m512 ifft1052 = _mm512_fnmadd_ps(ifft1047, _mm512_set1_ps(1.5625e-02f), ifft1037);
__m512 ifft969 = _mm512_fnmadd_ps(ifft964, _mm512_set1_ps(1.5625e-02f), ifft952);
__m512 ifft1053 = _mm512_fnmadd_ps(ifft1048, _mm512_set1_ps(1.5625e-02f), ifft1036);
__m512 ifft970 = _mm512_fmadd_ps(ifft964, _mm512_set1_ps(1.5625e-02f), ifft952);
__m512 ifft1054 = _mm512_fmadd_ps(ifft1048, _mm512_set1_ps(1.5625e-02f), ifft1036);
__m512 ifft971 = _mm512_fmadd_ps(ifft962, _mm512_set1_ps(1.5625e-02f), ifft954);
__m512 ifft1055 = _mm512_fmadd_ps(ifft1046, _mm512_set1_ps(1.5625e-02f), ifft1038);
__m512 ifft972 = _mm512_fnmadd_ps(ifft962, _mm512_set1_ps(1.5625e-02f), ifft954);
__m512 ifft1056 = _mm512_fnmadd_ps(ifft1046, _mm512_set1_ps(1.5625e-02f), ifft1038);
__m512 dat93 = ifft965;
__m512 dat95 = ifft1049;
__m512 dat94 = ifft967;
__m512 dat96 = ifft1051;
(void)ifft969;
(void)ifft1053;
(void)ifft971;
(void)ifft1055;
(void)ifft966;
(void)ifft1050;
(void)ifft968;
(void)ifft1052;
(void)ifft970;
(void)ifft1054;
(void)ifft972;
(void)ifft1056;
__m512i pm7 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack25 = _mm512_permutex2var_ps(dat93, pm7, dat95);
__m512i pm8 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack26 = _mm512_permutex2var_ps(dat93, pm8, dat95);
__m512 pack27 = _mm512_permutex2var_ps(dat94, pm7, dat96);
__m512 pack28 = _mm512_permutex2var_ps(dat94, pm8, dat96);
_mm512_mask_storeu_ps(datPtr2+260+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t7, 127, pack25);
_mm512_mask_storeu_ps(datPtr2+596+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t7, 127, pack26);
_mm512_mask_storeu_ps(datPtr2+308+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t7, 127, pack27);
_mm512_mask_storeu_ps(datPtr2+644+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t7, 127, pack28);
}
ptrdiff_t t8 = 0;
__m512 sfRe133 = _mm512_loadu_ps(sfPtr3+0+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfIm133 = _mm512_loadu_ps(sfPtr3+64+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfRe137 = _mm512_loadu_ps(sfPtr3+128+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfIm137 = _mm512_loadu_ps(sfPtr3+192+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfRe134 = _mm512_loadu_ps(sfPtr3+30336+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfIm134 = _mm512_loadu_ps(sfPtr3+30400+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfRe138 = _mm512_loadu_ps(sfPtr3+30464+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfIm138 = _mm512_loadu_ps(sfPtr3+30528+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfRe135 = _mm512_loadu_ps(sfPtr3+60672+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfIm135 = _mm512_loadu_ps(sfPtr3+60736+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfRe139 = _mm512_loadu_ps(sfPtr3+60800+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfIm139 = _mm512_loadu_ps(sfPtr3+60864+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfRe136 = _mm512_loadu_ps(sfPtr3+91008+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfIm136 = _mm512_loadu_ps(sfPtr3+91072+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfRe140 = _mm512_loadu_ps(sfPtr3+91136+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512 sfIm140 = _mm512_loadu_ps(sfPtr3+91200+121344*i9+30336*j5+1536*k9+768*r3+256*t8);
__m512i ifft1057 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1058 = _mm512_permutexvar_ps(ifft1057, sfRe133);
__m512 ifft1149 = _mm512_permutexvar_ps(ifft1057, sfRe137);
__m512i ifft1059 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1060 = _mm512_permutexvar_ps(ifft1059, sfRe133);
__m512 ifft1150 = _mm512_permutexvar_ps(ifft1059, sfRe137);
__m512 ifft1061 = _mm512_permutexvar_ps(ifft1057, sfIm133);
__m512 ifft1151 = _mm512_permutexvar_ps(ifft1057, sfIm137);
__m512 ifft1062 = _mm512_permutexvar_ps(ifft1059, sfIm133);
__m512 ifft1152 = _mm512_permutexvar_ps(ifft1059, sfIm137);
__m512 ifft1063 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1064 = _mm512_mask_fmadd_ps(ifft1062, 65021, ifft1063, ifft1058);
__m512 ifft1153 = _mm512_mask_fmadd_ps(ifft1152, 65021, ifft1063, ifft1149);
__m512 ifft1065 = _mm512_mask_fnmadd_ps(ifft1061, 65021, ifft1063, ifft1060);
__m512 ifft1154 = _mm512_mask_fnmadd_ps(ifft1151, 65021, ifft1063, ifft1150);
__m512 ifft1066 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1067 = _mm512_fmadd_ps(ifft1064, ifft1066, _mm512_shuffle_ps(ifft1064, ifft1064, 177));
__m512 ifft1155 = _mm512_fmadd_ps(ifft1153, ifft1066, _mm512_shuffle_ps(ifft1153, ifft1153, 177));
__m512 ifft1068 = _mm512_fmadd_ps(ifft1065, ifft1066, _mm512_shuffle_ps(ifft1065, ifft1065, 177));
__m512 ifft1156 = _mm512_fmadd_ps(ifft1154, ifft1066, _mm512_shuffle_ps(ifft1154, ifft1154, 177));
__m512 ifft1069 = _mm512_fmadd_ps(sfRe134, ifft1066, _mm512_shuffle_ps(sfRe134, sfRe134, 177));
__m512 ifft1157 = _mm512_fmadd_ps(sfRe138, ifft1066, _mm512_shuffle_ps(sfRe138, sfRe138, 177));
__m512 ifft1070 = _mm512_fmadd_ps(sfIm134, ifft1066, _mm512_shuffle_ps(sfIm134, sfIm134, 177));
__m512 ifft1158 = _mm512_fmadd_ps(sfIm138, ifft1066, _mm512_shuffle_ps(sfIm138, sfIm138, 177));
__m512 ifft1071 = _mm512_fmadd_ps(sfRe135, ifft1066, _mm512_shuffle_ps(sfRe135, sfRe135, 177));
__m512 ifft1159 = _mm512_fmadd_ps(sfRe139, ifft1066, _mm512_shuffle_ps(sfRe139, sfRe139, 177));
__m512 ifft1072 = _mm512_fmadd_ps(sfIm135, ifft1066, _mm512_shuffle_ps(sfIm135, sfIm135, 177));
__m512 ifft1160 = _mm512_fmadd_ps(sfIm139, ifft1066, _mm512_shuffle_ps(sfIm139, sfIm139, 177));
__m512 ifft1073 = _mm512_fmadd_ps(sfRe136, ifft1066, _mm512_shuffle_ps(sfRe136, sfRe136, 177));
__m512 ifft1161 = _mm512_fmadd_ps(sfRe140, ifft1066, _mm512_shuffle_ps(sfRe140, sfRe140, 177));
__m512 ifft1074 = _mm512_fmadd_ps(sfIm136, ifft1066, _mm512_shuffle_ps(sfIm136, sfIm136, 177));
__m512 ifft1162 = _mm512_fmadd_ps(sfIm140, ifft1066, _mm512_shuffle_ps(sfIm140, sfIm140, 177));
__m512 ifft1075 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1076 = _mm512_mul_ps(ifft1067, ifft1075);
__m512 ifft1163 = _mm512_mul_ps(ifft1155, ifft1075);
__m512 ifft1077 = _mm512_mul_ps(ifft1068, ifft1075);
__m512 ifft1164 = _mm512_mul_ps(ifft1156, ifft1075);
__m512 ifft1078 = _mm512_mul_ps(ifft1069, ifft1075);
__m512 ifft1165 = _mm512_mul_ps(ifft1157, ifft1075);
__m512 ifft1079 = _mm512_mul_ps(ifft1070, ifft1075);
__m512 ifft1166 = _mm512_mul_ps(ifft1158, ifft1075);
__m512 ifft1080 = _mm512_mul_ps(ifft1071, ifft1075);
__m512 ifft1167 = _mm512_mul_ps(ifft1159, ifft1075);
__m512 ifft1081 = _mm512_mul_ps(ifft1072, ifft1075);
__m512 ifft1168 = _mm512_mul_ps(ifft1160, ifft1075);
__m512 ifft1082 = _mm512_mul_ps(ifft1073, ifft1075);
__m512 ifft1169 = _mm512_mul_ps(ifft1161, ifft1075);
__m512 ifft1083 = _mm512_mul_ps(ifft1074, ifft1075);
__m512 ifft1170 = _mm512_mul_ps(ifft1162, ifft1075);
__m512 ifft1084 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1085 = _mm512_fnmadd_ps(ifft1068, ifft1084, ifft1076);
__m512 ifft1171 = _mm512_fnmadd_ps(ifft1156, ifft1084, ifft1163);
__m512 ifft1086 = _mm512_fmadd_ps(ifft1067, ifft1084, ifft1077);
__m512 ifft1172 = _mm512_fmadd_ps(ifft1155, ifft1084, ifft1164);
__m512 ifft1087 = _mm512_fnmadd_ps(ifft1070, ifft1084, ifft1078);
__m512 ifft1173 = _mm512_fnmadd_ps(ifft1158, ifft1084, ifft1165);
__m512 ifft1088 = _mm512_fmadd_ps(ifft1069, ifft1084, ifft1079);
__m512 ifft1174 = _mm512_fmadd_ps(ifft1157, ifft1084, ifft1166);
__m512 ifft1089 = _mm512_fnmadd_ps(ifft1072, ifft1084, ifft1080);
__m512 ifft1175 = _mm512_fnmadd_ps(ifft1160, ifft1084, ifft1167);
__m512 ifft1090 = _mm512_fmadd_ps(ifft1071, ifft1084, ifft1081);
__m512 ifft1176 = _mm512_fmadd_ps(ifft1159, ifft1084, ifft1168);
__m512 ifft1091 = _mm512_fnmadd_ps(ifft1074, ifft1084, ifft1082);
__m512 ifft1177 = _mm512_fnmadd_ps(ifft1162, ifft1084, ifft1169);
__m512 ifft1092 = _mm512_fmadd_ps(ifft1073, ifft1084, ifft1083);
__m512 ifft1178 = _mm512_fmadd_ps(ifft1161, ifft1084, ifft1170);
__m512 ifft1093 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1094 = _mm512_fmadd_ps(ifft1085, ifft1093, _mm512_shuffle_ps(ifft1085, ifft1085, 78));
__m512 ifft1179 = _mm512_fmadd_ps(ifft1171, ifft1093, _mm512_shuffle_ps(ifft1171, ifft1171, 78));
__m512 ifft1095 = _mm512_fmadd_ps(ifft1086, ifft1093, _mm512_shuffle_ps(ifft1086, ifft1086, 78));
__m512 ifft1180 = _mm512_fmadd_ps(ifft1172, ifft1093, _mm512_shuffle_ps(ifft1172, ifft1172, 78));
__m512 ifft1096 = _mm512_fmadd_ps(ifft1087, ifft1093, _mm512_shuffle_ps(ifft1087, ifft1087, 78));
__m512 ifft1181 = _mm512_fmadd_ps(ifft1173, ifft1093, _mm512_shuffle_ps(ifft1173, ifft1173, 78));
__m512 ifft1097 = _mm512_fmadd_ps(ifft1088, ifft1093, _mm512_shuffle_ps(ifft1088, ifft1088, 78));
__m512 ifft1182 = _mm512_fmadd_ps(ifft1174, ifft1093, _mm512_shuffle_ps(ifft1174, ifft1174, 78));
__m512 ifft1098 = _mm512_fmadd_ps(ifft1089, ifft1093, _mm512_shuffle_ps(ifft1089, ifft1089, 78));
__m512 ifft1183 = _mm512_fmadd_ps(ifft1175, ifft1093, _mm512_shuffle_ps(ifft1175, ifft1175, 78));
__m512 ifft1099 = _mm512_fmadd_ps(ifft1090, ifft1093, _mm512_shuffle_ps(ifft1090, ifft1090, 78));
__m512 ifft1184 = _mm512_fmadd_ps(ifft1176, ifft1093, _mm512_shuffle_ps(ifft1176, ifft1176, 78));
__m512 ifft1100 = _mm512_fmadd_ps(ifft1091, ifft1093, _mm512_shuffle_ps(ifft1091, ifft1091, 78));
__m512 ifft1185 = _mm512_fmadd_ps(ifft1177, ifft1093, _mm512_shuffle_ps(ifft1177, ifft1177, 78));
__m512 ifft1101 = _mm512_fmadd_ps(ifft1092, ifft1093, _mm512_shuffle_ps(ifft1092, ifft1092, 78));
__m512 ifft1186 = _mm512_fmadd_ps(ifft1178, ifft1093, _mm512_shuffle_ps(ifft1178, ifft1178, 78));
__m512 ifft1102 = _mm512_mask_sub_ps(ifft1094, 49344, _mm512_setzero_ps(), ifft1095);
__m512 ifft1187 = _mm512_mask_sub_ps(ifft1179, 49344, _mm512_setzero_ps(), ifft1180);
__m512 ifft1103 = _mm512_mask_mov_ps(ifft1095, 49344, ifft1094);
__m512 ifft1188 = _mm512_mask_mov_ps(ifft1180, 49344, ifft1179);
__m512 ifft1104 = _mm512_mask_sub_ps(ifft1096, 49344, _mm512_setzero_ps(), ifft1097);
__m512 ifft1189 = _mm512_mask_sub_ps(ifft1181, 49344, _mm512_setzero_ps(), ifft1182);
__m512 ifft1105 = _mm512_mask_mov_ps(ifft1097, 49344, ifft1096);
__m512 ifft1190 = _mm512_mask_mov_ps(ifft1182, 49344, ifft1181);
__m512 ifft1106 = _mm512_mask_sub_ps(ifft1098, 49344, _mm512_setzero_ps(), ifft1099);
__m512 ifft1191 = _mm512_mask_sub_ps(ifft1183, 49344, _mm512_setzero_ps(), ifft1184);
__m512 ifft1107 = _mm512_mask_mov_ps(ifft1099, 49344, ifft1098);
__m512 ifft1192 = _mm512_mask_mov_ps(ifft1184, 49344, ifft1183);
__m512 ifft1108 = _mm512_mask_sub_ps(ifft1100, 49344, _mm512_setzero_ps(), ifft1101);
__m512 ifft1193 = _mm512_mask_sub_ps(ifft1185, 49344, _mm512_setzero_ps(), ifft1186);
__m512 ifft1109 = _mm512_mask_mov_ps(ifft1101, 49344, ifft1100);
__m512 ifft1194 = _mm512_mask_mov_ps(ifft1186, 49344, ifft1185);
__m512 ifft1110 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1111 = _mm512_fmadd_ps(ifft1102, ifft1110, _mm512_shuffle_f32x4(ifft1102, ifft1102, 177));
__m512 ifft1195 = _mm512_fmadd_ps(ifft1187, ifft1110, _mm512_shuffle_f32x4(ifft1187, ifft1187, 177));
__m512 ifft1112 = _mm512_fmadd_ps(ifft1103, ifft1110, _mm512_shuffle_f32x4(ifft1103, ifft1103, 177));
__m512 ifft1196 = _mm512_fmadd_ps(ifft1188, ifft1110, _mm512_shuffle_f32x4(ifft1188, ifft1188, 177));
__m512 ifft1113 = _mm512_fmadd_ps(ifft1104, ifft1110, _mm512_shuffle_f32x4(ifft1104, ifft1104, 177));
__m512 ifft1197 = _mm512_fmadd_ps(ifft1189, ifft1110, _mm512_shuffle_f32x4(ifft1189, ifft1189, 177));
__m512 ifft1114 = _mm512_fmadd_ps(ifft1105, ifft1110, _mm512_shuffle_f32x4(ifft1105, ifft1105, 177));
__m512 ifft1198 = _mm512_fmadd_ps(ifft1190, ifft1110, _mm512_shuffle_f32x4(ifft1190, ifft1190, 177));
__m512 ifft1115 = _mm512_fmadd_ps(ifft1106, ifft1110, _mm512_shuffle_f32x4(ifft1106, ifft1106, 177));
__m512 ifft1199 = _mm512_fmadd_ps(ifft1191, ifft1110, _mm512_shuffle_f32x4(ifft1191, ifft1191, 177));
__m512 ifft1116 = _mm512_fnmsub_ps(ifft1107, ifft1110, _mm512_shuffle_f32x4(ifft1107, ifft1107, 177));
__m512 ifft1200 = _mm512_fnmsub_ps(ifft1192, ifft1110, _mm512_shuffle_f32x4(ifft1192, ifft1192, 177));
__m512 ifft1117 = _mm512_fmadd_ps(ifft1108, ifft1110, _mm512_shuffle_f32x4(ifft1108, ifft1108, 177));
__m512 ifft1201 = _mm512_fmadd_ps(ifft1193, ifft1110, _mm512_shuffle_f32x4(ifft1193, ifft1193, 177));
__m512 ifft1118 = _mm512_fmadd_ps(ifft1109, ifft1110, _mm512_shuffle_f32x4(ifft1109, ifft1109, 177));
__m512 ifft1202 = _mm512_fmadd_ps(ifft1194, ifft1110, _mm512_shuffle_f32x4(ifft1194, ifft1194, 177));
__m512 ifft1119 = _mm512_add_ps(ifft1111, ifft1112);
__m512 ifft1203 = _mm512_add_ps(ifft1195, ifft1196);
__m512 ifft1120 = _mm512_sub_ps(ifft1111, ifft1112);
__m512 ifft1204 = _mm512_sub_ps(ifft1195, ifft1196);
__m512 ifft1121 = _mm512_sub_ps(ifft1113, ifft1117);
__m512 ifft1205 = _mm512_sub_ps(ifft1197, ifft1201);
__m512 ifft1122 = _mm512_add_ps(ifft1114, ifft1118);
__m512 ifft1206 = _mm512_add_ps(ifft1198, ifft1202);
__m512 ifft1123 = _mm512_add_ps(ifft1113, ifft1117);
__m512 ifft1207 = _mm512_add_ps(ifft1197, ifft1201);
__m512 ifft1124 = _mm512_sub_ps(ifft1114, ifft1118);
__m512 ifft1208 = _mm512_sub_ps(ifft1198, ifft1202);
__m512 ifft1125 = _mm512_mul_ps(ifft1115, _mm512_set1_ps(3.125e-02f));
__m512 ifft1209 = _mm512_mul_ps(ifft1199, _mm512_set1_ps(3.125e-02f));
__m512 ifft1126 = _mm512_mul_ps(ifft1116, _mm512_set1_ps(3.125e-02f));
__m512 ifft1210 = _mm512_mul_ps(ifft1200, _mm512_set1_ps(3.125e-02f));
__m512 ifft1127 = _mm512_fmadd_ps(ifft1119, _mm512_set1_ps(1.5625e-02f), ifft1125);
__m512 ifft1211 = _mm512_fmadd_ps(ifft1203, _mm512_set1_ps(1.5625e-02f), ifft1209);
__m512 ifft1128 = _mm512_fmsub_ps(ifft1119, _mm512_set1_ps(1.5625e-02f), ifft1125);
__m512 ifft1212 = _mm512_fmsub_ps(ifft1203, _mm512_set1_ps(1.5625e-02f), ifft1209);
__m512 ifft1129 = _mm512_fmadd_ps(ifft1120, _mm512_set1_ps(1.5625e-02f), ifft1126);
__m512 ifft1213 = _mm512_fmadd_ps(ifft1204, _mm512_set1_ps(1.5625e-02f), ifft1210);
__m512 ifft1130 = _mm512_fmsub_ps(ifft1120, _mm512_set1_ps(1.5625e-02f), ifft1126);
__m512 ifft1214 = _mm512_fmsub_ps(ifft1204, _mm512_set1_ps(1.5625e-02f), ifft1210);
__m512 ifft1131 = _mm512_add_ps(ifft1121, ifft1122);
__m512 ifft1215 = _mm512_add_ps(ifft1205, ifft1206);
__m512 ifft1132 = _mm512_sub_ps(ifft1121, ifft1122);
__m512 ifft1216 = _mm512_sub_ps(ifft1205, ifft1206);
__m512 ifft1133 = _mm512_fnmadd_ps(ifft1131, _mm512_set1_ps(7.0710677e-01f), ifft1123);
__m512 ifft1217 = _mm512_fnmadd_ps(ifft1215, _mm512_set1_ps(7.0710677e-01f), ifft1207);
__m512 ifft1134 = _mm512_fmadd_ps(ifft1131, _mm512_set1_ps(7.0710677e-01f), ifft1123);
__m512 ifft1218 = _mm512_fmadd_ps(ifft1215, _mm512_set1_ps(7.0710677e-01f), ifft1207);
__m512 ifft1135 = _mm512_fmadd_ps(ifft1132, _mm512_set1_ps(7.0710677e-01f), ifft1124);
__m512 ifft1219 = _mm512_fmadd_ps(ifft1216, _mm512_set1_ps(7.0710677e-01f), ifft1208);
__m512 ifft1136 = _mm512_fmsub_ps(ifft1132, _mm512_set1_ps(7.0710677e-01f), ifft1124);
__m512 ifft1220 = _mm512_fmsub_ps(ifft1216, _mm512_set1_ps(7.0710677e-01f), ifft1208);
__m512 ifft1137 = _mm512_add_ps(ifft1133, ifft1134);
__m512 ifft1221 = _mm512_add_ps(ifft1217, ifft1218);
__m512 ifft1138 = _mm512_sub_ps(ifft1133, ifft1134);
__m512 ifft1222 = _mm512_sub_ps(ifft1217, ifft1218);
__m512 ifft1139 = _mm512_add_ps(ifft1135, ifft1136);
__m512 ifft1223 = _mm512_add_ps(ifft1219, ifft1220);
__m512 ifft1140 = _mm512_sub_ps(ifft1135, ifft1136);
__m512 ifft1224 = _mm512_sub_ps(ifft1219, ifft1220);
__m512 ifft1141 = _mm512_fmadd_ps(ifft1137, _mm512_set1_ps(1.5625e-02f), ifft1127);
__m512 ifft1225 = _mm512_fmadd_ps(ifft1221, _mm512_set1_ps(1.5625e-02f), ifft1211);
__m512 ifft1142 = _mm512_fnmadd_ps(ifft1137, _mm512_set1_ps(1.5625e-02f), ifft1127);
__m512 ifft1226 = _mm512_fnmadd_ps(ifft1221, _mm512_set1_ps(1.5625e-02f), ifft1211);
__m512 ifft1143 = _mm512_fmadd_ps(ifft1139, _mm512_set1_ps(1.5625e-02f), ifft1129);
__m512 ifft1227 = _mm512_fmadd_ps(ifft1223, _mm512_set1_ps(1.5625e-02f), ifft1213);
__m512 ifft1144 = _mm512_fnmadd_ps(ifft1139, _mm512_set1_ps(1.5625e-02f), ifft1129);
__m512 ifft1228 = _mm512_fnmadd_ps(ifft1223, _mm512_set1_ps(1.5625e-02f), ifft1213);
__m512 ifft1145 = _mm512_fnmadd_ps(ifft1140, _mm512_set1_ps(1.5625e-02f), ifft1128);
__m512 ifft1229 = _mm512_fnmadd_ps(ifft1224, _mm512_set1_ps(1.5625e-02f), ifft1212);
__m512 ifft1146 = _mm512_fmadd_ps(ifft1140, _mm512_set1_ps(1.5625e-02f), ifft1128);
__m512 ifft1230 = _mm512_fmadd_ps(ifft1224, _mm512_set1_ps(1.5625e-02f), ifft1212);
__m512 ifft1147 = _mm512_fmadd_ps(ifft1138, _mm512_set1_ps(1.5625e-02f), ifft1130);
__m512 ifft1231 = _mm512_fmadd_ps(ifft1222, _mm512_set1_ps(1.5625e-02f), ifft1214);
__m512 ifft1148 = _mm512_fnmadd_ps(ifft1138, _mm512_set1_ps(1.5625e-02f), ifft1130);
__m512 ifft1232 = _mm512_fnmadd_ps(ifft1222, _mm512_set1_ps(1.5625e-02f), ifft1214);
__m512 dat97 = ifft1141;
__m512 dat102 = ifft1225;
__m512 dat98 = ifft1143;
__m512 dat103 = ifft1227;
__m512 dat99 = ifft1145;
__m512 dat104 = ifft1229;
__m512 dat100 = ifft1147;
__m512 dat105 = ifft1231;
__m512 dat101 = ifft1142;
__m512 dat106 = ifft1226;
(void)ifft1144;
(void)ifft1228;
(void)ifft1146;
(void)ifft1230;
(void)ifft1148;
(void)ifft1232;
__m512i pm9 = _mm512_set_epi32(24, 20, 19, 18, 17, 16, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0);
__m512 pack29 = _mm512_permutex2var_ps(dat97, pm9, dat102);
__m512 pack30 = _mm512_permutex2var_ps(dat98, pm9, dat103);
__m512 pack31 = _mm512_permutex2var_ps(dat99, pm9, dat104);
__m512 pack32 = _mm512_permutex2var_ps(dat100, pm9, dat105);
__m512 pack33 = _mm512_permutex2var_ps(dat101, pm9, dat106);
_mm512_mask_storeu_ps(datPtr2+0+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+80*t8, 4095, pack29);
_mm512_mask_storeu_ps(datPtr2+208+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+80*t8, 7936, dat102);
_mm512_mask_storeu_ps(datPtr2+48+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+80*t8, 4095, pack30);
_mm512_mask_storeu_ps(datPtr2+256+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+80*t8, 7936, dat103);
_mm512_mask_storeu_ps(datPtr2+96+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+80*t8, 4095, pack31);
_mm512_mask_storeu_ps(datPtr2+144+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+80*t8, 4095, pack32);
_mm512_mask_storeu_ps(datPtr2+192+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+80*t8, 4095, pack33);
ptrdiff_t t9 = 0;
__m512 sfRe141 = _mm512_loadu_ps(sfPtr3+256+121344*i9+30336*j5+1536*k9+768*r3+128*t9);
__m512 sfIm141 = _mm512_loadu_ps(sfPtr3+320+121344*i9+30336*j5+1536*k9+768*r3+128*t9);
__m512 sfRe142 = _mm512_loadu_ps(sfPtr3+30592+121344*i9+30336*j5+1536*k9+768*r3+128*t9);
__m512 sfIm142 = _mm512_loadu_ps(sfPtr3+30656+121344*i9+30336*j5+1536*k9+768*r3+128*t9);
__m512 sfRe143 = _mm512_loadu_ps(sfPtr3+60928+121344*i9+30336*j5+1536*k9+768*r3+128*t9);
__m512 sfIm143 = _mm512_loadu_ps(sfPtr3+60992+121344*i9+30336*j5+1536*k9+768*r3+128*t9);
__m512 sfRe144 = _mm512_loadu_ps(sfPtr3+91264+121344*i9+30336*j5+1536*k9+768*r3+128*t9);
__m512 sfIm144 = _mm512_loadu_ps(sfPtr3+91328+121344*i9+30336*j5+1536*k9+768*r3+128*t9);
__m512i ifft1233 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1234 = _mm512_permutexvar_ps(ifft1233, sfRe141);
__m512i ifft1235 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1236 = _mm512_permutexvar_ps(ifft1235, sfRe141);
__m512 ifft1237 = _mm512_permutexvar_ps(ifft1233, sfIm141);
__m512 ifft1238 = _mm512_permutexvar_ps(ifft1235, sfIm141);
__m512 ifft1239 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1240 = _mm512_mask_fmadd_ps(ifft1238, 65021, ifft1239, ifft1234);
__m512 ifft1241 = _mm512_mask_fnmadd_ps(ifft1237, 65021, ifft1239, ifft1236);
__m512 ifft1242 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1243 = _mm512_fmadd_ps(ifft1240, ifft1242, _mm512_shuffle_ps(ifft1240, ifft1240, 177));
__m512 ifft1244 = _mm512_fmadd_ps(ifft1241, ifft1242, _mm512_shuffle_ps(ifft1241, ifft1241, 177));
__m512 ifft1245 = _mm512_fmadd_ps(sfRe142, ifft1242, _mm512_shuffle_ps(sfRe142, sfRe142, 177));
__m512 ifft1246 = _mm512_fmadd_ps(sfIm142, ifft1242, _mm512_shuffle_ps(sfIm142, sfIm142, 177));
__m512 ifft1247 = _mm512_fmadd_ps(sfRe143, ifft1242, _mm512_shuffle_ps(sfRe143, sfRe143, 177));
__m512 ifft1248 = _mm512_fmadd_ps(sfIm143, ifft1242, _mm512_shuffle_ps(sfIm143, sfIm143, 177));
__m512 ifft1249 = _mm512_fmadd_ps(sfRe144, ifft1242, _mm512_shuffle_ps(sfRe144, sfRe144, 177));
__m512 ifft1250 = _mm512_fmadd_ps(sfIm144, ifft1242, _mm512_shuffle_ps(sfIm144, sfIm144, 177));
__m512 ifft1251 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1252 = _mm512_mul_ps(ifft1243, ifft1251);
__m512 ifft1253 = _mm512_mul_ps(ifft1244, ifft1251);
__m512 ifft1254 = _mm512_mul_ps(ifft1245, ifft1251);
__m512 ifft1255 = _mm512_mul_ps(ifft1246, ifft1251);
__m512 ifft1256 = _mm512_mul_ps(ifft1247, ifft1251);
__m512 ifft1257 = _mm512_mul_ps(ifft1248, ifft1251);
__m512 ifft1258 = _mm512_mul_ps(ifft1249, ifft1251);
__m512 ifft1259 = _mm512_mul_ps(ifft1250, ifft1251);
__m512 ifft1260 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1261 = _mm512_fnmadd_ps(ifft1244, ifft1260, ifft1252);
__m512 ifft1262 = _mm512_fmadd_ps(ifft1243, ifft1260, ifft1253);
__m512 ifft1263 = _mm512_fnmadd_ps(ifft1246, ifft1260, ifft1254);
__m512 ifft1264 = _mm512_fmadd_ps(ifft1245, ifft1260, ifft1255);
__m512 ifft1265 = _mm512_fnmadd_ps(ifft1248, ifft1260, ifft1256);
__m512 ifft1266 = _mm512_fmadd_ps(ifft1247, ifft1260, ifft1257);
__m512 ifft1267 = _mm512_fnmadd_ps(ifft1250, ifft1260, ifft1258);
__m512 ifft1268 = _mm512_fmadd_ps(ifft1249, ifft1260, ifft1259);
__m512 ifft1269 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1270 = _mm512_fmadd_ps(ifft1261, ifft1269, _mm512_shuffle_ps(ifft1261, ifft1261, 78));
__m512 ifft1271 = _mm512_fmadd_ps(ifft1262, ifft1269, _mm512_shuffle_ps(ifft1262, ifft1262, 78));
__m512 ifft1272 = _mm512_fmadd_ps(ifft1263, ifft1269, _mm512_shuffle_ps(ifft1263, ifft1263, 78));
__m512 ifft1273 = _mm512_fmadd_ps(ifft1264, ifft1269, _mm512_shuffle_ps(ifft1264, ifft1264, 78));
__m512 ifft1274 = _mm512_fmadd_ps(ifft1265, ifft1269, _mm512_shuffle_ps(ifft1265, ifft1265, 78));
__m512 ifft1275 = _mm512_fmadd_ps(ifft1266, ifft1269, _mm512_shuffle_ps(ifft1266, ifft1266, 78));
__m512 ifft1276 = _mm512_fmadd_ps(ifft1267, ifft1269, _mm512_shuffle_ps(ifft1267, ifft1267, 78));
__m512 ifft1277 = _mm512_fmadd_ps(ifft1268, ifft1269, _mm512_shuffle_ps(ifft1268, ifft1268, 78));
__m512 ifft1278 = _mm512_mask_sub_ps(ifft1270, 49344, _mm512_setzero_ps(), ifft1271);
__m512 ifft1279 = _mm512_mask_mov_ps(ifft1271, 49344, ifft1270);
__m512 ifft1280 = _mm512_mask_sub_ps(ifft1272, 49344, _mm512_setzero_ps(), ifft1273);
__m512 ifft1281 = _mm512_mask_mov_ps(ifft1273, 49344, ifft1272);
__m512 ifft1282 = _mm512_mask_sub_ps(ifft1274, 49344, _mm512_setzero_ps(), ifft1275);
__m512 ifft1283 = _mm512_mask_mov_ps(ifft1275, 49344, ifft1274);
__m512 ifft1284 = _mm512_mask_sub_ps(ifft1276, 49344, _mm512_setzero_ps(), ifft1277);
__m512 ifft1285 = _mm512_mask_mov_ps(ifft1277, 49344, ifft1276);
__m512 ifft1286 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1287 = _mm512_fmadd_ps(ifft1278, ifft1286, _mm512_shuffle_f32x4(ifft1278, ifft1278, 177));
__m512 ifft1288 = _mm512_fmadd_ps(ifft1279, ifft1286, _mm512_shuffle_f32x4(ifft1279, ifft1279, 177));
__m512 ifft1289 = _mm512_fmadd_ps(ifft1280, ifft1286, _mm512_shuffle_f32x4(ifft1280, ifft1280, 177));
__m512 ifft1290 = _mm512_fmadd_ps(ifft1281, ifft1286, _mm512_shuffle_f32x4(ifft1281, ifft1281, 177));
__m512 ifft1291 = _mm512_fmadd_ps(ifft1282, ifft1286, _mm512_shuffle_f32x4(ifft1282, ifft1282, 177));
__m512 ifft1292 = _mm512_fnmsub_ps(ifft1283, ifft1286, _mm512_shuffle_f32x4(ifft1283, ifft1283, 177));
__m512 ifft1293 = _mm512_fmadd_ps(ifft1284, ifft1286, _mm512_shuffle_f32x4(ifft1284, ifft1284, 177));
__m512 ifft1294 = _mm512_fmadd_ps(ifft1285, ifft1286, _mm512_shuffle_f32x4(ifft1285, ifft1285, 177));
__m512 ifft1295 = _mm512_add_ps(ifft1287, ifft1288);
__m512 ifft1296 = _mm512_sub_ps(ifft1287, ifft1288);
__m512 ifft1297 = _mm512_sub_ps(ifft1289, ifft1293);
__m512 ifft1298 = _mm512_add_ps(ifft1290, ifft1294);
__m512 ifft1299 = _mm512_add_ps(ifft1289, ifft1293);
__m512 ifft1300 = _mm512_sub_ps(ifft1290, ifft1294);
__m512 ifft1301 = _mm512_mul_ps(ifft1291, _mm512_set1_ps(3.125e-02f));
__m512 ifft1302 = _mm512_mul_ps(ifft1292, _mm512_set1_ps(3.125e-02f));
__m512 ifft1303 = _mm512_fmadd_ps(ifft1295, _mm512_set1_ps(1.5625e-02f), ifft1301);
__m512 ifft1304 = _mm512_fmsub_ps(ifft1295, _mm512_set1_ps(1.5625e-02f), ifft1301);
__m512 ifft1305 = _mm512_fmadd_ps(ifft1296, _mm512_set1_ps(1.5625e-02f), ifft1302);
__m512 ifft1306 = _mm512_fmsub_ps(ifft1296, _mm512_set1_ps(1.5625e-02f), ifft1302);
__m512 ifft1307 = _mm512_add_ps(ifft1297, ifft1298);
__m512 ifft1308 = _mm512_sub_ps(ifft1297, ifft1298);
__m512 ifft1309 = _mm512_fnmadd_ps(ifft1307, _mm512_set1_ps(7.0710677e-01f), ifft1299);
__m512 ifft1310 = _mm512_fmadd_ps(ifft1307, _mm512_set1_ps(7.0710677e-01f), ifft1299);
__m512 ifft1311 = _mm512_fmadd_ps(ifft1308, _mm512_set1_ps(7.0710677e-01f), ifft1300);
__m512 ifft1312 = _mm512_fmsub_ps(ifft1308, _mm512_set1_ps(7.0710677e-01f), ifft1300);
__m512 ifft1313 = _mm512_add_ps(ifft1309, ifft1310);
__m512 ifft1314 = _mm512_sub_ps(ifft1309, ifft1310);
__m512 ifft1315 = _mm512_add_ps(ifft1311, ifft1312);
__m512 ifft1316 = _mm512_sub_ps(ifft1311, ifft1312);
__m512 ifft1317 = _mm512_fmadd_ps(ifft1313, _mm512_set1_ps(1.5625e-02f), ifft1303);
__m512 ifft1318 = _mm512_fnmadd_ps(ifft1313, _mm512_set1_ps(1.5625e-02f), ifft1303);
__m512 ifft1319 = _mm512_fmadd_ps(ifft1315, _mm512_set1_ps(1.5625e-02f), ifft1305);
__m512 ifft1320 = _mm512_fnmadd_ps(ifft1315, _mm512_set1_ps(1.5625e-02f), ifft1305);
__m512 ifft1321 = _mm512_fnmadd_ps(ifft1316, _mm512_set1_ps(1.5625e-02f), ifft1304);
__m512 ifft1322 = _mm512_fmadd_ps(ifft1316, _mm512_set1_ps(1.5625e-02f), ifft1304);
__m512 ifft1323 = _mm512_fmadd_ps(ifft1314, _mm512_set1_ps(1.5625e-02f), ifft1306);
__m512 ifft1324 = _mm512_fnmadd_ps(ifft1314, _mm512_set1_ps(1.5625e-02f), ifft1306);
__m512 dat107 = ifft1317;
__m512 dat108 = ifft1319;
(void)ifft1321;
(void)ifft1323;
(void)ifft1318;
(void)ifft1320;
(void)ifft1322;
(void)ifft1324;
__m512i pm10 = _mm512_set_epi32(24, 20, 19, 18, 17, 16, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0);
__m512 pack34 = _mm512_permutexvar_ps(pm10, dat107);
__m512 pack35 = _mm512_permutexvar_ps(pm10, dat108);
_mm512_mask_storeu_ps(datPtr2+260+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t9, 127, pack34);
_mm512_mask_storeu_ps(datPtr2+308+26544*i9+1344*k9+672*r3+48*toH1+4*toW1+40*t9, 127, pack35);
++j5;
}
}

static void Example9StriderConsumeSums1(Example9ThreaderTeam1* team17, char** tensors7) {
Example9ThreaderTask1 task11;
task11.callee1 = Example9StriderConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 3;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 2;
Example9ThreaderDo1(team17, &task11);
}

struct Example9Net {
char* alloc1;
char* align1;
};

void Example9NetDestroy(Example9Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example9NetCreate(
Example9Net** net1,
Example9Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example9Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(11675263);
if (__builtin_expect(!alloc3, 0)) {
return Example9Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example9ThreaderTeam1* team12 = 0;
char* err8 = Example9ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors12[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example9StriderArrangeFilts1(team12, tensors12);
}
Example9ThreaderDestroy1(team12);
Example9Net* net5 = malloc(sizeof(Example9Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example9Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example9Engine {
Example9Net* net3;
Example9ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example9EnginePthreadT(
Example9Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example9ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example9EngineDestroy(Example9Engine* eng3) {
Example9ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example9EngineCreate(
Example9Engine** eng4,
Example9Net* net4,
ptrdiff_t threads2
) {
Example9Engine* eng5 = malloc(sizeof(Example9Engine));
if (__builtin_expect(!eng5, 0)) {
return Example9Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(2357823);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example9Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example9ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example9EngineInference(
Example9Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example9ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example9StriderArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
align4+1751040
};
Example9StriderProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+1751040,
(char*)outData
};
Example9StriderConsumeSums1(team14, tensors11);
}
}

// End of file.

Top