NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example7 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=826 Height=14 Width=21
Conv FromTensor=in ToTensor=out ToChannels=287 FilterH=4 FilterW=3 StrideH=2 StrideW=2 PaddingH=0 PaddingW=1 DilationH=1 DilationW=1 Groups=7
Output FromTensor=out

Top || Output Example7.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example7Params);
// Example7Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example7Params Example7Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example7Params* params = malloc(sizeof(Example7Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example7Net* net; // For example, 4 threads:
// char* err = Example7NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example7NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example7Net Example7Net;

char* Example7NetCreate(
Example7Net**,
Example7Params*,
ptrdiff_t threads
);

void Example7NetDestroy(Example7Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example7Net* net;
//
// ... Create net ...
//
// Example7Engine* engine; // For example, 4 inference threads:
// char* err = Example7EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example7EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example7EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*826*14*21);
// float* outData = malloc(sizeof(float)*287*6*11);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example7EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example7Engine Example7Engine;

char* Example7EngineCreate(
Example7Engine**,
Example7Net*,
ptrdiff_t threads
);

char* Example7EnginePthreadT(
Example7Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example7EngineInference(
Example7Engine*,
float* inData,
float* outData
);

void Example7EngineDestroy(Example7Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example7Params {
float outBiases[287]; // 1x287x1x1
float outWeights[406392]; // 287x118x4x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example7.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example7.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example7.h"

static char* Example7Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(276);
int step1 = sprintf(msg1, "Example7: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 276-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example7ThreaderTask1 Example7ThreaderTask1;
typedef void (*Example7ThreaderCallee1)(Example7ThreaderTask1*, int64_t*);
typedef struct Example7ThreaderHub1 Example7ThreaderHub1;
typedef struct Example7ThreaderNode1 Example7ThreaderNode1;
typedef struct Example7ThreaderUnwind1 Example7ThreaderUnwind1;
typedef struct Example7ThreaderTeam1 Example7ThreaderTeam1;

struct Example7ThreaderTask1 {
Example7ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example7ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example7ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example7ThreaderTask1* task1;
pthread_cond_t cond2;
Example7ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example7ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example7ThreaderTeam1 {
ptrdiff_t nt1;
Example7ThreaderHub1* hub2;
Example7ThreaderNode1* nodes2;
Example7ThreaderUnwind1 unwind1;
};

static void Example7ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example7ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example7ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example7ThreaderMain1(void* arg1) {
Example7ThreaderNode1* node1 = arg1;
Example7ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example7ThreaderHub1* hub3 = team2->hub2;
Example7ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example7ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example7ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example7ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example7ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example7ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example7ThreaderDestroy1(Example7ThreaderTeam1* team3) {
if (!team3) return;
Example7ThreaderNode1* nodes4 = team3->nodes2;
Example7ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example7ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example7ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example7ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example7ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example7ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example7ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example7ThreaderCreate1Up4(Example7ThreaderTeam1* team8, ptrdiff_t nt7) {
Example7ThreaderNode1* nodes5 = team8->nodes2;
for (Example7ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example7Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example7Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example7ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example7Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example7ThreaderCreate1Up3(Example7ThreaderTeam1* team7, ptrdiff_t nt6) {
Example7ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example7Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example7Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example7ThreaderCreate1Up4(team7, nt6);
}

static char* Example7ThreaderCreate1Up2(Example7ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example7ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example7ThreaderNode1) != (size_t)nt5, 0)) {
return Example7Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example7Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example7ThreaderCreate1Up3(team6, nt5);
}

static char* Example7ThreaderCreate1Up1(Example7ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example7ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example7Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example7ThreaderCreate1Up2(team5, nt4);
}

static char* Example7ThreaderCreate1(Example7ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example7Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example7ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example7Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example7ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example7ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example7ThreaderPthreadT1(
pthread_t* thr2,
Example7ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example7Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example7ThreaderDo1(Example7ThreaderTeam1* team10, Example7ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example7ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example7ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example7ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example7ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example7Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example7Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example7StriderArrangeFilts1Callee1(Example7ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = pt7[1];
ptrdiff_t e1 = 0;
char*restrict bfPtr1 = tensors2[2]+1176*e1;
char*restrict wfPtr1 = tensors2[2]+1216+99499008*e1;
char*restrict wtPtr1 = tensors2[0]+31728*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+0;
if (j1 < 20) {
for (; j1 != 20; ++j1) {
for (ptrdiff_t k1 = 0; k1 < 118; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(7, wtPtr1+0+232224*i5+11328*j1+48*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(7, wtPtr1+12+232224*i5+11328*j1+48*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(7, wtPtr1+24+232224*i5+11328*j1+48*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(7, wtPtr1+36+232224*i5+11328*j1+48*k1);
__m512 fft1 = _mm512_add_ps(wt1, _mm512_setzero_ps());
__m512 fft89 = _mm512_add_ps(wt2, _mm512_setzero_ps());
__m512 fft2 = _mm512_sub_ps(wt1, _mm512_setzero_ps());
__m512 fft90 = _mm512_sub_ps(wt2, _mm512_setzero_ps());
__m512 fft3 = _mm512_add_ps(wt3, _mm512_setzero_ps());
__m512 fft91 = _mm512_add_ps(wt4, _mm512_setzero_ps());
__m512 fft4 = _mm512_sub_ps(wt3, _mm512_setzero_ps());
__m512 fft92 = _mm512_sub_ps(wt4, _mm512_setzero_ps());
__m512 fft5 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft93 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft94 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft95 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft8 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft96 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9 = _mm512_add_ps(fft1, fft5);
__m512 fft97 = _mm512_add_ps(fft89, fft93);
__m512 fft10 = _mm512_sub_ps(fft1, fft5);
__m512 fft98 = _mm512_sub_ps(fft89, fft93);
__m512 fft11 = _mm512_add_ps(fft3, fft7);
__m512 fft99 = _mm512_add_ps(fft91, fft95);
__m512 fft12 = _mm512_sub_ps(fft7, fft3);
__m512 fft100 = _mm512_sub_ps(fft95, fft91);
__m512 fft13 = _mm512_sub_ps(fft4, fft8);
__m512 fft101 = _mm512_sub_ps(fft92, fft96);
__m512 fft14 = _mm512_add_ps(fft4, fft8);
__m512 fft102 = _mm512_add_ps(fft92, fft96);
__m512 fft15 = _mm512_add_ps(fft9, fft11);
__m512 fft103 = _mm512_add_ps(fft97, fft99);
__m512 fft16 = _mm512_sub_ps(fft9, fft11);
__m512 fft104 = _mm512_sub_ps(fft97, fft99);
__m512 fft17 = _mm512_fmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft105 = _mm512_fmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft18 = _mm512_fnmsub_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft106 = _mm512_fnmsub_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft19 = _mm512_fnmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft107 = _mm512_fnmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft20 = _mm512_fnmadd_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft108 = _mm512_fnmadd_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft21 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft22 = _mm512_fmadd_ps(fft15, fft21, _mm512_shuffle_f32x4(fft15, fft15, 78));
__m512 fft109 = _mm512_fmadd_ps(fft103, fft21, _mm512_shuffle_f32x4(fft103, fft103, 78));
__m512 fft23 = _mm512_fmadd_ps(fft16, fft21, _mm512_shuffle_f32x4(fft16, fft16, 78));
__m512 fft110 = _mm512_fmadd_ps(fft104, fft21, _mm512_shuffle_f32x4(fft104, fft104, 78));
__m512 fft24 = _mm512_fmadd_ps(fft17, fft21, _mm512_shuffle_f32x4(fft17, fft17, 78));
__m512 fft111 = _mm512_fmadd_ps(fft105, fft21, _mm512_shuffle_f32x4(fft105, fft105, 78));
__m512 fft25 = _mm512_fmadd_ps(fft18, fft21, _mm512_shuffle_f32x4(fft18, fft18, 78));
__m512 fft112 = _mm512_fmadd_ps(fft106, fft21, _mm512_shuffle_f32x4(fft106, fft106, 78));
__m512 fft26 = _mm512_fmadd_ps(fft10, fft21, _mm512_shuffle_f32x4(fft10, fft10, 78));
__m512 fft113 = _mm512_fmadd_ps(fft98, fft21, _mm512_shuffle_f32x4(fft98, fft98, 78));
__m512 fft27 = _mm512_fmadd_ps(fft12, fft21, _mm512_shuffle_f32x4(fft12, fft12, 78));
__m512 fft114 = _mm512_fmadd_ps(fft100, fft21, _mm512_shuffle_f32x4(fft100, fft100, 78));
__m512 fft28 = _mm512_fmadd_ps(fft19, fft21, _mm512_shuffle_f32x4(fft19, fft19, 78));
__m512 fft115 = _mm512_fmadd_ps(fft107, fft21, _mm512_shuffle_f32x4(fft107, fft107, 78));
__m512 fft29 = _mm512_fmadd_ps(fft20, fft21, _mm512_shuffle_f32x4(fft20, fft20, 78));
__m512 fft116 = _mm512_fmadd_ps(fft108, fft21, _mm512_shuffle_f32x4(fft108, fft108, 78));
__m512 fft30 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft31 = _mm512_mul_ps(fft22, fft30);
__m512 fft117 = _mm512_mul_ps(fft109, fft30);
__m512 fft32 = _mm512_mul_ps(fft23, fft30);
__m512 fft118 = _mm512_mul_ps(fft110, fft30);
__m512 fft33 = _mm512_mul_ps(fft24, fft30);
__m512 fft119 = _mm512_mul_ps(fft111, fft30);
__m512 fft34 = _mm512_mul_ps(fft25, fft30);
__m512 fft120 = _mm512_mul_ps(fft112, fft30);
__m512 fft35 = _mm512_mul_ps(fft26, fft30);
__m512 fft121 = _mm512_mul_ps(fft113, fft30);
__m512 fft36 = _mm512_mul_ps(fft27, fft30);
__m512 fft122 = _mm512_mul_ps(fft114, fft30);
__m512 fft37 = _mm512_mul_ps(fft28, fft30);
__m512 fft123 = _mm512_mul_ps(fft115, fft30);
__m512 fft38 = _mm512_mul_ps(fft29, fft30);
__m512 fft124 = _mm512_mul_ps(fft116, fft30);
__m512 fft39 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft40 = _mm512_fmadd_ps(fft23, fft39, fft31);
__m512 fft125 = _mm512_fmadd_ps(fft110, fft39, fft117);
__m512 fft41 = _mm512_fnmadd_ps(fft22, fft39, fft32);
__m512 fft126 = _mm512_fnmadd_ps(fft109, fft39, fft118);
__m512 fft42 = _mm512_fmadd_ps(fft25, fft39, fft33);
__m512 fft127 = _mm512_fmadd_ps(fft112, fft39, fft119);
__m512 fft43 = _mm512_fnmadd_ps(fft24, fft39, fft34);
__m512 fft128 = _mm512_fnmadd_ps(fft111, fft39, fft120);
__m512 fft44 = _mm512_fmadd_ps(fft27, fft39, fft35);
__m512 fft129 = _mm512_fmadd_ps(fft114, fft39, fft121);
__m512 fft45 = _mm512_fnmadd_ps(fft26, fft39, fft36);
__m512 fft130 = _mm512_fnmadd_ps(fft113, fft39, fft122);
__m512 fft46 = _mm512_fmadd_ps(fft29, fft39, fft37);
__m512 fft131 = _mm512_fmadd_ps(fft116, fft39, fft123);
__m512 fft47 = _mm512_fnmadd_ps(fft28, fft39, fft38);
__m512 fft132 = _mm512_fnmadd_ps(fft115, fft39, fft124);
__m512 fft48 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft49 = _mm512_fmadd_ps(fft40, fft48, _mm512_shuffle_f32x4(fft40, fft40, 177));
__m512 fft133 = _mm512_fmadd_ps(fft125, fft48, _mm512_shuffle_f32x4(fft125, fft125, 177));
__m512 fft50 = _mm512_fmadd_ps(fft41, fft48, _mm512_shuffle_f32x4(fft41, fft41, 177));
__m512 fft134 = _mm512_fmadd_ps(fft126, fft48, _mm512_shuffle_f32x4(fft126, fft126, 177));
__m512 fft51 = _mm512_fmadd_ps(fft42, fft48, _mm512_shuffle_f32x4(fft42, fft42, 177));
__m512 fft135 = _mm512_fmadd_ps(fft127, fft48, _mm512_shuffle_f32x4(fft127, fft127, 177));
__m512 fft52 = _mm512_fmadd_ps(fft43, fft48, _mm512_shuffle_f32x4(fft43, fft43, 177));
__m512 fft136 = _mm512_fmadd_ps(fft128, fft48, _mm512_shuffle_f32x4(fft128, fft128, 177));
__m512 fft53 = _mm512_fmadd_ps(fft44, fft48, _mm512_shuffle_f32x4(fft44, fft44, 177));
__m512 fft137 = _mm512_fmadd_ps(fft129, fft48, _mm512_shuffle_f32x4(fft129, fft129, 177));
__m512 fft54 = _mm512_fmadd_ps(fft45, fft48, _mm512_shuffle_f32x4(fft45, fft45, 177));
__m512 fft138 = _mm512_fmadd_ps(fft130, fft48, _mm512_shuffle_f32x4(fft130, fft130, 177));
__m512 fft55 = _mm512_fmadd_ps(fft46, fft48, _mm512_shuffle_f32x4(fft46, fft46, 177));
__m512 fft139 = _mm512_fmadd_ps(fft131, fft48, _mm512_shuffle_f32x4(fft131, fft131, 177));
__m512 fft56 = _mm512_fmadd_ps(fft47, fft48, _mm512_shuffle_f32x4(fft47, fft47, 177));
__m512 fft140 = _mm512_fmadd_ps(fft132, fft48, _mm512_shuffle_f32x4(fft132, fft132, 177));
__m512 fft57 = _mm512_mask_mov_ps(fft49, 49344, fft50);
__m512 fft141 = _mm512_mask_mov_ps(fft133, 49344, fft134);
__m512 fft58 = _mm512_mask_sub_ps(fft50, 49344, _mm512_setzero_ps(), fft49);
__m512 fft142 = _mm512_mask_sub_ps(fft134, 49344, _mm512_setzero_ps(), fft133);
__m512 fft59 = _mm512_mask_mov_ps(fft51, 49344, fft52);
__m512 fft143 = _mm512_mask_mov_ps(fft135, 49344, fft136);
__m512 fft60 = _mm512_mask_sub_ps(fft52, 49344, _mm512_setzero_ps(), fft51);
__m512 fft144 = _mm512_mask_sub_ps(fft136, 49344, _mm512_setzero_ps(), fft135);
__m512 fft61 = _mm512_mask_mov_ps(fft53, 49344, fft54);
__m512 fft145 = _mm512_mask_mov_ps(fft137, 49344, fft138);
__m512 fft62 = _mm512_mask_sub_ps(fft54, 49344, _mm512_setzero_ps(), fft53);
__m512 fft146 = _mm512_mask_sub_ps(fft138, 49344, _mm512_setzero_ps(), fft137);
__m512 fft63 = _mm512_mask_mov_ps(fft55, 49344, fft56);
__m512 fft147 = _mm512_mask_mov_ps(fft139, 49344, fft140);
__m512 fft64 = _mm512_mask_sub_ps(fft56, 49344, _mm512_setzero_ps(), fft55);
__m512 fft148 = _mm512_mask_sub_ps(fft140, 49344, _mm512_setzero_ps(), fft139);
__m512 fft65 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft66 = _mm512_fmadd_ps(fft57, fft65, _mm512_shuffle_ps(fft57, fft57, 78));
__m512 fft149 = _mm512_fmadd_ps(fft141, fft65, _mm512_shuffle_ps(fft141, fft141, 78));
__m512 fft67 = _mm512_fmadd_ps(fft58, fft65, _mm512_shuffle_ps(fft58, fft58, 78));
__m512 fft150 = _mm512_fmadd_ps(fft142, fft65, _mm512_shuffle_ps(fft142, fft142, 78));
__m512 fft68 = _mm512_fmadd_ps(fft59, fft65, _mm512_shuffle_ps(fft59, fft59, 78));
__m512 fft151 = _mm512_fmadd_ps(fft143, fft65, _mm512_shuffle_ps(fft143, fft143, 78));
__m512 fft69 = _mm512_fmadd_ps(fft60, fft65, _mm512_shuffle_ps(fft60, fft60, 78));
__m512 fft152 = _mm512_fmadd_ps(fft144, fft65, _mm512_shuffle_ps(fft144, fft144, 78));
__m512 fft70 = _mm512_fmadd_ps(fft61, fft65, _mm512_shuffle_ps(fft61, fft61, 78));
__m512 fft153 = _mm512_fmadd_ps(fft145, fft65, _mm512_shuffle_ps(fft145, fft145, 78));
__m512 fft71 = _mm512_fmadd_ps(fft62, fft65, _mm512_shuffle_ps(fft62, fft62, 78));
__m512 fft154 = _mm512_fmadd_ps(fft146, fft65, _mm512_shuffle_ps(fft146, fft146, 78));
__m512 fft72 = _mm512_fmadd_ps(fft63, fft65, _mm512_shuffle_ps(fft63, fft63, 78));
__m512 fft155 = _mm512_fmadd_ps(fft147, fft65, _mm512_shuffle_ps(fft147, fft147, 78));
__m512 fft73 = _mm512_fmadd_ps(fft64, fft65, _mm512_shuffle_ps(fft64, fft64, 78));
__m512 fft156 = _mm512_fmadd_ps(fft148, fft65, _mm512_shuffle_ps(fft148, fft148, 78));
__m512i fft74 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft75 = _mm512_permutexvar_ps(fft74, fft66);
__m512 fft157 = _mm512_permutexvar_ps(fft74, fft149);
__m512i fft76 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft77 = _mm512_permutexvar_ps(fft76, fft66);
__m512 fft158 = _mm512_permutexvar_ps(fft76, fft149);
__m512 fft78 = _mm512_permutexvar_ps(fft74, fft67);
__m512 fft159 = _mm512_permutexvar_ps(fft74, fft150);
__m512 fft79 = _mm512_permutexvar_ps(fft76, fft67);
__m512 fft160 = _mm512_permutexvar_ps(fft76, fft150);
__m512 fft80 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft81 = _mm512_fmadd_ps(fft75, fft80, fft77);
__m512 fft161 = _mm512_fmadd_ps(fft157, fft80, fft158);
__m512 fft82 = _mm512_fnmadd_ps(fft79, fft80, fft78);
__m512 fft162 = _mm512_fnmadd_ps(fft160, fft80, fft159);
__m512 fft83 = _mm512_mask_mov_ps(fft79, 21845, fft81);
__m512 fft163 = _mm512_mask_mov_ps(fft160, 21845, fft161);
__m512 fft84 = _mm512_mask_mov_ps(fft75, 43176, fft81);
__m512 fft164 = _mm512_mask_mov_ps(fft157, 43176, fft161);
__m512 fft85 = _mm512_mask_mov_ps(fft83, 43176, fft82);
__m512 fft165 = _mm512_mask_mov_ps(fft163, 43176, fft162);
__m512 fft86 = _mm512_mask_mov_ps(fft84, 22102, fft82);
__m512 fft166 = _mm512_mask_mov_ps(fft164, 22102, fft162);
__m512 fft87 = _mm512_mask_mul_ps(fft85, 64764, fft85, _mm512_set1_ps(5e-01f));
__m512 fft167 = _mm512_mask_mul_ps(fft165, 64764, fft165, _mm512_set1_ps(5e-01f));
__m512 fft88 = _mm512_mask_mul_ps(fft86, 64764, fft86, _mm512_set1_ps(5e-01f));
__m512 fft168 = _mm512_mask_mul_ps(fft166, 64764, fft166, _mm512_set1_ps(5e-01f));
__m512 wf1 = fft87;
__m512 wf9 = fft167;
__m512 wf2 = fft88;
__m512 wf10 = fft168;
__m512 wf3 = fft68;
__m512 wf11 = fft151;
__m512 wf4 = fft69;
__m512 wf12 = fft152;
__m512 wf5 = fft70;
__m512 wf13 = fft153;
__m512 wf6 = fft71;
__m512 wf14 = fft154;
__m512 wf7 = fft72;
__m512 wf15 = fft155;
__m512 wf8 = fft73;
__m512 wf16 = fft156;
ptrdiff_t c1 = (size_t)(0+2*j1)/4;
ptrdiff_t m1 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f2 = (size_t)(0+2*j1)%2;
__m512i eo1 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf3 = _mm512_permutexvar_ps(eo1, wf3);
wf4 = _mm512_permutexvar_ps(eo1, wf4);
__m512i wfs1 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs1 = _mm512_inserti64x4(wfs1, _mm512_cvtps_ph(wf4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+158592+634368*i5+15104*c1+128*k1+64*m1+16*f2, 3855, wfs1);
_mm512_mask_storeu_epi32(wfPtr1+4599152+634368*i5+15104*c1+128*k1+64*m1+16*f2, 61680, wfs1);
wf11 = _mm512_permutexvar_ps(eo1, wf11);
wf12 = _mm512_permutexvar_ps(eo1, wf12);
__m512i wfs2 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs2 = _mm512_inserti64x4(wfs2, _mm512_cvtps_ph(wf12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+9039744+634368*i5+15104*c1+128*k1+64*m1+16*f2, 3855, wfs2);
_mm512_mask_storeu_epi32(wfPtr1+13480304+634368*i5+15104*c1+128*k1+64*m1+16*f2, 61680, wfs2);
wf5 = _mm512_permutexvar_ps(eo1, wf5);
wf6 = _mm512_permutexvar_ps(eo1, wf6);
__m512i wfs3 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs3 = _mm512_inserti64x4(wfs3, _mm512_cvtps_ph(wf6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+317184+634368*i5+15104*c1+128*k1+64*m1+16*f2, 3855, wfs3);
_mm512_mask_storeu_epi32(wfPtr1+4757744+634368*i5+15104*c1+128*k1+64*m1+16*f2, 61680, wfs3);
wf13 = _mm512_permutexvar_ps(eo1, wf13);
wf14 = _mm512_permutexvar_ps(eo1, wf14);
__m512i wfs4 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs4 = _mm512_inserti64x4(wfs4, _mm512_cvtps_ph(wf14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+9198336+634368*i5+15104*c1+128*k1+64*m1+16*f2, 3855, wfs4);
_mm512_mask_storeu_epi32(wfPtr1+13638896+634368*i5+15104*c1+128*k1+64*m1+16*f2, 61680, wfs4);
wf7 = _mm512_permutexvar_ps(eo1, wf7);
wf8 = _mm512_permutexvar_ps(eo1, wf8);
__m512i wfs5 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs5 = _mm512_inserti64x4(wfs5, _mm512_cvtps_ph(wf8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+475776+634368*i5+15104*c1+128*k1+64*m1+16*f2, 3855, wfs5);
_mm512_mask_storeu_epi32(wfPtr1+4916336+634368*i5+15104*c1+128*k1+64*m1+16*f2, 61680, wfs5);
wf15 = _mm512_permutexvar_ps(eo1, wf15);
wf16 = _mm512_permutexvar_ps(eo1, wf16);
__m512i wfs6 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs6 = _mm512_inserti64x4(wfs6, _mm512_cvtps_ph(wf16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+9356928+634368*i5+15104*c1+128*k1+64*m1+16*f2, 3855, wfs6);
_mm512_mask_storeu_epi32(wfPtr1+13797488+634368*i5+15104*c1+128*k1+64*m1+16*f2, 61680, wfs6);
__m512i wfs7 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs7 = _mm512_inserti64x4(wfs7, _mm512_cvtps_ph(wf2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+634368*i5+15104*c1+128*k1+64*m1+16*f2, 3855, wfs7);
_mm512_mask_storeu_epi32(wfPtr1+4440560+634368*i5+15104*c1+128*k1+64*m1+16*f2, 61680, wfs7);
__m512i wfs8 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs8 = _mm512_inserti64x4(wfs8, _mm512_cvtps_ph(wf10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+8881152+634368*i5+15104*c1+128*k1+64*m1+16*f2, 3855, wfs8);
_mm512_mask_storeu_epi32(wfPtr1+13321712+634368*i5+15104*c1+128*k1+64*m1+16*f2, 61680, wfs8);
__m512 wt5 = _mm512_maskz_loadu_ps(7, wtPtr1+5664+232224*i5+11328*j1+48*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(7, wtPtr1+5676+232224*i5+11328*j1+48*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(7, wtPtr1+5688+232224*i5+11328*j1+48*k1);
__m512 wt8 = _mm512_maskz_loadu_ps(7, wtPtr1+5700+232224*i5+11328*j1+48*k1);
__m512 fft169 = _mm512_add_ps(wt5, _mm512_setzero_ps());
__m512 fft257 = _mm512_add_ps(wt6, _mm512_setzero_ps());
__m512 fft170 = _mm512_sub_ps(wt5, _mm512_setzero_ps());
__m512 fft258 = _mm512_sub_ps(wt6, _mm512_setzero_ps());
__m512 fft171 = _mm512_add_ps(wt7, _mm512_setzero_ps());
__m512 fft259 = _mm512_add_ps(wt8, _mm512_setzero_ps());
__m512 fft172 = _mm512_sub_ps(wt7, _mm512_setzero_ps());
__m512 fft260 = _mm512_sub_ps(wt8, _mm512_setzero_ps());
__m512 fft173 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft261 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft174 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft262 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft175 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft263 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft176 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft264 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft177 = _mm512_add_ps(fft169, fft173);
__m512 fft265 = _mm512_add_ps(fft257, fft261);
__m512 fft178 = _mm512_sub_ps(fft169, fft173);
__m512 fft266 = _mm512_sub_ps(fft257, fft261);
__m512 fft179 = _mm512_add_ps(fft171, fft175);
__m512 fft267 = _mm512_add_ps(fft259, fft263);
__m512 fft180 = _mm512_sub_ps(fft175, fft171);
__m512 fft268 = _mm512_sub_ps(fft263, fft259);
__m512 fft181 = _mm512_sub_ps(fft172, fft176);
__m512 fft269 = _mm512_sub_ps(fft260, fft264);
__m512 fft182 = _mm512_add_ps(fft172, fft176);
__m512 fft270 = _mm512_add_ps(fft260, fft264);
__m512 fft183 = _mm512_add_ps(fft177, fft179);
__m512 fft271 = _mm512_add_ps(fft265, fft267);
__m512 fft184 = _mm512_sub_ps(fft177, fft179);
__m512 fft272 = _mm512_sub_ps(fft265, fft267);
__m512 fft185 = _mm512_fmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft273 = _mm512_fmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft186 = _mm512_fnmsub_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft274 = _mm512_fnmsub_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft187 = _mm512_fnmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft275 = _mm512_fnmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft188 = _mm512_fnmadd_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft276 = _mm512_fnmadd_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft189 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft190 = _mm512_fmadd_ps(fft183, fft189, _mm512_shuffle_f32x4(fft183, fft183, 78));
__m512 fft277 = _mm512_fmadd_ps(fft271, fft189, _mm512_shuffle_f32x4(fft271, fft271, 78));
__m512 fft191 = _mm512_fmadd_ps(fft184, fft189, _mm512_shuffle_f32x4(fft184, fft184, 78));
__m512 fft278 = _mm512_fmadd_ps(fft272, fft189, _mm512_shuffle_f32x4(fft272, fft272, 78));
__m512 fft192 = _mm512_fmadd_ps(fft185, fft189, _mm512_shuffle_f32x4(fft185, fft185, 78));
__m512 fft279 = _mm512_fmadd_ps(fft273, fft189, _mm512_shuffle_f32x4(fft273, fft273, 78));
__m512 fft193 = _mm512_fmadd_ps(fft186, fft189, _mm512_shuffle_f32x4(fft186, fft186, 78));
__m512 fft280 = _mm512_fmadd_ps(fft274, fft189, _mm512_shuffle_f32x4(fft274, fft274, 78));
__m512 fft194 = _mm512_fmadd_ps(fft178, fft189, _mm512_shuffle_f32x4(fft178, fft178, 78));
__m512 fft281 = _mm512_fmadd_ps(fft266, fft189, _mm512_shuffle_f32x4(fft266, fft266, 78));
__m512 fft195 = _mm512_fmadd_ps(fft180, fft189, _mm512_shuffle_f32x4(fft180, fft180, 78));
__m512 fft282 = _mm512_fmadd_ps(fft268, fft189, _mm512_shuffle_f32x4(fft268, fft268, 78));
__m512 fft196 = _mm512_fmadd_ps(fft187, fft189, _mm512_shuffle_f32x4(fft187, fft187, 78));
__m512 fft283 = _mm512_fmadd_ps(fft275, fft189, _mm512_shuffle_f32x4(fft275, fft275, 78));
__m512 fft197 = _mm512_fmadd_ps(fft188, fft189, _mm512_shuffle_f32x4(fft188, fft188, 78));
__m512 fft284 = _mm512_fmadd_ps(fft276, fft189, _mm512_shuffle_f32x4(fft276, fft276, 78));
__m512 fft198 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft199 = _mm512_mul_ps(fft190, fft198);
__m512 fft285 = _mm512_mul_ps(fft277, fft198);
__m512 fft200 = _mm512_mul_ps(fft191, fft198);
__m512 fft286 = _mm512_mul_ps(fft278, fft198);
__m512 fft201 = _mm512_mul_ps(fft192, fft198);
__m512 fft287 = _mm512_mul_ps(fft279, fft198);
__m512 fft202 = _mm512_mul_ps(fft193, fft198);
__m512 fft288 = _mm512_mul_ps(fft280, fft198);
__m512 fft203 = _mm512_mul_ps(fft194, fft198);
__m512 fft289 = _mm512_mul_ps(fft281, fft198);
__m512 fft204 = _mm512_mul_ps(fft195, fft198);
__m512 fft290 = _mm512_mul_ps(fft282, fft198);
__m512 fft205 = _mm512_mul_ps(fft196, fft198);
__m512 fft291 = _mm512_mul_ps(fft283, fft198);
__m512 fft206 = _mm512_mul_ps(fft197, fft198);
__m512 fft292 = _mm512_mul_ps(fft284, fft198);
__m512 fft207 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft208 = _mm512_fmadd_ps(fft191, fft207, fft199);
__m512 fft293 = _mm512_fmadd_ps(fft278, fft207, fft285);
__m512 fft209 = _mm512_fnmadd_ps(fft190, fft207, fft200);
__m512 fft294 = _mm512_fnmadd_ps(fft277, fft207, fft286);
__m512 fft210 = _mm512_fmadd_ps(fft193, fft207, fft201);
__m512 fft295 = _mm512_fmadd_ps(fft280, fft207, fft287);
__m512 fft211 = _mm512_fnmadd_ps(fft192, fft207, fft202);
__m512 fft296 = _mm512_fnmadd_ps(fft279, fft207, fft288);
__m512 fft212 = _mm512_fmadd_ps(fft195, fft207, fft203);
__m512 fft297 = _mm512_fmadd_ps(fft282, fft207, fft289);
__m512 fft213 = _mm512_fnmadd_ps(fft194, fft207, fft204);
__m512 fft298 = _mm512_fnmadd_ps(fft281, fft207, fft290);
__m512 fft214 = _mm512_fmadd_ps(fft197, fft207, fft205);
__m512 fft299 = _mm512_fmadd_ps(fft284, fft207, fft291);
__m512 fft215 = _mm512_fnmadd_ps(fft196, fft207, fft206);
__m512 fft300 = _mm512_fnmadd_ps(fft283, fft207, fft292);
__m512 fft216 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft217 = _mm512_fmadd_ps(fft208, fft216, _mm512_shuffle_f32x4(fft208, fft208, 177));
__m512 fft301 = _mm512_fmadd_ps(fft293, fft216, _mm512_shuffle_f32x4(fft293, fft293, 177));
__m512 fft218 = _mm512_fmadd_ps(fft209, fft216, _mm512_shuffle_f32x4(fft209, fft209, 177));
__m512 fft302 = _mm512_fmadd_ps(fft294, fft216, _mm512_shuffle_f32x4(fft294, fft294, 177));
__m512 fft219 = _mm512_fmadd_ps(fft210, fft216, _mm512_shuffle_f32x4(fft210, fft210, 177));
__m512 fft303 = _mm512_fmadd_ps(fft295, fft216, _mm512_shuffle_f32x4(fft295, fft295, 177));
__m512 fft220 = _mm512_fmadd_ps(fft211, fft216, _mm512_shuffle_f32x4(fft211, fft211, 177));
__m512 fft304 = _mm512_fmadd_ps(fft296, fft216, _mm512_shuffle_f32x4(fft296, fft296, 177));
__m512 fft221 = _mm512_fmadd_ps(fft212, fft216, _mm512_shuffle_f32x4(fft212, fft212, 177));
__m512 fft305 = _mm512_fmadd_ps(fft297, fft216, _mm512_shuffle_f32x4(fft297, fft297, 177));
__m512 fft222 = _mm512_fmadd_ps(fft213, fft216, _mm512_shuffle_f32x4(fft213, fft213, 177));
__m512 fft306 = _mm512_fmadd_ps(fft298, fft216, _mm512_shuffle_f32x4(fft298, fft298, 177));
__m512 fft223 = _mm512_fmadd_ps(fft214, fft216, _mm512_shuffle_f32x4(fft214, fft214, 177));
__m512 fft307 = _mm512_fmadd_ps(fft299, fft216, _mm512_shuffle_f32x4(fft299, fft299, 177));
__m512 fft224 = _mm512_fmadd_ps(fft215, fft216, _mm512_shuffle_f32x4(fft215, fft215, 177));
__m512 fft308 = _mm512_fmadd_ps(fft300, fft216, _mm512_shuffle_f32x4(fft300, fft300, 177));
__m512 fft225 = _mm512_mask_mov_ps(fft217, 49344, fft218);
__m512 fft309 = _mm512_mask_mov_ps(fft301, 49344, fft302);
__m512 fft226 = _mm512_mask_sub_ps(fft218, 49344, _mm512_setzero_ps(), fft217);
__m512 fft310 = _mm512_mask_sub_ps(fft302, 49344, _mm512_setzero_ps(), fft301);
__m512 fft227 = _mm512_mask_mov_ps(fft219, 49344, fft220);
__m512 fft311 = _mm512_mask_mov_ps(fft303, 49344, fft304);
__m512 fft228 = _mm512_mask_sub_ps(fft220, 49344, _mm512_setzero_ps(), fft219);
__m512 fft312 = _mm512_mask_sub_ps(fft304, 49344, _mm512_setzero_ps(), fft303);
__m512 fft229 = _mm512_mask_mov_ps(fft221, 49344, fft222);
__m512 fft313 = _mm512_mask_mov_ps(fft305, 49344, fft306);
__m512 fft230 = _mm512_mask_sub_ps(fft222, 49344, _mm512_setzero_ps(), fft221);
__m512 fft314 = _mm512_mask_sub_ps(fft306, 49344, _mm512_setzero_ps(), fft305);
__m512 fft231 = _mm512_mask_mov_ps(fft223, 49344, fft224);
__m512 fft315 = _mm512_mask_mov_ps(fft307, 49344, fft308);
__m512 fft232 = _mm512_mask_sub_ps(fft224, 49344, _mm512_setzero_ps(), fft223);
__m512 fft316 = _mm512_mask_sub_ps(fft308, 49344, _mm512_setzero_ps(), fft307);
__m512 fft233 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft234 = _mm512_fmadd_ps(fft225, fft233, _mm512_shuffle_ps(fft225, fft225, 78));
__m512 fft317 = _mm512_fmadd_ps(fft309, fft233, _mm512_shuffle_ps(fft309, fft309, 78));
__m512 fft235 = _mm512_fmadd_ps(fft226, fft233, _mm512_shuffle_ps(fft226, fft226, 78));
__m512 fft318 = _mm512_fmadd_ps(fft310, fft233, _mm512_shuffle_ps(fft310, fft310, 78));
__m512 fft236 = _mm512_fmadd_ps(fft227, fft233, _mm512_shuffle_ps(fft227, fft227, 78));
__m512 fft319 = _mm512_fmadd_ps(fft311, fft233, _mm512_shuffle_ps(fft311, fft311, 78));
__m512 fft237 = _mm512_fmadd_ps(fft228, fft233, _mm512_shuffle_ps(fft228, fft228, 78));
__m512 fft320 = _mm512_fmadd_ps(fft312, fft233, _mm512_shuffle_ps(fft312, fft312, 78));
__m512 fft238 = _mm512_fmadd_ps(fft229, fft233, _mm512_shuffle_ps(fft229, fft229, 78));
__m512 fft321 = _mm512_fmadd_ps(fft313, fft233, _mm512_shuffle_ps(fft313, fft313, 78));
__m512 fft239 = _mm512_fmadd_ps(fft230, fft233, _mm512_shuffle_ps(fft230, fft230, 78));
__m512 fft322 = _mm512_fmadd_ps(fft314, fft233, _mm512_shuffle_ps(fft314, fft314, 78));
__m512 fft240 = _mm512_fmadd_ps(fft231, fft233, _mm512_shuffle_ps(fft231, fft231, 78));
__m512 fft323 = _mm512_fmadd_ps(fft315, fft233, _mm512_shuffle_ps(fft315, fft315, 78));
__m512 fft241 = _mm512_fmadd_ps(fft232, fft233, _mm512_shuffle_ps(fft232, fft232, 78));
__m512 fft324 = _mm512_fmadd_ps(fft316, fft233, _mm512_shuffle_ps(fft316, fft316, 78));
__m512i fft242 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft243 = _mm512_permutexvar_ps(fft242, fft234);
__m512 fft325 = _mm512_permutexvar_ps(fft242, fft317);
__m512i fft244 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft245 = _mm512_permutexvar_ps(fft244, fft234);
__m512 fft326 = _mm512_permutexvar_ps(fft244, fft317);
__m512 fft246 = _mm512_permutexvar_ps(fft242, fft235);
__m512 fft327 = _mm512_permutexvar_ps(fft242, fft318);
__m512 fft247 = _mm512_permutexvar_ps(fft244, fft235);
__m512 fft328 = _mm512_permutexvar_ps(fft244, fft318);
__m512 fft248 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft249 = _mm512_fmadd_ps(fft243, fft248, fft245);
__m512 fft329 = _mm512_fmadd_ps(fft325, fft248, fft326);
__m512 fft250 = _mm512_fnmadd_ps(fft247, fft248, fft246);
__m512 fft330 = _mm512_fnmadd_ps(fft328, fft248, fft327);
__m512 fft251 = _mm512_mask_mov_ps(fft247, 21845, fft249);
__m512 fft331 = _mm512_mask_mov_ps(fft328, 21845, fft329);
__m512 fft252 = _mm512_mask_mov_ps(fft243, 43176, fft249);
__m512 fft332 = _mm512_mask_mov_ps(fft325, 43176, fft329);
__m512 fft253 = _mm512_mask_mov_ps(fft251, 43176, fft250);
__m512 fft333 = _mm512_mask_mov_ps(fft331, 43176, fft330);
__m512 fft254 = _mm512_mask_mov_ps(fft252, 22102, fft250);
__m512 fft334 = _mm512_mask_mov_ps(fft332, 22102, fft330);
__m512 fft255 = _mm512_mask_mul_ps(fft253, 64764, fft253, _mm512_set1_ps(5e-01f));
__m512 fft335 = _mm512_mask_mul_ps(fft333, 64764, fft333, _mm512_set1_ps(5e-01f));
__m512 fft256 = _mm512_mask_mul_ps(fft254, 64764, fft254, _mm512_set1_ps(5e-01f));
__m512 fft336 = _mm512_mask_mul_ps(fft334, 64764, fft334, _mm512_set1_ps(5e-01f));
__m512 wf17 = fft255;
__m512 wf25 = fft335;
__m512 wf18 = fft256;
__m512 wf26 = fft336;
__m512 wf19 = fft236;
__m512 wf27 = fft319;
__m512 wf20 = fft237;
__m512 wf28 = fft320;
__m512 wf21 = fft238;
__m512 wf29 = fft321;
__m512 wf22 = fft239;
__m512 wf30 = fft322;
__m512 wf23 = fft240;
__m512 wf31 = fft323;
__m512 wf24 = fft241;
__m512 wf32 = fft324;
ptrdiff_t c2 = (size_t)(1+2*j1)/4;
ptrdiff_t m2 = (size_t)(1+2*j1)%4/2;
ptrdiff_t f3 = (size_t)(1+2*j1)%2;
__m512i eo2 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf19 = _mm512_permutexvar_ps(eo2, wf19);
wf20 = _mm512_permutexvar_ps(eo2, wf20);
__m512i wfs9 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs9 = _mm512_inserti64x4(wfs9, _mm512_cvtps_ph(wf20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+158592+634368*i5+15104*c2+128*k1+64*m2+16*f3, 3855, wfs9);
_mm512_mask_storeu_epi32(wfPtr1+4599152+634368*i5+15104*c2+128*k1+64*m2+16*f3, 61680, wfs9);
wf27 = _mm512_permutexvar_ps(eo2, wf27);
wf28 = _mm512_permutexvar_ps(eo2, wf28);
__m512i wfs10 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs10 = _mm512_inserti64x4(wfs10, _mm512_cvtps_ph(wf28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+9039744+634368*i5+15104*c2+128*k1+64*m2+16*f3, 3855, wfs10);
_mm512_mask_storeu_epi32(wfPtr1+13480304+634368*i5+15104*c2+128*k1+64*m2+16*f3, 61680, wfs10);
wf21 = _mm512_permutexvar_ps(eo2, wf21);
wf22 = _mm512_permutexvar_ps(eo2, wf22);
__m512i wfs11 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs11 = _mm512_inserti64x4(wfs11, _mm512_cvtps_ph(wf22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+317184+634368*i5+15104*c2+128*k1+64*m2+16*f3, 3855, wfs11);
_mm512_mask_storeu_epi32(wfPtr1+4757744+634368*i5+15104*c2+128*k1+64*m2+16*f3, 61680, wfs11);
wf29 = _mm512_permutexvar_ps(eo2, wf29);
wf30 = _mm512_permutexvar_ps(eo2, wf30);
__m512i wfs12 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf29, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs12 = _mm512_inserti64x4(wfs12, _mm512_cvtps_ph(wf30, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+9198336+634368*i5+15104*c2+128*k1+64*m2+16*f3, 3855, wfs12);
_mm512_mask_storeu_epi32(wfPtr1+13638896+634368*i5+15104*c2+128*k1+64*m2+16*f3, 61680, wfs12);
wf23 = _mm512_permutexvar_ps(eo2, wf23);
wf24 = _mm512_permutexvar_ps(eo2, wf24);
__m512i wfs13 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs13 = _mm512_inserti64x4(wfs13, _mm512_cvtps_ph(wf24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+475776+634368*i5+15104*c2+128*k1+64*m2+16*f3, 3855, wfs13);
_mm512_mask_storeu_epi32(wfPtr1+4916336+634368*i5+15104*c2+128*k1+64*m2+16*f3, 61680, wfs13);
wf31 = _mm512_permutexvar_ps(eo2, wf31);
wf32 = _mm512_permutexvar_ps(eo2, wf32);
__m512i wfs14 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf31, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs14 = _mm512_inserti64x4(wfs14, _mm512_cvtps_ph(wf32, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+9356928+634368*i5+15104*c2+128*k1+64*m2+16*f3, 3855, wfs14);
_mm512_mask_storeu_epi32(wfPtr1+13797488+634368*i5+15104*c2+128*k1+64*m2+16*f3, 61680, wfs14);
__m512i wfs15 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs15 = _mm512_inserti64x4(wfs15, _mm512_cvtps_ph(wf18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+634368*i5+15104*c2+128*k1+64*m2+16*f3, 3855, wfs15);
_mm512_mask_storeu_epi32(wfPtr1+4440560+634368*i5+15104*c2+128*k1+64*m2+16*f3, 61680, wfs15);
__m512i wfs16 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs16 = _mm512_inserti64x4(wfs16, _mm512_cvtps_ph(wf26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+8881152+634368*i5+15104*c2+128*k1+64*m2+16*f3, 3855, wfs16);
_mm512_mask_storeu_epi32(wfPtr1+13321712+634368*i5+15104*c2+128*k1+64*m2+16*f3, 61680, wfs16);
}
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(3, biasPtr1-0+164*i5+8*j1);
bias1 = _mm512_mul_ps(bias1, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+168*i5+8*j1, 3, bias1);
if (j1 >= jj1) return;
}
}
if (j1 == 20) {
for (ptrdiff_t k2 = 0; k2 < 118; ++k2) {
__m512 wt9 = _mm512_maskz_loadu_ps(7, wtPtr1+0+232224*i5+11328*j1+48*k2);
__m512 wt10 = _mm512_maskz_loadu_ps(7, wtPtr1+12+232224*i5+11328*j1+48*k2);
__m512 wt11 = _mm512_maskz_loadu_ps(7, wtPtr1+24+232224*i5+11328*j1+48*k2);
__m512 wt12 = _mm512_maskz_loadu_ps(7, wtPtr1+36+232224*i5+11328*j1+48*k2);
__m512 fft337 = _mm512_add_ps(wt9, _mm512_setzero_ps());
__m512 fft425 = _mm512_add_ps(wt10, _mm512_setzero_ps());
__m512 fft338 = _mm512_sub_ps(wt9, _mm512_setzero_ps());
__m512 fft426 = _mm512_sub_ps(wt10, _mm512_setzero_ps());
__m512 fft339 = _mm512_add_ps(wt11, _mm512_setzero_ps());
__m512 fft427 = _mm512_add_ps(wt12, _mm512_setzero_ps());
__m512 fft340 = _mm512_sub_ps(wt11, _mm512_setzero_ps());
__m512 fft428 = _mm512_sub_ps(wt12, _mm512_setzero_ps());
__m512 fft341 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft429 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft342 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft430 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft343 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft431 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft344 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft432 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft345 = _mm512_add_ps(fft337, fft341);
__m512 fft433 = _mm512_add_ps(fft425, fft429);
__m512 fft346 = _mm512_sub_ps(fft337, fft341);
__m512 fft434 = _mm512_sub_ps(fft425, fft429);
__m512 fft347 = _mm512_add_ps(fft339, fft343);
__m512 fft435 = _mm512_add_ps(fft427, fft431);
__m512 fft348 = _mm512_sub_ps(fft343, fft339);
__m512 fft436 = _mm512_sub_ps(fft431, fft427);
__m512 fft349 = _mm512_sub_ps(fft340, fft344);
__m512 fft437 = _mm512_sub_ps(fft428, fft432);
__m512 fft350 = _mm512_add_ps(fft340, fft344);
__m512 fft438 = _mm512_add_ps(fft428, fft432);
__m512 fft351 = _mm512_add_ps(fft345, fft347);
__m512 fft439 = _mm512_add_ps(fft433, fft435);
__m512 fft352 = _mm512_sub_ps(fft345, fft347);
__m512 fft440 = _mm512_sub_ps(fft433, fft435);
__m512 fft353 = _mm512_fmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft441 = _mm512_fmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft354 = _mm512_fnmsub_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft442 = _mm512_fnmsub_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft355 = _mm512_fnmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft443 = _mm512_fnmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft356 = _mm512_fnmadd_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft444 = _mm512_fnmadd_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft357 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft358 = _mm512_fmadd_ps(fft351, fft357, _mm512_shuffle_f32x4(fft351, fft351, 78));
__m512 fft445 = _mm512_fmadd_ps(fft439, fft357, _mm512_shuffle_f32x4(fft439, fft439, 78));
__m512 fft359 = _mm512_fmadd_ps(fft352, fft357, _mm512_shuffle_f32x4(fft352, fft352, 78));
__m512 fft446 = _mm512_fmadd_ps(fft440, fft357, _mm512_shuffle_f32x4(fft440, fft440, 78));
__m512 fft360 = _mm512_fmadd_ps(fft353, fft357, _mm512_shuffle_f32x4(fft353, fft353, 78));
__m512 fft447 = _mm512_fmadd_ps(fft441, fft357, _mm512_shuffle_f32x4(fft441, fft441, 78));
__m512 fft361 = _mm512_fmadd_ps(fft354, fft357, _mm512_shuffle_f32x4(fft354, fft354, 78));
__m512 fft448 = _mm512_fmadd_ps(fft442, fft357, _mm512_shuffle_f32x4(fft442, fft442, 78));
__m512 fft362 = _mm512_fmadd_ps(fft346, fft357, _mm512_shuffle_f32x4(fft346, fft346, 78));
__m512 fft449 = _mm512_fmadd_ps(fft434, fft357, _mm512_shuffle_f32x4(fft434, fft434, 78));
__m512 fft363 = _mm512_fmadd_ps(fft348, fft357, _mm512_shuffle_f32x4(fft348, fft348, 78));
__m512 fft450 = _mm512_fmadd_ps(fft436, fft357, _mm512_shuffle_f32x4(fft436, fft436, 78));
__m512 fft364 = _mm512_fmadd_ps(fft355, fft357, _mm512_shuffle_f32x4(fft355, fft355, 78));
__m512 fft451 = _mm512_fmadd_ps(fft443, fft357, _mm512_shuffle_f32x4(fft443, fft443, 78));
__m512 fft365 = _mm512_fmadd_ps(fft356, fft357, _mm512_shuffle_f32x4(fft356, fft356, 78));
__m512 fft452 = _mm512_fmadd_ps(fft444, fft357, _mm512_shuffle_f32x4(fft444, fft444, 78));
__m512 fft366 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft367 = _mm512_mul_ps(fft358, fft366);
__m512 fft453 = _mm512_mul_ps(fft445, fft366);
__m512 fft368 = _mm512_mul_ps(fft359, fft366);
__m512 fft454 = _mm512_mul_ps(fft446, fft366);
__m512 fft369 = _mm512_mul_ps(fft360, fft366);
__m512 fft455 = _mm512_mul_ps(fft447, fft366);
__m512 fft370 = _mm512_mul_ps(fft361, fft366);
__m512 fft456 = _mm512_mul_ps(fft448, fft366);
__m512 fft371 = _mm512_mul_ps(fft362, fft366);
__m512 fft457 = _mm512_mul_ps(fft449, fft366);
__m512 fft372 = _mm512_mul_ps(fft363, fft366);
__m512 fft458 = _mm512_mul_ps(fft450, fft366);
__m512 fft373 = _mm512_mul_ps(fft364, fft366);
__m512 fft459 = _mm512_mul_ps(fft451, fft366);
__m512 fft374 = _mm512_mul_ps(fft365, fft366);
__m512 fft460 = _mm512_mul_ps(fft452, fft366);
__m512 fft375 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft376 = _mm512_fmadd_ps(fft359, fft375, fft367);
__m512 fft461 = _mm512_fmadd_ps(fft446, fft375, fft453);
__m512 fft377 = _mm512_fnmadd_ps(fft358, fft375, fft368);
__m512 fft462 = _mm512_fnmadd_ps(fft445, fft375, fft454);
__m512 fft378 = _mm512_fmadd_ps(fft361, fft375, fft369);
__m512 fft463 = _mm512_fmadd_ps(fft448, fft375, fft455);
__m512 fft379 = _mm512_fnmadd_ps(fft360, fft375, fft370);
__m512 fft464 = _mm512_fnmadd_ps(fft447, fft375, fft456);
__m512 fft380 = _mm512_fmadd_ps(fft363, fft375, fft371);
__m512 fft465 = _mm512_fmadd_ps(fft450, fft375, fft457);
__m512 fft381 = _mm512_fnmadd_ps(fft362, fft375, fft372);
__m512 fft466 = _mm512_fnmadd_ps(fft449, fft375, fft458);
__m512 fft382 = _mm512_fmadd_ps(fft365, fft375, fft373);
__m512 fft467 = _mm512_fmadd_ps(fft452, fft375, fft459);
__m512 fft383 = _mm512_fnmadd_ps(fft364, fft375, fft374);
__m512 fft468 = _mm512_fnmadd_ps(fft451, fft375, fft460);
__m512 fft384 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft385 = _mm512_fmadd_ps(fft376, fft384, _mm512_shuffle_f32x4(fft376, fft376, 177));
__m512 fft469 = _mm512_fmadd_ps(fft461, fft384, _mm512_shuffle_f32x4(fft461, fft461, 177));
__m512 fft386 = _mm512_fmadd_ps(fft377, fft384, _mm512_shuffle_f32x4(fft377, fft377, 177));
__m512 fft470 = _mm512_fmadd_ps(fft462, fft384, _mm512_shuffle_f32x4(fft462, fft462, 177));
__m512 fft387 = _mm512_fmadd_ps(fft378, fft384, _mm512_shuffle_f32x4(fft378, fft378, 177));
__m512 fft471 = _mm512_fmadd_ps(fft463, fft384, _mm512_shuffle_f32x4(fft463, fft463, 177));
__m512 fft388 = _mm512_fmadd_ps(fft379, fft384, _mm512_shuffle_f32x4(fft379, fft379, 177));
__m512 fft472 = _mm512_fmadd_ps(fft464, fft384, _mm512_shuffle_f32x4(fft464, fft464, 177));
__m512 fft389 = _mm512_fmadd_ps(fft380, fft384, _mm512_shuffle_f32x4(fft380, fft380, 177));
__m512 fft473 = _mm512_fmadd_ps(fft465, fft384, _mm512_shuffle_f32x4(fft465, fft465, 177));
__m512 fft390 = _mm512_fmadd_ps(fft381, fft384, _mm512_shuffle_f32x4(fft381, fft381, 177));
__m512 fft474 = _mm512_fmadd_ps(fft466, fft384, _mm512_shuffle_f32x4(fft466, fft466, 177));
__m512 fft391 = _mm512_fmadd_ps(fft382, fft384, _mm512_shuffle_f32x4(fft382, fft382, 177));
__m512 fft475 = _mm512_fmadd_ps(fft467, fft384, _mm512_shuffle_f32x4(fft467, fft467, 177));
__m512 fft392 = _mm512_fmadd_ps(fft383, fft384, _mm512_shuffle_f32x4(fft383, fft383, 177));
__m512 fft476 = _mm512_fmadd_ps(fft468, fft384, _mm512_shuffle_f32x4(fft468, fft468, 177));
__m512 fft393 = _mm512_mask_mov_ps(fft385, 49344, fft386);
__m512 fft477 = _mm512_mask_mov_ps(fft469, 49344, fft470);
__m512 fft394 = _mm512_mask_sub_ps(fft386, 49344, _mm512_setzero_ps(), fft385);
__m512 fft478 = _mm512_mask_sub_ps(fft470, 49344, _mm512_setzero_ps(), fft469);
__m512 fft395 = _mm512_mask_mov_ps(fft387, 49344, fft388);
__m512 fft479 = _mm512_mask_mov_ps(fft471, 49344, fft472);
__m512 fft396 = _mm512_mask_sub_ps(fft388, 49344, _mm512_setzero_ps(), fft387);
__m512 fft480 = _mm512_mask_sub_ps(fft472, 49344, _mm512_setzero_ps(), fft471);
__m512 fft397 = _mm512_mask_mov_ps(fft389, 49344, fft390);
__m512 fft481 = _mm512_mask_mov_ps(fft473, 49344, fft474);
__m512 fft398 = _mm512_mask_sub_ps(fft390, 49344, _mm512_setzero_ps(), fft389);
__m512 fft482 = _mm512_mask_sub_ps(fft474, 49344, _mm512_setzero_ps(), fft473);
__m512 fft399 = _mm512_mask_mov_ps(fft391, 49344, fft392);
__m512 fft483 = _mm512_mask_mov_ps(fft475, 49344, fft476);
__m512 fft400 = _mm512_mask_sub_ps(fft392, 49344, _mm512_setzero_ps(), fft391);
__m512 fft484 = _mm512_mask_sub_ps(fft476, 49344, _mm512_setzero_ps(), fft475);
__m512 fft401 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft402 = _mm512_fmadd_ps(fft393, fft401, _mm512_shuffle_ps(fft393, fft393, 78));
__m512 fft485 = _mm512_fmadd_ps(fft477, fft401, _mm512_shuffle_ps(fft477, fft477, 78));
__m512 fft403 = _mm512_fmadd_ps(fft394, fft401, _mm512_shuffle_ps(fft394, fft394, 78));
__m512 fft486 = _mm512_fmadd_ps(fft478, fft401, _mm512_shuffle_ps(fft478, fft478, 78));
__m512 fft404 = _mm512_fmadd_ps(fft395, fft401, _mm512_shuffle_ps(fft395, fft395, 78));
__m512 fft487 = _mm512_fmadd_ps(fft479, fft401, _mm512_shuffle_ps(fft479, fft479, 78));
__m512 fft405 = _mm512_fmadd_ps(fft396, fft401, _mm512_shuffle_ps(fft396, fft396, 78));
__m512 fft488 = _mm512_fmadd_ps(fft480, fft401, _mm512_shuffle_ps(fft480, fft480, 78));
__m512 fft406 = _mm512_fmadd_ps(fft397, fft401, _mm512_shuffle_ps(fft397, fft397, 78));
__m512 fft489 = _mm512_fmadd_ps(fft481, fft401, _mm512_shuffle_ps(fft481, fft481, 78));
__m512 fft407 = _mm512_fmadd_ps(fft398, fft401, _mm512_shuffle_ps(fft398, fft398, 78));
__m512 fft490 = _mm512_fmadd_ps(fft482, fft401, _mm512_shuffle_ps(fft482, fft482, 78));
__m512 fft408 = _mm512_fmadd_ps(fft399, fft401, _mm512_shuffle_ps(fft399, fft399, 78));
__m512 fft491 = _mm512_fmadd_ps(fft483, fft401, _mm512_shuffle_ps(fft483, fft483, 78));
__m512 fft409 = _mm512_fmadd_ps(fft400, fft401, _mm512_shuffle_ps(fft400, fft400, 78));
__m512 fft492 = _mm512_fmadd_ps(fft484, fft401, _mm512_shuffle_ps(fft484, fft484, 78));
__m512i fft410 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft411 = _mm512_permutexvar_ps(fft410, fft402);
__m512 fft493 = _mm512_permutexvar_ps(fft410, fft485);
__m512i fft412 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft413 = _mm512_permutexvar_ps(fft412, fft402);
__m512 fft494 = _mm512_permutexvar_ps(fft412, fft485);
__m512 fft414 = _mm512_permutexvar_ps(fft410, fft403);
__m512 fft495 = _mm512_permutexvar_ps(fft410, fft486);
__m512 fft415 = _mm512_permutexvar_ps(fft412, fft403);
__m512 fft496 = _mm512_permutexvar_ps(fft412, fft486);
__m512 fft416 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft417 = _mm512_fmadd_ps(fft411, fft416, fft413);
__m512 fft497 = _mm512_fmadd_ps(fft493, fft416, fft494);
__m512 fft418 = _mm512_fnmadd_ps(fft415, fft416, fft414);
__m512 fft498 = _mm512_fnmadd_ps(fft496, fft416, fft495);
__m512 fft419 = _mm512_mask_mov_ps(fft415, 21845, fft417);
__m512 fft499 = _mm512_mask_mov_ps(fft496, 21845, fft497);
__m512 fft420 = _mm512_mask_mov_ps(fft411, 43176, fft417);
__m512 fft500 = _mm512_mask_mov_ps(fft493, 43176, fft497);
__m512 fft421 = _mm512_mask_mov_ps(fft419, 43176, fft418);
__m512 fft501 = _mm512_mask_mov_ps(fft499, 43176, fft498);
__m512 fft422 = _mm512_mask_mov_ps(fft420, 22102, fft418);
__m512 fft502 = _mm512_mask_mov_ps(fft500, 22102, fft498);
__m512 fft423 = _mm512_mask_mul_ps(fft421, 64764, fft421, _mm512_set1_ps(5e-01f));
__m512 fft503 = _mm512_mask_mul_ps(fft501, 64764, fft501, _mm512_set1_ps(5e-01f));
__m512 fft424 = _mm512_mask_mul_ps(fft422, 64764, fft422, _mm512_set1_ps(5e-01f));
__m512 fft504 = _mm512_mask_mul_ps(fft502, 64764, fft502, _mm512_set1_ps(5e-01f));
__m512 wf33 = fft423;
__m512 wf41 = fft503;
__m512 wf34 = fft424;
__m512 wf42 = fft504;
__m512 wf35 = fft404;
__m512 wf43 = fft487;
__m512 wf36 = fft405;
__m512 wf44 = fft488;
__m512 wf37 = fft406;
__m512 wf45 = fft489;
__m512 wf38 = fft407;
__m512 wf46 = fft490;
__m512 wf39 = fft408;
__m512 wf47 = fft491;
__m512 wf40 = fft409;
__m512 wf48 = fft492;
ptrdiff_t c3 = (size_t)(0+2*j1)/4;
ptrdiff_t m3 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f4 = (size_t)(0+2*j1)%2;
__m512i eo3 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf35 = _mm512_permutexvar_ps(eo3, wf35);
wf36 = _mm512_permutexvar_ps(eo3, wf36);
__m512i wfs17 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf35, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs17 = _mm512_inserti64x4(wfs17, _mm512_cvtps_ph(wf36, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep1 = _mm512_shuffle_i32x4(wfs17, wfs17, 160);
_mm512_mask_storeu_epi32(wfPtr1+158592+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep1);
__m512i rep2 = _mm512_shuffle_i32x4(wfs17, wfs17, 245);
_mm512_mask_storeu_epi32(wfPtr1+4599168+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep2);
wf43 = _mm512_permutexvar_ps(eo3, wf43);
wf44 = _mm512_permutexvar_ps(eo3, wf44);
__m512i wfs18 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf43, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs18 = _mm512_inserti64x4(wfs18, _mm512_cvtps_ph(wf44, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep3 = _mm512_shuffle_i32x4(wfs18, wfs18, 160);
_mm512_mask_storeu_epi32(wfPtr1+9039744+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep3);
__m512i rep4 = _mm512_shuffle_i32x4(wfs18, wfs18, 245);
_mm512_mask_storeu_epi32(wfPtr1+13480320+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep4);
wf37 = _mm512_permutexvar_ps(eo3, wf37);
wf38 = _mm512_permutexvar_ps(eo3, wf38);
__m512i wfs19 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf37, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs19 = _mm512_inserti64x4(wfs19, _mm512_cvtps_ph(wf38, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep5 = _mm512_shuffle_i32x4(wfs19, wfs19, 160);
_mm512_mask_storeu_epi32(wfPtr1+317184+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep5);
__m512i rep6 = _mm512_shuffle_i32x4(wfs19, wfs19, 245);
_mm512_mask_storeu_epi32(wfPtr1+4757760+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep6);
wf45 = _mm512_permutexvar_ps(eo3, wf45);
wf46 = _mm512_permutexvar_ps(eo3, wf46);
__m512i wfs20 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf45, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs20 = _mm512_inserti64x4(wfs20, _mm512_cvtps_ph(wf46, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep7 = _mm512_shuffle_i32x4(wfs20, wfs20, 160);
_mm512_mask_storeu_epi32(wfPtr1+9198336+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep7);
__m512i rep8 = _mm512_shuffle_i32x4(wfs20, wfs20, 245);
_mm512_mask_storeu_epi32(wfPtr1+13638912+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep8);
wf39 = _mm512_permutexvar_ps(eo3, wf39);
wf40 = _mm512_permutexvar_ps(eo3, wf40);
__m512i wfs21 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf39, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs21 = _mm512_inserti64x4(wfs21, _mm512_cvtps_ph(wf40, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep9 = _mm512_shuffle_i32x4(wfs21, wfs21, 160);
_mm512_mask_storeu_epi32(wfPtr1+475776+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep9);
__m512i rep10 = _mm512_shuffle_i32x4(wfs21, wfs21, 245);
_mm512_mask_storeu_epi32(wfPtr1+4916352+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep10);
wf47 = _mm512_permutexvar_ps(eo3, wf47);
wf48 = _mm512_permutexvar_ps(eo3, wf48);
__m512i wfs22 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf47, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs22 = _mm512_inserti64x4(wfs22, _mm512_cvtps_ph(wf48, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep11 = _mm512_shuffle_i32x4(wfs22, wfs22, 160);
_mm512_mask_storeu_epi32(wfPtr1+9356928+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep11);
__m512i rep12 = _mm512_shuffle_i32x4(wfs22, wfs22, 245);
_mm512_mask_storeu_epi32(wfPtr1+13797504+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep12);
__m512i wfs23 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf33, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs23 = _mm512_inserti64x4(wfs23, _mm512_cvtps_ph(wf34, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep13 = _mm512_shuffle_i32x4(wfs23, wfs23, 160);
_mm512_mask_storeu_epi32(wfPtr1+0+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep13);
__m512i rep14 = _mm512_shuffle_i32x4(wfs23, wfs23, 245);
_mm512_mask_storeu_epi32(wfPtr1+4440576+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep14);
__m512i wfs24 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf41, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs24 = _mm512_inserti64x4(wfs24, _mm512_cvtps_ph(wf42, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep15 = _mm512_shuffle_i32x4(wfs24, wfs24, 160);
_mm512_mask_storeu_epi32(wfPtr1+8881152+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep15);
__m512i rep16 = _mm512_shuffle_i32x4(wfs24, wfs24, 245);
_mm512_mask_storeu_epi32(wfPtr1+13321728+634368*i5+15104*c3+64*k2+64*m3+16*f4, 65535, rep16);
}
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(1, biasPtr1-0+164*i5+8*j1);
bias2 = _mm512_mul_ps(bias2, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+168*i5+8*j1, 1, bias2);
if (j1 >= jj1) return;
j1 = 21;
}
}

static void Example7StriderArrangeFilts1(Example7ThreaderTeam1* team13, char** tensors1) {
Example7ThreaderTask1 task5;
task5.callee1 = Example7StriderArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 21;
task5.hull1[1] = 7;
task5.hull1[2] = 1;
Example7ThreaderDo1(team13, &task5);
}

static void Example7StriderArrangeDats1Callee1(Example7ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = 0;
ptrdiff_t c4 = 0;
ptrdiff_t g3 = pt8[2];
ptrdiff_t e2 = 0;
char*restrict datPtr1 = tensors4[0]-4+777336*e2;
char*restrict dfPtr1 = tensors4[1]+9476096*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 1*c4;
ptrdiff_t rel1 = j2-0;
ptrdiff_t base1 = 0;
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k3 = 118*s1;
ptrdiff_t kk1 = k3+117;
for (; k3 <= kk1; ++k3) {
ptrdiff_t b3 = 0;
ptrdiff_t m4 = (size_t)b3/2;
ptrdiff_t f5 = (size_t)b3%2;
__m512 dat1 = _mm512_maskz_loadu_ps(65534, datPtr1+0+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat2 = _mm512_maskz_loadu_ps(65534, datPtr1+84+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat3 = _mm512_maskz_loadu_ps(65534, datPtr1+168+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat4 = _mm512_maskz_loadu_ps(65534, datPtr1+252+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat5 = _mm512_maskz_loadu_ps(65534, datPtr1+336+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat6 = _mm512_maskz_loadu_ps(65534, datPtr1+420+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat7 = _mm512_maskz_loadu_ps(65534, datPtr1+504+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat8 = _mm512_maskz_loadu_ps(65534, datPtr1+588+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat9 = _mm512_maskz_loadu_ps(65534, datPtr1+672+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat10 = _mm512_maskz_loadu_ps(65534, datPtr1+756+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat11 = _mm512_maskz_loadu_ps(65534, datPtr1+840+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat12 = _mm512_maskz_loadu_ps(65534, datPtr1+924+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat13 = _mm512_maskz_loadu_ps(65534, datPtr1+1008+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 dat14 = _mm512_maskz_loadu_ps(65534, datPtr1+1092+138768*i6+1176*k3+84*h1+4*w1+0*b3);
__m512 fft505 = _mm512_add_ps(dat1, dat9);
__m512 fft593 = _mm512_add_ps(dat2, dat10);
__m512 fft506 = _mm512_sub_ps(dat1, dat9);
__m512 fft594 = _mm512_sub_ps(dat2, dat10);
__m512 fft507 = _mm512_add_ps(dat3, dat11);
__m512 fft595 = _mm512_add_ps(dat4, dat12);
__m512 fft508 = _mm512_sub_ps(dat3, dat11);
__m512 fft596 = _mm512_sub_ps(dat4, dat12);
__m512 fft509 = _mm512_add_ps(dat5, dat13);
__m512 fft597 = _mm512_add_ps(dat6, dat14);
__m512 fft510 = _mm512_sub_ps(dat5, dat13);
__m512 fft598 = _mm512_sub_ps(dat6, dat14);
__m512 fft511 = _mm512_add_ps(dat7, _mm512_setzero_ps());
__m512 fft599 = _mm512_add_ps(dat8, _mm512_setzero_ps());
__m512 fft512 = _mm512_sub_ps(dat7, _mm512_setzero_ps());
__m512 fft600 = _mm512_sub_ps(dat8, _mm512_setzero_ps());
__m512 fft513 = _mm512_add_ps(fft505, fft509);
__m512 fft601 = _mm512_add_ps(fft593, fft597);
__m512 fft514 = _mm512_sub_ps(fft505, fft509);
__m512 fft602 = _mm512_sub_ps(fft593, fft597);
__m512 fft515 = _mm512_add_ps(fft507, fft511);
__m512 fft603 = _mm512_add_ps(fft595, fft599);
__m512 fft516 = _mm512_sub_ps(fft511, fft507);
__m512 fft604 = _mm512_sub_ps(fft599, fft595);
__m512 fft517 = _mm512_sub_ps(fft508, fft512);
__m512 fft605 = _mm512_sub_ps(fft596, fft600);
__m512 fft518 = _mm512_add_ps(fft508, fft512);
__m512 fft606 = _mm512_add_ps(fft596, fft600);
__m512 fft519 = _mm512_add_ps(fft513, fft515);
__m512 fft607 = _mm512_add_ps(fft601, fft603);
__m512 fft520 = _mm512_sub_ps(fft513, fft515);
__m512 fft608 = _mm512_sub_ps(fft601, fft603);
__m512 fft521 = _mm512_fmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft609 = _mm512_fmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft522 = _mm512_fnmsub_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft610 = _mm512_fnmsub_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft523 = _mm512_fnmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft611 = _mm512_fnmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft524 = _mm512_fnmadd_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft612 = _mm512_fnmadd_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft525 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft526 = _mm512_fmadd_ps(fft519, fft525, _mm512_shuffle_f32x4(fft519, fft519, 78));
__m512 fft613 = _mm512_fmadd_ps(fft607, fft525, _mm512_shuffle_f32x4(fft607, fft607, 78));
__m512 fft527 = _mm512_fmadd_ps(fft520, fft525, _mm512_shuffle_f32x4(fft520, fft520, 78));
__m512 fft614 = _mm512_fmadd_ps(fft608, fft525, _mm512_shuffle_f32x4(fft608, fft608, 78));
__m512 fft528 = _mm512_fmadd_ps(fft521, fft525, _mm512_shuffle_f32x4(fft521, fft521, 78));
__m512 fft615 = _mm512_fmadd_ps(fft609, fft525, _mm512_shuffle_f32x4(fft609, fft609, 78));
__m512 fft529 = _mm512_fmadd_ps(fft522, fft525, _mm512_shuffle_f32x4(fft522, fft522, 78));
__m512 fft616 = _mm512_fmadd_ps(fft610, fft525, _mm512_shuffle_f32x4(fft610, fft610, 78));
__m512 fft530 = _mm512_fmadd_ps(fft514, fft525, _mm512_shuffle_f32x4(fft514, fft514, 78));
__m512 fft617 = _mm512_fmadd_ps(fft602, fft525, _mm512_shuffle_f32x4(fft602, fft602, 78));
__m512 fft531 = _mm512_fmadd_ps(fft516, fft525, _mm512_shuffle_f32x4(fft516, fft516, 78));
__m512 fft618 = _mm512_fmadd_ps(fft604, fft525, _mm512_shuffle_f32x4(fft604, fft604, 78));
__m512 fft532 = _mm512_fmadd_ps(fft523, fft525, _mm512_shuffle_f32x4(fft523, fft523, 78));
__m512 fft619 = _mm512_fmadd_ps(fft611, fft525, _mm512_shuffle_f32x4(fft611, fft611, 78));
__m512 fft533 = _mm512_fmadd_ps(fft524, fft525, _mm512_shuffle_f32x4(fft524, fft524, 78));
__m512 fft620 = _mm512_fmadd_ps(fft612, fft525, _mm512_shuffle_f32x4(fft612, fft612, 78));
__m512 fft534 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft535 = _mm512_mul_ps(fft526, fft534);
__m512 fft621 = _mm512_mul_ps(fft613, fft534);
__m512 fft536 = _mm512_mul_ps(fft527, fft534);
__m512 fft622 = _mm512_mul_ps(fft614, fft534);
__m512 fft537 = _mm512_mul_ps(fft528, fft534);
__m512 fft623 = _mm512_mul_ps(fft615, fft534);
__m512 fft538 = _mm512_mul_ps(fft529, fft534);
__m512 fft624 = _mm512_mul_ps(fft616, fft534);
__m512 fft539 = _mm512_mul_ps(fft530, fft534);
__m512 fft625 = _mm512_mul_ps(fft617, fft534);
__m512 fft540 = _mm512_mul_ps(fft531, fft534);
__m512 fft626 = _mm512_mul_ps(fft618, fft534);
__m512 fft541 = _mm512_mul_ps(fft532, fft534);
__m512 fft627 = _mm512_mul_ps(fft619, fft534);
__m512 fft542 = _mm512_mul_ps(fft533, fft534);
__m512 fft628 = _mm512_mul_ps(fft620, fft534);
__m512 fft543 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft544 = _mm512_fmadd_ps(fft527, fft543, fft535);
__m512 fft629 = _mm512_fmadd_ps(fft614, fft543, fft621);
__m512 fft545 = _mm512_fnmadd_ps(fft526, fft543, fft536);
__m512 fft630 = _mm512_fnmadd_ps(fft613, fft543, fft622);
__m512 fft546 = _mm512_fmadd_ps(fft529, fft543, fft537);
__m512 fft631 = _mm512_fmadd_ps(fft616, fft543, fft623);
__m512 fft547 = _mm512_fnmadd_ps(fft528, fft543, fft538);
__m512 fft632 = _mm512_fnmadd_ps(fft615, fft543, fft624);
__m512 fft548 = _mm512_fmadd_ps(fft531, fft543, fft539);
__m512 fft633 = _mm512_fmadd_ps(fft618, fft543, fft625);
__m512 fft549 = _mm512_fnmadd_ps(fft530, fft543, fft540);
__m512 fft634 = _mm512_fnmadd_ps(fft617, fft543, fft626);
__m512 fft550 = _mm512_fmadd_ps(fft533, fft543, fft541);
__m512 fft635 = _mm512_fmadd_ps(fft620, fft543, fft627);
__m512 fft551 = _mm512_fnmadd_ps(fft532, fft543, fft542);
__m512 fft636 = _mm512_fnmadd_ps(fft619, fft543, fft628);
__m512 fft552 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft553 = _mm512_fmadd_ps(fft544, fft552, _mm512_shuffle_f32x4(fft544, fft544, 177));
__m512 fft637 = _mm512_fmadd_ps(fft629, fft552, _mm512_shuffle_f32x4(fft629, fft629, 177));
__m512 fft554 = _mm512_fmadd_ps(fft545, fft552, _mm512_shuffle_f32x4(fft545, fft545, 177));
__m512 fft638 = _mm512_fmadd_ps(fft630, fft552, _mm512_shuffle_f32x4(fft630, fft630, 177));
__m512 fft555 = _mm512_fmadd_ps(fft546, fft552, _mm512_shuffle_f32x4(fft546, fft546, 177));
__m512 fft639 = _mm512_fmadd_ps(fft631, fft552, _mm512_shuffle_f32x4(fft631, fft631, 177));
__m512 fft556 = _mm512_fmadd_ps(fft547, fft552, _mm512_shuffle_f32x4(fft547, fft547, 177));
__m512 fft640 = _mm512_fmadd_ps(fft632, fft552, _mm512_shuffle_f32x4(fft632, fft632, 177));
__m512 fft557 = _mm512_fmadd_ps(fft548, fft552, _mm512_shuffle_f32x4(fft548, fft548, 177));
__m512 fft641 = _mm512_fmadd_ps(fft633, fft552, _mm512_shuffle_f32x4(fft633, fft633, 177));
__m512 fft558 = _mm512_fmadd_ps(fft549, fft552, _mm512_shuffle_f32x4(fft549, fft549, 177));
__m512 fft642 = _mm512_fmadd_ps(fft634, fft552, _mm512_shuffle_f32x4(fft634, fft634, 177));
__m512 fft559 = _mm512_fmadd_ps(fft550, fft552, _mm512_shuffle_f32x4(fft550, fft550, 177));
__m512 fft643 = _mm512_fmadd_ps(fft635, fft552, _mm512_shuffle_f32x4(fft635, fft635, 177));
__m512 fft560 = _mm512_fmadd_ps(fft551, fft552, _mm512_shuffle_f32x4(fft551, fft551, 177));
__m512 fft644 = _mm512_fmadd_ps(fft636, fft552, _mm512_shuffle_f32x4(fft636, fft636, 177));
__m512 fft561 = _mm512_mask_mov_ps(fft553, 49344, fft554);
__m512 fft645 = _mm512_mask_mov_ps(fft637, 49344, fft638);
__m512 fft562 = _mm512_mask_sub_ps(fft554, 49344, _mm512_setzero_ps(), fft553);
__m512 fft646 = _mm512_mask_sub_ps(fft638, 49344, _mm512_setzero_ps(), fft637);
__m512 fft563 = _mm512_mask_mov_ps(fft555, 49344, fft556);
__m512 fft647 = _mm512_mask_mov_ps(fft639, 49344, fft640);
__m512 fft564 = _mm512_mask_sub_ps(fft556, 49344, _mm512_setzero_ps(), fft555);
__m512 fft648 = _mm512_mask_sub_ps(fft640, 49344, _mm512_setzero_ps(), fft639);
__m512 fft565 = _mm512_mask_mov_ps(fft557, 49344, fft558);
__m512 fft649 = _mm512_mask_mov_ps(fft641, 49344, fft642);
__m512 fft566 = _mm512_mask_sub_ps(fft558, 49344, _mm512_setzero_ps(), fft557);
__m512 fft650 = _mm512_mask_sub_ps(fft642, 49344, _mm512_setzero_ps(), fft641);
__m512 fft567 = _mm512_mask_mov_ps(fft559, 49344, fft560);
__m512 fft651 = _mm512_mask_mov_ps(fft643, 49344, fft644);
__m512 fft568 = _mm512_mask_sub_ps(fft560, 49344, _mm512_setzero_ps(), fft559);
__m512 fft652 = _mm512_mask_sub_ps(fft644, 49344, _mm512_setzero_ps(), fft643);
__m512 fft569 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft570 = _mm512_fmadd_ps(fft561, fft569, _mm512_shuffle_ps(fft561, fft561, 78));
__m512 fft653 = _mm512_fmadd_ps(fft645, fft569, _mm512_shuffle_ps(fft645, fft645, 78));
__m512 fft571 = _mm512_fmadd_ps(fft562, fft569, _mm512_shuffle_ps(fft562, fft562, 78));
__m512 fft654 = _mm512_fmadd_ps(fft646, fft569, _mm512_shuffle_ps(fft646, fft646, 78));
__m512 fft572 = _mm512_fmadd_ps(fft563, fft569, _mm512_shuffle_ps(fft563, fft563, 78));
__m512 fft655 = _mm512_fmadd_ps(fft647, fft569, _mm512_shuffle_ps(fft647, fft647, 78));
__m512 fft573 = _mm512_fmadd_ps(fft564, fft569, _mm512_shuffle_ps(fft564, fft564, 78));
__m512 fft656 = _mm512_fmadd_ps(fft648, fft569, _mm512_shuffle_ps(fft648, fft648, 78));
__m512 fft574 = _mm512_fmadd_ps(fft565, fft569, _mm512_shuffle_ps(fft565, fft565, 78));
__m512 fft657 = _mm512_fmadd_ps(fft649, fft569, _mm512_shuffle_ps(fft649, fft649, 78));
__m512 fft575 = _mm512_fmadd_ps(fft566, fft569, _mm512_shuffle_ps(fft566, fft566, 78));
__m512 fft658 = _mm512_fmadd_ps(fft650, fft569, _mm512_shuffle_ps(fft650, fft650, 78));
__m512 fft576 = _mm512_fmadd_ps(fft567, fft569, _mm512_shuffle_ps(fft567, fft567, 78));
__m512 fft659 = _mm512_fmadd_ps(fft651, fft569, _mm512_shuffle_ps(fft651, fft651, 78));
__m512 fft577 = _mm512_fmadd_ps(fft568, fft569, _mm512_shuffle_ps(fft568, fft568, 78));
__m512 fft660 = _mm512_fmadd_ps(fft652, fft569, _mm512_shuffle_ps(fft652, fft652, 78));
__m512i fft578 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft579 = _mm512_permutexvar_ps(fft578, fft570);
__m512 fft661 = _mm512_permutexvar_ps(fft578, fft653);
__m512i fft580 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft581 = _mm512_permutexvar_ps(fft580, fft570);
__m512 fft662 = _mm512_permutexvar_ps(fft580, fft653);
__m512 fft582 = _mm512_permutexvar_ps(fft578, fft571);
__m512 fft663 = _mm512_permutexvar_ps(fft578, fft654);
__m512 fft583 = _mm512_permutexvar_ps(fft580, fft571);
__m512 fft664 = _mm512_permutexvar_ps(fft580, fft654);
__m512 fft584 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft585 = _mm512_fmadd_ps(fft579, fft584, fft581);
__m512 fft665 = _mm512_fmadd_ps(fft661, fft584, fft662);
__m512 fft586 = _mm512_fnmadd_ps(fft583, fft584, fft582);
__m512 fft666 = _mm512_fnmadd_ps(fft664, fft584, fft663);
__m512 fft587 = _mm512_mask_mov_ps(fft583, 21845, fft585);
__m512 fft667 = _mm512_mask_mov_ps(fft664, 21845, fft665);
__m512 fft588 = _mm512_mask_mov_ps(fft579, 43176, fft585);
__m512 fft668 = _mm512_mask_mov_ps(fft661, 43176, fft665);
__m512 fft589 = _mm512_mask_mov_ps(fft587, 43176, fft586);
__m512 fft669 = _mm512_mask_mov_ps(fft667, 43176, fft666);
__m512 fft590 = _mm512_mask_mov_ps(fft588, 22102, fft586);
__m512 fft670 = _mm512_mask_mov_ps(fft668, 22102, fft666);
__m512 fft591 = _mm512_mask_mul_ps(fft589, 64764, fft589, _mm512_set1_ps(5e-01f));
__m512 fft671 = _mm512_mask_mul_ps(fft669, 64764, fft669, _mm512_set1_ps(5e-01f));
__m512 fft592 = _mm512_mask_mul_ps(fft590, 64764, fft590, _mm512_set1_ps(5e-01f));
__m512 fft672 = _mm512_mask_mul_ps(fft670, 64764, fft670, _mm512_set1_ps(5e-01f));
__m512 df1 = fft591;
__m512 df9 = fft671;
__m512 df2 = fft592;
__m512 df10 = fft672;
__m512 df3 = fft572;
__m512 df11 = fft655;
__m512 df4 = fft573;
__m512 df12 = fft656;
__m512 df5 = fft574;
__m512 df13 = fft657;
__m512 df6 = fft575;
__m512 df14 = fft658;
__m512 df7 = fft576;
__m512 df15 = fft659;
__m512 df8 = fft577;
__m512 df16 = fft660;
__m512i eo4 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df3 = _mm512_permutexvar_ps(eo4, df3);
df4 = _mm512_permutexvar_ps(eo4, df4);
_mm512_mask_storeu_ps(dfPtr1+15104+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df3);
_mm512_mask_storeu_ps(dfPtr1+15168+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df4);
_mm512_mask_storeu_ps(dfPtr1+437984+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df3);
_mm512_mask_storeu_ps(dfPtr1+438048+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df4);
df11 = _mm512_permutexvar_ps(eo4, df11);
df12 = _mm512_permutexvar_ps(eo4, df12);
_mm512_mask_storeu_ps(dfPtr1+860928+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df11);
_mm512_mask_storeu_ps(dfPtr1+860992+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df12);
_mm512_mask_storeu_ps(dfPtr1+1283808+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df11);
_mm512_mask_storeu_ps(dfPtr1+1283872+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df12);
df5 = _mm512_permutexvar_ps(eo4, df5);
df6 = _mm512_permutexvar_ps(eo4, df6);
_mm512_mask_storeu_ps(dfPtr1+30208+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df5);
_mm512_mask_storeu_ps(dfPtr1+30272+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df6);
_mm512_mask_storeu_ps(dfPtr1+453088+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df5);
_mm512_mask_storeu_ps(dfPtr1+453152+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df6);
df13 = _mm512_permutexvar_ps(eo4, df13);
df14 = _mm512_permutexvar_ps(eo4, df14);
_mm512_mask_storeu_ps(dfPtr1+876032+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df13);
_mm512_mask_storeu_ps(dfPtr1+876096+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df14);
_mm512_mask_storeu_ps(dfPtr1+1298912+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df13);
_mm512_mask_storeu_ps(dfPtr1+1298976+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df14);
df7 = _mm512_permutexvar_ps(eo4, df7);
df8 = _mm512_permutexvar_ps(eo4, df8);
_mm512_mask_storeu_ps(dfPtr1+45312+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df7);
_mm512_mask_storeu_ps(dfPtr1+45376+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df8);
_mm512_mask_storeu_ps(dfPtr1+468192+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df7);
_mm512_mask_storeu_ps(dfPtr1+468256+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df8);
df15 = _mm512_permutexvar_ps(eo4, df15);
df16 = _mm512_permutexvar_ps(eo4, df16);
_mm512_mask_storeu_ps(dfPtr1+891136+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df15);
_mm512_mask_storeu_ps(dfPtr1+891200+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df16);
_mm512_mask_storeu_ps(dfPtr1+1314016+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df15);
_mm512_mask_storeu_ps(dfPtr1+1314080+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df16);
_mm512_mask_storeu_ps(dfPtr1+0+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df1);
_mm512_mask_storeu_ps(dfPtr1+64+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df2);
_mm512_mask_storeu_ps(dfPtr1+422880+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df1);
_mm512_mask_storeu_ps(dfPtr1+422944+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df2);
_mm512_mask_storeu_ps(dfPtr1+845824+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df9);
_mm512_mask_storeu_ps(dfPtr1+845888+60416*i6+45312*j2+128*k3+128*m4+32*f5, 255, df10);
_mm512_mask_storeu_ps(dfPtr1+1268704+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df9);
_mm512_mask_storeu_ps(dfPtr1+1268768+60416*i6+45312*j2+128*k3+128*m4+32*f5, 65280, df10);
ptrdiff_t b4 = 1;
ptrdiff_t m5 = (size_t)b4/2;
ptrdiff_t f6 = (size_t)b4%2;
__m512 dat15 = _mm512_maskz_loadu_ps(255, datPtr1+56+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat16 = _mm512_maskz_loadu_ps(255, datPtr1+140+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat17 = _mm512_maskz_loadu_ps(255, datPtr1+224+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat18 = _mm512_maskz_loadu_ps(255, datPtr1+308+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat19 = _mm512_maskz_loadu_ps(255, datPtr1+392+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat20 = _mm512_maskz_loadu_ps(255, datPtr1+476+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat21 = _mm512_maskz_loadu_ps(255, datPtr1+560+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat22 = _mm512_maskz_loadu_ps(255, datPtr1+644+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat23 = _mm512_maskz_loadu_ps(255, datPtr1+728+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat24 = _mm512_maskz_loadu_ps(255, datPtr1+812+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat25 = _mm512_maskz_loadu_ps(255, datPtr1+896+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat26 = _mm512_maskz_loadu_ps(255, datPtr1+980+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat27 = _mm512_maskz_loadu_ps(255, datPtr1+1064+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 dat28 = _mm512_maskz_loadu_ps(255, datPtr1+1148+138768*i6+1176*k3+84*h1+4*w1+0*b4);
__m512 fft673 = _mm512_add_ps(dat15, dat23);
__m512 fft761 = _mm512_add_ps(dat16, dat24);
__m512 fft674 = _mm512_sub_ps(dat15, dat23);
__m512 fft762 = _mm512_sub_ps(dat16, dat24);
__m512 fft675 = _mm512_add_ps(dat17, dat25);
__m512 fft763 = _mm512_add_ps(dat18, dat26);
__m512 fft676 = _mm512_sub_ps(dat17, dat25);
__m512 fft764 = _mm512_sub_ps(dat18, dat26);
__m512 fft677 = _mm512_add_ps(dat19, dat27);
__m512 fft765 = _mm512_add_ps(dat20, dat28);
__m512 fft678 = _mm512_sub_ps(dat19, dat27);
__m512 fft766 = _mm512_sub_ps(dat20, dat28);
__m512 fft679 = _mm512_add_ps(dat21, _mm512_setzero_ps());
__m512 fft767 = _mm512_add_ps(dat22, _mm512_setzero_ps());
__m512 fft680 = _mm512_sub_ps(dat21, _mm512_setzero_ps());
__m512 fft768 = _mm512_sub_ps(dat22, _mm512_setzero_ps());
__m512 fft681 = _mm512_add_ps(fft673, fft677);
__m512 fft769 = _mm512_add_ps(fft761, fft765);
__m512 fft682 = _mm512_sub_ps(fft673, fft677);
__m512 fft770 = _mm512_sub_ps(fft761, fft765);
__m512 fft683 = _mm512_add_ps(fft675, fft679);
__m512 fft771 = _mm512_add_ps(fft763, fft767);
__m512 fft684 = _mm512_sub_ps(fft679, fft675);
__m512 fft772 = _mm512_sub_ps(fft767, fft763);
__m512 fft685 = _mm512_sub_ps(fft676, fft680);
__m512 fft773 = _mm512_sub_ps(fft764, fft768);
__m512 fft686 = _mm512_add_ps(fft676, fft680);
__m512 fft774 = _mm512_add_ps(fft764, fft768);
__m512 fft687 = _mm512_add_ps(fft681, fft683);
__m512 fft775 = _mm512_add_ps(fft769, fft771);
__m512 fft688 = _mm512_sub_ps(fft681, fft683);
__m512 fft776 = _mm512_sub_ps(fft769, fft771);
__m512 fft689 = _mm512_fmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft777 = _mm512_fmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft690 = _mm512_fnmsub_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft778 = _mm512_fnmsub_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft691 = _mm512_fnmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft779 = _mm512_fnmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft692 = _mm512_fnmadd_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft780 = _mm512_fnmadd_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft693 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft694 = _mm512_fmadd_ps(fft687, fft693, _mm512_shuffle_f32x4(fft687, fft687, 78));
__m512 fft781 = _mm512_fmadd_ps(fft775, fft693, _mm512_shuffle_f32x4(fft775, fft775, 78));
__m512 fft695 = _mm512_fmadd_ps(fft688, fft693, _mm512_shuffle_f32x4(fft688, fft688, 78));
__m512 fft782 = _mm512_fmadd_ps(fft776, fft693, _mm512_shuffle_f32x4(fft776, fft776, 78));
__m512 fft696 = _mm512_fmadd_ps(fft689, fft693, _mm512_shuffle_f32x4(fft689, fft689, 78));
__m512 fft783 = _mm512_fmadd_ps(fft777, fft693, _mm512_shuffle_f32x4(fft777, fft777, 78));
__m512 fft697 = _mm512_fmadd_ps(fft690, fft693, _mm512_shuffle_f32x4(fft690, fft690, 78));
__m512 fft784 = _mm512_fmadd_ps(fft778, fft693, _mm512_shuffle_f32x4(fft778, fft778, 78));
__m512 fft698 = _mm512_fmadd_ps(fft682, fft693, _mm512_shuffle_f32x4(fft682, fft682, 78));
__m512 fft785 = _mm512_fmadd_ps(fft770, fft693, _mm512_shuffle_f32x4(fft770, fft770, 78));
__m512 fft699 = _mm512_fmadd_ps(fft684, fft693, _mm512_shuffle_f32x4(fft684, fft684, 78));
__m512 fft786 = _mm512_fmadd_ps(fft772, fft693, _mm512_shuffle_f32x4(fft772, fft772, 78));
__m512 fft700 = _mm512_fmadd_ps(fft691, fft693, _mm512_shuffle_f32x4(fft691, fft691, 78));
__m512 fft787 = _mm512_fmadd_ps(fft779, fft693, _mm512_shuffle_f32x4(fft779, fft779, 78));
__m512 fft701 = _mm512_fmadd_ps(fft692, fft693, _mm512_shuffle_f32x4(fft692, fft692, 78));
__m512 fft788 = _mm512_fmadd_ps(fft780, fft693, _mm512_shuffle_f32x4(fft780, fft780, 78));
__m512 fft702 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft703 = _mm512_mul_ps(fft694, fft702);
__m512 fft789 = _mm512_mul_ps(fft781, fft702);
__m512 fft704 = _mm512_mul_ps(fft695, fft702);
__m512 fft790 = _mm512_mul_ps(fft782, fft702);
__m512 fft705 = _mm512_mul_ps(fft696, fft702);
__m512 fft791 = _mm512_mul_ps(fft783, fft702);
__m512 fft706 = _mm512_mul_ps(fft697, fft702);
__m512 fft792 = _mm512_mul_ps(fft784, fft702);
__m512 fft707 = _mm512_mul_ps(fft698, fft702);
__m512 fft793 = _mm512_mul_ps(fft785, fft702);
__m512 fft708 = _mm512_mul_ps(fft699, fft702);
__m512 fft794 = _mm512_mul_ps(fft786, fft702);
__m512 fft709 = _mm512_mul_ps(fft700, fft702);
__m512 fft795 = _mm512_mul_ps(fft787, fft702);
__m512 fft710 = _mm512_mul_ps(fft701, fft702);
__m512 fft796 = _mm512_mul_ps(fft788, fft702);
__m512 fft711 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft712 = _mm512_fmadd_ps(fft695, fft711, fft703);
__m512 fft797 = _mm512_fmadd_ps(fft782, fft711, fft789);
__m512 fft713 = _mm512_fnmadd_ps(fft694, fft711, fft704);
__m512 fft798 = _mm512_fnmadd_ps(fft781, fft711, fft790);
__m512 fft714 = _mm512_fmadd_ps(fft697, fft711, fft705);
__m512 fft799 = _mm512_fmadd_ps(fft784, fft711, fft791);
__m512 fft715 = _mm512_fnmadd_ps(fft696, fft711, fft706);
__m512 fft800 = _mm512_fnmadd_ps(fft783, fft711, fft792);
__m512 fft716 = _mm512_fmadd_ps(fft699, fft711, fft707);
__m512 fft801 = _mm512_fmadd_ps(fft786, fft711, fft793);
__m512 fft717 = _mm512_fnmadd_ps(fft698, fft711, fft708);
__m512 fft802 = _mm512_fnmadd_ps(fft785, fft711, fft794);
__m512 fft718 = _mm512_fmadd_ps(fft701, fft711, fft709);
__m512 fft803 = _mm512_fmadd_ps(fft788, fft711, fft795);
__m512 fft719 = _mm512_fnmadd_ps(fft700, fft711, fft710);
__m512 fft804 = _mm512_fnmadd_ps(fft787, fft711, fft796);
__m512 fft720 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft721 = _mm512_fmadd_ps(fft712, fft720, _mm512_shuffle_f32x4(fft712, fft712, 177));
__m512 fft805 = _mm512_fmadd_ps(fft797, fft720, _mm512_shuffle_f32x4(fft797, fft797, 177));
__m512 fft722 = _mm512_fmadd_ps(fft713, fft720, _mm512_shuffle_f32x4(fft713, fft713, 177));
__m512 fft806 = _mm512_fmadd_ps(fft798, fft720, _mm512_shuffle_f32x4(fft798, fft798, 177));
__m512 fft723 = _mm512_fmadd_ps(fft714, fft720, _mm512_shuffle_f32x4(fft714, fft714, 177));
__m512 fft807 = _mm512_fmadd_ps(fft799, fft720, _mm512_shuffle_f32x4(fft799, fft799, 177));
__m512 fft724 = _mm512_fmadd_ps(fft715, fft720, _mm512_shuffle_f32x4(fft715, fft715, 177));
__m512 fft808 = _mm512_fmadd_ps(fft800, fft720, _mm512_shuffle_f32x4(fft800, fft800, 177));
__m512 fft725 = _mm512_fmadd_ps(fft716, fft720, _mm512_shuffle_f32x4(fft716, fft716, 177));
__m512 fft809 = _mm512_fmadd_ps(fft801, fft720, _mm512_shuffle_f32x4(fft801, fft801, 177));
__m512 fft726 = _mm512_fmadd_ps(fft717, fft720, _mm512_shuffle_f32x4(fft717, fft717, 177));
__m512 fft810 = _mm512_fmadd_ps(fft802, fft720, _mm512_shuffle_f32x4(fft802, fft802, 177));
__m512 fft727 = _mm512_fmadd_ps(fft718, fft720, _mm512_shuffle_f32x4(fft718, fft718, 177));
__m512 fft811 = _mm512_fmadd_ps(fft803, fft720, _mm512_shuffle_f32x4(fft803, fft803, 177));
__m512 fft728 = _mm512_fmadd_ps(fft719, fft720, _mm512_shuffle_f32x4(fft719, fft719, 177));
__m512 fft812 = _mm512_fmadd_ps(fft804, fft720, _mm512_shuffle_f32x4(fft804, fft804, 177));
__m512 fft729 = _mm512_mask_mov_ps(fft721, 49344, fft722);
__m512 fft813 = _mm512_mask_mov_ps(fft805, 49344, fft806);
__m512 fft730 = _mm512_mask_sub_ps(fft722, 49344, _mm512_setzero_ps(), fft721);
__m512 fft814 = _mm512_mask_sub_ps(fft806, 49344, _mm512_setzero_ps(), fft805);
__m512 fft731 = _mm512_mask_mov_ps(fft723, 49344, fft724);
__m512 fft815 = _mm512_mask_mov_ps(fft807, 49344, fft808);
__m512 fft732 = _mm512_mask_sub_ps(fft724, 49344, _mm512_setzero_ps(), fft723);
__m512 fft816 = _mm512_mask_sub_ps(fft808, 49344, _mm512_setzero_ps(), fft807);
__m512 fft733 = _mm512_mask_mov_ps(fft725, 49344, fft726);
__m512 fft817 = _mm512_mask_mov_ps(fft809, 49344, fft810);
__m512 fft734 = _mm512_mask_sub_ps(fft726, 49344, _mm512_setzero_ps(), fft725);
__m512 fft818 = _mm512_mask_sub_ps(fft810, 49344, _mm512_setzero_ps(), fft809);
__m512 fft735 = _mm512_mask_mov_ps(fft727, 49344, fft728);
__m512 fft819 = _mm512_mask_mov_ps(fft811, 49344, fft812);
__m512 fft736 = _mm512_mask_sub_ps(fft728, 49344, _mm512_setzero_ps(), fft727);
__m512 fft820 = _mm512_mask_sub_ps(fft812, 49344, _mm512_setzero_ps(), fft811);
__m512 fft737 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft738 = _mm512_fmadd_ps(fft729, fft737, _mm512_shuffle_ps(fft729, fft729, 78));
__m512 fft821 = _mm512_fmadd_ps(fft813, fft737, _mm512_shuffle_ps(fft813, fft813, 78));
__m512 fft739 = _mm512_fmadd_ps(fft730, fft737, _mm512_shuffle_ps(fft730, fft730, 78));
__m512 fft822 = _mm512_fmadd_ps(fft814, fft737, _mm512_shuffle_ps(fft814, fft814, 78));
__m512 fft740 = _mm512_fmadd_ps(fft731, fft737, _mm512_shuffle_ps(fft731, fft731, 78));
__m512 fft823 = _mm512_fmadd_ps(fft815, fft737, _mm512_shuffle_ps(fft815, fft815, 78));
__m512 fft741 = _mm512_fmadd_ps(fft732, fft737, _mm512_shuffle_ps(fft732, fft732, 78));
__m512 fft824 = _mm512_fmadd_ps(fft816, fft737, _mm512_shuffle_ps(fft816, fft816, 78));
__m512 fft742 = _mm512_fmadd_ps(fft733, fft737, _mm512_shuffle_ps(fft733, fft733, 78));
__m512 fft825 = _mm512_fmadd_ps(fft817, fft737, _mm512_shuffle_ps(fft817, fft817, 78));
__m512 fft743 = _mm512_fmadd_ps(fft734, fft737, _mm512_shuffle_ps(fft734, fft734, 78));
__m512 fft826 = _mm512_fmadd_ps(fft818, fft737, _mm512_shuffle_ps(fft818, fft818, 78));
__m512 fft744 = _mm512_fmadd_ps(fft735, fft737, _mm512_shuffle_ps(fft735, fft735, 78));
__m512 fft827 = _mm512_fmadd_ps(fft819, fft737, _mm512_shuffle_ps(fft819, fft819, 78));
__m512 fft745 = _mm512_fmadd_ps(fft736, fft737, _mm512_shuffle_ps(fft736, fft736, 78));
__m512 fft828 = _mm512_fmadd_ps(fft820, fft737, _mm512_shuffle_ps(fft820, fft820, 78));
__m512i fft746 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft747 = _mm512_permutexvar_ps(fft746, fft738);
__m512 fft829 = _mm512_permutexvar_ps(fft746, fft821);
__m512i fft748 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft749 = _mm512_permutexvar_ps(fft748, fft738);
__m512 fft830 = _mm512_permutexvar_ps(fft748, fft821);
__m512 fft750 = _mm512_permutexvar_ps(fft746, fft739);
__m512 fft831 = _mm512_permutexvar_ps(fft746, fft822);
__m512 fft751 = _mm512_permutexvar_ps(fft748, fft739);
__m512 fft832 = _mm512_permutexvar_ps(fft748, fft822);
__m512 fft752 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft753 = _mm512_fmadd_ps(fft747, fft752, fft749);
__m512 fft833 = _mm512_fmadd_ps(fft829, fft752, fft830);
__m512 fft754 = _mm512_fnmadd_ps(fft751, fft752, fft750);
__m512 fft834 = _mm512_fnmadd_ps(fft832, fft752, fft831);
__m512 fft755 = _mm512_mask_mov_ps(fft751, 21845, fft753);
__m512 fft835 = _mm512_mask_mov_ps(fft832, 21845, fft833);
__m512 fft756 = _mm512_mask_mov_ps(fft747, 43176, fft753);
__m512 fft836 = _mm512_mask_mov_ps(fft829, 43176, fft833);
__m512 fft757 = _mm512_mask_mov_ps(fft755, 43176, fft754);
__m512 fft837 = _mm512_mask_mov_ps(fft835, 43176, fft834);
__m512 fft758 = _mm512_mask_mov_ps(fft756, 22102, fft754);
__m512 fft838 = _mm512_mask_mov_ps(fft836, 22102, fft834);
__m512 fft759 = _mm512_mask_mul_ps(fft757, 64764, fft757, _mm512_set1_ps(5e-01f));
__m512 fft839 = _mm512_mask_mul_ps(fft837, 64764, fft837, _mm512_set1_ps(5e-01f));
__m512 fft760 = _mm512_mask_mul_ps(fft758, 64764, fft758, _mm512_set1_ps(5e-01f));
__m512 fft840 = _mm512_mask_mul_ps(fft838, 64764, fft838, _mm512_set1_ps(5e-01f));
__m512 df17 = fft759;
__m512 df25 = fft839;
__m512 df18 = fft760;
__m512 df26 = fft840;
__m512 df19 = fft740;
__m512 df27 = fft823;
__m512 df20 = fft741;
__m512 df28 = fft824;
__m512 df21 = fft742;
__m512 df29 = fft825;
__m512 df22 = fft743;
__m512 df30 = fft826;
__m512 df23 = fft744;
__m512 df31 = fft827;
__m512 df24 = fft745;
__m512 df32 = fft828;
__m512i eo5 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df19 = _mm512_permutexvar_ps(eo5, df19);
df20 = _mm512_permutexvar_ps(eo5, df20);
_mm512_mask_storeu_ps(dfPtr1+15104+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df19);
_mm512_mask_storeu_ps(dfPtr1+15168+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df20);
_mm512_mask_storeu_ps(dfPtr1+437984+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df19);
_mm512_mask_storeu_ps(dfPtr1+438048+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df20);
df27 = _mm512_permutexvar_ps(eo5, df27);
df28 = _mm512_permutexvar_ps(eo5, df28);
_mm512_mask_storeu_ps(dfPtr1+860928+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df27);
_mm512_mask_storeu_ps(dfPtr1+860992+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df28);
_mm512_mask_storeu_ps(dfPtr1+1283808+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df27);
_mm512_mask_storeu_ps(dfPtr1+1283872+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df28);
df21 = _mm512_permutexvar_ps(eo5, df21);
df22 = _mm512_permutexvar_ps(eo5, df22);
_mm512_mask_storeu_ps(dfPtr1+30208+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df21);
_mm512_mask_storeu_ps(dfPtr1+30272+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df22);
_mm512_mask_storeu_ps(dfPtr1+453088+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df21);
_mm512_mask_storeu_ps(dfPtr1+453152+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df22);
df29 = _mm512_permutexvar_ps(eo5, df29);
df30 = _mm512_permutexvar_ps(eo5, df30);
_mm512_mask_storeu_ps(dfPtr1+876032+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df29);
_mm512_mask_storeu_ps(dfPtr1+876096+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df30);
_mm512_mask_storeu_ps(dfPtr1+1298912+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df29);
_mm512_mask_storeu_ps(dfPtr1+1298976+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df30);
df23 = _mm512_permutexvar_ps(eo5, df23);
df24 = _mm512_permutexvar_ps(eo5, df24);
_mm512_mask_storeu_ps(dfPtr1+45312+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df23);
_mm512_mask_storeu_ps(dfPtr1+45376+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df24);
_mm512_mask_storeu_ps(dfPtr1+468192+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df23);
_mm512_mask_storeu_ps(dfPtr1+468256+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df24);
df31 = _mm512_permutexvar_ps(eo5, df31);
df32 = _mm512_permutexvar_ps(eo5, df32);
_mm512_mask_storeu_ps(dfPtr1+891136+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df31);
_mm512_mask_storeu_ps(dfPtr1+891200+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df32);
_mm512_mask_storeu_ps(dfPtr1+1314016+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df31);
_mm512_mask_storeu_ps(dfPtr1+1314080+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df32);
_mm512_mask_storeu_ps(dfPtr1+0+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df17);
_mm512_mask_storeu_ps(dfPtr1+64+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df18);
_mm512_mask_storeu_ps(dfPtr1+422880+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df17);
_mm512_mask_storeu_ps(dfPtr1+422944+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df18);
_mm512_mask_storeu_ps(dfPtr1+845824+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df25);
_mm512_mask_storeu_ps(dfPtr1+845888+60416*i6+45312*j2+128*k3+128*m5+32*f6, 255, df26);
_mm512_mask_storeu_ps(dfPtr1+1268704+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df25);
_mm512_mask_storeu_ps(dfPtr1+1268768+60416*i6+45312*j2+128*k3+128*m5+32*f6, 65280, df26);
}
++j2;
}

static void Example7StriderArrangeDats1(Example7ThreaderTeam1* team15, char** tensors3) {
Example7ThreaderTask1 task7;
task7.callee1 = Example7StriderArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 1;
task7.hull1[1] = 1;
task7.hull1[2] = 7;
task7.hull1[3] = 1;
Example7ThreaderDo1(team15, &task7);
}

static void Example7StriderProduceSums1Callee1(Example7ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t e3 = 0;
ptrdiff_t z2 = (ptrdiff_t)tuple2[2];
ptrdiff_t g4 = pt9[3];
ptrdiff_t p1 = pt9[2];
ptrdiff_t d1 = 0;
ptrdiff_t w2 = pt9[0];
if (__builtin_expect(!(e3|z2), 0)) {
z2 = 0;
char*restrict bfPtr2 = tensors6[0]+1176*e3;
char*restrict wfPtr2 = tensors6[0]+1216+99499008*e3+4440576*z2;
char*restrict dfPtr2 = tensors6[1]+9476096*e3+422912*z2;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i7 = 1*g4;
ptrdiff_t j3 = 1*p1;
ptrdiff_t jj2 = j3+0;
if (__builtin_expect(!j3, 0)) {
ptrdiff_t k4 = 1*d1;
ptrdiff_t l1 = 3*w2;
ptrdiff_t ll1 = l1+(w2 < 2 ? 2 : 4);
for (; l1 != 10; ++l1) {
__m512 sfRe1 = _mm512_setzero_ps();
__m512 sfIm1 = _mm512_setzero_ps();
__m512 sfRe3 = _mm512_setzero_ps();
__m512 sfIm3 = _mm512_setzero_ps();
sfRe1 = _mm512_mask_mov_ps(sfRe1, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+168*i7+16*l1)));
sfRe1 = _mm512_mask_mov_ps(sfRe1, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+168*i7+16*l1)));
sfRe3 = _mm512_mask_mov_ps(sfRe3, 1, _mm512_set1_ps(*(float*)(bfPtr2+8+168*i7+16*l1)));
sfRe3 = _mm512_mask_mov_ps(sfRe3, 256, _mm512_set1_ps(*(float*)(bfPtr2+12+168*i7+16*l1)));
__m512 sfRe2 = sfRe1;
__m512 sfIm2 = sfIm1;
__m512 sfRe4 = sfRe3;
__m512 sfIm4 = sfIm3;
for (ptrdiff_t s2 = 0; s2 < 118; ++s2) {
__m512i wfLd1 = _mm512_loadu_si512(wfPtr2+0+634368*i7+158592*j3+15104*l1+128*s2);
__m512 wfRe1 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd1));
__m512 wfIm1 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd1, 1));
__m512 wfMx1 = _mm512_mask_mov_ps(wfIm1, 64764, wfRe1);
__m512i wfLd2 = _mm512_loadu_si512(wfPtr2+64+634368*i7+158592*j3+15104*l1+128*s2);
__m512 wfRe2 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd2));
__m512 wfIm2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd2, 1));
__m512 wfMx2 = _mm512_mask_mov_ps(wfIm2, 64764, wfRe2);
__m512 dfRe1 = _mm512_loadu_ps(dfPtr2+0+60416*i7+15104*j3+45312*k4+128*s2);
__m512 dfIm1 = _mm512_loadu_ps(dfPtr2+64+60416*i7+15104*j3+45312*k4+128*s2);
sfRe1 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe1);
sfRe1 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe1, 64764);
sfIm1 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm1);
sfIm1 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm1, 64764);
sfRe3 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe3);
sfRe3 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe3, 64764);
sfIm3 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm3);
sfIm3 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm3, 64764);
dfRe1 = _mm512_shuffle_f32x4(dfRe1, dfRe1, 78);
dfIm1 = _mm512_shuffle_f32x4(dfIm1, dfIm1, 78);
sfRe2 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe2);
sfRe2 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe2, 64764);
sfIm2 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm2);
sfIm2 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm2, 64764);
sfRe4 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe4);
sfRe4 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe4, 64764);
sfIm4 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm4);
sfIm4 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm4, 64764);
}
_mm512_storeu_ps(sfPtr1+0+20992*i7+5248*j3+15744*k4+512*l1, sfRe1);
_mm512_storeu_ps(sfPtr1+64+20992*i7+5248*j3+15744*k4+512*l1, sfIm1);
_mm512_storeu_ps(sfPtr1+128+20992*i7+5248*j3+15744*k4+512*l1, sfRe2);
_mm512_storeu_ps(sfPtr1+192+20992*i7+5248*j3+15744*k4+512*l1, sfIm2);
_mm512_storeu_ps(sfPtr1+256+20992*i7+5248*j3+15744*k4+512*l1, sfRe3);
_mm512_storeu_ps(sfPtr1+320+20992*i7+5248*j3+15744*k4+512*l1, sfIm3);
_mm512_storeu_ps(sfPtr1+384+20992*i7+5248*j3+15744*k4+512*l1, sfRe4);
_mm512_storeu_ps(sfPtr1+448+20992*i7+5248*j3+15744*k4+512*l1, sfIm4);
if (l1 >= ll1) return;
}
__m512 sfRe5 = _mm512_setzero_ps();
__m512 sfIm5 = _mm512_setzero_ps();
sfRe5 = _mm512_mask_mov_ps(sfRe5, 257, _mm512_set1_ps(*(float*)(bfPtr2+0+168*i7+16*l1)));
for (ptrdiff_t s3 = 0; s3 < 118; ++s3) {
__m512i wfLd3 = _mm512_loadu_si512(wfPtr2+0+634368*i7+158592*j3+15104*l1+64*s3);
__m512 wfRe3 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd3));
__m512 wfIm3 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd3, 1));
__m512 wfMx3 = _mm512_mask_mov_ps(wfIm3, 64764, wfRe3);
__m512 dfRe2 = _mm512_loadu_ps(dfPtr2+0+60416*i7+15104*j3+45312*k4+128*s3);
__m512 dfIm2 = _mm512_loadu_ps(dfPtr2+64+60416*i7+15104*j3+45312*k4+128*s3);
sfRe5 = _mm512_fmadd_ps(wfRe3, dfRe2, sfRe5);
sfRe5 = _mm512_mask3_fmadd_ps(wfIm3, dfIm2, sfRe5, 64764);
sfIm5 = _mm512_fmadd_ps(wfMx3, dfIm2, sfIm5);
sfIm5 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe2, sfIm5, 64764);
}
_mm512_storeu_ps(sfPtr1+0+20992*i7+5248*j3+15744*k4+512*l1, sfRe5);
_mm512_storeu_ps(sfPtr1+64+20992*i7+5248*j3+15744*k4+512*l1, sfIm5);
j3 = 1;
}
for (; j3 <= jj2; ++j3) {
ptrdiff_t k5 = 1*d1;
ptrdiff_t l2 = 3*w2;
ptrdiff_t ll2 = l2+(w2 < 2 ? 2 : 4);
for (; l2 != 10; ++l2) {
__m512 sfRe6 = _mm512_setzero_ps();
__m512 sfIm6 = _mm512_setzero_ps();
__m512 sfRe8 = _mm512_setzero_ps();
__m512 sfIm8 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe7 = sfRe6;
__m512 sfIm7 = sfIm6;
__m512 sfRe9 = sfRe8;
__m512 sfIm9 = sfIm8;
for (ptrdiff_t s4 = 0; s4 < 118; ++s4) {
__m512i wfLd4 = _mm512_loadu_si512(wfPtr2+0+634368*i7+158592*j3+15104*l2+128*s4);
__m512 wfRe4 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd4));
__m512 wfIm4 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd4, 1));
__m512i wfLd5 = _mm512_loadu_si512(wfPtr2+64+634368*i7+158592*j3+15104*l2+128*s4);
__m512 wfRe5 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd5));
__m512 wfIm5 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd5, 1));
__m512 dfRe3 = _mm512_loadu_ps(dfPtr2+0+60416*i7+15104*j3+45312*k5+128*s4);
__m512 dfIm3 = _mm512_loadu_ps(dfPtr2+64+60416*i7+15104*j3+45312*k5+128*s4);
sfRe6 = _mm512_fmadd_ps(wfRe4, dfRe3, sfRe6);
sfRe6 = _mm512_fmadd_ps(wfIm4, dfIm3, sfRe6);
sfIm6 = _mm512_fmadd_ps(wfRe4, dfIm3, sfIm6);
sfIm6 = _mm512_fnmadd_ps(wfIm4, dfRe3, sfIm6);
sfRe8 = _mm512_fmadd_ps(wfRe5, dfRe3, sfRe8);
sfRe8 = _mm512_fmadd_ps(wfIm5, dfIm3, sfRe8);
sfIm8 = _mm512_fmadd_ps(wfRe5, dfIm3, sfIm8);
sfIm8 = _mm512_fnmadd_ps(wfIm5, dfRe3, sfIm8);
dfRe3 = _mm512_shuffle_f32x4(dfRe3, dfRe3, 78);
dfIm3 = _mm512_shuffle_f32x4(dfIm3, dfIm3, 78);
sfRe7 = _mm512_fmadd_ps(wfRe4, dfRe3, sfRe7);
sfRe7 = _mm512_fmadd_ps(wfIm4, dfIm3, sfRe7);
sfIm7 = _mm512_fmadd_ps(wfRe4, dfIm3, sfIm7);
sfIm7 = _mm512_fnmadd_ps(wfIm4, dfRe3, sfIm7);
sfRe9 = _mm512_fmadd_ps(wfRe5, dfRe3, sfRe9);
sfRe9 = _mm512_fmadd_ps(wfIm5, dfIm3, sfRe9);
sfIm9 = _mm512_fmadd_ps(wfRe5, dfIm3, sfIm9);
sfIm9 = _mm512_fnmadd_ps(wfIm5, dfRe3, sfIm9);
}
_mm512_storeu_ps(sfPtr1+0+20992*i7+5248*j3+15744*k5+512*l2, sfRe6);
_mm512_storeu_ps(sfPtr1+64+20992*i7+5248*j3+15744*k5+512*l2, sfIm6);
_mm512_storeu_ps(sfPtr1+128+20992*i7+5248*j3+15744*k5+512*l2, sfRe7);
_mm512_storeu_ps(sfPtr1+192+20992*i7+5248*j3+15744*k5+512*l2, sfIm7);
_mm512_storeu_ps(sfPtr1+256+20992*i7+5248*j3+15744*k5+512*l2, sfRe8);
_mm512_storeu_ps(sfPtr1+320+20992*i7+5248*j3+15744*k5+512*l2, sfIm8);
_mm512_storeu_ps(sfPtr1+384+20992*i7+5248*j3+15744*k5+512*l2, sfRe9);
_mm512_storeu_ps(sfPtr1+448+20992*i7+5248*j3+15744*k5+512*l2, sfIm9);
if (l2 >= ll2) return;
}
__m512 sfRe10 = _mm512_setzero_ps();
__m512 sfIm10 = _mm512_setzero_ps();
(void)bfPtr2;
for (ptrdiff_t s5 = 0; s5 < 118; ++s5) {
__m512i wfLd6 = _mm512_loadu_si512(wfPtr2+0+634368*i7+158592*j3+15104*l2+64*s5);
__m512 wfRe6 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd6));
__m512 wfIm6 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd6, 1));
__m512 dfRe4 = _mm512_loadu_ps(dfPtr2+0+60416*i7+15104*j3+45312*k5+128*s5);
__m512 dfIm4 = _mm512_loadu_ps(dfPtr2+64+60416*i7+15104*j3+45312*k5+128*s5);
sfRe10 = _mm512_fmadd_ps(wfRe6, dfRe4, sfRe10);
sfRe10 = _mm512_fmadd_ps(wfIm6, dfIm4, sfRe10);
sfIm10 = _mm512_fmadd_ps(wfRe6, dfIm4, sfIm10);
sfIm10 = _mm512_fnmadd_ps(wfIm6, dfRe4, sfIm10);
}
_mm512_storeu_ps(sfPtr1+0+20992*i7+5248*j3+15744*k5+512*l2, sfRe10);
_mm512_storeu_ps(sfPtr1+64+20992*i7+5248*j3+15744*k5+512*l2, sfIm10);
}
return;
}
char*restrict bfPtr3 = tensors6[0]+1176*e3;
char*restrict wfPtr3 = tensors6[0]+1216+99499008*e3+4440576*z2;
char*restrict dfPtr3 = tensors6[1]+9476096*e3+422912*z2;
char*restrict sfPtr2 = tensors6[2];
ptrdiff_t i8 = 1*g4;
ptrdiff_t j4 = 1*p1;
ptrdiff_t jj3 = j4+0;
if (__builtin_expect(!j4, 0)) {
ptrdiff_t k6 = 1*d1;
ptrdiff_t l3 = 3*w2;
ptrdiff_t ll3 = l3+(w2 < 2 ? 2 : 4);
for (; l3 != 10; ++l3) {
__m512 sfRe11 = _mm512_setzero_ps();
__m512 sfIm11 = _mm512_setzero_ps();
__m512 sfRe13 = _mm512_setzero_ps();
__m512 sfIm13 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe12 = sfRe11;
__m512 sfIm12 = sfIm11;
__m512 sfRe14 = sfRe13;
__m512 sfIm14 = sfIm13;
for (ptrdiff_t s6 = 0; s6 < 118; ++s6) {
__m512i wfLd7 = _mm512_loadu_si512(wfPtr3+0+634368*i8+158592*j4+15104*l3+128*s6);
__m512 wfRe7 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd7));
__m512 wfIm7 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd7, 1));
__m512 wfMx4 = _mm512_mask_mov_ps(wfIm7, 64764, wfRe7);
__m512i wfLd8 = _mm512_loadu_si512(wfPtr3+64+634368*i8+158592*j4+15104*l3+128*s6);
__m512 wfRe8 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd8));
__m512 wfIm8 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd8, 1));
__m512 wfMx5 = _mm512_mask_mov_ps(wfIm8, 64764, wfRe8);
__m512 dfRe5 = _mm512_loadu_ps(dfPtr3+0+60416*i8+15104*j4+45312*k6+128*s6);
__m512 dfIm5 = _mm512_loadu_ps(dfPtr3+64+60416*i8+15104*j4+45312*k6+128*s6);
sfRe11 = _mm512_fmadd_ps(wfRe7, dfRe5, sfRe11);
sfRe11 = _mm512_mask3_fmadd_ps(wfIm7, dfIm5, sfRe11, 64764);
sfIm11 = _mm512_fmadd_ps(wfMx4, dfIm5, sfIm11);
sfIm11 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe5, sfIm11, 64764);
sfRe13 = _mm512_fmadd_ps(wfRe8, dfRe5, sfRe13);
sfRe13 = _mm512_mask3_fmadd_ps(wfIm8, dfIm5, sfRe13, 64764);
sfIm13 = _mm512_fmadd_ps(wfMx5, dfIm5, sfIm13);
sfIm13 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe5, sfIm13, 64764);
dfRe5 = _mm512_shuffle_f32x4(dfRe5, dfRe5, 78);
dfIm5 = _mm512_shuffle_f32x4(dfIm5, dfIm5, 78);
sfRe12 = _mm512_fmadd_ps(wfRe7, dfRe5, sfRe12);
sfRe12 = _mm512_mask3_fmadd_ps(wfIm7, dfIm5, sfRe12, 64764);
sfIm12 = _mm512_fmadd_ps(wfMx4, dfIm5, sfIm12);
sfIm12 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe5, sfIm12, 64764);
sfRe14 = _mm512_fmadd_ps(wfRe8, dfRe5, sfRe14);
sfRe14 = _mm512_mask3_fmadd_ps(wfIm8, dfIm5, sfRe14, 64764);
sfIm14 = _mm512_fmadd_ps(wfMx5, dfIm5, sfIm14);
sfIm14 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe5, sfIm14, 64764);
}
sfRe11 = _mm512_add_ps(sfRe11, _mm512_loadu_ps(sfPtr2+0+20992*i8+5248*j4+15744*k6+512*l3));
sfIm11 = _mm512_add_ps(sfIm11, _mm512_loadu_ps(sfPtr2+64+20992*i8+5248*j4+15744*k6+512*l3));
sfRe12 = _mm512_add_ps(sfRe12, _mm512_loadu_ps(sfPtr2+128+20992*i8+5248*j4+15744*k6+512*l3));
sfIm12 = _mm512_add_ps(sfIm12, _mm512_loadu_ps(sfPtr2+192+20992*i8+5248*j4+15744*k6+512*l3));
sfRe13 = _mm512_add_ps(sfRe13, _mm512_loadu_ps(sfPtr2+256+20992*i8+5248*j4+15744*k6+512*l3));
sfIm13 = _mm512_add_ps(sfIm13, _mm512_loadu_ps(sfPtr2+320+20992*i8+5248*j4+15744*k6+512*l3));
sfRe14 = _mm512_add_ps(sfRe14, _mm512_loadu_ps(sfPtr2+384+20992*i8+5248*j4+15744*k6+512*l3));
sfIm14 = _mm512_add_ps(sfIm14, _mm512_loadu_ps(sfPtr2+448+20992*i8+5248*j4+15744*k6+512*l3));
_mm512_storeu_ps(sfPtr2+0+20992*i8+5248*j4+15744*k6+512*l3, sfRe11);
_mm512_storeu_ps(sfPtr2+64+20992*i8+5248*j4+15744*k6+512*l3, sfIm11);
_mm512_storeu_ps(sfPtr2+128+20992*i8+5248*j4+15744*k6+512*l3, sfRe12);
_mm512_storeu_ps(sfPtr2+192+20992*i8+5248*j4+15744*k6+512*l3, sfIm12);
_mm512_storeu_ps(sfPtr2+256+20992*i8+5248*j4+15744*k6+512*l3, sfRe13);
_mm512_storeu_ps(sfPtr2+320+20992*i8+5248*j4+15744*k6+512*l3, sfIm13);
_mm512_storeu_ps(sfPtr2+384+20992*i8+5248*j4+15744*k6+512*l3, sfRe14);
_mm512_storeu_ps(sfPtr2+448+20992*i8+5248*j4+15744*k6+512*l3, sfIm14);
if (l3 >= ll3) return;
}
__m512 sfRe15 = _mm512_setzero_ps();
__m512 sfIm15 = _mm512_setzero_ps();
(void)bfPtr3;
for (ptrdiff_t s7 = 0; s7 < 118; ++s7) {
__m512i wfLd9 = _mm512_loadu_si512(wfPtr3+0+634368*i8+158592*j4+15104*l3+64*s7);
__m512 wfRe9 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd9));
__m512 wfIm9 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd9, 1));
__m512 wfMx6 = _mm512_mask_mov_ps(wfIm9, 64764, wfRe9);
__m512 dfRe6 = _mm512_loadu_ps(dfPtr3+0+60416*i8+15104*j4+45312*k6+128*s7);
__m512 dfIm6 = _mm512_loadu_ps(dfPtr3+64+60416*i8+15104*j4+45312*k6+128*s7);
sfRe15 = _mm512_fmadd_ps(wfRe9, dfRe6, sfRe15);
sfRe15 = _mm512_mask3_fmadd_ps(wfIm9, dfIm6, sfRe15, 64764);
sfIm15 = _mm512_fmadd_ps(wfMx6, dfIm6, sfIm15);
sfIm15 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe6, sfIm15, 64764);
}
sfRe15 = _mm512_add_ps(sfRe15, _mm512_loadu_ps(sfPtr2+0+20992*i8+5248*j4+15744*k6+512*l3));
sfIm15 = _mm512_add_ps(sfIm15, _mm512_loadu_ps(sfPtr2+64+20992*i8+5248*j4+15744*k6+512*l3));
_mm512_storeu_ps(sfPtr2+0+20992*i8+5248*j4+15744*k6+512*l3, sfRe15);
_mm512_storeu_ps(sfPtr2+64+20992*i8+5248*j4+15744*k6+512*l3, sfIm15);
j4 = 1;
}
for (; j4 <= jj3; ++j4) {
ptrdiff_t k7 = 1*d1;
ptrdiff_t l4 = 3*w2;
ptrdiff_t ll4 = l4+(w2 < 2 ? 2 : 4);
for (; l4 != 10; ++l4) {
__m512 sfRe16 = _mm512_setzero_ps();
__m512 sfIm16 = _mm512_setzero_ps();
__m512 sfRe18 = _mm512_setzero_ps();
__m512 sfIm18 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe17 = sfRe16;
__m512 sfIm17 = sfIm16;
__m512 sfRe19 = sfRe18;
__m512 sfIm19 = sfIm18;
for (ptrdiff_t s8 = 0; s8 < 118; ++s8) {
__m512i wfLd10 = _mm512_loadu_si512(wfPtr3+0+634368*i8+158592*j4+15104*l4+128*s8);
__m512 wfRe10 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd10));
__m512 wfIm10 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd10, 1));
__m512i wfLd11 = _mm512_loadu_si512(wfPtr3+64+634368*i8+158592*j4+15104*l4+128*s8);
__m512 wfRe11 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd11));
__m512 wfIm11 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd11, 1));
__m512 dfRe7 = _mm512_loadu_ps(dfPtr3+0+60416*i8+15104*j4+45312*k7+128*s8);
__m512 dfIm7 = _mm512_loadu_ps(dfPtr3+64+60416*i8+15104*j4+45312*k7+128*s8);
sfRe16 = _mm512_fmadd_ps(wfRe10, dfRe7, sfRe16);
sfRe16 = _mm512_fmadd_ps(wfIm10, dfIm7, sfRe16);
sfIm16 = _mm512_fmadd_ps(wfRe10, dfIm7, sfIm16);
sfIm16 = _mm512_fnmadd_ps(wfIm10, dfRe7, sfIm16);
sfRe18 = _mm512_fmadd_ps(wfRe11, dfRe7, sfRe18);
sfRe18 = _mm512_fmadd_ps(wfIm11, dfIm7, sfRe18);
sfIm18 = _mm512_fmadd_ps(wfRe11, dfIm7, sfIm18);
sfIm18 = _mm512_fnmadd_ps(wfIm11, dfRe7, sfIm18);
dfRe7 = _mm512_shuffle_f32x4(dfRe7, dfRe7, 78);
dfIm7 = _mm512_shuffle_f32x4(dfIm7, dfIm7, 78);
sfRe17 = _mm512_fmadd_ps(wfRe10, dfRe7, sfRe17);
sfRe17 = _mm512_fmadd_ps(wfIm10, dfIm7, sfRe17);
sfIm17 = _mm512_fmadd_ps(wfRe10, dfIm7, sfIm17);
sfIm17 = _mm512_fnmadd_ps(wfIm10, dfRe7, sfIm17);
sfRe19 = _mm512_fmadd_ps(wfRe11, dfRe7, sfRe19);
sfRe19 = _mm512_fmadd_ps(wfIm11, dfIm7, sfRe19);
sfIm19 = _mm512_fmadd_ps(wfRe11, dfIm7, sfIm19);
sfIm19 = _mm512_fnmadd_ps(wfIm11, dfRe7, sfIm19);
}
sfRe16 = _mm512_add_ps(sfRe16, _mm512_loadu_ps(sfPtr2+0+20992*i8+5248*j4+15744*k7+512*l4));
sfIm16 = _mm512_add_ps(sfIm16, _mm512_loadu_ps(sfPtr2+64+20992*i8+5248*j4+15744*k7+512*l4));
sfRe17 = _mm512_add_ps(sfRe17, _mm512_loadu_ps(sfPtr2+128+20992*i8+5248*j4+15744*k7+512*l4));
sfIm17 = _mm512_add_ps(sfIm17, _mm512_loadu_ps(sfPtr2+192+20992*i8+5248*j4+15744*k7+512*l4));
sfRe18 = _mm512_add_ps(sfRe18, _mm512_loadu_ps(sfPtr2+256+20992*i8+5248*j4+15744*k7+512*l4));
sfIm18 = _mm512_add_ps(sfIm18, _mm512_loadu_ps(sfPtr2+320+20992*i8+5248*j4+15744*k7+512*l4));
sfRe19 = _mm512_add_ps(sfRe19, _mm512_loadu_ps(sfPtr2+384+20992*i8+5248*j4+15744*k7+512*l4));
sfIm19 = _mm512_add_ps(sfIm19, _mm512_loadu_ps(sfPtr2+448+20992*i8+5248*j4+15744*k7+512*l4));
_mm512_storeu_ps(sfPtr2+0+20992*i8+5248*j4+15744*k7+512*l4, sfRe16);
_mm512_storeu_ps(sfPtr2+64+20992*i8+5248*j4+15744*k7+512*l4, sfIm16);
_mm512_storeu_ps(sfPtr2+128+20992*i8+5248*j4+15744*k7+512*l4, sfRe17);
_mm512_storeu_ps(sfPtr2+192+20992*i8+5248*j4+15744*k7+512*l4, sfIm17);
_mm512_storeu_ps(sfPtr2+256+20992*i8+5248*j4+15744*k7+512*l4, sfRe18);
_mm512_storeu_ps(sfPtr2+320+20992*i8+5248*j4+15744*k7+512*l4, sfIm18);
_mm512_storeu_ps(sfPtr2+384+20992*i8+5248*j4+15744*k7+512*l4, sfRe19);
_mm512_storeu_ps(sfPtr2+448+20992*i8+5248*j4+15744*k7+512*l4, sfIm19);
if (l4 >= ll4) return;
}
__m512 sfRe20 = _mm512_setzero_ps();
__m512 sfIm20 = _mm512_setzero_ps();
(void)bfPtr3;
for (ptrdiff_t s9 = 0; s9 < 118; ++s9) {
__m512i wfLd12 = _mm512_loadu_si512(wfPtr3+0+634368*i8+158592*j4+15104*l4+64*s9);
__m512 wfRe12 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd12));
__m512 wfIm12 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd12, 1));
__m512 dfRe8 = _mm512_loadu_ps(dfPtr3+0+60416*i8+15104*j4+45312*k7+128*s9);
__m512 dfIm8 = _mm512_loadu_ps(dfPtr3+64+60416*i8+15104*j4+45312*k7+128*s9);
sfRe20 = _mm512_fmadd_ps(wfRe12, dfRe8, sfRe20);
sfRe20 = _mm512_fmadd_ps(wfIm12, dfIm8, sfRe20);
sfIm20 = _mm512_fmadd_ps(wfRe12, dfIm8, sfIm20);
sfIm20 = _mm512_fnmadd_ps(wfIm12, dfRe8, sfIm20);
}
sfRe20 = _mm512_add_ps(sfRe20, _mm512_loadu_ps(sfPtr2+0+20992*i8+5248*j4+15744*k7+512*l4));
sfIm20 = _mm512_add_ps(sfIm20, _mm512_loadu_ps(sfPtr2+64+20992*i8+5248*j4+15744*k7+512*l4));
_mm512_storeu_ps(sfPtr2+0+20992*i8+5248*j4+15744*k7+512*l4, sfRe20);
_mm512_storeu_ps(sfPtr2+64+20992*i8+5248*j4+15744*k7+512*l4, sfIm20);
}
}

static void Example7StriderProduceSums1(Example7ThreaderTeam1* team16, char** tensors5) {
void* tuple1[3];
tuple1[0] = tensors5;
for (ptrdiff_t e4 = 0; e4 < 1; ++e4) {
tuple1[1] = (void*)e4;
for (ptrdiff_t z3 = 0; z3 < 4; ++z3) {
tuple1[2] = (void*)z3;
Example7ThreaderTask1 task9;
task9.callee1 = Example7StriderProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 3;
task9.hull1[1] = 1;
task9.hull1[2] = 4;
task9.hull1[3] = 7;
Example7ThreaderDo1(team16, &task9);
}
}
}

static void Example7StriderConsumeSums1Callee1(Example7ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t w3 = 0;
ptrdiff_t d2 = 0;
ptrdiff_t g5 = 0;
(void)pt10;
char*restrict sfPtr3 = tensors8[0];
char*restrict datPtr2 = tensors8[1];
ptrdiff_t i9 = 7*g5;
ptrdiff_t ii1 = i9+6;
for (; i9 <= ii1; ++i9) {
ptrdiff_t j5 = 1*d2;
ptrdiff_t rel2 = j5-0;
ptrdiff_t base2 = 0;
ptrdiff_t toH1 = base2+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k8 = 11*w3;
for (; k8 != 10; ++k8) {
ptrdiff_t r2 = 0;
for (; r2 != 2; ++r2) {
ptrdiff_t t2 = 0;
__m512 sfRe21 = _mm512_loadu_ps(sfPtr3+0+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfIm21 = _mm512_loadu_ps(sfPtr3+64+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfRe25 = _mm512_loadu_ps(sfPtr3+128+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfIm25 = _mm512_loadu_ps(sfPtr3+192+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfRe22 = _mm512_loadu_ps(sfPtr3+5248+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfIm22 = _mm512_loadu_ps(sfPtr3+5312+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfRe26 = _mm512_loadu_ps(sfPtr3+5376+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfIm26 = _mm512_loadu_ps(sfPtr3+5440+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfRe23 = _mm512_loadu_ps(sfPtr3+10496+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfIm23 = _mm512_loadu_ps(sfPtr3+10560+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfRe27 = _mm512_loadu_ps(sfPtr3+10624+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfIm27 = _mm512_loadu_ps(sfPtr3+10688+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfRe24 = _mm512_loadu_ps(sfPtr3+15744+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfIm24 = _mm512_loadu_ps(sfPtr3+15808+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfRe28 = _mm512_loadu_ps(sfPtr3+15872+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512 sfIm28 = _mm512_loadu_ps(sfPtr3+15936+20992*i9+15744*j5+512*k8+256*r2+256*t2);
__m512i ifft1 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2 = _mm512_permutexvar_ps(ifft1, sfRe21);
__m512 ifft93 = _mm512_permutexvar_ps(ifft1, sfRe25);
__m512i ifft3 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4 = _mm512_permutexvar_ps(ifft3, sfRe21);
__m512 ifft94 = _mm512_permutexvar_ps(ifft3, sfRe25);
__m512 ifft5 = _mm512_permutexvar_ps(ifft1, sfIm21);
__m512 ifft95 = _mm512_permutexvar_ps(ifft1, sfIm25);
__m512 ifft6 = _mm512_permutexvar_ps(ifft3, sfIm21);
__m512 ifft96 = _mm512_permutexvar_ps(ifft3, sfIm25);
__m512 ifft7 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft8 = _mm512_mask_fmadd_ps(ifft6, 65021, ifft7, ifft2);
__m512 ifft97 = _mm512_mask_fmadd_ps(ifft96, 65021, ifft7, ifft93);
__m512 ifft9 = _mm512_mask_fnmadd_ps(ifft5, 65021, ifft7, ifft4);
__m512 ifft98 = _mm512_mask_fnmadd_ps(ifft95, 65021, ifft7, ifft94);
__m512 ifft10 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft11 = _mm512_fmadd_ps(ifft8, ifft10, _mm512_shuffle_ps(ifft8, ifft8, 177));
__m512 ifft99 = _mm512_fmadd_ps(ifft97, ifft10, _mm512_shuffle_ps(ifft97, ifft97, 177));
__m512 ifft12 = _mm512_fmadd_ps(ifft9, ifft10, _mm512_shuffle_ps(ifft9, ifft9, 177));
__m512 ifft100 = _mm512_fmadd_ps(ifft98, ifft10, _mm512_shuffle_ps(ifft98, ifft98, 177));
__m512 ifft13 = _mm512_fmadd_ps(sfRe22, ifft10, _mm512_shuffle_ps(sfRe22, sfRe22, 177));
__m512 ifft101 = _mm512_fmadd_ps(sfRe26, ifft10, _mm512_shuffle_ps(sfRe26, sfRe26, 177));
__m512 ifft14 = _mm512_fmadd_ps(sfIm22, ifft10, _mm512_shuffle_ps(sfIm22, sfIm22, 177));
__m512 ifft102 = _mm512_fmadd_ps(sfIm26, ifft10, _mm512_shuffle_ps(sfIm26, sfIm26, 177));
__m512 ifft15 = _mm512_fmadd_ps(sfRe23, ifft10, _mm512_shuffle_ps(sfRe23, sfRe23, 177));
__m512 ifft103 = _mm512_fmadd_ps(sfRe27, ifft10, _mm512_shuffle_ps(sfRe27, sfRe27, 177));
__m512 ifft16 = _mm512_fmadd_ps(sfIm23, ifft10, _mm512_shuffle_ps(sfIm23, sfIm23, 177));
__m512 ifft104 = _mm512_fmadd_ps(sfIm27, ifft10, _mm512_shuffle_ps(sfIm27, sfIm27, 177));
__m512 ifft17 = _mm512_fmadd_ps(sfRe24, ifft10, _mm512_shuffle_ps(sfRe24, sfRe24, 177));
__m512 ifft105 = _mm512_fmadd_ps(sfRe28, ifft10, _mm512_shuffle_ps(sfRe28, sfRe28, 177));
__m512 ifft18 = _mm512_fmadd_ps(sfIm24, ifft10, _mm512_shuffle_ps(sfIm24, sfIm24, 177));
__m512 ifft106 = _mm512_fmadd_ps(sfIm28, ifft10, _mm512_shuffle_ps(sfIm28, sfIm28, 177));
__m512 ifft19 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft20 = _mm512_mul_ps(ifft11, ifft19);
__m512 ifft107 = _mm512_mul_ps(ifft99, ifft19);
__m512 ifft21 = _mm512_mul_ps(ifft12, ifft19);
__m512 ifft108 = _mm512_mul_ps(ifft100, ifft19);
__m512 ifft22 = _mm512_mul_ps(ifft13, ifft19);
__m512 ifft109 = _mm512_mul_ps(ifft101, ifft19);
__m512 ifft23 = _mm512_mul_ps(ifft14, ifft19);
__m512 ifft110 = _mm512_mul_ps(ifft102, ifft19);
__m512 ifft24 = _mm512_mul_ps(ifft15, ifft19);
__m512 ifft111 = _mm512_mul_ps(ifft103, ifft19);
__m512 ifft25 = _mm512_mul_ps(ifft16, ifft19);
__m512 ifft112 = _mm512_mul_ps(ifft104, ifft19);
__m512 ifft26 = _mm512_mul_ps(ifft17, ifft19);
__m512 ifft113 = _mm512_mul_ps(ifft105, ifft19);
__m512 ifft27 = _mm512_mul_ps(ifft18, ifft19);
__m512 ifft114 = _mm512_mul_ps(ifft106, ifft19);
__m512 ifft28 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft29 = _mm512_fnmadd_ps(ifft12, ifft28, ifft20);
__m512 ifft115 = _mm512_fnmadd_ps(ifft100, ifft28, ifft107);
__m512 ifft30 = _mm512_fmadd_ps(ifft11, ifft28, ifft21);
__m512 ifft116 = _mm512_fmadd_ps(ifft99, ifft28, ifft108);
__m512 ifft31 = _mm512_fnmadd_ps(ifft14, ifft28, ifft22);
__m512 ifft117 = _mm512_fnmadd_ps(ifft102, ifft28, ifft109);
__m512 ifft32 = _mm512_fmadd_ps(ifft13, ifft28, ifft23);
__m512 ifft118 = _mm512_fmadd_ps(ifft101, ifft28, ifft110);
__m512 ifft33 = _mm512_fnmadd_ps(ifft16, ifft28, ifft24);
__m512 ifft119 = _mm512_fnmadd_ps(ifft104, ifft28, ifft111);
__m512 ifft34 = _mm512_fmadd_ps(ifft15, ifft28, ifft25);
__m512 ifft120 = _mm512_fmadd_ps(ifft103, ifft28, ifft112);
__m512 ifft35 = _mm512_fnmadd_ps(ifft18, ifft28, ifft26);
__m512 ifft121 = _mm512_fnmadd_ps(ifft106, ifft28, ifft113);
__m512 ifft36 = _mm512_fmadd_ps(ifft17, ifft28, ifft27);
__m512 ifft122 = _mm512_fmadd_ps(ifft105, ifft28, ifft114);
__m512 ifft37 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft38 = _mm512_fmadd_ps(ifft29, ifft37, _mm512_shuffle_ps(ifft29, ifft29, 78));
__m512 ifft123 = _mm512_fmadd_ps(ifft115, ifft37, _mm512_shuffle_ps(ifft115, ifft115, 78));
__m512 ifft39 = _mm512_fmadd_ps(ifft30, ifft37, _mm512_shuffle_ps(ifft30, ifft30, 78));
__m512 ifft124 = _mm512_fmadd_ps(ifft116, ifft37, _mm512_shuffle_ps(ifft116, ifft116, 78));
__m512 ifft40 = _mm512_fmadd_ps(ifft31, ifft37, _mm512_shuffle_ps(ifft31, ifft31, 78));
__m512 ifft125 = _mm512_fmadd_ps(ifft117, ifft37, _mm512_shuffle_ps(ifft117, ifft117, 78));
__m512 ifft41 = _mm512_fmadd_ps(ifft32, ifft37, _mm512_shuffle_ps(ifft32, ifft32, 78));
__m512 ifft126 = _mm512_fmadd_ps(ifft118, ifft37, _mm512_shuffle_ps(ifft118, ifft118, 78));
__m512 ifft42 = _mm512_fmadd_ps(ifft33, ifft37, _mm512_shuffle_ps(ifft33, ifft33, 78));
__m512 ifft127 = _mm512_fmadd_ps(ifft119, ifft37, _mm512_shuffle_ps(ifft119, ifft119, 78));
__m512 ifft43 = _mm512_fmadd_ps(ifft34, ifft37, _mm512_shuffle_ps(ifft34, ifft34, 78));
__m512 ifft128 = _mm512_fmadd_ps(ifft120, ifft37, _mm512_shuffle_ps(ifft120, ifft120, 78));
__m512 ifft44 = _mm512_fmadd_ps(ifft35, ifft37, _mm512_shuffle_ps(ifft35, ifft35, 78));
__m512 ifft129 = _mm512_fmadd_ps(ifft121, ifft37, _mm512_shuffle_ps(ifft121, ifft121, 78));
__m512 ifft45 = _mm512_fmadd_ps(ifft36, ifft37, _mm512_shuffle_ps(ifft36, ifft36, 78));
__m512 ifft130 = _mm512_fmadd_ps(ifft122, ifft37, _mm512_shuffle_ps(ifft122, ifft122, 78));
__m512 ifft46 = _mm512_mask_sub_ps(ifft38, 49344, _mm512_setzero_ps(), ifft39);
__m512 ifft131 = _mm512_mask_sub_ps(ifft123, 49344, _mm512_setzero_ps(), ifft124);
__m512 ifft47 = _mm512_mask_mov_ps(ifft39, 49344, ifft38);
__m512 ifft132 = _mm512_mask_mov_ps(ifft124, 49344, ifft123);
__m512 ifft48 = _mm512_mask_sub_ps(ifft40, 49344, _mm512_setzero_ps(), ifft41);
__m512 ifft133 = _mm512_mask_sub_ps(ifft125, 49344, _mm512_setzero_ps(), ifft126);
__m512 ifft49 = _mm512_mask_mov_ps(ifft41, 49344, ifft40);
__m512 ifft134 = _mm512_mask_mov_ps(ifft126, 49344, ifft125);
__m512 ifft50 = _mm512_mask_sub_ps(ifft42, 49344, _mm512_setzero_ps(), ifft43);
__m512 ifft135 = _mm512_mask_sub_ps(ifft127, 49344, _mm512_setzero_ps(), ifft128);
__m512 ifft51 = _mm512_mask_mov_ps(ifft43, 49344, ifft42);
__m512 ifft136 = _mm512_mask_mov_ps(ifft128, 49344, ifft127);
__m512 ifft52 = _mm512_mask_sub_ps(ifft44, 49344, _mm512_setzero_ps(), ifft45);
__m512 ifft137 = _mm512_mask_sub_ps(ifft129, 49344, _mm512_setzero_ps(), ifft130);
__m512 ifft53 = _mm512_mask_mov_ps(ifft45, 49344, ifft44);
__m512 ifft138 = _mm512_mask_mov_ps(ifft130, 49344, ifft129);
__m512 ifft54 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft55 = _mm512_fmadd_ps(ifft46, ifft54, _mm512_shuffle_f32x4(ifft46, ifft46, 177));
__m512 ifft139 = _mm512_fmadd_ps(ifft131, ifft54, _mm512_shuffle_f32x4(ifft131, ifft131, 177));
__m512 ifft56 = _mm512_fmadd_ps(ifft47, ifft54, _mm512_shuffle_f32x4(ifft47, ifft47, 177));
__m512 ifft140 = _mm512_fmadd_ps(ifft132, ifft54, _mm512_shuffle_f32x4(ifft132, ifft132, 177));
__m512 ifft57 = _mm512_fmadd_ps(ifft48, ifft54, _mm512_shuffle_f32x4(ifft48, ifft48, 177));
__m512 ifft141 = _mm512_fmadd_ps(ifft133, ifft54, _mm512_shuffle_f32x4(ifft133, ifft133, 177));
__m512 ifft58 = _mm512_fmadd_ps(ifft49, ifft54, _mm512_shuffle_f32x4(ifft49, ifft49, 177));
__m512 ifft142 = _mm512_fmadd_ps(ifft134, ifft54, _mm512_shuffle_f32x4(ifft134, ifft134, 177));
__m512 ifft59 = _mm512_fmadd_ps(ifft50, ifft54, _mm512_shuffle_f32x4(ifft50, ifft50, 177));
__m512 ifft143 = _mm512_fmadd_ps(ifft135, ifft54, _mm512_shuffle_f32x4(ifft135, ifft135, 177));
__m512 ifft60 = _mm512_fnmsub_ps(ifft51, ifft54, _mm512_shuffle_f32x4(ifft51, ifft51, 177));
__m512 ifft144 = _mm512_fnmsub_ps(ifft136, ifft54, _mm512_shuffle_f32x4(ifft136, ifft136, 177));
__m512 ifft61 = _mm512_fmadd_ps(ifft52, ifft54, _mm512_shuffle_f32x4(ifft52, ifft52, 177));
__m512 ifft145 = _mm512_fmadd_ps(ifft137, ifft54, _mm512_shuffle_f32x4(ifft137, ifft137, 177));
__m512 ifft62 = _mm512_fmadd_ps(ifft53, ifft54, _mm512_shuffle_f32x4(ifft53, ifft53, 177));
__m512 ifft146 = _mm512_fmadd_ps(ifft138, ifft54, _mm512_shuffle_f32x4(ifft138, ifft138, 177));
__m512 ifft63 = _mm512_add_ps(ifft55, ifft56);
__m512 ifft147 = _mm512_add_ps(ifft139, ifft140);
__m512 ifft64 = _mm512_sub_ps(ifft55, ifft56);
__m512 ifft148 = _mm512_sub_ps(ifft139, ifft140);
__m512 ifft65 = _mm512_sub_ps(ifft57, ifft61);
__m512 ifft149 = _mm512_sub_ps(ifft141, ifft145);
__m512 ifft66 = _mm512_add_ps(ifft58, ifft62);
__m512 ifft150 = _mm512_add_ps(ifft142, ifft146);
__m512 ifft67 = _mm512_add_ps(ifft57, ifft61);
__m512 ifft151 = _mm512_add_ps(ifft141, ifft145);
__m512 ifft68 = _mm512_sub_ps(ifft58, ifft62);
__m512 ifft152 = _mm512_sub_ps(ifft142, ifft146);
__m512 ifft69 = _mm512_mul_ps(ifft59, _mm512_set1_ps(3.125e-02f));
__m512 ifft153 = _mm512_mul_ps(ifft143, _mm512_set1_ps(3.125e-02f));
__m512 ifft70 = _mm512_mul_ps(ifft60, _mm512_set1_ps(3.125e-02f));
__m512 ifft154 = _mm512_mul_ps(ifft144, _mm512_set1_ps(3.125e-02f));
__m512 ifft71 = _mm512_fmadd_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft155 = _mm512_fmadd_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft72 = _mm512_fmsub_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft156 = _mm512_fmsub_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft73 = _mm512_fmadd_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft157 = _mm512_fmadd_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft74 = _mm512_fmsub_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft158 = _mm512_fmsub_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft75 = _mm512_add_ps(ifft65, ifft66);
__m512 ifft159 = _mm512_add_ps(ifft149, ifft150);
__m512 ifft76 = _mm512_sub_ps(ifft65, ifft66);
__m512 ifft160 = _mm512_sub_ps(ifft149, ifft150);
__m512 ifft77 = _mm512_fnmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft161 = _mm512_fnmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft78 = _mm512_fmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft162 = _mm512_fmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft79 = _mm512_fmadd_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft163 = _mm512_fmadd_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft80 = _mm512_fmsub_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft164 = _mm512_fmsub_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft81 = _mm512_add_ps(ifft77, ifft78);
__m512 ifft165 = _mm512_add_ps(ifft161, ifft162);
__m512 ifft82 = _mm512_sub_ps(ifft77, ifft78);
__m512 ifft166 = _mm512_sub_ps(ifft161, ifft162);
__m512 ifft83 = _mm512_add_ps(ifft79, ifft80);
__m512 ifft167 = _mm512_add_ps(ifft163, ifft164);
__m512 ifft84 = _mm512_sub_ps(ifft79, ifft80);
__m512 ifft168 = _mm512_sub_ps(ifft163, ifft164);
__m512 ifft85 = _mm512_fmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft169 = _mm512_fmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft86 = _mm512_fnmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft170 = _mm512_fnmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft87 = _mm512_fmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft171 = _mm512_fmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft88 = _mm512_fnmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft172 = _mm512_fnmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft89 = _mm512_fnmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft173 = _mm512_fnmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft90 = _mm512_fmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft174 = _mm512_fmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft91 = _mm512_fmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft175 = _mm512_fmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 ifft92 = _mm512_fnmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft176 = _mm512_fnmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 dat29 = ifft85;
__m512 dat35 = ifft169;
__m512 dat30 = ifft87;
__m512 dat36 = ifft171;
__m512 dat31 = ifft89;
__m512 dat37 = ifft173;
__m512 dat32 = ifft91;
__m512 dat38 = ifft175;
__m512 dat33 = ifft86;
__m512 dat39 = ifft170;
__m512 dat34 = ifft88;
__m512 dat40 = ifft172;
(void)ifft90;
(void)ifft174;
(void)ifft92;
(void)ifft176;
__m512i pm1 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack1 = _mm512_permutex2var_ps(dat29, pm1, dat35);
__m512i pm2 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack2 = _mm512_permutex2var_ps(dat29, pm2, dat35);
__m512 pack3 = _mm512_permutex2var_ps(dat30, pm1, dat36);
__m512 pack4 = _mm512_permutex2var_ps(dat30, pm2, dat36);
__m512 pack5 = _mm512_permutex2var_ps(dat31, pm1, dat37);
__m512 pack6 = _mm512_permutex2var_ps(dat31, pm2, dat37);
__m512 pack7 = _mm512_permutex2var_ps(dat32, pm1, dat38);
__m512 pack8 = _mm512_permutex2var_ps(dat32, pm2, dat38);
__m512 pack9 = _mm512_permutex2var_ps(dat33, pm1, dat39);
__m512 pack10 = _mm512_permutex2var_ps(dat33, pm2, dat39);
__m512 pack11 = _mm512_permutex2var_ps(dat34, pm1, dat40);
__m512 pack12 = _mm512_permutex2var_ps(dat34, pm2, dat40);
_mm512_mask_storeu_ps(datPtr2+0+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack1);
_mm512_mask_storeu_ps(datPtr2+264+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack2);
_mm512_mask_storeu_ps(datPtr2+44+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack3);
_mm512_mask_storeu_ps(datPtr2+308+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack4);
_mm512_mask_storeu_ps(datPtr2+88+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack5);
_mm512_mask_storeu_ps(datPtr2+352+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack6);
_mm512_mask_storeu_ps(datPtr2+132+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack7);
_mm512_mask_storeu_ps(datPtr2+396+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack8);
_mm512_mask_storeu_ps(datPtr2+176+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack9);
_mm512_mask_storeu_ps(datPtr2+440+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack10);
_mm512_mask_storeu_ps(datPtr2+220+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack11);
_mm512_mask_storeu_ps(datPtr2+484+10824*i9+1056*k8+528*r2+44*toH1+4*toW1+0*t2, 2047, pack12);
}
}
ptrdiff_t r3 = 0;
ptrdiff_t t3 = 0;
__m512 sfRe29 = _mm512_loadu_ps(sfPtr3+0+20992*i9+15744*j5+512*k8+256*r3+128*t3);
__m512 sfIm29 = _mm512_loadu_ps(sfPtr3+64+20992*i9+15744*j5+512*k8+256*r3+128*t3);
__m512 sfRe30 = _mm512_loadu_ps(sfPtr3+5248+20992*i9+15744*j5+512*k8+256*r3+128*t3);
__m512 sfIm30 = _mm512_loadu_ps(sfPtr3+5312+20992*i9+15744*j5+512*k8+256*r3+128*t3);
__m512 sfRe31 = _mm512_loadu_ps(sfPtr3+10496+20992*i9+15744*j5+512*k8+256*r3+128*t3);
__m512 sfIm31 = _mm512_loadu_ps(sfPtr3+10560+20992*i9+15744*j5+512*k8+256*r3+128*t3);
__m512 sfRe32 = _mm512_loadu_ps(sfPtr3+15744+20992*i9+15744*j5+512*k8+256*r3+128*t3);
__m512 sfIm32 = _mm512_loadu_ps(sfPtr3+15808+20992*i9+15744*j5+512*k8+256*r3+128*t3);
__m512i ifft177 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft178 = _mm512_permutexvar_ps(ifft177, sfRe29);
__m512i ifft179 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft180 = _mm512_permutexvar_ps(ifft179, sfRe29);
__m512 ifft181 = _mm512_permutexvar_ps(ifft177, sfIm29);
__m512 ifft182 = _mm512_permutexvar_ps(ifft179, sfIm29);
__m512 ifft183 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft184 = _mm512_mask_fmadd_ps(ifft182, 65021, ifft183, ifft178);
__m512 ifft185 = _mm512_mask_fnmadd_ps(ifft181, 65021, ifft183, ifft180);
__m512 ifft186 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft187 = _mm512_fmadd_ps(ifft184, ifft186, _mm512_shuffle_ps(ifft184, ifft184, 177));
__m512 ifft188 = _mm512_fmadd_ps(ifft185, ifft186, _mm512_shuffle_ps(ifft185, ifft185, 177));
__m512 ifft189 = _mm512_fmadd_ps(sfRe30, ifft186, _mm512_shuffle_ps(sfRe30, sfRe30, 177));
__m512 ifft190 = _mm512_fmadd_ps(sfIm30, ifft186, _mm512_shuffle_ps(sfIm30, sfIm30, 177));
__m512 ifft191 = _mm512_fmadd_ps(sfRe31, ifft186, _mm512_shuffle_ps(sfRe31, sfRe31, 177));
__m512 ifft192 = _mm512_fmadd_ps(sfIm31, ifft186, _mm512_shuffle_ps(sfIm31, sfIm31, 177));
__m512 ifft193 = _mm512_fmadd_ps(sfRe32, ifft186, _mm512_shuffle_ps(sfRe32, sfRe32, 177));
__m512 ifft194 = _mm512_fmadd_ps(sfIm32, ifft186, _mm512_shuffle_ps(sfIm32, sfIm32, 177));
__m512 ifft195 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft196 = _mm512_mul_ps(ifft187, ifft195);
__m512 ifft197 = _mm512_mul_ps(ifft188, ifft195);
__m512 ifft198 = _mm512_mul_ps(ifft189, ifft195);
__m512 ifft199 = _mm512_mul_ps(ifft190, ifft195);
__m512 ifft200 = _mm512_mul_ps(ifft191, ifft195);
__m512 ifft201 = _mm512_mul_ps(ifft192, ifft195);
__m512 ifft202 = _mm512_mul_ps(ifft193, ifft195);
__m512 ifft203 = _mm512_mul_ps(ifft194, ifft195);
__m512 ifft204 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft205 = _mm512_fnmadd_ps(ifft188, ifft204, ifft196);
__m512 ifft206 = _mm512_fmadd_ps(ifft187, ifft204, ifft197);
__m512 ifft207 = _mm512_fnmadd_ps(ifft190, ifft204, ifft198);
__m512 ifft208 = _mm512_fmadd_ps(ifft189, ifft204, ifft199);
__m512 ifft209 = _mm512_fnmadd_ps(ifft192, ifft204, ifft200);
__m512 ifft210 = _mm512_fmadd_ps(ifft191, ifft204, ifft201);
__m512 ifft211 = _mm512_fnmadd_ps(ifft194, ifft204, ifft202);
__m512 ifft212 = _mm512_fmadd_ps(ifft193, ifft204, ifft203);
__m512 ifft213 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft214 = _mm512_fmadd_ps(ifft205, ifft213, _mm512_shuffle_ps(ifft205, ifft205, 78));
__m512 ifft215 = _mm512_fmadd_ps(ifft206, ifft213, _mm512_shuffle_ps(ifft206, ifft206, 78));
__m512 ifft216 = _mm512_fmadd_ps(ifft207, ifft213, _mm512_shuffle_ps(ifft207, ifft207, 78));
__m512 ifft217 = _mm512_fmadd_ps(ifft208, ifft213, _mm512_shuffle_ps(ifft208, ifft208, 78));
__m512 ifft218 = _mm512_fmadd_ps(ifft209, ifft213, _mm512_shuffle_ps(ifft209, ifft209, 78));
__m512 ifft219 = _mm512_fmadd_ps(ifft210, ifft213, _mm512_shuffle_ps(ifft210, ifft210, 78));
__m512 ifft220 = _mm512_fmadd_ps(ifft211, ifft213, _mm512_shuffle_ps(ifft211, ifft211, 78));
__m512 ifft221 = _mm512_fmadd_ps(ifft212, ifft213, _mm512_shuffle_ps(ifft212, ifft212, 78));
__m512 ifft222 = _mm512_mask_sub_ps(ifft214, 49344, _mm512_setzero_ps(), ifft215);
__m512 ifft223 = _mm512_mask_mov_ps(ifft215, 49344, ifft214);
__m512 ifft224 = _mm512_mask_sub_ps(ifft216, 49344, _mm512_setzero_ps(), ifft217);
__m512 ifft225 = _mm512_mask_mov_ps(ifft217, 49344, ifft216);
__m512 ifft226 = _mm512_mask_sub_ps(ifft218, 49344, _mm512_setzero_ps(), ifft219);
__m512 ifft227 = _mm512_mask_mov_ps(ifft219, 49344, ifft218);
__m512 ifft228 = _mm512_mask_sub_ps(ifft220, 49344, _mm512_setzero_ps(), ifft221);
__m512 ifft229 = _mm512_mask_mov_ps(ifft221, 49344, ifft220);
__m512 ifft230 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft231 = _mm512_fmadd_ps(ifft222, ifft230, _mm512_shuffle_f32x4(ifft222, ifft222, 177));
__m512 ifft232 = _mm512_fmadd_ps(ifft223, ifft230, _mm512_shuffle_f32x4(ifft223, ifft223, 177));
__m512 ifft233 = _mm512_fmadd_ps(ifft224, ifft230, _mm512_shuffle_f32x4(ifft224, ifft224, 177));
__m512 ifft234 = _mm512_fmadd_ps(ifft225, ifft230, _mm512_shuffle_f32x4(ifft225, ifft225, 177));
__m512 ifft235 = _mm512_fmadd_ps(ifft226, ifft230, _mm512_shuffle_f32x4(ifft226, ifft226, 177));
__m512 ifft236 = _mm512_fnmsub_ps(ifft227, ifft230, _mm512_shuffle_f32x4(ifft227, ifft227, 177));
__m512 ifft237 = _mm512_fmadd_ps(ifft228, ifft230, _mm512_shuffle_f32x4(ifft228, ifft228, 177));
__m512 ifft238 = _mm512_fmadd_ps(ifft229, ifft230, _mm512_shuffle_f32x4(ifft229, ifft229, 177));
__m512 ifft239 = _mm512_add_ps(ifft231, ifft232);
__m512 ifft240 = _mm512_sub_ps(ifft231, ifft232);
__m512 ifft241 = _mm512_sub_ps(ifft233, ifft237);
__m512 ifft242 = _mm512_add_ps(ifft234, ifft238);
__m512 ifft243 = _mm512_add_ps(ifft233, ifft237);
__m512 ifft244 = _mm512_sub_ps(ifft234, ifft238);
__m512 ifft245 = _mm512_mul_ps(ifft235, _mm512_set1_ps(3.125e-02f));
__m512 ifft246 = _mm512_mul_ps(ifft236, _mm512_set1_ps(3.125e-02f));
__m512 ifft247 = _mm512_fmadd_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft248 = _mm512_fmsub_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft249 = _mm512_fmadd_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft250 = _mm512_fmsub_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft251 = _mm512_add_ps(ifft241, ifft242);
__m512 ifft252 = _mm512_sub_ps(ifft241, ifft242);
__m512 ifft253 = _mm512_fnmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft254 = _mm512_fmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft255 = _mm512_fmadd_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft256 = _mm512_fmsub_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft257 = _mm512_add_ps(ifft253, ifft254);
__m512 ifft258 = _mm512_sub_ps(ifft253, ifft254);
__m512 ifft259 = _mm512_add_ps(ifft255, ifft256);
__m512 ifft260 = _mm512_sub_ps(ifft255, ifft256);
__m512 ifft261 = _mm512_fmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft262 = _mm512_fnmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft263 = _mm512_fmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft264 = _mm512_fnmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft265 = _mm512_fnmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft266 = _mm512_fmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft267 = _mm512_fmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft268 = _mm512_fnmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 dat41 = ifft261;
__m512 dat42 = ifft263;
__m512 dat43 = ifft265;
__m512 dat44 = ifft267;
__m512 dat45 = ifft262;
__m512 dat46 = ifft264;
(void)ifft266;
(void)ifft268;
__m512i pm3 = _mm512_set_epi32(17, 16, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0);
__m512 pack13 = _mm512_permutexvar_ps(pm3, dat41);
__m512 pack14 = _mm512_permutexvar_ps(pm3, dat42);
__m512 pack15 = _mm512_permutexvar_ps(pm3, dat43);
__m512 pack16 = _mm512_permutexvar_ps(pm3, dat44);
__m512 pack17 = _mm512_permutexvar_ps(pm3, dat45);
__m512 pack18 = _mm512_permutexvar_ps(pm3, dat46);
_mm512_mask_storeu_ps(datPtr2+0+10824*i9+1056*k8+528*r3+44*toH1+4*toW1+0*t3, 2047, pack13);
_mm512_mask_storeu_ps(datPtr2+44+10824*i9+1056*k8+528*r3+44*toH1+4*toW1+0*t3, 2047, pack14);
_mm512_mask_storeu_ps(datPtr2+88+10824*i9+1056*k8+528*r3+44*toH1+4*toW1+0*t3, 2047, pack15);
_mm512_mask_storeu_ps(datPtr2+132+10824*i9+1056*k8+528*r3+44*toH1+4*toW1+0*t3, 2047, pack16);
_mm512_mask_storeu_ps(datPtr2+176+10824*i9+1056*k8+528*r3+44*toH1+4*toW1+0*t3, 2047, pack17);
_mm512_mask_storeu_ps(datPtr2+220+10824*i9+1056*k8+528*r3+44*toH1+4*toW1+0*t3, 2047, pack18);
++j5;
}
}

static void Example7StriderConsumeSums1(Example7ThreaderTeam1* team17, char** tensors7) {
Example7ThreaderTask1 task11;
task11.callee1 = Example7StriderConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 3;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 1;
Example7ThreaderDo1(team17, &task11);
}

struct Example7Net {
char* alloc1;
char* align1;
};

void Example7NetDestroy(Example7Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example7NetCreate(
Example7Net** net1,
Example7Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example7Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(17763583);
if (__builtin_expect(!alloc3, 0)) {
return Example7Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example7ThreaderTeam1* team12 = 0;
char* err8 = Example7ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors12[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example7StriderArrangeFilts1(team12, tensors12);
}
Example7ThreaderDestroy1(team12);
Example7Net* net5 = malloc(sizeof(Example7Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example7Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example7Engine {
Example7Net* net3;
Example7ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example7EnginePthreadT(
Example7Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example7ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example7EngineDestroy(Example7Engine* eng3) {
Example7ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example7EngineCreate(
Example7Engine** eng4,
Example7Net* net4,
ptrdiff_t threads2
) {
Example7Engine* eng5 = malloc(sizeof(Example7Engine));
if (__builtin_expect(!eng5, 0)) {
return Example7Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(1838655);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example7Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example7ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example7EngineInference(
Example7Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example7ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example7StriderArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
align4+1691648
};
Example7StriderProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+1691648,
(char*)outData
};
Example7StriderConsumeSums1(team14, tensors11);
}
}

// End of file.

Top