NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example29 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in1 Channels=53 Height=13 Width=32
Input ToTensor=in2 Channels=53 Height=13 Width=32
Input ToTensor=in3 Channels=93 Height=4 Width=13
BatchNorm FromTensor=in1 ToTensor=bn1 Epsilon=0.00001
Activation FromTensor=bn1 ToTensor=act1 Kind=ReLU Param=0
Add FromTensor1=act1 FromTensor2=in2 ToTensor=add1
BatchNorm FromTensor=add1 ToTensor=bn2 Epsilon=0.00001
Conv FromTensor=bn2 ToTensor=conv ToChannels=93 FilterH=7 FilterW=7 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=conv ToTensor=bn3 Epsilon=0.00001
Activation FromTensor=bn3 ToTensor=act2 Kind=ReLU Param=0.8125
Add FromTensor1=act2 FromTensor2=in3 ToTensor=add2
BatchNorm FromTensor=add2 ToTensor=bn4 Epsilon=0.00001
Output FromTensor=bn4

Top || Output Example29.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example29Params);
// Example29Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example29Params Example29Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example29Params* params = malloc(sizeof(Example29Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example29Net* net; // For example, 4 threads:
// char* err = Example29NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example29NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example29Net Example29Net;

char* Example29NetCreate(
Example29Net**,
Example29Params*,
ptrdiff_t threads
);

void Example29NetDestroy(Example29Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example29Net* net;
//
// ... Create net ...
//
// Example29Engine* engine; // For example, 4 inference threads:
// char* err = Example29EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example29EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example29EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* bn4Data = malloc(sizeof(float)*93*4*13);
// float* in1Data = malloc(sizeof(float)*53*13*32);
// float* in2Data = malloc(sizeof(float)*53*13*32);
// float* in3Data = malloc(sizeof(float)*93*4*13);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example29EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// bn4Data, // The tensor arguments are sorted by name.
// in1Data,
// in2Data,
// in3Data
// );
//
// ... Read the output floats ...
//
// }
//
// free(bn4Data);
// free(in1Data);
// free(in2Data);
// free(in3Data);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example29Engine Example29Engine;

char* Example29EngineCreate(
Example29Engine**,
Example29Net*,
ptrdiff_t threads
);

char* Example29EnginePthreadT(
Example29Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example29EngineInference(
Example29Engine*,
float* bn4Data,
float* in1Data,
float* in2Data,
float* in3Data
);

void Example29EngineDestroy(Example29Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example29Params {
float bn1Means[53]; // 1x53x1x1
float bn1Scales[53]; // 1x53x1x1
float bn1Shifts[53]; // 1x53x1x1
float bn1Variances[53]; // 1x53x1x1
float bn2Means[53]; // 1x53x1x1
float bn2Scales[53]; // 1x53x1x1
float bn2Shifts[53]; // 1x53x1x1
float bn2Variances[53]; // 1x53x1x1
float bn3Means[93]; // 1x93x1x1
float bn3Scales[93]; // 1x93x1x1
float bn3Shifts[93]; // 1x93x1x1
float bn3Variances[93]; // 1x93x1x1
float bn4Means[93]; // 1x93x1x1
float bn4Scales[93]; // 1x93x1x1
float bn4Shifts[93]; // 1x93x1x1
float bn4Variances[93]; // 1x93x1x1
float convBiases[93]; // 1x93x1x1
float convWeights[241521]; // 93x53x7x7
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example29.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example29.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example29.h"

static char* Example29Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example29: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example29ThreaderTask1 Example29ThreaderTask1;
typedef void (*Example29ThreaderCallee1)(Example29ThreaderTask1*, int64_t*);
typedef struct Example29ThreaderHub1 Example29ThreaderHub1;
typedef struct Example29ThreaderNode1 Example29ThreaderNode1;
typedef struct Example29ThreaderUnwind1 Example29ThreaderUnwind1;
typedef struct Example29ThreaderTeam1 Example29ThreaderTeam1;

struct Example29ThreaderTask1 {
Example29ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example29ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example29ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example29ThreaderTask1* task1;
pthread_cond_t cond2;
Example29ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example29ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example29ThreaderTeam1 {
ptrdiff_t nt1;
Example29ThreaderHub1* hub2;
Example29ThreaderNode1* nodes2;
Example29ThreaderUnwind1 unwind1;
};

static void Example29ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example29ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example29ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example29ThreaderMain1(void* arg1) {
Example29ThreaderNode1* node1 = arg1;
Example29ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example29ThreaderHub1* hub3 = team2->hub2;
Example29ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example29ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example29ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example29ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example29ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example29ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example29ThreaderDestroy1(Example29ThreaderTeam1* team3) {
if (!team3) return;
Example29ThreaderNode1* nodes4 = team3->nodes2;
Example29ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example29ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example29ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example29ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example29ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example29ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example29ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example29ThreaderCreate1Up4(Example29ThreaderTeam1* team8, ptrdiff_t nt7) {
Example29ThreaderNode1* nodes5 = team8->nodes2;
for (Example29ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example29Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example29Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example29ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example29Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example29ThreaderCreate1Up3(Example29ThreaderTeam1* team7, ptrdiff_t nt6) {
Example29ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example29Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example29Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example29ThreaderCreate1Up4(team7, nt6);
}

static char* Example29ThreaderCreate1Up2(Example29ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example29ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example29ThreaderNode1) != (size_t)nt5, 0)) {
return Example29Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example29Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example29ThreaderCreate1Up3(team6, nt5);
}

static char* Example29ThreaderCreate1Up1(Example29ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example29ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example29Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example29ThreaderCreate1Up2(team5, nt4);
}

static char* Example29ThreaderCreate1(Example29ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example29Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example29ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example29Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example29ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example29ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example29ThreaderPthreadT1(
pthread_t* thr2,
Example29ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example29Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example29ThreaderDo1(Example29ThreaderTeam1* team10, Example29ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example29ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example29ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example29ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example29ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example29Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example29Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example29BnSimplify1(
float*restrict means1,
float*restrict variances1,
float*restrict scales1,
float*restrict shifts1,
char*restrict mas1
) {
__m512 eps1 = _mm512_set1_ps(1e-05f);
__m512i xlo1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va1 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*0);
__m512 va2 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*1);
__m512 va3 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*2);
__m512 va4 = _mm512_maskz_loadu_ps(31, variances1+(ptrdiff_t)16*3);
__m512 rcp1 = Example29Rsqrt1(_mm512_add_ps(eps1, va1));
__m512 rcp2 = Example29Rsqrt1(_mm512_add_ps(eps1, va2));
__m512 rcp3 = Example29Rsqrt1(_mm512_add_ps(eps1, va3));
__m512 rcp4 = Example29Rsqrt1(_mm512_add_ps(eps1, va4));
__m512 sc1 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*0);
__m512 sc2 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*1);
__m512 sc3 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*2);
__m512 sc4 = _mm512_maskz_loadu_ps(31, scales1+(ptrdiff_t)16*3);
__m512 mul1 = _mm512_mul_ps(rcp1, sc1);
__m512 mul2 = _mm512_mul_ps(rcp2, sc2);
__m512 mul3 = _mm512_mul_ps(rcp3, sc3);
__m512 mul4 = _mm512_mul_ps(rcp4, sc4);
__m512 me1 = _mm512_loadu_ps(means1+(ptrdiff_t)16*0);
__m512 me2 = _mm512_loadu_ps(means1+(ptrdiff_t)16*1);
__m512 me3 = _mm512_loadu_ps(means1+(ptrdiff_t)16*2);
__m512 me4 = _mm512_maskz_loadu_ps(31, means1+(ptrdiff_t)16*3);
__m512 sh1 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*0);
__m512 sh2 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*1);
__m512 sh3 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*2);
__m512 sh4 = _mm512_maskz_loadu_ps(31, shifts1+(ptrdiff_t)16*3);
__m512 add1 = _mm512_fnmadd_ps(me1, mul1, sh1);
__m512 add2 = _mm512_fnmadd_ps(me2, mul2, sh2);
__m512 add3 = _mm512_fnmadd_ps(me3, mul3, sh3);
__m512 add4 = _mm512_fnmadd_ps(me4, mul4, sh4);
__m512 lo1 = _mm512_permutex2var_ps(mul1, xlo1, add1);
__m512 lo2 = _mm512_permutex2var_ps(mul2, xlo1, add2);
__m512 lo3 = _mm512_permutex2var_ps(mul3, xlo1, add3);
__m512 lo4 = _mm512_permutex2var_ps(mul4, xlo1, add4);
__m512 hi1 = _mm512_permutex2var_ps(mul1, xhi1, add1);
__m512 hi2 = _mm512_permutex2var_ps(mul2, xhi1, add2);
__m512 hi3 = _mm512_permutex2var_ps(mul3, xhi1, add3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*0, lo1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*1, hi1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*2, lo2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*3, hi2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*4, lo3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*5, hi3);
_mm512_mask_storeu_ps(mas1+(ptrdiff_t)64*6, 1023, lo4);
}

static void Example29BnSimplify2(
float*restrict means2,
float*restrict variances2,
float*restrict scales2,
float*restrict shifts2,
char*restrict mas2
) {
__m512 eps2 = _mm512_set1_ps(1e-05f);
__m512i xlo2 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i5 = 0; i5 < 1; ++i5) {
__m512 va5 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0+(ptrdiff_t)80*i5);
__m512 va6 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*1+(ptrdiff_t)80*i5);
__m512 va7 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*2+(ptrdiff_t)80*i5);
__m512 va8 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*3+(ptrdiff_t)80*i5);
__m512 va9 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*4+(ptrdiff_t)80*i5);
__m512 rcp5 = Example29Rsqrt1(_mm512_add_ps(eps2, va5));
__m512 rcp6 = Example29Rsqrt1(_mm512_add_ps(eps2, va6));
__m512 rcp7 = Example29Rsqrt1(_mm512_add_ps(eps2, va7));
__m512 rcp8 = Example29Rsqrt1(_mm512_add_ps(eps2, va8));
__m512 rcp9 = Example29Rsqrt1(_mm512_add_ps(eps2, va9));
__m512 sc5 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0+(ptrdiff_t)80*i5);
__m512 sc6 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*1+(ptrdiff_t)80*i5);
__m512 sc7 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*2+(ptrdiff_t)80*i5);
__m512 sc8 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*3+(ptrdiff_t)80*i5);
__m512 sc9 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*4+(ptrdiff_t)80*i5);
__m512 mul5 = _mm512_mul_ps(rcp5, sc5);
__m512 mul6 = _mm512_mul_ps(rcp6, sc6);
__m512 mul7 = _mm512_mul_ps(rcp7, sc7);
__m512 mul8 = _mm512_mul_ps(rcp8, sc8);
__m512 mul9 = _mm512_mul_ps(rcp9, sc9);
__m512 me5 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0+(ptrdiff_t)80*i5);
__m512 me6 = _mm512_loadu_ps(means2+(ptrdiff_t)16*1+(ptrdiff_t)80*i5);
__m512 me7 = _mm512_loadu_ps(means2+(ptrdiff_t)16*2+(ptrdiff_t)80*i5);
__m512 me8 = _mm512_loadu_ps(means2+(ptrdiff_t)16*3+(ptrdiff_t)80*i5);
__m512 me9 = _mm512_loadu_ps(means2+(ptrdiff_t)16*4+(ptrdiff_t)80*i5);
__m512 sh5 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0+(ptrdiff_t)80*i5);
__m512 sh6 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*1+(ptrdiff_t)80*i5);
__m512 sh7 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*2+(ptrdiff_t)80*i5);
__m512 sh8 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*3+(ptrdiff_t)80*i5);
__m512 sh9 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*4+(ptrdiff_t)80*i5);
__m512 add5 = _mm512_fnmadd_ps(me5, mul5, sh5);
__m512 add6 = _mm512_fnmadd_ps(me6, mul6, sh6);
__m512 add7 = _mm512_fnmadd_ps(me7, mul7, sh7);
__m512 add8 = _mm512_fnmadd_ps(me8, mul8, sh8);
__m512 add9 = _mm512_fnmadd_ps(me9, mul9, sh9);
__m512 lo5 = _mm512_permutex2var_ps(mul5, xlo2, add5);
__m512 lo6 = _mm512_permutex2var_ps(mul6, xlo2, add6);
__m512 lo7 = _mm512_permutex2var_ps(mul7, xlo2, add7);
__m512 lo8 = _mm512_permutex2var_ps(mul8, xlo2, add8);
__m512 lo9 = _mm512_permutex2var_ps(mul9, xlo2, add9);
__m512 hi4 = _mm512_permutex2var_ps(mul5, xhi2, add5);
__m512 hi5 = _mm512_permutex2var_ps(mul6, xhi2, add6);
__m512 hi6 = _mm512_permutex2var_ps(mul7, xhi2, add7);
__m512 hi7 = _mm512_permutex2var_ps(mul8, xhi2, add8);
__m512 hi8 = _mm512_permutex2var_ps(mul9, xhi2, add9);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*0+(ptrdiff_t)640*i5, lo5);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*1+(ptrdiff_t)640*i5, hi4);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*2+(ptrdiff_t)640*i5, lo6);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*3+(ptrdiff_t)640*i5, hi5);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*4+(ptrdiff_t)640*i5, lo7);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*5+(ptrdiff_t)640*i5, hi6);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*6+(ptrdiff_t)640*i5, lo8);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*7+(ptrdiff_t)640*i5, hi7);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*8+(ptrdiff_t)640*i5, lo9);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*9+(ptrdiff_t)640*i5, hi8);
}
__m512 va10 = _mm512_maskz_loadu_ps(8191, variances2+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 rcp10 = Example29Rsqrt1(_mm512_add_ps(eps2, va10));
__m512 sc10 = _mm512_maskz_loadu_ps(8191, scales2+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 mul10 = _mm512_mul_ps(rcp10, sc10);
__m512 me10 = _mm512_maskz_loadu_ps(8191, means2+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sh10 = _mm512_maskz_loadu_ps(8191, shifts2+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 add10 = _mm512_fnmadd_ps(me10, mul10, sh10);
__m512 lo10 = _mm512_permutex2var_ps(mul10, xlo2, add10);
__m512 hi9 = _mm512_permutex2var_ps(mul10, xhi2, add10);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*0+(ptrdiff_t)640*1, lo10);
_mm512_mask_storeu_ps(mas2+(ptrdiff_t)64*1+(ptrdiff_t)640*1, 1023, hi9);
}

static void Example29StriderArrangeFilts1Callee1(Example29ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = 0;
ptrdiff_t e1 = 0;
char*restrict bfPtr1 = tensors2[4]+376*e1;
char*restrict wfPtr1 = tensors2[4]+384+23871488*e1;
char*restrict wtPtr1 = tensors2[0]+97216*e1;
char*restrict biasPtr1 = tensors2[1];
char*restrict bnPtr1 = tensors2[2]+(ptrdiff_t)8*496*e1;
char*restrict bnPtr2 = tensors2[3];
ptrdiff_t i6 = 1*g2;
ptrdiff_t j1 = 2*b2;
ptrdiff_t jj1 = j1+(b2 < 22 ? 1 : 2);
if (j1 < 46) {
for (; j1 != 46; ++j1) {
__m512 bf1 = _mm512_setzero_ps();
__m512 bf2 = _mm512_setzero_ps();
__m512 postMul1 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(0+93*i6+2*j1))[0]);
__m512 postMul2 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(1+93*i6+2*j1))[0]);
for (ptrdiff_t k1 = 0; k1 < 53; ++k1) {
__m512 preMul1 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(k1+53*i6))[0]);
__m512 preAdd1 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(k1+53*i6))[1]);
__m512 wt1 = _mm512_maskz_loadu_ps(127, wtPtr1+0+966084*i6+20776*j1+196*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(127, wtPtr1+28+966084*i6+20776*j1+196*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(127, wtPtr1+56+966084*i6+20776*j1+196*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(127, wtPtr1+84+966084*i6+20776*j1+196*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(127, wtPtr1+112+966084*i6+20776*j1+196*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(127, wtPtr1+140+966084*i6+20776*j1+196*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(127, wtPtr1+168+966084*i6+20776*j1+196*k1);
wt1 = _mm512_mul_ps(postMul1, wt1);
wt2 = _mm512_mul_ps(postMul1, wt2);
wt3 = _mm512_mul_ps(postMul1, wt3);
wt4 = _mm512_mul_ps(postMul1, wt4);
wt5 = _mm512_mul_ps(postMul1, wt5);
wt6 = _mm512_mul_ps(postMul1, wt6);
wt7 = _mm512_mul_ps(postMul1, wt7);
bf1 = _mm512_fmadd_ps(preAdd1, wt1, bf1);
wt1 = _mm512_mul_ps(preMul1, wt1);
bf1 = _mm512_fmadd_ps(preAdd1, wt2, bf1);
wt2 = _mm512_mul_ps(preMul1, wt2);
bf1 = _mm512_fmadd_ps(preAdd1, wt3, bf1);
wt3 = _mm512_mul_ps(preMul1, wt3);
bf1 = _mm512_fmadd_ps(preAdd1, wt4, bf1);
wt4 = _mm512_mul_ps(preMul1, wt4);
bf1 = _mm512_fmadd_ps(preAdd1, wt5, bf1);
wt5 = _mm512_mul_ps(preMul1, wt5);
bf1 = _mm512_fmadd_ps(preAdd1, wt6, bf1);
wt6 = _mm512_mul_ps(preMul1, wt6);
bf1 = _mm512_fmadd_ps(preAdd1, wt7, bf1);
wt7 = _mm512_mul_ps(preMul1, wt7);
__m512 fft1 = _mm512_add_ps(wt1, _mm512_setzero_ps());
__m512 fft89 = _mm512_add_ps(wt2, _mm512_setzero_ps());
__m512 fft2 = _mm512_sub_ps(wt1, _mm512_setzero_ps());
__m512 fft90 = _mm512_sub_ps(wt2, _mm512_setzero_ps());
__m512 fft3 = _mm512_add_ps(wt3, _mm512_setzero_ps());
__m512 fft91 = _mm512_add_ps(wt4, _mm512_setzero_ps());
__m512 fft4 = _mm512_sub_ps(wt3, _mm512_setzero_ps());
__m512 fft92 = _mm512_sub_ps(wt4, _mm512_setzero_ps());
__m512 fft5 = _mm512_add_ps(wt5, _mm512_setzero_ps());
__m512 fft93 = _mm512_add_ps(wt6, _mm512_setzero_ps());
__m512 fft6 = _mm512_sub_ps(wt5, _mm512_setzero_ps());
__m512 fft94 = _mm512_sub_ps(wt6, _mm512_setzero_ps());
__m512 fft7 = _mm512_add_ps(wt7, _mm512_setzero_ps());
__m512 fft95 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft8 = _mm512_sub_ps(wt7, _mm512_setzero_ps());
__m512 fft96 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9 = _mm512_add_ps(fft1, fft5);
__m512 fft97 = _mm512_add_ps(fft89, fft93);
__m512 fft10 = _mm512_sub_ps(fft1, fft5);
__m512 fft98 = _mm512_sub_ps(fft89, fft93);
__m512 fft11 = _mm512_add_ps(fft3, fft7);
__m512 fft99 = _mm512_add_ps(fft91, fft95);
__m512 fft12 = _mm512_sub_ps(fft7, fft3);
__m512 fft100 = _mm512_sub_ps(fft95, fft91);
__m512 fft13 = _mm512_sub_ps(fft4, fft8);
__m512 fft101 = _mm512_sub_ps(fft92, fft96);
__m512 fft14 = _mm512_add_ps(fft4, fft8);
__m512 fft102 = _mm512_add_ps(fft92, fft96);
__m512 fft15 = _mm512_add_ps(fft9, fft11);
__m512 fft103 = _mm512_add_ps(fft97, fft99);
__m512 fft16 = _mm512_sub_ps(fft9, fft11);
__m512 fft104 = _mm512_sub_ps(fft97, fft99);
__m512 fft17 = _mm512_fmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft105 = _mm512_fmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft18 = _mm512_fnmsub_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft106 = _mm512_fnmsub_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft19 = _mm512_fnmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft107 = _mm512_fnmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft20 = _mm512_fnmadd_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft108 = _mm512_fnmadd_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft21 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft22 = _mm512_fmadd_ps(fft15, fft21, _mm512_shuffle_f32x4(fft15, fft15, 78));
__m512 fft109 = _mm512_fmadd_ps(fft103, fft21, _mm512_shuffle_f32x4(fft103, fft103, 78));
__m512 fft23 = _mm512_fmadd_ps(fft16, fft21, _mm512_shuffle_f32x4(fft16, fft16, 78));
__m512 fft110 = _mm512_fmadd_ps(fft104, fft21, _mm512_shuffle_f32x4(fft104, fft104, 78));
__m512 fft24 = _mm512_fmadd_ps(fft17, fft21, _mm512_shuffle_f32x4(fft17, fft17, 78));
__m512 fft111 = _mm512_fmadd_ps(fft105, fft21, _mm512_shuffle_f32x4(fft105, fft105, 78));
__m512 fft25 = _mm512_fmadd_ps(fft18, fft21, _mm512_shuffle_f32x4(fft18, fft18, 78));
__m512 fft112 = _mm512_fmadd_ps(fft106, fft21, _mm512_shuffle_f32x4(fft106, fft106, 78));
__m512 fft26 = _mm512_fmadd_ps(fft10, fft21, _mm512_shuffle_f32x4(fft10, fft10, 78));
__m512 fft113 = _mm512_fmadd_ps(fft98, fft21, _mm512_shuffle_f32x4(fft98, fft98, 78));
__m512 fft27 = _mm512_fmadd_ps(fft12, fft21, _mm512_shuffle_f32x4(fft12, fft12, 78));
__m512 fft114 = _mm512_fmadd_ps(fft100, fft21, _mm512_shuffle_f32x4(fft100, fft100, 78));
__m512 fft28 = _mm512_fmadd_ps(fft19, fft21, _mm512_shuffle_f32x4(fft19, fft19, 78));
__m512 fft115 = _mm512_fmadd_ps(fft107, fft21, _mm512_shuffle_f32x4(fft107, fft107, 78));
__m512 fft29 = _mm512_fmadd_ps(fft20, fft21, _mm512_shuffle_f32x4(fft20, fft20, 78));
__m512 fft116 = _mm512_fmadd_ps(fft108, fft21, _mm512_shuffle_f32x4(fft108, fft108, 78));
__m512 fft30 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft31 = _mm512_mul_ps(fft22, fft30);
__m512 fft117 = _mm512_mul_ps(fft109, fft30);
__m512 fft32 = _mm512_mul_ps(fft23, fft30);
__m512 fft118 = _mm512_mul_ps(fft110, fft30);
__m512 fft33 = _mm512_mul_ps(fft24, fft30);
__m512 fft119 = _mm512_mul_ps(fft111, fft30);
__m512 fft34 = _mm512_mul_ps(fft25, fft30);
__m512 fft120 = _mm512_mul_ps(fft112, fft30);
__m512 fft35 = _mm512_mul_ps(fft26, fft30);
__m512 fft121 = _mm512_mul_ps(fft113, fft30);
__m512 fft36 = _mm512_mul_ps(fft27, fft30);
__m512 fft122 = _mm512_mul_ps(fft114, fft30);
__m512 fft37 = _mm512_mul_ps(fft28, fft30);
__m512 fft123 = _mm512_mul_ps(fft115, fft30);
__m512 fft38 = _mm512_mul_ps(fft29, fft30);
__m512 fft124 = _mm512_mul_ps(fft116, fft30);
__m512 fft39 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft40 = _mm512_fmadd_ps(fft23, fft39, fft31);
__m512 fft125 = _mm512_fmadd_ps(fft110, fft39, fft117);
__m512 fft41 = _mm512_fnmadd_ps(fft22, fft39, fft32);
__m512 fft126 = _mm512_fnmadd_ps(fft109, fft39, fft118);
__m512 fft42 = _mm512_fmadd_ps(fft25, fft39, fft33);
__m512 fft127 = _mm512_fmadd_ps(fft112, fft39, fft119);
__m512 fft43 = _mm512_fnmadd_ps(fft24, fft39, fft34);
__m512 fft128 = _mm512_fnmadd_ps(fft111, fft39, fft120);
__m512 fft44 = _mm512_fmadd_ps(fft27, fft39, fft35);
__m512 fft129 = _mm512_fmadd_ps(fft114, fft39, fft121);
__m512 fft45 = _mm512_fnmadd_ps(fft26, fft39, fft36);
__m512 fft130 = _mm512_fnmadd_ps(fft113, fft39, fft122);
__m512 fft46 = _mm512_fmadd_ps(fft29, fft39, fft37);
__m512 fft131 = _mm512_fmadd_ps(fft116, fft39, fft123);
__m512 fft47 = _mm512_fnmadd_ps(fft28, fft39, fft38);
__m512 fft132 = _mm512_fnmadd_ps(fft115, fft39, fft124);
__m512 fft48 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft49 = _mm512_fmadd_ps(fft40, fft48, _mm512_shuffle_f32x4(fft40, fft40, 177));
__m512 fft133 = _mm512_fmadd_ps(fft125, fft48, _mm512_shuffle_f32x4(fft125, fft125, 177));
__m512 fft50 = _mm512_fmadd_ps(fft41, fft48, _mm512_shuffle_f32x4(fft41, fft41, 177));
__m512 fft134 = _mm512_fmadd_ps(fft126, fft48, _mm512_shuffle_f32x4(fft126, fft126, 177));
__m512 fft51 = _mm512_fmadd_ps(fft42, fft48, _mm512_shuffle_f32x4(fft42, fft42, 177));
__m512 fft135 = _mm512_fmadd_ps(fft127, fft48, _mm512_shuffle_f32x4(fft127, fft127, 177));
__m512 fft52 = _mm512_fmadd_ps(fft43, fft48, _mm512_shuffle_f32x4(fft43, fft43, 177));
__m512 fft136 = _mm512_fmadd_ps(fft128, fft48, _mm512_shuffle_f32x4(fft128, fft128, 177));
__m512 fft53 = _mm512_fmadd_ps(fft44, fft48, _mm512_shuffle_f32x4(fft44, fft44, 177));
__m512 fft137 = _mm512_fmadd_ps(fft129, fft48, _mm512_shuffle_f32x4(fft129, fft129, 177));
__m512 fft54 = _mm512_fmadd_ps(fft45, fft48, _mm512_shuffle_f32x4(fft45, fft45, 177));
__m512 fft138 = _mm512_fmadd_ps(fft130, fft48, _mm512_shuffle_f32x4(fft130, fft130, 177));
__m512 fft55 = _mm512_fmadd_ps(fft46, fft48, _mm512_shuffle_f32x4(fft46, fft46, 177));
__m512 fft139 = _mm512_fmadd_ps(fft131, fft48, _mm512_shuffle_f32x4(fft131, fft131, 177));
__m512 fft56 = _mm512_fmadd_ps(fft47, fft48, _mm512_shuffle_f32x4(fft47, fft47, 177));
__m512 fft140 = _mm512_fmadd_ps(fft132, fft48, _mm512_shuffle_f32x4(fft132, fft132, 177));
__m512 fft57 = _mm512_mask_mov_ps(fft49, 49344, fft50);
__m512 fft141 = _mm512_mask_mov_ps(fft133, 49344, fft134);
__m512 fft58 = _mm512_mask_sub_ps(fft50, 49344, _mm512_setzero_ps(), fft49);
__m512 fft142 = _mm512_mask_sub_ps(fft134, 49344, _mm512_setzero_ps(), fft133);
__m512 fft59 = _mm512_mask_mov_ps(fft51, 49344, fft52);
__m512 fft143 = _mm512_mask_mov_ps(fft135, 49344, fft136);
__m512 fft60 = _mm512_mask_sub_ps(fft52, 49344, _mm512_setzero_ps(), fft51);
__m512 fft144 = _mm512_mask_sub_ps(fft136, 49344, _mm512_setzero_ps(), fft135);
__m512 fft61 = _mm512_mask_mov_ps(fft53, 49344, fft54);
__m512 fft145 = _mm512_mask_mov_ps(fft137, 49344, fft138);
__m512 fft62 = _mm512_mask_sub_ps(fft54, 49344, _mm512_setzero_ps(), fft53);
__m512 fft146 = _mm512_mask_sub_ps(fft138, 49344, _mm512_setzero_ps(), fft137);
__m512 fft63 = _mm512_mask_mov_ps(fft55, 49344, fft56);
__m512 fft147 = _mm512_mask_mov_ps(fft139, 49344, fft140);
__m512 fft64 = _mm512_mask_sub_ps(fft56, 49344, _mm512_setzero_ps(), fft55);
__m512 fft148 = _mm512_mask_sub_ps(fft140, 49344, _mm512_setzero_ps(), fft139);
__m512 fft65 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft66 = _mm512_fmadd_ps(fft57, fft65, _mm512_shuffle_ps(fft57, fft57, 78));
__m512 fft149 = _mm512_fmadd_ps(fft141, fft65, _mm512_shuffle_ps(fft141, fft141, 78));
__m512 fft67 = _mm512_fmadd_ps(fft58, fft65, _mm512_shuffle_ps(fft58, fft58, 78));
__m512 fft150 = _mm512_fmadd_ps(fft142, fft65, _mm512_shuffle_ps(fft142, fft142, 78));
__m512 fft68 = _mm512_fmadd_ps(fft59, fft65, _mm512_shuffle_ps(fft59, fft59, 78));
__m512 fft151 = _mm512_fmadd_ps(fft143, fft65, _mm512_shuffle_ps(fft143, fft143, 78));
__m512 fft69 = _mm512_fmadd_ps(fft60, fft65, _mm512_shuffle_ps(fft60, fft60, 78));
__m512 fft152 = _mm512_fmadd_ps(fft144, fft65, _mm512_shuffle_ps(fft144, fft144, 78));
__m512 fft70 = _mm512_fmadd_ps(fft61, fft65, _mm512_shuffle_ps(fft61, fft61, 78));
__m512 fft153 = _mm512_fmadd_ps(fft145, fft65, _mm512_shuffle_ps(fft145, fft145, 78));
__m512 fft71 = _mm512_fmadd_ps(fft62, fft65, _mm512_shuffle_ps(fft62, fft62, 78));
__m512 fft154 = _mm512_fmadd_ps(fft146, fft65, _mm512_shuffle_ps(fft146, fft146, 78));
__m512 fft72 = _mm512_fmadd_ps(fft63, fft65, _mm512_shuffle_ps(fft63, fft63, 78));
__m512 fft155 = _mm512_fmadd_ps(fft147, fft65, _mm512_shuffle_ps(fft147, fft147, 78));
__m512 fft73 = _mm512_fmadd_ps(fft64, fft65, _mm512_shuffle_ps(fft64, fft64, 78));
__m512 fft156 = _mm512_fmadd_ps(fft148, fft65, _mm512_shuffle_ps(fft148, fft148, 78));
__m512i fft74 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft75 = _mm512_permutexvar_ps(fft74, fft66);
__m512 fft157 = _mm512_permutexvar_ps(fft74, fft149);
__m512i fft76 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft77 = _mm512_permutexvar_ps(fft76, fft66);
__m512 fft158 = _mm512_permutexvar_ps(fft76, fft149);
__m512 fft78 = _mm512_permutexvar_ps(fft74, fft67);
__m512 fft159 = _mm512_permutexvar_ps(fft74, fft150);
__m512 fft79 = _mm512_permutexvar_ps(fft76, fft67);
__m512 fft160 = _mm512_permutexvar_ps(fft76, fft150);
__m512 fft80 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft81 = _mm512_fmadd_ps(fft75, fft80, fft77);
__m512 fft161 = _mm512_fmadd_ps(fft157, fft80, fft158);
__m512 fft82 = _mm512_fnmadd_ps(fft79, fft80, fft78);
__m512 fft162 = _mm512_fnmadd_ps(fft160, fft80, fft159);
__m512 fft83 = _mm512_mask_mov_ps(fft79, 21845, fft81);
__m512 fft163 = _mm512_mask_mov_ps(fft160, 21845, fft161);
__m512 fft84 = _mm512_mask_mov_ps(fft75, 43176, fft81);
__m512 fft164 = _mm512_mask_mov_ps(fft157, 43176, fft161);
__m512 fft85 = _mm512_mask_mov_ps(fft83, 43176, fft82);
__m512 fft165 = _mm512_mask_mov_ps(fft163, 43176, fft162);
__m512 fft86 = _mm512_mask_mov_ps(fft84, 22102, fft82);
__m512 fft166 = _mm512_mask_mov_ps(fft164, 22102, fft162);
__m512 fft87 = _mm512_mask_mul_ps(fft85, 64764, fft85, _mm512_set1_ps(5e-01f));
__m512 fft167 = _mm512_mask_mul_ps(fft165, 64764, fft165, _mm512_set1_ps(5e-01f));
__m512 fft88 = _mm512_mask_mul_ps(fft86, 64764, fft86, _mm512_set1_ps(5e-01f));
__m512 fft168 = _mm512_mask_mul_ps(fft166, 64764, fft166, _mm512_set1_ps(5e-01f));
__m512 wf1 = fft87;
__m512 wf9 = fft167;
__m512 wf2 = fft88;
__m512 wf10 = fft168;
__m512 wf3 = fft68;
__m512 wf11 = fft151;
__m512 wf4 = fft69;
__m512 wf12 = fft152;
__m512 wf5 = fft70;
__m512 wf13 = fft153;
__m512 wf6 = fft71;
__m512 wf14 = fft154;
__m512 wf7 = fft72;
__m512 wf15 = fft155;
__m512 wf8 = fft73;
__m512 wf16 = fft156;
ptrdiff_t c1 = (size_t)(0+2*j1)/4;
ptrdiff_t m1 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f2 = (size_t)(0+2*j1)%2;
__m512i eo1 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf3 = _mm512_permutexvar_ps(eo1, wf3);
wf4 = _mm512_permutexvar_ps(eo1, wf4);
__m512i wfs1 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs1 = _mm512_inserti64x4(wfs1, _mm512_cvtps_ph(wf4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+159424+637696*i6+6784*c1+128*k1+64*m1+16*f2, 3855, wfs1);
_mm512_mask_storeu_epi32(wfPtr1+797104+637696*i6+6784*c1+128*k1+64*m1+16*f2, 61680, wfs1);
wf11 = _mm512_permutexvar_ps(eo1, wf11);
wf12 = _mm512_permutexvar_ps(eo1, wf12);
__m512i wfs2 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs2 = _mm512_inserti64x4(wfs2, _mm512_cvtps_ph(wf12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+1434816+637696*i6+6784*c1+128*k1+64*m1+16*f2, 3855, wfs2);
_mm512_mask_storeu_epi32(wfPtr1+2072496+637696*i6+6784*c1+128*k1+64*m1+16*f2, 61680, wfs2);
wf5 = _mm512_permutexvar_ps(eo1, wf5);
wf6 = _mm512_permutexvar_ps(eo1, wf6);
__m512i wfs3 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs3 = _mm512_inserti64x4(wfs3, _mm512_cvtps_ph(wf6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+318848+637696*i6+6784*c1+128*k1+64*m1+16*f2, 3855, wfs3);
_mm512_mask_storeu_epi32(wfPtr1+956528+637696*i6+6784*c1+128*k1+64*m1+16*f2, 61680, wfs3);
wf13 = _mm512_permutexvar_ps(eo1, wf13);
wf14 = _mm512_permutexvar_ps(eo1, wf14);
__m512i wfs4 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs4 = _mm512_inserti64x4(wfs4, _mm512_cvtps_ph(wf14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+1594240+637696*i6+6784*c1+128*k1+64*m1+16*f2, 3855, wfs4);
_mm512_mask_storeu_epi32(wfPtr1+2231920+637696*i6+6784*c1+128*k1+64*m1+16*f2, 61680, wfs4);
wf7 = _mm512_permutexvar_ps(eo1, wf7);
wf8 = _mm512_permutexvar_ps(eo1, wf8);
__m512i wfs5 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs5 = _mm512_inserti64x4(wfs5, _mm512_cvtps_ph(wf8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+478272+637696*i6+6784*c1+128*k1+64*m1+16*f2, 3855, wfs5);
_mm512_mask_storeu_epi32(wfPtr1+1115952+637696*i6+6784*c1+128*k1+64*m1+16*f2, 61680, wfs5);
wf15 = _mm512_permutexvar_ps(eo1, wf15);
wf16 = _mm512_permutexvar_ps(eo1, wf16);
__m512i wfs6 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs6 = _mm512_inserti64x4(wfs6, _mm512_cvtps_ph(wf16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+1753664+637696*i6+6784*c1+128*k1+64*m1+16*f2, 3855, wfs6);
_mm512_mask_storeu_epi32(wfPtr1+2391344+637696*i6+6784*c1+128*k1+64*m1+16*f2, 61680, wfs6);
__m512i wfs7 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs7 = _mm512_inserti64x4(wfs7, _mm512_cvtps_ph(wf2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+637696*i6+6784*c1+128*k1+64*m1+16*f2, 3855, wfs7);
_mm512_mask_storeu_epi32(wfPtr1+637680+637696*i6+6784*c1+128*k1+64*m1+16*f2, 61680, wfs7);
__m512i wfs8 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs8 = _mm512_inserti64x4(wfs8, _mm512_cvtps_ph(wf10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+1275392+637696*i6+6784*c1+128*k1+64*m1+16*f2, 3855, wfs8);
_mm512_mask_storeu_epi32(wfPtr1+1913072+637696*i6+6784*c1+128*k1+64*m1+16*f2, 61680, wfs8);
__m512 wt8 = _mm512_maskz_loadu_ps(127, wtPtr1+10388+966084*i6+20776*j1+196*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(127, wtPtr1+10416+966084*i6+20776*j1+196*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(127, wtPtr1+10444+966084*i6+20776*j1+196*k1);
__m512 wt11 = _mm512_maskz_loadu_ps(127, wtPtr1+10472+966084*i6+20776*j1+196*k1);
__m512 wt12 = _mm512_maskz_loadu_ps(127, wtPtr1+10500+966084*i6+20776*j1+196*k1);
__m512 wt13 = _mm512_maskz_loadu_ps(127, wtPtr1+10528+966084*i6+20776*j1+196*k1);
__m512 wt14 = _mm512_maskz_loadu_ps(127, wtPtr1+10556+966084*i6+20776*j1+196*k1);
wt8 = _mm512_mul_ps(postMul2, wt8);
wt9 = _mm512_mul_ps(postMul2, wt9);
wt10 = _mm512_mul_ps(postMul2, wt10);
wt11 = _mm512_mul_ps(postMul2, wt11);
wt12 = _mm512_mul_ps(postMul2, wt12);
wt13 = _mm512_mul_ps(postMul2, wt13);
wt14 = _mm512_mul_ps(postMul2, wt14);
bf2 = _mm512_fmadd_ps(preAdd1, wt8, bf2);
wt8 = _mm512_mul_ps(preMul1, wt8);
bf2 = _mm512_fmadd_ps(preAdd1, wt9, bf2);
wt9 = _mm512_mul_ps(preMul1, wt9);
bf2 = _mm512_fmadd_ps(preAdd1, wt10, bf2);
wt10 = _mm512_mul_ps(preMul1, wt10);
bf2 = _mm512_fmadd_ps(preAdd1, wt11, bf2);
wt11 = _mm512_mul_ps(preMul1, wt11);
bf2 = _mm512_fmadd_ps(preAdd1, wt12, bf2);
wt12 = _mm512_mul_ps(preMul1, wt12);
bf2 = _mm512_fmadd_ps(preAdd1, wt13, bf2);
wt13 = _mm512_mul_ps(preMul1, wt13);
bf2 = _mm512_fmadd_ps(preAdd1, wt14, bf2);
wt14 = _mm512_mul_ps(preMul1, wt14);
__m512 fft169 = _mm512_add_ps(wt8, _mm512_setzero_ps());
__m512 fft257 = _mm512_add_ps(wt9, _mm512_setzero_ps());
__m512 fft170 = _mm512_sub_ps(wt8, _mm512_setzero_ps());
__m512 fft258 = _mm512_sub_ps(wt9, _mm512_setzero_ps());
__m512 fft171 = _mm512_add_ps(wt10, _mm512_setzero_ps());
__m512 fft259 = _mm512_add_ps(wt11, _mm512_setzero_ps());
__m512 fft172 = _mm512_sub_ps(wt10, _mm512_setzero_ps());
__m512 fft260 = _mm512_sub_ps(wt11, _mm512_setzero_ps());
__m512 fft173 = _mm512_add_ps(wt12, _mm512_setzero_ps());
__m512 fft261 = _mm512_add_ps(wt13, _mm512_setzero_ps());
__m512 fft174 = _mm512_sub_ps(wt12, _mm512_setzero_ps());
__m512 fft262 = _mm512_sub_ps(wt13, _mm512_setzero_ps());
__m512 fft175 = _mm512_add_ps(wt14, _mm512_setzero_ps());
__m512 fft263 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft176 = _mm512_sub_ps(wt14, _mm512_setzero_ps());
__m512 fft264 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft177 = _mm512_add_ps(fft169, fft173);
__m512 fft265 = _mm512_add_ps(fft257, fft261);
__m512 fft178 = _mm512_sub_ps(fft169, fft173);
__m512 fft266 = _mm512_sub_ps(fft257, fft261);
__m512 fft179 = _mm512_add_ps(fft171, fft175);
__m512 fft267 = _mm512_add_ps(fft259, fft263);
__m512 fft180 = _mm512_sub_ps(fft175, fft171);
__m512 fft268 = _mm512_sub_ps(fft263, fft259);
__m512 fft181 = _mm512_sub_ps(fft172, fft176);
__m512 fft269 = _mm512_sub_ps(fft260, fft264);
__m512 fft182 = _mm512_add_ps(fft172, fft176);
__m512 fft270 = _mm512_add_ps(fft260, fft264);
__m512 fft183 = _mm512_add_ps(fft177, fft179);
__m512 fft271 = _mm512_add_ps(fft265, fft267);
__m512 fft184 = _mm512_sub_ps(fft177, fft179);
__m512 fft272 = _mm512_sub_ps(fft265, fft267);
__m512 fft185 = _mm512_fmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft273 = _mm512_fmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft186 = _mm512_fnmsub_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft274 = _mm512_fnmsub_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft187 = _mm512_fnmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft275 = _mm512_fnmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft188 = _mm512_fnmadd_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft276 = _mm512_fnmadd_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft189 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft190 = _mm512_fmadd_ps(fft183, fft189, _mm512_shuffle_f32x4(fft183, fft183, 78));
__m512 fft277 = _mm512_fmadd_ps(fft271, fft189, _mm512_shuffle_f32x4(fft271, fft271, 78));
__m512 fft191 = _mm512_fmadd_ps(fft184, fft189, _mm512_shuffle_f32x4(fft184, fft184, 78));
__m512 fft278 = _mm512_fmadd_ps(fft272, fft189, _mm512_shuffle_f32x4(fft272, fft272, 78));
__m512 fft192 = _mm512_fmadd_ps(fft185, fft189, _mm512_shuffle_f32x4(fft185, fft185, 78));
__m512 fft279 = _mm512_fmadd_ps(fft273, fft189, _mm512_shuffle_f32x4(fft273, fft273, 78));
__m512 fft193 = _mm512_fmadd_ps(fft186, fft189, _mm512_shuffle_f32x4(fft186, fft186, 78));
__m512 fft280 = _mm512_fmadd_ps(fft274, fft189, _mm512_shuffle_f32x4(fft274, fft274, 78));
__m512 fft194 = _mm512_fmadd_ps(fft178, fft189, _mm512_shuffle_f32x4(fft178, fft178, 78));
__m512 fft281 = _mm512_fmadd_ps(fft266, fft189, _mm512_shuffle_f32x4(fft266, fft266, 78));
__m512 fft195 = _mm512_fmadd_ps(fft180, fft189, _mm512_shuffle_f32x4(fft180, fft180, 78));
__m512 fft282 = _mm512_fmadd_ps(fft268, fft189, _mm512_shuffle_f32x4(fft268, fft268, 78));
__m512 fft196 = _mm512_fmadd_ps(fft187, fft189, _mm512_shuffle_f32x4(fft187, fft187, 78));
__m512 fft283 = _mm512_fmadd_ps(fft275, fft189, _mm512_shuffle_f32x4(fft275, fft275, 78));
__m512 fft197 = _mm512_fmadd_ps(fft188, fft189, _mm512_shuffle_f32x4(fft188, fft188, 78));
__m512 fft284 = _mm512_fmadd_ps(fft276, fft189, _mm512_shuffle_f32x4(fft276, fft276, 78));
__m512 fft198 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft199 = _mm512_mul_ps(fft190, fft198);
__m512 fft285 = _mm512_mul_ps(fft277, fft198);
__m512 fft200 = _mm512_mul_ps(fft191, fft198);
__m512 fft286 = _mm512_mul_ps(fft278, fft198);
__m512 fft201 = _mm512_mul_ps(fft192, fft198);
__m512 fft287 = _mm512_mul_ps(fft279, fft198);
__m512 fft202 = _mm512_mul_ps(fft193, fft198);
__m512 fft288 = _mm512_mul_ps(fft280, fft198);
__m512 fft203 = _mm512_mul_ps(fft194, fft198);
__m512 fft289 = _mm512_mul_ps(fft281, fft198);
__m512 fft204 = _mm512_mul_ps(fft195, fft198);
__m512 fft290 = _mm512_mul_ps(fft282, fft198);
__m512 fft205 = _mm512_mul_ps(fft196, fft198);
__m512 fft291 = _mm512_mul_ps(fft283, fft198);
__m512 fft206 = _mm512_mul_ps(fft197, fft198);
__m512 fft292 = _mm512_mul_ps(fft284, fft198);
__m512 fft207 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft208 = _mm512_fmadd_ps(fft191, fft207, fft199);
__m512 fft293 = _mm512_fmadd_ps(fft278, fft207, fft285);
__m512 fft209 = _mm512_fnmadd_ps(fft190, fft207, fft200);
__m512 fft294 = _mm512_fnmadd_ps(fft277, fft207, fft286);
__m512 fft210 = _mm512_fmadd_ps(fft193, fft207, fft201);
__m512 fft295 = _mm512_fmadd_ps(fft280, fft207, fft287);
__m512 fft211 = _mm512_fnmadd_ps(fft192, fft207, fft202);
__m512 fft296 = _mm512_fnmadd_ps(fft279, fft207, fft288);
__m512 fft212 = _mm512_fmadd_ps(fft195, fft207, fft203);
__m512 fft297 = _mm512_fmadd_ps(fft282, fft207, fft289);
__m512 fft213 = _mm512_fnmadd_ps(fft194, fft207, fft204);
__m512 fft298 = _mm512_fnmadd_ps(fft281, fft207, fft290);
__m512 fft214 = _mm512_fmadd_ps(fft197, fft207, fft205);
__m512 fft299 = _mm512_fmadd_ps(fft284, fft207, fft291);
__m512 fft215 = _mm512_fnmadd_ps(fft196, fft207, fft206);
__m512 fft300 = _mm512_fnmadd_ps(fft283, fft207, fft292);
__m512 fft216 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft217 = _mm512_fmadd_ps(fft208, fft216, _mm512_shuffle_f32x4(fft208, fft208, 177));
__m512 fft301 = _mm512_fmadd_ps(fft293, fft216, _mm512_shuffle_f32x4(fft293, fft293, 177));
__m512 fft218 = _mm512_fmadd_ps(fft209, fft216, _mm512_shuffle_f32x4(fft209, fft209, 177));
__m512 fft302 = _mm512_fmadd_ps(fft294, fft216, _mm512_shuffle_f32x4(fft294, fft294, 177));
__m512 fft219 = _mm512_fmadd_ps(fft210, fft216, _mm512_shuffle_f32x4(fft210, fft210, 177));
__m512 fft303 = _mm512_fmadd_ps(fft295, fft216, _mm512_shuffle_f32x4(fft295, fft295, 177));
__m512 fft220 = _mm512_fmadd_ps(fft211, fft216, _mm512_shuffle_f32x4(fft211, fft211, 177));
__m512 fft304 = _mm512_fmadd_ps(fft296, fft216, _mm512_shuffle_f32x4(fft296, fft296, 177));
__m512 fft221 = _mm512_fmadd_ps(fft212, fft216, _mm512_shuffle_f32x4(fft212, fft212, 177));
__m512 fft305 = _mm512_fmadd_ps(fft297, fft216, _mm512_shuffle_f32x4(fft297, fft297, 177));
__m512 fft222 = _mm512_fmadd_ps(fft213, fft216, _mm512_shuffle_f32x4(fft213, fft213, 177));
__m512 fft306 = _mm512_fmadd_ps(fft298, fft216, _mm512_shuffle_f32x4(fft298, fft298, 177));
__m512 fft223 = _mm512_fmadd_ps(fft214, fft216, _mm512_shuffle_f32x4(fft214, fft214, 177));
__m512 fft307 = _mm512_fmadd_ps(fft299, fft216, _mm512_shuffle_f32x4(fft299, fft299, 177));
__m512 fft224 = _mm512_fmadd_ps(fft215, fft216, _mm512_shuffle_f32x4(fft215, fft215, 177));
__m512 fft308 = _mm512_fmadd_ps(fft300, fft216, _mm512_shuffle_f32x4(fft300, fft300, 177));
__m512 fft225 = _mm512_mask_mov_ps(fft217, 49344, fft218);
__m512 fft309 = _mm512_mask_mov_ps(fft301, 49344, fft302);
__m512 fft226 = _mm512_mask_sub_ps(fft218, 49344, _mm512_setzero_ps(), fft217);
__m512 fft310 = _mm512_mask_sub_ps(fft302, 49344, _mm512_setzero_ps(), fft301);
__m512 fft227 = _mm512_mask_mov_ps(fft219, 49344, fft220);
__m512 fft311 = _mm512_mask_mov_ps(fft303, 49344, fft304);
__m512 fft228 = _mm512_mask_sub_ps(fft220, 49344, _mm512_setzero_ps(), fft219);
__m512 fft312 = _mm512_mask_sub_ps(fft304, 49344, _mm512_setzero_ps(), fft303);
__m512 fft229 = _mm512_mask_mov_ps(fft221, 49344, fft222);
__m512 fft313 = _mm512_mask_mov_ps(fft305, 49344, fft306);
__m512 fft230 = _mm512_mask_sub_ps(fft222, 49344, _mm512_setzero_ps(), fft221);
__m512 fft314 = _mm512_mask_sub_ps(fft306, 49344, _mm512_setzero_ps(), fft305);
__m512 fft231 = _mm512_mask_mov_ps(fft223, 49344, fft224);
__m512 fft315 = _mm512_mask_mov_ps(fft307, 49344, fft308);
__m512 fft232 = _mm512_mask_sub_ps(fft224, 49344, _mm512_setzero_ps(), fft223);
__m512 fft316 = _mm512_mask_sub_ps(fft308, 49344, _mm512_setzero_ps(), fft307);
__m512 fft233 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft234 = _mm512_fmadd_ps(fft225, fft233, _mm512_shuffle_ps(fft225, fft225, 78));
__m512 fft317 = _mm512_fmadd_ps(fft309, fft233, _mm512_shuffle_ps(fft309, fft309, 78));
__m512 fft235 = _mm512_fmadd_ps(fft226, fft233, _mm512_shuffle_ps(fft226, fft226, 78));
__m512 fft318 = _mm512_fmadd_ps(fft310, fft233, _mm512_shuffle_ps(fft310, fft310, 78));
__m512 fft236 = _mm512_fmadd_ps(fft227, fft233, _mm512_shuffle_ps(fft227, fft227, 78));
__m512 fft319 = _mm512_fmadd_ps(fft311, fft233, _mm512_shuffle_ps(fft311, fft311, 78));
__m512 fft237 = _mm512_fmadd_ps(fft228, fft233, _mm512_shuffle_ps(fft228, fft228, 78));
__m512 fft320 = _mm512_fmadd_ps(fft312, fft233, _mm512_shuffle_ps(fft312, fft312, 78));
__m512 fft238 = _mm512_fmadd_ps(fft229, fft233, _mm512_shuffle_ps(fft229, fft229, 78));
__m512 fft321 = _mm512_fmadd_ps(fft313, fft233, _mm512_shuffle_ps(fft313, fft313, 78));
__m512 fft239 = _mm512_fmadd_ps(fft230, fft233, _mm512_shuffle_ps(fft230, fft230, 78));
__m512 fft322 = _mm512_fmadd_ps(fft314, fft233, _mm512_shuffle_ps(fft314, fft314, 78));
__m512 fft240 = _mm512_fmadd_ps(fft231, fft233, _mm512_shuffle_ps(fft231, fft231, 78));
__m512 fft323 = _mm512_fmadd_ps(fft315, fft233, _mm512_shuffle_ps(fft315, fft315, 78));
__m512 fft241 = _mm512_fmadd_ps(fft232, fft233, _mm512_shuffle_ps(fft232, fft232, 78));
__m512 fft324 = _mm512_fmadd_ps(fft316, fft233, _mm512_shuffle_ps(fft316, fft316, 78));
__m512i fft242 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft243 = _mm512_permutexvar_ps(fft242, fft234);
__m512 fft325 = _mm512_permutexvar_ps(fft242, fft317);
__m512i fft244 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft245 = _mm512_permutexvar_ps(fft244, fft234);
__m512 fft326 = _mm512_permutexvar_ps(fft244, fft317);
__m512 fft246 = _mm512_permutexvar_ps(fft242, fft235);
__m512 fft327 = _mm512_permutexvar_ps(fft242, fft318);
__m512 fft247 = _mm512_permutexvar_ps(fft244, fft235);
__m512 fft328 = _mm512_permutexvar_ps(fft244, fft318);
__m512 fft248 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft249 = _mm512_fmadd_ps(fft243, fft248, fft245);
__m512 fft329 = _mm512_fmadd_ps(fft325, fft248, fft326);
__m512 fft250 = _mm512_fnmadd_ps(fft247, fft248, fft246);
__m512 fft330 = _mm512_fnmadd_ps(fft328, fft248, fft327);
__m512 fft251 = _mm512_mask_mov_ps(fft247, 21845, fft249);
__m512 fft331 = _mm512_mask_mov_ps(fft328, 21845, fft329);
__m512 fft252 = _mm512_mask_mov_ps(fft243, 43176, fft249);
__m512 fft332 = _mm512_mask_mov_ps(fft325, 43176, fft329);
__m512 fft253 = _mm512_mask_mov_ps(fft251, 43176, fft250);
__m512 fft333 = _mm512_mask_mov_ps(fft331, 43176, fft330);
__m512 fft254 = _mm512_mask_mov_ps(fft252, 22102, fft250);
__m512 fft334 = _mm512_mask_mov_ps(fft332, 22102, fft330);
__m512 fft255 = _mm512_mask_mul_ps(fft253, 64764, fft253, _mm512_set1_ps(5e-01f));
__m512 fft335 = _mm512_mask_mul_ps(fft333, 64764, fft333, _mm512_set1_ps(5e-01f));
__m512 fft256 = _mm512_mask_mul_ps(fft254, 64764, fft254, _mm512_set1_ps(5e-01f));
__m512 fft336 = _mm512_mask_mul_ps(fft334, 64764, fft334, _mm512_set1_ps(5e-01f));
__m512 wf17 = fft255;
__m512 wf25 = fft335;
__m512 wf18 = fft256;
__m512 wf26 = fft336;
__m512 wf19 = fft236;
__m512 wf27 = fft319;
__m512 wf20 = fft237;
__m512 wf28 = fft320;
__m512 wf21 = fft238;
__m512 wf29 = fft321;
__m512 wf22 = fft239;
__m512 wf30 = fft322;
__m512 wf23 = fft240;
__m512 wf31 = fft323;
__m512 wf24 = fft241;
__m512 wf32 = fft324;
ptrdiff_t c2 = (size_t)(1+2*j1)/4;
ptrdiff_t m2 = (size_t)(1+2*j1)%4/2;
ptrdiff_t f3 = (size_t)(1+2*j1)%2;
__m512i eo2 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf19 = _mm512_permutexvar_ps(eo2, wf19);
wf20 = _mm512_permutexvar_ps(eo2, wf20);
__m512i wfs9 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs9 = _mm512_inserti64x4(wfs9, _mm512_cvtps_ph(wf20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+159424+637696*i6+6784*c2+128*k1+64*m2+16*f3, 3855, wfs9);
_mm512_mask_storeu_epi32(wfPtr1+797104+637696*i6+6784*c2+128*k1+64*m2+16*f3, 61680, wfs9);
wf27 = _mm512_permutexvar_ps(eo2, wf27);
wf28 = _mm512_permutexvar_ps(eo2, wf28);
__m512i wfs10 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs10 = _mm512_inserti64x4(wfs10, _mm512_cvtps_ph(wf28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+1434816+637696*i6+6784*c2+128*k1+64*m2+16*f3, 3855, wfs10);
_mm512_mask_storeu_epi32(wfPtr1+2072496+637696*i6+6784*c2+128*k1+64*m2+16*f3, 61680, wfs10);
wf21 = _mm512_permutexvar_ps(eo2, wf21);
wf22 = _mm512_permutexvar_ps(eo2, wf22);
__m512i wfs11 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs11 = _mm512_inserti64x4(wfs11, _mm512_cvtps_ph(wf22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+318848+637696*i6+6784*c2+128*k1+64*m2+16*f3, 3855, wfs11);
_mm512_mask_storeu_epi32(wfPtr1+956528+637696*i6+6784*c2+128*k1+64*m2+16*f3, 61680, wfs11);
wf29 = _mm512_permutexvar_ps(eo2, wf29);
wf30 = _mm512_permutexvar_ps(eo2, wf30);
__m512i wfs12 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf29, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs12 = _mm512_inserti64x4(wfs12, _mm512_cvtps_ph(wf30, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+1594240+637696*i6+6784*c2+128*k1+64*m2+16*f3, 3855, wfs12);
_mm512_mask_storeu_epi32(wfPtr1+2231920+637696*i6+6784*c2+128*k1+64*m2+16*f3, 61680, wfs12);
wf23 = _mm512_permutexvar_ps(eo2, wf23);
wf24 = _mm512_permutexvar_ps(eo2, wf24);
__m512i wfs13 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs13 = _mm512_inserti64x4(wfs13, _mm512_cvtps_ph(wf24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+478272+637696*i6+6784*c2+128*k1+64*m2+16*f3, 3855, wfs13);
_mm512_mask_storeu_epi32(wfPtr1+1115952+637696*i6+6784*c2+128*k1+64*m2+16*f3, 61680, wfs13);
wf31 = _mm512_permutexvar_ps(eo2, wf31);
wf32 = _mm512_permutexvar_ps(eo2, wf32);
__m512i wfs14 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf31, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs14 = _mm512_inserti64x4(wfs14, _mm512_cvtps_ph(wf32, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+1753664+637696*i6+6784*c2+128*k1+64*m2+16*f3, 3855, wfs14);
_mm512_mask_storeu_epi32(wfPtr1+2391344+637696*i6+6784*c2+128*k1+64*m2+16*f3, 61680, wfs14);
__m512i wfs15 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs15 = _mm512_inserti64x4(wfs15, _mm512_cvtps_ph(wf18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+637696*i6+6784*c2+128*k1+64*m2+16*f3, 3855, wfs15);
_mm512_mask_storeu_epi32(wfPtr1+637680+637696*i6+6784*c2+128*k1+64*m2+16*f3, 61680, wfs15);
__m512i wfs16 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs16 = _mm512_inserti64x4(wfs16, _mm512_cvtps_ph(wf26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+1275392+637696*i6+6784*c2+128*k1+64*m2+16*f3, 3855, wfs16);
_mm512_mask_storeu_epi32(wfPtr1+1913072+637696*i6+6784*c2+128*k1+64*m2+16*f3, 61680, wfs16);
}
__m512 upper3 = _mm512_shuffle_f32x4(bf1, bf1, 14);
bf1 = _mm512_add_ps(bf1, upper3);
__m512 upper5 = _mm512_shuffle_f32x4(bf2, bf2, 14);
bf2 = _mm512_add_ps(bf2, upper5);
__m512 upper2 = _mm512_shuffle_f32x4(bf1, bf1, 1);
__m512 upper4 = _mm512_shuffle_f32x4(bf2, bf2, 1);
bf1 = _mm512_add_ps(bf1, upper2);
bf2 = _mm512_add_ps(bf2, upper4);
__m512 upper1 = _mm512_shuffle_ps(bf1, bf2, 238);
bf1 = _mm512_shuffle_ps(bf1, bf2, 68);
bf1 = _mm512_add_ps(bf1, upper1);
__m512 upper6 = _mm512_shuffle_ps(bf1, bf1, 13);
bf1 = _mm512_shuffle_ps(bf1, bf1, 8);
bf1 = _mm512_add_ps(bf1, upper6);
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(3, biasPtr1-0+372*i6+8*j1);
__m512i pmMul1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas3 = _mm512_maskz_loadu_ps(15, bnPtr2+(ptrdiff_t)8*(0+93*i6+2*j1));
__m512 postMul3 = _mm512_permutexvar_ps(pmMul1, mas3);
__m512 postAdd1 = _mm512_permutexvar_ps(pmAdd1, mas3);
bias1 = _mm512_fmadd_ps(bias1, postMul3, postAdd1);
bf1 = _mm512_add_ps(bf1, bias1);
}
bf1 = _mm512_mul_ps(bf1, _mm512_set1_ps(6.4e+01f));
_mm512_mask_storeu_ps(bfPtr1-0+376*i6+8*j1, 3, bf1);
if (j1 >= jj1) return;
}
}
if (j1 == 46) {
__m512 bf3 = _mm512_setzero_ps();
__m512 postMul4 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(0+93*i6+2*j1))[0]);
for (ptrdiff_t k2 = 0; k2 < 53; ++k2) {
__m512 preMul2 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(k2+53*i6))[0]);
__m512 preAdd2 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(k2+53*i6))[1]);
__m512 wt15 = _mm512_maskz_loadu_ps(127, wtPtr1+0+966084*i6+20776*j1+196*k2);
__m512 wt16 = _mm512_maskz_loadu_ps(127, wtPtr1+28+966084*i6+20776*j1+196*k2);
__m512 wt17 = _mm512_maskz_loadu_ps(127, wtPtr1+56+966084*i6+20776*j1+196*k2);
__m512 wt18 = _mm512_maskz_loadu_ps(127, wtPtr1+84+966084*i6+20776*j1+196*k2);
__m512 wt19 = _mm512_maskz_loadu_ps(127, wtPtr1+112+966084*i6+20776*j1+196*k2);
__m512 wt20 = _mm512_maskz_loadu_ps(127, wtPtr1+140+966084*i6+20776*j1+196*k2);
__m512 wt21 = _mm512_maskz_loadu_ps(127, wtPtr1+168+966084*i6+20776*j1+196*k2);
wt15 = _mm512_mul_ps(postMul4, wt15);
wt16 = _mm512_mul_ps(postMul4, wt16);
wt17 = _mm512_mul_ps(postMul4, wt17);
wt18 = _mm512_mul_ps(postMul4, wt18);
wt19 = _mm512_mul_ps(postMul4, wt19);
wt20 = _mm512_mul_ps(postMul4, wt20);
wt21 = _mm512_mul_ps(postMul4, wt21);
bf3 = _mm512_fmadd_ps(preAdd2, wt15, bf3);
wt15 = _mm512_mul_ps(preMul2, wt15);
bf3 = _mm512_fmadd_ps(preAdd2, wt16, bf3);
wt16 = _mm512_mul_ps(preMul2, wt16);
bf3 = _mm512_fmadd_ps(preAdd2, wt17, bf3);
wt17 = _mm512_mul_ps(preMul2, wt17);
bf3 = _mm512_fmadd_ps(preAdd2, wt18, bf3);
wt18 = _mm512_mul_ps(preMul2, wt18);
bf3 = _mm512_fmadd_ps(preAdd2, wt19, bf3);
wt19 = _mm512_mul_ps(preMul2, wt19);
bf3 = _mm512_fmadd_ps(preAdd2, wt20, bf3);
wt20 = _mm512_mul_ps(preMul2, wt20);
bf3 = _mm512_fmadd_ps(preAdd2, wt21, bf3);
wt21 = _mm512_mul_ps(preMul2, wt21);
__m512 fft337 = _mm512_add_ps(wt15, _mm512_setzero_ps());
__m512 fft425 = _mm512_add_ps(wt16, _mm512_setzero_ps());
__m512 fft338 = _mm512_sub_ps(wt15, _mm512_setzero_ps());
__m512 fft426 = _mm512_sub_ps(wt16, _mm512_setzero_ps());
__m512 fft339 = _mm512_add_ps(wt17, _mm512_setzero_ps());
__m512 fft427 = _mm512_add_ps(wt18, _mm512_setzero_ps());
__m512 fft340 = _mm512_sub_ps(wt17, _mm512_setzero_ps());
__m512 fft428 = _mm512_sub_ps(wt18, _mm512_setzero_ps());
__m512 fft341 = _mm512_add_ps(wt19, _mm512_setzero_ps());
__m512 fft429 = _mm512_add_ps(wt20, _mm512_setzero_ps());
__m512 fft342 = _mm512_sub_ps(wt19, _mm512_setzero_ps());
__m512 fft430 = _mm512_sub_ps(wt20, _mm512_setzero_ps());
__m512 fft343 = _mm512_add_ps(wt21, _mm512_setzero_ps());
__m512 fft431 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft344 = _mm512_sub_ps(wt21, _mm512_setzero_ps());
__m512 fft432 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft345 = _mm512_add_ps(fft337, fft341);
__m512 fft433 = _mm512_add_ps(fft425, fft429);
__m512 fft346 = _mm512_sub_ps(fft337, fft341);
__m512 fft434 = _mm512_sub_ps(fft425, fft429);
__m512 fft347 = _mm512_add_ps(fft339, fft343);
__m512 fft435 = _mm512_add_ps(fft427, fft431);
__m512 fft348 = _mm512_sub_ps(fft343, fft339);
__m512 fft436 = _mm512_sub_ps(fft431, fft427);
__m512 fft349 = _mm512_sub_ps(fft340, fft344);
__m512 fft437 = _mm512_sub_ps(fft428, fft432);
__m512 fft350 = _mm512_add_ps(fft340, fft344);
__m512 fft438 = _mm512_add_ps(fft428, fft432);
__m512 fft351 = _mm512_add_ps(fft345, fft347);
__m512 fft439 = _mm512_add_ps(fft433, fft435);
__m512 fft352 = _mm512_sub_ps(fft345, fft347);
__m512 fft440 = _mm512_sub_ps(fft433, fft435);
__m512 fft353 = _mm512_fmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft441 = _mm512_fmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft354 = _mm512_fnmsub_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft442 = _mm512_fnmsub_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft355 = _mm512_fnmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft443 = _mm512_fnmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft356 = _mm512_fnmadd_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft444 = _mm512_fnmadd_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft357 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft358 = _mm512_fmadd_ps(fft351, fft357, _mm512_shuffle_f32x4(fft351, fft351, 78));
__m512 fft445 = _mm512_fmadd_ps(fft439, fft357, _mm512_shuffle_f32x4(fft439, fft439, 78));
__m512 fft359 = _mm512_fmadd_ps(fft352, fft357, _mm512_shuffle_f32x4(fft352, fft352, 78));
__m512 fft446 = _mm512_fmadd_ps(fft440, fft357, _mm512_shuffle_f32x4(fft440, fft440, 78));
__m512 fft360 = _mm512_fmadd_ps(fft353, fft357, _mm512_shuffle_f32x4(fft353, fft353, 78));
__m512 fft447 = _mm512_fmadd_ps(fft441, fft357, _mm512_shuffle_f32x4(fft441, fft441, 78));
__m512 fft361 = _mm512_fmadd_ps(fft354, fft357, _mm512_shuffle_f32x4(fft354, fft354, 78));
__m512 fft448 = _mm512_fmadd_ps(fft442, fft357, _mm512_shuffle_f32x4(fft442, fft442, 78));
__m512 fft362 = _mm512_fmadd_ps(fft346, fft357, _mm512_shuffle_f32x4(fft346, fft346, 78));
__m512 fft449 = _mm512_fmadd_ps(fft434, fft357, _mm512_shuffle_f32x4(fft434, fft434, 78));
__m512 fft363 = _mm512_fmadd_ps(fft348, fft357, _mm512_shuffle_f32x4(fft348, fft348, 78));
__m512 fft450 = _mm512_fmadd_ps(fft436, fft357, _mm512_shuffle_f32x4(fft436, fft436, 78));
__m512 fft364 = _mm512_fmadd_ps(fft355, fft357, _mm512_shuffle_f32x4(fft355, fft355, 78));
__m512 fft451 = _mm512_fmadd_ps(fft443, fft357, _mm512_shuffle_f32x4(fft443, fft443, 78));
__m512 fft365 = _mm512_fmadd_ps(fft356, fft357, _mm512_shuffle_f32x4(fft356, fft356, 78));
__m512 fft452 = _mm512_fmadd_ps(fft444, fft357, _mm512_shuffle_f32x4(fft444, fft444, 78));
__m512 fft366 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft367 = _mm512_mul_ps(fft358, fft366);
__m512 fft453 = _mm512_mul_ps(fft445, fft366);
__m512 fft368 = _mm512_mul_ps(fft359, fft366);
__m512 fft454 = _mm512_mul_ps(fft446, fft366);
__m512 fft369 = _mm512_mul_ps(fft360, fft366);
__m512 fft455 = _mm512_mul_ps(fft447, fft366);
__m512 fft370 = _mm512_mul_ps(fft361, fft366);
__m512 fft456 = _mm512_mul_ps(fft448, fft366);
__m512 fft371 = _mm512_mul_ps(fft362, fft366);
__m512 fft457 = _mm512_mul_ps(fft449, fft366);
__m512 fft372 = _mm512_mul_ps(fft363, fft366);
__m512 fft458 = _mm512_mul_ps(fft450, fft366);
__m512 fft373 = _mm512_mul_ps(fft364, fft366);
__m512 fft459 = _mm512_mul_ps(fft451, fft366);
__m512 fft374 = _mm512_mul_ps(fft365, fft366);
__m512 fft460 = _mm512_mul_ps(fft452, fft366);
__m512 fft375 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft376 = _mm512_fmadd_ps(fft359, fft375, fft367);
__m512 fft461 = _mm512_fmadd_ps(fft446, fft375, fft453);
__m512 fft377 = _mm512_fnmadd_ps(fft358, fft375, fft368);
__m512 fft462 = _mm512_fnmadd_ps(fft445, fft375, fft454);
__m512 fft378 = _mm512_fmadd_ps(fft361, fft375, fft369);
__m512 fft463 = _mm512_fmadd_ps(fft448, fft375, fft455);
__m512 fft379 = _mm512_fnmadd_ps(fft360, fft375, fft370);
__m512 fft464 = _mm512_fnmadd_ps(fft447, fft375, fft456);
__m512 fft380 = _mm512_fmadd_ps(fft363, fft375, fft371);
__m512 fft465 = _mm512_fmadd_ps(fft450, fft375, fft457);
__m512 fft381 = _mm512_fnmadd_ps(fft362, fft375, fft372);
__m512 fft466 = _mm512_fnmadd_ps(fft449, fft375, fft458);
__m512 fft382 = _mm512_fmadd_ps(fft365, fft375, fft373);
__m512 fft467 = _mm512_fmadd_ps(fft452, fft375, fft459);
__m512 fft383 = _mm512_fnmadd_ps(fft364, fft375, fft374);
__m512 fft468 = _mm512_fnmadd_ps(fft451, fft375, fft460);
__m512 fft384 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft385 = _mm512_fmadd_ps(fft376, fft384, _mm512_shuffle_f32x4(fft376, fft376, 177));
__m512 fft469 = _mm512_fmadd_ps(fft461, fft384, _mm512_shuffle_f32x4(fft461, fft461, 177));
__m512 fft386 = _mm512_fmadd_ps(fft377, fft384, _mm512_shuffle_f32x4(fft377, fft377, 177));
__m512 fft470 = _mm512_fmadd_ps(fft462, fft384, _mm512_shuffle_f32x4(fft462, fft462, 177));
__m512 fft387 = _mm512_fmadd_ps(fft378, fft384, _mm512_shuffle_f32x4(fft378, fft378, 177));
__m512 fft471 = _mm512_fmadd_ps(fft463, fft384, _mm512_shuffle_f32x4(fft463, fft463, 177));
__m512 fft388 = _mm512_fmadd_ps(fft379, fft384, _mm512_shuffle_f32x4(fft379, fft379, 177));
__m512 fft472 = _mm512_fmadd_ps(fft464, fft384, _mm512_shuffle_f32x4(fft464, fft464, 177));
__m512 fft389 = _mm512_fmadd_ps(fft380, fft384, _mm512_shuffle_f32x4(fft380, fft380, 177));
__m512 fft473 = _mm512_fmadd_ps(fft465, fft384, _mm512_shuffle_f32x4(fft465, fft465, 177));
__m512 fft390 = _mm512_fmadd_ps(fft381, fft384, _mm512_shuffle_f32x4(fft381, fft381, 177));
__m512 fft474 = _mm512_fmadd_ps(fft466, fft384, _mm512_shuffle_f32x4(fft466, fft466, 177));
__m512 fft391 = _mm512_fmadd_ps(fft382, fft384, _mm512_shuffle_f32x4(fft382, fft382, 177));
__m512 fft475 = _mm512_fmadd_ps(fft467, fft384, _mm512_shuffle_f32x4(fft467, fft467, 177));
__m512 fft392 = _mm512_fmadd_ps(fft383, fft384, _mm512_shuffle_f32x4(fft383, fft383, 177));
__m512 fft476 = _mm512_fmadd_ps(fft468, fft384, _mm512_shuffle_f32x4(fft468, fft468, 177));
__m512 fft393 = _mm512_mask_mov_ps(fft385, 49344, fft386);
__m512 fft477 = _mm512_mask_mov_ps(fft469, 49344, fft470);
__m512 fft394 = _mm512_mask_sub_ps(fft386, 49344, _mm512_setzero_ps(), fft385);
__m512 fft478 = _mm512_mask_sub_ps(fft470, 49344, _mm512_setzero_ps(), fft469);
__m512 fft395 = _mm512_mask_mov_ps(fft387, 49344, fft388);
__m512 fft479 = _mm512_mask_mov_ps(fft471, 49344, fft472);
__m512 fft396 = _mm512_mask_sub_ps(fft388, 49344, _mm512_setzero_ps(), fft387);
__m512 fft480 = _mm512_mask_sub_ps(fft472, 49344, _mm512_setzero_ps(), fft471);
__m512 fft397 = _mm512_mask_mov_ps(fft389, 49344, fft390);
__m512 fft481 = _mm512_mask_mov_ps(fft473, 49344, fft474);
__m512 fft398 = _mm512_mask_sub_ps(fft390, 49344, _mm512_setzero_ps(), fft389);
__m512 fft482 = _mm512_mask_sub_ps(fft474, 49344, _mm512_setzero_ps(), fft473);
__m512 fft399 = _mm512_mask_mov_ps(fft391, 49344, fft392);
__m512 fft483 = _mm512_mask_mov_ps(fft475, 49344, fft476);
__m512 fft400 = _mm512_mask_sub_ps(fft392, 49344, _mm512_setzero_ps(), fft391);
__m512 fft484 = _mm512_mask_sub_ps(fft476, 49344, _mm512_setzero_ps(), fft475);
__m512 fft401 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft402 = _mm512_fmadd_ps(fft393, fft401, _mm512_shuffle_ps(fft393, fft393, 78));
__m512 fft485 = _mm512_fmadd_ps(fft477, fft401, _mm512_shuffle_ps(fft477, fft477, 78));
__m512 fft403 = _mm512_fmadd_ps(fft394, fft401, _mm512_shuffle_ps(fft394, fft394, 78));
__m512 fft486 = _mm512_fmadd_ps(fft478, fft401, _mm512_shuffle_ps(fft478, fft478, 78));
__m512 fft404 = _mm512_fmadd_ps(fft395, fft401, _mm512_shuffle_ps(fft395, fft395, 78));
__m512 fft487 = _mm512_fmadd_ps(fft479, fft401, _mm512_shuffle_ps(fft479, fft479, 78));
__m512 fft405 = _mm512_fmadd_ps(fft396, fft401, _mm512_shuffle_ps(fft396, fft396, 78));
__m512 fft488 = _mm512_fmadd_ps(fft480, fft401, _mm512_shuffle_ps(fft480, fft480, 78));
__m512 fft406 = _mm512_fmadd_ps(fft397, fft401, _mm512_shuffle_ps(fft397, fft397, 78));
__m512 fft489 = _mm512_fmadd_ps(fft481, fft401, _mm512_shuffle_ps(fft481, fft481, 78));
__m512 fft407 = _mm512_fmadd_ps(fft398, fft401, _mm512_shuffle_ps(fft398, fft398, 78));
__m512 fft490 = _mm512_fmadd_ps(fft482, fft401, _mm512_shuffle_ps(fft482, fft482, 78));
__m512 fft408 = _mm512_fmadd_ps(fft399, fft401, _mm512_shuffle_ps(fft399, fft399, 78));
__m512 fft491 = _mm512_fmadd_ps(fft483, fft401, _mm512_shuffle_ps(fft483, fft483, 78));
__m512 fft409 = _mm512_fmadd_ps(fft400, fft401, _mm512_shuffle_ps(fft400, fft400, 78));
__m512 fft492 = _mm512_fmadd_ps(fft484, fft401, _mm512_shuffle_ps(fft484, fft484, 78));
__m512i fft410 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft411 = _mm512_permutexvar_ps(fft410, fft402);
__m512 fft493 = _mm512_permutexvar_ps(fft410, fft485);
__m512i fft412 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft413 = _mm512_permutexvar_ps(fft412, fft402);
__m512 fft494 = _mm512_permutexvar_ps(fft412, fft485);
__m512 fft414 = _mm512_permutexvar_ps(fft410, fft403);
__m512 fft495 = _mm512_permutexvar_ps(fft410, fft486);
__m512 fft415 = _mm512_permutexvar_ps(fft412, fft403);
__m512 fft496 = _mm512_permutexvar_ps(fft412, fft486);
__m512 fft416 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft417 = _mm512_fmadd_ps(fft411, fft416, fft413);
__m512 fft497 = _mm512_fmadd_ps(fft493, fft416, fft494);
__m512 fft418 = _mm512_fnmadd_ps(fft415, fft416, fft414);
__m512 fft498 = _mm512_fnmadd_ps(fft496, fft416, fft495);
__m512 fft419 = _mm512_mask_mov_ps(fft415, 21845, fft417);
__m512 fft499 = _mm512_mask_mov_ps(fft496, 21845, fft497);
__m512 fft420 = _mm512_mask_mov_ps(fft411, 43176, fft417);
__m512 fft500 = _mm512_mask_mov_ps(fft493, 43176, fft497);
__m512 fft421 = _mm512_mask_mov_ps(fft419, 43176, fft418);
__m512 fft501 = _mm512_mask_mov_ps(fft499, 43176, fft498);
__m512 fft422 = _mm512_mask_mov_ps(fft420, 22102, fft418);
__m512 fft502 = _mm512_mask_mov_ps(fft500, 22102, fft498);
__m512 fft423 = _mm512_mask_mul_ps(fft421, 64764, fft421, _mm512_set1_ps(5e-01f));
__m512 fft503 = _mm512_mask_mul_ps(fft501, 64764, fft501, _mm512_set1_ps(5e-01f));
__m512 fft424 = _mm512_mask_mul_ps(fft422, 64764, fft422, _mm512_set1_ps(5e-01f));
__m512 fft504 = _mm512_mask_mul_ps(fft502, 64764, fft502, _mm512_set1_ps(5e-01f));
__m512 wf33 = fft423;
__m512 wf41 = fft503;
__m512 wf34 = fft424;
__m512 wf42 = fft504;
__m512 wf35 = fft404;
__m512 wf43 = fft487;
__m512 wf36 = fft405;
__m512 wf44 = fft488;
__m512 wf37 = fft406;
__m512 wf45 = fft489;
__m512 wf38 = fft407;
__m512 wf46 = fft490;
__m512 wf39 = fft408;
__m512 wf47 = fft491;
__m512 wf40 = fft409;
__m512 wf48 = fft492;
ptrdiff_t c3 = (size_t)(0+2*j1)/4;
ptrdiff_t m3 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f4 = (size_t)(0+2*j1)%2;
__m512i eo3 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf35 = _mm512_permutexvar_ps(eo3, wf35);
wf36 = _mm512_permutexvar_ps(eo3, wf36);
__m512i wfs17 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf35, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs17 = _mm512_inserti64x4(wfs17, _mm512_cvtps_ph(wf36, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep1 = _mm512_shuffle_i32x4(wfs17, wfs17, 160);
_mm512_mask_storeu_epi32(wfPtr1+159424+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep1);
__m512i rep2 = _mm512_shuffle_i32x4(wfs17, wfs17, 245);
_mm512_mask_storeu_epi32(wfPtr1+797120+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep2);
wf43 = _mm512_permutexvar_ps(eo3, wf43);
wf44 = _mm512_permutexvar_ps(eo3, wf44);
__m512i wfs18 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf43, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs18 = _mm512_inserti64x4(wfs18, _mm512_cvtps_ph(wf44, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep3 = _mm512_shuffle_i32x4(wfs18, wfs18, 160);
_mm512_mask_storeu_epi32(wfPtr1+1434816+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep3);
__m512i rep4 = _mm512_shuffle_i32x4(wfs18, wfs18, 245);
_mm512_mask_storeu_epi32(wfPtr1+2072512+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep4);
wf37 = _mm512_permutexvar_ps(eo3, wf37);
wf38 = _mm512_permutexvar_ps(eo3, wf38);
__m512i wfs19 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf37, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs19 = _mm512_inserti64x4(wfs19, _mm512_cvtps_ph(wf38, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep5 = _mm512_shuffle_i32x4(wfs19, wfs19, 160);
_mm512_mask_storeu_epi32(wfPtr1+318848+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep5);
__m512i rep6 = _mm512_shuffle_i32x4(wfs19, wfs19, 245);
_mm512_mask_storeu_epi32(wfPtr1+956544+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep6);
wf45 = _mm512_permutexvar_ps(eo3, wf45);
wf46 = _mm512_permutexvar_ps(eo3, wf46);
__m512i wfs20 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf45, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs20 = _mm512_inserti64x4(wfs20, _mm512_cvtps_ph(wf46, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep7 = _mm512_shuffle_i32x4(wfs20, wfs20, 160);
_mm512_mask_storeu_epi32(wfPtr1+1594240+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep7);
__m512i rep8 = _mm512_shuffle_i32x4(wfs20, wfs20, 245);
_mm512_mask_storeu_epi32(wfPtr1+2231936+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep8);
wf39 = _mm512_permutexvar_ps(eo3, wf39);
wf40 = _mm512_permutexvar_ps(eo3, wf40);
__m512i wfs21 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf39, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs21 = _mm512_inserti64x4(wfs21, _mm512_cvtps_ph(wf40, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep9 = _mm512_shuffle_i32x4(wfs21, wfs21, 160);
_mm512_mask_storeu_epi32(wfPtr1+478272+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep9);
__m512i rep10 = _mm512_shuffle_i32x4(wfs21, wfs21, 245);
_mm512_mask_storeu_epi32(wfPtr1+1115968+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep10);
wf47 = _mm512_permutexvar_ps(eo3, wf47);
wf48 = _mm512_permutexvar_ps(eo3, wf48);
__m512i wfs22 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf47, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs22 = _mm512_inserti64x4(wfs22, _mm512_cvtps_ph(wf48, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep11 = _mm512_shuffle_i32x4(wfs22, wfs22, 160);
_mm512_mask_storeu_epi32(wfPtr1+1753664+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep11);
__m512i rep12 = _mm512_shuffle_i32x4(wfs22, wfs22, 245);
_mm512_mask_storeu_epi32(wfPtr1+2391360+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep12);
__m512i wfs23 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf33, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs23 = _mm512_inserti64x4(wfs23, _mm512_cvtps_ph(wf34, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep13 = _mm512_shuffle_i32x4(wfs23, wfs23, 160);
_mm512_mask_storeu_epi32(wfPtr1+0+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep13);
__m512i rep14 = _mm512_shuffle_i32x4(wfs23, wfs23, 245);
_mm512_mask_storeu_epi32(wfPtr1+637696+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep14);
__m512i wfs24 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf41, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs24 = _mm512_inserti64x4(wfs24, _mm512_cvtps_ph(wf42, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
__m512i rep15 = _mm512_shuffle_i32x4(wfs24, wfs24, 160);
_mm512_mask_storeu_epi32(wfPtr1+1275392+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep15);
__m512i rep16 = _mm512_shuffle_i32x4(wfs24, wfs24, 245);
_mm512_mask_storeu_epi32(wfPtr1+1913088+637696*i6+6784*c3+64*k2+64*m3+16*f4, 65535, rep16);
}
__m512 upper10 = _mm512_shuffle_f32x4(bf3, bf3, 14);
bf3 = _mm512_add_ps(bf3, upper10);
__m512 upper9 = _mm512_shuffle_f32x4(bf3, bf3, 1);
bf3 = _mm512_add_ps(bf3, upper9);
__m512 upper8 = _mm512_shuffle_ps(bf3, bf3, 14);
bf3 = _mm512_add_ps(bf3, upper8);
__m512 upper7 = _mm512_shuffle_ps(bf3, bf3, 1);
bf3 = _mm512_add_ps(bf3, upper7);
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(1, biasPtr1-0+372*i6+8*j1);
__m512i pmMul2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas4 = _mm512_maskz_loadu_ps(3, bnPtr2+(ptrdiff_t)8*(0+93*i6+2*j1));
__m512 postMul5 = _mm512_permutexvar_ps(pmMul2, mas4);
__m512 postAdd2 = _mm512_permutexvar_ps(pmAdd2, mas4);
bias2 = _mm512_fmadd_ps(bias2, postMul5, postAdd2);
bf3 = _mm512_add_ps(bf3, bias2);
}
bf3 = _mm512_mul_ps(bf3, _mm512_set1_ps(6.4e+01f));
_mm512_mask_storeu_ps(bfPtr1-0+376*i6+8*j1, 1, bf3);
if (j1 >= jj1) return;
j1 = 47;
}
}

static void Example29StriderArrangeFilts1(Example29ThreaderTeam1* team13, char** tensors1) {
Example29ThreaderTask1 task5;
task5.callee1 = Example29StriderArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 23;
task5.hull1[1] = 1;
task5.hull1[2] = 1;
Example29ThreaderDo1(team13, &task5);
}

static void Example29StriderArrangeDats1Callee1(Example29ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = 0;
ptrdiff_t c4 = 0;
ptrdiff_t g3 = 0;
ptrdiff_t e2 = 0;
(void)pt8;
char*restrict datPtr1 = tensors4[0]-0+825344*e2;
char*restrict bnPtr3 = tensors4[1]+(ptrdiff_t)8*496*e2;
char*restrict datPtr2 = tensors4[2]-0+825344*e2;
char*restrict dfPtr1 = tensors4[3]+2031616*e2;
ptrdiff_t i7 = 1*g3;
ptrdiff_t j2 = 1*c4;
ptrdiff_t rel1 = j2-0;
ptrdiff_t base1 = 0;
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k3 = 53*s1;
ptrdiff_t kk1 = k3+52;
for (; k3 <= kk1; ++k3) {
__m512 bnMul1 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(k3+53*i7))[0]);
__m512 bnAdd1 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(k3+53*i7))[1]);
for (ptrdiff_t b3 = 0; b3 < 2; ++b3) {
ptrdiff_t m4 = (size_t)b3/2;
ptrdiff_t f5 = (size_t)b3%2;
__m512 dat1 = _mm512_maskz_loadu_ps(65535, datPtr1+0+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat1 = _mm512_mask_fmadd_ps(dat1, 65535, bnMul1, bnAdd1);
dat1 = _mm512_max_ps(_mm512_setzero_ps(), dat1);
dat1 = _mm512_add_ps(dat1, _mm512_maskz_loadu_ps(65535, datPtr2+0+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat2 = _mm512_maskz_loadu_ps(65535, datPtr1+128+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat2 = _mm512_mask_fmadd_ps(dat2, 65535, bnMul1, bnAdd1);
dat2 = _mm512_max_ps(_mm512_setzero_ps(), dat2);
dat2 = _mm512_add_ps(dat2, _mm512_maskz_loadu_ps(65535, datPtr2+128+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat3 = _mm512_maskz_loadu_ps(65535, datPtr1+256+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat3 = _mm512_mask_fmadd_ps(dat3, 65535, bnMul1, bnAdd1);
dat3 = _mm512_max_ps(_mm512_setzero_ps(), dat3);
dat3 = _mm512_add_ps(dat3, _mm512_maskz_loadu_ps(65535, datPtr2+256+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat4 = _mm512_maskz_loadu_ps(65535, datPtr1+384+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat4 = _mm512_mask_fmadd_ps(dat4, 65535, bnMul1, bnAdd1);
dat4 = _mm512_max_ps(_mm512_setzero_ps(), dat4);
dat4 = _mm512_add_ps(dat4, _mm512_maskz_loadu_ps(65535, datPtr2+384+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat5 = _mm512_maskz_loadu_ps(65535, datPtr1+512+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat5 = _mm512_mask_fmadd_ps(dat5, 65535, bnMul1, bnAdd1);
dat5 = _mm512_max_ps(_mm512_setzero_ps(), dat5);
dat5 = _mm512_add_ps(dat5, _mm512_maskz_loadu_ps(65535, datPtr2+512+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat6 = _mm512_maskz_loadu_ps(65535, datPtr1+640+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat6 = _mm512_mask_fmadd_ps(dat6, 65535, bnMul1, bnAdd1);
dat6 = _mm512_max_ps(_mm512_setzero_ps(), dat6);
dat6 = _mm512_add_ps(dat6, _mm512_maskz_loadu_ps(65535, datPtr2+640+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat7 = _mm512_maskz_loadu_ps(65535, datPtr1+768+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat7 = _mm512_mask_fmadd_ps(dat7, 65535, bnMul1, bnAdd1);
dat7 = _mm512_max_ps(_mm512_setzero_ps(), dat7);
dat7 = _mm512_add_ps(dat7, _mm512_maskz_loadu_ps(65535, datPtr2+768+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat8 = _mm512_maskz_loadu_ps(65535, datPtr1+896+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat8 = _mm512_mask_fmadd_ps(dat8, 65535, bnMul1, bnAdd1);
dat8 = _mm512_max_ps(_mm512_setzero_ps(), dat8);
dat8 = _mm512_add_ps(dat8, _mm512_maskz_loadu_ps(65535, datPtr2+896+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat9 = _mm512_maskz_loadu_ps(65535, datPtr1+1024+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat9 = _mm512_mask_fmadd_ps(dat9, 65535, bnMul1, bnAdd1);
dat9 = _mm512_max_ps(_mm512_setzero_ps(), dat9);
dat9 = _mm512_add_ps(dat9, _mm512_maskz_loadu_ps(65535, datPtr2+1024+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat10 = _mm512_maskz_loadu_ps(65535, datPtr1+1152+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat10 = _mm512_mask_fmadd_ps(dat10, 65535, bnMul1, bnAdd1);
dat10 = _mm512_max_ps(_mm512_setzero_ps(), dat10);
dat10 = _mm512_add_ps(dat10, _mm512_maskz_loadu_ps(65535, datPtr2+1152+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat11 = _mm512_maskz_loadu_ps(65535, datPtr1+1280+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat11 = _mm512_mask_fmadd_ps(dat11, 65535, bnMul1, bnAdd1);
dat11 = _mm512_max_ps(_mm512_setzero_ps(), dat11);
dat11 = _mm512_add_ps(dat11, _mm512_maskz_loadu_ps(65535, datPtr2+1280+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat12 = _mm512_maskz_loadu_ps(65535, datPtr1+1408+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat12 = _mm512_mask_fmadd_ps(dat12, 65535, bnMul1, bnAdd1);
dat12 = _mm512_max_ps(_mm512_setzero_ps(), dat12);
dat12 = _mm512_add_ps(dat12, _mm512_maskz_loadu_ps(65535, datPtr2+1408+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 dat13 = _mm512_maskz_loadu_ps(65535, datPtr1+1536+88192*i7+1664*k3+128*h1+4*w1+40*b3);
dat13 = _mm512_mask_fmadd_ps(dat13, 65535, bnMul1, bnAdd1);
dat13 = _mm512_max_ps(_mm512_setzero_ps(), dat13);
dat13 = _mm512_add_ps(dat13, _mm512_maskz_loadu_ps(65535, datPtr2+1536+88192*i7+1664*k3+128*h1+4*w1+40*b3));
__m512 fft505 = _mm512_add_ps(dat1, dat9);
__m512 fft593 = _mm512_add_ps(dat2, dat10);
__m512 fft506 = _mm512_sub_ps(dat1, dat9);
__m512 fft594 = _mm512_sub_ps(dat2, dat10);
__m512 fft507 = _mm512_add_ps(dat3, dat11);
__m512 fft595 = _mm512_add_ps(dat4, dat12);
__m512 fft508 = _mm512_sub_ps(dat3, dat11);
__m512 fft596 = _mm512_sub_ps(dat4, dat12);
__m512 fft509 = _mm512_add_ps(dat5, dat13);
__m512 fft597 = _mm512_add_ps(dat6, _mm512_setzero_ps());
__m512 fft510 = _mm512_sub_ps(dat5, dat13);
__m512 fft598 = _mm512_sub_ps(dat6, _mm512_setzero_ps());
__m512 fft511 = _mm512_add_ps(dat7, _mm512_setzero_ps());
__m512 fft599 = _mm512_add_ps(dat8, _mm512_setzero_ps());
__m512 fft512 = _mm512_sub_ps(dat7, _mm512_setzero_ps());
__m512 fft600 = _mm512_sub_ps(dat8, _mm512_setzero_ps());
__m512 fft513 = _mm512_add_ps(fft505, fft509);
__m512 fft601 = _mm512_add_ps(fft593, fft597);
__m512 fft514 = _mm512_sub_ps(fft505, fft509);
__m512 fft602 = _mm512_sub_ps(fft593, fft597);
__m512 fft515 = _mm512_add_ps(fft507, fft511);
__m512 fft603 = _mm512_add_ps(fft595, fft599);
__m512 fft516 = _mm512_sub_ps(fft511, fft507);
__m512 fft604 = _mm512_sub_ps(fft599, fft595);
__m512 fft517 = _mm512_sub_ps(fft508, fft512);
__m512 fft605 = _mm512_sub_ps(fft596, fft600);
__m512 fft518 = _mm512_add_ps(fft508, fft512);
__m512 fft606 = _mm512_add_ps(fft596, fft600);
__m512 fft519 = _mm512_add_ps(fft513, fft515);
__m512 fft607 = _mm512_add_ps(fft601, fft603);
__m512 fft520 = _mm512_sub_ps(fft513, fft515);
__m512 fft608 = _mm512_sub_ps(fft601, fft603);
__m512 fft521 = _mm512_fmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft609 = _mm512_fmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft522 = _mm512_fnmsub_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft610 = _mm512_fnmsub_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft523 = _mm512_fnmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft611 = _mm512_fnmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft524 = _mm512_fnmadd_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft612 = _mm512_fnmadd_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft525 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft526 = _mm512_fmadd_ps(fft519, fft525, _mm512_shuffle_f32x4(fft519, fft519, 78));
__m512 fft613 = _mm512_fmadd_ps(fft607, fft525, _mm512_shuffle_f32x4(fft607, fft607, 78));
__m512 fft527 = _mm512_fmadd_ps(fft520, fft525, _mm512_shuffle_f32x4(fft520, fft520, 78));
__m512 fft614 = _mm512_fmadd_ps(fft608, fft525, _mm512_shuffle_f32x4(fft608, fft608, 78));
__m512 fft528 = _mm512_fmadd_ps(fft521, fft525, _mm512_shuffle_f32x4(fft521, fft521, 78));
__m512 fft615 = _mm512_fmadd_ps(fft609, fft525, _mm512_shuffle_f32x4(fft609, fft609, 78));
__m512 fft529 = _mm512_fmadd_ps(fft522, fft525, _mm512_shuffle_f32x4(fft522, fft522, 78));
__m512 fft616 = _mm512_fmadd_ps(fft610, fft525, _mm512_shuffle_f32x4(fft610, fft610, 78));
__m512 fft530 = _mm512_fmadd_ps(fft514, fft525, _mm512_shuffle_f32x4(fft514, fft514, 78));
__m512 fft617 = _mm512_fmadd_ps(fft602, fft525, _mm512_shuffle_f32x4(fft602, fft602, 78));
__m512 fft531 = _mm512_fmadd_ps(fft516, fft525, _mm512_shuffle_f32x4(fft516, fft516, 78));
__m512 fft618 = _mm512_fmadd_ps(fft604, fft525, _mm512_shuffle_f32x4(fft604, fft604, 78));
__m512 fft532 = _mm512_fmadd_ps(fft523, fft525, _mm512_shuffle_f32x4(fft523, fft523, 78));
__m512 fft619 = _mm512_fmadd_ps(fft611, fft525, _mm512_shuffle_f32x4(fft611, fft611, 78));
__m512 fft533 = _mm512_fmadd_ps(fft524, fft525, _mm512_shuffle_f32x4(fft524, fft524, 78));
__m512 fft620 = _mm512_fmadd_ps(fft612, fft525, _mm512_shuffle_f32x4(fft612, fft612, 78));
__m512 fft534 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft535 = _mm512_mul_ps(fft526, fft534);
__m512 fft621 = _mm512_mul_ps(fft613, fft534);
__m512 fft536 = _mm512_mul_ps(fft527, fft534);
__m512 fft622 = _mm512_mul_ps(fft614, fft534);
__m512 fft537 = _mm512_mul_ps(fft528, fft534);
__m512 fft623 = _mm512_mul_ps(fft615, fft534);
__m512 fft538 = _mm512_mul_ps(fft529, fft534);
__m512 fft624 = _mm512_mul_ps(fft616, fft534);
__m512 fft539 = _mm512_mul_ps(fft530, fft534);
__m512 fft625 = _mm512_mul_ps(fft617, fft534);
__m512 fft540 = _mm512_mul_ps(fft531, fft534);
__m512 fft626 = _mm512_mul_ps(fft618, fft534);
__m512 fft541 = _mm512_mul_ps(fft532, fft534);
__m512 fft627 = _mm512_mul_ps(fft619, fft534);
__m512 fft542 = _mm512_mul_ps(fft533, fft534);
__m512 fft628 = _mm512_mul_ps(fft620, fft534);
__m512 fft543 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft544 = _mm512_fmadd_ps(fft527, fft543, fft535);
__m512 fft629 = _mm512_fmadd_ps(fft614, fft543, fft621);
__m512 fft545 = _mm512_fnmadd_ps(fft526, fft543, fft536);
__m512 fft630 = _mm512_fnmadd_ps(fft613, fft543, fft622);
__m512 fft546 = _mm512_fmadd_ps(fft529, fft543, fft537);
__m512 fft631 = _mm512_fmadd_ps(fft616, fft543, fft623);
__m512 fft547 = _mm512_fnmadd_ps(fft528, fft543, fft538);
__m512 fft632 = _mm512_fnmadd_ps(fft615, fft543, fft624);
__m512 fft548 = _mm512_fmadd_ps(fft531, fft543, fft539);
__m512 fft633 = _mm512_fmadd_ps(fft618, fft543, fft625);
__m512 fft549 = _mm512_fnmadd_ps(fft530, fft543, fft540);
__m512 fft634 = _mm512_fnmadd_ps(fft617, fft543, fft626);
__m512 fft550 = _mm512_fmadd_ps(fft533, fft543, fft541);
__m512 fft635 = _mm512_fmadd_ps(fft620, fft543, fft627);
__m512 fft551 = _mm512_fnmadd_ps(fft532, fft543, fft542);
__m512 fft636 = _mm512_fnmadd_ps(fft619, fft543, fft628);
__m512 fft552 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft553 = _mm512_fmadd_ps(fft544, fft552, _mm512_shuffle_f32x4(fft544, fft544, 177));
__m512 fft637 = _mm512_fmadd_ps(fft629, fft552, _mm512_shuffle_f32x4(fft629, fft629, 177));
__m512 fft554 = _mm512_fmadd_ps(fft545, fft552, _mm512_shuffle_f32x4(fft545, fft545, 177));
__m512 fft638 = _mm512_fmadd_ps(fft630, fft552, _mm512_shuffle_f32x4(fft630, fft630, 177));
__m512 fft555 = _mm512_fmadd_ps(fft546, fft552, _mm512_shuffle_f32x4(fft546, fft546, 177));
__m512 fft639 = _mm512_fmadd_ps(fft631, fft552, _mm512_shuffle_f32x4(fft631, fft631, 177));
__m512 fft556 = _mm512_fmadd_ps(fft547, fft552, _mm512_shuffle_f32x4(fft547, fft547, 177));
__m512 fft640 = _mm512_fmadd_ps(fft632, fft552, _mm512_shuffle_f32x4(fft632, fft632, 177));
__m512 fft557 = _mm512_fmadd_ps(fft548, fft552, _mm512_shuffle_f32x4(fft548, fft548, 177));
__m512 fft641 = _mm512_fmadd_ps(fft633, fft552, _mm512_shuffle_f32x4(fft633, fft633, 177));
__m512 fft558 = _mm512_fmadd_ps(fft549, fft552, _mm512_shuffle_f32x4(fft549, fft549, 177));
__m512 fft642 = _mm512_fmadd_ps(fft634, fft552, _mm512_shuffle_f32x4(fft634, fft634, 177));
__m512 fft559 = _mm512_fmadd_ps(fft550, fft552, _mm512_shuffle_f32x4(fft550, fft550, 177));
__m512 fft643 = _mm512_fmadd_ps(fft635, fft552, _mm512_shuffle_f32x4(fft635, fft635, 177));
__m512 fft560 = _mm512_fmadd_ps(fft551, fft552, _mm512_shuffle_f32x4(fft551, fft551, 177));
__m512 fft644 = _mm512_fmadd_ps(fft636, fft552, _mm512_shuffle_f32x4(fft636, fft636, 177));
__m512 fft561 = _mm512_mask_mov_ps(fft553, 49344, fft554);
__m512 fft645 = _mm512_mask_mov_ps(fft637, 49344, fft638);
__m512 fft562 = _mm512_mask_sub_ps(fft554, 49344, _mm512_setzero_ps(), fft553);
__m512 fft646 = _mm512_mask_sub_ps(fft638, 49344, _mm512_setzero_ps(), fft637);
__m512 fft563 = _mm512_mask_mov_ps(fft555, 49344, fft556);
__m512 fft647 = _mm512_mask_mov_ps(fft639, 49344, fft640);
__m512 fft564 = _mm512_mask_sub_ps(fft556, 49344, _mm512_setzero_ps(), fft555);
__m512 fft648 = _mm512_mask_sub_ps(fft640, 49344, _mm512_setzero_ps(), fft639);
__m512 fft565 = _mm512_mask_mov_ps(fft557, 49344, fft558);
__m512 fft649 = _mm512_mask_mov_ps(fft641, 49344, fft642);
__m512 fft566 = _mm512_mask_sub_ps(fft558, 49344, _mm512_setzero_ps(), fft557);
__m512 fft650 = _mm512_mask_sub_ps(fft642, 49344, _mm512_setzero_ps(), fft641);
__m512 fft567 = _mm512_mask_mov_ps(fft559, 49344, fft560);
__m512 fft651 = _mm512_mask_mov_ps(fft643, 49344, fft644);
__m512 fft568 = _mm512_mask_sub_ps(fft560, 49344, _mm512_setzero_ps(), fft559);
__m512 fft652 = _mm512_mask_sub_ps(fft644, 49344, _mm512_setzero_ps(), fft643);
__m512 fft569 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft570 = _mm512_fmadd_ps(fft561, fft569, _mm512_shuffle_ps(fft561, fft561, 78));
__m512 fft653 = _mm512_fmadd_ps(fft645, fft569, _mm512_shuffle_ps(fft645, fft645, 78));
__m512 fft571 = _mm512_fmadd_ps(fft562, fft569, _mm512_shuffle_ps(fft562, fft562, 78));
__m512 fft654 = _mm512_fmadd_ps(fft646, fft569, _mm512_shuffle_ps(fft646, fft646, 78));
__m512 fft572 = _mm512_fmadd_ps(fft563, fft569, _mm512_shuffle_ps(fft563, fft563, 78));
__m512 fft655 = _mm512_fmadd_ps(fft647, fft569, _mm512_shuffle_ps(fft647, fft647, 78));
__m512 fft573 = _mm512_fmadd_ps(fft564, fft569, _mm512_shuffle_ps(fft564, fft564, 78));
__m512 fft656 = _mm512_fmadd_ps(fft648, fft569, _mm512_shuffle_ps(fft648, fft648, 78));
__m512 fft574 = _mm512_fmadd_ps(fft565, fft569, _mm512_shuffle_ps(fft565, fft565, 78));
__m512 fft657 = _mm512_fmadd_ps(fft649, fft569, _mm512_shuffle_ps(fft649, fft649, 78));
__m512 fft575 = _mm512_fmadd_ps(fft566, fft569, _mm512_shuffle_ps(fft566, fft566, 78));
__m512 fft658 = _mm512_fmadd_ps(fft650, fft569, _mm512_shuffle_ps(fft650, fft650, 78));
__m512 fft576 = _mm512_fmadd_ps(fft567, fft569, _mm512_shuffle_ps(fft567, fft567, 78));
__m512 fft659 = _mm512_fmadd_ps(fft651, fft569, _mm512_shuffle_ps(fft651, fft651, 78));
__m512 fft577 = _mm512_fmadd_ps(fft568, fft569, _mm512_shuffle_ps(fft568, fft568, 78));
__m512 fft660 = _mm512_fmadd_ps(fft652, fft569, _mm512_shuffle_ps(fft652, fft652, 78));
__m512i fft578 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft579 = _mm512_permutexvar_ps(fft578, fft570);
__m512 fft661 = _mm512_permutexvar_ps(fft578, fft653);
__m512i fft580 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft581 = _mm512_permutexvar_ps(fft580, fft570);
__m512 fft662 = _mm512_permutexvar_ps(fft580, fft653);
__m512 fft582 = _mm512_permutexvar_ps(fft578, fft571);
__m512 fft663 = _mm512_permutexvar_ps(fft578, fft654);
__m512 fft583 = _mm512_permutexvar_ps(fft580, fft571);
__m512 fft664 = _mm512_permutexvar_ps(fft580, fft654);
__m512 fft584 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft585 = _mm512_fmadd_ps(fft579, fft584, fft581);
__m512 fft665 = _mm512_fmadd_ps(fft661, fft584, fft662);
__m512 fft586 = _mm512_fnmadd_ps(fft583, fft584, fft582);
__m512 fft666 = _mm512_fnmadd_ps(fft664, fft584, fft663);
__m512 fft587 = _mm512_mask_mov_ps(fft583, 21845, fft585);
__m512 fft667 = _mm512_mask_mov_ps(fft664, 21845, fft665);
__m512 fft588 = _mm512_mask_mov_ps(fft579, 43176, fft585);
__m512 fft668 = _mm512_mask_mov_ps(fft661, 43176, fft665);
__m512 fft589 = _mm512_mask_mov_ps(fft587, 43176, fft586);
__m512 fft669 = _mm512_mask_mov_ps(fft667, 43176, fft666);
__m512 fft590 = _mm512_mask_mov_ps(fft588, 22102, fft586);
__m512 fft670 = _mm512_mask_mov_ps(fft668, 22102, fft666);
__m512 fft591 = _mm512_mask_mul_ps(fft589, 64764, fft589, _mm512_set1_ps(5e-01f));
__m512 fft671 = _mm512_mask_mul_ps(fft669, 64764, fft669, _mm512_set1_ps(5e-01f));
__m512 fft592 = _mm512_mask_mul_ps(fft590, 64764, fft590, _mm512_set1_ps(5e-01f));
__m512 fft672 = _mm512_mask_mul_ps(fft670, 64764, fft670, _mm512_set1_ps(5e-01f));
__m512 df1 = fft591;
__m512 df9 = fft671;
__m512 df2 = fft592;
__m512 df10 = fft672;
__m512 df3 = fft572;
__m512 df11 = fft655;
__m512 df4 = fft573;
__m512 df12 = fft656;
__m512 df5 = fft574;
__m512 df13 = fft657;
__m512 df6 = fft575;
__m512 df14 = fft658;
__m512 df7 = fft576;
__m512 df15 = fft659;
__m512 df8 = fft577;
__m512 df16 = fft660;
__m512i eo4 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df3 = _mm512_permutexvar_ps(eo4, df3);
df4 = _mm512_permutexvar_ps(eo4, df4);
_mm512_mask_storeu_ps(dfPtr1+13568+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df3);
_mm512_mask_storeu_ps(dfPtr1+13632+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df4);
_mm512_mask_storeu_ps(dfPtr1+67808+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df3);
_mm512_mask_storeu_ps(dfPtr1+67872+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df4);
df11 = _mm512_permutexvar_ps(eo4, df11);
df12 = _mm512_permutexvar_ps(eo4, df12);
_mm512_mask_storeu_ps(dfPtr1+122112+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df11);
_mm512_mask_storeu_ps(dfPtr1+122176+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df12);
_mm512_mask_storeu_ps(dfPtr1+176352+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df11);
_mm512_mask_storeu_ps(dfPtr1+176416+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df12);
df5 = _mm512_permutexvar_ps(eo4, df5);
df6 = _mm512_permutexvar_ps(eo4, df6);
_mm512_mask_storeu_ps(dfPtr1+27136+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df5);
_mm512_mask_storeu_ps(dfPtr1+27200+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df6);
_mm512_mask_storeu_ps(dfPtr1+81376+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df5);
_mm512_mask_storeu_ps(dfPtr1+81440+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df6);
df13 = _mm512_permutexvar_ps(eo4, df13);
df14 = _mm512_permutexvar_ps(eo4, df14);
_mm512_mask_storeu_ps(dfPtr1+135680+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df13);
_mm512_mask_storeu_ps(dfPtr1+135744+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df14);
_mm512_mask_storeu_ps(dfPtr1+189920+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df13);
_mm512_mask_storeu_ps(dfPtr1+189984+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df14);
df7 = _mm512_permutexvar_ps(eo4, df7);
df8 = _mm512_permutexvar_ps(eo4, df8);
_mm512_mask_storeu_ps(dfPtr1+40704+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df7);
_mm512_mask_storeu_ps(dfPtr1+40768+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df8);
_mm512_mask_storeu_ps(dfPtr1+94944+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df7);
_mm512_mask_storeu_ps(dfPtr1+95008+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df8);
df15 = _mm512_permutexvar_ps(eo4, df15);
df16 = _mm512_permutexvar_ps(eo4, df16);
_mm512_mask_storeu_ps(dfPtr1+149248+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df15);
_mm512_mask_storeu_ps(dfPtr1+149312+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df16);
_mm512_mask_storeu_ps(dfPtr1+203488+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df15);
_mm512_mask_storeu_ps(dfPtr1+203552+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df16);
_mm512_mask_storeu_ps(dfPtr1+0+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df1);
_mm512_mask_storeu_ps(dfPtr1+64+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df2);
_mm512_mask_storeu_ps(dfPtr1+54240+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df1);
_mm512_mask_storeu_ps(dfPtr1+54304+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df2);
_mm512_mask_storeu_ps(dfPtr1+108544+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df9);
_mm512_mask_storeu_ps(dfPtr1+108608+54272*i7+20352*j2+256*k3+128*m4+32*f5, 255, df10);
_mm512_mask_storeu_ps(dfPtr1+162784+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df9);
_mm512_mask_storeu_ps(dfPtr1+162848+54272*i7+20352*j2+256*k3+128*m4+32*f5, 65280, df10);
}
ptrdiff_t b4 = 2;
ptrdiff_t m5 = (size_t)b4/2;
ptrdiff_t f6 = (size_t)b4%2;
__m512 dat14 = _mm512_maskz_loadu_ps(4095, datPtr1+80+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat14 = _mm512_mask_fmadd_ps(dat14, 4095, bnMul1, bnAdd1);
dat14 = _mm512_max_ps(_mm512_setzero_ps(), dat14);
dat14 = _mm512_add_ps(dat14, _mm512_maskz_loadu_ps(4095, datPtr2+80+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat15 = _mm512_maskz_loadu_ps(4095, datPtr1+208+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat15 = _mm512_mask_fmadd_ps(dat15, 4095, bnMul1, bnAdd1);
dat15 = _mm512_max_ps(_mm512_setzero_ps(), dat15);
dat15 = _mm512_add_ps(dat15, _mm512_maskz_loadu_ps(4095, datPtr2+208+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat16 = _mm512_maskz_loadu_ps(4095, datPtr1+336+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat16 = _mm512_mask_fmadd_ps(dat16, 4095, bnMul1, bnAdd1);
dat16 = _mm512_max_ps(_mm512_setzero_ps(), dat16);
dat16 = _mm512_add_ps(dat16, _mm512_maskz_loadu_ps(4095, datPtr2+336+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat17 = _mm512_maskz_loadu_ps(4095, datPtr1+464+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat17 = _mm512_mask_fmadd_ps(dat17, 4095, bnMul1, bnAdd1);
dat17 = _mm512_max_ps(_mm512_setzero_ps(), dat17);
dat17 = _mm512_add_ps(dat17, _mm512_maskz_loadu_ps(4095, datPtr2+464+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat18 = _mm512_maskz_loadu_ps(4095, datPtr1+592+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat18 = _mm512_mask_fmadd_ps(dat18, 4095, bnMul1, bnAdd1);
dat18 = _mm512_max_ps(_mm512_setzero_ps(), dat18);
dat18 = _mm512_add_ps(dat18, _mm512_maskz_loadu_ps(4095, datPtr2+592+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat19 = _mm512_maskz_loadu_ps(4095, datPtr1+720+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat19 = _mm512_mask_fmadd_ps(dat19, 4095, bnMul1, bnAdd1);
dat19 = _mm512_max_ps(_mm512_setzero_ps(), dat19);
dat19 = _mm512_add_ps(dat19, _mm512_maskz_loadu_ps(4095, datPtr2+720+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat20 = _mm512_maskz_loadu_ps(4095, datPtr1+848+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat20 = _mm512_mask_fmadd_ps(dat20, 4095, bnMul1, bnAdd1);
dat20 = _mm512_max_ps(_mm512_setzero_ps(), dat20);
dat20 = _mm512_add_ps(dat20, _mm512_maskz_loadu_ps(4095, datPtr2+848+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat21 = _mm512_maskz_loadu_ps(4095, datPtr1+976+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat21 = _mm512_mask_fmadd_ps(dat21, 4095, bnMul1, bnAdd1);
dat21 = _mm512_max_ps(_mm512_setzero_ps(), dat21);
dat21 = _mm512_add_ps(dat21, _mm512_maskz_loadu_ps(4095, datPtr2+976+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat22 = _mm512_maskz_loadu_ps(4095, datPtr1+1104+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat22 = _mm512_mask_fmadd_ps(dat22, 4095, bnMul1, bnAdd1);
dat22 = _mm512_max_ps(_mm512_setzero_ps(), dat22);
dat22 = _mm512_add_ps(dat22, _mm512_maskz_loadu_ps(4095, datPtr2+1104+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat23 = _mm512_maskz_loadu_ps(4095, datPtr1+1232+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat23 = _mm512_mask_fmadd_ps(dat23, 4095, bnMul1, bnAdd1);
dat23 = _mm512_max_ps(_mm512_setzero_ps(), dat23);
dat23 = _mm512_add_ps(dat23, _mm512_maskz_loadu_ps(4095, datPtr2+1232+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat24 = _mm512_maskz_loadu_ps(4095, datPtr1+1360+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat24 = _mm512_mask_fmadd_ps(dat24, 4095, bnMul1, bnAdd1);
dat24 = _mm512_max_ps(_mm512_setzero_ps(), dat24);
dat24 = _mm512_add_ps(dat24, _mm512_maskz_loadu_ps(4095, datPtr2+1360+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat25 = _mm512_maskz_loadu_ps(4095, datPtr1+1488+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat25 = _mm512_mask_fmadd_ps(dat25, 4095, bnMul1, bnAdd1);
dat25 = _mm512_max_ps(_mm512_setzero_ps(), dat25);
dat25 = _mm512_add_ps(dat25, _mm512_maskz_loadu_ps(4095, datPtr2+1488+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 dat26 = _mm512_maskz_loadu_ps(4095, datPtr1+1616+88192*i7+1664*k3+128*h1+4*w1+0*b4);
dat26 = _mm512_mask_fmadd_ps(dat26, 4095, bnMul1, bnAdd1);
dat26 = _mm512_max_ps(_mm512_setzero_ps(), dat26);
dat26 = _mm512_add_ps(dat26, _mm512_maskz_loadu_ps(4095, datPtr2+1616+88192*i7+1664*k3+128*h1+4*w1+0*b4));
__m512 fft673 = _mm512_add_ps(dat14, dat22);
__m512 fft761 = _mm512_add_ps(dat15, dat23);
__m512 fft674 = _mm512_sub_ps(dat14, dat22);
__m512 fft762 = _mm512_sub_ps(dat15, dat23);
__m512 fft675 = _mm512_add_ps(dat16, dat24);
__m512 fft763 = _mm512_add_ps(dat17, dat25);
__m512 fft676 = _mm512_sub_ps(dat16, dat24);
__m512 fft764 = _mm512_sub_ps(dat17, dat25);
__m512 fft677 = _mm512_add_ps(dat18, dat26);
__m512 fft765 = _mm512_add_ps(dat19, _mm512_setzero_ps());
__m512 fft678 = _mm512_sub_ps(dat18, dat26);
__m512 fft766 = _mm512_sub_ps(dat19, _mm512_setzero_ps());
__m512 fft679 = _mm512_add_ps(dat20, _mm512_setzero_ps());
__m512 fft767 = _mm512_add_ps(dat21, _mm512_setzero_ps());
__m512 fft680 = _mm512_sub_ps(dat20, _mm512_setzero_ps());
__m512 fft768 = _mm512_sub_ps(dat21, _mm512_setzero_ps());
__m512 fft681 = _mm512_add_ps(fft673, fft677);
__m512 fft769 = _mm512_add_ps(fft761, fft765);
__m512 fft682 = _mm512_sub_ps(fft673, fft677);
__m512 fft770 = _mm512_sub_ps(fft761, fft765);
__m512 fft683 = _mm512_add_ps(fft675, fft679);
__m512 fft771 = _mm512_add_ps(fft763, fft767);
__m512 fft684 = _mm512_sub_ps(fft679, fft675);
__m512 fft772 = _mm512_sub_ps(fft767, fft763);
__m512 fft685 = _mm512_sub_ps(fft676, fft680);
__m512 fft773 = _mm512_sub_ps(fft764, fft768);
__m512 fft686 = _mm512_add_ps(fft676, fft680);
__m512 fft774 = _mm512_add_ps(fft764, fft768);
__m512 fft687 = _mm512_add_ps(fft681, fft683);
__m512 fft775 = _mm512_add_ps(fft769, fft771);
__m512 fft688 = _mm512_sub_ps(fft681, fft683);
__m512 fft776 = _mm512_sub_ps(fft769, fft771);
__m512 fft689 = _mm512_fmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft777 = _mm512_fmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft690 = _mm512_fnmsub_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft778 = _mm512_fnmsub_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft691 = _mm512_fnmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft779 = _mm512_fnmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft692 = _mm512_fnmadd_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft780 = _mm512_fnmadd_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft693 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft694 = _mm512_fmadd_ps(fft687, fft693, _mm512_shuffle_f32x4(fft687, fft687, 78));
__m512 fft781 = _mm512_fmadd_ps(fft775, fft693, _mm512_shuffle_f32x4(fft775, fft775, 78));
__m512 fft695 = _mm512_fmadd_ps(fft688, fft693, _mm512_shuffle_f32x4(fft688, fft688, 78));
__m512 fft782 = _mm512_fmadd_ps(fft776, fft693, _mm512_shuffle_f32x4(fft776, fft776, 78));
__m512 fft696 = _mm512_fmadd_ps(fft689, fft693, _mm512_shuffle_f32x4(fft689, fft689, 78));
__m512 fft783 = _mm512_fmadd_ps(fft777, fft693, _mm512_shuffle_f32x4(fft777, fft777, 78));
__m512 fft697 = _mm512_fmadd_ps(fft690, fft693, _mm512_shuffle_f32x4(fft690, fft690, 78));
__m512 fft784 = _mm512_fmadd_ps(fft778, fft693, _mm512_shuffle_f32x4(fft778, fft778, 78));
__m512 fft698 = _mm512_fmadd_ps(fft682, fft693, _mm512_shuffle_f32x4(fft682, fft682, 78));
__m512 fft785 = _mm512_fmadd_ps(fft770, fft693, _mm512_shuffle_f32x4(fft770, fft770, 78));
__m512 fft699 = _mm512_fmadd_ps(fft684, fft693, _mm512_shuffle_f32x4(fft684, fft684, 78));
__m512 fft786 = _mm512_fmadd_ps(fft772, fft693, _mm512_shuffle_f32x4(fft772, fft772, 78));
__m512 fft700 = _mm512_fmadd_ps(fft691, fft693, _mm512_shuffle_f32x4(fft691, fft691, 78));
__m512 fft787 = _mm512_fmadd_ps(fft779, fft693, _mm512_shuffle_f32x4(fft779, fft779, 78));
__m512 fft701 = _mm512_fmadd_ps(fft692, fft693, _mm512_shuffle_f32x4(fft692, fft692, 78));
__m512 fft788 = _mm512_fmadd_ps(fft780, fft693, _mm512_shuffle_f32x4(fft780, fft780, 78));
__m512 fft702 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft703 = _mm512_mul_ps(fft694, fft702);
__m512 fft789 = _mm512_mul_ps(fft781, fft702);
__m512 fft704 = _mm512_mul_ps(fft695, fft702);
__m512 fft790 = _mm512_mul_ps(fft782, fft702);
__m512 fft705 = _mm512_mul_ps(fft696, fft702);
__m512 fft791 = _mm512_mul_ps(fft783, fft702);
__m512 fft706 = _mm512_mul_ps(fft697, fft702);
__m512 fft792 = _mm512_mul_ps(fft784, fft702);
__m512 fft707 = _mm512_mul_ps(fft698, fft702);
__m512 fft793 = _mm512_mul_ps(fft785, fft702);
__m512 fft708 = _mm512_mul_ps(fft699, fft702);
__m512 fft794 = _mm512_mul_ps(fft786, fft702);
__m512 fft709 = _mm512_mul_ps(fft700, fft702);
__m512 fft795 = _mm512_mul_ps(fft787, fft702);
__m512 fft710 = _mm512_mul_ps(fft701, fft702);
__m512 fft796 = _mm512_mul_ps(fft788, fft702);
__m512 fft711 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft712 = _mm512_fmadd_ps(fft695, fft711, fft703);
__m512 fft797 = _mm512_fmadd_ps(fft782, fft711, fft789);
__m512 fft713 = _mm512_fnmadd_ps(fft694, fft711, fft704);
__m512 fft798 = _mm512_fnmadd_ps(fft781, fft711, fft790);
__m512 fft714 = _mm512_fmadd_ps(fft697, fft711, fft705);
__m512 fft799 = _mm512_fmadd_ps(fft784, fft711, fft791);
__m512 fft715 = _mm512_fnmadd_ps(fft696, fft711, fft706);
__m512 fft800 = _mm512_fnmadd_ps(fft783, fft711, fft792);
__m512 fft716 = _mm512_fmadd_ps(fft699, fft711, fft707);
__m512 fft801 = _mm512_fmadd_ps(fft786, fft711, fft793);
__m512 fft717 = _mm512_fnmadd_ps(fft698, fft711, fft708);
__m512 fft802 = _mm512_fnmadd_ps(fft785, fft711, fft794);
__m512 fft718 = _mm512_fmadd_ps(fft701, fft711, fft709);
__m512 fft803 = _mm512_fmadd_ps(fft788, fft711, fft795);
__m512 fft719 = _mm512_fnmadd_ps(fft700, fft711, fft710);
__m512 fft804 = _mm512_fnmadd_ps(fft787, fft711, fft796);
__m512 fft720 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft721 = _mm512_fmadd_ps(fft712, fft720, _mm512_shuffle_f32x4(fft712, fft712, 177));
__m512 fft805 = _mm512_fmadd_ps(fft797, fft720, _mm512_shuffle_f32x4(fft797, fft797, 177));
__m512 fft722 = _mm512_fmadd_ps(fft713, fft720, _mm512_shuffle_f32x4(fft713, fft713, 177));
__m512 fft806 = _mm512_fmadd_ps(fft798, fft720, _mm512_shuffle_f32x4(fft798, fft798, 177));
__m512 fft723 = _mm512_fmadd_ps(fft714, fft720, _mm512_shuffle_f32x4(fft714, fft714, 177));
__m512 fft807 = _mm512_fmadd_ps(fft799, fft720, _mm512_shuffle_f32x4(fft799, fft799, 177));
__m512 fft724 = _mm512_fmadd_ps(fft715, fft720, _mm512_shuffle_f32x4(fft715, fft715, 177));
__m512 fft808 = _mm512_fmadd_ps(fft800, fft720, _mm512_shuffle_f32x4(fft800, fft800, 177));
__m512 fft725 = _mm512_fmadd_ps(fft716, fft720, _mm512_shuffle_f32x4(fft716, fft716, 177));
__m512 fft809 = _mm512_fmadd_ps(fft801, fft720, _mm512_shuffle_f32x4(fft801, fft801, 177));
__m512 fft726 = _mm512_fmadd_ps(fft717, fft720, _mm512_shuffle_f32x4(fft717, fft717, 177));
__m512 fft810 = _mm512_fmadd_ps(fft802, fft720, _mm512_shuffle_f32x4(fft802, fft802, 177));
__m512 fft727 = _mm512_fmadd_ps(fft718, fft720, _mm512_shuffle_f32x4(fft718, fft718, 177));
__m512 fft811 = _mm512_fmadd_ps(fft803, fft720, _mm512_shuffle_f32x4(fft803, fft803, 177));
__m512 fft728 = _mm512_fmadd_ps(fft719, fft720, _mm512_shuffle_f32x4(fft719, fft719, 177));
__m512 fft812 = _mm512_fmadd_ps(fft804, fft720, _mm512_shuffle_f32x4(fft804, fft804, 177));
__m512 fft729 = _mm512_mask_mov_ps(fft721, 49344, fft722);
__m512 fft813 = _mm512_mask_mov_ps(fft805, 49344, fft806);
__m512 fft730 = _mm512_mask_sub_ps(fft722, 49344, _mm512_setzero_ps(), fft721);
__m512 fft814 = _mm512_mask_sub_ps(fft806, 49344, _mm512_setzero_ps(), fft805);
__m512 fft731 = _mm512_mask_mov_ps(fft723, 49344, fft724);
__m512 fft815 = _mm512_mask_mov_ps(fft807, 49344, fft808);
__m512 fft732 = _mm512_mask_sub_ps(fft724, 49344, _mm512_setzero_ps(), fft723);
__m512 fft816 = _mm512_mask_sub_ps(fft808, 49344, _mm512_setzero_ps(), fft807);
__m512 fft733 = _mm512_mask_mov_ps(fft725, 49344, fft726);
__m512 fft817 = _mm512_mask_mov_ps(fft809, 49344, fft810);
__m512 fft734 = _mm512_mask_sub_ps(fft726, 49344, _mm512_setzero_ps(), fft725);
__m512 fft818 = _mm512_mask_sub_ps(fft810, 49344, _mm512_setzero_ps(), fft809);
__m512 fft735 = _mm512_mask_mov_ps(fft727, 49344, fft728);
__m512 fft819 = _mm512_mask_mov_ps(fft811, 49344, fft812);
__m512 fft736 = _mm512_mask_sub_ps(fft728, 49344, _mm512_setzero_ps(), fft727);
__m512 fft820 = _mm512_mask_sub_ps(fft812, 49344, _mm512_setzero_ps(), fft811);
__m512 fft737 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft738 = _mm512_fmadd_ps(fft729, fft737, _mm512_shuffle_ps(fft729, fft729, 78));
__m512 fft821 = _mm512_fmadd_ps(fft813, fft737, _mm512_shuffle_ps(fft813, fft813, 78));
__m512 fft739 = _mm512_fmadd_ps(fft730, fft737, _mm512_shuffle_ps(fft730, fft730, 78));
__m512 fft822 = _mm512_fmadd_ps(fft814, fft737, _mm512_shuffle_ps(fft814, fft814, 78));
__m512 fft740 = _mm512_fmadd_ps(fft731, fft737, _mm512_shuffle_ps(fft731, fft731, 78));
__m512 fft823 = _mm512_fmadd_ps(fft815, fft737, _mm512_shuffle_ps(fft815, fft815, 78));
__m512 fft741 = _mm512_fmadd_ps(fft732, fft737, _mm512_shuffle_ps(fft732, fft732, 78));
__m512 fft824 = _mm512_fmadd_ps(fft816, fft737, _mm512_shuffle_ps(fft816, fft816, 78));
__m512 fft742 = _mm512_fmadd_ps(fft733, fft737, _mm512_shuffle_ps(fft733, fft733, 78));
__m512 fft825 = _mm512_fmadd_ps(fft817, fft737, _mm512_shuffle_ps(fft817, fft817, 78));
__m512 fft743 = _mm512_fmadd_ps(fft734, fft737, _mm512_shuffle_ps(fft734, fft734, 78));
__m512 fft826 = _mm512_fmadd_ps(fft818, fft737, _mm512_shuffle_ps(fft818, fft818, 78));
__m512 fft744 = _mm512_fmadd_ps(fft735, fft737, _mm512_shuffle_ps(fft735, fft735, 78));
__m512 fft827 = _mm512_fmadd_ps(fft819, fft737, _mm512_shuffle_ps(fft819, fft819, 78));
__m512 fft745 = _mm512_fmadd_ps(fft736, fft737, _mm512_shuffle_ps(fft736, fft736, 78));
__m512 fft828 = _mm512_fmadd_ps(fft820, fft737, _mm512_shuffle_ps(fft820, fft820, 78));
__m512i fft746 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft747 = _mm512_permutexvar_ps(fft746, fft738);
__m512 fft829 = _mm512_permutexvar_ps(fft746, fft821);
__m512i fft748 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft749 = _mm512_permutexvar_ps(fft748, fft738);
__m512 fft830 = _mm512_permutexvar_ps(fft748, fft821);
__m512 fft750 = _mm512_permutexvar_ps(fft746, fft739);
__m512 fft831 = _mm512_permutexvar_ps(fft746, fft822);
__m512 fft751 = _mm512_permutexvar_ps(fft748, fft739);
__m512 fft832 = _mm512_permutexvar_ps(fft748, fft822);
__m512 fft752 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft753 = _mm512_fmadd_ps(fft747, fft752, fft749);
__m512 fft833 = _mm512_fmadd_ps(fft829, fft752, fft830);
__m512 fft754 = _mm512_fnmadd_ps(fft751, fft752, fft750);
__m512 fft834 = _mm512_fnmadd_ps(fft832, fft752, fft831);
__m512 fft755 = _mm512_mask_mov_ps(fft751, 21845, fft753);
__m512 fft835 = _mm512_mask_mov_ps(fft832, 21845, fft833);
__m512 fft756 = _mm512_mask_mov_ps(fft747, 43176, fft753);
__m512 fft836 = _mm512_mask_mov_ps(fft829, 43176, fft833);
__m512 fft757 = _mm512_mask_mov_ps(fft755, 43176, fft754);
__m512 fft837 = _mm512_mask_mov_ps(fft835, 43176, fft834);
__m512 fft758 = _mm512_mask_mov_ps(fft756, 22102, fft754);
__m512 fft838 = _mm512_mask_mov_ps(fft836, 22102, fft834);
__m512 fft759 = _mm512_mask_mul_ps(fft757, 64764, fft757, _mm512_set1_ps(5e-01f));
__m512 fft839 = _mm512_mask_mul_ps(fft837, 64764, fft837, _mm512_set1_ps(5e-01f));
__m512 fft760 = _mm512_mask_mul_ps(fft758, 64764, fft758, _mm512_set1_ps(5e-01f));
__m512 fft840 = _mm512_mask_mul_ps(fft838, 64764, fft838, _mm512_set1_ps(5e-01f));
__m512 df17 = fft759;
__m512 df25 = fft839;
__m512 df18 = fft760;
__m512 df26 = fft840;
__m512 df19 = fft740;
__m512 df27 = fft823;
__m512 df20 = fft741;
__m512 df28 = fft824;
__m512 df21 = fft742;
__m512 df29 = fft825;
__m512 df22 = fft743;
__m512 df30 = fft826;
__m512 df23 = fft744;
__m512 df31 = fft827;
__m512 df24 = fft745;
__m512 df32 = fft828;
__m512i eo5 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df19 = _mm512_permutexvar_ps(eo5, df19);
df20 = _mm512_permutexvar_ps(eo5, df20);
__m512 rep17 = _mm512_shuffle_f32x4(df19, df19, 68);
_mm512_mask_storeu_ps(dfPtr1+13568+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep17);
__m512 rep18 = _mm512_shuffle_f32x4(df20, df20, 68);
_mm512_mask_storeu_ps(dfPtr1+13632+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep18);
__m512 rep19 = _mm512_shuffle_f32x4(df19, df19, 238);
_mm512_mask_storeu_ps(dfPtr1+67840+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep19);
__m512 rep20 = _mm512_shuffle_f32x4(df20, df20, 238);
_mm512_mask_storeu_ps(dfPtr1+67904+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep20);
df27 = _mm512_permutexvar_ps(eo5, df27);
df28 = _mm512_permutexvar_ps(eo5, df28);
__m512 rep21 = _mm512_shuffle_f32x4(df27, df27, 68);
_mm512_mask_storeu_ps(dfPtr1+122112+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep21);
__m512 rep22 = _mm512_shuffle_f32x4(df28, df28, 68);
_mm512_mask_storeu_ps(dfPtr1+122176+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep22);
__m512 rep23 = _mm512_shuffle_f32x4(df27, df27, 238);
_mm512_mask_storeu_ps(dfPtr1+176384+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep23);
__m512 rep24 = _mm512_shuffle_f32x4(df28, df28, 238);
_mm512_mask_storeu_ps(dfPtr1+176448+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep24);
df21 = _mm512_permutexvar_ps(eo5, df21);
df22 = _mm512_permutexvar_ps(eo5, df22);
__m512 rep25 = _mm512_shuffle_f32x4(df21, df21, 68);
_mm512_mask_storeu_ps(dfPtr1+27136+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep25);
__m512 rep26 = _mm512_shuffle_f32x4(df22, df22, 68);
_mm512_mask_storeu_ps(dfPtr1+27200+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep26);
__m512 rep27 = _mm512_shuffle_f32x4(df21, df21, 238);
_mm512_mask_storeu_ps(dfPtr1+81408+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep27);
__m512 rep28 = _mm512_shuffle_f32x4(df22, df22, 238);
_mm512_mask_storeu_ps(dfPtr1+81472+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep28);
df29 = _mm512_permutexvar_ps(eo5, df29);
df30 = _mm512_permutexvar_ps(eo5, df30);
__m512 rep29 = _mm512_shuffle_f32x4(df29, df29, 68);
_mm512_mask_storeu_ps(dfPtr1+135680+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep29);
__m512 rep30 = _mm512_shuffle_f32x4(df30, df30, 68);
_mm512_mask_storeu_ps(dfPtr1+135744+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep30);
__m512 rep31 = _mm512_shuffle_f32x4(df29, df29, 238);
_mm512_mask_storeu_ps(dfPtr1+189952+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep31);
__m512 rep32 = _mm512_shuffle_f32x4(df30, df30, 238);
_mm512_mask_storeu_ps(dfPtr1+190016+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep32);
df23 = _mm512_permutexvar_ps(eo5, df23);
df24 = _mm512_permutexvar_ps(eo5, df24);
__m512 rep33 = _mm512_shuffle_f32x4(df23, df23, 68);
_mm512_mask_storeu_ps(dfPtr1+40704+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep33);
__m512 rep34 = _mm512_shuffle_f32x4(df24, df24, 68);
_mm512_mask_storeu_ps(dfPtr1+40768+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep34);
__m512 rep35 = _mm512_shuffle_f32x4(df23, df23, 238);
_mm512_mask_storeu_ps(dfPtr1+94976+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep35);
__m512 rep36 = _mm512_shuffle_f32x4(df24, df24, 238);
_mm512_mask_storeu_ps(dfPtr1+95040+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep36);
df31 = _mm512_permutexvar_ps(eo5, df31);
df32 = _mm512_permutexvar_ps(eo5, df32);
__m512 rep37 = _mm512_shuffle_f32x4(df31, df31, 68);
_mm512_mask_storeu_ps(dfPtr1+149248+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep37);
__m512 rep38 = _mm512_shuffle_f32x4(df32, df32, 68);
_mm512_mask_storeu_ps(dfPtr1+149312+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep38);
__m512 rep39 = _mm512_shuffle_f32x4(df31, df31, 238);
_mm512_mask_storeu_ps(dfPtr1+203520+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep39);
__m512 rep40 = _mm512_shuffle_f32x4(df32, df32, 238);
_mm512_mask_storeu_ps(dfPtr1+203584+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep40);
__m512 rep41 = _mm512_shuffle_f32x4(df17, df17, 68);
_mm512_mask_storeu_ps(dfPtr1+0+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep41);
__m512 rep42 = _mm512_shuffle_f32x4(df18, df18, 68);
_mm512_mask_storeu_ps(dfPtr1+64+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep42);
__m512 rep43 = _mm512_shuffle_f32x4(df17, df17, 238);
_mm512_mask_storeu_ps(dfPtr1+54272+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep43);
__m512 rep44 = _mm512_shuffle_f32x4(df18, df18, 238);
_mm512_mask_storeu_ps(dfPtr1+54336+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep44);
__m512 rep45 = _mm512_shuffle_f32x4(df25, df25, 68);
_mm512_mask_storeu_ps(dfPtr1+108544+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep45);
__m512 rep46 = _mm512_shuffle_f32x4(df26, df26, 68);
_mm512_mask_storeu_ps(dfPtr1+108608+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep46);
__m512 rep47 = _mm512_shuffle_f32x4(df25, df25, 238);
_mm512_mask_storeu_ps(dfPtr1+162816+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep47);
__m512 rep48 = _mm512_shuffle_f32x4(df26, df26, 238);
_mm512_mask_storeu_ps(dfPtr1+162880+54272*i7+20352*j2+256*k3+128*m5+32*f6, 65535, rep48);
}
++j2;
}

static void Example29StriderArrangeDats1(Example29ThreaderTeam1* team15, char** tensors3) {
Example29ThreaderTask1 task7;
task7.callee1 = Example29StriderArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 1;
task7.hull1[1] = 1;
task7.hull1[2] = 1;
task7.hull1[3] = 1;
Example29ThreaderDo1(team15, &task7);
}

static void Example29StriderProduceSums1Callee1(Example29ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t e3 = 0;
ptrdiff_t z2 = (ptrdiff_t)tuple2[2];
ptrdiff_t g4 = 0;
ptrdiff_t p1 = pt9[2];
ptrdiff_t d1 = 0;
ptrdiff_t w2 = pt9[0];
if (__builtin_expect(!(e3|z2), 0)) {
z2 = 0;
char*restrict bfPtr2 = tensors6[0]+376*e3;
char*restrict wfPtr2 = tensors6[0]+384+23871488*e3+637696*z2;
char*restrict dfPtr2 = tensors6[1]+2031616*e3+54272*z2;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i8 = 1*g4;
ptrdiff_t j3 = 1*p1;
ptrdiff_t jj2 = j3+0;
if (__builtin_expect(!j3, 0)) {
ptrdiff_t k4 = 1*d1;
ptrdiff_t l1 = 6*w2;
ptrdiff_t ll1 = l1+5;
for (; l1 != 23; ++l1) {
__m512 sfRe1 = _mm512_setzero_ps();
__m512 sfIm1 = _mm512_setzero_ps();
__m512 sfRe4 = _mm512_setzero_ps();
__m512 sfIm4 = _mm512_setzero_ps();
sfRe1 = _mm512_mask_mov_ps(sfRe1, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+376*i8+16*l1)));
sfRe1 = _mm512_mask_mov_ps(sfRe1, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+376*i8+16*l1)));
sfRe4 = _mm512_mask_mov_ps(sfRe4, 1, _mm512_set1_ps(*(float*)(bfPtr2+8+376*i8+16*l1)));
sfRe4 = _mm512_mask_mov_ps(sfRe4, 256, _mm512_set1_ps(*(float*)(bfPtr2+12+376*i8+16*l1)));
__m512 sfRe2 = sfRe1;
__m512 sfIm2 = sfIm1;
__m512 sfRe3 = sfRe1;
__m512 sfIm3 = sfIm1;
__m512 sfRe5 = sfRe4;
__m512 sfIm5 = sfIm4;
__m512 sfRe6 = sfRe4;
__m512 sfIm6 = sfIm4;
for (ptrdiff_t s2 = 0; s2 < 53; ++s2) {
__m512i wfLd1 = _mm512_loadu_si512(wfPtr2+0+637696*i8+159424*j3+6784*l1+128*s2);
__m512 wfRe1 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd1));
__m512 wfIm1 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd1, 1));
__m512 wfMx1 = _mm512_mask_mov_ps(wfIm1, 64764, wfRe1);
__m512i wfLd2 = _mm512_loadu_si512(wfPtr2+64+637696*i8+159424*j3+6784*l1+128*s2);
__m512 wfRe2 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd2));
__m512 wfIm2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd2, 1));
__m512 wfMx2 = _mm512_mask_mov_ps(wfIm2, 64764, wfRe2);
__m512 dfRe1 = _mm512_loadu_ps(dfPtr2+0+54272*i8+13568*j3+20352*k4+256*s2);
__m512 dfIm1 = _mm512_loadu_ps(dfPtr2+64+54272*i8+13568*j3+20352*k4+256*s2);
sfRe1 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe1);
sfRe1 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe1, 64764);
sfIm1 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm1);
sfIm1 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm1, 64764);
sfRe4 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe4);
sfRe4 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe4, 64764);
sfIm4 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm4);
sfIm4 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm4, 64764);
dfRe1 = _mm512_shuffle_f32x4(dfRe1, dfRe1, 78);
dfIm1 = _mm512_shuffle_f32x4(dfIm1, dfIm1, 78);
sfRe2 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe2);
sfRe2 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe2, 64764);
sfIm2 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm2);
sfIm2 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm2, 64764);
sfRe5 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe5);
sfRe5 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe5, 64764);
sfIm5 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm5);
sfIm5 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm5, 64764);
__m512 dfRe2 = _mm512_loadu_ps(dfPtr2+128+54272*i8+13568*j3+20352*k4+256*s2);
__m512 dfIm2 = _mm512_loadu_ps(dfPtr2+192+54272*i8+13568*j3+20352*k4+256*s2);
sfRe3 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe3);
sfRe3 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe3, 64764);
sfIm3 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm3);
sfIm3 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm3, 64764);
sfRe6 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe6);
sfRe6 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe6, 64764);
sfIm6 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm6);
sfIm6 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm6, 64764);
}
_mm512_storeu_ps(sfPtr1+0+71424*i8+17856*j3+35712*k4+768*l1, sfRe1);
_mm512_storeu_ps(sfPtr1+64+71424*i8+17856*j3+35712*k4+768*l1, sfIm1);
_mm512_storeu_ps(sfPtr1+128+71424*i8+17856*j3+35712*k4+768*l1, sfRe2);
_mm512_storeu_ps(sfPtr1+192+71424*i8+17856*j3+35712*k4+768*l1, sfIm2);
_mm512_storeu_ps(sfPtr1+256+71424*i8+17856*j3+35712*k4+768*l1, sfRe3);
_mm512_storeu_ps(sfPtr1+320+71424*i8+17856*j3+35712*k4+768*l1, sfIm3);
_mm512_storeu_ps(sfPtr1+384+71424*i8+17856*j3+35712*k4+768*l1, sfRe4);
_mm512_storeu_ps(sfPtr1+448+71424*i8+17856*j3+35712*k4+768*l1, sfIm4);
_mm512_storeu_ps(sfPtr1+512+71424*i8+17856*j3+35712*k4+768*l1, sfRe5);
_mm512_storeu_ps(sfPtr1+576+71424*i8+17856*j3+35712*k4+768*l1, sfIm5);
_mm512_storeu_ps(sfPtr1+640+71424*i8+17856*j3+35712*k4+768*l1, sfRe6);
_mm512_storeu_ps(sfPtr1+704+71424*i8+17856*j3+35712*k4+768*l1, sfIm6);
if (l1 >= ll1) return;
}
__m512 sfRe7 = _mm512_setzero_ps();
__m512 sfIm7 = _mm512_setzero_ps();
sfRe7 = _mm512_mask_mov_ps(sfRe7, 257, _mm512_set1_ps(*(float*)(bfPtr2+0+376*i8+16*l1)));
__m512 sfRe8 = sfRe7;
__m512 sfIm8 = sfIm7;
for (ptrdiff_t s3 = 0; s3 < 53; ++s3) {
__m512i wfLd3 = _mm512_loadu_si512(wfPtr2+0+637696*i8+159424*j3+6784*l1+64*s3);
__m512 wfRe3 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd3));
__m512 wfIm3 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd3, 1));
__m512 wfMx3 = _mm512_mask_mov_ps(wfIm3, 64764, wfRe3);
__m512 dfRe3 = _mm512_loadu_ps(dfPtr2+0+54272*i8+13568*j3+20352*k4+256*s3);
__m512 dfIm3 = _mm512_loadu_ps(dfPtr2+64+54272*i8+13568*j3+20352*k4+256*s3);
sfRe7 = _mm512_fmadd_ps(wfRe3, dfRe3, sfRe7);
sfRe7 = _mm512_mask3_fmadd_ps(wfIm3, dfIm3, sfRe7, 64764);
sfIm7 = _mm512_fmadd_ps(wfMx3, dfIm3, sfIm7);
sfIm7 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe3, sfIm7, 64764);
__m512 dfRe4 = _mm512_loadu_ps(dfPtr2+128+54272*i8+13568*j3+20352*k4+256*s3);
__m512 dfIm4 = _mm512_loadu_ps(dfPtr2+192+54272*i8+13568*j3+20352*k4+256*s3);
sfRe8 = _mm512_fmadd_ps(wfRe3, dfRe4, sfRe8);
sfRe8 = _mm512_mask3_fmadd_ps(wfIm3, dfIm4, sfRe8, 64764);
sfIm8 = _mm512_fmadd_ps(wfMx3, dfIm4, sfIm8);
sfIm8 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe4, sfIm8, 64764);
}
sfRe8 = _mm512_shuffle_f32x4(sfRe8, sfIm8, 68);
_mm512_storeu_ps(sfPtr1+0+71424*i8+17856*j3+35712*k4+768*l1, sfRe7);
_mm512_storeu_ps(sfPtr1+64+71424*i8+17856*j3+35712*k4+768*l1, sfIm7);
_mm512_storeu_ps(sfPtr1+128+71424*i8+17856*j3+35712*k4+768*l1, sfRe8);
j3 = 1;
}
for (; j3 <= jj2; ++j3) {
ptrdiff_t k5 = 1*d1;
ptrdiff_t l2 = 6*w2;
ptrdiff_t ll2 = l2+5;
for (; l2 != 23; ++l2) {
__m512 sfRe9 = _mm512_setzero_ps();
__m512 sfIm9 = _mm512_setzero_ps();
__m512 sfRe12 = _mm512_setzero_ps();
__m512 sfIm12 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe10 = sfRe9;
__m512 sfIm10 = sfIm9;
__m512 sfRe11 = sfRe9;
__m512 sfIm11 = sfIm9;
__m512 sfRe13 = sfRe12;
__m512 sfIm13 = sfIm12;
__m512 sfRe14 = sfRe12;
__m512 sfIm14 = sfIm12;
for (ptrdiff_t s4 = 0; s4 < 53; ++s4) {
__m512i wfLd4 = _mm512_loadu_si512(wfPtr2+0+637696*i8+159424*j3+6784*l2+128*s4);
__m512 wfRe4 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd4));
__m512 wfIm4 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd4, 1));
__m512i wfLd5 = _mm512_loadu_si512(wfPtr2+64+637696*i8+159424*j3+6784*l2+128*s4);
__m512 wfRe5 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd5));
__m512 wfIm5 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd5, 1));
__m512 dfRe5 = _mm512_loadu_ps(dfPtr2+0+54272*i8+13568*j3+20352*k5+256*s4);
__m512 dfIm5 = _mm512_loadu_ps(dfPtr2+64+54272*i8+13568*j3+20352*k5+256*s4);
sfRe9 = _mm512_fmadd_ps(wfRe4, dfRe5, sfRe9);
sfRe9 = _mm512_fmadd_ps(wfIm4, dfIm5, sfRe9);
sfIm9 = _mm512_fmadd_ps(wfRe4, dfIm5, sfIm9);
sfIm9 = _mm512_fnmadd_ps(wfIm4, dfRe5, sfIm9);
sfRe12 = _mm512_fmadd_ps(wfRe5, dfRe5, sfRe12);
sfRe12 = _mm512_fmadd_ps(wfIm5, dfIm5, sfRe12);
sfIm12 = _mm512_fmadd_ps(wfRe5, dfIm5, sfIm12);
sfIm12 = _mm512_fnmadd_ps(wfIm5, dfRe5, sfIm12);
dfRe5 = _mm512_shuffle_f32x4(dfRe5, dfRe5, 78);
dfIm5 = _mm512_shuffle_f32x4(dfIm5, dfIm5, 78);
sfRe10 = _mm512_fmadd_ps(wfRe4, dfRe5, sfRe10);
sfRe10 = _mm512_fmadd_ps(wfIm4, dfIm5, sfRe10);
sfIm10 = _mm512_fmadd_ps(wfRe4, dfIm5, sfIm10);
sfIm10 = _mm512_fnmadd_ps(wfIm4, dfRe5, sfIm10);
sfRe13 = _mm512_fmadd_ps(wfRe5, dfRe5, sfRe13);
sfRe13 = _mm512_fmadd_ps(wfIm5, dfIm5, sfRe13);
sfIm13 = _mm512_fmadd_ps(wfRe5, dfIm5, sfIm13);
sfIm13 = _mm512_fnmadd_ps(wfIm5, dfRe5, sfIm13);
__m512 dfRe6 = _mm512_loadu_ps(dfPtr2+128+54272*i8+13568*j3+20352*k5+256*s4);
__m512 dfIm6 = _mm512_loadu_ps(dfPtr2+192+54272*i8+13568*j3+20352*k5+256*s4);
sfRe11 = _mm512_fmadd_ps(wfRe4, dfRe6, sfRe11);
sfRe11 = _mm512_fmadd_ps(wfIm4, dfIm6, sfRe11);
sfIm11 = _mm512_fmadd_ps(wfRe4, dfIm6, sfIm11);
sfIm11 = _mm512_fnmadd_ps(wfIm4, dfRe6, sfIm11);
sfRe14 = _mm512_fmadd_ps(wfRe5, dfRe6, sfRe14);
sfRe14 = _mm512_fmadd_ps(wfIm5, dfIm6, sfRe14);
sfIm14 = _mm512_fmadd_ps(wfRe5, dfIm6, sfIm14);
sfIm14 = _mm512_fnmadd_ps(wfIm5, dfRe6, sfIm14);
}
_mm512_storeu_ps(sfPtr1+0+71424*i8+17856*j3+35712*k5+768*l2, sfRe9);
_mm512_storeu_ps(sfPtr1+64+71424*i8+17856*j3+35712*k5+768*l2, sfIm9);
_mm512_storeu_ps(sfPtr1+128+71424*i8+17856*j3+35712*k5+768*l2, sfRe10);
_mm512_storeu_ps(sfPtr1+192+71424*i8+17856*j3+35712*k5+768*l2, sfIm10);
_mm512_storeu_ps(sfPtr1+256+71424*i8+17856*j3+35712*k5+768*l2, sfRe11);
_mm512_storeu_ps(sfPtr1+320+71424*i8+17856*j3+35712*k5+768*l2, sfIm11);
_mm512_storeu_ps(sfPtr1+384+71424*i8+17856*j3+35712*k5+768*l2, sfRe12);
_mm512_storeu_ps(sfPtr1+448+71424*i8+17856*j3+35712*k5+768*l2, sfIm12);
_mm512_storeu_ps(sfPtr1+512+71424*i8+17856*j3+35712*k5+768*l2, sfRe13);
_mm512_storeu_ps(sfPtr1+576+71424*i8+17856*j3+35712*k5+768*l2, sfIm13);
_mm512_storeu_ps(sfPtr1+640+71424*i8+17856*j3+35712*k5+768*l2, sfRe14);
_mm512_storeu_ps(sfPtr1+704+71424*i8+17856*j3+35712*k5+768*l2, sfIm14);
if (l2 >= ll2) return;
}
__m512 sfRe15 = _mm512_setzero_ps();
__m512 sfIm15 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe16 = sfRe15;
__m512 sfIm16 = sfIm15;
for (ptrdiff_t s5 = 0; s5 < 53; ++s5) {
__m512i wfLd6 = _mm512_loadu_si512(wfPtr2+0+637696*i8+159424*j3+6784*l2+64*s5);
__m512 wfRe6 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd6));
__m512 wfIm6 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd6, 1));
__m512 dfRe7 = _mm512_loadu_ps(dfPtr2+0+54272*i8+13568*j3+20352*k5+256*s5);
__m512 dfIm7 = _mm512_loadu_ps(dfPtr2+64+54272*i8+13568*j3+20352*k5+256*s5);
sfRe15 = _mm512_fmadd_ps(wfRe6, dfRe7, sfRe15);
sfRe15 = _mm512_fmadd_ps(wfIm6, dfIm7, sfRe15);
sfIm15 = _mm512_fmadd_ps(wfRe6, dfIm7, sfIm15);
sfIm15 = _mm512_fnmadd_ps(wfIm6, dfRe7, sfIm15);
__m512 dfRe8 = _mm512_loadu_ps(dfPtr2+128+54272*i8+13568*j3+20352*k5+256*s5);
__m512 dfIm8 = _mm512_loadu_ps(dfPtr2+192+54272*i8+13568*j3+20352*k5+256*s5);
sfRe16 = _mm512_fmadd_ps(wfRe6, dfRe8, sfRe16);
sfRe16 = _mm512_fmadd_ps(wfIm6, dfIm8, sfRe16);
sfIm16 = _mm512_fmadd_ps(wfRe6, dfIm8, sfIm16);
sfIm16 = _mm512_fnmadd_ps(wfIm6, dfRe8, sfIm16);
}
sfRe16 = _mm512_shuffle_f32x4(sfRe16, sfIm16, 68);
_mm512_storeu_ps(sfPtr1+0+71424*i8+17856*j3+35712*k5+768*l2, sfRe15);
_mm512_storeu_ps(sfPtr1+64+71424*i8+17856*j3+35712*k5+768*l2, sfIm15);
_mm512_storeu_ps(sfPtr1+128+71424*i8+17856*j3+35712*k5+768*l2, sfRe16);
}
return;
}
char*restrict bfPtr3 = tensors6[0]+376*e3;
char*restrict wfPtr3 = tensors6[0]+384+23871488*e3+637696*z2;
char*restrict dfPtr3 = tensors6[1]+2031616*e3+54272*z2;
char*restrict sfPtr2 = tensors6[2];
ptrdiff_t i9 = 1*g4;
ptrdiff_t j4 = 1*p1;
ptrdiff_t jj3 = j4+0;
if (__builtin_expect(!j4, 0)) {
ptrdiff_t k6 = 1*d1;
ptrdiff_t l3 = 6*w2;
ptrdiff_t ll3 = l3+5;
for (; l3 != 23; ++l3) {
__m512 sfRe17 = _mm512_setzero_ps();
__m512 sfIm17 = _mm512_setzero_ps();
__m512 sfRe20 = _mm512_setzero_ps();
__m512 sfIm20 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe18 = sfRe17;
__m512 sfIm18 = sfIm17;
__m512 sfRe19 = sfRe17;
__m512 sfIm19 = sfIm17;
__m512 sfRe21 = sfRe20;
__m512 sfIm21 = sfIm20;
__m512 sfRe22 = sfRe20;
__m512 sfIm22 = sfIm20;
for (ptrdiff_t s6 = 0; s6 < 53; ++s6) {
__m512i wfLd7 = _mm512_loadu_si512(wfPtr3+0+637696*i9+159424*j4+6784*l3+128*s6);
__m512 wfRe7 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd7));
__m512 wfIm7 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd7, 1));
__m512 wfMx4 = _mm512_mask_mov_ps(wfIm7, 64764, wfRe7);
__m512i wfLd8 = _mm512_loadu_si512(wfPtr3+64+637696*i9+159424*j4+6784*l3+128*s6);
__m512 wfRe8 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd8));
__m512 wfIm8 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd8, 1));
__m512 wfMx5 = _mm512_mask_mov_ps(wfIm8, 64764, wfRe8);
__m512 dfRe9 = _mm512_loadu_ps(dfPtr3+0+54272*i9+13568*j4+20352*k6+256*s6);
__m512 dfIm9 = _mm512_loadu_ps(dfPtr3+64+54272*i9+13568*j4+20352*k6+256*s6);
sfRe17 = _mm512_fmadd_ps(wfRe7, dfRe9, sfRe17);
sfRe17 = _mm512_mask3_fmadd_ps(wfIm7, dfIm9, sfRe17, 64764);
sfIm17 = _mm512_fmadd_ps(wfMx4, dfIm9, sfIm17);
sfIm17 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe9, sfIm17, 64764);
sfRe20 = _mm512_fmadd_ps(wfRe8, dfRe9, sfRe20);
sfRe20 = _mm512_mask3_fmadd_ps(wfIm8, dfIm9, sfRe20, 64764);
sfIm20 = _mm512_fmadd_ps(wfMx5, dfIm9, sfIm20);
sfIm20 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe9, sfIm20, 64764);
dfRe9 = _mm512_shuffle_f32x4(dfRe9, dfRe9, 78);
dfIm9 = _mm512_shuffle_f32x4(dfIm9, dfIm9, 78);
sfRe18 = _mm512_fmadd_ps(wfRe7, dfRe9, sfRe18);
sfRe18 = _mm512_mask3_fmadd_ps(wfIm7, dfIm9, sfRe18, 64764);
sfIm18 = _mm512_fmadd_ps(wfMx4, dfIm9, sfIm18);
sfIm18 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe9, sfIm18, 64764);
sfRe21 = _mm512_fmadd_ps(wfRe8, dfRe9, sfRe21);
sfRe21 = _mm512_mask3_fmadd_ps(wfIm8, dfIm9, sfRe21, 64764);
sfIm21 = _mm512_fmadd_ps(wfMx5, dfIm9, sfIm21);
sfIm21 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe9, sfIm21, 64764);
__m512 dfRe10 = _mm512_loadu_ps(dfPtr3+128+54272*i9+13568*j4+20352*k6+256*s6);
__m512 dfIm10 = _mm512_loadu_ps(dfPtr3+192+54272*i9+13568*j4+20352*k6+256*s6);
sfRe19 = _mm512_fmadd_ps(wfRe7, dfRe10, sfRe19);
sfRe19 = _mm512_mask3_fmadd_ps(wfIm7, dfIm10, sfRe19, 64764);
sfIm19 = _mm512_fmadd_ps(wfMx4, dfIm10, sfIm19);
sfIm19 = _mm512_mask3_fnmadd_ps(wfIm7, dfRe10, sfIm19, 64764);
sfRe22 = _mm512_fmadd_ps(wfRe8, dfRe10, sfRe22);
sfRe22 = _mm512_mask3_fmadd_ps(wfIm8, dfIm10, sfRe22, 64764);
sfIm22 = _mm512_fmadd_ps(wfMx5, dfIm10, sfIm22);
sfIm22 = _mm512_mask3_fnmadd_ps(wfIm8, dfRe10, sfIm22, 64764);
}
sfRe17 = _mm512_add_ps(sfRe17, _mm512_loadu_ps(sfPtr2+0+71424*i9+17856*j4+35712*k6+768*l3));
sfIm17 = _mm512_add_ps(sfIm17, _mm512_loadu_ps(sfPtr2+64+71424*i9+17856*j4+35712*k6+768*l3));
sfRe18 = _mm512_add_ps(sfRe18, _mm512_loadu_ps(sfPtr2+128+71424*i9+17856*j4+35712*k6+768*l3));
sfIm18 = _mm512_add_ps(sfIm18, _mm512_loadu_ps(sfPtr2+192+71424*i9+17856*j4+35712*k6+768*l3));
sfRe19 = _mm512_add_ps(sfRe19, _mm512_loadu_ps(sfPtr2+256+71424*i9+17856*j4+35712*k6+768*l3));
sfIm19 = _mm512_add_ps(sfIm19, _mm512_loadu_ps(sfPtr2+320+71424*i9+17856*j4+35712*k6+768*l3));
sfRe20 = _mm512_add_ps(sfRe20, _mm512_loadu_ps(sfPtr2+384+71424*i9+17856*j4+35712*k6+768*l3));
sfIm20 = _mm512_add_ps(sfIm20, _mm512_loadu_ps(sfPtr2+448+71424*i9+17856*j4+35712*k6+768*l3));
sfRe21 = _mm512_add_ps(sfRe21, _mm512_loadu_ps(sfPtr2+512+71424*i9+17856*j4+35712*k6+768*l3));
sfIm21 = _mm512_add_ps(sfIm21, _mm512_loadu_ps(sfPtr2+576+71424*i9+17856*j4+35712*k6+768*l3));
sfRe22 = _mm512_add_ps(sfRe22, _mm512_loadu_ps(sfPtr2+640+71424*i9+17856*j4+35712*k6+768*l3));
sfIm22 = _mm512_add_ps(sfIm22, _mm512_loadu_ps(sfPtr2+704+71424*i9+17856*j4+35712*k6+768*l3));
_mm512_storeu_ps(sfPtr2+0+71424*i9+17856*j4+35712*k6+768*l3, sfRe17);
_mm512_storeu_ps(sfPtr2+64+71424*i9+17856*j4+35712*k6+768*l3, sfIm17);
_mm512_storeu_ps(sfPtr2+128+71424*i9+17856*j4+35712*k6+768*l3, sfRe18);
_mm512_storeu_ps(sfPtr2+192+71424*i9+17856*j4+35712*k6+768*l3, sfIm18);
_mm512_storeu_ps(sfPtr2+256+71424*i9+17856*j4+35712*k6+768*l3, sfRe19);
_mm512_storeu_ps(sfPtr2+320+71424*i9+17856*j4+35712*k6+768*l3, sfIm19);
_mm512_storeu_ps(sfPtr2+384+71424*i9+17856*j4+35712*k6+768*l3, sfRe20);
_mm512_storeu_ps(sfPtr2+448+71424*i9+17856*j4+35712*k6+768*l3, sfIm20);
_mm512_storeu_ps(sfPtr2+512+71424*i9+17856*j4+35712*k6+768*l3, sfRe21);
_mm512_storeu_ps(sfPtr2+576+71424*i9+17856*j4+35712*k6+768*l3, sfIm21);
_mm512_storeu_ps(sfPtr2+640+71424*i9+17856*j4+35712*k6+768*l3, sfRe22);
_mm512_storeu_ps(sfPtr2+704+71424*i9+17856*j4+35712*k6+768*l3, sfIm22);
if (l3 >= ll3) return;
}
__m512 sfRe23 = _mm512_setzero_ps();
__m512 sfIm23 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe24 = sfRe23;
__m512 sfIm24 = sfIm23;
for (ptrdiff_t s7 = 0; s7 < 53; ++s7) {
__m512i wfLd9 = _mm512_loadu_si512(wfPtr3+0+637696*i9+159424*j4+6784*l3+64*s7);
__m512 wfRe9 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd9));
__m512 wfIm9 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd9, 1));
__m512 wfMx6 = _mm512_mask_mov_ps(wfIm9, 64764, wfRe9);
__m512 dfRe11 = _mm512_loadu_ps(dfPtr3+0+54272*i9+13568*j4+20352*k6+256*s7);
__m512 dfIm11 = _mm512_loadu_ps(dfPtr3+64+54272*i9+13568*j4+20352*k6+256*s7);
sfRe23 = _mm512_fmadd_ps(wfRe9, dfRe11, sfRe23);
sfRe23 = _mm512_mask3_fmadd_ps(wfIm9, dfIm11, sfRe23, 64764);
sfIm23 = _mm512_fmadd_ps(wfMx6, dfIm11, sfIm23);
sfIm23 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe11, sfIm23, 64764);
__m512 dfRe12 = _mm512_loadu_ps(dfPtr3+128+54272*i9+13568*j4+20352*k6+256*s7);
__m512 dfIm12 = _mm512_loadu_ps(dfPtr3+192+54272*i9+13568*j4+20352*k6+256*s7);
sfRe24 = _mm512_fmadd_ps(wfRe9, dfRe12, sfRe24);
sfRe24 = _mm512_mask3_fmadd_ps(wfIm9, dfIm12, sfRe24, 64764);
sfIm24 = _mm512_fmadd_ps(wfMx6, dfIm12, sfIm24);
sfIm24 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe12, sfIm24, 64764);
}
sfRe23 = _mm512_add_ps(sfRe23, _mm512_loadu_ps(sfPtr2+0+71424*i9+17856*j4+35712*k6+768*l3));
sfIm23 = _mm512_add_ps(sfIm23, _mm512_loadu_ps(sfPtr2+64+71424*i9+17856*j4+35712*k6+768*l3));
sfRe24 = _mm512_shuffle_f32x4(sfRe24, sfIm24, 68);
sfRe24 = _mm512_add_ps(sfRe24, _mm512_loadu_ps(sfPtr2+128+71424*i9+17856*j4+35712*k6+768*l3));
_mm512_storeu_ps(sfPtr2+0+71424*i9+17856*j4+35712*k6+768*l3, sfRe23);
_mm512_storeu_ps(sfPtr2+64+71424*i9+17856*j4+35712*k6+768*l3, sfIm23);
_mm512_storeu_ps(sfPtr2+128+71424*i9+17856*j4+35712*k6+768*l3, sfRe24);
j4 = 1;
}
for (; j4 <= jj3; ++j4) {
ptrdiff_t k7 = 1*d1;
ptrdiff_t l4 = 6*w2;
ptrdiff_t ll4 = l4+5;
for (; l4 != 23; ++l4) {
__m512 sfRe25 = _mm512_setzero_ps();
__m512 sfIm25 = _mm512_setzero_ps();
__m512 sfRe28 = _mm512_setzero_ps();
__m512 sfIm28 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe26 = sfRe25;
__m512 sfIm26 = sfIm25;
__m512 sfRe27 = sfRe25;
__m512 sfIm27 = sfIm25;
__m512 sfRe29 = sfRe28;
__m512 sfIm29 = sfIm28;
__m512 sfRe30 = sfRe28;
__m512 sfIm30 = sfIm28;
for (ptrdiff_t s8 = 0; s8 < 53; ++s8) {
__m512i wfLd10 = _mm512_loadu_si512(wfPtr3+0+637696*i9+159424*j4+6784*l4+128*s8);
__m512 wfRe10 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd10));
__m512 wfIm10 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd10, 1));
__m512i wfLd11 = _mm512_loadu_si512(wfPtr3+64+637696*i9+159424*j4+6784*l4+128*s8);
__m512 wfRe11 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd11));
__m512 wfIm11 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd11, 1));
__m512 dfRe13 = _mm512_loadu_ps(dfPtr3+0+54272*i9+13568*j4+20352*k7+256*s8);
__m512 dfIm13 = _mm512_loadu_ps(dfPtr3+64+54272*i9+13568*j4+20352*k7+256*s8);
sfRe25 = _mm512_fmadd_ps(wfRe10, dfRe13, sfRe25);
sfRe25 = _mm512_fmadd_ps(wfIm10, dfIm13, sfRe25);
sfIm25 = _mm512_fmadd_ps(wfRe10, dfIm13, sfIm25);
sfIm25 = _mm512_fnmadd_ps(wfIm10, dfRe13, sfIm25);
sfRe28 = _mm512_fmadd_ps(wfRe11, dfRe13, sfRe28);
sfRe28 = _mm512_fmadd_ps(wfIm11, dfIm13, sfRe28);
sfIm28 = _mm512_fmadd_ps(wfRe11, dfIm13, sfIm28);
sfIm28 = _mm512_fnmadd_ps(wfIm11, dfRe13, sfIm28);
dfRe13 = _mm512_shuffle_f32x4(dfRe13, dfRe13, 78);
dfIm13 = _mm512_shuffle_f32x4(dfIm13, dfIm13, 78);
sfRe26 = _mm512_fmadd_ps(wfRe10, dfRe13, sfRe26);
sfRe26 = _mm512_fmadd_ps(wfIm10, dfIm13, sfRe26);
sfIm26 = _mm512_fmadd_ps(wfRe10, dfIm13, sfIm26);
sfIm26 = _mm512_fnmadd_ps(wfIm10, dfRe13, sfIm26);
sfRe29 = _mm512_fmadd_ps(wfRe11, dfRe13, sfRe29);
sfRe29 = _mm512_fmadd_ps(wfIm11, dfIm13, sfRe29);
sfIm29 = _mm512_fmadd_ps(wfRe11, dfIm13, sfIm29);
sfIm29 = _mm512_fnmadd_ps(wfIm11, dfRe13, sfIm29);
__m512 dfRe14 = _mm512_loadu_ps(dfPtr3+128+54272*i9+13568*j4+20352*k7+256*s8);
__m512 dfIm14 = _mm512_loadu_ps(dfPtr3+192+54272*i9+13568*j4+20352*k7+256*s8);
sfRe27 = _mm512_fmadd_ps(wfRe10, dfRe14, sfRe27);
sfRe27 = _mm512_fmadd_ps(wfIm10, dfIm14, sfRe27);
sfIm27 = _mm512_fmadd_ps(wfRe10, dfIm14, sfIm27);
sfIm27 = _mm512_fnmadd_ps(wfIm10, dfRe14, sfIm27);
sfRe30 = _mm512_fmadd_ps(wfRe11, dfRe14, sfRe30);
sfRe30 = _mm512_fmadd_ps(wfIm11, dfIm14, sfRe30);
sfIm30 = _mm512_fmadd_ps(wfRe11, dfIm14, sfIm30);
sfIm30 = _mm512_fnmadd_ps(wfIm11, dfRe14, sfIm30);
}
sfRe25 = _mm512_add_ps(sfRe25, _mm512_loadu_ps(sfPtr2+0+71424*i9+17856*j4+35712*k7+768*l4));
sfIm25 = _mm512_add_ps(sfIm25, _mm512_loadu_ps(sfPtr2+64+71424*i9+17856*j4+35712*k7+768*l4));
sfRe26 = _mm512_add_ps(sfRe26, _mm512_loadu_ps(sfPtr2+128+71424*i9+17856*j4+35712*k7+768*l4));
sfIm26 = _mm512_add_ps(sfIm26, _mm512_loadu_ps(sfPtr2+192+71424*i9+17856*j4+35712*k7+768*l4));
sfRe27 = _mm512_add_ps(sfRe27, _mm512_loadu_ps(sfPtr2+256+71424*i9+17856*j4+35712*k7+768*l4));
sfIm27 = _mm512_add_ps(sfIm27, _mm512_loadu_ps(sfPtr2+320+71424*i9+17856*j4+35712*k7+768*l4));
sfRe28 = _mm512_add_ps(sfRe28, _mm512_loadu_ps(sfPtr2+384+71424*i9+17856*j4+35712*k7+768*l4));
sfIm28 = _mm512_add_ps(sfIm28, _mm512_loadu_ps(sfPtr2+448+71424*i9+17856*j4+35712*k7+768*l4));
sfRe29 = _mm512_add_ps(sfRe29, _mm512_loadu_ps(sfPtr2+512+71424*i9+17856*j4+35712*k7+768*l4));
sfIm29 = _mm512_add_ps(sfIm29, _mm512_loadu_ps(sfPtr2+576+71424*i9+17856*j4+35712*k7+768*l4));
sfRe30 = _mm512_add_ps(sfRe30, _mm512_loadu_ps(sfPtr2+640+71424*i9+17856*j4+35712*k7+768*l4));
sfIm30 = _mm512_add_ps(sfIm30, _mm512_loadu_ps(sfPtr2+704+71424*i9+17856*j4+35712*k7+768*l4));
_mm512_storeu_ps(sfPtr2+0+71424*i9+17856*j4+35712*k7+768*l4, sfRe25);
_mm512_storeu_ps(sfPtr2+64+71424*i9+17856*j4+35712*k7+768*l4, sfIm25);
_mm512_storeu_ps(sfPtr2+128+71424*i9+17856*j4+35712*k7+768*l4, sfRe26);
_mm512_storeu_ps(sfPtr2+192+71424*i9+17856*j4+35712*k7+768*l4, sfIm26);
_mm512_storeu_ps(sfPtr2+256+71424*i9+17856*j4+35712*k7+768*l4, sfRe27);
_mm512_storeu_ps(sfPtr2+320+71424*i9+17856*j4+35712*k7+768*l4, sfIm27);
_mm512_storeu_ps(sfPtr2+384+71424*i9+17856*j4+35712*k7+768*l4, sfRe28);
_mm512_storeu_ps(sfPtr2+448+71424*i9+17856*j4+35712*k7+768*l4, sfIm28);
_mm512_storeu_ps(sfPtr2+512+71424*i9+17856*j4+35712*k7+768*l4, sfRe29);
_mm512_storeu_ps(sfPtr2+576+71424*i9+17856*j4+35712*k7+768*l4, sfIm29);
_mm512_storeu_ps(sfPtr2+640+71424*i9+17856*j4+35712*k7+768*l4, sfRe30);
_mm512_storeu_ps(sfPtr2+704+71424*i9+17856*j4+35712*k7+768*l4, sfIm30);
if (l4 >= ll4) return;
}
__m512 sfRe31 = _mm512_setzero_ps();
__m512 sfIm31 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe32 = sfRe31;
__m512 sfIm32 = sfIm31;
for (ptrdiff_t s9 = 0; s9 < 53; ++s9) {
__m512i wfLd12 = _mm512_loadu_si512(wfPtr3+0+637696*i9+159424*j4+6784*l4+64*s9);
__m512 wfRe12 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd12));
__m512 wfIm12 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd12, 1));
__m512 dfRe15 = _mm512_loadu_ps(dfPtr3+0+54272*i9+13568*j4+20352*k7+256*s9);
__m512 dfIm15 = _mm512_loadu_ps(dfPtr3+64+54272*i9+13568*j4+20352*k7+256*s9);
sfRe31 = _mm512_fmadd_ps(wfRe12, dfRe15, sfRe31);
sfRe31 = _mm512_fmadd_ps(wfIm12, dfIm15, sfRe31);
sfIm31 = _mm512_fmadd_ps(wfRe12, dfIm15, sfIm31);
sfIm31 = _mm512_fnmadd_ps(wfIm12, dfRe15, sfIm31);
__m512 dfRe16 = _mm512_loadu_ps(dfPtr3+128+54272*i9+13568*j4+20352*k7+256*s9);
__m512 dfIm16 = _mm512_loadu_ps(dfPtr3+192+54272*i9+13568*j4+20352*k7+256*s9);
sfRe32 = _mm512_fmadd_ps(wfRe12, dfRe16, sfRe32);
sfRe32 = _mm512_fmadd_ps(wfIm12, dfIm16, sfRe32);
sfIm32 = _mm512_fmadd_ps(wfRe12, dfIm16, sfIm32);
sfIm32 = _mm512_fnmadd_ps(wfIm12, dfRe16, sfIm32);
}
sfRe31 = _mm512_add_ps(sfRe31, _mm512_loadu_ps(sfPtr2+0+71424*i9+17856*j4+35712*k7+768*l4));
sfIm31 = _mm512_add_ps(sfIm31, _mm512_loadu_ps(sfPtr2+64+71424*i9+17856*j4+35712*k7+768*l4));
sfRe32 = _mm512_shuffle_f32x4(sfRe32, sfIm32, 68);
sfRe32 = _mm512_add_ps(sfRe32, _mm512_loadu_ps(sfPtr2+128+71424*i9+17856*j4+35712*k7+768*l4));
_mm512_storeu_ps(sfPtr2+0+71424*i9+17856*j4+35712*k7+768*l4, sfRe31);
_mm512_storeu_ps(sfPtr2+64+71424*i9+17856*j4+35712*k7+768*l4, sfIm31);
_mm512_storeu_ps(sfPtr2+128+71424*i9+17856*j4+35712*k7+768*l4, sfRe32);
}
}

static void Example29StriderProduceSums1(Example29ThreaderTeam1* team16, char** tensors5) {
void* tuple1[3];
tuple1[0] = tensors5;
for (ptrdiff_t e4 = 0; e4 < 1; ++e4) {
tuple1[1] = (void*)e4;
for (ptrdiff_t z3 = 0; z3 < 4; ++z3) {
tuple1[2] = (void*)z3;
Example29ThreaderTask1 task9;
task9.callee1 = Example29StriderProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 4;
task9.hull1[1] = 1;
task9.hull1[2] = 4;
task9.hull1[3] = 1;
Example29ThreaderDo1(team16, &task9);
}
}
}

static void Example29StriderConsumeSums1Callee1(Example29ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t w3 = 0;
ptrdiff_t d2 = 0;
ptrdiff_t g5 = 0;
(void)pt10;
char*restrict sfPtr3 = tensors8[0];
char*restrict datPtr3 = tensors8[1];
char*restrict bnPtr4 = tensors8[2];
char*restrict datPtr4 = tensors8[3];
ptrdiff_t i10 = 1*g5;
ptrdiff_t j5 = 1*d2;
ptrdiff_t rel2 = j5-0;
ptrdiff_t base2 = 0;
ptrdiff_t toH1 = base2+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k8 = 24*w3;
for (; k8 != 23; ++k8) {
ptrdiff_t r2 = 0;
for (; r2 != 2; ++r2) {
__m512 bnMul2 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(0+93*i10+4*k8+2*r2))[0]);
__m512 bnAdd2 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(0+93*i10+4*k8+2*r2))[1]);
__m512 bnMul3 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(1+93*i10+4*k8+2*r2))[0]);
__m512 bnAdd3 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(1+93*i10+4*k8+2*r2))[1]);
ptrdiff_t t2 = 0;
__m512 sfRe33 = _mm512_loadu_ps(sfPtr3+0+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfIm33 = _mm512_loadu_ps(sfPtr3+64+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfRe37 = _mm512_loadu_ps(sfPtr3+128+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfIm37 = _mm512_loadu_ps(sfPtr3+192+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfRe34 = _mm512_loadu_ps(sfPtr3+17856+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfIm34 = _mm512_loadu_ps(sfPtr3+17920+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfRe38 = _mm512_loadu_ps(sfPtr3+17984+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfIm38 = _mm512_loadu_ps(sfPtr3+18048+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfRe35 = _mm512_loadu_ps(sfPtr3+35712+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfIm35 = _mm512_loadu_ps(sfPtr3+35776+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfRe39 = _mm512_loadu_ps(sfPtr3+35840+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfIm39 = _mm512_loadu_ps(sfPtr3+35904+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfRe36 = _mm512_loadu_ps(sfPtr3+53568+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfIm36 = _mm512_loadu_ps(sfPtr3+53632+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfRe40 = _mm512_loadu_ps(sfPtr3+53696+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512 sfIm40 = _mm512_loadu_ps(sfPtr3+53760+71424*i10+35712*j5+768*k8+384*r2+256*t2);
__m512i ifft1 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2 = _mm512_permutexvar_ps(ifft1, sfRe33);
__m512 ifft93 = _mm512_permutexvar_ps(ifft1, sfRe37);
__m512i ifft3 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4 = _mm512_permutexvar_ps(ifft3, sfRe33);
__m512 ifft94 = _mm512_permutexvar_ps(ifft3, sfRe37);
__m512 ifft5 = _mm512_permutexvar_ps(ifft1, sfIm33);
__m512 ifft95 = _mm512_permutexvar_ps(ifft1, sfIm37);
__m512 ifft6 = _mm512_permutexvar_ps(ifft3, sfIm33);
__m512 ifft96 = _mm512_permutexvar_ps(ifft3, sfIm37);
__m512 ifft7 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft8 = _mm512_mask_fmadd_ps(ifft6, 65021, ifft7, ifft2);
__m512 ifft97 = _mm512_mask_fmadd_ps(ifft96, 65021, ifft7, ifft93);
__m512 ifft9 = _mm512_mask_fnmadd_ps(ifft5, 65021, ifft7, ifft4);
__m512 ifft98 = _mm512_mask_fnmadd_ps(ifft95, 65021, ifft7, ifft94);
__m512 ifft10 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft11 = _mm512_fmadd_ps(ifft8, ifft10, _mm512_shuffle_ps(ifft8, ifft8, 177));
__m512 ifft99 = _mm512_fmadd_ps(ifft97, ifft10, _mm512_shuffle_ps(ifft97, ifft97, 177));
__m512 ifft12 = _mm512_fmadd_ps(ifft9, ifft10, _mm512_shuffle_ps(ifft9, ifft9, 177));
__m512 ifft100 = _mm512_fmadd_ps(ifft98, ifft10, _mm512_shuffle_ps(ifft98, ifft98, 177));
__m512 ifft13 = _mm512_fmadd_ps(sfRe34, ifft10, _mm512_shuffle_ps(sfRe34, sfRe34, 177));
__m512 ifft101 = _mm512_fmadd_ps(sfRe38, ifft10, _mm512_shuffle_ps(sfRe38, sfRe38, 177));
__m512 ifft14 = _mm512_fmadd_ps(sfIm34, ifft10, _mm512_shuffle_ps(sfIm34, sfIm34, 177));
__m512 ifft102 = _mm512_fmadd_ps(sfIm38, ifft10, _mm512_shuffle_ps(sfIm38, sfIm38, 177));
__m512 ifft15 = _mm512_fmadd_ps(sfRe35, ifft10, _mm512_shuffle_ps(sfRe35, sfRe35, 177));
__m512 ifft103 = _mm512_fmadd_ps(sfRe39, ifft10, _mm512_shuffle_ps(sfRe39, sfRe39, 177));
__m512 ifft16 = _mm512_fmadd_ps(sfIm35, ifft10, _mm512_shuffle_ps(sfIm35, sfIm35, 177));
__m512 ifft104 = _mm512_fmadd_ps(sfIm39, ifft10, _mm512_shuffle_ps(sfIm39, sfIm39, 177));
__m512 ifft17 = _mm512_fmadd_ps(sfRe36, ifft10, _mm512_shuffle_ps(sfRe36, sfRe36, 177));
__m512 ifft105 = _mm512_fmadd_ps(sfRe40, ifft10, _mm512_shuffle_ps(sfRe40, sfRe40, 177));
__m512 ifft18 = _mm512_fmadd_ps(sfIm36, ifft10, _mm512_shuffle_ps(sfIm36, sfIm36, 177));
__m512 ifft106 = _mm512_fmadd_ps(sfIm40, ifft10, _mm512_shuffle_ps(sfIm40, sfIm40, 177));
__m512 ifft19 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft20 = _mm512_mul_ps(ifft11, ifft19);
__m512 ifft107 = _mm512_mul_ps(ifft99, ifft19);
__m512 ifft21 = _mm512_mul_ps(ifft12, ifft19);
__m512 ifft108 = _mm512_mul_ps(ifft100, ifft19);
__m512 ifft22 = _mm512_mul_ps(ifft13, ifft19);
__m512 ifft109 = _mm512_mul_ps(ifft101, ifft19);
__m512 ifft23 = _mm512_mul_ps(ifft14, ifft19);
__m512 ifft110 = _mm512_mul_ps(ifft102, ifft19);
__m512 ifft24 = _mm512_mul_ps(ifft15, ifft19);
__m512 ifft111 = _mm512_mul_ps(ifft103, ifft19);
__m512 ifft25 = _mm512_mul_ps(ifft16, ifft19);
__m512 ifft112 = _mm512_mul_ps(ifft104, ifft19);
__m512 ifft26 = _mm512_mul_ps(ifft17, ifft19);
__m512 ifft113 = _mm512_mul_ps(ifft105, ifft19);
__m512 ifft27 = _mm512_mul_ps(ifft18, ifft19);
__m512 ifft114 = _mm512_mul_ps(ifft106, ifft19);
__m512 ifft28 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft29 = _mm512_fnmadd_ps(ifft12, ifft28, ifft20);
__m512 ifft115 = _mm512_fnmadd_ps(ifft100, ifft28, ifft107);
__m512 ifft30 = _mm512_fmadd_ps(ifft11, ifft28, ifft21);
__m512 ifft116 = _mm512_fmadd_ps(ifft99, ifft28, ifft108);
__m512 ifft31 = _mm512_fnmadd_ps(ifft14, ifft28, ifft22);
__m512 ifft117 = _mm512_fnmadd_ps(ifft102, ifft28, ifft109);
__m512 ifft32 = _mm512_fmadd_ps(ifft13, ifft28, ifft23);
__m512 ifft118 = _mm512_fmadd_ps(ifft101, ifft28, ifft110);
__m512 ifft33 = _mm512_fnmadd_ps(ifft16, ifft28, ifft24);
__m512 ifft119 = _mm512_fnmadd_ps(ifft104, ifft28, ifft111);
__m512 ifft34 = _mm512_fmadd_ps(ifft15, ifft28, ifft25);
__m512 ifft120 = _mm512_fmadd_ps(ifft103, ifft28, ifft112);
__m512 ifft35 = _mm512_fnmadd_ps(ifft18, ifft28, ifft26);
__m512 ifft121 = _mm512_fnmadd_ps(ifft106, ifft28, ifft113);
__m512 ifft36 = _mm512_fmadd_ps(ifft17, ifft28, ifft27);
__m512 ifft122 = _mm512_fmadd_ps(ifft105, ifft28, ifft114);
__m512 ifft37 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft38 = _mm512_fmadd_ps(ifft29, ifft37, _mm512_shuffle_ps(ifft29, ifft29, 78));
__m512 ifft123 = _mm512_fmadd_ps(ifft115, ifft37, _mm512_shuffle_ps(ifft115, ifft115, 78));
__m512 ifft39 = _mm512_fmadd_ps(ifft30, ifft37, _mm512_shuffle_ps(ifft30, ifft30, 78));
__m512 ifft124 = _mm512_fmadd_ps(ifft116, ifft37, _mm512_shuffle_ps(ifft116, ifft116, 78));
__m512 ifft40 = _mm512_fmadd_ps(ifft31, ifft37, _mm512_shuffle_ps(ifft31, ifft31, 78));
__m512 ifft125 = _mm512_fmadd_ps(ifft117, ifft37, _mm512_shuffle_ps(ifft117, ifft117, 78));
__m512 ifft41 = _mm512_fmadd_ps(ifft32, ifft37, _mm512_shuffle_ps(ifft32, ifft32, 78));
__m512 ifft126 = _mm512_fmadd_ps(ifft118, ifft37, _mm512_shuffle_ps(ifft118, ifft118, 78));
__m512 ifft42 = _mm512_fmadd_ps(ifft33, ifft37, _mm512_shuffle_ps(ifft33, ifft33, 78));
__m512 ifft127 = _mm512_fmadd_ps(ifft119, ifft37, _mm512_shuffle_ps(ifft119, ifft119, 78));
__m512 ifft43 = _mm512_fmadd_ps(ifft34, ifft37, _mm512_shuffle_ps(ifft34, ifft34, 78));
__m512 ifft128 = _mm512_fmadd_ps(ifft120, ifft37, _mm512_shuffle_ps(ifft120, ifft120, 78));
__m512 ifft44 = _mm512_fmadd_ps(ifft35, ifft37, _mm512_shuffle_ps(ifft35, ifft35, 78));
__m512 ifft129 = _mm512_fmadd_ps(ifft121, ifft37, _mm512_shuffle_ps(ifft121, ifft121, 78));
__m512 ifft45 = _mm512_fmadd_ps(ifft36, ifft37, _mm512_shuffle_ps(ifft36, ifft36, 78));
__m512 ifft130 = _mm512_fmadd_ps(ifft122, ifft37, _mm512_shuffle_ps(ifft122, ifft122, 78));
__m512 ifft46 = _mm512_mask_sub_ps(ifft38, 49344, _mm512_setzero_ps(), ifft39);
__m512 ifft131 = _mm512_mask_sub_ps(ifft123, 49344, _mm512_setzero_ps(), ifft124);
__m512 ifft47 = _mm512_mask_mov_ps(ifft39, 49344, ifft38);
__m512 ifft132 = _mm512_mask_mov_ps(ifft124, 49344, ifft123);
__m512 ifft48 = _mm512_mask_sub_ps(ifft40, 49344, _mm512_setzero_ps(), ifft41);
__m512 ifft133 = _mm512_mask_sub_ps(ifft125, 49344, _mm512_setzero_ps(), ifft126);
__m512 ifft49 = _mm512_mask_mov_ps(ifft41, 49344, ifft40);
__m512 ifft134 = _mm512_mask_mov_ps(ifft126, 49344, ifft125);
__m512 ifft50 = _mm512_mask_sub_ps(ifft42, 49344, _mm512_setzero_ps(), ifft43);
__m512 ifft135 = _mm512_mask_sub_ps(ifft127, 49344, _mm512_setzero_ps(), ifft128);
__m512 ifft51 = _mm512_mask_mov_ps(ifft43, 49344, ifft42);
__m512 ifft136 = _mm512_mask_mov_ps(ifft128, 49344, ifft127);
__m512 ifft52 = _mm512_mask_sub_ps(ifft44, 49344, _mm512_setzero_ps(), ifft45);
__m512 ifft137 = _mm512_mask_sub_ps(ifft129, 49344, _mm512_setzero_ps(), ifft130);
__m512 ifft53 = _mm512_mask_mov_ps(ifft45, 49344, ifft44);
__m512 ifft138 = _mm512_mask_mov_ps(ifft130, 49344, ifft129);
__m512 ifft54 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft55 = _mm512_fmadd_ps(ifft46, ifft54, _mm512_shuffle_f32x4(ifft46, ifft46, 177));
__m512 ifft139 = _mm512_fmadd_ps(ifft131, ifft54, _mm512_shuffle_f32x4(ifft131, ifft131, 177));
__m512 ifft56 = _mm512_fmadd_ps(ifft47, ifft54, _mm512_shuffle_f32x4(ifft47, ifft47, 177));
__m512 ifft140 = _mm512_fmadd_ps(ifft132, ifft54, _mm512_shuffle_f32x4(ifft132, ifft132, 177));
__m512 ifft57 = _mm512_fmadd_ps(ifft48, ifft54, _mm512_shuffle_f32x4(ifft48, ifft48, 177));
__m512 ifft141 = _mm512_fmadd_ps(ifft133, ifft54, _mm512_shuffle_f32x4(ifft133, ifft133, 177));
__m512 ifft58 = _mm512_fmadd_ps(ifft49, ifft54, _mm512_shuffle_f32x4(ifft49, ifft49, 177));
__m512 ifft142 = _mm512_fmadd_ps(ifft134, ifft54, _mm512_shuffle_f32x4(ifft134, ifft134, 177));
__m512 ifft59 = _mm512_fmadd_ps(ifft50, ifft54, _mm512_shuffle_f32x4(ifft50, ifft50, 177));
__m512 ifft143 = _mm512_fmadd_ps(ifft135, ifft54, _mm512_shuffle_f32x4(ifft135, ifft135, 177));
__m512 ifft60 = _mm512_fnmsub_ps(ifft51, ifft54, _mm512_shuffle_f32x4(ifft51, ifft51, 177));
__m512 ifft144 = _mm512_fnmsub_ps(ifft136, ifft54, _mm512_shuffle_f32x4(ifft136, ifft136, 177));
__m512 ifft61 = _mm512_fmadd_ps(ifft52, ifft54, _mm512_shuffle_f32x4(ifft52, ifft52, 177));
__m512 ifft145 = _mm512_fmadd_ps(ifft137, ifft54, _mm512_shuffle_f32x4(ifft137, ifft137, 177));
__m512 ifft62 = _mm512_fmadd_ps(ifft53, ifft54, _mm512_shuffle_f32x4(ifft53, ifft53, 177));
__m512 ifft146 = _mm512_fmadd_ps(ifft138, ifft54, _mm512_shuffle_f32x4(ifft138, ifft138, 177));
__m512 ifft63 = _mm512_add_ps(ifft55, ifft56);
__m512 ifft147 = _mm512_add_ps(ifft139, ifft140);
__m512 ifft64 = _mm512_sub_ps(ifft55, ifft56);
__m512 ifft148 = _mm512_sub_ps(ifft139, ifft140);
__m512 ifft65 = _mm512_sub_ps(ifft57, ifft61);
__m512 ifft149 = _mm512_sub_ps(ifft141, ifft145);
__m512 ifft66 = _mm512_add_ps(ifft58, ifft62);
__m512 ifft150 = _mm512_add_ps(ifft142, ifft146);
__m512 ifft67 = _mm512_add_ps(ifft57, ifft61);
__m512 ifft151 = _mm512_add_ps(ifft141, ifft145);
__m512 ifft68 = _mm512_sub_ps(ifft58, ifft62);
__m512 ifft152 = _mm512_sub_ps(ifft142, ifft146);
__m512 ifft69 = _mm512_mul_ps(ifft59, _mm512_set1_ps(3.125e-02f));
__m512 ifft153 = _mm512_mul_ps(ifft143, _mm512_set1_ps(3.125e-02f));
__m512 ifft70 = _mm512_mul_ps(ifft60, _mm512_set1_ps(3.125e-02f));
__m512 ifft154 = _mm512_mul_ps(ifft144, _mm512_set1_ps(3.125e-02f));
__m512 ifft71 = _mm512_fmadd_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft155 = _mm512_fmadd_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft72 = _mm512_fmsub_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft156 = _mm512_fmsub_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft73 = _mm512_fmadd_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft157 = _mm512_fmadd_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft74 = _mm512_fmsub_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft158 = _mm512_fmsub_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft75 = _mm512_add_ps(ifft65, ifft66);
__m512 ifft159 = _mm512_add_ps(ifft149, ifft150);
__m512 ifft76 = _mm512_sub_ps(ifft65, ifft66);
__m512 ifft160 = _mm512_sub_ps(ifft149, ifft150);
__m512 ifft77 = _mm512_fnmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft161 = _mm512_fnmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft78 = _mm512_fmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft162 = _mm512_fmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft79 = _mm512_fmadd_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft163 = _mm512_fmadd_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft80 = _mm512_fmsub_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft164 = _mm512_fmsub_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft81 = _mm512_add_ps(ifft77, ifft78);
__m512 ifft165 = _mm512_add_ps(ifft161, ifft162);
__m512 ifft82 = _mm512_sub_ps(ifft77, ifft78);
__m512 ifft166 = _mm512_sub_ps(ifft161, ifft162);
__m512 ifft83 = _mm512_add_ps(ifft79, ifft80);
__m512 ifft167 = _mm512_add_ps(ifft163, ifft164);
__m512 ifft84 = _mm512_sub_ps(ifft79, ifft80);
__m512 ifft168 = _mm512_sub_ps(ifft163, ifft164);
__m512 ifft85 = _mm512_fmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft169 = _mm512_fmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft86 = _mm512_fnmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft170 = _mm512_fnmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft87 = _mm512_fmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft171 = _mm512_fmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft88 = _mm512_fnmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft172 = _mm512_fnmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft89 = _mm512_fnmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft173 = _mm512_fnmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft90 = _mm512_fmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft174 = _mm512_fmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft91 = _mm512_fmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft175 = _mm512_fmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 ifft92 = _mm512_fnmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft176 = _mm512_fnmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 dat27 = ifft85;
__m512 dat31 = ifft169;
__m512 dat28 = ifft87;
__m512 dat32 = ifft171;
__m512 dat29 = ifft89;
__m512 dat33 = ifft173;
__m512 dat30 = ifft91;
__m512 dat34 = ifft175;
(void)ifft86;
(void)ifft170;
(void)ifft88;
(void)ifft172;
(void)ifft90;
(void)ifft174;
(void)ifft92;
(void)ifft176;
__m512i pm1 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack1 = _mm512_permutex2var_ps(dat27, pm1, dat31);
__m512i pm2 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack2 = _mm512_permutex2var_ps(dat27, pm2, dat31);
__m512 pack3 = _mm512_permutex2var_ps(dat28, pm1, dat32);
__m512 pack4 = _mm512_permutex2var_ps(dat28, pm2, dat32);
__m512 pack5 = _mm512_permutex2var_ps(dat29, pm1, dat33);
__m512 pack6 = _mm512_permutex2var_ps(dat29, pm2, dat33);
__m512 pack7 = _mm512_permutex2var_ps(dat30, pm1, dat34);
__m512 pack8 = _mm512_permutex2var_ps(dat30, pm2, dat34);
__mmask16 mask3 = _mm512_cmp_ps_mask(pack1, _mm512_setzero_ps(), _CMP_LT_OQ);
pack1 = _mm512_mask_mul_ps(pack1, mask3, pack1, _mm512_set1_ps(8.125e-01f));
pack1 = _mm512_add_ps(pack1, _mm512_maskz_loadu_ps(1023, datPtr3+0+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2));
pack1 = _mm512_fmadd_ps(pack1, bnMul2, bnAdd2);
__mmask16 mask4 = _mm512_cmp_ps_mask(pack2, _mm512_setzero_ps(), _CMP_LT_OQ);
pack2 = _mm512_mask_mul_ps(pack2, mask4, pack2, _mm512_set1_ps(8.125e-01f));
pack2 = _mm512_add_ps(pack2, _mm512_maskz_loadu_ps(1023, datPtr3+208+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2));
pack2 = _mm512_fmadd_ps(pack2, bnMul3, bnAdd3);
__mmask16 mask5 = _mm512_cmp_ps_mask(pack3, _mm512_setzero_ps(), _CMP_LT_OQ);
pack3 = _mm512_mask_mul_ps(pack3, mask5, pack3, _mm512_set1_ps(8.125e-01f));
pack3 = _mm512_add_ps(pack3, _mm512_maskz_loadu_ps(1023, datPtr3+52+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2));
pack3 = _mm512_fmadd_ps(pack3, bnMul2, bnAdd2);
__mmask16 mask6 = _mm512_cmp_ps_mask(pack4, _mm512_setzero_ps(), _CMP_LT_OQ);
pack4 = _mm512_mask_mul_ps(pack4, mask6, pack4, _mm512_set1_ps(8.125e-01f));
pack4 = _mm512_add_ps(pack4, _mm512_maskz_loadu_ps(1023, datPtr3+260+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2));
pack4 = _mm512_fmadd_ps(pack4, bnMul3, bnAdd3);
__mmask16 mask7 = _mm512_cmp_ps_mask(pack5, _mm512_setzero_ps(), _CMP_LT_OQ);
pack5 = _mm512_mask_mul_ps(pack5, mask7, pack5, _mm512_set1_ps(8.125e-01f));
pack5 = _mm512_add_ps(pack5, _mm512_maskz_loadu_ps(1023, datPtr3+104+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2));
pack5 = _mm512_fmadd_ps(pack5, bnMul2, bnAdd2);
__mmask16 mask8 = _mm512_cmp_ps_mask(pack6, _mm512_setzero_ps(), _CMP_LT_OQ);
pack6 = _mm512_mask_mul_ps(pack6, mask8, pack6, _mm512_set1_ps(8.125e-01f));
pack6 = _mm512_add_ps(pack6, _mm512_maskz_loadu_ps(1023, datPtr3+312+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2));
pack6 = _mm512_fmadd_ps(pack6, bnMul3, bnAdd3);
__mmask16 mask9 = _mm512_cmp_ps_mask(pack7, _mm512_setzero_ps(), _CMP_LT_OQ);
pack7 = _mm512_mask_mul_ps(pack7, mask9, pack7, _mm512_set1_ps(8.125e-01f));
pack7 = _mm512_add_ps(pack7, _mm512_maskz_loadu_ps(1023, datPtr3+156+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2));
pack7 = _mm512_fmadd_ps(pack7, bnMul2, bnAdd2);
__mmask16 mask10 = _mm512_cmp_ps_mask(pack8, _mm512_setzero_ps(), _CMP_LT_OQ);
pack8 = _mm512_mask_mul_ps(pack8, mask10, pack8, _mm512_set1_ps(8.125e-01f));
pack8 = _mm512_add_ps(pack8, _mm512_maskz_loadu_ps(1023, datPtr3+364+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2));
pack8 = _mm512_fmadd_ps(pack8, bnMul3, bnAdd3);
_mm512_mask_storeu_ps(datPtr4+0+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2, 1023, pack1);
_mm512_mask_storeu_ps(datPtr4+208+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2, 1023, pack2);
_mm512_mask_storeu_ps(datPtr4+52+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2, 1023, pack3);
_mm512_mask_storeu_ps(datPtr4+260+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2, 1023, pack4);
_mm512_mask_storeu_ps(datPtr4+104+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2, 1023, pack5);
_mm512_mask_storeu_ps(datPtr4+312+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2, 1023, pack6);
_mm512_mask_storeu_ps(datPtr4+156+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2, 1023, pack7);
_mm512_mask_storeu_ps(datPtr4+364+19344*i10+832*k8+416*r2+52*toH1+4*toW1+40*t2, 1023, pack8);
ptrdiff_t t3 = 0;
__m512 sfRe41 = _mm512_loadu_ps(sfPtr3+256+71424*i10+35712*j5+768*k8+384*r2+0*t3);
__m512 sfIm41 = _mm512_loadu_ps(sfPtr3+320+71424*i10+35712*j5+768*k8+384*r2+0*t3);
__m512 sfRe42 = _mm512_loadu_ps(sfPtr3+18112+71424*i10+35712*j5+768*k8+384*r2+0*t3);
__m512 sfIm42 = _mm512_loadu_ps(sfPtr3+18176+71424*i10+35712*j5+768*k8+384*r2+0*t3);
__m512 sfRe43 = _mm512_loadu_ps(sfPtr3+35968+71424*i10+35712*j5+768*k8+384*r2+0*t3);
__m512 sfIm43 = _mm512_loadu_ps(sfPtr3+36032+71424*i10+35712*j5+768*k8+384*r2+0*t3);
__m512 sfRe44 = _mm512_loadu_ps(sfPtr3+53824+71424*i10+35712*j5+768*k8+384*r2+0*t3);
__m512 sfIm44 = _mm512_loadu_ps(sfPtr3+53888+71424*i10+35712*j5+768*k8+384*r2+0*t3);
__m512i ifft177 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft178 = _mm512_permutexvar_ps(ifft177, sfRe41);
__m512i ifft179 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft180 = _mm512_permutexvar_ps(ifft179, sfRe41);
__m512 ifft181 = _mm512_permutexvar_ps(ifft177, sfIm41);
__m512 ifft182 = _mm512_permutexvar_ps(ifft179, sfIm41);
__m512 ifft183 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft184 = _mm512_mask_fmadd_ps(ifft182, 65021, ifft183, ifft178);
__m512 ifft185 = _mm512_mask_fnmadd_ps(ifft181, 65021, ifft183, ifft180);
__m512 ifft186 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft187 = _mm512_fmadd_ps(ifft184, ifft186, _mm512_shuffle_ps(ifft184, ifft184, 177));
__m512 ifft188 = _mm512_fmadd_ps(ifft185, ifft186, _mm512_shuffle_ps(ifft185, ifft185, 177));
__m512 ifft189 = _mm512_fmadd_ps(sfRe42, ifft186, _mm512_shuffle_ps(sfRe42, sfRe42, 177));
__m512 ifft190 = _mm512_fmadd_ps(sfIm42, ifft186, _mm512_shuffle_ps(sfIm42, sfIm42, 177));
__m512 ifft191 = _mm512_fmadd_ps(sfRe43, ifft186, _mm512_shuffle_ps(sfRe43, sfRe43, 177));
__m512 ifft192 = _mm512_fmadd_ps(sfIm43, ifft186, _mm512_shuffle_ps(sfIm43, sfIm43, 177));
__m512 ifft193 = _mm512_fmadd_ps(sfRe44, ifft186, _mm512_shuffle_ps(sfRe44, sfRe44, 177));
__m512 ifft194 = _mm512_fmadd_ps(sfIm44, ifft186, _mm512_shuffle_ps(sfIm44, sfIm44, 177));
__m512 ifft195 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft196 = _mm512_mul_ps(ifft187, ifft195);
__m512 ifft197 = _mm512_mul_ps(ifft188, ifft195);
__m512 ifft198 = _mm512_mul_ps(ifft189, ifft195);
__m512 ifft199 = _mm512_mul_ps(ifft190, ifft195);
__m512 ifft200 = _mm512_mul_ps(ifft191, ifft195);
__m512 ifft201 = _mm512_mul_ps(ifft192, ifft195);
__m512 ifft202 = _mm512_mul_ps(ifft193, ifft195);
__m512 ifft203 = _mm512_mul_ps(ifft194, ifft195);
__m512 ifft204 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft205 = _mm512_fnmadd_ps(ifft188, ifft204, ifft196);
__m512 ifft206 = _mm512_fmadd_ps(ifft187, ifft204, ifft197);
__m512 ifft207 = _mm512_fnmadd_ps(ifft190, ifft204, ifft198);
__m512 ifft208 = _mm512_fmadd_ps(ifft189, ifft204, ifft199);
__m512 ifft209 = _mm512_fnmadd_ps(ifft192, ifft204, ifft200);
__m512 ifft210 = _mm512_fmadd_ps(ifft191, ifft204, ifft201);
__m512 ifft211 = _mm512_fnmadd_ps(ifft194, ifft204, ifft202);
__m512 ifft212 = _mm512_fmadd_ps(ifft193, ifft204, ifft203);
__m512 ifft213 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft214 = _mm512_fmadd_ps(ifft205, ifft213, _mm512_shuffle_ps(ifft205, ifft205, 78));
__m512 ifft215 = _mm512_fmadd_ps(ifft206, ifft213, _mm512_shuffle_ps(ifft206, ifft206, 78));
__m512 ifft216 = _mm512_fmadd_ps(ifft207, ifft213, _mm512_shuffle_ps(ifft207, ifft207, 78));
__m512 ifft217 = _mm512_fmadd_ps(ifft208, ifft213, _mm512_shuffle_ps(ifft208, ifft208, 78));
__m512 ifft218 = _mm512_fmadd_ps(ifft209, ifft213, _mm512_shuffle_ps(ifft209, ifft209, 78));
__m512 ifft219 = _mm512_fmadd_ps(ifft210, ifft213, _mm512_shuffle_ps(ifft210, ifft210, 78));
__m512 ifft220 = _mm512_fmadd_ps(ifft211, ifft213, _mm512_shuffle_ps(ifft211, ifft211, 78));
__m512 ifft221 = _mm512_fmadd_ps(ifft212, ifft213, _mm512_shuffle_ps(ifft212, ifft212, 78));
__m512 ifft222 = _mm512_mask_sub_ps(ifft214, 49344, _mm512_setzero_ps(), ifft215);
__m512 ifft223 = _mm512_mask_mov_ps(ifft215, 49344, ifft214);
__m512 ifft224 = _mm512_mask_sub_ps(ifft216, 49344, _mm512_setzero_ps(), ifft217);
__m512 ifft225 = _mm512_mask_mov_ps(ifft217, 49344, ifft216);
__m512 ifft226 = _mm512_mask_sub_ps(ifft218, 49344, _mm512_setzero_ps(), ifft219);
__m512 ifft227 = _mm512_mask_mov_ps(ifft219, 49344, ifft218);
__m512 ifft228 = _mm512_mask_sub_ps(ifft220, 49344, _mm512_setzero_ps(), ifft221);
__m512 ifft229 = _mm512_mask_mov_ps(ifft221, 49344, ifft220);
__m512 ifft230 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft231 = _mm512_fmadd_ps(ifft222, ifft230, _mm512_shuffle_f32x4(ifft222, ifft222, 177));
__m512 ifft232 = _mm512_fmadd_ps(ifft223, ifft230, _mm512_shuffle_f32x4(ifft223, ifft223, 177));
__m512 ifft233 = _mm512_fmadd_ps(ifft224, ifft230, _mm512_shuffle_f32x4(ifft224, ifft224, 177));
__m512 ifft234 = _mm512_fmadd_ps(ifft225, ifft230, _mm512_shuffle_f32x4(ifft225, ifft225, 177));
__m512 ifft235 = _mm512_fmadd_ps(ifft226, ifft230, _mm512_shuffle_f32x4(ifft226, ifft226, 177));
__m512 ifft236 = _mm512_fnmsub_ps(ifft227, ifft230, _mm512_shuffle_f32x4(ifft227, ifft227, 177));
__m512 ifft237 = _mm512_fmadd_ps(ifft228, ifft230, _mm512_shuffle_f32x4(ifft228, ifft228, 177));
__m512 ifft238 = _mm512_fmadd_ps(ifft229, ifft230, _mm512_shuffle_f32x4(ifft229, ifft229, 177));
__m512 ifft239 = _mm512_add_ps(ifft231, ifft232);
__m512 ifft240 = _mm512_sub_ps(ifft231, ifft232);
__m512 ifft241 = _mm512_sub_ps(ifft233, ifft237);
__m512 ifft242 = _mm512_add_ps(ifft234, ifft238);
__m512 ifft243 = _mm512_add_ps(ifft233, ifft237);
__m512 ifft244 = _mm512_sub_ps(ifft234, ifft238);
__m512 ifft245 = _mm512_mul_ps(ifft235, _mm512_set1_ps(3.125e-02f));
__m512 ifft246 = _mm512_mul_ps(ifft236, _mm512_set1_ps(3.125e-02f));
__m512 ifft247 = _mm512_fmadd_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft248 = _mm512_fmsub_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft249 = _mm512_fmadd_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft250 = _mm512_fmsub_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft251 = _mm512_add_ps(ifft241, ifft242);
__m512 ifft252 = _mm512_sub_ps(ifft241, ifft242);
__m512 ifft253 = _mm512_fnmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft254 = _mm512_fmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft255 = _mm512_fmadd_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft256 = _mm512_fmsub_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft257 = _mm512_add_ps(ifft253, ifft254);
__m512 ifft258 = _mm512_sub_ps(ifft253, ifft254);
__m512 ifft259 = _mm512_add_ps(ifft255, ifft256);
__m512 ifft260 = _mm512_sub_ps(ifft255, ifft256);
__m512 ifft261 = _mm512_fmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft262 = _mm512_fnmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft263 = _mm512_fmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft264 = _mm512_fnmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft265 = _mm512_fnmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft266 = _mm512_fmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft267 = _mm512_fmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft268 = _mm512_fnmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 dat35 = ifft261;
__m512 dat36 = ifft263;
__m512 dat37 = ifft265;
__m512 dat38 = ifft267;
(void)ifft262;
(void)ifft264;
(void)ifft266;
(void)ifft268;
__mmask16 mask11 = _mm512_cmp_ps_mask(dat35, _mm512_setzero_ps(), _CMP_LT_OQ);
dat35 = _mm512_mask_mul_ps(dat35, mask11, dat35, _mm512_set1_ps(8.125e-01f));
dat35 = _mm512_add_ps(dat35, _mm512_maskz_loadu_ps(7, datPtr3+40+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3));
dat35 = _mm512_add_ps(dat35, _mm512_maskz_loadu_ps(1792, datPtr3+216+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3));
dat35 = _mm512_mask_fmadd_ps(dat35, 7, bnMul2, bnAdd2);
dat35 = _mm512_mask_fmadd_ps(dat35, 1792, bnMul3, bnAdd3);
__mmask16 mask12 = _mm512_cmp_ps_mask(dat36, _mm512_setzero_ps(), _CMP_LT_OQ);
dat36 = _mm512_mask_mul_ps(dat36, mask12, dat36, _mm512_set1_ps(8.125e-01f));
dat36 = _mm512_add_ps(dat36, _mm512_maskz_loadu_ps(7, datPtr3+92+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3));
dat36 = _mm512_add_ps(dat36, _mm512_maskz_loadu_ps(1792, datPtr3+268+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3));
dat36 = _mm512_mask_fmadd_ps(dat36, 7, bnMul2, bnAdd2);
dat36 = _mm512_mask_fmadd_ps(dat36, 1792, bnMul3, bnAdd3);
__mmask16 mask13 = _mm512_cmp_ps_mask(dat37, _mm512_setzero_ps(), _CMP_LT_OQ);
dat37 = _mm512_mask_mul_ps(dat37, mask13, dat37, _mm512_set1_ps(8.125e-01f));
dat37 = _mm512_add_ps(dat37, _mm512_maskz_loadu_ps(7, datPtr3+144+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3));
dat37 = _mm512_add_ps(dat37, _mm512_maskz_loadu_ps(1792, datPtr3+320+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3));
dat37 = _mm512_mask_fmadd_ps(dat37, 7, bnMul2, bnAdd2);
dat37 = _mm512_mask_fmadd_ps(dat37, 1792, bnMul3, bnAdd3);
__mmask16 mask14 = _mm512_cmp_ps_mask(dat38, _mm512_setzero_ps(), _CMP_LT_OQ);
dat38 = _mm512_mask_mul_ps(dat38, mask14, dat38, _mm512_set1_ps(8.125e-01f));
dat38 = _mm512_add_ps(dat38, _mm512_maskz_loadu_ps(7, datPtr3+196+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3));
dat38 = _mm512_add_ps(dat38, _mm512_maskz_loadu_ps(1792, datPtr3+372+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3));
dat38 = _mm512_mask_fmadd_ps(dat38, 7, bnMul2, bnAdd2);
dat38 = _mm512_mask_fmadd_ps(dat38, 1792, bnMul3, bnAdd3);
_mm512_mask_storeu_ps(datPtr4+40+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3, 7, dat35);
_mm512_mask_storeu_ps(datPtr4+216+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3, 1792, dat35);
_mm512_mask_storeu_ps(datPtr4+92+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3, 7, dat36);
_mm512_mask_storeu_ps(datPtr4+268+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3, 1792, dat36);
_mm512_mask_storeu_ps(datPtr4+144+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3, 7, dat37);
_mm512_mask_storeu_ps(datPtr4+320+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3, 1792, dat37);
_mm512_mask_storeu_ps(datPtr4+196+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3, 7, dat38);
_mm512_mask_storeu_ps(datPtr4+372+19344*i10+832*k8+416*r2+52*toH1+4*toW1+0*t3, 1792, dat38);
}
}
ptrdiff_t r3 = 0;
__m512 bnMul4 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(0+93*i10+4*k8+2*r3))[0]);
__m512 bnAdd4 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(0+93*i10+4*k8+2*r3))[1]);
ptrdiff_t t4 = 0;
__m512 sfRe45 = _mm512_loadu_ps(sfPtr3+0+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfIm45 = _mm512_loadu_ps(sfPtr3+64+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfRe49 = _mm512_loadu_ps(sfPtr3+128+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfIm49 = _mm512_shuffle_f32x4(sfRe49, sfRe49, 78);
__m512 sfRe46 = _mm512_loadu_ps(sfPtr3+17856+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfIm46 = _mm512_loadu_ps(sfPtr3+17920+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfRe50 = _mm512_loadu_ps(sfPtr3+17984+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfIm50 = _mm512_shuffle_f32x4(sfRe50, sfRe50, 78);
__m512 sfRe47 = _mm512_loadu_ps(sfPtr3+35712+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfIm47 = _mm512_loadu_ps(sfPtr3+35776+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfRe51 = _mm512_loadu_ps(sfPtr3+35840+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfIm51 = _mm512_shuffle_f32x4(sfRe51, sfRe51, 78);
__m512 sfRe48 = _mm512_loadu_ps(sfPtr3+53568+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfIm48 = _mm512_loadu_ps(sfPtr3+53632+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfRe52 = _mm512_loadu_ps(sfPtr3+53696+71424*i10+35712*j5+768*k8+384*r3+128*t4);
__m512 sfIm52 = _mm512_shuffle_f32x4(sfRe52, sfRe52, 78);
__m512i ifft269 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft270 = _mm512_permutexvar_ps(ifft269, sfRe45);
__m512 ifft361 = _mm512_permutexvar_ps(ifft269, sfRe49);
__m512i ifft271 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft272 = _mm512_permutexvar_ps(ifft271, sfRe45);
__m512 ifft362 = _mm512_permutexvar_ps(ifft271, sfRe49);
__m512 ifft273 = _mm512_permutexvar_ps(ifft269, sfIm45);
__m512 ifft363 = _mm512_permutexvar_ps(ifft269, sfIm49);
__m512 ifft274 = _mm512_permutexvar_ps(ifft271, sfIm45);
__m512 ifft364 = _mm512_permutexvar_ps(ifft271, sfIm49);
__m512 ifft275 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft276 = _mm512_mask_fmadd_ps(ifft274, 65021, ifft275, ifft270);
__m512 ifft365 = _mm512_mask_fmadd_ps(ifft364, 65021, ifft275, ifft361);
__m512 ifft277 = _mm512_mask_fnmadd_ps(ifft273, 65021, ifft275, ifft272);
__m512 ifft366 = _mm512_mask_fnmadd_ps(ifft363, 65021, ifft275, ifft362);
__m512 ifft278 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft279 = _mm512_fmadd_ps(ifft276, ifft278, _mm512_shuffle_ps(ifft276, ifft276, 177));
__m512 ifft367 = _mm512_fmadd_ps(ifft365, ifft278, _mm512_shuffle_ps(ifft365, ifft365, 177));
__m512 ifft280 = _mm512_fmadd_ps(ifft277, ifft278, _mm512_shuffle_ps(ifft277, ifft277, 177));
__m512 ifft368 = _mm512_fmadd_ps(ifft366, ifft278, _mm512_shuffle_ps(ifft366, ifft366, 177));
__m512 ifft281 = _mm512_fmadd_ps(sfRe46, ifft278, _mm512_shuffle_ps(sfRe46, sfRe46, 177));
__m512 ifft369 = _mm512_fmadd_ps(sfRe50, ifft278, _mm512_shuffle_ps(sfRe50, sfRe50, 177));
__m512 ifft282 = _mm512_fmadd_ps(sfIm46, ifft278, _mm512_shuffle_ps(sfIm46, sfIm46, 177));
__m512 ifft370 = _mm512_fmadd_ps(sfIm50, ifft278, _mm512_shuffle_ps(sfIm50, sfIm50, 177));
__m512 ifft283 = _mm512_fmadd_ps(sfRe47, ifft278, _mm512_shuffle_ps(sfRe47, sfRe47, 177));
__m512 ifft371 = _mm512_fmadd_ps(sfRe51, ifft278, _mm512_shuffle_ps(sfRe51, sfRe51, 177));
__m512 ifft284 = _mm512_fmadd_ps(sfIm47, ifft278, _mm512_shuffle_ps(sfIm47, sfIm47, 177));
__m512 ifft372 = _mm512_fmadd_ps(sfIm51, ifft278, _mm512_shuffle_ps(sfIm51, sfIm51, 177));
__m512 ifft285 = _mm512_fmadd_ps(sfRe48, ifft278, _mm512_shuffle_ps(sfRe48, sfRe48, 177));
__m512 ifft373 = _mm512_fmadd_ps(sfRe52, ifft278, _mm512_shuffle_ps(sfRe52, sfRe52, 177));
__m512 ifft286 = _mm512_fmadd_ps(sfIm48, ifft278, _mm512_shuffle_ps(sfIm48, sfIm48, 177));
__m512 ifft374 = _mm512_fmadd_ps(sfIm52, ifft278, _mm512_shuffle_ps(sfIm52, sfIm52, 177));
__m512 ifft287 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft288 = _mm512_mul_ps(ifft279, ifft287);
__m512 ifft375 = _mm512_mul_ps(ifft367, ifft287);
__m512 ifft289 = _mm512_mul_ps(ifft280, ifft287);
__m512 ifft376 = _mm512_mul_ps(ifft368, ifft287);
__m512 ifft290 = _mm512_mul_ps(ifft281, ifft287);
__m512 ifft377 = _mm512_mul_ps(ifft369, ifft287);
__m512 ifft291 = _mm512_mul_ps(ifft282, ifft287);
__m512 ifft378 = _mm512_mul_ps(ifft370, ifft287);
__m512 ifft292 = _mm512_mul_ps(ifft283, ifft287);
__m512 ifft379 = _mm512_mul_ps(ifft371, ifft287);
__m512 ifft293 = _mm512_mul_ps(ifft284, ifft287);
__m512 ifft380 = _mm512_mul_ps(ifft372, ifft287);
__m512 ifft294 = _mm512_mul_ps(ifft285, ifft287);
__m512 ifft381 = _mm512_mul_ps(ifft373, ifft287);
__m512 ifft295 = _mm512_mul_ps(ifft286, ifft287);
__m512 ifft382 = _mm512_mul_ps(ifft374, ifft287);
__m512 ifft296 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft297 = _mm512_fnmadd_ps(ifft280, ifft296, ifft288);
__m512 ifft383 = _mm512_fnmadd_ps(ifft368, ifft296, ifft375);
__m512 ifft298 = _mm512_fmadd_ps(ifft279, ifft296, ifft289);
__m512 ifft384 = _mm512_fmadd_ps(ifft367, ifft296, ifft376);
__m512 ifft299 = _mm512_fnmadd_ps(ifft282, ifft296, ifft290);
__m512 ifft385 = _mm512_fnmadd_ps(ifft370, ifft296, ifft377);
__m512 ifft300 = _mm512_fmadd_ps(ifft281, ifft296, ifft291);
__m512 ifft386 = _mm512_fmadd_ps(ifft369, ifft296, ifft378);
__m512 ifft301 = _mm512_fnmadd_ps(ifft284, ifft296, ifft292);
__m512 ifft387 = _mm512_fnmadd_ps(ifft372, ifft296, ifft379);
__m512 ifft302 = _mm512_fmadd_ps(ifft283, ifft296, ifft293);
__m512 ifft388 = _mm512_fmadd_ps(ifft371, ifft296, ifft380);
__m512 ifft303 = _mm512_fnmadd_ps(ifft286, ifft296, ifft294);
__m512 ifft389 = _mm512_fnmadd_ps(ifft374, ifft296, ifft381);
__m512 ifft304 = _mm512_fmadd_ps(ifft285, ifft296, ifft295);
__m512 ifft390 = _mm512_fmadd_ps(ifft373, ifft296, ifft382);
__m512 ifft305 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft306 = _mm512_fmadd_ps(ifft297, ifft305, _mm512_shuffle_ps(ifft297, ifft297, 78));
__m512 ifft391 = _mm512_fmadd_ps(ifft383, ifft305, _mm512_shuffle_ps(ifft383, ifft383, 78));
__m512 ifft307 = _mm512_fmadd_ps(ifft298, ifft305, _mm512_shuffle_ps(ifft298, ifft298, 78));
__m512 ifft392 = _mm512_fmadd_ps(ifft384, ifft305, _mm512_shuffle_ps(ifft384, ifft384, 78));
__m512 ifft308 = _mm512_fmadd_ps(ifft299, ifft305, _mm512_shuffle_ps(ifft299, ifft299, 78));
__m512 ifft393 = _mm512_fmadd_ps(ifft385, ifft305, _mm512_shuffle_ps(ifft385, ifft385, 78));
__m512 ifft309 = _mm512_fmadd_ps(ifft300, ifft305, _mm512_shuffle_ps(ifft300, ifft300, 78));
__m512 ifft394 = _mm512_fmadd_ps(ifft386, ifft305, _mm512_shuffle_ps(ifft386, ifft386, 78));
__m512 ifft310 = _mm512_fmadd_ps(ifft301, ifft305, _mm512_shuffle_ps(ifft301, ifft301, 78));
__m512 ifft395 = _mm512_fmadd_ps(ifft387, ifft305, _mm512_shuffle_ps(ifft387, ifft387, 78));
__m512 ifft311 = _mm512_fmadd_ps(ifft302, ifft305, _mm512_shuffle_ps(ifft302, ifft302, 78));
__m512 ifft396 = _mm512_fmadd_ps(ifft388, ifft305, _mm512_shuffle_ps(ifft388, ifft388, 78));
__m512 ifft312 = _mm512_fmadd_ps(ifft303, ifft305, _mm512_shuffle_ps(ifft303, ifft303, 78));
__m512 ifft397 = _mm512_fmadd_ps(ifft389, ifft305, _mm512_shuffle_ps(ifft389, ifft389, 78));
__m512 ifft313 = _mm512_fmadd_ps(ifft304, ifft305, _mm512_shuffle_ps(ifft304, ifft304, 78));
__m512 ifft398 = _mm512_fmadd_ps(ifft390, ifft305, _mm512_shuffle_ps(ifft390, ifft390, 78));
__m512 ifft314 = _mm512_mask_sub_ps(ifft306, 49344, _mm512_setzero_ps(), ifft307);
__m512 ifft399 = _mm512_mask_sub_ps(ifft391, 49344, _mm512_setzero_ps(), ifft392);
__m512 ifft315 = _mm512_mask_mov_ps(ifft307, 49344, ifft306);
__m512 ifft400 = _mm512_mask_mov_ps(ifft392, 49344, ifft391);
__m512 ifft316 = _mm512_mask_sub_ps(ifft308, 49344, _mm512_setzero_ps(), ifft309);
__m512 ifft401 = _mm512_mask_sub_ps(ifft393, 49344, _mm512_setzero_ps(), ifft394);
__m512 ifft317 = _mm512_mask_mov_ps(ifft309, 49344, ifft308);
__m512 ifft402 = _mm512_mask_mov_ps(ifft394, 49344, ifft393);
__m512 ifft318 = _mm512_mask_sub_ps(ifft310, 49344, _mm512_setzero_ps(), ifft311);
__m512 ifft403 = _mm512_mask_sub_ps(ifft395, 49344, _mm512_setzero_ps(), ifft396);
__m512 ifft319 = _mm512_mask_mov_ps(ifft311, 49344, ifft310);
__m512 ifft404 = _mm512_mask_mov_ps(ifft396, 49344, ifft395);
__m512 ifft320 = _mm512_mask_sub_ps(ifft312, 49344, _mm512_setzero_ps(), ifft313);
__m512 ifft405 = _mm512_mask_sub_ps(ifft397, 49344, _mm512_setzero_ps(), ifft398);
__m512 ifft321 = _mm512_mask_mov_ps(ifft313, 49344, ifft312);
__m512 ifft406 = _mm512_mask_mov_ps(ifft398, 49344, ifft397);
__m512 ifft322 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft323 = _mm512_fmadd_ps(ifft314, ifft322, _mm512_shuffle_f32x4(ifft314, ifft314, 177));
__m512 ifft407 = _mm512_fmadd_ps(ifft399, ifft322, _mm512_shuffle_f32x4(ifft399, ifft399, 177));
__m512 ifft324 = _mm512_fmadd_ps(ifft315, ifft322, _mm512_shuffle_f32x4(ifft315, ifft315, 177));
__m512 ifft408 = _mm512_fmadd_ps(ifft400, ifft322, _mm512_shuffle_f32x4(ifft400, ifft400, 177));
__m512 ifft325 = _mm512_fmadd_ps(ifft316, ifft322, _mm512_shuffle_f32x4(ifft316, ifft316, 177));
__m512 ifft409 = _mm512_fmadd_ps(ifft401, ifft322, _mm512_shuffle_f32x4(ifft401, ifft401, 177));
__m512 ifft326 = _mm512_fmadd_ps(ifft317, ifft322, _mm512_shuffle_f32x4(ifft317, ifft317, 177));
__m512 ifft410 = _mm512_fmadd_ps(ifft402, ifft322, _mm512_shuffle_f32x4(ifft402, ifft402, 177));
__m512 ifft327 = _mm512_fmadd_ps(ifft318, ifft322, _mm512_shuffle_f32x4(ifft318, ifft318, 177));
__m512 ifft411 = _mm512_fmadd_ps(ifft403, ifft322, _mm512_shuffle_f32x4(ifft403, ifft403, 177));
__m512 ifft328 = _mm512_fnmsub_ps(ifft319, ifft322, _mm512_shuffle_f32x4(ifft319, ifft319, 177));
__m512 ifft412 = _mm512_fnmsub_ps(ifft404, ifft322, _mm512_shuffle_f32x4(ifft404, ifft404, 177));
__m512 ifft329 = _mm512_fmadd_ps(ifft320, ifft322, _mm512_shuffle_f32x4(ifft320, ifft320, 177));
__m512 ifft413 = _mm512_fmadd_ps(ifft405, ifft322, _mm512_shuffle_f32x4(ifft405, ifft405, 177));
__m512 ifft330 = _mm512_fmadd_ps(ifft321, ifft322, _mm512_shuffle_f32x4(ifft321, ifft321, 177));
__m512 ifft414 = _mm512_fmadd_ps(ifft406, ifft322, _mm512_shuffle_f32x4(ifft406, ifft406, 177));
__m512 ifft331 = _mm512_add_ps(ifft323, ifft324);
__m512 ifft415 = _mm512_add_ps(ifft407, ifft408);
__m512 ifft332 = _mm512_sub_ps(ifft323, ifft324);
__m512 ifft416 = _mm512_sub_ps(ifft407, ifft408);
__m512 ifft333 = _mm512_sub_ps(ifft325, ifft329);
__m512 ifft417 = _mm512_sub_ps(ifft409, ifft413);
__m512 ifft334 = _mm512_add_ps(ifft326, ifft330);
__m512 ifft418 = _mm512_add_ps(ifft410, ifft414);
__m512 ifft335 = _mm512_add_ps(ifft325, ifft329);
__m512 ifft419 = _mm512_add_ps(ifft409, ifft413);
__m512 ifft336 = _mm512_sub_ps(ifft326, ifft330);
__m512 ifft420 = _mm512_sub_ps(ifft410, ifft414);
__m512 ifft337 = _mm512_mul_ps(ifft327, _mm512_set1_ps(3.125e-02f));
__m512 ifft421 = _mm512_mul_ps(ifft411, _mm512_set1_ps(3.125e-02f));
__m512 ifft338 = _mm512_mul_ps(ifft328, _mm512_set1_ps(3.125e-02f));
__m512 ifft422 = _mm512_mul_ps(ifft412, _mm512_set1_ps(3.125e-02f));
__m512 ifft339 = _mm512_fmadd_ps(ifft331, _mm512_set1_ps(1.5625e-02f), ifft337);
__m512 ifft423 = _mm512_fmadd_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft340 = _mm512_fmsub_ps(ifft331, _mm512_set1_ps(1.5625e-02f), ifft337);
__m512 ifft424 = _mm512_fmsub_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft341 = _mm512_fmadd_ps(ifft332, _mm512_set1_ps(1.5625e-02f), ifft338);
__m512 ifft425 = _mm512_fmadd_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft342 = _mm512_fmsub_ps(ifft332, _mm512_set1_ps(1.5625e-02f), ifft338);
__m512 ifft426 = _mm512_fmsub_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft343 = _mm512_add_ps(ifft333, ifft334);
__m512 ifft427 = _mm512_add_ps(ifft417, ifft418);
__m512 ifft344 = _mm512_sub_ps(ifft333, ifft334);
__m512 ifft428 = _mm512_sub_ps(ifft417, ifft418);
__m512 ifft345 = _mm512_fnmadd_ps(ifft343, _mm512_set1_ps(7.0710677e-01f), ifft335);
__m512 ifft429 = _mm512_fnmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft346 = _mm512_fmadd_ps(ifft343, _mm512_set1_ps(7.0710677e-01f), ifft335);
__m512 ifft430 = _mm512_fmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft347 = _mm512_fmadd_ps(ifft344, _mm512_set1_ps(7.0710677e-01f), ifft336);
__m512 ifft431 = _mm512_fmadd_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft348 = _mm512_fmsub_ps(ifft344, _mm512_set1_ps(7.0710677e-01f), ifft336);
__m512 ifft432 = _mm512_fmsub_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft349 = _mm512_add_ps(ifft345, ifft346);
__m512 ifft433 = _mm512_add_ps(ifft429, ifft430);
__m512 ifft350 = _mm512_sub_ps(ifft345, ifft346);
__m512 ifft434 = _mm512_sub_ps(ifft429, ifft430);
__m512 ifft351 = _mm512_add_ps(ifft347, ifft348);
__m512 ifft435 = _mm512_add_ps(ifft431, ifft432);
__m512 ifft352 = _mm512_sub_ps(ifft347, ifft348);
__m512 ifft436 = _mm512_sub_ps(ifft431, ifft432);
__m512 ifft353 = _mm512_fmadd_ps(ifft349, _mm512_set1_ps(1.5625e-02f), ifft339);
__m512 ifft437 = _mm512_fmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft354 = _mm512_fnmadd_ps(ifft349, _mm512_set1_ps(1.5625e-02f), ifft339);
__m512 ifft438 = _mm512_fnmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft355 = _mm512_fmadd_ps(ifft351, _mm512_set1_ps(1.5625e-02f), ifft341);
__m512 ifft439 = _mm512_fmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft356 = _mm512_fnmadd_ps(ifft351, _mm512_set1_ps(1.5625e-02f), ifft341);
__m512 ifft440 = _mm512_fnmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft357 = _mm512_fnmadd_ps(ifft352, _mm512_set1_ps(1.5625e-02f), ifft340);
__m512 ifft441 = _mm512_fnmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft358 = _mm512_fmadd_ps(ifft352, _mm512_set1_ps(1.5625e-02f), ifft340);
__m512 ifft442 = _mm512_fmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft359 = _mm512_fmadd_ps(ifft350, _mm512_set1_ps(1.5625e-02f), ifft342);
__m512 ifft443 = _mm512_fmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft360 = _mm512_fnmadd_ps(ifft350, _mm512_set1_ps(1.5625e-02f), ifft342);
__m512 ifft444 = _mm512_fnmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 dat39 = ifft353;
__m512 dat43 = ifft437;
__m512 dat40 = ifft355;
__m512 dat44 = ifft439;
__m512 dat41 = ifft357;
__m512 dat45 = ifft441;
__m512 dat42 = ifft359;
__m512 dat46 = ifft443;
(void)ifft354;
(void)ifft438;
(void)ifft356;
(void)ifft440;
(void)ifft358;
(void)ifft442;
(void)ifft360;
(void)ifft444;
__m512i pm3 = _mm512_set_epi32(24, 20, 19, 18, 17, 16, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0);
__m512 pack9 = _mm512_permutex2var_ps(dat39, pm3, dat43);
__m512 pack10 = _mm512_permutex2var_ps(dat40, pm3, dat44);
__m512 pack11 = _mm512_permutex2var_ps(dat41, pm3, dat45);
__m512 pack12 = _mm512_permutex2var_ps(dat42, pm3, dat46);
__mmask16 mask15 = _mm512_cmp_ps_mask(pack9, _mm512_setzero_ps(), _CMP_LT_OQ);
pack9 = _mm512_mask_mul_ps(pack9, mask15, pack9, _mm512_set1_ps(8.125e-01f));
pack9 = _mm512_add_ps(pack9, _mm512_maskz_loadu_ps(8191, datPtr3+0+19344*i10+832*k8+416*r3+52*toH1+4*toW1+60*t4));
pack9 = _mm512_fmadd_ps(pack9, bnMul4, bnAdd4);
__mmask16 mask16 = _mm512_cmp_ps_mask(pack10, _mm512_setzero_ps(), _CMP_LT_OQ);
pack10 = _mm512_mask_mul_ps(pack10, mask16, pack10, _mm512_set1_ps(8.125e-01f));
pack10 = _mm512_add_ps(pack10, _mm512_maskz_loadu_ps(8191, datPtr3+52+19344*i10+832*k8+416*r3+52*toH1+4*toW1+60*t4));
pack10 = _mm512_fmadd_ps(pack10, bnMul4, bnAdd4);
__mmask16 mask17 = _mm512_cmp_ps_mask(pack11, _mm512_setzero_ps(), _CMP_LT_OQ);
pack11 = _mm512_mask_mul_ps(pack11, mask17, pack11, _mm512_set1_ps(8.125e-01f));
pack11 = _mm512_add_ps(pack11, _mm512_maskz_loadu_ps(8191, datPtr3+104+19344*i10+832*k8+416*r3+52*toH1+4*toW1+60*t4));
pack11 = _mm512_fmadd_ps(pack11, bnMul4, bnAdd4);
__mmask16 mask18 = _mm512_cmp_ps_mask(pack12, _mm512_setzero_ps(), _CMP_LT_OQ);
pack12 = _mm512_mask_mul_ps(pack12, mask18, pack12, _mm512_set1_ps(8.125e-01f));
pack12 = _mm512_add_ps(pack12, _mm512_maskz_loadu_ps(8191, datPtr3+156+19344*i10+832*k8+416*r3+52*toH1+4*toW1+60*t4));
pack12 = _mm512_fmadd_ps(pack12, bnMul4, bnAdd4);
_mm512_mask_storeu_ps(datPtr4+0+19344*i10+832*k8+416*r3+52*toH1+4*toW1+60*t4, 8191, pack9);
_mm512_mask_storeu_ps(datPtr4+52+19344*i10+832*k8+416*r3+52*toH1+4*toW1+60*t4, 8191, pack10);
_mm512_mask_storeu_ps(datPtr4+104+19344*i10+832*k8+416*r3+52*toH1+4*toW1+60*t4, 8191, pack11);
_mm512_mask_storeu_ps(datPtr4+156+19344*i10+832*k8+416*r3+52*toH1+4*toW1+60*t4, 8191, pack12);
++j5;
}

static void Example29StriderConsumeSums1(Example29ThreaderTeam1* team17, char** tensors7) {
Example29ThreaderTask1 task11;
task11.callee1 = Example29StriderConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 3;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 1;
Example29ThreaderDo1(team17, &task11);
}

struct Example29Net {
char* alloc1;
char* align1;
};

void Example29NetDestroy(Example29Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example29NetCreate(
Example29Net** net1,
Example29Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example29Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(2552447);
if (__builtin_expect(!alloc3, 0)) {
return Example29Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
char* tmpAlloc1 = malloc(1255);
if (__builtin_expect(!tmpAlloc1, 0)) {
char* msg6 = Example29Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
char* tmpAlign1 = (void*)(((size_t)tmpAlloc1+63)&-64);
Example29ThreaderTeam1* team12 = 0;
char* err8 = Example29ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(tmpAlloc1);
free(alloc3);
return err8;
}
{
Example29BnSimplify1(
params1->bn1Means,
params1->bn1Variances,
params1->bn1Scales,
params1->bn1Shifts,
align3+0
);
Example29BnSimplify2(
params1->bn4Means,
params1->bn4Variances,
params1->bn4Scales,
params1->bn4Shifts,
align3+448
);
Example29BnSimplify1(
params1->bn2Means,
params1->bn2Variances,
params1->bn2Scales,
params1->bn2Shifts,
tmpAlign1+0
);
Example29BnSimplify2(
params1->bn3Means,
params1->bn3Variances,
params1->bn3Scales,
params1->bn3Shifts,
tmpAlign1+448
);
char* tensors12[] = {
(char*)params1->convWeights,
(char*)params1->convBiases,
tmpAlign1+0,
tmpAlign1+448,
align3+1216
};
Example29StriderArrangeFilts1(team12, tensors12);
}
Example29ThreaderDestroy1(team12);
free(tmpAlloc1);
Example29Net* net5 = malloc(sizeof(Example29Net));
if (__builtin_expect(!net5, 0)) {
char* msg7 = Example29Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg7;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example29Engine {
Example29Net* net3;
Example29ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example29EnginePthreadT(
Example29Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example29ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example29EngineDestroy(Example29Engine* eng3) {
Example29ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example29EngineCreate(
Example29Engine** eng4,
Example29Net* net4,
ptrdiff_t threads2
) {
Example29Engine* eng5 = malloc(sizeof(Example29Engine));
if (__builtin_expect(!eng5, 0)) {
return Example29Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(288575);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example29Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example29ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example29EngineInference(
Example29Engine* eng1,
float* bn4Data,
float* in1Data,
float* in2Data,
float* in3Data
) {
char* netAlign1 = eng1->net3->align1;
Example29ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)in1Data,
netAlign1+0,
(char*)in2Data,
align4+0
};
Example29StriderArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+1216,
align4+0,
align4+217088
};
Example29StriderProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+217088,
(char*)in3Data,
netAlign1+448,
(char*)bn4Data
};
Example29StriderConsumeSums1(team14, tensors11);
}
}

// End of file.

Top