NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example13 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=12294 Height=38 Width=23
Conv FromTensor=in ToTensor=out ToChannels=7662 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=6
Output FromTensor=out

Top || Output Example13.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example13Params);
// Example13Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example13Params Example13Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example13Params* params = malloc(sizeof(Example13Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example13Net* net; // For example, 4 threads:
// char* err = Example13NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example13NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example13Net Example13Net;

char* Example13NetCreate(
Example13Net**,
Example13Params*,
ptrdiff_t threads
);

void Example13NetDestroy(Example13Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example13Net* net;
//
// ... Create net ...
//
// Example13Engine* engine; // For example, 4 inference threads:
// char* err = Example13EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example13EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example13EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*12294*38*23);
// float* outData = malloc(sizeof(float)*7662*36*21);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example13EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example13Engine Example13Engine;

char* Example13EngineCreate(
Example13Engine**,
Example13Net*,
ptrdiff_t threads
);

char* Example13EnginePthreadT(
Example13Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example13EngineInference(
Example13Engine*,
float* inData,
float* outData
);

void Example13EngineDestroy(Example13Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example13Params {
float outBiases[7662]; // 1x7662x1x1
float outWeights[141294942]; // 7662x2049x3x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example13.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example13.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example13.h"

static char* Example13Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example13: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example13ThreaderTask1 Example13ThreaderTask1;
typedef void (*Example13ThreaderCallee1)(Example13ThreaderTask1*, int64_t*);
typedef struct Example13ThreaderHub1 Example13ThreaderHub1;
typedef struct Example13ThreaderNode1 Example13ThreaderNode1;
typedef struct Example13ThreaderUnwind1 Example13ThreaderUnwind1;
typedef struct Example13ThreaderTeam1 Example13ThreaderTeam1;

struct Example13ThreaderTask1 {
Example13ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example13ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example13ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example13ThreaderTask1* task1;
pthread_cond_t cond2;
Example13ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example13ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example13ThreaderTeam1 {
ptrdiff_t nt1;
Example13ThreaderHub1* hub2;
Example13ThreaderNode1* nodes2;
Example13ThreaderUnwind1 unwind1;
};

static void Example13ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example13ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example13ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example13ThreaderMain1(void* arg1) {
Example13ThreaderNode1* node1 = arg1;
Example13ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example13ThreaderHub1* hub3 = team2->hub2;
Example13ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example13ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example13ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example13ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example13ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example13ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example13ThreaderDestroy1(Example13ThreaderTeam1* team3) {
if (!team3) return;
Example13ThreaderNode1* nodes4 = team3->nodes2;
Example13ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example13ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example13ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example13ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example13ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example13ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example13ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example13ThreaderCreate1Up4(Example13ThreaderTeam1* team8, ptrdiff_t nt7) {
Example13ThreaderNode1* nodes5 = team8->nodes2;
for (Example13ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example13Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example13Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example13ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example13Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example13ThreaderCreate1Up3(Example13ThreaderTeam1* team7, ptrdiff_t nt6) {
Example13ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example13Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example13Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example13ThreaderCreate1Up4(team7, nt6);
}

static char* Example13ThreaderCreate1Up2(Example13ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example13ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example13ThreaderNode1) != (size_t)nt5, 0)) {
return Example13Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example13Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example13ThreaderCreate1Up3(team6, nt5);
}

static char* Example13ThreaderCreate1Up1(Example13ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example13ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example13Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example13ThreaderCreate1Up2(team5, nt4);
}

static char* Example13ThreaderCreate1(Example13ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example13Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example13ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example13Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example13ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example13ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example13ThreaderPthreadT1(
pthread_t* thr2,
Example13ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example13Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example13ThreaderDo1(Example13ThreaderTeam1* team10, Example13ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example13ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example13ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example13ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example13ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example13Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example13Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example13ThreeArrangeFilts1Callee1(Example13ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = pt7[1];
ptrdiff_t e1 = pt7[2];
if (e1 < 4) {
char*restrict bfPtr1 = tensors2[2]+30648*e1;
char*restrict wfPtr1 = tensors2[2]+153280+388371456*e1;
char*restrict wtPtr1 = tensors2[0]+14256*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+0;
if (j1 < 319) {
for (; j1 != 319; ++j1) {
ptrdiff_t k1 = 0+1*j1;
ptrdiff_t cut1 = 0;
ptrdiff_t s1 = 0;
for (; s1 != 396; ++s1) {
__m512 wt1 = _mm512_maskz_loadu_ps(511, wtPtr1+0+94196628*i5+295056*j1+36*s1);
__m512 wt2 = _mm512_maskz_loadu_ps(511, wtPtr1+73764+94196628*i5+295056*j1+36*s1);
__m512 wt3 = _mm512_maskz_loadu_ps(511, wtPtr1+147528+94196628*i5+295056*j1+36*s1);
__m512 wt4 = _mm512_maskz_loadu_ps(511, wtPtr1+221292+94196628*i5+295056*j1+36*s1);
__m512i pm1 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm2 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp1 = _mm512_permutex2var_ps(wt1, pm1, wt3);
__m512 tmp2 = _mm512_permutex2var_ps(wt2, pm1, wt4);
__m512 tmp3 = _mm512_permutex2var_ps(wt1, pm2, wt3);
__m512 tmp4 = _mm512_permutex2var_ps(wt2, pm2, wt4);
__m512 in1 = _mm512_permutex2var_ps(tmp1, pm1, tmp2);
__m512 in2 = _mm512_permutex2var_ps(tmp1, pm2, tmp2);
__m512 in3 = _mm512_permutex2var_ps(tmp3, pm1, tmp4);
__m512 tmp17 = _mm512_fmadd_ps(in1, _mm512_set1_ps(4e+00f), in3);
__m512 tmp18 = _mm512_add_ps(in1, in3);
__m512 tmp19 = _mm512_fmadd_ps(in3, _mm512_set1_ps(4e+00f), in1);
__m512 tmp20 = _mm512_add_ps(in2, tmp18);
__m512 tmp21 = _mm512_fmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp19);
tmp19 = _mm512_fnmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp19);
__m512 tmp22 = _mm512_fnmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp17);
tmp17 = _mm512_fmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp17);
tmp18 = _mm512_sub_ps(tmp18, in2);
__m512 tmp39 = _mm512_unpacklo_ps(in1, tmp20);
__m512 tmp40 = _mm512_unpackhi_ps(in1, tmp20);
__m512 tmp41 = _mm512_unpacklo_ps(tmp18, tmp21);
__m512 tmp42 = _mm512_unpackhi_ps(tmp18, tmp21);
__m512 tmp43 = _mm512_unpacklo_ps(tmp19, tmp17);
__m512 tmp44 = _mm512_unpackhi_ps(tmp19, tmp17);
__m512 tmp45 = _mm512_unpacklo_ps(tmp22, in3);
__m512 tmp46 = _mm512_unpackhi_ps(tmp22, in3);
__m512 tmp47 = _mm512_shuffle_ps(tmp39, tmp41, 68);
__m512 tmp48 = _mm512_shuffle_ps(tmp39, tmp41, 238);
__m512 tmp49 = _mm512_shuffle_ps(tmp40, tmp42, 68);
__m512 tmp50 = _mm512_shuffle_ps(tmp40, tmp42, 238);
__m512 tmp51 = _mm512_shuffle_ps(tmp43, tmp45, 68);
__m512 tmp52 = _mm512_shuffle_ps(tmp43, tmp45, 238);
__m512 tmp53 = _mm512_shuffle_ps(tmp44, tmp46, 68);
__m512 tmp54 = _mm512_shuffle_ps(tmp44, tmp46, 238);
__m512 tmp55 = _mm512_shuffle_f32x4(tmp47, tmp51, 136);
__m512 tmp56 = _mm512_shuffle_f32x4(tmp47, tmp51, 221);
__m512 tmp57 = _mm512_shuffle_f32x4(tmp48, tmp52, 136);
__m512 tmp58 = _mm512_shuffle_f32x4(tmp48, tmp52, 221);
__m512 tmp59 = _mm512_shuffle_f32x4(tmp49, tmp53, 136);
__m512 tmp60 = _mm512_shuffle_f32x4(tmp49, tmp53, 221);
__m512 tmp61 = _mm512_shuffle_f32x4(tmp50, tmp54, 136);
__m512 tmp62 = _mm512_shuffle_f32x4(tmp50, tmp54, 221);
in1 = _mm512_shuffle_f32x4(tmp55, tmp55, 136);
__m512 tmp23 = _mm512_shuffle_f32x4(tmp55, tmp55, 221);
tmp20 = _mm512_shuffle_f32x4(tmp57, tmp57, 136);
__m512 tmp24 = _mm512_shuffle_f32x4(tmp57, tmp57, 221);
tmp18 = _mm512_shuffle_f32x4(tmp59, tmp59, 136);
__m512 tmp25 = _mm512_shuffle_f32x4(tmp59, tmp59, 221);
tmp21 = _mm512_shuffle_f32x4(tmp61, tmp61, 136);
__m512 tmp26 = _mm512_shuffle_f32x4(tmp61, tmp61, 221);
tmp19 = _mm512_shuffle_f32x4(tmp56, tmp56, 136);
tmp17 = _mm512_shuffle_f32x4(tmp58, tmp58, 136);
tmp22 = _mm512_shuffle_f32x4(tmp60, tmp60, 136);
in3 = _mm512_shuffle_f32x4(tmp62, tmp62, 136);
in1 = _mm512_shuffle_f32x4(in1, tmp21, 68);
tmp20 = _mm512_shuffle_f32x4(tmp20, tmp19, 68);
tmp18 = _mm512_shuffle_f32x4(tmp18, tmp17, 68);
tmp22 = _mm512_shuffle_f32x4(tmp22, tmp24, 68);
in3 = _mm512_shuffle_f32x4(in3, tmp25, 68);
tmp23 = _mm512_shuffle_f32x4(tmp23, tmp26, 68);
__m512 tmp27 = _mm512_fmadd_ps(in1, _mm512_set1_ps(4e+00f), tmp18);
__m512 tmp33 = _mm512_fmadd_ps(tmp22, _mm512_set1_ps(4e+00f), tmp23);
__m512 tmp28 = _mm512_add_ps(in1, tmp18);
__m512 tmp34 = _mm512_add_ps(tmp22, tmp23);
__m512 tmp29 = _mm512_fmadd_ps(tmp18, _mm512_set1_ps(4e+00f), in1);
__m512 tmp35 = _mm512_fmadd_ps(tmp23, _mm512_set1_ps(4e+00f), tmp22);
__m512 tmp30 = _mm512_add_ps(tmp20, tmp28);
__m512 tmp36 = _mm512_add_ps(in3, tmp34);
__m512 tmp31 = _mm512_fmadd_ps(tmp20, _mm512_set1_ps(2e+00f), tmp29);
__m512 tmp37 = _mm512_fmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp35);
tmp29 = _mm512_fnmadd_ps(tmp20, _mm512_set1_ps(2e+00f), tmp29);
tmp35 = _mm512_fnmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp35);
__m512 tmp32 = _mm512_fnmadd_ps(tmp20, _mm512_set1_ps(2e+00f), tmp27);
__m512 tmp38 = _mm512_fnmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp33);
tmp27 = _mm512_fmadd_ps(tmp20, _mm512_set1_ps(2e+00f), tmp27);
tmp33 = _mm512_fmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp33);
tmp28 = _mm512_sub_ps(tmp28, tmp20);
tmp34 = _mm512_sub_ps(tmp34, in3);
in1 = _mm512_mul_ps(in1, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp30 = _mm512_mul_ps(tmp30, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp28 = _mm512_mul_ps(tmp28, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp31 = _mm512_mul_ps(tmp31, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp29 = _mm512_mul_ps(tmp29, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp27 = _mm512_mul_ps(tmp27, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp32 = _mm512_mul_ps(tmp32, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp18 = _mm512_mul_ps(tmp18, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp22 = _mm512_mul_ps(tmp22, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp36 = _mm512_mul_ps(tmp36, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp34 = _mm512_mul_ps(tmp34, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp37 = _mm512_mul_ps(tmp37, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp35 = _mm512_mul_ps(tmp35, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp33 = _mm512_mul_ps(tmp33, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp38 = _mm512_mul_ps(tmp38, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp23 = _mm512_mul_ps(tmp23, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out1 = _mm512_shuffle_f32x4(in1, tmp30, 68);
__m512 out5 = _mm512_shuffle_f32x4(in1, tmp30, 238);
__m512 out2 = _mm512_shuffle_f32x4(tmp28, tmp31, 68);
__m512 out6 = _mm512_shuffle_f32x4(tmp28, tmp31, 238);
__m512 out3 = _mm512_shuffle_f32x4(tmp29, tmp27, 68);
__m512 out7 = _mm512_shuffle_f32x4(tmp29, tmp27, 238);
__m512 out4 = _mm512_shuffle_f32x4(tmp32, tmp18, 68);
__m512 out8 = _mm512_shuffle_f32x4(tmp32, tmp18, 238);
__m512 out9 = _mm512_shuffle_f32x4(tmp22, tmp36, 68);
__m512 out13 = _mm512_shuffle_f32x4(tmp22, tmp36, 238);
__m512 out10 = _mm512_shuffle_f32x4(tmp34, tmp37, 68);
__m512 out14 = _mm512_shuffle_f32x4(tmp34, tmp37, 238);
__m512 out11 = _mm512_shuffle_f32x4(tmp35, tmp33, 68);
__m512 out15 = _mm512_shuffle_f32x4(tmp35, tmp33, 238);
__m512 out12 = _mm512_shuffle_f32x4(tmp38, tmp23, 68);
__m512 out16 = _mm512_shuffle_f32x4(tmp38, tmp23, 238);
ptrdiff_t off1 = 32*cut1;
ptrdiff_t off2 = (size_t)(cut1+1)/4*50688+(size_t)(cut1+1)%4*32;
ptrdiff_t off3 = (size_t)(cut1+2)/4*50688+(size_t)(cut1+2)%4*32;
ptrdiff_t off4 = (size_t)(cut1+3)/4*50688+(size_t)(cut1+3)%4*32;
__m512i wf1 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf2 = _mm512_castsi256_si512(_mm512_cvtps_ph(out5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf3 = _mm512_castsi256_si512(_mm512_cvtps_ph(out9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf4 = _mm512_castsi256_si512(_mm512_cvtps_ph(out13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf5 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf6 = _mm512_castsi256_si512(_mm512_cvtps_ph(out6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf7 = _mm512_castsi256_si512(_mm512_cvtps_ph(out10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf8 = _mm512_castsi256_si512(_mm512_cvtps_ph(out14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf9 = _mm512_castsi256_si512(_mm512_cvtps_ph(out3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf10 = _mm512_castsi256_si512(_mm512_cvtps_ph(out7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf11 = _mm512_castsi256_si512(_mm512_cvtps_ph(out11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf12 = _mm512_castsi256_si512(_mm512_cvtps_ph(out15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf13 = _mm512_castsi256_si512(_mm512_cvtps_ph(out4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf14 = _mm512_castsi256_si512(_mm512_cvtps_ph(out8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf15 = _mm512_castsi256_si512(_mm512_cvtps_ph(out12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf16 = _mm512_castsi256_si512(_mm512_cvtps_ph(out16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr1+0+64728576*i5+50688*k1+off1+128*s1, 255, wf1);
_mm512_mask_storeu_epi32(wfPtr1+0+64728576*i5+50688*k1+off2+128*s1, 255, wf2);
_mm512_mask_storeu_epi32(wfPtr1+0+64728576*i5+50688*k1+off3+128*s1, 255, wf3);
_mm512_mask_storeu_epi32(wfPtr1+0+64728576*i5+50688*k1+off4+128*s1, 255, wf4);
_mm512_mask_storeu_epi32(wfPtr1+16182144+64728576*i5+50688*k1+off1+128*s1, 255, wf5);
_mm512_mask_storeu_epi32(wfPtr1+16182144+64728576*i5+50688*k1+off2+128*s1, 255, wf6);
_mm512_mask_storeu_epi32(wfPtr1+16182144+64728576*i5+50688*k1+off3+128*s1, 255, wf7);
_mm512_mask_storeu_epi32(wfPtr1+16182144+64728576*i5+50688*k1+off4+128*s1, 255, wf8);
_mm512_mask_storeu_epi32(wfPtr1+32364288+64728576*i5+50688*k1+off1+128*s1, 255, wf9);
_mm512_mask_storeu_epi32(wfPtr1+32364288+64728576*i5+50688*k1+off2+128*s1, 255, wf10);
_mm512_mask_storeu_epi32(wfPtr1+32364288+64728576*i5+50688*k1+off3+128*s1, 255, wf11);
_mm512_mask_storeu_epi32(wfPtr1+32364288+64728576*i5+50688*k1+off4+128*s1, 255, wf12);
_mm512_mask_storeu_epi32(wfPtr1+48546432+64728576*i5+50688*k1+off1+128*s1, 255, wf13);
_mm512_mask_storeu_epi32(wfPtr1+48546432+64728576*i5+50688*k1+off2+128*s1, 255, wf14);
_mm512_mask_storeu_epi32(wfPtr1+48546432+64728576*i5+50688*k1+off3+128*s1, 255, wf15);
_mm512_mask_storeu_epi32(wfPtr1+48546432+64728576*i5+50688*k1+off4+128*s1, 255, wf16);
}
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(15, biasPtr1-0+5108*i5+16*j1);
}
_mm512_mask_storeu_ps(bfPtr1-0+5108*i5+16*j1, 15, bias1);
if (j1 >= jj1) return;
}
}
if (j1 == 319) {
ptrdiff_t k2 = 0+1*j1;
ptrdiff_t cut2 = 0;
ptrdiff_t s2 = 0;
for (; s2 != 99; ++s2) {
__m512 wt5 = _mm512_maskz_loadu_ps(511, wtPtr1+0+94196628*i5+295056*j1+144*s2);
__m512 wt6 = _mm512_maskz_loadu_ps(511, wtPtr1+36+94196628*i5+295056*j1+144*s2);
__m512 wt7 = _mm512_maskz_loadu_ps(511, wtPtr1+72+94196628*i5+295056*j1+144*s2);
__m512 wt8 = _mm512_maskz_loadu_ps(511, wtPtr1+108+94196628*i5+295056*j1+144*s2);
__m512i pm3 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm4 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp5 = _mm512_permutex2var_ps(wt5, pm3, wt7);
__m512 tmp6 = _mm512_permutex2var_ps(wt6, pm3, wt8);
__m512 tmp7 = _mm512_permutex2var_ps(wt5, pm4, wt7);
__m512 tmp8 = _mm512_permutex2var_ps(wt6, pm4, wt8);
__m512 in4 = _mm512_permutex2var_ps(tmp5, pm3, tmp6);
__m512 in5 = _mm512_permutex2var_ps(tmp5, pm4, tmp6);
__m512 in6 = _mm512_permutex2var_ps(tmp7, pm3, tmp8);
__m512 tmp63 = _mm512_fmadd_ps(in4, _mm512_set1_ps(4e+00f), in6);
__m512 tmp64 = _mm512_add_ps(in4, in6);
__m512 tmp65 = _mm512_fmadd_ps(in6, _mm512_set1_ps(4e+00f), in4);
__m512 tmp66 = _mm512_add_ps(in5, tmp64);
__m512 tmp67 = _mm512_fmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp65);
tmp65 = _mm512_fnmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp65);
__m512 tmp68 = _mm512_fnmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp63);
tmp63 = _mm512_fmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp63);
tmp64 = _mm512_sub_ps(tmp64, in5);
__m512 tmp85 = _mm512_unpacklo_ps(in4, tmp66);
__m512 tmp86 = _mm512_unpackhi_ps(in4, tmp66);
__m512 tmp87 = _mm512_unpacklo_ps(tmp64, tmp67);
__m512 tmp88 = _mm512_unpackhi_ps(tmp64, tmp67);
__m512 tmp89 = _mm512_unpacklo_ps(tmp65, tmp63);
__m512 tmp90 = _mm512_unpackhi_ps(tmp65, tmp63);
__m512 tmp91 = _mm512_unpacklo_ps(tmp68, in6);
__m512 tmp92 = _mm512_unpackhi_ps(tmp68, in6);
__m512 tmp93 = _mm512_shuffle_ps(tmp85, tmp87, 68);
__m512 tmp94 = _mm512_shuffle_ps(tmp85, tmp87, 238);
__m512 tmp95 = _mm512_shuffle_ps(tmp86, tmp88, 68);
__m512 tmp96 = _mm512_shuffle_ps(tmp86, tmp88, 238);
__m512 tmp97 = _mm512_shuffle_ps(tmp89, tmp91, 68);
__m512 tmp98 = _mm512_shuffle_ps(tmp89, tmp91, 238);
__m512 tmp99 = _mm512_shuffle_ps(tmp90, tmp92, 68);
__m512 tmp100 = _mm512_shuffle_ps(tmp90, tmp92, 238);
__m512 tmp101 = _mm512_shuffle_f32x4(tmp93, tmp97, 136);
__m512 tmp102 = _mm512_shuffle_f32x4(tmp93, tmp97, 221);
__m512 tmp103 = _mm512_shuffle_f32x4(tmp94, tmp98, 136);
__m512 tmp104 = _mm512_shuffle_f32x4(tmp94, tmp98, 221);
__m512 tmp105 = _mm512_shuffle_f32x4(tmp95, tmp99, 136);
__m512 tmp106 = _mm512_shuffle_f32x4(tmp95, tmp99, 221);
__m512 tmp107 = _mm512_shuffle_f32x4(tmp96, tmp100, 136);
__m512 tmp108 = _mm512_shuffle_f32x4(tmp96, tmp100, 221);
in4 = _mm512_shuffle_f32x4(tmp101, tmp101, 136);
__m512 tmp69 = _mm512_shuffle_f32x4(tmp101, tmp101, 221);
tmp66 = _mm512_shuffle_f32x4(tmp103, tmp103, 136);
__m512 tmp70 = _mm512_shuffle_f32x4(tmp103, tmp103, 221);
tmp64 = _mm512_shuffle_f32x4(tmp105, tmp105, 136);
__m512 tmp71 = _mm512_shuffle_f32x4(tmp105, tmp105, 221);
tmp67 = _mm512_shuffle_f32x4(tmp107, tmp107, 136);
__m512 tmp72 = _mm512_shuffle_f32x4(tmp107, tmp107, 221);
tmp65 = _mm512_shuffle_f32x4(tmp102, tmp102, 136);
tmp63 = _mm512_shuffle_f32x4(tmp104, tmp104, 136);
tmp68 = _mm512_shuffle_f32x4(tmp106, tmp106, 136);
in6 = _mm512_shuffle_f32x4(tmp108, tmp108, 136);
in4 = _mm512_shuffle_f32x4(in4, tmp67, 68);
tmp66 = _mm512_shuffle_f32x4(tmp66, tmp65, 68);
tmp64 = _mm512_shuffle_f32x4(tmp64, tmp63, 68);
tmp68 = _mm512_shuffle_f32x4(tmp68, tmp70, 68);
in6 = _mm512_shuffle_f32x4(in6, tmp71, 68);
tmp69 = _mm512_shuffle_f32x4(tmp69, tmp72, 68);
__m512 tmp73 = _mm512_fmadd_ps(in4, _mm512_set1_ps(4e+00f), tmp64);
__m512 tmp79 = _mm512_fmadd_ps(tmp68, _mm512_set1_ps(4e+00f), tmp69);
__m512 tmp74 = _mm512_add_ps(in4, tmp64);
__m512 tmp80 = _mm512_add_ps(tmp68, tmp69);
__m512 tmp75 = _mm512_fmadd_ps(tmp64, _mm512_set1_ps(4e+00f), in4);
__m512 tmp81 = _mm512_fmadd_ps(tmp69, _mm512_set1_ps(4e+00f), tmp68);
__m512 tmp76 = _mm512_add_ps(tmp66, tmp74);
__m512 tmp82 = _mm512_add_ps(in6, tmp80);
__m512 tmp77 = _mm512_fmadd_ps(tmp66, _mm512_set1_ps(2e+00f), tmp75);
__m512 tmp83 = _mm512_fmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp81);
tmp75 = _mm512_fnmadd_ps(tmp66, _mm512_set1_ps(2e+00f), tmp75);
tmp81 = _mm512_fnmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp81);
__m512 tmp78 = _mm512_fnmadd_ps(tmp66, _mm512_set1_ps(2e+00f), tmp73);
__m512 tmp84 = _mm512_fnmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp79);
tmp73 = _mm512_fmadd_ps(tmp66, _mm512_set1_ps(2e+00f), tmp73);
tmp79 = _mm512_fmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp79);
tmp74 = _mm512_sub_ps(tmp74, tmp66);
tmp80 = _mm512_sub_ps(tmp80, in6);
in4 = _mm512_mul_ps(in4, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp76 = _mm512_mul_ps(tmp76, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp74 = _mm512_mul_ps(tmp74, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp77 = _mm512_mul_ps(tmp77, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp75 = _mm512_mul_ps(tmp75, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp73 = _mm512_mul_ps(tmp73, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp78 = _mm512_mul_ps(tmp78, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp64 = _mm512_mul_ps(tmp64, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp68 = _mm512_mul_ps(tmp68, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp82 = _mm512_mul_ps(tmp82, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp80 = _mm512_mul_ps(tmp80, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp83 = _mm512_mul_ps(tmp83, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp81 = _mm512_mul_ps(tmp81, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp79 = _mm512_mul_ps(tmp79, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp84 = _mm512_mul_ps(tmp84, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp69 = _mm512_mul_ps(tmp69, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out17 = _mm512_shuffle_f32x4(in4, tmp76, 68);
__m512 out21 = _mm512_shuffle_f32x4(in4, tmp76, 238);
__m512 out18 = _mm512_shuffle_f32x4(tmp74, tmp77, 68);
__m512 out22 = _mm512_shuffle_f32x4(tmp74, tmp77, 238);
__m512 out19 = _mm512_shuffle_f32x4(tmp75, tmp73, 68);
__m512 out23 = _mm512_shuffle_f32x4(tmp75, tmp73, 238);
__m512 out20 = _mm512_shuffle_f32x4(tmp78, tmp64, 68);
__m512 out24 = _mm512_shuffle_f32x4(tmp78, tmp64, 238);
__m512 out25 = _mm512_shuffle_f32x4(tmp68, tmp82, 68);
__m512 out29 = _mm512_shuffle_f32x4(tmp68, tmp82, 238);
__m512 out26 = _mm512_shuffle_f32x4(tmp80, tmp83, 68);
__m512 out30 = _mm512_shuffle_f32x4(tmp80, tmp83, 238);
__m512 out27 = _mm512_shuffle_f32x4(tmp81, tmp79, 68);
__m512 out31 = _mm512_shuffle_f32x4(tmp81, tmp79, 238);
__m512 out28 = _mm512_shuffle_f32x4(tmp84, tmp69, 68);
__m512 out32 = _mm512_shuffle_f32x4(tmp84, tmp69, 238);
ptrdiff_t off5 = 32*cut2;
__m512i wf17 = _mm512_castsi256_si512(_mm512_cvtps_ph(out17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf18 = _mm512_castsi256_si512(_mm512_cvtps_ph(out21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf19 = _mm512_castsi256_si512(_mm512_cvtps_ph(out25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf20 = _mm512_castsi256_si512(_mm512_cvtps_ph(out29, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf21 = _mm512_castsi256_si512(_mm512_cvtps_ph(out18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf22 = _mm512_castsi256_si512(_mm512_cvtps_ph(out22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf23 = _mm512_castsi256_si512(_mm512_cvtps_ph(out26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf24 = _mm512_castsi256_si512(_mm512_cvtps_ph(out30, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf25 = _mm512_castsi256_si512(_mm512_cvtps_ph(out19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf26 = _mm512_castsi256_si512(_mm512_cvtps_ph(out23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf27 = _mm512_castsi256_si512(_mm512_cvtps_ph(out27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf28 = _mm512_castsi256_si512(_mm512_cvtps_ph(out31, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf29 = _mm512_castsi256_si512(_mm512_cvtps_ph(out20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf30 = _mm512_castsi256_si512(_mm512_cvtps_ph(out24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf31 = _mm512_castsi256_si512(_mm512_cvtps_ph(out28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf32 = _mm512_castsi256_si512(_mm512_cvtps_ph(out32, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr1+0+64728576*i5+50688*k2+off5+128*s2, 255, wf17);
_mm512_mask_storeu_epi32(wfPtr1+32+64728576*i5+50688*k2+off5+128*s2, 255, wf18);
_mm512_mask_storeu_epi32(wfPtr1+64+64728576*i5+50688*k2+off5+128*s2, 255, wf19);
_mm512_mask_storeu_epi32(wfPtr1+96+64728576*i5+50688*k2+off5+128*s2, 255, wf20);
_mm512_mask_storeu_epi32(wfPtr1+16182144+64728576*i5+50688*k2+off5+128*s2, 255, wf21);
_mm512_mask_storeu_epi32(wfPtr1+16182176+64728576*i5+50688*k2+off5+128*s2, 255, wf22);
_mm512_mask_storeu_epi32(wfPtr1+16182208+64728576*i5+50688*k2+off5+128*s2, 255, wf23);
_mm512_mask_storeu_epi32(wfPtr1+16182240+64728576*i5+50688*k2+off5+128*s2, 255, wf24);
_mm512_mask_storeu_epi32(wfPtr1+32364288+64728576*i5+50688*k2+off5+128*s2, 255, wf25);
_mm512_mask_storeu_epi32(wfPtr1+32364320+64728576*i5+50688*k2+off5+128*s2, 255, wf26);
_mm512_mask_storeu_epi32(wfPtr1+32364352+64728576*i5+50688*k2+off5+128*s2, 255, wf27);
_mm512_mask_storeu_epi32(wfPtr1+32364384+64728576*i5+50688*k2+off5+128*s2, 255, wf28);
_mm512_mask_storeu_epi32(wfPtr1+48546432+64728576*i5+50688*k2+off5+128*s2, 255, wf29);
_mm512_mask_storeu_epi32(wfPtr1+48546464+64728576*i5+50688*k2+off5+128*s2, 255, wf30);
_mm512_mask_storeu_epi32(wfPtr1+48546496+64728576*i5+50688*k2+off5+128*s2, 255, wf31);
_mm512_mask_storeu_epi32(wfPtr1+48546528+64728576*i5+50688*k2+off5+128*s2, 255, wf32);
}
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(1, biasPtr1-0+5108*i5+16*j1);
}
_mm512_mask_storeu_ps(bfPtr1-0+5108*i5+16*j1, 1, bias2);
if (j1 >= jj1) return;
j1 = 320;
}
return;
}
e1 = 4;
char*restrict bfPtr2 = tensors2[2]+30648*e1;
char*restrict wfPtr2 = tensors2[2]+153280+388371456*e1;
char*restrict wtPtr2 = tensors2[0]+14256*e1;
ptrdiff_t i6 = 1*g2;
ptrdiff_t j2 = 1*b2;
ptrdiff_t jj2 = j2+0;
if (j2 < 319) {
for (; j2 != 319; ++j2) {
ptrdiff_t k3 = 0+1*j2;
ptrdiff_t cut3 = 0;
ptrdiff_t s3 = 0;
for (; s3 != 465; ++s3) {
__m512 wt9 = _mm512_maskz_loadu_ps(511, wtPtr2+0+94196628*i6+295056*j2+36*s3);
__m512 wt10 = _mm512_maskz_loadu_ps(511, wtPtr2+73764+94196628*i6+295056*j2+36*s3);
__m512 wt11 = _mm512_maskz_loadu_ps(511, wtPtr2+147528+94196628*i6+295056*j2+36*s3);
__m512 wt12 = _mm512_maskz_loadu_ps(511, wtPtr2+221292+94196628*i6+295056*j2+36*s3);
__m512i pm5 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm6 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp9 = _mm512_permutex2var_ps(wt9, pm5, wt11);
__m512 tmp10 = _mm512_permutex2var_ps(wt10, pm5, wt12);
__m512 tmp11 = _mm512_permutex2var_ps(wt9, pm6, wt11);
__m512 tmp12 = _mm512_permutex2var_ps(wt10, pm6, wt12);
__m512 in7 = _mm512_permutex2var_ps(tmp9, pm5, tmp10);
__m512 in8 = _mm512_permutex2var_ps(tmp9, pm6, tmp10);
__m512 in9 = _mm512_permutex2var_ps(tmp11, pm5, tmp12);
__m512 tmp109 = _mm512_fmadd_ps(in7, _mm512_set1_ps(4e+00f), in9);
__m512 tmp110 = _mm512_add_ps(in7, in9);
__m512 tmp111 = _mm512_fmadd_ps(in9, _mm512_set1_ps(4e+00f), in7);
__m512 tmp112 = _mm512_add_ps(in8, tmp110);
__m512 tmp113 = _mm512_fmadd_ps(in8, _mm512_set1_ps(2e+00f), tmp111);
tmp111 = _mm512_fnmadd_ps(in8, _mm512_set1_ps(2e+00f), tmp111);
__m512 tmp114 = _mm512_fnmadd_ps(in8, _mm512_set1_ps(2e+00f), tmp109);
tmp109 = _mm512_fmadd_ps(in8, _mm512_set1_ps(2e+00f), tmp109);
tmp110 = _mm512_sub_ps(tmp110, in8);
__m512 tmp131 = _mm512_unpacklo_ps(in7, tmp112);
__m512 tmp132 = _mm512_unpackhi_ps(in7, tmp112);
__m512 tmp133 = _mm512_unpacklo_ps(tmp110, tmp113);
__m512 tmp134 = _mm512_unpackhi_ps(tmp110, tmp113);
__m512 tmp135 = _mm512_unpacklo_ps(tmp111, tmp109);
__m512 tmp136 = _mm512_unpackhi_ps(tmp111, tmp109);
__m512 tmp137 = _mm512_unpacklo_ps(tmp114, in9);
__m512 tmp138 = _mm512_unpackhi_ps(tmp114, in9);
__m512 tmp139 = _mm512_shuffle_ps(tmp131, tmp133, 68);
__m512 tmp140 = _mm512_shuffle_ps(tmp131, tmp133, 238);
__m512 tmp141 = _mm512_shuffle_ps(tmp132, tmp134, 68);
__m512 tmp142 = _mm512_shuffle_ps(tmp132, tmp134, 238);
__m512 tmp143 = _mm512_shuffle_ps(tmp135, tmp137, 68);
__m512 tmp144 = _mm512_shuffle_ps(tmp135, tmp137, 238);
__m512 tmp145 = _mm512_shuffle_ps(tmp136, tmp138, 68);
__m512 tmp146 = _mm512_shuffle_ps(tmp136, tmp138, 238);
__m512 tmp147 = _mm512_shuffle_f32x4(tmp139, tmp143, 136);
__m512 tmp148 = _mm512_shuffle_f32x4(tmp139, tmp143, 221);
__m512 tmp149 = _mm512_shuffle_f32x4(tmp140, tmp144, 136);
__m512 tmp150 = _mm512_shuffle_f32x4(tmp140, tmp144, 221);
__m512 tmp151 = _mm512_shuffle_f32x4(tmp141, tmp145, 136);
__m512 tmp152 = _mm512_shuffle_f32x4(tmp141, tmp145, 221);
__m512 tmp153 = _mm512_shuffle_f32x4(tmp142, tmp146, 136);
__m512 tmp154 = _mm512_shuffle_f32x4(tmp142, tmp146, 221);
in7 = _mm512_shuffle_f32x4(tmp147, tmp147, 136);
__m512 tmp115 = _mm512_shuffle_f32x4(tmp147, tmp147, 221);
tmp112 = _mm512_shuffle_f32x4(tmp149, tmp149, 136);
__m512 tmp116 = _mm512_shuffle_f32x4(tmp149, tmp149, 221);
tmp110 = _mm512_shuffle_f32x4(tmp151, tmp151, 136);
__m512 tmp117 = _mm512_shuffle_f32x4(tmp151, tmp151, 221);
tmp113 = _mm512_shuffle_f32x4(tmp153, tmp153, 136);
__m512 tmp118 = _mm512_shuffle_f32x4(tmp153, tmp153, 221);
tmp111 = _mm512_shuffle_f32x4(tmp148, tmp148, 136);
tmp109 = _mm512_shuffle_f32x4(tmp150, tmp150, 136);
tmp114 = _mm512_shuffle_f32x4(tmp152, tmp152, 136);
in9 = _mm512_shuffle_f32x4(tmp154, tmp154, 136);
in7 = _mm512_shuffle_f32x4(in7, tmp113, 68);
tmp112 = _mm512_shuffle_f32x4(tmp112, tmp111, 68);
tmp110 = _mm512_shuffle_f32x4(tmp110, tmp109, 68);
tmp114 = _mm512_shuffle_f32x4(tmp114, tmp116, 68);
in9 = _mm512_shuffle_f32x4(in9, tmp117, 68);
tmp115 = _mm512_shuffle_f32x4(tmp115, tmp118, 68);
__m512 tmp119 = _mm512_fmadd_ps(in7, _mm512_set1_ps(4e+00f), tmp110);
__m512 tmp125 = _mm512_fmadd_ps(tmp114, _mm512_set1_ps(4e+00f), tmp115);
__m512 tmp120 = _mm512_add_ps(in7, tmp110);
__m512 tmp126 = _mm512_add_ps(tmp114, tmp115);
__m512 tmp121 = _mm512_fmadd_ps(tmp110, _mm512_set1_ps(4e+00f), in7);
__m512 tmp127 = _mm512_fmadd_ps(tmp115, _mm512_set1_ps(4e+00f), tmp114);
__m512 tmp122 = _mm512_add_ps(tmp112, tmp120);
__m512 tmp128 = _mm512_add_ps(in9, tmp126);
__m512 tmp123 = _mm512_fmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp121);
__m512 tmp129 = _mm512_fmadd_ps(in9, _mm512_set1_ps(2e+00f), tmp127);
tmp121 = _mm512_fnmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp121);
tmp127 = _mm512_fnmadd_ps(in9, _mm512_set1_ps(2e+00f), tmp127);
__m512 tmp124 = _mm512_fnmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp119);
__m512 tmp130 = _mm512_fnmadd_ps(in9, _mm512_set1_ps(2e+00f), tmp125);
tmp119 = _mm512_fmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp119);
tmp125 = _mm512_fmadd_ps(in9, _mm512_set1_ps(2e+00f), tmp125);
tmp120 = _mm512_sub_ps(tmp120, tmp112);
tmp126 = _mm512_sub_ps(tmp126, in9);
in7 = _mm512_mul_ps(in7, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp122 = _mm512_mul_ps(tmp122, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp120 = _mm512_mul_ps(tmp120, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp123 = _mm512_mul_ps(tmp123, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp121 = _mm512_mul_ps(tmp121, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp119 = _mm512_mul_ps(tmp119, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp124 = _mm512_mul_ps(tmp124, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp110 = _mm512_mul_ps(tmp110, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp114 = _mm512_mul_ps(tmp114, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp128 = _mm512_mul_ps(tmp128, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp126 = _mm512_mul_ps(tmp126, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp129 = _mm512_mul_ps(tmp129, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp127 = _mm512_mul_ps(tmp127, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp125 = _mm512_mul_ps(tmp125, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp130 = _mm512_mul_ps(tmp130, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp115 = _mm512_mul_ps(tmp115, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out33 = _mm512_shuffle_f32x4(in7, tmp122, 68);
__m512 out37 = _mm512_shuffle_f32x4(in7, tmp122, 238);
__m512 out34 = _mm512_shuffle_f32x4(tmp120, tmp123, 68);
__m512 out38 = _mm512_shuffle_f32x4(tmp120, tmp123, 238);
__m512 out35 = _mm512_shuffle_f32x4(tmp121, tmp119, 68);
__m512 out39 = _mm512_shuffle_f32x4(tmp121, tmp119, 238);
__m512 out36 = _mm512_shuffle_f32x4(tmp124, tmp110, 68);
__m512 out40 = _mm512_shuffle_f32x4(tmp124, tmp110, 238);
__m512 out41 = _mm512_shuffle_f32x4(tmp114, tmp128, 68);
__m512 out45 = _mm512_shuffle_f32x4(tmp114, tmp128, 238);
__m512 out42 = _mm512_shuffle_f32x4(tmp126, tmp129, 68);
__m512 out46 = _mm512_shuffle_f32x4(tmp126, tmp129, 238);
__m512 out43 = _mm512_shuffle_f32x4(tmp127, tmp125, 68);
__m512 out47 = _mm512_shuffle_f32x4(tmp127, tmp125, 238);
__m512 out44 = _mm512_shuffle_f32x4(tmp130, tmp115, 68);
__m512 out48 = _mm512_shuffle_f32x4(tmp130, tmp115, 238);
ptrdiff_t off6 = 32*cut3;
ptrdiff_t off7 = (size_t)(cut3+1)/4*59520+(size_t)(cut3+1)%4*32;
ptrdiff_t off8 = (size_t)(cut3+2)/4*59520+(size_t)(cut3+2)%4*32;
ptrdiff_t off9 = (size_t)(cut3+3)/4*59520+(size_t)(cut3+3)%4*32;
__m512i wf33 = _mm512_castsi256_si512(_mm512_cvtps_ph(out33, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf34 = _mm512_castsi256_si512(_mm512_cvtps_ph(out37, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf35 = _mm512_castsi256_si512(_mm512_cvtps_ph(out41, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf36 = _mm512_castsi256_si512(_mm512_cvtps_ph(out45, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf37 = _mm512_castsi256_si512(_mm512_cvtps_ph(out34, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf38 = _mm512_castsi256_si512(_mm512_cvtps_ph(out38, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf39 = _mm512_castsi256_si512(_mm512_cvtps_ph(out42, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf40 = _mm512_castsi256_si512(_mm512_cvtps_ph(out46, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf41 = _mm512_castsi256_si512(_mm512_cvtps_ph(out35, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf42 = _mm512_castsi256_si512(_mm512_cvtps_ph(out39, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf43 = _mm512_castsi256_si512(_mm512_cvtps_ph(out43, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf44 = _mm512_castsi256_si512(_mm512_cvtps_ph(out47, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf45 = _mm512_castsi256_si512(_mm512_cvtps_ph(out36, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf46 = _mm512_castsi256_si512(_mm512_cvtps_ph(out40, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf47 = _mm512_castsi256_si512(_mm512_cvtps_ph(out44, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf48 = _mm512_castsi256_si512(_mm512_cvtps_ph(out48, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr2+0+76007168*i6+59520*k3+off6+128*s3, 255, wf33);
_mm512_mask_storeu_epi32(wfPtr2+0+76007168*i6+59520*k3+off7+128*s3, 255, wf34);
_mm512_mask_storeu_epi32(wfPtr2+0+76007168*i6+59520*k3+off8+128*s3, 255, wf35);
_mm512_mask_storeu_epi32(wfPtr2+0+76007168*i6+59520*k3+off9+128*s3, 255, wf36);
_mm512_mask_storeu_epi32(wfPtr2+19001792+76007168*i6+59520*k3+off6+128*s3, 255, wf37);
_mm512_mask_storeu_epi32(wfPtr2+19001792+76007168*i6+59520*k3+off7+128*s3, 255, wf38);
_mm512_mask_storeu_epi32(wfPtr2+19001792+76007168*i6+59520*k3+off8+128*s3, 255, wf39);
_mm512_mask_storeu_epi32(wfPtr2+19001792+76007168*i6+59520*k3+off9+128*s3, 255, wf40);
_mm512_mask_storeu_epi32(wfPtr2+38003584+76007168*i6+59520*k3+off6+128*s3, 255, wf41);
_mm512_mask_storeu_epi32(wfPtr2+38003584+76007168*i6+59520*k3+off7+128*s3, 255, wf42);
_mm512_mask_storeu_epi32(wfPtr2+38003584+76007168*i6+59520*k3+off8+128*s3, 255, wf43);
_mm512_mask_storeu_epi32(wfPtr2+38003584+76007168*i6+59520*k3+off9+128*s3, 255, wf44);
_mm512_mask_storeu_epi32(wfPtr2+57005376+76007168*i6+59520*k3+off6+128*s3, 255, wf45);
_mm512_mask_storeu_epi32(wfPtr2+57005376+76007168*i6+59520*k3+off7+128*s3, 255, wf46);
_mm512_mask_storeu_epi32(wfPtr2+57005376+76007168*i6+59520*k3+off8+128*s3, 255, wf47);
_mm512_mask_storeu_epi32(wfPtr2+57005376+76007168*i6+59520*k3+off9+128*s3, 255, wf48);
}
_mm512_mask_storeu_ps(bfPtr2-0+5108*i6+16*j2, 15, _mm512_setzero_ps());
if (j2 >= jj2) return;
}
}
if (j2 == 319) {
ptrdiff_t k4 = 0+1*j2;
ptrdiff_t cut4 = 0;
ptrdiff_t s4 = 0;
for (; s4 != 116; ++s4) {
__m512 wt13 = _mm512_maskz_loadu_ps(511, wtPtr2+0+94196628*i6+295056*j2+144*s4);
__m512 wt14 = _mm512_maskz_loadu_ps(511, wtPtr2+36+94196628*i6+295056*j2+144*s4);
__m512 wt15 = _mm512_maskz_loadu_ps(511, wtPtr2+72+94196628*i6+295056*j2+144*s4);
__m512 wt16 = _mm512_maskz_loadu_ps(511, wtPtr2+108+94196628*i6+295056*j2+144*s4);
__m512i pm7 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm8 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp13 = _mm512_permutex2var_ps(wt13, pm7, wt15);
__m512 tmp14 = _mm512_permutex2var_ps(wt14, pm7, wt16);
__m512 tmp15 = _mm512_permutex2var_ps(wt13, pm8, wt15);
__m512 tmp16 = _mm512_permutex2var_ps(wt14, pm8, wt16);
__m512 in10 = _mm512_permutex2var_ps(tmp13, pm7, tmp14);
__m512 in11 = _mm512_permutex2var_ps(tmp13, pm8, tmp14);
__m512 in12 = _mm512_permutex2var_ps(tmp15, pm7, tmp16);
__m512 tmp155 = _mm512_fmadd_ps(in10, _mm512_set1_ps(4e+00f), in12);
__m512 tmp156 = _mm512_add_ps(in10, in12);
__m512 tmp157 = _mm512_fmadd_ps(in12, _mm512_set1_ps(4e+00f), in10);
__m512 tmp158 = _mm512_add_ps(in11, tmp156);
__m512 tmp159 = _mm512_fmadd_ps(in11, _mm512_set1_ps(2e+00f), tmp157);
tmp157 = _mm512_fnmadd_ps(in11, _mm512_set1_ps(2e+00f), tmp157);
__m512 tmp160 = _mm512_fnmadd_ps(in11, _mm512_set1_ps(2e+00f), tmp155);
tmp155 = _mm512_fmadd_ps(in11, _mm512_set1_ps(2e+00f), tmp155);
tmp156 = _mm512_sub_ps(tmp156, in11);
__m512 tmp177 = _mm512_unpacklo_ps(in10, tmp158);
__m512 tmp178 = _mm512_unpackhi_ps(in10, tmp158);
__m512 tmp179 = _mm512_unpacklo_ps(tmp156, tmp159);
__m512 tmp180 = _mm512_unpackhi_ps(tmp156, tmp159);
__m512 tmp181 = _mm512_unpacklo_ps(tmp157, tmp155);
__m512 tmp182 = _mm512_unpackhi_ps(tmp157, tmp155);
__m512 tmp183 = _mm512_unpacklo_ps(tmp160, in12);
__m512 tmp184 = _mm512_unpackhi_ps(tmp160, in12);
__m512 tmp185 = _mm512_shuffle_ps(tmp177, tmp179, 68);
__m512 tmp186 = _mm512_shuffle_ps(tmp177, tmp179, 238);
__m512 tmp187 = _mm512_shuffle_ps(tmp178, tmp180, 68);
__m512 tmp188 = _mm512_shuffle_ps(tmp178, tmp180, 238);
__m512 tmp189 = _mm512_shuffle_ps(tmp181, tmp183, 68);
__m512 tmp190 = _mm512_shuffle_ps(tmp181, tmp183, 238);
__m512 tmp191 = _mm512_shuffle_ps(tmp182, tmp184, 68);
__m512 tmp192 = _mm512_shuffle_ps(tmp182, tmp184, 238);
__m512 tmp193 = _mm512_shuffle_f32x4(tmp185, tmp189, 136);
__m512 tmp194 = _mm512_shuffle_f32x4(tmp185, tmp189, 221);
__m512 tmp195 = _mm512_shuffle_f32x4(tmp186, tmp190, 136);
__m512 tmp196 = _mm512_shuffle_f32x4(tmp186, tmp190, 221);
__m512 tmp197 = _mm512_shuffle_f32x4(tmp187, tmp191, 136);
__m512 tmp198 = _mm512_shuffle_f32x4(tmp187, tmp191, 221);
__m512 tmp199 = _mm512_shuffle_f32x4(tmp188, tmp192, 136);
__m512 tmp200 = _mm512_shuffle_f32x4(tmp188, tmp192, 221);
in10 = _mm512_shuffle_f32x4(tmp193, tmp193, 136);
__m512 tmp161 = _mm512_shuffle_f32x4(tmp193, tmp193, 221);
tmp158 = _mm512_shuffle_f32x4(tmp195, tmp195, 136);
__m512 tmp162 = _mm512_shuffle_f32x4(tmp195, tmp195, 221);
tmp156 = _mm512_shuffle_f32x4(tmp197, tmp197, 136);
__m512 tmp163 = _mm512_shuffle_f32x4(tmp197, tmp197, 221);
tmp159 = _mm512_shuffle_f32x4(tmp199, tmp199, 136);
__m512 tmp164 = _mm512_shuffle_f32x4(tmp199, tmp199, 221);
tmp157 = _mm512_shuffle_f32x4(tmp194, tmp194, 136);
tmp155 = _mm512_shuffle_f32x4(tmp196, tmp196, 136);
tmp160 = _mm512_shuffle_f32x4(tmp198, tmp198, 136);
in12 = _mm512_shuffle_f32x4(tmp200, tmp200, 136);
in10 = _mm512_shuffle_f32x4(in10, tmp159, 68);
tmp158 = _mm512_shuffle_f32x4(tmp158, tmp157, 68);
tmp156 = _mm512_shuffle_f32x4(tmp156, tmp155, 68);
tmp160 = _mm512_shuffle_f32x4(tmp160, tmp162, 68);
in12 = _mm512_shuffle_f32x4(in12, tmp163, 68);
tmp161 = _mm512_shuffle_f32x4(tmp161, tmp164, 68);
__m512 tmp165 = _mm512_fmadd_ps(in10, _mm512_set1_ps(4e+00f), tmp156);
__m512 tmp171 = _mm512_fmadd_ps(tmp160, _mm512_set1_ps(4e+00f), tmp161);
__m512 tmp166 = _mm512_add_ps(in10, tmp156);
__m512 tmp172 = _mm512_add_ps(tmp160, tmp161);
__m512 tmp167 = _mm512_fmadd_ps(tmp156, _mm512_set1_ps(4e+00f), in10);
__m512 tmp173 = _mm512_fmadd_ps(tmp161, _mm512_set1_ps(4e+00f), tmp160);
__m512 tmp168 = _mm512_add_ps(tmp158, tmp166);
__m512 tmp174 = _mm512_add_ps(in12, tmp172);
__m512 tmp169 = _mm512_fmadd_ps(tmp158, _mm512_set1_ps(2e+00f), tmp167);
__m512 tmp175 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp173);
tmp167 = _mm512_fnmadd_ps(tmp158, _mm512_set1_ps(2e+00f), tmp167);
tmp173 = _mm512_fnmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp173);
__m512 tmp170 = _mm512_fnmadd_ps(tmp158, _mm512_set1_ps(2e+00f), tmp165);
__m512 tmp176 = _mm512_fnmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp171);
tmp165 = _mm512_fmadd_ps(tmp158, _mm512_set1_ps(2e+00f), tmp165);
tmp171 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp171);
tmp166 = _mm512_sub_ps(tmp166, tmp158);
tmp172 = _mm512_sub_ps(tmp172, in12);
in10 = _mm512_mul_ps(in10, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp168 = _mm512_mul_ps(tmp168, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp166 = _mm512_mul_ps(tmp166, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp169 = _mm512_mul_ps(tmp169, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp167 = _mm512_mul_ps(tmp167, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp165 = _mm512_mul_ps(tmp165, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp170 = _mm512_mul_ps(tmp170, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp156 = _mm512_mul_ps(tmp156, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp160 = _mm512_mul_ps(tmp160, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp174 = _mm512_mul_ps(tmp174, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp172 = _mm512_mul_ps(tmp172, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp175 = _mm512_mul_ps(tmp175, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp173 = _mm512_mul_ps(tmp173, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp171 = _mm512_mul_ps(tmp171, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp176 = _mm512_mul_ps(tmp176, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp161 = _mm512_mul_ps(tmp161, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out49 = _mm512_shuffle_f32x4(in10, tmp168, 68);
__m512 out53 = _mm512_shuffle_f32x4(in10, tmp168, 238);
__m512 out50 = _mm512_shuffle_f32x4(tmp166, tmp169, 68);
__m512 out54 = _mm512_shuffle_f32x4(tmp166, tmp169, 238);
__m512 out51 = _mm512_shuffle_f32x4(tmp167, tmp165, 68);
__m512 out55 = _mm512_shuffle_f32x4(tmp167, tmp165, 238);
__m512 out52 = _mm512_shuffle_f32x4(tmp170, tmp156, 68);
__m512 out56 = _mm512_shuffle_f32x4(tmp170, tmp156, 238);
__m512 out57 = _mm512_shuffle_f32x4(tmp160, tmp174, 68);
__m512 out61 = _mm512_shuffle_f32x4(tmp160, tmp174, 238);
__m512 out58 = _mm512_shuffle_f32x4(tmp172, tmp175, 68);
__m512 out62 = _mm512_shuffle_f32x4(tmp172, tmp175, 238);
__m512 out59 = _mm512_shuffle_f32x4(tmp173, tmp171, 68);
__m512 out63 = _mm512_shuffle_f32x4(tmp173, tmp171, 238);
__m512 out60 = _mm512_shuffle_f32x4(tmp176, tmp161, 68);
__m512 out64 = _mm512_shuffle_f32x4(tmp176, tmp161, 238);
ptrdiff_t off10 = 32*cut4;
__m512i wf49 = _mm512_castsi256_si512(_mm512_cvtps_ph(out49, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf50 = _mm512_castsi256_si512(_mm512_cvtps_ph(out53, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf51 = _mm512_castsi256_si512(_mm512_cvtps_ph(out57, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf52 = _mm512_castsi256_si512(_mm512_cvtps_ph(out61, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf53 = _mm512_castsi256_si512(_mm512_cvtps_ph(out50, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf54 = _mm512_castsi256_si512(_mm512_cvtps_ph(out54, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf55 = _mm512_castsi256_si512(_mm512_cvtps_ph(out58, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf56 = _mm512_castsi256_si512(_mm512_cvtps_ph(out62, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf57 = _mm512_castsi256_si512(_mm512_cvtps_ph(out51, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf58 = _mm512_castsi256_si512(_mm512_cvtps_ph(out55, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf59 = _mm512_castsi256_si512(_mm512_cvtps_ph(out59, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf60 = _mm512_castsi256_si512(_mm512_cvtps_ph(out63, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf61 = _mm512_castsi256_si512(_mm512_cvtps_ph(out52, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf62 = _mm512_castsi256_si512(_mm512_cvtps_ph(out56, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf63 = _mm512_castsi256_si512(_mm512_cvtps_ph(out60, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf64 = _mm512_castsi256_si512(_mm512_cvtps_ph(out64, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr2+0+76007168*i6+59520*k4+off10+128*s4, 255, wf49);
_mm512_mask_storeu_epi32(wfPtr2+32+76007168*i6+59520*k4+off10+128*s4, 255, wf50);
_mm512_mask_storeu_epi32(wfPtr2+64+76007168*i6+59520*k4+off10+128*s4, 255, wf51);
_mm512_mask_storeu_epi32(wfPtr2+96+76007168*i6+59520*k4+off10+128*s4, 255, wf52);
_mm512_mask_storeu_epi32(wfPtr2+19001792+76007168*i6+59520*k4+off10+128*s4, 255, wf53);
_mm512_mask_storeu_epi32(wfPtr2+19001824+76007168*i6+59520*k4+off10+128*s4, 255, wf54);
_mm512_mask_storeu_epi32(wfPtr2+19001856+76007168*i6+59520*k4+off10+128*s4, 255, wf55);
_mm512_mask_storeu_epi32(wfPtr2+19001888+76007168*i6+59520*k4+off10+128*s4, 255, wf56);
_mm512_mask_storeu_epi32(wfPtr2+38003584+76007168*i6+59520*k4+off10+128*s4, 255, wf57);
_mm512_mask_storeu_epi32(wfPtr2+38003616+76007168*i6+59520*k4+off10+128*s4, 255, wf58);
_mm512_mask_storeu_epi32(wfPtr2+38003648+76007168*i6+59520*k4+off10+128*s4, 255, wf59);
_mm512_mask_storeu_epi32(wfPtr2+38003680+76007168*i6+59520*k4+off10+128*s4, 255, wf60);
_mm512_mask_storeu_epi32(wfPtr2+57005376+76007168*i6+59520*k4+off10+128*s4, 255, wf61);
_mm512_mask_storeu_epi32(wfPtr2+57005408+76007168*i6+59520*k4+off10+128*s4, 255, wf62);
_mm512_mask_storeu_epi32(wfPtr2+57005440+76007168*i6+59520*k4+off10+128*s4, 255, wf63);
_mm512_mask_storeu_epi32(wfPtr2+57005472+76007168*i6+59520*k4+off10+128*s4, 255, wf64);
}
__m512 wt17 = _mm512_maskz_loadu_ps(511, wtPtr2+0+94196628*i6+295056*j2+144*s4);
__m512i via1 = _mm512_castps_si512(wt17);
__m512 in13 = wt17;
__m512 in14 = _mm512_castsi512_ps(_mm512_alignr_epi32(via1, via1, 3));
__m512 in15 = _mm512_castsi512_ps(_mm512_alignr_epi32(via1, via1, 6));
__m512 tmp201 = _mm512_fmadd_ps(in13, _mm512_set1_ps(4e+00f), in15);
__m512 tmp202 = _mm512_add_ps(in13, in15);
__m512 tmp203 = _mm512_fmadd_ps(in15, _mm512_set1_ps(4e+00f), in13);
__m512 tmp204 = _mm512_add_ps(in14, tmp202);
__m512 tmp205 = _mm512_fmadd_ps(in14, _mm512_set1_ps(2e+00f), tmp203);
tmp203 = _mm512_fnmadd_ps(in14, _mm512_set1_ps(2e+00f), tmp203);
__m512 tmp206 = _mm512_fnmadd_ps(in14, _mm512_set1_ps(2e+00f), tmp201);
tmp201 = _mm512_fmadd_ps(in14, _mm512_set1_ps(2e+00f), tmp201);
tmp202 = _mm512_sub_ps(tmp202, in14);
__m512 tmp213 = _mm512_unpacklo_ps(in13, tmp204);
__m512 tmp214 = _mm512_unpackhi_ps(in13, tmp204);
__m512 tmp215 = _mm512_unpacklo_ps(tmp202, tmp205);
__m512 tmp216 = _mm512_unpackhi_ps(tmp202, tmp205);
__m512 tmp217 = _mm512_unpacklo_ps(tmp203, tmp201);
__m512 tmp218 = _mm512_unpackhi_ps(tmp203, tmp201);
__m512 tmp219 = _mm512_unpacklo_ps(tmp206, in15);
__m512 tmp220 = _mm512_unpackhi_ps(tmp206, in15);
__m512 tmp221 = _mm512_shuffle_ps(tmp213, tmp215, 68);
__m512 tmp222 = _mm512_shuffle_ps(tmp213, tmp215, 238);
__m512 tmp223 = _mm512_shuffle_ps(tmp214, tmp216, 68);
__m512 tmp224 = _mm512_shuffle_ps(tmp217, tmp219, 68);
__m512 tmp225 = _mm512_shuffle_ps(tmp217, tmp219, 238);
__m512 tmp226 = _mm512_shuffle_ps(tmp218, tmp220, 68);
__m512 tmp227 = _mm512_shuffle_f32x4(tmp221, tmp224, 136);
__m512 tmp228 = _mm512_shuffle_f32x4(tmp222, tmp225, 136);
__m512 tmp229 = _mm512_shuffle_f32x4(tmp223, tmp226, 136);
in13 = _mm512_shuffle_f32x4(tmp227, tmp227, 136);
tmp204 = _mm512_shuffle_f32x4(tmp228, tmp228, 136);
tmp202 = _mm512_shuffle_f32x4(tmp229, tmp229, 136);
__m512 tmp207 = _mm512_fmadd_ps(in13, _mm512_set1_ps(4e+00f), tmp202);
__m512 tmp208 = _mm512_add_ps(in13, tmp202);
__m512 tmp209 = _mm512_fmadd_ps(tmp202, _mm512_set1_ps(4e+00f), in13);
__m512 tmp210 = _mm512_add_ps(tmp204, tmp208);
__m512 tmp211 = _mm512_fmadd_ps(tmp204, _mm512_set1_ps(2e+00f), tmp209);
tmp209 = _mm512_fnmadd_ps(tmp204, _mm512_set1_ps(2e+00f), tmp209);
__m512 tmp212 = _mm512_fnmadd_ps(tmp204, _mm512_set1_ps(2e+00f), tmp207);
tmp207 = _mm512_fmadd_ps(tmp204, _mm512_set1_ps(2e+00f), tmp207);
tmp208 = _mm512_sub_ps(tmp208, tmp204);
in13 = _mm512_mul_ps(in13, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp210 = _mm512_mul_ps(tmp210, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp208 = _mm512_mul_ps(tmp208, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp211 = _mm512_mul_ps(tmp211, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp209 = _mm512_mul_ps(tmp209, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp207 = _mm512_mul_ps(tmp207, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp212 = _mm512_mul_ps(tmp212, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp202 = _mm512_mul_ps(tmp202, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out65 = _mm512_shuffle_f32x4(in13, tmp210, 68);
__m512 out66 = _mm512_shuffle_f32x4(tmp208, tmp211, 68);
__m512 out67 = _mm512_shuffle_f32x4(tmp209, tmp207, 68);
__m512 out68 = _mm512_shuffle_f32x4(tmp212, tmp202, 68);
ptrdiff_t off11 = 32*cut4;
__m512i wf65 = _mm512_castsi256_si512(_mm512_cvtps_ph(out65, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf66 = _mm512_castsi256_si512(_mm512_cvtps_ph(out66, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf67 = _mm512_castsi256_si512(_mm512_cvtps_ph(out67, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf68 = _mm512_castsi256_si512(_mm512_cvtps_ph(out68, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr2+0+76007168*i6+59520*k4+off11+128*s4, 255, wf65);
_mm512_mask_storeu_epi32(wfPtr2+19001792+76007168*i6+59520*k4+off11+128*s4, 255, wf66);
_mm512_mask_storeu_epi32(wfPtr2+38003584+76007168*i6+59520*k4+off11+128*s4, 255, wf67);
_mm512_mask_storeu_epi32(wfPtr2+57005376+76007168*i6+59520*k4+off11+128*s4, 255, wf68);
_mm512_mask_storeu_ps(bfPtr2-0+5108*i6+16*j2, 1, _mm512_setzero_ps());
if (j2 >= jj2) return;
j2 = 320;
}
}

static void Example13ThreeArrangeFilts1(Example13ThreaderTeam1* team13, char** tensors1) {
Example13ThreaderTask1 task5;
task5.callee1 = Example13ThreeArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 320;
task5.hull1[1] = 6;
task5.hull1[2] = 5;
Example13ThreaderDo1(team13, &task5);
}

static void Example13ThreeArrangeDats1Callee1(Example13ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s5 = pt8[0];
ptrdiff_t c1 = pt8[1];
ptrdiff_t g3 = pt8[2];
ptrdiff_t e2 = pt8[3];
if (e2 < 4) {
char*restrict datPtr1 = tensors4[0]-0+1384416*e2;
char*restrict dfPtr1 = tensors4[1]+14598144*e2;
ptrdiff_t i7 = 1*g3;
ptrdiff_t j3 = 1*c1;
ptrdiff_t last1 = j3+0;
ptrdiff_t rel1 = (size_t)(j3-0)%2;
ptrdiff_t base1 = 0+(size_t)(j3-0)/2*18;
for (; ; rel1 = 0, base1 += 18) {
if (rel1 < 1) {
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k5 = 0;
for (; k5 != 49; ++k5) {
__m512 dat1 = _mm512_maskz_loadu_ps(16383, datPtr1+0+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat2 = _mm512_maskz_loadu_ps(2047, datPtr1+48+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512i pm9 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in16 = _mm512_permutexvar_ps(pm9, dat1);
__m512i pm10 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in24 = _mm512_permutexvar_ps(pm10, dat2);
__m512 dat3 = _mm512_maskz_loadu_ps(16383, datPtr1+92+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat4 = _mm512_maskz_loadu_ps(2047, datPtr1+140+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in17 = _mm512_permutexvar_ps(pm9, dat3);
__m512 in25 = _mm512_permutexvar_ps(pm10, dat4);
__m512 dat5 = _mm512_maskz_loadu_ps(16383, datPtr1+184+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat6 = _mm512_maskz_loadu_ps(2047, datPtr1+232+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in18 = _mm512_permutexvar_ps(pm9, dat5);
__m512 in26 = _mm512_permutexvar_ps(pm10, dat6);
__m512 dat7 = _mm512_maskz_loadu_ps(16383, datPtr1+276+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat8 = _mm512_maskz_loadu_ps(2047, datPtr1+324+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in19 = _mm512_permutexvar_ps(pm9, dat7);
__m512 in27 = _mm512_permutexvar_ps(pm10, dat8);
__m512 dat9 = _mm512_maskz_loadu_ps(16383, datPtr1+368+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat10 = _mm512_maskz_loadu_ps(2047, datPtr1+416+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in20 = _mm512_permutexvar_ps(pm9, dat9);
__m512 in28 = _mm512_permutexvar_ps(pm10, dat10);
__m512 dat11 = _mm512_maskz_loadu_ps(16383, datPtr1+460+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat12 = _mm512_maskz_loadu_ps(2047, datPtr1+508+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in21 = _mm512_permutexvar_ps(pm9, dat11);
__m512 in29 = _mm512_permutexvar_ps(pm10, dat12);
__m512 dat13 = _mm512_maskz_loadu_ps(16383, datPtr1+552+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat14 = _mm512_maskz_loadu_ps(2047, datPtr1+600+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in22 = _mm512_permutexvar_ps(pm9, dat13);
__m512 in30 = _mm512_permutexvar_ps(pm10, dat14);
__m512 dat15 = _mm512_maskz_loadu_ps(16383, datPtr1+644+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat16 = _mm512_maskz_loadu_ps(2047, datPtr1+692+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in23 = _mm512_permutexvar_ps(pm9, dat15);
__m512 in31 = _mm512_permutexvar_ps(pm10, dat16);
__m512 tmp230 = _mm512_add_ps(in17, in21);
__m512 tmp234 = _mm512_add_ps(in25, in29);
__m512 tmp231 = _mm512_sub_ps(in20, in18);
__m512 tmp235 = _mm512_sub_ps(in28, in26);
__m512 tmp232 = _mm512_add_ps(in18, in22);
__m512 tmp236 = _mm512_add_ps(in26, in30);
in16 = _mm512_sub_ps(in16, in22);
in24 = _mm512_sub_ps(in24, in30);
tmp230 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-4.25e+00f), tmp230);
tmp234 = _mm512_fmadd_ps(in27, _mm512_set1_ps(-4.25e+00f), tmp234);
tmp232 = _mm512_fmadd_ps(in20, _mm512_set1_ps(-4.25e+00f), tmp232);
tmp236 = _mm512_fmadd_ps(in28, _mm512_set1_ps(-4.25e+00f), tmp236);
in16 = _mm512_fmadd_ps(tmp231, _mm512_set1_ps(5.25e+00f), in16);
in24 = _mm512_fmadd_ps(tmp235, _mm512_set1_ps(5.25e+00f), in24);
tmp231 = _mm512_fmadd_ps(in18, _mm512_set1_ps(2.5e-01f), in22);
tmp235 = _mm512_fmadd_ps(in26, _mm512_set1_ps(2.5e-01f), in30);
in18 = _mm512_fmadd_ps(in18, _mm512_set1_ps(4e+00f), in22);
in26 = _mm512_fmadd_ps(in26, _mm512_set1_ps(4e+00f), in30);
__m512 tmp233 = _mm512_sub_ps(tmp232, tmp230);
__m512 tmp237 = _mm512_sub_ps(tmp236, tmp234);
tmp232 = _mm512_add_ps(tmp230, tmp232);
tmp236 = _mm512_add_ps(tmp234, tmp236);
tmp230 = _mm512_fmadd_ps(in17, _mm512_set1_ps(2.5e-01f), in21);
tmp234 = _mm512_fmadd_ps(in25, _mm512_set1_ps(2.5e-01f), in29);
tmp231 = _mm512_fmadd_ps(in20, _mm512_set1_ps(-1.25e+00f), tmp231);
tmp235 = _mm512_fmadd_ps(in28, _mm512_set1_ps(-1.25e+00f), tmp235);
in20 = _mm512_fmadd_ps(in20, _mm512_set1_ps(-5e+00f), in18);
in28 = _mm512_fmadd_ps(in28, _mm512_set1_ps(-5e+00f), in26);
tmp230 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-1.25e+00f), tmp230);
tmp234 = _mm512_fmadd_ps(in27, _mm512_set1_ps(-1.25e+00f), tmp234);
in22 = _mm512_fmadd_ps(tmp230, _mm512_set1_ps(2e+00f), tmp231);
in30 = _mm512_fmadd_ps(tmp234, _mm512_set1_ps(2e+00f), tmp235);
tmp231 = _mm512_fnmadd_ps(tmp230, _mm512_set1_ps(2e+00f), tmp231);
tmp235 = _mm512_fnmadd_ps(tmp234, _mm512_set1_ps(2e+00f), tmp235);
tmp230 = _mm512_fmadd_ps(in21, _mm512_set1_ps(2.5e-01f), in17);
tmp234 = _mm512_fmadd_ps(in29, _mm512_set1_ps(2.5e-01f), in25);
in17 = _mm512_sub_ps(in23, in17);
in25 = _mm512_sub_ps(in31, in25);
tmp230 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-1.25e+00f), tmp230);
tmp234 = _mm512_fmadd_ps(in27, _mm512_set1_ps(-1.25e+00f), tmp234);
in19 = _mm512_sub_ps(in19, in21);
in27 = _mm512_sub_ps(in27, in29);
in19 = _mm512_fmadd_ps(in19, _mm512_set1_ps(5.25e+00f), in17);
in27 = _mm512_fmadd_ps(in27, _mm512_set1_ps(5.25e+00f), in25);
in18 = _mm512_fmadd_ps(tmp230, _mm512_set1_ps(2e+00f), in20);
in26 = _mm512_fmadd_ps(tmp234, _mm512_set1_ps(2e+00f), in28);
in20 = _mm512_fnmadd_ps(tmp230, _mm512_set1_ps(2e+00f), in20);
in28 = _mm512_fnmadd_ps(tmp234, _mm512_set1_ps(2e+00f), in28);
__m512 tmp246 = _mm512_unpacklo_ps(in16, tmp232);
__m512 tmp247 = _mm512_unpackhi_ps(in16, tmp232);
__m512 tmp248 = _mm512_unpacklo_ps(tmp233, in22);
__m512 tmp249 = _mm512_unpackhi_ps(tmp233, in22);
__m512 tmp250 = _mm512_unpacklo_ps(tmp231, in18);
__m512 tmp251 = _mm512_unpackhi_ps(tmp231, in18);
__m512 tmp252 = _mm512_unpacklo_ps(in20, in19);
__m512 tmp253 = _mm512_unpackhi_ps(in20, in19);
__m512 tmp254 = _mm512_unpacklo_ps(in24, tmp236);
__m512 tmp255 = _mm512_unpackhi_ps(in24, tmp236);
__m512 tmp256 = _mm512_unpacklo_ps(tmp237, in30);
__m512 tmp257 = _mm512_unpackhi_ps(tmp237, in30);
__m512 tmp258 = _mm512_unpacklo_ps(tmp235, in26);
__m512 tmp259 = _mm512_unpackhi_ps(tmp235, in26);
__m512 tmp260 = _mm512_unpacklo_ps(in28, in27);
__m512 tmp261 = _mm512_unpackhi_ps(in28, in27);
__m512 tmp262 = _mm512_shuffle_ps(tmp246, tmp248, 68);
__m512 tmp263 = _mm512_shuffle_ps(tmp246, tmp248, 238);
__m512 tmp264 = _mm512_shuffle_ps(tmp247, tmp249, 68);
__m512 tmp265 = _mm512_shuffle_ps(tmp247, tmp249, 238);
__m512 tmp266 = _mm512_shuffle_ps(tmp250, tmp252, 68);
__m512 tmp267 = _mm512_shuffle_ps(tmp250, tmp252, 238);
__m512 tmp268 = _mm512_shuffle_ps(tmp251, tmp253, 68);
__m512 tmp269 = _mm512_shuffle_ps(tmp251, tmp253, 238);
__m512 tmp270 = _mm512_shuffle_ps(tmp254, tmp256, 68);
__m512 tmp271 = _mm512_shuffle_ps(tmp254, tmp256, 238);
__m512 tmp272 = _mm512_shuffle_ps(tmp255, tmp257, 68);
__m512 tmp273 = _mm512_shuffle_ps(tmp255, tmp257, 238);
__m512 tmp274 = _mm512_shuffle_ps(tmp258, tmp260, 68);
__m512 tmp275 = _mm512_shuffle_ps(tmp258, tmp260, 238);
__m512 tmp276 = _mm512_shuffle_ps(tmp259, tmp261, 68);
__m512 tmp277 = _mm512_shuffle_ps(tmp259, tmp261, 238);
__m512 tmp278 = _mm512_shuffle_f32x4(tmp262, tmp266, 136);
__m512 tmp279 = _mm512_shuffle_f32x4(tmp262, tmp266, 221);
__m512 tmp280 = _mm512_shuffle_f32x4(tmp263, tmp267, 136);
__m512 tmp281 = _mm512_shuffle_f32x4(tmp263, tmp267, 221);
__m512 tmp282 = _mm512_shuffle_f32x4(tmp264, tmp268, 136);
__m512 tmp283 = _mm512_shuffle_f32x4(tmp264, tmp268, 221);
__m512 tmp284 = _mm512_shuffle_f32x4(tmp265, tmp269, 136);
__m512 tmp285 = _mm512_shuffle_f32x4(tmp265, tmp269, 221);
__m512 tmp286 = _mm512_shuffle_f32x4(tmp270, tmp274, 136);
__m512 tmp287 = _mm512_shuffle_f32x4(tmp270, tmp274, 221);
__m512 tmp288 = _mm512_shuffle_f32x4(tmp271, tmp275, 136);
__m512 tmp289 = _mm512_shuffle_f32x4(tmp271, tmp275, 221);
__m512 tmp290 = _mm512_shuffle_f32x4(tmp272, tmp276, 136);
__m512 tmp291 = _mm512_shuffle_f32x4(tmp272, tmp276, 221);
__m512 tmp292 = _mm512_shuffle_f32x4(tmp273, tmp277, 136);
__m512 tmp293 = _mm512_shuffle_f32x4(tmp273, tmp277, 221);
in16 = _mm512_shuffle_f32x4(tmp278, tmp286, 136);
in24 = _mm512_shuffle_f32x4(tmp278, tmp286, 221);
tmp232 = _mm512_shuffle_f32x4(tmp280, tmp288, 136);
tmp236 = _mm512_shuffle_f32x4(tmp280, tmp288, 221);
tmp233 = _mm512_shuffle_f32x4(tmp282, tmp290, 136);
tmp237 = _mm512_shuffle_f32x4(tmp282, tmp290, 221);
in22 = _mm512_shuffle_f32x4(tmp284, tmp292, 136);
in30 = _mm512_shuffle_f32x4(tmp284, tmp292, 221);
tmp231 = _mm512_shuffle_f32x4(tmp279, tmp287, 136);
tmp235 = _mm512_shuffle_f32x4(tmp279, tmp287, 221);
in18 = _mm512_shuffle_f32x4(tmp281, tmp289, 136);
in26 = _mm512_shuffle_f32x4(tmp281, tmp289, 221);
in20 = _mm512_shuffle_f32x4(tmp283, tmp291, 136);
in28 = _mm512_shuffle_f32x4(tmp283, tmp291, 221);
in19 = _mm512_shuffle_f32x4(tmp285, tmp293, 136);
in27 = _mm512_shuffle_f32x4(tmp285, tmp293, 221);
__m512 tmp238 = _mm512_add_ps(tmp232, in18);
__m512 tmp242 = _mm512_add_ps(tmp236, in26);
__m512 tmp239 = _mm512_sub_ps(tmp231, tmp233);
__m512 tmp243 = _mm512_sub_ps(tmp235, tmp237);
__m512 tmp240 = _mm512_add_ps(tmp233, in20);
__m512 tmp244 = _mm512_add_ps(tmp237, in28);
in16 = _mm512_sub_ps(in16, in20);
in24 = _mm512_sub_ps(in24, in28);
tmp238 = _mm512_fmadd_ps(in22, _mm512_set1_ps(-4.25e+00f), tmp238);
tmp242 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-4.25e+00f), tmp242);
tmp240 = _mm512_fmadd_ps(tmp231, _mm512_set1_ps(-4.25e+00f), tmp240);
tmp244 = _mm512_fmadd_ps(tmp235, _mm512_set1_ps(-4.25e+00f), tmp244);
in16 = _mm512_fmadd_ps(tmp239, _mm512_set1_ps(5.25e+00f), in16);
in24 = _mm512_fmadd_ps(tmp243, _mm512_set1_ps(5.25e+00f), in24);
tmp239 = _mm512_fmadd_ps(tmp233, _mm512_set1_ps(2.5e-01f), in20);
tmp243 = _mm512_fmadd_ps(tmp237, _mm512_set1_ps(2.5e-01f), in28);
tmp233 = _mm512_fmadd_ps(tmp233, _mm512_set1_ps(4e+00f), in20);
tmp237 = _mm512_fmadd_ps(tmp237, _mm512_set1_ps(4e+00f), in28);
__m512 tmp241 = _mm512_sub_ps(tmp240, tmp238);
__m512 tmp245 = _mm512_sub_ps(tmp244, tmp242);
tmp240 = _mm512_add_ps(tmp238, tmp240);
tmp244 = _mm512_add_ps(tmp242, tmp244);
tmp238 = _mm512_fmadd_ps(tmp232, _mm512_set1_ps(2.5e-01f), in18);
tmp242 = _mm512_fmadd_ps(tmp236, _mm512_set1_ps(2.5e-01f), in26);
tmp239 = _mm512_fmadd_ps(tmp231, _mm512_set1_ps(-1.25e+00f), tmp239);
tmp243 = _mm512_fmadd_ps(tmp235, _mm512_set1_ps(-1.25e+00f), tmp243);
tmp231 = _mm512_fmadd_ps(tmp231, _mm512_set1_ps(-5e+00f), tmp233);
tmp235 = _mm512_fmadd_ps(tmp235, _mm512_set1_ps(-5e+00f), tmp237);
tmp238 = _mm512_fmadd_ps(in22, _mm512_set1_ps(-1.25e+00f), tmp238);
tmp242 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-1.25e+00f), tmp242);
in20 = _mm512_fmadd_ps(tmp238, _mm512_set1_ps(2e+00f), tmp239);
in28 = _mm512_fmadd_ps(tmp242, _mm512_set1_ps(2e+00f), tmp243);
tmp239 = _mm512_fnmadd_ps(tmp238, _mm512_set1_ps(2e+00f), tmp239);
tmp243 = _mm512_fnmadd_ps(tmp242, _mm512_set1_ps(2e+00f), tmp243);
tmp238 = _mm512_fmadd_ps(in18, _mm512_set1_ps(2.5e-01f), tmp232);
tmp242 = _mm512_fmadd_ps(in26, _mm512_set1_ps(2.5e-01f), tmp236);
tmp232 = _mm512_sub_ps(in19, tmp232);
tmp236 = _mm512_sub_ps(in27, tmp236);
tmp238 = _mm512_fmadd_ps(in22, _mm512_set1_ps(-1.25e+00f), tmp238);
tmp242 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-1.25e+00f), tmp242);
in22 = _mm512_sub_ps(in22, in18);
in30 = _mm512_sub_ps(in30, in26);
in22 = _mm512_fmadd_ps(in22, _mm512_set1_ps(5.25e+00f), tmp232);
in30 = _mm512_fmadd_ps(in30, _mm512_set1_ps(5.25e+00f), tmp236);
tmp233 = _mm512_fmadd_ps(tmp238, _mm512_set1_ps(2e+00f), tmp231);
tmp237 = _mm512_fmadd_ps(tmp242, _mm512_set1_ps(2e+00f), tmp235);
tmp231 = _mm512_fnmadd_ps(tmp238, _mm512_set1_ps(2e+00f), tmp231);
tmp235 = _mm512_fnmadd_ps(tmp242, _mm512_set1_ps(2e+00f), tmp235);
__m512 out69 = _mm512_shuffle_f32x4(in16, tmp240, 68);
__m512 out77 = _mm512_shuffle_f32x4(in16, tmp240, 238);
__m512 out70 = _mm512_shuffle_f32x4(tmp241, in20, 68);
__m512 out78 = _mm512_shuffle_f32x4(tmp241, in20, 238);
__m512 out71 = _mm512_shuffle_f32x4(tmp239, tmp233, 68);
__m512 out79 = _mm512_shuffle_f32x4(tmp239, tmp233, 238);
__m512 out72 = _mm512_shuffle_f32x4(tmp231, in22, 68);
__m512 out80 = _mm512_shuffle_f32x4(tmp231, in22, 238);
__m512 out73 = _mm512_shuffle_f32x4(in24, tmp244, 68);
__m512 out81 = _mm512_shuffle_f32x4(in24, tmp244, 238);
__m512 out74 = _mm512_shuffle_f32x4(tmp245, in28, 68);
__m512 out82 = _mm512_shuffle_f32x4(tmp245, in28, 238);
__m512 out75 = _mm512_shuffle_f32x4(tmp243, tmp237, 68);
__m512 out83 = _mm512_shuffle_f32x4(tmp243, tmp237, 238);
__m512 out76 = _mm512_shuffle_f32x4(tmp235, in30, 68);
__m512 out84 = _mm512_shuffle_f32x4(tmp235, in30, 238);
_mm512_storeu_ps(dfPtr1+0+2433024*i7+152064*j3+38016*s5+768*k5, out69);
_mm512_storeu_ps(dfPtr1+128+2433024*i7+152064*j3+38016*s5+768*k5, out77);
_mm512_storeu_ps(dfPtr1+64+2433024*i7+152064*j3+38016*s5+768*k5, out73);
_mm512_storeu_ps(dfPtr1+192+2433024*i7+152064*j3+38016*s5+768*k5, out81);
_mm512_storeu_ps(dfPtr1+608256+2433024*i7+152064*j3+38016*s5+768*k5, out70);
_mm512_storeu_ps(dfPtr1+608384+2433024*i7+152064*j3+38016*s5+768*k5, out78);
_mm512_storeu_ps(dfPtr1+608320+2433024*i7+152064*j3+38016*s5+768*k5, out74);
_mm512_storeu_ps(dfPtr1+608448+2433024*i7+152064*j3+38016*s5+768*k5, out82);
_mm512_storeu_ps(dfPtr1+1216512+2433024*i7+152064*j3+38016*s5+768*k5, out71);
_mm512_storeu_ps(dfPtr1+1216640+2433024*i7+152064*j3+38016*s5+768*k5, out79);
_mm512_storeu_ps(dfPtr1+1216576+2433024*i7+152064*j3+38016*s5+768*k5, out75);
_mm512_storeu_ps(dfPtr1+1216704+2433024*i7+152064*j3+38016*s5+768*k5, out83);
_mm512_storeu_ps(dfPtr1+1824768+2433024*i7+152064*j3+38016*s5+768*k5, out72);
_mm512_storeu_ps(dfPtr1+1824896+2433024*i7+152064*j3+38016*s5+768*k5, out80);
_mm512_storeu_ps(dfPtr1+1824832+2433024*i7+152064*j3+38016*s5+768*k5, out76);
_mm512_storeu_ps(dfPtr1+1824960+2433024*i7+152064*j3+38016*s5+768*k5, out84);
__m512 dat17 = _mm512_maskz_loadu_ps(16383, datPtr1+552+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat18 = _mm512_maskz_loadu_ps(16383, datPtr1+3496+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512i pm11 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in32 = _mm512_permutexvar_ps(pm11, dat17);
__m512 in40 = _mm512_permutexvar_ps(pm11, dat18);
__m512 dat19 = _mm512_maskz_loadu_ps(16383, datPtr1+644+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat20 = _mm512_maskz_loadu_ps(16383, datPtr1+3588+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in33 = _mm512_permutexvar_ps(pm11, dat19);
__m512 in41 = _mm512_permutexvar_ps(pm11, dat20);
__m512 dat21 = _mm512_maskz_loadu_ps(16383, datPtr1+736+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat22 = _mm512_maskz_loadu_ps(16383, datPtr1+3680+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in34 = _mm512_permutexvar_ps(pm11, dat21);
__m512 in42 = _mm512_permutexvar_ps(pm11, dat22);
__m512 dat23 = _mm512_maskz_loadu_ps(16383, datPtr1+828+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat24 = _mm512_maskz_loadu_ps(16383, datPtr1+3772+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in35 = _mm512_permutexvar_ps(pm11, dat23);
__m512 in43 = _mm512_permutexvar_ps(pm11, dat24);
__m512 dat25 = _mm512_maskz_loadu_ps(16383, datPtr1+920+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat26 = _mm512_maskz_loadu_ps(16383, datPtr1+3864+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in36 = _mm512_permutexvar_ps(pm11, dat25);
__m512 in44 = _mm512_permutexvar_ps(pm11, dat26);
__m512 dat27 = _mm512_maskz_loadu_ps(16383, datPtr1+1012+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat28 = _mm512_maskz_loadu_ps(16383, datPtr1+3956+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in37 = _mm512_permutexvar_ps(pm11, dat27);
__m512 in45 = _mm512_permutexvar_ps(pm11, dat28);
__m512 dat29 = _mm512_maskz_loadu_ps(16383, datPtr1+1104+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat30 = _mm512_maskz_loadu_ps(16383, datPtr1+4048+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in38 = _mm512_permutexvar_ps(pm11, dat29);
__m512 in46 = _mm512_permutexvar_ps(pm11, dat30);
__m512 dat31 = _mm512_maskz_loadu_ps(16383, datPtr1+1196+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat32 = _mm512_maskz_loadu_ps(16383, datPtr1+4140+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in39 = _mm512_permutexvar_ps(pm11, dat31);
__m512 in47 = _mm512_permutexvar_ps(pm11, dat32);
__m512 tmp294 = _mm512_add_ps(in33, in37);
__m512 tmp298 = _mm512_add_ps(in41, in45);
__m512 tmp295 = _mm512_sub_ps(in36, in34);
__m512 tmp299 = _mm512_sub_ps(in44, in42);
__m512 tmp296 = _mm512_add_ps(in34, in38);
__m512 tmp300 = _mm512_add_ps(in42, in46);
in32 = _mm512_sub_ps(in32, in38);
in40 = _mm512_sub_ps(in40, in46);
tmp294 = _mm512_fmadd_ps(in35, _mm512_set1_ps(-4.25e+00f), tmp294);
tmp298 = _mm512_fmadd_ps(in43, _mm512_set1_ps(-4.25e+00f), tmp298);
tmp296 = _mm512_fmadd_ps(in36, _mm512_set1_ps(-4.25e+00f), tmp296);
tmp300 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-4.25e+00f), tmp300);
in32 = _mm512_fmadd_ps(tmp295, _mm512_set1_ps(5.25e+00f), in32);
in40 = _mm512_fmadd_ps(tmp299, _mm512_set1_ps(5.25e+00f), in40);
tmp295 = _mm512_fmadd_ps(in34, _mm512_set1_ps(2.5e-01f), in38);
tmp299 = _mm512_fmadd_ps(in42, _mm512_set1_ps(2.5e-01f), in46);
in34 = _mm512_fmadd_ps(in34, _mm512_set1_ps(4e+00f), in38);
in42 = _mm512_fmadd_ps(in42, _mm512_set1_ps(4e+00f), in46);
__m512 tmp297 = _mm512_sub_ps(tmp296, tmp294);
__m512 tmp301 = _mm512_sub_ps(tmp300, tmp298);
tmp296 = _mm512_add_ps(tmp294, tmp296);
tmp300 = _mm512_add_ps(tmp298, tmp300);
tmp294 = _mm512_fmadd_ps(in33, _mm512_set1_ps(2.5e-01f), in37);
tmp298 = _mm512_fmadd_ps(in41, _mm512_set1_ps(2.5e-01f), in45);
tmp295 = _mm512_fmadd_ps(in36, _mm512_set1_ps(-1.25e+00f), tmp295);
tmp299 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-1.25e+00f), tmp299);
in36 = _mm512_fmadd_ps(in36, _mm512_set1_ps(-5e+00f), in34);
in44 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-5e+00f), in42);
tmp294 = _mm512_fmadd_ps(in35, _mm512_set1_ps(-1.25e+00f), tmp294);
tmp298 = _mm512_fmadd_ps(in43, _mm512_set1_ps(-1.25e+00f), tmp298);
in38 = _mm512_fmadd_ps(tmp294, _mm512_set1_ps(2e+00f), tmp295);
in46 = _mm512_fmadd_ps(tmp298, _mm512_set1_ps(2e+00f), tmp299);
tmp295 = _mm512_fnmadd_ps(tmp294, _mm512_set1_ps(2e+00f), tmp295);
tmp299 = _mm512_fnmadd_ps(tmp298, _mm512_set1_ps(2e+00f), tmp299);
tmp294 = _mm512_fmadd_ps(in37, _mm512_set1_ps(2.5e-01f), in33);
tmp298 = _mm512_fmadd_ps(in45, _mm512_set1_ps(2.5e-01f), in41);
in33 = _mm512_sub_ps(in39, in33);
in41 = _mm512_sub_ps(in47, in41);
tmp294 = _mm512_fmadd_ps(in35, _mm512_set1_ps(-1.25e+00f), tmp294);
tmp298 = _mm512_fmadd_ps(in43, _mm512_set1_ps(-1.25e+00f), tmp298);
in35 = _mm512_sub_ps(in35, in37);
in43 = _mm512_sub_ps(in43, in45);
in35 = _mm512_fmadd_ps(in35, _mm512_set1_ps(5.25e+00f), in33);
in43 = _mm512_fmadd_ps(in43, _mm512_set1_ps(5.25e+00f), in41);
in34 = _mm512_fmadd_ps(tmp294, _mm512_set1_ps(2e+00f), in36);
in42 = _mm512_fmadd_ps(tmp298, _mm512_set1_ps(2e+00f), in44);
in36 = _mm512_fnmadd_ps(tmp294, _mm512_set1_ps(2e+00f), in36);
in44 = _mm512_fnmadd_ps(tmp298, _mm512_set1_ps(2e+00f), in44);
__m512 tmp310 = _mm512_unpacklo_ps(in32, tmp296);
__m512 tmp311 = _mm512_unpackhi_ps(in32, tmp296);
__m512 tmp312 = _mm512_unpacklo_ps(tmp297, in38);
__m512 tmp313 = _mm512_unpackhi_ps(tmp297, in38);
__m512 tmp314 = _mm512_unpacklo_ps(tmp295, in34);
__m512 tmp315 = _mm512_unpackhi_ps(tmp295, in34);
__m512 tmp316 = _mm512_unpacklo_ps(in36, in35);
__m512 tmp317 = _mm512_unpackhi_ps(in36, in35);
__m512 tmp318 = _mm512_unpacklo_ps(in40, tmp300);
__m512 tmp319 = _mm512_unpackhi_ps(in40, tmp300);
__m512 tmp320 = _mm512_unpacklo_ps(tmp301, in46);
__m512 tmp321 = _mm512_unpackhi_ps(tmp301, in46);
__m512 tmp322 = _mm512_unpacklo_ps(tmp299, in42);
__m512 tmp323 = _mm512_unpackhi_ps(tmp299, in42);
__m512 tmp324 = _mm512_unpacklo_ps(in44, in43);
__m512 tmp325 = _mm512_unpackhi_ps(in44, in43);
__m512 tmp326 = _mm512_shuffle_ps(tmp310, tmp312, 68);
__m512 tmp327 = _mm512_shuffle_ps(tmp310, tmp312, 238);
__m512 tmp328 = _mm512_shuffle_ps(tmp311, tmp313, 68);
__m512 tmp329 = _mm512_shuffle_ps(tmp311, tmp313, 238);
__m512 tmp330 = _mm512_shuffle_ps(tmp314, tmp316, 68);
__m512 tmp331 = _mm512_shuffle_ps(tmp314, tmp316, 238);
__m512 tmp332 = _mm512_shuffle_ps(tmp315, tmp317, 68);
__m512 tmp333 = _mm512_shuffle_ps(tmp315, tmp317, 238);
__m512 tmp334 = _mm512_shuffle_ps(tmp318, tmp320, 68);
__m512 tmp335 = _mm512_shuffle_ps(tmp318, tmp320, 238);
__m512 tmp336 = _mm512_shuffle_ps(tmp319, tmp321, 68);
__m512 tmp337 = _mm512_shuffle_ps(tmp319, tmp321, 238);
__m512 tmp338 = _mm512_shuffle_ps(tmp322, tmp324, 68);
__m512 tmp339 = _mm512_shuffle_ps(tmp322, tmp324, 238);
__m512 tmp340 = _mm512_shuffle_ps(tmp323, tmp325, 68);
__m512 tmp341 = _mm512_shuffle_ps(tmp323, tmp325, 238);
__m512 tmp342 = _mm512_shuffle_f32x4(tmp326, tmp330, 136);
__m512 tmp343 = _mm512_shuffle_f32x4(tmp326, tmp330, 221);
__m512 tmp344 = _mm512_shuffle_f32x4(tmp327, tmp331, 136);
__m512 tmp345 = _mm512_shuffle_f32x4(tmp327, tmp331, 221);
__m512 tmp346 = _mm512_shuffle_f32x4(tmp328, tmp332, 136);
__m512 tmp347 = _mm512_shuffle_f32x4(tmp328, tmp332, 221);
__m512 tmp348 = _mm512_shuffle_f32x4(tmp329, tmp333, 136);
__m512 tmp349 = _mm512_shuffle_f32x4(tmp329, tmp333, 221);
__m512 tmp350 = _mm512_shuffle_f32x4(tmp334, tmp338, 136);
__m512 tmp351 = _mm512_shuffle_f32x4(tmp334, tmp338, 221);
__m512 tmp352 = _mm512_shuffle_f32x4(tmp335, tmp339, 136);
__m512 tmp353 = _mm512_shuffle_f32x4(tmp335, tmp339, 221);
__m512 tmp354 = _mm512_shuffle_f32x4(tmp336, tmp340, 136);
__m512 tmp355 = _mm512_shuffle_f32x4(tmp336, tmp340, 221);
__m512 tmp356 = _mm512_shuffle_f32x4(tmp337, tmp341, 136);
__m512 tmp357 = _mm512_shuffle_f32x4(tmp337, tmp341, 221);
in32 = _mm512_shuffle_f32x4(tmp342, tmp350, 136);
in40 = _mm512_shuffle_f32x4(tmp342, tmp350, 221);
tmp296 = _mm512_shuffle_f32x4(tmp344, tmp352, 136);
tmp300 = _mm512_shuffle_f32x4(tmp344, tmp352, 221);
tmp297 = _mm512_shuffle_f32x4(tmp346, tmp354, 136);
tmp301 = _mm512_shuffle_f32x4(tmp346, tmp354, 221);
in38 = _mm512_shuffle_f32x4(tmp348, tmp356, 136);
in46 = _mm512_shuffle_f32x4(tmp348, tmp356, 221);
tmp295 = _mm512_shuffle_f32x4(tmp343, tmp351, 136);
tmp299 = _mm512_shuffle_f32x4(tmp343, tmp351, 221);
in34 = _mm512_shuffle_f32x4(tmp345, tmp353, 136);
in42 = _mm512_shuffle_f32x4(tmp345, tmp353, 221);
in36 = _mm512_shuffle_f32x4(tmp347, tmp355, 136);
in44 = _mm512_shuffle_f32x4(tmp347, tmp355, 221);
in35 = _mm512_shuffle_f32x4(tmp349, tmp357, 136);
in43 = _mm512_shuffle_f32x4(tmp349, tmp357, 221);
__m512 tmp302 = _mm512_add_ps(tmp296, in34);
__m512 tmp306 = _mm512_add_ps(tmp300, in42);
__m512 tmp303 = _mm512_sub_ps(tmp295, tmp297);
__m512 tmp307 = _mm512_sub_ps(tmp299, tmp301);
__m512 tmp304 = _mm512_add_ps(tmp297, in36);
__m512 tmp308 = _mm512_add_ps(tmp301, in44);
in32 = _mm512_sub_ps(in32, in36);
in40 = _mm512_sub_ps(in40, in44);
tmp302 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-4.25e+00f), tmp302);
tmp306 = _mm512_fmadd_ps(in46, _mm512_set1_ps(-4.25e+00f), tmp306);
tmp304 = _mm512_fmadd_ps(tmp295, _mm512_set1_ps(-4.25e+00f), tmp304);
tmp308 = _mm512_fmadd_ps(tmp299, _mm512_set1_ps(-4.25e+00f), tmp308);
in32 = _mm512_fmadd_ps(tmp303, _mm512_set1_ps(5.25e+00f), in32);
in40 = _mm512_fmadd_ps(tmp307, _mm512_set1_ps(5.25e+00f), in40);
tmp303 = _mm512_fmadd_ps(tmp297, _mm512_set1_ps(2.5e-01f), in36);
tmp307 = _mm512_fmadd_ps(tmp301, _mm512_set1_ps(2.5e-01f), in44);
tmp297 = _mm512_fmadd_ps(tmp297, _mm512_set1_ps(4e+00f), in36);
tmp301 = _mm512_fmadd_ps(tmp301, _mm512_set1_ps(4e+00f), in44);
__m512 tmp305 = _mm512_sub_ps(tmp304, tmp302);
__m512 tmp309 = _mm512_sub_ps(tmp308, tmp306);
tmp304 = _mm512_add_ps(tmp302, tmp304);
tmp308 = _mm512_add_ps(tmp306, tmp308);
tmp302 = _mm512_fmadd_ps(tmp296, _mm512_set1_ps(2.5e-01f), in34);
tmp306 = _mm512_fmadd_ps(tmp300, _mm512_set1_ps(2.5e-01f), in42);
tmp303 = _mm512_fmadd_ps(tmp295, _mm512_set1_ps(-1.25e+00f), tmp303);
tmp307 = _mm512_fmadd_ps(tmp299, _mm512_set1_ps(-1.25e+00f), tmp307);
tmp295 = _mm512_fmadd_ps(tmp295, _mm512_set1_ps(-5e+00f), tmp297);
tmp299 = _mm512_fmadd_ps(tmp299, _mm512_set1_ps(-5e+00f), tmp301);
tmp302 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-1.25e+00f), tmp302);
tmp306 = _mm512_fmadd_ps(in46, _mm512_set1_ps(-1.25e+00f), tmp306);
in36 = _mm512_fmadd_ps(tmp302, _mm512_set1_ps(2e+00f), tmp303);
in44 = _mm512_fmadd_ps(tmp306, _mm512_set1_ps(2e+00f), tmp307);
tmp303 = _mm512_fnmadd_ps(tmp302, _mm512_set1_ps(2e+00f), tmp303);
tmp307 = _mm512_fnmadd_ps(tmp306, _mm512_set1_ps(2e+00f), tmp307);
tmp302 = _mm512_fmadd_ps(in34, _mm512_set1_ps(2.5e-01f), tmp296);
tmp306 = _mm512_fmadd_ps(in42, _mm512_set1_ps(2.5e-01f), tmp300);
tmp296 = _mm512_sub_ps(in35, tmp296);
tmp300 = _mm512_sub_ps(in43, tmp300);
tmp302 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-1.25e+00f), tmp302);
tmp306 = _mm512_fmadd_ps(in46, _mm512_set1_ps(-1.25e+00f), tmp306);
in38 = _mm512_sub_ps(in38, in34);
in46 = _mm512_sub_ps(in46, in42);
in38 = _mm512_fmadd_ps(in38, _mm512_set1_ps(5.25e+00f), tmp296);
in46 = _mm512_fmadd_ps(in46, _mm512_set1_ps(5.25e+00f), tmp300);
tmp297 = _mm512_fmadd_ps(tmp302, _mm512_set1_ps(2e+00f), tmp295);
tmp301 = _mm512_fmadd_ps(tmp306, _mm512_set1_ps(2e+00f), tmp299);
tmp295 = _mm512_fnmadd_ps(tmp302, _mm512_set1_ps(2e+00f), tmp295);
tmp299 = _mm512_fnmadd_ps(tmp306, _mm512_set1_ps(2e+00f), tmp299);
__m512 out85 = _mm512_shuffle_f32x4(in32, tmp304, 68);
__m512 out93 = _mm512_shuffle_f32x4(in32, tmp304, 238);
__m512 out86 = _mm512_shuffle_f32x4(tmp305, in36, 68);
__m512 out94 = _mm512_shuffle_f32x4(tmp305, in36, 238);
__m512 out87 = _mm512_shuffle_f32x4(tmp303, tmp297, 68);
__m512 out95 = _mm512_shuffle_f32x4(tmp303, tmp297, 238);
__m512 out88 = _mm512_shuffle_f32x4(tmp295, in38, 68);
__m512 out96 = _mm512_shuffle_f32x4(tmp295, in38, 238);
__m512 out89 = _mm512_shuffle_f32x4(in40, tmp308, 68);
__m512 out97 = _mm512_shuffle_f32x4(in40, tmp308, 238);
__m512 out90 = _mm512_shuffle_f32x4(tmp309, in44, 68);
__m512 out98 = _mm512_shuffle_f32x4(tmp309, in44, 238);
__m512 out91 = _mm512_shuffle_f32x4(tmp307, tmp301, 68);
__m512 out99 = _mm512_shuffle_f32x4(tmp307, tmp301, 238);
__m512 out92 = _mm512_shuffle_f32x4(tmp299, in46, 68);
__m512 out100 = _mm512_shuffle_f32x4(tmp299, in46, 238);
_mm512_storeu_ps(dfPtr1+256+2433024*i7+152064*j3+38016*s5+768*k5, out85);
_mm512_storeu_ps(dfPtr1+384+2433024*i7+152064*j3+38016*s5+768*k5, out93);
_mm512_storeu_ps(dfPtr1+320+2433024*i7+152064*j3+38016*s5+768*k5, out89);
_mm512_storeu_ps(dfPtr1+448+2433024*i7+152064*j3+38016*s5+768*k5, out97);
_mm512_storeu_ps(dfPtr1+608512+2433024*i7+152064*j3+38016*s5+768*k5, out86);
_mm512_storeu_ps(dfPtr1+608640+2433024*i7+152064*j3+38016*s5+768*k5, out94);
_mm512_storeu_ps(dfPtr1+608576+2433024*i7+152064*j3+38016*s5+768*k5, out90);
_mm512_storeu_ps(dfPtr1+608704+2433024*i7+152064*j3+38016*s5+768*k5, out98);
_mm512_storeu_ps(dfPtr1+1216768+2433024*i7+152064*j3+38016*s5+768*k5, out87);
_mm512_storeu_ps(dfPtr1+1216896+2433024*i7+152064*j3+38016*s5+768*k5, out95);
_mm512_storeu_ps(dfPtr1+1216832+2433024*i7+152064*j3+38016*s5+768*k5, out91);
_mm512_storeu_ps(dfPtr1+1216960+2433024*i7+152064*j3+38016*s5+768*k5, out99);
_mm512_storeu_ps(dfPtr1+1825024+2433024*i7+152064*j3+38016*s5+768*k5, out88);
_mm512_storeu_ps(dfPtr1+1825152+2433024*i7+152064*j3+38016*s5+768*k5, out96);
_mm512_storeu_ps(dfPtr1+1825088+2433024*i7+152064*j3+38016*s5+768*k5, out92);
_mm512_storeu_ps(dfPtr1+1825216+2433024*i7+152064*j3+38016*s5+768*k5, out100);
__m512 dat33 = _mm512_maskz_loadu_ps(2047, datPtr1+3544+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat34 = _mm512_maskz_loadu_ps(16383, datPtr1+4048+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512i pm12 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in48 = _mm512_permutexvar_ps(pm12, dat33);
__m512i pm13 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in56 = _mm512_permutexvar_ps(pm13, dat34);
__m512 dat35 = _mm512_maskz_loadu_ps(2047, datPtr1+3636+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat36 = _mm512_maskz_loadu_ps(16383, datPtr1+4140+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in49 = _mm512_permutexvar_ps(pm12, dat35);
__m512 in57 = _mm512_permutexvar_ps(pm13, dat36);
__m512 dat37 = _mm512_maskz_loadu_ps(2047, datPtr1+3728+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat38 = _mm512_maskz_loadu_ps(16383, datPtr1+4232+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in50 = _mm512_permutexvar_ps(pm12, dat37);
__m512 in58 = _mm512_permutexvar_ps(pm13, dat38);
__m512 dat39 = _mm512_maskz_loadu_ps(2047, datPtr1+3820+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat40 = _mm512_maskz_loadu_ps(16383, datPtr1+4324+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in51 = _mm512_permutexvar_ps(pm12, dat39);
__m512 in59 = _mm512_permutexvar_ps(pm13, dat40);
__m512 dat41 = _mm512_maskz_loadu_ps(2047, datPtr1+3912+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat42 = _mm512_maskz_loadu_ps(16383, datPtr1+4416+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in52 = _mm512_permutexvar_ps(pm12, dat41);
__m512 in60 = _mm512_permutexvar_ps(pm13, dat42);
__m512 dat43 = _mm512_maskz_loadu_ps(2047, datPtr1+4004+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat44 = _mm512_maskz_loadu_ps(16383, datPtr1+4508+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in53 = _mm512_permutexvar_ps(pm12, dat43);
__m512 in61 = _mm512_permutexvar_ps(pm13, dat44);
__m512 dat45 = _mm512_maskz_loadu_ps(2047, datPtr1+4096+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat46 = _mm512_maskz_loadu_ps(16383, datPtr1+4600+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in54 = _mm512_permutexvar_ps(pm12, dat45);
__m512 in62 = _mm512_permutexvar_ps(pm13, dat46);
__m512 dat47 = _mm512_maskz_loadu_ps(2047, datPtr1+4188+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat48 = _mm512_maskz_loadu_ps(16383, datPtr1+4692+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in55 = _mm512_permutexvar_ps(pm12, dat47);
__m512 in63 = _mm512_permutexvar_ps(pm13, dat48);
__m512 tmp358 = _mm512_add_ps(in49, in53);
__m512 tmp362 = _mm512_add_ps(in57, in61);
__m512 tmp359 = _mm512_sub_ps(in52, in50);
__m512 tmp363 = _mm512_sub_ps(in60, in58);
__m512 tmp360 = _mm512_add_ps(in50, in54);
__m512 tmp364 = _mm512_add_ps(in58, in62);
in48 = _mm512_sub_ps(in48, in54);
in56 = _mm512_sub_ps(in56, in62);
tmp358 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-4.25e+00f), tmp358);
tmp362 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-4.25e+00f), tmp362);
tmp360 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-4.25e+00f), tmp360);
tmp364 = _mm512_fmadd_ps(in60, _mm512_set1_ps(-4.25e+00f), tmp364);
in48 = _mm512_fmadd_ps(tmp359, _mm512_set1_ps(5.25e+00f), in48);
in56 = _mm512_fmadd_ps(tmp363, _mm512_set1_ps(5.25e+00f), in56);
tmp359 = _mm512_fmadd_ps(in50, _mm512_set1_ps(2.5e-01f), in54);
tmp363 = _mm512_fmadd_ps(in58, _mm512_set1_ps(2.5e-01f), in62);
in50 = _mm512_fmadd_ps(in50, _mm512_set1_ps(4e+00f), in54);
in58 = _mm512_fmadd_ps(in58, _mm512_set1_ps(4e+00f), in62);
__m512 tmp361 = _mm512_sub_ps(tmp360, tmp358);
__m512 tmp365 = _mm512_sub_ps(tmp364, tmp362);
tmp360 = _mm512_add_ps(tmp358, tmp360);
tmp364 = _mm512_add_ps(tmp362, tmp364);
tmp358 = _mm512_fmadd_ps(in49, _mm512_set1_ps(2.5e-01f), in53);
tmp362 = _mm512_fmadd_ps(in57, _mm512_set1_ps(2.5e-01f), in61);
tmp359 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-1.25e+00f), tmp359);
tmp363 = _mm512_fmadd_ps(in60, _mm512_set1_ps(-1.25e+00f), tmp363);
in52 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-5e+00f), in50);
in60 = _mm512_fmadd_ps(in60, _mm512_set1_ps(-5e+00f), in58);
tmp358 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-1.25e+00f), tmp358);
tmp362 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-1.25e+00f), tmp362);
in54 = _mm512_fmadd_ps(tmp358, _mm512_set1_ps(2e+00f), tmp359);
in62 = _mm512_fmadd_ps(tmp362, _mm512_set1_ps(2e+00f), tmp363);
tmp359 = _mm512_fnmadd_ps(tmp358, _mm512_set1_ps(2e+00f), tmp359);
tmp363 = _mm512_fnmadd_ps(tmp362, _mm512_set1_ps(2e+00f), tmp363);
tmp358 = _mm512_fmadd_ps(in53, _mm512_set1_ps(2.5e-01f), in49);
tmp362 = _mm512_fmadd_ps(in61, _mm512_set1_ps(2.5e-01f), in57);
in49 = _mm512_sub_ps(in55, in49);
in57 = _mm512_sub_ps(in63, in57);
tmp358 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-1.25e+00f), tmp358);
tmp362 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-1.25e+00f), tmp362);
in51 = _mm512_sub_ps(in51, in53);
in59 = _mm512_sub_ps(in59, in61);
in51 = _mm512_fmadd_ps(in51, _mm512_set1_ps(5.25e+00f), in49);
in59 = _mm512_fmadd_ps(in59, _mm512_set1_ps(5.25e+00f), in57);
in50 = _mm512_fmadd_ps(tmp358, _mm512_set1_ps(2e+00f), in52);
in58 = _mm512_fmadd_ps(tmp362, _mm512_set1_ps(2e+00f), in60);
in52 = _mm512_fnmadd_ps(tmp358, _mm512_set1_ps(2e+00f), in52);
in60 = _mm512_fnmadd_ps(tmp362, _mm512_set1_ps(2e+00f), in60);
__m512 tmp374 = _mm512_unpacklo_ps(in48, tmp360);
__m512 tmp375 = _mm512_unpackhi_ps(in48, tmp360);
__m512 tmp376 = _mm512_unpacklo_ps(tmp361, in54);
__m512 tmp377 = _mm512_unpackhi_ps(tmp361, in54);
__m512 tmp378 = _mm512_unpacklo_ps(tmp359, in50);
__m512 tmp379 = _mm512_unpackhi_ps(tmp359, in50);
__m512 tmp380 = _mm512_unpacklo_ps(in52, in51);
__m512 tmp381 = _mm512_unpackhi_ps(in52, in51);
__m512 tmp382 = _mm512_unpacklo_ps(in56, tmp364);
__m512 tmp383 = _mm512_unpackhi_ps(in56, tmp364);
__m512 tmp384 = _mm512_unpacklo_ps(tmp365, in62);
__m512 tmp385 = _mm512_unpackhi_ps(tmp365, in62);
__m512 tmp386 = _mm512_unpacklo_ps(tmp363, in58);
__m512 tmp387 = _mm512_unpackhi_ps(tmp363, in58);
__m512 tmp388 = _mm512_unpacklo_ps(in60, in59);
__m512 tmp389 = _mm512_unpackhi_ps(in60, in59);
__m512 tmp390 = _mm512_shuffle_ps(tmp374, tmp376, 68);
__m512 tmp391 = _mm512_shuffle_ps(tmp374, tmp376, 238);
__m512 tmp392 = _mm512_shuffle_ps(tmp375, tmp377, 68);
__m512 tmp393 = _mm512_shuffle_ps(tmp375, tmp377, 238);
__m512 tmp394 = _mm512_shuffle_ps(tmp378, tmp380, 68);
__m512 tmp395 = _mm512_shuffle_ps(tmp378, tmp380, 238);
__m512 tmp396 = _mm512_shuffle_ps(tmp379, tmp381, 68);
__m512 tmp397 = _mm512_shuffle_ps(tmp379, tmp381, 238);
__m512 tmp398 = _mm512_shuffle_ps(tmp382, tmp384, 68);
__m512 tmp399 = _mm512_shuffle_ps(tmp382, tmp384, 238);
__m512 tmp400 = _mm512_shuffle_ps(tmp383, tmp385, 68);
__m512 tmp401 = _mm512_shuffle_ps(tmp383, tmp385, 238);
__m512 tmp402 = _mm512_shuffle_ps(tmp386, tmp388, 68);
__m512 tmp403 = _mm512_shuffle_ps(tmp386, tmp388, 238);
__m512 tmp404 = _mm512_shuffle_ps(tmp387, tmp389, 68);
__m512 tmp405 = _mm512_shuffle_ps(tmp387, tmp389, 238);
__m512 tmp406 = _mm512_shuffle_f32x4(tmp390, tmp394, 136);
__m512 tmp407 = _mm512_shuffle_f32x4(tmp390, tmp394, 221);
__m512 tmp408 = _mm512_shuffle_f32x4(tmp391, tmp395, 136);
__m512 tmp409 = _mm512_shuffle_f32x4(tmp391, tmp395, 221);
__m512 tmp410 = _mm512_shuffle_f32x4(tmp392, tmp396, 136);
__m512 tmp411 = _mm512_shuffle_f32x4(tmp392, tmp396, 221);
__m512 tmp412 = _mm512_shuffle_f32x4(tmp393, tmp397, 136);
__m512 tmp413 = _mm512_shuffle_f32x4(tmp393, tmp397, 221);
__m512 tmp414 = _mm512_shuffle_f32x4(tmp398, tmp402, 136);
__m512 tmp415 = _mm512_shuffle_f32x4(tmp398, tmp402, 221);
__m512 tmp416 = _mm512_shuffle_f32x4(tmp399, tmp403, 136);
__m512 tmp417 = _mm512_shuffle_f32x4(tmp399, tmp403, 221);
__m512 tmp418 = _mm512_shuffle_f32x4(tmp400, tmp404, 136);
__m512 tmp419 = _mm512_shuffle_f32x4(tmp400, tmp404, 221);
__m512 tmp420 = _mm512_shuffle_f32x4(tmp401, tmp405, 136);
__m512 tmp421 = _mm512_shuffle_f32x4(tmp401, tmp405, 221);
in48 = _mm512_shuffle_f32x4(tmp406, tmp414, 136);
in56 = _mm512_shuffle_f32x4(tmp406, tmp414, 221);
tmp360 = _mm512_shuffle_f32x4(tmp408, tmp416, 136);
tmp364 = _mm512_shuffle_f32x4(tmp408, tmp416, 221);
tmp361 = _mm512_shuffle_f32x4(tmp410, tmp418, 136);
tmp365 = _mm512_shuffle_f32x4(tmp410, tmp418, 221);
in54 = _mm512_shuffle_f32x4(tmp412, tmp420, 136);
in62 = _mm512_shuffle_f32x4(tmp412, tmp420, 221);
tmp359 = _mm512_shuffle_f32x4(tmp407, tmp415, 136);
tmp363 = _mm512_shuffle_f32x4(tmp407, tmp415, 221);
in50 = _mm512_shuffle_f32x4(tmp409, tmp417, 136);
in58 = _mm512_shuffle_f32x4(tmp409, tmp417, 221);
in52 = _mm512_shuffle_f32x4(tmp411, tmp419, 136);
in60 = _mm512_shuffle_f32x4(tmp411, tmp419, 221);
in51 = _mm512_shuffle_f32x4(tmp413, tmp421, 136);
in59 = _mm512_shuffle_f32x4(tmp413, tmp421, 221);
__m512 tmp366 = _mm512_add_ps(tmp360, in50);
__m512 tmp370 = _mm512_add_ps(tmp364, in58);
__m512 tmp367 = _mm512_sub_ps(tmp359, tmp361);
__m512 tmp371 = _mm512_sub_ps(tmp363, tmp365);
__m512 tmp368 = _mm512_add_ps(tmp361, in52);
__m512 tmp372 = _mm512_add_ps(tmp365, in60);
in48 = _mm512_sub_ps(in48, in52);
in56 = _mm512_sub_ps(in56, in60);
tmp366 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-4.25e+00f), tmp366);
tmp370 = _mm512_fmadd_ps(in62, _mm512_set1_ps(-4.25e+00f), tmp370);
tmp368 = _mm512_fmadd_ps(tmp359, _mm512_set1_ps(-4.25e+00f), tmp368);
tmp372 = _mm512_fmadd_ps(tmp363, _mm512_set1_ps(-4.25e+00f), tmp372);
in48 = _mm512_fmadd_ps(tmp367, _mm512_set1_ps(5.25e+00f), in48);
in56 = _mm512_fmadd_ps(tmp371, _mm512_set1_ps(5.25e+00f), in56);
tmp367 = _mm512_fmadd_ps(tmp361, _mm512_set1_ps(2.5e-01f), in52);
tmp371 = _mm512_fmadd_ps(tmp365, _mm512_set1_ps(2.5e-01f), in60);
tmp361 = _mm512_fmadd_ps(tmp361, _mm512_set1_ps(4e+00f), in52);
tmp365 = _mm512_fmadd_ps(tmp365, _mm512_set1_ps(4e+00f), in60);
__m512 tmp369 = _mm512_sub_ps(tmp368, tmp366);
__m512 tmp373 = _mm512_sub_ps(tmp372, tmp370);
tmp368 = _mm512_add_ps(tmp366, tmp368);
tmp372 = _mm512_add_ps(tmp370, tmp372);
tmp366 = _mm512_fmadd_ps(tmp360, _mm512_set1_ps(2.5e-01f), in50);
tmp370 = _mm512_fmadd_ps(tmp364, _mm512_set1_ps(2.5e-01f), in58);
tmp367 = _mm512_fmadd_ps(tmp359, _mm512_set1_ps(-1.25e+00f), tmp367);
tmp371 = _mm512_fmadd_ps(tmp363, _mm512_set1_ps(-1.25e+00f), tmp371);
tmp359 = _mm512_fmadd_ps(tmp359, _mm512_set1_ps(-5e+00f), tmp361);
tmp363 = _mm512_fmadd_ps(tmp363, _mm512_set1_ps(-5e+00f), tmp365);
tmp366 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-1.25e+00f), tmp366);
tmp370 = _mm512_fmadd_ps(in62, _mm512_set1_ps(-1.25e+00f), tmp370);
in52 = _mm512_fmadd_ps(tmp366, _mm512_set1_ps(2e+00f), tmp367);
in60 = _mm512_fmadd_ps(tmp370, _mm512_set1_ps(2e+00f), tmp371);
tmp367 = _mm512_fnmadd_ps(tmp366, _mm512_set1_ps(2e+00f), tmp367);
tmp371 = _mm512_fnmadd_ps(tmp370, _mm512_set1_ps(2e+00f), tmp371);
tmp366 = _mm512_fmadd_ps(in50, _mm512_set1_ps(2.5e-01f), tmp360);
tmp370 = _mm512_fmadd_ps(in58, _mm512_set1_ps(2.5e-01f), tmp364);
tmp360 = _mm512_sub_ps(in51, tmp360);
tmp364 = _mm512_sub_ps(in59, tmp364);
tmp366 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-1.25e+00f), tmp366);
tmp370 = _mm512_fmadd_ps(in62, _mm512_set1_ps(-1.25e+00f), tmp370);
in54 = _mm512_sub_ps(in54, in50);
in62 = _mm512_sub_ps(in62, in58);
in54 = _mm512_fmadd_ps(in54, _mm512_set1_ps(5.25e+00f), tmp360);
in62 = _mm512_fmadd_ps(in62, _mm512_set1_ps(5.25e+00f), tmp364);
tmp361 = _mm512_fmadd_ps(tmp366, _mm512_set1_ps(2e+00f), tmp359);
tmp365 = _mm512_fmadd_ps(tmp370, _mm512_set1_ps(2e+00f), tmp363);
tmp359 = _mm512_fnmadd_ps(tmp366, _mm512_set1_ps(2e+00f), tmp359);
tmp363 = _mm512_fnmadd_ps(tmp370, _mm512_set1_ps(2e+00f), tmp363);
__m512 out101 = _mm512_shuffle_f32x4(in48, tmp368, 68);
__m512 out109 = _mm512_shuffle_f32x4(in48, tmp368, 238);
__m512 out102 = _mm512_shuffle_f32x4(tmp369, in52, 68);
__m512 out110 = _mm512_shuffle_f32x4(tmp369, in52, 238);
__m512 out103 = _mm512_shuffle_f32x4(tmp367, tmp361, 68);
__m512 out111 = _mm512_shuffle_f32x4(tmp367, tmp361, 238);
__m512 out104 = _mm512_shuffle_f32x4(tmp359, in54, 68);
__m512 out112 = _mm512_shuffle_f32x4(tmp359, in54, 238);
__m512 out105 = _mm512_shuffle_f32x4(in56, tmp372, 68);
__m512 out113 = _mm512_shuffle_f32x4(in56, tmp372, 238);
__m512 out106 = _mm512_shuffle_f32x4(tmp373, in60, 68);
__m512 out114 = _mm512_shuffle_f32x4(tmp373, in60, 238);
__m512 out107 = _mm512_shuffle_f32x4(tmp371, tmp365, 68);
__m512 out115 = _mm512_shuffle_f32x4(tmp371, tmp365, 238);
__m512 out108 = _mm512_shuffle_f32x4(tmp363, in62, 68);
__m512 out116 = _mm512_shuffle_f32x4(tmp363, in62, 238);
_mm512_storeu_ps(dfPtr1+512+2433024*i7+152064*j3+38016*s5+768*k5, out101);
_mm512_storeu_ps(dfPtr1+640+2433024*i7+152064*j3+38016*s5+768*k5, out109);
_mm512_storeu_ps(dfPtr1+576+2433024*i7+152064*j3+38016*s5+768*k5, out105);
_mm512_storeu_ps(dfPtr1+704+2433024*i7+152064*j3+38016*s5+768*k5, out113);
_mm512_storeu_ps(dfPtr1+608768+2433024*i7+152064*j3+38016*s5+768*k5, out102);
_mm512_storeu_ps(dfPtr1+608896+2433024*i7+152064*j3+38016*s5+768*k5, out110);
_mm512_storeu_ps(dfPtr1+608832+2433024*i7+152064*j3+38016*s5+768*k5, out106);
_mm512_storeu_ps(dfPtr1+608960+2433024*i7+152064*j3+38016*s5+768*k5, out114);
_mm512_storeu_ps(dfPtr1+1217024+2433024*i7+152064*j3+38016*s5+768*k5, out103);
_mm512_storeu_ps(dfPtr1+1217152+2433024*i7+152064*j3+38016*s5+768*k5, out111);
_mm512_storeu_ps(dfPtr1+1217088+2433024*i7+152064*j3+38016*s5+768*k5, out107);
_mm512_storeu_ps(dfPtr1+1217216+2433024*i7+152064*j3+38016*s5+768*k5, out115);
_mm512_storeu_ps(dfPtr1+1825280+2433024*i7+152064*j3+38016*s5+768*k5, out104);
_mm512_storeu_ps(dfPtr1+1825408+2433024*i7+152064*j3+38016*s5+768*k5, out112);
_mm512_storeu_ps(dfPtr1+1825344+2433024*i7+152064*j3+38016*s5+768*k5, out108);
_mm512_storeu_ps(dfPtr1+1825472+2433024*i7+152064*j3+38016*s5+768*k5, out116);
}
__m512 dat49 = _mm512_maskz_loadu_ps(16383, datPtr1+0+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat50 = _mm512_maskz_loadu_ps(2047, datPtr1+48+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512i pm14 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in64 = _mm512_permutexvar_ps(pm14, dat49);
__m512i pm15 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in72 = _mm512_permutexvar_ps(pm15, dat50);
__m512 dat51 = _mm512_maskz_loadu_ps(16383, datPtr1+92+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat52 = _mm512_maskz_loadu_ps(2047, datPtr1+140+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in65 = _mm512_permutexvar_ps(pm14, dat51);
__m512 in73 = _mm512_permutexvar_ps(pm15, dat52);
__m512 dat53 = _mm512_maskz_loadu_ps(16383, datPtr1+184+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat54 = _mm512_maskz_loadu_ps(2047, datPtr1+232+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in66 = _mm512_permutexvar_ps(pm14, dat53);
__m512 in74 = _mm512_permutexvar_ps(pm15, dat54);
__m512 dat55 = _mm512_maskz_loadu_ps(16383, datPtr1+276+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat56 = _mm512_maskz_loadu_ps(2047, datPtr1+324+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in67 = _mm512_permutexvar_ps(pm14, dat55);
__m512 in75 = _mm512_permutexvar_ps(pm15, dat56);
__m512 dat57 = _mm512_maskz_loadu_ps(16383, datPtr1+368+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat58 = _mm512_maskz_loadu_ps(2047, datPtr1+416+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in68 = _mm512_permutexvar_ps(pm14, dat57);
__m512 in76 = _mm512_permutexvar_ps(pm15, dat58);
__m512 dat59 = _mm512_maskz_loadu_ps(16383, datPtr1+460+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat60 = _mm512_maskz_loadu_ps(2047, datPtr1+508+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in69 = _mm512_permutexvar_ps(pm14, dat59);
__m512 in77 = _mm512_permutexvar_ps(pm15, dat60);
__m512 dat61 = _mm512_maskz_loadu_ps(16383, datPtr1+552+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat62 = _mm512_maskz_loadu_ps(2047, datPtr1+600+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in70 = _mm512_permutexvar_ps(pm14, dat61);
__m512 in78 = _mm512_permutexvar_ps(pm15, dat62);
__m512 dat63 = _mm512_maskz_loadu_ps(16383, datPtr1+644+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 dat64 = _mm512_maskz_loadu_ps(2047, datPtr1+692+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in71 = _mm512_permutexvar_ps(pm14, dat63);
__m512 in79 = _mm512_permutexvar_ps(pm15, dat64);
__m512 tmp422 = _mm512_add_ps(in65, in69);
__m512 tmp426 = _mm512_add_ps(in73, in77);
__m512 tmp423 = _mm512_sub_ps(in68, in66);
__m512 tmp427 = _mm512_sub_ps(in76, in74);
__m512 tmp424 = _mm512_add_ps(in66, in70);
__m512 tmp428 = _mm512_add_ps(in74, in78);
in64 = _mm512_sub_ps(in64, in70);
in72 = _mm512_sub_ps(in72, in78);
tmp422 = _mm512_fmadd_ps(in67, _mm512_set1_ps(-4.25e+00f), tmp422);
tmp426 = _mm512_fmadd_ps(in75, _mm512_set1_ps(-4.25e+00f), tmp426);
tmp424 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-4.25e+00f), tmp424);
tmp428 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-4.25e+00f), tmp428);
in64 = _mm512_fmadd_ps(tmp423, _mm512_set1_ps(5.25e+00f), in64);
in72 = _mm512_fmadd_ps(tmp427, _mm512_set1_ps(5.25e+00f), in72);
tmp423 = _mm512_fmadd_ps(in66, _mm512_set1_ps(2.5e-01f), in70);
tmp427 = _mm512_fmadd_ps(in74, _mm512_set1_ps(2.5e-01f), in78);
in66 = _mm512_fmadd_ps(in66, _mm512_set1_ps(4e+00f), in70);
in74 = _mm512_fmadd_ps(in74, _mm512_set1_ps(4e+00f), in78);
__m512 tmp425 = _mm512_sub_ps(tmp424, tmp422);
__m512 tmp429 = _mm512_sub_ps(tmp428, tmp426);
tmp424 = _mm512_add_ps(tmp422, tmp424);
tmp428 = _mm512_add_ps(tmp426, tmp428);
tmp422 = _mm512_fmadd_ps(in65, _mm512_set1_ps(2.5e-01f), in69);
tmp426 = _mm512_fmadd_ps(in73, _mm512_set1_ps(2.5e-01f), in77);
tmp423 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-1.25e+00f), tmp423);
tmp427 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-1.25e+00f), tmp427);
in68 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-5e+00f), in66);
in76 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-5e+00f), in74);
tmp422 = _mm512_fmadd_ps(in67, _mm512_set1_ps(-1.25e+00f), tmp422);
tmp426 = _mm512_fmadd_ps(in75, _mm512_set1_ps(-1.25e+00f), tmp426);
in70 = _mm512_fmadd_ps(tmp422, _mm512_set1_ps(2e+00f), tmp423);
in78 = _mm512_fmadd_ps(tmp426, _mm512_set1_ps(2e+00f), tmp427);
tmp423 = _mm512_fnmadd_ps(tmp422, _mm512_set1_ps(2e+00f), tmp423);
tmp427 = _mm512_fnmadd_ps(tmp426, _mm512_set1_ps(2e+00f), tmp427);
tmp422 = _mm512_fmadd_ps(in69, _mm512_set1_ps(2.5e-01f), in65);
tmp426 = _mm512_fmadd_ps(in77, _mm512_set1_ps(2.5e-01f), in73);
in65 = _mm512_sub_ps(in71, in65);
in73 = _mm512_sub_ps(in79, in73);
tmp422 = _mm512_fmadd_ps(in67, _mm512_set1_ps(-1.25e+00f), tmp422);
tmp426 = _mm512_fmadd_ps(in75, _mm512_set1_ps(-1.25e+00f), tmp426);
in67 = _mm512_sub_ps(in67, in69);
in75 = _mm512_sub_ps(in75, in77);
in67 = _mm512_fmadd_ps(in67, _mm512_set1_ps(5.25e+00f), in65);
in75 = _mm512_fmadd_ps(in75, _mm512_set1_ps(5.25e+00f), in73);
in66 = _mm512_fmadd_ps(tmp422, _mm512_set1_ps(2e+00f), in68);
in74 = _mm512_fmadd_ps(tmp426, _mm512_set1_ps(2e+00f), in76);
in68 = _mm512_fnmadd_ps(tmp422, _mm512_set1_ps(2e+00f), in68);
in76 = _mm512_fnmadd_ps(tmp426, _mm512_set1_ps(2e+00f), in76);
__m512 tmp438 = _mm512_unpacklo_ps(in64, tmp424);
__m512 tmp439 = _mm512_unpackhi_ps(in64, tmp424);
__m512 tmp440 = _mm512_unpacklo_ps(tmp425, in70);
__m512 tmp441 = _mm512_unpackhi_ps(tmp425, in70);
__m512 tmp442 = _mm512_unpacklo_ps(tmp423, in66);
__m512 tmp443 = _mm512_unpackhi_ps(tmp423, in66);
__m512 tmp444 = _mm512_unpacklo_ps(in68, in67);
__m512 tmp445 = _mm512_unpackhi_ps(in68, in67);
__m512 tmp446 = _mm512_unpacklo_ps(in72, tmp428);
__m512 tmp447 = _mm512_unpackhi_ps(in72, tmp428);
__m512 tmp448 = _mm512_unpacklo_ps(tmp429, in78);
__m512 tmp449 = _mm512_unpackhi_ps(tmp429, in78);
__m512 tmp450 = _mm512_unpacklo_ps(tmp427, in74);
__m512 tmp451 = _mm512_unpackhi_ps(tmp427, in74);
__m512 tmp452 = _mm512_unpacklo_ps(in76, in75);
__m512 tmp453 = _mm512_unpackhi_ps(in76, in75);
__m512 tmp454 = _mm512_shuffle_ps(tmp438, tmp440, 68);
__m512 tmp455 = _mm512_shuffle_ps(tmp438, tmp440, 238);
__m512 tmp456 = _mm512_shuffle_ps(tmp439, tmp441, 68);
__m512 tmp457 = _mm512_shuffle_ps(tmp439, tmp441, 238);
__m512 tmp458 = _mm512_shuffle_ps(tmp442, tmp444, 68);
__m512 tmp459 = _mm512_shuffle_ps(tmp442, tmp444, 238);
__m512 tmp460 = _mm512_shuffle_ps(tmp443, tmp445, 68);
__m512 tmp461 = _mm512_shuffle_ps(tmp443, tmp445, 238);
__m512 tmp462 = _mm512_shuffle_ps(tmp446, tmp448, 68);
__m512 tmp463 = _mm512_shuffle_ps(tmp446, tmp448, 238);
__m512 tmp464 = _mm512_shuffle_ps(tmp447, tmp449, 68);
__m512 tmp465 = _mm512_shuffle_ps(tmp447, tmp449, 238);
__m512 tmp466 = _mm512_shuffle_ps(tmp450, tmp452, 68);
__m512 tmp467 = _mm512_shuffle_ps(tmp450, tmp452, 238);
__m512 tmp468 = _mm512_shuffle_ps(tmp451, tmp453, 68);
__m512 tmp469 = _mm512_shuffle_ps(tmp451, tmp453, 238);
__m512 tmp470 = _mm512_shuffle_f32x4(tmp454, tmp458, 136);
__m512 tmp471 = _mm512_shuffle_f32x4(tmp454, tmp458, 221);
__m512 tmp472 = _mm512_shuffle_f32x4(tmp455, tmp459, 136);
__m512 tmp473 = _mm512_shuffle_f32x4(tmp455, tmp459, 221);
__m512 tmp474 = _mm512_shuffle_f32x4(tmp456, tmp460, 136);
__m512 tmp475 = _mm512_shuffle_f32x4(tmp456, tmp460, 221);
__m512 tmp476 = _mm512_shuffle_f32x4(tmp457, tmp461, 136);
__m512 tmp477 = _mm512_shuffle_f32x4(tmp457, tmp461, 221);
__m512 tmp478 = _mm512_shuffle_f32x4(tmp462, tmp466, 136);
__m512 tmp479 = _mm512_shuffle_f32x4(tmp462, tmp466, 221);
__m512 tmp480 = _mm512_shuffle_f32x4(tmp463, tmp467, 136);
__m512 tmp481 = _mm512_shuffle_f32x4(tmp463, tmp467, 221);
__m512 tmp482 = _mm512_shuffle_f32x4(tmp464, tmp468, 136);
__m512 tmp483 = _mm512_shuffle_f32x4(tmp464, tmp468, 221);
__m512 tmp484 = _mm512_shuffle_f32x4(tmp465, tmp469, 136);
__m512 tmp485 = _mm512_shuffle_f32x4(tmp465, tmp469, 221);
in64 = _mm512_shuffle_f32x4(tmp470, tmp478, 136);
in72 = _mm512_shuffle_f32x4(tmp470, tmp478, 221);
tmp424 = _mm512_shuffle_f32x4(tmp472, tmp480, 136);
tmp428 = _mm512_shuffle_f32x4(tmp472, tmp480, 221);
tmp425 = _mm512_shuffle_f32x4(tmp474, tmp482, 136);
tmp429 = _mm512_shuffle_f32x4(tmp474, tmp482, 221);
in70 = _mm512_shuffle_f32x4(tmp476, tmp484, 136);
in78 = _mm512_shuffle_f32x4(tmp476, tmp484, 221);
tmp423 = _mm512_shuffle_f32x4(tmp471, tmp479, 136);
tmp427 = _mm512_shuffle_f32x4(tmp471, tmp479, 221);
in66 = _mm512_shuffle_f32x4(tmp473, tmp481, 136);
in74 = _mm512_shuffle_f32x4(tmp473, tmp481, 221);
in68 = _mm512_shuffle_f32x4(tmp475, tmp483, 136);
in76 = _mm512_shuffle_f32x4(tmp475, tmp483, 221);
in67 = _mm512_shuffle_f32x4(tmp477, tmp485, 136);
in75 = _mm512_shuffle_f32x4(tmp477, tmp485, 221);
__m512 tmp430 = _mm512_add_ps(tmp424, in66);
__m512 tmp434 = _mm512_add_ps(tmp428, in74);
__m512 tmp431 = _mm512_sub_ps(tmp423, tmp425);
__m512 tmp435 = _mm512_sub_ps(tmp427, tmp429);
__m512 tmp432 = _mm512_add_ps(tmp425, in68);
__m512 tmp436 = _mm512_add_ps(tmp429, in76);
in64 = _mm512_sub_ps(in64, in68);
in72 = _mm512_sub_ps(in72, in76);
tmp430 = _mm512_fmadd_ps(in70, _mm512_set1_ps(-4.25e+00f), tmp430);
tmp434 = _mm512_fmadd_ps(in78, _mm512_set1_ps(-4.25e+00f), tmp434);
tmp432 = _mm512_fmadd_ps(tmp423, _mm512_set1_ps(-4.25e+00f), tmp432);
tmp436 = _mm512_fmadd_ps(tmp427, _mm512_set1_ps(-4.25e+00f), tmp436);
in64 = _mm512_fmadd_ps(tmp431, _mm512_set1_ps(5.25e+00f), in64);
in72 = _mm512_fmadd_ps(tmp435, _mm512_set1_ps(5.25e+00f), in72);
tmp431 = _mm512_fmadd_ps(tmp425, _mm512_set1_ps(2.5e-01f), in68);
tmp435 = _mm512_fmadd_ps(tmp429, _mm512_set1_ps(2.5e-01f), in76);
tmp425 = _mm512_fmadd_ps(tmp425, _mm512_set1_ps(4e+00f), in68);
tmp429 = _mm512_fmadd_ps(tmp429, _mm512_set1_ps(4e+00f), in76);
__m512 tmp433 = _mm512_sub_ps(tmp432, tmp430);
__m512 tmp437 = _mm512_sub_ps(tmp436, tmp434);
tmp432 = _mm512_add_ps(tmp430, tmp432);
tmp436 = _mm512_add_ps(tmp434, tmp436);
tmp430 = _mm512_fmadd_ps(tmp424, _mm512_set1_ps(2.5e-01f), in66);
tmp434 = _mm512_fmadd_ps(tmp428, _mm512_set1_ps(2.5e-01f), in74);
tmp431 = _mm512_fmadd_ps(tmp423, _mm512_set1_ps(-1.25e+00f), tmp431);
tmp435 = _mm512_fmadd_ps(tmp427, _mm512_set1_ps(-1.25e+00f), tmp435);
tmp423 = _mm512_fmadd_ps(tmp423, _mm512_set1_ps(-5e+00f), tmp425);
tmp427 = _mm512_fmadd_ps(tmp427, _mm512_set1_ps(-5e+00f), tmp429);
tmp430 = _mm512_fmadd_ps(in70, _mm512_set1_ps(-1.25e+00f), tmp430);
tmp434 = _mm512_fmadd_ps(in78, _mm512_set1_ps(-1.25e+00f), tmp434);
in68 = _mm512_fmadd_ps(tmp430, _mm512_set1_ps(2e+00f), tmp431);
in76 = _mm512_fmadd_ps(tmp434, _mm512_set1_ps(2e+00f), tmp435);
tmp431 = _mm512_fnmadd_ps(tmp430, _mm512_set1_ps(2e+00f), tmp431);
tmp435 = _mm512_fnmadd_ps(tmp434, _mm512_set1_ps(2e+00f), tmp435);
tmp430 = _mm512_fmadd_ps(in66, _mm512_set1_ps(2.5e-01f), tmp424);
tmp434 = _mm512_fmadd_ps(in74, _mm512_set1_ps(2.5e-01f), tmp428);
tmp424 = _mm512_sub_ps(in67, tmp424);
tmp428 = _mm512_sub_ps(in75, tmp428);
tmp430 = _mm512_fmadd_ps(in70, _mm512_set1_ps(-1.25e+00f), tmp430);
tmp434 = _mm512_fmadd_ps(in78, _mm512_set1_ps(-1.25e+00f), tmp434);
in70 = _mm512_sub_ps(in70, in66);
in78 = _mm512_sub_ps(in78, in74);
in70 = _mm512_fmadd_ps(in70, _mm512_set1_ps(5.25e+00f), tmp424);
in78 = _mm512_fmadd_ps(in78, _mm512_set1_ps(5.25e+00f), tmp428);
tmp425 = _mm512_fmadd_ps(tmp430, _mm512_set1_ps(2e+00f), tmp423);
tmp429 = _mm512_fmadd_ps(tmp434, _mm512_set1_ps(2e+00f), tmp427);
tmp423 = _mm512_fnmadd_ps(tmp430, _mm512_set1_ps(2e+00f), tmp423);
tmp427 = _mm512_fnmadd_ps(tmp434, _mm512_set1_ps(2e+00f), tmp427);
__m512 out117 = _mm512_shuffle_f32x4(in64, tmp432, 68);
__m512 out125 = _mm512_shuffle_f32x4(in64, tmp432, 238);
__m512 out118 = _mm512_shuffle_f32x4(tmp433, in68, 68);
__m512 out126 = _mm512_shuffle_f32x4(tmp433, in68, 238);
__m512 out119 = _mm512_shuffle_f32x4(tmp431, tmp425, 68);
__m512 out127 = _mm512_shuffle_f32x4(tmp431, tmp425, 238);
__m512 out120 = _mm512_shuffle_f32x4(tmp423, in70, 68);
__m512 out128 = _mm512_shuffle_f32x4(tmp423, in70, 238);
__m512 out121 = _mm512_shuffle_f32x4(in72, tmp436, 68);
__m512 out129 = _mm512_shuffle_f32x4(in72, tmp436, 238);
__m512 out122 = _mm512_shuffle_f32x4(tmp437, in76, 68);
__m512 out130 = _mm512_shuffle_f32x4(tmp437, in76, 238);
__m512 out123 = _mm512_shuffle_f32x4(tmp435, tmp429, 68);
__m512 out131 = _mm512_shuffle_f32x4(tmp435, tmp429, 238);
__m512 out124 = _mm512_shuffle_f32x4(tmp427, in78, 68);
__m512 out132 = _mm512_shuffle_f32x4(tmp427, in78, 238);
_mm512_storeu_ps(dfPtr1+0+2433024*i7+152064*j3+38016*s5+768*k5, out117);
_mm512_storeu_ps(dfPtr1+128+2433024*i7+152064*j3+38016*s5+768*k5, out125);
_mm512_storeu_ps(dfPtr1+64+2433024*i7+152064*j3+38016*s5+768*k5, out121);
_mm512_storeu_ps(dfPtr1+192+2433024*i7+152064*j3+38016*s5+768*k5, out129);
_mm512_storeu_ps(dfPtr1+608256+2433024*i7+152064*j3+38016*s5+768*k5, out118);
_mm512_storeu_ps(dfPtr1+608384+2433024*i7+152064*j3+38016*s5+768*k5, out126);
_mm512_storeu_ps(dfPtr1+608320+2433024*i7+152064*j3+38016*s5+768*k5, out122);
_mm512_storeu_ps(dfPtr1+608448+2433024*i7+152064*j3+38016*s5+768*k5, out130);
_mm512_storeu_ps(dfPtr1+1216512+2433024*i7+152064*j3+38016*s5+768*k5, out119);
_mm512_storeu_ps(dfPtr1+1216640+2433024*i7+152064*j3+38016*s5+768*k5, out127);
_mm512_storeu_ps(dfPtr1+1216576+2433024*i7+152064*j3+38016*s5+768*k5, out123);
_mm512_storeu_ps(dfPtr1+1216704+2433024*i7+152064*j3+38016*s5+768*k5, out131);
_mm512_storeu_ps(dfPtr1+1824768+2433024*i7+152064*j3+38016*s5+768*k5, out120);
_mm512_storeu_ps(dfPtr1+1824896+2433024*i7+152064*j3+38016*s5+768*k5, out128);
_mm512_storeu_ps(dfPtr1+1824832+2433024*i7+152064*j3+38016*s5+768*k5, out124);
_mm512_storeu_ps(dfPtr1+1824960+2433024*i7+152064*j3+38016*s5+768*k5, out132);
__m512 dat65 = _mm512_maskz_loadu_ps(16383, datPtr1+552+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512i pm16 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in80 = _mm512_permutexvar_ps(pm16, dat65);
__m512 dat66 = _mm512_maskz_loadu_ps(16383, datPtr1+644+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in81 = _mm512_permutexvar_ps(pm16, dat66);
__m512 dat67 = _mm512_maskz_loadu_ps(16383, datPtr1+736+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in82 = _mm512_permutexvar_ps(pm16, dat67);
__m512 dat68 = _mm512_maskz_loadu_ps(16383, datPtr1+828+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in83 = _mm512_permutexvar_ps(pm16, dat68);
__m512 dat69 = _mm512_maskz_loadu_ps(16383, datPtr1+920+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in84 = _mm512_permutexvar_ps(pm16, dat69);
__m512 dat70 = _mm512_maskz_loadu_ps(16383, datPtr1+1012+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in85 = _mm512_permutexvar_ps(pm16, dat70);
__m512 dat71 = _mm512_maskz_loadu_ps(16383, datPtr1+1104+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in86 = _mm512_permutexvar_ps(pm16, dat71);
__m512 dat72 = _mm512_maskz_loadu_ps(16383, datPtr1+1196+7163304*i7+92*h1+4*w1+346104*s5+6992*k5);
__m512 in87 = _mm512_permutexvar_ps(pm16, dat72);
__m512 tmp486 = _mm512_add_ps(in81, in85);
__m512 tmp487 = _mm512_sub_ps(in84, in82);
__m512 tmp488 = _mm512_add_ps(in82, in86);
in80 = _mm512_sub_ps(in80, in86);
tmp486 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-4.25e+00f), tmp486);
tmp488 = _mm512_fmadd_ps(in84, _mm512_set1_ps(-4.25e+00f), tmp488);
in80 = _mm512_fmadd_ps(tmp487, _mm512_set1_ps(5.25e+00f), in80);
tmp487 = _mm512_fmadd_ps(in82, _mm512_set1_ps(2.5e-01f), in86);
in82 = _mm512_fmadd_ps(in82, _mm512_set1_ps(4e+00f), in86);
__m512 tmp489 = _mm512_sub_ps(tmp488, tmp486);
tmp488 = _mm512_add_ps(tmp486, tmp488);
tmp486 = _mm512_fmadd_ps(in81, _mm512_set1_ps(2.5e-01f), in85);
tmp487 = _mm512_fmadd_ps(in84, _mm512_set1_ps(-1.25e+00f), tmp487);
in84 = _mm512_fmadd_ps(in84, _mm512_set1_ps(-5e+00f), in82);
tmp486 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-1.25e+00f), tmp486);
in86 = _mm512_fmadd_ps(tmp486, _mm512_set1_ps(2e+00f), tmp487);
tmp487 = _mm512_fnmadd_ps(tmp486, _mm512_set1_ps(2e+00f), tmp487);
tmp486 = _mm512_fmadd_ps(in85, _mm512_set1_ps(2.5e-01f), in81);
in81 = _mm512_sub_ps(in87, in81);
tmp486 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-1.25e+00f), tmp486);
in83 = _mm512_sub_ps(in83, in85);
in83 = _mm512_fmadd_ps(in83, _mm512_set1_ps(5.25e+00f), in81);
in82 = _mm512_fmadd_ps(tmp486, _mm512_set1_ps(2e+00f), in84);
in84 = _mm512_fnmadd_ps(tmp486, _mm512_set1_ps(2e+00f), in84);
__m512 tmp506 = _mm512_unpacklo_ps(in80, tmp488);
__m512 tmp507 = _mm512_unpackhi_ps(in80, tmp488);
__m512 tmp508 = _mm512_unpacklo_ps(tmp489, in86);
__m512 tmp509 = _mm512_unpackhi_ps(tmp489, in86);
__m512 tmp510 = _mm512_unpacklo_ps(tmp487, in82);
__m512 tmp511 = _mm512_unpackhi_ps(tmp487, in82);
__m512 tmp512 = _mm512_unpacklo_ps(in84, in83);
__m512 tmp513 = _mm512_unpackhi_ps(in84, in83);
__m512 tmp514 = _mm512_shuffle_ps(tmp506, tmp508, 68);
__m512 tmp515 = _mm512_shuffle_ps(tmp506, tmp508, 238);
__m512 tmp516 = _mm512_shuffle_ps(tmp507, tmp509, 68);
__m512 tmp517 = _mm512_shuffle_ps(tmp507, tmp509, 238);
__m512 tmp518 = _mm512_shuffle_ps(tmp510, tmp512, 68);
__m512 tmp519 = _mm512_shuffle_ps(tmp510, tmp512, 238);
__m512 tmp520 = _mm512_shuffle_ps(tmp511, tmp513, 68);
__m512 tmp521 = _mm512_shuffle_ps(tmp511, tmp513, 238);
__m512 tmp522 = _mm512_shuffle_f32x4(tmp514, tmp518, 136);
__m512 tmp523 = _mm512_shuffle_f32x4(tmp514, tmp518, 221);
__m512 tmp524 = _mm512_shuffle_f32x4(tmp515, tmp519, 136);
__m512 tmp525 = _mm512_shuffle_f32x4(tmp515, tmp519, 221);
__m512 tmp526 = _mm512_shuffle_f32x4(tmp516, tmp520, 136);
__m512 tmp527 = _mm512_shuffle_f32x4(tmp516, tmp520, 221);
__m512 tmp528 = _mm512_shuffle_f32x4(tmp517, tmp521, 136);
__m512 tmp529 = _mm512_shuffle_f32x4(tmp517, tmp521, 221);
in80 = _mm512_shuffle_f32x4(tmp522, tmp522, 136);
__m512 tmp490 = _mm512_shuffle_f32x4(tmp522, tmp522, 221);
tmp488 = _mm512_shuffle_f32x4(tmp524, tmp524, 136);
__m512 tmp491 = _mm512_shuffle_f32x4(tmp524, tmp524, 221);
tmp489 = _mm512_shuffle_f32x4(tmp526, tmp526, 136);
__m512 tmp492 = _mm512_shuffle_f32x4(tmp526, tmp526, 221);
in86 = _mm512_shuffle_f32x4(tmp528, tmp528, 136);
__m512 tmp493 = _mm512_shuffle_f32x4(tmp528, tmp528, 221);
tmp487 = _mm512_shuffle_f32x4(tmp523, tmp523, 136);
__m512 tmp494 = _mm512_shuffle_f32x4(tmp523, tmp523, 221);
in82 = _mm512_shuffle_f32x4(tmp525, tmp525, 136);
__m512 tmp495 = _mm512_shuffle_f32x4(tmp525, tmp525, 221);
in84 = _mm512_shuffle_f32x4(tmp527, tmp527, 136);
__m512 tmp496 = _mm512_shuffle_f32x4(tmp527, tmp527, 221);
in83 = _mm512_shuffle_f32x4(tmp529, tmp529, 136);
__m512 tmp497 = _mm512_shuffle_f32x4(tmp529, tmp529, 221);
__m512 tmp498 = _mm512_add_ps(tmp488, in82);
__m512 tmp502 = _mm512_add_ps(tmp491, tmp495);
__m512 tmp499 = _mm512_sub_ps(tmp487, tmp489);
__m512 tmp503 = _mm512_sub_ps(tmp494, tmp492);
__m512 tmp500 = _mm512_add_ps(tmp489, in84);
__m512 tmp504 = _mm512_add_ps(tmp492, tmp496);
in80 = _mm512_sub_ps(in80, in84);
tmp490 = _mm512_sub_ps(tmp490, tmp496);
tmp498 = _mm512_fmadd_ps(in86, _mm512_set1_ps(-4.25e+00f), tmp498);
tmp502 = _mm512_fmadd_ps(tmp493, _mm512_set1_ps(-4.25e+00f), tmp502);
tmp500 = _mm512_fmadd_ps(tmp487, _mm512_set1_ps(-4.25e+00f), tmp500);
tmp504 = _mm512_fmadd_ps(tmp494, _mm512_set1_ps(-4.25e+00f), tmp504);
in80 = _mm512_fmadd_ps(tmp499, _mm512_set1_ps(5.25e+00f), in80);
tmp490 = _mm512_fmadd_ps(tmp503, _mm512_set1_ps(5.25e+00f), tmp490);
tmp499 = _mm512_fmadd_ps(tmp489, _mm512_set1_ps(2.5e-01f), in84);
tmp503 = _mm512_fmadd_ps(tmp492, _mm512_set1_ps(2.5e-01f), tmp496);
tmp489 = _mm512_fmadd_ps(tmp489, _mm512_set1_ps(4e+00f), in84);
tmp492 = _mm512_fmadd_ps(tmp492, _mm512_set1_ps(4e+00f), tmp496);
__m512 tmp501 = _mm512_sub_ps(tmp500, tmp498);
__m512 tmp505 = _mm512_sub_ps(tmp504, tmp502);
tmp500 = _mm512_add_ps(tmp498, tmp500);
tmp504 = _mm512_add_ps(tmp502, tmp504);
tmp498 = _mm512_fmadd_ps(tmp488, _mm512_set1_ps(2.5e-01f), in82);
tmp502 = _mm512_fmadd_ps(tmp491, _mm512_set1_ps(2.5e-01f), tmp495);
tmp499 = _mm512_fmadd_ps(tmp487, _mm512_set1_ps(-1.25e+00f), tmp499);
tmp503 = _mm512_fmadd_ps(tmp494, _mm512_set1_ps(-1.25e+00f), tmp503);
tmp487 = _mm512_fmadd_ps(tmp487, _mm512_set1_ps(-5e+00f), tmp489);
tmp494 = _mm512_fmadd_ps(tmp494, _mm512_set1_ps(-5e+00f), tmp492);
tmp498 = _mm512_fmadd_ps(in86, _mm512_set1_ps(-1.25e+00f), tmp498);
tmp502 = _mm512_fmadd_ps(tmp493, _mm512_set1_ps(-1.25e+00f), tmp502);
in84 = _mm512_fmadd_ps(tmp498, _mm512_set1_ps(2e+00f), tmp499);
tmp496 = _mm512_fmadd_ps(tmp502, _mm512_set1_ps(2e+00f), tmp503);
tmp499 = _mm512_fnmadd_ps(tmp498, _mm512_set1_ps(2e+00f), tmp499);
tmp503 = _mm512_fnmadd_ps(tmp502, _mm512_set1_ps(2e+00f), tmp503);
tmp498 = _mm512_fmadd_ps(in82, _mm512_set1_ps(2.5e-01f), tmp488);
tmp502 = _mm512_fmadd_ps(tmp495, _mm512_set1_ps(2.5e-01f), tmp491);
tmp488 = _mm512_sub_ps(in83, tmp488);
tmp491 = _mm512_sub_ps(tmp497, tmp491);
tmp498 = _mm512_fmadd_ps(in86, _mm512_set1_ps(-1.25e+00f), tmp498);
tmp502 = _mm512_fmadd_ps(tmp493, _mm512_set1_ps(-1.25e+00f), tmp502);
in86 = _mm512_sub_ps(in86, in82);
tmp493 = _mm512_sub_ps(tmp493, tmp495);
in86 = _mm512_fmadd_ps(in86, _mm512_set1_ps(5.25e+00f), tmp488);
tmp493 = _mm512_fmadd_ps(tmp493, _mm512_set1_ps(5.25e+00f), tmp491);
tmp489 = _mm512_fmadd_ps(tmp498, _mm512_set1_ps(2e+00f), tmp487);
tmp492 = _mm512_fmadd_ps(tmp502, _mm512_set1_ps(2e+00f), tmp494);
tmp487 = _mm512_fnmadd_ps(tmp498, _mm512_set1_ps(2e+00f), tmp487);
tmp494 = _mm512_fnmadd_ps(tmp502, _mm512_set1_ps(2e+00f), tmp494);
__m512 out133 = _mm512_shuffle_f32x4(in80, tmp500, 68);
__m512 out134 = _mm512_shuffle_f32x4(tmp501, in84, 68);
__m512 out135 = _mm512_shuffle_f32x4(tmp499, tmp489, 68);
__m512 out136 = _mm512_shuffle_f32x4(tmp487, in86, 68);
__m512 out137 = _mm512_shuffle_f32x4(tmp490, tmp504, 68);
__m512 out138 = _mm512_shuffle_f32x4(tmp505, tmp496, 68);
__m512 out139 = _mm512_shuffle_f32x4(tmp503, tmp492, 68);
__m512 out140 = _mm512_shuffle_f32x4(tmp494, tmp493, 68);
_mm512_storeu_ps(dfPtr1+256+2433024*i7+152064*j3+38016*s5+768*k5, out133);
_mm512_storeu_ps(dfPtr1+320+2433024*i7+152064*j3+38016*s5+768*k5, out137);
_mm512_storeu_ps(dfPtr1+608512+2433024*i7+152064*j3+38016*s5+768*k5, out134);
_mm512_storeu_ps(dfPtr1+608576+2433024*i7+152064*j3+38016*s5+768*k5, out138);
_mm512_storeu_ps(dfPtr1+1216768+2433024*i7+152064*j3+38016*s5+768*k5, out135);
_mm512_storeu_ps(dfPtr1+1216832+2433024*i7+152064*j3+38016*s5+768*k5, out139);
_mm512_storeu_ps(dfPtr1+1825024+2433024*i7+152064*j3+38016*s5+768*k5, out136);
_mm512_storeu_ps(dfPtr1+1825088+2433024*i7+152064*j3+38016*s5+768*k5, out140);
if (j3 >= last1) return;
++j3;
rel1 = 1;
}
ptrdiff_t h2 = base1+6;
ptrdiff_t w2 = 12;
ptrdiff_t k6 = 0;
for (; k6 != 49; ++k6) {
__m512 dat73 = _mm512_maskz_loadu_ps(2047, datPtr1+0+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat74 = _mm512_maskz_loadu_ps(16383, datPtr1+504+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512i pm17 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in88 = _mm512_permutexvar_ps(pm17, dat73);
__m512i pm18 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in96 = _mm512_permutexvar_ps(pm18, dat74);
__m512 dat75 = _mm512_maskz_loadu_ps(2047, datPtr1+92+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat76 = _mm512_maskz_loadu_ps(16383, datPtr1+596+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in89 = _mm512_permutexvar_ps(pm17, dat75);
__m512 in97 = _mm512_permutexvar_ps(pm18, dat76);
__m512 dat77 = _mm512_maskz_loadu_ps(2047, datPtr1+184+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat78 = _mm512_maskz_loadu_ps(16383, datPtr1+688+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in90 = _mm512_permutexvar_ps(pm17, dat77);
__m512 in98 = _mm512_permutexvar_ps(pm18, dat78);
__m512 dat79 = _mm512_maskz_loadu_ps(2047, datPtr1+276+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat80 = _mm512_maskz_loadu_ps(16383, datPtr1+780+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in91 = _mm512_permutexvar_ps(pm17, dat79);
__m512 in99 = _mm512_permutexvar_ps(pm18, dat80);
__m512 dat81 = _mm512_maskz_loadu_ps(2047, datPtr1+368+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat82 = _mm512_maskz_loadu_ps(16383, datPtr1+872+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in92 = _mm512_permutexvar_ps(pm17, dat81);
__m512 in100 = _mm512_permutexvar_ps(pm18, dat82);
__m512 dat83 = _mm512_maskz_loadu_ps(2047, datPtr1+460+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat84 = _mm512_maskz_loadu_ps(16383, datPtr1+964+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in93 = _mm512_permutexvar_ps(pm17, dat83);
__m512 in101 = _mm512_permutexvar_ps(pm18, dat84);
__m512 dat85 = _mm512_maskz_loadu_ps(2047, datPtr1+552+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat86 = _mm512_maskz_loadu_ps(16383, datPtr1+1056+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in94 = _mm512_permutexvar_ps(pm17, dat85);
__m512 in102 = _mm512_permutexvar_ps(pm18, dat86);
__m512 dat87 = _mm512_maskz_loadu_ps(2047, datPtr1+644+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat88 = _mm512_maskz_loadu_ps(16383, datPtr1+1148+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in95 = _mm512_permutexvar_ps(pm17, dat87);
__m512 in103 = _mm512_permutexvar_ps(pm18, dat88);
__m512 tmp530 = _mm512_add_ps(in89, in93);
__m512 tmp534 = _mm512_add_ps(in97, in101);
__m512 tmp531 = _mm512_sub_ps(in92, in90);
__m512 tmp535 = _mm512_sub_ps(in100, in98);
__m512 tmp532 = _mm512_add_ps(in90, in94);
__m512 tmp536 = _mm512_add_ps(in98, in102);
in88 = _mm512_sub_ps(in88, in94);
in96 = _mm512_sub_ps(in96, in102);
tmp530 = _mm512_fmadd_ps(in91, _mm512_set1_ps(-4.25e+00f), tmp530);
tmp534 = _mm512_fmadd_ps(in99, _mm512_set1_ps(-4.25e+00f), tmp534);
tmp532 = _mm512_fmadd_ps(in92, _mm512_set1_ps(-4.25e+00f), tmp532);
tmp536 = _mm512_fmadd_ps(in100, _mm512_set1_ps(-4.25e+00f), tmp536);
in88 = _mm512_fmadd_ps(tmp531, _mm512_set1_ps(5.25e+00f), in88);
in96 = _mm512_fmadd_ps(tmp535, _mm512_set1_ps(5.25e+00f), in96);
tmp531 = _mm512_fmadd_ps(in90, _mm512_set1_ps(2.5e-01f), in94);
tmp535 = _mm512_fmadd_ps(in98, _mm512_set1_ps(2.5e-01f), in102);
in90 = _mm512_fmadd_ps(in90, _mm512_set1_ps(4e+00f), in94);
in98 = _mm512_fmadd_ps(in98, _mm512_set1_ps(4e+00f), in102);
__m512 tmp533 = _mm512_sub_ps(tmp532, tmp530);
__m512 tmp537 = _mm512_sub_ps(tmp536, tmp534);
tmp532 = _mm512_add_ps(tmp530, tmp532);
tmp536 = _mm512_add_ps(tmp534, tmp536);
tmp530 = _mm512_fmadd_ps(in89, _mm512_set1_ps(2.5e-01f), in93);
tmp534 = _mm512_fmadd_ps(in97, _mm512_set1_ps(2.5e-01f), in101);
tmp531 = _mm512_fmadd_ps(in92, _mm512_set1_ps(-1.25e+00f), tmp531);
tmp535 = _mm512_fmadd_ps(in100, _mm512_set1_ps(-1.25e+00f), tmp535);
in92 = _mm512_fmadd_ps(in92, _mm512_set1_ps(-5e+00f), in90);
in100 = _mm512_fmadd_ps(in100, _mm512_set1_ps(-5e+00f), in98);
tmp530 = _mm512_fmadd_ps(in91, _mm512_set1_ps(-1.25e+00f), tmp530);
tmp534 = _mm512_fmadd_ps(in99, _mm512_set1_ps(-1.25e+00f), tmp534);
in94 = _mm512_fmadd_ps(tmp530, _mm512_set1_ps(2e+00f), tmp531);
in102 = _mm512_fmadd_ps(tmp534, _mm512_set1_ps(2e+00f), tmp535);
tmp531 = _mm512_fnmadd_ps(tmp530, _mm512_set1_ps(2e+00f), tmp531);
tmp535 = _mm512_fnmadd_ps(tmp534, _mm512_set1_ps(2e+00f), tmp535);
tmp530 = _mm512_fmadd_ps(in93, _mm512_set1_ps(2.5e-01f), in89);
tmp534 = _mm512_fmadd_ps(in101, _mm512_set1_ps(2.5e-01f), in97);
in89 = _mm512_sub_ps(in95, in89);
in97 = _mm512_sub_ps(in103, in97);
tmp530 = _mm512_fmadd_ps(in91, _mm512_set1_ps(-1.25e+00f), tmp530);
tmp534 = _mm512_fmadd_ps(in99, _mm512_set1_ps(-1.25e+00f), tmp534);
in91 = _mm512_sub_ps(in91, in93);
in99 = _mm512_sub_ps(in99, in101);
in91 = _mm512_fmadd_ps(in91, _mm512_set1_ps(5.25e+00f), in89);
in99 = _mm512_fmadd_ps(in99, _mm512_set1_ps(5.25e+00f), in97);
in90 = _mm512_fmadd_ps(tmp530, _mm512_set1_ps(2e+00f), in92);
in98 = _mm512_fmadd_ps(tmp534, _mm512_set1_ps(2e+00f), in100);
in92 = _mm512_fnmadd_ps(tmp530, _mm512_set1_ps(2e+00f), in92);
in100 = _mm512_fnmadd_ps(tmp534, _mm512_set1_ps(2e+00f), in100);
__m512 tmp546 = _mm512_unpacklo_ps(in88, tmp532);
__m512 tmp547 = _mm512_unpackhi_ps(in88, tmp532);
__m512 tmp548 = _mm512_unpacklo_ps(tmp533, in94);
__m512 tmp549 = _mm512_unpackhi_ps(tmp533, in94);
__m512 tmp550 = _mm512_unpacklo_ps(tmp531, in90);
__m512 tmp551 = _mm512_unpackhi_ps(tmp531, in90);
__m512 tmp552 = _mm512_unpacklo_ps(in92, in91);
__m512 tmp553 = _mm512_unpackhi_ps(in92, in91);
__m512 tmp554 = _mm512_unpacklo_ps(in96, tmp536);
__m512 tmp555 = _mm512_unpackhi_ps(in96, tmp536);
__m512 tmp556 = _mm512_unpacklo_ps(tmp537, in102);
__m512 tmp557 = _mm512_unpackhi_ps(tmp537, in102);
__m512 tmp558 = _mm512_unpacklo_ps(tmp535, in98);
__m512 tmp559 = _mm512_unpackhi_ps(tmp535, in98);
__m512 tmp560 = _mm512_unpacklo_ps(in100, in99);
__m512 tmp561 = _mm512_unpackhi_ps(in100, in99);
__m512 tmp562 = _mm512_shuffle_ps(tmp546, tmp548, 68);
__m512 tmp563 = _mm512_shuffle_ps(tmp546, tmp548, 238);
__m512 tmp564 = _mm512_shuffle_ps(tmp547, tmp549, 68);
__m512 tmp565 = _mm512_shuffle_ps(tmp547, tmp549, 238);
__m512 tmp566 = _mm512_shuffle_ps(tmp550, tmp552, 68);
__m512 tmp567 = _mm512_shuffle_ps(tmp550, tmp552, 238);
__m512 tmp568 = _mm512_shuffle_ps(tmp551, tmp553, 68);
__m512 tmp569 = _mm512_shuffle_ps(tmp551, tmp553, 238);
__m512 tmp570 = _mm512_shuffle_ps(tmp554, tmp556, 68);
__m512 tmp571 = _mm512_shuffle_ps(tmp554, tmp556, 238);
__m512 tmp572 = _mm512_shuffle_ps(tmp555, tmp557, 68);
__m512 tmp573 = _mm512_shuffle_ps(tmp555, tmp557, 238);
__m512 tmp574 = _mm512_shuffle_ps(tmp558, tmp560, 68);
__m512 tmp575 = _mm512_shuffle_ps(tmp558, tmp560, 238);
__m512 tmp576 = _mm512_shuffle_ps(tmp559, tmp561, 68);
__m512 tmp577 = _mm512_shuffle_ps(tmp559, tmp561, 238);
__m512 tmp578 = _mm512_shuffle_f32x4(tmp562, tmp566, 136);
__m512 tmp579 = _mm512_shuffle_f32x4(tmp562, tmp566, 221);
__m512 tmp580 = _mm512_shuffle_f32x4(tmp563, tmp567, 136);
__m512 tmp581 = _mm512_shuffle_f32x4(tmp563, tmp567, 221);
__m512 tmp582 = _mm512_shuffle_f32x4(tmp564, tmp568, 136);
__m512 tmp583 = _mm512_shuffle_f32x4(tmp564, tmp568, 221);
__m512 tmp584 = _mm512_shuffle_f32x4(tmp565, tmp569, 136);
__m512 tmp585 = _mm512_shuffle_f32x4(tmp565, tmp569, 221);
__m512 tmp586 = _mm512_shuffle_f32x4(tmp570, tmp574, 136);
__m512 tmp587 = _mm512_shuffle_f32x4(tmp570, tmp574, 221);
__m512 tmp588 = _mm512_shuffle_f32x4(tmp571, tmp575, 136);
__m512 tmp589 = _mm512_shuffle_f32x4(tmp571, tmp575, 221);
__m512 tmp590 = _mm512_shuffle_f32x4(tmp572, tmp576, 136);
__m512 tmp591 = _mm512_shuffle_f32x4(tmp572, tmp576, 221);
__m512 tmp592 = _mm512_shuffle_f32x4(tmp573, tmp577, 136);
__m512 tmp593 = _mm512_shuffle_f32x4(tmp573, tmp577, 221);
in88 = _mm512_shuffle_f32x4(tmp578, tmp586, 136);
in96 = _mm512_shuffle_f32x4(tmp578, tmp586, 221);
tmp532 = _mm512_shuffle_f32x4(tmp580, tmp588, 136);
tmp536 = _mm512_shuffle_f32x4(tmp580, tmp588, 221);
tmp533 = _mm512_shuffle_f32x4(tmp582, tmp590, 136);
tmp537 = _mm512_shuffle_f32x4(tmp582, tmp590, 221);
in94 = _mm512_shuffle_f32x4(tmp584, tmp592, 136);
in102 = _mm512_shuffle_f32x4(tmp584, tmp592, 221);
tmp531 = _mm512_shuffle_f32x4(tmp579, tmp587, 136);
tmp535 = _mm512_shuffle_f32x4(tmp579, tmp587, 221);
in90 = _mm512_shuffle_f32x4(tmp581, tmp589, 136);
in98 = _mm512_shuffle_f32x4(tmp581, tmp589, 221);
in92 = _mm512_shuffle_f32x4(tmp583, tmp591, 136);
in100 = _mm512_shuffle_f32x4(tmp583, tmp591, 221);
in91 = _mm512_shuffle_f32x4(tmp585, tmp593, 136);
in99 = _mm512_shuffle_f32x4(tmp585, tmp593, 221);
__m512 tmp538 = _mm512_add_ps(tmp532, in90);
__m512 tmp542 = _mm512_add_ps(tmp536, in98);
__m512 tmp539 = _mm512_sub_ps(tmp531, tmp533);
__m512 tmp543 = _mm512_sub_ps(tmp535, tmp537);
__m512 tmp540 = _mm512_add_ps(tmp533, in92);
__m512 tmp544 = _mm512_add_ps(tmp537, in100);
in88 = _mm512_sub_ps(in88, in92);
in96 = _mm512_sub_ps(in96, in100);
tmp538 = _mm512_fmadd_ps(in94, _mm512_set1_ps(-4.25e+00f), tmp538);
tmp542 = _mm512_fmadd_ps(in102, _mm512_set1_ps(-4.25e+00f), tmp542);
tmp540 = _mm512_fmadd_ps(tmp531, _mm512_set1_ps(-4.25e+00f), tmp540);
tmp544 = _mm512_fmadd_ps(tmp535, _mm512_set1_ps(-4.25e+00f), tmp544);
in88 = _mm512_fmadd_ps(tmp539, _mm512_set1_ps(5.25e+00f), in88);
in96 = _mm512_fmadd_ps(tmp543, _mm512_set1_ps(5.25e+00f), in96);
tmp539 = _mm512_fmadd_ps(tmp533, _mm512_set1_ps(2.5e-01f), in92);
tmp543 = _mm512_fmadd_ps(tmp537, _mm512_set1_ps(2.5e-01f), in100);
tmp533 = _mm512_fmadd_ps(tmp533, _mm512_set1_ps(4e+00f), in92);
tmp537 = _mm512_fmadd_ps(tmp537, _mm512_set1_ps(4e+00f), in100);
__m512 tmp541 = _mm512_sub_ps(tmp540, tmp538);
__m512 tmp545 = _mm512_sub_ps(tmp544, tmp542);
tmp540 = _mm512_add_ps(tmp538, tmp540);
tmp544 = _mm512_add_ps(tmp542, tmp544);
tmp538 = _mm512_fmadd_ps(tmp532, _mm512_set1_ps(2.5e-01f), in90);
tmp542 = _mm512_fmadd_ps(tmp536, _mm512_set1_ps(2.5e-01f), in98);
tmp539 = _mm512_fmadd_ps(tmp531, _mm512_set1_ps(-1.25e+00f), tmp539);
tmp543 = _mm512_fmadd_ps(tmp535, _mm512_set1_ps(-1.25e+00f), tmp543);
tmp531 = _mm512_fmadd_ps(tmp531, _mm512_set1_ps(-5e+00f), tmp533);
tmp535 = _mm512_fmadd_ps(tmp535, _mm512_set1_ps(-5e+00f), tmp537);
tmp538 = _mm512_fmadd_ps(in94, _mm512_set1_ps(-1.25e+00f), tmp538);
tmp542 = _mm512_fmadd_ps(in102, _mm512_set1_ps(-1.25e+00f), tmp542);
in92 = _mm512_fmadd_ps(tmp538, _mm512_set1_ps(2e+00f), tmp539);
in100 = _mm512_fmadd_ps(tmp542, _mm512_set1_ps(2e+00f), tmp543);
tmp539 = _mm512_fnmadd_ps(tmp538, _mm512_set1_ps(2e+00f), tmp539);
tmp543 = _mm512_fnmadd_ps(tmp542, _mm512_set1_ps(2e+00f), tmp543);
tmp538 = _mm512_fmadd_ps(in90, _mm512_set1_ps(2.5e-01f), tmp532);
tmp542 = _mm512_fmadd_ps(in98, _mm512_set1_ps(2.5e-01f), tmp536);
tmp532 = _mm512_sub_ps(in91, tmp532);
tmp536 = _mm512_sub_ps(in99, tmp536);
tmp538 = _mm512_fmadd_ps(in94, _mm512_set1_ps(-1.25e+00f), tmp538);
tmp542 = _mm512_fmadd_ps(in102, _mm512_set1_ps(-1.25e+00f), tmp542);
in94 = _mm512_sub_ps(in94, in90);
in102 = _mm512_sub_ps(in102, in98);
in94 = _mm512_fmadd_ps(in94, _mm512_set1_ps(5.25e+00f), tmp532);
in102 = _mm512_fmadd_ps(in102, _mm512_set1_ps(5.25e+00f), tmp536);
tmp533 = _mm512_fmadd_ps(tmp538, _mm512_set1_ps(2e+00f), tmp531);
tmp537 = _mm512_fmadd_ps(tmp542, _mm512_set1_ps(2e+00f), tmp535);
tmp531 = _mm512_fnmadd_ps(tmp538, _mm512_set1_ps(2e+00f), tmp531);
tmp535 = _mm512_fnmadd_ps(tmp542, _mm512_set1_ps(2e+00f), tmp535);
__m512 out141 = _mm512_shuffle_f32x4(in88, tmp540, 68);
__m512 out149 = _mm512_shuffle_f32x4(in88, tmp540, 238);
__m512 out142 = _mm512_shuffle_f32x4(tmp541, in92, 68);
__m512 out150 = _mm512_shuffle_f32x4(tmp541, in92, 238);
__m512 out143 = _mm512_shuffle_f32x4(tmp539, tmp533, 68);
__m512 out151 = _mm512_shuffle_f32x4(tmp539, tmp533, 238);
__m512 out144 = _mm512_shuffle_f32x4(tmp531, in94, 68);
__m512 out152 = _mm512_shuffle_f32x4(tmp531, in94, 238);
__m512 out145 = _mm512_shuffle_f32x4(in96, tmp544, 68);
__m512 out153 = _mm512_shuffle_f32x4(in96, tmp544, 238);
__m512 out146 = _mm512_shuffle_f32x4(tmp545, in100, 68);
__m512 out154 = _mm512_shuffle_f32x4(tmp545, in100, 238);
__m512 out147 = _mm512_shuffle_f32x4(tmp543, tmp537, 68);
__m512 out155 = _mm512_shuffle_f32x4(tmp543, tmp537, 238);
__m512 out148 = _mm512_shuffle_f32x4(tmp535, in102, 68);
__m512 out156 = _mm512_shuffle_f32x4(tmp535, in102, 238);
_mm512_storeu_ps(dfPtr1+0+2433024*i7+152064*j3+38016*s5+768*k6, out141);
_mm512_storeu_ps(dfPtr1+128+2433024*i7+152064*j3+38016*s5+768*k6, out149);
_mm512_storeu_ps(dfPtr1+64+2433024*i7+152064*j3+38016*s5+768*k6, out145);
_mm512_storeu_ps(dfPtr1+192+2433024*i7+152064*j3+38016*s5+768*k6, out153);
_mm512_storeu_ps(dfPtr1+608256+2433024*i7+152064*j3+38016*s5+768*k6, out142);
_mm512_storeu_ps(dfPtr1+608384+2433024*i7+152064*j3+38016*s5+768*k6, out150);
_mm512_storeu_ps(dfPtr1+608320+2433024*i7+152064*j3+38016*s5+768*k6, out146);
_mm512_storeu_ps(dfPtr1+608448+2433024*i7+152064*j3+38016*s5+768*k6, out154);
_mm512_storeu_ps(dfPtr1+1216512+2433024*i7+152064*j3+38016*s5+768*k6, out143);
_mm512_storeu_ps(dfPtr1+1216640+2433024*i7+152064*j3+38016*s5+768*k6, out151);
_mm512_storeu_ps(dfPtr1+1216576+2433024*i7+152064*j3+38016*s5+768*k6, out147);
_mm512_storeu_ps(dfPtr1+1216704+2433024*i7+152064*j3+38016*s5+768*k6, out155);
_mm512_storeu_ps(dfPtr1+1824768+2433024*i7+152064*j3+38016*s5+768*k6, out144);
_mm512_storeu_ps(dfPtr1+1824896+2433024*i7+152064*j3+38016*s5+768*k6, out152);
_mm512_storeu_ps(dfPtr1+1824832+2433024*i7+152064*j3+38016*s5+768*k6, out148);
_mm512_storeu_ps(dfPtr1+1824960+2433024*i7+152064*j3+38016*s5+768*k6, out156);
__m512 dat89 = _mm512_maskz_loadu_ps(2047, datPtr1+552+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat90 = _mm512_maskz_loadu_ps(2047, datPtr1+3496+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512i pm19 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in104 = _mm512_permutexvar_ps(pm19, dat89);
__m512 in112 = _mm512_permutexvar_ps(pm19, dat90);
__m512 dat91 = _mm512_maskz_loadu_ps(2047, datPtr1+644+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat92 = _mm512_maskz_loadu_ps(2047, datPtr1+3588+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in105 = _mm512_permutexvar_ps(pm19, dat91);
__m512 in113 = _mm512_permutexvar_ps(pm19, dat92);
__m512 dat93 = _mm512_maskz_loadu_ps(2047, datPtr1+736+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat94 = _mm512_maskz_loadu_ps(2047, datPtr1+3680+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in106 = _mm512_permutexvar_ps(pm19, dat93);
__m512 in114 = _mm512_permutexvar_ps(pm19, dat94);
__m512 dat95 = _mm512_maskz_loadu_ps(2047, datPtr1+828+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat96 = _mm512_maskz_loadu_ps(2047, datPtr1+3772+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in107 = _mm512_permutexvar_ps(pm19, dat95);
__m512 in115 = _mm512_permutexvar_ps(pm19, dat96);
__m512 dat97 = _mm512_maskz_loadu_ps(2047, datPtr1+920+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat98 = _mm512_maskz_loadu_ps(2047, datPtr1+3864+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in108 = _mm512_permutexvar_ps(pm19, dat97);
__m512 in116 = _mm512_permutexvar_ps(pm19, dat98);
__m512 dat99 = _mm512_maskz_loadu_ps(2047, datPtr1+1012+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat100 = _mm512_maskz_loadu_ps(2047, datPtr1+3956+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in109 = _mm512_permutexvar_ps(pm19, dat99);
__m512 in117 = _mm512_permutexvar_ps(pm19, dat100);
__m512 dat101 = _mm512_maskz_loadu_ps(2047, datPtr1+1104+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat102 = _mm512_maskz_loadu_ps(2047, datPtr1+4048+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in110 = _mm512_permutexvar_ps(pm19, dat101);
__m512 in118 = _mm512_permutexvar_ps(pm19, dat102);
__m512 dat103 = _mm512_maskz_loadu_ps(2047, datPtr1+1196+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat104 = _mm512_maskz_loadu_ps(2047, datPtr1+4140+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in111 = _mm512_permutexvar_ps(pm19, dat103);
__m512 in119 = _mm512_permutexvar_ps(pm19, dat104);
__m512 tmp594 = _mm512_add_ps(in105, in109);
__m512 tmp598 = _mm512_add_ps(in113, in117);
__m512 tmp595 = _mm512_sub_ps(in108, in106);
__m512 tmp599 = _mm512_sub_ps(in116, in114);
__m512 tmp596 = _mm512_add_ps(in106, in110);
__m512 tmp600 = _mm512_add_ps(in114, in118);
in104 = _mm512_sub_ps(in104, in110);
in112 = _mm512_sub_ps(in112, in118);
tmp594 = _mm512_fmadd_ps(in107, _mm512_set1_ps(-4.25e+00f), tmp594);
tmp598 = _mm512_fmadd_ps(in115, _mm512_set1_ps(-4.25e+00f), tmp598);
tmp596 = _mm512_fmadd_ps(in108, _mm512_set1_ps(-4.25e+00f), tmp596);
tmp600 = _mm512_fmadd_ps(in116, _mm512_set1_ps(-4.25e+00f), tmp600);
in104 = _mm512_fmadd_ps(tmp595, _mm512_set1_ps(5.25e+00f), in104);
in112 = _mm512_fmadd_ps(tmp599, _mm512_set1_ps(5.25e+00f), in112);
tmp595 = _mm512_fmadd_ps(in106, _mm512_set1_ps(2.5e-01f), in110);
tmp599 = _mm512_fmadd_ps(in114, _mm512_set1_ps(2.5e-01f), in118);
in106 = _mm512_fmadd_ps(in106, _mm512_set1_ps(4e+00f), in110);
in114 = _mm512_fmadd_ps(in114, _mm512_set1_ps(4e+00f), in118);
__m512 tmp597 = _mm512_sub_ps(tmp596, tmp594);
__m512 tmp601 = _mm512_sub_ps(tmp600, tmp598);
tmp596 = _mm512_add_ps(tmp594, tmp596);
tmp600 = _mm512_add_ps(tmp598, tmp600);
tmp594 = _mm512_fmadd_ps(in105, _mm512_set1_ps(2.5e-01f), in109);
tmp598 = _mm512_fmadd_ps(in113, _mm512_set1_ps(2.5e-01f), in117);
tmp595 = _mm512_fmadd_ps(in108, _mm512_set1_ps(-1.25e+00f), tmp595);
tmp599 = _mm512_fmadd_ps(in116, _mm512_set1_ps(-1.25e+00f), tmp599);
in108 = _mm512_fmadd_ps(in108, _mm512_set1_ps(-5e+00f), in106);
in116 = _mm512_fmadd_ps(in116, _mm512_set1_ps(-5e+00f), in114);
tmp594 = _mm512_fmadd_ps(in107, _mm512_set1_ps(-1.25e+00f), tmp594);
tmp598 = _mm512_fmadd_ps(in115, _mm512_set1_ps(-1.25e+00f), tmp598);
in110 = _mm512_fmadd_ps(tmp594, _mm512_set1_ps(2e+00f), tmp595);
in118 = _mm512_fmadd_ps(tmp598, _mm512_set1_ps(2e+00f), tmp599);
tmp595 = _mm512_fnmadd_ps(tmp594, _mm512_set1_ps(2e+00f), tmp595);
tmp599 = _mm512_fnmadd_ps(tmp598, _mm512_set1_ps(2e+00f), tmp599);
tmp594 = _mm512_fmadd_ps(in109, _mm512_set1_ps(2.5e-01f), in105);
tmp598 = _mm512_fmadd_ps(in117, _mm512_set1_ps(2.5e-01f), in113);
in105 = _mm512_sub_ps(in111, in105);
in113 = _mm512_sub_ps(in119, in113);
tmp594 = _mm512_fmadd_ps(in107, _mm512_set1_ps(-1.25e+00f), tmp594);
tmp598 = _mm512_fmadd_ps(in115, _mm512_set1_ps(-1.25e+00f), tmp598);
in107 = _mm512_sub_ps(in107, in109);
in115 = _mm512_sub_ps(in115, in117);
in107 = _mm512_fmadd_ps(in107, _mm512_set1_ps(5.25e+00f), in105);
in115 = _mm512_fmadd_ps(in115, _mm512_set1_ps(5.25e+00f), in113);
in106 = _mm512_fmadd_ps(tmp594, _mm512_set1_ps(2e+00f), in108);
in114 = _mm512_fmadd_ps(tmp598, _mm512_set1_ps(2e+00f), in116);
in108 = _mm512_fnmadd_ps(tmp594, _mm512_set1_ps(2e+00f), in108);
in116 = _mm512_fnmadd_ps(tmp598, _mm512_set1_ps(2e+00f), in116);
__m512 tmp611 = _mm512_unpacklo_ps(in104, tmp596);
__m512 tmp612 = _mm512_unpackhi_ps(in104, tmp596);
__m512 tmp613 = _mm512_unpacklo_ps(tmp597, in110);
__m512 tmp614 = _mm512_unpackhi_ps(tmp597, in110);
__m512 tmp615 = _mm512_unpacklo_ps(tmp595, in106);
__m512 tmp616 = _mm512_unpackhi_ps(tmp595, in106);
__m512 tmp617 = _mm512_unpacklo_ps(in108, in107);
__m512 tmp618 = _mm512_unpackhi_ps(in108, in107);
__m512 tmp619 = _mm512_unpacklo_ps(in112, tmp600);
__m512 tmp620 = _mm512_unpackhi_ps(in112, tmp600);
__m512 tmp621 = _mm512_unpacklo_ps(tmp601, in118);
__m512 tmp622 = _mm512_unpackhi_ps(tmp601, in118);
__m512 tmp623 = _mm512_unpacklo_ps(tmp599, in114);
__m512 tmp624 = _mm512_unpackhi_ps(tmp599, in114);
__m512 tmp625 = _mm512_unpacklo_ps(in116, in115);
__m512 tmp626 = _mm512_unpackhi_ps(in116, in115);
__m512 tmp627 = _mm512_shuffle_ps(tmp611, tmp613, 68);
__m512 tmp628 = _mm512_shuffle_ps(tmp611, tmp613, 238);
__m512 tmp629 = _mm512_shuffle_ps(tmp612, tmp614, 68);
__m512 tmp630 = _mm512_shuffle_ps(tmp612, tmp614, 238);
__m512 tmp631 = _mm512_shuffle_ps(tmp615, tmp617, 68);
__m512 tmp632 = _mm512_shuffle_ps(tmp615, tmp617, 238);
__m512 tmp633 = _mm512_shuffle_ps(tmp616, tmp618, 68);
__m512 tmp634 = _mm512_shuffle_ps(tmp616, tmp618, 238);
__m512 tmp635 = _mm512_shuffle_ps(tmp619, tmp621, 68);
__m512 tmp636 = _mm512_shuffle_ps(tmp619, tmp621, 238);
__m512 tmp637 = _mm512_shuffle_ps(tmp620, tmp622, 68);
__m512 tmp638 = _mm512_shuffle_ps(tmp620, tmp622, 238);
__m512 tmp639 = _mm512_shuffle_ps(tmp623, tmp625, 68);
__m512 tmp640 = _mm512_shuffle_ps(tmp623, tmp625, 238);
__m512 tmp641 = _mm512_shuffle_ps(tmp624, tmp626, 68);
__m512 tmp642 = _mm512_shuffle_ps(tmp624, tmp626, 238);
__m512 tmp643 = _mm512_shuffle_f32x4(tmp627, tmp631, 136);
__m512 tmp644 = _mm512_shuffle_f32x4(tmp627, tmp631, 221);
__m512 tmp645 = _mm512_shuffle_f32x4(tmp628, tmp632, 136);
__m512 tmp646 = _mm512_shuffle_f32x4(tmp628, tmp632, 221);
__m512 tmp647 = _mm512_shuffle_f32x4(tmp629, tmp633, 136);
__m512 tmp648 = _mm512_shuffle_f32x4(tmp629, tmp633, 221);
__m512 tmp649 = _mm512_shuffle_f32x4(tmp630, tmp634, 136);
__m512 tmp650 = _mm512_shuffle_f32x4(tmp630, tmp634, 221);
__m512 tmp651 = _mm512_shuffle_f32x4(tmp635, tmp639, 136);
__m512 tmp652 = _mm512_shuffle_f32x4(tmp635, tmp639, 221);
__m512 tmp653 = _mm512_shuffle_f32x4(tmp636, tmp640, 136);
__m512 tmp654 = _mm512_shuffle_f32x4(tmp636, tmp640, 221);
__m512 tmp655 = _mm512_shuffle_f32x4(tmp637, tmp641, 136);
__m512 tmp656 = _mm512_shuffle_f32x4(tmp637, tmp641, 221);
__m512 tmp657 = _mm512_shuffle_f32x4(tmp638, tmp642, 136);
__m512 tmp658 = _mm512_shuffle_f32x4(tmp638, tmp642, 221);
in104 = _mm512_shuffle_f32x4(tmp643, tmp651, 136);
in112 = _mm512_shuffle_f32x4(tmp643, tmp651, 221);
tmp596 = _mm512_shuffle_f32x4(tmp645, tmp653, 136);
tmp600 = _mm512_shuffle_f32x4(tmp645, tmp653, 221);
tmp597 = _mm512_shuffle_f32x4(tmp647, tmp655, 136);
tmp601 = _mm512_shuffle_f32x4(tmp647, tmp655, 221);
in110 = _mm512_shuffle_f32x4(tmp649, tmp657, 136);
in118 = _mm512_shuffle_f32x4(tmp649, tmp657, 221);
tmp595 = _mm512_shuffle_f32x4(tmp644, tmp652, 136);
tmp599 = _mm512_shuffle_f32x4(tmp644, tmp652, 221);
in106 = _mm512_shuffle_f32x4(tmp646, tmp654, 136);
in108 = _mm512_shuffle_f32x4(tmp648, tmp656, 136);
in107 = _mm512_shuffle_f32x4(tmp650, tmp658, 136);
__m512 tmp602 = _mm512_add_ps(tmp596, in106);
__m512 tmp606 = tmp600;
__m512 tmp603 = _mm512_sub_ps(tmp595, tmp597);
__m512 tmp607 = _mm512_sub_ps(tmp599, tmp601);
__m512 tmp604 = _mm512_add_ps(tmp597, in108);
__m512 tmp608 = tmp601;
in104 = _mm512_sub_ps(in104, in108);
in112 = in112;
tmp602 = _mm512_fmadd_ps(in110, _mm512_set1_ps(-4.25e+00f), tmp602);
tmp606 = _mm512_fmadd_ps(in118, _mm512_set1_ps(-4.25e+00f), tmp606);
tmp604 = _mm512_fmadd_ps(tmp595, _mm512_set1_ps(-4.25e+00f), tmp604);
tmp608 = _mm512_fmadd_ps(tmp599, _mm512_set1_ps(-4.25e+00f), tmp608);
in104 = _mm512_fmadd_ps(tmp603, _mm512_set1_ps(5.25e+00f), in104);
in112 = _mm512_fmadd_ps(tmp607, _mm512_set1_ps(5.25e+00f), in112);
tmp603 = _mm512_fmadd_ps(tmp597, _mm512_set1_ps(2.5e-01f), in108);
tmp607 = _mm512_mul_ps(tmp601, _mm512_set1_ps(2.5e-01f));
tmp597 = _mm512_fmadd_ps(tmp597, _mm512_set1_ps(4e+00f), in108);
tmp601 = _mm512_mul_ps(tmp601, _mm512_set1_ps(4e+00f));
__m512 tmp605 = _mm512_sub_ps(tmp604, tmp602);
__m512 tmp609 = _mm512_sub_ps(tmp608, tmp606);
tmp604 = _mm512_add_ps(tmp602, tmp604);
tmp608 = _mm512_add_ps(tmp606, tmp608);
tmp602 = _mm512_fmadd_ps(tmp596, _mm512_set1_ps(2.5e-01f), in106);
tmp606 = _mm512_mul_ps(tmp600, _mm512_set1_ps(2.5e-01f));
tmp603 = _mm512_fmadd_ps(tmp595, _mm512_set1_ps(-1.25e+00f), tmp603);
tmp607 = _mm512_fmadd_ps(tmp599, _mm512_set1_ps(-1.25e+00f), tmp607);
tmp595 = _mm512_fmadd_ps(tmp595, _mm512_set1_ps(-5e+00f), tmp597);
tmp599 = _mm512_fmadd_ps(tmp599, _mm512_set1_ps(-5e+00f), tmp601);
tmp602 = _mm512_fmadd_ps(in110, _mm512_set1_ps(-1.25e+00f), tmp602);
tmp606 = _mm512_fmadd_ps(in118, _mm512_set1_ps(-1.25e+00f), tmp606);
in108 = _mm512_fmadd_ps(tmp602, _mm512_set1_ps(2e+00f), tmp603);
__m512 tmp610 = _mm512_fmadd_ps(tmp606, _mm512_set1_ps(2e+00f), tmp607);
tmp603 = _mm512_fnmadd_ps(tmp602, _mm512_set1_ps(2e+00f), tmp603);
tmp607 = _mm512_fnmadd_ps(tmp606, _mm512_set1_ps(2e+00f), tmp607);
tmp602 = _mm512_fmadd_ps(in106, _mm512_set1_ps(2.5e-01f), tmp596);
tmp606 = tmp600;
tmp596 = _mm512_sub_ps(in107, tmp596);
tmp600 = _mm512_sub_ps(_mm512_setzero_ps(), tmp600);
tmp602 = _mm512_fmadd_ps(in110, _mm512_set1_ps(-1.25e+00f), tmp602);
tmp606 = _mm512_fmadd_ps(in118, _mm512_set1_ps(-1.25e+00f), tmp606);
in110 = _mm512_sub_ps(in110, in106);
in118 = in118;
in110 = _mm512_fmadd_ps(in110, _mm512_set1_ps(5.25e+00f), tmp596);
in118 = _mm512_fmadd_ps(in118, _mm512_set1_ps(5.25e+00f), tmp600);
tmp597 = _mm512_fmadd_ps(tmp602, _mm512_set1_ps(2e+00f), tmp595);
tmp601 = _mm512_fmadd_ps(tmp606, _mm512_set1_ps(2e+00f), tmp599);
tmp595 = _mm512_fnmadd_ps(tmp602, _mm512_set1_ps(2e+00f), tmp595);
tmp599 = _mm512_fnmadd_ps(tmp606, _mm512_set1_ps(2e+00f), tmp599);
__m512 out157 = _mm512_shuffle_f32x4(in104, tmp604, 68);
__m512 out165 = _mm512_shuffle_f32x4(in104, tmp604, 238);
__m512 out158 = _mm512_shuffle_f32x4(tmp605, in108, 68);
__m512 out166 = _mm512_shuffle_f32x4(tmp605, in108, 238);
__m512 out159 = _mm512_shuffle_f32x4(tmp603, tmp597, 68);
__m512 out167 = _mm512_shuffle_f32x4(tmp603, tmp597, 238);
__m512 out160 = _mm512_shuffle_f32x4(tmp595, in110, 68);
__m512 out168 = _mm512_shuffle_f32x4(tmp595, in110, 238);
__m512 out161 = _mm512_shuffle_f32x4(in112, tmp608, 68);
__m512 out169 = _mm512_shuffle_f32x4(in112, tmp608, 238);
__m512 out162 = _mm512_shuffle_f32x4(tmp609, tmp610, 68);
__m512 out170 = _mm512_shuffle_f32x4(tmp609, tmp610, 238);
__m512 out163 = _mm512_shuffle_f32x4(tmp607, tmp601, 68);
__m512 out171 = _mm512_shuffle_f32x4(tmp607, tmp601, 238);
__m512 out164 = _mm512_shuffle_f32x4(tmp599, in118, 68);
__m512 out172 = _mm512_shuffle_f32x4(tmp599, in118, 238);
_mm512_storeu_ps(dfPtr1+256+2433024*i7+152064*j3+38016*s5+768*k6, out157);
_mm512_storeu_ps(dfPtr1+384+2433024*i7+152064*j3+38016*s5+768*k6, out165);
_mm512_storeu_ps(dfPtr1+320+2433024*i7+152064*j3+38016*s5+768*k6, out161);
_mm512_storeu_ps(dfPtr1+448+2433024*i7+152064*j3+38016*s5+768*k6, out169);
_mm512_storeu_ps(dfPtr1+608512+2433024*i7+152064*j3+38016*s5+768*k6, out158);
_mm512_storeu_ps(dfPtr1+608640+2433024*i7+152064*j3+38016*s5+768*k6, out166);
_mm512_storeu_ps(dfPtr1+608576+2433024*i7+152064*j3+38016*s5+768*k6, out162);
_mm512_storeu_ps(dfPtr1+608704+2433024*i7+152064*j3+38016*s5+768*k6, out170);
_mm512_storeu_ps(dfPtr1+1216768+2433024*i7+152064*j3+38016*s5+768*k6, out159);
_mm512_storeu_ps(dfPtr1+1216896+2433024*i7+152064*j3+38016*s5+768*k6, out167);
_mm512_storeu_ps(dfPtr1+1216832+2433024*i7+152064*j3+38016*s5+768*k6, out163);
_mm512_storeu_ps(dfPtr1+1216960+2433024*i7+152064*j3+38016*s5+768*k6, out171);
_mm512_storeu_ps(dfPtr1+1825024+2433024*i7+152064*j3+38016*s5+768*k6, out160);
_mm512_storeu_ps(dfPtr1+1825152+2433024*i7+152064*j3+38016*s5+768*k6, out168);
_mm512_storeu_ps(dfPtr1+1825088+2433024*i7+152064*j3+38016*s5+768*k6, out164);
_mm512_storeu_ps(dfPtr1+1825216+2433024*i7+152064*j3+38016*s5+768*k6, out172);
__m512 dat105 = _mm512_maskz_loadu_ps(16383, datPtr1+4000+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat106 = _mm512_maskz_loadu_ps(2047, datPtr1+4048+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512i pm20 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in120 = _mm512_permutexvar_ps(pm20, dat105);
__m512i pm21 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in128 = _mm512_permutexvar_ps(pm21, dat106);
__m512 dat107 = _mm512_maskz_loadu_ps(16383, datPtr1+4092+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat108 = _mm512_maskz_loadu_ps(2047, datPtr1+4140+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in121 = _mm512_permutexvar_ps(pm20, dat107);
__m512 in129 = _mm512_permutexvar_ps(pm21, dat108);
__m512 dat109 = _mm512_maskz_loadu_ps(16383, datPtr1+4184+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat110 = _mm512_maskz_loadu_ps(2047, datPtr1+4232+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in122 = _mm512_permutexvar_ps(pm20, dat109);
__m512 in130 = _mm512_permutexvar_ps(pm21, dat110);
__m512 dat111 = _mm512_maskz_loadu_ps(16383, datPtr1+4276+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat112 = _mm512_maskz_loadu_ps(2047, datPtr1+4324+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in123 = _mm512_permutexvar_ps(pm20, dat111);
__m512 in131 = _mm512_permutexvar_ps(pm21, dat112);
__m512 dat113 = _mm512_maskz_loadu_ps(16383, datPtr1+4368+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat114 = _mm512_maskz_loadu_ps(2047, datPtr1+4416+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in124 = _mm512_permutexvar_ps(pm20, dat113);
__m512 in132 = _mm512_permutexvar_ps(pm21, dat114);
__m512 dat115 = _mm512_maskz_loadu_ps(16383, datPtr1+4460+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat116 = _mm512_maskz_loadu_ps(2047, datPtr1+4508+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in125 = _mm512_permutexvar_ps(pm20, dat115);
__m512 in133 = _mm512_permutexvar_ps(pm21, dat116);
__m512 dat117 = _mm512_maskz_loadu_ps(16383, datPtr1+4552+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat118 = _mm512_maskz_loadu_ps(2047, datPtr1+4600+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in126 = _mm512_permutexvar_ps(pm20, dat117);
__m512 in134 = _mm512_permutexvar_ps(pm21, dat118);
__m512 dat119 = _mm512_maskz_loadu_ps(16383, datPtr1+4644+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat120 = _mm512_maskz_loadu_ps(2047, datPtr1+4692+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in127 = _mm512_permutexvar_ps(pm20, dat119);
__m512 in135 = _mm512_permutexvar_ps(pm21, dat120);
__m512 tmp659 = _mm512_add_ps(in121, in125);
__m512 tmp663 = _mm512_add_ps(in129, in133);
__m512 tmp660 = _mm512_sub_ps(in124, in122);
__m512 tmp664 = _mm512_sub_ps(in132, in130);
__m512 tmp661 = _mm512_add_ps(in122, in126);
__m512 tmp665 = _mm512_add_ps(in130, in134);
in120 = _mm512_sub_ps(in120, in126);
in128 = _mm512_sub_ps(in128, in134);
tmp659 = _mm512_fmadd_ps(in123, _mm512_set1_ps(-4.25e+00f), tmp659);
tmp663 = _mm512_fmadd_ps(in131, _mm512_set1_ps(-4.25e+00f), tmp663);
tmp661 = _mm512_fmadd_ps(in124, _mm512_set1_ps(-4.25e+00f), tmp661);
tmp665 = _mm512_fmadd_ps(in132, _mm512_set1_ps(-4.25e+00f), tmp665);
in120 = _mm512_fmadd_ps(tmp660, _mm512_set1_ps(5.25e+00f), in120);
in128 = _mm512_fmadd_ps(tmp664, _mm512_set1_ps(5.25e+00f), in128);
tmp660 = _mm512_fmadd_ps(in122, _mm512_set1_ps(2.5e-01f), in126);
tmp664 = _mm512_fmadd_ps(in130, _mm512_set1_ps(2.5e-01f), in134);
in122 = _mm512_fmadd_ps(in122, _mm512_set1_ps(4e+00f), in126);
in130 = _mm512_fmadd_ps(in130, _mm512_set1_ps(4e+00f), in134);
__m512 tmp662 = _mm512_sub_ps(tmp661, tmp659);
__m512 tmp666 = _mm512_sub_ps(tmp665, tmp663);
tmp661 = _mm512_add_ps(tmp659, tmp661);
tmp665 = _mm512_add_ps(tmp663, tmp665);
tmp659 = _mm512_fmadd_ps(in121, _mm512_set1_ps(2.5e-01f), in125);
tmp663 = _mm512_fmadd_ps(in129, _mm512_set1_ps(2.5e-01f), in133);
tmp660 = _mm512_fmadd_ps(in124, _mm512_set1_ps(-1.25e+00f), tmp660);
tmp664 = _mm512_fmadd_ps(in132, _mm512_set1_ps(-1.25e+00f), tmp664);
in124 = _mm512_fmadd_ps(in124, _mm512_set1_ps(-5e+00f), in122);
in132 = _mm512_fmadd_ps(in132, _mm512_set1_ps(-5e+00f), in130);
tmp659 = _mm512_fmadd_ps(in123, _mm512_set1_ps(-1.25e+00f), tmp659);
tmp663 = _mm512_fmadd_ps(in131, _mm512_set1_ps(-1.25e+00f), tmp663);
in126 = _mm512_fmadd_ps(tmp659, _mm512_set1_ps(2e+00f), tmp660);
in134 = _mm512_fmadd_ps(tmp663, _mm512_set1_ps(2e+00f), tmp664);
tmp660 = _mm512_fnmadd_ps(tmp659, _mm512_set1_ps(2e+00f), tmp660);
tmp664 = _mm512_fnmadd_ps(tmp663, _mm512_set1_ps(2e+00f), tmp664);
tmp659 = _mm512_fmadd_ps(in125, _mm512_set1_ps(2.5e-01f), in121);
tmp663 = _mm512_fmadd_ps(in133, _mm512_set1_ps(2.5e-01f), in129);
in121 = _mm512_sub_ps(in127, in121);
in129 = _mm512_sub_ps(in135, in129);
tmp659 = _mm512_fmadd_ps(in123, _mm512_set1_ps(-1.25e+00f), tmp659);
tmp663 = _mm512_fmadd_ps(in131, _mm512_set1_ps(-1.25e+00f), tmp663);
in123 = _mm512_sub_ps(in123, in125);
in131 = _mm512_sub_ps(in131, in133);
in123 = _mm512_fmadd_ps(in123, _mm512_set1_ps(5.25e+00f), in121);
in131 = _mm512_fmadd_ps(in131, _mm512_set1_ps(5.25e+00f), in129);
in122 = _mm512_fmadd_ps(tmp659, _mm512_set1_ps(2e+00f), in124);
in130 = _mm512_fmadd_ps(tmp663, _mm512_set1_ps(2e+00f), in132);
in124 = _mm512_fnmadd_ps(tmp659, _mm512_set1_ps(2e+00f), in124);
in132 = _mm512_fnmadd_ps(tmp663, _mm512_set1_ps(2e+00f), in132);
__m512 tmp675 = _mm512_unpacklo_ps(in120, tmp661);
__m512 tmp676 = _mm512_unpackhi_ps(in120, tmp661);
__m512 tmp677 = _mm512_unpacklo_ps(tmp662, in126);
__m512 tmp678 = _mm512_unpackhi_ps(tmp662, in126);
__m512 tmp679 = _mm512_unpacklo_ps(tmp660, in122);
__m512 tmp680 = _mm512_unpackhi_ps(tmp660, in122);
__m512 tmp681 = _mm512_unpacklo_ps(in124, in123);
__m512 tmp682 = _mm512_unpackhi_ps(in124, in123);
__m512 tmp683 = _mm512_unpacklo_ps(in128, tmp665);
__m512 tmp684 = _mm512_unpackhi_ps(in128, tmp665);
__m512 tmp685 = _mm512_unpacklo_ps(tmp666, in134);
__m512 tmp686 = _mm512_unpackhi_ps(tmp666, in134);
__m512 tmp687 = _mm512_unpacklo_ps(tmp664, in130);
__m512 tmp688 = _mm512_unpackhi_ps(tmp664, in130);
__m512 tmp689 = _mm512_unpacklo_ps(in132, in131);
__m512 tmp690 = _mm512_unpackhi_ps(in132, in131);
__m512 tmp691 = _mm512_shuffle_ps(tmp675, tmp677, 68);
__m512 tmp692 = _mm512_shuffle_ps(tmp675, tmp677, 238);
__m512 tmp693 = _mm512_shuffle_ps(tmp676, tmp678, 68);
__m512 tmp694 = _mm512_shuffle_ps(tmp676, tmp678, 238);
__m512 tmp695 = _mm512_shuffle_ps(tmp679, tmp681, 68);
__m512 tmp696 = _mm512_shuffle_ps(tmp679, tmp681, 238);
__m512 tmp697 = _mm512_shuffle_ps(tmp680, tmp682, 68);
__m512 tmp698 = _mm512_shuffle_ps(tmp680, tmp682, 238);
__m512 tmp699 = _mm512_shuffle_ps(tmp683, tmp685, 68);
__m512 tmp700 = _mm512_shuffle_ps(tmp683, tmp685, 238);
__m512 tmp701 = _mm512_shuffle_ps(tmp684, tmp686, 68);
__m512 tmp702 = _mm512_shuffle_ps(tmp684, tmp686, 238);
__m512 tmp703 = _mm512_shuffle_ps(tmp687, tmp689, 68);
__m512 tmp704 = _mm512_shuffle_ps(tmp687, tmp689, 238);
__m512 tmp705 = _mm512_shuffle_ps(tmp688, tmp690, 68);
__m512 tmp706 = _mm512_shuffle_ps(tmp688, tmp690, 238);
__m512 tmp707 = _mm512_shuffle_f32x4(tmp691, tmp695, 136);
__m512 tmp708 = _mm512_shuffle_f32x4(tmp691, tmp695, 221);
__m512 tmp709 = _mm512_shuffle_f32x4(tmp692, tmp696, 136);
__m512 tmp710 = _mm512_shuffle_f32x4(tmp692, tmp696, 221);
__m512 tmp711 = _mm512_shuffle_f32x4(tmp693, tmp697, 136);
__m512 tmp712 = _mm512_shuffle_f32x4(tmp693, tmp697, 221);
__m512 tmp713 = _mm512_shuffle_f32x4(tmp694, tmp698, 136);
__m512 tmp714 = _mm512_shuffle_f32x4(tmp694, tmp698, 221);
__m512 tmp715 = _mm512_shuffle_f32x4(tmp699, tmp703, 136);
__m512 tmp716 = _mm512_shuffle_f32x4(tmp699, tmp703, 221);
__m512 tmp717 = _mm512_shuffle_f32x4(tmp700, tmp704, 136);
__m512 tmp718 = _mm512_shuffle_f32x4(tmp700, tmp704, 221);
__m512 tmp719 = _mm512_shuffle_f32x4(tmp701, tmp705, 136);
__m512 tmp720 = _mm512_shuffle_f32x4(tmp701, tmp705, 221);
__m512 tmp721 = _mm512_shuffle_f32x4(tmp702, tmp706, 136);
__m512 tmp722 = _mm512_shuffle_f32x4(tmp702, tmp706, 221);
in120 = _mm512_shuffle_f32x4(tmp707, tmp715, 136);
in128 = _mm512_shuffle_f32x4(tmp707, tmp715, 221);
tmp661 = _mm512_shuffle_f32x4(tmp709, tmp717, 136);
tmp665 = _mm512_shuffle_f32x4(tmp709, tmp717, 221);
tmp662 = _mm512_shuffle_f32x4(tmp711, tmp719, 136);
tmp666 = _mm512_shuffle_f32x4(tmp711, tmp719, 221);
in126 = _mm512_shuffle_f32x4(tmp713, tmp721, 136);
in134 = _mm512_shuffle_f32x4(tmp713, tmp721, 221);
tmp660 = _mm512_shuffle_f32x4(tmp708, tmp716, 136);
tmp664 = _mm512_shuffle_f32x4(tmp708, tmp716, 221);
in122 = _mm512_shuffle_f32x4(tmp710, tmp718, 136);
in130 = _mm512_shuffle_f32x4(tmp710, tmp718, 221);
in124 = _mm512_shuffle_f32x4(tmp712, tmp720, 136);
in132 = _mm512_shuffle_f32x4(tmp712, tmp720, 221);
in123 = _mm512_shuffle_f32x4(tmp714, tmp722, 136);
in131 = _mm512_shuffle_f32x4(tmp714, tmp722, 221);
__m512 tmp667 = _mm512_add_ps(tmp661, in122);
__m512 tmp671 = _mm512_add_ps(tmp665, in130);
__m512 tmp668 = _mm512_sub_ps(tmp660, tmp662);
__m512 tmp672 = _mm512_sub_ps(tmp664, tmp666);
__m512 tmp669 = _mm512_add_ps(tmp662, in124);
__m512 tmp673 = _mm512_add_ps(tmp666, in132);
in120 = _mm512_sub_ps(in120, in124);
in128 = _mm512_sub_ps(in128, in132);
tmp667 = _mm512_fmadd_ps(in126, _mm512_set1_ps(-4.25e+00f), tmp667);
tmp671 = _mm512_fmadd_ps(in134, _mm512_set1_ps(-4.25e+00f), tmp671);
tmp669 = _mm512_fmadd_ps(tmp660, _mm512_set1_ps(-4.25e+00f), tmp669);
tmp673 = _mm512_fmadd_ps(tmp664, _mm512_set1_ps(-4.25e+00f), tmp673);
in120 = _mm512_fmadd_ps(tmp668, _mm512_set1_ps(5.25e+00f), in120);
in128 = _mm512_fmadd_ps(tmp672, _mm512_set1_ps(5.25e+00f), in128);
tmp668 = _mm512_fmadd_ps(tmp662, _mm512_set1_ps(2.5e-01f), in124);
tmp672 = _mm512_fmadd_ps(tmp666, _mm512_set1_ps(2.5e-01f), in132);
tmp662 = _mm512_fmadd_ps(tmp662, _mm512_set1_ps(4e+00f), in124);
tmp666 = _mm512_fmadd_ps(tmp666, _mm512_set1_ps(4e+00f), in132);
__m512 tmp670 = _mm512_sub_ps(tmp669, tmp667);
__m512 tmp674 = _mm512_sub_ps(tmp673, tmp671);
tmp669 = _mm512_add_ps(tmp667, tmp669);
tmp673 = _mm512_add_ps(tmp671, tmp673);
tmp667 = _mm512_fmadd_ps(tmp661, _mm512_set1_ps(2.5e-01f), in122);
tmp671 = _mm512_fmadd_ps(tmp665, _mm512_set1_ps(2.5e-01f), in130);
tmp668 = _mm512_fmadd_ps(tmp660, _mm512_set1_ps(-1.25e+00f), tmp668);
tmp672 = _mm512_fmadd_ps(tmp664, _mm512_set1_ps(-1.25e+00f), tmp672);
tmp660 = _mm512_fmadd_ps(tmp660, _mm512_set1_ps(-5e+00f), tmp662);
tmp664 = _mm512_fmadd_ps(tmp664, _mm512_set1_ps(-5e+00f), tmp666);
tmp667 = _mm512_fmadd_ps(in126, _mm512_set1_ps(-1.25e+00f), tmp667);
tmp671 = _mm512_fmadd_ps(in134, _mm512_set1_ps(-1.25e+00f), tmp671);
in124 = _mm512_fmadd_ps(tmp667, _mm512_set1_ps(2e+00f), tmp668);
in132 = _mm512_fmadd_ps(tmp671, _mm512_set1_ps(2e+00f), tmp672);
tmp668 = _mm512_fnmadd_ps(tmp667, _mm512_set1_ps(2e+00f), tmp668);
tmp672 = _mm512_fnmadd_ps(tmp671, _mm512_set1_ps(2e+00f), tmp672);
tmp667 = _mm512_fmadd_ps(in122, _mm512_set1_ps(2.5e-01f), tmp661);
tmp671 = _mm512_fmadd_ps(in130, _mm512_set1_ps(2.5e-01f), tmp665);
tmp661 = _mm512_sub_ps(in123, tmp661);
tmp665 = _mm512_sub_ps(in131, tmp665);
tmp667 = _mm512_fmadd_ps(in126, _mm512_set1_ps(-1.25e+00f), tmp667);
tmp671 = _mm512_fmadd_ps(in134, _mm512_set1_ps(-1.25e+00f), tmp671);
in126 = _mm512_sub_ps(in126, in122);
in134 = _mm512_sub_ps(in134, in130);
in126 = _mm512_fmadd_ps(in126, _mm512_set1_ps(5.25e+00f), tmp661);
in134 = _mm512_fmadd_ps(in134, _mm512_set1_ps(5.25e+00f), tmp665);
tmp662 = _mm512_fmadd_ps(tmp667, _mm512_set1_ps(2e+00f), tmp660);
tmp666 = _mm512_fmadd_ps(tmp671, _mm512_set1_ps(2e+00f), tmp664);
tmp660 = _mm512_fnmadd_ps(tmp667, _mm512_set1_ps(2e+00f), tmp660);
tmp664 = _mm512_fnmadd_ps(tmp671, _mm512_set1_ps(2e+00f), tmp664);
__m512 out173 = _mm512_shuffle_f32x4(in120, tmp669, 68);
__m512 out181 = _mm512_shuffle_f32x4(in120, tmp669, 238);
__m512 out174 = _mm512_shuffle_f32x4(tmp670, in124, 68);
__m512 out182 = _mm512_shuffle_f32x4(tmp670, in124, 238);
__m512 out175 = _mm512_shuffle_f32x4(tmp668, tmp662, 68);
__m512 out183 = _mm512_shuffle_f32x4(tmp668, tmp662, 238);
__m512 out176 = _mm512_shuffle_f32x4(tmp660, in126, 68);
__m512 out184 = _mm512_shuffle_f32x4(tmp660, in126, 238);
__m512 out177 = _mm512_shuffle_f32x4(in128, tmp673, 68);
__m512 out185 = _mm512_shuffle_f32x4(in128, tmp673, 238);
__m512 out178 = _mm512_shuffle_f32x4(tmp674, in132, 68);
__m512 out186 = _mm512_shuffle_f32x4(tmp674, in132, 238);
__m512 out179 = _mm512_shuffle_f32x4(tmp672, tmp666, 68);
__m512 out187 = _mm512_shuffle_f32x4(tmp672, tmp666, 238);
__m512 out180 = _mm512_shuffle_f32x4(tmp664, in134, 68);
__m512 out188 = _mm512_shuffle_f32x4(tmp664, in134, 238);
_mm512_storeu_ps(dfPtr1+512+2433024*i7+152064*j3+38016*s5+768*k6, out173);
_mm512_storeu_ps(dfPtr1+640+2433024*i7+152064*j3+38016*s5+768*k6, out181);
_mm512_storeu_ps(dfPtr1+576+2433024*i7+152064*j3+38016*s5+768*k6, out177);
_mm512_storeu_ps(dfPtr1+704+2433024*i7+152064*j3+38016*s5+768*k6, out185);
_mm512_storeu_ps(dfPtr1+608768+2433024*i7+152064*j3+38016*s5+768*k6, out174);
_mm512_storeu_ps(dfPtr1+608896+2433024*i7+152064*j3+38016*s5+768*k6, out182);
_mm512_storeu_ps(dfPtr1+608832+2433024*i7+152064*j3+38016*s5+768*k6, out178);
_mm512_storeu_ps(dfPtr1+608960+2433024*i7+152064*j3+38016*s5+768*k6, out186);
_mm512_storeu_ps(dfPtr1+1217024+2433024*i7+152064*j3+38016*s5+768*k6, out175);
_mm512_storeu_ps(dfPtr1+1217152+2433024*i7+152064*j3+38016*s5+768*k6, out183);
_mm512_storeu_ps(dfPtr1+1217088+2433024*i7+152064*j3+38016*s5+768*k6, out179);
_mm512_storeu_ps(dfPtr1+1217216+2433024*i7+152064*j3+38016*s5+768*k6, out187);
_mm512_storeu_ps(dfPtr1+1825280+2433024*i7+152064*j3+38016*s5+768*k6, out176);
_mm512_storeu_ps(dfPtr1+1825408+2433024*i7+152064*j3+38016*s5+768*k6, out184);
_mm512_storeu_ps(dfPtr1+1825344+2433024*i7+152064*j3+38016*s5+768*k6, out180);
_mm512_storeu_ps(dfPtr1+1825472+2433024*i7+152064*j3+38016*s5+768*k6, out188);
}
__m512 dat121 = _mm512_maskz_loadu_ps(2047, datPtr1+0+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat122 = _mm512_maskz_loadu_ps(16383, datPtr1+504+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512i pm22 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in136 = _mm512_permutexvar_ps(pm22, dat121);
__m512i pm23 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in144 = _mm512_permutexvar_ps(pm23, dat122);
__m512 dat123 = _mm512_maskz_loadu_ps(2047, datPtr1+92+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat124 = _mm512_maskz_loadu_ps(16383, datPtr1+596+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in137 = _mm512_permutexvar_ps(pm22, dat123);
__m512 in145 = _mm512_permutexvar_ps(pm23, dat124);
__m512 dat125 = _mm512_maskz_loadu_ps(2047, datPtr1+184+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat126 = _mm512_maskz_loadu_ps(16383, datPtr1+688+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in138 = _mm512_permutexvar_ps(pm22, dat125);
__m512 in146 = _mm512_permutexvar_ps(pm23, dat126);
__m512 dat127 = _mm512_maskz_loadu_ps(2047, datPtr1+276+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat128 = _mm512_maskz_loadu_ps(16383, datPtr1+780+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in139 = _mm512_permutexvar_ps(pm22, dat127);
__m512 in147 = _mm512_permutexvar_ps(pm23, dat128);
__m512 dat129 = _mm512_maskz_loadu_ps(2047, datPtr1+368+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat130 = _mm512_maskz_loadu_ps(16383, datPtr1+872+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in140 = _mm512_permutexvar_ps(pm22, dat129);
__m512 in148 = _mm512_permutexvar_ps(pm23, dat130);
__m512 dat131 = _mm512_maskz_loadu_ps(2047, datPtr1+460+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat132 = _mm512_maskz_loadu_ps(16383, datPtr1+964+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in141 = _mm512_permutexvar_ps(pm22, dat131);
__m512 in149 = _mm512_permutexvar_ps(pm23, dat132);
__m512 dat133 = _mm512_maskz_loadu_ps(2047, datPtr1+552+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat134 = _mm512_maskz_loadu_ps(16383, datPtr1+1056+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in142 = _mm512_permutexvar_ps(pm22, dat133);
__m512 in150 = _mm512_permutexvar_ps(pm23, dat134);
__m512 dat135 = _mm512_maskz_loadu_ps(2047, datPtr1+644+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 dat136 = _mm512_maskz_loadu_ps(16383, datPtr1+1148+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in143 = _mm512_permutexvar_ps(pm22, dat135);
__m512 in151 = _mm512_permutexvar_ps(pm23, dat136);
__m512 tmp723 = _mm512_add_ps(in137, in141);
__m512 tmp727 = _mm512_add_ps(in145, in149);
__m512 tmp724 = _mm512_sub_ps(in140, in138);
__m512 tmp728 = _mm512_sub_ps(in148, in146);
__m512 tmp725 = _mm512_add_ps(in138, in142);
__m512 tmp729 = _mm512_add_ps(in146, in150);
in136 = _mm512_sub_ps(in136, in142);
in144 = _mm512_sub_ps(in144, in150);
tmp723 = _mm512_fmadd_ps(in139, _mm512_set1_ps(-4.25e+00f), tmp723);
tmp727 = _mm512_fmadd_ps(in147, _mm512_set1_ps(-4.25e+00f), tmp727);
tmp725 = _mm512_fmadd_ps(in140, _mm512_set1_ps(-4.25e+00f), tmp725);
tmp729 = _mm512_fmadd_ps(in148, _mm512_set1_ps(-4.25e+00f), tmp729);
in136 = _mm512_fmadd_ps(tmp724, _mm512_set1_ps(5.25e+00f), in136);
in144 = _mm512_fmadd_ps(tmp728, _mm512_set1_ps(5.25e+00f), in144);
tmp724 = _mm512_fmadd_ps(in138, _mm512_set1_ps(2.5e-01f), in142);
tmp728 = _mm512_fmadd_ps(in146, _mm512_set1_ps(2.5e-01f), in150);
in138 = _mm512_fmadd_ps(in138, _mm512_set1_ps(4e+00f), in142);
in146 = _mm512_fmadd_ps(in146, _mm512_set1_ps(4e+00f), in150);
__m512 tmp726 = _mm512_sub_ps(tmp725, tmp723);
__m512 tmp730 = _mm512_sub_ps(tmp729, tmp727);
tmp725 = _mm512_add_ps(tmp723, tmp725);
tmp729 = _mm512_add_ps(tmp727, tmp729);
tmp723 = _mm512_fmadd_ps(in137, _mm512_set1_ps(2.5e-01f), in141);
tmp727 = _mm512_fmadd_ps(in145, _mm512_set1_ps(2.5e-01f), in149);
tmp724 = _mm512_fmadd_ps(in140, _mm512_set1_ps(-1.25e+00f), tmp724);
tmp728 = _mm512_fmadd_ps(in148, _mm512_set1_ps(-1.25e+00f), tmp728);
in140 = _mm512_fmadd_ps(in140, _mm512_set1_ps(-5e+00f), in138);
in148 = _mm512_fmadd_ps(in148, _mm512_set1_ps(-5e+00f), in146);
tmp723 = _mm512_fmadd_ps(in139, _mm512_set1_ps(-1.25e+00f), tmp723);
tmp727 = _mm512_fmadd_ps(in147, _mm512_set1_ps(-1.25e+00f), tmp727);
in142 = _mm512_fmadd_ps(tmp723, _mm512_set1_ps(2e+00f), tmp724);
in150 = _mm512_fmadd_ps(tmp727, _mm512_set1_ps(2e+00f), tmp728);
tmp724 = _mm512_fnmadd_ps(tmp723, _mm512_set1_ps(2e+00f), tmp724);
tmp728 = _mm512_fnmadd_ps(tmp727, _mm512_set1_ps(2e+00f), tmp728);
tmp723 = _mm512_fmadd_ps(in141, _mm512_set1_ps(2.5e-01f), in137);
tmp727 = _mm512_fmadd_ps(in149, _mm512_set1_ps(2.5e-01f), in145);
in137 = _mm512_sub_ps(in143, in137);
in145 = _mm512_sub_ps(in151, in145);
tmp723 = _mm512_fmadd_ps(in139, _mm512_set1_ps(-1.25e+00f), tmp723);
tmp727 = _mm512_fmadd_ps(in147, _mm512_set1_ps(-1.25e+00f), tmp727);
in139 = _mm512_sub_ps(in139, in141);
in147 = _mm512_sub_ps(in147, in149);
in139 = _mm512_fmadd_ps(in139, _mm512_set1_ps(5.25e+00f), in137);
in147 = _mm512_fmadd_ps(in147, _mm512_set1_ps(5.25e+00f), in145);
in138 = _mm512_fmadd_ps(tmp723, _mm512_set1_ps(2e+00f), in140);
in146 = _mm512_fmadd_ps(tmp727, _mm512_set1_ps(2e+00f), in148);
in140 = _mm512_fnmadd_ps(tmp723, _mm512_set1_ps(2e+00f), in140);
in148 = _mm512_fnmadd_ps(tmp727, _mm512_set1_ps(2e+00f), in148);
__m512 tmp739 = _mm512_unpacklo_ps(in136, tmp725);
__m512 tmp740 = _mm512_unpackhi_ps(in136, tmp725);
__m512 tmp741 = _mm512_unpacklo_ps(tmp726, in142);
__m512 tmp742 = _mm512_unpackhi_ps(tmp726, in142);
__m512 tmp743 = _mm512_unpacklo_ps(tmp724, in138);
__m512 tmp744 = _mm512_unpackhi_ps(tmp724, in138);
__m512 tmp745 = _mm512_unpacklo_ps(in140, in139);
__m512 tmp746 = _mm512_unpackhi_ps(in140, in139);
__m512 tmp747 = _mm512_unpacklo_ps(in144, tmp729);
__m512 tmp748 = _mm512_unpackhi_ps(in144, tmp729);
__m512 tmp749 = _mm512_unpacklo_ps(tmp730, in150);
__m512 tmp750 = _mm512_unpackhi_ps(tmp730, in150);
__m512 tmp751 = _mm512_unpacklo_ps(tmp728, in146);
__m512 tmp752 = _mm512_unpackhi_ps(tmp728, in146);
__m512 tmp753 = _mm512_unpacklo_ps(in148, in147);
__m512 tmp754 = _mm512_unpackhi_ps(in148, in147);
__m512 tmp755 = _mm512_shuffle_ps(tmp739, tmp741, 68);
__m512 tmp756 = _mm512_shuffle_ps(tmp739, tmp741, 238);
__m512 tmp757 = _mm512_shuffle_ps(tmp740, tmp742, 68);
__m512 tmp758 = _mm512_shuffle_ps(tmp740, tmp742, 238);
__m512 tmp759 = _mm512_shuffle_ps(tmp743, tmp745, 68);
__m512 tmp760 = _mm512_shuffle_ps(tmp743, tmp745, 238);
__m512 tmp761 = _mm512_shuffle_ps(tmp744, tmp746, 68);
__m512 tmp762 = _mm512_shuffle_ps(tmp744, tmp746, 238);
__m512 tmp763 = _mm512_shuffle_ps(tmp747, tmp749, 68);
__m512 tmp764 = _mm512_shuffle_ps(tmp747, tmp749, 238);
__m512 tmp765 = _mm512_shuffle_ps(tmp748, tmp750, 68);
__m512 tmp766 = _mm512_shuffle_ps(tmp748, tmp750, 238);
__m512 tmp767 = _mm512_shuffle_ps(tmp751, tmp753, 68);
__m512 tmp768 = _mm512_shuffle_ps(tmp751, tmp753, 238);
__m512 tmp769 = _mm512_shuffle_ps(tmp752, tmp754, 68);
__m512 tmp770 = _mm512_shuffle_ps(tmp752, tmp754, 238);
__m512 tmp771 = _mm512_shuffle_f32x4(tmp755, tmp759, 136);
__m512 tmp772 = _mm512_shuffle_f32x4(tmp755, tmp759, 221);
__m512 tmp773 = _mm512_shuffle_f32x4(tmp756, tmp760, 136);
__m512 tmp774 = _mm512_shuffle_f32x4(tmp756, tmp760, 221);
__m512 tmp775 = _mm512_shuffle_f32x4(tmp757, tmp761, 136);
__m512 tmp776 = _mm512_shuffle_f32x4(tmp757, tmp761, 221);
__m512 tmp777 = _mm512_shuffle_f32x4(tmp758, tmp762, 136);
__m512 tmp778 = _mm512_shuffle_f32x4(tmp758, tmp762, 221);
__m512 tmp779 = _mm512_shuffle_f32x4(tmp763, tmp767, 136);
__m512 tmp780 = _mm512_shuffle_f32x4(tmp763, tmp767, 221);
__m512 tmp781 = _mm512_shuffle_f32x4(tmp764, tmp768, 136);
__m512 tmp782 = _mm512_shuffle_f32x4(tmp764, tmp768, 221);
__m512 tmp783 = _mm512_shuffle_f32x4(tmp765, tmp769, 136);
__m512 tmp784 = _mm512_shuffle_f32x4(tmp765, tmp769, 221);
__m512 tmp785 = _mm512_shuffle_f32x4(tmp766, tmp770, 136);
__m512 tmp786 = _mm512_shuffle_f32x4(tmp766, tmp770, 221);
in136 = _mm512_shuffle_f32x4(tmp771, tmp779, 136);
in144 = _mm512_shuffle_f32x4(tmp771, tmp779, 221);
tmp725 = _mm512_shuffle_f32x4(tmp773, tmp781, 136);
tmp729 = _mm512_shuffle_f32x4(tmp773, tmp781, 221);
tmp726 = _mm512_shuffle_f32x4(tmp775, tmp783, 136);
tmp730 = _mm512_shuffle_f32x4(tmp775, tmp783, 221);
in142 = _mm512_shuffle_f32x4(tmp777, tmp785, 136);
in150 = _mm512_shuffle_f32x4(tmp777, tmp785, 221);
tmp724 = _mm512_shuffle_f32x4(tmp772, tmp780, 136);
tmp728 = _mm512_shuffle_f32x4(tmp772, tmp780, 221);
in138 = _mm512_shuffle_f32x4(tmp774, tmp782, 136);
in146 = _mm512_shuffle_f32x4(tmp774, tmp782, 221);
in140 = _mm512_shuffle_f32x4(tmp776, tmp784, 136);
in148 = _mm512_shuffle_f32x4(tmp776, tmp784, 221);
in139 = _mm512_shuffle_f32x4(tmp778, tmp786, 136);
in147 = _mm512_shuffle_f32x4(tmp778, tmp786, 221);
__m512 tmp731 = _mm512_add_ps(tmp725, in138);
__m512 tmp735 = _mm512_add_ps(tmp729, in146);
__m512 tmp732 = _mm512_sub_ps(tmp724, tmp726);
__m512 tmp736 = _mm512_sub_ps(tmp728, tmp730);
__m512 tmp733 = _mm512_add_ps(tmp726, in140);
__m512 tmp737 = _mm512_add_ps(tmp730, in148);
in136 = _mm512_sub_ps(in136, in140);
in144 = _mm512_sub_ps(in144, in148);
tmp731 = _mm512_fmadd_ps(in142, _mm512_set1_ps(-4.25e+00f), tmp731);
tmp735 = _mm512_fmadd_ps(in150, _mm512_set1_ps(-4.25e+00f), tmp735);
tmp733 = _mm512_fmadd_ps(tmp724, _mm512_set1_ps(-4.25e+00f), tmp733);
tmp737 = _mm512_fmadd_ps(tmp728, _mm512_set1_ps(-4.25e+00f), tmp737);
in136 = _mm512_fmadd_ps(tmp732, _mm512_set1_ps(5.25e+00f), in136);
in144 = _mm512_fmadd_ps(tmp736, _mm512_set1_ps(5.25e+00f), in144);
tmp732 = _mm512_fmadd_ps(tmp726, _mm512_set1_ps(2.5e-01f), in140);
tmp736 = _mm512_fmadd_ps(tmp730, _mm512_set1_ps(2.5e-01f), in148);
tmp726 = _mm512_fmadd_ps(tmp726, _mm512_set1_ps(4e+00f), in140);
tmp730 = _mm512_fmadd_ps(tmp730, _mm512_set1_ps(4e+00f), in148);
__m512 tmp734 = _mm512_sub_ps(tmp733, tmp731);
__m512 tmp738 = _mm512_sub_ps(tmp737, tmp735);
tmp733 = _mm512_add_ps(tmp731, tmp733);
tmp737 = _mm512_add_ps(tmp735, tmp737);
tmp731 = _mm512_fmadd_ps(tmp725, _mm512_set1_ps(2.5e-01f), in138);
tmp735 = _mm512_fmadd_ps(tmp729, _mm512_set1_ps(2.5e-01f), in146);
tmp732 = _mm512_fmadd_ps(tmp724, _mm512_set1_ps(-1.25e+00f), tmp732);
tmp736 = _mm512_fmadd_ps(tmp728, _mm512_set1_ps(-1.25e+00f), tmp736);
tmp724 = _mm512_fmadd_ps(tmp724, _mm512_set1_ps(-5e+00f), tmp726);
tmp728 = _mm512_fmadd_ps(tmp728, _mm512_set1_ps(-5e+00f), tmp730);
tmp731 = _mm512_fmadd_ps(in142, _mm512_set1_ps(-1.25e+00f), tmp731);
tmp735 = _mm512_fmadd_ps(in150, _mm512_set1_ps(-1.25e+00f), tmp735);
in140 = _mm512_fmadd_ps(tmp731, _mm512_set1_ps(2e+00f), tmp732);
in148 = _mm512_fmadd_ps(tmp735, _mm512_set1_ps(2e+00f), tmp736);
tmp732 = _mm512_fnmadd_ps(tmp731, _mm512_set1_ps(2e+00f), tmp732);
tmp736 = _mm512_fnmadd_ps(tmp735, _mm512_set1_ps(2e+00f), tmp736);
tmp731 = _mm512_fmadd_ps(in138, _mm512_set1_ps(2.5e-01f), tmp725);
tmp735 = _mm512_fmadd_ps(in146, _mm512_set1_ps(2.5e-01f), tmp729);
tmp725 = _mm512_sub_ps(in139, tmp725);
tmp729 = _mm512_sub_ps(in147, tmp729);
tmp731 = _mm512_fmadd_ps(in142, _mm512_set1_ps(-1.25e+00f), tmp731);
tmp735 = _mm512_fmadd_ps(in150, _mm512_set1_ps(-1.25e+00f), tmp735);
in142 = _mm512_sub_ps(in142, in138);
in150 = _mm512_sub_ps(in150, in146);
in142 = _mm512_fmadd_ps(in142, _mm512_set1_ps(5.25e+00f), tmp725);
in150 = _mm512_fmadd_ps(in150, _mm512_set1_ps(5.25e+00f), tmp729);
tmp726 = _mm512_fmadd_ps(tmp731, _mm512_set1_ps(2e+00f), tmp724);
tmp730 = _mm512_fmadd_ps(tmp735, _mm512_set1_ps(2e+00f), tmp728);
tmp724 = _mm512_fnmadd_ps(tmp731, _mm512_set1_ps(2e+00f), tmp724);
tmp728 = _mm512_fnmadd_ps(tmp735, _mm512_set1_ps(2e+00f), tmp728);
__m512 out189 = _mm512_shuffle_f32x4(in136, tmp733, 68);
__m512 out197 = _mm512_shuffle_f32x4(in136, tmp733, 238);
__m512 out190 = _mm512_shuffle_f32x4(tmp734, in140, 68);
__m512 out198 = _mm512_shuffle_f32x4(tmp734, in140, 238);
__m512 out191 = _mm512_shuffle_f32x4(tmp732, tmp726, 68);
__m512 out199 = _mm512_shuffle_f32x4(tmp732, tmp726, 238);
__m512 out192 = _mm512_shuffle_f32x4(tmp724, in142, 68);
__m512 out200 = _mm512_shuffle_f32x4(tmp724, in142, 238);
__m512 out193 = _mm512_shuffle_f32x4(in144, tmp737, 68);
__m512 out201 = _mm512_shuffle_f32x4(in144, tmp737, 238);
__m512 out194 = _mm512_shuffle_f32x4(tmp738, in148, 68);
__m512 out202 = _mm512_shuffle_f32x4(tmp738, in148, 238);
__m512 out195 = _mm512_shuffle_f32x4(tmp736, tmp730, 68);
__m512 out203 = _mm512_shuffle_f32x4(tmp736, tmp730, 238);
__m512 out196 = _mm512_shuffle_f32x4(tmp728, in150, 68);
__m512 out204 = _mm512_shuffle_f32x4(tmp728, in150, 238);
_mm512_storeu_ps(dfPtr1+0+2433024*i7+152064*j3+38016*s5+768*k6, out189);
_mm512_storeu_ps(dfPtr1+128+2433024*i7+152064*j3+38016*s5+768*k6, out197);
_mm512_storeu_ps(dfPtr1+64+2433024*i7+152064*j3+38016*s5+768*k6, out193);
_mm512_storeu_ps(dfPtr1+192+2433024*i7+152064*j3+38016*s5+768*k6, out201);
_mm512_storeu_ps(dfPtr1+608256+2433024*i7+152064*j3+38016*s5+768*k6, out190);
_mm512_storeu_ps(dfPtr1+608384+2433024*i7+152064*j3+38016*s5+768*k6, out198);
_mm512_storeu_ps(dfPtr1+608320+2433024*i7+152064*j3+38016*s5+768*k6, out194);
_mm512_storeu_ps(dfPtr1+608448+2433024*i7+152064*j3+38016*s5+768*k6, out202);
_mm512_storeu_ps(dfPtr1+1216512+2433024*i7+152064*j3+38016*s5+768*k6, out191);
_mm512_storeu_ps(dfPtr1+1216640+2433024*i7+152064*j3+38016*s5+768*k6, out199);
_mm512_storeu_ps(dfPtr1+1216576+2433024*i7+152064*j3+38016*s5+768*k6, out195);
_mm512_storeu_ps(dfPtr1+1216704+2433024*i7+152064*j3+38016*s5+768*k6, out203);
_mm512_storeu_ps(dfPtr1+1824768+2433024*i7+152064*j3+38016*s5+768*k6, out192);
_mm512_storeu_ps(dfPtr1+1824896+2433024*i7+152064*j3+38016*s5+768*k6, out200);
_mm512_storeu_ps(dfPtr1+1824832+2433024*i7+152064*j3+38016*s5+768*k6, out196);
_mm512_storeu_ps(dfPtr1+1824960+2433024*i7+152064*j3+38016*s5+768*k6, out204);
__m512 dat137 = _mm512_maskz_loadu_ps(2047, datPtr1+552+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512i pm24 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in152 = _mm512_permutexvar_ps(pm24, dat137);
__m512 dat138 = _mm512_maskz_loadu_ps(2047, datPtr1+644+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in153 = _mm512_permutexvar_ps(pm24, dat138);
__m512 dat139 = _mm512_maskz_loadu_ps(2047, datPtr1+736+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in154 = _mm512_permutexvar_ps(pm24, dat139);
__m512 dat140 = _mm512_maskz_loadu_ps(2047, datPtr1+828+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in155 = _mm512_permutexvar_ps(pm24, dat140);
__m512 dat141 = _mm512_maskz_loadu_ps(2047, datPtr1+920+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in156 = _mm512_permutexvar_ps(pm24, dat141);
__m512 dat142 = _mm512_maskz_loadu_ps(2047, datPtr1+1012+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in157 = _mm512_permutexvar_ps(pm24, dat142);
__m512 dat143 = _mm512_maskz_loadu_ps(2047, datPtr1+1104+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in158 = _mm512_permutexvar_ps(pm24, dat143);
__m512 dat144 = _mm512_maskz_loadu_ps(2047, datPtr1+1196+7163304*i7+92*h2+4*w2+346104*s5+6992*k6);
__m512 in159 = _mm512_permutexvar_ps(pm24, dat144);
__m512 tmp787 = _mm512_add_ps(in153, in157);
__m512 tmp788 = _mm512_sub_ps(in156, in154);
__m512 tmp789 = _mm512_add_ps(in154, in158);
in152 = _mm512_sub_ps(in152, in158);
tmp787 = _mm512_fmadd_ps(in155, _mm512_set1_ps(-4.25e+00f), tmp787);
tmp789 = _mm512_fmadd_ps(in156, _mm512_set1_ps(-4.25e+00f), tmp789);
in152 = _mm512_fmadd_ps(tmp788, _mm512_set1_ps(5.25e+00f), in152);
tmp788 = _mm512_fmadd_ps(in154, _mm512_set1_ps(2.5e-01f), in158);
in154 = _mm512_fmadd_ps(in154, _mm512_set1_ps(4e+00f), in158);
__m512 tmp790 = _mm512_sub_ps(tmp789, tmp787);
tmp789 = _mm512_add_ps(tmp787, tmp789);
tmp787 = _mm512_fmadd_ps(in153, _mm512_set1_ps(2.5e-01f), in157);
tmp788 = _mm512_fmadd_ps(in156, _mm512_set1_ps(-1.25e+00f), tmp788);
in156 = _mm512_fmadd_ps(in156, _mm512_set1_ps(-5e+00f), in154);
tmp787 = _mm512_fmadd_ps(in155, _mm512_set1_ps(-1.25e+00f), tmp787);
in158 = _mm512_fmadd_ps(tmp787, _mm512_set1_ps(2e+00f), tmp788);
tmp788 = _mm512_fnmadd_ps(tmp787, _mm512_set1_ps(2e+00f), tmp788);
tmp787 = _mm512_fmadd_ps(in157, _mm512_set1_ps(2.5e-01f), in153);
in153 = _mm512_sub_ps(in159, in153);
tmp787 = _mm512_fmadd_ps(in155, _mm512_set1_ps(-1.25e+00f), tmp787);
in155 = _mm512_sub_ps(in155, in157);
in155 = _mm512_fmadd_ps(in155, _mm512_set1_ps(5.25e+00f), in153);
in154 = _mm512_fmadd_ps(tmp787, _mm512_set1_ps(2e+00f), in156);
in156 = _mm512_fnmadd_ps(tmp787, _mm512_set1_ps(2e+00f), in156);
__m512 tmp805 = _mm512_unpacklo_ps(in152, tmp789);
__m512 tmp806 = _mm512_unpackhi_ps(in152, tmp789);
__m512 tmp807 = _mm512_unpacklo_ps(tmp790, in158);
__m512 tmp808 = _mm512_unpackhi_ps(tmp790, in158);
__m512 tmp809 = _mm512_unpacklo_ps(tmp788, in154);
__m512 tmp810 = _mm512_unpackhi_ps(tmp788, in154);
__m512 tmp811 = _mm512_unpacklo_ps(in156, in155);
__m512 tmp812 = _mm512_unpackhi_ps(in156, in155);
__m512 tmp813 = _mm512_shuffle_ps(tmp805, tmp807, 68);
__m512 tmp814 = _mm512_shuffle_ps(tmp805, tmp807, 238);
__m512 tmp815 = _mm512_shuffle_ps(tmp806, tmp808, 68);
__m512 tmp816 = _mm512_shuffle_ps(tmp806, tmp808, 238);
__m512 tmp817 = _mm512_shuffle_ps(tmp809, tmp811, 68);
__m512 tmp818 = _mm512_shuffle_ps(tmp809, tmp811, 238);
__m512 tmp819 = _mm512_shuffle_ps(tmp810, tmp812, 68);
__m512 tmp820 = _mm512_shuffle_ps(tmp810, tmp812, 238);
__m512 tmp821 = _mm512_shuffle_f32x4(tmp813, tmp817, 136);
__m512 tmp822 = _mm512_shuffle_f32x4(tmp813, tmp817, 221);
__m512 tmp823 = _mm512_shuffle_f32x4(tmp814, tmp818, 136);
__m512 tmp824 = _mm512_shuffle_f32x4(tmp814, tmp818, 221);
__m512 tmp825 = _mm512_shuffle_f32x4(tmp815, tmp819, 136);
__m512 tmp826 = _mm512_shuffle_f32x4(tmp815, tmp819, 221);
__m512 tmp827 = _mm512_shuffle_f32x4(tmp816, tmp820, 136);
__m512 tmp828 = _mm512_shuffle_f32x4(tmp816, tmp820, 221);
in152 = _mm512_shuffle_f32x4(tmp821, tmp821, 136);
__m512 tmp791 = _mm512_shuffle_f32x4(tmp821, tmp821, 221);
tmp789 = _mm512_shuffle_f32x4(tmp823, tmp823, 136);
__m512 tmp792 = _mm512_shuffle_f32x4(tmp823, tmp823, 221);
tmp790 = _mm512_shuffle_f32x4(tmp825, tmp825, 136);
__m512 tmp793 = _mm512_shuffle_f32x4(tmp825, tmp825, 221);
in158 = _mm512_shuffle_f32x4(tmp827, tmp827, 136);
__m512 tmp794 = _mm512_shuffle_f32x4(tmp827, tmp827, 221);
tmp788 = _mm512_shuffle_f32x4(tmp822, tmp822, 136);
__m512 tmp795 = _mm512_shuffle_f32x4(tmp822, tmp822, 221);
in154 = _mm512_shuffle_f32x4(tmp824, tmp824, 136);
in156 = _mm512_shuffle_f32x4(tmp826, tmp826, 136);
in155 = _mm512_shuffle_f32x4(tmp828, tmp828, 136);
__m512 tmp796 = _mm512_add_ps(tmp789, in154);
__m512 tmp800 = tmp792;
__m512 tmp797 = _mm512_sub_ps(tmp788, tmp790);
__m512 tmp801 = _mm512_sub_ps(tmp795, tmp793);
__m512 tmp798 = _mm512_add_ps(tmp790, in156);
__m512 tmp802 = tmp793;
in152 = _mm512_sub_ps(in152, in156);
tmp791 = tmp791;
tmp796 = _mm512_fmadd_ps(in158, _mm512_set1_ps(-4.25e+00f), tmp796);
tmp800 = _mm512_fmadd_ps(tmp794, _mm512_set1_ps(-4.25e+00f), tmp800);
tmp798 = _mm512_fmadd_ps(tmp788, _mm512_set1_ps(-4.25e+00f), tmp798);
tmp802 = _mm512_fmadd_ps(tmp795, _mm512_set1_ps(-4.25e+00f), tmp802);
in152 = _mm512_fmadd_ps(tmp797, _mm512_set1_ps(5.25e+00f), in152);
tmp791 = _mm512_fmadd_ps(tmp801, _mm512_set1_ps(5.25e+00f), tmp791);
tmp797 = _mm512_fmadd_ps(tmp790, _mm512_set1_ps(2.5e-01f), in156);
tmp801 = _mm512_mul_ps(tmp793, _mm512_set1_ps(2.5e-01f));
tmp790 = _mm512_fmadd_ps(tmp790, _mm512_set1_ps(4e+00f), in156);
tmp793 = _mm512_mul_ps(tmp793, _mm512_set1_ps(4e+00f));
__m512 tmp799 = _mm512_sub_ps(tmp798, tmp796);
__m512 tmp803 = _mm512_sub_ps(tmp802, tmp800);
tmp798 = _mm512_add_ps(tmp796, tmp798);
tmp802 = _mm512_add_ps(tmp800, tmp802);
tmp796 = _mm512_fmadd_ps(tmp789, _mm512_set1_ps(2.5e-01f), in154);
tmp800 = _mm512_mul_ps(tmp792, _mm512_set1_ps(2.5e-01f));
tmp797 = _mm512_fmadd_ps(tmp788, _mm512_set1_ps(-1.25e+00f), tmp797);
tmp801 = _mm512_fmadd_ps(tmp795, _mm512_set1_ps(-1.25e+00f), tmp801);
tmp788 = _mm512_fmadd_ps(tmp788, _mm512_set1_ps(-5e+00f), tmp790);
tmp795 = _mm512_fmadd_ps(tmp795, _mm512_set1_ps(-5e+00f), tmp793);
tmp796 = _mm512_fmadd_ps(in158, _mm512_set1_ps(-1.25e+00f), tmp796);
tmp800 = _mm512_fmadd_ps(tmp794, _mm512_set1_ps(-1.25e+00f), tmp800);
in156 = _mm512_fmadd_ps(tmp796, _mm512_set1_ps(2e+00f), tmp797);
__m512 tmp804 = _mm512_fmadd_ps(tmp800, _mm512_set1_ps(2e+00f), tmp801);
tmp797 = _mm512_fnmadd_ps(tmp796, _mm512_set1_ps(2e+00f), tmp797);
tmp801 = _mm512_fnmadd_ps(tmp800, _mm512_set1_ps(2e+00f), tmp801);
tmp796 = _mm512_fmadd_ps(in154, _mm512_set1_ps(2.5e-01f), tmp789);
tmp800 = tmp792;
tmp789 = _mm512_sub_ps(in155, tmp789);
tmp792 = _mm512_sub_ps(_mm512_setzero_ps(), tmp792);
tmp796 = _mm512_fmadd_ps(in158, _mm512_set1_ps(-1.25e+00f), tmp796);
tmp800 = _mm512_fmadd_ps(tmp794, _mm512_set1_ps(-1.25e+00f), tmp800);
in158 = _mm512_sub_ps(in158, in154);
tmp794 = tmp794;
in158 = _mm512_fmadd_ps(in158, _mm512_set1_ps(5.25e+00f), tmp789);
tmp794 = _mm512_fmadd_ps(tmp794, _mm512_set1_ps(5.25e+00f), tmp792);
tmp790 = _mm512_fmadd_ps(tmp796, _mm512_set1_ps(2e+00f), tmp788);
tmp793 = _mm512_fmadd_ps(tmp800, _mm512_set1_ps(2e+00f), tmp795);
tmp788 = _mm512_fnmadd_ps(tmp796, _mm512_set1_ps(2e+00f), tmp788);
tmp795 = _mm512_fnmadd_ps(tmp800, _mm512_set1_ps(2e+00f), tmp795);
__m512 out205 = _mm512_shuffle_f32x4(in152, tmp798, 68);
__m512 out206 = _mm512_shuffle_f32x4(tmp799, in156, 68);
__m512 out207 = _mm512_shuffle_f32x4(tmp797, tmp790, 68);
__m512 out208 = _mm512_shuffle_f32x4(tmp788, in158, 68);
__m512 out209 = _mm512_shuffle_f32x4(tmp791, tmp802, 68);
__m512 out210 = _mm512_shuffle_f32x4(tmp803, tmp804, 68);
__m512 out211 = _mm512_shuffle_f32x4(tmp801, tmp793, 68);
__m512 out212 = _mm512_shuffle_f32x4(tmp795, tmp794, 68);
_mm512_storeu_ps(dfPtr1+256+2433024*i7+152064*j3+38016*s5+768*k6, out205);
_mm512_storeu_ps(dfPtr1+320+2433024*i7+152064*j3+38016*s5+768*k6, out209);
_mm512_storeu_ps(dfPtr1+608512+2433024*i7+152064*j3+38016*s5+768*k6, out206);
_mm512_storeu_ps(dfPtr1+608576+2433024*i7+152064*j3+38016*s5+768*k6, out210);
_mm512_storeu_ps(dfPtr1+1216768+2433024*i7+152064*j3+38016*s5+768*k6, out207);
_mm512_storeu_ps(dfPtr1+1216832+2433024*i7+152064*j3+38016*s5+768*k6, out211);
_mm512_storeu_ps(dfPtr1+1825024+2433024*i7+152064*j3+38016*s5+768*k6, out208);
_mm512_storeu_ps(dfPtr1+1825088+2433024*i7+152064*j3+38016*s5+768*k6, out212);
if (j3 >= last1) return;
++j3;
if (j3 >= 4) break;
}
return;
}
e2 = 4;
char*restrict datPtr2 = tensors4[0]-0+1384416*e2;
char*restrict dfPtr2 = tensors4[1]+14598144*e2;
ptrdiff_t i8 = 1*g3;
ptrdiff_t j4 = 1*c1;
ptrdiff_t last2 = j4+0;
ptrdiff_t rel2 = (size_t)(j4-0)%2;
ptrdiff_t base2 = 0+(size_t)(j4-0)/2*18;
for (; ; rel2 = 0, base2 += 18) {
if (rel2 < 1) {
ptrdiff_t h3 = base2+0;
ptrdiff_t w3 = 0;
if (s5 < 3) {
ptrdiff_t k7 = 0;
for (; k7 != 58; ++k7) {
__m512 dat145 = _mm512_maskz_loadu_ps(16383, datPtr2+0+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat146 = _mm512_maskz_loadu_ps(2047, datPtr2+48+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512i pm25 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in160 = _mm512_permutexvar_ps(pm25, dat145);
__m512i pm26 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in168 = _mm512_permutexvar_ps(pm26, dat146);
__m512 dat147 = _mm512_maskz_loadu_ps(16383, datPtr2+92+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat148 = _mm512_maskz_loadu_ps(2047, datPtr2+140+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in161 = _mm512_permutexvar_ps(pm25, dat147);
__m512 in169 = _mm512_permutexvar_ps(pm26, dat148);
__m512 dat149 = _mm512_maskz_loadu_ps(16383, datPtr2+184+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat150 = _mm512_maskz_loadu_ps(2047, datPtr2+232+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in162 = _mm512_permutexvar_ps(pm25, dat149);
__m512 in170 = _mm512_permutexvar_ps(pm26, dat150);
__m512 dat151 = _mm512_maskz_loadu_ps(16383, datPtr2+276+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat152 = _mm512_maskz_loadu_ps(2047, datPtr2+324+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in163 = _mm512_permutexvar_ps(pm25, dat151);
__m512 in171 = _mm512_permutexvar_ps(pm26, dat152);
__m512 dat153 = _mm512_maskz_loadu_ps(16383, datPtr2+368+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat154 = _mm512_maskz_loadu_ps(2047, datPtr2+416+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in164 = _mm512_permutexvar_ps(pm25, dat153);
__m512 in172 = _mm512_permutexvar_ps(pm26, dat154);
__m512 dat155 = _mm512_maskz_loadu_ps(16383, datPtr2+460+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat156 = _mm512_maskz_loadu_ps(2047, datPtr2+508+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in165 = _mm512_permutexvar_ps(pm25, dat155);
__m512 in173 = _mm512_permutexvar_ps(pm26, dat156);
__m512 dat157 = _mm512_maskz_loadu_ps(16383, datPtr2+552+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat158 = _mm512_maskz_loadu_ps(2047, datPtr2+600+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in166 = _mm512_permutexvar_ps(pm25, dat157);
__m512 in174 = _mm512_permutexvar_ps(pm26, dat158);
__m512 dat159 = _mm512_maskz_loadu_ps(16383, datPtr2+644+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat160 = _mm512_maskz_loadu_ps(2047, datPtr2+692+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in167 = _mm512_permutexvar_ps(pm25, dat159);
__m512 in175 = _mm512_permutexvar_ps(pm26, dat160);
__m512 tmp829 = _mm512_add_ps(in161, in165);
__m512 tmp833 = _mm512_add_ps(in169, in173);
__m512 tmp830 = _mm512_sub_ps(in164, in162);
__m512 tmp834 = _mm512_sub_ps(in172, in170);
__m512 tmp831 = _mm512_add_ps(in162, in166);
__m512 tmp835 = _mm512_add_ps(in170, in174);
in160 = _mm512_sub_ps(in160, in166);
in168 = _mm512_sub_ps(in168, in174);
tmp829 = _mm512_fmadd_ps(in163, _mm512_set1_ps(-4.25e+00f), tmp829);
tmp833 = _mm512_fmadd_ps(in171, _mm512_set1_ps(-4.25e+00f), tmp833);
tmp831 = _mm512_fmadd_ps(in164, _mm512_set1_ps(-4.25e+00f), tmp831);
tmp835 = _mm512_fmadd_ps(in172, _mm512_set1_ps(-4.25e+00f), tmp835);
in160 = _mm512_fmadd_ps(tmp830, _mm512_set1_ps(5.25e+00f), in160);
in168 = _mm512_fmadd_ps(tmp834, _mm512_set1_ps(5.25e+00f), in168);
tmp830 = _mm512_fmadd_ps(in162, _mm512_set1_ps(2.5e-01f), in166);
tmp834 = _mm512_fmadd_ps(in170, _mm512_set1_ps(2.5e-01f), in174);
in162 = _mm512_fmadd_ps(in162, _mm512_set1_ps(4e+00f), in166);
in170 = _mm512_fmadd_ps(in170, _mm512_set1_ps(4e+00f), in174);
__m512 tmp832 = _mm512_sub_ps(tmp831, tmp829);
__m512 tmp836 = _mm512_sub_ps(tmp835, tmp833);
tmp831 = _mm512_add_ps(tmp829, tmp831);
tmp835 = _mm512_add_ps(tmp833, tmp835);
tmp829 = _mm512_fmadd_ps(in161, _mm512_set1_ps(2.5e-01f), in165);
tmp833 = _mm512_fmadd_ps(in169, _mm512_set1_ps(2.5e-01f), in173);
tmp830 = _mm512_fmadd_ps(in164, _mm512_set1_ps(-1.25e+00f), tmp830);
tmp834 = _mm512_fmadd_ps(in172, _mm512_set1_ps(-1.25e+00f), tmp834);
in164 = _mm512_fmadd_ps(in164, _mm512_set1_ps(-5e+00f), in162);
in172 = _mm512_fmadd_ps(in172, _mm512_set1_ps(-5e+00f), in170);
tmp829 = _mm512_fmadd_ps(in163, _mm512_set1_ps(-1.25e+00f), tmp829);
tmp833 = _mm512_fmadd_ps(in171, _mm512_set1_ps(-1.25e+00f), tmp833);
in166 = _mm512_fmadd_ps(tmp829, _mm512_set1_ps(2e+00f), tmp830);
in174 = _mm512_fmadd_ps(tmp833, _mm512_set1_ps(2e+00f), tmp834);
tmp830 = _mm512_fnmadd_ps(tmp829, _mm512_set1_ps(2e+00f), tmp830);
tmp834 = _mm512_fnmadd_ps(tmp833, _mm512_set1_ps(2e+00f), tmp834);
tmp829 = _mm512_fmadd_ps(in165, _mm512_set1_ps(2.5e-01f), in161);
tmp833 = _mm512_fmadd_ps(in173, _mm512_set1_ps(2.5e-01f), in169);
in161 = _mm512_sub_ps(in167, in161);
in169 = _mm512_sub_ps(in175, in169);
tmp829 = _mm512_fmadd_ps(in163, _mm512_set1_ps(-1.25e+00f), tmp829);
tmp833 = _mm512_fmadd_ps(in171, _mm512_set1_ps(-1.25e+00f), tmp833);
in163 = _mm512_sub_ps(in163, in165);
in171 = _mm512_sub_ps(in171, in173);
in163 = _mm512_fmadd_ps(in163, _mm512_set1_ps(5.25e+00f), in161);
in171 = _mm512_fmadd_ps(in171, _mm512_set1_ps(5.25e+00f), in169);
in162 = _mm512_fmadd_ps(tmp829, _mm512_set1_ps(2e+00f), in164);
in170 = _mm512_fmadd_ps(tmp833, _mm512_set1_ps(2e+00f), in172);
in164 = _mm512_fnmadd_ps(tmp829, _mm512_set1_ps(2e+00f), in164);
in172 = _mm512_fnmadd_ps(tmp833, _mm512_set1_ps(2e+00f), in172);
__m512 tmp845 = _mm512_unpacklo_ps(in160, tmp831);
__m512 tmp846 = _mm512_unpackhi_ps(in160, tmp831);
__m512 tmp847 = _mm512_unpacklo_ps(tmp832, in166);
__m512 tmp848 = _mm512_unpackhi_ps(tmp832, in166);
__m512 tmp849 = _mm512_unpacklo_ps(tmp830, in162);
__m512 tmp850 = _mm512_unpackhi_ps(tmp830, in162);
__m512 tmp851 = _mm512_unpacklo_ps(in164, in163);
__m512 tmp852 = _mm512_unpackhi_ps(in164, in163);
__m512 tmp853 = _mm512_unpacklo_ps(in168, tmp835);
__m512 tmp854 = _mm512_unpackhi_ps(in168, tmp835);
__m512 tmp855 = _mm512_unpacklo_ps(tmp836, in174);
__m512 tmp856 = _mm512_unpackhi_ps(tmp836, in174);
__m512 tmp857 = _mm512_unpacklo_ps(tmp834, in170);
__m512 tmp858 = _mm512_unpackhi_ps(tmp834, in170);
__m512 tmp859 = _mm512_unpacklo_ps(in172, in171);
__m512 tmp860 = _mm512_unpackhi_ps(in172, in171);
__m512 tmp861 = _mm512_shuffle_ps(tmp845, tmp847, 68);
__m512 tmp862 = _mm512_shuffle_ps(tmp845, tmp847, 238);
__m512 tmp863 = _mm512_shuffle_ps(tmp846, tmp848, 68);
__m512 tmp864 = _mm512_shuffle_ps(tmp846, tmp848, 238);
__m512 tmp865 = _mm512_shuffle_ps(tmp849, tmp851, 68);
__m512 tmp866 = _mm512_shuffle_ps(tmp849, tmp851, 238);
__m512 tmp867 = _mm512_shuffle_ps(tmp850, tmp852, 68);
__m512 tmp868 = _mm512_shuffle_ps(tmp850, tmp852, 238);
__m512 tmp869 = _mm512_shuffle_ps(tmp853, tmp855, 68);
__m512 tmp870 = _mm512_shuffle_ps(tmp853, tmp855, 238);
__m512 tmp871 = _mm512_shuffle_ps(tmp854, tmp856, 68);
__m512 tmp872 = _mm512_shuffle_ps(tmp854, tmp856, 238);
__m512 tmp873 = _mm512_shuffle_ps(tmp857, tmp859, 68);
__m512 tmp874 = _mm512_shuffle_ps(tmp857, tmp859, 238);
__m512 tmp875 = _mm512_shuffle_ps(tmp858, tmp860, 68);
__m512 tmp876 = _mm512_shuffle_ps(tmp858, tmp860, 238);
__m512 tmp877 = _mm512_shuffle_f32x4(tmp861, tmp865, 136);
__m512 tmp878 = _mm512_shuffle_f32x4(tmp861, tmp865, 221);
__m512 tmp879 = _mm512_shuffle_f32x4(tmp862, tmp866, 136);
__m512 tmp880 = _mm512_shuffle_f32x4(tmp862, tmp866, 221);
__m512 tmp881 = _mm512_shuffle_f32x4(tmp863, tmp867, 136);
__m512 tmp882 = _mm512_shuffle_f32x4(tmp863, tmp867, 221);
__m512 tmp883 = _mm512_shuffle_f32x4(tmp864, tmp868, 136);
__m512 tmp884 = _mm512_shuffle_f32x4(tmp864, tmp868, 221);
__m512 tmp885 = _mm512_shuffle_f32x4(tmp869, tmp873, 136);
__m512 tmp886 = _mm512_shuffle_f32x4(tmp869, tmp873, 221);
__m512 tmp887 = _mm512_shuffle_f32x4(tmp870, tmp874, 136);
__m512 tmp888 = _mm512_shuffle_f32x4(tmp870, tmp874, 221);
__m512 tmp889 = _mm512_shuffle_f32x4(tmp871, tmp875, 136);
__m512 tmp890 = _mm512_shuffle_f32x4(tmp871, tmp875, 221);
__m512 tmp891 = _mm512_shuffle_f32x4(tmp872, tmp876, 136);
__m512 tmp892 = _mm512_shuffle_f32x4(tmp872, tmp876, 221);
in160 = _mm512_shuffle_f32x4(tmp877, tmp885, 136);
in168 = _mm512_shuffle_f32x4(tmp877, tmp885, 221);
tmp831 = _mm512_shuffle_f32x4(tmp879, tmp887, 136);
tmp835 = _mm512_shuffle_f32x4(tmp879, tmp887, 221);
tmp832 = _mm512_shuffle_f32x4(tmp881, tmp889, 136);
tmp836 = _mm512_shuffle_f32x4(tmp881, tmp889, 221);
in166 = _mm512_shuffle_f32x4(tmp883, tmp891, 136);
in174 = _mm512_shuffle_f32x4(tmp883, tmp891, 221);
tmp830 = _mm512_shuffle_f32x4(tmp878, tmp886, 136);
tmp834 = _mm512_shuffle_f32x4(tmp878, tmp886, 221);
in162 = _mm512_shuffle_f32x4(tmp880, tmp888, 136);
in170 = _mm512_shuffle_f32x4(tmp880, tmp888, 221);
in164 = _mm512_shuffle_f32x4(tmp882, tmp890, 136);
in172 = _mm512_shuffle_f32x4(tmp882, tmp890, 221);
in163 = _mm512_shuffle_f32x4(tmp884, tmp892, 136);
in171 = _mm512_shuffle_f32x4(tmp884, tmp892, 221);
__m512 tmp837 = _mm512_add_ps(tmp831, in162);
__m512 tmp841 = _mm512_add_ps(tmp835, in170);
__m512 tmp838 = _mm512_sub_ps(tmp830, tmp832);
__m512 tmp842 = _mm512_sub_ps(tmp834, tmp836);
__m512 tmp839 = _mm512_add_ps(tmp832, in164);
__m512 tmp843 = _mm512_add_ps(tmp836, in172);
in160 = _mm512_sub_ps(in160, in164);
in168 = _mm512_sub_ps(in168, in172);
tmp837 = _mm512_fmadd_ps(in166, _mm512_set1_ps(-4.25e+00f), tmp837);
tmp841 = _mm512_fmadd_ps(in174, _mm512_set1_ps(-4.25e+00f), tmp841);
tmp839 = _mm512_fmadd_ps(tmp830, _mm512_set1_ps(-4.25e+00f), tmp839);
tmp843 = _mm512_fmadd_ps(tmp834, _mm512_set1_ps(-4.25e+00f), tmp843);
in160 = _mm512_fmadd_ps(tmp838, _mm512_set1_ps(5.25e+00f), in160);
in168 = _mm512_fmadd_ps(tmp842, _mm512_set1_ps(5.25e+00f), in168);
tmp838 = _mm512_fmadd_ps(tmp832, _mm512_set1_ps(2.5e-01f), in164);
tmp842 = _mm512_fmadd_ps(tmp836, _mm512_set1_ps(2.5e-01f), in172);
tmp832 = _mm512_fmadd_ps(tmp832, _mm512_set1_ps(4e+00f), in164);
tmp836 = _mm512_fmadd_ps(tmp836, _mm512_set1_ps(4e+00f), in172);
__m512 tmp840 = _mm512_sub_ps(tmp839, tmp837);
__m512 tmp844 = _mm512_sub_ps(tmp843, tmp841);
tmp839 = _mm512_add_ps(tmp837, tmp839);
tmp843 = _mm512_add_ps(tmp841, tmp843);
tmp837 = _mm512_fmadd_ps(tmp831, _mm512_set1_ps(2.5e-01f), in162);
tmp841 = _mm512_fmadd_ps(tmp835, _mm512_set1_ps(2.5e-01f), in170);
tmp838 = _mm512_fmadd_ps(tmp830, _mm512_set1_ps(-1.25e+00f), tmp838);
tmp842 = _mm512_fmadd_ps(tmp834, _mm512_set1_ps(-1.25e+00f), tmp842);
tmp830 = _mm512_fmadd_ps(tmp830, _mm512_set1_ps(-5e+00f), tmp832);
tmp834 = _mm512_fmadd_ps(tmp834, _mm512_set1_ps(-5e+00f), tmp836);
tmp837 = _mm512_fmadd_ps(in166, _mm512_set1_ps(-1.25e+00f), tmp837);
tmp841 = _mm512_fmadd_ps(in174, _mm512_set1_ps(-1.25e+00f), tmp841);
in164 = _mm512_fmadd_ps(tmp837, _mm512_set1_ps(2e+00f), tmp838);
in172 = _mm512_fmadd_ps(tmp841, _mm512_set1_ps(2e+00f), tmp842);
tmp838 = _mm512_fnmadd_ps(tmp837, _mm512_set1_ps(2e+00f), tmp838);
tmp842 = _mm512_fnmadd_ps(tmp841, _mm512_set1_ps(2e+00f), tmp842);
tmp837 = _mm512_fmadd_ps(in162, _mm512_set1_ps(2.5e-01f), tmp831);
tmp841 = _mm512_fmadd_ps(in170, _mm512_set1_ps(2.5e-01f), tmp835);
tmp831 = _mm512_sub_ps(in163, tmp831);
tmp835 = _mm512_sub_ps(in171, tmp835);
tmp837 = _mm512_fmadd_ps(in166, _mm512_set1_ps(-1.25e+00f), tmp837);
tmp841 = _mm512_fmadd_ps(in174, _mm512_set1_ps(-1.25e+00f), tmp841);
in166 = _mm512_sub_ps(in166, in162);
in174 = _mm512_sub_ps(in174, in170);
in166 = _mm512_fmadd_ps(in166, _mm512_set1_ps(5.25e+00f), tmp831);
in174 = _mm512_fmadd_ps(in174, _mm512_set1_ps(5.25e+00f), tmp835);
tmp832 = _mm512_fmadd_ps(tmp837, _mm512_set1_ps(2e+00f), tmp830);
tmp836 = _mm512_fmadd_ps(tmp841, _mm512_set1_ps(2e+00f), tmp834);
tmp830 = _mm512_fnmadd_ps(tmp837, _mm512_set1_ps(2e+00f), tmp830);
tmp834 = _mm512_fnmadd_ps(tmp841, _mm512_set1_ps(2e+00f), tmp834);
__m512 out213 = _mm512_shuffle_f32x4(in160, tmp839, 68);
__m512 out221 = _mm512_shuffle_f32x4(in160, tmp839, 238);
__m512 out214 = _mm512_shuffle_f32x4(tmp840, in164, 68);
__m512 out222 = _mm512_shuffle_f32x4(tmp840, in164, 238);
__m512 out215 = _mm512_shuffle_f32x4(tmp838, tmp832, 68);
__m512 out223 = _mm512_shuffle_f32x4(tmp838, tmp832, 238);
__m512 out216 = _mm512_shuffle_f32x4(tmp830, in166, 68);
__m512 out224 = _mm512_shuffle_f32x4(tmp830, in166, 238);
__m512 out217 = _mm512_shuffle_f32x4(in168, tmp843, 68);
__m512 out225 = _mm512_shuffle_f32x4(in168, tmp843, 238);
__m512 out218 = _mm512_shuffle_f32x4(tmp844, in172, 68);
__m512 out226 = _mm512_shuffle_f32x4(tmp844, in172, 238);
__m512 out219 = _mm512_shuffle_f32x4(tmp842, tmp836, 68);
__m512 out227 = _mm512_shuffle_f32x4(tmp842, tmp836, 238);
__m512 out220 = _mm512_shuffle_f32x4(tmp834, in174, 68);
__m512 out228 = _mm512_shuffle_f32x4(tmp834, in174, 238);
_mm512_storeu_ps(dfPtr2+0+2856960*i8+178560*j4+44544*s5+768*k7, out213);
_mm512_storeu_ps(dfPtr2+128+2856960*i8+178560*j4+44544*s5+768*k7, out221);
_mm512_storeu_ps(dfPtr2+64+2856960*i8+178560*j4+44544*s5+768*k7, out217);
_mm512_storeu_ps(dfPtr2+192+2856960*i8+178560*j4+44544*s5+768*k7, out225);
_mm512_storeu_ps(dfPtr2+714240+2856960*i8+178560*j4+44544*s5+768*k7, out214);
_mm512_storeu_ps(dfPtr2+714368+2856960*i8+178560*j4+44544*s5+768*k7, out222);
_mm512_storeu_ps(dfPtr2+714304+2856960*i8+178560*j4+44544*s5+768*k7, out218);
_mm512_storeu_ps(dfPtr2+714432+2856960*i8+178560*j4+44544*s5+768*k7, out226);
_mm512_storeu_ps(dfPtr2+1428480+2856960*i8+178560*j4+44544*s5+768*k7, out215);
_mm512_storeu_ps(dfPtr2+1428608+2856960*i8+178560*j4+44544*s5+768*k7, out223);
_mm512_storeu_ps(dfPtr2+1428544+2856960*i8+178560*j4+44544*s5+768*k7, out219);
_mm512_storeu_ps(dfPtr2+1428672+2856960*i8+178560*j4+44544*s5+768*k7, out227);
_mm512_storeu_ps(dfPtr2+2142720+2856960*i8+178560*j4+44544*s5+768*k7, out216);
_mm512_storeu_ps(dfPtr2+2142848+2856960*i8+178560*j4+44544*s5+768*k7, out224);
_mm512_storeu_ps(dfPtr2+2142784+2856960*i8+178560*j4+44544*s5+768*k7, out220);
_mm512_storeu_ps(dfPtr2+2142912+2856960*i8+178560*j4+44544*s5+768*k7, out228);
__m512 dat161 = _mm512_maskz_loadu_ps(16383, datPtr2+552+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat162 = _mm512_maskz_loadu_ps(16383, datPtr2+3496+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512i pm27 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in176 = _mm512_permutexvar_ps(pm27, dat161);
__m512 in184 = _mm512_permutexvar_ps(pm27, dat162);
__m512 dat163 = _mm512_maskz_loadu_ps(16383, datPtr2+644+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat164 = _mm512_maskz_loadu_ps(16383, datPtr2+3588+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in177 = _mm512_permutexvar_ps(pm27, dat163);
__m512 in185 = _mm512_permutexvar_ps(pm27, dat164);
__m512 dat165 = _mm512_maskz_loadu_ps(16383, datPtr2+736+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat166 = _mm512_maskz_loadu_ps(16383, datPtr2+3680+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in178 = _mm512_permutexvar_ps(pm27, dat165);
__m512 in186 = _mm512_permutexvar_ps(pm27, dat166);
__m512 dat167 = _mm512_maskz_loadu_ps(16383, datPtr2+828+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat168 = _mm512_maskz_loadu_ps(16383, datPtr2+3772+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in179 = _mm512_permutexvar_ps(pm27, dat167);
__m512 in187 = _mm512_permutexvar_ps(pm27, dat168);
__m512 dat169 = _mm512_maskz_loadu_ps(16383, datPtr2+920+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat170 = _mm512_maskz_loadu_ps(16383, datPtr2+3864+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in180 = _mm512_permutexvar_ps(pm27, dat169);
__m512 in188 = _mm512_permutexvar_ps(pm27, dat170);
__m512 dat171 = _mm512_maskz_loadu_ps(16383, datPtr2+1012+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat172 = _mm512_maskz_loadu_ps(16383, datPtr2+3956+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in181 = _mm512_permutexvar_ps(pm27, dat171);
__m512 in189 = _mm512_permutexvar_ps(pm27, dat172);
__m512 dat173 = _mm512_maskz_loadu_ps(16383, datPtr2+1104+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat174 = _mm512_maskz_loadu_ps(16383, datPtr2+4048+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in182 = _mm512_permutexvar_ps(pm27, dat173);
__m512 in190 = _mm512_permutexvar_ps(pm27, dat174);
__m512 dat175 = _mm512_maskz_loadu_ps(16383, datPtr2+1196+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat176 = _mm512_maskz_loadu_ps(16383, datPtr2+4140+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in183 = _mm512_permutexvar_ps(pm27, dat175);
__m512 in191 = _mm512_permutexvar_ps(pm27, dat176);
__m512 tmp893 = _mm512_add_ps(in177, in181);
__m512 tmp897 = _mm512_add_ps(in185, in189);
__m512 tmp894 = _mm512_sub_ps(in180, in178);
__m512 tmp898 = _mm512_sub_ps(in188, in186);
__m512 tmp895 = _mm512_add_ps(in178, in182);
__m512 tmp899 = _mm512_add_ps(in186, in190);
in176 = _mm512_sub_ps(in176, in182);
in184 = _mm512_sub_ps(in184, in190);
tmp893 = _mm512_fmadd_ps(in179, _mm512_set1_ps(-4.25e+00f), tmp893);
tmp897 = _mm512_fmadd_ps(in187, _mm512_set1_ps(-4.25e+00f), tmp897);
tmp895 = _mm512_fmadd_ps(in180, _mm512_set1_ps(-4.25e+00f), tmp895);
tmp899 = _mm512_fmadd_ps(in188, _mm512_set1_ps(-4.25e+00f), tmp899);
in176 = _mm512_fmadd_ps(tmp894, _mm512_set1_ps(5.25e+00f), in176);
in184 = _mm512_fmadd_ps(tmp898, _mm512_set1_ps(5.25e+00f), in184);
tmp894 = _mm512_fmadd_ps(in178, _mm512_set1_ps(2.5e-01f), in182);
tmp898 = _mm512_fmadd_ps(in186, _mm512_set1_ps(2.5e-01f), in190);
in178 = _mm512_fmadd_ps(in178, _mm512_set1_ps(4e+00f), in182);
in186 = _mm512_fmadd_ps(in186, _mm512_set1_ps(4e+00f), in190);
__m512 tmp896 = _mm512_sub_ps(tmp895, tmp893);
__m512 tmp900 = _mm512_sub_ps(tmp899, tmp897);
tmp895 = _mm512_add_ps(tmp893, tmp895);
tmp899 = _mm512_add_ps(tmp897, tmp899);
tmp893 = _mm512_fmadd_ps(in177, _mm512_set1_ps(2.5e-01f), in181);
tmp897 = _mm512_fmadd_ps(in185, _mm512_set1_ps(2.5e-01f), in189);
tmp894 = _mm512_fmadd_ps(in180, _mm512_set1_ps(-1.25e+00f), tmp894);
tmp898 = _mm512_fmadd_ps(in188, _mm512_set1_ps(-1.25e+00f), tmp898);
in180 = _mm512_fmadd_ps(in180, _mm512_set1_ps(-5e+00f), in178);
in188 = _mm512_fmadd_ps(in188, _mm512_set1_ps(-5e+00f), in186);
tmp893 = _mm512_fmadd_ps(in179, _mm512_set1_ps(-1.25e+00f), tmp893);
tmp897 = _mm512_fmadd_ps(in187, _mm512_set1_ps(-1.25e+00f), tmp897);
in182 = _mm512_fmadd_ps(tmp893, _mm512_set1_ps(2e+00f), tmp894);
in190 = _mm512_fmadd_ps(tmp897, _mm512_set1_ps(2e+00f), tmp898);
tmp894 = _mm512_fnmadd_ps(tmp893, _mm512_set1_ps(2e+00f), tmp894);
tmp898 = _mm512_fnmadd_ps(tmp897, _mm512_set1_ps(2e+00f), tmp898);
tmp893 = _mm512_fmadd_ps(in181, _mm512_set1_ps(2.5e-01f), in177);
tmp897 = _mm512_fmadd_ps(in189, _mm512_set1_ps(2.5e-01f), in185);
in177 = _mm512_sub_ps(in183, in177);
in185 = _mm512_sub_ps(in191, in185);
tmp893 = _mm512_fmadd_ps(in179, _mm512_set1_ps(-1.25e+00f), tmp893);
tmp897 = _mm512_fmadd_ps(in187, _mm512_set1_ps(-1.25e+00f), tmp897);
in179 = _mm512_sub_ps(in179, in181);
in187 = _mm512_sub_ps(in187, in189);
in179 = _mm512_fmadd_ps(in179, _mm512_set1_ps(5.25e+00f), in177);
in187 = _mm512_fmadd_ps(in187, _mm512_set1_ps(5.25e+00f), in185);
in178 = _mm512_fmadd_ps(tmp893, _mm512_set1_ps(2e+00f), in180);
in186 = _mm512_fmadd_ps(tmp897, _mm512_set1_ps(2e+00f), in188);
in180 = _mm512_fnmadd_ps(tmp893, _mm512_set1_ps(2e+00f), in180);
in188 = _mm512_fnmadd_ps(tmp897, _mm512_set1_ps(2e+00f), in188);
__m512 tmp909 = _mm512_unpacklo_ps(in176, tmp895);
__m512 tmp910 = _mm512_unpackhi_ps(in176, tmp895);
__m512 tmp911 = _mm512_unpacklo_ps(tmp896, in182);
__m512 tmp912 = _mm512_unpackhi_ps(tmp896, in182);
__m512 tmp913 = _mm512_unpacklo_ps(tmp894, in178);
__m512 tmp914 = _mm512_unpackhi_ps(tmp894, in178);
__m512 tmp915 = _mm512_unpacklo_ps(in180, in179);
__m512 tmp916 = _mm512_unpackhi_ps(in180, in179);
__m512 tmp917 = _mm512_unpacklo_ps(in184, tmp899);
__m512 tmp918 = _mm512_unpackhi_ps(in184, tmp899);
__m512 tmp919 = _mm512_unpacklo_ps(tmp900, in190);
__m512 tmp920 = _mm512_unpackhi_ps(tmp900, in190);
__m512 tmp921 = _mm512_unpacklo_ps(tmp898, in186);
__m512 tmp922 = _mm512_unpackhi_ps(tmp898, in186);
__m512 tmp923 = _mm512_unpacklo_ps(in188, in187);
__m512 tmp924 = _mm512_unpackhi_ps(in188, in187);
__m512 tmp925 = _mm512_shuffle_ps(tmp909, tmp911, 68);
__m512 tmp926 = _mm512_shuffle_ps(tmp909, tmp911, 238);
__m512 tmp927 = _mm512_shuffle_ps(tmp910, tmp912, 68);
__m512 tmp928 = _mm512_shuffle_ps(tmp910, tmp912, 238);
__m512 tmp929 = _mm512_shuffle_ps(tmp913, tmp915, 68);
__m512 tmp930 = _mm512_shuffle_ps(tmp913, tmp915, 238);
__m512 tmp931 = _mm512_shuffle_ps(tmp914, tmp916, 68);
__m512 tmp932 = _mm512_shuffle_ps(tmp914, tmp916, 238);
__m512 tmp933 = _mm512_shuffle_ps(tmp917, tmp919, 68);
__m512 tmp934 = _mm512_shuffle_ps(tmp917, tmp919, 238);
__m512 tmp935 = _mm512_shuffle_ps(tmp918, tmp920, 68);
__m512 tmp936 = _mm512_shuffle_ps(tmp918, tmp920, 238);
__m512 tmp937 = _mm512_shuffle_ps(tmp921, tmp923, 68);
__m512 tmp938 = _mm512_shuffle_ps(tmp921, tmp923, 238);
__m512 tmp939 = _mm512_shuffle_ps(tmp922, tmp924, 68);
__m512 tmp940 = _mm512_shuffle_ps(tmp922, tmp924, 238);
__m512 tmp941 = _mm512_shuffle_f32x4(tmp925, tmp929, 136);
__m512 tmp942 = _mm512_shuffle_f32x4(tmp925, tmp929, 221);
__m512 tmp943 = _mm512_shuffle_f32x4(tmp926, tmp930, 136);
__m512 tmp944 = _mm512_shuffle_f32x4(tmp926, tmp930, 221);
__m512 tmp945 = _mm512_shuffle_f32x4(tmp927, tmp931, 136);
__m512 tmp946 = _mm512_shuffle_f32x4(tmp927, tmp931, 221);
__m512 tmp947 = _mm512_shuffle_f32x4(tmp928, tmp932, 136);
__m512 tmp948 = _mm512_shuffle_f32x4(tmp928, tmp932, 221);
__m512 tmp949 = _mm512_shuffle_f32x4(tmp933, tmp937, 136);
__m512 tmp950 = _mm512_shuffle_f32x4(tmp933, tmp937, 221);
__m512 tmp951 = _mm512_shuffle_f32x4(tmp934, tmp938, 136);
__m512 tmp952 = _mm512_shuffle_f32x4(tmp934, tmp938, 221);
__m512 tmp953 = _mm512_shuffle_f32x4(tmp935, tmp939, 136);
__m512 tmp954 = _mm512_shuffle_f32x4(tmp935, tmp939, 221);
__m512 tmp955 = _mm512_shuffle_f32x4(tmp936, tmp940, 136);
__m512 tmp956 = _mm512_shuffle_f32x4(tmp936, tmp940, 221);
in176 = _mm512_shuffle_f32x4(tmp941, tmp949, 136);
in184 = _mm512_shuffle_f32x4(tmp941, tmp949, 221);
tmp895 = _mm512_shuffle_f32x4(tmp943, tmp951, 136);
tmp899 = _mm512_shuffle_f32x4(tmp943, tmp951, 221);
tmp896 = _mm512_shuffle_f32x4(tmp945, tmp953, 136);
tmp900 = _mm512_shuffle_f32x4(tmp945, tmp953, 221);
in182 = _mm512_shuffle_f32x4(tmp947, tmp955, 136);
in190 = _mm512_shuffle_f32x4(tmp947, tmp955, 221);
tmp894 = _mm512_shuffle_f32x4(tmp942, tmp950, 136);
tmp898 = _mm512_shuffle_f32x4(tmp942, tmp950, 221);
in178 = _mm512_shuffle_f32x4(tmp944, tmp952, 136);
in186 = _mm512_shuffle_f32x4(tmp944, tmp952, 221);
in180 = _mm512_shuffle_f32x4(tmp946, tmp954, 136);
in188 = _mm512_shuffle_f32x4(tmp946, tmp954, 221);
in179 = _mm512_shuffle_f32x4(tmp948, tmp956, 136);
in187 = _mm512_shuffle_f32x4(tmp948, tmp956, 221);
__m512 tmp901 = _mm512_add_ps(tmp895, in178);
__m512 tmp905 = _mm512_add_ps(tmp899, in186);
__m512 tmp902 = _mm512_sub_ps(tmp894, tmp896);
__m512 tmp906 = _mm512_sub_ps(tmp898, tmp900);
__m512 tmp903 = _mm512_add_ps(tmp896, in180);
__m512 tmp907 = _mm512_add_ps(tmp900, in188);
in176 = _mm512_sub_ps(in176, in180);
in184 = _mm512_sub_ps(in184, in188);
tmp901 = _mm512_fmadd_ps(in182, _mm512_set1_ps(-4.25e+00f), tmp901);
tmp905 = _mm512_fmadd_ps(in190, _mm512_set1_ps(-4.25e+00f), tmp905);
tmp903 = _mm512_fmadd_ps(tmp894, _mm512_set1_ps(-4.25e+00f), tmp903);
tmp907 = _mm512_fmadd_ps(tmp898, _mm512_set1_ps(-4.25e+00f), tmp907);
in176 = _mm512_fmadd_ps(tmp902, _mm512_set1_ps(5.25e+00f), in176);
in184 = _mm512_fmadd_ps(tmp906, _mm512_set1_ps(5.25e+00f), in184);
tmp902 = _mm512_fmadd_ps(tmp896, _mm512_set1_ps(2.5e-01f), in180);
tmp906 = _mm512_fmadd_ps(tmp900, _mm512_set1_ps(2.5e-01f), in188);
tmp896 = _mm512_fmadd_ps(tmp896, _mm512_set1_ps(4e+00f), in180);
tmp900 = _mm512_fmadd_ps(tmp900, _mm512_set1_ps(4e+00f), in188);
__m512 tmp904 = _mm512_sub_ps(tmp903, tmp901);
__m512 tmp908 = _mm512_sub_ps(tmp907, tmp905);
tmp903 = _mm512_add_ps(tmp901, tmp903);
tmp907 = _mm512_add_ps(tmp905, tmp907);
tmp901 = _mm512_fmadd_ps(tmp895, _mm512_set1_ps(2.5e-01f), in178);
tmp905 = _mm512_fmadd_ps(tmp899, _mm512_set1_ps(2.5e-01f), in186);
tmp902 = _mm512_fmadd_ps(tmp894, _mm512_set1_ps(-1.25e+00f), tmp902);
tmp906 = _mm512_fmadd_ps(tmp898, _mm512_set1_ps(-1.25e+00f), tmp906);
tmp894 = _mm512_fmadd_ps(tmp894, _mm512_set1_ps(-5e+00f), tmp896);
tmp898 = _mm512_fmadd_ps(tmp898, _mm512_set1_ps(-5e+00f), tmp900);
tmp901 = _mm512_fmadd_ps(in182, _mm512_set1_ps(-1.25e+00f), tmp901);
tmp905 = _mm512_fmadd_ps(in190, _mm512_set1_ps(-1.25e+00f), tmp905);
in180 = _mm512_fmadd_ps(tmp901, _mm512_set1_ps(2e+00f), tmp902);
in188 = _mm512_fmadd_ps(tmp905, _mm512_set1_ps(2e+00f), tmp906);
tmp902 = _mm512_fnmadd_ps(tmp901, _mm512_set1_ps(2e+00f), tmp902);
tmp906 = _mm512_fnmadd_ps(tmp905, _mm512_set1_ps(2e+00f), tmp906);
tmp901 = _mm512_fmadd_ps(in178, _mm512_set1_ps(2.5e-01f), tmp895);
tmp905 = _mm512_fmadd_ps(in186, _mm512_set1_ps(2.5e-01f), tmp899);
tmp895 = _mm512_sub_ps(in179, tmp895);
tmp899 = _mm512_sub_ps(in187, tmp899);
tmp901 = _mm512_fmadd_ps(in182, _mm512_set1_ps(-1.25e+00f), tmp901);
tmp905 = _mm512_fmadd_ps(in190, _mm512_set1_ps(-1.25e+00f), tmp905);
in182 = _mm512_sub_ps(in182, in178);
in190 = _mm512_sub_ps(in190, in186);
in182 = _mm512_fmadd_ps(in182, _mm512_set1_ps(5.25e+00f), tmp895);
in190 = _mm512_fmadd_ps(in190, _mm512_set1_ps(5.25e+00f), tmp899);
tmp896 = _mm512_fmadd_ps(tmp901, _mm512_set1_ps(2e+00f), tmp894);
tmp900 = _mm512_fmadd_ps(tmp905, _mm512_set1_ps(2e+00f), tmp898);
tmp894 = _mm512_fnmadd_ps(tmp901, _mm512_set1_ps(2e+00f), tmp894);
tmp898 = _mm512_fnmadd_ps(tmp905, _mm512_set1_ps(2e+00f), tmp898);
__m512 out229 = _mm512_shuffle_f32x4(in176, tmp903, 68);
__m512 out237 = _mm512_shuffle_f32x4(in176, tmp903, 238);
__m512 out230 = _mm512_shuffle_f32x4(tmp904, in180, 68);
__m512 out238 = _mm512_shuffle_f32x4(tmp904, in180, 238);
__m512 out231 = _mm512_shuffle_f32x4(tmp902, tmp896, 68);
__m512 out239 = _mm512_shuffle_f32x4(tmp902, tmp896, 238);
__m512 out232 = _mm512_shuffle_f32x4(tmp894, in182, 68);
__m512 out240 = _mm512_shuffle_f32x4(tmp894, in182, 238);
__m512 out233 = _mm512_shuffle_f32x4(in184, tmp907, 68);
__m512 out241 = _mm512_shuffle_f32x4(in184, tmp907, 238);
__m512 out234 = _mm512_shuffle_f32x4(tmp908, in188, 68);
__m512 out242 = _mm512_shuffle_f32x4(tmp908, in188, 238);
__m512 out235 = _mm512_shuffle_f32x4(tmp906, tmp900, 68);
__m512 out243 = _mm512_shuffle_f32x4(tmp906, tmp900, 238);
__m512 out236 = _mm512_shuffle_f32x4(tmp898, in190, 68);
__m512 out244 = _mm512_shuffle_f32x4(tmp898, in190, 238);
_mm512_storeu_ps(dfPtr2+256+2856960*i8+178560*j4+44544*s5+768*k7, out229);
_mm512_storeu_ps(dfPtr2+384+2856960*i8+178560*j4+44544*s5+768*k7, out237);
_mm512_storeu_ps(dfPtr2+320+2856960*i8+178560*j4+44544*s5+768*k7, out233);
_mm512_storeu_ps(dfPtr2+448+2856960*i8+178560*j4+44544*s5+768*k7, out241);
_mm512_storeu_ps(dfPtr2+714496+2856960*i8+178560*j4+44544*s5+768*k7, out230);
_mm512_storeu_ps(dfPtr2+714624+2856960*i8+178560*j4+44544*s5+768*k7, out238);
_mm512_storeu_ps(dfPtr2+714560+2856960*i8+178560*j4+44544*s5+768*k7, out234);
_mm512_storeu_ps(dfPtr2+714688+2856960*i8+178560*j4+44544*s5+768*k7, out242);
_mm512_storeu_ps(dfPtr2+1428736+2856960*i8+178560*j4+44544*s5+768*k7, out231);
_mm512_storeu_ps(dfPtr2+1428864+2856960*i8+178560*j4+44544*s5+768*k7, out239);
_mm512_storeu_ps(dfPtr2+1428800+2856960*i8+178560*j4+44544*s5+768*k7, out235);
_mm512_storeu_ps(dfPtr2+1428928+2856960*i8+178560*j4+44544*s5+768*k7, out243);
_mm512_storeu_ps(dfPtr2+2142976+2856960*i8+178560*j4+44544*s5+768*k7, out232);
_mm512_storeu_ps(dfPtr2+2143104+2856960*i8+178560*j4+44544*s5+768*k7, out240);
_mm512_storeu_ps(dfPtr2+2143040+2856960*i8+178560*j4+44544*s5+768*k7, out236);
_mm512_storeu_ps(dfPtr2+2143168+2856960*i8+178560*j4+44544*s5+768*k7, out244);
__m512 dat177 = _mm512_maskz_loadu_ps(2047, datPtr2+3544+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat178 = _mm512_maskz_loadu_ps(16383, datPtr2+4048+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512i pm28 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in192 = _mm512_permutexvar_ps(pm28, dat177);
__m512i pm29 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in200 = _mm512_permutexvar_ps(pm29, dat178);
__m512 dat179 = _mm512_maskz_loadu_ps(2047, datPtr2+3636+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat180 = _mm512_maskz_loadu_ps(16383, datPtr2+4140+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in193 = _mm512_permutexvar_ps(pm28, dat179);
__m512 in201 = _mm512_permutexvar_ps(pm29, dat180);
__m512 dat181 = _mm512_maskz_loadu_ps(2047, datPtr2+3728+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat182 = _mm512_maskz_loadu_ps(16383, datPtr2+4232+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in194 = _mm512_permutexvar_ps(pm28, dat181);
__m512 in202 = _mm512_permutexvar_ps(pm29, dat182);
__m512 dat183 = _mm512_maskz_loadu_ps(2047, datPtr2+3820+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat184 = _mm512_maskz_loadu_ps(16383, datPtr2+4324+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in195 = _mm512_permutexvar_ps(pm28, dat183);
__m512 in203 = _mm512_permutexvar_ps(pm29, dat184);
__m512 dat185 = _mm512_maskz_loadu_ps(2047, datPtr2+3912+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat186 = _mm512_maskz_loadu_ps(16383, datPtr2+4416+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in196 = _mm512_permutexvar_ps(pm28, dat185);
__m512 in204 = _mm512_permutexvar_ps(pm29, dat186);
__m512 dat187 = _mm512_maskz_loadu_ps(2047, datPtr2+4004+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat188 = _mm512_maskz_loadu_ps(16383, datPtr2+4508+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in197 = _mm512_permutexvar_ps(pm28, dat187);
__m512 in205 = _mm512_permutexvar_ps(pm29, dat188);
__m512 dat189 = _mm512_maskz_loadu_ps(2047, datPtr2+4096+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat190 = _mm512_maskz_loadu_ps(16383, datPtr2+4600+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in198 = _mm512_permutexvar_ps(pm28, dat189);
__m512 in206 = _mm512_permutexvar_ps(pm29, dat190);
__m512 dat191 = _mm512_maskz_loadu_ps(2047, datPtr2+4188+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 dat192 = _mm512_maskz_loadu_ps(16383, datPtr2+4692+7163304*i8+92*h3+4*w3+405536*s5+6992*k7);
__m512 in199 = _mm512_permutexvar_ps(pm28, dat191);
__m512 in207 = _mm512_permutexvar_ps(pm29, dat192);
__m512 tmp957 = _mm512_add_ps(in193, in197);
__m512 tmp961 = _mm512_add_ps(in201, in205);
__m512 tmp958 = _mm512_sub_ps(in196, in194);
__m512 tmp962 = _mm512_sub_ps(in204, in202);
__m512 tmp959 = _mm512_add_ps(in194, in198);
__m512 tmp963 = _mm512_add_ps(in202, in206);
in192 = _mm512_sub_ps(in192, in198);
in200 = _mm512_sub_ps(in200, in206);
tmp957 = _mm512_fmadd_ps(in195, _mm512_set1_ps(-4.25e+00f), tmp957);
tmp961 = _mm512_fmadd_ps(in203, _mm512_set1_ps(-4.25e+00f), tmp961);
tmp959 = _mm512_fmadd_ps(in196, _mm512_set1_ps(-4.25e+00f), tmp959);
tmp963 = _mm512_fmadd_ps(in204, _mm512_set1_ps(-4.25e+00f), tmp963);
in192 = _mm512_fmadd_ps(tmp958, _mm512_set1_ps(5.25e+00f), in192);
in200 = _mm512_fmadd_ps(tmp962, _mm512_set1_ps(5.25e+00f), in200);
tmp958 = _mm512_fmadd_ps(in194, _mm512_set1_ps(2.5e-01f), in198);
tmp962 = _mm512_fmadd_ps(in202, _mm512_set1_ps(2.5e-01f), in206);
in194 = _mm512_fmadd_ps(in194, _mm512_set1_ps(4e+00f), in198);
in202 = _mm512_fmadd_ps(in202, _mm512_set1_ps(4e+00f), in206);
__m512 tmp960 = _mm512_sub_ps(tmp959, tmp957);
__m512 tmp964 = _mm512_sub_ps(tmp963, tmp961);
tmp959 = _mm512_add_ps(tmp957, tmp959);
tmp963 = _mm512_add_ps(tmp961, tmp963);
tmp957 = _mm512_fmadd_ps(in193, _mm512_set1_ps(2.5e-01f), in197);
tmp961 = _mm512_fmadd_ps(in201, _mm512_set1_ps(2.5e-01f), in205);
tmp958 = _mm512_fmadd_ps(in196, _mm512_set1_ps(-1.25e+00f), tmp958);
tmp962 = _mm512_fmadd_ps(in204, _mm512_set1_ps(-1.25e+00f), tmp962);
in196 = _mm512_fmadd_ps(in196, _mm512_set1_ps(-5e+00f), in194);
in204 = _mm512_fmadd_ps(in204, _mm512_set1_ps(-5e+00f), in202);
tmp957 = _mm512_fmadd_ps(in195, _mm512_set1_ps(-1.25e+00f), tmp957);
tmp961 = _mm512_fmadd_ps(in203, _mm512_set1_ps(-1.25e+00f), tmp961);
in198 = _mm512_fmadd_ps(tmp957, _mm512_set1_ps(2e+00f), tmp958);
in206 = _mm512_fmadd_ps(tmp961, _mm512_set1_ps(2e+00f), tmp962);
tmp958 = _mm512_fnmadd_ps(tmp957, _mm512_set1_ps(2e+00f), tmp958);
tmp962 = _mm512_fnmadd_ps(tmp961, _mm512_set1_ps(2e+00f), tmp962);
tmp957 = _mm512_fmadd_ps(in197, _mm512_set1_ps(2.5e-01f), in193);
tmp961 = _mm512_fmadd_ps(in205, _mm512_set1_ps(2.5e-01f), in201);
in193 = _mm512_sub_ps(in199, in193);
in201 = _mm512_sub_ps(in207, in201);
tmp957 = _mm512_fmadd_ps(in195, _mm512_set1_ps(-1.25e+00f), tmp957);
tmp961 = _mm512_fmadd_ps(in203, _mm512_set1_ps(-1.25e+00f), tmp961);
in195 = _mm512_sub_ps(in195, in197);
in203 = _mm512_sub_ps(in203, in205);
in195 = _mm512_fmadd_ps(in195, _mm512_set1_ps(5.25e+00f), in193);
in203 = _mm512_fmadd_ps(in203, _mm512_set1_ps(5.25e+00f), in201);
in194 = _mm512_fmadd_ps(tmp957, _mm512_set1_ps(2e+00f), in196);
in202 = _mm512_fmadd_ps(tmp961, _mm512_set1_ps(2e+00f), in204);
in196 = _mm512_fnmadd_ps(tmp957, _mm512_set1_ps(2e+00f), in196);
in204 = _mm512_fnmadd_ps(tmp961, _mm512_set1_ps(2e+00f), in204);
__m512 tmp973 = _mm512_unpacklo_ps(in192, tmp959);
__m512 tmp974 = _mm512_unpackhi_ps(in192, tmp959);
__m512 tmp975 = _mm512_unpacklo_ps(tmp960, in198);
__m512 tmp976 = _mm512_unpackhi_ps(tmp960, in198);
__m512 tmp977 = _mm512_unpacklo_ps(tmp958, in194);
__m512 tmp978 = _mm512_unpackhi_ps(tmp958, in194);
__m512 tmp979 = _mm512_unpacklo_ps(in196, in195);
__m512 tmp980 = _mm512_unpackhi_ps(in196, in195);
__m512 tmp981 = _mm512_unpacklo_ps(in200, tmp963);
__m512 tmp982 = _mm512_unpackhi_ps(in200, tmp963);
__m512 tmp983 = _mm512_unpacklo_ps(tmp964, in206);
__m512 tmp984 = _mm512_unpackhi_ps(tmp964, in206);
__m512 tmp985 = _mm512_unpacklo_ps(tmp962, in202);
__m512 tmp986 = _mm512_unpackhi_ps(tmp962, in202);
__m512 tmp987 = _mm512_unpacklo_ps(in204, in203);
__m512 tmp988 = _mm512_unpackhi_ps(in204, in203);
__m512 tmp989 = _mm512_shuffle_ps(tmp973, tmp975, 68);
__m512 tmp990 = _mm512_shuffle_ps(tmp973, tmp975, 238);
__m512 tmp991 = _mm512_shuffle_ps(tmp974, tmp976, 68);
__m512 tmp992 = _mm512_shuffle_ps(tmp974, tmp976, 238);
__m512 tmp993 = _mm512_shuffle_ps(tmp977, tmp979, 68);
__m512 tmp994 = _mm512_shuffle_ps(tmp977, tmp979, 238);
__m512 tmp995 = _mm512_shuffle_ps(tmp978, tmp980, 68);
__m512 tmp996 = _mm512_shuffle_ps(tmp978, tmp980, 238);
__m512 tmp997 = _mm512_shuffle_ps(tmp981, tmp983, 68);
__m512 tmp998 = _mm512_shuffle_ps(tmp981, tmp983, 238);
__m512 tmp999 = _mm512_shuffle_ps(tmp982, tmp984, 68);
__m512 tmp1000 = _mm512_shuffle_ps(tmp982, tmp984, 238);
__m512 tmp1001 = _mm512_shuffle_ps(tmp985, tmp987, 68);
__m512 tmp1002 = _mm512_shuffle_ps(tmp985, tmp987, 238);
__m512 tmp1003 = _mm512_shuffle_ps(tmp986, tmp988, 68);
__m512 tmp1004 = _mm512_shuffle_ps(tmp986, tmp988, 238);
__m512 tmp1005 = _mm512_shuffle_f32x4(tmp989, tmp993, 136);
__m512 tmp1006 = _mm512_shuffle_f32x4(tmp989, tmp993, 221);
__m512 tmp1007 = _mm512_shuffle_f32x4(tmp990, tmp994, 136);
__m512 tmp1008 = _mm512_shuffle_f32x4(tmp990, tmp994, 221);
__m512 tmp1009 = _mm512_shuffle_f32x4(tmp991, tmp995, 136);
__m512 tmp1010 = _mm512_shuffle_f32x4(tmp991, tmp995, 221);
__m512 tmp1011 = _mm512_shuffle_f32x4(tmp992, tmp996, 136);
__m512 tmp1012 = _mm512_shuffle_f32x4(tmp992, tmp996, 221);
__m512 tmp1013 = _mm512_shuffle_f32x4(tmp997, tmp1001, 136);
__m512 tmp1014 = _mm512_shuffle_f32x4(tmp997, tmp1001, 221);
__m512 tmp1015 = _mm512_shuffle_f32x4(tmp998, tmp1002, 136);
__m512 tmp1016 = _mm512_shuffle_f32x4(tmp998, tmp1002, 221);
__m512 tmp1017 = _mm512_shuffle_f32x4(tmp999, tmp1003, 136);
__m512 tmp1018 = _mm512_shuffle_f32x4(tmp999, tmp1003, 221);
__m512 tmp1019 = _mm512_shuffle_f32x4(tmp1000, tmp1004, 136);
__m512 tmp1020 = _mm512_shuffle_f32x4(tmp1000, tmp1004, 221);
in192 = _mm512_shuffle_f32x4(tmp1005, tmp1013, 136);
in200 = _mm512_shuffle_f32x4(tmp1005, tmp1013, 221);
tmp959 = _mm512_shuffle_f32x4(tmp1007, tmp1015, 136);
tmp963 = _mm512_shuffle_f32x4(tmp1007, tmp1015, 221);
tmp960 = _mm512_shuffle_f32x4(tmp1009, tmp1017, 136);
tmp964 = _mm512_shuffle_f32x4(tmp1009, tmp1017, 221);
in198 = _mm512_shuffle_f32x4(tmp1011, tmp1019, 136);
in206 = _mm512_shuffle_f32x4(tmp1011, tmp1019, 221);
tmp958 = _mm512_shuffle_f32x4(tmp1006, tmp1014, 136);
tmp962 = _mm512_shuffle_f32x4(tmp1006, tmp1014, 221);
in194 = _mm512_shuffle_f32x4(tmp1008, tmp1016, 136);
in202 = _mm512_shuffle_f32x4(tmp1008, tmp1016, 221);
in196 = _mm512_shuffle_f32x4(tmp1010, tmp1018, 136);
in204 = _mm512_shuffle_f32x4(tmp1010, tmp1018, 221);
in195 = _mm512_shuffle_f32x4(tmp1012, tmp1020, 136);
in203 = _mm512_shuffle_f32x4(tmp1012, tmp1020, 221);
__m512 tmp965 = _mm512_add_ps(tmp959, in194);
__m512 tmp969 = _mm512_add_ps(tmp963, in202);
__m512 tmp966 = _mm512_sub_ps(tmp958, tmp960);
__m512 tmp970 = _mm512_sub_ps(tmp962, tmp964);
__m512 tmp967 = _mm512_add_ps(tmp960, in196);
__m512 tmp971 = _mm512_add_ps(tmp964, in204);
in192 = _mm512_sub_ps(in192, in196);
in200 = _mm512_sub_ps(in200, in204);
tmp965 = _mm512_fmadd_ps(in198, _mm512_set1_ps(-4.25e+00f), tmp965);
tmp969 = _mm512_fmadd_ps(in206, _mm512_set1_ps(-4.25e+00f), tmp969);
tmp967 = _mm512_fmadd_ps(tmp958, _mm512_set1_ps(-4.25e+00f), tmp967);
tmp971 = _mm512_fmadd_ps(tmp962, _mm512_set1_ps(-4.25e+00f), tmp971);
in192 = _mm512_fmadd_ps(tmp966, _mm512_set1_ps(5.25e+00f), in192);
in200 = _mm512_fmadd_ps(tmp970, _mm512_set1_ps(5.25e+00f), in200);
tmp966 = _mm512_fmadd_ps(tmp960, _mm512_set1_ps(2.5e-01f), in196);
tmp970 = _mm512_fmadd_ps(tmp964, _mm512_set1_ps(2.5e-01f), in204);
tmp960 = _mm512_fmadd_ps(tmp960, _mm512_set1_ps(4e+00f), in196);
tmp964 = _mm512_fmadd_ps(tmp964, _mm512_set1_ps(4e+00f), in204);
__m512 tmp968 = _mm512_sub_ps(tmp967, tmp965);
__m512 tmp972 = _mm512_sub_ps(tmp971, tmp969);
tmp967 = _mm512_add_ps(tmp965, tmp967);
tmp971 = _mm512_add_ps(tmp969, tmp971);
tmp965 = _mm512_fmadd_ps(tmp959, _mm512_set1_ps(2.5e-01f), in194);
tmp969 = _mm512_fmadd_ps(tmp963, _mm512_set1_ps(2.5e-01f), in202);
tmp966 = _mm512_fmadd_ps(tmp958, _mm512_set1_ps(-1.25e+00f), tmp966);
tmp970 = _mm512_fmadd_ps(tmp962, _mm512_set1_ps(-1.25e+00f), tmp970);
tmp958 = _mm512_fmadd_ps(tmp958, _mm512_set1_ps(-5e+00f), tmp960);
tmp962 = _mm512_fmadd_ps(tmp962, _mm512_set1_ps(-5e+00f), tmp964);
tmp965 = _mm512_fmadd_ps(in198, _mm512_set1_ps(-1.25e+00f), tmp965);
tmp969 = _mm512_fmadd_ps(in206, _mm512_set1_ps(-1.25e+00f), tmp969);
in196 = _mm512_fmadd_ps(tmp965, _mm512_set1_ps(2e+00f), tmp966);
in204 = _mm512_fmadd_ps(tmp969, _mm512_set1_ps(2e+00f), tmp970);
tmp966 = _mm512_fnmadd_ps(tmp965, _mm512_set1_ps(2e+00f), tmp966);
tmp970 = _mm512_fnmadd_ps(tmp969, _mm512_set1_ps(2e+00f), tmp970);
tmp965 = _mm512_fmadd_ps(in194, _mm512_set1_ps(2.5e-01f), tmp959);
tmp969 = _mm512_fmadd_ps(in202, _mm512_set1_ps(2.5e-01f), tmp963);
tmp959 = _mm512_sub_ps(in195, tmp959);
tmp963 = _mm512_sub_ps(in203, tmp963);
tmp965 = _mm512_fmadd_ps(in198, _mm512_set1_ps(-1.25e+00f), tmp965);
tmp969 = _mm512_fmadd_ps(in206, _mm512_set1_ps(-1.25e+00f), tmp969);
in198 = _mm512_sub_ps(in198, in194);
in206 = _mm512_sub_ps(in206, in202);
in198 = _mm512_fmadd_ps(in198, _mm512_set1_ps(5.25e+00f), tmp959);
in206 = _mm512_fmadd_ps(in206, _mm512_set1_ps(5.25e+00f), tmp963);
tmp960 = _mm512_fmadd_ps(tmp965, _mm512_set1_ps(2e+00f), tmp958);
tmp964 = _mm512_fmadd_ps(tmp969, _mm512_set1_ps(2e+00f), tmp962);
tmp958 = _mm512_fnmadd_ps(tmp965, _mm512_set1_ps(2e+00f), tmp958);
tmp962 = _mm512_fnmadd_ps(tmp969, _mm512_set1_ps(2e+00f), tmp962);
__m512 out245 = _mm512_shuffle_f32x4(in192, tmp967, 68);
__m512 out253 = _mm512_shuffle_f32x4(in192, tmp967, 238);
__m512 out246 = _mm512_shuffle_f32x4(tmp968, in196, 68);
__m512 out254 = _mm512_shuffle_f32x4(tmp968, in196, 238);
__m512 out247 = _mm512_shuffle_f32x4(tmp966, tmp960, 68);
__m512 out255 = _mm512_shuffle_f32x4(tmp966, tmp960, 238);
__m512 out248 = _mm512_shuffle_f32x4(tmp958, in198, 68);
__m512 out256 = _mm512_shuffle_f32x4(tmp958, in198, 238);
__m512 out249 = _mm512_shuffle_f32x4(in200, tmp971, 68);
__m512 out257 = _mm512_shuffle_f32x4(in200, tmp971, 238);
__m512 out250 = _mm512_shuffle_f32x4(tmp972, in204, 68);
__m512 out258 = _mm512_shuffle_f32x4(tmp972, in204, 238);
__m512 out251 = _mm512_shuffle_f32x4(tmp970, tmp964, 68);
__m512 out259 = _mm512_shuffle_f32x4(tmp970, tmp964, 238);
__m512 out252 = _mm512_shuffle_f32x4(tmp962, in206, 68);
__m512 out260 = _mm512_shuffle_f32x4(tmp962, in206, 238);
_mm512_storeu_ps(dfPtr2+512+2856960*i8+178560*j4+44544*s5+768*k7, out245);
_mm512_storeu_ps(dfPtr2+640+2856960*i8+178560*j4+44544*s5+768*k7, out253);
_mm512_storeu_ps(dfPtr2+576+2856960*i8+178560*j4+44544*s5+768*k7, out249);
_mm512_storeu_ps(dfPtr2+704+2856960*i8+178560*j4+44544*s5+768*k7, out257);
_mm512_storeu_ps(dfPtr2+714752+2856960*i8+178560*j4+44544*s5+768*k7, out246);
_mm512_storeu_ps(dfPtr2+714880+2856960*i8+178560*j4+44544*s5+768*k7, out254);
_mm512_storeu_ps(dfPtr2+714816+2856960*i8+178560*j4+44544*s5+768*k7, out250);
_mm512_storeu_ps(dfPtr2+714944+2856960*i8+178560*j4+44544*s5+768*k7, out258);
_mm512_storeu_ps(dfPtr2+1428992+2856960*i8+178560*j4+44544*s5+768*k7, out247);
_mm512_storeu_ps(dfPtr2+1429120+2856960*i8+178560*j4+44544*s5+768*k7, out255);
_mm512_storeu_ps(dfPtr2+1429056+2856960*i8+178560*j4+44544*s5+768*k7, out251);
_mm512_storeu_ps(dfPtr2+1429184+2856960*i8+178560*j4+44544*s5+768*k7, out259);
_mm512_storeu_ps(dfPtr2+2143232+2856960*i8+178560*j4+44544*s5+768*k7, out248);
_mm512_storeu_ps(dfPtr2+2143360+2856960*i8+178560*j4+44544*s5+768*k7, out256);
_mm512_storeu_ps(dfPtr2+2143296+2856960*i8+178560*j4+44544*s5+768*k7, out252);
_mm512_storeu_ps(dfPtr2+2143424+2856960*i8+178560*j4+44544*s5+768*k7, out260);
}
} else {
ptrdiff_t ss1 = 3;
ptrdiff_t k8 = 0;
for (; k8 != 58; ++k8) {
__m512 dat193 = _mm512_maskz_loadu_ps(16383, datPtr2+0+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat194 = _mm512_maskz_loadu_ps(2047, datPtr2+48+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512i pm30 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in208 = _mm512_permutexvar_ps(pm30, dat193);
__m512i pm31 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in216 = _mm512_permutexvar_ps(pm31, dat194);
__m512 dat195 = _mm512_maskz_loadu_ps(16383, datPtr2+92+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat196 = _mm512_maskz_loadu_ps(2047, datPtr2+140+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in209 = _mm512_permutexvar_ps(pm30, dat195);
__m512 in217 = _mm512_permutexvar_ps(pm31, dat196);
__m512 dat197 = _mm512_maskz_loadu_ps(16383, datPtr2+184+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat198 = _mm512_maskz_loadu_ps(2047, datPtr2+232+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in210 = _mm512_permutexvar_ps(pm30, dat197);
__m512 in218 = _mm512_permutexvar_ps(pm31, dat198);
__m512 dat199 = _mm512_maskz_loadu_ps(16383, datPtr2+276+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat200 = _mm512_maskz_loadu_ps(2047, datPtr2+324+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in211 = _mm512_permutexvar_ps(pm30, dat199);
__m512 in219 = _mm512_permutexvar_ps(pm31, dat200);
__m512 dat201 = _mm512_maskz_loadu_ps(16383, datPtr2+368+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat202 = _mm512_maskz_loadu_ps(2047, datPtr2+416+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in212 = _mm512_permutexvar_ps(pm30, dat201);
__m512 in220 = _mm512_permutexvar_ps(pm31, dat202);
__m512 dat203 = _mm512_maskz_loadu_ps(16383, datPtr2+460+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat204 = _mm512_maskz_loadu_ps(2047, datPtr2+508+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in213 = _mm512_permutexvar_ps(pm30, dat203);
__m512 in221 = _mm512_permutexvar_ps(pm31, dat204);
__m512 dat205 = _mm512_maskz_loadu_ps(16383, datPtr2+552+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat206 = _mm512_maskz_loadu_ps(2047, datPtr2+600+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in214 = _mm512_permutexvar_ps(pm30, dat205);
__m512 in222 = _mm512_permutexvar_ps(pm31, dat206);
__m512 dat207 = _mm512_maskz_loadu_ps(16383, datPtr2+644+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat208 = _mm512_maskz_loadu_ps(2047, datPtr2+692+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in215 = _mm512_permutexvar_ps(pm30, dat207);
__m512 in223 = _mm512_permutexvar_ps(pm31, dat208);
__m512 tmp1021 = _mm512_add_ps(in209, in213);
__m512 tmp1025 = _mm512_add_ps(in217, in221);
__m512 tmp1022 = _mm512_sub_ps(in212, in210);
__m512 tmp1026 = _mm512_sub_ps(in220, in218);
__m512 tmp1023 = _mm512_add_ps(in210, in214);
__m512 tmp1027 = _mm512_add_ps(in218, in222);
in208 = _mm512_sub_ps(in208, in214);
in216 = _mm512_sub_ps(in216, in222);
tmp1021 = _mm512_fmadd_ps(in211, _mm512_set1_ps(-4.25e+00f), tmp1021);
tmp1025 = _mm512_fmadd_ps(in219, _mm512_set1_ps(-4.25e+00f), tmp1025);
tmp1023 = _mm512_fmadd_ps(in212, _mm512_set1_ps(-4.25e+00f), tmp1023);
tmp1027 = _mm512_fmadd_ps(in220, _mm512_set1_ps(-4.25e+00f), tmp1027);
in208 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(5.25e+00f), in208);
in216 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(5.25e+00f), in216);
tmp1022 = _mm512_fmadd_ps(in210, _mm512_set1_ps(2.5e-01f), in214);
tmp1026 = _mm512_fmadd_ps(in218, _mm512_set1_ps(2.5e-01f), in222);
in210 = _mm512_fmadd_ps(in210, _mm512_set1_ps(4e+00f), in214);
in218 = _mm512_fmadd_ps(in218, _mm512_set1_ps(4e+00f), in222);
__m512 tmp1024 = _mm512_sub_ps(tmp1023, tmp1021);
__m512 tmp1028 = _mm512_sub_ps(tmp1027, tmp1025);
tmp1023 = _mm512_add_ps(tmp1021, tmp1023);
tmp1027 = _mm512_add_ps(tmp1025, tmp1027);
tmp1021 = _mm512_fmadd_ps(in209, _mm512_set1_ps(2.5e-01f), in213);
tmp1025 = _mm512_fmadd_ps(in217, _mm512_set1_ps(2.5e-01f), in221);
tmp1022 = _mm512_fmadd_ps(in212, _mm512_set1_ps(-1.25e+00f), tmp1022);
tmp1026 = _mm512_fmadd_ps(in220, _mm512_set1_ps(-1.25e+00f), tmp1026);
in212 = _mm512_fmadd_ps(in212, _mm512_set1_ps(-5e+00f), in210);
in220 = _mm512_fmadd_ps(in220, _mm512_set1_ps(-5e+00f), in218);
tmp1021 = _mm512_fmadd_ps(in211, _mm512_set1_ps(-1.25e+00f), tmp1021);
tmp1025 = _mm512_fmadd_ps(in219, _mm512_set1_ps(-1.25e+00f), tmp1025);
in214 = _mm512_fmadd_ps(tmp1021, _mm512_set1_ps(2e+00f), tmp1022);
in222 = _mm512_fmadd_ps(tmp1025, _mm512_set1_ps(2e+00f), tmp1026);
tmp1022 = _mm512_fnmadd_ps(tmp1021, _mm512_set1_ps(2e+00f), tmp1022);
tmp1026 = _mm512_fnmadd_ps(tmp1025, _mm512_set1_ps(2e+00f), tmp1026);
tmp1021 = _mm512_fmadd_ps(in213, _mm512_set1_ps(2.5e-01f), in209);
tmp1025 = _mm512_fmadd_ps(in221, _mm512_set1_ps(2.5e-01f), in217);
in209 = _mm512_sub_ps(in215, in209);
in217 = _mm512_sub_ps(in223, in217);
tmp1021 = _mm512_fmadd_ps(in211, _mm512_set1_ps(-1.25e+00f), tmp1021);
tmp1025 = _mm512_fmadd_ps(in219, _mm512_set1_ps(-1.25e+00f), tmp1025);
in211 = _mm512_sub_ps(in211, in213);
in219 = _mm512_sub_ps(in219, in221);
in211 = _mm512_fmadd_ps(in211, _mm512_set1_ps(5.25e+00f), in209);
in219 = _mm512_fmadd_ps(in219, _mm512_set1_ps(5.25e+00f), in217);
in210 = _mm512_fmadd_ps(tmp1021, _mm512_set1_ps(2e+00f), in212);
in218 = _mm512_fmadd_ps(tmp1025, _mm512_set1_ps(2e+00f), in220);
in212 = _mm512_fnmadd_ps(tmp1021, _mm512_set1_ps(2e+00f), in212);
in220 = _mm512_fnmadd_ps(tmp1025, _mm512_set1_ps(2e+00f), in220);
__m512 tmp1037 = _mm512_unpacklo_ps(in208, tmp1023);
__m512 tmp1038 = _mm512_unpackhi_ps(in208, tmp1023);
__m512 tmp1039 = _mm512_unpacklo_ps(tmp1024, in214);
__m512 tmp1040 = _mm512_unpackhi_ps(tmp1024, in214);
__m512 tmp1041 = _mm512_unpacklo_ps(tmp1022, in210);
__m512 tmp1042 = _mm512_unpackhi_ps(tmp1022, in210);
__m512 tmp1043 = _mm512_unpacklo_ps(in212, in211);
__m512 tmp1044 = _mm512_unpackhi_ps(in212, in211);
__m512 tmp1045 = _mm512_unpacklo_ps(in216, tmp1027);
__m512 tmp1046 = _mm512_unpackhi_ps(in216, tmp1027);
__m512 tmp1047 = _mm512_unpacklo_ps(tmp1028, in222);
__m512 tmp1048 = _mm512_unpackhi_ps(tmp1028, in222);
__m512 tmp1049 = _mm512_unpacklo_ps(tmp1026, in218);
__m512 tmp1050 = _mm512_unpackhi_ps(tmp1026, in218);
__m512 tmp1051 = _mm512_unpacklo_ps(in220, in219);
__m512 tmp1052 = _mm512_unpackhi_ps(in220, in219);
__m512 tmp1053 = _mm512_shuffle_ps(tmp1037, tmp1039, 68);
__m512 tmp1054 = _mm512_shuffle_ps(tmp1037, tmp1039, 238);
__m512 tmp1055 = _mm512_shuffle_ps(tmp1038, tmp1040, 68);
__m512 tmp1056 = _mm512_shuffle_ps(tmp1038, tmp1040, 238);
__m512 tmp1057 = _mm512_shuffle_ps(tmp1041, tmp1043, 68);
__m512 tmp1058 = _mm512_shuffle_ps(tmp1041, tmp1043, 238);
__m512 tmp1059 = _mm512_shuffle_ps(tmp1042, tmp1044, 68);
__m512 tmp1060 = _mm512_shuffle_ps(tmp1042, tmp1044, 238);
__m512 tmp1061 = _mm512_shuffle_ps(tmp1045, tmp1047, 68);
__m512 tmp1062 = _mm512_shuffle_ps(tmp1045, tmp1047, 238);
__m512 tmp1063 = _mm512_shuffle_ps(tmp1046, tmp1048, 68);
__m512 tmp1064 = _mm512_shuffle_ps(tmp1046, tmp1048, 238);
__m512 tmp1065 = _mm512_shuffle_ps(tmp1049, tmp1051, 68);
__m512 tmp1066 = _mm512_shuffle_ps(tmp1049, tmp1051, 238);
__m512 tmp1067 = _mm512_shuffle_ps(tmp1050, tmp1052, 68);
__m512 tmp1068 = _mm512_shuffle_ps(tmp1050, tmp1052, 238);
__m512 tmp1069 = _mm512_shuffle_f32x4(tmp1053, tmp1057, 136);
__m512 tmp1070 = _mm512_shuffle_f32x4(tmp1053, tmp1057, 221);
__m512 tmp1071 = _mm512_shuffle_f32x4(tmp1054, tmp1058, 136);
__m512 tmp1072 = _mm512_shuffle_f32x4(tmp1054, tmp1058, 221);
__m512 tmp1073 = _mm512_shuffle_f32x4(tmp1055, tmp1059, 136);
__m512 tmp1074 = _mm512_shuffle_f32x4(tmp1055, tmp1059, 221);
__m512 tmp1075 = _mm512_shuffle_f32x4(tmp1056, tmp1060, 136);
__m512 tmp1076 = _mm512_shuffle_f32x4(tmp1056, tmp1060, 221);
__m512 tmp1077 = _mm512_shuffle_f32x4(tmp1061, tmp1065, 136);
__m512 tmp1078 = _mm512_shuffle_f32x4(tmp1061, tmp1065, 221);
__m512 tmp1079 = _mm512_shuffle_f32x4(tmp1062, tmp1066, 136);
__m512 tmp1080 = _mm512_shuffle_f32x4(tmp1062, tmp1066, 221);
__m512 tmp1081 = _mm512_shuffle_f32x4(tmp1063, tmp1067, 136);
__m512 tmp1082 = _mm512_shuffle_f32x4(tmp1063, tmp1067, 221);
__m512 tmp1083 = _mm512_shuffle_f32x4(tmp1064, tmp1068, 136);
__m512 tmp1084 = _mm512_shuffle_f32x4(tmp1064, tmp1068, 221);
in208 = _mm512_shuffle_f32x4(tmp1069, tmp1077, 136);
in216 = _mm512_shuffle_f32x4(tmp1069, tmp1077, 221);
tmp1023 = _mm512_shuffle_f32x4(tmp1071, tmp1079, 136);
tmp1027 = _mm512_shuffle_f32x4(tmp1071, tmp1079, 221);
tmp1024 = _mm512_shuffle_f32x4(tmp1073, tmp1081, 136);
tmp1028 = _mm512_shuffle_f32x4(tmp1073, tmp1081, 221);
in214 = _mm512_shuffle_f32x4(tmp1075, tmp1083, 136);
in222 = _mm512_shuffle_f32x4(tmp1075, tmp1083, 221);
tmp1022 = _mm512_shuffle_f32x4(tmp1070, tmp1078, 136);
tmp1026 = _mm512_shuffle_f32x4(tmp1070, tmp1078, 221);
in210 = _mm512_shuffle_f32x4(tmp1072, tmp1080, 136);
in218 = _mm512_shuffle_f32x4(tmp1072, tmp1080, 221);
in212 = _mm512_shuffle_f32x4(tmp1074, tmp1082, 136);
in220 = _mm512_shuffle_f32x4(tmp1074, tmp1082, 221);
in211 = _mm512_shuffle_f32x4(tmp1076, tmp1084, 136);
in219 = _mm512_shuffle_f32x4(tmp1076, tmp1084, 221);
__m512 tmp1029 = _mm512_add_ps(tmp1023, in210);
__m512 tmp1033 = _mm512_add_ps(tmp1027, in218);
__m512 tmp1030 = _mm512_sub_ps(tmp1022, tmp1024);
__m512 tmp1034 = _mm512_sub_ps(tmp1026, tmp1028);
__m512 tmp1031 = _mm512_add_ps(tmp1024, in212);
__m512 tmp1035 = _mm512_add_ps(tmp1028, in220);
in208 = _mm512_sub_ps(in208, in212);
in216 = _mm512_sub_ps(in216, in220);
tmp1029 = _mm512_fmadd_ps(in214, _mm512_set1_ps(-4.25e+00f), tmp1029);
tmp1033 = _mm512_fmadd_ps(in222, _mm512_set1_ps(-4.25e+00f), tmp1033);
tmp1031 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(-4.25e+00f), tmp1031);
tmp1035 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(-4.25e+00f), tmp1035);
in208 = _mm512_fmadd_ps(tmp1030, _mm512_set1_ps(5.25e+00f), in208);
in216 = _mm512_fmadd_ps(tmp1034, _mm512_set1_ps(5.25e+00f), in216);
tmp1030 = _mm512_fmadd_ps(tmp1024, _mm512_set1_ps(2.5e-01f), in212);
tmp1034 = _mm512_fmadd_ps(tmp1028, _mm512_set1_ps(2.5e-01f), in220);
tmp1024 = _mm512_fmadd_ps(tmp1024, _mm512_set1_ps(4e+00f), in212);
tmp1028 = _mm512_fmadd_ps(tmp1028, _mm512_set1_ps(4e+00f), in220);
__m512 tmp1032 = _mm512_sub_ps(tmp1031, tmp1029);
__m512 tmp1036 = _mm512_sub_ps(tmp1035, tmp1033);
tmp1031 = _mm512_add_ps(tmp1029, tmp1031);
tmp1035 = _mm512_add_ps(tmp1033, tmp1035);
tmp1029 = _mm512_fmadd_ps(tmp1023, _mm512_set1_ps(2.5e-01f), in210);
tmp1033 = _mm512_fmadd_ps(tmp1027, _mm512_set1_ps(2.5e-01f), in218);
tmp1030 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(-1.25e+00f), tmp1030);
tmp1034 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(-1.25e+00f), tmp1034);
tmp1022 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(-5e+00f), tmp1024);
tmp1026 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(-5e+00f), tmp1028);
tmp1029 = _mm512_fmadd_ps(in214, _mm512_set1_ps(-1.25e+00f), tmp1029);
tmp1033 = _mm512_fmadd_ps(in222, _mm512_set1_ps(-1.25e+00f), tmp1033);
in212 = _mm512_fmadd_ps(tmp1029, _mm512_set1_ps(2e+00f), tmp1030);
in220 = _mm512_fmadd_ps(tmp1033, _mm512_set1_ps(2e+00f), tmp1034);
tmp1030 = _mm512_fnmadd_ps(tmp1029, _mm512_set1_ps(2e+00f), tmp1030);
tmp1034 = _mm512_fnmadd_ps(tmp1033, _mm512_set1_ps(2e+00f), tmp1034);
tmp1029 = _mm512_fmadd_ps(in210, _mm512_set1_ps(2.5e-01f), tmp1023);
tmp1033 = _mm512_fmadd_ps(in218, _mm512_set1_ps(2.5e-01f), tmp1027);
tmp1023 = _mm512_sub_ps(in211, tmp1023);
tmp1027 = _mm512_sub_ps(in219, tmp1027);
tmp1029 = _mm512_fmadd_ps(in214, _mm512_set1_ps(-1.25e+00f), tmp1029);
tmp1033 = _mm512_fmadd_ps(in222, _mm512_set1_ps(-1.25e+00f), tmp1033);
in214 = _mm512_sub_ps(in214, in210);
in222 = _mm512_sub_ps(in222, in218);
in214 = _mm512_fmadd_ps(in214, _mm512_set1_ps(5.25e+00f), tmp1023);
in222 = _mm512_fmadd_ps(in222, _mm512_set1_ps(5.25e+00f), tmp1027);
tmp1024 = _mm512_fmadd_ps(tmp1029, _mm512_set1_ps(2e+00f), tmp1022);
tmp1028 = _mm512_fmadd_ps(tmp1033, _mm512_set1_ps(2e+00f), tmp1026);
tmp1022 = _mm512_fnmadd_ps(tmp1029, _mm512_set1_ps(2e+00f), tmp1022);
tmp1026 = _mm512_fnmadd_ps(tmp1033, _mm512_set1_ps(2e+00f), tmp1026);
__m512 out261 = _mm512_shuffle_f32x4(in208, tmp1031, 68);
__m512 out269 = _mm512_shuffle_f32x4(in208, tmp1031, 238);
__m512 out262 = _mm512_shuffle_f32x4(tmp1032, in212, 68);
__m512 out270 = _mm512_shuffle_f32x4(tmp1032, in212, 238);
__m512 out263 = _mm512_shuffle_f32x4(tmp1030, tmp1024, 68);
__m512 out271 = _mm512_shuffle_f32x4(tmp1030, tmp1024, 238);
__m512 out264 = _mm512_shuffle_f32x4(tmp1022, in214, 68);
__m512 out272 = _mm512_shuffle_f32x4(tmp1022, in214, 238);
__m512 out265 = _mm512_shuffle_f32x4(in216, tmp1035, 68);
__m512 out273 = _mm512_shuffle_f32x4(in216, tmp1035, 238);
__m512 out266 = _mm512_shuffle_f32x4(tmp1036, in220, 68);
__m512 out274 = _mm512_shuffle_f32x4(tmp1036, in220, 238);
__m512 out267 = _mm512_shuffle_f32x4(tmp1034, tmp1028, 68);
__m512 out275 = _mm512_shuffle_f32x4(tmp1034, tmp1028, 238);
__m512 out268 = _mm512_shuffle_f32x4(tmp1026, in222, 68);
__m512 out276 = _mm512_shuffle_f32x4(tmp1026, in222, 238);
_mm512_storeu_ps(dfPtr2+0+2856960*i8+178560*j4+44544*ss1+768*k8, out261);
_mm512_storeu_ps(dfPtr2+128+2856960*i8+178560*j4+44544*ss1+768*k8, out269);
_mm512_storeu_ps(dfPtr2+64+2856960*i8+178560*j4+44544*ss1+768*k8, out265);
_mm512_storeu_ps(dfPtr2+192+2856960*i8+178560*j4+44544*ss1+768*k8, out273);
_mm512_storeu_ps(dfPtr2+714240+2856960*i8+178560*j4+44544*ss1+768*k8, out262);
_mm512_storeu_ps(dfPtr2+714368+2856960*i8+178560*j4+44544*ss1+768*k8, out270);
_mm512_storeu_ps(dfPtr2+714304+2856960*i8+178560*j4+44544*ss1+768*k8, out266);
_mm512_storeu_ps(dfPtr2+714432+2856960*i8+178560*j4+44544*ss1+768*k8, out274);
_mm512_storeu_ps(dfPtr2+1428480+2856960*i8+178560*j4+44544*ss1+768*k8, out263);
_mm512_storeu_ps(dfPtr2+1428608+2856960*i8+178560*j4+44544*ss1+768*k8, out271);
_mm512_storeu_ps(dfPtr2+1428544+2856960*i8+178560*j4+44544*ss1+768*k8, out267);
_mm512_storeu_ps(dfPtr2+1428672+2856960*i8+178560*j4+44544*ss1+768*k8, out275);
_mm512_storeu_ps(dfPtr2+2142720+2856960*i8+178560*j4+44544*ss1+768*k8, out264);
_mm512_storeu_ps(dfPtr2+2142848+2856960*i8+178560*j4+44544*ss1+768*k8, out272);
_mm512_storeu_ps(dfPtr2+2142784+2856960*i8+178560*j4+44544*ss1+768*k8, out268);
_mm512_storeu_ps(dfPtr2+2142912+2856960*i8+178560*j4+44544*ss1+768*k8, out276);
__m512 dat209 = _mm512_maskz_loadu_ps(16383, datPtr2+552+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat210 = _mm512_maskz_loadu_ps(16383, datPtr2+3496+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512i pm32 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in224 = _mm512_permutexvar_ps(pm32, dat209);
__m512 in232 = _mm512_permutexvar_ps(pm32, dat210);
__m512 dat211 = _mm512_maskz_loadu_ps(16383, datPtr2+644+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat212 = _mm512_maskz_loadu_ps(16383, datPtr2+3588+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in225 = _mm512_permutexvar_ps(pm32, dat211);
__m512 in233 = _mm512_permutexvar_ps(pm32, dat212);
__m512 dat213 = _mm512_maskz_loadu_ps(16383, datPtr2+736+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat214 = _mm512_maskz_loadu_ps(16383, datPtr2+3680+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in226 = _mm512_permutexvar_ps(pm32, dat213);
__m512 in234 = _mm512_permutexvar_ps(pm32, dat214);
__m512 dat215 = _mm512_maskz_loadu_ps(16383, datPtr2+828+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat216 = _mm512_maskz_loadu_ps(16383, datPtr2+3772+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in227 = _mm512_permutexvar_ps(pm32, dat215);
__m512 in235 = _mm512_permutexvar_ps(pm32, dat216);
__m512 dat217 = _mm512_maskz_loadu_ps(16383, datPtr2+920+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat218 = _mm512_maskz_loadu_ps(16383, datPtr2+3864+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in228 = _mm512_permutexvar_ps(pm32, dat217);
__m512 in236 = _mm512_permutexvar_ps(pm32, dat218);
__m512 dat219 = _mm512_maskz_loadu_ps(16383, datPtr2+1012+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat220 = _mm512_maskz_loadu_ps(16383, datPtr2+3956+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in229 = _mm512_permutexvar_ps(pm32, dat219);
__m512 in237 = _mm512_permutexvar_ps(pm32, dat220);
__m512 dat221 = _mm512_maskz_loadu_ps(16383, datPtr2+1104+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat222 = _mm512_maskz_loadu_ps(16383, datPtr2+4048+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in230 = _mm512_permutexvar_ps(pm32, dat221);
__m512 in238 = _mm512_permutexvar_ps(pm32, dat222);
__m512 dat223 = _mm512_maskz_loadu_ps(16383, datPtr2+1196+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat224 = _mm512_maskz_loadu_ps(16383, datPtr2+4140+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in231 = _mm512_permutexvar_ps(pm32, dat223);
__m512 in239 = _mm512_permutexvar_ps(pm32, dat224);
__m512 tmp1085 = _mm512_add_ps(in225, in229);
__m512 tmp1089 = _mm512_add_ps(in233, in237);
__m512 tmp1086 = _mm512_sub_ps(in228, in226);
__m512 tmp1090 = _mm512_sub_ps(in236, in234);
__m512 tmp1087 = _mm512_add_ps(in226, in230);
__m512 tmp1091 = _mm512_add_ps(in234, in238);
in224 = _mm512_sub_ps(in224, in230);
in232 = _mm512_sub_ps(in232, in238);
tmp1085 = _mm512_fmadd_ps(in227, _mm512_set1_ps(-4.25e+00f), tmp1085);
tmp1089 = _mm512_fmadd_ps(in235, _mm512_set1_ps(-4.25e+00f), tmp1089);
tmp1087 = _mm512_fmadd_ps(in228, _mm512_set1_ps(-4.25e+00f), tmp1087);
tmp1091 = _mm512_fmadd_ps(in236, _mm512_set1_ps(-4.25e+00f), tmp1091);
in224 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(5.25e+00f), in224);
in232 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(5.25e+00f), in232);
tmp1086 = _mm512_fmadd_ps(in226, _mm512_set1_ps(2.5e-01f), in230);
tmp1090 = _mm512_fmadd_ps(in234, _mm512_set1_ps(2.5e-01f), in238);
in226 = _mm512_fmadd_ps(in226, _mm512_set1_ps(4e+00f), in230);
in234 = _mm512_fmadd_ps(in234, _mm512_set1_ps(4e+00f), in238);
__m512 tmp1088 = _mm512_sub_ps(tmp1087, tmp1085);
__m512 tmp1092 = _mm512_sub_ps(tmp1091, tmp1089);
tmp1087 = _mm512_add_ps(tmp1085, tmp1087);
tmp1091 = _mm512_add_ps(tmp1089, tmp1091);
tmp1085 = _mm512_fmadd_ps(in225, _mm512_set1_ps(2.5e-01f), in229);
tmp1089 = _mm512_fmadd_ps(in233, _mm512_set1_ps(2.5e-01f), in237);
tmp1086 = _mm512_fmadd_ps(in228, _mm512_set1_ps(-1.25e+00f), tmp1086);
tmp1090 = _mm512_fmadd_ps(in236, _mm512_set1_ps(-1.25e+00f), tmp1090);
in228 = _mm512_fmadd_ps(in228, _mm512_set1_ps(-5e+00f), in226);
in236 = _mm512_fmadd_ps(in236, _mm512_set1_ps(-5e+00f), in234);
tmp1085 = _mm512_fmadd_ps(in227, _mm512_set1_ps(-1.25e+00f), tmp1085);
tmp1089 = _mm512_fmadd_ps(in235, _mm512_set1_ps(-1.25e+00f), tmp1089);
in230 = _mm512_fmadd_ps(tmp1085, _mm512_set1_ps(2e+00f), tmp1086);
in238 = _mm512_fmadd_ps(tmp1089, _mm512_set1_ps(2e+00f), tmp1090);
tmp1086 = _mm512_fnmadd_ps(tmp1085, _mm512_set1_ps(2e+00f), tmp1086);
tmp1090 = _mm512_fnmadd_ps(tmp1089, _mm512_set1_ps(2e+00f), tmp1090);
tmp1085 = _mm512_fmadd_ps(in229, _mm512_set1_ps(2.5e-01f), in225);
tmp1089 = _mm512_fmadd_ps(in237, _mm512_set1_ps(2.5e-01f), in233);
in225 = _mm512_sub_ps(in231, in225);
in233 = _mm512_sub_ps(in239, in233);
tmp1085 = _mm512_fmadd_ps(in227, _mm512_set1_ps(-1.25e+00f), tmp1085);
tmp1089 = _mm512_fmadd_ps(in235, _mm512_set1_ps(-1.25e+00f), tmp1089);
in227 = _mm512_sub_ps(in227, in229);
in235 = _mm512_sub_ps(in235, in237);
in227 = _mm512_fmadd_ps(in227, _mm512_set1_ps(5.25e+00f), in225);
in235 = _mm512_fmadd_ps(in235, _mm512_set1_ps(5.25e+00f), in233);
in226 = _mm512_fmadd_ps(tmp1085, _mm512_set1_ps(2e+00f), in228);
in234 = _mm512_fmadd_ps(tmp1089, _mm512_set1_ps(2e+00f), in236);
in228 = _mm512_fnmadd_ps(tmp1085, _mm512_set1_ps(2e+00f), in228);
in236 = _mm512_fnmadd_ps(tmp1089, _mm512_set1_ps(2e+00f), in236);
__m512 tmp1101 = _mm512_unpacklo_ps(in224, tmp1087);
__m512 tmp1102 = _mm512_unpackhi_ps(in224, tmp1087);
__m512 tmp1103 = _mm512_unpacklo_ps(tmp1088, in230);
__m512 tmp1104 = _mm512_unpackhi_ps(tmp1088, in230);
__m512 tmp1105 = _mm512_unpacklo_ps(tmp1086, in226);
__m512 tmp1106 = _mm512_unpackhi_ps(tmp1086, in226);
__m512 tmp1107 = _mm512_unpacklo_ps(in228, in227);
__m512 tmp1108 = _mm512_unpackhi_ps(in228, in227);
__m512 tmp1109 = _mm512_unpacklo_ps(in232, tmp1091);
__m512 tmp1110 = _mm512_unpackhi_ps(in232, tmp1091);
__m512 tmp1111 = _mm512_unpacklo_ps(tmp1092, in238);
__m512 tmp1112 = _mm512_unpackhi_ps(tmp1092, in238);
__m512 tmp1113 = _mm512_unpacklo_ps(tmp1090, in234);
__m512 tmp1114 = _mm512_unpackhi_ps(tmp1090, in234);
__m512 tmp1115 = _mm512_unpacklo_ps(in236, in235);
__m512 tmp1116 = _mm512_unpackhi_ps(in236, in235);
__m512 tmp1117 = _mm512_shuffle_ps(tmp1101, tmp1103, 68);
__m512 tmp1118 = _mm512_shuffle_ps(tmp1101, tmp1103, 238);
__m512 tmp1119 = _mm512_shuffle_ps(tmp1102, tmp1104, 68);
__m512 tmp1120 = _mm512_shuffle_ps(tmp1102, tmp1104, 238);
__m512 tmp1121 = _mm512_shuffle_ps(tmp1105, tmp1107, 68);
__m512 tmp1122 = _mm512_shuffle_ps(tmp1105, tmp1107, 238);
__m512 tmp1123 = _mm512_shuffle_ps(tmp1106, tmp1108, 68);
__m512 tmp1124 = _mm512_shuffle_ps(tmp1106, tmp1108, 238);
__m512 tmp1125 = _mm512_shuffle_ps(tmp1109, tmp1111, 68);
__m512 tmp1126 = _mm512_shuffle_ps(tmp1109, tmp1111, 238);
__m512 tmp1127 = _mm512_shuffle_ps(tmp1110, tmp1112, 68);
__m512 tmp1128 = _mm512_shuffle_ps(tmp1110, tmp1112, 238);
__m512 tmp1129 = _mm512_shuffle_ps(tmp1113, tmp1115, 68);
__m512 tmp1130 = _mm512_shuffle_ps(tmp1113, tmp1115, 238);
__m512 tmp1131 = _mm512_shuffle_ps(tmp1114, tmp1116, 68);
__m512 tmp1132 = _mm512_shuffle_ps(tmp1114, tmp1116, 238);
__m512 tmp1133 = _mm512_shuffle_f32x4(tmp1117, tmp1121, 136);
__m512 tmp1134 = _mm512_shuffle_f32x4(tmp1117, tmp1121, 221);
__m512 tmp1135 = _mm512_shuffle_f32x4(tmp1118, tmp1122, 136);
__m512 tmp1136 = _mm512_shuffle_f32x4(tmp1118, tmp1122, 221);
__m512 tmp1137 = _mm512_shuffle_f32x4(tmp1119, tmp1123, 136);
__m512 tmp1138 = _mm512_shuffle_f32x4(tmp1119, tmp1123, 221);
__m512 tmp1139 = _mm512_shuffle_f32x4(tmp1120, tmp1124, 136);
__m512 tmp1140 = _mm512_shuffle_f32x4(tmp1120, tmp1124, 221);
__m512 tmp1141 = _mm512_shuffle_f32x4(tmp1125, tmp1129, 136);
__m512 tmp1142 = _mm512_shuffle_f32x4(tmp1125, tmp1129, 221);
__m512 tmp1143 = _mm512_shuffle_f32x4(tmp1126, tmp1130, 136);
__m512 tmp1144 = _mm512_shuffle_f32x4(tmp1126, tmp1130, 221);
__m512 tmp1145 = _mm512_shuffle_f32x4(tmp1127, tmp1131, 136);
__m512 tmp1146 = _mm512_shuffle_f32x4(tmp1127, tmp1131, 221);
__m512 tmp1147 = _mm512_shuffle_f32x4(tmp1128, tmp1132, 136);
__m512 tmp1148 = _mm512_shuffle_f32x4(tmp1128, tmp1132, 221);
in224 = _mm512_shuffle_f32x4(tmp1133, tmp1141, 136);
in232 = _mm512_shuffle_f32x4(tmp1133, tmp1141, 221);
tmp1087 = _mm512_shuffle_f32x4(tmp1135, tmp1143, 136);
tmp1091 = _mm512_shuffle_f32x4(tmp1135, tmp1143, 221);
tmp1088 = _mm512_shuffle_f32x4(tmp1137, tmp1145, 136);
tmp1092 = _mm512_shuffle_f32x4(tmp1137, tmp1145, 221);
in230 = _mm512_shuffle_f32x4(tmp1139, tmp1147, 136);
in238 = _mm512_shuffle_f32x4(tmp1139, tmp1147, 221);
tmp1086 = _mm512_shuffle_f32x4(tmp1134, tmp1142, 136);
tmp1090 = _mm512_shuffle_f32x4(tmp1134, tmp1142, 221);
in226 = _mm512_shuffle_f32x4(tmp1136, tmp1144, 136);
in234 = _mm512_shuffle_f32x4(tmp1136, tmp1144, 221);
in228 = _mm512_shuffle_f32x4(tmp1138, tmp1146, 136);
in236 = _mm512_shuffle_f32x4(tmp1138, tmp1146, 221);
in227 = _mm512_shuffle_f32x4(tmp1140, tmp1148, 136);
in235 = _mm512_shuffle_f32x4(tmp1140, tmp1148, 221);
__m512 tmp1093 = _mm512_add_ps(tmp1087, in226);
__m512 tmp1097 = _mm512_add_ps(tmp1091, in234);
__m512 tmp1094 = _mm512_sub_ps(tmp1086, tmp1088);
__m512 tmp1098 = _mm512_sub_ps(tmp1090, tmp1092);
__m512 tmp1095 = _mm512_add_ps(tmp1088, in228);
__m512 tmp1099 = _mm512_add_ps(tmp1092, in236);
in224 = _mm512_sub_ps(in224, in228);
in232 = _mm512_sub_ps(in232, in236);
tmp1093 = _mm512_fmadd_ps(in230, _mm512_set1_ps(-4.25e+00f), tmp1093);
tmp1097 = _mm512_fmadd_ps(in238, _mm512_set1_ps(-4.25e+00f), tmp1097);
tmp1095 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(-4.25e+00f), tmp1095);
tmp1099 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(-4.25e+00f), tmp1099);
in224 = _mm512_fmadd_ps(tmp1094, _mm512_set1_ps(5.25e+00f), in224);
in232 = _mm512_fmadd_ps(tmp1098, _mm512_set1_ps(5.25e+00f), in232);
tmp1094 = _mm512_fmadd_ps(tmp1088, _mm512_set1_ps(2.5e-01f), in228);
tmp1098 = _mm512_fmadd_ps(tmp1092, _mm512_set1_ps(2.5e-01f), in236);
tmp1088 = _mm512_fmadd_ps(tmp1088, _mm512_set1_ps(4e+00f), in228);
tmp1092 = _mm512_fmadd_ps(tmp1092, _mm512_set1_ps(4e+00f), in236);
__m512 tmp1096 = _mm512_sub_ps(tmp1095, tmp1093);
__m512 tmp1100 = _mm512_sub_ps(tmp1099, tmp1097);
tmp1095 = _mm512_add_ps(tmp1093, tmp1095);
tmp1099 = _mm512_add_ps(tmp1097, tmp1099);
tmp1093 = _mm512_fmadd_ps(tmp1087, _mm512_set1_ps(2.5e-01f), in226);
tmp1097 = _mm512_fmadd_ps(tmp1091, _mm512_set1_ps(2.5e-01f), in234);
tmp1094 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(-1.25e+00f), tmp1094);
tmp1098 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(-1.25e+00f), tmp1098);
tmp1086 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(-5e+00f), tmp1088);
tmp1090 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(-5e+00f), tmp1092);
tmp1093 = _mm512_fmadd_ps(in230, _mm512_set1_ps(-1.25e+00f), tmp1093);
tmp1097 = _mm512_fmadd_ps(in238, _mm512_set1_ps(-1.25e+00f), tmp1097);
in228 = _mm512_fmadd_ps(tmp1093, _mm512_set1_ps(2e+00f), tmp1094);
in236 = _mm512_fmadd_ps(tmp1097, _mm512_set1_ps(2e+00f), tmp1098);
tmp1094 = _mm512_fnmadd_ps(tmp1093, _mm512_set1_ps(2e+00f), tmp1094);
tmp1098 = _mm512_fnmadd_ps(tmp1097, _mm512_set1_ps(2e+00f), tmp1098);
tmp1093 = _mm512_fmadd_ps(in226, _mm512_set1_ps(2.5e-01f), tmp1087);
tmp1097 = _mm512_fmadd_ps(in234, _mm512_set1_ps(2.5e-01f), tmp1091);
tmp1087 = _mm512_sub_ps(in227, tmp1087);
tmp1091 = _mm512_sub_ps(in235, tmp1091);
tmp1093 = _mm512_fmadd_ps(in230, _mm512_set1_ps(-1.25e+00f), tmp1093);
tmp1097 = _mm512_fmadd_ps(in238, _mm512_set1_ps(-1.25e+00f), tmp1097);
in230 = _mm512_sub_ps(in230, in226);
in238 = _mm512_sub_ps(in238, in234);
in230 = _mm512_fmadd_ps(in230, _mm512_set1_ps(5.25e+00f), tmp1087);
in238 = _mm512_fmadd_ps(in238, _mm512_set1_ps(5.25e+00f), tmp1091);
tmp1088 = _mm512_fmadd_ps(tmp1093, _mm512_set1_ps(2e+00f), tmp1086);
tmp1092 = _mm512_fmadd_ps(tmp1097, _mm512_set1_ps(2e+00f), tmp1090);
tmp1086 = _mm512_fnmadd_ps(tmp1093, _mm512_set1_ps(2e+00f), tmp1086);
tmp1090 = _mm512_fnmadd_ps(tmp1097, _mm512_set1_ps(2e+00f), tmp1090);
__m512 out277 = _mm512_shuffle_f32x4(in224, tmp1095, 68);
__m512 out285 = _mm512_shuffle_f32x4(in224, tmp1095, 238);
__m512 out278 = _mm512_shuffle_f32x4(tmp1096, in228, 68);
__m512 out286 = _mm512_shuffle_f32x4(tmp1096, in228, 238);
__m512 out279 = _mm512_shuffle_f32x4(tmp1094, tmp1088, 68);
__m512 out287 = _mm512_shuffle_f32x4(tmp1094, tmp1088, 238);
__m512 out280 = _mm512_shuffle_f32x4(tmp1086, in230, 68);
__m512 out288 = _mm512_shuffle_f32x4(tmp1086, in230, 238);
__m512 out281 = _mm512_shuffle_f32x4(in232, tmp1099, 68);
__m512 out289 = _mm512_shuffle_f32x4(in232, tmp1099, 238);
__m512 out282 = _mm512_shuffle_f32x4(tmp1100, in236, 68);
__m512 out290 = _mm512_shuffle_f32x4(tmp1100, in236, 238);
__m512 out283 = _mm512_shuffle_f32x4(tmp1098, tmp1092, 68);
__m512 out291 = _mm512_shuffle_f32x4(tmp1098, tmp1092, 238);
__m512 out284 = _mm512_shuffle_f32x4(tmp1090, in238, 68);
__m512 out292 = _mm512_shuffle_f32x4(tmp1090, in238, 238);
_mm512_storeu_ps(dfPtr2+256+2856960*i8+178560*j4+44544*ss1+768*k8, out277);
_mm512_storeu_ps(dfPtr2+384+2856960*i8+178560*j4+44544*ss1+768*k8, out285);
_mm512_storeu_ps(dfPtr2+320+2856960*i8+178560*j4+44544*ss1+768*k8, out281);
_mm512_storeu_ps(dfPtr2+448+2856960*i8+178560*j4+44544*ss1+768*k8, out289);
_mm512_storeu_ps(dfPtr2+714496+2856960*i8+178560*j4+44544*ss1+768*k8, out278);
_mm512_storeu_ps(dfPtr2+714624+2856960*i8+178560*j4+44544*ss1+768*k8, out286);
_mm512_storeu_ps(dfPtr2+714560+2856960*i8+178560*j4+44544*ss1+768*k8, out282);
_mm512_storeu_ps(dfPtr2+714688+2856960*i8+178560*j4+44544*ss1+768*k8, out290);
_mm512_storeu_ps(dfPtr2+1428736+2856960*i8+178560*j4+44544*ss1+768*k8, out279);
_mm512_storeu_ps(dfPtr2+1428864+2856960*i8+178560*j4+44544*ss1+768*k8, out287);
_mm512_storeu_ps(dfPtr2+1428800+2856960*i8+178560*j4+44544*ss1+768*k8, out283);
_mm512_storeu_ps(dfPtr2+1428928+2856960*i8+178560*j4+44544*ss1+768*k8, out291);
_mm512_storeu_ps(dfPtr2+2142976+2856960*i8+178560*j4+44544*ss1+768*k8, out280);
_mm512_storeu_ps(dfPtr2+2143104+2856960*i8+178560*j4+44544*ss1+768*k8, out288);
_mm512_storeu_ps(dfPtr2+2143040+2856960*i8+178560*j4+44544*ss1+768*k8, out284);
_mm512_storeu_ps(dfPtr2+2143168+2856960*i8+178560*j4+44544*ss1+768*k8, out292);
__m512 dat225 = _mm512_maskz_loadu_ps(2047, datPtr2+3544+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat226 = _mm512_maskz_loadu_ps(16383, datPtr2+4048+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512i pm33 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in240 = _mm512_permutexvar_ps(pm33, dat225);
__m512i pm34 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in248 = _mm512_permutexvar_ps(pm34, dat226);
__m512 dat227 = _mm512_maskz_loadu_ps(2047, datPtr2+3636+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat228 = _mm512_maskz_loadu_ps(16383, datPtr2+4140+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in241 = _mm512_permutexvar_ps(pm33, dat227);
__m512 in249 = _mm512_permutexvar_ps(pm34, dat228);
__m512 dat229 = _mm512_maskz_loadu_ps(2047, datPtr2+3728+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat230 = _mm512_maskz_loadu_ps(16383, datPtr2+4232+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in242 = _mm512_permutexvar_ps(pm33, dat229);
__m512 in250 = _mm512_permutexvar_ps(pm34, dat230);
__m512 dat231 = _mm512_maskz_loadu_ps(2047, datPtr2+3820+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat232 = _mm512_maskz_loadu_ps(16383, datPtr2+4324+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in243 = _mm512_permutexvar_ps(pm33, dat231);
__m512 in251 = _mm512_permutexvar_ps(pm34, dat232);
__m512 dat233 = _mm512_maskz_loadu_ps(2047, datPtr2+3912+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat234 = _mm512_maskz_loadu_ps(16383, datPtr2+4416+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in244 = _mm512_permutexvar_ps(pm33, dat233);
__m512 in252 = _mm512_permutexvar_ps(pm34, dat234);
__m512 dat235 = _mm512_maskz_loadu_ps(2047, datPtr2+4004+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat236 = _mm512_maskz_loadu_ps(16383, datPtr2+4508+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in245 = _mm512_permutexvar_ps(pm33, dat235);
__m512 in253 = _mm512_permutexvar_ps(pm34, dat236);
__m512 dat237 = _mm512_maskz_loadu_ps(2047, datPtr2+4096+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat238 = _mm512_maskz_loadu_ps(16383, datPtr2+4600+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in246 = _mm512_permutexvar_ps(pm33, dat237);
__m512 in254 = _mm512_permutexvar_ps(pm34, dat238);
__m512 dat239 = _mm512_maskz_loadu_ps(2047, datPtr2+4188+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat240 = _mm512_maskz_loadu_ps(16383, datPtr2+4692+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in247 = _mm512_permutexvar_ps(pm33, dat239);
__m512 in255 = _mm512_permutexvar_ps(pm34, dat240);
__m512 tmp1149 = _mm512_add_ps(in241, in245);
__m512 tmp1153 = _mm512_add_ps(in249, in253);
__m512 tmp1150 = _mm512_sub_ps(in244, in242);
__m512 tmp1154 = _mm512_sub_ps(in252, in250);
__m512 tmp1151 = _mm512_add_ps(in242, in246);
__m512 tmp1155 = _mm512_add_ps(in250, in254);
in240 = _mm512_sub_ps(in240, in246);
in248 = _mm512_sub_ps(in248, in254);
tmp1149 = _mm512_fmadd_ps(in243, _mm512_set1_ps(-4.25e+00f), tmp1149);
tmp1153 = _mm512_fmadd_ps(in251, _mm512_set1_ps(-4.25e+00f), tmp1153);
tmp1151 = _mm512_fmadd_ps(in244, _mm512_set1_ps(-4.25e+00f), tmp1151);
tmp1155 = _mm512_fmadd_ps(in252, _mm512_set1_ps(-4.25e+00f), tmp1155);
in240 = _mm512_fmadd_ps(tmp1150, _mm512_set1_ps(5.25e+00f), in240);
in248 = _mm512_fmadd_ps(tmp1154, _mm512_set1_ps(5.25e+00f), in248);
tmp1150 = _mm512_fmadd_ps(in242, _mm512_set1_ps(2.5e-01f), in246);
tmp1154 = _mm512_fmadd_ps(in250, _mm512_set1_ps(2.5e-01f), in254);
in242 = _mm512_fmadd_ps(in242, _mm512_set1_ps(4e+00f), in246);
in250 = _mm512_fmadd_ps(in250, _mm512_set1_ps(4e+00f), in254);
__m512 tmp1152 = _mm512_sub_ps(tmp1151, tmp1149);
__m512 tmp1156 = _mm512_sub_ps(tmp1155, tmp1153);
tmp1151 = _mm512_add_ps(tmp1149, tmp1151);
tmp1155 = _mm512_add_ps(tmp1153, tmp1155);
tmp1149 = _mm512_fmadd_ps(in241, _mm512_set1_ps(2.5e-01f), in245);
tmp1153 = _mm512_fmadd_ps(in249, _mm512_set1_ps(2.5e-01f), in253);
tmp1150 = _mm512_fmadd_ps(in244, _mm512_set1_ps(-1.25e+00f), tmp1150);
tmp1154 = _mm512_fmadd_ps(in252, _mm512_set1_ps(-1.25e+00f), tmp1154);
in244 = _mm512_fmadd_ps(in244, _mm512_set1_ps(-5e+00f), in242);
in252 = _mm512_fmadd_ps(in252, _mm512_set1_ps(-5e+00f), in250);
tmp1149 = _mm512_fmadd_ps(in243, _mm512_set1_ps(-1.25e+00f), tmp1149);
tmp1153 = _mm512_fmadd_ps(in251, _mm512_set1_ps(-1.25e+00f), tmp1153);
in246 = _mm512_fmadd_ps(tmp1149, _mm512_set1_ps(2e+00f), tmp1150);
in254 = _mm512_fmadd_ps(tmp1153, _mm512_set1_ps(2e+00f), tmp1154);
tmp1150 = _mm512_fnmadd_ps(tmp1149, _mm512_set1_ps(2e+00f), tmp1150);
tmp1154 = _mm512_fnmadd_ps(tmp1153, _mm512_set1_ps(2e+00f), tmp1154);
tmp1149 = _mm512_fmadd_ps(in245, _mm512_set1_ps(2.5e-01f), in241);
tmp1153 = _mm512_fmadd_ps(in253, _mm512_set1_ps(2.5e-01f), in249);
in241 = _mm512_sub_ps(in247, in241);
in249 = _mm512_sub_ps(in255, in249);
tmp1149 = _mm512_fmadd_ps(in243, _mm512_set1_ps(-1.25e+00f), tmp1149);
tmp1153 = _mm512_fmadd_ps(in251, _mm512_set1_ps(-1.25e+00f), tmp1153);
in243 = _mm512_sub_ps(in243, in245);
in251 = _mm512_sub_ps(in251, in253);
in243 = _mm512_fmadd_ps(in243, _mm512_set1_ps(5.25e+00f), in241);
in251 = _mm512_fmadd_ps(in251, _mm512_set1_ps(5.25e+00f), in249);
in242 = _mm512_fmadd_ps(tmp1149, _mm512_set1_ps(2e+00f), in244);
in250 = _mm512_fmadd_ps(tmp1153, _mm512_set1_ps(2e+00f), in252);
in244 = _mm512_fnmadd_ps(tmp1149, _mm512_set1_ps(2e+00f), in244);
in252 = _mm512_fnmadd_ps(tmp1153, _mm512_set1_ps(2e+00f), in252);
__m512 tmp1165 = _mm512_unpacklo_ps(in240, tmp1151);
__m512 tmp1166 = _mm512_unpackhi_ps(in240, tmp1151);
__m512 tmp1167 = _mm512_unpacklo_ps(tmp1152, in246);
__m512 tmp1168 = _mm512_unpackhi_ps(tmp1152, in246);
__m512 tmp1169 = _mm512_unpacklo_ps(tmp1150, in242);
__m512 tmp1170 = _mm512_unpackhi_ps(tmp1150, in242);
__m512 tmp1171 = _mm512_unpacklo_ps(in244, in243);
__m512 tmp1172 = _mm512_unpackhi_ps(in244, in243);
__m512 tmp1173 = _mm512_unpacklo_ps(in248, tmp1155);
__m512 tmp1174 = _mm512_unpackhi_ps(in248, tmp1155);
__m512 tmp1175 = _mm512_unpacklo_ps(tmp1156, in254);
__m512 tmp1176 = _mm512_unpackhi_ps(tmp1156, in254);
__m512 tmp1177 = _mm512_unpacklo_ps(tmp1154, in250);
__m512 tmp1178 = _mm512_unpackhi_ps(tmp1154, in250);
__m512 tmp1179 = _mm512_unpacklo_ps(in252, in251);
__m512 tmp1180 = _mm512_unpackhi_ps(in252, in251);
__m512 tmp1181 = _mm512_shuffle_ps(tmp1165, tmp1167, 68);
__m512 tmp1182 = _mm512_shuffle_ps(tmp1165, tmp1167, 238);
__m512 tmp1183 = _mm512_shuffle_ps(tmp1166, tmp1168, 68);
__m512 tmp1184 = _mm512_shuffle_ps(tmp1166, tmp1168, 238);
__m512 tmp1185 = _mm512_shuffle_ps(tmp1169, tmp1171, 68);
__m512 tmp1186 = _mm512_shuffle_ps(tmp1169, tmp1171, 238);
__m512 tmp1187 = _mm512_shuffle_ps(tmp1170, tmp1172, 68);
__m512 tmp1188 = _mm512_shuffle_ps(tmp1170, tmp1172, 238);
__m512 tmp1189 = _mm512_shuffle_ps(tmp1173, tmp1175, 68);
__m512 tmp1190 = _mm512_shuffle_ps(tmp1173, tmp1175, 238);
__m512 tmp1191 = _mm512_shuffle_ps(tmp1174, tmp1176, 68);
__m512 tmp1192 = _mm512_shuffle_ps(tmp1174, tmp1176, 238);
__m512 tmp1193 = _mm512_shuffle_ps(tmp1177, tmp1179, 68);
__m512 tmp1194 = _mm512_shuffle_ps(tmp1177, tmp1179, 238);
__m512 tmp1195 = _mm512_shuffle_ps(tmp1178, tmp1180, 68);
__m512 tmp1196 = _mm512_shuffle_ps(tmp1178, tmp1180, 238);
__m512 tmp1197 = _mm512_shuffle_f32x4(tmp1181, tmp1185, 136);
__m512 tmp1198 = _mm512_shuffle_f32x4(tmp1181, tmp1185, 221);
__m512 tmp1199 = _mm512_shuffle_f32x4(tmp1182, tmp1186, 136);
__m512 tmp1200 = _mm512_shuffle_f32x4(tmp1182, tmp1186, 221);
__m512 tmp1201 = _mm512_shuffle_f32x4(tmp1183, tmp1187, 136);
__m512 tmp1202 = _mm512_shuffle_f32x4(tmp1183, tmp1187, 221);
__m512 tmp1203 = _mm512_shuffle_f32x4(tmp1184, tmp1188, 136);
__m512 tmp1204 = _mm512_shuffle_f32x4(tmp1184, tmp1188, 221);
__m512 tmp1205 = _mm512_shuffle_f32x4(tmp1189, tmp1193, 136);
__m512 tmp1206 = _mm512_shuffle_f32x4(tmp1189, tmp1193, 221);
__m512 tmp1207 = _mm512_shuffle_f32x4(tmp1190, tmp1194, 136);
__m512 tmp1208 = _mm512_shuffle_f32x4(tmp1190, tmp1194, 221);
__m512 tmp1209 = _mm512_shuffle_f32x4(tmp1191, tmp1195, 136);
__m512 tmp1210 = _mm512_shuffle_f32x4(tmp1191, tmp1195, 221);
__m512 tmp1211 = _mm512_shuffle_f32x4(tmp1192, tmp1196, 136);
__m512 tmp1212 = _mm512_shuffle_f32x4(tmp1192, tmp1196, 221);
in240 = _mm512_shuffle_f32x4(tmp1197, tmp1205, 136);
in248 = _mm512_shuffle_f32x4(tmp1197, tmp1205, 221);
tmp1151 = _mm512_shuffle_f32x4(tmp1199, tmp1207, 136);
tmp1155 = _mm512_shuffle_f32x4(tmp1199, tmp1207, 221);
tmp1152 = _mm512_shuffle_f32x4(tmp1201, tmp1209, 136);
tmp1156 = _mm512_shuffle_f32x4(tmp1201, tmp1209, 221);
in246 = _mm512_shuffle_f32x4(tmp1203, tmp1211, 136);
in254 = _mm512_shuffle_f32x4(tmp1203, tmp1211, 221);
tmp1150 = _mm512_shuffle_f32x4(tmp1198, tmp1206, 136);
tmp1154 = _mm512_shuffle_f32x4(tmp1198, tmp1206, 221);
in242 = _mm512_shuffle_f32x4(tmp1200, tmp1208, 136);
in250 = _mm512_shuffle_f32x4(tmp1200, tmp1208, 221);
in244 = _mm512_shuffle_f32x4(tmp1202, tmp1210, 136);
in252 = _mm512_shuffle_f32x4(tmp1202, tmp1210, 221);
in243 = _mm512_shuffle_f32x4(tmp1204, tmp1212, 136);
in251 = _mm512_shuffle_f32x4(tmp1204, tmp1212, 221);
__m512 tmp1157 = _mm512_add_ps(tmp1151, in242);
__m512 tmp1161 = _mm512_add_ps(tmp1155, in250);
__m512 tmp1158 = _mm512_sub_ps(tmp1150, tmp1152);
__m512 tmp1162 = _mm512_sub_ps(tmp1154, tmp1156);
__m512 tmp1159 = _mm512_add_ps(tmp1152, in244);
__m512 tmp1163 = _mm512_add_ps(tmp1156, in252);
in240 = _mm512_sub_ps(in240, in244);
in248 = _mm512_sub_ps(in248, in252);
tmp1157 = _mm512_fmadd_ps(in246, _mm512_set1_ps(-4.25e+00f), tmp1157);
tmp1161 = _mm512_fmadd_ps(in254, _mm512_set1_ps(-4.25e+00f), tmp1161);
tmp1159 = _mm512_fmadd_ps(tmp1150, _mm512_set1_ps(-4.25e+00f), tmp1159);
tmp1163 = _mm512_fmadd_ps(tmp1154, _mm512_set1_ps(-4.25e+00f), tmp1163);
in240 = _mm512_fmadd_ps(tmp1158, _mm512_set1_ps(5.25e+00f), in240);
in248 = _mm512_fmadd_ps(tmp1162, _mm512_set1_ps(5.25e+00f), in248);
tmp1158 = _mm512_fmadd_ps(tmp1152, _mm512_set1_ps(2.5e-01f), in244);
tmp1162 = _mm512_fmadd_ps(tmp1156, _mm512_set1_ps(2.5e-01f), in252);
tmp1152 = _mm512_fmadd_ps(tmp1152, _mm512_set1_ps(4e+00f), in244);
tmp1156 = _mm512_fmadd_ps(tmp1156, _mm512_set1_ps(4e+00f), in252);
__m512 tmp1160 = _mm512_sub_ps(tmp1159, tmp1157);
__m512 tmp1164 = _mm512_sub_ps(tmp1163, tmp1161);
tmp1159 = _mm512_add_ps(tmp1157, tmp1159);
tmp1163 = _mm512_add_ps(tmp1161, tmp1163);
tmp1157 = _mm512_fmadd_ps(tmp1151, _mm512_set1_ps(2.5e-01f), in242);
tmp1161 = _mm512_fmadd_ps(tmp1155, _mm512_set1_ps(2.5e-01f), in250);
tmp1158 = _mm512_fmadd_ps(tmp1150, _mm512_set1_ps(-1.25e+00f), tmp1158);
tmp1162 = _mm512_fmadd_ps(tmp1154, _mm512_set1_ps(-1.25e+00f), tmp1162);
tmp1150 = _mm512_fmadd_ps(tmp1150, _mm512_set1_ps(-5e+00f), tmp1152);
tmp1154 = _mm512_fmadd_ps(tmp1154, _mm512_set1_ps(-5e+00f), tmp1156);
tmp1157 = _mm512_fmadd_ps(in246, _mm512_set1_ps(-1.25e+00f), tmp1157);
tmp1161 = _mm512_fmadd_ps(in254, _mm512_set1_ps(-1.25e+00f), tmp1161);
in244 = _mm512_fmadd_ps(tmp1157, _mm512_set1_ps(2e+00f), tmp1158);
in252 = _mm512_fmadd_ps(tmp1161, _mm512_set1_ps(2e+00f), tmp1162);
tmp1158 = _mm512_fnmadd_ps(tmp1157, _mm512_set1_ps(2e+00f), tmp1158);
tmp1162 = _mm512_fnmadd_ps(tmp1161, _mm512_set1_ps(2e+00f), tmp1162);
tmp1157 = _mm512_fmadd_ps(in242, _mm512_set1_ps(2.5e-01f), tmp1151);
tmp1161 = _mm512_fmadd_ps(in250, _mm512_set1_ps(2.5e-01f), tmp1155);
tmp1151 = _mm512_sub_ps(in243, tmp1151);
tmp1155 = _mm512_sub_ps(in251, tmp1155);
tmp1157 = _mm512_fmadd_ps(in246, _mm512_set1_ps(-1.25e+00f), tmp1157);
tmp1161 = _mm512_fmadd_ps(in254, _mm512_set1_ps(-1.25e+00f), tmp1161);
in246 = _mm512_sub_ps(in246, in242);
in254 = _mm512_sub_ps(in254, in250);
in246 = _mm512_fmadd_ps(in246, _mm512_set1_ps(5.25e+00f), tmp1151);
in254 = _mm512_fmadd_ps(in254, _mm512_set1_ps(5.25e+00f), tmp1155);
tmp1152 = _mm512_fmadd_ps(tmp1157, _mm512_set1_ps(2e+00f), tmp1150);
tmp1156 = _mm512_fmadd_ps(tmp1161, _mm512_set1_ps(2e+00f), tmp1154);
tmp1150 = _mm512_fnmadd_ps(tmp1157, _mm512_set1_ps(2e+00f), tmp1150);
tmp1154 = _mm512_fnmadd_ps(tmp1161, _mm512_set1_ps(2e+00f), tmp1154);
__m512 out293 = _mm512_shuffle_f32x4(in240, tmp1159, 68);
__m512 out301 = _mm512_shuffle_f32x4(in240, tmp1159, 238);
__m512 out294 = _mm512_shuffle_f32x4(tmp1160, in244, 68);
__m512 out302 = _mm512_shuffle_f32x4(tmp1160, in244, 238);
__m512 out295 = _mm512_shuffle_f32x4(tmp1158, tmp1152, 68);
__m512 out303 = _mm512_shuffle_f32x4(tmp1158, tmp1152, 238);
__m512 out296 = _mm512_shuffle_f32x4(tmp1150, in246, 68);
__m512 out304 = _mm512_shuffle_f32x4(tmp1150, in246, 238);
__m512 out297 = _mm512_shuffle_f32x4(in248, tmp1163, 68);
__m512 out305 = _mm512_shuffle_f32x4(in248, tmp1163, 238);
__m512 out298 = _mm512_shuffle_f32x4(tmp1164, in252, 68);
__m512 out306 = _mm512_shuffle_f32x4(tmp1164, in252, 238);
__m512 out299 = _mm512_shuffle_f32x4(tmp1162, tmp1156, 68);
__m512 out307 = _mm512_shuffle_f32x4(tmp1162, tmp1156, 238);
__m512 out300 = _mm512_shuffle_f32x4(tmp1154, in254, 68);
__m512 out308 = _mm512_shuffle_f32x4(tmp1154, in254, 238);
_mm512_storeu_ps(dfPtr2+512+2856960*i8+178560*j4+44544*ss1+768*k8, out293);
_mm512_storeu_ps(dfPtr2+640+2856960*i8+178560*j4+44544*ss1+768*k8, out301);
_mm512_storeu_ps(dfPtr2+576+2856960*i8+178560*j4+44544*ss1+768*k8, out297);
_mm512_storeu_ps(dfPtr2+704+2856960*i8+178560*j4+44544*ss1+768*k8, out305);
_mm512_storeu_ps(dfPtr2+714752+2856960*i8+178560*j4+44544*ss1+768*k8, out294);
_mm512_storeu_ps(dfPtr2+714880+2856960*i8+178560*j4+44544*ss1+768*k8, out302);
_mm512_storeu_ps(dfPtr2+714816+2856960*i8+178560*j4+44544*ss1+768*k8, out298);
_mm512_storeu_ps(dfPtr2+714944+2856960*i8+178560*j4+44544*ss1+768*k8, out306);
_mm512_storeu_ps(dfPtr2+1428992+2856960*i8+178560*j4+44544*ss1+768*k8, out295);
_mm512_storeu_ps(dfPtr2+1429120+2856960*i8+178560*j4+44544*ss1+768*k8, out303);
_mm512_storeu_ps(dfPtr2+1429056+2856960*i8+178560*j4+44544*ss1+768*k8, out299);
_mm512_storeu_ps(dfPtr2+1429184+2856960*i8+178560*j4+44544*ss1+768*k8, out307);
_mm512_storeu_ps(dfPtr2+2143232+2856960*i8+178560*j4+44544*ss1+768*k8, out296);
_mm512_storeu_ps(dfPtr2+2143360+2856960*i8+178560*j4+44544*ss1+768*k8, out304);
_mm512_storeu_ps(dfPtr2+2143296+2856960*i8+178560*j4+44544*ss1+768*k8, out300);
_mm512_storeu_ps(dfPtr2+2143424+2856960*i8+178560*j4+44544*ss1+768*k8, out308);
}
__m512 dat241 = _mm512_maskz_loadu_ps(16383, datPtr2+0+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat242 = _mm512_maskz_loadu_ps(2047, datPtr2+48+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512i pm35 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in256 = _mm512_permutexvar_ps(pm35, dat241);
__m512i pm36 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in264 = _mm512_permutexvar_ps(pm36, dat242);
__m512 dat243 = _mm512_maskz_loadu_ps(16383, datPtr2+92+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat244 = _mm512_maskz_loadu_ps(2047, datPtr2+140+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in257 = _mm512_permutexvar_ps(pm35, dat243);
__m512 in265 = _mm512_permutexvar_ps(pm36, dat244);
__m512 dat245 = _mm512_maskz_loadu_ps(16383, datPtr2+184+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat246 = _mm512_maskz_loadu_ps(2047, datPtr2+232+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in258 = _mm512_permutexvar_ps(pm35, dat245);
__m512 in266 = _mm512_permutexvar_ps(pm36, dat246);
__m512 dat247 = _mm512_maskz_loadu_ps(16383, datPtr2+276+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat248 = _mm512_maskz_loadu_ps(2047, datPtr2+324+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in259 = _mm512_permutexvar_ps(pm35, dat247);
__m512 in267 = _mm512_permutexvar_ps(pm36, dat248);
__m512 dat249 = _mm512_maskz_loadu_ps(16383, datPtr2+368+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat250 = _mm512_maskz_loadu_ps(2047, datPtr2+416+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in260 = _mm512_permutexvar_ps(pm35, dat249);
__m512 in268 = _mm512_permutexvar_ps(pm36, dat250);
__m512 dat251 = _mm512_maskz_loadu_ps(16383, datPtr2+460+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat252 = _mm512_maskz_loadu_ps(2047, datPtr2+508+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in261 = _mm512_permutexvar_ps(pm35, dat251);
__m512 in269 = _mm512_permutexvar_ps(pm36, dat252);
__m512 dat253 = _mm512_maskz_loadu_ps(16383, datPtr2+552+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat254 = _mm512_maskz_loadu_ps(2047, datPtr2+600+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in262 = _mm512_permutexvar_ps(pm35, dat253);
__m512 in270 = _mm512_permutexvar_ps(pm36, dat254);
__m512 dat255 = _mm512_maskz_loadu_ps(16383, datPtr2+644+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 dat256 = _mm512_maskz_loadu_ps(2047, datPtr2+692+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in263 = _mm512_permutexvar_ps(pm35, dat255);
__m512 in271 = _mm512_permutexvar_ps(pm36, dat256);
__m512 tmp1213 = _mm512_add_ps(in257, in261);
__m512 tmp1217 = _mm512_add_ps(in265, in269);
__m512 tmp1214 = _mm512_sub_ps(in260, in258);
__m512 tmp1218 = _mm512_sub_ps(in268, in266);
__m512 tmp1215 = _mm512_add_ps(in258, in262);
__m512 tmp1219 = _mm512_add_ps(in266, in270);
in256 = _mm512_sub_ps(in256, in262);
in264 = _mm512_sub_ps(in264, in270);
tmp1213 = _mm512_fmadd_ps(in259, _mm512_set1_ps(-4.25e+00f), tmp1213);
tmp1217 = _mm512_fmadd_ps(in267, _mm512_set1_ps(-4.25e+00f), tmp1217);
tmp1215 = _mm512_fmadd_ps(in260, _mm512_set1_ps(-4.25e+00f), tmp1215);
tmp1219 = _mm512_fmadd_ps(in268, _mm512_set1_ps(-4.25e+00f), tmp1219);
in256 = _mm512_fmadd_ps(tmp1214, _mm512_set1_ps(5.25e+00f), in256);
in264 = _mm512_fmadd_ps(tmp1218, _mm512_set1_ps(5.25e+00f), in264);
tmp1214 = _mm512_fmadd_ps(in258, _mm512_set1_ps(2.5e-01f), in262);
tmp1218 = _mm512_fmadd_ps(in266, _mm512_set1_ps(2.5e-01f), in270);
in258 = _mm512_fmadd_ps(in258, _mm512_set1_ps(4e+00f), in262);
in266 = _mm512_fmadd_ps(in266, _mm512_set1_ps(4e+00f), in270);
__m512 tmp1216 = _mm512_sub_ps(tmp1215, tmp1213);
__m512 tmp1220 = _mm512_sub_ps(tmp1219, tmp1217);
tmp1215 = _mm512_add_ps(tmp1213, tmp1215);
tmp1219 = _mm512_add_ps(tmp1217, tmp1219);
tmp1213 = _mm512_fmadd_ps(in257, _mm512_set1_ps(2.5e-01f), in261);
tmp1217 = _mm512_fmadd_ps(in265, _mm512_set1_ps(2.5e-01f), in269);
tmp1214 = _mm512_fmadd_ps(in260, _mm512_set1_ps(-1.25e+00f), tmp1214);
tmp1218 = _mm512_fmadd_ps(in268, _mm512_set1_ps(-1.25e+00f), tmp1218);
in260 = _mm512_fmadd_ps(in260, _mm512_set1_ps(-5e+00f), in258);
in268 = _mm512_fmadd_ps(in268, _mm512_set1_ps(-5e+00f), in266);
tmp1213 = _mm512_fmadd_ps(in259, _mm512_set1_ps(-1.25e+00f), tmp1213);
tmp1217 = _mm512_fmadd_ps(in267, _mm512_set1_ps(-1.25e+00f), tmp1217);
in262 = _mm512_fmadd_ps(tmp1213, _mm512_set1_ps(2e+00f), tmp1214);
in270 = _mm512_fmadd_ps(tmp1217, _mm512_set1_ps(2e+00f), tmp1218);
tmp1214 = _mm512_fnmadd_ps(tmp1213, _mm512_set1_ps(2e+00f), tmp1214);
tmp1218 = _mm512_fnmadd_ps(tmp1217, _mm512_set1_ps(2e+00f), tmp1218);
tmp1213 = _mm512_fmadd_ps(in261, _mm512_set1_ps(2.5e-01f), in257);
tmp1217 = _mm512_fmadd_ps(in269, _mm512_set1_ps(2.5e-01f), in265);
in257 = _mm512_sub_ps(in263, in257);
in265 = _mm512_sub_ps(in271, in265);
tmp1213 = _mm512_fmadd_ps(in259, _mm512_set1_ps(-1.25e+00f), tmp1213);
tmp1217 = _mm512_fmadd_ps(in267, _mm512_set1_ps(-1.25e+00f), tmp1217);
in259 = _mm512_sub_ps(in259, in261);
in267 = _mm512_sub_ps(in267, in269);
in259 = _mm512_fmadd_ps(in259, _mm512_set1_ps(5.25e+00f), in257);
in267 = _mm512_fmadd_ps(in267, _mm512_set1_ps(5.25e+00f), in265);
in258 = _mm512_fmadd_ps(tmp1213, _mm512_set1_ps(2e+00f), in260);
in266 = _mm512_fmadd_ps(tmp1217, _mm512_set1_ps(2e+00f), in268);
in260 = _mm512_fnmadd_ps(tmp1213, _mm512_set1_ps(2e+00f), in260);
in268 = _mm512_fnmadd_ps(tmp1217, _mm512_set1_ps(2e+00f), in268);
__m512 tmp1229 = _mm512_unpacklo_ps(in256, tmp1215);
__m512 tmp1230 = _mm512_unpackhi_ps(in256, tmp1215);
__m512 tmp1231 = _mm512_unpacklo_ps(tmp1216, in262);
__m512 tmp1232 = _mm512_unpackhi_ps(tmp1216, in262);
__m512 tmp1233 = _mm512_unpacklo_ps(tmp1214, in258);
__m512 tmp1234 = _mm512_unpackhi_ps(tmp1214, in258);
__m512 tmp1235 = _mm512_unpacklo_ps(in260, in259);
__m512 tmp1236 = _mm512_unpackhi_ps(in260, in259);
__m512 tmp1237 = _mm512_unpacklo_ps(in264, tmp1219);
__m512 tmp1238 = _mm512_unpackhi_ps(in264, tmp1219);
__m512 tmp1239 = _mm512_unpacklo_ps(tmp1220, in270);
__m512 tmp1240 = _mm512_unpackhi_ps(tmp1220, in270);
__m512 tmp1241 = _mm512_unpacklo_ps(tmp1218, in266);
__m512 tmp1242 = _mm512_unpackhi_ps(tmp1218, in266);
__m512 tmp1243 = _mm512_unpacklo_ps(in268, in267);
__m512 tmp1244 = _mm512_unpackhi_ps(in268, in267);
__m512 tmp1245 = _mm512_shuffle_ps(tmp1229, tmp1231, 68);
__m512 tmp1246 = _mm512_shuffle_ps(tmp1229, tmp1231, 238);
__m512 tmp1247 = _mm512_shuffle_ps(tmp1230, tmp1232, 68);
__m512 tmp1248 = _mm512_shuffle_ps(tmp1230, tmp1232, 238);
__m512 tmp1249 = _mm512_shuffle_ps(tmp1233, tmp1235, 68);
__m512 tmp1250 = _mm512_shuffle_ps(tmp1233, tmp1235, 238);
__m512 tmp1251 = _mm512_shuffle_ps(tmp1234, tmp1236, 68);
__m512 tmp1252 = _mm512_shuffle_ps(tmp1234, tmp1236, 238);
__m512 tmp1253 = _mm512_shuffle_ps(tmp1237, tmp1239, 68);
__m512 tmp1254 = _mm512_shuffle_ps(tmp1237, tmp1239, 238);
__m512 tmp1255 = _mm512_shuffle_ps(tmp1238, tmp1240, 68);
__m512 tmp1256 = _mm512_shuffle_ps(tmp1238, tmp1240, 238);
__m512 tmp1257 = _mm512_shuffle_ps(tmp1241, tmp1243, 68);
__m512 tmp1258 = _mm512_shuffle_ps(tmp1241, tmp1243, 238);
__m512 tmp1259 = _mm512_shuffle_ps(tmp1242, tmp1244, 68);
__m512 tmp1260 = _mm512_shuffle_ps(tmp1242, tmp1244, 238);
__m512 tmp1261 = _mm512_shuffle_f32x4(tmp1245, tmp1249, 136);
__m512 tmp1262 = _mm512_shuffle_f32x4(tmp1245, tmp1249, 221);
__m512 tmp1263 = _mm512_shuffle_f32x4(tmp1246, tmp1250, 136);
__m512 tmp1264 = _mm512_shuffle_f32x4(tmp1246, tmp1250, 221);
__m512 tmp1265 = _mm512_shuffle_f32x4(tmp1247, tmp1251, 136);
__m512 tmp1266 = _mm512_shuffle_f32x4(tmp1247, tmp1251, 221);
__m512 tmp1267 = _mm512_shuffle_f32x4(tmp1248, tmp1252, 136);
__m512 tmp1268 = _mm512_shuffle_f32x4(tmp1248, tmp1252, 221);
__m512 tmp1269 = _mm512_shuffle_f32x4(tmp1253, tmp1257, 136);
__m512 tmp1270 = _mm512_shuffle_f32x4(tmp1253, tmp1257, 221);
__m512 tmp1271 = _mm512_shuffle_f32x4(tmp1254, tmp1258, 136);
__m512 tmp1272 = _mm512_shuffle_f32x4(tmp1254, tmp1258, 221);
__m512 tmp1273 = _mm512_shuffle_f32x4(tmp1255, tmp1259, 136);
__m512 tmp1274 = _mm512_shuffle_f32x4(tmp1255, tmp1259, 221);
__m512 tmp1275 = _mm512_shuffle_f32x4(tmp1256, tmp1260, 136);
__m512 tmp1276 = _mm512_shuffle_f32x4(tmp1256, tmp1260, 221);
in256 = _mm512_shuffle_f32x4(tmp1261, tmp1269, 136);
in264 = _mm512_shuffle_f32x4(tmp1261, tmp1269, 221);
tmp1215 = _mm512_shuffle_f32x4(tmp1263, tmp1271, 136);
tmp1219 = _mm512_shuffle_f32x4(tmp1263, tmp1271, 221);
tmp1216 = _mm512_shuffle_f32x4(tmp1265, tmp1273, 136);
tmp1220 = _mm512_shuffle_f32x4(tmp1265, tmp1273, 221);
in262 = _mm512_shuffle_f32x4(tmp1267, tmp1275, 136);
in270 = _mm512_shuffle_f32x4(tmp1267, tmp1275, 221);
tmp1214 = _mm512_shuffle_f32x4(tmp1262, tmp1270, 136);
tmp1218 = _mm512_shuffle_f32x4(tmp1262, tmp1270, 221);
in258 = _mm512_shuffle_f32x4(tmp1264, tmp1272, 136);
in266 = _mm512_shuffle_f32x4(tmp1264, tmp1272, 221);
in260 = _mm512_shuffle_f32x4(tmp1266, tmp1274, 136);
in268 = _mm512_shuffle_f32x4(tmp1266, tmp1274, 221);
in259 = _mm512_shuffle_f32x4(tmp1268, tmp1276, 136);
in267 = _mm512_shuffle_f32x4(tmp1268, tmp1276, 221);
__m512 tmp1221 = _mm512_add_ps(tmp1215, in258);
__m512 tmp1225 = _mm512_add_ps(tmp1219, in266);
__m512 tmp1222 = _mm512_sub_ps(tmp1214, tmp1216);
__m512 tmp1226 = _mm512_sub_ps(tmp1218, tmp1220);
__m512 tmp1223 = _mm512_add_ps(tmp1216, in260);
__m512 tmp1227 = _mm512_add_ps(tmp1220, in268);
in256 = _mm512_sub_ps(in256, in260);
in264 = _mm512_sub_ps(in264, in268);
tmp1221 = _mm512_fmadd_ps(in262, _mm512_set1_ps(-4.25e+00f), tmp1221);
tmp1225 = _mm512_fmadd_ps(in270, _mm512_set1_ps(-4.25e+00f), tmp1225);
tmp1223 = _mm512_fmadd_ps(tmp1214, _mm512_set1_ps(-4.25e+00f), tmp1223);
tmp1227 = _mm512_fmadd_ps(tmp1218, _mm512_set1_ps(-4.25e+00f), tmp1227);
in256 = _mm512_fmadd_ps(tmp1222, _mm512_set1_ps(5.25e+00f), in256);
in264 = _mm512_fmadd_ps(tmp1226, _mm512_set1_ps(5.25e+00f), in264);
tmp1222 = _mm512_fmadd_ps(tmp1216, _mm512_set1_ps(2.5e-01f), in260);
tmp1226 = _mm512_fmadd_ps(tmp1220, _mm512_set1_ps(2.5e-01f), in268);
tmp1216 = _mm512_fmadd_ps(tmp1216, _mm512_set1_ps(4e+00f), in260);
tmp1220 = _mm512_fmadd_ps(tmp1220, _mm512_set1_ps(4e+00f), in268);
__m512 tmp1224 = _mm512_sub_ps(tmp1223, tmp1221);
__m512 tmp1228 = _mm512_sub_ps(tmp1227, tmp1225);
tmp1223 = _mm512_add_ps(tmp1221, tmp1223);
tmp1227 = _mm512_add_ps(tmp1225, tmp1227);
tmp1221 = _mm512_fmadd_ps(tmp1215, _mm512_set1_ps(2.5e-01f), in258);
tmp1225 = _mm512_fmadd_ps(tmp1219, _mm512_set1_ps(2.5e-01f), in266);
tmp1222 = _mm512_fmadd_ps(tmp1214, _mm512_set1_ps(-1.25e+00f), tmp1222);
tmp1226 = _mm512_fmadd_ps(tmp1218, _mm512_set1_ps(-1.25e+00f), tmp1226);
tmp1214 = _mm512_fmadd_ps(tmp1214, _mm512_set1_ps(-5e+00f), tmp1216);
tmp1218 = _mm512_fmadd_ps(tmp1218, _mm512_set1_ps(-5e+00f), tmp1220);
tmp1221 = _mm512_fmadd_ps(in262, _mm512_set1_ps(-1.25e+00f), tmp1221);
tmp1225 = _mm512_fmadd_ps(in270, _mm512_set1_ps(-1.25e+00f), tmp1225);
in260 = _mm512_fmadd_ps(tmp1221, _mm512_set1_ps(2e+00f), tmp1222);
in268 = _mm512_fmadd_ps(tmp1225, _mm512_set1_ps(2e+00f), tmp1226);
tmp1222 = _mm512_fnmadd_ps(tmp1221, _mm512_set1_ps(2e+00f), tmp1222);
tmp1226 = _mm512_fnmadd_ps(tmp1225, _mm512_set1_ps(2e+00f), tmp1226);
tmp1221 = _mm512_fmadd_ps(in258, _mm512_set1_ps(2.5e-01f), tmp1215);
tmp1225 = _mm512_fmadd_ps(in266, _mm512_set1_ps(2.5e-01f), tmp1219);
tmp1215 = _mm512_sub_ps(in259, tmp1215);
tmp1219 = _mm512_sub_ps(in267, tmp1219);
tmp1221 = _mm512_fmadd_ps(in262, _mm512_set1_ps(-1.25e+00f), tmp1221);
tmp1225 = _mm512_fmadd_ps(in270, _mm512_set1_ps(-1.25e+00f), tmp1225);
in262 = _mm512_sub_ps(in262, in258);
in270 = _mm512_sub_ps(in270, in266);
in262 = _mm512_fmadd_ps(in262, _mm512_set1_ps(5.25e+00f), tmp1215);
in270 = _mm512_fmadd_ps(in270, _mm512_set1_ps(5.25e+00f), tmp1219);
tmp1216 = _mm512_fmadd_ps(tmp1221, _mm512_set1_ps(2e+00f), tmp1214);
tmp1220 = _mm512_fmadd_ps(tmp1225, _mm512_set1_ps(2e+00f), tmp1218);
tmp1214 = _mm512_fnmadd_ps(tmp1221, _mm512_set1_ps(2e+00f), tmp1214);
tmp1218 = _mm512_fnmadd_ps(tmp1225, _mm512_set1_ps(2e+00f), tmp1218);
__m512 out309 = _mm512_shuffle_f32x4(in256, tmp1223, 68);
__m512 out317 = _mm512_shuffle_f32x4(in256, tmp1223, 238);
__m512 out310 = _mm512_shuffle_f32x4(tmp1224, in260, 68);
__m512 out318 = _mm512_shuffle_f32x4(tmp1224, in260, 238);
__m512 out311 = _mm512_shuffle_f32x4(tmp1222, tmp1216, 68);
__m512 out319 = _mm512_shuffle_f32x4(tmp1222, tmp1216, 238);
__m512 out312 = _mm512_shuffle_f32x4(tmp1214, in262, 68);
__m512 out320 = _mm512_shuffle_f32x4(tmp1214, in262, 238);
__m512 out313 = _mm512_shuffle_f32x4(in264, tmp1227, 68);
__m512 out321 = _mm512_shuffle_f32x4(in264, tmp1227, 238);
__m512 out314 = _mm512_shuffle_f32x4(tmp1228, in268, 68);
__m512 out322 = _mm512_shuffle_f32x4(tmp1228, in268, 238);
__m512 out315 = _mm512_shuffle_f32x4(tmp1226, tmp1220, 68);
__m512 out323 = _mm512_shuffle_f32x4(tmp1226, tmp1220, 238);
__m512 out316 = _mm512_shuffle_f32x4(tmp1218, in270, 68);
__m512 out324 = _mm512_shuffle_f32x4(tmp1218, in270, 238);
_mm512_storeu_ps(dfPtr2+0+2856960*i8+178560*j4+44544*ss1+768*k8, out309);
_mm512_storeu_ps(dfPtr2+128+2856960*i8+178560*j4+44544*ss1+768*k8, out317);
_mm512_storeu_ps(dfPtr2+64+2856960*i8+178560*j4+44544*ss1+768*k8, out313);
_mm512_storeu_ps(dfPtr2+192+2856960*i8+178560*j4+44544*ss1+768*k8, out321);
_mm512_storeu_ps(dfPtr2+714240+2856960*i8+178560*j4+44544*ss1+768*k8, out310);
_mm512_storeu_ps(dfPtr2+714368+2856960*i8+178560*j4+44544*ss1+768*k8, out318);
_mm512_storeu_ps(dfPtr2+714304+2856960*i8+178560*j4+44544*ss1+768*k8, out314);
_mm512_storeu_ps(dfPtr2+714432+2856960*i8+178560*j4+44544*ss1+768*k8, out322);
_mm512_storeu_ps(dfPtr2+1428480+2856960*i8+178560*j4+44544*ss1+768*k8, out311);
_mm512_storeu_ps(dfPtr2+1428608+2856960*i8+178560*j4+44544*ss1+768*k8, out319);
_mm512_storeu_ps(dfPtr2+1428544+2856960*i8+178560*j4+44544*ss1+768*k8, out315);
_mm512_storeu_ps(dfPtr2+1428672+2856960*i8+178560*j4+44544*ss1+768*k8, out323);
_mm512_storeu_ps(dfPtr2+2142720+2856960*i8+178560*j4+44544*ss1+768*k8, out312);
_mm512_storeu_ps(dfPtr2+2142848+2856960*i8+178560*j4+44544*ss1+768*k8, out320);
_mm512_storeu_ps(dfPtr2+2142784+2856960*i8+178560*j4+44544*ss1+768*k8, out316);
_mm512_storeu_ps(dfPtr2+2142912+2856960*i8+178560*j4+44544*ss1+768*k8, out324);
__m512 dat257 = _mm512_maskz_loadu_ps(16383, datPtr2+552+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512i pm37 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in272 = _mm512_permutexvar_ps(pm37, dat257);
__m512 dat258 = _mm512_maskz_loadu_ps(16383, datPtr2+644+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in273 = _mm512_permutexvar_ps(pm37, dat258);
__m512 dat259 = _mm512_maskz_loadu_ps(16383, datPtr2+736+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in274 = _mm512_permutexvar_ps(pm37, dat259);
__m512 dat260 = _mm512_maskz_loadu_ps(16383, datPtr2+828+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in275 = _mm512_permutexvar_ps(pm37, dat260);
__m512 dat261 = _mm512_maskz_loadu_ps(16383, datPtr2+920+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in276 = _mm512_permutexvar_ps(pm37, dat261);
__m512 dat262 = _mm512_maskz_loadu_ps(16383, datPtr2+1012+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in277 = _mm512_permutexvar_ps(pm37, dat262);
__m512 dat263 = _mm512_maskz_loadu_ps(16383, datPtr2+1104+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in278 = _mm512_permutexvar_ps(pm37, dat263);
__m512 dat264 = _mm512_maskz_loadu_ps(16383, datPtr2+1196+7163304*i8+92*h3+4*w3+405536*ss1+6992*k8);
__m512 in279 = _mm512_permutexvar_ps(pm37, dat264);
__m512 tmp1277 = _mm512_add_ps(in273, in277);
__m512 tmp1278 = _mm512_sub_ps(in276, in274);
__m512 tmp1279 = _mm512_add_ps(in274, in278);
in272 = _mm512_sub_ps(in272, in278);
tmp1277 = _mm512_fmadd_ps(in275, _mm512_set1_ps(-4.25e+00f), tmp1277);
tmp1279 = _mm512_fmadd_ps(in276, _mm512_set1_ps(-4.25e+00f), tmp1279);
in272 = _mm512_fmadd_ps(tmp1278, _mm512_set1_ps(5.25e+00f), in272);
tmp1278 = _mm512_fmadd_ps(in274, _mm512_set1_ps(2.5e-01f), in278);
in274 = _mm512_fmadd_ps(in274, _mm512_set1_ps(4e+00f), in278);
__m512 tmp1280 = _mm512_sub_ps(tmp1279, tmp1277);
tmp1279 = _mm512_add_ps(tmp1277, tmp1279);
tmp1277 = _mm512_fmadd_ps(in273, _mm512_set1_ps(2.5e-01f), in277);
tmp1278 = _mm512_fmadd_ps(in276, _mm512_set1_ps(-1.25e+00f), tmp1278);
in276 = _mm512_fmadd_ps(in276, _mm512_set1_ps(-5e+00f), in274);
tmp1277 = _mm512_fmadd_ps(in275, _mm512_set1_ps(-1.25e+00f), tmp1277);
in278 = _mm512_fmadd_ps(tmp1277, _mm512_set1_ps(2e+00f), tmp1278);
tmp1278 = _mm512_fnmadd_ps(tmp1277, _mm512_set1_ps(2e+00f), tmp1278);
tmp1277 = _mm512_fmadd_ps(in277, _mm512_set1_ps(2.5e-01f), in273);
in273 = _mm512_sub_ps(in279, in273);
tmp1277 = _mm512_fmadd_ps(in275, _mm512_set1_ps(-1.25e+00f), tmp1277);
in275 = _mm512_sub_ps(in275, in277);
in275 = _mm512_fmadd_ps(in275, _mm512_set1_ps(5.25e+00f), in273);
in274 = _mm512_fmadd_ps(tmp1277, _mm512_set1_ps(2e+00f), in276);
in276 = _mm512_fnmadd_ps(tmp1277, _mm512_set1_ps(2e+00f), in276);
__m512 tmp1297 = _mm512_unpacklo_ps(in272, tmp1279);
__m512 tmp1298 = _mm512_unpackhi_ps(in272, tmp1279);
__m512 tmp1299 = _mm512_unpacklo_ps(tmp1280, in278);
__m512 tmp1300 = _mm512_unpackhi_ps(tmp1280, in278);
__m512 tmp1301 = _mm512_unpacklo_ps(tmp1278, in274);
__m512 tmp1302 = _mm512_unpackhi_ps(tmp1278, in274);
__m512 tmp1303 = _mm512_unpacklo_ps(in276, in275);
__m512 tmp1304 = _mm512_unpackhi_ps(in276, in275);
__m512 tmp1305 = _mm512_shuffle_ps(tmp1297, tmp1299, 68);
__m512 tmp1306 = _mm512_shuffle_ps(tmp1297, tmp1299, 238);
__m512 tmp1307 = _mm512_shuffle_ps(tmp1298, tmp1300, 68);
__m512 tmp1308 = _mm512_shuffle_ps(tmp1298, tmp1300, 238);
__m512 tmp1309 = _mm512_shuffle_ps(tmp1301, tmp1303, 68);
__m512 tmp1310 = _mm512_shuffle_ps(tmp1301, tmp1303, 238);
__m512 tmp1311 = _mm512_shuffle_ps(tmp1302, tmp1304, 68);
__m512 tmp1312 = _mm512_shuffle_ps(tmp1302, tmp1304, 238);
__m512 tmp1313 = _mm512_shuffle_f32x4(tmp1305, tmp1309, 136);
__m512 tmp1314 = _mm512_shuffle_f32x4(tmp1305, tmp1309, 221);
__m512 tmp1315 = _mm512_shuffle_f32x4(tmp1306, tmp1310, 136);
__m512 tmp1316 = _mm512_shuffle_f32x4(tmp1306, tmp1310, 221);
__m512 tmp1317 = _mm512_shuffle_f32x4(tmp1307, tmp1311, 136);
__m512 tmp1318 = _mm512_shuffle_f32x4(tmp1307, tmp1311, 221);
__m512 tmp1319 = _mm512_shuffle_f32x4(tmp1308, tmp1312, 136);
__m512 tmp1320 = _mm512_shuffle_f32x4(tmp1308, tmp1312, 221);
in272 = _mm512_shuffle_f32x4(tmp1313, tmp1313, 136);
__m512 tmp1281 = _mm512_shuffle_f32x4(tmp1313, tmp1313, 221);
tmp1279 = _mm512_shuffle_f32x4(tmp1315, tmp1315, 136);
__m512 tmp1282 = _mm512_shuffle_f32x4(tmp1315, tmp1315, 221);
tmp1280 = _mm512_shuffle_f32x4(tmp1317, tmp1317, 136);
__m512 tmp1283 = _mm512_shuffle_f32x4(tmp1317, tmp1317, 221);
in278 = _mm512_shuffle_f32x4(tmp1319, tmp1319, 136);
__m512 tmp1284 = _mm512_shuffle_f32x4(tmp1319, tmp1319, 221);
tmp1278 = _mm512_shuffle_f32x4(tmp1314, tmp1314, 136);
__m512 tmp1285 = _mm512_shuffle_f32x4(tmp1314, tmp1314, 221);
in274 = _mm512_shuffle_f32x4(tmp1316, tmp1316, 136);
__m512 tmp1286 = _mm512_shuffle_f32x4(tmp1316, tmp1316, 221);
in276 = _mm512_shuffle_f32x4(tmp1318, tmp1318, 136);
__m512 tmp1287 = _mm512_shuffle_f32x4(tmp1318, tmp1318, 221);
in275 = _mm512_shuffle_f32x4(tmp1320, tmp1320, 136);
__m512 tmp1288 = _mm512_shuffle_f32x4(tmp1320, tmp1320, 221);
__m512 tmp1289 = _mm512_add_ps(tmp1279, in274);
__m512 tmp1293 = _mm512_add_ps(tmp1282, tmp1286);
__m512 tmp1290 = _mm512_sub_ps(tmp1278, tmp1280);
__m512 tmp1294 = _mm512_sub_ps(tmp1285, tmp1283);
__m512 tmp1291 = _mm512_add_ps(tmp1280, in276);
__m512 tmp1295 = _mm512_add_ps(tmp1283, tmp1287);
in272 = _mm512_sub_ps(in272, in276);
tmp1281 = _mm512_sub_ps(tmp1281, tmp1287);
tmp1289 = _mm512_fmadd_ps(in278, _mm512_set1_ps(-4.25e+00f), tmp1289);
tmp1293 = _mm512_fmadd_ps(tmp1284, _mm512_set1_ps(-4.25e+00f), tmp1293);
tmp1291 = _mm512_fmadd_ps(tmp1278, _mm512_set1_ps(-4.25e+00f), tmp1291);
tmp1295 = _mm512_fmadd_ps(tmp1285, _mm512_set1_ps(-4.25e+00f), tmp1295);
in272 = _mm512_fmadd_ps(tmp1290, _mm512_set1_ps(5.25e+00f), in272);
tmp1281 = _mm512_fmadd_ps(tmp1294, _mm512_set1_ps(5.25e+00f), tmp1281);
tmp1290 = _mm512_fmadd_ps(tmp1280, _mm512_set1_ps(2.5e-01f), in276);
tmp1294 = _mm512_fmadd_ps(tmp1283, _mm512_set1_ps(2.5e-01f), tmp1287);
tmp1280 = _mm512_fmadd_ps(tmp1280, _mm512_set1_ps(4e+00f), in276);
tmp1283 = _mm512_fmadd_ps(tmp1283, _mm512_set1_ps(4e+00f), tmp1287);
__m512 tmp1292 = _mm512_sub_ps(tmp1291, tmp1289);
__m512 tmp1296 = _mm512_sub_ps(tmp1295, tmp1293);
tmp1291 = _mm512_add_ps(tmp1289, tmp1291);
tmp1295 = _mm512_add_ps(tmp1293, tmp1295);
tmp1289 = _mm512_fmadd_ps(tmp1279, _mm512_set1_ps(2.5e-01f), in274);
tmp1293 = _mm512_fmadd_ps(tmp1282, _mm512_set1_ps(2.5e-01f), tmp1286);
tmp1290 = _mm512_fmadd_ps(tmp1278, _mm512_set1_ps(-1.25e+00f), tmp1290);
tmp1294 = _mm512_fmadd_ps(tmp1285, _mm512_set1_ps(-1.25e+00f), tmp1294);
tmp1278 = _mm512_fmadd_ps(tmp1278, _mm512_set1_ps(-5e+00f), tmp1280);
tmp1285 = _mm512_fmadd_ps(tmp1285, _mm512_set1_ps(-5e+00f), tmp1283);
tmp1289 = _mm512_fmadd_ps(in278, _mm512_set1_ps(-1.25e+00f), tmp1289);
tmp1293 = _mm512_fmadd_ps(tmp1284, _mm512_set1_ps(-1.25e+00f), tmp1293);
in276 = _mm512_fmadd_ps(tmp1289, _mm512_set1_ps(2e+00f), tmp1290);
tmp1287 = _mm512_fmadd_ps(tmp1293, _mm512_set1_ps(2e+00f), tmp1294);
tmp1290 = _mm512_fnmadd_ps(tmp1289, _mm512_set1_ps(2e+00f), tmp1290);
tmp1294 = _mm512_fnmadd_ps(tmp1293, _mm512_set1_ps(2e+00f), tmp1294);
tmp1289 = _mm512_fmadd_ps(in274, _mm512_set1_ps(2.5e-01f), tmp1279);
tmp1293 = _mm512_fmadd_ps(tmp1286, _mm512_set1_ps(2.5e-01f), tmp1282);
tmp1279 = _mm512_sub_ps(in275, tmp1279);
tmp1282 = _mm512_sub_ps(tmp1288, tmp1282);
tmp1289 = _mm512_fmadd_ps(in278, _mm512_set1_ps(-1.25e+00f), tmp1289);
tmp1293 = _mm512_fmadd_ps(tmp1284, _mm512_set1_ps(-1.25e+00f), tmp1293);
in278 = _mm512_sub_ps(in278, in274);
tmp1284 = _mm512_sub_ps(tmp1284, tmp1286);
in278 = _mm512_fmadd_ps(in278, _mm512_set1_ps(5.25e+00f), tmp1279);
tmp1284 = _mm512_fmadd_ps(tmp1284, _mm512_set1_ps(5.25e+00f), tmp1282);
tmp1280 = _mm512_fmadd_ps(tmp1289, _mm512_set1_ps(2e+00f), tmp1278);
tmp1283 = _mm512_fmadd_ps(tmp1293, _mm512_set1_ps(2e+00f), tmp1285);
tmp1278 = _mm512_fnmadd_ps(tmp1289, _mm512_set1_ps(2e+00f), tmp1278);
tmp1285 = _mm512_fnmadd_ps(tmp1293, _mm512_set1_ps(2e+00f), tmp1285);
__m512 out325 = _mm512_shuffle_f32x4(in272, tmp1291, 68);
__m512 out326 = _mm512_shuffle_f32x4(tmp1292, in276, 68);
__m512 out327 = _mm512_shuffle_f32x4(tmp1290, tmp1280, 68);
__m512 out328 = _mm512_shuffle_f32x4(tmp1278, in278, 68);
__m512 out329 = _mm512_shuffle_f32x4(tmp1281, tmp1295, 68);
__m512 out330 = _mm512_shuffle_f32x4(tmp1296, tmp1287, 68);
__m512 out331 = _mm512_shuffle_f32x4(tmp1294, tmp1283, 68);
__m512 out332 = _mm512_shuffle_f32x4(tmp1285, tmp1284, 68);
_mm512_storeu_ps(dfPtr2+256+2856960*i8+178560*j4+44544*ss1+768*k8, out325);
_mm512_storeu_ps(dfPtr2+320+2856960*i8+178560*j4+44544*ss1+768*k8, out329);
_mm512_storeu_ps(dfPtr2+714496+2856960*i8+178560*j4+44544*ss1+768*k8, out326);
_mm512_storeu_ps(dfPtr2+714560+2856960*i8+178560*j4+44544*ss1+768*k8, out330);
_mm512_storeu_ps(dfPtr2+1428736+2856960*i8+178560*j4+44544*ss1+768*k8, out327);
_mm512_storeu_ps(dfPtr2+1428800+2856960*i8+178560*j4+44544*ss1+768*k8, out331);
_mm512_storeu_ps(dfPtr2+2142976+2856960*i8+178560*j4+44544*ss1+768*k8, out328);
_mm512_storeu_ps(dfPtr2+2143040+2856960*i8+178560*j4+44544*ss1+768*k8, out332);
}
if (j4 >= last2) return;
++j4;
rel2 = 1;
}
ptrdiff_t h4 = base2+6;
ptrdiff_t w4 = 12;
if (s5 < 3) {
ptrdiff_t k9 = 0;
for (; k9 != 58; ++k9) {
__m512 dat265 = _mm512_maskz_loadu_ps(2047, datPtr2+0+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat266 = _mm512_maskz_loadu_ps(16383, datPtr2+504+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512i pm38 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in280 = _mm512_permutexvar_ps(pm38, dat265);
__m512i pm39 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in288 = _mm512_permutexvar_ps(pm39, dat266);
__m512 dat267 = _mm512_maskz_loadu_ps(2047, datPtr2+92+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat268 = _mm512_maskz_loadu_ps(16383, datPtr2+596+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in281 = _mm512_permutexvar_ps(pm38, dat267);
__m512 in289 = _mm512_permutexvar_ps(pm39, dat268);
__m512 dat269 = _mm512_maskz_loadu_ps(2047, datPtr2+184+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat270 = _mm512_maskz_loadu_ps(16383, datPtr2+688+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in282 = _mm512_permutexvar_ps(pm38, dat269);
__m512 in290 = _mm512_permutexvar_ps(pm39, dat270);
__m512 dat271 = _mm512_maskz_loadu_ps(2047, datPtr2+276+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat272 = _mm512_maskz_loadu_ps(16383, datPtr2+780+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in283 = _mm512_permutexvar_ps(pm38, dat271);
__m512 in291 = _mm512_permutexvar_ps(pm39, dat272);
__m512 dat273 = _mm512_maskz_loadu_ps(2047, datPtr2+368+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat274 = _mm512_maskz_loadu_ps(16383, datPtr2+872+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in284 = _mm512_permutexvar_ps(pm38, dat273);
__m512 in292 = _mm512_permutexvar_ps(pm39, dat274);
__m512 dat275 = _mm512_maskz_loadu_ps(2047, datPtr2+460+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat276 = _mm512_maskz_loadu_ps(16383, datPtr2+964+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in285 = _mm512_permutexvar_ps(pm38, dat275);
__m512 in293 = _mm512_permutexvar_ps(pm39, dat276);
__m512 dat277 = _mm512_maskz_loadu_ps(2047, datPtr2+552+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat278 = _mm512_maskz_loadu_ps(16383, datPtr2+1056+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in286 = _mm512_permutexvar_ps(pm38, dat277);
__m512 in294 = _mm512_permutexvar_ps(pm39, dat278);
__m512 dat279 = _mm512_maskz_loadu_ps(2047, datPtr2+644+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat280 = _mm512_maskz_loadu_ps(16383, datPtr2+1148+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in287 = _mm512_permutexvar_ps(pm38, dat279);
__m512 in295 = _mm512_permutexvar_ps(pm39, dat280);
__m512 tmp1321 = _mm512_add_ps(in281, in285);
__m512 tmp1325 = _mm512_add_ps(in289, in293);
__m512 tmp1322 = _mm512_sub_ps(in284, in282);
__m512 tmp1326 = _mm512_sub_ps(in292, in290);
__m512 tmp1323 = _mm512_add_ps(in282, in286);
__m512 tmp1327 = _mm512_add_ps(in290, in294);
in280 = _mm512_sub_ps(in280, in286);
in288 = _mm512_sub_ps(in288, in294);
tmp1321 = _mm512_fmadd_ps(in283, _mm512_set1_ps(-4.25e+00f), tmp1321);
tmp1325 = _mm512_fmadd_ps(in291, _mm512_set1_ps(-4.25e+00f), tmp1325);
tmp1323 = _mm512_fmadd_ps(in284, _mm512_set1_ps(-4.25e+00f), tmp1323);
tmp1327 = _mm512_fmadd_ps(in292, _mm512_set1_ps(-4.25e+00f), tmp1327);
in280 = _mm512_fmadd_ps(tmp1322, _mm512_set1_ps(5.25e+00f), in280);
in288 = _mm512_fmadd_ps(tmp1326, _mm512_set1_ps(5.25e+00f), in288);
tmp1322 = _mm512_fmadd_ps(in282, _mm512_set1_ps(2.5e-01f), in286);
tmp1326 = _mm512_fmadd_ps(in290, _mm512_set1_ps(2.5e-01f), in294);
in282 = _mm512_fmadd_ps(in282, _mm512_set1_ps(4e+00f), in286);
in290 = _mm512_fmadd_ps(in290, _mm512_set1_ps(4e+00f), in294);
__m512 tmp1324 = _mm512_sub_ps(tmp1323, tmp1321);
__m512 tmp1328 = _mm512_sub_ps(tmp1327, tmp1325);
tmp1323 = _mm512_add_ps(tmp1321, tmp1323);
tmp1327 = _mm512_add_ps(tmp1325, tmp1327);
tmp1321 = _mm512_fmadd_ps(in281, _mm512_set1_ps(2.5e-01f), in285);
tmp1325 = _mm512_fmadd_ps(in289, _mm512_set1_ps(2.5e-01f), in293);
tmp1322 = _mm512_fmadd_ps(in284, _mm512_set1_ps(-1.25e+00f), tmp1322);
tmp1326 = _mm512_fmadd_ps(in292, _mm512_set1_ps(-1.25e+00f), tmp1326);
in284 = _mm512_fmadd_ps(in284, _mm512_set1_ps(-5e+00f), in282);
in292 = _mm512_fmadd_ps(in292, _mm512_set1_ps(-5e+00f), in290);
tmp1321 = _mm512_fmadd_ps(in283, _mm512_set1_ps(-1.25e+00f), tmp1321);
tmp1325 = _mm512_fmadd_ps(in291, _mm512_set1_ps(-1.25e+00f), tmp1325);
in286 = _mm512_fmadd_ps(tmp1321, _mm512_set1_ps(2e+00f), tmp1322);
in294 = _mm512_fmadd_ps(tmp1325, _mm512_set1_ps(2e+00f), tmp1326);
tmp1322 = _mm512_fnmadd_ps(tmp1321, _mm512_set1_ps(2e+00f), tmp1322);
tmp1326 = _mm512_fnmadd_ps(tmp1325, _mm512_set1_ps(2e+00f), tmp1326);
tmp1321 = _mm512_fmadd_ps(in285, _mm512_set1_ps(2.5e-01f), in281);
tmp1325 = _mm512_fmadd_ps(in293, _mm512_set1_ps(2.5e-01f), in289);
in281 = _mm512_sub_ps(in287, in281);
in289 = _mm512_sub_ps(in295, in289);
tmp1321 = _mm512_fmadd_ps(in283, _mm512_set1_ps(-1.25e+00f), tmp1321);
tmp1325 = _mm512_fmadd_ps(in291, _mm512_set1_ps(-1.25e+00f), tmp1325);
in283 = _mm512_sub_ps(in283, in285);
in291 = _mm512_sub_ps(in291, in293);
in283 = _mm512_fmadd_ps(in283, _mm512_set1_ps(5.25e+00f), in281);
in291 = _mm512_fmadd_ps(in291, _mm512_set1_ps(5.25e+00f), in289);
in282 = _mm512_fmadd_ps(tmp1321, _mm512_set1_ps(2e+00f), in284);
in290 = _mm512_fmadd_ps(tmp1325, _mm512_set1_ps(2e+00f), in292);
in284 = _mm512_fnmadd_ps(tmp1321, _mm512_set1_ps(2e+00f), in284);
in292 = _mm512_fnmadd_ps(tmp1325, _mm512_set1_ps(2e+00f), in292);
__m512 tmp1337 = _mm512_unpacklo_ps(in280, tmp1323);
__m512 tmp1338 = _mm512_unpackhi_ps(in280, tmp1323);
__m512 tmp1339 = _mm512_unpacklo_ps(tmp1324, in286);
__m512 tmp1340 = _mm512_unpackhi_ps(tmp1324, in286);
__m512 tmp1341 = _mm512_unpacklo_ps(tmp1322, in282);
__m512 tmp1342 = _mm512_unpackhi_ps(tmp1322, in282);
__m512 tmp1343 = _mm512_unpacklo_ps(in284, in283);
__m512 tmp1344 = _mm512_unpackhi_ps(in284, in283);
__m512 tmp1345 = _mm512_unpacklo_ps(in288, tmp1327);
__m512 tmp1346 = _mm512_unpackhi_ps(in288, tmp1327);
__m512 tmp1347 = _mm512_unpacklo_ps(tmp1328, in294);
__m512 tmp1348 = _mm512_unpackhi_ps(tmp1328, in294);
__m512 tmp1349 = _mm512_unpacklo_ps(tmp1326, in290);
__m512 tmp1350 = _mm512_unpackhi_ps(tmp1326, in290);
__m512 tmp1351 = _mm512_unpacklo_ps(in292, in291);
__m512 tmp1352 = _mm512_unpackhi_ps(in292, in291);
__m512 tmp1353 = _mm512_shuffle_ps(tmp1337, tmp1339, 68);
__m512 tmp1354 = _mm512_shuffle_ps(tmp1337, tmp1339, 238);
__m512 tmp1355 = _mm512_shuffle_ps(tmp1338, tmp1340, 68);
__m512 tmp1356 = _mm512_shuffle_ps(tmp1338, tmp1340, 238);
__m512 tmp1357 = _mm512_shuffle_ps(tmp1341, tmp1343, 68);
__m512 tmp1358 = _mm512_shuffle_ps(tmp1341, tmp1343, 238);
__m512 tmp1359 = _mm512_shuffle_ps(tmp1342, tmp1344, 68);
__m512 tmp1360 = _mm512_shuffle_ps(tmp1342, tmp1344, 238);
__m512 tmp1361 = _mm512_shuffle_ps(tmp1345, tmp1347, 68);
__m512 tmp1362 = _mm512_shuffle_ps(tmp1345, tmp1347, 238);
__m512 tmp1363 = _mm512_shuffle_ps(tmp1346, tmp1348, 68);
__m512 tmp1364 = _mm512_shuffle_ps(tmp1346, tmp1348, 238);
__m512 tmp1365 = _mm512_shuffle_ps(tmp1349, tmp1351, 68);
__m512 tmp1366 = _mm512_shuffle_ps(tmp1349, tmp1351, 238);
__m512 tmp1367 = _mm512_shuffle_ps(tmp1350, tmp1352, 68);
__m512 tmp1368 = _mm512_shuffle_ps(tmp1350, tmp1352, 238);
__m512 tmp1369 = _mm512_shuffle_f32x4(tmp1353, tmp1357, 136);
__m512 tmp1370 = _mm512_shuffle_f32x4(tmp1353, tmp1357, 221);
__m512 tmp1371 = _mm512_shuffle_f32x4(tmp1354, tmp1358, 136);
__m512 tmp1372 = _mm512_shuffle_f32x4(tmp1354, tmp1358, 221);
__m512 tmp1373 = _mm512_shuffle_f32x4(tmp1355, tmp1359, 136);
__m512 tmp1374 = _mm512_shuffle_f32x4(tmp1355, tmp1359, 221);
__m512 tmp1375 = _mm512_shuffle_f32x4(tmp1356, tmp1360, 136);
__m512 tmp1376 = _mm512_shuffle_f32x4(tmp1356, tmp1360, 221);
__m512 tmp1377 = _mm512_shuffle_f32x4(tmp1361, tmp1365, 136);
__m512 tmp1378 = _mm512_shuffle_f32x4(tmp1361, tmp1365, 221);
__m512 tmp1379 = _mm512_shuffle_f32x4(tmp1362, tmp1366, 136);
__m512 tmp1380 = _mm512_shuffle_f32x4(tmp1362, tmp1366, 221);
__m512 tmp1381 = _mm512_shuffle_f32x4(tmp1363, tmp1367, 136);
__m512 tmp1382 = _mm512_shuffle_f32x4(tmp1363, tmp1367, 221);
__m512 tmp1383 = _mm512_shuffle_f32x4(tmp1364, tmp1368, 136);
__m512 tmp1384 = _mm512_shuffle_f32x4(tmp1364, tmp1368, 221);
in280 = _mm512_shuffle_f32x4(tmp1369, tmp1377, 136);
in288 = _mm512_shuffle_f32x4(tmp1369, tmp1377, 221);
tmp1323 = _mm512_shuffle_f32x4(tmp1371, tmp1379, 136);
tmp1327 = _mm512_shuffle_f32x4(tmp1371, tmp1379, 221);
tmp1324 = _mm512_shuffle_f32x4(tmp1373, tmp1381, 136);
tmp1328 = _mm512_shuffle_f32x4(tmp1373, tmp1381, 221);
in286 = _mm512_shuffle_f32x4(tmp1375, tmp1383, 136);
in294 = _mm512_shuffle_f32x4(tmp1375, tmp1383, 221);
tmp1322 = _mm512_shuffle_f32x4(tmp1370, tmp1378, 136);
tmp1326 = _mm512_shuffle_f32x4(tmp1370, tmp1378, 221);
in282 = _mm512_shuffle_f32x4(tmp1372, tmp1380, 136);
in290 = _mm512_shuffle_f32x4(tmp1372, tmp1380, 221);
in284 = _mm512_shuffle_f32x4(tmp1374, tmp1382, 136);
in292 = _mm512_shuffle_f32x4(tmp1374, tmp1382, 221);
in283 = _mm512_shuffle_f32x4(tmp1376, tmp1384, 136);
in291 = _mm512_shuffle_f32x4(tmp1376, tmp1384, 221);
__m512 tmp1329 = _mm512_add_ps(tmp1323, in282);
__m512 tmp1333 = _mm512_add_ps(tmp1327, in290);
__m512 tmp1330 = _mm512_sub_ps(tmp1322, tmp1324);
__m512 tmp1334 = _mm512_sub_ps(tmp1326, tmp1328);
__m512 tmp1331 = _mm512_add_ps(tmp1324, in284);
__m512 tmp1335 = _mm512_add_ps(tmp1328, in292);
in280 = _mm512_sub_ps(in280, in284);
in288 = _mm512_sub_ps(in288, in292);
tmp1329 = _mm512_fmadd_ps(in286, _mm512_set1_ps(-4.25e+00f), tmp1329);
tmp1333 = _mm512_fmadd_ps(in294, _mm512_set1_ps(-4.25e+00f), tmp1333);
tmp1331 = _mm512_fmadd_ps(tmp1322, _mm512_set1_ps(-4.25e+00f), tmp1331);
tmp1335 = _mm512_fmadd_ps(tmp1326, _mm512_set1_ps(-4.25e+00f), tmp1335);
in280 = _mm512_fmadd_ps(tmp1330, _mm512_set1_ps(5.25e+00f), in280);
in288 = _mm512_fmadd_ps(tmp1334, _mm512_set1_ps(5.25e+00f), in288);
tmp1330 = _mm512_fmadd_ps(tmp1324, _mm512_set1_ps(2.5e-01f), in284);
tmp1334 = _mm512_fmadd_ps(tmp1328, _mm512_set1_ps(2.5e-01f), in292);
tmp1324 = _mm512_fmadd_ps(tmp1324, _mm512_set1_ps(4e+00f), in284);
tmp1328 = _mm512_fmadd_ps(tmp1328, _mm512_set1_ps(4e+00f), in292);
__m512 tmp1332 = _mm512_sub_ps(tmp1331, tmp1329);
__m512 tmp1336 = _mm512_sub_ps(tmp1335, tmp1333);
tmp1331 = _mm512_add_ps(tmp1329, tmp1331);
tmp1335 = _mm512_add_ps(tmp1333, tmp1335);
tmp1329 = _mm512_fmadd_ps(tmp1323, _mm512_set1_ps(2.5e-01f), in282);
tmp1333 = _mm512_fmadd_ps(tmp1327, _mm512_set1_ps(2.5e-01f), in290);
tmp1330 = _mm512_fmadd_ps(tmp1322, _mm512_set1_ps(-1.25e+00f), tmp1330);
tmp1334 = _mm512_fmadd_ps(tmp1326, _mm512_set1_ps(-1.25e+00f), tmp1334);
tmp1322 = _mm512_fmadd_ps(tmp1322, _mm512_set1_ps(-5e+00f), tmp1324);
tmp1326 = _mm512_fmadd_ps(tmp1326, _mm512_set1_ps(-5e+00f), tmp1328);
tmp1329 = _mm512_fmadd_ps(in286, _mm512_set1_ps(-1.25e+00f), tmp1329);
tmp1333 = _mm512_fmadd_ps(in294, _mm512_set1_ps(-1.25e+00f), tmp1333);
in284 = _mm512_fmadd_ps(tmp1329, _mm512_set1_ps(2e+00f), tmp1330);
in292 = _mm512_fmadd_ps(tmp1333, _mm512_set1_ps(2e+00f), tmp1334);
tmp1330 = _mm512_fnmadd_ps(tmp1329, _mm512_set1_ps(2e+00f), tmp1330);
tmp1334 = _mm512_fnmadd_ps(tmp1333, _mm512_set1_ps(2e+00f), tmp1334);
tmp1329 = _mm512_fmadd_ps(in282, _mm512_set1_ps(2.5e-01f), tmp1323);
tmp1333 = _mm512_fmadd_ps(in290, _mm512_set1_ps(2.5e-01f), tmp1327);
tmp1323 = _mm512_sub_ps(in283, tmp1323);
tmp1327 = _mm512_sub_ps(in291, tmp1327);
tmp1329 = _mm512_fmadd_ps(in286, _mm512_set1_ps(-1.25e+00f), tmp1329);
tmp1333 = _mm512_fmadd_ps(in294, _mm512_set1_ps(-1.25e+00f), tmp1333);
in286 = _mm512_sub_ps(in286, in282);
in294 = _mm512_sub_ps(in294, in290);
in286 = _mm512_fmadd_ps(in286, _mm512_set1_ps(5.25e+00f), tmp1323);
in294 = _mm512_fmadd_ps(in294, _mm512_set1_ps(5.25e+00f), tmp1327);
tmp1324 = _mm512_fmadd_ps(tmp1329, _mm512_set1_ps(2e+00f), tmp1322);
tmp1328 = _mm512_fmadd_ps(tmp1333, _mm512_set1_ps(2e+00f), tmp1326);
tmp1322 = _mm512_fnmadd_ps(tmp1329, _mm512_set1_ps(2e+00f), tmp1322);
tmp1326 = _mm512_fnmadd_ps(tmp1333, _mm512_set1_ps(2e+00f), tmp1326);
__m512 out333 = _mm512_shuffle_f32x4(in280, tmp1331, 68);
__m512 out341 = _mm512_shuffle_f32x4(in280, tmp1331, 238);
__m512 out334 = _mm512_shuffle_f32x4(tmp1332, in284, 68);
__m512 out342 = _mm512_shuffle_f32x4(tmp1332, in284, 238);
__m512 out335 = _mm512_shuffle_f32x4(tmp1330, tmp1324, 68);
__m512 out343 = _mm512_shuffle_f32x4(tmp1330, tmp1324, 238);
__m512 out336 = _mm512_shuffle_f32x4(tmp1322, in286, 68);
__m512 out344 = _mm512_shuffle_f32x4(tmp1322, in286, 238);
__m512 out337 = _mm512_shuffle_f32x4(in288, tmp1335, 68);
__m512 out345 = _mm512_shuffle_f32x4(in288, tmp1335, 238);
__m512 out338 = _mm512_shuffle_f32x4(tmp1336, in292, 68);
__m512 out346 = _mm512_shuffle_f32x4(tmp1336, in292, 238);
__m512 out339 = _mm512_shuffle_f32x4(tmp1334, tmp1328, 68);
__m512 out347 = _mm512_shuffle_f32x4(tmp1334, tmp1328, 238);
__m512 out340 = _mm512_shuffle_f32x4(tmp1326, in294, 68);
__m512 out348 = _mm512_shuffle_f32x4(tmp1326, in294, 238);
_mm512_storeu_ps(dfPtr2+0+2856960*i8+178560*j4+44544*s5+768*k9, out333);
_mm512_storeu_ps(dfPtr2+128+2856960*i8+178560*j4+44544*s5+768*k9, out341);
_mm512_storeu_ps(dfPtr2+64+2856960*i8+178560*j4+44544*s5+768*k9, out337);
_mm512_storeu_ps(dfPtr2+192+2856960*i8+178560*j4+44544*s5+768*k9, out345);
_mm512_storeu_ps(dfPtr2+714240+2856960*i8+178560*j4+44544*s5+768*k9, out334);
_mm512_storeu_ps(dfPtr2+714368+2856960*i8+178560*j4+44544*s5+768*k9, out342);
_mm512_storeu_ps(dfPtr2+714304+2856960*i8+178560*j4+44544*s5+768*k9, out338);
_mm512_storeu_ps(dfPtr2+714432+2856960*i8+178560*j4+44544*s5+768*k9, out346);
_mm512_storeu_ps(dfPtr2+1428480+2856960*i8+178560*j4+44544*s5+768*k9, out335);
_mm512_storeu_ps(dfPtr2+1428608+2856960*i8+178560*j4+44544*s5+768*k9, out343);
_mm512_storeu_ps(dfPtr2+1428544+2856960*i8+178560*j4+44544*s5+768*k9, out339);
_mm512_storeu_ps(dfPtr2+1428672+2856960*i8+178560*j4+44544*s5+768*k9, out347);
_mm512_storeu_ps(dfPtr2+2142720+2856960*i8+178560*j4+44544*s5+768*k9, out336);
_mm512_storeu_ps(dfPtr2+2142848+2856960*i8+178560*j4+44544*s5+768*k9, out344);
_mm512_storeu_ps(dfPtr2+2142784+2856960*i8+178560*j4+44544*s5+768*k9, out340);
_mm512_storeu_ps(dfPtr2+2142912+2856960*i8+178560*j4+44544*s5+768*k9, out348);
__m512 dat281 = _mm512_maskz_loadu_ps(2047, datPtr2+552+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat282 = _mm512_maskz_loadu_ps(2047, datPtr2+3496+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512i pm40 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in296 = _mm512_permutexvar_ps(pm40, dat281);
__m512 in304 = _mm512_permutexvar_ps(pm40, dat282);
__m512 dat283 = _mm512_maskz_loadu_ps(2047, datPtr2+644+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat284 = _mm512_maskz_loadu_ps(2047, datPtr2+3588+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in297 = _mm512_permutexvar_ps(pm40, dat283);
__m512 in305 = _mm512_permutexvar_ps(pm40, dat284);
__m512 dat285 = _mm512_maskz_loadu_ps(2047, datPtr2+736+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat286 = _mm512_maskz_loadu_ps(2047, datPtr2+3680+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in298 = _mm512_permutexvar_ps(pm40, dat285);
__m512 in306 = _mm512_permutexvar_ps(pm40, dat286);
__m512 dat287 = _mm512_maskz_loadu_ps(2047, datPtr2+828+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat288 = _mm512_maskz_loadu_ps(2047, datPtr2+3772+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in299 = _mm512_permutexvar_ps(pm40, dat287);
__m512 in307 = _mm512_permutexvar_ps(pm40, dat288);
__m512 dat289 = _mm512_maskz_loadu_ps(2047, datPtr2+920+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat290 = _mm512_maskz_loadu_ps(2047, datPtr2+3864+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in300 = _mm512_permutexvar_ps(pm40, dat289);
__m512 in308 = _mm512_permutexvar_ps(pm40, dat290);
__m512 dat291 = _mm512_maskz_loadu_ps(2047, datPtr2+1012+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat292 = _mm512_maskz_loadu_ps(2047, datPtr2+3956+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in301 = _mm512_permutexvar_ps(pm40, dat291);
__m512 in309 = _mm512_permutexvar_ps(pm40, dat292);
__m512 dat293 = _mm512_maskz_loadu_ps(2047, datPtr2+1104+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat294 = _mm512_maskz_loadu_ps(2047, datPtr2+4048+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in302 = _mm512_permutexvar_ps(pm40, dat293);
__m512 in310 = _mm512_permutexvar_ps(pm40, dat294);
__m512 dat295 = _mm512_maskz_loadu_ps(2047, datPtr2+1196+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat296 = _mm512_maskz_loadu_ps(2047, datPtr2+4140+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in303 = _mm512_permutexvar_ps(pm40, dat295);
__m512 in311 = _mm512_permutexvar_ps(pm40, dat296);
__m512 tmp1385 = _mm512_add_ps(in297, in301);
__m512 tmp1389 = _mm512_add_ps(in305, in309);
__m512 tmp1386 = _mm512_sub_ps(in300, in298);
__m512 tmp1390 = _mm512_sub_ps(in308, in306);
__m512 tmp1387 = _mm512_add_ps(in298, in302);
__m512 tmp1391 = _mm512_add_ps(in306, in310);
in296 = _mm512_sub_ps(in296, in302);
in304 = _mm512_sub_ps(in304, in310);
tmp1385 = _mm512_fmadd_ps(in299, _mm512_set1_ps(-4.25e+00f), tmp1385);
tmp1389 = _mm512_fmadd_ps(in307, _mm512_set1_ps(-4.25e+00f), tmp1389);
tmp1387 = _mm512_fmadd_ps(in300, _mm512_set1_ps(-4.25e+00f), tmp1387);
tmp1391 = _mm512_fmadd_ps(in308, _mm512_set1_ps(-4.25e+00f), tmp1391);
in296 = _mm512_fmadd_ps(tmp1386, _mm512_set1_ps(5.25e+00f), in296);
in304 = _mm512_fmadd_ps(tmp1390, _mm512_set1_ps(5.25e+00f), in304);
tmp1386 = _mm512_fmadd_ps(in298, _mm512_set1_ps(2.5e-01f), in302);
tmp1390 = _mm512_fmadd_ps(in306, _mm512_set1_ps(2.5e-01f), in310);
in298 = _mm512_fmadd_ps(in298, _mm512_set1_ps(4e+00f), in302);
in306 = _mm512_fmadd_ps(in306, _mm512_set1_ps(4e+00f), in310);
__m512 tmp1388 = _mm512_sub_ps(tmp1387, tmp1385);
__m512 tmp1392 = _mm512_sub_ps(tmp1391, tmp1389);
tmp1387 = _mm512_add_ps(tmp1385, tmp1387);
tmp1391 = _mm512_add_ps(tmp1389, tmp1391);
tmp1385 = _mm512_fmadd_ps(in297, _mm512_set1_ps(2.5e-01f), in301);
tmp1389 = _mm512_fmadd_ps(in305, _mm512_set1_ps(2.5e-01f), in309);
tmp1386 = _mm512_fmadd_ps(in300, _mm512_set1_ps(-1.25e+00f), tmp1386);
tmp1390 = _mm512_fmadd_ps(in308, _mm512_set1_ps(-1.25e+00f), tmp1390);
in300 = _mm512_fmadd_ps(in300, _mm512_set1_ps(-5e+00f), in298);
in308 = _mm512_fmadd_ps(in308, _mm512_set1_ps(-5e+00f), in306);
tmp1385 = _mm512_fmadd_ps(in299, _mm512_set1_ps(-1.25e+00f), tmp1385);
tmp1389 = _mm512_fmadd_ps(in307, _mm512_set1_ps(-1.25e+00f), tmp1389);
in302 = _mm512_fmadd_ps(tmp1385, _mm512_set1_ps(2e+00f), tmp1386);
in310 = _mm512_fmadd_ps(tmp1389, _mm512_set1_ps(2e+00f), tmp1390);
tmp1386 = _mm512_fnmadd_ps(tmp1385, _mm512_set1_ps(2e+00f), tmp1386);
tmp1390 = _mm512_fnmadd_ps(tmp1389, _mm512_set1_ps(2e+00f), tmp1390);
tmp1385 = _mm512_fmadd_ps(in301, _mm512_set1_ps(2.5e-01f), in297);
tmp1389 = _mm512_fmadd_ps(in309, _mm512_set1_ps(2.5e-01f), in305);
in297 = _mm512_sub_ps(in303, in297);
in305 = _mm512_sub_ps(in311, in305);
tmp1385 = _mm512_fmadd_ps(in299, _mm512_set1_ps(-1.25e+00f), tmp1385);
tmp1389 = _mm512_fmadd_ps(in307, _mm512_set1_ps(-1.25e+00f), tmp1389);
in299 = _mm512_sub_ps(in299, in301);
in307 = _mm512_sub_ps(in307, in309);
in299 = _mm512_fmadd_ps(in299, _mm512_set1_ps(5.25e+00f), in297);
in307 = _mm512_fmadd_ps(in307, _mm512_set1_ps(5.25e+00f), in305);
in298 = _mm512_fmadd_ps(tmp1385, _mm512_set1_ps(2e+00f), in300);
in306 = _mm512_fmadd_ps(tmp1389, _mm512_set1_ps(2e+00f), in308);
in300 = _mm512_fnmadd_ps(tmp1385, _mm512_set1_ps(2e+00f), in300);
in308 = _mm512_fnmadd_ps(tmp1389, _mm512_set1_ps(2e+00f), in308);
__m512 tmp1402 = _mm512_unpacklo_ps(in296, tmp1387);
__m512 tmp1403 = _mm512_unpackhi_ps(in296, tmp1387);
__m512 tmp1404 = _mm512_unpacklo_ps(tmp1388, in302);
__m512 tmp1405 = _mm512_unpackhi_ps(tmp1388, in302);
__m512 tmp1406 = _mm512_unpacklo_ps(tmp1386, in298);
__m512 tmp1407 = _mm512_unpackhi_ps(tmp1386, in298);
__m512 tmp1408 = _mm512_unpacklo_ps(in300, in299);
__m512 tmp1409 = _mm512_unpackhi_ps(in300, in299);
__m512 tmp1410 = _mm512_unpacklo_ps(in304, tmp1391);
__m512 tmp1411 = _mm512_unpackhi_ps(in304, tmp1391);
__m512 tmp1412 = _mm512_unpacklo_ps(tmp1392, in310);
__m512 tmp1413 = _mm512_unpackhi_ps(tmp1392, in310);
__m512 tmp1414 = _mm512_unpacklo_ps(tmp1390, in306);
__m512 tmp1415 = _mm512_unpackhi_ps(tmp1390, in306);
__m512 tmp1416 = _mm512_unpacklo_ps(in308, in307);
__m512 tmp1417 = _mm512_unpackhi_ps(in308, in307);
__m512 tmp1418 = _mm512_shuffle_ps(tmp1402, tmp1404, 68);
__m512 tmp1419 = _mm512_shuffle_ps(tmp1402, tmp1404, 238);
__m512 tmp1420 = _mm512_shuffle_ps(tmp1403, tmp1405, 68);
__m512 tmp1421 = _mm512_shuffle_ps(tmp1403, tmp1405, 238);
__m512 tmp1422 = _mm512_shuffle_ps(tmp1406, tmp1408, 68);
__m512 tmp1423 = _mm512_shuffle_ps(tmp1406, tmp1408, 238);
__m512 tmp1424 = _mm512_shuffle_ps(tmp1407, tmp1409, 68);
__m512 tmp1425 = _mm512_shuffle_ps(tmp1407, tmp1409, 238);
__m512 tmp1426 = _mm512_shuffle_ps(tmp1410, tmp1412, 68);
__m512 tmp1427 = _mm512_shuffle_ps(tmp1410, tmp1412, 238);
__m512 tmp1428 = _mm512_shuffle_ps(tmp1411, tmp1413, 68);
__m512 tmp1429 = _mm512_shuffle_ps(tmp1411, tmp1413, 238);
__m512 tmp1430 = _mm512_shuffle_ps(tmp1414, tmp1416, 68);
__m512 tmp1431 = _mm512_shuffle_ps(tmp1414, tmp1416, 238);
__m512 tmp1432 = _mm512_shuffle_ps(tmp1415, tmp1417, 68);
__m512 tmp1433 = _mm512_shuffle_ps(tmp1415, tmp1417, 238);
__m512 tmp1434 = _mm512_shuffle_f32x4(tmp1418, tmp1422, 136);
__m512 tmp1435 = _mm512_shuffle_f32x4(tmp1418, tmp1422, 221);
__m512 tmp1436 = _mm512_shuffle_f32x4(tmp1419, tmp1423, 136);
__m512 tmp1437 = _mm512_shuffle_f32x4(tmp1419, tmp1423, 221);
__m512 tmp1438 = _mm512_shuffle_f32x4(tmp1420, tmp1424, 136);
__m512 tmp1439 = _mm512_shuffle_f32x4(tmp1420, tmp1424, 221);
__m512 tmp1440 = _mm512_shuffle_f32x4(tmp1421, tmp1425, 136);
__m512 tmp1441 = _mm512_shuffle_f32x4(tmp1421, tmp1425, 221);
__m512 tmp1442 = _mm512_shuffle_f32x4(tmp1426, tmp1430, 136);
__m512 tmp1443 = _mm512_shuffle_f32x4(tmp1426, tmp1430, 221);
__m512 tmp1444 = _mm512_shuffle_f32x4(tmp1427, tmp1431, 136);
__m512 tmp1445 = _mm512_shuffle_f32x4(tmp1427, tmp1431, 221);
__m512 tmp1446 = _mm512_shuffle_f32x4(tmp1428, tmp1432, 136);
__m512 tmp1447 = _mm512_shuffle_f32x4(tmp1428, tmp1432, 221);
__m512 tmp1448 = _mm512_shuffle_f32x4(tmp1429, tmp1433, 136);
__m512 tmp1449 = _mm512_shuffle_f32x4(tmp1429, tmp1433, 221);
in296 = _mm512_shuffle_f32x4(tmp1434, tmp1442, 136);
in304 = _mm512_shuffle_f32x4(tmp1434, tmp1442, 221);
tmp1387 = _mm512_shuffle_f32x4(tmp1436, tmp1444, 136);
tmp1391 = _mm512_shuffle_f32x4(tmp1436, tmp1444, 221);
tmp1388 = _mm512_shuffle_f32x4(tmp1438, tmp1446, 136);
tmp1392 = _mm512_shuffle_f32x4(tmp1438, tmp1446, 221);
in302 = _mm512_shuffle_f32x4(tmp1440, tmp1448, 136);
in310 = _mm512_shuffle_f32x4(tmp1440, tmp1448, 221);
tmp1386 = _mm512_shuffle_f32x4(tmp1435, tmp1443, 136);
tmp1390 = _mm512_shuffle_f32x4(tmp1435, tmp1443, 221);
in298 = _mm512_shuffle_f32x4(tmp1437, tmp1445, 136);
in300 = _mm512_shuffle_f32x4(tmp1439, tmp1447, 136);
in299 = _mm512_shuffle_f32x4(tmp1441, tmp1449, 136);
__m512 tmp1393 = _mm512_add_ps(tmp1387, in298);
__m512 tmp1397 = tmp1391;
__m512 tmp1394 = _mm512_sub_ps(tmp1386, tmp1388);
__m512 tmp1398 = _mm512_sub_ps(tmp1390, tmp1392);
__m512 tmp1395 = _mm512_add_ps(tmp1388, in300);
__m512 tmp1399 = tmp1392;
in296 = _mm512_sub_ps(in296, in300);
in304 = in304;
tmp1393 = _mm512_fmadd_ps(in302, _mm512_set1_ps(-4.25e+00f), tmp1393);
tmp1397 = _mm512_fmadd_ps(in310, _mm512_set1_ps(-4.25e+00f), tmp1397);
tmp1395 = _mm512_fmadd_ps(tmp1386, _mm512_set1_ps(-4.25e+00f), tmp1395);
tmp1399 = _mm512_fmadd_ps(tmp1390, _mm512_set1_ps(-4.25e+00f), tmp1399);
in296 = _mm512_fmadd_ps(tmp1394, _mm512_set1_ps(5.25e+00f), in296);
in304 = _mm512_fmadd_ps(tmp1398, _mm512_set1_ps(5.25e+00f), in304);
tmp1394 = _mm512_fmadd_ps(tmp1388, _mm512_set1_ps(2.5e-01f), in300);
tmp1398 = _mm512_mul_ps(tmp1392, _mm512_set1_ps(2.5e-01f));
tmp1388 = _mm512_fmadd_ps(tmp1388, _mm512_set1_ps(4e+00f), in300);
tmp1392 = _mm512_mul_ps(tmp1392, _mm512_set1_ps(4e+00f));
__m512 tmp1396 = _mm512_sub_ps(tmp1395, tmp1393);
__m512 tmp1400 = _mm512_sub_ps(tmp1399, tmp1397);
tmp1395 = _mm512_add_ps(tmp1393, tmp1395);
tmp1399 = _mm512_add_ps(tmp1397, tmp1399);
tmp1393 = _mm512_fmadd_ps(tmp1387, _mm512_set1_ps(2.5e-01f), in298);
tmp1397 = _mm512_mul_ps(tmp1391, _mm512_set1_ps(2.5e-01f));
tmp1394 = _mm512_fmadd_ps(tmp1386, _mm512_set1_ps(-1.25e+00f), tmp1394);
tmp1398 = _mm512_fmadd_ps(tmp1390, _mm512_set1_ps(-1.25e+00f), tmp1398);
tmp1386 = _mm512_fmadd_ps(tmp1386, _mm512_set1_ps(-5e+00f), tmp1388);
tmp1390 = _mm512_fmadd_ps(tmp1390, _mm512_set1_ps(-5e+00f), tmp1392);
tmp1393 = _mm512_fmadd_ps(in302, _mm512_set1_ps(-1.25e+00f), tmp1393);
tmp1397 = _mm512_fmadd_ps(in310, _mm512_set1_ps(-1.25e+00f), tmp1397);
in300 = _mm512_fmadd_ps(tmp1393, _mm512_set1_ps(2e+00f), tmp1394);
__m512 tmp1401 = _mm512_fmadd_ps(tmp1397, _mm512_set1_ps(2e+00f), tmp1398);
tmp1394 = _mm512_fnmadd_ps(tmp1393, _mm512_set1_ps(2e+00f), tmp1394);
tmp1398 = _mm512_fnmadd_ps(tmp1397, _mm512_set1_ps(2e+00f), tmp1398);
tmp1393 = _mm512_fmadd_ps(in298, _mm512_set1_ps(2.5e-01f), tmp1387);
tmp1397 = tmp1391;
tmp1387 = _mm512_sub_ps(in299, tmp1387);
tmp1391 = _mm512_sub_ps(_mm512_setzero_ps(), tmp1391);
tmp1393 = _mm512_fmadd_ps(in302, _mm512_set1_ps(-1.25e+00f), tmp1393);
tmp1397 = _mm512_fmadd_ps(in310, _mm512_set1_ps(-1.25e+00f), tmp1397);
in302 = _mm512_sub_ps(in302, in298);
in310 = in310;
in302 = _mm512_fmadd_ps(in302, _mm512_set1_ps(5.25e+00f), tmp1387);
in310 = _mm512_fmadd_ps(in310, _mm512_set1_ps(5.25e+00f), tmp1391);
tmp1388 = _mm512_fmadd_ps(tmp1393, _mm512_set1_ps(2e+00f), tmp1386);
tmp1392 = _mm512_fmadd_ps(tmp1397, _mm512_set1_ps(2e+00f), tmp1390);
tmp1386 = _mm512_fnmadd_ps(tmp1393, _mm512_set1_ps(2e+00f), tmp1386);
tmp1390 = _mm512_fnmadd_ps(tmp1397, _mm512_set1_ps(2e+00f), tmp1390);
__m512 out349 = _mm512_shuffle_f32x4(in296, tmp1395, 68);
__m512 out357 = _mm512_shuffle_f32x4(in296, tmp1395, 238);
__m512 out350 = _mm512_shuffle_f32x4(tmp1396, in300, 68);
__m512 out358 = _mm512_shuffle_f32x4(tmp1396, in300, 238);
__m512 out351 = _mm512_shuffle_f32x4(tmp1394, tmp1388, 68);
__m512 out359 = _mm512_shuffle_f32x4(tmp1394, tmp1388, 238);
__m512 out352 = _mm512_shuffle_f32x4(tmp1386, in302, 68);
__m512 out360 = _mm512_shuffle_f32x4(tmp1386, in302, 238);
__m512 out353 = _mm512_shuffle_f32x4(in304, tmp1399, 68);
__m512 out361 = _mm512_shuffle_f32x4(in304, tmp1399, 238);
__m512 out354 = _mm512_shuffle_f32x4(tmp1400, tmp1401, 68);
__m512 out362 = _mm512_shuffle_f32x4(tmp1400, tmp1401, 238);
__m512 out355 = _mm512_shuffle_f32x4(tmp1398, tmp1392, 68);
__m512 out363 = _mm512_shuffle_f32x4(tmp1398, tmp1392, 238);
__m512 out356 = _mm512_shuffle_f32x4(tmp1390, in310, 68);
__m512 out364 = _mm512_shuffle_f32x4(tmp1390, in310, 238);
_mm512_storeu_ps(dfPtr2+256+2856960*i8+178560*j4+44544*s5+768*k9, out349);
_mm512_storeu_ps(dfPtr2+384+2856960*i8+178560*j4+44544*s5+768*k9, out357);
_mm512_storeu_ps(dfPtr2+320+2856960*i8+178560*j4+44544*s5+768*k9, out353);
_mm512_storeu_ps(dfPtr2+448+2856960*i8+178560*j4+44544*s5+768*k9, out361);
_mm512_storeu_ps(dfPtr2+714496+2856960*i8+178560*j4+44544*s5+768*k9, out350);
_mm512_storeu_ps(dfPtr2+714624+2856960*i8+178560*j4+44544*s5+768*k9, out358);
_mm512_storeu_ps(dfPtr2+714560+2856960*i8+178560*j4+44544*s5+768*k9, out354);
_mm512_storeu_ps(dfPtr2+714688+2856960*i8+178560*j4+44544*s5+768*k9, out362);
_mm512_storeu_ps(dfPtr2+1428736+2856960*i8+178560*j4+44544*s5+768*k9, out351);
_mm512_storeu_ps(dfPtr2+1428864+2856960*i8+178560*j4+44544*s5+768*k9, out359);
_mm512_storeu_ps(dfPtr2+1428800+2856960*i8+178560*j4+44544*s5+768*k9, out355);
_mm512_storeu_ps(dfPtr2+1428928+2856960*i8+178560*j4+44544*s5+768*k9, out363);
_mm512_storeu_ps(dfPtr2+2142976+2856960*i8+178560*j4+44544*s5+768*k9, out352);
_mm512_storeu_ps(dfPtr2+2143104+2856960*i8+178560*j4+44544*s5+768*k9, out360);
_mm512_storeu_ps(dfPtr2+2143040+2856960*i8+178560*j4+44544*s5+768*k9, out356);
_mm512_storeu_ps(dfPtr2+2143168+2856960*i8+178560*j4+44544*s5+768*k9, out364);
__m512 dat297 = _mm512_maskz_loadu_ps(16383, datPtr2+4000+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat298 = _mm512_maskz_loadu_ps(2047, datPtr2+4048+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512i pm41 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in312 = _mm512_permutexvar_ps(pm41, dat297);
__m512i pm42 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in320 = _mm512_permutexvar_ps(pm42, dat298);
__m512 dat299 = _mm512_maskz_loadu_ps(16383, datPtr2+4092+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat300 = _mm512_maskz_loadu_ps(2047, datPtr2+4140+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in313 = _mm512_permutexvar_ps(pm41, dat299);
__m512 in321 = _mm512_permutexvar_ps(pm42, dat300);
__m512 dat301 = _mm512_maskz_loadu_ps(16383, datPtr2+4184+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat302 = _mm512_maskz_loadu_ps(2047, datPtr2+4232+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in314 = _mm512_permutexvar_ps(pm41, dat301);
__m512 in322 = _mm512_permutexvar_ps(pm42, dat302);
__m512 dat303 = _mm512_maskz_loadu_ps(16383, datPtr2+4276+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat304 = _mm512_maskz_loadu_ps(2047, datPtr2+4324+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in315 = _mm512_permutexvar_ps(pm41, dat303);
__m512 in323 = _mm512_permutexvar_ps(pm42, dat304);
__m512 dat305 = _mm512_maskz_loadu_ps(16383, datPtr2+4368+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat306 = _mm512_maskz_loadu_ps(2047, datPtr2+4416+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in316 = _mm512_permutexvar_ps(pm41, dat305);
__m512 in324 = _mm512_permutexvar_ps(pm42, dat306);
__m512 dat307 = _mm512_maskz_loadu_ps(16383, datPtr2+4460+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat308 = _mm512_maskz_loadu_ps(2047, datPtr2+4508+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in317 = _mm512_permutexvar_ps(pm41, dat307);
__m512 in325 = _mm512_permutexvar_ps(pm42, dat308);
__m512 dat309 = _mm512_maskz_loadu_ps(16383, datPtr2+4552+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat310 = _mm512_maskz_loadu_ps(2047, datPtr2+4600+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in318 = _mm512_permutexvar_ps(pm41, dat309);
__m512 in326 = _mm512_permutexvar_ps(pm42, dat310);
__m512 dat311 = _mm512_maskz_loadu_ps(16383, datPtr2+4644+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 dat312 = _mm512_maskz_loadu_ps(2047, datPtr2+4692+7163304*i8+92*h4+4*w4+405536*s5+6992*k9);
__m512 in319 = _mm512_permutexvar_ps(pm41, dat311);
__m512 in327 = _mm512_permutexvar_ps(pm42, dat312);
__m512 tmp1450 = _mm512_add_ps(in313, in317);
__m512 tmp1454 = _mm512_add_ps(in321, in325);
__m512 tmp1451 = _mm512_sub_ps(in316, in314);
__m512 tmp1455 = _mm512_sub_ps(in324, in322);
__m512 tmp1452 = _mm512_add_ps(in314, in318);
__m512 tmp1456 = _mm512_add_ps(in322, in326);
in312 = _mm512_sub_ps(in312, in318);
in320 = _mm512_sub_ps(in320, in326);
tmp1450 = _mm512_fmadd_ps(in315, _mm512_set1_ps(-4.25e+00f), tmp1450);
tmp1454 = _mm512_fmadd_ps(in323, _mm512_set1_ps(-4.25e+00f), tmp1454);
tmp1452 = _mm512_fmadd_ps(in316, _mm512_set1_ps(-4.25e+00f), tmp1452);
tmp1456 = _mm512_fmadd_ps(in324, _mm512_set1_ps(-4.25e+00f), tmp1456);
in312 = _mm512_fmadd_ps(tmp1451, _mm512_set1_ps(5.25e+00f), in312);
in320 = _mm512_fmadd_ps(tmp1455, _mm512_set1_ps(5.25e+00f), in320);
tmp1451 = _mm512_fmadd_ps(in314, _mm512_set1_ps(2.5e-01f), in318);
tmp1455 = _mm512_fmadd_ps(in322, _mm512_set1_ps(2.5e-01f), in326);
in314 = _mm512_fmadd_ps(in314, _mm512_set1_ps(4e+00f), in318);
in322 = _mm512_fmadd_ps(in322, _mm512_set1_ps(4e+00f), in326);
__m512 tmp1453 = _mm512_sub_ps(tmp1452, tmp1450);
__m512 tmp1457 = _mm512_sub_ps(tmp1456, tmp1454);
tmp1452 = _mm512_add_ps(tmp1450, tmp1452);
tmp1456 = _mm512_add_ps(tmp1454, tmp1456);
tmp1450 = _mm512_fmadd_ps(in313, _mm512_set1_ps(2.5e-01f), in317);
tmp1454 = _mm512_fmadd_ps(in321, _mm512_set1_ps(2.5e-01f), in325);
tmp1451 = _mm512_fmadd_ps(in316, _mm512_set1_ps(-1.25e+00f), tmp1451);
tmp1455 = _mm512_fmadd_ps(in324, _mm512_set1_ps(-1.25e+00f), tmp1455);
in316 = _mm512_fmadd_ps(in316, _mm512_set1_ps(-5e+00f), in314);
in324 = _mm512_fmadd_ps(in324, _mm512_set1_ps(-5e+00f), in322);
tmp1450 = _mm512_fmadd_ps(in315, _mm512_set1_ps(-1.25e+00f), tmp1450);
tmp1454 = _mm512_fmadd_ps(in323, _mm512_set1_ps(-1.25e+00f), tmp1454);
in318 = _mm512_fmadd_ps(tmp1450, _mm512_set1_ps(2e+00f), tmp1451);
in326 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(2e+00f), tmp1455);
tmp1451 = _mm512_fnmadd_ps(tmp1450, _mm512_set1_ps(2e+00f), tmp1451);
tmp1455 = _mm512_fnmadd_ps(tmp1454, _mm512_set1_ps(2e+00f), tmp1455);
tmp1450 = _mm512_fmadd_ps(in317, _mm512_set1_ps(2.5e-01f), in313);
tmp1454 = _mm512_fmadd_ps(in325, _mm512_set1_ps(2.5e-01f), in321);
in313 = _mm512_sub_ps(in319, in313);
in321 = _mm512_sub_ps(in327, in321);
tmp1450 = _mm512_fmadd_ps(in315, _mm512_set1_ps(-1.25e+00f), tmp1450);
tmp1454 = _mm512_fmadd_ps(in323, _mm512_set1_ps(-1.25e+00f), tmp1454);
in315 = _mm512_sub_ps(in315, in317);
in323 = _mm512_sub_ps(in323, in325);
in315 = _mm512_fmadd_ps(in315, _mm512_set1_ps(5.25e+00f), in313);
in323 = _mm512_fmadd_ps(in323, _mm512_set1_ps(5.25e+00f), in321);
in314 = _mm512_fmadd_ps(tmp1450, _mm512_set1_ps(2e+00f), in316);
in322 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(2e+00f), in324);
in316 = _mm512_fnmadd_ps(tmp1450, _mm512_set1_ps(2e+00f), in316);
in324 = _mm512_fnmadd_ps(tmp1454, _mm512_set1_ps(2e+00f), in324);
__m512 tmp1466 = _mm512_unpacklo_ps(in312, tmp1452);
__m512 tmp1467 = _mm512_unpackhi_ps(in312, tmp1452);
__m512 tmp1468 = _mm512_unpacklo_ps(tmp1453, in318);
__m512 tmp1469 = _mm512_unpackhi_ps(tmp1453, in318);
__m512 tmp1470 = _mm512_unpacklo_ps(tmp1451, in314);
__m512 tmp1471 = _mm512_unpackhi_ps(tmp1451, in314);
__m512 tmp1472 = _mm512_unpacklo_ps(in316, in315);
__m512 tmp1473 = _mm512_unpackhi_ps(in316, in315);
__m512 tmp1474 = _mm512_unpacklo_ps(in320, tmp1456);
__m512 tmp1475 = _mm512_unpackhi_ps(in320, tmp1456);
__m512 tmp1476 = _mm512_unpacklo_ps(tmp1457, in326);
__m512 tmp1477 = _mm512_unpackhi_ps(tmp1457, in326);
__m512 tmp1478 = _mm512_unpacklo_ps(tmp1455, in322);
__m512 tmp1479 = _mm512_unpackhi_ps(tmp1455, in322);
__m512 tmp1480 = _mm512_unpacklo_ps(in324, in323);
__m512 tmp1481 = _mm512_unpackhi_ps(in324, in323);
__m512 tmp1482 = _mm512_shuffle_ps(tmp1466, tmp1468, 68);
__m512 tmp1483 = _mm512_shuffle_ps(tmp1466, tmp1468, 238);
__m512 tmp1484 = _mm512_shuffle_ps(tmp1467, tmp1469, 68);
__m512 tmp1485 = _mm512_shuffle_ps(tmp1467, tmp1469, 238);
__m512 tmp1486 = _mm512_shuffle_ps(tmp1470, tmp1472, 68);
__m512 tmp1487 = _mm512_shuffle_ps(tmp1470, tmp1472, 238);
__m512 tmp1488 = _mm512_shuffle_ps(tmp1471, tmp1473, 68);
__m512 tmp1489 = _mm512_shuffle_ps(tmp1471, tmp1473, 238);
__m512 tmp1490 = _mm512_shuffle_ps(tmp1474, tmp1476, 68);
__m512 tmp1491 = _mm512_shuffle_ps(tmp1474, tmp1476, 238);
__m512 tmp1492 = _mm512_shuffle_ps(tmp1475, tmp1477, 68);
__m512 tmp1493 = _mm512_shuffle_ps(tmp1475, tmp1477, 238);
__m512 tmp1494 = _mm512_shuffle_ps(tmp1478, tmp1480, 68);
__m512 tmp1495 = _mm512_shuffle_ps(tmp1478, tmp1480, 238);
__m512 tmp1496 = _mm512_shuffle_ps(tmp1479, tmp1481, 68);
__m512 tmp1497 = _mm512_shuffle_ps(tmp1479, tmp1481, 238);
__m512 tmp1498 = _mm512_shuffle_f32x4(tmp1482, tmp1486, 136);
__m512 tmp1499 = _mm512_shuffle_f32x4(tmp1482, tmp1486, 221);
__m512 tmp1500 = _mm512_shuffle_f32x4(tmp1483, tmp1487, 136);
__m512 tmp1501 = _mm512_shuffle_f32x4(tmp1483, tmp1487, 221);
__m512 tmp1502 = _mm512_shuffle_f32x4(tmp1484, tmp1488, 136);
__m512 tmp1503 = _mm512_shuffle_f32x4(tmp1484, tmp1488, 221);
__m512 tmp1504 = _mm512_shuffle_f32x4(tmp1485, tmp1489, 136);
__m512 tmp1505 = _mm512_shuffle_f32x4(tmp1485, tmp1489, 221);
__m512 tmp1506 = _mm512_shuffle_f32x4(tmp1490, tmp1494, 136);
__m512 tmp1507 = _mm512_shuffle_f32x4(tmp1490, tmp1494, 221);
__m512 tmp1508 = _mm512_shuffle_f32x4(tmp1491, tmp1495, 136);
__m512 tmp1509 = _mm512_shuffle_f32x4(tmp1491, tmp1495, 221);
__m512 tmp1510 = _mm512_shuffle_f32x4(tmp1492, tmp1496, 136);
__m512 tmp1511 = _mm512_shuffle_f32x4(tmp1492, tmp1496, 221);
__m512 tmp1512 = _mm512_shuffle_f32x4(tmp1493, tmp1497, 136);
__m512 tmp1513 = _mm512_shuffle_f32x4(tmp1493, tmp1497, 221);
in312 = _mm512_shuffle_f32x4(tmp1498, tmp1506, 136);
in320 = _mm512_shuffle_f32x4(tmp1498, tmp1506, 221);
tmp1452 = _mm512_shuffle_f32x4(tmp1500, tmp1508, 136);
tmp1456 = _mm512_shuffle_f32x4(tmp1500, tmp1508, 221);
tmp1453 = _mm512_shuffle_f32x4(tmp1502, tmp1510, 136);
tmp1457 = _mm512_shuffle_f32x4(tmp1502, tmp1510, 221);
in318 = _mm512_shuffle_f32x4(tmp1504, tmp1512, 136);
in326 = _mm512_shuffle_f32x4(tmp1504, tmp1512, 221);
tmp1451 = _mm512_shuffle_f32x4(tmp1499, tmp1507, 136);
tmp1455 = _mm512_shuffle_f32x4(tmp1499, tmp1507, 221);
in314 = _mm512_shuffle_f32x4(tmp1501, tmp1509, 136);
in322 = _mm512_shuffle_f32x4(tmp1501, tmp1509, 221);
in316 = _mm512_shuffle_f32x4(tmp1503, tmp1511, 136);
in324 = _mm512_shuffle_f32x4(tmp1503, tmp1511, 221);
in315 = _mm512_shuffle_f32x4(tmp1505, tmp1513, 136);
in323 = _mm512_shuffle_f32x4(tmp1505, tmp1513, 221);
__m512 tmp1458 = _mm512_add_ps(tmp1452, in314);
__m512 tmp1462 = _mm512_add_ps(tmp1456, in322);
__m512 tmp1459 = _mm512_sub_ps(tmp1451, tmp1453);
__m512 tmp1463 = _mm512_sub_ps(tmp1455, tmp1457);
__m512 tmp1460 = _mm512_add_ps(tmp1453, in316);
__m512 tmp1464 = _mm512_add_ps(tmp1457, in324);
in312 = _mm512_sub_ps(in312, in316);
in320 = _mm512_sub_ps(in320, in324);
tmp1458 = _mm512_fmadd_ps(in318, _mm512_set1_ps(-4.25e+00f), tmp1458);
tmp1462 = _mm512_fmadd_ps(in326, _mm512_set1_ps(-4.25e+00f), tmp1462);
tmp1460 = _mm512_fmadd_ps(tmp1451, _mm512_set1_ps(-4.25e+00f), tmp1460);
tmp1464 = _mm512_fmadd_ps(tmp1455, _mm512_set1_ps(-4.25e+00f), tmp1464);
in312 = _mm512_fmadd_ps(tmp1459, _mm512_set1_ps(5.25e+00f), in312);
in320 = _mm512_fmadd_ps(tmp1463, _mm512_set1_ps(5.25e+00f), in320);
tmp1459 = _mm512_fmadd_ps(tmp1453, _mm512_set1_ps(2.5e-01f), in316);
tmp1463 = _mm512_fmadd_ps(tmp1457, _mm512_set1_ps(2.5e-01f), in324);
tmp1453 = _mm512_fmadd_ps(tmp1453, _mm512_set1_ps(4e+00f), in316);
tmp1457 = _mm512_fmadd_ps(tmp1457, _mm512_set1_ps(4e+00f), in324);
__m512 tmp1461 = _mm512_sub_ps(tmp1460, tmp1458);
__m512 tmp1465 = _mm512_sub_ps(tmp1464, tmp1462);
tmp1460 = _mm512_add_ps(tmp1458, tmp1460);
tmp1464 = _mm512_add_ps(tmp1462, tmp1464);
tmp1458 = _mm512_fmadd_ps(tmp1452, _mm512_set1_ps(2.5e-01f), in314);
tmp1462 = _mm512_fmadd_ps(tmp1456, _mm512_set1_ps(2.5e-01f), in322);
tmp1459 = _mm512_fmadd_ps(tmp1451, _mm512_set1_ps(-1.25e+00f), tmp1459);
tmp1463 = _mm512_fmadd_ps(tmp1455, _mm512_set1_ps(-1.25e+00f), tmp1463);
tmp1451 = _mm512_fmadd_ps(tmp1451, _mm512_set1_ps(-5e+00f), tmp1453);
tmp1455 = _mm512_fmadd_ps(tmp1455, _mm512_set1_ps(-5e+00f), tmp1457);
tmp1458 = _mm512_fmadd_ps(in318, _mm512_set1_ps(-1.25e+00f), tmp1458);
tmp1462 = _mm512_fmadd_ps(in326, _mm512_set1_ps(-1.25e+00f), tmp1462);
in316 = _mm512_fmadd_ps(tmp1458, _mm512_set1_ps(2e+00f), tmp1459);
in324 = _mm512_fmadd_ps(tmp1462, _mm512_set1_ps(2e+00f), tmp1463);
tmp1459 = _mm512_fnmadd_ps(tmp1458, _mm512_set1_ps(2e+00f), tmp1459);
tmp1463 = _mm512_fnmadd_ps(tmp1462, _mm512_set1_ps(2e+00f), tmp1463);
tmp1458 = _mm512_fmadd_ps(in314, _mm512_set1_ps(2.5e-01f), tmp1452);
tmp1462 = _mm512_fmadd_ps(in322, _mm512_set1_ps(2.5e-01f), tmp1456);
tmp1452 = _mm512_sub_ps(in315, tmp1452);
tmp1456 = _mm512_sub_ps(in323, tmp1456);
tmp1458 = _mm512_fmadd_ps(in318, _mm512_set1_ps(-1.25e+00f), tmp1458);
tmp1462 = _mm512_fmadd_ps(in326, _mm512_set1_ps(-1.25e+00f), tmp1462);
in318 = _mm512_sub_ps(in318, in314);
in326 = _mm512_sub_ps(in326, in322);
in318 = _mm512_fmadd_ps(in318, _mm512_set1_ps(5.25e+00f), tmp1452);
in326 = _mm512_fmadd_ps(in326, _mm512_set1_ps(5.25e+00f), tmp1456);
tmp1453 = _mm512_fmadd_ps(tmp1458, _mm512_set1_ps(2e+00f), tmp1451);
tmp1457 = _mm512_fmadd_ps(tmp1462, _mm512_set1_ps(2e+00f), tmp1455);
tmp1451 = _mm512_fnmadd_ps(tmp1458, _mm512_set1_ps(2e+00f), tmp1451);
tmp1455 = _mm512_fnmadd_ps(tmp1462, _mm512_set1_ps(2e+00f), tmp1455);
__m512 out365 = _mm512_shuffle_f32x4(in312, tmp1460, 68);
__m512 out373 = _mm512_shuffle_f32x4(in312, tmp1460, 238);
__m512 out366 = _mm512_shuffle_f32x4(tmp1461, in316, 68);
__m512 out374 = _mm512_shuffle_f32x4(tmp1461, in316, 238);
__m512 out367 = _mm512_shuffle_f32x4(tmp1459, tmp1453, 68);
__m512 out375 = _mm512_shuffle_f32x4(tmp1459, tmp1453, 238);
__m512 out368 = _mm512_shuffle_f32x4(tmp1451, in318, 68);
__m512 out376 = _mm512_shuffle_f32x4(tmp1451, in318, 238);
__m512 out369 = _mm512_shuffle_f32x4(in320, tmp1464, 68);
__m512 out377 = _mm512_shuffle_f32x4(in320, tmp1464, 238);
__m512 out370 = _mm512_shuffle_f32x4(tmp1465, in324, 68);
__m512 out378 = _mm512_shuffle_f32x4(tmp1465, in324, 238);
__m512 out371 = _mm512_shuffle_f32x4(tmp1463, tmp1457, 68);
__m512 out379 = _mm512_shuffle_f32x4(tmp1463, tmp1457, 238);
__m512 out372 = _mm512_shuffle_f32x4(tmp1455, in326, 68);
__m512 out380 = _mm512_shuffle_f32x4(tmp1455, in326, 238);
_mm512_storeu_ps(dfPtr2+512+2856960*i8+178560*j4+44544*s5+768*k9, out365);
_mm512_storeu_ps(dfPtr2+640+2856960*i8+178560*j4+44544*s5+768*k9, out373);
_mm512_storeu_ps(dfPtr2+576+2856960*i8+178560*j4+44544*s5+768*k9, out369);
_mm512_storeu_ps(dfPtr2+704+2856960*i8+178560*j4+44544*s5+768*k9, out377);
_mm512_storeu_ps(dfPtr2+714752+2856960*i8+178560*j4+44544*s5+768*k9, out366);
_mm512_storeu_ps(dfPtr2+714880+2856960*i8+178560*j4+44544*s5+768*k9, out374);
_mm512_storeu_ps(dfPtr2+714816+2856960*i8+178560*j4+44544*s5+768*k9, out370);
_mm512_storeu_ps(dfPtr2+714944+2856960*i8+178560*j4+44544*s5+768*k9, out378);
_mm512_storeu_ps(dfPtr2+1428992+2856960*i8+178560*j4+44544*s5+768*k9, out367);
_mm512_storeu_ps(dfPtr2+1429120+2856960*i8+178560*j4+44544*s5+768*k9, out375);
_mm512_storeu_ps(dfPtr2+1429056+2856960*i8+178560*j4+44544*s5+768*k9, out371);
_mm512_storeu_ps(dfPtr2+1429184+2856960*i8+178560*j4+44544*s5+768*k9, out379);
_mm512_storeu_ps(dfPtr2+2143232+2856960*i8+178560*j4+44544*s5+768*k9, out368);
_mm512_storeu_ps(dfPtr2+2143360+2856960*i8+178560*j4+44544*s5+768*k9, out376);
_mm512_storeu_ps(dfPtr2+2143296+2856960*i8+178560*j4+44544*s5+768*k9, out372);
_mm512_storeu_ps(dfPtr2+2143424+2856960*i8+178560*j4+44544*s5+768*k9, out380);
}
} else {
ptrdiff_t ss2 = 3;
ptrdiff_t k10 = 0;
for (; k10 != 58; ++k10) {
__m512 dat313 = _mm512_maskz_loadu_ps(2047, datPtr2+0+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat314 = _mm512_maskz_loadu_ps(16383, datPtr2+504+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512i pm43 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in328 = _mm512_permutexvar_ps(pm43, dat313);
__m512i pm44 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in336 = _mm512_permutexvar_ps(pm44, dat314);
__m512 dat315 = _mm512_maskz_loadu_ps(2047, datPtr2+92+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat316 = _mm512_maskz_loadu_ps(16383, datPtr2+596+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in329 = _mm512_permutexvar_ps(pm43, dat315);
__m512 in337 = _mm512_permutexvar_ps(pm44, dat316);
__m512 dat317 = _mm512_maskz_loadu_ps(2047, datPtr2+184+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat318 = _mm512_maskz_loadu_ps(16383, datPtr2+688+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in330 = _mm512_permutexvar_ps(pm43, dat317);
__m512 in338 = _mm512_permutexvar_ps(pm44, dat318);
__m512 dat319 = _mm512_maskz_loadu_ps(2047, datPtr2+276+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat320 = _mm512_maskz_loadu_ps(16383, datPtr2+780+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in331 = _mm512_permutexvar_ps(pm43, dat319);
__m512 in339 = _mm512_permutexvar_ps(pm44, dat320);
__m512 dat321 = _mm512_maskz_loadu_ps(2047, datPtr2+368+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat322 = _mm512_maskz_loadu_ps(16383, datPtr2+872+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in332 = _mm512_permutexvar_ps(pm43, dat321);
__m512 in340 = _mm512_permutexvar_ps(pm44, dat322);
__m512 dat323 = _mm512_maskz_loadu_ps(2047, datPtr2+460+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat324 = _mm512_maskz_loadu_ps(16383, datPtr2+964+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in333 = _mm512_permutexvar_ps(pm43, dat323);
__m512 in341 = _mm512_permutexvar_ps(pm44, dat324);
__m512 dat325 = _mm512_maskz_loadu_ps(2047, datPtr2+552+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat326 = _mm512_maskz_loadu_ps(16383, datPtr2+1056+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in334 = _mm512_permutexvar_ps(pm43, dat325);
__m512 in342 = _mm512_permutexvar_ps(pm44, dat326);
__m512 dat327 = _mm512_maskz_loadu_ps(2047, datPtr2+644+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat328 = _mm512_maskz_loadu_ps(16383, datPtr2+1148+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in335 = _mm512_permutexvar_ps(pm43, dat327);
__m512 in343 = _mm512_permutexvar_ps(pm44, dat328);
__m512 tmp1514 = _mm512_add_ps(in329, in333);
__m512 tmp1518 = _mm512_add_ps(in337, in341);
__m512 tmp1515 = _mm512_sub_ps(in332, in330);
__m512 tmp1519 = _mm512_sub_ps(in340, in338);
__m512 tmp1516 = _mm512_add_ps(in330, in334);
__m512 tmp1520 = _mm512_add_ps(in338, in342);
in328 = _mm512_sub_ps(in328, in334);
in336 = _mm512_sub_ps(in336, in342);
tmp1514 = _mm512_fmadd_ps(in331, _mm512_set1_ps(-4.25e+00f), tmp1514);
tmp1518 = _mm512_fmadd_ps(in339, _mm512_set1_ps(-4.25e+00f), tmp1518);
tmp1516 = _mm512_fmadd_ps(in332, _mm512_set1_ps(-4.25e+00f), tmp1516);
tmp1520 = _mm512_fmadd_ps(in340, _mm512_set1_ps(-4.25e+00f), tmp1520);
in328 = _mm512_fmadd_ps(tmp1515, _mm512_set1_ps(5.25e+00f), in328);
in336 = _mm512_fmadd_ps(tmp1519, _mm512_set1_ps(5.25e+00f), in336);
tmp1515 = _mm512_fmadd_ps(in330, _mm512_set1_ps(2.5e-01f), in334);
tmp1519 = _mm512_fmadd_ps(in338, _mm512_set1_ps(2.5e-01f), in342);
in330 = _mm512_fmadd_ps(in330, _mm512_set1_ps(4e+00f), in334);
in338 = _mm512_fmadd_ps(in338, _mm512_set1_ps(4e+00f), in342);
__m512 tmp1517 = _mm512_sub_ps(tmp1516, tmp1514);
__m512 tmp1521 = _mm512_sub_ps(tmp1520, tmp1518);
tmp1516 = _mm512_add_ps(tmp1514, tmp1516);
tmp1520 = _mm512_add_ps(tmp1518, tmp1520);
tmp1514 = _mm512_fmadd_ps(in329, _mm512_set1_ps(2.5e-01f), in333);
tmp1518 = _mm512_fmadd_ps(in337, _mm512_set1_ps(2.5e-01f), in341);
tmp1515 = _mm512_fmadd_ps(in332, _mm512_set1_ps(-1.25e+00f), tmp1515);
tmp1519 = _mm512_fmadd_ps(in340, _mm512_set1_ps(-1.25e+00f), tmp1519);
in332 = _mm512_fmadd_ps(in332, _mm512_set1_ps(-5e+00f), in330);
in340 = _mm512_fmadd_ps(in340, _mm512_set1_ps(-5e+00f), in338);
tmp1514 = _mm512_fmadd_ps(in331, _mm512_set1_ps(-1.25e+00f), tmp1514);
tmp1518 = _mm512_fmadd_ps(in339, _mm512_set1_ps(-1.25e+00f), tmp1518);
in334 = _mm512_fmadd_ps(tmp1514, _mm512_set1_ps(2e+00f), tmp1515);
in342 = _mm512_fmadd_ps(tmp1518, _mm512_set1_ps(2e+00f), tmp1519);
tmp1515 = _mm512_fnmadd_ps(tmp1514, _mm512_set1_ps(2e+00f), tmp1515);
tmp1519 = _mm512_fnmadd_ps(tmp1518, _mm512_set1_ps(2e+00f), tmp1519);
tmp1514 = _mm512_fmadd_ps(in333, _mm512_set1_ps(2.5e-01f), in329);
tmp1518 = _mm512_fmadd_ps(in341, _mm512_set1_ps(2.5e-01f), in337);
in329 = _mm512_sub_ps(in335, in329);
in337 = _mm512_sub_ps(in343, in337);
tmp1514 = _mm512_fmadd_ps(in331, _mm512_set1_ps(-1.25e+00f), tmp1514);
tmp1518 = _mm512_fmadd_ps(in339, _mm512_set1_ps(-1.25e+00f), tmp1518);
in331 = _mm512_sub_ps(in331, in333);
in339 = _mm512_sub_ps(in339, in341);
in331 = _mm512_fmadd_ps(in331, _mm512_set1_ps(5.25e+00f), in329);
in339 = _mm512_fmadd_ps(in339, _mm512_set1_ps(5.25e+00f), in337);
in330 = _mm512_fmadd_ps(tmp1514, _mm512_set1_ps(2e+00f), in332);
in338 = _mm512_fmadd_ps(tmp1518, _mm512_set1_ps(2e+00f), in340);
in332 = _mm512_fnmadd_ps(tmp1514, _mm512_set1_ps(2e+00f), in332);
in340 = _mm512_fnmadd_ps(tmp1518, _mm512_set1_ps(2e+00f), in340);
__m512 tmp1530 = _mm512_unpacklo_ps(in328, tmp1516);
__m512 tmp1531 = _mm512_unpackhi_ps(in328, tmp1516);
__m512 tmp1532 = _mm512_unpacklo_ps(tmp1517, in334);
__m512 tmp1533 = _mm512_unpackhi_ps(tmp1517, in334);
__m512 tmp1534 = _mm512_unpacklo_ps(tmp1515, in330);
__m512 tmp1535 = _mm512_unpackhi_ps(tmp1515, in330);
__m512 tmp1536 = _mm512_unpacklo_ps(in332, in331);
__m512 tmp1537 = _mm512_unpackhi_ps(in332, in331);
__m512 tmp1538 = _mm512_unpacklo_ps(in336, tmp1520);
__m512 tmp1539 = _mm512_unpackhi_ps(in336, tmp1520);
__m512 tmp1540 = _mm512_unpacklo_ps(tmp1521, in342);
__m512 tmp1541 = _mm512_unpackhi_ps(tmp1521, in342);
__m512 tmp1542 = _mm512_unpacklo_ps(tmp1519, in338);
__m512 tmp1543 = _mm512_unpackhi_ps(tmp1519, in338);
__m512 tmp1544 = _mm512_unpacklo_ps(in340, in339);
__m512 tmp1545 = _mm512_unpackhi_ps(in340, in339);
__m512 tmp1546 = _mm512_shuffle_ps(tmp1530, tmp1532, 68);
__m512 tmp1547 = _mm512_shuffle_ps(tmp1530, tmp1532, 238);
__m512 tmp1548 = _mm512_shuffle_ps(tmp1531, tmp1533, 68);
__m512 tmp1549 = _mm512_shuffle_ps(tmp1531, tmp1533, 238);
__m512 tmp1550 = _mm512_shuffle_ps(tmp1534, tmp1536, 68);
__m512 tmp1551 = _mm512_shuffle_ps(tmp1534, tmp1536, 238);
__m512 tmp1552 = _mm512_shuffle_ps(tmp1535, tmp1537, 68);
__m512 tmp1553 = _mm512_shuffle_ps(tmp1535, tmp1537, 238);
__m512 tmp1554 = _mm512_shuffle_ps(tmp1538, tmp1540, 68);
__m512 tmp1555 = _mm512_shuffle_ps(tmp1538, tmp1540, 238);
__m512 tmp1556 = _mm512_shuffle_ps(tmp1539, tmp1541, 68);
__m512 tmp1557 = _mm512_shuffle_ps(tmp1539, tmp1541, 238);
__m512 tmp1558 = _mm512_shuffle_ps(tmp1542, tmp1544, 68);
__m512 tmp1559 = _mm512_shuffle_ps(tmp1542, tmp1544, 238);
__m512 tmp1560 = _mm512_shuffle_ps(tmp1543, tmp1545, 68);
__m512 tmp1561 = _mm512_shuffle_ps(tmp1543, tmp1545, 238);
__m512 tmp1562 = _mm512_shuffle_f32x4(tmp1546, tmp1550, 136);
__m512 tmp1563 = _mm512_shuffle_f32x4(tmp1546, tmp1550, 221);
__m512 tmp1564 = _mm512_shuffle_f32x4(tmp1547, tmp1551, 136);
__m512 tmp1565 = _mm512_shuffle_f32x4(tmp1547, tmp1551, 221);
__m512 tmp1566 = _mm512_shuffle_f32x4(tmp1548, tmp1552, 136);
__m512 tmp1567 = _mm512_shuffle_f32x4(tmp1548, tmp1552, 221);
__m512 tmp1568 = _mm512_shuffle_f32x4(tmp1549, tmp1553, 136);
__m512 tmp1569 = _mm512_shuffle_f32x4(tmp1549, tmp1553, 221);
__m512 tmp1570 = _mm512_shuffle_f32x4(tmp1554, tmp1558, 136);
__m512 tmp1571 = _mm512_shuffle_f32x4(tmp1554, tmp1558, 221);
__m512 tmp1572 = _mm512_shuffle_f32x4(tmp1555, tmp1559, 136);
__m512 tmp1573 = _mm512_shuffle_f32x4(tmp1555, tmp1559, 221);
__m512 tmp1574 = _mm512_shuffle_f32x4(tmp1556, tmp1560, 136);
__m512 tmp1575 = _mm512_shuffle_f32x4(tmp1556, tmp1560, 221);
__m512 tmp1576 = _mm512_shuffle_f32x4(tmp1557, tmp1561, 136);
__m512 tmp1577 = _mm512_shuffle_f32x4(tmp1557, tmp1561, 221);
in328 = _mm512_shuffle_f32x4(tmp1562, tmp1570, 136);
in336 = _mm512_shuffle_f32x4(tmp1562, tmp1570, 221);
tmp1516 = _mm512_shuffle_f32x4(tmp1564, tmp1572, 136);
tmp1520 = _mm512_shuffle_f32x4(tmp1564, tmp1572, 221);
tmp1517 = _mm512_shuffle_f32x4(tmp1566, tmp1574, 136);
tmp1521 = _mm512_shuffle_f32x4(tmp1566, tmp1574, 221);
in334 = _mm512_shuffle_f32x4(tmp1568, tmp1576, 136);
in342 = _mm512_shuffle_f32x4(tmp1568, tmp1576, 221);
tmp1515 = _mm512_shuffle_f32x4(tmp1563, tmp1571, 136);
tmp1519 = _mm512_shuffle_f32x4(tmp1563, tmp1571, 221);
in330 = _mm512_shuffle_f32x4(tmp1565, tmp1573, 136);
in338 = _mm512_shuffle_f32x4(tmp1565, tmp1573, 221);
in332 = _mm512_shuffle_f32x4(tmp1567, tmp1575, 136);
in340 = _mm512_shuffle_f32x4(tmp1567, tmp1575, 221);
in331 = _mm512_shuffle_f32x4(tmp1569, tmp1577, 136);
in339 = _mm512_shuffle_f32x4(tmp1569, tmp1577, 221);
__m512 tmp1522 = _mm512_add_ps(tmp1516, in330);
__m512 tmp1526 = _mm512_add_ps(tmp1520, in338);
__m512 tmp1523 = _mm512_sub_ps(tmp1515, tmp1517);
__m512 tmp1527 = _mm512_sub_ps(tmp1519, tmp1521);
__m512 tmp1524 = _mm512_add_ps(tmp1517, in332);
__m512 tmp1528 = _mm512_add_ps(tmp1521, in340);
in328 = _mm512_sub_ps(in328, in332);
in336 = _mm512_sub_ps(in336, in340);
tmp1522 = _mm512_fmadd_ps(in334, _mm512_set1_ps(-4.25e+00f), tmp1522);
tmp1526 = _mm512_fmadd_ps(in342, _mm512_set1_ps(-4.25e+00f), tmp1526);
tmp1524 = _mm512_fmadd_ps(tmp1515, _mm512_set1_ps(-4.25e+00f), tmp1524);
tmp1528 = _mm512_fmadd_ps(tmp1519, _mm512_set1_ps(-4.25e+00f), tmp1528);
in328 = _mm512_fmadd_ps(tmp1523, _mm512_set1_ps(5.25e+00f), in328);
in336 = _mm512_fmadd_ps(tmp1527, _mm512_set1_ps(5.25e+00f), in336);
tmp1523 = _mm512_fmadd_ps(tmp1517, _mm512_set1_ps(2.5e-01f), in332);
tmp1527 = _mm512_fmadd_ps(tmp1521, _mm512_set1_ps(2.5e-01f), in340);
tmp1517 = _mm512_fmadd_ps(tmp1517, _mm512_set1_ps(4e+00f), in332);
tmp1521 = _mm512_fmadd_ps(tmp1521, _mm512_set1_ps(4e+00f), in340);
__m512 tmp1525 = _mm512_sub_ps(tmp1524, tmp1522);
__m512 tmp1529 = _mm512_sub_ps(tmp1528, tmp1526);
tmp1524 = _mm512_add_ps(tmp1522, tmp1524);
tmp1528 = _mm512_add_ps(tmp1526, tmp1528);
tmp1522 = _mm512_fmadd_ps(tmp1516, _mm512_set1_ps(2.5e-01f), in330);
tmp1526 = _mm512_fmadd_ps(tmp1520, _mm512_set1_ps(2.5e-01f), in338);
tmp1523 = _mm512_fmadd_ps(tmp1515, _mm512_set1_ps(-1.25e+00f), tmp1523);
tmp1527 = _mm512_fmadd_ps(tmp1519, _mm512_set1_ps(-1.25e+00f), tmp1527);
tmp1515 = _mm512_fmadd_ps(tmp1515, _mm512_set1_ps(-5e+00f), tmp1517);
tmp1519 = _mm512_fmadd_ps(tmp1519, _mm512_set1_ps(-5e+00f), tmp1521);
tmp1522 = _mm512_fmadd_ps(in334, _mm512_set1_ps(-1.25e+00f), tmp1522);
tmp1526 = _mm512_fmadd_ps(in342, _mm512_set1_ps(-1.25e+00f), tmp1526);
in332 = _mm512_fmadd_ps(tmp1522, _mm512_set1_ps(2e+00f), tmp1523);
in340 = _mm512_fmadd_ps(tmp1526, _mm512_set1_ps(2e+00f), tmp1527);
tmp1523 = _mm512_fnmadd_ps(tmp1522, _mm512_set1_ps(2e+00f), tmp1523);
tmp1527 = _mm512_fnmadd_ps(tmp1526, _mm512_set1_ps(2e+00f), tmp1527);
tmp1522 = _mm512_fmadd_ps(in330, _mm512_set1_ps(2.5e-01f), tmp1516);
tmp1526 = _mm512_fmadd_ps(in338, _mm512_set1_ps(2.5e-01f), tmp1520);
tmp1516 = _mm512_sub_ps(in331, tmp1516);
tmp1520 = _mm512_sub_ps(in339, tmp1520);
tmp1522 = _mm512_fmadd_ps(in334, _mm512_set1_ps(-1.25e+00f), tmp1522);
tmp1526 = _mm512_fmadd_ps(in342, _mm512_set1_ps(-1.25e+00f), tmp1526);
in334 = _mm512_sub_ps(in334, in330);
in342 = _mm512_sub_ps(in342, in338);
in334 = _mm512_fmadd_ps(in334, _mm512_set1_ps(5.25e+00f), tmp1516);
in342 = _mm512_fmadd_ps(in342, _mm512_set1_ps(5.25e+00f), tmp1520);
tmp1517 = _mm512_fmadd_ps(tmp1522, _mm512_set1_ps(2e+00f), tmp1515);
tmp1521 = _mm512_fmadd_ps(tmp1526, _mm512_set1_ps(2e+00f), tmp1519);
tmp1515 = _mm512_fnmadd_ps(tmp1522, _mm512_set1_ps(2e+00f), tmp1515);
tmp1519 = _mm512_fnmadd_ps(tmp1526, _mm512_set1_ps(2e+00f), tmp1519);
__m512 out381 = _mm512_shuffle_f32x4(in328, tmp1524, 68);
__m512 out389 = _mm512_shuffle_f32x4(in328, tmp1524, 238);
__m512 out382 = _mm512_shuffle_f32x4(tmp1525, in332, 68);
__m512 out390 = _mm512_shuffle_f32x4(tmp1525, in332, 238);
__m512 out383 = _mm512_shuffle_f32x4(tmp1523, tmp1517, 68);
__m512 out391 = _mm512_shuffle_f32x4(tmp1523, tmp1517, 238);
__m512 out384 = _mm512_shuffle_f32x4(tmp1515, in334, 68);
__m512 out392 = _mm512_shuffle_f32x4(tmp1515, in334, 238);
__m512 out385 = _mm512_shuffle_f32x4(in336, tmp1528, 68);
__m512 out393 = _mm512_shuffle_f32x4(in336, tmp1528, 238);
__m512 out386 = _mm512_shuffle_f32x4(tmp1529, in340, 68);
__m512 out394 = _mm512_shuffle_f32x4(tmp1529, in340, 238);
__m512 out387 = _mm512_shuffle_f32x4(tmp1527, tmp1521, 68);
__m512 out395 = _mm512_shuffle_f32x4(tmp1527, tmp1521, 238);
__m512 out388 = _mm512_shuffle_f32x4(tmp1519, in342, 68);
__m512 out396 = _mm512_shuffle_f32x4(tmp1519, in342, 238);
_mm512_storeu_ps(dfPtr2+0+2856960*i8+178560*j4+44544*ss2+768*k10, out381);
_mm512_storeu_ps(dfPtr2+128+2856960*i8+178560*j4+44544*ss2+768*k10, out389);
_mm512_storeu_ps(dfPtr2+64+2856960*i8+178560*j4+44544*ss2+768*k10, out385);
_mm512_storeu_ps(dfPtr2+192+2856960*i8+178560*j4+44544*ss2+768*k10, out393);
_mm512_storeu_ps(dfPtr2+714240+2856960*i8+178560*j4+44544*ss2+768*k10, out382);
_mm512_storeu_ps(dfPtr2+714368+2856960*i8+178560*j4+44544*ss2+768*k10, out390);
_mm512_storeu_ps(dfPtr2+714304+2856960*i8+178560*j4+44544*ss2+768*k10, out386);
_mm512_storeu_ps(dfPtr2+714432+2856960*i8+178560*j4+44544*ss2+768*k10, out394);
_mm512_storeu_ps(dfPtr2+1428480+2856960*i8+178560*j4+44544*ss2+768*k10, out383);
_mm512_storeu_ps(dfPtr2+1428608+2856960*i8+178560*j4+44544*ss2+768*k10, out391);
_mm512_storeu_ps(dfPtr2+1428544+2856960*i8+178560*j4+44544*ss2+768*k10, out387);
_mm512_storeu_ps(dfPtr2+1428672+2856960*i8+178560*j4+44544*ss2+768*k10, out395);
_mm512_storeu_ps(dfPtr2+2142720+2856960*i8+178560*j4+44544*ss2+768*k10, out384);
_mm512_storeu_ps(dfPtr2+2142848+2856960*i8+178560*j4+44544*ss2+768*k10, out392);
_mm512_storeu_ps(dfPtr2+2142784+2856960*i8+178560*j4+44544*ss2+768*k10, out388);
_mm512_storeu_ps(dfPtr2+2142912+2856960*i8+178560*j4+44544*ss2+768*k10, out396);
__m512 dat329 = _mm512_maskz_loadu_ps(2047, datPtr2+552+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat330 = _mm512_maskz_loadu_ps(2047, datPtr2+3496+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512i pm45 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in344 = _mm512_permutexvar_ps(pm45, dat329);
__m512 in352 = _mm512_permutexvar_ps(pm45, dat330);
__m512 dat331 = _mm512_maskz_loadu_ps(2047, datPtr2+644+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat332 = _mm512_maskz_loadu_ps(2047, datPtr2+3588+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in345 = _mm512_permutexvar_ps(pm45, dat331);
__m512 in353 = _mm512_permutexvar_ps(pm45, dat332);
__m512 dat333 = _mm512_maskz_loadu_ps(2047, datPtr2+736+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat334 = _mm512_maskz_loadu_ps(2047, datPtr2+3680+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in346 = _mm512_permutexvar_ps(pm45, dat333);
__m512 in354 = _mm512_permutexvar_ps(pm45, dat334);
__m512 dat335 = _mm512_maskz_loadu_ps(2047, datPtr2+828+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat336 = _mm512_maskz_loadu_ps(2047, datPtr2+3772+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in347 = _mm512_permutexvar_ps(pm45, dat335);
__m512 in355 = _mm512_permutexvar_ps(pm45, dat336);
__m512 dat337 = _mm512_maskz_loadu_ps(2047, datPtr2+920+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat338 = _mm512_maskz_loadu_ps(2047, datPtr2+3864+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in348 = _mm512_permutexvar_ps(pm45, dat337);
__m512 in356 = _mm512_permutexvar_ps(pm45, dat338);
__m512 dat339 = _mm512_maskz_loadu_ps(2047, datPtr2+1012+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat340 = _mm512_maskz_loadu_ps(2047, datPtr2+3956+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in349 = _mm512_permutexvar_ps(pm45, dat339);
__m512 in357 = _mm512_permutexvar_ps(pm45, dat340);
__m512 dat341 = _mm512_maskz_loadu_ps(2047, datPtr2+1104+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat342 = _mm512_maskz_loadu_ps(2047, datPtr2+4048+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in350 = _mm512_permutexvar_ps(pm45, dat341);
__m512 in358 = _mm512_permutexvar_ps(pm45, dat342);
__m512 dat343 = _mm512_maskz_loadu_ps(2047, datPtr2+1196+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat344 = _mm512_maskz_loadu_ps(2047, datPtr2+4140+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in351 = _mm512_permutexvar_ps(pm45, dat343);
__m512 in359 = _mm512_permutexvar_ps(pm45, dat344);
__m512 tmp1578 = _mm512_add_ps(in345, in349);
__m512 tmp1582 = _mm512_add_ps(in353, in357);
__m512 tmp1579 = _mm512_sub_ps(in348, in346);
__m512 tmp1583 = _mm512_sub_ps(in356, in354);
__m512 tmp1580 = _mm512_add_ps(in346, in350);
__m512 tmp1584 = _mm512_add_ps(in354, in358);
in344 = _mm512_sub_ps(in344, in350);
in352 = _mm512_sub_ps(in352, in358);
tmp1578 = _mm512_fmadd_ps(in347, _mm512_set1_ps(-4.25e+00f), tmp1578);
tmp1582 = _mm512_fmadd_ps(in355, _mm512_set1_ps(-4.25e+00f), tmp1582);
tmp1580 = _mm512_fmadd_ps(in348, _mm512_set1_ps(-4.25e+00f), tmp1580);
tmp1584 = _mm512_fmadd_ps(in356, _mm512_set1_ps(-4.25e+00f), tmp1584);
in344 = _mm512_fmadd_ps(tmp1579, _mm512_set1_ps(5.25e+00f), in344);
in352 = _mm512_fmadd_ps(tmp1583, _mm512_set1_ps(5.25e+00f), in352);
tmp1579 = _mm512_fmadd_ps(in346, _mm512_set1_ps(2.5e-01f), in350);
tmp1583 = _mm512_fmadd_ps(in354, _mm512_set1_ps(2.5e-01f), in358);
in346 = _mm512_fmadd_ps(in346, _mm512_set1_ps(4e+00f), in350);
in354 = _mm512_fmadd_ps(in354, _mm512_set1_ps(4e+00f), in358);
__m512 tmp1581 = _mm512_sub_ps(tmp1580, tmp1578);
__m512 tmp1585 = _mm512_sub_ps(tmp1584, tmp1582);
tmp1580 = _mm512_add_ps(tmp1578, tmp1580);
tmp1584 = _mm512_add_ps(tmp1582, tmp1584);
tmp1578 = _mm512_fmadd_ps(in345, _mm512_set1_ps(2.5e-01f), in349);
tmp1582 = _mm512_fmadd_ps(in353, _mm512_set1_ps(2.5e-01f), in357);
tmp1579 = _mm512_fmadd_ps(in348, _mm512_set1_ps(-1.25e+00f), tmp1579);
tmp1583 = _mm512_fmadd_ps(in356, _mm512_set1_ps(-1.25e+00f), tmp1583);
in348 = _mm512_fmadd_ps(in348, _mm512_set1_ps(-5e+00f), in346);
in356 = _mm512_fmadd_ps(in356, _mm512_set1_ps(-5e+00f), in354);
tmp1578 = _mm512_fmadd_ps(in347, _mm512_set1_ps(-1.25e+00f), tmp1578);
tmp1582 = _mm512_fmadd_ps(in355, _mm512_set1_ps(-1.25e+00f), tmp1582);
in350 = _mm512_fmadd_ps(tmp1578, _mm512_set1_ps(2e+00f), tmp1579);
in358 = _mm512_fmadd_ps(tmp1582, _mm512_set1_ps(2e+00f), tmp1583);
tmp1579 = _mm512_fnmadd_ps(tmp1578, _mm512_set1_ps(2e+00f), tmp1579);
tmp1583 = _mm512_fnmadd_ps(tmp1582, _mm512_set1_ps(2e+00f), tmp1583);
tmp1578 = _mm512_fmadd_ps(in349, _mm512_set1_ps(2.5e-01f), in345);
tmp1582 = _mm512_fmadd_ps(in357, _mm512_set1_ps(2.5e-01f), in353);
in345 = _mm512_sub_ps(in351, in345);
in353 = _mm512_sub_ps(in359, in353);
tmp1578 = _mm512_fmadd_ps(in347, _mm512_set1_ps(-1.25e+00f), tmp1578);
tmp1582 = _mm512_fmadd_ps(in355, _mm512_set1_ps(-1.25e+00f), tmp1582);
in347 = _mm512_sub_ps(in347, in349);
in355 = _mm512_sub_ps(in355, in357);
in347 = _mm512_fmadd_ps(in347, _mm512_set1_ps(5.25e+00f), in345);
in355 = _mm512_fmadd_ps(in355, _mm512_set1_ps(5.25e+00f), in353);
in346 = _mm512_fmadd_ps(tmp1578, _mm512_set1_ps(2e+00f), in348);
in354 = _mm512_fmadd_ps(tmp1582, _mm512_set1_ps(2e+00f), in356);
in348 = _mm512_fnmadd_ps(tmp1578, _mm512_set1_ps(2e+00f), in348);
in356 = _mm512_fnmadd_ps(tmp1582, _mm512_set1_ps(2e+00f), in356);
__m512 tmp1595 = _mm512_unpacklo_ps(in344, tmp1580);
__m512 tmp1596 = _mm512_unpackhi_ps(in344, tmp1580);
__m512 tmp1597 = _mm512_unpacklo_ps(tmp1581, in350);
__m512 tmp1598 = _mm512_unpackhi_ps(tmp1581, in350);
__m512 tmp1599 = _mm512_unpacklo_ps(tmp1579, in346);
__m512 tmp1600 = _mm512_unpackhi_ps(tmp1579, in346);
__m512 tmp1601 = _mm512_unpacklo_ps(in348, in347);
__m512 tmp1602 = _mm512_unpackhi_ps(in348, in347);
__m512 tmp1603 = _mm512_unpacklo_ps(in352, tmp1584);
__m512 tmp1604 = _mm512_unpackhi_ps(in352, tmp1584);
__m512 tmp1605 = _mm512_unpacklo_ps(tmp1585, in358);
__m512 tmp1606 = _mm512_unpackhi_ps(tmp1585, in358);
__m512 tmp1607 = _mm512_unpacklo_ps(tmp1583, in354);
__m512 tmp1608 = _mm512_unpackhi_ps(tmp1583, in354);
__m512 tmp1609 = _mm512_unpacklo_ps(in356, in355);
__m512 tmp1610 = _mm512_unpackhi_ps(in356, in355);
__m512 tmp1611 = _mm512_shuffle_ps(tmp1595, tmp1597, 68);
__m512 tmp1612 = _mm512_shuffle_ps(tmp1595, tmp1597, 238);
__m512 tmp1613 = _mm512_shuffle_ps(tmp1596, tmp1598, 68);
__m512 tmp1614 = _mm512_shuffle_ps(tmp1596, tmp1598, 238);
__m512 tmp1615 = _mm512_shuffle_ps(tmp1599, tmp1601, 68);
__m512 tmp1616 = _mm512_shuffle_ps(tmp1599, tmp1601, 238);
__m512 tmp1617 = _mm512_shuffle_ps(tmp1600, tmp1602, 68);
__m512 tmp1618 = _mm512_shuffle_ps(tmp1600, tmp1602, 238);
__m512 tmp1619 = _mm512_shuffle_ps(tmp1603, tmp1605, 68);
__m512 tmp1620 = _mm512_shuffle_ps(tmp1603, tmp1605, 238);
__m512 tmp1621 = _mm512_shuffle_ps(tmp1604, tmp1606, 68);
__m512 tmp1622 = _mm512_shuffle_ps(tmp1604, tmp1606, 238);
__m512 tmp1623 = _mm512_shuffle_ps(tmp1607, tmp1609, 68);
__m512 tmp1624 = _mm512_shuffle_ps(tmp1607, tmp1609, 238);
__m512 tmp1625 = _mm512_shuffle_ps(tmp1608, tmp1610, 68);
__m512 tmp1626 = _mm512_shuffle_ps(tmp1608, tmp1610, 238);
__m512 tmp1627 = _mm512_shuffle_f32x4(tmp1611, tmp1615, 136);
__m512 tmp1628 = _mm512_shuffle_f32x4(tmp1611, tmp1615, 221);
__m512 tmp1629 = _mm512_shuffle_f32x4(tmp1612, tmp1616, 136);
__m512 tmp1630 = _mm512_shuffle_f32x4(tmp1612, tmp1616, 221);
__m512 tmp1631 = _mm512_shuffle_f32x4(tmp1613, tmp1617, 136);
__m512 tmp1632 = _mm512_shuffle_f32x4(tmp1613, tmp1617, 221);
__m512 tmp1633 = _mm512_shuffle_f32x4(tmp1614, tmp1618, 136);
__m512 tmp1634 = _mm512_shuffle_f32x4(tmp1614, tmp1618, 221);
__m512 tmp1635 = _mm512_shuffle_f32x4(tmp1619, tmp1623, 136);
__m512 tmp1636 = _mm512_shuffle_f32x4(tmp1619, tmp1623, 221);
__m512 tmp1637 = _mm512_shuffle_f32x4(tmp1620, tmp1624, 136);
__m512 tmp1638 = _mm512_shuffle_f32x4(tmp1620, tmp1624, 221);
__m512 tmp1639 = _mm512_shuffle_f32x4(tmp1621, tmp1625, 136);
__m512 tmp1640 = _mm512_shuffle_f32x4(tmp1621, tmp1625, 221);
__m512 tmp1641 = _mm512_shuffle_f32x4(tmp1622, tmp1626, 136);
__m512 tmp1642 = _mm512_shuffle_f32x4(tmp1622, tmp1626, 221);
in344 = _mm512_shuffle_f32x4(tmp1627, tmp1635, 136);
in352 = _mm512_shuffle_f32x4(tmp1627, tmp1635, 221);
tmp1580 = _mm512_shuffle_f32x4(tmp1629, tmp1637, 136);
tmp1584 = _mm512_shuffle_f32x4(tmp1629, tmp1637, 221);
tmp1581 = _mm512_shuffle_f32x4(tmp1631, tmp1639, 136);
tmp1585 = _mm512_shuffle_f32x4(tmp1631, tmp1639, 221);
in350 = _mm512_shuffle_f32x4(tmp1633, tmp1641, 136);
in358 = _mm512_shuffle_f32x4(tmp1633, tmp1641, 221);
tmp1579 = _mm512_shuffle_f32x4(tmp1628, tmp1636, 136);
tmp1583 = _mm512_shuffle_f32x4(tmp1628, tmp1636, 221);
in346 = _mm512_shuffle_f32x4(tmp1630, tmp1638, 136);
in348 = _mm512_shuffle_f32x4(tmp1632, tmp1640, 136);
in347 = _mm512_shuffle_f32x4(tmp1634, tmp1642, 136);
__m512 tmp1586 = _mm512_add_ps(tmp1580, in346);
__m512 tmp1590 = tmp1584;
__m512 tmp1587 = _mm512_sub_ps(tmp1579, tmp1581);
__m512 tmp1591 = _mm512_sub_ps(tmp1583, tmp1585);
__m512 tmp1588 = _mm512_add_ps(tmp1581, in348);
__m512 tmp1592 = tmp1585;
in344 = _mm512_sub_ps(in344, in348);
in352 = in352;
tmp1586 = _mm512_fmadd_ps(in350, _mm512_set1_ps(-4.25e+00f), tmp1586);
tmp1590 = _mm512_fmadd_ps(in358, _mm512_set1_ps(-4.25e+00f), tmp1590);
tmp1588 = _mm512_fmadd_ps(tmp1579, _mm512_set1_ps(-4.25e+00f), tmp1588);
tmp1592 = _mm512_fmadd_ps(tmp1583, _mm512_set1_ps(-4.25e+00f), tmp1592);
in344 = _mm512_fmadd_ps(tmp1587, _mm512_set1_ps(5.25e+00f), in344);
in352 = _mm512_fmadd_ps(tmp1591, _mm512_set1_ps(5.25e+00f), in352);
tmp1587 = _mm512_fmadd_ps(tmp1581, _mm512_set1_ps(2.5e-01f), in348);
tmp1591 = _mm512_mul_ps(tmp1585, _mm512_set1_ps(2.5e-01f));
tmp1581 = _mm512_fmadd_ps(tmp1581, _mm512_set1_ps(4e+00f), in348);
tmp1585 = _mm512_mul_ps(tmp1585, _mm512_set1_ps(4e+00f));
__m512 tmp1589 = _mm512_sub_ps(tmp1588, tmp1586);
__m512 tmp1593 = _mm512_sub_ps(tmp1592, tmp1590);
tmp1588 = _mm512_add_ps(tmp1586, tmp1588);
tmp1592 = _mm512_add_ps(tmp1590, tmp1592);
tmp1586 = _mm512_fmadd_ps(tmp1580, _mm512_set1_ps(2.5e-01f), in346);
tmp1590 = _mm512_mul_ps(tmp1584, _mm512_set1_ps(2.5e-01f));
tmp1587 = _mm512_fmadd_ps(tmp1579, _mm512_set1_ps(-1.25e+00f), tmp1587);
tmp1591 = _mm512_fmadd_ps(tmp1583, _mm512_set1_ps(-1.25e+00f), tmp1591);
tmp1579 = _mm512_fmadd_ps(tmp1579, _mm512_set1_ps(-5e+00f), tmp1581);
tmp1583 = _mm512_fmadd_ps(tmp1583, _mm512_set1_ps(-5e+00f), tmp1585);
tmp1586 = _mm512_fmadd_ps(in350, _mm512_set1_ps(-1.25e+00f), tmp1586);
tmp1590 = _mm512_fmadd_ps(in358, _mm512_set1_ps(-1.25e+00f), tmp1590);
in348 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(2e+00f), tmp1587);
__m512 tmp1594 = _mm512_fmadd_ps(tmp1590, _mm512_set1_ps(2e+00f), tmp1591);
tmp1587 = _mm512_fnmadd_ps(tmp1586, _mm512_set1_ps(2e+00f), tmp1587);
tmp1591 = _mm512_fnmadd_ps(tmp1590, _mm512_set1_ps(2e+00f), tmp1591);
tmp1586 = _mm512_fmadd_ps(in346, _mm512_set1_ps(2.5e-01f), tmp1580);
tmp1590 = tmp1584;
tmp1580 = _mm512_sub_ps(in347, tmp1580);
tmp1584 = _mm512_sub_ps(_mm512_setzero_ps(), tmp1584);
tmp1586 = _mm512_fmadd_ps(in350, _mm512_set1_ps(-1.25e+00f), tmp1586);
tmp1590 = _mm512_fmadd_ps(in358, _mm512_set1_ps(-1.25e+00f), tmp1590);
in350 = _mm512_sub_ps(in350, in346);
in358 = in358;
in350 = _mm512_fmadd_ps(in350, _mm512_set1_ps(5.25e+00f), tmp1580);
in358 = _mm512_fmadd_ps(in358, _mm512_set1_ps(5.25e+00f), tmp1584);
tmp1581 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(2e+00f), tmp1579);
tmp1585 = _mm512_fmadd_ps(tmp1590, _mm512_set1_ps(2e+00f), tmp1583);
tmp1579 = _mm512_fnmadd_ps(tmp1586, _mm512_set1_ps(2e+00f), tmp1579);
tmp1583 = _mm512_fnmadd_ps(tmp1590, _mm512_set1_ps(2e+00f), tmp1583);
__m512 out397 = _mm512_shuffle_f32x4(in344, tmp1588, 68);
__m512 out405 = _mm512_shuffle_f32x4(in344, tmp1588, 238);
__m512 out398 = _mm512_shuffle_f32x4(tmp1589, in348, 68);
__m512 out406 = _mm512_shuffle_f32x4(tmp1589, in348, 238);
__m512 out399 = _mm512_shuffle_f32x4(tmp1587, tmp1581, 68);
__m512 out407 = _mm512_shuffle_f32x4(tmp1587, tmp1581, 238);
__m512 out400 = _mm512_shuffle_f32x4(tmp1579, in350, 68);
__m512 out408 = _mm512_shuffle_f32x4(tmp1579, in350, 238);
__m512 out401 = _mm512_shuffle_f32x4(in352, tmp1592, 68);
__m512 out409 = _mm512_shuffle_f32x4(in352, tmp1592, 238);
__m512 out402 = _mm512_shuffle_f32x4(tmp1593, tmp1594, 68);
__m512 out410 = _mm512_shuffle_f32x4(tmp1593, tmp1594, 238);
__m512 out403 = _mm512_shuffle_f32x4(tmp1591, tmp1585, 68);
__m512 out411 = _mm512_shuffle_f32x4(tmp1591, tmp1585, 238);
__m512 out404 = _mm512_shuffle_f32x4(tmp1583, in358, 68);
__m512 out412 = _mm512_shuffle_f32x4(tmp1583, in358, 238);
_mm512_storeu_ps(dfPtr2+256+2856960*i8+178560*j4+44544*ss2+768*k10, out397);
_mm512_storeu_ps(dfPtr2+384+2856960*i8+178560*j4+44544*ss2+768*k10, out405);
_mm512_storeu_ps(dfPtr2+320+2856960*i8+178560*j4+44544*ss2+768*k10, out401);
_mm512_storeu_ps(dfPtr2+448+2856960*i8+178560*j4+44544*ss2+768*k10, out409);
_mm512_storeu_ps(dfPtr2+714496+2856960*i8+178560*j4+44544*ss2+768*k10, out398);
_mm512_storeu_ps(dfPtr2+714624+2856960*i8+178560*j4+44544*ss2+768*k10, out406);
_mm512_storeu_ps(dfPtr2+714560+2856960*i8+178560*j4+44544*ss2+768*k10, out402);
_mm512_storeu_ps(dfPtr2+714688+2856960*i8+178560*j4+44544*ss2+768*k10, out410);
_mm512_storeu_ps(dfPtr2+1428736+2856960*i8+178560*j4+44544*ss2+768*k10, out399);
_mm512_storeu_ps(dfPtr2+1428864+2856960*i8+178560*j4+44544*ss2+768*k10, out407);
_mm512_storeu_ps(dfPtr2+1428800+2856960*i8+178560*j4+44544*ss2+768*k10, out403);
_mm512_storeu_ps(dfPtr2+1428928+2856960*i8+178560*j4+44544*ss2+768*k10, out411);
_mm512_storeu_ps(dfPtr2+2142976+2856960*i8+178560*j4+44544*ss2+768*k10, out400);
_mm512_storeu_ps(dfPtr2+2143104+2856960*i8+178560*j4+44544*ss2+768*k10, out408);
_mm512_storeu_ps(dfPtr2+2143040+2856960*i8+178560*j4+44544*ss2+768*k10, out404);
_mm512_storeu_ps(dfPtr2+2143168+2856960*i8+178560*j4+44544*ss2+768*k10, out412);
__m512 dat345 = _mm512_maskz_loadu_ps(16383, datPtr2+4000+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat346 = _mm512_maskz_loadu_ps(2047, datPtr2+4048+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512i pm46 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in360 = _mm512_permutexvar_ps(pm46, dat345);
__m512i pm47 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in368 = _mm512_permutexvar_ps(pm47, dat346);
__m512 dat347 = _mm512_maskz_loadu_ps(16383, datPtr2+4092+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat348 = _mm512_maskz_loadu_ps(2047, datPtr2+4140+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in361 = _mm512_permutexvar_ps(pm46, dat347);
__m512 in369 = _mm512_permutexvar_ps(pm47, dat348);
__m512 dat349 = _mm512_maskz_loadu_ps(16383, datPtr2+4184+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat350 = _mm512_maskz_loadu_ps(2047, datPtr2+4232+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in362 = _mm512_permutexvar_ps(pm46, dat349);
__m512 in370 = _mm512_permutexvar_ps(pm47, dat350);
__m512 dat351 = _mm512_maskz_loadu_ps(16383, datPtr2+4276+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat352 = _mm512_maskz_loadu_ps(2047, datPtr2+4324+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in363 = _mm512_permutexvar_ps(pm46, dat351);
__m512 in371 = _mm512_permutexvar_ps(pm47, dat352);
__m512 dat353 = _mm512_maskz_loadu_ps(16383, datPtr2+4368+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat354 = _mm512_maskz_loadu_ps(2047, datPtr2+4416+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in364 = _mm512_permutexvar_ps(pm46, dat353);
__m512 in372 = _mm512_permutexvar_ps(pm47, dat354);
__m512 dat355 = _mm512_maskz_loadu_ps(16383, datPtr2+4460+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat356 = _mm512_maskz_loadu_ps(2047, datPtr2+4508+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in365 = _mm512_permutexvar_ps(pm46, dat355);
__m512 in373 = _mm512_permutexvar_ps(pm47, dat356);
__m512 dat357 = _mm512_maskz_loadu_ps(16383, datPtr2+4552+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat358 = _mm512_maskz_loadu_ps(2047, datPtr2+4600+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in366 = _mm512_permutexvar_ps(pm46, dat357);
__m512 in374 = _mm512_permutexvar_ps(pm47, dat358);
__m512 dat359 = _mm512_maskz_loadu_ps(16383, datPtr2+4644+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat360 = _mm512_maskz_loadu_ps(2047, datPtr2+4692+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in367 = _mm512_permutexvar_ps(pm46, dat359);
__m512 in375 = _mm512_permutexvar_ps(pm47, dat360);
__m512 tmp1643 = _mm512_add_ps(in361, in365);
__m512 tmp1647 = _mm512_add_ps(in369, in373);
__m512 tmp1644 = _mm512_sub_ps(in364, in362);
__m512 tmp1648 = _mm512_sub_ps(in372, in370);
__m512 tmp1645 = _mm512_add_ps(in362, in366);
__m512 tmp1649 = _mm512_add_ps(in370, in374);
in360 = _mm512_sub_ps(in360, in366);
in368 = _mm512_sub_ps(in368, in374);
tmp1643 = _mm512_fmadd_ps(in363, _mm512_set1_ps(-4.25e+00f), tmp1643);
tmp1647 = _mm512_fmadd_ps(in371, _mm512_set1_ps(-4.25e+00f), tmp1647);
tmp1645 = _mm512_fmadd_ps(in364, _mm512_set1_ps(-4.25e+00f), tmp1645);
tmp1649 = _mm512_fmadd_ps(in372, _mm512_set1_ps(-4.25e+00f), tmp1649);
in360 = _mm512_fmadd_ps(tmp1644, _mm512_set1_ps(5.25e+00f), in360);
in368 = _mm512_fmadd_ps(tmp1648, _mm512_set1_ps(5.25e+00f), in368);
tmp1644 = _mm512_fmadd_ps(in362, _mm512_set1_ps(2.5e-01f), in366);
tmp1648 = _mm512_fmadd_ps(in370, _mm512_set1_ps(2.5e-01f), in374);
in362 = _mm512_fmadd_ps(in362, _mm512_set1_ps(4e+00f), in366);
in370 = _mm512_fmadd_ps(in370, _mm512_set1_ps(4e+00f), in374);
__m512 tmp1646 = _mm512_sub_ps(tmp1645, tmp1643);
__m512 tmp1650 = _mm512_sub_ps(tmp1649, tmp1647);
tmp1645 = _mm512_add_ps(tmp1643, tmp1645);
tmp1649 = _mm512_add_ps(tmp1647, tmp1649);
tmp1643 = _mm512_fmadd_ps(in361, _mm512_set1_ps(2.5e-01f), in365);
tmp1647 = _mm512_fmadd_ps(in369, _mm512_set1_ps(2.5e-01f), in373);
tmp1644 = _mm512_fmadd_ps(in364, _mm512_set1_ps(-1.25e+00f), tmp1644);
tmp1648 = _mm512_fmadd_ps(in372, _mm512_set1_ps(-1.25e+00f), tmp1648);
in364 = _mm512_fmadd_ps(in364, _mm512_set1_ps(-5e+00f), in362);
in372 = _mm512_fmadd_ps(in372, _mm512_set1_ps(-5e+00f), in370);
tmp1643 = _mm512_fmadd_ps(in363, _mm512_set1_ps(-1.25e+00f), tmp1643);
tmp1647 = _mm512_fmadd_ps(in371, _mm512_set1_ps(-1.25e+00f), tmp1647);
in366 = _mm512_fmadd_ps(tmp1643, _mm512_set1_ps(2e+00f), tmp1644);
in374 = _mm512_fmadd_ps(tmp1647, _mm512_set1_ps(2e+00f), tmp1648);
tmp1644 = _mm512_fnmadd_ps(tmp1643, _mm512_set1_ps(2e+00f), tmp1644);
tmp1648 = _mm512_fnmadd_ps(tmp1647, _mm512_set1_ps(2e+00f), tmp1648);
tmp1643 = _mm512_fmadd_ps(in365, _mm512_set1_ps(2.5e-01f), in361);
tmp1647 = _mm512_fmadd_ps(in373, _mm512_set1_ps(2.5e-01f), in369);
in361 = _mm512_sub_ps(in367, in361);
in369 = _mm512_sub_ps(in375, in369);
tmp1643 = _mm512_fmadd_ps(in363, _mm512_set1_ps(-1.25e+00f), tmp1643);
tmp1647 = _mm512_fmadd_ps(in371, _mm512_set1_ps(-1.25e+00f), tmp1647);
in363 = _mm512_sub_ps(in363, in365);
in371 = _mm512_sub_ps(in371, in373);
in363 = _mm512_fmadd_ps(in363, _mm512_set1_ps(5.25e+00f), in361);
in371 = _mm512_fmadd_ps(in371, _mm512_set1_ps(5.25e+00f), in369);
in362 = _mm512_fmadd_ps(tmp1643, _mm512_set1_ps(2e+00f), in364);
in370 = _mm512_fmadd_ps(tmp1647, _mm512_set1_ps(2e+00f), in372);
in364 = _mm512_fnmadd_ps(tmp1643, _mm512_set1_ps(2e+00f), in364);
in372 = _mm512_fnmadd_ps(tmp1647, _mm512_set1_ps(2e+00f), in372);
__m512 tmp1659 = _mm512_unpacklo_ps(in360, tmp1645);
__m512 tmp1660 = _mm512_unpackhi_ps(in360, tmp1645);
__m512 tmp1661 = _mm512_unpacklo_ps(tmp1646, in366);
__m512 tmp1662 = _mm512_unpackhi_ps(tmp1646, in366);
__m512 tmp1663 = _mm512_unpacklo_ps(tmp1644, in362);
__m512 tmp1664 = _mm512_unpackhi_ps(tmp1644, in362);
__m512 tmp1665 = _mm512_unpacklo_ps(in364, in363);
__m512 tmp1666 = _mm512_unpackhi_ps(in364, in363);
__m512 tmp1667 = _mm512_unpacklo_ps(in368, tmp1649);
__m512 tmp1668 = _mm512_unpackhi_ps(in368, tmp1649);
__m512 tmp1669 = _mm512_unpacklo_ps(tmp1650, in374);
__m512 tmp1670 = _mm512_unpackhi_ps(tmp1650, in374);
__m512 tmp1671 = _mm512_unpacklo_ps(tmp1648, in370);
__m512 tmp1672 = _mm512_unpackhi_ps(tmp1648, in370);
__m512 tmp1673 = _mm512_unpacklo_ps(in372, in371);
__m512 tmp1674 = _mm512_unpackhi_ps(in372, in371);
__m512 tmp1675 = _mm512_shuffle_ps(tmp1659, tmp1661, 68);
__m512 tmp1676 = _mm512_shuffle_ps(tmp1659, tmp1661, 238);
__m512 tmp1677 = _mm512_shuffle_ps(tmp1660, tmp1662, 68);
__m512 tmp1678 = _mm512_shuffle_ps(tmp1660, tmp1662, 238);
__m512 tmp1679 = _mm512_shuffle_ps(tmp1663, tmp1665, 68);
__m512 tmp1680 = _mm512_shuffle_ps(tmp1663, tmp1665, 238);
__m512 tmp1681 = _mm512_shuffle_ps(tmp1664, tmp1666, 68);
__m512 tmp1682 = _mm512_shuffle_ps(tmp1664, tmp1666, 238);
__m512 tmp1683 = _mm512_shuffle_ps(tmp1667, tmp1669, 68);
__m512 tmp1684 = _mm512_shuffle_ps(tmp1667, tmp1669, 238);
__m512 tmp1685 = _mm512_shuffle_ps(tmp1668, tmp1670, 68);
__m512 tmp1686 = _mm512_shuffle_ps(tmp1668, tmp1670, 238);
__m512 tmp1687 = _mm512_shuffle_ps(tmp1671, tmp1673, 68);
__m512 tmp1688 = _mm512_shuffle_ps(tmp1671, tmp1673, 238);
__m512 tmp1689 = _mm512_shuffle_ps(tmp1672, tmp1674, 68);
__m512 tmp1690 = _mm512_shuffle_ps(tmp1672, tmp1674, 238);
__m512 tmp1691 = _mm512_shuffle_f32x4(tmp1675, tmp1679, 136);
__m512 tmp1692 = _mm512_shuffle_f32x4(tmp1675, tmp1679, 221);
__m512 tmp1693 = _mm512_shuffle_f32x4(tmp1676, tmp1680, 136);
__m512 tmp1694 = _mm512_shuffle_f32x4(tmp1676, tmp1680, 221);
__m512 tmp1695 = _mm512_shuffle_f32x4(tmp1677, tmp1681, 136);
__m512 tmp1696 = _mm512_shuffle_f32x4(tmp1677, tmp1681, 221);
__m512 tmp1697 = _mm512_shuffle_f32x4(tmp1678, tmp1682, 136);
__m512 tmp1698 = _mm512_shuffle_f32x4(tmp1678, tmp1682, 221);
__m512 tmp1699 = _mm512_shuffle_f32x4(tmp1683, tmp1687, 136);
__m512 tmp1700 = _mm512_shuffle_f32x4(tmp1683, tmp1687, 221);
__m512 tmp1701 = _mm512_shuffle_f32x4(tmp1684, tmp1688, 136);
__m512 tmp1702 = _mm512_shuffle_f32x4(tmp1684, tmp1688, 221);
__m512 tmp1703 = _mm512_shuffle_f32x4(tmp1685, tmp1689, 136);
__m512 tmp1704 = _mm512_shuffle_f32x4(tmp1685, tmp1689, 221);
__m512 tmp1705 = _mm512_shuffle_f32x4(tmp1686, tmp1690, 136);
__m512 tmp1706 = _mm512_shuffle_f32x4(tmp1686, tmp1690, 221);
in360 = _mm512_shuffle_f32x4(tmp1691, tmp1699, 136);
in368 = _mm512_shuffle_f32x4(tmp1691, tmp1699, 221);
tmp1645 = _mm512_shuffle_f32x4(tmp1693, tmp1701, 136);
tmp1649 = _mm512_shuffle_f32x4(tmp1693, tmp1701, 221);
tmp1646 = _mm512_shuffle_f32x4(tmp1695, tmp1703, 136);
tmp1650 = _mm512_shuffle_f32x4(tmp1695, tmp1703, 221);
in366 = _mm512_shuffle_f32x4(tmp1697, tmp1705, 136);
in374 = _mm512_shuffle_f32x4(tmp1697, tmp1705, 221);
tmp1644 = _mm512_shuffle_f32x4(tmp1692, tmp1700, 136);
tmp1648 = _mm512_shuffle_f32x4(tmp1692, tmp1700, 221);
in362 = _mm512_shuffle_f32x4(tmp1694, tmp1702, 136);
in370 = _mm512_shuffle_f32x4(tmp1694, tmp1702, 221);
in364 = _mm512_shuffle_f32x4(tmp1696, tmp1704, 136);
in372 = _mm512_shuffle_f32x4(tmp1696, tmp1704, 221);
in363 = _mm512_shuffle_f32x4(tmp1698, tmp1706, 136);
in371 = _mm512_shuffle_f32x4(tmp1698, tmp1706, 221);
__m512 tmp1651 = _mm512_add_ps(tmp1645, in362);
__m512 tmp1655 = _mm512_add_ps(tmp1649, in370);
__m512 tmp1652 = _mm512_sub_ps(tmp1644, tmp1646);
__m512 tmp1656 = _mm512_sub_ps(tmp1648, tmp1650);
__m512 tmp1653 = _mm512_add_ps(tmp1646, in364);
__m512 tmp1657 = _mm512_add_ps(tmp1650, in372);
in360 = _mm512_sub_ps(in360, in364);
in368 = _mm512_sub_ps(in368, in372);
tmp1651 = _mm512_fmadd_ps(in366, _mm512_set1_ps(-4.25e+00f), tmp1651);
tmp1655 = _mm512_fmadd_ps(in374, _mm512_set1_ps(-4.25e+00f), tmp1655);
tmp1653 = _mm512_fmadd_ps(tmp1644, _mm512_set1_ps(-4.25e+00f), tmp1653);
tmp1657 = _mm512_fmadd_ps(tmp1648, _mm512_set1_ps(-4.25e+00f), tmp1657);
in360 = _mm512_fmadd_ps(tmp1652, _mm512_set1_ps(5.25e+00f), in360);
in368 = _mm512_fmadd_ps(tmp1656, _mm512_set1_ps(5.25e+00f), in368);
tmp1652 = _mm512_fmadd_ps(tmp1646, _mm512_set1_ps(2.5e-01f), in364);
tmp1656 = _mm512_fmadd_ps(tmp1650, _mm512_set1_ps(2.5e-01f), in372);
tmp1646 = _mm512_fmadd_ps(tmp1646, _mm512_set1_ps(4e+00f), in364);
tmp1650 = _mm512_fmadd_ps(tmp1650, _mm512_set1_ps(4e+00f), in372);
__m512 tmp1654 = _mm512_sub_ps(tmp1653, tmp1651);
__m512 tmp1658 = _mm512_sub_ps(tmp1657, tmp1655);
tmp1653 = _mm512_add_ps(tmp1651, tmp1653);
tmp1657 = _mm512_add_ps(tmp1655, tmp1657);
tmp1651 = _mm512_fmadd_ps(tmp1645, _mm512_set1_ps(2.5e-01f), in362);
tmp1655 = _mm512_fmadd_ps(tmp1649, _mm512_set1_ps(2.5e-01f), in370);
tmp1652 = _mm512_fmadd_ps(tmp1644, _mm512_set1_ps(-1.25e+00f), tmp1652);
tmp1656 = _mm512_fmadd_ps(tmp1648, _mm512_set1_ps(-1.25e+00f), tmp1656);
tmp1644 = _mm512_fmadd_ps(tmp1644, _mm512_set1_ps(-5e+00f), tmp1646);
tmp1648 = _mm512_fmadd_ps(tmp1648, _mm512_set1_ps(-5e+00f), tmp1650);
tmp1651 = _mm512_fmadd_ps(in366, _mm512_set1_ps(-1.25e+00f), tmp1651);
tmp1655 = _mm512_fmadd_ps(in374, _mm512_set1_ps(-1.25e+00f), tmp1655);
in364 = _mm512_fmadd_ps(tmp1651, _mm512_set1_ps(2e+00f), tmp1652);
in372 = _mm512_fmadd_ps(tmp1655, _mm512_set1_ps(2e+00f), tmp1656);
tmp1652 = _mm512_fnmadd_ps(tmp1651, _mm512_set1_ps(2e+00f), tmp1652);
tmp1656 = _mm512_fnmadd_ps(tmp1655, _mm512_set1_ps(2e+00f), tmp1656);
tmp1651 = _mm512_fmadd_ps(in362, _mm512_set1_ps(2.5e-01f), tmp1645);
tmp1655 = _mm512_fmadd_ps(in370, _mm512_set1_ps(2.5e-01f), tmp1649);
tmp1645 = _mm512_sub_ps(in363, tmp1645);
tmp1649 = _mm512_sub_ps(in371, tmp1649);
tmp1651 = _mm512_fmadd_ps(in366, _mm512_set1_ps(-1.25e+00f), tmp1651);
tmp1655 = _mm512_fmadd_ps(in374, _mm512_set1_ps(-1.25e+00f), tmp1655);
in366 = _mm512_sub_ps(in366, in362);
in374 = _mm512_sub_ps(in374, in370);
in366 = _mm512_fmadd_ps(in366, _mm512_set1_ps(5.25e+00f), tmp1645);
in374 = _mm512_fmadd_ps(in374, _mm512_set1_ps(5.25e+00f), tmp1649);
tmp1646 = _mm512_fmadd_ps(tmp1651, _mm512_set1_ps(2e+00f), tmp1644);
tmp1650 = _mm512_fmadd_ps(tmp1655, _mm512_set1_ps(2e+00f), tmp1648);
tmp1644 = _mm512_fnmadd_ps(tmp1651, _mm512_set1_ps(2e+00f), tmp1644);
tmp1648 = _mm512_fnmadd_ps(tmp1655, _mm512_set1_ps(2e+00f), tmp1648);
__m512 out413 = _mm512_shuffle_f32x4(in360, tmp1653, 68);
__m512 out421 = _mm512_shuffle_f32x4(in360, tmp1653, 238);
__m512 out414 = _mm512_shuffle_f32x4(tmp1654, in364, 68);
__m512 out422 = _mm512_shuffle_f32x4(tmp1654, in364, 238);
__m512 out415 = _mm512_shuffle_f32x4(tmp1652, tmp1646, 68);
__m512 out423 = _mm512_shuffle_f32x4(tmp1652, tmp1646, 238);
__m512 out416 = _mm512_shuffle_f32x4(tmp1644, in366, 68);
__m512 out424 = _mm512_shuffle_f32x4(tmp1644, in366, 238);
__m512 out417 = _mm512_shuffle_f32x4(in368, tmp1657, 68);
__m512 out425 = _mm512_shuffle_f32x4(in368, tmp1657, 238);
__m512 out418 = _mm512_shuffle_f32x4(tmp1658, in372, 68);
__m512 out426 = _mm512_shuffle_f32x4(tmp1658, in372, 238);
__m512 out419 = _mm512_shuffle_f32x4(tmp1656, tmp1650, 68);
__m512 out427 = _mm512_shuffle_f32x4(tmp1656, tmp1650, 238);
__m512 out420 = _mm512_shuffle_f32x4(tmp1648, in374, 68);
__m512 out428 = _mm512_shuffle_f32x4(tmp1648, in374, 238);
_mm512_storeu_ps(dfPtr2+512+2856960*i8+178560*j4+44544*ss2+768*k10, out413);
_mm512_storeu_ps(dfPtr2+640+2856960*i8+178560*j4+44544*ss2+768*k10, out421);
_mm512_storeu_ps(dfPtr2+576+2856960*i8+178560*j4+44544*ss2+768*k10, out417);
_mm512_storeu_ps(dfPtr2+704+2856960*i8+178560*j4+44544*ss2+768*k10, out425);
_mm512_storeu_ps(dfPtr2+714752+2856960*i8+178560*j4+44544*ss2+768*k10, out414);
_mm512_storeu_ps(dfPtr2+714880+2856960*i8+178560*j4+44544*ss2+768*k10, out422);
_mm512_storeu_ps(dfPtr2+714816+2856960*i8+178560*j4+44544*ss2+768*k10, out418);
_mm512_storeu_ps(dfPtr2+714944+2856960*i8+178560*j4+44544*ss2+768*k10, out426);
_mm512_storeu_ps(dfPtr2+1428992+2856960*i8+178560*j4+44544*ss2+768*k10, out415);
_mm512_storeu_ps(dfPtr2+1429120+2856960*i8+178560*j4+44544*ss2+768*k10, out423);
_mm512_storeu_ps(dfPtr2+1429056+2856960*i8+178560*j4+44544*ss2+768*k10, out419);
_mm512_storeu_ps(dfPtr2+1429184+2856960*i8+178560*j4+44544*ss2+768*k10, out427);
_mm512_storeu_ps(dfPtr2+2143232+2856960*i8+178560*j4+44544*ss2+768*k10, out416);
_mm512_storeu_ps(dfPtr2+2143360+2856960*i8+178560*j4+44544*ss2+768*k10, out424);
_mm512_storeu_ps(dfPtr2+2143296+2856960*i8+178560*j4+44544*ss2+768*k10, out420);
_mm512_storeu_ps(dfPtr2+2143424+2856960*i8+178560*j4+44544*ss2+768*k10, out428);
}
__m512 dat361 = _mm512_maskz_loadu_ps(2047, datPtr2+0+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat362 = _mm512_maskz_loadu_ps(16383, datPtr2+504+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512i pm48 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in376 = _mm512_permutexvar_ps(pm48, dat361);
__m512i pm49 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in384 = _mm512_permutexvar_ps(pm49, dat362);
__m512 dat363 = _mm512_maskz_loadu_ps(2047, datPtr2+92+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat364 = _mm512_maskz_loadu_ps(16383, datPtr2+596+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in377 = _mm512_permutexvar_ps(pm48, dat363);
__m512 in385 = _mm512_permutexvar_ps(pm49, dat364);
__m512 dat365 = _mm512_maskz_loadu_ps(2047, datPtr2+184+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat366 = _mm512_maskz_loadu_ps(16383, datPtr2+688+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in378 = _mm512_permutexvar_ps(pm48, dat365);
__m512 in386 = _mm512_permutexvar_ps(pm49, dat366);
__m512 dat367 = _mm512_maskz_loadu_ps(2047, datPtr2+276+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat368 = _mm512_maskz_loadu_ps(16383, datPtr2+780+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in379 = _mm512_permutexvar_ps(pm48, dat367);
__m512 in387 = _mm512_permutexvar_ps(pm49, dat368);
__m512 dat369 = _mm512_maskz_loadu_ps(2047, datPtr2+368+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat370 = _mm512_maskz_loadu_ps(16383, datPtr2+872+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in380 = _mm512_permutexvar_ps(pm48, dat369);
__m512 in388 = _mm512_permutexvar_ps(pm49, dat370);
__m512 dat371 = _mm512_maskz_loadu_ps(2047, datPtr2+460+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat372 = _mm512_maskz_loadu_ps(16383, datPtr2+964+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in381 = _mm512_permutexvar_ps(pm48, dat371);
__m512 in389 = _mm512_permutexvar_ps(pm49, dat372);
__m512 dat373 = _mm512_maskz_loadu_ps(2047, datPtr2+552+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat374 = _mm512_maskz_loadu_ps(16383, datPtr2+1056+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in382 = _mm512_permutexvar_ps(pm48, dat373);
__m512 in390 = _mm512_permutexvar_ps(pm49, dat374);
__m512 dat375 = _mm512_maskz_loadu_ps(2047, datPtr2+644+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 dat376 = _mm512_maskz_loadu_ps(16383, datPtr2+1148+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in383 = _mm512_permutexvar_ps(pm48, dat375);
__m512 in391 = _mm512_permutexvar_ps(pm49, dat376);
__m512 tmp1707 = _mm512_add_ps(in377, in381);
__m512 tmp1711 = _mm512_add_ps(in385, in389);
__m512 tmp1708 = _mm512_sub_ps(in380, in378);
__m512 tmp1712 = _mm512_sub_ps(in388, in386);
__m512 tmp1709 = _mm512_add_ps(in378, in382);
__m512 tmp1713 = _mm512_add_ps(in386, in390);
in376 = _mm512_sub_ps(in376, in382);
in384 = _mm512_sub_ps(in384, in390);
tmp1707 = _mm512_fmadd_ps(in379, _mm512_set1_ps(-4.25e+00f), tmp1707);
tmp1711 = _mm512_fmadd_ps(in387, _mm512_set1_ps(-4.25e+00f), tmp1711);
tmp1709 = _mm512_fmadd_ps(in380, _mm512_set1_ps(-4.25e+00f), tmp1709);
tmp1713 = _mm512_fmadd_ps(in388, _mm512_set1_ps(-4.25e+00f), tmp1713);
in376 = _mm512_fmadd_ps(tmp1708, _mm512_set1_ps(5.25e+00f), in376);
in384 = _mm512_fmadd_ps(tmp1712, _mm512_set1_ps(5.25e+00f), in384);
tmp1708 = _mm512_fmadd_ps(in378, _mm512_set1_ps(2.5e-01f), in382);
tmp1712 = _mm512_fmadd_ps(in386, _mm512_set1_ps(2.5e-01f), in390);
in378 = _mm512_fmadd_ps(in378, _mm512_set1_ps(4e+00f), in382);
in386 = _mm512_fmadd_ps(in386, _mm512_set1_ps(4e+00f), in390);
__m512 tmp1710 = _mm512_sub_ps(tmp1709, tmp1707);
__m512 tmp1714 = _mm512_sub_ps(tmp1713, tmp1711);
tmp1709 = _mm512_add_ps(tmp1707, tmp1709);
tmp1713 = _mm512_add_ps(tmp1711, tmp1713);
tmp1707 = _mm512_fmadd_ps(in377, _mm512_set1_ps(2.5e-01f), in381);
tmp1711 = _mm512_fmadd_ps(in385, _mm512_set1_ps(2.5e-01f), in389);
tmp1708 = _mm512_fmadd_ps(in380, _mm512_set1_ps(-1.25e+00f), tmp1708);
tmp1712 = _mm512_fmadd_ps(in388, _mm512_set1_ps(-1.25e+00f), tmp1712);
in380 = _mm512_fmadd_ps(in380, _mm512_set1_ps(-5e+00f), in378);
in388 = _mm512_fmadd_ps(in388, _mm512_set1_ps(-5e+00f), in386);
tmp1707 = _mm512_fmadd_ps(in379, _mm512_set1_ps(-1.25e+00f), tmp1707);
tmp1711 = _mm512_fmadd_ps(in387, _mm512_set1_ps(-1.25e+00f), tmp1711);
in382 = _mm512_fmadd_ps(tmp1707, _mm512_set1_ps(2e+00f), tmp1708);
in390 = _mm512_fmadd_ps(tmp1711, _mm512_set1_ps(2e+00f), tmp1712);
tmp1708 = _mm512_fnmadd_ps(tmp1707, _mm512_set1_ps(2e+00f), tmp1708);
tmp1712 = _mm512_fnmadd_ps(tmp1711, _mm512_set1_ps(2e+00f), tmp1712);
tmp1707 = _mm512_fmadd_ps(in381, _mm512_set1_ps(2.5e-01f), in377);
tmp1711 = _mm512_fmadd_ps(in389, _mm512_set1_ps(2.5e-01f), in385);
in377 = _mm512_sub_ps(in383, in377);
in385 = _mm512_sub_ps(in391, in385);
tmp1707 = _mm512_fmadd_ps(in379, _mm512_set1_ps(-1.25e+00f), tmp1707);
tmp1711 = _mm512_fmadd_ps(in387, _mm512_set1_ps(-1.25e+00f), tmp1711);
in379 = _mm512_sub_ps(in379, in381);
in387 = _mm512_sub_ps(in387, in389);
in379 = _mm512_fmadd_ps(in379, _mm512_set1_ps(5.25e+00f), in377);
in387 = _mm512_fmadd_ps(in387, _mm512_set1_ps(5.25e+00f), in385);
in378 = _mm512_fmadd_ps(tmp1707, _mm512_set1_ps(2e+00f), in380);
in386 = _mm512_fmadd_ps(tmp1711, _mm512_set1_ps(2e+00f), in388);
in380 = _mm512_fnmadd_ps(tmp1707, _mm512_set1_ps(2e+00f), in380);
in388 = _mm512_fnmadd_ps(tmp1711, _mm512_set1_ps(2e+00f), in388);
__m512 tmp1723 = _mm512_unpacklo_ps(in376, tmp1709);
__m512 tmp1724 = _mm512_unpackhi_ps(in376, tmp1709);
__m512 tmp1725 = _mm512_unpacklo_ps(tmp1710, in382);
__m512 tmp1726 = _mm512_unpackhi_ps(tmp1710, in382);
__m512 tmp1727 = _mm512_unpacklo_ps(tmp1708, in378);
__m512 tmp1728 = _mm512_unpackhi_ps(tmp1708, in378);
__m512 tmp1729 = _mm512_unpacklo_ps(in380, in379);
__m512 tmp1730 = _mm512_unpackhi_ps(in380, in379);
__m512 tmp1731 = _mm512_unpacklo_ps(in384, tmp1713);
__m512 tmp1732 = _mm512_unpackhi_ps(in384, tmp1713);
__m512 tmp1733 = _mm512_unpacklo_ps(tmp1714, in390);
__m512 tmp1734 = _mm512_unpackhi_ps(tmp1714, in390);
__m512 tmp1735 = _mm512_unpacklo_ps(tmp1712, in386);
__m512 tmp1736 = _mm512_unpackhi_ps(tmp1712, in386);
__m512 tmp1737 = _mm512_unpacklo_ps(in388, in387);
__m512 tmp1738 = _mm512_unpackhi_ps(in388, in387);
__m512 tmp1739 = _mm512_shuffle_ps(tmp1723, tmp1725, 68);
__m512 tmp1740 = _mm512_shuffle_ps(tmp1723, tmp1725, 238);
__m512 tmp1741 = _mm512_shuffle_ps(tmp1724, tmp1726, 68);
__m512 tmp1742 = _mm512_shuffle_ps(tmp1724, tmp1726, 238);
__m512 tmp1743 = _mm512_shuffle_ps(tmp1727, tmp1729, 68);
__m512 tmp1744 = _mm512_shuffle_ps(tmp1727, tmp1729, 238);
__m512 tmp1745 = _mm512_shuffle_ps(tmp1728, tmp1730, 68);
__m512 tmp1746 = _mm512_shuffle_ps(tmp1728, tmp1730, 238);
__m512 tmp1747 = _mm512_shuffle_ps(tmp1731, tmp1733, 68);
__m512 tmp1748 = _mm512_shuffle_ps(tmp1731, tmp1733, 238);
__m512 tmp1749 = _mm512_shuffle_ps(tmp1732, tmp1734, 68);
__m512 tmp1750 = _mm512_shuffle_ps(tmp1732, tmp1734, 238);
__m512 tmp1751 = _mm512_shuffle_ps(tmp1735, tmp1737, 68);
__m512 tmp1752 = _mm512_shuffle_ps(tmp1735, tmp1737, 238);
__m512 tmp1753 = _mm512_shuffle_ps(tmp1736, tmp1738, 68);
__m512 tmp1754 = _mm512_shuffle_ps(tmp1736, tmp1738, 238);
__m512 tmp1755 = _mm512_shuffle_f32x4(tmp1739, tmp1743, 136);
__m512 tmp1756 = _mm512_shuffle_f32x4(tmp1739, tmp1743, 221);
__m512 tmp1757 = _mm512_shuffle_f32x4(tmp1740, tmp1744, 136);
__m512 tmp1758 = _mm512_shuffle_f32x4(tmp1740, tmp1744, 221);
__m512 tmp1759 = _mm512_shuffle_f32x4(tmp1741, tmp1745, 136);
__m512 tmp1760 = _mm512_shuffle_f32x4(tmp1741, tmp1745, 221);
__m512 tmp1761 = _mm512_shuffle_f32x4(tmp1742, tmp1746, 136);
__m512 tmp1762 = _mm512_shuffle_f32x4(tmp1742, tmp1746, 221);
__m512 tmp1763 = _mm512_shuffle_f32x4(tmp1747, tmp1751, 136);
__m512 tmp1764 = _mm512_shuffle_f32x4(tmp1747, tmp1751, 221);
__m512 tmp1765 = _mm512_shuffle_f32x4(tmp1748, tmp1752, 136);
__m512 tmp1766 = _mm512_shuffle_f32x4(tmp1748, tmp1752, 221);
__m512 tmp1767 = _mm512_shuffle_f32x4(tmp1749, tmp1753, 136);
__m512 tmp1768 = _mm512_shuffle_f32x4(tmp1749, tmp1753, 221);
__m512 tmp1769 = _mm512_shuffle_f32x4(tmp1750, tmp1754, 136);
__m512 tmp1770 = _mm512_shuffle_f32x4(tmp1750, tmp1754, 221);
in376 = _mm512_shuffle_f32x4(tmp1755, tmp1763, 136);
in384 = _mm512_shuffle_f32x4(tmp1755, tmp1763, 221);
tmp1709 = _mm512_shuffle_f32x4(tmp1757, tmp1765, 136);
tmp1713 = _mm512_shuffle_f32x4(tmp1757, tmp1765, 221);
tmp1710 = _mm512_shuffle_f32x4(tmp1759, tmp1767, 136);
tmp1714 = _mm512_shuffle_f32x4(tmp1759, tmp1767, 221);
in382 = _mm512_shuffle_f32x4(tmp1761, tmp1769, 136);
in390 = _mm512_shuffle_f32x4(tmp1761, tmp1769, 221);
tmp1708 = _mm512_shuffle_f32x4(tmp1756, tmp1764, 136);
tmp1712 = _mm512_shuffle_f32x4(tmp1756, tmp1764, 221);
in378 = _mm512_shuffle_f32x4(tmp1758, tmp1766, 136);
in386 = _mm512_shuffle_f32x4(tmp1758, tmp1766, 221);
in380 = _mm512_shuffle_f32x4(tmp1760, tmp1768, 136);
in388 = _mm512_shuffle_f32x4(tmp1760, tmp1768, 221);
in379 = _mm512_shuffle_f32x4(tmp1762, tmp1770, 136);
in387 = _mm512_shuffle_f32x4(tmp1762, tmp1770, 221);
__m512 tmp1715 = _mm512_add_ps(tmp1709, in378);
__m512 tmp1719 = _mm512_add_ps(tmp1713, in386);
__m512 tmp1716 = _mm512_sub_ps(tmp1708, tmp1710);
__m512 tmp1720 = _mm512_sub_ps(tmp1712, tmp1714);
__m512 tmp1717 = _mm512_add_ps(tmp1710, in380);
__m512 tmp1721 = _mm512_add_ps(tmp1714, in388);
in376 = _mm512_sub_ps(in376, in380);
in384 = _mm512_sub_ps(in384, in388);
tmp1715 = _mm512_fmadd_ps(in382, _mm512_set1_ps(-4.25e+00f), tmp1715);
tmp1719 = _mm512_fmadd_ps(in390, _mm512_set1_ps(-4.25e+00f), tmp1719);
tmp1717 = _mm512_fmadd_ps(tmp1708, _mm512_set1_ps(-4.25e+00f), tmp1717);
tmp1721 = _mm512_fmadd_ps(tmp1712, _mm512_set1_ps(-4.25e+00f), tmp1721);
in376 = _mm512_fmadd_ps(tmp1716, _mm512_set1_ps(5.25e+00f), in376);
in384 = _mm512_fmadd_ps(tmp1720, _mm512_set1_ps(5.25e+00f), in384);
tmp1716 = _mm512_fmadd_ps(tmp1710, _mm512_set1_ps(2.5e-01f), in380);
tmp1720 = _mm512_fmadd_ps(tmp1714, _mm512_set1_ps(2.5e-01f), in388);
tmp1710 = _mm512_fmadd_ps(tmp1710, _mm512_set1_ps(4e+00f), in380);
tmp1714 = _mm512_fmadd_ps(tmp1714, _mm512_set1_ps(4e+00f), in388);
__m512 tmp1718 = _mm512_sub_ps(tmp1717, tmp1715);
__m512 tmp1722 = _mm512_sub_ps(tmp1721, tmp1719);
tmp1717 = _mm512_add_ps(tmp1715, tmp1717);
tmp1721 = _mm512_add_ps(tmp1719, tmp1721);
tmp1715 = _mm512_fmadd_ps(tmp1709, _mm512_set1_ps(2.5e-01f), in378);
tmp1719 = _mm512_fmadd_ps(tmp1713, _mm512_set1_ps(2.5e-01f), in386);
tmp1716 = _mm512_fmadd_ps(tmp1708, _mm512_set1_ps(-1.25e+00f), tmp1716);
tmp1720 = _mm512_fmadd_ps(tmp1712, _mm512_set1_ps(-1.25e+00f), tmp1720);
tmp1708 = _mm512_fmadd_ps(tmp1708, _mm512_set1_ps(-5e+00f), tmp1710);
tmp1712 = _mm512_fmadd_ps(tmp1712, _mm512_set1_ps(-5e+00f), tmp1714);
tmp1715 = _mm512_fmadd_ps(in382, _mm512_set1_ps(-1.25e+00f), tmp1715);
tmp1719 = _mm512_fmadd_ps(in390, _mm512_set1_ps(-1.25e+00f), tmp1719);
in380 = _mm512_fmadd_ps(tmp1715, _mm512_set1_ps(2e+00f), tmp1716);
in388 = _mm512_fmadd_ps(tmp1719, _mm512_set1_ps(2e+00f), tmp1720);
tmp1716 = _mm512_fnmadd_ps(tmp1715, _mm512_set1_ps(2e+00f), tmp1716);
tmp1720 = _mm512_fnmadd_ps(tmp1719, _mm512_set1_ps(2e+00f), tmp1720);
tmp1715 = _mm512_fmadd_ps(in378, _mm512_set1_ps(2.5e-01f), tmp1709);
tmp1719 = _mm512_fmadd_ps(in386, _mm512_set1_ps(2.5e-01f), tmp1713);
tmp1709 = _mm512_sub_ps(in379, tmp1709);
tmp1713 = _mm512_sub_ps(in387, tmp1713);
tmp1715 = _mm512_fmadd_ps(in382, _mm512_set1_ps(-1.25e+00f), tmp1715);
tmp1719 = _mm512_fmadd_ps(in390, _mm512_set1_ps(-1.25e+00f), tmp1719);
in382 = _mm512_sub_ps(in382, in378);
in390 = _mm512_sub_ps(in390, in386);
in382 = _mm512_fmadd_ps(in382, _mm512_set1_ps(5.25e+00f), tmp1709);
in390 = _mm512_fmadd_ps(in390, _mm512_set1_ps(5.25e+00f), tmp1713);
tmp1710 = _mm512_fmadd_ps(tmp1715, _mm512_set1_ps(2e+00f), tmp1708);
tmp1714 = _mm512_fmadd_ps(tmp1719, _mm512_set1_ps(2e+00f), tmp1712);
tmp1708 = _mm512_fnmadd_ps(tmp1715, _mm512_set1_ps(2e+00f), tmp1708);
tmp1712 = _mm512_fnmadd_ps(tmp1719, _mm512_set1_ps(2e+00f), tmp1712);
__m512 out429 = _mm512_shuffle_f32x4(in376, tmp1717, 68);
__m512 out437 = _mm512_shuffle_f32x4(in376, tmp1717, 238);
__m512 out430 = _mm512_shuffle_f32x4(tmp1718, in380, 68);
__m512 out438 = _mm512_shuffle_f32x4(tmp1718, in380, 238);
__m512 out431 = _mm512_shuffle_f32x4(tmp1716, tmp1710, 68);
__m512 out439 = _mm512_shuffle_f32x4(tmp1716, tmp1710, 238);
__m512 out432 = _mm512_shuffle_f32x4(tmp1708, in382, 68);
__m512 out440 = _mm512_shuffle_f32x4(tmp1708, in382, 238);
__m512 out433 = _mm512_shuffle_f32x4(in384, tmp1721, 68);
__m512 out441 = _mm512_shuffle_f32x4(in384, tmp1721, 238);
__m512 out434 = _mm512_shuffle_f32x4(tmp1722, in388, 68);
__m512 out442 = _mm512_shuffle_f32x4(tmp1722, in388, 238);
__m512 out435 = _mm512_shuffle_f32x4(tmp1720, tmp1714, 68);
__m512 out443 = _mm512_shuffle_f32x4(tmp1720, tmp1714, 238);
__m512 out436 = _mm512_shuffle_f32x4(tmp1712, in390, 68);
__m512 out444 = _mm512_shuffle_f32x4(tmp1712, in390, 238);
_mm512_storeu_ps(dfPtr2+0+2856960*i8+178560*j4+44544*ss2+768*k10, out429);
_mm512_storeu_ps(dfPtr2+128+2856960*i8+178560*j4+44544*ss2+768*k10, out437);
_mm512_storeu_ps(dfPtr2+64+2856960*i8+178560*j4+44544*ss2+768*k10, out433);
_mm512_storeu_ps(dfPtr2+192+2856960*i8+178560*j4+44544*ss2+768*k10, out441);
_mm512_storeu_ps(dfPtr2+714240+2856960*i8+178560*j4+44544*ss2+768*k10, out430);
_mm512_storeu_ps(dfPtr2+714368+2856960*i8+178560*j4+44544*ss2+768*k10, out438);
_mm512_storeu_ps(dfPtr2+714304+2856960*i8+178560*j4+44544*ss2+768*k10, out434);
_mm512_storeu_ps(dfPtr2+714432+2856960*i8+178560*j4+44544*ss2+768*k10, out442);
_mm512_storeu_ps(dfPtr2+1428480+2856960*i8+178560*j4+44544*ss2+768*k10, out431);
_mm512_storeu_ps(dfPtr2+1428608+2856960*i8+178560*j4+44544*ss2+768*k10, out439);
_mm512_storeu_ps(dfPtr2+1428544+2856960*i8+178560*j4+44544*ss2+768*k10, out435);
_mm512_storeu_ps(dfPtr2+1428672+2856960*i8+178560*j4+44544*ss2+768*k10, out443);
_mm512_storeu_ps(dfPtr2+2142720+2856960*i8+178560*j4+44544*ss2+768*k10, out432);
_mm512_storeu_ps(dfPtr2+2142848+2856960*i8+178560*j4+44544*ss2+768*k10, out440);
_mm512_storeu_ps(dfPtr2+2142784+2856960*i8+178560*j4+44544*ss2+768*k10, out436);
_mm512_storeu_ps(dfPtr2+2142912+2856960*i8+178560*j4+44544*ss2+768*k10, out444);
__m512 dat377 = _mm512_maskz_loadu_ps(2047, datPtr2+552+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512i pm50 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in392 = _mm512_permutexvar_ps(pm50, dat377);
__m512 dat378 = _mm512_maskz_loadu_ps(2047, datPtr2+644+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in393 = _mm512_permutexvar_ps(pm50, dat378);
__m512 dat379 = _mm512_maskz_loadu_ps(2047, datPtr2+736+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in394 = _mm512_permutexvar_ps(pm50, dat379);
__m512 dat380 = _mm512_maskz_loadu_ps(2047, datPtr2+828+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in395 = _mm512_permutexvar_ps(pm50, dat380);
__m512 dat381 = _mm512_maskz_loadu_ps(2047, datPtr2+920+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in396 = _mm512_permutexvar_ps(pm50, dat381);
__m512 dat382 = _mm512_maskz_loadu_ps(2047, datPtr2+1012+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in397 = _mm512_permutexvar_ps(pm50, dat382);
__m512 dat383 = _mm512_maskz_loadu_ps(2047, datPtr2+1104+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in398 = _mm512_permutexvar_ps(pm50, dat383);
__m512 dat384 = _mm512_maskz_loadu_ps(2047, datPtr2+1196+7163304*i8+92*h4+4*w4+405536*ss2+6992*k10);
__m512 in399 = _mm512_permutexvar_ps(pm50, dat384);
__m512 tmp1771 = _mm512_add_ps(in393, in397);
__m512 tmp1772 = _mm512_sub_ps(in396, in394);
__m512 tmp1773 = _mm512_add_ps(in394, in398);
in392 = _mm512_sub_ps(in392, in398);
tmp1771 = _mm512_fmadd_ps(in395, _mm512_set1_ps(-4.25e+00f), tmp1771);
tmp1773 = _mm512_fmadd_ps(in396, _mm512_set1_ps(-4.25e+00f), tmp1773);
in392 = _mm512_fmadd_ps(tmp1772, _mm512_set1_ps(5.25e+00f), in392);
tmp1772 = _mm512_fmadd_ps(in394, _mm512_set1_ps(2.5e-01f), in398);
in394 = _mm512_fmadd_ps(in394, _mm512_set1_ps(4e+00f), in398);
__m512 tmp1774 = _mm512_sub_ps(tmp1773, tmp1771);
tmp1773 = _mm512_add_ps(tmp1771, tmp1773);
tmp1771 = _mm512_fmadd_ps(in393, _mm512_set1_ps(2.5e-01f), in397);
tmp1772 = _mm512_fmadd_ps(in396, _mm512_set1_ps(-1.25e+00f), tmp1772);
in396 = _mm512_fmadd_ps(in396, _mm512_set1_ps(-5e+00f), in394);
tmp1771 = _mm512_fmadd_ps(in395, _mm512_set1_ps(-1.25e+00f), tmp1771);
in398 = _mm512_fmadd_ps(tmp1771, _mm512_set1_ps(2e+00f), tmp1772);
tmp1772 = _mm512_fnmadd_ps(tmp1771, _mm512_set1_ps(2e+00f), tmp1772);
tmp1771 = _mm512_fmadd_ps(in397, _mm512_set1_ps(2.5e-01f), in393);
in393 = _mm512_sub_ps(in399, in393);
tmp1771 = _mm512_fmadd_ps(in395, _mm512_set1_ps(-1.25e+00f), tmp1771);
in395 = _mm512_sub_ps(in395, in397);
in395 = _mm512_fmadd_ps(in395, _mm512_set1_ps(5.25e+00f), in393);
in394 = _mm512_fmadd_ps(tmp1771, _mm512_set1_ps(2e+00f), in396);
in396 = _mm512_fnmadd_ps(tmp1771, _mm512_set1_ps(2e+00f), in396);
__m512 tmp1789 = _mm512_unpacklo_ps(in392, tmp1773);
__m512 tmp1790 = _mm512_unpackhi_ps(in392, tmp1773);
__m512 tmp1791 = _mm512_unpacklo_ps(tmp1774, in398);
__m512 tmp1792 = _mm512_unpackhi_ps(tmp1774, in398);
__m512 tmp1793 = _mm512_unpacklo_ps(tmp1772, in394);
__m512 tmp1794 = _mm512_unpackhi_ps(tmp1772, in394);
__m512 tmp1795 = _mm512_unpacklo_ps(in396, in395);
__m512 tmp1796 = _mm512_unpackhi_ps(in396, in395);
__m512 tmp1797 = _mm512_shuffle_ps(tmp1789, tmp1791, 68);
__m512 tmp1798 = _mm512_shuffle_ps(tmp1789, tmp1791, 238);
__m512 tmp1799 = _mm512_shuffle_ps(tmp1790, tmp1792, 68);
__m512 tmp1800 = _mm512_shuffle_ps(tmp1790, tmp1792, 238);
__m512 tmp1801 = _mm512_shuffle_ps(tmp1793, tmp1795, 68);
__m512 tmp1802 = _mm512_shuffle_ps(tmp1793, tmp1795, 238);
__m512 tmp1803 = _mm512_shuffle_ps(tmp1794, tmp1796, 68);
__m512 tmp1804 = _mm512_shuffle_ps(tmp1794, tmp1796, 238);
__m512 tmp1805 = _mm512_shuffle_f32x4(tmp1797, tmp1801, 136);
__m512 tmp1806 = _mm512_shuffle_f32x4(tmp1797, tmp1801, 221);
__m512 tmp1807 = _mm512_shuffle_f32x4(tmp1798, tmp1802, 136);
__m512 tmp1808 = _mm512_shuffle_f32x4(tmp1798, tmp1802, 221);
__m512 tmp1809 = _mm512_shuffle_f32x4(tmp1799, tmp1803, 136);
__m512 tmp1810 = _mm512_shuffle_f32x4(tmp1799, tmp1803, 221);
__m512 tmp1811 = _mm512_shuffle_f32x4(tmp1800, tmp1804, 136);
__m512 tmp1812 = _mm512_shuffle_f32x4(tmp1800, tmp1804, 221);
in392 = _mm512_shuffle_f32x4(tmp1805, tmp1805, 136);
__m512 tmp1775 = _mm512_shuffle_f32x4(tmp1805, tmp1805, 221);
tmp1773 = _mm512_shuffle_f32x4(tmp1807, tmp1807, 136);
__m512 tmp1776 = _mm512_shuffle_f32x4(tmp1807, tmp1807, 221);
tmp1774 = _mm512_shuffle_f32x4(tmp1809, tmp1809, 136);
__m512 tmp1777 = _mm512_shuffle_f32x4(tmp1809, tmp1809, 221);
in398 = _mm512_shuffle_f32x4(tmp1811, tmp1811, 136);
__m512 tmp1778 = _mm512_shuffle_f32x4(tmp1811, tmp1811, 221);
tmp1772 = _mm512_shuffle_f32x4(tmp1806, tmp1806, 136);
__m512 tmp1779 = _mm512_shuffle_f32x4(tmp1806, tmp1806, 221);
in394 = _mm512_shuffle_f32x4(tmp1808, tmp1808, 136);
in396 = _mm512_shuffle_f32x4(tmp1810, tmp1810, 136);
in395 = _mm512_shuffle_f32x4(tmp1812, tmp1812, 136);
__m512 tmp1780 = _mm512_add_ps(tmp1773, in394);
__m512 tmp1784 = tmp1776;
__m512 tmp1781 = _mm512_sub_ps(tmp1772, tmp1774);
__m512 tmp1785 = _mm512_sub_ps(tmp1779, tmp1777);
__m512 tmp1782 = _mm512_add_ps(tmp1774, in396);
__m512 tmp1786 = tmp1777;
in392 = _mm512_sub_ps(in392, in396);
tmp1775 = tmp1775;
tmp1780 = _mm512_fmadd_ps(in398, _mm512_set1_ps(-4.25e+00f), tmp1780);
tmp1784 = _mm512_fmadd_ps(tmp1778, _mm512_set1_ps(-4.25e+00f), tmp1784);
tmp1782 = _mm512_fmadd_ps(tmp1772, _mm512_set1_ps(-4.25e+00f), tmp1782);
tmp1786 = _mm512_fmadd_ps(tmp1779, _mm512_set1_ps(-4.25e+00f), tmp1786);
in392 = _mm512_fmadd_ps(tmp1781, _mm512_set1_ps(5.25e+00f), in392);
tmp1775 = _mm512_fmadd_ps(tmp1785, _mm512_set1_ps(5.25e+00f), tmp1775);
tmp1781 = _mm512_fmadd_ps(tmp1774, _mm512_set1_ps(2.5e-01f), in396);
tmp1785 = _mm512_mul_ps(tmp1777, _mm512_set1_ps(2.5e-01f));
tmp1774 = _mm512_fmadd_ps(tmp1774, _mm512_set1_ps(4e+00f), in396);
tmp1777 = _mm512_mul_ps(tmp1777, _mm512_set1_ps(4e+00f));
__m512 tmp1783 = _mm512_sub_ps(tmp1782, tmp1780);
__m512 tmp1787 = _mm512_sub_ps(tmp1786, tmp1784);
tmp1782 = _mm512_add_ps(tmp1780, tmp1782);
tmp1786 = _mm512_add_ps(tmp1784, tmp1786);
tmp1780 = _mm512_fmadd_ps(tmp1773, _mm512_set1_ps(2.5e-01f), in394);
tmp1784 = _mm512_mul_ps(tmp1776, _mm512_set1_ps(2.5e-01f));
tmp1781 = _mm512_fmadd_ps(tmp1772, _mm512_set1_ps(-1.25e+00f), tmp1781);
tmp1785 = _mm512_fmadd_ps(tmp1779, _mm512_set1_ps(-1.25e+00f), tmp1785);
tmp1772 = _mm512_fmadd_ps(tmp1772, _mm512_set1_ps(-5e+00f), tmp1774);
tmp1779 = _mm512_fmadd_ps(tmp1779, _mm512_set1_ps(-5e+00f), tmp1777);
tmp1780 = _mm512_fmadd_ps(in398, _mm512_set1_ps(-1.25e+00f), tmp1780);
tmp1784 = _mm512_fmadd_ps(tmp1778, _mm512_set1_ps(-1.25e+00f), tmp1784);
in396 = _mm512_fmadd_ps(tmp1780, _mm512_set1_ps(2e+00f), tmp1781);
__m512 tmp1788 = _mm512_fmadd_ps(tmp1784, _mm512_set1_ps(2e+00f), tmp1785);
tmp1781 = _mm512_fnmadd_ps(tmp1780, _mm512_set1_ps(2e+00f), tmp1781);
tmp1785 = _mm512_fnmadd_ps(tmp1784, _mm512_set1_ps(2e+00f), tmp1785);
tmp1780 = _mm512_fmadd_ps(in394, _mm512_set1_ps(2.5e-01f), tmp1773);
tmp1784 = tmp1776;
tmp1773 = _mm512_sub_ps(in395, tmp1773);
tmp1776 = _mm512_sub_ps(_mm512_setzero_ps(), tmp1776);
tmp1780 = _mm512_fmadd_ps(in398, _mm512_set1_ps(-1.25e+00f), tmp1780);
tmp1784 = _mm512_fmadd_ps(tmp1778, _mm512_set1_ps(-1.25e+00f), tmp1784);
in398 = _mm512_sub_ps(in398, in394);
tmp1778 = tmp1778;
in398 = _mm512_fmadd_ps(in398, _mm512_set1_ps(5.25e+00f), tmp1773);
tmp1778 = _mm512_fmadd_ps(tmp1778, _mm512_set1_ps(5.25e+00f), tmp1776);
tmp1774 = _mm512_fmadd_ps(tmp1780, _mm512_set1_ps(2e+00f), tmp1772);
tmp1777 = _mm512_fmadd_ps(tmp1784, _mm512_set1_ps(2e+00f), tmp1779);
tmp1772 = _mm512_fnmadd_ps(tmp1780, _mm512_set1_ps(2e+00f), tmp1772);
tmp1779 = _mm512_fnmadd_ps(tmp1784, _mm512_set1_ps(2e+00f), tmp1779);
__m512 out445 = _mm512_shuffle_f32x4(in392, tmp1782, 68);
__m512 out446 = _mm512_shuffle_f32x4(tmp1783, in396, 68);
__m512 out447 = _mm512_shuffle_f32x4(tmp1781, tmp1774, 68);
__m512 out448 = _mm512_shuffle_f32x4(tmp1772, in398, 68);
__m512 out449 = _mm512_shuffle_f32x4(tmp1775, tmp1786, 68);
__m512 out450 = _mm512_shuffle_f32x4(tmp1787, tmp1788, 68);
__m512 out451 = _mm512_shuffle_f32x4(tmp1785, tmp1777, 68);
__m512 out452 = _mm512_shuffle_f32x4(tmp1779, tmp1778, 68);
_mm512_storeu_ps(dfPtr2+256+2856960*i8+178560*j4+44544*ss2+768*k10, out445);
_mm512_storeu_ps(dfPtr2+320+2856960*i8+178560*j4+44544*ss2+768*k10, out449);
_mm512_storeu_ps(dfPtr2+714496+2856960*i8+178560*j4+44544*ss2+768*k10, out446);
_mm512_storeu_ps(dfPtr2+714560+2856960*i8+178560*j4+44544*ss2+768*k10, out450);
_mm512_storeu_ps(dfPtr2+1428736+2856960*i8+178560*j4+44544*ss2+768*k10, out447);
_mm512_storeu_ps(dfPtr2+1428800+2856960*i8+178560*j4+44544*ss2+768*k10, out451);
_mm512_storeu_ps(dfPtr2+2142976+2856960*i8+178560*j4+44544*ss2+768*k10, out448);
_mm512_storeu_ps(dfPtr2+2143040+2856960*i8+178560*j4+44544*ss2+768*k10, out452);
}
if (j4 >= last2) return;
++j4;
if (j4 >= 4) break;
}
}

static void Example13ThreeArrangeDats1(Example13ThreaderTeam1* team15, char** tensors3) {
Example13ThreaderTask1 task7;
task7.callee1 = Example13ThreeArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 4;
task7.hull1[1] = 4;
task7.hull1[2] = 6;
task7.hull1[3] = 5;
Example13ThreaderDo1(team15, &task7);
}

static void Example13ThreeProduceSums1Callee1(Example13ThreaderTask1* task8, int64_t* pt9) {
void** pair2 = task8->any1;
char** tensors6 = pair2[0];
ptrdiff_t e3 = 0;
ptrdiff_t g4 = pt9[3];
ptrdiff_t f2 = pt9[2];
ptrdiff_t d1 = pt9[1];
ptrdiff_t w5 = pt9[0];
char*restrict bfPtr3 = tensors6[0]+30648*e3;
char*restrict wfPtr3 = tensors6[0]+153280+388371456*e3;
char*restrict dfPtr3 = tensors6[1]+14598144*e3;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i9 = 1*g4;
ptrdiff_t j5 = 1*f2;
ptrdiff_t k11 = 1*d1;
ptrdiff_t kk1 = k11+0;
for (; k11 != 4; ++k11) {
ptrdiff_t l1 = 2*w5;
ptrdiff_t ll1 = l1+1;
for (; l1 != 319; ++l1) {
__m512 sum2;
__m512 sum8;
__m512 sum14;
__m512 sum20;
if (__builtin_expect(!j5, 0)) {
sum2 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+0+5108*i9+16*l1)));
sum8 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+4+5108*i9+16*l1)));
sum14 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+8+5108*i9+16*l1)));
sum20 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+12+5108*i9+16*l1)));
} else {
sum2 = _mm512_setzero_ps();
sum8 = _mm512_setzero_ps();
sum14 = _mm512_setzero_ps();
sum20 = _mm512_setzero_ps();
}
__m512 sum3 = sum2;
__m512 sum4 = sum2;
__m512 sum5 = sum2;
__m512 sum6 = sum2;
__m512 sum7 = sum2;
__m512 sum9 = sum8;
__m512 sum10 = sum8;
__m512 sum11 = sum8;
__m512 sum12 = sum8;
__m512 sum13 = sum8;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum18 = sum14;
__m512 sum19 = sum14;
__m512 sum21 = sum20;
__m512 sum22 = sum20;
__m512 sum23 = sum20;
__m512 sum24 = sum20;
__m512 sum25 = sum20;
ptrdiff_t b3 = 0;
for (; b3 != 396; ++b3) {
__m512i wfs1 = _mm512_maskz_loadu_epi32(65535, wfPtr3+0+64728576*i9+16182144*j5+50688*l1+128*b3);
__m512 wf69 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs1));
__m512 df1 = _mm512_loadu_ps(dfPtr3+0+2433024*i9+608256*j5+152064*k11+384*b3);
sum2 = _mm512_fmadd_ps(wf69, df1, sum2);
__m512 df2 = _mm512_loadu_ps(dfPtr3+64+2433024*i9+608256*j5+152064*k11+384*b3);
sum3 = _mm512_fmadd_ps(wf69, df2, sum3);
__m512 df3 = _mm512_loadu_ps(dfPtr3+128+2433024*i9+608256*j5+152064*k11+384*b3);
sum4 = _mm512_fmadd_ps(wf69, df3, sum4);
__m512 df4 = _mm512_loadu_ps(dfPtr3+192+2433024*i9+608256*j5+152064*k11+384*b3);
sum5 = _mm512_fmadd_ps(wf69, df4, sum5);
__m512 df5 = _mm512_loadu_ps(dfPtr3+256+2433024*i9+608256*j5+152064*k11+384*b3);
sum6 = _mm512_fmadd_ps(wf69, df5, sum6);
__m512 df6 = _mm512_loadu_ps(dfPtr3+320+2433024*i9+608256*j5+152064*k11+384*b3);
sum7 = _mm512_fmadd_ps(wf69, df6, sum7);
__m512 wf70 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs1, 1));
sum8 = _mm512_fmadd_ps(wf70, df1, sum8);
sum9 = _mm512_fmadd_ps(wf70, df2, sum9);
sum10 = _mm512_fmadd_ps(wf70, df3, sum10);
sum11 = _mm512_fmadd_ps(wf70, df4, sum11);
sum12 = _mm512_fmadd_ps(wf70, df5, sum12);
sum13 = _mm512_fmadd_ps(wf70, df6, sum13);
__m512i wfs2 = _mm512_maskz_loadu_epi32(65535, wfPtr3+64+64728576*i9+16182144*j5+50688*l1+128*b3);
__m512 wf71 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs2));
sum14 = _mm512_fmadd_ps(wf71, df1, sum14);
sum15 = _mm512_fmadd_ps(wf71, df2, sum15);
sum16 = _mm512_fmadd_ps(wf71, df3, sum16);
sum17 = _mm512_fmadd_ps(wf71, df4, sum17);
sum18 = _mm512_fmadd_ps(wf71, df5, sum18);
sum19 = _mm512_fmadd_ps(wf71, df6, sum19);
__m512 wf72 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs2, 1));
sum20 = _mm512_fmadd_ps(wf72, df1, sum20);
sum21 = _mm512_fmadd_ps(wf72, df2, sum21);
sum22 = _mm512_fmadd_ps(wf72, df3, sum22);
sum23 = _mm512_fmadd_ps(wf72, df4, sum23);
sum24 = _mm512_fmadd_ps(wf72, df5, sum24);
sum25 = _mm512_fmadd_ps(wf72, df6, sum25);
}
_mm512_storeu_ps(sfPtr1+0+7845888*i9+1961472*j5+490368*k11+1536*l1, sum2);
_mm512_storeu_ps(sfPtr1+64+7845888*i9+1961472*j5+490368*k11+1536*l1, sum3);
_mm512_storeu_ps(sfPtr1+128+7845888*i9+1961472*j5+490368*k11+1536*l1, sum4);
_mm512_storeu_ps(sfPtr1+192+7845888*i9+1961472*j5+490368*k11+1536*l1, sum5);
_mm512_storeu_ps(sfPtr1+256+7845888*i9+1961472*j5+490368*k11+1536*l1, sum6);
_mm512_storeu_ps(sfPtr1+320+7845888*i9+1961472*j5+490368*k11+1536*l1, sum7);
_mm512_storeu_ps(sfPtr1+384+7845888*i9+1961472*j5+490368*k11+1536*l1, sum8);
_mm512_storeu_ps(sfPtr1+448+7845888*i9+1961472*j5+490368*k11+1536*l1, sum9);
_mm512_storeu_ps(sfPtr1+512+7845888*i9+1961472*j5+490368*k11+1536*l1, sum10);
_mm512_storeu_ps(sfPtr1+576+7845888*i9+1961472*j5+490368*k11+1536*l1, sum11);
_mm512_storeu_ps(sfPtr1+640+7845888*i9+1961472*j5+490368*k11+1536*l1, sum12);
_mm512_storeu_ps(sfPtr1+704+7845888*i9+1961472*j5+490368*k11+1536*l1, sum13);
_mm512_storeu_ps(sfPtr1+768+7845888*i9+1961472*j5+490368*k11+1536*l1, sum14);
_mm512_storeu_ps(sfPtr1+832+7845888*i9+1961472*j5+490368*k11+1536*l1, sum15);
_mm512_storeu_ps(sfPtr1+896+7845888*i9+1961472*j5+490368*k11+1536*l1, sum16);
_mm512_storeu_ps(sfPtr1+960+7845888*i9+1961472*j5+490368*k11+1536*l1, sum17);
_mm512_storeu_ps(sfPtr1+1024+7845888*i9+1961472*j5+490368*k11+1536*l1, sum18);
_mm512_storeu_ps(sfPtr1+1088+7845888*i9+1961472*j5+490368*k11+1536*l1, sum19);
_mm512_storeu_ps(sfPtr1+1152+7845888*i9+1961472*j5+490368*k11+1536*l1, sum20);
_mm512_storeu_ps(sfPtr1+1216+7845888*i9+1961472*j5+490368*k11+1536*l1, sum21);
_mm512_storeu_ps(sfPtr1+1280+7845888*i9+1961472*j5+490368*k11+1536*l1, sum22);
_mm512_storeu_ps(sfPtr1+1344+7845888*i9+1961472*j5+490368*k11+1536*l1, sum23);
_mm512_storeu_ps(sfPtr1+1408+7845888*i9+1961472*j5+490368*k11+1536*l1, sum24);
_mm512_storeu_ps(sfPtr1+1472+7845888*i9+1961472*j5+490368*k11+1536*l1, sum25);
if (l1 >= ll1) return;
}
__m512 sum26;
if (__builtin_expect(!j5, 0)) {
sum26 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+0+5108*i9+16*l1)));
} else {
sum26 = _mm512_setzero_ps();
}
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
__m512 sum30 = sum26;
__m512 sum31 = sum26;
ptrdiff_t b4 = 0;
for (; b4 != 198; ++b4) {
__m512i wfs3 = _mm512_maskz_loadu_epi32(65535, wfPtr3+0+64728576*i9+16182144*j5+50688*l1+64*b4);
__m512 wf73 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs3));
__m512 df7 = _mm512_loadu_ps(dfPtr3+0+2433024*i9+608256*j5+152064*k11+768*b4);
sum26 = _mm512_fmadd_ps(wf73, df7, sum26);
__m512 df8 = _mm512_loadu_ps(dfPtr3+64+2433024*i9+608256*j5+152064*k11+768*b4);
sum27 = _mm512_fmadd_ps(wf73, df8, sum27);
__m512 df9 = _mm512_loadu_ps(dfPtr3+128+2433024*i9+608256*j5+152064*k11+768*b4);
sum28 = _mm512_fmadd_ps(wf73, df9, sum28);
__m512 df10 = _mm512_loadu_ps(dfPtr3+192+2433024*i9+608256*j5+152064*k11+768*b4);
sum29 = _mm512_fmadd_ps(wf73, df10, sum29);
__m512 df11 = _mm512_loadu_ps(dfPtr3+256+2433024*i9+608256*j5+152064*k11+768*b4);
sum30 = _mm512_fmadd_ps(wf73, df11, sum30);
__m512 df12 = _mm512_loadu_ps(dfPtr3+320+2433024*i9+608256*j5+152064*k11+768*b4);
sum31 = _mm512_fmadd_ps(wf73, df12, sum31);
__m512 wf74 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs3, 1));
__m512 df13 = _mm512_loadu_ps(dfPtr3+384+2433024*i9+608256*j5+152064*k11+768*b4);
sum26 = _mm512_fmadd_ps(wf74, df13, sum26);
__m512 df14 = _mm512_loadu_ps(dfPtr3+448+2433024*i9+608256*j5+152064*k11+768*b4);
sum27 = _mm512_fmadd_ps(wf74, df14, sum27);
__m512 df15 = _mm512_loadu_ps(dfPtr3+512+2433024*i9+608256*j5+152064*k11+768*b4);
sum28 = _mm512_fmadd_ps(wf74, df15, sum28);
__m512 df16 = _mm512_loadu_ps(dfPtr3+576+2433024*i9+608256*j5+152064*k11+768*b4);
sum29 = _mm512_fmadd_ps(wf74, df16, sum29);
__m512 df17 = _mm512_loadu_ps(dfPtr3+640+2433024*i9+608256*j5+152064*k11+768*b4);
sum30 = _mm512_fmadd_ps(wf74, df17, sum30);
__m512 df18 = _mm512_loadu_ps(dfPtr3+704+2433024*i9+608256*j5+152064*k11+768*b4);
sum31 = _mm512_fmadd_ps(wf74, df18, sum31);
}
_mm512_storeu_ps(sfPtr1+0+7845888*i9+1961472*j5+490368*k11+1536*l1, sum26);
_mm512_storeu_ps(sfPtr1+64+7845888*i9+1961472*j5+490368*k11+1536*l1, sum27);
_mm512_storeu_ps(sfPtr1+128+7845888*i9+1961472*j5+490368*k11+1536*l1, sum28);
_mm512_storeu_ps(sfPtr1+192+7845888*i9+1961472*j5+490368*k11+1536*l1, sum29);
_mm512_storeu_ps(sfPtr1+256+7845888*i9+1961472*j5+490368*k11+1536*l1, sum30);
_mm512_storeu_ps(sfPtr1+320+7845888*i9+1961472*j5+490368*k11+1536*l1, sum31);
if (k11 >= kk1) return;
}
}

static void Example13ThreeProduceSums1Callee2(Example13ThreaderTask1* task9, int64_t* pt10) {
void** pair3 = task9->any1;
char** tensors7 = pair3[0];
ptrdiff_t e4 = (ptrdiff_t)pair3[1];
ptrdiff_t g5 = pt10[3];
ptrdiff_t f3 = pt10[2];
ptrdiff_t d2 = pt10[1];
ptrdiff_t w6 = pt10[0];
char*restrict bfPtr4 = tensors7[0]+30648*e4;
char*restrict wfPtr4 = tensors7[0]+153280+388371456*e4;
char*restrict dfPtr4 = tensors7[1]+14598144*e4;
char*restrict sfPtr2 = tensors7[2];
ptrdiff_t i10 = 1*g5;
ptrdiff_t j6 = 1*f3;
ptrdiff_t k12 = 1*d2;
ptrdiff_t kk2 = k12+0;
for (; k12 != 4; ++k12) {
ptrdiff_t l2 = 2*w6;
ptrdiff_t ll2 = l2+1;
for (; l2 != 319; ++l2) {
(void)bfPtr4;
__m512 sum32 = _mm512_setzero_ps();
__m512 sum38 = _mm512_setzero_ps();
__m512 sum44 = _mm512_setzero_ps();
__m512 sum50 = _mm512_setzero_ps();
__m512 sum33 = sum32;
__m512 sum34 = sum32;
__m512 sum35 = sum32;
__m512 sum36 = sum32;
__m512 sum37 = sum32;
__m512 sum39 = sum38;
__m512 sum40 = sum38;
__m512 sum41 = sum38;
__m512 sum42 = sum38;
__m512 sum43 = sum38;
__m512 sum45 = sum44;
__m512 sum46 = sum44;
__m512 sum47 = sum44;
__m512 sum48 = sum44;
__m512 sum49 = sum44;
__m512 sum51 = sum50;
__m512 sum52 = sum50;
__m512 sum53 = sum50;
__m512 sum54 = sum50;
__m512 sum55 = sum50;
ptrdiff_t b5 = 0;
for (; b5 != 396; ++b5) {
__m512i wfs4 = _mm512_maskz_loadu_epi32(65535, wfPtr4+0+64728576*i10+16182144*j6+50688*l2+128*b5);
__m512 wf75 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs4));
__m512 df19 = _mm512_loadu_ps(dfPtr4+0+2433024*i10+608256*j6+152064*k12+384*b5);
sum32 = _mm512_fmadd_ps(wf75, df19, sum32);
__m512 df20 = _mm512_loadu_ps(dfPtr4+64+2433024*i10+608256*j6+152064*k12+384*b5);
sum33 = _mm512_fmadd_ps(wf75, df20, sum33);
__m512 df21 = _mm512_loadu_ps(dfPtr4+128+2433024*i10+608256*j6+152064*k12+384*b5);
sum34 = _mm512_fmadd_ps(wf75, df21, sum34);
__m512 df22 = _mm512_loadu_ps(dfPtr4+192+2433024*i10+608256*j6+152064*k12+384*b5);
sum35 = _mm512_fmadd_ps(wf75, df22, sum35);
__m512 df23 = _mm512_loadu_ps(dfPtr4+256+2433024*i10+608256*j6+152064*k12+384*b5);
sum36 = _mm512_fmadd_ps(wf75, df23, sum36);
__m512 df24 = _mm512_loadu_ps(dfPtr4+320+2433024*i10+608256*j6+152064*k12+384*b5);
sum37 = _mm512_fmadd_ps(wf75, df24, sum37);
__m512 wf76 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs4, 1));
sum38 = _mm512_fmadd_ps(wf76, df19, sum38);
sum39 = _mm512_fmadd_ps(wf76, df20, sum39);
sum40 = _mm512_fmadd_ps(wf76, df21, sum40);
sum41 = _mm512_fmadd_ps(wf76, df22, sum41);
sum42 = _mm512_fmadd_ps(wf76, df23, sum42);
sum43 = _mm512_fmadd_ps(wf76, df24, sum43);
__m512i wfs5 = _mm512_maskz_loadu_epi32(65535, wfPtr4+64+64728576*i10+16182144*j6+50688*l2+128*b5);
__m512 wf77 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs5));
sum44 = _mm512_fmadd_ps(wf77, df19, sum44);
sum45 = _mm512_fmadd_ps(wf77, df20, sum45);
sum46 = _mm512_fmadd_ps(wf77, df21, sum46);
sum47 = _mm512_fmadd_ps(wf77, df22, sum47);
sum48 = _mm512_fmadd_ps(wf77, df23, sum48);
sum49 = _mm512_fmadd_ps(wf77, df24, sum49);
__m512 wf78 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs5, 1));
sum50 = _mm512_fmadd_ps(wf78, df19, sum50);
sum51 = _mm512_fmadd_ps(wf78, df20, sum51);
sum52 = _mm512_fmadd_ps(wf78, df21, sum52);
sum53 = _mm512_fmadd_ps(wf78, df22, sum53);
sum54 = _mm512_fmadd_ps(wf78, df23, sum54);
sum55 = _mm512_fmadd_ps(wf78, df24, sum55);
}
sum32 = _mm512_add_ps(sum32, _mm512_loadu_ps(sfPtr2+0+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum33 = _mm512_add_ps(sum33, _mm512_loadu_ps(sfPtr2+64+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum34 = _mm512_add_ps(sum34, _mm512_loadu_ps(sfPtr2+128+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum35 = _mm512_add_ps(sum35, _mm512_loadu_ps(sfPtr2+192+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum36 = _mm512_add_ps(sum36, _mm512_loadu_ps(sfPtr2+256+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum37 = _mm512_add_ps(sum37, _mm512_loadu_ps(sfPtr2+320+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum38 = _mm512_add_ps(sum38, _mm512_loadu_ps(sfPtr2+384+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum39 = _mm512_add_ps(sum39, _mm512_loadu_ps(sfPtr2+448+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum40 = _mm512_add_ps(sum40, _mm512_loadu_ps(sfPtr2+512+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum41 = _mm512_add_ps(sum41, _mm512_loadu_ps(sfPtr2+576+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum42 = _mm512_add_ps(sum42, _mm512_loadu_ps(sfPtr2+640+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum43 = _mm512_add_ps(sum43, _mm512_loadu_ps(sfPtr2+704+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum44 = _mm512_add_ps(sum44, _mm512_loadu_ps(sfPtr2+768+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum45 = _mm512_add_ps(sum45, _mm512_loadu_ps(sfPtr2+832+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum46 = _mm512_add_ps(sum46, _mm512_loadu_ps(sfPtr2+896+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum47 = _mm512_add_ps(sum47, _mm512_loadu_ps(sfPtr2+960+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum48 = _mm512_add_ps(sum48, _mm512_loadu_ps(sfPtr2+1024+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum49 = _mm512_add_ps(sum49, _mm512_loadu_ps(sfPtr2+1088+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum50 = _mm512_add_ps(sum50, _mm512_loadu_ps(sfPtr2+1152+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum51 = _mm512_add_ps(sum51, _mm512_loadu_ps(sfPtr2+1216+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum52 = _mm512_add_ps(sum52, _mm512_loadu_ps(sfPtr2+1280+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum53 = _mm512_add_ps(sum53, _mm512_loadu_ps(sfPtr2+1344+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum54 = _mm512_add_ps(sum54, _mm512_loadu_ps(sfPtr2+1408+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum55 = _mm512_add_ps(sum55, _mm512_loadu_ps(sfPtr2+1472+7845888*i10+1961472*j6+490368*k12+1536*l2));
_mm512_storeu_ps(sfPtr2+0+7845888*i10+1961472*j6+490368*k12+1536*l2, sum32);
_mm512_storeu_ps(sfPtr2+64+7845888*i10+1961472*j6+490368*k12+1536*l2, sum33);
_mm512_storeu_ps(sfPtr2+128+7845888*i10+1961472*j6+490368*k12+1536*l2, sum34);
_mm512_storeu_ps(sfPtr2+192+7845888*i10+1961472*j6+490368*k12+1536*l2, sum35);
_mm512_storeu_ps(sfPtr2+256+7845888*i10+1961472*j6+490368*k12+1536*l2, sum36);
_mm512_storeu_ps(sfPtr2+320+7845888*i10+1961472*j6+490368*k12+1536*l2, sum37);
_mm512_storeu_ps(sfPtr2+384+7845888*i10+1961472*j6+490368*k12+1536*l2, sum38);
_mm512_storeu_ps(sfPtr2+448+7845888*i10+1961472*j6+490368*k12+1536*l2, sum39);
_mm512_storeu_ps(sfPtr2+512+7845888*i10+1961472*j6+490368*k12+1536*l2, sum40);
_mm512_storeu_ps(sfPtr2+576+7845888*i10+1961472*j6+490368*k12+1536*l2, sum41);
_mm512_storeu_ps(sfPtr2+640+7845888*i10+1961472*j6+490368*k12+1536*l2, sum42);
_mm512_storeu_ps(sfPtr2+704+7845888*i10+1961472*j6+490368*k12+1536*l2, sum43);
_mm512_storeu_ps(sfPtr2+768+7845888*i10+1961472*j6+490368*k12+1536*l2, sum44);
_mm512_storeu_ps(sfPtr2+832+7845888*i10+1961472*j6+490368*k12+1536*l2, sum45);
_mm512_storeu_ps(sfPtr2+896+7845888*i10+1961472*j6+490368*k12+1536*l2, sum46);
_mm512_storeu_ps(sfPtr2+960+7845888*i10+1961472*j6+490368*k12+1536*l2, sum47);
_mm512_storeu_ps(sfPtr2+1024+7845888*i10+1961472*j6+490368*k12+1536*l2, sum48);
_mm512_storeu_ps(sfPtr2+1088+7845888*i10+1961472*j6+490368*k12+1536*l2, sum49);
_mm512_storeu_ps(sfPtr2+1152+7845888*i10+1961472*j6+490368*k12+1536*l2, sum50);
_mm512_storeu_ps(sfPtr2+1216+7845888*i10+1961472*j6+490368*k12+1536*l2, sum51);
_mm512_storeu_ps(sfPtr2+1280+7845888*i10+1961472*j6+490368*k12+1536*l2, sum52);
_mm512_storeu_ps(sfPtr2+1344+7845888*i10+1961472*j6+490368*k12+1536*l2, sum53);
_mm512_storeu_ps(sfPtr2+1408+7845888*i10+1961472*j6+490368*k12+1536*l2, sum54);
_mm512_storeu_ps(sfPtr2+1472+7845888*i10+1961472*j6+490368*k12+1536*l2, sum55);
if (l2 >= ll2) return;
}
(void)bfPtr4;
__m512 sum56 = _mm512_setzero_ps();
__m512 sum57 = sum56;
__m512 sum58 = sum56;
__m512 sum59 = sum56;
__m512 sum60 = sum56;
__m512 sum61 = sum56;
ptrdiff_t b6 = 0;
for (; b6 != 198; ++b6) {
__m512i wfs6 = _mm512_maskz_loadu_epi32(65535, wfPtr4+0+64728576*i10+16182144*j6+50688*l2+64*b6);
__m512 wf79 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs6));
__m512 df25 = _mm512_loadu_ps(dfPtr4+0+2433024*i10+608256*j6+152064*k12+768*b6);
sum56 = _mm512_fmadd_ps(wf79, df25, sum56);
__m512 df26 = _mm512_loadu_ps(dfPtr4+64+2433024*i10+608256*j6+152064*k12+768*b6);
sum57 = _mm512_fmadd_ps(wf79, df26, sum57);
__m512 df27 = _mm512_loadu_ps(dfPtr4+128+2433024*i10+608256*j6+152064*k12+768*b6);
sum58 = _mm512_fmadd_ps(wf79, df27, sum58);
__m512 df28 = _mm512_loadu_ps(dfPtr4+192+2433024*i10+608256*j6+152064*k12+768*b6);
sum59 = _mm512_fmadd_ps(wf79, df28, sum59);
__m512 df29 = _mm512_loadu_ps(dfPtr4+256+2433024*i10+608256*j6+152064*k12+768*b6);
sum60 = _mm512_fmadd_ps(wf79, df29, sum60);
__m512 df30 = _mm512_loadu_ps(dfPtr4+320+2433024*i10+608256*j6+152064*k12+768*b6);
sum61 = _mm512_fmadd_ps(wf79, df30, sum61);
__m512 wf80 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs6, 1));
__m512 df31 = _mm512_loadu_ps(dfPtr4+384+2433024*i10+608256*j6+152064*k12+768*b6);
sum56 = _mm512_fmadd_ps(wf80, df31, sum56);
__m512 df32 = _mm512_loadu_ps(dfPtr4+448+2433024*i10+608256*j6+152064*k12+768*b6);
sum57 = _mm512_fmadd_ps(wf80, df32, sum57);
__m512 df33 = _mm512_loadu_ps(dfPtr4+512+2433024*i10+608256*j6+152064*k12+768*b6);
sum58 = _mm512_fmadd_ps(wf80, df33, sum58);
__m512 df34 = _mm512_loadu_ps(dfPtr4+576+2433024*i10+608256*j6+152064*k12+768*b6);
sum59 = _mm512_fmadd_ps(wf80, df34, sum59);
__m512 df35 = _mm512_loadu_ps(dfPtr4+640+2433024*i10+608256*j6+152064*k12+768*b6);
sum60 = _mm512_fmadd_ps(wf80, df35, sum60);
__m512 df36 = _mm512_loadu_ps(dfPtr4+704+2433024*i10+608256*j6+152064*k12+768*b6);
sum61 = _mm512_fmadd_ps(wf80, df36, sum61);
}
sum56 = _mm512_add_ps(sum56, _mm512_loadu_ps(sfPtr2+0+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum57 = _mm512_add_ps(sum57, _mm512_loadu_ps(sfPtr2+64+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum58 = _mm512_add_ps(sum58, _mm512_loadu_ps(sfPtr2+128+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum59 = _mm512_add_ps(sum59, _mm512_loadu_ps(sfPtr2+192+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum60 = _mm512_add_ps(sum60, _mm512_loadu_ps(sfPtr2+256+7845888*i10+1961472*j6+490368*k12+1536*l2));
sum61 = _mm512_add_ps(sum61, _mm512_loadu_ps(sfPtr2+320+7845888*i10+1961472*j6+490368*k12+1536*l2));
_mm512_storeu_ps(sfPtr2+0+7845888*i10+1961472*j6+490368*k12+1536*l2, sum56);
_mm512_storeu_ps(sfPtr2+64+7845888*i10+1961472*j6+490368*k12+1536*l2, sum57);
_mm512_storeu_ps(sfPtr2+128+7845888*i10+1961472*j6+490368*k12+1536*l2, sum58);
_mm512_storeu_ps(sfPtr2+192+7845888*i10+1961472*j6+490368*k12+1536*l2, sum59);
_mm512_storeu_ps(sfPtr2+256+7845888*i10+1961472*j6+490368*k12+1536*l2, sum60);
_mm512_storeu_ps(sfPtr2+320+7845888*i10+1961472*j6+490368*k12+1536*l2, sum61);
if (k12 >= kk2) return;
}
}

static void Example13ThreeProduceSums1Callee3(Example13ThreaderTask1* task10, int64_t* pt11) {
void** pair4 = task10->any1;
char** tensors8 = pair4[0];
ptrdiff_t e6 = 4;
ptrdiff_t g6 = pt11[3];
ptrdiff_t f4 = pt11[2];
ptrdiff_t d3 = pt11[1];
ptrdiff_t w7 = pt11[0];
char*restrict bfPtr5 = tensors8[0]+30648*e6;
char*restrict wfPtr5 = tensors8[0]+153280+388371456*e6;
char*restrict dfPtr5 = tensors8[1]+14598144*e6;
char*restrict sfPtr3 = tensors8[2];
ptrdiff_t i11 = 1*g6;
ptrdiff_t j7 = 1*f4;
ptrdiff_t k13 = 1*d3;
ptrdiff_t kk3 = k13+0;
for (; k13 != 4; ++k13) {
ptrdiff_t l3 = 2*w7;
ptrdiff_t ll3 = l3+1;
for (; l3 != 319; ++l3) {
(void)bfPtr5;
__m512 sum62 = _mm512_setzero_ps();
__m512 sum68 = _mm512_setzero_ps();
__m512 sum74 = _mm512_setzero_ps();
__m512 sum80 = _mm512_setzero_ps();
__m512 sum63 = sum62;
__m512 sum64 = sum62;
__m512 sum65 = sum62;
__m512 sum66 = sum62;
__m512 sum67 = sum62;
__m512 sum69 = sum68;
__m512 sum70 = sum68;
__m512 sum71 = sum68;
__m512 sum72 = sum68;
__m512 sum73 = sum68;
__m512 sum75 = sum74;
__m512 sum76 = sum74;
__m512 sum77 = sum74;
__m512 sum78 = sum74;
__m512 sum79 = sum74;
__m512 sum81 = sum80;
__m512 sum82 = sum80;
__m512 sum83 = sum80;
__m512 sum84 = sum80;
__m512 sum85 = sum80;
ptrdiff_t b7 = 0;
for (; b7 != 465; ++b7) {
__m512i wfs7 = _mm512_maskz_loadu_epi32(65535, wfPtr5+0+76007168*i11+19001792*j7+59520*l3+128*b7);
__m512 wf81 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs7));
__m512 df37 = _mm512_loadu_ps(dfPtr5+0+2856960*i11+714240*j7+178560*k13+384*b7);
sum62 = _mm512_fmadd_ps(wf81, df37, sum62);
__m512 df38 = _mm512_loadu_ps(dfPtr5+64+2856960*i11+714240*j7+178560*k13+384*b7);
sum63 = _mm512_fmadd_ps(wf81, df38, sum63);
__m512 df39 = _mm512_loadu_ps(dfPtr5+128+2856960*i11+714240*j7+178560*k13+384*b7);
sum64 = _mm512_fmadd_ps(wf81, df39, sum64);
__m512 df40 = _mm512_loadu_ps(dfPtr5+192+2856960*i11+714240*j7+178560*k13+384*b7);
sum65 = _mm512_fmadd_ps(wf81, df40, sum65);
__m512 df41 = _mm512_loadu_ps(dfPtr5+256+2856960*i11+714240*j7+178560*k13+384*b7);
sum66 = _mm512_fmadd_ps(wf81, df41, sum66);
__m512 df42 = _mm512_loadu_ps(dfPtr5+320+2856960*i11+714240*j7+178560*k13+384*b7);
sum67 = _mm512_fmadd_ps(wf81, df42, sum67);
__m512 wf82 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs7, 1));
sum68 = _mm512_fmadd_ps(wf82, df37, sum68);
sum69 = _mm512_fmadd_ps(wf82, df38, sum69);
sum70 = _mm512_fmadd_ps(wf82, df39, sum70);
sum71 = _mm512_fmadd_ps(wf82, df40, sum71);
sum72 = _mm512_fmadd_ps(wf82, df41, sum72);
sum73 = _mm512_fmadd_ps(wf82, df42, sum73);
__m512i wfs8 = _mm512_maskz_loadu_epi32(65535, wfPtr5+64+76007168*i11+19001792*j7+59520*l3+128*b7);
__m512 wf83 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs8));
sum74 = _mm512_fmadd_ps(wf83, df37, sum74);
sum75 = _mm512_fmadd_ps(wf83, df38, sum75);
sum76 = _mm512_fmadd_ps(wf83, df39, sum76);
sum77 = _mm512_fmadd_ps(wf83, df40, sum77);
sum78 = _mm512_fmadd_ps(wf83, df41, sum78);
sum79 = _mm512_fmadd_ps(wf83, df42, sum79);
__m512 wf84 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs8, 1));
sum80 = _mm512_fmadd_ps(wf84, df37, sum80);
sum81 = _mm512_fmadd_ps(wf84, df38, sum81);
sum82 = _mm512_fmadd_ps(wf84, df39, sum82);
sum83 = _mm512_fmadd_ps(wf84, df40, sum83);
sum84 = _mm512_fmadd_ps(wf84, df41, sum84);
sum85 = _mm512_fmadd_ps(wf84, df42, sum85);
}
sum62 = _mm512_add_ps(sum62, _mm512_loadu_ps(sfPtr3+0+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum63 = _mm512_add_ps(sum63, _mm512_loadu_ps(sfPtr3+64+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum64 = _mm512_add_ps(sum64, _mm512_loadu_ps(sfPtr3+128+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum65 = _mm512_add_ps(sum65, _mm512_loadu_ps(sfPtr3+192+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum66 = _mm512_add_ps(sum66, _mm512_loadu_ps(sfPtr3+256+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum67 = _mm512_add_ps(sum67, _mm512_loadu_ps(sfPtr3+320+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum68 = _mm512_add_ps(sum68, _mm512_loadu_ps(sfPtr3+384+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum69 = _mm512_add_ps(sum69, _mm512_loadu_ps(sfPtr3+448+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum70 = _mm512_add_ps(sum70, _mm512_loadu_ps(sfPtr3+512+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum71 = _mm512_add_ps(sum71, _mm512_loadu_ps(sfPtr3+576+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum72 = _mm512_add_ps(sum72, _mm512_loadu_ps(sfPtr3+640+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum73 = _mm512_add_ps(sum73, _mm512_loadu_ps(sfPtr3+704+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum74 = _mm512_add_ps(sum74, _mm512_loadu_ps(sfPtr3+768+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum75 = _mm512_add_ps(sum75, _mm512_loadu_ps(sfPtr3+832+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum76 = _mm512_add_ps(sum76, _mm512_loadu_ps(sfPtr3+896+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum77 = _mm512_add_ps(sum77, _mm512_loadu_ps(sfPtr3+960+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum78 = _mm512_add_ps(sum78, _mm512_loadu_ps(sfPtr3+1024+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum79 = _mm512_add_ps(sum79, _mm512_loadu_ps(sfPtr3+1088+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum80 = _mm512_add_ps(sum80, _mm512_loadu_ps(sfPtr3+1152+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum81 = _mm512_add_ps(sum81, _mm512_loadu_ps(sfPtr3+1216+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum82 = _mm512_add_ps(sum82, _mm512_loadu_ps(sfPtr3+1280+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum83 = _mm512_add_ps(sum83, _mm512_loadu_ps(sfPtr3+1344+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum84 = _mm512_add_ps(sum84, _mm512_loadu_ps(sfPtr3+1408+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum85 = _mm512_add_ps(sum85, _mm512_loadu_ps(sfPtr3+1472+7845888*i11+1961472*j7+490368*k13+1536*l3));
_mm512_storeu_ps(sfPtr3+0+7845888*i11+1961472*j7+490368*k13+1536*l3, sum62);
_mm512_storeu_ps(sfPtr3+64+7845888*i11+1961472*j7+490368*k13+1536*l3, sum63);
_mm512_storeu_ps(sfPtr3+128+7845888*i11+1961472*j7+490368*k13+1536*l3, sum64);
_mm512_storeu_ps(sfPtr3+192+7845888*i11+1961472*j7+490368*k13+1536*l3, sum65);
_mm512_storeu_ps(sfPtr3+256+7845888*i11+1961472*j7+490368*k13+1536*l3, sum66);
_mm512_storeu_ps(sfPtr3+320+7845888*i11+1961472*j7+490368*k13+1536*l3, sum67);
_mm512_storeu_ps(sfPtr3+384+7845888*i11+1961472*j7+490368*k13+1536*l3, sum68);
_mm512_storeu_ps(sfPtr3+448+7845888*i11+1961472*j7+490368*k13+1536*l3, sum69);
_mm512_storeu_ps(sfPtr3+512+7845888*i11+1961472*j7+490368*k13+1536*l3, sum70);
_mm512_storeu_ps(sfPtr3+576+7845888*i11+1961472*j7+490368*k13+1536*l3, sum71);
_mm512_storeu_ps(sfPtr3+640+7845888*i11+1961472*j7+490368*k13+1536*l3, sum72);
_mm512_storeu_ps(sfPtr3+704+7845888*i11+1961472*j7+490368*k13+1536*l3, sum73);
_mm512_storeu_ps(sfPtr3+768+7845888*i11+1961472*j7+490368*k13+1536*l3, sum74);
_mm512_storeu_ps(sfPtr3+832+7845888*i11+1961472*j7+490368*k13+1536*l3, sum75);
_mm512_storeu_ps(sfPtr3+896+7845888*i11+1961472*j7+490368*k13+1536*l3, sum76);
_mm512_storeu_ps(sfPtr3+960+7845888*i11+1961472*j7+490368*k13+1536*l3, sum77);
_mm512_storeu_ps(sfPtr3+1024+7845888*i11+1961472*j7+490368*k13+1536*l3, sum78);
_mm512_storeu_ps(sfPtr3+1088+7845888*i11+1961472*j7+490368*k13+1536*l3, sum79);
_mm512_storeu_ps(sfPtr3+1152+7845888*i11+1961472*j7+490368*k13+1536*l3, sum80);
_mm512_storeu_ps(sfPtr3+1216+7845888*i11+1961472*j7+490368*k13+1536*l3, sum81);
_mm512_storeu_ps(sfPtr3+1280+7845888*i11+1961472*j7+490368*k13+1536*l3, sum82);
_mm512_storeu_ps(sfPtr3+1344+7845888*i11+1961472*j7+490368*k13+1536*l3, sum83);
_mm512_storeu_ps(sfPtr3+1408+7845888*i11+1961472*j7+490368*k13+1536*l3, sum84);
_mm512_storeu_ps(sfPtr3+1472+7845888*i11+1961472*j7+490368*k13+1536*l3, sum85);
if (l3 >= ll3) return;
}
(void)bfPtr5;
__m512 sum86 = _mm512_setzero_ps();
__m512 sum87 = sum86;
__m512 sum88 = sum86;
__m512 sum89 = sum86;
__m512 sum90 = sum86;
__m512 sum91 = sum86;
ptrdiff_t b8 = 0;
for (; b8 != 232; ++b8) {
__m512i wfs9 = _mm512_maskz_loadu_epi32(65535, wfPtr5+0+76007168*i11+19001792*j7+59520*l3+64*b8);
__m512 wf85 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs9));
__m512 df43 = _mm512_loadu_ps(dfPtr5+0+2856960*i11+714240*j7+178560*k13+768*b8);
sum86 = _mm512_fmadd_ps(wf85, df43, sum86);
__m512 df44 = _mm512_loadu_ps(dfPtr5+64+2856960*i11+714240*j7+178560*k13+768*b8);
sum87 = _mm512_fmadd_ps(wf85, df44, sum87);
__m512 df45 = _mm512_loadu_ps(dfPtr5+128+2856960*i11+714240*j7+178560*k13+768*b8);
sum88 = _mm512_fmadd_ps(wf85, df45, sum88);
__m512 df46 = _mm512_loadu_ps(dfPtr5+192+2856960*i11+714240*j7+178560*k13+768*b8);
sum89 = _mm512_fmadd_ps(wf85, df46, sum89);
__m512 df47 = _mm512_loadu_ps(dfPtr5+256+2856960*i11+714240*j7+178560*k13+768*b8);
sum90 = _mm512_fmadd_ps(wf85, df47, sum90);
__m512 df48 = _mm512_loadu_ps(dfPtr5+320+2856960*i11+714240*j7+178560*k13+768*b8);
sum91 = _mm512_fmadd_ps(wf85, df48, sum91);
__m512 wf86 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs9, 1));
__m512 df49 = _mm512_loadu_ps(dfPtr5+384+2856960*i11+714240*j7+178560*k13+768*b8);
sum86 = _mm512_fmadd_ps(wf86, df49, sum86);
__m512 df50 = _mm512_loadu_ps(dfPtr5+448+2856960*i11+714240*j7+178560*k13+768*b8);
sum87 = _mm512_fmadd_ps(wf86, df50, sum87);
__m512 df51 = _mm512_loadu_ps(dfPtr5+512+2856960*i11+714240*j7+178560*k13+768*b8);
sum88 = _mm512_fmadd_ps(wf86, df51, sum88);
__m512 df52 = _mm512_loadu_ps(dfPtr5+576+2856960*i11+714240*j7+178560*k13+768*b8);
sum89 = _mm512_fmadd_ps(wf86, df52, sum89);
__m512 df53 = _mm512_loadu_ps(dfPtr5+640+2856960*i11+714240*j7+178560*k13+768*b8);
sum90 = _mm512_fmadd_ps(wf86, df53, sum90);
__m512 df54 = _mm512_loadu_ps(dfPtr5+704+2856960*i11+714240*j7+178560*k13+768*b8);
sum91 = _mm512_fmadd_ps(wf86, df54, sum91);
}
__m512i wfs10 = _mm512_maskz_loadu_epi32(255, wfPtr5+0+76007168*i11+19001792*j7+59520*l3+64*b8);
__m512 wf87 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs10));
__m512 df55 = _mm512_loadu_ps(dfPtr5+0+2856960*i11+714240*j7+178560*k13+768*b8);
sum86 = _mm512_fmadd_ps(wf87, df55, sum86);
__m512 df56 = _mm512_loadu_ps(dfPtr5+64+2856960*i11+714240*j7+178560*k13+768*b8);
sum87 = _mm512_fmadd_ps(wf87, df56, sum87);
__m512 df57 = _mm512_loadu_ps(dfPtr5+128+2856960*i11+714240*j7+178560*k13+768*b8);
sum88 = _mm512_fmadd_ps(wf87, df57, sum88);
__m512 df58 = _mm512_loadu_ps(dfPtr5+192+2856960*i11+714240*j7+178560*k13+768*b8);
sum89 = _mm512_fmadd_ps(wf87, df58, sum89);
__m512 df59 = _mm512_loadu_ps(dfPtr5+256+2856960*i11+714240*j7+178560*k13+768*b8);
sum90 = _mm512_fmadd_ps(wf87, df59, sum90);
__m512 df60 = _mm512_loadu_ps(dfPtr5+320+2856960*i11+714240*j7+178560*k13+768*b8);
sum91 = _mm512_fmadd_ps(wf87, df60, sum91);
sum86 = _mm512_add_ps(sum86, _mm512_loadu_ps(sfPtr3+0+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum87 = _mm512_add_ps(sum87, _mm512_loadu_ps(sfPtr3+64+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum88 = _mm512_add_ps(sum88, _mm512_loadu_ps(sfPtr3+128+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum89 = _mm512_add_ps(sum89, _mm512_loadu_ps(sfPtr3+192+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum90 = _mm512_add_ps(sum90, _mm512_loadu_ps(sfPtr3+256+7845888*i11+1961472*j7+490368*k13+1536*l3));
sum91 = _mm512_add_ps(sum91, _mm512_loadu_ps(sfPtr3+320+7845888*i11+1961472*j7+490368*k13+1536*l3));
_mm512_storeu_ps(sfPtr3+0+7845888*i11+1961472*j7+490368*k13+1536*l3, sum86);
_mm512_storeu_ps(sfPtr3+64+7845888*i11+1961472*j7+490368*k13+1536*l3, sum87);
_mm512_storeu_ps(sfPtr3+128+7845888*i11+1961472*j7+490368*k13+1536*l3, sum88);
_mm512_storeu_ps(sfPtr3+192+7845888*i11+1961472*j7+490368*k13+1536*l3, sum89);
_mm512_storeu_ps(sfPtr3+256+7845888*i11+1961472*j7+490368*k13+1536*l3, sum90);
_mm512_storeu_ps(sfPtr3+320+7845888*i11+1961472*j7+490368*k13+1536*l3, sum91);
if (k13 >= kk3) return;
}
}

static void Example13ThreeProduceSums1(Example13ThreaderTeam1* team16, char** tensors5) {
void* pair1[] = {tensors5, 0};
Example13ThreaderTask1 task11;
task11.callee1 = Example13ThreeProduceSums1Callee1;
task11.any1 = pair1;
task11.nd1 = 4;
task11.hull1[0] = 160;
task11.hull1[1] = 4;
task11.hull1[2] = 4;
task11.hull1[3] = 6;
Example13ThreaderDo1(team16, &task11);
for (ptrdiff_t e5 = 1; e5 < 4; ++e5) {
pair1[1] = (void*)e5;
Example13ThreaderTask1 task12;
task12.callee1 = Example13ThreeProduceSums1Callee2;
task12.any1 = pair1;
task12.nd1 = 4;
task12.hull1[0] = 160;
task12.hull1[1] = 4;
task12.hull1[2] = 4;
task12.hull1[3] = 6;
Example13ThreaderDo1(team16, &task12);
}
Example13ThreaderTask1 task13;
task13.callee1 = Example13ThreeProduceSums1Callee3;
task13.any1 = pair1;
task13.nd1 = 4;
task13.hull1[0] = 160;
task13.hull1[1] = 4;
task13.hull1[2] = 4;
task13.hull1[3] = 6;
Example13ThreaderDo1(team16, &task13);
}

static void Example13ThreeConsumeSums1Callee1(Example13ThreaderTask1* task14, int64_t* pt12) {
char** tensors10 = task14->any1;
ptrdiff_t w8 = pt12[0];
ptrdiff_t d4 = pt12[1];
ptrdiff_t g7 = pt12[2];
char*restrict sfPtr4 = tensors10[0];
char*restrict datPtr3 = tensors10[1];
ptrdiff_t i12 = 1*g7;
ptrdiff_t j8 = 1*d4;
ptrdiff_t last3 = j8+0;
ptrdiff_t rel3 = (size_t)(j8-0)%2;
ptrdiff_t base3 = 0+(size_t)(j8-0)/2*18;
for (; ; rel3 = 0, base3 += 18) {
if (rel3 < 1) {
ptrdiff_t toH1 = base3+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k14 = 22*w8;
ptrdiff_t kk4 = k14+(w8 < 13 ? 21 : 33);
for (; k14 != 319; ++k14) {
ptrdiff_t l4 = 0;
for (; l4 != 2; ++l4) {
__m512 sf1 = _mm512_loadu_ps(sfPtr4+0+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf2 = _mm512_loadu_ps(sfPtr4+128+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in400 = _mm512_shuffle_f32x4(sf1, sf2, 68);
__m512 in401 = _mm512_shuffle_f32x4(sf1, sf2, 238);
__m512 sf3 = _mm512_loadu_ps(sfPtr4+64+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf4 = _mm512_loadu_ps(sfPtr4+192+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in408 = _mm512_shuffle_f32x4(sf3, sf4, 68);
__m512 in409 = _mm512_shuffle_f32x4(sf3, sf4, 238);
__m512 sf5 = _mm512_loadu_ps(sfPtr4+1961472+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf6 = _mm512_loadu_ps(sfPtr4+1961600+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in402 = _mm512_shuffle_f32x4(sf5, sf6, 68);
__m512 in403 = _mm512_shuffle_f32x4(sf5, sf6, 238);
__m512 sf7 = _mm512_loadu_ps(sfPtr4+1961536+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf8 = _mm512_loadu_ps(sfPtr4+1961664+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in410 = _mm512_shuffle_f32x4(sf7, sf8, 68);
__m512 in411 = _mm512_shuffle_f32x4(sf7, sf8, 238);
__m512 sf9 = _mm512_loadu_ps(sfPtr4+3922944+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf10 = _mm512_loadu_ps(sfPtr4+3923072+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in404 = _mm512_shuffle_f32x4(sf9, sf10, 68);
__m512 in405 = _mm512_shuffle_f32x4(sf9, sf10, 238);
__m512 sf11 = _mm512_loadu_ps(sfPtr4+3923008+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf12 = _mm512_loadu_ps(sfPtr4+3923136+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in412 = _mm512_shuffle_f32x4(sf11, sf12, 68);
__m512 in413 = _mm512_shuffle_f32x4(sf11, sf12, 238);
__m512 sf13 = _mm512_loadu_ps(sfPtr4+5884416+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf14 = _mm512_loadu_ps(sfPtr4+5884544+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in406 = _mm512_shuffle_f32x4(sf13, sf14, 68);
__m512 in407 = _mm512_shuffle_f32x4(sf13, sf14, 238);
__m512 sf15 = _mm512_loadu_ps(sfPtr4+5884480+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf16 = _mm512_loadu_ps(sfPtr4+5884608+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in414 = _mm512_shuffle_f32x4(sf15, sf16, 68);
__m512 in415 = _mm512_shuffle_f32x4(sf15, sf16, 238);
__m512 tmp1829 = _mm512_add_ps(in401, in402);
__m512 tmp1849 = _mm512_add_ps(in409, in410);
__m512 tmp1828 = _mm512_add_ps(in403, in404);
__m512 tmp1848 = _mm512_add_ps(in411, in412);
__m512 tmp1834 = _mm512_sub_ps(in403, in404);
__m512 tmp1854 = _mm512_sub_ps(in411, in412);
__m512 tmp1833 = _mm512_sub_ps(in401, in402);
__m512 tmp1853 = _mm512_sub_ps(in409, in410);
__m512 tmp1830 = _mm512_add_ps(in405, in406);
__m512 tmp1850 = _mm512_add_ps(in413, in414);
__m512 tmp1835 = _mm512_sub_ps(in405, in406);
__m512 tmp1855 = _mm512_sub_ps(in413, in414);
__m512 tmp1832 = _mm512_fmadd_ps(tmp1834, _mm512_set1_ps(2e+00f), tmp1833);
__m512 tmp1852 = _mm512_fmadd_ps(tmp1854, _mm512_set1_ps(2e+00f), tmp1853);
__m512 tmp1839 = _mm512_fmadd_ps(tmp1834, _mm512_set1_ps(8e+00f), tmp1833);
__m512 tmp1859 = _mm512_fmadd_ps(tmp1854, _mm512_set1_ps(8e+00f), tmp1853);
__m512 tmp1827 = _mm512_add_ps(tmp1828, tmp1829);
__m512 tmp1847 = _mm512_add_ps(tmp1848, tmp1849);
__m512 tmp1831 = _mm512_fmadd_ps(tmp1835, _mm512_set1_ps(1.6e+01f), tmp1832);
__m512 tmp1851 = _mm512_fmadd_ps(tmp1855, _mm512_set1_ps(1.6e+01f), tmp1852);
__m512 tmp1838 = _mm512_fmadd_ps(tmp1835, _mm512_set1_ps(4e+00f), tmp1839);
__m512 tmp1858 = _mm512_fmadd_ps(tmp1855, _mm512_set1_ps(4e+00f), tmp1859);
__m512 tmp1844 = _mm512_add_ps(tmp1835, tmp1833);
__m512 tmp1864 = _mm512_add_ps(tmp1855, tmp1853);
__m512 tmp1837 = _mm512_fmadd_ps(tmp1828, _mm512_set1_ps(4e+00f), tmp1829);
__m512 tmp1857 = _mm512_fmadd_ps(tmp1848, _mm512_set1_ps(4e+00f), tmp1849);
__m512 tmp1841 = _mm512_fmadd_ps(tmp1828, _mm512_set1_ps(1.6e+01f), tmp1829);
__m512 tmp1861 = _mm512_fmadd_ps(tmp1848, _mm512_set1_ps(1.6e+01f), tmp1849);
__m512 tmp1826 = _mm512_add_ps(tmp1827, in400);
__m512 tmp1846 = _mm512_add_ps(tmp1847, in408);
__m512 tmp1843 = _mm512_add_ps(tmp1844, in407);
__m512 tmp1863 = _mm512_add_ps(tmp1864, in415);
__m512 tmp1825 = _mm512_fmadd_ps(tmp1830, _mm512_set1_ps(3.2e+01f), tmp1826);
__m512 tmp1845 = _mm512_fmadd_ps(tmp1850, _mm512_set1_ps(3.2e+01f), tmp1846);
__m512 tmp1836 = _mm512_fmadd_ps(tmp1830, _mm512_set1_ps(8e+00f), tmp1837);
__m512 tmp1856 = _mm512_fmadd_ps(tmp1850, _mm512_set1_ps(8e+00f), tmp1857);
__m512 tmp1842 = _mm512_fmadd_ps(tmp1834, _mm512_set1_ps(3.2e+01f), tmp1843);
__m512 tmp1862 = _mm512_fmadd_ps(tmp1854, _mm512_set1_ps(3.2e+01f), tmp1863);
__m512 tmp1840 = _mm512_fmadd_ps(tmp1830, _mm512_set1_ps(2e+00f), tmp1841);
__m512 tmp1860 = _mm512_fmadd_ps(tmp1850, _mm512_set1_ps(2e+00f), tmp1861);
__m512 tmp1813 = tmp1825;
__m512 tmp1819 = tmp1845;
__m512 tmp1814 = tmp1831;
__m512 tmp1820 = tmp1851;
__m512 tmp1815 = tmp1836;
__m512 tmp1821 = tmp1856;
__m512 tmp1816 = tmp1838;
__m512 tmp1822 = tmp1858;
__m512 tmp1817 = tmp1840;
__m512 tmp1823 = tmp1860;
__m512 tmp1818 = tmp1842;
__m512 tmp1824 = tmp1862;
__m512 tmp1909 = _mm512_unpacklo_ps(tmp1813, tmp1814);
__m512 tmp1910 = _mm512_unpackhi_ps(tmp1813, tmp1814);
__m512 tmp1911 = _mm512_unpacklo_ps(tmp1815, tmp1816);
__m512 tmp1912 = _mm512_unpackhi_ps(tmp1815, tmp1816);
__m512 tmp1913 = _mm512_unpacklo_ps(tmp1817, tmp1818);
__m512 tmp1914 = _mm512_unpackhi_ps(tmp1817, tmp1818);
__m512 tmp1915 = _mm512_unpacklo_ps(tmp1819, tmp1820);
__m512 tmp1916 = _mm512_unpackhi_ps(tmp1819, tmp1820);
__m512 tmp1917 = _mm512_unpacklo_ps(tmp1821, tmp1822);
__m512 tmp1918 = _mm512_unpackhi_ps(tmp1821, tmp1822);
__m512 tmp1919 = _mm512_unpacklo_ps(tmp1823, tmp1824);
__m512 tmp1920 = _mm512_unpackhi_ps(tmp1823, tmp1824);
__m512 tmp1921 = _mm512_shuffle_ps(tmp1909, tmp1911, 68);
__m512 tmp1922 = _mm512_shuffle_ps(tmp1909, tmp1911, 238);
__m512 tmp1923 = _mm512_shuffle_ps(tmp1910, tmp1912, 68);
__m512 tmp1924 = _mm512_shuffle_ps(tmp1910, tmp1912, 238);
__m512 tmp1925 = _mm512_shuffle_ps(tmp1913, tmp1915, 68);
__m512 tmp1926 = _mm512_shuffle_ps(tmp1913, tmp1915, 238);
__m512 tmp1927 = _mm512_shuffle_ps(tmp1914, tmp1916, 68);
__m512 tmp1928 = _mm512_shuffle_ps(tmp1914, tmp1916, 238);
__m512 tmp1929 = _mm512_shuffle_ps(tmp1917, tmp1919, 68);
__m512 tmp1930 = _mm512_shuffle_ps(tmp1917, tmp1919, 238);
__m512 tmp1931 = _mm512_shuffle_ps(tmp1918, tmp1920, 68);
__m512 tmp1932 = _mm512_shuffle_ps(tmp1918, tmp1920, 238);
__m512 tmp1933 = _mm512_shuffle_f32x4(tmp1921, tmp1925, 136);
__m512 tmp1934 = _mm512_shuffle_f32x4(tmp1921, tmp1925, 221);
__m512 tmp1935 = _mm512_shuffle_f32x4(tmp1922, tmp1926, 136);
__m512 tmp1936 = _mm512_shuffle_f32x4(tmp1922, tmp1926, 221);
__m512 tmp1937 = _mm512_shuffle_f32x4(tmp1923, tmp1927, 136);
__m512 tmp1938 = _mm512_shuffle_f32x4(tmp1923, tmp1927, 221);
__m512 tmp1939 = _mm512_shuffle_f32x4(tmp1924, tmp1928, 136);
__m512 tmp1940 = _mm512_shuffle_f32x4(tmp1924, tmp1928, 221);
__m512 tmp1941 = _mm512_shuffle_f32x4(tmp1929, tmp1929, 136);
__m512 tmp1942 = _mm512_shuffle_f32x4(tmp1929, tmp1929, 221);
__m512 tmp1943 = _mm512_shuffle_f32x4(tmp1930, tmp1930, 136);
__m512 tmp1944 = _mm512_shuffle_f32x4(tmp1930, tmp1930, 221);
__m512 tmp1945 = _mm512_shuffle_f32x4(tmp1931, tmp1931, 136);
__m512 tmp1946 = _mm512_shuffle_f32x4(tmp1931, tmp1931, 221);
__m512 tmp1947 = _mm512_shuffle_f32x4(tmp1932, tmp1932, 136);
__m512 tmp1948 = _mm512_shuffle_f32x4(tmp1932, tmp1932, 221);
tmp1813 = _mm512_shuffle_f32x4(tmp1933, tmp1941, 136);
tmp1821 = _mm512_shuffle_f32x4(tmp1933, tmp1941, 221);
tmp1814 = _mm512_shuffle_f32x4(tmp1935, tmp1943, 136);
tmp1822 = _mm512_shuffle_f32x4(tmp1935, tmp1943, 221);
tmp1815 = _mm512_shuffle_f32x4(tmp1937, tmp1945, 136);
tmp1823 = _mm512_shuffle_f32x4(tmp1937, tmp1945, 221);
tmp1816 = _mm512_shuffle_f32x4(tmp1939, tmp1947, 136);
tmp1824 = _mm512_shuffle_f32x4(tmp1939, tmp1947, 221);
tmp1817 = _mm512_shuffle_f32x4(tmp1934, tmp1942, 136);
__m512 tmp1865 = _mm512_shuffle_f32x4(tmp1934, tmp1942, 221);
tmp1818 = _mm512_shuffle_f32x4(tmp1936, tmp1944, 136);
__m512 tmp1866 = _mm512_shuffle_f32x4(tmp1936, tmp1944, 221);
tmp1819 = _mm512_shuffle_f32x4(tmp1938, tmp1946, 136);
__m512 tmp1867 = _mm512_shuffle_f32x4(tmp1938, tmp1946, 221);
tmp1820 = _mm512_shuffle_f32x4(tmp1940, tmp1948, 136);
__m512 tmp1868 = _mm512_shuffle_f32x4(tmp1940, tmp1948, 221);
__m512 tmp1873 = _mm512_add_ps(tmp1814, tmp1815);
__m512 tmp1893 = _mm512_add_ps(tmp1822, tmp1823);
__m512 tmp1872 = _mm512_add_ps(tmp1816, tmp1817);
__m512 tmp1892 = _mm512_add_ps(tmp1824, tmp1865);
__m512 tmp1878 = _mm512_sub_ps(tmp1816, tmp1817);
__m512 tmp1898 = _mm512_sub_ps(tmp1824, tmp1865);
__m512 tmp1877 = _mm512_sub_ps(tmp1814, tmp1815);
__m512 tmp1897 = _mm512_sub_ps(tmp1822, tmp1823);
__m512 tmp1874 = _mm512_add_ps(tmp1818, tmp1819);
__m512 tmp1894 = _mm512_add_ps(tmp1866, tmp1867);
__m512 tmp1879 = _mm512_sub_ps(tmp1818, tmp1819);
__m512 tmp1899 = _mm512_sub_ps(tmp1866, tmp1867);
__m512 tmp1876 = _mm512_fmadd_ps(tmp1878, _mm512_set1_ps(2e+00f), tmp1877);
__m512 tmp1896 = _mm512_fmadd_ps(tmp1898, _mm512_set1_ps(2e+00f), tmp1897);
__m512 tmp1883 = _mm512_fmadd_ps(tmp1878, _mm512_set1_ps(8e+00f), tmp1877);
__m512 tmp1903 = _mm512_fmadd_ps(tmp1898, _mm512_set1_ps(8e+00f), tmp1897);
__m512 tmp1871 = _mm512_add_ps(tmp1872, tmp1873);
__m512 tmp1891 = _mm512_add_ps(tmp1892, tmp1893);
__m512 tmp1875 = _mm512_fmadd_ps(tmp1879, _mm512_set1_ps(1.6e+01f), tmp1876);
__m512 tmp1895 = _mm512_fmadd_ps(tmp1899, _mm512_set1_ps(1.6e+01f), tmp1896);
__m512 tmp1882 = _mm512_fmadd_ps(tmp1879, _mm512_set1_ps(4e+00f), tmp1883);
__m512 tmp1902 = _mm512_fmadd_ps(tmp1899, _mm512_set1_ps(4e+00f), tmp1903);
__m512 tmp1888 = _mm512_add_ps(tmp1879, tmp1877);
__m512 tmp1908 = _mm512_add_ps(tmp1899, tmp1897);
__m512 tmp1881 = _mm512_fmadd_ps(tmp1872, _mm512_set1_ps(4e+00f), tmp1873);
__m512 tmp1901 = _mm512_fmadd_ps(tmp1892, _mm512_set1_ps(4e+00f), tmp1893);
__m512 tmp1885 = _mm512_fmadd_ps(tmp1872, _mm512_set1_ps(1.6e+01f), tmp1873);
__m512 tmp1905 = _mm512_fmadd_ps(tmp1892, _mm512_set1_ps(1.6e+01f), tmp1893);
__m512 tmp1870 = _mm512_add_ps(tmp1871, tmp1813);
__m512 tmp1890 = _mm512_add_ps(tmp1891, tmp1821);
__m512 tmp1887 = _mm512_add_ps(tmp1888, tmp1820);
__m512 tmp1907 = _mm512_add_ps(tmp1908, tmp1868);
__m512 tmp1869 = _mm512_fmadd_ps(tmp1874, _mm512_set1_ps(3.2e+01f), tmp1870);
__m512 tmp1889 = _mm512_fmadd_ps(tmp1894, _mm512_set1_ps(3.2e+01f), tmp1890);
__m512 tmp1880 = _mm512_fmadd_ps(tmp1874, _mm512_set1_ps(8e+00f), tmp1881);
__m512 tmp1900 = _mm512_fmadd_ps(tmp1894, _mm512_set1_ps(8e+00f), tmp1901);
__m512 tmp1886 = _mm512_fmadd_ps(tmp1878, _mm512_set1_ps(3.2e+01f), tmp1887);
__m512 tmp1906 = _mm512_fmadd_ps(tmp1898, _mm512_set1_ps(3.2e+01f), tmp1907);
__m512 tmp1884 = _mm512_fmadd_ps(tmp1874, _mm512_set1_ps(2e+00f), tmp1885);
__m512 tmp1904 = _mm512_fmadd_ps(tmp1894, _mm512_set1_ps(2e+00f), tmp1905);
__m512 out453 = tmp1869;
__m512 out459 = tmp1889;
__m512 out454 = tmp1875;
__m512 out460 = tmp1895;
__m512 out455 = tmp1880;
__m512 out461 = tmp1900;
__m512 out456 = tmp1882;
__m512 out462 = tmp1902;
__m512 out457 = tmp1884;
__m512 out463 = tmp1904;
__m512 out458 = tmp1886;
__m512 out464 = tmp1906;
_mm512_mask_storeu_ps(datPtr3+0+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out453);
_mm512_mask_storeu_ps(datPtr3+48+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out459);
_mm512_mask_storeu_ps(datPtr3+84+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out454);
_mm512_mask_storeu_ps(datPtr3+132+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out460);
_mm512_mask_storeu_ps(datPtr3+168+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out455);
_mm512_mask_storeu_ps(datPtr3+216+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out461);
_mm512_mask_storeu_ps(datPtr3+252+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out456);
_mm512_mask_storeu_ps(datPtr3+300+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out462);
_mm512_mask_storeu_ps(datPtr3+336+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out457);
_mm512_mask_storeu_ps(datPtr3+384+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out463);
_mm512_mask_storeu_ps(datPtr3+420+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out458);
_mm512_mask_storeu_ps(datPtr3+468+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out464);
__m512 sf17 = _mm512_loadu_ps(sfPtr4+256+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf18 = _mm512_loadu_ps(sfPtr4+384+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in416 = _mm512_shuffle_f32x4(sf17, sf18, 68);
__m512 in417 = _mm512_shuffle_f32x4(sf17, sf18, 238);
__m512 sf19 = _mm512_loadu_ps(sfPtr4+320+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf20 = _mm512_loadu_ps(sfPtr4+448+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in424 = _mm512_shuffle_f32x4(sf19, sf20, 68);
__m512 in425 = _mm512_shuffle_f32x4(sf19, sf20, 238);
__m512 sf21 = _mm512_loadu_ps(sfPtr4+1961728+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf22 = _mm512_loadu_ps(sfPtr4+1961856+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in418 = _mm512_shuffle_f32x4(sf21, sf22, 68);
__m512 in419 = _mm512_shuffle_f32x4(sf21, sf22, 238);
__m512 sf23 = _mm512_loadu_ps(sfPtr4+1961792+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf24 = _mm512_loadu_ps(sfPtr4+1961920+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in426 = _mm512_shuffle_f32x4(sf23, sf24, 68);
__m512 in427 = _mm512_shuffle_f32x4(sf23, sf24, 238);
__m512 sf25 = _mm512_loadu_ps(sfPtr4+3923200+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf26 = _mm512_loadu_ps(sfPtr4+3923328+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in420 = _mm512_shuffle_f32x4(sf25, sf26, 68);
__m512 in421 = _mm512_shuffle_f32x4(sf25, sf26, 238);
__m512 sf27 = _mm512_loadu_ps(sfPtr4+3923264+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf28 = _mm512_loadu_ps(sfPtr4+3923392+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in428 = _mm512_shuffle_f32x4(sf27, sf28, 68);
__m512 in429 = _mm512_shuffle_f32x4(sf27, sf28, 238);
__m512 sf29 = _mm512_loadu_ps(sfPtr4+5884672+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf30 = _mm512_loadu_ps(sfPtr4+5884800+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in422 = _mm512_shuffle_f32x4(sf29, sf30, 68);
__m512 in423 = _mm512_shuffle_f32x4(sf29, sf30, 238);
__m512 sf31 = _mm512_loadu_ps(sfPtr4+5884736+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf32 = _mm512_loadu_ps(sfPtr4+5884864+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in430 = _mm512_shuffle_f32x4(sf31, sf32, 68);
__m512 in431 = _mm512_shuffle_f32x4(sf31, sf32, 238);
__m512 tmp1965 = _mm512_add_ps(in417, in418);
__m512 tmp1985 = _mm512_add_ps(in425, in426);
__m512 tmp1964 = _mm512_add_ps(in419, in420);
__m512 tmp1984 = _mm512_add_ps(in427, in428);
__m512 tmp1970 = _mm512_sub_ps(in419, in420);
__m512 tmp1990 = _mm512_sub_ps(in427, in428);
__m512 tmp1969 = _mm512_sub_ps(in417, in418);
__m512 tmp1989 = _mm512_sub_ps(in425, in426);
__m512 tmp1966 = _mm512_add_ps(in421, in422);
__m512 tmp1986 = _mm512_add_ps(in429, in430);
__m512 tmp1971 = _mm512_sub_ps(in421, in422);
__m512 tmp1991 = _mm512_sub_ps(in429, in430);
__m512 tmp1968 = _mm512_fmadd_ps(tmp1970, _mm512_set1_ps(2e+00f), tmp1969);
__m512 tmp1988 = _mm512_fmadd_ps(tmp1990, _mm512_set1_ps(2e+00f), tmp1989);
__m512 tmp1975 = _mm512_fmadd_ps(tmp1970, _mm512_set1_ps(8e+00f), tmp1969);
__m512 tmp1995 = _mm512_fmadd_ps(tmp1990, _mm512_set1_ps(8e+00f), tmp1989);
__m512 tmp1963 = _mm512_add_ps(tmp1964, tmp1965);
__m512 tmp1983 = _mm512_add_ps(tmp1984, tmp1985);
__m512 tmp1967 = _mm512_fmadd_ps(tmp1971, _mm512_set1_ps(1.6e+01f), tmp1968);
__m512 tmp1987 = _mm512_fmadd_ps(tmp1991, _mm512_set1_ps(1.6e+01f), tmp1988);
__m512 tmp1974 = _mm512_fmadd_ps(tmp1971, _mm512_set1_ps(4e+00f), tmp1975);
__m512 tmp1994 = _mm512_fmadd_ps(tmp1991, _mm512_set1_ps(4e+00f), tmp1995);
__m512 tmp1980 = _mm512_add_ps(tmp1971, tmp1969);
__m512 tmp2000 = _mm512_add_ps(tmp1991, tmp1989);
__m512 tmp1973 = _mm512_fmadd_ps(tmp1964, _mm512_set1_ps(4e+00f), tmp1965);
__m512 tmp1993 = _mm512_fmadd_ps(tmp1984, _mm512_set1_ps(4e+00f), tmp1985);
__m512 tmp1977 = _mm512_fmadd_ps(tmp1964, _mm512_set1_ps(1.6e+01f), tmp1965);
__m512 tmp1997 = _mm512_fmadd_ps(tmp1984, _mm512_set1_ps(1.6e+01f), tmp1985);
__m512 tmp1962 = _mm512_add_ps(tmp1963, in416);
__m512 tmp1982 = _mm512_add_ps(tmp1983, in424);
__m512 tmp1979 = _mm512_add_ps(tmp1980, in423);
__m512 tmp1999 = _mm512_add_ps(tmp2000, in431);
__m512 tmp1961 = _mm512_fmadd_ps(tmp1966, _mm512_set1_ps(3.2e+01f), tmp1962);
__m512 tmp1981 = _mm512_fmadd_ps(tmp1986, _mm512_set1_ps(3.2e+01f), tmp1982);
__m512 tmp1972 = _mm512_fmadd_ps(tmp1966, _mm512_set1_ps(8e+00f), tmp1973);
__m512 tmp1992 = _mm512_fmadd_ps(tmp1986, _mm512_set1_ps(8e+00f), tmp1993);
__m512 tmp1978 = _mm512_fmadd_ps(tmp1970, _mm512_set1_ps(3.2e+01f), tmp1979);
__m512 tmp1998 = _mm512_fmadd_ps(tmp1990, _mm512_set1_ps(3.2e+01f), tmp1999);
__m512 tmp1976 = _mm512_fmadd_ps(tmp1966, _mm512_set1_ps(2e+00f), tmp1977);
__m512 tmp1996 = _mm512_fmadd_ps(tmp1986, _mm512_set1_ps(2e+00f), tmp1997);
__m512 tmp1949 = tmp1961;
__m512 tmp1955 = tmp1981;
__m512 tmp1950 = tmp1967;
__m512 tmp1956 = tmp1987;
__m512 tmp1951 = tmp1972;
__m512 tmp1957 = tmp1992;
__m512 tmp1952 = tmp1974;
__m512 tmp1958 = tmp1994;
__m512 tmp1953 = tmp1976;
__m512 tmp1959 = tmp1996;
__m512 tmp1954 = tmp1978;
__m512 tmp1960 = tmp1998;
__m512 tmp2045 = _mm512_unpacklo_ps(tmp1949, tmp1950);
__m512 tmp2046 = _mm512_unpackhi_ps(tmp1949, tmp1950);
__m512 tmp2047 = _mm512_unpacklo_ps(tmp1951, tmp1952);
__m512 tmp2048 = _mm512_unpackhi_ps(tmp1951, tmp1952);
__m512 tmp2049 = _mm512_unpacklo_ps(tmp1953, tmp1954);
__m512 tmp2050 = _mm512_unpackhi_ps(tmp1953, tmp1954);
__m512 tmp2051 = _mm512_unpacklo_ps(tmp1955, tmp1956);
__m512 tmp2052 = _mm512_unpackhi_ps(tmp1955, tmp1956);
__m512 tmp2053 = _mm512_unpacklo_ps(tmp1957, tmp1958);
__m512 tmp2054 = _mm512_unpackhi_ps(tmp1957, tmp1958);
__m512 tmp2055 = _mm512_unpacklo_ps(tmp1959, tmp1960);
__m512 tmp2056 = _mm512_unpackhi_ps(tmp1959, tmp1960);
__m512 tmp2057 = _mm512_shuffle_ps(tmp2045, tmp2047, 68);
__m512 tmp2058 = _mm512_shuffle_ps(tmp2045, tmp2047, 238);
__m512 tmp2059 = _mm512_shuffle_ps(tmp2046, tmp2048, 68);
__m512 tmp2060 = _mm512_shuffle_ps(tmp2046, tmp2048, 238);
__m512 tmp2061 = _mm512_shuffle_ps(tmp2049, tmp2051, 68);
__m512 tmp2062 = _mm512_shuffle_ps(tmp2049, tmp2051, 238);
__m512 tmp2063 = _mm512_shuffle_ps(tmp2050, tmp2052, 68);
__m512 tmp2064 = _mm512_shuffle_ps(tmp2050, tmp2052, 238);
__m512 tmp2065 = _mm512_shuffle_ps(tmp2053, tmp2055, 68);
__m512 tmp2066 = _mm512_shuffle_ps(tmp2053, tmp2055, 238);
__m512 tmp2067 = _mm512_shuffle_ps(tmp2054, tmp2056, 68);
__m512 tmp2068 = _mm512_shuffle_ps(tmp2054, tmp2056, 238);
__m512 tmp2069 = _mm512_shuffle_f32x4(tmp2057, tmp2061, 136);
__m512 tmp2070 = _mm512_shuffle_f32x4(tmp2057, tmp2061, 221);
__m512 tmp2071 = _mm512_shuffle_f32x4(tmp2058, tmp2062, 136);
__m512 tmp2072 = _mm512_shuffle_f32x4(tmp2058, tmp2062, 221);
__m512 tmp2073 = _mm512_shuffle_f32x4(tmp2059, tmp2063, 136);
__m512 tmp2074 = _mm512_shuffle_f32x4(tmp2059, tmp2063, 221);
__m512 tmp2075 = _mm512_shuffle_f32x4(tmp2060, tmp2064, 136);
__m512 tmp2076 = _mm512_shuffle_f32x4(tmp2060, tmp2064, 221);
__m512 tmp2077 = _mm512_shuffle_f32x4(tmp2065, tmp2065, 136);
__m512 tmp2078 = _mm512_shuffle_f32x4(tmp2065, tmp2065, 221);
__m512 tmp2079 = _mm512_shuffle_f32x4(tmp2066, tmp2066, 136);
__m512 tmp2080 = _mm512_shuffle_f32x4(tmp2066, tmp2066, 221);
__m512 tmp2081 = _mm512_shuffle_f32x4(tmp2067, tmp2067, 136);
__m512 tmp2082 = _mm512_shuffle_f32x4(tmp2067, tmp2067, 221);
__m512 tmp2083 = _mm512_shuffle_f32x4(tmp2068, tmp2068, 136);
__m512 tmp2084 = _mm512_shuffle_f32x4(tmp2068, tmp2068, 221);
tmp1949 = _mm512_shuffle_f32x4(tmp2069, tmp2077, 136);
tmp1957 = _mm512_shuffle_f32x4(tmp2069, tmp2077, 221);
tmp1950 = _mm512_shuffle_f32x4(tmp2071, tmp2079, 136);
tmp1958 = _mm512_shuffle_f32x4(tmp2071, tmp2079, 221);
tmp1951 = _mm512_shuffle_f32x4(tmp2073, tmp2081, 136);
tmp1959 = _mm512_shuffle_f32x4(tmp2073, tmp2081, 221);
tmp1952 = _mm512_shuffle_f32x4(tmp2075, tmp2083, 136);
tmp1960 = _mm512_shuffle_f32x4(tmp2075, tmp2083, 221);
tmp1953 = _mm512_shuffle_f32x4(tmp2070, tmp2078, 136);
__m512 tmp2001 = _mm512_shuffle_f32x4(tmp2070, tmp2078, 221);
tmp1954 = _mm512_shuffle_f32x4(tmp2072, tmp2080, 136);
__m512 tmp2002 = _mm512_shuffle_f32x4(tmp2072, tmp2080, 221);
tmp1955 = _mm512_shuffle_f32x4(tmp2074, tmp2082, 136);
__m512 tmp2003 = _mm512_shuffle_f32x4(tmp2074, tmp2082, 221);
tmp1956 = _mm512_shuffle_f32x4(tmp2076, tmp2084, 136);
__m512 tmp2004 = _mm512_shuffle_f32x4(tmp2076, tmp2084, 221);
__m512 tmp2009 = _mm512_add_ps(tmp1950, tmp1951);
__m512 tmp2029 = _mm512_add_ps(tmp1958, tmp1959);
__m512 tmp2008 = _mm512_add_ps(tmp1952, tmp1953);
__m512 tmp2028 = _mm512_add_ps(tmp1960, tmp2001);
__m512 tmp2014 = _mm512_sub_ps(tmp1952, tmp1953);
__m512 tmp2034 = _mm512_sub_ps(tmp1960, tmp2001);
__m512 tmp2013 = _mm512_sub_ps(tmp1950, tmp1951);
__m512 tmp2033 = _mm512_sub_ps(tmp1958, tmp1959);
__m512 tmp2010 = _mm512_add_ps(tmp1954, tmp1955);
__m512 tmp2030 = _mm512_add_ps(tmp2002, tmp2003);
__m512 tmp2015 = _mm512_sub_ps(tmp1954, tmp1955);
__m512 tmp2035 = _mm512_sub_ps(tmp2002, tmp2003);
__m512 tmp2012 = _mm512_fmadd_ps(tmp2014, _mm512_set1_ps(2e+00f), tmp2013);
__m512 tmp2032 = _mm512_fmadd_ps(tmp2034, _mm512_set1_ps(2e+00f), tmp2033);
__m512 tmp2019 = _mm512_fmadd_ps(tmp2014, _mm512_set1_ps(8e+00f), tmp2013);
__m512 tmp2039 = _mm512_fmadd_ps(tmp2034, _mm512_set1_ps(8e+00f), tmp2033);
__m512 tmp2007 = _mm512_add_ps(tmp2008, tmp2009);
__m512 tmp2027 = _mm512_add_ps(tmp2028, tmp2029);
__m512 tmp2011 = _mm512_fmadd_ps(tmp2015, _mm512_set1_ps(1.6e+01f), tmp2012);
__m512 tmp2031 = _mm512_fmadd_ps(tmp2035, _mm512_set1_ps(1.6e+01f), tmp2032);
__m512 tmp2018 = _mm512_fmadd_ps(tmp2015, _mm512_set1_ps(4e+00f), tmp2019);
__m512 tmp2038 = _mm512_fmadd_ps(tmp2035, _mm512_set1_ps(4e+00f), tmp2039);
__m512 tmp2024 = _mm512_add_ps(tmp2015, tmp2013);
__m512 tmp2044 = _mm512_add_ps(tmp2035, tmp2033);
__m512 tmp2017 = _mm512_fmadd_ps(tmp2008, _mm512_set1_ps(4e+00f), tmp2009);
__m512 tmp2037 = _mm512_fmadd_ps(tmp2028, _mm512_set1_ps(4e+00f), tmp2029);
__m512 tmp2021 = _mm512_fmadd_ps(tmp2008, _mm512_set1_ps(1.6e+01f), tmp2009);
__m512 tmp2041 = _mm512_fmadd_ps(tmp2028, _mm512_set1_ps(1.6e+01f), tmp2029);
__m512 tmp2006 = _mm512_add_ps(tmp2007, tmp1949);
__m512 tmp2026 = _mm512_add_ps(tmp2027, tmp1957);
__m512 tmp2023 = _mm512_add_ps(tmp2024, tmp1956);
__m512 tmp2043 = _mm512_add_ps(tmp2044, tmp2004);
__m512 tmp2005 = _mm512_fmadd_ps(tmp2010, _mm512_set1_ps(3.2e+01f), tmp2006);
__m512 tmp2025 = _mm512_fmadd_ps(tmp2030, _mm512_set1_ps(3.2e+01f), tmp2026);
__m512 tmp2016 = _mm512_fmadd_ps(tmp2010, _mm512_set1_ps(8e+00f), tmp2017);
__m512 tmp2036 = _mm512_fmadd_ps(tmp2030, _mm512_set1_ps(8e+00f), tmp2037);
__m512 tmp2022 = _mm512_fmadd_ps(tmp2014, _mm512_set1_ps(3.2e+01f), tmp2023);
__m512 tmp2042 = _mm512_fmadd_ps(tmp2034, _mm512_set1_ps(3.2e+01f), tmp2043);
__m512 tmp2020 = _mm512_fmadd_ps(tmp2010, _mm512_set1_ps(2e+00f), tmp2021);
__m512 tmp2040 = _mm512_fmadd_ps(tmp2030, _mm512_set1_ps(2e+00f), tmp2041);
__m512 out465 = tmp2005;
__m512 out471 = tmp2025;
__m512 out466 = tmp2011;
__m512 out472 = tmp2031;
__m512 out467 = tmp2016;
__m512 out473 = tmp2036;
__m512 out468 = tmp2018;
__m512 out474 = tmp2038;
__m512 out469 = tmp2020;
__m512 out475 = tmp2040;
__m512 out470 = tmp2022;
__m512 out476 = tmp2042;
_mm512_mask_storeu_ps(datPtr3+504+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out465);
_mm512_mask_storeu_ps(datPtr3+3024+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out471);
_mm512_mask_storeu_ps(datPtr3+588+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out466);
_mm512_mask_storeu_ps(datPtr3+3108+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out472);
_mm512_mask_storeu_ps(datPtr3+672+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out467);
_mm512_mask_storeu_ps(datPtr3+3192+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out473);
_mm512_mask_storeu_ps(datPtr3+756+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out468);
_mm512_mask_storeu_ps(datPtr3+3276+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out474);
_mm512_mask_storeu_ps(datPtr3+840+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out469);
_mm512_mask_storeu_ps(datPtr3+3360+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out475);
_mm512_mask_storeu_ps(datPtr3+924+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out470);
_mm512_mask_storeu_ps(datPtr3+3444+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out476);
__m512 sf33 = _mm512_loadu_ps(sfPtr4+512+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf34 = _mm512_loadu_ps(sfPtr4+640+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in432 = _mm512_shuffle_f32x4(sf33, sf34, 68);
__m512 in433 = _mm512_shuffle_f32x4(sf33, sf34, 238);
__m512 sf35 = _mm512_loadu_ps(sfPtr4+576+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf36 = _mm512_loadu_ps(sfPtr4+704+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in440 = _mm512_shuffle_f32x4(sf35, sf36, 68);
__m512 in441 = _mm512_shuffle_f32x4(sf35, sf36, 238);
__m512 sf37 = _mm512_loadu_ps(sfPtr4+1961984+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf38 = _mm512_loadu_ps(sfPtr4+1962112+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in434 = _mm512_shuffle_f32x4(sf37, sf38, 68);
__m512 in435 = _mm512_shuffle_f32x4(sf37, sf38, 238);
__m512 sf39 = _mm512_loadu_ps(sfPtr4+1962048+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf40 = _mm512_loadu_ps(sfPtr4+1962176+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in442 = _mm512_shuffle_f32x4(sf39, sf40, 68);
__m512 in443 = _mm512_shuffle_f32x4(sf39, sf40, 238);
__m512 sf41 = _mm512_loadu_ps(sfPtr4+3923456+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf42 = _mm512_loadu_ps(sfPtr4+3923584+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in436 = _mm512_shuffle_f32x4(sf41, sf42, 68);
__m512 in437 = _mm512_shuffle_f32x4(sf41, sf42, 238);
__m512 sf43 = _mm512_loadu_ps(sfPtr4+3923520+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf44 = _mm512_loadu_ps(sfPtr4+3923648+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in444 = _mm512_shuffle_f32x4(sf43, sf44, 68);
__m512 in445 = _mm512_shuffle_f32x4(sf43, sf44, 238);
__m512 sf45 = _mm512_loadu_ps(sfPtr4+5884928+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf46 = _mm512_loadu_ps(sfPtr4+5885056+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in438 = _mm512_shuffle_f32x4(sf45, sf46, 68);
__m512 in439 = _mm512_shuffle_f32x4(sf45, sf46, 238);
__m512 sf47 = _mm512_loadu_ps(sfPtr4+5884992+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 sf48 = _mm512_loadu_ps(sfPtr4+5885120+7845888*i12+490368*j8+1536*k14+768*l4);
__m512 in446 = _mm512_shuffle_f32x4(sf47, sf48, 68);
__m512 in447 = _mm512_shuffle_f32x4(sf47, sf48, 238);
__m512 tmp2101 = _mm512_add_ps(in433, in434);
__m512 tmp2121 = _mm512_add_ps(in441, in442);
__m512 tmp2100 = _mm512_add_ps(in435, in436);
__m512 tmp2120 = _mm512_add_ps(in443, in444);
__m512 tmp2106 = _mm512_sub_ps(in435, in436);
__m512 tmp2126 = _mm512_sub_ps(in443, in444);
__m512 tmp2105 = _mm512_sub_ps(in433, in434);
__m512 tmp2125 = _mm512_sub_ps(in441, in442);
__m512 tmp2102 = _mm512_add_ps(in437, in438);
__m512 tmp2122 = _mm512_add_ps(in445, in446);
__m512 tmp2107 = _mm512_sub_ps(in437, in438);
__m512 tmp2127 = _mm512_sub_ps(in445, in446);
__m512 tmp2104 = _mm512_fmadd_ps(tmp2106, _mm512_set1_ps(2e+00f), tmp2105);
__m512 tmp2124 = _mm512_fmadd_ps(tmp2126, _mm512_set1_ps(2e+00f), tmp2125);
__m512 tmp2111 = _mm512_fmadd_ps(tmp2106, _mm512_set1_ps(8e+00f), tmp2105);
__m512 tmp2131 = _mm512_fmadd_ps(tmp2126, _mm512_set1_ps(8e+00f), tmp2125);
__m512 tmp2099 = _mm512_add_ps(tmp2100, tmp2101);
__m512 tmp2119 = _mm512_add_ps(tmp2120, tmp2121);
__m512 tmp2103 = _mm512_fmadd_ps(tmp2107, _mm512_set1_ps(1.6e+01f), tmp2104);
__m512 tmp2123 = _mm512_fmadd_ps(tmp2127, _mm512_set1_ps(1.6e+01f), tmp2124);
__m512 tmp2110 = _mm512_fmadd_ps(tmp2107, _mm512_set1_ps(4e+00f), tmp2111);
__m512 tmp2130 = _mm512_fmadd_ps(tmp2127, _mm512_set1_ps(4e+00f), tmp2131);
__m512 tmp2116 = _mm512_add_ps(tmp2107, tmp2105);
__m512 tmp2136 = _mm512_add_ps(tmp2127, tmp2125);
__m512 tmp2109 = _mm512_fmadd_ps(tmp2100, _mm512_set1_ps(4e+00f), tmp2101);
__m512 tmp2129 = _mm512_fmadd_ps(tmp2120, _mm512_set1_ps(4e+00f), tmp2121);
__m512 tmp2113 = _mm512_fmadd_ps(tmp2100, _mm512_set1_ps(1.6e+01f), tmp2101);
__m512 tmp2133 = _mm512_fmadd_ps(tmp2120, _mm512_set1_ps(1.6e+01f), tmp2121);
__m512 tmp2098 = _mm512_add_ps(tmp2099, in432);
__m512 tmp2118 = _mm512_add_ps(tmp2119, in440);
__m512 tmp2115 = _mm512_add_ps(tmp2116, in439);
__m512 tmp2135 = _mm512_add_ps(tmp2136, in447);
__m512 tmp2097 = _mm512_fmadd_ps(tmp2102, _mm512_set1_ps(3.2e+01f), tmp2098);
__m512 tmp2117 = _mm512_fmadd_ps(tmp2122, _mm512_set1_ps(3.2e+01f), tmp2118);
__m512 tmp2108 = _mm512_fmadd_ps(tmp2102, _mm512_set1_ps(8e+00f), tmp2109);
__m512 tmp2128 = _mm512_fmadd_ps(tmp2122, _mm512_set1_ps(8e+00f), tmp2129);
__m512 tmp2114 = _mm512_fmadd_ps(tmp2106, _mm512_set1_ps(3.2e+01f), tmp2115);
__m512 tmp2134 = _mm512_fmadd_ps(tmp2126, _mm512_set1_ps(3.2e+01f), tmp2135);
__m512 tmp2112 = _mm512_fmadd_ps(tmp2102, _mm512_set1_ps(2e+00f), tmp2113);
__m512 tmp2132 = _mm512_fmadd_ps(tmp2122, _mm512_set1_ps(2e+00f), tmp2133);
__m512 tmp2085 = tmp2097;
__m512 tmp2091 = tmp2117;
__m512 tmp2086 = tmp2103;
__m512 tmp2092 = tmp2123;
__m512 tmp2087 = tmp2108;
__m512 tmp2093 = tmp2128;
__m512 tmp2088 = tmp2110;
__m512 tmp2094 = tmp2130;
__m512 tmp2089 = tmp2112;
__m512 tmp2095 = tmp2132;
__m512 tmp2090 = tmp2114;
__m512 tmp2096 = tmp2134;
__m512 tmp2181 = _mm512_unpacklo_ps(tmp2085, tmp2086);
__m512 tmp2182 = _mm512_unpackhi_ps(tmp2085, tmp2086);
__m512 tmp2183 = _mm512_unpacklo_ps(tmp2087, tmp2088);
__m512 tmp2184 = _mm512_unpackhi_ps(tmp2087, tmp2088);
__m512 tmp2185 = _mm512_unpacklo_ps(tmp2089, tmp2090);
__m512 tmp2186 = _mm512_unpackhi_ps(tmp2089, tmp2090);
__m512 tmp2187 = _mm512_unpacklo_ps(tmp2091, tmp2092);
__m512 tmp2188 = _mm512_unpackhi_ps(tmp2091, tmp2092);
__m512 tmp2189 = _mm512_unpacklo_ps(tmp2093, tmp2094);
__m512 tmp2190 = _mm512_unpackhi_ps(tmp2093, tmp2094);
__m512 tmp2191 = _mm512_unpacklo_ps(tmp2095, tmp2096);
__m512 tmp2192 = _mm512_unpackhi_ps(tmp2095, tmp2096);
__m512 tmp2193 = _mm512_shuffle_ps(tmp2181, tmp2183, 68);
__m512 tmp2194 = _mm512_shuffle_ps(tmp2181, tmp2183, 238);
__m512 tmp2195 = _mm512_shuffle_ps(tmp2182, tmp2184, 68);
__m512 tmp2196 = _mm512_shuffle_ps(tmp2182, tmp2184, 238);
__m512 tmp2197 = _mm512_shuffle_ps(tmp2185, tmp2187, 68);
__m512 tmp2198 = _mm512_shuffle_ps(tmp2185, tmp2187, 238);
__m512 tmp2199 = _mm512_shuffle_ps(tmp2186, tmp2188, 68);
__m512 tmp2200 = _mm512_shuffle_ps(tmp2186, tmp2188, 238);
__m512 tmp2201 = _mm512_shuffle_ps(tmp2189, tmp2191, 68);
__m512 tmp2202 = _mm512_shuffle_ps(tmp2189, tmp2191, 238);
__m512 tmp2203 = _mm512_shuffle_ps(tmp2190, tmp2192, 68);
__m512 tmp2204 = _mm512_shuffle_ps(tmp2190, tmp2192, 238);
__m512 tmp2205 = _mm512_shuffle_f32x4(tmp2193, tmp2197, 136);
__m512 tmp2206 = _mm512_shuffle_f32x4(tmp2193, tmp2197, 221);
__m512 tmp2207 = _mm512_shuffle_f32x4(tmp2194, tmp2198, 136);
__m512 tmp2208 = _mm512_shuffle_f32x4(tmp2194, tmp2198, 221);
__m512 tmp2209 = _mm512_shuffle_f32x4(tmp2195, tmp2199, 136);
__m512 tmp2210 = _mm512_shuffle_f32x4(tmp2195, tmp2199, 221);
__m512 tmp2211 = _mm512_shuffle_f32x4(tmp2196, tmp2200, 136);
__m512 tmp2212 = _mm512_shuffle_f32x4(tmp2196, tmp2200, 221);
__m512 tmp2213 = _mm512_shuffle_f32x4(tmp2201, tmp2201, 136);
__m512 tmp2214 = _mm512_shuffle_f32x4(tmp2201, tmp2201, 221);
__m512 tmp2215 = _mm512_shuffle_f32x4(tmp2202, tmp2202, 136);
__m512 tmp2216 = _mm512_shuffle_f32x4(tmp2202, tmp2202, 221);
__m512 tmp2217 = _mm512_shuffle_f32x4(tmp2203, tmp2203, 136);
__m512 tmp2218 = _mm512_shuffle_f32x4(tmp2203, tmp2203, 221);
__m512 tmp2219 = _mm512_shuffle_f32x4(tmp2204, tmp2204, 136);
__m512 tmp2220 = _mm512_shuffle_f32x4(tmp2204, tmp2204, 221);
tmp2085 = _mm512_shuffle_f32x4(tmp2205, tmp2213, 136);
tmp2093 = _mm512_shuffle_f32x4(tmp2205, tmp2213, 221);
tmp2086 = _mm512_shuffle_f32x4(tmp2207, tmp2215, 136);
tmp2094 = _mm512_shuffle_f32x4(tmp2207, tmp2215, 221);
tmp2087 = _mm512_shuffle_f32x4(tmp2209, tmp2217, 136);
tmp2095 = _mm512_shuffle_f32x4(tmp2209, tmp2217, 221);
tmp2088 = _mm512_shuffle_f32x4(tmp2211, tmp2219, 136);
tmp2096 = _mm512_shuffle_f32x4(tmp2211, tmp2219, 221);
tmp2089 = _mm512_shuffle_f32x4(tmp2206, tmp2214, 136);
__m512 tmp2137 = _mm512_shuffle_f32x4(tmp2206, tmp2214, 221);
tmp2090 = _mm512_shuffle_f32x4(tmp2208, tmp2216, 136);
__m512 tmp2138 = _mm512_shuffle_f32x4(tmp2208, tmp2216, 221);
tmp2091 = _mm512_shuffle_f32x4(tmp2210, tmp2218, 136);
__m512 tmp2139 = _mm512_shuffle_f32x4(tmp2210, tmp2218, 221);
tmp2092 = _mm512_shuffle_f32x4(tmp2212, tmp2220, 136);
__m512 tmp2140 = _mm512_shuffle_f32x4(tmp2212, tmp2220, 221);
__m512 tmp2145 = _mm512_add_ps(tmp2086, tmp2087);
__m512 tmp2165 = _mm512_add_ps(tmp2094, tmp2095);
__m512 tmp2144 = _mm512_add_ps(tmp2088, tmp2089);
__m512 tmp2164 = _mm512_add_ps(tmp2096, tmp2137);
__m512 tmp2150 = _mm512_sub_ps(tmp2088, tmp2089);
__m512 tmp2170 = _mm512_sub_ps(tmp2096, tmp2137);
__m512 tmp2149 = _mm512_sub_ps(tmp2086, tmp2087);
__m512 tmp2169 = _mm512_sub_ps(tmp2094, tmp2095);
__m512 tmp2146 = _mm512_add_ps(tmp2090, tmp2091);
__m512 tmp2166 = _mm512_add_ps(tmp2138, tmp2139);
__m512 tmp2151 = _mm512_sub_ps(tmp2090, tmp2091);
__m512 tmp2171 = _mm512_sub_ps(tmp2138, tmp2139);
__m512 tmp2148 = _mm512_fmadd_ps(tmp2150, _mm512_set1_ps(2e+00f), tmp2149);
__m512 tmp2168 = _mm512_fmadd_ps(tmp2170, _mm512_set1_ps(2e+00f), tmp2169);
__m512 tmp2155 = _mm512_fmadd_ps(tmp2150, _mm512_set1_ps(8e+00f), tmp2149);
__m512 tmp2175 = _mm512_fmadd_ps(tmp2170, _mm512_set1_ps(8e+00f), tmp2169);
__m512 tmp2143 = _mm512_add_ps(tmp2144, tmp2145);
__m512 tmp2163 = _mm512_add_ps(tmp2164, tmp2165);
__m512 tmp2147 = _mm512_fmadd_ps(tmp2151, _mm512_set1_ps(1.6e+01f), tmp2148);
__m512 tmp2167 = _mm512_fmadd_ps(tmp2171, _mm512_set1_ps(1.6e+01f), tmp2168);
__m512 tmp2154 = _mm512_fmadd_ps(tmp2151, _mm512_set1_ps(4e+00f), tmp2155);
__m512 tmp2174 = _mm512_fmadd_ps(tmp2171, _mm512_set1_ps(4e+00f), tmp2175);
__m512 tmp2160 = _mm512_add_ps(tmp2151, tmp2149);
__m512 tmp2180 = _mm512_add_ps(tmp2171, tmp2169);
__m512 tmp2153 = _mm512_fmadd_ps(tmp2144, _mm512_set1_ps(4e+00f), tmp2145);
__m512 tmp2173 = _mm512_fmadd_ps(tmp2164, _mm512_set1_ps(4e+00f), tmp2165);
__m512 tmp2157 = _mm512_fmadd_ps(tmp2144, _mm512_set1_ps(1.6e+01f), tmp2145);
__m512 tmp2177 = _mm512_fmadd_ps(tmp2164, _mm512_set1_ps(1.6e+01f), tmp2165);
__m512 tmp2142 = _mm512_add_ps(tmp2143, tmp2085);
__m512 tmp2162 = _mm512_add_ps(tmp2163, tmp2093);
__m512 tmp2159 = _mm512_add_ps(tmp2160, tmp2092);
__m512 tmp2179 = _mm512_add_ps(tmp2180, tmp2140);
__m512 tmp2141 = _mm512_fmadd_ps(tmp2146, _mm512_set1_ps(3.2e+01f), tmp2142);
__m512 tmp2161 = _mm512_fmadd_ps(tmp2166, _mm512_set1_ps(3.2e+01f), tmp2162);
__m512 tmp2152 = _mm512_fmadd_ps(tmp2146, _mm512_set1_ps(8e+00f), tmp2153);
__m512 tmp2172 = _mm512_fmadd_ps(tmp2166, _mm512_set1_ps(8e+00f), tmp2173);
__m512 tmp2158 = _mm512_fmadd_ps(tmp2150, _mm512_set1_ps(3.2e+01f), tmp2159);
__m512 tmp2178 = _mm512_fmadd_ps(tmp2170, _mm512_set1_ps(3.2e+01f), tmp2179);
__m512 tmp2156 = _mm512_fmadd_ps(tmp2146, _mm512_set1_ps(2e+00f), tmp2157);
__m512 tmp2176 = _mm512_fmadd_ps(tmp2166, _mm512_set1_ps(2e+00f), tmp2177);
__m512 out477 = tmp2141;
__m512 out483 = tmp2161;
__m512 out478 = tmp2147;
__m512 out484 = tmp2167;
__m512 out479 = tmp2152;
__m512 out485 = tmp2172;
__m512 out480 = tmp2154;
__m512 out486 = tmp2174;
__m512 out481 = tmp2156;
__m512 out487 = tmp2176;
__m512 out482 = tmp2158;
__m512 out488 = tmp2178;
_mm512_mask_storeu_ps(datPtr3+3072+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out477);
_mm512_mask_storeu_ps(datPtr3+3528+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out483);
_mm512_mask_storeu_ps(datPtr3+3156+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out478);
_mm512_mask_storeu_ps(datPtr3+3612+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out484);
_mm512_mask_storeu_ps(datPtr3+3240+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out479);
_mm512_mask_storeu_ps(datPtr3+3696+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out485);
_mm512_mask_storeu_ps(datPtr3+3324+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out480);
_mm512_mask_storeu_ps(datPtr3+3780+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out486);
_mm512_mask_storeu_ps(datPtr3+3408+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out481);
_mm512_mask_storeu_ps(datPtr3+3864+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out487);
_mm512_mask_storeu_ps(datPtr3+3492+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 511, out482);
_mm512_mask_storeu_ps(datPtr3+3948+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l4, 4095, out488);
}
if (k14 >= kk4) return;
}
ptrdiff_t l5 = 0;
__m512 sf49 = _mm512_loadu_ps(sfPtr4+0+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf50 = _mm512_loadu_ps(sfPtr4+128+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in448 = _mm512_shuffle_f32x4(sf49, sf50, 68);
__m512 in449 = _mm512_shuffle_f32x4(sf49, sf50, 238);
__m512 sf51 = _mm512_loadu_ps(sfPtr4+64+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf52 = _mm512_loadu_ps(sfPtr4+192+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in456 = _mm512_shuffle_f32x4(sf51, sf52, 68);
__m512 in457 = _mm512_shuffle_f32x4(sf51, sf52, 238);
__m512 sf53 = _mm512_loadu_ps(sfPtr4+1961472+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf54 = _mm512_loadu_ps(sfPtr4+1961600+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in450 = _mm512_shuffle_f32x4(sf53, sf54, 68);
__m512 in451 = _mm512_shuffle_f32x4(sf53, sf54, 238);
__m512 sf55 = _mm512_loadu_ps(sfPtr4+1961536+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf56 = _mm512_loadu_ps(sfPtr4+1961664+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in458 = _mm512_shuffle_f32x4(sf55, sf56, 68);
__m512 in459 = _mm512_shuffle_f32x4(sf55, sf56, 238);
__m512 sf57 = _mm512_loadu_ps(sfPtr4+3922944+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf58 = _mm512_loadu_ps(sfPtr4+3923072+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in452 = _mm512_shuffle_f32x4(sf57, sf58, 68);
__m512 in453 = _mm512_shuffle_f32x4(sf57, sf58, 238);
__m512 sf59 = _mm512_loadu_ps(sfPtr4+3923008+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf60 = _mm512_loadu_ps(sfPtr4+3923136+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in460 = _mm512_shuffle_f32x4(sf59, sf60, 68);
__m512 in461 = _mm512_shuffle_f32x4(sf59, sf60, 238);
__m512 sf61 = _mm512_loadu_ps(sfPtr4+5884416+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf62 = _mm512_loadu_ps(sfPtr4+5884544+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in454 = _mm512_shuffle_f32x4(sf61, sf62, 68);
__m512 in455 = _mm512_shuffle_f32x4(sf61, sf62, 238);
__m512 sf63 = _mm512_loadu_ps(sfPtr4+5884480+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf64 = _mm512_loadu_ps(sfPtr4+5884608+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in462 = _mm512_shuffle_f32x4(sf63, sf64, 68);
__m512 in463 = _mm512_shuffle_f32x4(sf63, sf64, 238);
__m512 tmp2237 = _mm512_add_ps(in449, in450);
__m512 tmp2257 = _mm512_add_ps(in457, in458);
__m512 tmp2236 = _mm512_add_ps(in451, in452);
__m512 tmp2256 = _mm512_add_ps(in459, in460);
__m512 tmp2242 = _mm512_sub_ps(in451, in452);
__m512 tmp2262 = _mm512_sub_ps(in459, in460);
__m512 tmp2241 = _mm512_sub_ps(in449, in450);
__m512 tmp2261 = _mm512_sub_ps(in457, in458);
__m512 tmp2238 = _mm512_add_ps(in453, in454);
__m512 tmp2258 = _mm512_add_ps(in461, in462);
__m512 tmp2243 = _mm512_sub_ps(in453, in454);
__m512 tmp2263 = _mm512_sub_ps(in461, in462);
__m512 tmp2240 = _mm512_fmadd_ps(tmp2242, _mm512_set1_ps(2e+00f), tmp2241);
__m512 tmp2260 = _mm512_fmadd_ps(tmp2262, _mm512_set1_ps(2e+00f), tmp2261);
__m512 tmp2247 = _mm512_fmadd_ps(tmp2242, _mm512_set1_ps(8e+00f), tmp2241);
__m512 tmp2267 = _mm512_fmadd_ps(tmp2262, _mm512_set1_ps(8e+00f), tmp2261);
__m512 tmp2235 = _mm512_add_ps(tmp2236, tmp2237);
__m512 tmp2255 = _mm512_add_ps(tmp2256, tmp2257);
__m512 tmp2239 = _mm512_fmadd_ps(tmp2243, _mm512_set1_ps(1.6e+01f), tmp2240);
__m512 tmp2259 = _mm512_fmadd_ps(tmp2263, _mm512_set1_ps(1.6e+01f), tmp2260);
__m512 tmp2246 = _mm512_fmadd_ps(tmp2243, _mm512_set1_ps(4e+00f), tmp2247);
__m512 tmp2266 = _mm512_fmadd_ps(tmp2263, _mm512_set1_ps(4e+00f), tmp2267);
__m512 tmp2252 = _mm512_add_ps(tmp2243, tmp2241);
__m512 tmp2272 = _mm512_add_ps(tmp2263, tmp2261);
__m512 tmp2245 = _mm512_fmadd_ps(tmp2236, _mm512_set1_ps(4e+00f), tmp2237);
__m512 tmp2265 = _mm512_fmadd_ps(tmp2256, _mm512_set1_ps(4e+00f), tmp2257);
__m512 tmp2249 = _mm512_fmadd_ps(tmp2236, _mm512_set1_ps(1.6e+01f), tmp2237);
__m512 tmp2269 = _mm512_fmadd_ps(tmp2256, _mm512_set1_ps(1.6e+01f), tmp2257);
__m512 tmp2234 = _mm512_add_ps(tmp2235, in448);
__m512 tmp2254 = _mm512_add_ps(tmp2255, in456);
__m512 tmp2251 = _mm512_add_ps(tmp2252, in455);
__m512 tmp2271 = _mm512_add_ps(tmp2272, in463);
__m512 tmp2233 = _mm512_fmadd_ps(tmp2238, _mm512_set1_ps(3.2e+01f), tmp2234);
__m512 tmp2253 = _mm512_fmadd_ps(tmp2258, _mm512_set1_ps(3.2e+01f), tmp2254);
__m512 tmp2244 = _mm512_fmadd_ps(tmp2238, _mm512_set1_ps(8e+00f), tmp2245);
__m512 tmp2264 = _mm512_fmadd_ps(tmp2258, _mm512_set1_ps(8e+00f), tmp2265);
__m512 tmp2250 = _mm512_fmadd_ps(tmp2242, _mm512_set1_ps(3.2e+01f), tmp2251);
__m512 tmp2270 = _mm512_fmadd_ps(tmp2262, _mm512_set1_ps(3.2e+01f), tmp2271);
__m512 tmp2248 = _mm512_fmadd_ps(tmp2238, _mm512_set1_ps(2e+00f), tmp2249);
__m512 tmp2268 = _mm512_fmadd_ps(tmp2258, _mm512_set1_ps(2e+00f), tmp2269);
__m512 tmp2221 = tmp2233;
__m512 tmp2227 = tmp2253;
__m512 tmp2222 = tmp2239;
__m512 tmp2228 = tmp2259;
__m512 tmp2223 = tmp2244;
__m512 tmp2229 = tmp2264;
__m512 tmp2224 = tmp2246;
__m512 tmp2230 = tmp2266;
__m512 tmp2225 = tmp2248;
__m512 tmp2231 = tmp2268;
__m512 tmp2226 = tmp2250;
__m512 tmp2232 = tmp2270;
__m512 tmp2317 = _mm512_unpacklo_ps(tmp2221, tmp2222);
__m512 tmp2318 = _mm512_unpackhi_ps(tmp2221, tmp2222);
__m512 tmp2319 = _mm512_unpacklo_ps(tmp2223, tmp2224);
__m512 tmp2320 = _mm512_unpackhi_ps(tmp2223, tmp2224);
__m512 tmp2321 = _mm512_unpacklo_ps(tmp2225, tmp2226);
__m512 tmp2322 = _mm512_unpackhi_ps(tmp2225, tmp2226);
__m512 tmp2323 = _mm512_unpacklo_ps(tmp2227, tmp2228);
__m512 tmp2324 = _mm512_unpackhi_ps(tmp2227, tmp2228);
__m512 tmp2325 = _mm512_unpacklo_ps(tmp2229, tmp2230);
__m512 tmp2326 = _mm512_unpackhi_ps(tmp2229, tmp2230);
__m512 tmp2327 = _mm512_unpacklo_ps(tmp2231, tmp2232);
__m512 tmp2328 = _mm512_unpackhi_ps(tmp2231, tmp2232);
__m512 tmp2329 = _mm512_shuffle_ps(tmp2317, tmp2319, 68);
__m512 tmp2330 = _mm512_shuffle_ps(tmp2317, tmp2319, 238);
__m512 tmp2331 = _mm512_shuffle_ps(tmp2318, tmp2320, 68);
__m512 tmp2332 = _mm512_shuffle_ps(tmp2318, tmp2320, 238);
__m512 tmp2333 = _mm512_shuffle_ps(tmp2321, tmp2323, 68);
__m512 tmp2334 = _mm512_shuffle_ps(tmp2321, tmp2323, 238);
__m512 tmp2335 = _mm512_shuffle_ps(tmp2322, tmp2324, 68);
__m512 tmp2336 = _mm512_shuffle_ps(tmp2322, tmp2324, 238);
__m512 tmp2337 = _mm512_shuffle_ps(tmp2325, tmp2327, 68);
__m512 tmp2338 = _mm512_shuffle_ps(tmp2325, tmp2327, 238);
__m512 tmp2339 = _mm512_shuffle_ps(tmp2326, tmp2328, 68);
__m512 tmp2340 = _mm512_shuffle_ps(tmp2326, tmp2328, 238);
__m512 tmp2341 = _mm512_shuffle_f32x4(tmp2329, tmp2333, 136);
__m512 tmp2342 = _mm512_shuffle_f32x4(tmp2329, tmp2333, 221);
__m512 tmp2343 = _mm512_shuffle_f32x4(tmp2330, tmp2334, 136);
__m512 tmp2344 = _mm512_shuffle_f32x4(tmp2330, tmp2334, 221);
__m512 tmp2345 = _mm512_shuffle_f32x4(tmp2331, tmp2335, 136);
__m512 tmp2346 = _mm512_shuffle_f32x4(tmp2331, tmp2335, 221);
__m512 tmp2347 = _mm512_shuffle_f32x4(tmp2332, tmp2336, 136);
__m512 tmp2348 = _mm512_shuffle_f32x4(tmp2332, tmp2336, 221);
__m512 tmp2349 = _mm512_shuffle_f32x4(tmp2337, tmp2337, 136);
__m512 tmp2350 = _mm512_shuffle_f32x4(tmp2337, tmp2337, 221);
__m512 tmp2351 = _mm512_shuffle_f32x4(tmp2338, tmp2338, 136);
__m512 tmp2352 = _mm512_shuffle_f32x4(tmp2338, tmp2338, 221);
__m512 tmp2353 = _mm512_shuffle_f32x4(tmp2339, tmp2339, 136);
__m512 tmp2354 = _mm512_shuffle_f32x4(tmp2339, tmp2339, 221);
__m512 tmp2355 = _mm512_shuffle_f32x4(tmp2340, tmp2340, 136);
__m512 tmp2356 = _mm512_shuffle_f32x4(tmp2340, tmp2340, 221);
tmp2221 = _mm512_shuffle_f32x4(tmp2341, tmp2349, 136);
tmp2229 = _mm512_shuffle_f32x4(tmp2341, tmp2349, 221);
tmp2222 = _mm512_shuffle_f32x4(tmp2343, tmp2351, 136);
tmp2230 = _mm512_shuffle_f32x4(tmp2343, tmp2351, 221);
tmp2223 = _mm512_shuffle_f32x4(tmp2345, tmp2353, 136);
tmp2231 = _mm512_shuffle_f32x4(tmp2345, tmp2353, 221);
tmp2224 = _mm512_shuffle_f32x4(tmp2347, tmp2355, 136);
tmp2232 = _mm512_shuffle_f32x4(tmp2347, tmp2355, 221);
tmp2225 = _mm512_shuffle_f32x4(tmp2342, tmp2350, 136);
__m512 tmp2273 = _mm512_shuffle_f32x4(tmp2342, tmp2350, 221);
tmp2226 = _mm512_shuffle_f32x4(tmp2344, tmp2352, 136);
__m512 tmp2274 = _mm512_shuffle_f32x4(tmp2344, tmp2352, 221);
tmp2227 = _mm512_shuffle_f32x4(tmp2346, tmp2354, 136);
__m512 tmp2275 = _mm512_shuffle_f32x4(tmp2346, tmp2354, 221);
tmp2228 = _mm512_shuffle_f32x4(tmp2348, tmp2356, 136);
__m512 tmp2276 = _mm512_shuffle_f32x4(tmp2348, tmp2356, 221);
__m512 tmp2281 = _mm512_add_ps(tmp2222, tmp2223);
__m512 tmp2301 = _mm512_add_ps(tmp2230, tmp2231);
__m512 tmp2280 = _mm512_add_ps(tmp2224, tmp2225);
__m512 tmp2300 = _mm512_add_ps(tmp2232, tmp2273);
__m512 tmp2286 = _mm512_sub_ps(tmp2224, tmp2225);
__m512 tmp2306 = _mm512_sub_ps(tmp2232, tmp2273);
__m512 tmp2285 = _mm512_sub_ps(tmp2222, tmp2223);
__m512 tmp2305 = _mm512_sub_ps(tmp2230, tmp2231);
__m512 tmp2282 = _mm512_add_ps(tmp2226, tmp2227);
__m512 tmp2302 = _mm512_add_ps(tmp2274, tmp2275);
__m512 tmp2287 = _mm512_sub_ps(tmp2226, tmp2227);
__m512 tmp2307 = _mm512_sub_ps(tmp2274, tmp2275);
__m512 tmp2284 = _mm512_fmadd_ps(tmp2286, _mm512_set1_ps(2e+00f), tmp2285);
__m512 tmp2304 = _mm512_fmadd_ps(tmp2306, _mm512_set1_ps(2e+00f), tmp2305);
__m512 tmp2291 = _mm512_fmadd_ps(tmp2286, _mm512_set1_ps(8e+00f), tmp2285);
__m512 tmp2311 = _mm512_fmadd_ps(tmp2306, _mm512_set1_ps(8e+00f), tmp2305);
__m512 tmp2279 = _mm512_add_ps(tmp2280, tmp2281);
__m512 tmp2299 = _mm512_add_ps(tmp2300, tmp2301);
__m512 tmp2283 = _mm512_fmadd_ps(tmp2287, _mm512_set1_ps(1.6e+01f), tmp2284);
__m512 tmp2303 = _mm512_fmadd_ps(tmp2307, _mm512_set1_ps(1.6e+01f), tmp2304);
__m512 tmp2290 = _mm512_fmadd_ps(tmp2287, _mm512_set1_ps(4e+00f), tmp2291);
__m512 tmp2310 = _mm512_fmadd_ps(tmp2307, _mm512_set1_ps(4e+00f), tmp2311);
__m512 tmp2296 = _mm512_add_ps(tmp2287, tmp2285);
__m512 tmp2316 = _mm512_add_ps(tmp2307, tmp2305);
__m512 tmp2289 = _mm512_fmadd_ps(tmp2280, _mm512_set1_ps(4e+00f), tmp2281);
__m512 tmp2309 = _mm512_fmadd_ps(tmp2300, _mm512_set1_ps(4e+00f), tmp2301);
__m512 tmp2293 = _mm512_fmadd_ps(tmp2280, _mm512_set1_ps(1.6e+01f), tmp2281);
__m512 tmp2313 = _mm512_fmadd_ps(tmp2300, _mm512_set1_ps(1.6e+01f), tmp2301);
__m512 tmp2278 = _mm512_add_ps(tmp2279, tmp2221);
__m512 tmp2298 = _mm512_add_ps(tmp2299, tmp2229);
__m512 tmp2295 = _mm512_add_ps(tmp2296, tmp2228);
__m512 tmp2315 = _mm512_add_ps(tmp2316, tmp2276);
__m512 tmp2277 = _mm512_fmadd_ps(tmp2282, _mm512_set1_ps(3.2e+01f), tmp2278);
__m512 tmp2297 = _mm512_fmadd_ps(tmp2302, _mm512_set1_ps(3.2e+01f), tmp2298);
__m512 tmp2288 = _mm512_fmadd_ps(tmp2282, _mm512_set1_ps(8e+00f), tmp2289);
__m512 tmp2308 = _mm512_fmadd_ps(tmp2302, _mm512_set1_ps(8e+00f), tmp2309);
__m512 tmp2294 = _mm512_fmadd_ps(tmp2286, _mm512_set1_ps(3.2e+01f), tmp2295);
__m512 tmp2314 = _mm512_fmadd_ps(tmp2306, _mm512_set1_ps(3.2e+01f), tmp2315);
__m512 tmp2292 = _mm512_fmadd_ps(tmp2282, _mm512_set1_ps(2e+00f), tmp2293);
__m512 tmp2312 = _mm512_fmadd_ps(tmp2302, _mm512_set1_ps(2e+00f), tmp2313);
__m512 out489 = tmp2277;
__m512 out495 = tmp2297;
__m512 out490 = tmp2283;
__m512 out496 = tmp2303;
__m512 out491 = tmp2288;
__m512 out497 = tmp2308;
__m512 out492 = tmp2290;
__m512 out498 = tmp2310;
__m512 out493 = tmp2292;
__m512 out499 = tmp2312;
__m512 out494 = tmp2294;
__m512 out500 = tmp2314;
_mm512_mask_storeu_ps(datPtr3+0+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 4095, out489);
_mm512_mask_storeu_ps(datPtr3+48+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 511, out495);
_mm512_mask_storeu_ps(datPtr3+84+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 4095, out490);
_mm512_mask_storeu_ps(datPtr3+132+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 511, out496);
_mm512_mask_storeu_ps(datPtr3+168+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 4095, out491);
_mm512_mask_storeu_ps(datPtr3+216+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 511, out497);
_mm512_mask_storeu_ps(datPtr3+252+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 4095, out492);
_mm512_mask_storeu_ps(datPtr3+300+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 511, out498);
_mm512_mask_storeu_ps(datPtr3+336+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 4095, out493);
_mm512_mask_storeu_ps(datPtr3+384+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 511, out499);
_mm512_mask_storeu_ps(datPtr3+420+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 4095, out494);
_mm512_mask_storeu_ps(datPtr3+468+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 511, out500);
__m512 sf65 = _mm512_loadu_ps(sfPtr4+256+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf66 = _mm512_loadu_ps(sfPtr4+320+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in464 = _mm512_shuffle_f32x4(sf65, sf66, 68);
__m512 in465 = _mm512_shuffle_f32x4(sf65, sf66, 238);
__m512 sf67 = _mm512_loadu_ps(sfPtr4+1961728+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf68 = _mm512_loadu_ps(sfPtr4+1961792+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in466 = _mm512_shuffle_f32x4(sf67, sf68, 68);
__m512 in467 = _mm512_shuffle_f32x4(sf67, sf68, 238);
__m512 sf69 = _mm512_loadu_ps(sfPtr4+3923200+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf70 = _mm512_loadu_ps(sfPtr4+3923264+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in468 = _mm512_shuffle_f32x4(sf69, sf70, 68);
__m512 in469 = _mm512_shuffle_f32x4(sf69, sf70, 238);
__m512 sf71 = _mm512_loadu_ps(sfPtr4+5884672+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 sf72 = _mm512_loadu_ps(sfPtr4+5884736+7845888*i12+490368*j8+1536*k14+768*l5);
__m512 in470 = _mm512_shuffle_f32x4(sf71, sf72, 68);
__m512 in471 = _mm512_shuffle_f32x4(sf71, sf72, 238);
__m512 tmp2367 = _mm512_add_ps(in465, in466);
__m512 tmp2366 = _mm512_add_ps(in467, in468);
__m512 tmp2372 = _mm512_sub_ps(in467, in468);
__m512 tmp2371 = _mm512_sub_ps(in465, in466);
__m512 tmp2368 = _mm512_add_ps(in469, in470);
__m512 tmp2373 = _mm512_sub_ps(in469, in470);
__m512 tmp2370 = _mm512_fmadd_ps(tmp2372, _mm512_set1_ps(2e+00f), tmp2371);
__m512 tmp2377 = _mm512_fmadd_ps(tmp2372, _mm512_set1_ps(8e+00f), tmp2371);
__m512 tmp2365 = _mm512_add_ps(tmp2366, tmp2367);
__m512 tmp2369 = _mm512_fmadd_ps(tmp2373, _mm512_set1_ps(1.6e+01f), tmp2370);
__m512 tmp2376 = _mm512_fmadd_ps(tmp2373, _mm512_set1_ps(4e+00f), tmp2377);
__m512 tmp2382 = _mm512_add_ps(tmp2373, tmp2371);
__m512 tmp2375 = _mm512_fmadd_ps(tmp2366, _mm512_set1_ps(4e+00f), tmp2367);
__m512 tmp2379 = _mm512_fmadd_ps(tmp2366, _mm512_set1_ps(1.6e+01f), tmp2367);
__m512 tmp2364 = _mm512_add_ps(tmp2365, in464);
__m512 tmp2381 = _mm512_add_ps(tmp2382, in471);
__m512 tmp2363 = _mm512_fmadd_ps(tmp2368, _mm512_set1_ps(3.2e+01f), tmp2364);
__m512 tmp2374 = _mm512_fmadd_ps(tmp2368, _mm512_set1_ps(8e+00f), tmp2375);
__m512 tmp2380 = _mm512_fmadd_ps(tmp2372, _mm512_set1_ps(3.2e+01f), tmp2381);
__m512 tmp2378 = _mm512_fmadd_ps(tmp2368, _mm512_set1_ps(2e+00f), tmp2379);
__m512 tmp2357 = tmp2363;
__m512 tmp2358 = tmp2369;
__m512 tmp2359 = tmp2374;
__m512 tmp2360 = tmp2376;
__m512 tmp2361 = tmp2378;
__m512 tmp2362 = tmp2380;
__m512 tmp2433 = _mm512_unpacklo_ps(tmp2357, tmp2358);
__m512 tmp2434 = _mm512_unpackhi_ps(tmp2357, tmp2358);
__m512 tmp2435 = _mm512_unpacklo_ps(tmp2359, tmp2360);
__m512 tmp2436 = _mm512_unpackhi_ps(tmp2359, tmp2360);
__m512 tmp2437 = _mm512_unpacklo_ps(tmp2361, tmp2362);
__m512 tmp2438 = _mm512_unpackhi_ps(tmp2361, tmp2362);
__m512 tmp2439 = _mm512_shuffle_ps(tmp2433, tmp2435, 68);
__m512 tmp2440 = _mm512_shuffle_ps(tmp2433, tmp2435, 238);
__m512 tmp2441 = _mm512_shuffle_ps(tmp2434, tmp2436, 68);
__m512 tmp2442 = _mm512_shuffle_ps(tmp2434, tmp2436, 238);
__m512 tmp2443 = _mm512_shuffle_ps(tmp2437, tmp2437, 238);
__m512 tmp2444 = _mm512_shuffle_ps(tmp2438, tmp2438, 238);
__m512 tmp2445 = _mm512_shuffle_f32x4(tmp2439, tmp2437, 136);
__m512 tmp2446 = _mm512_shuffle_f32x4(tmp2439, tmp2437, 221);
__m512 tmp2447 = _mm512_shuffle_f32x4(tmp2440, tmp2443, 136);
__m512 tmp2448 = _mm512_shuffle_f32x4(tmp2440, tmp2443, 221);
__m512 tmp2449 = _mm512_shuffle_f32x4(tmp2441, tmp2438, 136);
__m512 tmp2450 = _mm512_shuffle_f32x4(tmp2441, tmp2438, 221);
__m512 tmp2451 = _mm512_shuffle_f32x4(tmp2442, tmp2444, 136);
__m512 tmp2452 = _mm512_shuffle_f32x4(tmp2442, tmp2444, 221);
tmp2357 = _mm512_shuffle_f32x4(tmp2445, tmp2445, 136);
__m512 tmp2385 = _mm512_shuffle_f32x4(tmp2445, tmp2445, 221);
tmp2358 = _mm512_shuffle_f32x4(tmp2447, tmp2447, 136);
__m512 tmp2386 = _mm512_shuffle_f32x4(tmp2447, tmp2447, 221);
tmp2359 = _mm512_shuffle_f32x4(tmp2449, tmp2449, 136);
__m512 tmp2387 = _mm512_shuffle_f32x4(tmp2449, tmp2449, 221);
tmp2360 = _mm512_shuffle_f32x4(tmp2451, tmp2451, 136);
__m512 tmp2388 = _mm512_shuffle_f32x4(tmp2451, tmp2451, 221);
tmp2361 = _mm512_shuffle_f32x4(tmp2446, tmp2446, 136);
__m512 tmp2389 = _mm512_shuffle_f32x4(tmp2446, tmp2446, 221);
tmp2362 = _mm512_shuffle_f32x4(tmp2448, tmp2448, 136);
__m512 tmp2390 = _mm512_shuffle_f32x4(tmp2448, tmp2448, 221);
__m512 tmp2383 = _mm512_shuffle_f32x4(tmp2450, tmp2450, 136);
__m512 tmp2391 = _mm512_shuffle_f32x4(tmp2450, tmp2450, 221);
__m512 tmp2384 = _mm512_shuffle_f32x4(tmp2452, tmp2452, 136);
__m512 tmp2392 = _mm512_shuffle_f32x4(tmp2452, tmp2452, 221);
__m512 tmp2397 = _mm512_add_ps(tmp2358, tmp2359);
__m512 tmp2417 = _mm512_add_ps(tmp2386, tmp2387);
__m512 tmp2396 = _mm512_add_ps(tmp2360, tmp2361);
__m512 tmp2416 = _mm512_add_ps(tmp2388, tmp2389);
__m512 tmp2402 = _mm512_sub_ps(tmp2360, tmp2361);
__m512 tmp2422 = _mm512_sub_ps(tmp2388, tmp2389);
__m512 tmp2401 = _mm512_sub_ps(tmp2358, tmp2359);
__m512 tmp2421 = _mm512_sub_ps(tmp2386, tmp2387);
__m512 tmp2398 = _mm512_add_ps(tmp2362, tmp2383);
__m512 tmp2418 = _mm512_add_ps(tmp2390, tmp2391);
__m512 tmp2403 = _mm512_sub_ps(tmp2362, tmp2383);
__m512 tmp2423 = _mm512_sub_ps(tmp2390, tmp2391);
__m512 tmp2400 = _mm512_fmadd_ps(tmp2402, _mm512_set1_ps(2e+00f), tmp2401);
__m512 tmp2420 = _mm512_fmadd_ps(tmp2422, _mm512_set1_ps(2e+00f), tmp2421);
__m512 tmp2407 = _mm512_fmadd_ps(tmp2402, _mm512_set1_ps(8e+00f), tmp2401);
__m512 tmp2427 = _mm512_fmadd_ps(tmp2422, _mm512_set1_ps(8e+00f), tmp2421);
__m512 tmp2395 = _mm512_add_ps(tmp2396, tmp2397);
__m512 tmp2415 = _mm512_add_ps(tmp2416, tmp2417);
__m512 tmp2399 = _mm512_fmadd_ps(tmp2403, _mm512_set1_ps(1.6e+01f), tmp2400);
__m512 tmp2419 = _mm512_fmadd_ps(tmp2423, _mm512_set1_ps(1.6e+01f), tmp2420);
__m512 tmp2406 = _mm512_fmadd_ps(tmp2403, _mm512_set1_ps(4e+00f), tmp2407);
__m512 tmp2426 = _mm512_fmadd_ps(tmp2423, _mm512_set1_ps(4e+00f), tmp2427);
__m512 tmp2412 = _mm512_add_ps(tmp2403, tmp2401);
__m512 tmp2432 = _mm512_add_ps(tmp2423, tmp2421);
__m512 tmp2405 = _mm512_fmadd_ps(tmp2396, _mm512_set1_ps(4e+00f), tmp2397);
__m512 tmp2425 = _mm512_fmadd_ps(tmp2416, _mm512_set1_ps(4e+00f), tmp2417);
__m512 tmp2409 = _mm512_fmadd_ps(tmp2396, _mm512_set1_ps(1.6e+01f), tmp2397);
__m512 tmp2429 = _mm512_fmadd_ps(tmp2416, _mm512_set1_ps(1.6e+01f), tmp2417);
__m512 tmp2394 = _mm512_add_ps(tmp2395, tmp2357);
__m512 tmp2414 = _mm512_add_ps(tmp2415, tmp2385);
__m512 tmp2411 = _mm512_add_ps(tmp2412, tmp2384);
__m512 tmp2431 = _mm512_add_ps(tmp2432, tmp2392);
__m512 tmp2393 = _mm512_fmadd_ps(tmp2398, _mm512_set1_ps(3.2e+01f), tmp2394);
__m512 tmp2413 = _mm512_fmadd_ps(tmp2418, _mm512_set1_ps(3.2e+01f), tmp2414);
__m512 tmp2404 = _mm512_fmadd_ps(tmp2398, _mm512_set1_ps(8e+00f), tmp2405);
__m512 tmp2424 = _mm512_fmadd_ps(tmp2418, _mm512_set1_ps(8e+00f), tmp2425);
__m512 tmp2410 = _mm512_fmadd_ps(tmp2402, _mm512_set1_ps(3.2e+01f), tmp2411);
__m512 tmp2430 = _mm512_fmadd_ps(tmp2422, _mm512_set1_ps(3.2e+01f), tmp2431);
__m512 tmp2408 = _mm512_fmadd_ps(tmp2398, _mm512_set1_ps(2e+00f), tmp2409);
__m512 tmp2428 = _mm512_fmadd_ps(tmp2418, _mm512_set1_ps(2e+00f), tmp2429);
__m512 out501 = tmp2393;
__m512 out507 = tmp2413;
__m512 out502 = tmp2399;
__m512 out508 = tmp2419;
__m512 out503 = tmp2404;
__m512 out509 = tmp2424;
__m512 out504 = tmp2406;
__m512 out510 = tmp2426;
__m512 out505 = tmp2408;
__m512 out511 = tmp2428;
__m512 out506 = tmp2410;
__m512 out512 = tmp2430;
_mm512_mask_storeu_ps(datPtr3+504+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out501);
_mm512_mask_storeu_ps(datPtr3+528+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out507);
_mm512_mask_storeu_ps(datPtr3+588+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out502);
_mm512_mask_storeu_ps(datPtr3+612+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out508);
_mm512_mask_storeu_ps(datPtr3+672+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out503);
_mm512_mask_storeu_ps(datPtr3+696+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out509);
_mm512_mask_storeu_ps(datPtr3+756+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out504);
_mm512_mask_storeu_ps(datPtr3+780+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out510);
_mm512_mask_storeu_ps(datPtr3+840+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out505);
_mm512_mask_storeu_ps(datPtr3+864+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out511);
_mm512_mask_storeu_ps(datPtr3+924+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out506);
_mm512_mask_storeu_ps(datPtr3+948+3861648*i12+84*toH1+4*toW1+12096*k14+6048*l5, 63, out512);
if (j8 >= last3) return;
++j8;
rel3 = 1;
}
ptrdiff_t toH2 = base3+6;
ptrdiff_t toW2 = 12;
ptrdiff_t k15 = 22*w8;
ptrdiff_t kk5 = k15+(w8 < 13 ? 21 : 33);
for (; k15 != 319; ++k15) {
ptrdiff_t l6 = 0;
for (; l6 != 2; ++l6) {
__m512 sf73 = _mm512_loadu_ps(sfPtr4+0+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf74 = _mm512_loadu_ps(sfPtr4+128+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in472 = _mm512_shuffle_f32x4(sf73, sf74, 68);
__m512 in473 = _mm512_shuffle_f32x4(sf73, sf74, 238);
__m512 sf75 = _mm512_loadu_ps(sfPtr4+64+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf76 = _mm512_loadu_ps(sfPtr4+192+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in480 = _mm512_shuffle_f32x4(sf75, sf76, 68);
__m512 in481 = _mm512_shuffle_f32x4(sf75, sf76, 238);
__m512 sf77 = _mm512_loadu_ps(sfPtr4+1961472+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf78 = _mm512_loadu_ps(sfPtr4+1961600+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in474 = _mm512_shuffle_f32x4(sf77, sf78, 68);
__m512 in475 = _mm512_shuffle_f32x4(sf77, sf78, 238);
__m512 sf79 = _mm512_loadu_ps(sfPtr4+1961536+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf80 = _mm512_loadu_ps(sfPtr4+1961664+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in482 = _mm512_shuffle_f32x4(sf79, sf80, 68);
__m512 in483 = _mm512_shuffle_f32x4(sf79, sf80, 238);
__m512 sf81 = _mm512_loadu_ps(sfPtr4+3922944+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf82 = _mm512_loadu_ps(sfPtr4+3923072+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in476 = _mm512_shuffle_f32x4(sf81, sf82, 68);
__m512 in477 = _mm512_shuffle_f32x4(sf81, sf82, 238);
__m512 sf83 = _mm512_loadu_ps(sfPtr4+3923008+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf84 = _mm512_loadu_ps(sfPtr4+3923136+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in484 = _mm512_shuffle_f32x4(sf83, sf84, 68);
__m512 in485 = _mm512_shuffle_f32x4(sf83, sf84, 238);
__m512 sf85 = _mm512_loadu_ps(sfPtr4+5884416+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf86 = _mm512_loadu_ps(sfPtr4+5884544+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in478 = _mm512_shuffle_f32x4(sf85, sf86, 68);
__m512 in479 = _mm512_shuffle_f32x4(sf85, sf86, 238);
__m512 sf87 = _mm512_loadu_ps(sfPtr4+5884480+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf88 = _mm512_loadu_ps(sfPtr4+5884608+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in486 = _mm512_shuffle_f32x4(sf87, sf88, 68);
__m512 in487 = _mm512_shuffle_f32x4(sf87, sf88, 238);
__m512 tmp2469 = _mm512_add_ps(in473, in474);
__m512 tmp2489 = _mm512_add_ps(in481, in482);
__m512 tmp2468 = _mm512_add_ps(in475, in476);
__m512 tmp2488 = _mm512_add_ps(in483, in484);
__m512 tmp2474 = _mm512_sub_ps(in475, in476);
__m512 tmp2494 = _mm512_sub_ps(in483, in484);
__m512 tmp2473 = _mm512_sub_ps(in473, in474);
__m512 tmp2493 = _mm512_sub_ps(in481, in482);
__m512 tmp2470 = _mm512_add_ps(in477, in478);
__m512 tmp2490 = _mm512_add_ps(in485, in486);
__m512 tmp2475 = _mm512_sub_ps(in477, in478);
__m512 tmp2495 = _mm512_sub_ps(in485, in486);
__m512 tmp2472 = _mm512_fmadd_ps(tmp2474, _mm512_set1_ps(2e+00f), tmp2473);
__m512 tmp2492 = _mm512_fmadd_ps(tmp2494, _mm512_set1_ps(2e+00f), tmp2493);
__m512 tmp2479 = _mm512_fmadd_ps(tmp2474, _mm512_set1_ps(8e+00f), tmp2473);
__m512 tmp2499 = _mm512_fmadd_ps(tmp2494, _mm512_set1_ps(8e+00f), tmp2493);
__m512 tmp2467 = _mm512_add_ps(tmp2468, tmp2469);
__m512 tmp2487 = _mm512_add_ps(tmp2488, tmp2489);
__m512 tmp2471 = _mm512_fmadd_ps(tmp2475, _mm512_set1_ps(1.6e+01f), tmp2472);
__m512 tmp2491 = _mm512_fmadd_ps(tmp2495, _mm512_set1_ps(1.6e+01f), tmp2492);
__m512 tmp2478 = _mm512_fmadd_ps(tmp2475, _mm512_set1_ps(4e+00f), tmp2479);
__m512 tmp2498 = _mm512_fmadd_ps(tmp2495, _mm512_set1_ps(4e+00f), tmp2499);
__m512 tmp2484 = _mm512_add_ps(tmp2475, tmp2473);
__m512 tmp2504 = _mm512_add_ps(tmp2495, tmp2493);
__m512 tmp2477 = _mm512_fmadd_ps(tmp2468, _mm512_set1_ps(4e+00f), tmp2469);
__m512 tmp2497 = _mm512_fmadd_ps(tmp2488, _mm512_set1_ps(4e+00f), tmp2489);
__m512 tmp2481 = _mm512_fmadd_ps(tmp2468, _mm512_set1_ps(1.6e+01f), tmp2469);
__m512 tmp2501 = _mm512_fmadd_ps(tmp2488, _mm512_set1_ps(1.6e+01f), tmp2489);
__m512 tmp2466 = _mm512_add_ps(tmp2467, in472);
__m512 tmp2486 = _mm512_add_ps(tmp2487, in480);
__m512 tmp2483 = _mm512_add_ps(tmp2484, in479);
__m512 tmp2503 = _mm512_add_ps(tmp2504, in487);
__m512 tmp2465 = _mm512_fmadd_ps(tmp2470, _mm512_set1_ps(3.2e+01f), tmp2466);
__m512 tmp2485 = _mm512_fmadd_ps(tmp2490, _mm512_set1_ps(3.2e+01f), tmp2486);
__m512 tmp2476 = _mm512_fmadd_ps(tmp2470, _mm512_set1_ps(8e+00f), tmp2477);
__m512 tmp2496 = _mm512_fmadd_ps(tmp2490, _mm512_set1_ps(8e+00f), tmp2497);
__m512 tmp2482 = _mm512_fmadd_ps(tmp2474, _mm512_set1_ps(3.2e+01f), tmp2483);
__m512 tmp2502 = _mm512_fmadd_ps(tmp2494, _mm512_set1_ps(3.2e+01f), tmp2503);
__m512 tmp2480 = _mm512_fmadd_ps(tmp2470, _mm512_set1_ps(2e+00f), tmp2481);
__m512 tmp2500 = _mm512_fmadd_ps(tmp2490, _mm512_set1_ps(2e+00f), tmp2501);
__m512 tmp2453 = tmp2465;
__m512 tmp2459 = tmp2485;
__m512 tmp2454 = tmp2471;
__m512 tmp2460 = tmp2491;
__m512 tmp2455 = tmp2476;
__m512 tmp2461 = tmp2496;
__m512 tmp2456 = tmp2478;
__m512 tmp2462 = tmp2498;
__m512 tmp2457 = tmp2480;
__m512 tmp2463 = tmp2500;
__m512 tmp2458 = tmp2482;
__m512 tmp2464 = tmp2502;
__m512 tmp2549 = _mm512_unpacklo_ps(tmp2453, tmp2454);
__m512 tmp2550 = _mm512_unpackhi_ps(tmp2453, tmp2454);
__m512 tmp2551 = _mm512_unpacklo_ps(tmp2455, tmp2456);
__m512 tmp2552 = _mm512_unpackhi_ps(tmp2455, tmp2456);
__m512 tmp2553 = _mm512_unpacklo_ps(tmp2457, tmp2458);
__m512 tmp2554 = _mm512_unpackhi_ps(tmp2457, tmp2458);
__m512 tmp2555 = _mm512_unpacklo_ps(tmp2459, tmp2460);
__m512 tmp2556 = _mm512_unpackhi_ps(tmp2459, tmp2460);
__m512 tmp2557 = _mm512_unpacklo_ps(tmp2461, tmp2462);
__m512 tmp2558 = _mm512_unpackhi_ps(tmp2461, tmp2462);
__m512 tmp2559 = _mm512_unpacklo_ps(tmp2463, tmp2464);
__m512 tmp2560 = _mm512_unpackhi_ps(tmp2463, tmp2464);
__m512 tmp2561 = _mm512_shuffle_ps(tmp2549, tmp2551, 68);
__m512 tmp2562 = _mm512_shuffle_ps(tmp2549, tmp2551, 238);
__m512 tmp2563 = _mm512_shuffle_ps(tmp2550, tmp2552, 68);
__m512 tmp2564 = _mm512_shuffle_ps(tmp2550, tmp2552, 238);
__m512 tmp2565 = _mm512_shuffle_ps(tmp2553, tmp2555, 68);
__m512 tmp2566 = _mm512_shuffle_ps(tmp2553, tmp2555, 238);
__m512 tmp2567 = _mm512_shuffle_ps(tmp2554, tmp2556, 68);
__m512 tmp2568 = _mm512_shuffle_ps(tmp2554, tmp2556, 238);
__m512 tmp2569 = _mm512_shuffle_ps(tmp2557, tmp2559, 68);
__m512 tmp2570 = _mm512_shuffle_ps(tmp2557, tmp2559, 238);
__m512 tmp2571 = _mm512_shuffle_ps(tmp2558, tmp2560, 68);
__m512 tmp2572 = _mm512_shuffle_ps(tmp2558, tmp2560, 238);
__m512 tmp2573 = _mm512_shuffle_f32x4(tmp2561, tmp2565, 136);
__m512 tmp2574 = _mm512_shuffle_f32x4(tmp2561, tmp2565, 221);
__m512 tmp2575 = _mm512_shuffle_f32x4(tmp2562, tmp2566, 136);
__m512 tmp2576 = _mm512_shuffle_f32x4(tmp2562, tmp2566, 221);
__m512 tmp2577 = _mm512_shuffle_f32x4(tmp2563, tmp2567, 136);
__m512 tmp2578 = _mm512_shuffle_f32x4(tmp2563, tmp2567, 221);
__m512 tmp2579 = _mm512_shuffle_f32x4(tmp2564, tmp2568, 136);
__m512 tmp2580 = _mm512_shuffle_f32x4(tmp2564, tmp2568, 221);
__m512 tmp2581 = _mm512_shuffle_f32x4(tmp2569, tmp2569, 136);
__m512 tmp2582 = _mm512_shuffle_f32x4(tmp2569, tmp2569, 221);
__m512 tmp2583 = _mm512_shuffle_f32x4(tmp2570, tmp2570, 136);
__m512 tmp2584 = _mm512_shuffle_f32x4(tmp2570, tmp2570, 221);
__m512 tmp2585 = _mm512_shuffle_f32x4(tmp2571, tmp2571, 136);
__m512 tmp2586 = _mm512_shuffle_f32x4(tmp2571, tmp2571, 221);
__m512 tmp2587 = _mm512_shuffle_f32x4(tmp2572, tmp2572, 136);
__m512 tmp2588 = _mm512_shuffle_f32x4(tmp2572, tmp2572, 221);
tmp2453 = _mm512_shuffle_f32x4(tmp2573, tmp2581, 136);
tmp2461 = _mm512_shuffle_f32x4(tmp2573, tmp2581, 221);
tmp2454 = _mm512_shuffle_f32x4(tmp2575, tmp2583, 136);
tmp2462 = _mm512_shuffle_f32x4(tmp2575, tmp2583, 221);
tmp2455 = _mm512_shuffle_f32x4(tmp2577, tmp2585, 136);
tmp2463 = _mm512_shuffle_f32x4(tmp2577, tmp2585, 221);
tmp2456 = _mm512_shuffle_f32x4(tmp2579, tmp2587, 136);
tmp2464 = _mm512_shuffle_f32x4(tmp2579, tmp2587, 221);
tmp2457 = _mm512_shuffle_f32x4(tmp2574, tmp2582, 136);
__m512 tmp2505 = _mm512_shuffle_f32x4(tmp2574, tmp2582, 221);
tmp2458 = _mm512_shuffle_f32x4(tmp2576, tmp2584, 136);
__m512 tmp2506 = _mm512_shuffle_f32x4(tmp2576, tmp2584, 221);
tmp2459 = _mm512_shuffle_f32x4(tmp2578, tmp2586, 136);
__m512 tmp2507 = _mm512_shuffle_f32x4(tmp2578, tmp2586, 221);
tmp2460 = _mm512_shuffle_f32x4(tmp2580, tmp2588, 136);
__m512 tmp2508 = _mm512_shuffle_f32x4(tmp2580, tmp2588, 221);
__m512 tmp2513 = _mm512_add_ps(tmp2454, tmp2455);
__m512 tmp2533 = _mm512_add_ps(tmp2462, tmp2463);
__m512 tmp2512 = _mm512_add_ps(tmp2456, tmp2457);
__m512 tmp2532 = _mm512_add_ps(tmp2464, tmp2505);
__m512 tmp2518 = _mm512_sub_ps(tmp2456, tmp2457);
__m512 tmp2538 = _mm512_sub_ps(tmp2464, tmp2505);
__m512 tmp2517 = _mm512_sub_ps(tmp2454, tmp2455);
__m512 tmp2537 = _mm512_sub_ps(tmp2462, tmp2463);
__m512 tmp2514 = _mm512_add_ps(tmp2458, tmp2459);
__m512 tmp2534 = _mm512_add_ps(tmp2506, tmp2507);
__m512 tmp2519 = _mm512_sub_ps(tmp2458, tmp2459);
__m512 tmp2539 = _mm512_sub_ps(tmp2506, tmp2507);
__m512 tmp2516 = _mm512_fmadd_ps(tmp2518, _mm512_set1_ps(2e+00f), tmp2517);
__m512 tmp2536 = _mm512_fmadd_ps(tmp2538, _mm512_set1_ps(2e+00f), tmp2537);
__m512 tmp2523 = _mm512_fmadd_ps(tmp2518, _mm512_set1_ps(8e+00f), tmp2517);
__m512 tmp2543 = _mm512_fmadd_ps(tmp2538, _mm512_set1_ps(8e+00f), tmp2537);
__m512 tmp2511 = _mm512_add_ps(tmp2512, tmp2513);
__m512 tmp2531 = _mm512_add_ps(tmp2532, tmp2533);
__m512 tmp2515 = _mm512_fmadd_ps(tmp2519, _mm512_set1_ps(1.6e+01f), tmp2516);
__m512 tmp2535 = _mm512_fmadd_ps(tmp2539, _mm512_set1_ps(1.6e+01f), tmp2536);
__m512 tmp2522 = _mm512_fmadd_ps(tmp2519, _mm512_set1_ps(4e+00f), tmp2523);
__m512 tmp2542 = _mm512_fmadd_ps(tmp2539, _mm512_set1_ps(4e+00f), tmp2543);
__m512 tmp2528 = _mm512_add_ps(tmp2519, tmp2517);
__m512 tmp2548 = _mm512_add_ps(tmp2539, tmp2537);
__m512 tmp2521 = _mm512_fmadd_ps(tmp2512, _mm512_set1_ps(4e+00f), tmp2513);
__m512 tmp2541 = _mm512_fmadd_ps(tmp2532, _mm512_set1_ps(4e+00f), tmp2533);
__m512 tmp2525 = _mm512_fmadd_ps(tmp2512, _mm512_set1_ps(1.6e+01f), tmp2513);
__m512 tmp2545 = _mm512_fmadd_ps(tmp2532, _mm512_set1_ps(1.6e+01f), tmp2533);
__m512 tmp2510 = _mm512_add_ps(tmp2511, tmp2453);
__m512 tmp2530 = _mm512_add_ps(tmp2531, tmp2461);
__m512 tmp2527 = _mm512_add_ps(tmp2528, tmp2460);
__m512 tmp2547 = _mm512_add_ps(tmp2548, tmp2508);
__m512 tmp2509 = _mm512_fmadd_ps(tmp2514, _mm512_set1_ps(3.2e+01f), tmp2510);
__m512 tmp2529 = _mm512_fmadd_ps(tmp2534, _mm512_set1_ps(3.2e+01f), tmp2530);
__m512 tmp2520 = _mm512_fmadd_ps(tmp2514, _mm512_set1_ps(8e+00f), tmp2521);
__m512 tmp2540 = _mm512_fmadd_ps(tmp2534, _mm512_set1_ps(8e+00f), tmp2541);
__m512 tmp2526 = _mm512_fmadd_ps(tmp2518, _mm512_set1_ps(3.2e+01f), tmp2527);
__m512 tmp2546 = _mm512_fmadd_ps(tmp2538, _mm512_set1_ps(3.2e+01f), tmp2547);
__m512 tmp2524 = _mm512_fmadd_ps(tmp2514, _mm512_set1_ps(2e+00f), tmp2525);
__m512 tmp2544 = _mm512_fmadd_ps(tmp2534, _mm512_set1_ps(2e+00f), tmp2545);
__m512 out513 = tmp2509;
__m512 out519 = tmp2529;
__m512 out514 = tmp2515;
__m512 out520 = tmp2535;
__m512 out515 = tmp2520;
__m512 out521 = tmp2540;
__m512 out516 = tmp2522;
__m512 out522 = tmp2542;
__m512 out517 = tmp2524;
__m512 out523 = tmp2544;
__m512 out518 = tmp2526;
__m512 out524 = tmp2546;
_mm512_mask_storeu_ps(datPtr3+0+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out513);
_mm512_mask_storeu_ps(datPtr3+456+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out519);
_mm512_mask_storeu_ps(datPtr3+84+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out514);
_mm512_mask_storeu_ps(datPtr3+540+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out520);
_mm512_mask_storeu_ps(datPtr3+168+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out515);
_mm512_mask_storeu_ps(datPtr3+624+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out521);
_mm512_mask_storeu_ps(datPtr3+252+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out516);
_mm512_mask_storeu_ps(datPtr3+708+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out522);
_mm512_mask_storeu_ps(datPtr3+336+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out517);
_mm512_mask_storeu_ps(datPtr3+792+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out523);
_mm512_mask_storeu_ps(datPtr3+420+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out518);
_mm512_mask_storeu_ps(datPtr3+876+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out524);
__m512 sf89 = _mm512_loadu_ps(sfPtr4+256+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf90 = _mm512_loadu_ps(sfPtr4+384+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in488 = _mm512_shuffle_f32x4(sf89, sf90, 68);
__m512 in489 = _mm512_shuffle_f32x4(sf89, sf90, 238);
__m512 sf91 = _mm512_loadu_ps(sfPtr4+320+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf92 = _mm512_loadu_ps(sfPtr4+448+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in496 = _mm512_shuffle_f32x4(sf91, sf92, 68);
__m512 in497 = _mm512_shuffle_f32x4(sf91, sf92, 238);
__m512 sf93 = _mm512_loadu_ps(sfPtr4+1961728+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf94 = _mm512_loadu_ps(sfPtr4+1961856+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in490 = _mm512_shuffle_f32x4(sf93, sf94, 68);
__m512 in491 = _mm512_shuffle_f32x4(sf93, sf94, 238);
__m512 sf95 = _mm512_loadu_ps(sfPtr4+1961792+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf96 = _mm512_loadu_ps(sfPtr4+1961920+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in498 = _mm512_shuffle_f32x4(sf95, sf96, 68);
__m512 in499 = _mm512_shuffle_f32x4(sf95, sf96, 238);
__m512 sf97 = _mm512_loadu_ps(sfPtr4+3923200+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf98 = _mm512_loadu_ps(sfPtr4+3923328+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in492 = _mm512_shuffle_f32x4(sf97, sf98, 68);
__m512 in493 = _mm512_shuffle_f32x4(sf97, sf98, 238);
__m512 sf99 = _mm512_loadu_ps(sfPtr4+3923264+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf100 = _mm512_loadu_ps(sfPtr4+3923392+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in500 = _mm512_shuffle_f32x4(sf99, sf100, 68);
__m512 in501 = _mm512_shuffle_f32x4(sf99, sf100, 238);
__m512 sf101 = _mm512_loadu_ps(sfPtr4+5884672+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf102 = _mm512_loadu_ps(sfPtr4+5884800+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in494 = _mm512_shuffle_f32x4(sf101, sf102, 68);
__m512 in495 = _mm512_shuffle_f32x4(sf101, sf102, 238);
__m512 sf103 = _mm512_loadu_ps(sfPtr4+5884736+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf104 = _mm512_loadu_ps(sfPtr4+5884864+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in502 = _mm512_shuffle_f32x4(sf103, sf104, 68);
__m512 in503 = _mm512_shuffle_f32x4(sf103, sf104, 238);
(void)in503;
__m512 tmp2602 = _mm512_add_ps(in489, in490);
__m512 tmp2622 = _mm512_add_ps(in497, in498);
__m512 tmp2601 = _mm512_add_ps(in491, in492);
__m512 tmp2621 = _mm512_add_ps(in499, in500);
__m512 tmp2607 = _mm512_sub_ps(in491, in492);
__m512 tmp2627 = _mm512_sub_ps(in499, in500);
__m512 tmp2606 = _mm512_sub_ps(in489, in490);
__m512 tmp2626 = _mm512_sub_ps(in497, in498);
__m512 tmp2603 = _mm512_add_ps(in493, in494);
__m512 tmp2623 = _mm512_add_ps(in501, in502);
__m512 tmp2608 = _mm512_sub_ps(in493, in494);
__m512 tmp2628 = _mm512_sub_ps(in501, in502);
__m512 tmp2605 = _mm512_fmadd_ps(tmp2607, _mm512_set1_ps(2e+00f), tmp2606);
__m512 tmp2625 = _mm512_fmadd_ps(tmp2627, _mm512_set1_ps(2e+00f), tmp2626);
__m512 tmp2612 = _mm512_fmadd_ps(tmp2607, _mm512_set1_ps(8e+00f), tmp2606);
__m512 tmp2600 = _mm512_add_ps(tmp2601, tmp2602);
__m512 tmp2620 = _mm512_add_ps(tmp2621, tmp2622);
__m512 tmp2604 = _mm512_fmadd_ps(tmp2608, _mm512_set1_ps(1.6e+01f), tmp2605);
__m512 tmp2624 = _mm512_fmadd_ps(tmp2628, _mm512_set1_ps(1.6e+01f), tmp2625);
__m512 tmp2611 = _mm512_fmadd_ps(tmp2608, _mm512_set1_ps(4e+00f), tmp2612);
__m512 tmp2617 = _mm512_add_ps(tmp2608, tmp2606);
__m512 tmp2610 = _mm512_fmadd_ps(tmp2601, _mm512_set1_ps(4e+00f), tmp2602);
__m512 tmp2630 = _mm512_fmadd_ps(tmp2621, _mm512_set1_ps(4e+00f), tmp2622);
__m512 tmp2614 = _mm512_fmadd_ps(tmp2601, _mm512_set1_ps(1.6e+01f), tmp2602);
__m512 tmp2599 = _mm512_add_ps(tmp2600, in488);
__m512 tmp2619 = _mm512_add_ps(tmp2620, in496);
__m512 tmp2616 = _mm512_add_ps(tmp2617, in495);
__m512 tmp2598 = _mm512_fmadd_ps(tmp2603, _mm512_set1_ps(3.2e+01f), tmp2599);
__m512 tmp2618 = _mm512_fmadd_ps(tmp2623, _mm512_set1_ps(3.2e+01f), tmp2619);
__m512 tmp2609 = _mm512_fmadd_ps(tmp2603, _mm512_set1_ps(8e+00f), tmp2610);
__m512 tmp2629 = _mm512_fmadd_ps(tmp2623, _mm512_set1_ps(8e+00f), tmp2630);
__m512 tmp2615 = _mm512_fmadd_ps(tmp2607, _mm512_set1_ps(3.2e+01f), tmp2616);
__m512 tmp2613 = _mm512_fmadd_ps(tmp2603, _mm512_set1_ps(2e+00f), tmp2614);
__m512 tmp2589 = tmp2598;
__m512 tmp2595 = tmp2618;
__m512 tmp2590 = tmp2604;
__m512 tmp2596 = tmp2624;
__m512 tmp2591 = tmp2609;
__m512 tmp2597 = tmp2629;
__m512 tmp2592 = tmp2611;
__m512 tmp2593 = tmp2613;
__m512 tmp2594 = tmp2615;
__m512 tmp2678 = _mm512_unpacklo_ps(tmp2589, tmp2590);
__m512 tmp2679 = _mm512_unpackhi_ps(tmp2589, tmp2590);
__m512 tmp2680 = _mm512_unpacklo_ps(tmp2591, tmp2592);
__m512 tmp2681 = _mm512_unpackhi_ps(tmp2591, tmp2592);
__m512 tmp2682 = _mm512_unpacklo_ps(tmp2593, tmp2594);
__m512 tmp2683 = _mm512_unpackhi_ps(tmp2593, tmp2594);
__m512 tmp2684 = _mm512_unpacklo_ps(tmp2595, tmp2596);
__m512 tmp2685 = _mm512_unpackhi_ps(tmp2595, tmp2596);
__m512 tmp2686 = _mm512_unpacklo_ps(tmp2597, tmp2597);
__m512 tmp2687 = _mm512_unpackhi_ps(tmp2597, tmp2597);
__m512 tmp2688 = _mm512_shuffle_ps(tmp2678, tmp2680, 68);
__m512 tmp2689 = _mm512_shuffle_ps(tmp2678, tmp2680, 238);
__m512 tmp2690 = _mm512_shuffle_ps(tmp2679, tmp2681, 68);
__m512 tmp2691 = _mm512_shuffle_ps(tmp2679, tmp2681, 238);
__m512 tmp2692 = _mm512_shuffle_ps(tmp2682, tmp2684, 68);
__m512 tmp2693 = _mm512_shuffle_ps(tmp2682, tmp2684, 238);
__m512 tmp2694 = _mm512_shuffle_ps(tmp2683, tmp2685, 68);
__m512 tmp2695 = _mm512_shuffle_ps(tmp2683, tmp2685, 238);
__m512 tmp2696 = _mm512_shuffle_ps(tmp2686, tmp2686, 238);
__m512 tmp2697 = _mm512_shuffle_ps(tmp2687, tmp2687, 238);
__m512 tmp2698 = _mm512_shuffle_f32x4(tmp2688, tmp2692, 136);
__m512 tmp2699 = _mm512_shuffle_f32x4(tmp2688, tmp2692, 221);
__m512 tmp2700 = _mm512_shuffle_f32x4(tmp2689, tmp2693, 136);
__m512 tmp2701 = _mm512_shuffle_f32x4(tmp2689, tmp2693, 221);
__m512 tmp2702 = _mm512_shuffle_f32x4(tmp2690, tmp2694, 136);
__m512 tmp2703 = _mm512_shuffle_f32x4(tmp2690, tmp2694, 221);
__m512 tmp2704 = _mm512_shuffle_f32x4(tmp2691, tmp2695, 136);
__m512 tmp2705 = _mm512_shuffle_f32x4(tmp2691, tmp2695, 221);
__m512 tmp2706 = _mm512_shuffle_f32x4(tmp2686, tmp2686, 136);
__m512 tmp2707 = _mm512_shuffle_f32x4(tmp2686, tmp2686, 221);
__m512 tmp2708 = _mm512_shuffle_f32x4(tmp2696, tmp2696, 136);
__m512 tmp2709 = _mm512_shuffle_f32x4(tmp2696, tmp2696, 221);
__m512 tmp2710 = _mm512_shuffle_f32x4(tmp2687, tmp2687, 136);
__m512 tmp2711 = _mm512_shuffle_f32x4(tmp2687, tmp2687, 221);
__m512 tmp2712 = _mm512_shuffle_f32x4(tmp2697, tmp2697, 136);
__m512 tmp2713 = _mm512_shuffle_f32x4(tmp2697, tmp2697, 221);
tmp2589 = _mm512_shuffle_f32x4(tmp2698, tmp2706, 136);
tmp2597 = _mm512_shuffle_f32x4(tmp2698, tmp2706, 221);
tmp2590 = _mm512_shuffle_f32x4(tmp2700, tmp2708, 136);
__m512 tmp2631 = _mm512_shuffle_f32x4(tmp2700, tmp2708, 221);
tmp2591 = _mm512_shuffle_f32x4(tmp2702, tmp2710, 136);
__m512 tmp2632 = _mm512_shuffle_f32x4(tmp2702, tmp2710, 221);
tmp2592 = _mm512_shuffle_f32x4(tmp2704, tmp2712, 136);
__m512 tmp2633 = _mm512_shuffle_f32x4(tmp2704, tmp2712, 221);
tmp2593 = _mm512_shuffle_f32x4(tmp2699, tmp2707, 136);
__m512 tmp2634 = _mm512_shuffle_f32x4(tmp2699, tmp2707, 221);
tmp2594 = _mm512_shuffle_f32x4(tmp2701, tmp2709, 136);
__m512 tmp2635 = _mm512_shuffle_f32x4(tmp2701, tmp2709, 221);
tmp2595 = _mm512_shuffle_f32x4(tmp2703, tmp2711, 136);
__m512 tmp2636 = _mm512_shuffle_f32x4(tmp2703, tmp2711, 221);
tmp2596 = _mm512_shuffle_f32x4(tmp2705, tmp2713, 136);
__m512 tmp2637 = _mm512_shuffle_f32x4(tmp2705, tmp2713, 221);
__m512 tmp2642 = _mm512_add_ps(tmp2590, tmp2591);
__m512 tmp2662 = _mm512_add_ps(tmp2631, tmp2632);
__m512 tmp2641 = _mm512_add_ps(tmp2592, tmp2593);
__m512 tmp2661 = _mm512_add_ps(tmp2633, tmp2634);
__m512 tmp2647 = _mm512_sub_ps(tmp2592, tmp2593);
__m512 tmp2667 = _mm512_sub_ps(tmp2633, tmp2634);
__m512 tmp2646 = _mm512_sub_ps(tmp2590, tmp2591);
__m512 tmp2666 = _mm512_sub_ps(tmp2631, tmp2632);
__m512 tmp2643 = _mm512_add_ps(tmp2594, tmp2595);
__m512 tmp2663 = _mm512_add_ps(tmp2635, tmp2636);
__m512 tmp2648 = _mm512_sub_ps(tmp2594, tmp2595);
__m512 tmp2668 = _mm512_sub_ps(tmp2635, tmp2636);
__m512 tmp2645 = _mm512_fmadd_ps(tmp2647, _mm512_set1_ps(2e+00f), tmp2646);
__m512 tmp2665 = _mm512_fmadd_ps(tmp2667, _mm512_set1_ps(2e+00f), tmp2666);
__m512 tmp2652 = _mm512_fmadd_ps(tmp2647, _mm512_set1_ps(8e+00f), tmp2646);
__m512 tmp2672 = _mm512_fmadd_ps(tmp2667, _mm512_set1_ps(8e+00f), tmp2666);
__m512 tmp2640 = _mm512_add_ps(tmp2641, tmp2642);
__m512 tmp2660 = _mm512_add_ps(tmp2661, tmp2662);
__m512 tmp2644 = _mm512_fmadd_ps(tmp2648, _mm512_set1_ps(1.6e+01f), tmp2645);
__m512 tmp2664 = _mm512_fmadd_ps(tmp2668, _mm512_set1_ps(1.6e+01f), tmp2665);
__m512 tmp2651 = _mm512_fmadd_ps(tmp2648, _mm512_set1_ps(4e+00f), tmp2652);
__m512 tmp2671 = _mm512_fmadd_ps(tmp2668, _mm512_set1_ps(4e+00f), tmp2672);
__m512 tmp2657 = _mm512_add_ps(tmp2648, tmp2646);
__m512 tmp2677 = _mm512_add_ps(tmp2668, tmp2666);
__m512 tmp2650 = _mm512_fmadd_ps(tmp2641, _mm512_set1_ps(4e+00f), tmp2642);
__m512 tmp2670 = _mm512_fmadd_ps(tmp2661, _mm512_set1_ps(4e+00f), tmp2662);
__m512 tmp2654 = _mm512_fmadd_ps(tmp2641, _mm512_set1_ps(1.6e+01f), tmp2642);
__m512 tmp2674 = _mm512_fmadd_ps(tmp2661, _mm512_set1_ps(1.6e+01f), tmp2662);
__m512 tmp2639 = _mm512_add_ps(tmp2640, tmp2589);
__m512 tmp2659 = _mm512_add_ps(tmp2660, tmp2597);
__m512 tmp2656 = _mm512_add_ps(tmp2657, tmp2596);
__m512 tmp2676 = _mm512_add_ps(tmp2677, tmp2637);
__m512 tmp2638 = _mm512_fmadd_ps(tmp2643, _mm512_set1_ps(3.2e+01f), tmp2639);
__m512 tmp2658 = _mm512_fmadd_ps(tmp2663, _mm512_set1_ps(3.2e+01f), tmp2659);
__m512 tmp2649 = _mm512_fmadd_ps(tmp2643, _mm512_set1_ps(8e+00f), tmp2650);
__m512 tmp2669 = _mm512_fmadd_ps(tmp2663, _mm512_set1_ps(8e+00f), tmp2670);
__m512 tmp2655 = _mm512_fmadd_ps(tmp2647, _mm512_set1_ps(3.2e+01f), tmp2656);
__m512 tmp2675 = _mm512_fmadd_ps(tmp2667, _mm512_set1_ps(3.2e+01f), tmp2676);
__m512 tmp2653 = _mm512_fmadd_ps(tmp2643, _mm512_set1_ps(2e+00f), tmp2654);
__m512 tmp2673 = _mm512_fmadd_ps(tmp2663, _mm512_set1_ps(2e+00f), tmp2674);
__m512 out525 = tmp2638;
__m512 out531 = tmp2658;
__m512 out526 = tmp2644;
__m512 out532 = tmp2664;
__m512 out527 = tmp2649;
__m512 out533 = tmp2669;
__m512 out528 = tmp2651;
__m512 out534 = tmp2671;
__m512 out529 = tmp2653;
__m512 out535 = tmp2673;
__m512 out530 = tmp2655;
__m512 out536 = tmp2675;
_mm512_mask_storeu_ps(datPtr3+504+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out525);
_mm512_mask_storeu_ps(datPtr3+3024+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out531);
_mm512_mask_storeu_ps(datPtr3+588+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out526);
_mm512_mask_storeu_ps(datPtr3+3108+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out532);
_mm512_mask_storeu_ps(datPtr3+672+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out527);
_mm512_mask_storeu_ps(datPtr3+3192+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out533);
_mm512_mask_storeu_ps(datPtr3+756+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out528);
_mm512_mask_storeu_ps(datPtr3+3276+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out534);
_mm512_mask_storeu_ps(datPtr3+840+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out529);
_mm512_mask_storeu_ps(datPtr3+3360+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out535);
_mm512_mask_storeu_ps(datPtr3+924+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out530);
_mm512_mask_storeu_ps(datPtr3+3444+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out536);
__m512 sf105 = _mm512_loadu_ps(sfPtr4+512+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf106 = _mm512_loadu_ps(sfPtr4+640+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in504 = _mm512_shuffle_f32x4(sf105, sf106, 68);
__m512 in505 = _mm512_shuffle_f32x4(sf105, sf106, 238);
__m512 sf107 = _mm512_loadu_ps(sfPtr4+576+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf108 = _mm512_loadu_ps(sfPtr4+704+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in512 = _mm512_shuffle_f32x4(sf107, sf108, 68);
__m512 in513 = _mm512_shuffle_f32x4(sf107, sf108, 238);
__m512 sf109 = _mm512_loadu_ps(sfPtr4+1961984+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf110 = _mm512_loadu_ps(sfPtr4+1962112+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in506 = _mm512_shuffle_f32x4(sf109, sf110, 68);
__m512 in507 = _mm512_shuffle_f32x4(sf109, sf110, 238);
__m512 sf111 = _mm512_loadu_ps(sfPtr4+1962048+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf112 = _mm512_loadu_ps(sfPtr4+1962176+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in514 = _mm512_shuffle_f32x4(sf111, sf112, 68);
__m512 in515 = _mm512_shuffle_f32x4(sf111, sf112, 238);
__m512 sf113 = _mm512_loadu_ps(sfPtr4+3923456+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf114 = _mm512_loadu_ps(sfPtr4+3923584+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in508 = _mm512_shuffle_f32x4(sf113, sf114, 68);
__m512 in509 = _mm512_shuffle_f32x4(sf113, sf114, 238);
__m512 sf115 = _mm512_loadu_ps(sfPtr4+3923520+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf116 = _mm512_loadu_ps(sfPtr4+3923648+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in516 = _mm512_shuffle_f32x4(sf115, sf116, 68);
__m512 in517 = _mm512_shuffle_f32x4(sf115, sf116, 238);
__m512 sf117 = _mm512_loadu_ps(sfPtr4+5884928+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf118 = _mm512_loadu_ps(sfPtr4+5885056+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in510 = _mm512_shuffle_f32x4(sf117, sf118, 68);
__m512 in511 = _mm512_shuffle_f32x4(sf117, sf118, 238);
__m512 sf119 = _mm512_loadu_ps(sfPtr4+5884992+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 sf120 = _mm512_loadu_ps(sfPtr4+5885120+7845888*i12+490368*j8+1536*k15+768*l6);
__m512 in518 = _mm512_shuffle_f32x4(sf119, sf120, 68);
__m512 in519 = _mm512_shuffle_f32x4(sf119, sf120, 238);
__m512 tmp2730 = _mm512_add_ps(in505, in506);
__m512 tmp2750 = _mm512_add_ps(in513, in514);
__m512 tmp2729 = _mm512_add_ps(in507, in508);
__m512 tmp2749 = _mm512_add_ps(in515, in516);
__m512 tmp2735 = _mm512_sub_ps(in507, in508);
__m512 tmp2755 = _mm512_sub_ps(in515, in516);
__m512 tmp2734 = _mm512_sub_ps(in505, in506);
__m512 tmp2754 = _mm512_sub_ps(in513, in514);
__m512 tmp2731 = _mm512_add_ps(in509, in510);
__m512 tmp2751 = _mm512_add_ps(in517, in518);
__m512 tmp2736 = _mm512_sub_ps(in509, in510);
__m512 tmp2756 = _mm512_sub_ps(in517, in518);
__m512 tmp2733 = _mm512_fmadd_ps(tmp2735, _mm512_set1_ps(2e+00f), tmp2734);
__m512 tmp2753 = _mm512_fmadd_ps(tmp2755, _mm512_set1_ps(2e+00f), tmp2754);
__m512 tmp2740 = _mm512_fmadd_ps(tmp2735, _mm512_set1_ps(8e+00f), tmp2734);
__m512 tmp2760 = _mm512_fmadd_ps(tmp2755, _mm512_set1_ps(8e+00f), tmp2754);
__m512 tmp2728 = _mm512_add_ps(tmp2729, tmp2730);
__m512 tmp2748 = _mm512_add_ps(tmp2749, tmp2750);
__m512 tmp2732 = _mm512_fmadd_ps(tmp2736, _mm512_set1_ps(1.6e+01f), tmp2733);
__m512 tmp2752 = _mm512_fmadd_ps(tmp2756, _mm512_set1_ps(1.6e+01f), tmp2753);
__m512 tmp2739 = _mm512_fmadd_ps(tmp2736, _mm512_set1_ps(4e+00f), tmp2740);
__m512 tmp2759 = _mm512_fmadd_ps(tmp2756, _mm512_set1_ps(4e+00f), tmp2760);
__m512 tmp2745 = _mm512_add_ps(tmp2736, tmp2734);
__m512 tmp2765 = _mm512_add_ps(tmp2756, tmp2754);
__m512 tmp2738 = _mm512_fmadd_ps(tmp2729, _mm512_set1_ps(4e+00f), tmp2730);
__m512 tmp2758 = _mm512_fmadd_ps(tmp2749, _mm512_set1_ps(4e+00f), tmp2750);
__m512 tmp2742 = _mm512_fmadd_ps(tmp2729, _mm512_set1_ps(1.6e+01f), tmp2730);
__m512 tmp2762 = _mm512_fmadd_ps(tmp2749, _mm512_set1_ps(1.6e+01f), tmp2750);
__m512 tmp2727 = _mm512_add_ps(tmp2728, in504);
__m512 tmp2747 = _mm512_add_ps(tmp2748, in512);
__m512 tmp2744 = _mm512_add_ps(tmp2745, in511);
__m512 tmp2764 = _mm512_add_ps(tmp2765, in519);
__m512 tmp2726 = _mm512_fmadd_ps(tmp2731, _mm512_set1_ps(3.2e+01f), tmp2727);
__m512 tmp2746 = _mm512_fmadd_ps(tmp2751, _mm512_set1_ps(3.2e+01f), tmp2747);
__m512 tmp2737 = _mm512_fmadd_ps(tmp2731, _mm512_set1_ps(8e+00f), tmp2738);
__m512 tmp2757 = _mm512_fmadd_ps(tmp2751, _mm512_set1_ps(8e+00f), tmp2758);
__m512 tmp2743 = _mm512_fmadd_ps(tmp2735, _mm512_set1_ps(3.2e+01f), tmp2744);
__m512 tmp2763 = _mm512_fmadd_ps(tmp2755, _mm512_set1_ps(3.2e+01f), tmp2764);
__m512 tmp2741 = _mm512_fmadd_ps(tmp2731, _mm512_set1_ps(2e+00f), tmp2742);
__m512 tmp2761 = _mm512_fmadd_ps(tmp2751, _mm512_set1_ps(2e+00f), tmp2762);
__m512 tmp2714 = tmp2726;
__m512 tmp2720 = tmp2746;
__m512 tmp2715 = tmp2732;
__m512 tmp2721 = tmp2752;
__m512 tmp2716 = tmp2737;
__m512 tmp2722 = tmp2757;
__m512 tmp2717 = tmp2739;
__m512 tmp2723 = tmp2759;
__m512 tmp2718 = tmp2741;
__m512 tmp2724 = tmp2761;
__m512 tmp2719 = tmp2743;
__m512 tmp2725 = tmp2763;
__m512 tmp2810 = _mm512_unpacklo_ps(tmp2714, tmp2715);
__m512 tmp2811 = _mm512_unpackhi_ps(tmp2714, tmp2715);
__m512 tmp2812 = _mm512_unpacklo_ps(tmp2716, tmp2717);
__m512 tmp2813 = _mm512_unpackhi_ps(tmp2716, tmp2717);
__m512 tmp2814 = _mm512_unpacklo_ps(tmp2718, tmp2719);
__m512 tmp2815 = _mm512_unpackhi_ps(tmp2718, tmp2719);
__m512 tmp2816 = _mm512_unpacklo_ps(tmp2720, tmp2721);
__m512 tmp2817 = _mm512_unpackhi_ps(tmp2720, tmp2721);
__m512 tmp2818 = _mm512_unpacklo_ps(tmp2722, tmp2723);
__m512 tmp2819 = _mm512_unpackhi_ps(tmp2722, tmp2723);
__m512 tmp2820 = _mm512_unpacklo_ps(tmp2724, tmp2725);
__m512 tmp2821 = _mm512_unpackhi_ps(tmp2724, tmp2725);
__m512 tmp2822 = _mm512_shuffle_ps(tmp2810, tmp2812, 68);
__m512 tmp2823 = _mm512_shuffle_ps(tmp2810, tmp2812, 238);
__m512 tmp2824 = _mm512_shuffle_ps(tmp2811, tmp2813, 68);
__m512 tmp2825 = _mm512_shuffle_ps(tmp2811, tmp2813, 238);
__m512 tmp2826 = _mm512_shuffle_ps(tmp2814, tmp2816, 68);
__m512 tmp2827 = _mm512_shuffle_ps(tmp2814, tmp2816, 238);
__m512 tmp2828 = _mm512_shuffle_ps(tmp2815, tmp2817, 68);
__m512 tmp2829 = _mm512_shuffle_ps(tmp2815, tmp2817, 238);
__m512 tmp2830 = _mm512_shuffle_ps(tmp2818, tmp2820, 68);
__m512 tmp2831 = _mm512_shuffle_ps(tmp2818, tmp2820, 238);
__m512 tmp2832 = _mm512_shuffle_ps(tmp2819, tmp2821, 68);
__m512 tmp2833 = _mm512_shuffle_ps(tmp2819, tmp2821, 238);
__m512 tmp2834 = _mm512_shuffle_f32x4(tmp2822, tmp2826, 136);
__m512 tmp2835 = _mm512_shuffle_f32x4(tmp2822, tmp2826, 221);
__m512 tmp2836 = _mm512_shuffle_f32x4(tmp2823, tmp2827, 136);
__m512 tmp2837 = _mm512_shuffle_f32x4(tmp2823, tmp2827, 221);
__m512 tmp2838 = _mm512_shuffle_f32x4(tmp2824, tmp2828, 136);
__m512 tmp2839 = _mm512_shuffle_f32x4(tmp2824, tmp2828, 221);
__m512 tmp2840 = _mm512_shuffle_f32x4(tmp2825, tmp2829, 136);
__m512 tmp2841 = _mm512_shuffle_f32x4(tmp2825, tmp2829, 221);
__m512 tmp2842 = _mm512_shuffle_f32x4(tmp2830, tmp2830, 136);
__m512 tmp2843 = _mm512_shuffle_f32x4(tmp2830, tmp2830, 221);
__m512 tmp2844 = _mm512_shuffle_f32x4(tmp2831, tmp2831, 136);
__m512 tmp2845 = _mm512_shuffle_f32x4(tmp2831, tmp2831, 221);
__m512 tmp2846 = _mm512_shuffle_f32x4(tmp2832, tmp2832, 136);
__m512 tmp2847 = _mm512_shuffle_f32x4(tmp2832, tmp2832, 221);
__m512 tmp2848 = _mm512_shuffle_f32x4(tmp2833, tmp2833, 136);
__m512 tmp2849 = _mm512_shuffle_f32x4(tmp2833, tmp2833, 221);
tmp2714 = _mm512_shuffle_f32x4(tmp2834, tmp2842, 136);
tmp2722 = _mm512_shuffle_f32x4(tmp2834, tmp2842, 221);
tmp2715 = _mm512_shuffle_f32x4(tmp2836, tmp2844, 136);
tmp2723 = _mm512_shuffle_f32x4(tmp2836, tmp2844, 221);
tmp2716 = _mm512_shuffle_f32x4(tmp2838, tmp2846, 136);
tmp2724 = _mm512_shuffle_f32x4(tmp2838, tmp2846, 221);
tmp2717 = _mm512_shuffle_f32x4(tmp2840, tmp2848, 136);
tmp2725 = _mm512_shuffle_f32x4(tmp2840, tmp2848, 221);
tmp2718 = _mm512_shuffle_f32x4(tmp2835, tmp2843, 136);
__m512 tmp2766 = _mm512_shuffle_f32x4(tmp2835, tmp2843, 221);
tmp2719 = _mm512_shuffle_f32x4(tmp2837, tmp2845, 136);
__m512 tmp2767 = _mm512_shuffle_f32x4(tmp2837, tmp2845, 221);
tmp2720 = _mm512_shuffle_f32x4(tmp2839, tmp2847, 136);
__m512 tmp2768 = _mm512_shuffle_f32x4(tmp2839, tmp2847, 221);
tmp2721 = _mm512_shuffle_f32x4(tmp2841, tmp2849, 136);
__m512 tmp2769 = _mm512_shuffle_f32x4(tmp2841, tmp2849, 221);
__m512 tmp2774 = _mm512_add_ps(tmp2715, tmp2716);
__m512 tmp2794 = _mm512_add_ps(tmp2723, tmp2724);
__m512 tmp2773 = _mm512_add_ps(tmp2717, tmp2718);
__m512 tmp2793 = _mm512_add_ps(tmp2725, tmp2766);
__m512 tmp2779 = _mm512_sub_ps(tmp2717, tmp2718);
__m512 tmp2799 = _mm512_sub_ps(tmp2725, tmp2766);
__m512 tmp2778 = _mm512_sub_ps(tmp2715, tmp2716);
__m512 tmp2798 = _mm512_sub_ps(tmp2723, tmp2724);
__m512 tmp2775 = _mm512_add_ps(tmp2719, tmp2720);
__m512 tmp2795 = _mm512_add_ps(tmp2767, tmp2768);
__m512 tmp2780 = _mm512_sub_ps(tmp2719, tmp2720);
__m512 tmp2800 = _mm512_sub_ps(tmp2767, tmp2768);
__m512 tmp2777 = _mm512_fmadd_ps(tmp2779, _mm512_set1_ps(2e+00f), tmp2778);
__m512 tmp2797 = _mm512_fmadd_ps(tmp2799, _mm512_set1_ps(2e+00f), tmp2798);
__m512 tmp2784 = _mm512_fmadd_ps(tmp2779, _mm512_set1_ps(8e+00f), tmp2778);
__m512 tmp2804 = _mm512_fmadd_ps(tmp2799, _mm512_set1_ps(8e+00f), tmp2798);
__m512 tmp2772 = _mm512_add_ps(tmp2773, tmp2774);
__m512 tmp2792 = _mm512_add_ps(tmp2793, tmp2794);
__m512 tmp2776 = _mm512_fmadd_ps(tmp2780, _mm512_set1_ps(1.6e+01f), tmp2777);
__m512 tmp2796 = _mm512_fmadd_ps(tmp2800, _mm512_set1_ps(1.6e+01f), tmp2797);
__m512 tmp2783 = _mm512_fmadd_ps(tmp2780, _mm512_set1_ps(4e+00f), tmp2784);
__m512 tmp2803 = _mm512_fmadd_ps(tmp2800, _mm512_set1_ps(4e+00f), tmp2804);
__m512 tmp2789 = _mm512_add_ps(tmp2780, tmp2778);
__m512 tmp2809 = _mm512_add_ps(tmp2800, tmp2798);
__m512 tmp2782 = _mm512_fmadd_ps(tmp2773, _mm512_set1_ps(4e+00f), tmp2774);
__m512 tmp2802 = _mm512_fmadd_ps(tmp2793, _mm512_set1_ps(4e+00f), tmp2794);
__m512 tmp2786 = _mm512_fmadd_ps(tmp2773, _mm512_set1_ps(1.6e+01f), tmp2774);
__m512 tmp2806 = _mm512_fmadd_ps(tmp2793, _mm512_set1_ps(1.6e+01f), tmp2794);
__m512 tmp2771 = _mm512_add_ps(tmp2772, tmp2714);
__m512 tmp2791 = _mm512_add_ps(tmp2792, tmp2722);
__m512 tmp2788 = _mm512_add_ps(tmp2789, tmp2721);
__m512 tmp2808 = _mm512_add_ps(tmp2809, tmp2769);
__m512 tmp2770 = _mm512_fmadd_ps(tmp2775, _mm512_set1_ps(3.2e+01f), tmp2771);
__m512 tmp2790 = _mm512_fmadd_ps(tmp2795, _mm512_set1_ps(3.2e+01f), tmp2791);
__m512 tmp2781 = _mm512_fmadd_ps(tmp2775, _mm512_set1_ps(8e+00f), tmp2782);
__m512 tmp2801 = _mm512_fmadd_ps(tmp2795, _mm512_set1_ps(8e+00f), tmp2802);
__m512 tmp2787 = _mm512_fmadd_ps(tmp2779, _mm512_set1_ps(3.2e+01f), tmp2788);
__m512 tmp2807 = _mm512_fmadd_ps(tmp2799, _mm512_set1_ps(3.2e+01f), tmp2808);
__m512 tmp2785 = _mm512_fmadd_ps(tmp2775, _mm512_set1_ps(2e+00f), tmp2786);
__m512 tmp2805 = _mm512_fmadd_ps(tmp2795, _mm512_set1_ps(2e+00f), tmp2806);
__m512 out537 = tmp2770;
__m512 out543 = tmp2790;
__m512 out538 = tmp2776;
__m512 out544 = tmp2796;
__m512 out539 = tmp2781;
__m512 out545 = tmp2801;
__m512 out540 = tmp2783;
__m512 out546 = tmp2803;
__m512 out541 = tmp2785;
__m512 out547 = tmp2805;
__m512 out542 = tmp2787;
__m512 out548 = tmp2807;
_mm512_mask_storeu_ps(datPtr3+3480+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out537);
_mm512_mask_storeu_ps(datPtr3+3528+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out543);
_mm512_mask_storeu_ps(datPtr3+3564+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out538);
_mm512_mask_storeu_ps(datPtr3+3612+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out544);
_mm512_mask_storeu_ps(datPtr3+3648+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out539);
_mm512_mask_storeu_ps(datPtr3+3696+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out545);
_mm512_mask_storeu_ps(datPtr3+3732+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out540);
_mm512_mask_storeu_ps(datPtr3+3780+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out546);
_mm512_mask_storeu_ps(datPtr3+3816+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out541);
_mm512_mask_storeu_ps(datPtr3+3864+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out547);
_mm512_mask_storeu_ps(datPtr3+3900+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 4095, out542);
_mm512_mask_storeu_ps(datPtr3+3948+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l6, 511, out548);
}
if (k15 >= kk5) return;
}
ptrdiff_t l7 = 0;
__m512 sf121 = _mm512_loadu_ps(sfPtr4+0+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf122 = _mm512_loadu_ps(sfPtr4+128+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in520 = _mm512_shuffle_f32x4(sf121, sf122, 68);
__m512 in521 = _mm512_shuffle_f32x4(sf121, sf122, 238);
__m512 sf123 = _mm512_loadu_ps(sfPtr4+64+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf124 = _mm512_loadu_ps(sfPtr4+192+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in528 = _mm512_shuffle_f32x4(sf123, sf124, 68);
__m512 in529 = _mm512_shuffle_f32x4(sf123, sf124, 238);
__m512 sf125 = _mm512_loadu_ps(sfPtr4+1961472+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf126 = _mm512_loadu_ps(sfPtr4+1961600+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in522 = _mm512_shuffle_f32x4(sf125, sf126, 68);
__m512 in523 = _mm512_shuffle_f32x4(sf125, sf126, 238);
__m512 sf127 = _mm512_loadu_ps(sfPtr4+1961536+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf128 = _mm512_loadu_ps(sfPtr4+1961664+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in530 = _mm512_shuffle_f32x4(sf127, sf128, 68);
__m512 in531 = _mm512_shuffle_f32x4(sf127, sf128, 238);
__m512 sf129 = _mm512_loadu_ps(sfPtr4+3922944+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf130 = _mm512_loadu_ps(sfPtr4+3923072+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in524 = _mm512_shuffle_f32x4(sf129, sf130, 68);
__m512 in525 = _mm512_shuffle_f32x4(sf129, sf130, 238);
__m512 sf131 = _mm512_loadu_ps(sfPtr4+3923008+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf132 = _mm512_loadu_ps(sfPtr4+3923136+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in532 = _mm512_shuffle_f32x4(sf131, sf132, 68);
__m512 in533 = _mm512_shuffle_f32x4(sf131, sf132, 238);
__m512 sf133 = _mm512_loadu_ps(sfPtr4+5884416+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf134 = _mm512_loadu_ps(sfPtr4+5884544+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in526 = _mm512_shuffle_f32x4(sf133, sf134, 68);
__m512 in527 = _mm512_shuffle_f32x4(sf133, sf134, 238);
__m512 sf135 = _mm512_loadu_ps(sfPtr4+5884480+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf136 = _mm512_loadu_ps(sfPtr4+5884608+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in534 = _mm512_shuffle_f32x4(sf135, sf136, 68);
__m512 in535 = _mm512_shuffle_f32x4(sf135, sf136, 238);
__m512 tmp2866 = _mm512_add_ps(in521, in522);
__m512 tmp2886 = _mm512_add_ps(in529, in530);
__m512 tmp2865 = _mm512_add_ps(in523, in524);
__m512 tmp2885 = _mm512_add_ps(in531, in532);
__m512 tmp2871 = _mm512_sub_ps(in523, in524);
__m512 tmp2891 = _mm512_sub_ps(in531, in532);
__m512 tmp2870 = _mm512_sub_ps(in521, in522);
__m512 tmp2890 = _mm512_sub_ps(in529, in530);
__m512 tmp2867 = _mm512_add_ps(in525, in526);
__m512 tmp2887 = _mm512_add_ps(in533, in534);
__m512 tmp2872 = _mm512_sub_ps(in525, in526);
__m512 tmp2892 = _mm512_sub_ps(in533, in534);
__m512 tmp2869 = _mm512_fmadd_ps(tmp2871, _mm512_set1_ps(2e+00f), tmp2870);
__m512 tmp2889 = _mm512_fmadd_ps(tmp2891, _mm512_set1_ps(2e+00f), tmp2890);
__m512 tmp2876 = _mm512_fmadd_ps(tmp2871, _mm512_set1_ps(8e+00f), tmp2870);
__m512 tmp2896 = _mm512_fmadd_ps(tmp2891, _mm512_set1_ps(8e+00f), tmp2890);
__m512 tmp2864 = _mm512_add_ps(tmp2865, tmp2866);
__m512 tmp2884 = _mm512_add_ps(tmp2885, tmp2886);
__m512 tmp2868 = _mm512_fmadd_ps(tmp2872, _mm512_set1_ps(1.6e+01f), tmp2869);
__m512 tmp2888 = _mm512_fmadd_ps(tmp2892, _mm512_set1_ps(1.6e+01f), tmp2889);
__m512 tmp2875 = _mm512_fmadd_ps(tmp2872, _mm512_set1_ps(4e+00f), tmp2876);
__m512 tmp2895 = _mm512_fmadd_ps(tmp2892, _mm512_set1_ps(4e+00f), tmp2896);
__m512 tmp2881 = _mm512_add_ps(tmp2872, tmp2870);
__m512 tmp2901 = _mm512_add_ps(tmp2892, tmp2890);
__m512 tmp2874 = _mm512_fmadd_ps(tmp2865, _mm512_set1_ps(4e+00f), tmp2866);
__m512 tmp2894 = _mm512_fmadd_ps(tmp2885, _mm512_set1_ps(4e+00f), tmp2886);
__m512 tmp2878 = _mm512_fmadd_ps(tmp2865, _mm512_set1_ps(1.6e+01f), tmp2866);
__m512 tmp2898 = _mm512_fmadd_ps(tmp2885, _mm512_set1_ps(1.6e+01f), tmp2886);
__m512 tmp2863 = _mm512_add_ps(tmp2864, in520);
__m512 tmp2883 = _mm512_add_ps(tmp2884, in528);
__m512 tmp2880 = _mm512_add_ps(tmp2881, in527);
__m512 tmp2900 = _mm512_add_ps(tmp2901, in535);
__m512 tmp2862 = _mm512_fmadd_ps(tmp2867, _mm512_set1_ps(3.2e+01f), tmp2863);
__m512 tmp2882 = _mm512_fmadd_ps(tmp2887, _mm512_set1_ps(3.2e+01f), tmp2883);
__m512 tmp2873 = _mm512_fmadd_ps(tmp2867, _mm512_set1_ps(8e+00f), tmp2874);
__m512 tmp2893 = _mm512_fmadd_ps(tmp2887, _mm512_set1_ps(8e+00f), tmp2894);
__m512 tmp2879 = _mm512_fmadd_ps(tmp2871, _mm512_set1_ps(3.2e+01f), tmp2880);
__m512 tmp2899 = _mm512_fmadd_ps(tmp2891, _mm512_set1_ps(3.2e+01f), tmp2900);
__m512 tmp2877 = _mm512_fmadd_ps(tmp2867, _mm512_set1_ps(2e+00f), tmp2878);
__m512 tmp2897 = _mm512_fmadd_ps(tmp2887, _mm512_set1_ps(2e+00f), tmp2898);
__m512 tmp2850 = tmp2862;
__m512 tmp2856 = tmp2882;
__m512 tmp2851 = tmp2868;
__m512 tmp2857 = tmp2888;
__m512 tmp2852 = tmp2873;
__m512 tmp2858 = tmp2893;
__m512 tmp2853 = tmp2875;
__m512 tmp2859 = tmp2895;
__m512 tmp2854 = tmp2877;
__m512 tmp2860 = tmp2897;
__m512 tmp2855 = tmp2879;
__m512 tmp2861 = tmp2899;
__m512 tmp2946 = _mm512_unpacklo_ps(tmp2850, tmp2851);
__m512 tmp2947 = _mm512_unpackhi_ps(tmp2850, tmp2851);
__m512 tmp2948 = _mm512_unpacklo_ps(tmp2852, tmp2853);
__m512 tmp2949 = _mm512_unpackhi_ps(tmp2852, tmp2853);
__m512 tmp2950 = _mm512_unpacklo_ps(tmp2854, tmp2855);
__m512 tmp2951 = _mm512_unpackhi_ps(tmp2854, tmp2855);
__m512 tmp2952 = _mm512_unpacklo_ps(tmp2856, tmp2857);
__m512 tmp2953 = _mm512_unpackhi_ps(tmp2856, tmp2857);
__m512 tmp2954 = _mm512_unpacklo_ps(tmp2858, tmp2859);
__m512 tmp2955 = _mm512_unpackhi_ps(tmp2858, tmp2859);
__m512 tmp2956 = _mm512_unpacklo_ps(tmp2860, tmp2861);
__m512 tmp2957 = _mm512_unpackhi_ps(tmp2860, tmp2861);
__m512 tmp2958 = _mm512_shuffle_ps(tmp2946, tmp2948, 68);
__m512 tmp2959 = _mm512_shuffle_ps(tmp2946, tmp2948, 238);
__m512 tmp2960 = _mm512_shuffle_ps(tmp2947, tmp2949, 68);
__m512 tmp2961 = _mm512_shuffle_ps(tmp2947, tmp2949, 238);
__m512 tmp2962 = _mm512_shuffle_ps(tmp2950, tmp2952, 68);
__m512 tmp2963 = _mm512_shuffle_ps(tmp2950, tmp2952, 238);
__m512 tmp2964 = _mm512_shuffle_ps(tmp2951, tmp2953, 68);
__m512 tmp2965 = _mm512_shuffle_ps(tmp2951, tmp2953, 238);
__m512 tmp2966 = _mm512_shuffle_ps(tmp2954, tmp2956, 68);
__m512 tmp2967 = _mm512_shuffle_ps(tmp2954, tmp2956, 238);
__m512 tmp2968 = _mm512_shuffle_ps(tmp2955, tmp2957, 68);
__m512 tmp2969 = _mm512_shuffle_ps(tmp2955, tmp2957, 238);
__m512 tmp2970 = _mm512_shuffle_f32x4(tmp2958, tmp2962, 136);
__m512 tmp2971 = _mm512_shuffle_f32x4(tmp2958, tmp2962, 221);
__m512 tmp2972 = _mm512_shuffle_f32x4(tmp2959, tmp2963, 136);
__m512 tmp2973 = _mm512_shuffle_f32x4(tmp2959, tmp2963, 221);
__m512 tmp2974 = _mm512_shuffle_f32x4(tmp2960, tmp2964, 136);
__m512 tmp2975 = _mm512_shuffle_f32x4(tmp2960, tmp2964, 221);
__m512 tmp2976 = _mm512_shuffle_f32x4(tmp2961, tmp2965, 136);
__m512 tmp2977 = _mm512_shuffle_f32x4(tmp2961, tmp2965, 221);
__m512 tmp2978 = _mm512_shuffle_f32x4(tmp2966, tmp2966, 136);
__m512 tmp2979 = _mm512_shuffle_f32x4(tmp2966, tmp2966, 221);
__m512 tmp2980 = _mm512_shuffle_f32x4(tmp2967, tmp2967, 136);
__m512 tmp2981 = _mm512_shuffle_f32x4(tmp2967, tmp2967, 221);
__m512 tmp2982 = _mm512_shuffle_f32x4(tmp2968, tmp2968, 136);
__m512 tmp2983 = _mm512_shuffle_f32x4(tmp2968, tmp2968, 221);
__m512 tmp2984 = _mm512_shuffle_f32x4(tmp2969, tmp2969, 136);
__m512 tmp2985 = _mm512_shuffle_f32x4(tmp2969, tmp2969, 221);
tmp2850 = _mm512_shuffle_f32x4(tmp2970, tmp2978, 136);
tmp2858 = _mm512_shuffle_f32x4(tmp2970, tmp2978, 221);
tmp2851 = _mm512_shuffle_f32x4(tmp2972, tmp2980, 136);
tmp2859 = _mm512_shuffle_f32x4(tmp2972, tmp2980, 221);
tmp2852 = _mm512_shuffle_f32x4(tmp2974, tmp2982, 136);
tmp2860 = _mm512_shuffle_f32x4(tmp2974, tmp2982, 221);
tmp2853 = _mm512_shuffle_f32x4(tmp2976, tmp2984, 136);
tmp2861 = _mm512_shuffle_f32x4(tmp2976, tmp2984, 221);
tmp2854 = _mm512_shuffle_f32x4(tmp2971, tmp2979, 136);
__m512 tmp2902 = _mm512_shuffle_f32x4(tmp2971, tmp2979, 221);
tmp2855 = _mm512_shuffle_f32x4(tmp2973, tmp2981, 136);
__m512 tmp2903 = _mm512_shuffle_f32x4(tmp2973, tmp2981, 221);
tmp2856 = _mm512_shuffle_f32x4(tmp2975, tmp2983, 136);
__m512 tmp2904 = _mm512_shuffle_f32x4(tmp2975, tmp2983, 221);
tmp2857 = _mm512_shuffle_f32x4(tmp2977, tmp2985, 136);
__m512 tmp2905 = _mm512_shuffle_f32x4(tmp2977, tmp2985, 221);
__m512 tmp2910 = _mm512_add_ps(tmp2851, tmp2852);
__m512 tmp2930 = _mm512_add_ps(tmp2859, tmp2860);
__m512 tmp2909 = _mm512_add_ps(tmp2853, tmp2854);
__m512 tmp2929 = _mm512_add_ps(tmp2861, tmp2902);
__m512 tmp2915 = _mm512_sub_ps(tmp2853, tmp2854);
__m512 tmp2935 = _mm512_sub_ps(tmp2861, tmp2902);
__m512 tmp2914 = _mm512_sub_ps(tmp2851, tmp2852);
__m512 tmp2934 = _mm512_sub_ps(tmp2859, tmp2860);
__m512 tmp2911 = _mm512_add_ps(tmp2855, tmp2856);
__m512 tmp2931 = _mm512_add_ps(tmp2903, tmp2904);
__m512 tmp2916 = _mm512_sub_ps(tmp2855, tmp2856);
__m512 tmp2936 = _mm512_sub_ps(tmp2903, tmp2904);
__m512 tmp2913 = _mm512_fmadd_ps(tmp2915, _mm512_set1_ps(2e+00f), tmp2914);
__m512 tmp2933 = _mm512_fmadd_ps(tmp2935, _mm512_set1_ps(2e+00f), tmp2934);
__m512 tmp2920 = _mm512_fmadd_ps(tmp2915, _mm512_set1_ps(8e+00f), tmp2914);
__m512 tmp2940 = _mm512_fmadd_ps(tmp2935, _mm512_set1_ps(8e+00f), tmp2934);
__m512 tmp2908 = _mm512_add_ps(tmp2909, tmp2910);
__m512 tmp2928 = _mm512_add_ps(tmp2929, tmp2930);
__m512 tmp2912 = _mm512_fmadd_ps(tmp2916, _mm512_set1_ps(1.6e+01f), tmp2913);
__m512 tmp2932 = _mm512_fmadd_ps(tmp2936, _mm512_set1_ps(1.6e+01f), tmp2933);
__m512 tmp2919 = _mm512_fmadd_ps(tmp2916, _mm512_set1_ps(4e+00f), tmp2920);
__m512 tmp2939 = _mm512_fmadd_ps(tmp2936, _mm512_set1_ps(4e+00f), tmp2940);
__m512 tmp2925 = _mm512_add_ps(tmp2916, tmp2914);
__m512 tmp2945 = _mm512_add_ps(tmp2936, tmp2934);
__m512 tmp2918 = _mm512_fmadd_ps(tmp2909, _mm512_set1_ps(4e+00f), tmp2910);
__m512 tmp2938 = _mm512_fmadd_ps(tmp2929, _mm512_set1_ps(4e+00f), tmp2930);
__m512 tmp2922 = _mm512_fmadd_ps(tmp2909, _mm512_set1_ps(1.6e+01f), tmp2910);
__m512 tmp2942 = _mm512_fmadd_ps(tmp2929, _mm512_set1_ps(1.6e+01f), tmp2930);
__m512 tmp2907 = _mm512_add_ps(tmp2908, tmp2850);
__m512 tmp2927 = _mm512_add_ps(tmp2928, tmp2858);
__m512 tmp2924 = _mm512_add_ps(tmp2925, tmp2857);
__m512 tmp2944 = _mm512_add_ps(tmp2945, tmp2905);
__m512 tmp2906 = _mm512_fmadd_ps(tmp2911, _mm512_set1_ps(3.2e+01f), tmp2907);
__m512 tmp2926 = _mm512_fmadd_ps(tmp2931, _mm512_set1_ps(3.2e+01f), tmp2927);
__m512 tmp2917 = _mm512_fmadd_ps(tmp2911, _mm512_set1_ps(8e+00f), tmp2918);
__m512 tmp2937 = _mm512_fmadd_ps(tmp2931, _mm512_set1_ps(8e+00f), tmp2938);
__m512 tmp2923 = _mm512_fmadd_ps(tmp2915, _mm512_set1_ps(3.2e+01f), tmp2924);
__m512 tmp2943 = _mm512_fmadd_ps(tmp2935, _mm512_set1_ps(3.2e+01f), tmp2944);
__m512 tmp2921 = _mm512_fmadd_ps(tmp2911, _mm512_set1_ps(2e+00f), tmp2922);
__m512 tmp2941 = _mm512_fmadd_ps(tmp2931, _mm512_set1_ps(2e+00f), tmp2942);
__m512 out549 = tmp2906;
__m512 out555 = tmp2926;
__m512 out550 = tmp2912;
__m512 out556 = tmp2932;
__m512 out551 = tmp2917;
__m512 out557 = tmp2937;
__m512 out552 = tmp2919;
__m512 out558 = tmp2939;
__m512 out553 = tmp2921;
__m512 out559 = tmp2941;
__m512 out554 = tmp2923;
__m512 out560 = tmp2943;
_mm512_mask_storeu_ps(datPtr3+0+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 511, out549);
_mm512_mask_storeu_ps(datPtr3+456+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 4095, out555);
_mm512_mask_storeu_ps(datPtr3+84+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 511, out550);
_mm512_mask_storeu_ps(datPtr3+540+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 4095, out556);
_mm512_mask_storeu_ps(datPtr3+168+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 511, out551);
_mm512_mask_storeu_ps(datPtr3+624+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 4095, out557);
_mm512_mask_storeu_ps(datPtr3+252+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 511, out552);
_mm512_mask_storeu_ps(datPtr3+708+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 4095, out558);
_mm512_mask_storeu_ps(datPtr3+336+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 511, out553);
_mm512_mask_storeu_ps(datPtr3+792+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 4095, out559);
_mm512_mask_storeu_ps(datPtr3+420+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 511, out554);
_mm512_mask_storeu_ps(datPtr3+876+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 4095, out560);
__m512 sf137 = _mm512_loadu_ps(sfPtr4+256+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf138 = _mm512_loadu_ps(sfPtr4+320+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in536 = _mm512_shuffle_f32x4(sf137, sf138, 68);
__m512 in537 = _mm512_shuffle_f32x4(sf137, sf138, 238);
__m512 sf139 = _mm512_loadu_ps(sfPtr4+1961728+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf140 = _mm512_loadu_ps(sfPtr4+1961792+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in538 = _mm512_shuffle_f32x4(sf139, sf140, 68);
__m512 in539 = _mm512_shuffle_f32x4(sf139, sf140, 238);
__m512 sf141 = _mm512_loadu_ps(sfPtr4+3923200+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf142 = _mm512_loadu_ps(sfPtr4+3923264+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in540 = _mm512_shuffle_f32x4(sf141, sf142, 68);
__m512 in541 = _mm512_shuffle_f32x4(sf141, sf142, 238);
__m512 sf143 = _mm512_loadu_ps(sfPtr4+5884672+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 sf144 = _mm512_loadu_ps(sfPtr4+5884736+7845888*i12+490368*j8+1536*k15+768*l7);
__m512 in542 = _mm512_shuffle_f32x4(sf143, sf144, 68);
__m512 in543 = _mm512_shuffle_f32x4(sf143, sf144, 238);
__m512 tmp2996 = _mm512_add_ps(in537, in538);
__m512 tmp2995 = _mm512_add_ps(in539, in540);
__m512 tmp3001 = _mm512_sub_ps(in539, in540);
__m512 tmp3000 = _mm512_sub_ps(in537, in538);
__m512 tmp2997 = _mm512_add_ps(in541, in542);
__m512 tmp3002 = _mm512_sub_ps(in541, in542);
__m512 tmp2999 = _mm512_fmadd_ps(tmp3001, _mm512_set1_ps(2e+00f), tmp3000);
__m512 tmp3006 = _mm512_fmadd_ps(tmp3001, _mm512_set1_ps(8e+00f), tmp3000);
__m512 tmp2994 = _mm512_add_ps(tmp2995, tmp2996);
__m512 tmp2998 = _mm512_fmadd_ps(tmp3002, _mm512_set1_ps(1.6e+01f), tmp2999);
__m512 tmp3005 = _mm512_fmadd_ps(tmp3002, _mm512_set1_ps(4e+00f), tmp3006);
__m512 tmp3011 = _mm512_add_ps(tmp3002, tmp3000);
__m512 tmp3004 = _mm512_fmadd_ps(tmp2995, _mm512_set1_ps(4e+00f), tmp2996);
__m512 tmp3008 = _mm512_fmadd_ps(tmp2995, _mm512_set1_ps(1.6e+01f), tmp2996);
__m512 tmp2993 = _mm512_add_ps(tmp2994, in536);
__m512 tmp3010 = _mm512_add_ps(tmp3011, in543);
__m512 tmp2992 = _mm512_fmadd_ps(tmp2997, _mm512_set1_ps(3.2e+01f), tmp2993);
__m512 tmp3003 = _mm512_fmadd_ps(tmp2997, _mm512_set1_ps(8e+00f), tmp3004);
__m512 tmp3009 = _mm512_fmadd_ps(tmp3001, _mm512_set1_ps(3.2e+01f), tmp3010);
__m512 tmp3007 = _mm512_fmadd_ps(tmp2997, _mm512_set1_ps(2e+00f), tmp3008);
__m512 tmp2986 = tmp2992;
__m512 tmp2987 = tmp2998;
__m512 tmp2988 = tmp3003;
__m512 tmp2989 = tmp3005;
__m512 tmp2990 = tmp3007;
__m512 tmp2991 = tmp3009;
__m512 tmp3062 = _mm512_unpacklo_ps(tmp2986, tmp2987);
__m512 tmp3063 = _mm512_unpackhi_ps(tmp2986, tmp2987);
__m512 tmp3064 = _mm512_unpacklo_ps(tmp2988, tmp2989);
__m512 tmp3065 = _mm512_unpackhi_ps(tmp2988, tmp2989);
__m512 tmp3066 = _mm512_unpacklo_ps(tmp2990, tmp2991);
__m512 tmp3067 = _mm512_unpackhi_ps(tmp2990, tmp2991);
__m512 tmp3068 = _mm512_shuffle_ps(tmp3062, tmp3064, 68);
__m512 tmp3069 = _mm512_shuffle_ps(tmp3062, tmp3064, 238);
__m512 tmp3070 = _mm512_shuffle_ps(tmp3063, tmp3065, 68);
__m512 tmp3071 = _mm512_shuffle_ps(tmp3063, tmp3065, 238);
__m512 tmp3072 = _mm512_shuffle_ps(tmp3066, tmp3066, 238);
__m512 tmp3073 = _mm512_shuffle_ps(tmp3067, tmp3067, 238);
__m512 tmp3074 = _mm512_shuffle_f32x4(tmp3068, tmp3066, 136);
__m512 tmp3075 = _mm512_shuffle_f32x4(tmp3068, tmp3066, 221);
__m512 tmp3076 = _mm512_shuffle_f32x4(tmp3069, tmp3072, 136);
__m512 tmp3077 = _mm512_shuffle_f32x4(tmp3069, tmp3072, 221);
__m512 tmp3078 = _mm512_shuffle_f32x4(tmp3070, tmp3067, 136);
__m512 tmp3079 = _mm512_shuffle_f32x4(tmp3070, tmp3067, 221);
__m512 tmp3080 = _mm512_shuffle_f32x4(tmp3071, tmp3073, 136);
__m512 tmp3081 = _mm512_shuffle_f32x4(tmp3071, tmp3073, 221);
tmp2986 = _mm512_shuffle_f32x4(tmp3074, tmp3074, 136);
__m512 tmp3014 = _mm512_shuffle_f32x4(tmp3074, tmp3074, 221);
tmp2987 = _mm512_shuffle_f32x4(tmp3076, tmp3076, 136);
__m512 tmp3015 = _mm512_shuffle_f32x4(tmp3076, tmp3076, 221);
tmp2988 = _mm512_shuffle_f32x4(tmp3078, tmp3078, 136);
__m512 tmp3016 = _mm512_shuffle_f32x4(tmp3078, tmp3078, 221);
tmp2989 = _mm512_shuffle_f32x4(tmp3080, tmp3080, 136);
__m512 tmp3017 = _mm512_shuffle_f32x4(tmp3080, tmp3080, 221);
tmp2990 = _mm512_shuffle_f32x4(tmp3075, tmp3075, 136);
__m512 tmp3018 = _mm512_shuffle_f32x4(tmp3075, tmp3075, 221);
tmp2991 = _mm512_shuffle_f32x4(tmp3077, tmp3077, 136);
__m512 tmp3019 = _mm512_shuffle_f32x4(tmp3077, tmp3077, 221);
__m512 tmp3012 = _mm512_shuffle_f32x4(tmp3079, tmp3079, 136);
__m512 tmp3020 = _mm512_shuffle_f32x4(tmp3079, tmp3079, 221);
__m512 tmp3013 = _mm512_shuffle_f32x4(tmp3081, tmp3081, 136);
__m512 tmp3021 = _mm512_shuffle_f32x4(tmp3081, tmp3081, 221);
__m512 tmp3026 = _mm512_add_ps(tmp2987, tmp2988);
__m512 tmp3046 = _mm512_add_ps(tmp3015, tmp3016);
__m512 tmp3025 = _mm512_add_ps(tmp2989, tmp2990);
__m512 tmp3045 = _mm512_add_ps(tmp3017, tmp3018);
__m512 tmp3031 = _mm512_sub_ps(tmp2989, tmp2990);
__m512 tmp3051 = _mm512_sub_ps(tmp3017, tmp3018);
__m512 tmp3030 = _mm512_sub_ps(tmp2987, tmp2988);
__m512 tmp3050 = _mm512_sub_ps(tmp3015, tmp3016);
__m512 tmp3027 = _mm512_add_ps(tmp2991, tmp3012);
__m512 tmp3047 = _mm512_add_ps(tmp3019, tmp3020);
__m512 tmp3032 = _mm512_sub_ps(tmp2991, tmp3012);
__m512 tmp3052 = _mm512_sub_ps(tmp3019, tmp3020);
__m512 tmp3029 = _mm512_fmadd_ps(tmp3031, _mm512_set1_ps(2e+00f), tmp3030);
__m512 tmp3049 = _mm512_fmadd_ps(tmp3051, _mm512_set1_ps(2e+00f), tmp3050);
__m512 tmp3036 = _mm512_fmadd_ps(tmp3031, _mm512_set1_ps(8e+00f), tmp3030);
__m512 tmp3056 = _mm512_fmadd_ps(tmp3051, _mm512_set1_ps(8e+00f), tmp3050);
__m512 tmp3024 = _mm512_add_ps(tmp3025, tmp3026);
__m512 tmp3044 = _mm512_add_ps(tmp3045, tmp3046);
__m512 tmp3028 = _mm512_fmadd_ps(tmp3032, _mm512_set1_ps(1.6e+01f), tmp3029);
__m512 tmp3048 = _mm512_fmadd_ps(tmp3052, _mm512_set1_ps(1.6e+01f), tmp3049);
__m512 tmp3035 = _mm512_fmadd_ps(tmp3032, _mm512_set1_ps(4e+00f), tmp3036);
__m512 tmp3055 = _mm512_fmadd_ps(tmp3052, _mm512_set1_ps(4e+00f), tmp3056);
__m512 tmp3041 = _mm512_add_ps(tmp3032, tmp3030);
__m512 tmp3061 = _mm512_add_ps(tmp3052, tmp3050);
__m512 tmp3034 = _mm512_fmadd_ps(tmp3025, _mm512_set1_ps(4e+00f), tmp3026);
__m512 tmp3054 = _mm512_fmadd_ps(tmp3045, _mm512_set1_ps(4e+00f), tmp3046);
__m512 tmp3038 = _mm512_fmadd_ps(tmp3025, _mm512_set1_ps(1.6e+01f), tmp3026);
__m512 tmp3058 = _mm512_fmadd_ps(tmp3045, _mm512_set1_ps(1.6e+01f), tmp3046);
__m512 tmp3023 = _mm512_add_ps(tmp3024, tmp2986);
__m512 tmp3043 = _mm512_add_ps(tmp3044, tmp3014);
__m512 tmp3040 = _mm512_add_ps(tmp3041, tmp3013);
__m512 tmp3060 = _mm512_add_ps(tmp3061, tmp3021);
__m512 tmp3022 = _mm512_fmadd_ps(tmp3027, _mm512_set1_ps(3.2e+01f), tmp3023);
__m512 tmp3042 = _mm512_fmadd_ps(tmp3047, _mm512_set1_ps(3.2e+01f), tmp3043);
__m512 tmp3033 = _mm512_fmadd_ps(tmp3027, _mm512_set1_ps(8e+00f), tmp3034);
__m512 tmp3053 = _mm512_fmadd_ps(tmp3047, _mm512_set1_ps(8e+00f), tmp3054);
__m512 tmp3039 = _mm512_fmadd_ps(tmp3031, _mm512_set1_ps(3.2e+01f), tmp3040);
__m512 tmp3059 = _mm512_fmadd_ps(tmp3051, _mm512_set1_ps(3.2e+01f), tmp3060);
__m512 tmp3037 = _mm512_fmadd_ps(tmp3027, _mm512_set1_ps(2e+00f), tmp3038);
__m512 tmp3057 = _mm512_fmadd_ps(tmp3047, _mm512_set1_ps(2e+00f), tmp3058);
__m512 out561 = tmp3022;
__m512 out567 = tmp3042;
__m512 out562 = tmp3028;
__m512 out568 = tmp3048;
__m512 out563 = tmp3033;
__m512 out569 = tmp3053;
__m512 out564 = tmp3035;
__m512 out570 = tmp3055;
__m512 out565 = tmp3037;
__m512 out571 = tmp3057;
__m512 out566 = tmp3039;
__m512 out572 = tmp3059;
_mm512_mask_storeu_ps(datPtr3+504+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 63, out561);
_mm512_mask_storeu_ps(datPtr3+528+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 7, out567);
_mm512_mask_storeu_ps(datPtr3+588+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 63, out562);
_mm512_mask_storeu_ps(datPtr3+612+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 7, out568);
_mm512_mask_storeu_ps(datPtr3+672+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 63, out563);
_mm512_mask_storeu_ps(datPtr3+696+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 7, out569);
_mm512_mask_storeu_ps(datPtr3+756+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 63, out564);
_mm512_mask_storeu_ps(datPtr3+780+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 7, out570);
_mm512_mask_storeu_ps(datPtr3+840+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 63, out565);
_mm512_mask_storeu_ps(datPtr3+864+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 7, out571);
_mm512_mask_storeu_ps(datPtr3+924+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 63, out566);
_mm512_mask_storeu_ps(datPtr3+948+3861648*i12+84*toH2+4*toW2+12096*k15+6048*l7, 7, out572);
if (j8 >= last3) return;
++j8;
if (j8 >= 4) break;
}
}

static void Example13ThreeConsumeSums1(Example13ThreaderTeam1* team17, char** tensors9) {
Example13ThreaderTask1 task15;
task15.callee1 = Example13ThreeConsumeSums1Callee1;
task15.any1 = tensors9;
task15.nd1 = 3;
task15.hull1[0] = 14;
task15.hull1[1] = 4;
task15.hull1[2] = 6;
Example13ThreaderDo1(team17, &task15);
}

struct Example13Net {
char* alloc1;
char* align1;
};

void Example13NetDestroy(Example13Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example13NetCreate(
Example13Net** net1,
Example13Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example13Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(2009682175);
if (__builtin_expect(!alloc3, 0)) {
return Example13Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example13ThreaderTeam1* team12 = 0;
char* err8 = Example13ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors14[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example13ThreeArrangeFilts1(team12, tensors14);
}
Example13ThreaderDestroy1(team12);
Example13Net* net5 = malloc(sizeof(Example13Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example13Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example13Engine {
Example13Net* net3;
Example13ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example13EnginePthreadT(
Example13Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example13ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example13EngineDestroy(Example13Engine* eng3) {
Example13ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example13EngineCreate(
Example13Engine** eng4,
Example13Net* net4,
ptrdiff_t threads2
) {
Example13Engine* eng5 = malloc(sizeof(Example13Engine));
if (__builtin_expect(!eng5, 0)) {
return Example13Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(122609727);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example13Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example13ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example13EngineInference(
Example13Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example13ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors11[] = {
(char*)inData,
align4+0
};
Example13ThreeArrangeDats1(team14, tensors11);
char* tensors12[] = {
netAlign1+0,
align4+0,
align4+75534336
};
Example13ThreeProduceSums1(team14, tensors12);
char* tensors13[] = {
align4+75534336,
(char*)outData
};
Example13ThreeConsumeSums1(team14, tensors13);
}
}

// End of file.

Top