NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example12 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=540 Height=8 Width=27
Conv FromTensor=in ToTensor=out ToChannels=774 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=0 DilationH=1 DilationW=1 Groups=1
Output FromTensor=out

Top || Output Example12.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example12Params);
// Example12Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example12Params Example12Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example12Params* params = malloc(sizeof(Example12Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example12Net* net; // For example, 4 threads:
// char* err = Example12NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example12NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example12Net Example12Net;

char* Example12NetCreate(
Example12Net**,
Example12Params*,
ptrdiff_t threads
);

void Example12NetDestroy(Example12Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example12Net* net;
//
// ... Create net ...
//
// Example12Engine* engine; // For example, 4 inference threads:
// char* err = Example12EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example12EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example12EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*540*8*27);
// float* outData = malloc(sizeof(float)*774*8*25);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example12EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example12Engine Example12Engine;

char* Example12EngineCreate(
Example12Engine**,
Example12Net*,
ptrdiff_t threads
);

char* Example12EnginePthreadT(
Example12Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example12EngineInference(
Example12Engine*,
float* inData,
float* outData
);

void Example12EngineDestroy(Example12Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example12Params {
float outBiases[774]; // 1x774x1x1
float outWeights[3761640]; // 774x540x3x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example12.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example12.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example12.h"

static char* Example12Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example12: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example12ThreaderTask1 Example12ThreaderTask1;
typedef void (*Example12ThreaderCallee1)(Example12ThreaderTask1*, int64_t*);
typedef struct Example12ThreaderHub1 Example12ThreaderHub1;
typedef struct Example12ThreaderNode1 Example12ThreaderNode1;
typedef struct Example12ThreaderUnwind1 Example12ThreaderUnwind1;
typedef struct Example12ThreaderTeam1 Example12ThreaderTeam1;

struct Example12ThreaderTask1 {
Example12ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example12ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example12ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example12ThreaderTask1* task1;
pthread_cond_t cond2;
Example12ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example12ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example12ThreaderTeam1 {
ptrdiff_t nt1;
Example12ThreaderHub1* hub2;
Example12ThreaderNode1* nodes2;
Example12ThreaderUnwind1 unwind1;
};

static void Example12ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example12ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example12ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example12ThreaderMain1(void* arg1) {
Example12ThreaderNode1* node1 = arg1;
Example12ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example12ThreaderHub1* hub3 = team2->hub2;
Example12ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example12ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example12ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example12ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example12ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example12ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example12ThreaderDestroy1(Example12ThreaderTeam1* team3) {
if (!team3) return;
Example12ThreaderNode1* nodes4 = team3->nodes2;
Example12ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example12ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example12ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example12ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example12ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example12ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example12ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example12ThreaderCreate1Up4(Example12ThreaderTeam1* team8, ptrdiff_t nt7) {
Example12ThreaderNode1* nodes5 = team8->nodes2;
for (Example12ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example12Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example12Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example12ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example12Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example12ThreaderCreate1Up3(Example12ThreaderTeam1* team7, ptrdiff_t nt6) {
Example12ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example12Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example12Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example12ThreaderCreate1Up4(team7, nt6);
}

static char* Example12ThreaderCreate1Up2(Example12ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example12ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example12ThreaderNode1) != (size_t)nt5, 0)) {
return Example12Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example12Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example12ThreaderCreate1Up3(team6, nt5);
}

static char* Example12ThreaderCreate1Up1(Example12ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example12ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example12Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example12ThreaderCreate1Up2(team5, nt4);
}

static char* Example12ThreaderCreate1(Example12ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example12Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example12ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example12Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example12ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example12ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example12ThreaderPthreadT1(
pthread_t* thr2,
Example12ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example12Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example12ThreaderDo1(Example12ThreaderTeam1* team10, Example12ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example12ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example12ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example12ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example12ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example12Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example12Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example12ThreeArrangeFilts1Callee1(Example12ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = 0;
ptrdiff_t e1 = pt7[2];
if (e1 < 1) {
e1 = 0;
char*restrict bfPtr1 = tensors2[2]+3096*e1;
char*restrict wfPtr1 = tensors2[2]+6208+39232512*e1;
char*restrict wtPtr1 = tensors2[0]+14256*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+0;
if (j1 < 193) {
for (; j1 != 193; ++j1) {
ptrdiff_t k1 = 0+1*j1;
ptrdiff_t cut1 = 0;
ptrdiff_t s1 = 0;
for (; s1 != 396; ++s1) {
__m512 wt1 = _mm512_maskz_loadu_ps(511, wtPtr1+0+15046560*i5+77760*j1+36*s1);
__m512 wt2 = _mm512_maskz_loadu_ps(511, wtPtr1+19440+15046560*i5+77760*j1+36*s1);
__m512 wt3 = _mm512_maskz_loadu_ps(511, wtPtr1+38880+15046560*i5+77760*j1+36*s1);
__m512 wt4 = _mm512_maskz_loadu_ps(511, wtPtr1+58320+15046560*i5+77760*j1+36*s1);
__m512i pm1 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm2 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp1 = _mm512_permutex2var_ps(wt1, pm1, wt3);
__m512 tmp2 = _mm512_permutex2var_ps(wt2, pm1, wt4);
__m512 tmp3 = _mm512_permutex2var_ps(wt1, pm2, wt3);
__m512 tmp4 = _mm512_permutex2var_ps(wt2, pm2, wt4);
__m512 in1 = _mm512_permutex2var_ps(tmp1, pm1, tmp2);
__m512 in2 = _mm512_permutex2var_ps(tmp1, pm2, tmp2);
__m512 in3 = _mm512_permutex2var_ps(tmp3, pm1, tmp4);
__m512 tmp17 = _mm512_fmadd_ps(in1, _mm512_set1_ps(4e+00f), in3);
__m512 tmp18 = _mm512_add_ps(in1, in3);
__m512 tmp19 = _mm512_fmadd_ps(in3, _mm512_set1_ps(4e+00f), in1);
__m512 tmp20 = _mm512_add_ps(in2, tmp18);
__m512 tmp21 = _mm512_fmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp19);
tmp19 = _mm512_fnmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp19);
__m512 tmp22 = _mm512_fnmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp17);
tmp17 = _mm512_fmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp17);
tmp18 = _mm512_sub_ps(tmp18, in2);
__m512 tmp39 = _mm512_unpacklo_ps(in1, tmp20);
__m512 tmp40 = _mm512_unpackhi_ps(in1, tmp20);
__m512 tmp41 = _mm512_unpacklo_ps(tmp18, tmp21);
__m512 tmp42 = _mm512_unpackhi_ps(tmp18, tmp21);
__m512 tmp43 = _mm512_unpacklo_ps(tmp19, tmp17);
__m512 tmp44 = _mm512_unpackhi_ps(tmp19, tmp17);
__m512 tmp45 = _mm512_unpacklo_ps(tmp22, in3);
__m512 tmp46 = _mm512_unpackhi_ps(tmp22, in3);
__m512 tmp47 = _mm512_shuffle_ps(tmp39, tmp41, 68);
__m512 tmp48 = _mm512_shuffle_ps(tmp39, tmp41, 238);
__m512 tmp49 = _mm512_shuffle_ps(tmp40, tmp42, 68);
__m512 tmp50 = _mm512_shuffle_ps(tmp40, tmp42, 238);
__m512 tmp51 = _mm512_shuffle_ps(tmp43, tmp45, 68);
__m512 tmp52 = _mm512_shuffle_ps(tmp43, tmp45, 238);
__m512 tmp53 = _mm512_shuffle_ps(tmp44, tmp46, 68);
__m512 tmp54 = _mm512_shuffle_ps(tmp44, tmp46, 238);
__m512 tmp55 = _mm512_shuffle_f32x4(tmp47, tmp51, 136);
__m512 tmp56 = _mm512_shuffle_f32x4(tmp47, tmp51, 221);
__m512 tmp57 = _mm512_shuffle_f32x4(tmp48, tmp52, 136);
__m512 tmp58 = _mm512_shuffle_f32x4(tmp48, tmp52, 221);
__m512 tmp59 = _mm512_shuffle_f32x4(tmp49, tmp53, 136);
__m512 tmp60 = _mm512_shuffle_f32x4(tmp49, tmp53, 221);
__m512 tmp61 = _mm512_shuffle_f32x4(tmp50, tmp54, 136);
__m512 tmp62 = _mm512_shuffle_f32x4(tmp50, tmp54, 221);
in1 = _mm512_shuffle_f32x4(tmp55, tmp55, 136);
__m512 tmp23 = _mm512_shuffle_f32x4(tmp55, tmp55, 221);
tmp20 = _mm512_shuffle_f32x4(tmp57, tmp57, 136);
__m512 tmp24 = _mm512_shuffle_f32x4(tmp57, tmp57, 221);
tmp18 = _mm512_shuffle_f32x4(tmp59, tmp59, 136);
__m512 tmp25 = _mm512_shuffle_f32x4(tmp59, tmp59, 221);
tmp21 = _mm512_shuffle_f32x4(tmp61, tmp61, 136);
__m512 tmp26 = _mm512_shuffle_f32x4(tmp61, tmp61, 221);
tmp19 = _mm512_shuffle_f32x4(tmp56, tmp56, 136);
tmp17 = _mm512_shuffle_f32x4(tmp58, tmp58, 136);
tmp22 = _mm512_shuffle_f32x4(tmp60, tmp60, 136);
in3 = _mm512_shuffle_f32x4(tmp62, tmp62, 136);
in1 = _mm512_shuffle_f32x4(in1, tmp21, 68);
tmp20 = _mm512_shuffle_f32x4(tmp20, tmp19, 68);
tmp18 = _mm512_shuffle_f32x4(tmp18, tmp17, 68);
tmp22 = _mm512_shuffle_f32x4(tmp22, tmp24, 68);
in3 = _mm512_shuffle_f32x4(in3, tmp25, 68);
tmp23 = _mm512_shuffle_f32x4(tmp23, tmp26, 68);
__m512 tmp27 = _mm512_fmadd_ps(in1, _mm512_set1_ps(4e+00f), tmp18);
__m512 tmp33 = _mm512_fmadd_ps(tmp22, _mm512_set1_ps(4e+00f), tmp23);
__m512 tmp28 = _mm512_add_ps(in1, tmp18);
__m512 tmp34 = _mm512_add_ps(tmp22, tmp23);
__m512 tmp29 = _mm512_fmadd_ps(tmp18, _mm512_set1_ps(4e+00f), in1);
__m512 tmp35 = _mm512_fmadd_ps(tmp23, _mm512_set1_ps(4e+00f), tmp22);
__m512 tmp30 = _mm512_add_ps(tmp20, tmp28);
__m512 tmp36 = _mm512_add_ps(in3, tmp34);
__m512 tmp31 = _mm512_fmadd_ps(tmp20, _mm512_set1_ps(2e+00f), tmp29);
__m512 tmp37 = _mm512_fmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp35);
tmp29 = _mm512_fnmadd_ps(tmp20, _mm512_set1_ps(2e+00f), tmp29);
tmp35 = _mm512_fnmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp35);
__m512 tmp32 = _mm512_fnmadd_ps(tmp20, _mm512_set1_ps(2e+00f), tmp27);
__m512 tmp38 = _mm512_fnmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp33);
tmp27 = _mm512_fmadd_ps(tmp20, _mm512_set1_ps(2e+00f), tmp27);
tmp33 = _mm512_fmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp33);
tmp28 = _mm512_sub_ps(tmp28, tmp20);
tmp34 = _mm512_sub_ps(tmp34, in3);
in1 = _mm512_mul_ps(in1, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp30 = _mm512_mul_ps(tmp30, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp28 = _mm512_mul_ps(tmp28, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp31 = _mm512_mul_ps(tmp31, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp29 = _mm512_mul_ps(tmp29, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp27 = _mm512_mul_ps(tmp27, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp32 = _mm512_mul_ps(tmp32, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp18 = _mm512_mul_ps(tmp18, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp22 = _mm512_mul_ps(tmp22, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp36 = _mm512_mul_ps(tmp36, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp34 = _mm512_mul_ps(tmp34, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp37 = _mm512_mul_ps(tmp37, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp35 = _mm512_mul_ps(tmp35, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp33 = _mm512_mul_ps(tmp33, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp38 = _mm512_mul_ps(tmp38, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp23 = _mm512_mul_ps(tmp23, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out1 = _mm512_shuffle_f32x4(in1, tmp30, 68);
__m512 out5 = _mm512_shuffle_f32x4(in1, tmp30, 238);
__m512 out2 = _mm512_shuffle_f32x4(tmp28, tmp31, 68);
__m512 out6 = _mm512_shuffle_f32x4(tmp28, tmp31, 238);
__m512 out3 = _mm512_shuffle_f32x4(tmp29, tmp27, 68);
__m512 out7 = _mm512_shuffle_f32x4(tmp29, tmp27, 238);
__m512 out4 = _mm512_shuffle_f32x4(tmp32, tmp18, 68);
__m512 out8 = _mm512_shuffle_f32x4(tmp32, tmp18, 238);
__m512 out9 = _mm512_shuffle_f32x4(tmp22, tmp36, 68);
__m512 out13 = _mm512_shuffle_f32x4(tmp22, tmp36, 238);
__m512 out10 = _mm512_shuffle_f32x4(tmp34, tmp37, 68);
__m512 out14 = _mm512_shuffle_f32x4(tmp34, tmp37, 238);
__m512 out11 = _mm512_shuffle_f32x4(tmp35, tmp33, 68);
__m512 out15 = _mm512_shuffle_f32x4(tmp35, tmp33, 238);
__m512 out12 = _mm512_shuffle_f32x4(tmp38, tmp23, 68);
__m512 out16 = _mm512_shuffle_f32x4(tmp38, tmp23, 238);
ptrdiff_t off1 = 32*cut1;
ptrdiff_t off2 = (size_t)(cut1+1)/4*50688+(size_t)(cut1+1)%4*32;
ptrdiff_t off3 = (size_t)(cut1+2)/4*50688+(size_t)(cut1+2)%4*32;
ptrdiff_t off4 = (size_t)(cut1+3)/4*50688+(size_t)(cut1+3)%4*32;
__m512i wf1 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf2 = _mm512_castsi256_si512(_mm512_cvtps_ph(out5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf3 = _mm512_castsi256_si512(_mm512_cvtps_ph(out9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf4 = _mm512_castsi256_si512(_mm512_cvtps_ph(out13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf5 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf6 = _mm512_castsi256_si512(_mm512_cvtps_ph(out6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf7 = _mm512_castsi256_si512(_mm512_cvtps_ph(out10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf8 = _mm512_castsi256_si512(_mm512_cvtps_ph(out14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf9 = _mm512_castsi256_si512(_mm512_cvtps_ph(out3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf10 = _mm512_castsi256_si512(_mm512_cvtps_ph(out7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf11 = _mm512_castsi256_si512(_mm512_cvtps_ph(out11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf12 = _mm512_castsi256_si512(_mm512_cvtps_ph(out15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf13 = _mm512_castsi256_si512(_mm512_cvtps_ph(out4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf14 = _mm512_castsi256_si512(_mm512_cvtps_ph(out8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf15 = _mm512_castsi256_si512(_mm512_cvtps_ph(out12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf16 = _mm512_castsi256_si512(_mm512_cvtps_ph(out16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr1+0+39232512*i5+50688*k1+off1+128*s1, 255, wf1);
_mm512_mask_storeu_epi32(wfPtr1+0+39232512*i5+50688*k1+off2+128*s1, 255, wf2);
_mm512_mask_storeu_epi32(wfPtr1+0+39232512*i5+50688*k1+off3+128*s1, 255, wf3);
_mm512_mask_storeu_epi32(wfPtr1+0+39232512*i5+50688*k1+off4+128*s1, 255, wf4);
_mm512_mask_storeu_epi32(wfPtr1+9808128+39232512*i5+50688*k1+off1+128*s1, 255, wf5);
_mm512_mask_storeu_epi32(wfPtr1+9808128+39232512*i5+50688*k1+off2+128*s1, 255, wf6);
_mm512_mask_storeu_epi32(wfPtr1+9808128+39232512*i5+50688*k1+off3+128*s1, 255, wf7);
_mm512_mask_storeu_epi32(wfPtr1+9808128+39232512*i5+50688*k1+off4+128*s1, 255, wf8);
_mm512_mask_storeu_epi32(wfPtr1+19616256+39232512*i5+50688*k1+off1+128*s1, 255, wf9);
_mm512_mask_storeu_epi32(wfPtr1+19616256+39232512*i5+50688*k1+off2+128*s1, 255, wf10);
_mm512_mask_storeu_epi32(wfPtr1+19616256+39232512*i5+50688*k1+off3+128*s1, 255, wf11);
_mm512_mask_storeu_epi32(wfPtr1+19616256+39232512*i5+50688*k1+off4+128*s1, 255, wf12);
_mm512_mask_storeu_epi32(wfPtr1+29424384+39232512*i5+50688*k1+off1+128*s1, 255, wf13);
_mm512_mask_storeu_epi32(wfPtr1+29424384+39232512*i5+50688*k1+off2+128*s1, 255, wf14);
_mm512_mask_storeu_epi32(wfPtr1+29424384+39232512*i5+50688*k1+off3+128*s1, 255, wf15);
_mm512_mask_storeu_epi32(wfPtr1+29424384+39232512*i5+50688*k1+off4+128*s1, 255, wf16);
}
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(15, biasPtr1-0+3096*i5+16*j1);
}
_mm512_mask_storeu_ps(bfPtr1-0+3096*i5+16*j1, 15, bias1);
if (j1 >= jj1) return;
}
}
if (j1 == 193) {
ptrdiff_t k2 = 0+1*j1;
ptrdiff_t cut2 = 0;
ptrdiff_t s2 = 0;
for (; s2 != 198; ++s2) {
__m512 wt5 = _mm512_maskz_loadu_ps(511, wtPtr1+0+15046560*i5+77760*j1+72*s2);
__m512 wt6 = _mm512_maskz_loadu_ps(511, wtPtr1+36+15046560*i5+77760*j1+72*s2);
__m512 wt7 = _mm512_maskz_loadu_ps(511, wtPtr1+19440+15046560*i5+77760*j1+72*s2);
__m512 wt8 = _mm512_maskz_loadu_ps(511, wtPtr1+19476+15046560*i5+77760*j1+72*s2);
__m512i pm3 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm4 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp5 = _mm512_permutex2var_ps(wt5, pm3, wt7);
__m512 tmp6 = _mm512_permutex2var_ps(wt6, pm3, wt8);
__m512 tmp7 = _mm512_permutex2var_ps(wt5, pm4, wt7);
__m512 tmp8 = _mm512_permutex2var_ps(wt6, pm4, wt8);
__m512 in4 = _mm512_permutex2var_ps(tmp5, pm3, tmp6);
__m512 in5 = _mm512_permutex2var_ps(tmp5, pm4, tmp6);
__m512 in6 = _mm512_permutex2var_ps(tmp7, pm3, tmp8);
__m512 tmp63 = _mm512_fmadd_ps(in4, _mm512_set1_ps(4e+00f), in6);
__m512 tmp64 = _mm512_add_ps(in4, in6);
__m512 tmp65 = _mm512_fmadd_ps(in6, _mm512_set1_ps(4e+00f), in4);
__m512 tmp66 = _mm512_add_ps(in5, tmp64);
__m512 tmp67 = _mm512_fmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp65);
tmp65 = _mm512_fnmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp65);
__m512 tmp68 = _mm512_fnmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp63);
tmp63 = _mm512_fmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp63);
tmp64 = _mm512_sub_ps(tmp64, in5);
__m512 tmp85 = _mm512_unpacklo_ps(in4, tmp66);
__m512 tmp86 = _mm512_unpackhi_ps(in4, tmp66);
__m512 tmp87 = _mm512_unpacklo_ps(tmp64, tmp67);
__m512 tmp88 = _mm512_unpackhi_ps(tmp64, tmp67);
__m512 tmp89 = _mm512_unpacklo_ps(tmp65, tmp63);
__m512 tmp90 = _mm512_unpackhi_ps(tmp65, tmp63);
__m512 tmp91 = _mm512_unpacklo_ps(tmp68, in6);
__m512 tmp92 = _mm512_unpackhi_ps(tmp68, in6);
__m512 tmp93 = _mm512_shuffle_ps(tmp85, tmp87, 68);
__m512 tmp94 = _mm512_shuffle_ps(tmp85, tmp87, 238);
__m512 tmp95 = _mm512_shuffle_ps(tmp86, tmp88, 68);
__m512 tmp96 = _mm512_shuffle_ps(tmp86, tmp88, 238);
__m512 tmp97 = _mm512_shuffle_ps(tmp89, tmp91, 68);
__m512 tmp98 = _mm512_shuffle_ps(tmp89, tmp91, 238);
__m512 tmp99 = _mm512_shuffle_ps(tmp90, tmp92, 68);
__m512 tmp100 = _mm512_shuffle_ps(tmp90, tmp92, 238);
__m512 tmp101 = _mm512_shuffle_f32x4(tmp93, tmp97, 136);
__m512 tmp102 = _mm512_shuffle_f32x4(tmp93, tmp97, 221);
__m512 tmp103 = _mm512_shuffle_f32x4(tmp94, tmp98, 136);
__m512 tmp104 = _mm512_shuffle_f32x4(tmp94, tmp98, 221);
__m512 tmp105 = _mm512_shuffle_f32x4(tmp95, tmp99, 136);
__m512 tmp106 = _mm512_shuffle_f32x4(tmp95, tmp99, 221);
__m512 tmp107 = _mm512_shuffle_f32x4(tmp96, tmp100, 136);
__m512 tmp108 = _mm512_shuffle_f32x4(tmp96, tmp100, 221);
in4 = _mm512_shuffle_f32x4(tmp101, tmp101, 136);
__m512 tmp69 = _mm512_shuffle_f32x4(tmp101, tmp101, 221);
tmp66 = _mm512_shuffle_f32x4(tmp103, tmp103, 136);
__m512 tmp70 = _mm512_shuffle_f32x4(tmp103, tmp103, 221);
tmp64 = _mm512_shuffle_f32x4(tmp105, tmp105, 136);
__m512 tmp71 = _mm512_shuffle_f32x4(tmp105, tmp105, 221);
tmp67 = _mm512_shuffle_f32x4(tmp107, tmp107, 136);
__m512 tmp72 = _mm512_shuffle_f32x4(tmp107, tmp107, 221);
tmp65 = _mm512_shuffle_f32x4(tmp102, tmp102, 136);
tmp63 = _mm512_shuffle_f32x4(tmp104, tmp104, 136);
tmp68 = _mm512_shuffle_f32x4(tmp106, tmp106, 136);
in6 = _mm512_shuffle_f32x4(tmp108, tmp108, 136);
in4 = _mm512_shuffle_f32x4(in4, tmp67, 68);
tmp66 = _mm512_shuffle_f32x4(tmp66, tmp65, 68);
tmp64 = _mm512_shuffle_f32x4(tmp64, tmp63, 68);
tmp68 = _mm512_shuffle_f32x4(tmp68, tmp70, 68);
in6 = _mm512_shuffle_f32x4(in6, tmp71, 68);
tmp69 = _mm512_shuffle_f32x4(tmp69, tmp72, 68);
__m512 tmp73 = _mm512_fmadd_ps(in4, _mm512_set1_ps(4e+00f), tmp64);
__m512 tmp79 = _mm512_fmadd_ps(tmp68, _mm512_set1_ps(4e+00f), tmp69);
__m512 tmp74 = _mm512_add_ps(in4, tmp64);
__m512 tmp80 = _mm512_add_ps(tmp68, tmp69);
__m512 tmp75 = _mm512_fmadd_ps(tmp64, _mm512_set1_ps(4e+00f), in4);
__m512 tmp81 = _mm512_fmadd_ps(tmp69, _mm512_set1_ps(4e+00f), tmp68);
__m512 tmp76 = _mm512_add_ps(tmp66, tmp74);
__m512 tmp82 = _mm512_add_ps(in6, tmp80);
__m512 tmp77 = _mm512_fmadd_ps(tmp66, _mm512_set1_ps(2e+00f), tmp75);
__m512 tmp83 = _mm512_fmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp81);
tmp75 = _mm512_fnmadd_ps(tmp66, _mm512_set1_ps(2e+00f), tmp75);
tmp81 = _mm512_fnmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp81);
__m512 tmp78 = _mm512_fnmadd_ps(tmp66, _mm512_set1_ps(2e+00f), tmp73);
__m512 tmp84 = _mm512_fnmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp79);
tmp73 = _mm512_fmadd_ps(tmp66, _mm512_set1_ps(2e+00f), tmp73);
tmp79 = _mm512_fmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp79);
tmp74 = _mm512_sub_ps(tmp74, tmp66);
tmp80 = _mm512_sub_ps(tmp80, in6);
in4 = _mm512_mul_ps(in4, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp76 = _mm512_mul_ps(tmp76, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp74 = _mm512_mul_ps(tmp74, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp77 = _mm512_mul_ps(tmp77, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp75 = _mm512_mul_ps(tmp75, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp73 = _mm512_mul_ps(tmp73, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp78 = _mm512_mul_ps(tmp78, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp64 = _mm512_mul_ps(tmp64, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp68 = _mm512_mul_ps(tmp68, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp82 = _mm512_mul_ps(tmp82, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp80 = _mm512_mul_ps(tmp80, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp83 = _mm512_mul_ps(tmp83, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp81 = _mm512_mul_ps(tmp81, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp79 = _mm512_mul_ps(tmp79, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp84 = _mm512_mul_ps(tmp84, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp69 = _mm512_mul_ps(tmp69, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out17 = _mm512_shuffle_f32x4(in4, tmp76, 68);
__m512 out21 = _mm512_shuffle_f32x4(in4, tmp76, 238);
__m512 out18 = _mm512_shuffle_f32x4(tmp74, tmp77, 68);
__m512 out22 = _mm512_shuffle_f32x4(tmp74, tmp77, 238);
__m512 out19 = _mm512_shuffle_f32x4(tmp75, tmp73, 68);
__m512 out23 = _mm512_shuffle_f32x4(tmp75, tmp73, 238);
__m512 out20 = _mm512_shuffle_f32x4(tmp78, tmp64, 68);
__m512 out24 = _mm512_shuffle_f32x4(tmp78, tmp64, 238);
__m512 out25 = _mm512_shuffle_f32x4(tmp68, tmp82, 68);
__m512 out29 = _mm512_shuffle_f32x4(tmp68, tmp82, 238);
__m512 out26 = _mm512_shuffle_f32x4(tmp80, tmp83, 68);
__m512 out30 = _mm512_shuffle_f32x4(tmp80, tmp83, 238);
__m512 out27 = _mm512_shuffle_f32x4(tmp81, tmp79, 68);
__m512 out31 = _mm512_shuffle_f32x4(tmp81, tmp79, 238);
__m512 out28 = _mm512_shuffle_f32x4(tmp84, tmp69, 68);
__m512 out32 = _mm512_shuffle_f32x4(tmp84, tmp69, 238);
ptrdiff_t off5 = 32*cut2;
ptrdiff_t off6 = (size_t)(cut2+1)/4*50688+(size_t)(cut2+1)%4*32;
__m512i wf17 = _mm512_castsi256_si512(_mm512_cvtps_ph(out17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf18 = _mm512_castsi256_si512(_mm512_cvtps_ph(out21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf19 = _mm512_castsi256_si512(_mm512_cvtps_ph(out25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf20 = _mm512_castsi256_si512(_mm512_cvtps_ph(out29, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf21 = _mm512_castsi256_si512(_mm512_cvtps_ph(out18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf22 = _mm512_castsi256_si512(_mm512_cvtps_ph(out22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf23 = _mm512_castsi256_si512(_mm512_cvtps_ph(out26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf24 = _mm512_castsi256_si512(_mm512_cvtps_ph(out30, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf25 = _mm512_castsi256_si512(_mm512_cvtps_ph(out19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf26 = _mm512_castsi256_si512(_mm512_cvtps_ph(out23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf27 = _mm512_castsi256_si512(_mm512_cvtps_ph(out27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf28 = _mm512_castsi256_si512(_mm512_cvtps_ph(out31, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf29 = _mm512_castsi256_si512(_mm512_cvtps_ph(out20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf30 = _mm512_castsi256_si512(_mm512_cvtps_ph(out24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf31 = _mm512_castsi256_si512(_mm512_cvtps_ph(out28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf32 = _mm512_castsi256_si512(_mm512_cvtps_ph(out32, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr1+0+39232512*i5+50688*k2+off5+128*s2, 255, wf17);
_mm512_mask_storeu_epi32(wfPtr1+64+39232512*i5+50688*k2+off5+128*s2, 255, wf18);
_mm512_mask_storeu_epi32(wfPtr1+0+39232512*i5+50688*k2+off6+128*s2, 255, wf19);
_mm512_mask_storeu_epi32(wfPtr1+64+39232512*i5+50688*k2+off6+128*s2, 255, wf20);
_mm512_mask_storeu_epi32(wfPtr1+9808128+39232512*i5+50688*k2+off5+128*s2, 255, wf21);
_mm512_mask_storeu_epi32(wfPtr1+9808192+39232512*i5+50688*k2+off5+128*s2, 255, wf22);
_mm512_mask_storeu_epi32(wfPtr1+9808128+39232512*i5+50688*k2+off6+128*s2, 255, wf23);
_mm512_mask_storeu_epi32(wfPtr1+9808192+39232512*i5+50688*k2+off6+128*s2, 255, wf24);
_mm512_mask_storeu_epi32(wfPtr1+19616256+39232512*i5+50688*k2+off5+128*s2, 255, wf25);
_mm512_mask_storeu_epi32(wfPtr1+19616320+39232512*i5+50688*k2+off5+128*s2, 255, wf26);
_mm512_mask_storeu_epi32(wfPtr1+19616256+39232512*i5+50688*k2+off6+128*s2, 255, wf27);
_mm512_mask_storeu_epi32(wfPtr1+19616320+39232512*i5+50688*k2+off6+128*s2, 255, wf28);
_mm512_mask_storeu_epi32(wfPtr1+29424384+39232512*i5+50688*k2+off5+128*s2, 255, wf29);
_mm512_mask_storeu_epi32(wfPtr1+29424448+39232512*i5+50688*k2+off5+128*s2, 255, wf30);
_mm512_mask_storeu_epi32(wfPtr1+29424384+39232512*i5+50688*k2+off6+128*s2, 255, wf31);
_mm512_mask_storeu_epi32(wfPtr1+29424448+39232512*i5+50688*k2+off6+128*s2, 255, wf32);
}
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(3, biasPtr1-0+3096*i5+16*j1);
}
_mm512_mask_storeu_ps(bfPtr1-0+3096*i5+16*j1, 3, bias2);
if (j1 >= jj1) return;
j1 = 194;
}
return;
}
e1 = 1;
char*restrict bfPtr2 = tensors2[2]+3096*e1;
char*restrict wfPtr2 = tensors2[2]+6208+39232512*e1;
char*restrict wtPtr2 = tensors2[0]+14256*e1;
ptrdiff_t i6 = 1*g2;
ptrdiff_t j2 = 1*b2;
ptrdiff_t jj2 = j2+0;
if (j2 < 193) {
for (; j2 != 193; ++j2) {
ptrdiff_t k3 = 0+1*j2;
ptrdiff_t cut3 = 0;
ptrdiff_t s3 = 0;
for (; s3 != 144; ++s3) {
__m512 wt9 = _mm512_maskz_loadu_ps(511, wtPtr2+0+15046560*i6+77760*j2+36*s3);
__m512 wt10 = _mm512_maskz_loadu_ps(511, wtPtr2+19440+15046560*i6+77760*j2+36*s3);
__m512 wt11 = _mm512_maskz_loadu_ps(511, wtPtr2+38880+15046560*i6+77760*j2+36*s3);
__m512 wt12 = _mm512_maskz_loadu_ps(511, wtPtr2+58320+15046560*i6+77760*j2+36*s3);
__m512i pm5 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm6 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp9 = _mm512_permutex2var_ps(wt9, pm5, wt11);
__m512 tmp10 = _mm512_permutex2var_ps(wt10, pm5, wt12);
__m512 tmp11 = _mm512_permutex2var_ps(wt9, pm6, wt11);
__m512 tmp12 = _mm512_permutex2var_ps(wt10, pm6, wt12);
__m512 in7 = _mm512_permutex2var_ps(tmp9, pm5, tmp10);
__m512 in8 = _mm512_permutex2var_ps(tmp9, pm6, tmp10);
__m512 in9 = _mm512_permutex2var_ps(tmp11, pm5, tmp12);
__m512 tmp109 = _mm512_fmadd_ps(in7, _mm512_set1_ps(4e+00f), in9);
__m512 tmp110 = _mm512_add_ps(in7, in9);
__m512 tmp111 = _mm512_fmadd_ps(in9, _mm512_set1_ps(4e+00f), in7);
__m512 tmp112 = _mm512_add_ps(in8, tmp110);
__m512 tmp113 = _mm512_fmadd_ps(in8, _mm512_set1_ps(2e+00f), tmp111);
tmp111 = _mm512_fnmadd_ps(in8, _mm512_set1_ps(2e+00f), tmp111);
__m512 tmp114 = _mm512_fnmadd_ps(in8, _mm512_set1_ps(2e+00f), tmp109);
tmp109 = _mm512_fmadd_ps(in8, _mm512_set1_ps(2e+00f), tmp109);
tmp110 = _mm512_sub_ps(tmp110, in8);
__m512 tmp131 = _mm512_unpacklo_ps(in7, tmp112);
__m512 tmp132 = _mm512_unpackhi_ps(in7, tmp112);
__m512 tmp133 = _mm512_unpacklo_ps(tmp110, tmp113);
__m512 tmp134 = _mm512_unpackhi_ps(tmp110, tmp113);
__m512 tmp135 = _mm512_unpacklo_ps(tmp111, tmp109);
__m512 tmp136 = _mm512_unpackhi_ps(tmp111, tmp109);
__m512 tmp137 = _mm512_unpacklo_ps(tmp114, in9);
__m512 tmp138 = _mm512_unpackhi_ps(tmp114, in9);
__m512 tmp139 = _mm512_shuffle_ps(tmp131, tmp133, 68);
__m512 tmp140 = _mm512_shuffle_ps(tmp131, tmp133, 238);
__m512 tmp141 = _mm512_shuffle_ps(tmp132, tmp134, 68);
__m512 tmp142 = _mm512_shuffle_ps(tmp132, tmp134, 238);
__m512 tmp143 = _mm512_shuffle_ps(tmp135, tmp137, 68);
__m512 tmp144 = _mm512_shuffle_ps(tmp135, tmp137, 238);
__m512 tmp145 = _mm512_shuffle_ps(tmp136, tmp138, 68);
__m512 tmp146 = _mm512_shuffle_ps(tmp136, tmp138, 238);
__m512 tmp147 = _mm512_shuffle_f32x4(tmp139, tmp143, 136);
__m512 tmp148 = _mm512_shuffle_f32x4(tmp139, tmp143, 221);
__m512 tmp149 = _mm512_shuffle_f32x4(tmp140, tmp144, 136);
__m512 tmp150 = _mm512_shuffle_f32x4(tmp140, tmp144, 221);
__m512 tmp151 = _mm512_shuffle_f32x4(tmp141, tmp145, 136);
__m512 tmp152 = _mm512_shuffle_f32x4(tmp141, tmp145, 221);
__m512 tmp153 = _mm512_shuffle_f32x4(tmp142, tmp146, 136);
__m512 tmp154 = _mm512_shuffle_f32x4(tmp142, tmp146, 221);
in7 = _mm512_shuffle_f32x4(tmp147, tmp147, 136);
__m512 tmp115 = _mm512_shuffle_f32x4(tmp147, tmp147, 221);
tmp112 = _mm512_shuffle_f32x4(tmp149, tmp149, 136);
__m512 tmp116 = _mm512_shuffle_f32x4(tmp149, tmp149, 221);
tmp110 = _mm512_shuffle_f32x4(tmp151, tmp151, 136);
__m512 tmp117 = _mm512_shuffle_f32x4(tmp151, tmp151, 221);
tmp113 = _mm512_shuffle_f32x4(tmp153, tmp153, 136);
__m512 tmp118 = _mm512_shuffle_f32x4(tmp153, tmp153, 221);
tmp111 = _mm512_shuffle_f32x4(tmp148, tmp148, 136);
tmp109 = _mm512_shuffle_f32x4(tmp150, tmp150, 136);
tmp114 = _mm512_shuffle_f32x4(tmp152, tmp152, 136);
in9 = _mm512_shuffle_f32x4(tmp154, tmp154, 136);
in7 = _mm512_shuffle_f32x4(in7, tmp113, 68);
tmp112 = _mm512_shuffle_f32x4(tmp112, tmp111, 68);
tmp110 = _mm512_shuffle_f32x4(tmp110, tmp109, 68);
tmp114 = _mm512_shuffle_f32x4(tmp114, tmp116, 68);
in9 = _mm512_shuffle_f32x4(in9, tmp117, 68);
tmp115 = _mm512_shuffle_f32x4(tmp115, tmp118, 68);
__m512 tmp119 = _mm512_fmadd_ps(in7, _mm512_set1_ps(4e+00f), tmp110);
__m512 tmp125 = _mm512_fmadd_ps(tmp114, _mm512_set1_ps(4e+00f), tmp115);
__m512 tmp120 = _mm512_add_ps(in7, tmp110);
__m512 tmp126 = _mm512_add_ps(tmp114, tmp115);
__m512 tmp121 = _mm512_fmadd_ps(tmp110, _mm512_set1_ps(4e+00f), in7);
__m512 tmp127 = _mm512_fmadd_ps(tmp115, _mm512_set1_ps(4e+00f), tmp114);
__m512 tmp122 = _mm512_add_ps(tmp112, tmp120);
__m512 tmp128 = _mm512_add_ps(in9, tmp126);
__m512 tmp123 = _mm512_fmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp121);
__m512 tmp129 = _mm512_fmadd_ps(in9, _mm512_set1_ps(2e+00f), tmp127);
tmp121 = _mm512_fnmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp121);
tmp127 = _mm512_fnmadd_ps(in9, _mm512_set1_ps(2e+00f), tmp127);
__m512 tmp124 = _mm512_fnmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp119);
__m512 tmp130 = _mm512_fnmadd_ps(in9, _mm512_set1_ps(2e+00f), tmp125);
tmp119 = _mm512_fmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp119);
tmp125 = _mm512_fmadd_ps(in9, _mm512_set1_ps(2e+00f), tmp125);
tmp120 = _mm512_sub_ps(tmp120, tmp112);
tmp126 = _mm512_sub_ps(tmp126, in9);
in7 = _mm512_mul_ps(in7, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp122 = _mm512_mul_ps(tmp122, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp120 = _mm512_mul_ps(tmp120, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp123 = _mm512_mul_ps(tmp123, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp121 = _mm512_mul_ps(tmp121, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp119 = _mm512_mul_ps(tmp119, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp124 = _mm512_mul_ps(tmp124, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp110 = _mm512_mul_ps(tmp110, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp114 = _mm512_mul_ps(tmp114, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp128 = _mm512_mul_ps(tmp128, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp126 = _mm512_mul_ps(tmp126, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp129 = _mm512_mul_ps(tmp129, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp127 = _mm512_mul_ps(tmp127, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp125 = _mm512_mul_ps(tmp125, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp130 = _mm512_mul_ps(tmp130, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp115 = _mm512_mul_ps(tmp115, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out33 = _mm512_shuffle_f32x4(in7, tmp122, 68);
__m512 out37 = _mm512_shuffle_f32x4(in7, tmp122, 238);
__m512 out34 = _mm512_shuffle_f32x4(tmp120, tmp123, 68);
__m512 out38 = _mm512_shuffle_f32x4(tmp120, tmp123, 238);
__m512 out35 = _mm512_shuffle_f32x4(tmp121, tmp119, 68);
__m512 out39 = _mm512_shuffle_f32x4(tmp121, tmp119, 238);
__m512 out36 = _mm512_shuffle_f32x4(tmp124, tmp110, 68);
__m512 out40 = _mm512_shuffle_f32x4(tmp124, tmp110, 238);
__m512 out41 = _mm512_shuffle_f32x4(tmp114, tmp128, 68);
__m512 out45 = _mm512_shuffle_f32x4(tmp114, tmp128, 238);
__m512 out42 = _mm512_shuffle_f32x4(tmp126, tmp129, 68);
__m512 out46 = _mm512_shuffle_f32x4(tmp126, tmp129, 238);
__m512 out43 = _mm512_shuffle_f32x4(tmp127, tmp125, 68);
__m512 out47 = _mm512_shuffle_f32x4(tmp127, tmp125, 238);
__m512 out44 = _mm512_shuffle_f32x4(tmp130, tmp115, 68);
__m512 out48 = _mm512_shuffle_f32x4(tmp130, tmp115, 238);
ptrdiff_t off7 = 32*cut3;
ptrdiff_t off8 = (size_t)(cut3+1)/4*18432+(size_t)(cut3+1)%4*32;
ptrdiff_t off9 = (size_t)(cut3+2)/4*18432+(size_t)(cut3+2)%4*32;
ptrdiff_t off10 = (size_t)(cut3+3)/4*18432+(size_t)(cut3+3)%4*32;
__m512i wf33 = _mm512_castsi256_si512(_mm512_cvtps_ph(out33, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf34 = _mm512_castsi256_si512(_mm512_cvtps_ph(out37, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf35 = _mm512_castsi256_si512(_mm512_cvtps_ph(out41, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf36 = _mm512_castsi256_si512(_mm512_cvtps_ph(out45, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf37 = _mm512_castsi256_si512(_mm512_cvtps_ph(out34, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf38 = _mm512_castsi256_si512(_mm512_cvtps_ph(out38, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf39 = _mm512_castsi256_si512(_mm512_cvtps_ph(out42, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf40 = _mm512_castsi256_si512(_mm512_cvtps_ph(out46, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf41 = _mm512_castsi256_si512(_mm512_cvtps_ph(out35, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf42 = _mm512_castsi256_si512(_mm512_cvtps_ph(out39, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf43 = _mm512_castsi256_si512(_mm512_cvtps_ph(out43, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf44 = _mm512_castsi256_si512(_mm512_cvtps_ph(out47, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf45 = _mm512_castsi256_si512(_mm512_cvtps_ph(out36, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf46 = _mm512_castsi256_si512(_mm512_cvtps_ph(out40, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf47 = _mm512_castsi256_si512(_mm512_cvtps_ph(out44, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf48 = _mm512_castsi256_si512(_mm512_cvtps_ph(out48, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr2+0+14266368*i6+18432*k3+off7+128*s3, 255, wf33);
_mm512_mask_storeu_epi32(wfPtr2+0+14266368*i6+18432*k3+off8+128*s3, 255, wf34);
_mm512_mask_storeu_epi32(wfPtr2+0+14266368*i6+18432*k3+off9+128*s3, 255, wf35);
_mm512_mask_storeu_epi32(wfPtr2+0+14266368*i6+18432*k3+off10+128*s3, 255, wf36);
_mm512_mask_storeu_epi32(wfPtr2+3566592+14266368*i6+18432*k3+off7+128*s3, 255, wf37);
_mm512_mask_storeu_epi32(wfPtr2+3566592+14266368*i6+18432*k3+off8+128*s3, 255, wf38);
_mm512_mask_storeu_epi32(wfPtr2+3566592+14266368*i6+18432*k3+off9+128*s3, 255, wf39);
_mm512_mask_storeu_epi32(wfPtr2+3566592+14266368*i6+18432*k3+off10+128*s3, 255, wf40);
_mm512_mask_storeu_epi32(wfPtr2+7133184+14266368*i6+18432*k3+off7+128*s3, 255, wf41);
_mm512_mask_storeu_epi32(wfPtr2+7133184+14266368*i6+18432*k3+off8+128*s3, 255, wf42);
_mm512_mask_storeu_epi32(wfPtr2+7133184+14266368*i6+18432*k3+off9+128*s3, 255, wf43);
_mm512_mask_storeu_epi32(wfPtr2+7133184+14266368*i6+18432*k3+off10+128*s3, 255, wf44);
_mm512_mask_storeu_epi32(wfPtr2+10699776+14266368*i6+18432*k3+off7+128*s3, 255, wf45);
_mm512_mask_storeu_epi32(wfPtr2+10699776+14266368*i6+18432*k3+off8+128*s3, 255, wf46);
_mm512_mask_storeu_epi32(wfPtr2+10699776+14266368*i6+18432*k3+off9+128*s3, 255, wf47);
_mm512_mask_storeu_epi32(wfPtr2+10699776+14266368*i6+18432*k3+off10+128*s3, 255, wf48);
}
_mm512_mask_storeu_ps(bfPtr2-0+3096*i6+16*j2, 15, _mm512_setzero_ps());
if (j2 >= jj2) return;
}
}
if (j2 == 193) {
ptrdiff_t k4 = 0+1*j2;
ptrdiff_t cut4 = 0;
ptrdiff_t s4 = 0;
for (; s4 != 72; ++s4) {
__m512 wt13 = _mm512_maskz_loadu_ps(511, wtPtr2+0+15046560*i6+77760*j2+72*s4);
__m512 wt14 = _mm512_maskz_loadu_ps(511, wtPtr2+36+15046560*i6+77760*j2+72*s4);
__m512 wt15 = _mm512_maskz_loadu_ps(511, wtPtr2+19440+15046560*i6+77760*j2+72*s4);
__m512 wt16 = _mm512_maskz_loadu_ps(511, wtPtr2+19476+15046560*i6+77760*j2+72*s4);
__m512i pm7 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm8 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp13 = _mm512_permutex2var_ps(wt13, pm7, wt15);
__m512 tmp14 = _mm512_permutex2var_ps(wt14, pm7, wt16);
__m512 tmp15 = _mm512_permutex2var_ps(wt13, pm8, wt15);
__m512 tmp16 = _mm512_permutex2var_ps(wt14, pm8, wt16);
__m512 in10 = _mm512_permutex2var_ps(tmp13, pm7, tmp14);
__m512 in11 = _mm512_permutex2var_ps(tmp13, pm8, tmp14);
__m512 in12 = _mm512_permutex2var_ps(tmp15, pm7, tmp16);
__m512 tmp155 = _mm512_fmadd_ps(in10, _mm512_set1_ps(4e+00f), in12);
__m512 tmp156 = _mm512_add_ps(in10, in12);
__m512 tmp157 = _mm512_fmadd_ps(in12, _mm512_set1_ps(4e+00f), in10);
__m512 tmp158 = _mm512_add_ps(in11, tmp156);
__m512 tmp159 = _mm512_fmadd_ps(in11, _mm512_set1_ps(2e+00f), tmp157);
tmp157 = _mm512_fnmadd_ps(in11, _mm512_set1_ps(2e+00f), tmp157);
__m512 tmp160 = _mm512_fnmadd_ps(in11, _mm512_set1_ps(2e+00f), tmp155);
tmp155 = _mm512_fmadd_ps(in11, _mm512_set1_ps(2e+00f), tmp155);
tmp156 = _mm512_sub_ps(tmp156, in11);
__m512 tmp177 = _mm512_unpacklo_ps(in10, tmp158);
__m512 tmp178 = _mm512_unpackhi_ps(in10, tmp158);
__m512 tmp179 = _mm512_unpacklo_ps(tmp156, tmp159);
__m512 tmp180 = _mm512_unpackhi_ps(tmp156, tmp159);
__m512 tmp181 = _mm512_unpacklo_ps(tmp157, tmp155);
__m512 tmp182 = _mm512_unpackhi_ps(tmp157, tmp155);
__m512 tmp183 = _mm512_unpacklo_ps(tmp160, in12);
__m512 tmp184 = _mm512_unpackhi_ps(tmp160, in12);
__m512 tmp185 = _mm512_shuffle_ps(tmp177, tmp179, 68);
__m512 tmp186 = _mm512_shuffle_ps(tmp177, tmp179, 238);
__m512 tmp187 = _mm512_shuffle_ps(tmp178, tmp180, 68);
__m512 tmp188 = _mm512_shuffle_ps(tmp178, tmp180, 238);
__m512 tmp189 = _mm512_shuffle_ps(tmp181, tmp183, 68);
__m512 tmp190 = _mm512_shuffle_ps(tmp181, tmp183, 238);
__m512 tmp191 = _mm512_shuffle_ps(tmp182, tmp184, 68);
__m512 tmp192 = _mm512_shuffle_ps(tmp182, tmp184, 238);
__m512 tmp193 = _mm512_shuffle_f32x4(tmp185, tmp189, 136);
__m512 tmp194 = _mm512_shuffle_f32x4(tmp185, tmp189, 221);
__m512 tmp195 = _mm512_shuffle_f32x4(tmp186, tmp190, 136);
__m512 tmp196 = _mm512_shuffle_f32x4(tmp186, tmp190, 221);
__m512 tmp197 = _mm512_shuffle_f32x4(tmp187, tmp191, 136);
__m512 tmp198 = _mm512_shuffle_f32x4(tmp187, tmp191, 221);
__m512 tmp199 = _mm512_shuffle_f32x4(tmp188, tmp192, 136);
__m512 tmp200 = _mm512_shuffle_f32x4(tmp188, tmp192, 221);
in10 = _mm512_shuffle_f32x4(tmp193, tmp193, 136);
__m512 tmp161 = _mm512_shuffle_f32x4(tmp193, tmp193, 221);
tmp158 = _mm512_shuffle_f32x4(tmp195, tmp195, 136);
__m512 tmp162 = _mm512_shuffle_f32x4(tmp195, tmp195, 221);
tmp156 = _mm512_shuffle_f32x4(tmp197, tmp197, 136);
__m512 tmp163 = _mm512_shuffle_f32x4(tmp197, tmp197, 221);
tmp159 = _mm512_shuffle_f32x4(tmp199, tmp199, 136);
__m512 tmp164 = _mm512_shuffle_f32x4(tmp199, tmp199, 221);
tmp157 = _mm512_shuffle_f32x4(tmp194, tmp194, 136);
tmp155 = _mm512_shuffle_f32x4(tmp196, tmp196, 136);
tmp160 = _mm512_shuffle_f32x4(tmp198, tmp198, 136);
in12 = _mm512_shuffle_f32x4(tmp200, tmp200, 136);
in10 = _mm512_shuffle_f32x4(in10, tmp159, 68);
tmp158 = _mm512_shuffle_f32x4(tmp158, tmp157, 68);
tmp156 = _mm512_shuffle_f32x4(tmp156, tmp155, 68);
tmp160 = _mm512_shuffle_f32x4(tmp160, tmp162, 68);
in12 = _mm512_shuffle_f32x4(in12, tmp163, 68);
tmp161 = _mm512_shuffle_f32x4(tmp161, tmp164, 68);
__m512 tmp165 = _mm512_fmadd_ps(in10, _mm512_set1_ps(4e+00f), tmp156);
__m512 tmp171 = _mm512_fmadd_ps(tmp160, _mm512_set1_ps(4e+00f), tmp161);
__m512 tmp166 = _mm512_add_ps(in10, tmp156);
__m512 tmp172 = _mm512_add_ps(tmp160, tmp161);
__m512 tmp167 = _mm512_fmadd_ps(tmp156, _mm512_set1_ps(4e+00f), in10);
__m512 tmp173 = _mm512_fmadd_ps(tmp161, _mm512_set1_ps(4e+00f), tmp160);
__m512 tmp168 = _mm512_add_ps(tmp158, tmp166);
__m512 tmp174 = _mm512_add_ps(in12, tmp172);
__m512 tmp169 = _mm512_fmadd_ps(tmp158, _mm512_set1_ps(2e+00f), tmp167);
__m512 tmp175 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp173);
tmp167 = _mm512_fnmadd_ps(tmp158, _mm512_set1_ps(2e+00f), tmp167);
tmp173 = _mm512_fnmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp173);
__m512 tmp170 = _mm512_fnmadd_ps(tmp158, _mm512_set1_ps(2e+00f), tmp165);
__m512 tmp176 = _mm512_fnmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp171);
tmp165 = _mm512_fmadd_ps(tmp158, _mm512_set1_ps(2e+00f), tmp165);
tmp171 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp171);
tmp166 = _mm512_sub_ps(tmp166, tmp158);
tmp172 = _mm512_sub_ps(tmp172, in12);
in10 = _mm512_mul_ps(in10, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp168 = _mm512_mul_ps(tmp168, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp166 = _mm512_mul_ps(tmp166, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp169 = _mm512_mul_ps(tmp169, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp167 = _mm512_mul_ps(tmp167, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp165 = _mm512_mul_ps(tmp165, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp170 = _mm512_mul_ps(tmp170, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp156 = _mm512_mul_ps(tmp156, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp160 = _mm512_mul_ps(tmp160, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp174 = _mm512_mul_ps(tmp174, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp172 = _mm512_mul_ps(tmp172, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp175 = _mm512_mul_ps(tmp175, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp173 = _mm512_mul_ps(tmp173, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp171 = _mm512_mul_ps(tmp171, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp176 = _mm512_mul_ps(tmp176, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp161 = _mm512_mul_ps(tmp161, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out49 = _mm512_shuffle_f32x4(in10, tmp168, 68);
__m512 out53 = _mm512_shuffle_f32x4(in10, tmp168, 238);
__m512 out50 = _mm512_shuffle_f32x4(tmp166, tmp169, 68);
__m512 out54 = _mm512_shuffle_f32x4(tmp166, tmp169, 238);
__m512 out51 = _mm512_shuffle_f32x4(tmp167, tmp165, 68);
__m512 out55 = _mm512_shuffle_f32x4(tmp167, tmp165, 238);
__m512 out52 = _mm512_shuffle_f32x4(tmp170, tmp156, 68);
__m512 out56 = _mm512_shuffle_f32x4(tmp170, tmp156, 238);
__m512 out57 = _mm512_shuffle_f32x4(tmp160, tmp174, 68);
__m512 out61 = _mm512_shuffle_f32x4(tmp160, tmp174, 238);
__m512 out58 = _mm512_shuffle_f32x4(tmp172, tmp175, 68);
__m512 out62 = _mm512_shuffle_f32x4(tmp172, tmp175, 238);
__m512 out59 = _mm512_shuffle_f32x4(tmp173, tmp171, 68);
__m512 out63 = _mm512_shuffle_f32x4(tmp173, tmp171, 238);
__m512 out60 = _mm512_shuffle_f32x4(tmp176, tmp161, 68);
__m512 out64 = _mm512_shuffle_f32x4(tmp176, tmp161, 238);
ptrdiff_t off11 = 32*cut4;
ptrdiff_t off12 = (size_t)(cut4+1)/4*18432+(size_t)(cut4+1)%4*32;
__m512i wf49 = _mm512_castsi256_si512(_mm512_cvtps_ph(out49, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf50 = _mm512_castsi256_si512(_mm512_cvtps_ph(out53, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf51 = _mm512_castsi256_si512(_mm512_cvtps_ph(out57, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf52 = _mm512_castsi256_si512(_mm512_cvtps_ph(out61, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf53 = _mm512_castsi256_si512(_mm512_cvtps_ph(out50, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf54 = _mm512_castsi256_si512(_mm512_cvtps_ph(out54, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf55 = _mm512_castsi256_si512(_mm512_cvtps_ph(out58, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf56 = _mm512_castsi256_si512(_mm512_cvtps_ph(out62, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf57 = _mm512_castsi256_si512(_mm512_cvtps_ph(out51, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf58 = _mm512_castsi256_si512(_mm512_cvtps_ph(out55, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf59 = _mm512_castsi256_si512(_mm512_cvtps_ph(out59, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf60 = _mm512_castsi256_si512(_mm512_cvtps_ph(out63, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf61 = _mm512_castsi256_si512(_mm512_cvtps_ph(out52, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf62 = _mm512_castsi256_si512(_mm512_cvtps_ph(out56, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf63 = _mm512_castsi256_si512(_mm512_cvtps_ph(out60, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf64 = _mm512_castsi256_si512(_mm512_cvtps_ph(out64, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr2+0+14266368*i6+18432*k4+off11+128*s4, 255, wf49);
_mm512_mask_storeu_epi32(wfPtr2+64+14266368*i6+18432*k4+off11+128*s4, 255, wf50);
_mm512_mask_storeu_epi32(wfPtr2+0+14266368*i6+18432*k4+off12+128*s4, 255, wf51);
_mm512_mask_storeu_epi32(wfPtr2+64+14266368*i6+18432*k4+off12+128*s4, 255, wf52);
_mm512_mask_storeu_epi32(wfPtr2+3566592+14266368*i6+18432*k4+off11+128*s4, 255, wf53);
_mm512_mask_storeu_epi32(wfPtr2+3566656+14266368*i6+18432*k4+off11+128*s4, 255, wf54);
_mm512_mask_storeu_epi32(wfPtr2+3566592+14266368*i6+18432*k4+off12+128*s4, 255, wf55);
_mm512_mask_storeu_epi32(wfPtr2+3566656+14266368*i6+18432*k4+off12+128*s4, 255, wf56);
_mm512_mask_storeu_epi32(wfPtr2+7133184+14266368*i6+18432*k4+off11+128*s4, 255, wf57);
_mm512_mask_storeu_epi32(wfPtr2+7133248+14266368*i6+18432*k4+off11+128*s4, 255, wf58);
_mm512_mask_storeu_epi32(wfPtr2+7133184+14266368*i6+18432*k4+off12+128*s4, 255, wf59);
_mm512_mask_storeu_epi32(wfPtr2+7133248+14266368*i6+18432*k4+off12+128*s4, 255, wf60);
_mm512_mask_storeu_epi32(wfPtr2+10699776+14266368*i6+18432*k4+off11+128*s4, 255, wf61);
_mm512_mask_storeu_epi32(wfPtr2+10699840+14266368*i6+18432*k4+off11+128*s4, 255, wf62);
_mm512_mask_storeu_epi32(wfPtr2+10699776+14266368*i6+18432*k4+off12+128*s4, 255, wf63);
_mm512_mask_storeu_epi32(wfPtr2+10699840+14266368*i6+18432*k4+off12+128*s4, 255, wf64);
}
_mm512_mask_storeu_ps(bfPtr2-0+3096*i6+16*j2, 3, _mm512_setzero_ps());
if (j2 >= jj2) return;
j2 = 194;
}
}

static void Example12ThreeArrangeFilts1(Example12ThreaderTeam1* team13, char** tensors1) {
Example12ThreaderTask1 task5;
task5.callee1 = Example12ThreeArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 194;
task5.hull1[1] = 1;
task5.hull1[2] = 2;
Example12ThreaderDo1(team13, &task5);
}

static void Example12ThreeArrangeDats1Callee1(Example12ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s5 = 0;
ptrdiff_t c1 = pt8[1];
ptrdiff_t g3 = 0;
ptrdiff_t e2 = pt8[3];
if (e2 < 1) {
e2 = 0;
char*restrict datPtr1 = tensors4[0]-108+342144*e2;
char*restrict dfPtr1 = tensors4[1]+1013760*e2;
ptrdiff_t i7 = 1*g3;
ptrdiff_t j3 = 1*c1;
ptrdiff_t last1 = j3+0;
ptrdiff_t rel1 = j3-0;
ptrdiff_t base1 = 0;
if (rel1 < 1) {
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k5 = 0;
for (; k5 != 198; ++k5) {
__m512 dat1 = _mm512_maskz_loadu_ps(16383, datPtr1+108+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat2 = _mm512_maskz_loadu_ps(16383, datPtr1+156+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512i pm9 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in13 = _mm512_permutexvar_ps(pm9, dat1);
__m512 in20 = _mm512_permutexvar_ps(pm9, dat2);
__m512 dat3 = _mm512_maskz_loadu_ps(16383, datPtr1+216+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat4 = _mm512_maskz_loadu_ps(16383, datPtr1+264+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in14 = _mm512_permutexvar_ps(pm9, dat3);
__m512 in21 = _mm512_permutexvar_ps(pm9, dat4);
__m512 dat5 = _mm512_maskz_loadu_ps(16383, datPtr1+324+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat6 = _mm512_maskz_loadu_ps(16383, datPtr1+372+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in15 = _mm512_permutexvar_ps(pm9, dat5);
__m512 in22 = _mm512_permutexvar_ps(pm9, dat6);
__m512 dat7 = _mm512_maskz_loadu_ps(16383, datPtr1+432+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat8 = _mm512_maskz_loadu_ps(16383, datPtr1+480+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in16 = _mm512_permutexvar_ps(pm9, dat7);
__m512 in23 = _mm512_permutexvar_ps(pm9, dat8);
__m512 dat9 = _mm512_maskz_loadu_ps(16383, datPtr1+540+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat10 = _mm512_maskz_loadu_ps(16383, datPtr1+588+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in17 = _mm512_permutexvar_ps(pm9, dat9);
__m512 in24 = _mm512_permutexvar_ps(pm9, dat10);
__m512 dat11 = _mm512_maskz_loadu_ps(16383, datPtr1+648+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat12 = _mm512_maskz_loadu_ps(16383, datPtr1+696+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in18 = _mm512_permutexvar_ps(pm9, dat11);
__m512 in25 = _mm512_permutexvar_ps(pm9, dat12);
__m512 dat13 = _mm512_maskz_loadu_ps(16383, datPtr1+756+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat14 = _mm512_maskz_loadu_ps(16383, datPtr1+804+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in19 = _mm512_permutexvar_ps(pm9, dat13);
__m512 in26 = _mm512_permutexvar_ps(pm9, dat14);
__m512 tmp201 = _mm512_add_ps(in13, in17);
__m512 tmp206 = _mm512_add_ps(in20, in24);
__m512 tmp202 = _mm512_sub_ps(in16, in14);
__m512 tmp207 = _mm512_sub_ps(in23, in21);
__m512 tmp203 = _mm512_add_ps(in14, in18);
__m512 tmp208 = _mm512_add_ps(in21, in25);
__m512 tmp204 = _mm512_sub_ps(_mm512_setzero_ps(), in18);
__m512 tmp209 = _mm512_sub_ps(_mm512_setzero_ps(), in25);
tmp201 = _mm512_fmadd_ps(in15, _mm512_set1_ps(-4.25e+00f), tmp201);
tmp206 = _mm512_fmadd_ps(in22, _mm512_set1_ps(-4.25e+00f), tmp206);
tmp203 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-4.25e+00f), tmp203);
tmp208 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-4.25e+00f), tmp208);
tmp204 = _mm512_fmadd_ps(tmp202, _mm512_set1_ps(5.25e+00f), tmp204);
tmp209 = _mm512_fmadd_ps(tmp207, _mm512_set1_ps(5.25e+00f), tmp209);
tmp202 = _mm512_fmadd_ps(in14, _mm512_set1_ps(2.5e-01f), in18);
tmp207 = _mm512_fmadd_ps(in21, _mm512_set1_ps(2.5e-01f), in25);
in14 = _mm512_fmadd_ps(in14, _mm512_set1_ps(4e+00f), in18);
in21 = _mm512_fmadd_ps(in21, _mm512_set1_ps(4e+00f), in25);
__m512 tmp205 = _mm512_sub_ps(tmp203, tmp201);
__m512 tmp210 = _mm512_sub_ps(tmp208, tmp206);
tmp203 = _mm512_add_ps(tmp201, tmp203);
tmp208 = _mm512_add_ps(tmp206, tmp208);
tmp201 = _mm512_fmadd_ps(in13, _mm512_set1_ps(2.5e-01f), in17);
tmp206 = _mm512_fmadd_ps(in20, _mm512_set1_ps(2.5e-01f), in24);
tmp202 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-1.25e+00f), tmp202);
tmp207 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-1.25e+00f), tmp207);
in16 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-5e+00f), in14);
in23 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-5e+00f), in21);
tmp201 = _mm512_fmadd_ps(in15, _mm512_set1_ps(-1.25e+00f), tmp201);
tmp206 = _mm512_fmadd_ps(in22, _mm512_set1_ps(-1.25e+00f), tmp206);
in18 = _mm512_fmadd_ps(tmp201, _mm512_set1_ps(2e+00f), tmp202);
in25 = _mm512_fmadd_ps(tmp206, _mm512_set1_ps(2e+00f), tmp207);
tmp202 = _mm512_fnmadd_ps(tmp201, _mm512_set1_ps(2e+00f), tmp202);
tmp207 = _mm512_fnmadd_ps(tmp206, _mm512_set1_ps(2e+00f), tmp207);
tmp201 = _mm512_fmadd_ps(in17, _mm512_set1_ps(2.5e-01f), in13);
tmp206 = _mm512_fmadd_ps(in24, _mm512_set1_ps(2.5e-01f), in20);
in13 = _mm512_sub_ps(in19, in13);
in20 = _mm512_sub_ps(in26, in20);
tmp201 = _mm512_fmadd_ps(in15, _mm512_set1_ps(-1.25e+00f), tmp201);
tmp206 = _mm512_fmadd_ps(in22, _mm512_set1_ps(-1.25e+00f), tmp206);
in15 = _mm512_sub_ps(in15, in17);
in22 = _mm512_sub_ps(in22, in24);
in15 = _mm512_fmadd_ps(in15, _mm512_set1_ps(5.25e+00f), in13);
in22 = _mm512_fmadd_ps(in22, _mm512_set1_ps(5.25e+00f), in20);
in14 = _mm512_fmadd_ps(tmp201, _mm512_set1_ps(2e+00f), in16);
in21 = _mm512_fmadd_ps(tmp206, _mm512_set1_ps(2e+00f), in23);
in16 = _mm512_fnmadd_ps(tmp201, _mm512_set1_ps(2e+00f), in16);
in23 = _mm512_fnmadd_ps(tmp206, _mm512_set1_ps(2e+00f), in23);
__m512 tmp219 = _mm512_unpacklo_ps(tmp204, tmp203);
__m512 tmp220 = _mm512_unpackhi_ps(tmp204, tmp203);
__m512 tmp221 = _mm512_unpacklo_ps(tmp205, in18);
__m512 tmp222 = _mm512_unpackhi_ps(tmp205, in18);
__m512 tmp223 = _mm512_unpacklo_ps(tmp202, in14);
__m512 tmp224 = _mm512_unpackhi_ps(tmp202, in14);
__m512 tmp225 = _mm512_unpacklo_ps(in16, in15);
__m512 tmp226 = _mm512_unpackhi_ps(in16, in15);
__m512 tmp227 = _mm512_unpacklo_ps(tmp209, tmp208);
__m512 tmp228 = _mm512_unpackhi_ps(tmp209, tmp208);
__m512 tmp229 = _mm512_unpacklo_ps(tmp210, in25);
__m512 tmp230 = _mm512_unpackhi_ps(tmp210, in25);
__m512 tmp231 = _mm512_unpacklo_ps(tmp207, in21);
__m512 tmp232 = _mm512_unpackhi_ps(tmp207, in21);
__m512 tmp233 = _mm512_unpacklo_ps(in23, in22);
__m512 tmp234 = _mm512_unpackhi_ps(in23, in22);
__m512 tmp235 = _mm512_shuffle_ps(tmp219, tmp221, 68);
__m512 tmp236 = _mm512_shuffle_ps(tmp219, tmp221, 238);
__m512 tmp237 = _mm512_shuffle_ps(tmp220, tmp222, 68);
__m512 tmp238 = _mm512_shuffle_ps(tmp220, tmp222, 238);
__m512 tmp239 = _mm512_shuffle_ps(tmp223, tmp225, 68);
__m512 tmp240 = _mm512_shuffle_ps(tmp223, tmp225, 238);
__m512 tmp241 = _mm512_shuffle_ps(tmp224, tmp226, 68);
__m512 tmp242 = _mm512_shuffle_ps(tmp224, tmp226, 238);
__m512 tmp243 = _mm512_shuffle_ps(tmp227, tmp229, 68);
__m512 tmp244 = _mm512_shuffle_ps(tmp227, tmp229, 238);
__m512 tmp245 = _mm512_shuffle_ps(tmp228, tmp230, 68);
__m512 tmp246 = _mm512_shuffle_ps(tmp228, tmp230, 238);
__m512 tmp247 = _mm512_shuffle_ps(tmp231, tmp233, 68);
__m512 tmp248 = _mm512_shuffle_ps(tmp231, tmp233, 238);
__m512 tmp249 = _mm512_shuffle_ps(tmp232, tmp234, 68);
__m512 tmp250 = _mm512_shuffle_ps(tmp232, tmp234, 238);
__m512 tmp251 = _mm512_shuffle_f32x4(tmp235, tmp239, 136);
__m512 tmp252 = _mm512_shuffle_f32x4(tmp235, tmp239, 221);
__m512 tmp253 = _mm512_shuffle_f32x4(tmp236, tmp240, 136);
__m512 tmp254 = _mm512_shuffle_f32x4(tmp236, tmp240, 221);
__m512 tmp255 = _mm512_shuffle_f32x4(tmp237, tmp241, 136);
__m512 tmp256 = _mm512_shuffle_f32x4(tmp237, tmp241, 221);
__m512 tmp257 = _mm512_shuffle_f32x4(tmp238, tmp242, 136);
__m512 tmp258 = _mm512_shuffle_f32x4(tmp238, tmp242, 221);
__m512 tmp259 = _mm512_shuffle_f32x4(tmp243, tmp247, 136);
__m512 tmp260 = _mm512_shuffle_f32x4(tmp243, tmp247, 221);
__m512 tmp261 = _mm512_shuffle_f32x4(tmp244, tmp248, 136);
__m512 tmp262 = _mm512_shuffle_f32x4(tmp244, tmp248, 221);
__m512 tmp263 = _mm512_shuffle_f32x4(tmp245, tmp249, 136);
__m512 tmp264 = _mm512_shuffle_f32x4(tmp245, tmp249, 221);
__m512 tmp265 = _mm512_shuffle_f32x4(tmp246, tmp250, 136);
__m512 tmp266 = _mm512_shuffle_f32x4(tmp246, tmp250, 221);
tmp204 = _mm512_shuffle_f32x4(tmp251, tmp259, 136);
tmp209 = _mm512_shuffle_f32x4(tmp251, tmp259, 221);
tmp203 = _mm512_shuffle_f32x4(tmp253, tmp261, 136);
tmp208 = _mm512_shuffle_f32x4(tmp253, tmp261, 221);
tmp205 = _mm512_shuffle_f32x4(tmp255, tmp263, 136);
tmp210 = _mm512_shuffle_f32x4(tmp255, tmp263, 221);
in18 = _mm512_shuffle_f32x4(tmp257, tmp265, 136);
in25 = _mm512_shuffle_f32x4(tmp257, tmp265, 221);
tmp202 = _mm512_shuffle_f32x4(tmp252, tmp260, 136);
tmp207 = _mm512_shuffle_f32x4(tmp252, tmp260, 221);
in14 = _mm512_shuffle_f32x4(tmp254, tmp262, 136);
in21 = _mm512_shuffle_f32x4(tmp254, tmp262, 221);
in16 = _mm512_shuffle_f32x4(tmp256, tmp264, 136);
in23 = _mm512_shuffle_f32x4(tmp256, tmp264, 221);
in15 = _mm512_shuffle_f32x4(tmp258, tmp266, 136);
in22 = _mm512_shuffle_f32x4(tmp258, tmp266, 221);
__m512 tmp211 = _mm512_add_ps(tmp203, in14);
__m512 tmp215 = _mm512_add_ps(tmp208, in21);
__m512 tmp212 = _mm512_sub_ps(tmp202, tmp205);
__m512 tmp216 = _mm512_sub_ps(tmp207, tmp210);
__m512 tmp213 = _mm512_add_ps(tmp205, in16);
__m512 tmp217 = _mm512_add_ps(tmp210, in23);
tmp204 = _mm512_sub_ps(tmp204, in16);
tmp209 = _mm512_sub_ps(tmp209, in23);
tmp211 = _mm512_fmadd_ps(in18, _mm512_set1_ps(-4.25e+00f), tmp211);
tmp215 = _mm512_fmadd_ps(in25, _mm512_set1_ps(-4.25e+00f), tmp215);
tmp213 = _mm512_fmadd_ps(tmp202, _mm512_set1_ps(-4.25e+00f), tmp213);
tmp217 = _mm512_fmadd_ps(tmp207, _mm512_set1_ps(-4.25e+00f), tmp217);
tmp204 = _mm512_fmadd_ps(tmp212, _mm512_set1_ps(5.25e+00f), tmp204);
tmp209 = _mm512_fmadd_ps(tmp216, _mm512_set1_ps(5.25e+00f), tmp209);
tmp212 = _mm512_fmadd_ps(tmp205, _mm512_set1_ps(2.5e-01f), in16);
tmp216 = _mm512_fmadd_ps(tmp210, _mm512_set1_ps(2.5e-01f), in23);
tmp205 = _mm512_fmadd_ps(tmp205, _mm512_set1_ps(4e+00f), in16);
tmp210 = _mm512_fmadd_ps(tmp210, _mm512_set1_ps(4e+00f), in23);
__m512 tmp214 = _mm512_sub_ps(tmp213, tmp211);
__m512 tmp218 = _mm512_sub_ps(tmp217, tmp215);
tmp213 = _mm512_add_ps(tmp211, tmp213);
tmp217 = _mm512_add_ps(tmp215, tmp217);
tmp211 = _mm512_fmadd_ps(tmp203, _mm512_set1_ps(2.5e-01f), in14);
tmp215 = _mm512_fmadd_ps(tmp208, _mm512_set1_ps(2.5e-01f), in21);
tmp212 = _mm512_fmadd_ps(tmp202, _mm512_set1_ps(-1.25e+00f), tmp212);
tmp216 = _mm512_fmadd_ps(tmp207, _mm512_set1_ps(-1.25e+00f), tmp216);
tmp202 = _mm512_fmadd_ps(tmp202, _mm512_set1_ps(-5e+00f), tmp205);
tmp207 = _mm512_fmadd_ps(tmp207, _mm512_set1_ps(-5e+00f), tmp210);
tmp211 = _mm512_fmadd_ps(in18, _mm512_set1_ps(-1.25e+00f), tmp211);
tmp215 = _mm512_fmadd_ps(in25, _mm512_set1_ps(-1.25e+00f), tmp215);
in16 = _mm512_fmadd_ps(tmp211, _mm512_set1_ps(2e+00f), tmp212);
in23 = _mm512_fmadd_ps(tmp215, _mm512_set1_ps(2e+00f), tmp216);
tmp212 = _mm512_fnmadd_ps(tmp211, _mm512_set1_ps(2e+00f), tmp212);
tmp216 = _mm512_fnmadd_ps(tmp215, _mm512_set1_ps(2e+00f), tmp216);
tmp211 = _mm512_fmadd_ps(in14, _mm512_set1_ps(2.5e-01f), tmp203);
tmp215 = _mm512_fmadd_ps(in21, _mm512_set1_ps(2.5e-01f), tmp208);
tmp203 = _mm512_sub_ps(in15, tmp203);
tmp208 = _mm512_sub_ps(in22, tmp208);
tmp211 = _mm512_fmadd_ps(in18, _mm512_set1_ps(-1.25e+00f), tmp211);
tmp215 = _mm512_fmadd_ps(in25, _mm512_set1_ps(-1.25e+00f), tmp215);
in18 = _mm512_sub_ps(in18, in14);
in25 = _mm512_sub_ps(in25, in21);
in18 = _mm512_fmadd_ps(in18, _mm512_set1_ps(5.25e+00f), tmp203);
in25 = _mm512_fmadd_ps(in25, _mm512_set1_ps(5.25e+00f), tmp208);
tmp205 = _mm512_fmadd_ps(tmp211, _mm512_set1_ps(2e+00f), tmp202);
tmp210 = _mm512_fmadd_ps(tmp215, _mm512_set1_ps(2e+00f), tmp207);
tmp202 = _mm512_fnmadd_ps(tmp211, _mm512_set1_ps(2e+00f), tmp202);
tmp207 = _mm512_fnmadd_ps(tmp215, _mm512_set1_ps(2e+00f), tmp207);
__m512 out65 = _mm512_shuffle_f32x4(tmp204, tmp213, 68);
__m512 out73 = _mm512_shuffle_f32x4(tmp204, tmp213, 238);
__m512 out66 = _mm512_shuffle_f32x4(tmp214, in16, 68);
__m512 out74 = _mm512_shuffle_f32x4(tmp214, in16, 238);
__m512 out67 = _mm512_shuffle_f32x4(tmp212, tmp205, 68);
__m512 out75 = _mm512_shuffle_f32x4(tmp212, tmp205, 238);
__m512 out68 = _mm512_shuffle_f32x4(tmp202, in18, 68);
__m512 out76 = _mm512_shuffle_f32x4(tmp202, in18, 238);
__m512 out69 = _mm512_shuffle_f32x4(tmp209, tmp217, 68);
__m512 out77 = _mm512_shuffle_f32x4(tmp209, tmp217, 238);
__m512 out70 = _mm512_shuffle_f32x4(tmp218, in23, 68);
__m512 out78 = _mm512_shuffle_f32x4(tmp218, in23, 238);
__m512 out71 = _mm512_shuffle_f32x4(tmp216, tmp210, 68);
__m512 out79 = _mm512_shuffle_f32x4(tmp216, tmp210, 238);
__m512 out72 = _mm512_shuffle_f32x4(tmp207, in25, 68);
__m512 out80 = _mm512_shuffle_f32x4(tmp207, in25, 238);
_mm512_storeu_ps(dfPtr1+0+1013760*i7+152064*j3+152064*s5+768*k5, out65);
_mm512_storeu_ps(dfPtr1+128+1013760*i7+152064*j3+152064*s5+768*k5, out73);
_mm512_storeu_ps(dfPtr1+64+1013760*i7+152064*j3+152064*s5+768*k5, out69);
_mm512_storeu_ps(dfPtr1+192+1013760*i7+152064*j3+152064*s5+768*k5, out77);
_mm512_storeu_ps(dfPtr1+253440+1013760*i7+152064*j3+152064*s5+768*k5, out66);
_mm512_storeu_ps(dfPtr1+253568+1013760*i7+152064*j3+152064*s5+768*k5, out74);
_mm512_storeu_ps(dfPtr1+253504+1013760*i7+152064*j3+152064*s5+768*k5, out70);
_mm512_storeu_ps(dfPtr1+253632+1013760*i7+152064*j3+152064*s5+768*k5, out78);
_mm512_storeu_ps(dfPtr1+506880+1013760*i7+152064*j3+152064*s5+768*k5, out67);
_mm512_storeu_ps(dfPtr1+507008+1013760*i7+152064*j3+152064*s5+768*k5, out75);
_mm512_storeu_ps(dfPtr1+506944+1013760*i7+152064*j3+152064*s5+768*k5, out71);
_mm512_storeu_ps(dfPtr1+507072+1013760*i7+152064*j3+152064*s5+768*k5, out79);
_mm512_storeu_ps(dfPtr1+760320+1013760*i7+152064*j3+152064*s5+768*k5, out68);
_mm512_storeu_ps(dfPtr1+760448+1013760*i7+152064*j3+152064*s5+768*k5, out76);
_mm512_storeu_ps(dfPtr1+760384+1013760*i7+152064*j3+152064*s5+768*k5, out72);
_mm512_storeu_ps(dfPtr1+760512+1013760*i7+152064*j3+152064*s5+768*k5, out80);
__m512 dat15 = _mm512_maskz_loadu_ps(255, datPtr1+648+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512i pm10 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in27 = _mm512_permutexvar_ps(pm10, dat15);
__m512 dat16 = _mm512_maskz_loadu_ps(7, datPtr1+204+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat17 = _mm512_maskz_loadu_ps(255, datPtr1+756+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat18 = _mm512_maskz_loadu_ps(16383, datPtr1+972+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512i pm11 = _mm512_set_epi32(23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in28 = _mm512_permutex2var_ps(dat16, pm11, dat17);
__m512i pm12 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in35 = _mm512_permutexvar_ps(pm12, dat18);
__m512 dat19 = _mm512_maskz_loadu_ps(7, datPtr1+312+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat20 = _mm512_maskz_loadu_ps(255, datPtr1+864+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat21 = _mm512_maskz_loadu_ps(16383, datPtr1+1080+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in29 = _mm512_permutex2var_ps(dat19, pm11, dat20);
__m512 in36 = _mm512_permutexvar_ps(pm12, dat21);
__m512 dat22 = _mm512_maskz_loadu_ps(7, datPtr1+420+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat23 = _mm512_maskz_loadu_ps(16383, datPtr1+1188+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512i pm13 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in30 = _mm512_permutexvar_ps(pm13, dat22);
__m512 in37 = _mm512_permutexvar_ps(pm12, dat23);
__m512 dat24 = _mm512_maskz_loadu_ps(7, datPtr1+528+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat25 = _mm512_maskz_loadu_ps(16383, datPtr1+1296+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in31 = _mm512_permutexvar_ps(pm13, dat24);
__m512 in38 = _mm512_permutexvar_ps(pm12, dat25);
__m512 dat26 = _mm512_maskz_loadu_ps(7, datPtr1+636+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat27 = _mm512_maskz_loadu_ps(16383, datPtr1+1404+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in32 = _mm512_permutexvar_ps(pm13, dat26);
__m512 in39 = _mm512_permutexvar_ps(pm12, dat27);
__m512 dat28 = _mm512_maskz_loadu_ps(7, datPtr1+744+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat29 = _mm512_maskz_loadu_ps(16383, datPtr1+1512+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in33 = _mm512_permutexvar_ps(pm13, dat28);
__m512 in40 = _mm512_permutexvar_ps(pm12, dat29);
__m512 dat30 = _mm512_maskz_loadu_ps(7, datPtr1+852+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat31 = _mm512_maskz_loadu_ps(16383, datPtr1+1620+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in34 = _mm512_permutexvar_ps(pm13, dat30);
__m512 in41 = _mm512_permutexvar_ps(pm12, dat31);
__m512 tmp267 = _mm512_add_ps(in28, in32);
__m512 tmp271 = _mm512_add_ps(in35, in39);
__m512 tmp268 = _mm512_sub_ps(in31, in29);
__m512 tmp272 = _mm512_sub_ps(in38, in36);
__m512 tmp269 = _mm512_add_ps(in29, in33);
__m512 tmp273 = _mm512_add_ps(in36, in40);
in27 = _mm512_sub_ps(in27, in33);
__m512 tmp274 = _mm512_sub_ps(_mm512_setzero_ps(), in40);
tmp267 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-4.25e+00f), tmp267);
tmp271 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-4.25e+00f), tmp271);
tmp269 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-4.25e+00f), tmp269);
tmp273 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-4.25e+00f), tmp273);
in27 = _mm512_fmadd_ps(tmp268, _mm512_set1_ps(5.25e+00f), in27);
tmp274 = _mm512_fmadd_ps(tmp272, _mm512_set1_ps(5.25e+00f), tmp274);
tmp268 = _mm512_fmadd_ps(in29, _mm512_set1_ps(2.5e-01f), in33);
tmp272 = _mm512_fmadd_ps(in36, _mm512_set1_ps(2.5e-01f), in40);
in29 = _mm512_fmadd_ps(in29, _mm512_set1_ps(4e+00f), in33);
in36 = _mm512_fmadd_ps(in36, _mm512_set1_ps(4e+00f), in40);
__m512 tmp270 = _mm512_sub_ps(tmp269, tmp267);
__m512 tmp275 = _mm512_sub_ps(tmp273, tmp271);
tmp269 = _mm512_add_ps(tmp267, tmp269);
tmp273 = _mm512_add_ps(tmp271, tmp273);
tmp267 = _mm512_fmadd_ps(in28, _mm512_set1_ps(2.5e-01f), in32);
tmp271 = _mm512_fmadd_ps(in35, _mm512_set1_ps(2.5e-01f), in39);
tmp268 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-1.25e+00f), tmp268);
tmp272 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-1.25e+00f), tmp272);
in31 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-5e+00f), in29);
in38 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-5e+00f), in36);
tmp267 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-1.25e+00f), tmp267);
tmp271 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-1.25e+00f), tmp271);
in33 = _mm512_fmadd_ps(tmp267, _mm512_set1_ps(2e+00f), tmp268);
in40 = _mm512_fmadd_ps(tmp271, _mm512_set1_ps(2e+00f), tmp272);
tmp268 = _mm512_fnmadd_ps(tmp267, _mm512_set1_ps(2e+00f), tmp268);
tmp272 = _mm512_fnmadd_ps(tmp271, _mm512_set1_ps(2e+00f), tmp272);
tmp267 = _mm512_fmadd_ps(in32, _mm512_set1_ps(2.5e-01f), in28);
tmp271 = _mm512_fmadd_ps(in39, _mm512_set1_ps(2.5e-01f), in35);
in28 = _mm512_sub_ps(in34, in28);
in35 = _mm512_sub_ps(in41, in35);
tmp267 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-1.25e+00f), tmp267);
tmp271 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-1.25e+00f), tmp271);
in30 = _mm512_sub_ps(in30, in32);
in37 = _mm512_sub_ps(in37, in39);
in30 = _mm512_fmadd_ps(in30, _mm512_set1_ps(5.25e+00f), in28);
in37 = _mm512_fmadd_ps(in37, _mm512_set1_ps(5.25e+00f), in35);
in29 = _mm512_fmadd_ps(tmp267, _mm512_set1_ps(2e+00f), in31);
in36 = _mm512_fmadd_ps(tmp271, _mm512_set1_ps(2e+00f), in38);
in31 = _mm512_fnmadd_ps(tmp267, _mm512_set1_ps(2e+00f), in31);
in38 = _mm512_fnmadd_ps(tmp271, _mm512_set1_ps(2e+00f), in38);
__m512 tmp284 = _mm512_unpacklo_ps(in27, tmp269);
__m512 tmp285 = _mm512_unpackhi_ps(in27, tmp269);
__m512 tmp286 = _mm512_unpacklo_ps(tmp270, in33);
__m512 tmp287 = _mm512_unpackhi_ps(tmp270, in33);
__m512 tmp288 = _mm512_unpacklo_ps(tmp268, in29);
__m512 tmp289 = _mm512_unpackhi_ps(tmp268, in29);
__m512 tmp290 = _mm512_unpacklo_ps(in31, in30);
__m512 tmp291 = _mm512_unpackhi_ps(in31, in30);
__m512 tmp292 = _mm512_unpacklo_ps(tmp274, tmp273);
__m512 tmp293 = _mm512_unpackhi_ps(tmp274, tmp273);
__m512 tmp294 = _mm512_unpacklo_ps(tmp275, in40);
__m512 tmp295 = _mm512_unpackhi_ps(tmp275, in40);
__m512 tmp296 = _mm512_unpacklo_ps(tmp272, in36);
__m512 tmp297 = _mm512_unpackhi_ps(tmp272, in36);
__m512 tmp298 = _mm512_unpacklo_ps(in38, in37);
__m512 tmp299 = _mm512_unpackhi_ps(in38, in37);
__m512 tmp300 = _mm512_shuffle_ps(tmp284, tmp286, 68);
__m512 tmp301 = _mm512_shuffle_ps(tmp284, tmp286, 238);
__m512 tmp302 = _mm512_shuffle_ps(tmp285, tmp287, 68);
__m512 tmp303 = _mm512_shuffle_ps(tmp285, tmp287, 238);
__m512 tmp304 = _mm512_shuffle_ps(tmp288, tmp290, 68);
__m512 tmp305 = _mm512_shuffle_ps(tmp288, tmp290, 238);
__m512 tmp306 = _mm512_shuffle_ps(tmp289, tmp291, 68);
__m512 tmp307 = _mm512_shuffle_ps(tmp289, tmp291, 238);
__m512 tmp308 = _mm512_shuffle_ps(tmp292, tmp294, 68);
__m512 tmp309 = _mm512_shuffle_ps(tmp292, tmp294, 238);
__m512 tmp310 = _mm512_shuffle_ps(tmp293, tmp295, 68);
__m512 tmp311 = _mm512_shuffle_ps(tmp293, tmp295, 238);
__m512 tmp312 = _mm512_shuffle_ps(tmp296, tmp298, 68);
__m512 tmp313 = _mm512_shuffle_ps(tmp296, tmp298, 238);
__m512 tmp314 = _mm512_shuffle_ps(tmp297, tmp299, 68);
__m512 tmp315 = _mm512_shuffle_ps(tmp297, tmp299, 238);
__m512 tmp316 = _mm512_shuffle_f32x4(tmp300, tmp304, 136);
__m512 tmp317 = _mm512_shuffle_f32x4(tmp300, tmp304, 221);
__m512 tmp318 = _mm512_shuffle_f32x4(tmp301, tmp305, 136);
__m512 tmp319 = _mm512_shuffle_f32x4(tmp301, tmp305, 221);
__m512 tmp320 = _mm512_shuffle_f32x4(tmp302, tmp306, 136);
__m512 tmp321 = _mm512_shuffle_f32x4(tmp302, tmp306, 221);
__m512 tmp322 = _mm512_shuffle_f32x4(tmp303, tmp307, 136);
__m512 tmp323 = _mm512_shuffle_f32x4(tmp303, tmp307, 221);
__m512 tmp324 = _mm512_shuffle_f32x4(tmp308, tmp312, 136);
__m512 tmp325 = _mm512_shuffle_f32x4(tmp308, tmp312, 221);
__m512 tmp326 = _mm512_shuffle_f32x4(tmp309, tmp313, 136);
__m512 tmp327 = _mm512_shuffle_f32x4(tmp309, tmp313, 221);
__m512 tmp328 = _mm512_shuffle_f32x4(tmp310, tmp314, 136);
__m512 tmp329 = _mm512_shuffle_f32x4(tmp310, tmp314, 221);
__m512 tmp330 = _mm512_shuffle_f32x4(tmp311, tmp315, 136);
__m512 tmp331 = _mm512_shuffle_f32x4(tmp311, tmp315, 221);
in27 = _mm512_shuffle_f32x4(tmp316, tmp324, 136);
tmp274 = _mm512_shuffle_f32x4(tmp316, tmp324, 221);
tmp269 = _mm512_shuffle_f32x4(tmp318, tmp326, 136);
tmp273 = _mm512_shuffle_f32x4(tmp318, tmp326, 221);
tmp270 = _mm512_shuffle_f32x4(tmp320, tmp328, 136);
tmp275 = _mm512_shuffle_f32x4(tmp320, tmp328, 221);
in33 = _mm512_shuffle_f32x4(tmp322, tmp330, 136);
in40 = _mm512_shuffle_f32x4(tmp322, tmp330, 221);
tmp268 = _mm512_shuffle_f32x4(tmp317, tmp325, 136);
tmp272 = _mm512_shuffle_f32x4(tmp317, tmp325, 221);
in29 = _mm512_shuffle_f32x4(tmp319, tmp327, 136);
in36 = _mm512_shuffle_f32x4(tmp319, tmp327, 221);
in31 = _mm512_shuffle_f32x4(tmp321, tmp329, 136);
in38 = _mm512_shuffle_f32x4(tmp321, tmp329, 221);
in30 = _mm512_shuffle_f32x4(tmp323, tmp331, 136);
in37 = _mm512_shuffle_f32x4(tmp323, tmp331, 221);
__m512 tmp276 = _mm512_add_ps(tmp269, in29);
__m512 tmp280 = _mm512_add_ps(tmp273, in36);
__m512 tmp277 = _mm512_sub_ps(tmp268, tmp270);
__m512 tmp281 = _mm512_sub_ps(tmp272, tmp275);
__m512 tmp278 = _mm512_add_ps(tmp270, in31);
__m512 tmp282 = _mm512_add_ps(tmp275, in38);
in27 = _mm512_sub_ps(in27, in31);
tmp274 = _mm512_sub_ps(tmp274, in38);
tmp276 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-4.25e+00f), tmp276);
tmp280 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-4.25e+00f), tmp280);
tmp278 = _mm512_fmadd_ps(tmp268, _mm512_set1_ps(-4.25e+00f), tmp278);
tmp282 = _mm512_fmadd_ps(tmp272, _mm512_set1_ps(-4.25e+00f), tmp282);
in27 = _mm512_fmadd_ps(tmp277, _mm512_set1_ps(5.25e+00f), in27);
tmp274 = _mm512_fmadd_ps(tmp281, _mm512_set1_ps(5.25e+00f), tmp274);
tmp277 = _mm512_fmadd_ps(tmp270, _mm512_set1_ps(2.5e-01f), in31);
tmp281 = _mm512_fmadd_ps(tmp275, _mm512_set1_ps(2.5e-01f), in38);
tmp270 = _mm512_fmadd_ps(tmp270, _mm512_set1_ps(4e+00f), in31);
tmp275 = _mm512_fmadd_ps(tmp275, _mm512_set1_ps(4e+00f), in38);
__m512 tmp279 = _mm512_sub_ps(tmp278, tmp276);
__m512 tmp283 = _mm512_sub_ps(tmp282, tmp280);
tmp278 = _mm512_add_ps(tmp276, tmp278);
tmp282 = _mm512_add_ps(tmp280, tmp282);
tmp276 = _mm512_fmadd_ps(tmp269, _mm512_set1_ps(2.5e-01f), in29);
tmp280 = _mm512_fmadd_ps(tmp273, _mm512_set1_ps(2.5e-01f), in36);
tmp277 = _mm512_fmadd_ps(tmp268, _mm512_set1_ps(-1.25e+00f), tmp277);
tmp281 = _mm512_fmadd_ps(tmp272, _mm512_set1_ps(-1.25e+00f), tmp281);
tmp268 = _mm512_fmadd_ps(tmp268, _mm512_set1_ps(-5e+00f), tmp270);
tmp272 = _mm512_fmadd_ps(tmp272, _mm512_set1_ps(-5e+00f), tmp275);
tmp276 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-1.25e+00f), tmp276);
tmp280 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-1.25e+00f), tmp280);
in31 = _mm512_fmadd_ps(tmp276, _mm512_set1_ps(2e+00f), tmp277);
in38 = _mm512_fmadd_ps(tmp280, _mm512_set1_ps(2e+00f), tmp281);
tmp277 = _mm512_fnmadd_ps(tmp276, _mm512_set1_ps(2e+00f), tmp277);
tmp281 = _mm512_fnmadd_ps(tmp280, _mm512_set1_ps(2e+00f), tmp281);
tmp276 = _mm512_fmadd_ps(in29, _mm512_set1_ps(2.5e-01f), tmp269);
tmp280 = _mm512_fmadd_ps(in36, _mm512_set1_ps(2.5e-01f), tmp273);
tmp269 = _mm512_sub_ps(in30, tmp269);
tmp273 = _mm512_sub_ps(in37, tmp273);
tmp276 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-1.25e+00f), tmp276);
tmp280 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-1.25e+00f), tmp280);
in33 = _mm512_sub_ps(in33, in29);
in40 = _mm512_sub_ps(in40, in36);
in33 = _mm512_fmadd_ps(in33, _mm512_set1_ps(5.25e+00f), tmp269);
in40 = _mm512_fmadd_ps(in40, _mm512_set1_ps(5.25e+00f), tmp273);
tmp270 = _mm512_fmadd_ps(tmp276, _mm512_set1_ps(2e+00f), tmp268);
tmp275 = _mm512_fmadd_ps(tmp280, _mm512_set1_ps(2e+00f), tmp272);
tmp268 = _mm512_fnmadd_ps(tmp276, _mm512_set1_ps(2e+00f), tmp268);
tmp272 = _mm512_fnmadd_ps(tmp280, _mm512_set1_ps(2e+00f), tmp272);
__m512 out81 = _mm512_shuffle_f32x4(in27, tmp278, 68);
__m512 out89 = _mm512_shuffle_f32x4(in27, tmp278, 238);
__m512 out82 = _mm512_shuffle_f32x4(tmp279, in31, 68);
__m512 out90 = _mm512_shuffle_f32x4(tmp279, in31, 238);
__m512 out83 = _mm512_shuffle_f32x4(tmp277, tmp270, 68);
__m512 out91 = _mm512_shuffle_f32x4(tmp277, tmp270, 238);
__m512 out84 = _mm512_shuffle_f32x4(tmp268, in33, 68);
__m512 out92 = _mm512_shuffle_f32x4(tmp268, in33, 238);
__m512 out85 = _mm512_shuffle_f32x4(tmp274, tmp282, 68);
__m512 out93 = _mm512_shuffle_f32x4(tmp274, tmp282, 238);
__m512 out86 = _mm512_shuffle_f32x4(tmp283, in38, 68);
__m512 out94 = _mm512_shuffle_f32x4(tmp283, in38, 238);
__m512 out87 = _mm512_shuffle_f32x4(tmp281, tmp275, 68);
__m512 out95 = _mm512_shuffle_f32x4(tmp281, tmp275, 238);
__m512 out88 = _mm512_shuffle_f32x4(tmp272, in40, 68);
__m512 out96 = _mm512_shuffle_f32x4(tmp272, in40, 238);
_mm512_storeu_ps(dfPtr1+256+1013760*i7+152064*j3+152064*s5+768*k5, out81);
_mm512_storeu_ps(dfPtr1+384+1013760*i7+152064*j3+152064*s5+768*k5, out89);
_mm512_storeu_ps(dfPtr1+320+1013760*i7+152064*j3+152064*s5+768*k5, out85);
_mm512_storeu_ps(dfPtr1+448+1013760*i7+152064*j3+152064*s5+768*k5, out93);
_mm512_storeu_ps(dfPtr1+253696+1013760*i7+152064*j3+152064*s5+768*k5, out82);
_mm512_storeu_ps(dfPtr1+253824+1013760*i7+152064*j3+152064*s5+768*k5, out90);
_mm512_storeu_ps(dfPtr1+253760+1013760*i7+152064*j3+152064*s5+768*k5, out86);
_mm512_storeu_ps(dfPtr1+253888+1013760*i7+152064*j3+152064*s5+768*k5, out94);
_mm512_storeu_ps(dfPtr1+507136+1013760*i7+152064*j3+152064*s5+768*k5, out83);
_mm512_storeu_ps(dfPtr1+507264+1013760*i7+152064*j3+152064*s5+768*k5, out91);
_mm512_storeu_ps(dfPtr1+507200+1013760*i7+152064*j3+152064*s5+768*k5, out87);
_mm512_storeu_ps(dfPtr1+507328+1013760*i7+152064*j3+152064*s5+768*k5, out95);
_mm512_storeu_ps(dfPtr1+760576+1013760*i7+152064*j3+152064*s5+768*k5, out84);
_mm512_storeu_ps(dfPtr1+760704+1013760*i7+152064*j3+152064*s5+768*k5, out92);
_mm512_storeu_ps(dfPtr1+760640+1013760*i7+152064*j3+152064*s5+768*k5, out88);
_mm512_storeu_ps(dfPtr1+760768+1013760*i7+152064*j3+152064*s5+768*k5, out96);
__m512 dat32 = _mm512_maskz_loadu_ps(255, datPtr1+1512+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512i pm14 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in49 = _mm512_permutexvar_ps(pm14, dat32);
__m512 dat33 = _mm512_maskz_loadu_ps(32767, datPtr1+1020+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat34 = _mm512_maskz_loadu_ps(255, datPtr1+1620+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512i pm15 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in42 = _mm512_permutexvar_ps(pm15, dat33);
__m512i pm16 = _mm512_set_epi32(23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 14, 13, 12);
__m512 in50 = _mm512_permutex2var_ps(dat33, pm16, dat34);
__m512 dat35 = _mm512_maskz_loadu_ps(32767, datPtr1+1128+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 dat36 = _mm512_maskz_loadu_ps(255, datPtr1+1728+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in43 = _mm512_permutexvar_ps(pm15, dat35);
__m512 in51 = _mm512_permutex2var_ps(dat35, pm16, dat36);
__m512 dat37 = _mm512_maskz_loadu_ps(32767, datPtr1+1236+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in44 = _mm512_permutexvar_ps(pm15, dat37);
__m512i pm17 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 13, 12);
__m512 in52 = _mm512_permutexvar_ps(pm17, dat37);
__m512 dat38 = _mm512_maskz_loadu_ps(32767, datPtr1+1344+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in45 = _mm512_permutexvar_ps(pm15, dat38);
__m512 in53 = _mm512_permutexvar_ps(pm17, dat38);
__m512 dat39 = _mm512_maskz_loadu_ps(32767, datPtr1+1452+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in46 = _mm512_permutexvar_ps(pm15, dat39);
__m512 in54 = _mm512_permutexvar_ps(pm17, dat39);
__m512 dat40 = _mm512_maskz_loadu_ps(32767, datPtr1+1560+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in47 = _mm512_permutexvar_ps(pm15, dat40);
__m512 in55 = _mm512_permutexvar_ps(pm17, dat40);
__m512 dat41 = _mm512_maskz_loadu_ps(32767, datPtr1+1668+466560*i7+108*h1+4*w1+342144*s5+1728*k5);
__m512 in48 = _mm512_permutexvar_ps(pm15, dat41);
__m512 in56 = _mm512_permutexvar_ps(pm17, dat41);
__m512 tmp332 = _mm512_add_ps(in42, in46);
__m512 tmp337 = _mm512_add_ps(in50, in54);
__m512 tmp333 = _mm512_sub_ps(in45, in43);
__m512 tmp338 = _mm512_sub_ps(in53, in51);
__m512 tmp334 = _mm512_add_ps(in43, in47);
__m512 tmp339 = _mm512_add_ps(in51, in55);
__m512 tmp335 = _mm512_sub_ps(_mm512_setzero_ps(), in47);
in49 = _mm512_sub_ps(in49, in55);
tmp332 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-4.25e+00f), tmp332);
tmp337 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-4.25e+00f), tmp337);
tmp334 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-4.25e+00f), tmp334);
tmp339 = _mm512_fmadd_ps(in53, _mm512_set1_ps(-4.25e+00f), tmp339);
tmp335 = _mm512_fmadd_ps(tmp333, _mm512_set1_ps(5.25e+00f), tmp335);
in49 = _mm512_fmadd_ps(tmp338, _mm512_set1_ps(5.25e+00f), in49);
tmp333 = _mm512_fmadd_ps(in43, _mm512_set1_ps(2.5e-01f), in47);
tmp338 = _mm512_fmadd_ps(in51, _mm512_set1_ps(2.5e-01f), in55);
in43 = _mm512_fmadd_ps(in43, _mm512_set1_ps(4e+00f), in47);
in51 = _mm512_fmadd_ps(in51, _mm512_set1_ps(4e+00f), in55);
__m512 tmp336 = _mm512_sub_ps(tmp334, tmp332);
__m512 tmp340 = _mm512_sub_ps(tmp339, tmp337);
tmp334 = _mm512_add_ps(tmp332, tmp334);
tmp339 = _mm512_add_ps(tmp337, tmp339);
tmp332 = _mm512_fmadd_ps(in42, _mm512_set1_ps(2.5e-01f), in46);
tmp337 = _mm512_fmadd_ps(in50, _mm512_set1_ps(2.5e-01f), in54);
tmp333 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-1.25e+00f), tmp333);
tmp338 = _mm512_fmadd_ps(in53, _mm512_set1_ps(-1.25e+00f), tmp338);
in45 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-5e+00f), in43);
in53 = _mm512_fmadd_ps(in53, _mm512_set1_ps(-5e+00f), in51);
tmp332 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-1.25e+00f), tmp332);
tmp337 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-1.25e+00f), tmp337);
in47 = _mm512_fmadd_ps(tmp332, _mm512_set1_ps(2e+00f), tmp333);
in55 = _mm512_fmadd_ps(tmp337, _mm512_set1_ps(2e+00f), tmp338);
tmp333 = _mm512_fnmadd_ps(tmp332, _mm512_set1_ps(2e+00f), tmp333);
tmp338 = _mm512_fnmadd_ps(tmp337, _mm512_set1_ps(2e+00f), tmp338);
tmp332 = _mm512_fmadd_ps(in46, _mm512_set1_ps(2.5e-01f), in42);
tmp337 = _mm512_fmadd_ps(in54, _mm512_set1_ps(2.5e-01f), in50);
in42 = _mm512_sub_ps(in48, in42);
in50 = _mm512_sub_ps(in56, in50);
tmp332 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-1.25e+00f), tmp332);
tmp337 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-1.25e+00f), tmp337);
in44 = _mm512_sub_ps(in44, in46);
in52 = _mm512_sub_ps(in52, in54);
in44 = _mm512_fmadd_ps(in44, _mm512_set1_ps(5.25e+00f), in42);
in52 = _mm512_fmadd_ps(in52, _mm512_set1_ps(5.25e+00f), in50);
in43 = _mm512_fmadd_ps(tmp332, _mm512_set1_ps(2e+00f), in45);
in51 = _mm512_fmadd_ps(tmp337, _mm512_set1_ps(2e+00f), in53);
in45 = _mm512_fnmadd_ps(tmp332, _mm512_set1_ps(2e+00f), in45);
in53 = _mm512_fnmadd_ps(tmp337, _mm512_set1_ps(2e+00f), in53);
__m512 tmp349 = _mm512_unpacklo_ps(tmp335, tmp334);
__m512 tmp350 = _mm512_unpackhi_ps(tmp335, tmp334);
__m512 tmp351 = _mm512_unpacklo_ps(tmp336, in47);
__m512 tmp352 = _mm512_unpackhi_ps(tmp336, in47);
__m512 tmp353 = _mm512_unpacklo_ps(tmp333, in43);
__m512 tmp354 = _mm512_unpackhi_ps(tmp333, in43);
__m512 tmp355 = _mm512_unpacklo_ps(in45, in44);
__m512 tmp356 = _mm512_unpackhi_ps(in45, in44);
__m512 tmp357 = _mm512_unpacklo_ps(in49, tmp339);
__m512 tmp358 = _mm512_unpackhi_ps(in49, tmp339);
__m512 tmp359 = _mm512_unpacklo_ps(tmp340, in55);
__m512 tmp360 = _mm512_unpackhi_ps(tmp340, in55);
__m512 tmp361 = _mm512_unpacklo_ps(tmp338, in51);
__m512 tmp362 = _mm512_unpackhi_ps(tmp338, in51);
__m512 tmp363 = _mm512_unpacklo_ps(in53, in52);
__m512 tmp364 = _mm512_unpackhi_ps(in53, in52);
__m512 tmp365 = _mm512_shuffle_ps(tmp349, tmp351, 68);
__m512 tmp366 = _mm512_shuffle_ps(tmp349, tmp351, 238);
__m512 tmp367 = _mm512_shuffle_ps(tmp350, tmp352, 68);
__m512 tmp368 = _mm512_shuffle_ps(tmp350, tmp352, 238);
__m512 tmp369 = _mm512_shuffle_ps(tmp353, tmp355, 68);
__m512 tmp370 = _mm512_shuffle_ps(tmp353, tmp355, 238);
__m512 tmp371 = _mm512_shuffle_ps(tmp354, tmp356, 68);
__m512 tmp372 = _mm512_shuffle_ps(tmp354, tmp356, 238);
__m512 tmp373 = _mm512_shuffle_ps(tmp357, tmp359, 68);
__m512 tmp374 = _mm512_shuffle_ps(tmp357, tmp359, 238);
__m512 tmp375 = _mm512_shuffle_ps(tmp358, tmp360, 68);
__m512 tmp376 = _mm512_shuffle_ps(tmp358, tmp360, 238);
__m512 tmp377 = _mm512_shuffle_ps(tmp361, tmp363, 68);
__m512 tmp378 = _mm512_shuffle_ps(tmp361, tmp363, 238);
__m512 tmp379 = _mm512_shuffle_ps(tmp362, tmp364, 68);
__m512 tmp380 = _mm512_shuffle_ps(tmp362, tmp364, 238);
__m512 tmp381 = _mm512_shuffle_f32x4(tmp365, tmp369, 136);
__m512 tmp382 = _mm512_shuffle_f32x4(tmp365, tmp369, 221);
__m512 tmp383 = _mm512_shuffle_f32x4(tmp366, tmp370, 136);
__m512 tmp384 = _mm512_shuffle_f32x4(tmp366, tmp370, 221);
__m512 tmp385 = _mm512_shuffle_f32x4(tmp367, tmp371, 136);
__m512 tmp386 = _mm512_shuffle_f32x4(tmp367, tmp371, 221);
__m512 tmp387 = _mm512_shuffle_f32x4(tmp368, tmp372, 136);
__m512 tmp388 = _mm512_shuffle_f32x4(tmp368, tmp372, 221);
__m512 tmp389 = _mm512_shuffle_f32x4(tmp373, tmp377, 136);
__m512 tmp390 = _mm512_shuffle_f32x4(tmp373, tmp377, 221);
__m512 tmp391 = _mm512_shuffle_f32x4(tmp374, tmp378, 136);
__m512 tmp392 = _mm512_shuffle_f32x4(tmp374, tmp378, 221);
__m512 tmp393 = _mm512_shuffle_f32x4(tmp375, tmp379, 136);
__m512 tmp394 = _mm512_shuffle_f32x4(tmp375, tmp379, 221);
__m512 tmp395 = _mm512_shuffle_f32x4(tmp376, tmp380, 136);
__m512 tmp396 = _mm512_shuffle_f32x4(tmp376, tmp380, 221);
tmp335 = _mm512_shuffle_f32x4(tmp381, tmp389, 136);
in49 = _mm512_shuffle_f32x4(tmp381, tmp389, 221);
tmp334 = _mm512_shuffle_f32x4(tmp383, tmp391, 136);
tmp339 = _mm512_shuffle_f32x4(tmp383, tmp391, 221);
tmp336 = _mm512_shuffle_f32x4(tmp385, tmp393, 136);
tmp340 = _mm512_shuffle_f32x4(tmp385, tmp393, 221);
in47 = _mm512_shuffle_f32x4(tmp387, tmp395, 136);
in55 = _mm512_shuffle_f32x4(tmp387, tmp395, 221);
tmp333 = _mm512_shuffle_f32x4(tmp382, tmp390, 136);
tmp338 = _mm512_shuffle_f32x4(tmp382, tmp390, 221);
in43 = _mm512_shuffle_f32x4(tmp384, tmp392, 136);
in51 = _mm512_shuffle_f32x4(tmp384, tmp392, 221);
in45 = _mm512_shuffle_f32x4(tmp386, tmp394, 136);
in53 = _mm512_shuffle_f32x4(tmp386, tmp394, 221);
in44 = _mm512_shuffle_f32x4(tmp388, tmp396, 136);
in52 = _mm512_shuffle_f32x4(tmp388, tmp396, 221);
__m512 tmp341 = _mm512_add_ps(tmp334, in43);
__m512 tmp345 = _mm512_add_ps(tmp339, in51);
__m512 tmp342 = _mm512_sub_ps(tmp333, tmp336);
__m512 tmp346 = _mm512_sub_ps(tmp338, tmp340);
__m512 tmp343 = _mm512_add_ps(tmp336, in45);
__m512 tmp347 = _mm512_add_ps(tmp340, in53);
tmp335 = _mm512_sub_ps(tmp335, in45);
in49 = _mm512_sub_ps(in49, in53);
tmp341 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-4.25e+00f), tmp341);
tmp345 = _mm512_fmadd_ps(in55, _mm512_set1_ps(-4.25e+00f), tmp345);
tmp343 = _mm512_fmadd_ps(tmp333, _mm512_set1_ps(-4.25e+00f), tmp343);
tmp347 = _mm512_fmadd_ps(tmp338, _mm512_set1_ps(-4.25e+00f), tmp347);
tmp335 = _mm512_fmadd_ps(tmp342, _mm512_set1_ps(5.25e+00f), tmp335);
in49 = _mm512_fmadd_ps(tmp346, _mm512_set1_ps(5.25e+00f), in49);
tmp342 = _mm512_fmadd_ps(tmp336, _mm512_set1_ps(2.5e-01f), in45);
tmp346 = _mm512_fmadd_ps(tmp340, _mm512_set1_ps(2.5e-01f), in53);
tmp336 = _mm512_fmadd_ps(tmp336, _mm512_set1_ps(4e+00f), in45);
tmp340 = _mm512_fmadd_ps(tmp340, _mm512_set1_ps(4e+00f), in53);
__m512 tmp344 = _mm512_sub_ps(tmp343, tmp341);
__m512 tmp348 = _mm512_sub_ps(tmp347, tmp345);
tmp343 = _mm512_add_ps(tmp341, tmp343);
tmp347 = _mm512_add_ps(tmp345, tmp347);
tmp341 = _mm512_fmadd_ps(tmp334, _mm512_set1_ps(2.5e-01f), in43);
tmp345 = _mm512_fmadd_ps(tmp339, _mm512_set1_ps(2.5e-01f), in51);
tmp342 = _mm512_fmadd_ps(tmp333, _mm512_set1_ps(-1.25e+00f), tmp342);
tmp346 = _mm512_fmadd_ps(tmp338, _mm512_set1_ps(-1.25e+00f), tmp346);
tmp333 = _mm512_fmadd_ps(tmp333, _mm512_set1_ps(-5e+00f), tmp336);
tmp338 = _mm512_fmadd_ps(tmp338, _mm512_set1_ps(-5e+00f), tmp340);
tmp341 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-1.25e+00f), tmp341);
tmp345 = _mm512_fmadd_ps(in55, _mm512_set1_ps(-1.25e+00f), tmp345);
in45 = _mm512_fmadd_ps(tmp341, _mm512_set1_ps(2e+00f), tmp342);
in53 = _mm512_fmadd_ps(tmp345, _mm512_set1_ps(2e+00f), tmp346);
tmp342 = _mm512_fnmadd_ps(tmp341, _mm512_set1_ps(2e+00f), tmp342);
tmp346 = _mm512_fnmadd_ps(tmp345, _mm512_set1_ps(2e+00f), tmp346);
tmp341 = _mm512_fmadd_ps(in43, _mm512_set1_ps(2.5e-01f), tmp334);
tmp345 = _mm512_fmadd_ps(in51, _mm512_set1_ps(2.5e-01f), tmp339);
tmp334 = _mm512_sub_ps(in44, tmp334);
tmp339 = _mm512_sub_ps(in52, tmp339);
tmp341 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-1.25e+00f), tmp341);
tmp345 = _mm512_fmadd_ps(in55, _mm512_set1_ps(-1.25e+00f), tmp345);
in47 = _mm512_sub_ps(in47, in43);
in55 = _mm512_sub_ps(in55, in51);
in47 = _mm512_fmadd_ps(in47, _mm512_set1_ps(5.25e+00f), tmp334);
in55 = _mm512_fmadd_ps(in55, _mm512_set1_ps(5.25e+00f), tmp339);
tmp336 = _mm512_fmadd_ps(tmp341, _mm512_set1_ps(2e+00f), tmp333);
tmp340 = _mm512_fmadd_ps(tmp345, _mm512_set1_ps(2e+00f), tmp338);
tmp333 = _mm512_fnmadd_ps(tmp341, _mm512_set1_ps(2e+00f), tmp333);
tmp338 = _mm512_fnmadd_ps(tmp345, _mm512_set1_ps(2e+00f), tmp338);
__m512 out97 = _mm512_shuffle_f32x4(tmp335, tmp343, 68);
__m512 out105 = _mm512_shuffle_f32x4(tmp335, tmp343, 238);
__m512 out98 = _mm512_shuffle_f32x4(tmp344, in45, 68);
__m512 out106 = _mm512_shuffle_f32x4(tmp344, in45, 238);
__m512 out99 = _mm512_shuffle_f32x4(tmp342, tmp336, 68);
__m512 out107 = _mm512_shuffle_f32x4(tmp342, tmp336, 238);
__m512 out100 = _mm512_shuffle_f32x4(tmp333, in47, 68);
__m512 out108 = _mm512_shuffle_f32x4(tmp333, in47, 238);
__m512 out101 = _mm512_shuffle_f32x4(in49, tmp347, 68);
__m512 out109 = _mm512_shuffle_f32x4(in49, tmp347, 238);
__m512 out102 = _mm512_shuffle_f32x4(tmp348, in53, 68);
__m512 out110 = _mm512_shuffle_f32x4(tmp348, in53, 238);
__m512 out103 = _mm512_shuffle_f32x4(tmp346, tmp340, 68);
__m512 out111 = _mm512_shuffle_f32x4(tmp346, tmp340, 238);
__m512 out104 = _mm512_shuffle_f32x4(tmp338, in55, 68);
__m512 out112 = _mm512_shuffle_f32x4(tmp338, in55, 238);
_mm512_storeu_ps(dfPtr1+512+1013760*i7+152064*j3+152064*s5+768*k5, out97);
_mm512_storeu_ps(dfPtr1+640+1013760*i7+152064*j3+152064*s5+768*k5, out105);
_mm512_storeu_ps(dfPtr1+576+1013760*i7+152064*j3+152064*s5+768*k5, out101);
_mm512_storeu_ps(dfPtr1+704+1013760*i7+152064*j3+152064*s5+768*k5, out109);
_mm512_storeu_ps(dfPtr1+253952+1013760*i7+152064*j3+152064*s5+768*k5, out98);
_mm512_storeu_ps(dfPtr1+254080+1013760*i7+152064*j3+152064*s5+768*k5, out106);
_mm512_storeu_ps(dfPtr1+254016+1013760*i7+152064*j3+152064*s5+768*k5, out102);
_mm512_storeu_ps(dfPtr1+254144+1013760*i7+152064*j3+152064*s5+768*k5, out110);
_mm512_storeu_ps(dfPtr1+507392+1013760*i7+152064*j3+152064*s5+768*k5, out99);
_mm512_storeu_ps(dfPtr1+507520+1013760*i7+152064*j3+152064*s5+768*k5, out107);
_mm512_storeu_ps(dfPtr1+507456+1013760*i7+152064*j3+152064*s5+768*k5, out103);
_mm512_storeu_ps(dfPtr1+507584+1013760*i7+152064*j3+152064*s5+768*k5, out111);
_mm512_storeu_ps(dfPtr1+760832+1013760*i7+152064*j3+152064*s5+768*k5, out100);
_mm512_storeu_ps(dfPtr1+760960+1013760*i7+152064*j3+152064*s5+768*k5, out108);
_mm512_storeu_ps(dfPtr1+760896+1013760*i7+152064*j3+152064*s5+768*k5, out104);
_mm512_storeu_ps(dfPtr1+761024+1013760*i7+152064*j3+152064*s5+768*k5, out112);
}
if (j3 >= last1) return;
++j3;
rel1 = 1;
}
ptrdiff_t h2 = base1+6;
ptrdiff_t w2 = 6;
ptrdiff_t k6 = 0;
for (; k6 != 396; ++k6) {
__m512 dat42 = _mm512_maskz_loadu_ps(16383, datPtr1+0+466560*i7+108*h2+4*w2+342144*s5+864*k6);
__m512 dat43 = _mm512_maskz_loadu_ps(511, datPtr1+48+466560*i7+108*h2+4*w2+342144*s5+864*k6);
__m512i pm18 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in57 = _mm512_permutexvar_ps(pm18, dat42);
__m512i pm19 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in60 = _mm512_permutexvar_ps(pm19, dat43);
__m512 dat44 = _mm512_maskz_loadu_ps(16383, datPtr1+108+466560*i7+108*h2+4*w2+342144*s5+864*k6);
__m512 dat45 = _mm512_maskz_loadu_ps(511, datPtr1+156+466560*i7+108*h2+4*w2+342144*s5+864*k6);
__m512 in58 = _mm512_permutexvar_ps(pm18, dat44);
__m512 in61 = _mm512_permutexvar_ps(pm19, dat45);
__m512 dat46 = _mm512_maskz_loadu_ps(16383, datPtr1+216+466560*i7+108*h2+4*w2+342144*s5+864*k6);
__m512 dat47 = _mm512_maskz_loadu_ps(511, datPtr1+264+466560*i7+108*h2+4*w2+342144*s5+864*k6);
__m512 in59 = _mm512_permutexvar_ps(pm18, dat46);
__m512 in62 = _mm512_permutexvar_ps(pm19, dat47);
__m512 tmp397 = in58;
__m512 tmp404 = in61;
__m512 tmp398 = _mm512_sub_ps(_mm512_setzero_ps(), in59);
__m512 tmp405 = _mm512_sub_ps(_mm512_setzero_ps(), in62);
__m512 tmp399 = in59;
__m512 tmp406 = in62;
in57 = in57;
in60 = in60;
tmp397 = tmp397;
tmp404 = tmp404;
tmp399 = tmp399;
tmp406 = tmp406;
in57 = _mm512_fmadd_ps(tmp398, _mm512_set1_ps(5.25e+00f), in57);
in60 = _mm512_fmadd_ps(tmp405, _mm512_set1_ps(5.25e+00f), in60);
tmp398 = _mm512_mul_ps(in59, _mm512_set1_ps(2.5e-01f));
tmp405 = _mm512_mul_ps(in62, _mm512_set1_ps(2.5e-01f));
in59 = _mm512_mul_ps(in59, _mm512_set1_ps(4e+00f));
in62 = _mm512_mul_ps(in62, _mm512_set1_ps(4e+00f));
__m512 tmp400 = _mm512_sub_ps(tmp399, tmp397);
__m512 tmp407 = _mm512_sub_ps(tmp406, tmp404);
tmp399 = _mm512_add_ps(tmp397, tmp399);
tmp406 = _mm512_add_ps(tmp404, tmp406);
tmp397 = _mm512_mul_ps(in58, _mm512_set1_ps(2.5e-01f));
tmp404 = _mm512_mul_ps(in61, _mm512_set1_ps(2.5e-01f));
tmp398 = tmp398;
tmp405 = tmp405;
__m512 tmp401 = in59;
__m512 tmp408 = in62;
tmp397 = tmp397;
tmp404 = tmp404;
__m512 tmp402 = _mm512_fmadd_ps(tmp397, _mm512_set1_ps(2e+00f), tmp398);
__m512 tmp409 = _mm512_fmadd_ps(tmp404, _mm512_set1_ps(2e+00f), tmp405);
tmp398 = _mm512_fnmadd_ps(tmp397, _mm512_set1_ps(2e+00f), tmp398);
tmp405 = _mm512_fnmadd_ps(tmp404, _mm512_set1_ps(2e+00f), tmp405);
tmp397 = in58;
tmp404 = in61;
in58 = _mm512_sub_ps(_mm512_setzero_ps(), in58);
in61 = _mm512_sub_ps(_mm512_setzero_ps(), in61);
tmp397 = tmp397;
tmp404 = tmp404;
__m512 tmp403 = in58;
__m512 tmp410 = in61;
in59 = _mm512_fmadd_ps(tmp397, _mm512_set1_ps(2e+00f), tmp401);
in62 = _mm512_fmadd_ps(tmp404, _mm512_set1_ps(2e+00f), tmp408);
tmp401 = _mm512_fnmadd_ps(tmp397, _mm512_set1_ps(2e+00f), tmp401);
tmp408 = _mm512_fnmadd_ps(tmp404, _mm512_set1_ps(2e+00f), tmp408);
__m512 tmp419 = _mm512_unpacklo_ps(in57, tmp399);
__m512 tmp420 = _mm512_unpackhi_ps(in57, tmp399);
__m512 tmp421 = _mm512_unpacklo_ps(tmp400, tmp402);
__m512 tmp422 = _mm512_unpackhi_ps(tmp400, tmp402);
__m512 tmp423 = _mm512_unpacklo_ps(tmp398, in59);
__m512 tmp424 = _mm512_unpackhi_ps(tmp398, in59);
__m512 tmp425 = _mm512_unpacklo_ps(tmp401, tmp403);
__m512 tmp426 = _mm512_unpackhi_ps(tmp401, tmp403);
__m512 tmp427 = _mm512_unpacklo_ps(in60, tmp406);
__m512 tmp428 = _mm512_unpackhi_ps(in60, tmp406);
__m512 tmp429 = _mm512_unpacklo_ps(tmp407, tmp409);
__m512 tmp430 = _mm512_unpackhi_ps(tmp407, tmp409);
__m512 tmp431 = _mm512_unpacklo_ps(tmp405, in62);
__m512 tmp432 = _mm512_unpackhi_ps(tmp405, in62);
__m512 tmp433 = _mm512_unpacklo_ps(tmp408, tmp410);
__m512 tmp434 = _mm512_unpackhi_ps(tmp408, tmp410);
__m512 tmp435 = _mm512_shuffle_ps(tmp419, tmp421, 68);
__m512 tmp436 = _mm512_shuffle_ps(tmp419, tmp421, 238);
__m512 tmp437 = _mm512_shuffle_ps(tmp420, tmp422, 68);
__m512 tmp438 = _mm512_shuffle_ps(tmp420, tmp422, 238);
__m512 tmp439 = _mm512_shuffle_ps(tmp423, tmp425, 68);
__m512 tmp440 = _mm512_shuffle_ps(tmp423, tmp425, 238);
__m512 tmp441 = _mm512_shuffle_ps(tmp424, tmp426, 68);
__m512 tmp442 = _mm512_shuffle_ps(tmp424, tmp426, 238);
__m512 tmp443 = _mm512_shuffle_ps(tmp427, tmp429, 68);
__m512 tmp444 = _mm512_shuffle_ps(tmp427, tmp429, 238);
__m512 tmp445 = _mm512_shuffle_ps(tmp428, tmp430, 68);
__m512 tmp446 = _mm512_shuffle_ps(tmp428, tmp430, 238);
__m512 tmp447 = _mm512_shuffle_ps(tmp431, tmp433, 68);
__m512 tmp448 = _mm512_shuffle_ps(tmp431, tmp433, 238);
__m512 tmp449 = _mm512_shuffle_ps(tmp432, tmp434, 68);
__m512 tmp450 = _mm512_shuffle_ps(tmp432, tmp434, 238);
__m512 tmp451 = _mm512_shuffle_f32x4(tmp435, tmp439, 136);
__m512 tmp452 = _mm512_shuffle_f32x4(tmp435, tmp439, 221);
__m512 tmp453 = _mm512_shuffle_f32x4(tmp436, tmp440, 136);
__m512 tmp454 = _mm512_shuffle_f32x4(tmp436, tmp440, 221);
__m512 tmp455 = _mm512_shuffle_f32x4(tmp437, tmp441, 136);
__m512 tmp456 = _mm512_shuffle_f32x4(tmp437, tmp441, 221);
__m512 tmp457 = _mm512_shuffle_f32x4(tmp438, tmp442, 136);
__m512 tmp458 = _mm512_shuffle_f32x4(tmp438, tmp442, 221);
__m512 tmp459 = _mm512_shuffle_f32x4(tmp443, tmp447, 136);
__m512 tmp460 = _mm512_shuffle_f32x4(tmp443, tmp447, 221);
__m512 tmp461 = _mm512_shuffle_f32x4(tmp444, tmp448, 136);
__m512 tmp462 = _mm512_shuffle_f32x4(tmp444, tmp448, 221);
__m512 tmp463 = _mm512_shuffle_f32x4(tmp445, tmp449, 136);
__m512 tmp464 = _mm512_shuffle_f32x4(tmp445, tmp449, 221);
__m512 tmp465 = _mm512_shuffle_f32x4(tmp446, tmp450, 136);
__m512 tmp466 = _mm512_shuffle_f32x4(tmp446, tmp450, 221);
in57 = _mm512_shuffle_f32x4(tmp451, tmp459, 136);
in60 = _mm512_shuffle_f32x4(tmp451, tmp459, 221);
tmp399 = _mm512_shuffle_f32x4(tmp453, tmp461, 136);
tmp406 = _mm512_shuffle_f32x4(tmp453, tmp461, 221);
tmp400 = _mm512_shuffle_f32x4(tmp455, tmp463, 136);
tmp407 = _mm512_shuffle_f32x4(tmp455, tmp463, 221);
tmp402 = _mm512_shuffle_f32x4(tmp457, tmp465, 136);
tmp409 = _mm512_shuffle_f32x4(tmp457, tmp465, 221);
tmp398 = _mm512_shuffle_f32x4(tmp452, tmp460, 136);
tmp405 = _mm512_shuffle_f32x4(tmp452, tmp460, 221);
in59 = _mm512_shuffle_f32x4(tmp454, tmp462, 136);
in62 = _mm512_shuffle_f32x4(tmp454, tmp462, 221);
tmp401 = _mm512_shuffle_f32x4(tmp456, tmp464, 136);
tmp408 = _mm512_shuffle_f32x4(tmp456, tmp464, 221);
tmp403 = _mm512_shuffle_f32x4(tmp458, tmp466, 136);
tmp410 = _mm512_shuffle_f32x4(tmp458, tmp466, 221);
__m512 tmp411 = _mm512_add_ps(tmp399, in59);
__m512 tmp415 = _mm512_add_ps(tmp406, in62);
__m512 tmp412 = _mm512_sub_ps(tmp398, tmp400);
__m512 tmp416 = _mm512_sub_ps(tmp405, tmp407);
__m512 tmp413 = _mm512_add_ps(tmp400, tmp401);
__m512 tmp417 = _mm512_add_ps(tmp407, tmp408);
in57 = _mm512_sub_ps(in57, tmp401);
in60 = _mm512_sub_ps(in60, tmp408);
tmp411 = _mm512_fmadd_ps(tmp402, _mm512_set1_ps(-4.25e+00f), tmp411);
tmp415 = _mm512_fmadd_ps(tmp409, _mm512_set1_ps(-4.25e+00f), tmp415);
tmp413 = _mm512_fmadd_ps(tmp398, _mm512_set1_ps(-4.25e+00f), tmp413);
tmp417 = _mm512_fmadd_ps(tmp405, _mm512_set1_ps(-4.25e+00f), tmp417);
in57 = _mm512_fmadd_ps(tmp412, _mm512_set1_ps(5.25e+00f), in57);
in60 = _mm512_fmadd_ps(tmp416, _mm512_set1_ps(5.25e+00f), in60);
tmp412 = _mm512_fmadd_ps(tmp400, _mm512_set1_ps(2.5e-01f), tmp401);
tmp416 = _mm512_fmadd_ps(tmp407, _mm512_set1_ps(2.5e-01f), tmp408);
tmp400 = _mm512_fmadd_ps(tmp400, _mm512_set1_ps(4e+00f), tmp401);
tmp407 = _mm512_fmadd_ps(tmp407, _mm512_set1_ps(4e+00f), tmp408);
__m512 tmp414 = _mm512_sub_ps(tmp413, tmp411);
__m512 tmp418 = _mm512_sub_ps(tmp417, tmp415);
tmp413 = _mm512_add_ps(tmp411, tmp413);
tmp417 = _mm512_add_ps(tmp415, tmp417);
tmp411 = _mm512_fmadd_ps(tmp399, _mm512_set1_ps(2.5e-01f), in59);
tmp415 = _mm512_fmadd_ps(tmp406, _mm512_set1_ps(2.5e-01f), in62);
tmp412 = _mm512_fmadd_ps(tmp398, _mm512_set1_ps(-1.25e+00f), tmp412);
tmp416 = _mm512_fmadd_ps(tmp405, _mm512_set1_ps(-1.25e+00f), tmp416);
tmp398 = _mm512_fmadd_ps(tmp398, _mm512_set1_ps(-5e+00f), tmp400);
tmp405 = _mm512_fmadd_ps(tmp405, _mm512_set1_ps(-5e+00f), tmp407);
tmp411 = _mm512_fmadd_ps(tmp402, _mm512_set1_ps(-1.25e+00f), tmp411);
tmp415 = _mm512_fmadd_ps(tmp409, _mm512_set1_ps(-1.25e+00f), tmp415);
tmp401 = _mm512_fmadd_ps(tmp411, _mm512_set1_ps(2e+00f), tmp412);
tmp408 = _mm512_fmadd_ps(tmp415, _mm512_set1_ps(2e+00f), tmp416);
tmp412 = _mm512_fnmadd_ps(tmp411, _mm512_set1_ps(2e+00f), tmp412);
tmp416 = _mm512_fnmadd_ps(tmp415, _mm512_set1_ps(2e+00f), tmp416);
tmp411 = _mm512_fmadd_ps(in59, _mm512_set1_ps(2.5e-01f), tmp399);
tmp415 = _mm512_fmadd_ps(in62, _mm512_set1_ps(2.5e-01f), tmp406);
tmp399 = _mm512_sub_ps(tmp403, tmp399);
tmp406 = _mm512_sub_ps(tmp410, tmp406);
tmp411 = _mm512_fmadd_ps(tmp402, _mm512_set1_ps(-1.25e+00f), tmp411);
tmp415 = _mm512_fmadd_ps(tmp409, _mm512_set1_ps(-1.25e+00f), tmp415);
tmp402 = _mm512_sub_ps(tmp402, in59);
tmp409 = _mm512_sub_ps(tmp409, in62);
tmp402 = _mm512_fmadd_ps(tmp402, _mm512_set1_ps(5.25e+00f), tmp399);
tmp409 = _mm512_fmadd_ps(tmp409, _mm512_set1_ps(5.25e+00f), tmp406);
tmp400 = _mm512_fmadd_ps(tmp411, _mm512_set1_ps(2e+00f), tmp398);
tmp407 = _mm512_fmadd_ps(tmp415, _mm512_set1_ps(2e+00f), tmp405);
tmp398 = _mm512_fnmadd_ps(tmp411, _mm512_set1_ps(2e+00f), tmp398);
tmp405 = _mm512_fnmadd_ps(tmp415, _mm512_set1_ps(2e+00f), tmp405);
__m512 out113 = _mm512_shuffle_f32x4(in57, tmp413, 68);
__m512 out121 = _mm512_shuffle_f32x4(in57, tmp413, 238);
__m512 out114 = _mm512_shuffle_f32x4(tmp414, tmp401, 68);
__m512 out122 = _mm512_shuffle_f32x4(tmp414, tmp401, 238);
__m512 out115 = _mm512_shuffle_f32x4(tmp412, tmp400, 68);
__m512 out123 = _mm512_shuffle_f32x4(tmp412, tmp400, 238);
__m512 out116 = _mm512_shuffle_f32x4(tmp398, tmp402, 68);
__m512 out124 = _mm512_shuffle_f32x4(tmp398, tmp402, 238);
__m512 out117 = _mm512_shuffle_f32x4(in60, tmp417, 68);
__m512 out125 = _mm512_shuffle_f32x4(in60, tmp417, 238);
__m512 out118 = _mm512_shuffle_f32x4(tmp418, tmp408, 68);
__m512 out126 = _mm512_shuffle_f32x4(tmp418, tmp408, 238);
__m512 out119 = _mm512_shuffle_f32x4(tmp416, tmp407, 68);
__m512 out127 = _mm512_shuffle_f32x4(tmp416, tmp407, 238);
__m512 out120 = _mm512_shuffle_f32x4(tmp405, tmp409, 68);
__m512 out128 = _mm512_shuffle_f32x4(tmp405, tmp409, 238);
_mm512_storeu_ps(dfPtr1+0+1013760*i7+152064*j3+101376*s5+256*k6, out113);
_mm512_storeu_ps(dfPtr1+128+1013760*i7+152064*j3+101376*s5+256*k6, out121);
_mm512_storeu_ps(dfPtr1+64+1013760*i7+152064*j3+101376*s5+256*k6, out117);
_mm512_storeu_ps(dfPtr1+192+1013760*i7+152064*j3+101376*s5+256*k6, out125);
_mm512_storeu_ps(dfPtr1+253440+1013760*i7+152064*j3+101376*s5+256*k6, out114);
_mm512_storeu_ps(dfPtr1+253568+1013760*i7+152064*j3+101376*s5+256*k6, out122);
_mm512_storeu_ps(dfPtr1+253504+1013760*i7+152064*j3+101376*s5+256*k6, out118);
_mm512_storeu_ps(dfPtr1+253632+1013760*i7+152064*j3+101376*s5+256*k6, out126);
_mm512_storeu_ps(dfPtr1+506880+1013760*i7+152064*j3+101376*s5+256*k6, out115);
_mm512_storeu_ps(dfPtr1+507008+1013760*i7+152064*j3+101376*s5+256*k6, out123);
_mm512_storeu_ps(dfPtr1+506944+1013760*i7+152064*j3+101376*s5+256*k6, out119);
_mm512_storeu_ps(dfPtr1+507072+1013760*i7+152064*j3+101376*s5+256*k6, out127);
_mm512_storeu_ps(dfPtr1+760320+1013760*i7+152064*j3+101376*s5+256*k6, out116);
_mm512_storeu_ps(dfPtr1+760448+1013760*i7+152064*j3+101376*s5+256*k6, out124);
_mm512_storeu_ps(dfPtr1+760384+1013760*i7+152064*j3+101376*s5+256*k6, out120);
_mm512_storeu_ps(dfPtr1+760512+1013760*i7+152064*j3+101376*s5+256*k6, out128);
}
if (j3 >= last1) return;
++j3;
return;
}
e2 = 1;
char*restrict datPtr2 = tensors4[0]-108+342144*e2;
char*restrict dfPtr2 = tensors4[1]+1013760*e2;
ptrdiff_t i8 = 1*g3;
ptrdiff_t j4 = 1*c1;
ptrdiff_t last2 = j4+0;
ptrdiff_t rel2 = j4-0;
ptrdiff_t base2 = 0;
if (rel2 < 1) {
ptrdiff_t h3 = base2+0;
ptrdiff_t w3 = 0;
ptrdiff_t k7 = 0;
for (; k7 != 72; ++k7) {
__m512 dat48 = _mm512_maskz_loadu_ps(16383, datPtr2+108+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat49 = _mm512_maskz_loadu_ps(16383, datPtr2+156+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512i pm20 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in63 = _mm512_permutexvar_ps(pm20, dat48);
__m512 in70 = _mm512_permutexvar_ps(pm20, dat49);
__m512 dat50 = _mm512_maskz_loadu_ps(16383, datPtr2+216+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat51 = _mm512_maskz_loadu_ps(16383, datPtr2+264+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in64 = _mm512_permutexvar_ps(pm20, dat50);
__m512 in71 = _mm512_permutexvar_ps(pm20, dat51);
__m512 dat52 = _mm512_maskz_loadu_ps(16383, datPtr2+324+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat53 = _mm512_maskz_loadu_ps(16383, datPtr2+372+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in65 = _mm512_permutexvar_ps(pm20, dat52);
__m512 in72 = _mm512_permutexvar_ps(pm20, dat53);
__m512 dat54 = _mm512_maskz_loadu_ps(16383, datPtr2+432+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat55 = _mm512_maskz_loadu_ps(16383, datPtr2+480+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in66 = _mm512_permutexvar_ps(pm20, dat54);
__m512 in73 = _mm512_permutexvar_ps(pm20, dat55);
__m512 dat56 = _mm512_maskz_loadu_ps(16383, datPtr2+540+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat57 = _mm512_maskz_loadu_ps(16383, datPtr2+588+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in67 = _mm512_permutexvar_ps(pm20, dat56);
__m512 in74 = _mm512_permutexvar_ps(pm20, dat57);
__m512 dat58 = _mm512_maskz_loadu_ps(16383, datPtr2+648+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat59 = _mm512_maskz_loadu_ps(16383, datPtr2+696+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in68 = _mm512_permutexvar_ps(pm20, dat58);
__m512 in75 = _mm512_permutexvar_ps(pm20, dat59);
__m512 dat60 = _mm512_maskz_loadu_ps(16383, datPtr2+756+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat61 = _mm512_maskz_loadu_ps(16383, datPtr2+804+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in69 = _mm512_permutexvar_ps(pm20, dat60);
__m512 in76 = _mm512_permutexvar_ps(pm20, dat61);
__m512 tmp467 = _mm512_add_ps(in63, in67);
__m512 tmp472 = _mm512_add_ps(in70, in74);
__m512 tmp468 = _mm512_sub_ps(in66, in64);
__m512 tmp473 = _mm512_sub_ps(in73, in71);
__m512 tmp469 = _mm512_add_ps(in64, in68);
__m512 tmp474 = _mm512_add_ps(in71, in75);
__m512 tmp470 = _mm512_sub_ps(_mm512_setzero_ps(), in68);
__m512 tmp475 = _mm512_sub_ps(_mm512_setzero_ps(), in75);
tmp467 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-4.25e+00f), tmp467);
tmp472 = _mm512_fmadd_ps(in72, _mm512_set1_ps(-4.25e+00f), tmp472);
tmp469 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-4.25e+00f), tmp469);
tmp474 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-4.25e+00f), tmp474);
tmp470 = _mm512_fmadd_ps(tmp468, _mm512_set1_ps(5.25e+00f), tmp470);
tmp475 = _mm512_fmadd_ps(tmp473, _mm512_set1_ps(5.25e+00f), tmp475);
tmp468 = _mm512_fmadd_ps(in64, _mm512_set1_ps(2.5e-01f), in68);
tmp473 = _mm512_fmadd_ps(in71, _mm512_set1_ps(2.5e-01f), in75);
in64 = _mm512_fmadd_ps(in64, _mm512_set1_ps(4e+00f), in68);
in71 = _mm512_fmadd_ps(in71, _mm512_set1_ps(4e+00f), in75);
__m512 tmp471 = _mm512_sub_ps(tmp469, tmp467);
__m512 tmp476 = _mm512_sub_ps(tmp474, tmp472);
tmp469 = _mm512_add_ps(tmp467, tmp469);
tmp474 = _mm512_add_ps(tmp472, tmp474);
tmp467 = _mm512_fmadd_ps(in63, _mm512_set1_ps(2.5e-01f), in67);
tmp472 = _mm512_fmadd_ps(in70, _mm512_set1_ps(2.5e-01f), in74);
tmp468 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-1.25e+00f), tmp468);
tmp473 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-1.25e+00f), tmp473);
in66 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-5e+00f), in64);
in73 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-5e+00f), in71);
tmp467 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-1.25e+00f), tmp467);
tmp472 = _mm512_fmadd_ps(in72, _mm512_set1_ps(-1.25e+00f), tmp472);
in68 = _mm512_fmadd_ps(tmp467, _mm512_set1_ps(2e+00f), tmp468);
in75 = _mm512_fmadd_ps(tmp472, _mm512_set1_ps(2e+00f), tmp473);
tmp468 = _mm512_fnmadd_ps(tmp467, _mm512_set1_ps(2e+00f), tmp468);
tmp473 = _mm512_fnmadd_ps(tmp472, _mm512_set1_ps(2e+00f), tmp473);
tmp467 = _mm512_fmadd_ps(in67, _mm512_set1_ps(2.5e-01f), in63);
tmp472 = _mm512_fmadd_ps(in74, _mm512_set1_ps(2.5e-01f), in70);
in63 = _mm512_sub_ps(in69, in63);
in70 = _mm512_sub_ps(in76, in70);
tmp467 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-1.25e+00f), tmp467);
tmp472 = _mm512_fmadd_ps(in72, _mm512_set1_ps(-1.25e+00f), tmp472);
in65 = _mm512_sub_ps(in65, in67);
in72 = _mm512_sub_ps(in72, in74);
in65 = _mm512_fmadd_ps(in65, _mm512_set1_ps(5.25e+00f), in63);
in72 = _mm512_fmadd_ps(in72, _mm512_set1_ps(5.25e+00f), in70);
in64 = _mm512_fmadd_ps(tmp467, _mm512_set1_ps(2e+00f), in66);
in71 = _mm512_fmadd_ps(tmp472, _mm512_set1_ps(2e+00f), in73);
in66 = _mm512_fnmadd_ps(tmp467, _mm512_set1_ps(2e+00f), in66);
in73 = _mm512_fnmadd_ps(tmp472, _mm512_set1_ps(2e+00f), in73);
__m512 tmp485 = _mm512_unpacklo_ps(tmp470, tmp469);
__m512 tmp486 = _mm512_unpackhi_ps(tmp470, tmp469);
__m512 tmp487 = _mm512_unpacklo_ps(tmp471, in68);
__m512 tmp488 = _mm512_unpackhi_ps(tmp471, in68);
__m512 tmp489 = _mm512_unpacklo_ps(tmp468, in64);
__m512 tmp490 = _mm512_unpackhi_ps(tmp468, in64);
__m512 tmp491 = _mm512_unpacklo_ps(in66, in65);
__m512 tmp492 = _mm512_unpackhi_ps(in66, in65);
__m512 tmp493 = _mm512_unpacklo_ps(tmp475, tmp474);
__m512 tmp494 = _mm512_unpackhi_ps(tmp475, tmp474);
__m512 tmp495 = _mm512_unpacklo_ps(tmp476, in75);
__m512 tmp496 = _mm512_unpackhi_ps(tmp476, in75);
__m512 tmp497 = _mm512_unpacklo_ps(tmp473, in71);
__m512 tmp498 = _mm512_unpackhi_ps(tmp473, in71);
__m512 tmp499 = _mm512_unpacklo_ps(in73, in72);
__m512 tmp500 = _mm512_unpackhi_ps(in73, in72);
__m512 tmp501 = _mm512_shuffle_ps(tmp485, tmp487, 68);
__m512 tmp502 = _mm512_shuffle_ps(tmp485, tmp487, 238);
__m512 tmp503 = _mm512_shuffle_ps(tmp486, tmp488, 68);
__m512 tmp504 = _mm512_shuffle_ps(tmp486, tmp488, 238);
__m512 tmp505 = _mm512_shuffle_ps(tmp489, tmp491, 68);
__m512 tmp506 = _mm512_shuffle_ps(tmp489, tmp491, 238);
__m512 tmp507 = _mm512_shuffle_ps(tmp490, tmp492, 68);
__m512 tmp508 = _mm512_shuffle_ps(tmp490, tmp492, 238);
__m512 tmp509 = _mm512_shuffle_ps(tmp493, tmp495, 68);
__m512 tmp510 = _mm512_shuffle_ps(tmp493, tmp495, 238);
__m512 tmp511 = _mm512_shuffle_ps(tmp494, tmp496, 68);
__m512 tmp512 = _mm512_shuffle_ps(tmp494, tmp496, 238);
__m512 tmp513 = _mm512_shuffle_ps(tmp497, tmp499, 68);
__m512 tmp514 = _mm512_shuffle_ps(tmp497, tmp499, 238);
__m512 tmp515 = _mm512_shuffle_ps(tmp498, tmp500, 68);
__m512 tmp516 = _mm512_shuffle_ps(tmp498, tmp500, 238);
__m512 tmp517 = _mm512_shuffle_f32x4(tmp501, tmp505, 136);
__m512 tmp518 = _mm512_shuffle_f32x4(tmp501, tmp505, 221);
__m512 tmp519 = _mm512_shuffle_f32x4(tmp502, tmp506, 136);
__m512 tmp520 = _mm512_shuffle_f32x4(tmp502, tmp506, 221);
__m512 tmp521 = _mm512_shuffle_f32x4(tmp503, tmp507, 136);
__m512 tmp522 = _mm512_shuffle_f32x4(tmp503, tmp507, 221);
__m512 tmp523 = _mm512_shuffle_f32x4(tmp504, tmp508, 136);
__m512 tmp524 = _mm512_shuffle_f32x4(tmp504, tmp508, 221);
__m512 tmp525 = _mm512_shuffle_f32x4(tmp509, tmp513, 136);
__m512 tmp526 = _mm512_shuffle_f32x4(tmp509, tmp513, 221);
__m512 tmp527 = _mm512_shuffle_f32x4(tmp510, tmp514, 136);
__m512 tmp528 = _mm512_shuffle_f32x4(tmp510, tmp514, 221);
__m512 tmp529 = _mm512_shuffle_f32x4(tmp511, tmp515, 136);
__m512 tmp530 = _mm512_shuffle_f32x4(tmp511, tmp515, 221);
__m512 tmp531 = _mm512_shuffle_f32x4(tmp512, tmp516, 136);
__m512 tmp532 = _mm512_shuffle_f32x4(tmp512, tmp516, 221);
tmp470 = _mm512_shuffle_f32x4(tmp517, tmp525, 136);
tmp475 = _mm512_shuffle_f32x4(tmp517, tmp525, 221);
tmp469 = _mm512_shuffle_f32x4(tmp519, tmp527, 136);
tmp474 = _mm512_shuffle_f32x4(tmp519, tmp527, 221);
tmp471 = _mm512_shuffle_f32x4(tmp521, tmp529, 136);
tmp476 = _mm512_shuffle_f32x4(tmp521, tmp529, 221);
in68 = _mm512_shuffle_f32x4(tmp523, tmp531, 136);
in75 = _mm512_shuffle_f32x4(tmp523, tmp531, 221);
tmp468 = _mm512_shuffle_f32x4(tmp518, tmp526, 136);
tmp473 = _mm512_shuffle_f32x4(tmp518, tmp526, 221);
in64 = _mm512_shuffle_f32x4(tmp520, tmp528, 136);
in71 = _mm512_shuffle_f32x4(tmp520, tmp528, 221);
in66 = _mm512_shuffle_f32x4(tmp522, tmp530, 136);
in73 = _mm512_shuffle_f32x4(tmp522, tmp530, 221);
in65 = _mm512_shuffle_f32x4(tmp524, tmp532, 136);
in72 = _mm512_shuffle_f32x4(tmp524, tmp532, 221);
__m512 tmp477 = _mm512_add_ps(tmp469, in64);
__m512 tmp481 = _mm512_add_ps(tmp474, in71);
__m512 tmp478 = _mm512_sub_ps(tmp468, tmp471);
__m512 tmp482 = _mm512_sub_ps(tmp473, tmp476);
__m512 tmp479 = _mm512_add_ps(tmp471, in66);
__m512 tmp483 = _mm512_add_ps(tmp476, in73);
tmp470 = _mm512_sub_ps(tmp470, in66);
tmp475 = _mm512_sub_ps(tmp475, in73);
tmp477 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-4.25e+00f), tmp477);
tmp481 = _mm512_fmadd_ps(in75, _mm512_set1_ps(-4.25e+00f), tmp481);
tmp479 = _mm512_fmadd_ps(tmp468, _mm512_set1_ps(-4.25e+00f), tmp479);
tmp483 = _mm512_fmadd_ps(tmp473, _mm512_set1_ps(-4.25e+00f), tmp483);
tmp470 = _mm512_fmadd_ps(tmp478, _mm512_set1_ps(5.25e+00f), tmp470);
tmp475 = _mm512_fmadd_ps(tmp482, _mm512_set1_ps(5.25e+00f), tmp475);
tmp478 = _mm512_fmadd_ps(tmp471, _mm512_set1_ps(2.5e-01f), in66);
tmp482 = _mm512_fmadd_ps(tmp476, _mm512_set1_ps(2.5e-01f), in73);
tmp471 = _mm512_fmadd_ps(tmp471, _mm512_set1_ps(4e+00f), in66);
tmp476 = _mm512_fmadd_ps(tmp476, _mm512_set1_ps(4e+00f), in73);
__m512 tmp480 = _mm512_sub_ps(tmp479, tmp477);
__m512 tmp484 = _mm512_sub_ps(tmp483, tmp481);
tmp479 = _mm512_add_ps(tmp477, tmp479);
tmp483 = _mm512_add_ps(tmp481, tmp483);
tmp477 = _mm512_fmadd_ps(tmp469, _mm512_set1_ps(2.5e-01f), in64);
tmp481 = _mm512_fmadd_ps(tmp474, _mm512_set1_ps(2.5e-01f), in71);
tmp478 = _mm512_fmadd_ps(tmp468, _mm512_set1_ps(-1.25e+00f), tmp478);
tmp482 = _mm512_fmadd_ps(tmp473, _mm512_set1_ps(-1.25e+00f), tmp482);
tmp468 = _mm512_fmadd_ps(tmp468, _mm512_set1_ps(-5e+00f), tmp471);
tmp473 = _mm512_fmadd_ps(tmp473, _mm512_set1_ps(-5e+00f), tmp476);
tmp477 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-1.25e+00f), tmp477);
tmp481 = _mm512_fmadd_ps(in75, _mm512_set1_ps(-1.25e+00f), tmp481);
in66 = _mm512_fmadd_ps(tmp477, _mm512_set1_ps(2e+00f), tmp478);
in73 = _mm512_fmadd_ps(tmp481, _mm512_set1_ps(2e+00f), tmp482);
tmp478 = _mm512_fnmadd_ps(tmp477, _mm512_set1_ps(2e+00f), tmp478);
tmp482 = _mm512_fnmadd_ps(tmp481, _mm512_set1_ps(2e+00f), tmp482);
tmp477 = _mm512_fmadd_ps(in64, _mm512_set1_ps(2.5e-01f), tmp469);
tmp481 = _mm512_fmadd_ps(in71, _mm512_set1_ps(2.5e-01f), tmp474);
tmp469 = _mm512_sub_ps(in65, tmp469);
tmp474 = _mm512_sub_ps(in72, tmp474);
tmp477 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-1.25e+00f), tmp477);
tmp481 = _mm512_fmadd_ps(in75, _mm512_set1_ps(-1.25e+00f), tmp481);
in68 = _mm512_sub_ps(in68, in64);
in75 = _mm512_sub_ps(in75, in71);
in68 = _mm512_fmadd_ps(in68, _mm512_set1_ps(5.25e+00f), tmp469);
in75 = _mm512_fmadd_ps(in75, _mm512_set1_ps(5.25e+00f), tmp474);
tmp471 = _mm512_fmadd_ps(tmp477, _mm512_set1_ps(2e+00f), tmp468);
tmp476 = _mm512_fmadd_ps(tmp481, _mm512_set1_ps(2e+00f), tmp473);
tmp468 = _mm512_fnmadd_ps(tmp477, _mm512_set1_ps(2e+00f), tmp468);
tmp473 = _mm512_fnmadd_ps(tmp481, _mm512_set1_ps(2e+00f), tmp473);
__m512 out129 = _mm512_shuffle_f32x4(tmp470, tmp479, 68);
__m512 out137 = _mm512_shuffle_f32x4(tmp470, tmp479, 238);
__m512 out130 = _mm512_shuffle_f32x4(tmp480, in66, 68);
__m512 out138 = _mm512_shuffle_f32x4(tmp480, in66, 238);
__m512 out131 = _mm512_shuffle_f32x4(tmp478, tmp471, 68);
__m512 out139 = _mm512_shuffle_f32x4(tmp478, tmp471, 238);
__m512 out132 = _mm512_shuffle_f32x4(tmp468, in68, 68);
__m512 out140 = _mm512_shuffle_f32x4(tmp468, in68, 238);
__m512 out133 = _mm512_shuffle_f32x4(tmp475, tmp483, 68);
__m512 out141 = _mm512_shuffle_f32x4(tmp475, tmp483, 238);
__m512 out134 = _mm512_shuffle_f32x4(tmp484, in73, 68);
__m512 out142 = _mm512_shuffle_f32x4(tmp484, in73, 238);
__m512 out135 = _mm512_shuffle_f32x4(tmp482, tmp476, 68);
__m512 out143 = _mm512_shuffle_f32x4(tmp482, tmp476, 238);
__m512 out136 = _mm512_shuffle_f32x4(tmp473, in75, 68);
__m512 out144 = _mm512_shuffle_f32x4(tmp473, in75, 238);
_mm512_storeu_ps(dfPtr2+0+368640*i8+55296*j4+55296*s5+768*k7, out129);
_mm512_storeu_ps(dfPtr2+128+368640*i8+55296*j4+55296*s5+768*k7, out137);
_mm512_storeu_ps(dfPtr2+64+368640*i8+55296*j4+55296*s5+768*k7, out133);
_mm512_storeu_ps(dfPtr2+192+368640*i8+55296*j4+55296*s5+768*k7, out141);
_mm512_storeu_ps(dfPtr2+92160+368640*i8+55296*j4+55296*s5+768*k7, out130);
_mm512_storeu_ps(dfPtr2+92288+368640*i8+55296*j4+55296*s5+768*k7, out138);
_mm512_storeu_ps(dfPtr2+92224+368640*i8+55296*j4+55296*s5+768*k7, out134);
_mm512_storeu_ps(dfPtr2+92352+368640*i8+55296*j4+55296*s5+768*k7, out142);
_mm512_storeu_ps(dfPtr2+184320+368640*i8+55296*j4+55296*s5+768*k7, out131);
_mm512_storeu_ps(dfPtr2+184448+368640*i8+55296*j4+55296*s5+768*k7, out139);
_mm512_storeu_ps(dfPtr2+184384+368640*i8+55296*j4+55296*s5+768*k7, out135);
_mm512_storeu_ps(dfPtr2+184512+368640*i8+55296*j4+55296*s5+768*k7, out143);
_mm512_storeu_ps(dfPtr2+276480+368640*i8+55296*j4+55296*s5+768*k7, out132);
_mm512_storeu_ps(dfPtr2+276608+368640*i8+55296*j4+55296*s5+768*k7, out140);
_mm512_storeu_ps(dfPtr2+276544+368640*i8+55296*j4+55296*s5+768*k7, out136);
_mm512_storeu_ps(dfPtr2+276672+368640*i8+55296*j4+55296*s5+768*k7, out144);
__m512 dat62 = _mm512_maskz_loadu_ps(255, datPtr2+648+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512i pm21 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in77 = _mm512_permutexvar_ps(pm21, dat62);
__m512 dat63 = _mm512_maskz_loadu_ps(7, datPtr2+204+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat64 = _mm512_maskz_loadu_ps(255, datPtr2+756+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat65 = _mm512_maskz_loadu_ps(16383, datPtr2+972+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512i pm22 = _mm512_set_epi32(23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in78 = _mm512_permutex2var_ps(dat63, pm22, dat64);
__m512i pm23 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in85 = _mm512_permutexvar_ps(pm23, dat65);
__m512 dat66 = _mm512_maskz_loadu_ps(7, datPtr2+312+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat67 = _mm512_maskz_loadu_ps(255, datPtr2+864+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat68 = _mm512_maskz_loadu_ps(16383, datPtr2+1080+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in79 = _mm512_permutex2var_ps(dat66, pm22, dat67);
__m512 in86 = _mm512_permutexvar_ps(pm23, dat68);
__m512 dat69 = _mm512_maskz_loadu_ps(7, datPtr2+420+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat70 = _mm512_maskz_loadu_ps(16383, datPtr2+1188+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512i pm24 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in80 = _mm512_permutexvar_ps(pm24, dat69);
__m512 in87 = _mm512_permutexvar_ps(pm23, dat70);
__m512 dat71 = _mm512_maskz_loadu_ps(7, datPtr2+528+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat72 = _mm512_maskz_loadu_ps(16383, datPtr2+1296+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in81 = _mm512_permutexvar_ps(pm24, dat71);
__m512 in88 = _mm512_permutexvar_ps(pm23, dat72);
__m512 dat73 = _mm512_maskz_loadu_ps(7, datPtr2+636+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat74 = _mm512_maskz_loadu_ps(16383, datPtr2+1404+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in82 = _mm512_permutexvar_ps(pm24, dat73);
__m512 in89 = _mm512_permutexvar_ps(pm23, dat74);
__m512 dat75 = _mm512_maskz_loadu_ps(7, datPtr2+744+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat76 = _mm512_maskz_loadu_ps(16383, datPtr2+1512+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in83 = _mm512_permutexvar_ps(pm24, dat75);
__m512 in90 = _mm512_permutexvar_ps(pm23, dat76);
__m512 dat77 = _mm512_maskz_loadu_ps(7, datPtr2+852+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat78 = _mm512_maskz_loadu_ps(16383, datPtr2+1620+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in84 = _mm512_permutexvar_ps(pm24, dat77);
__m512 in91 = _mm512_permutexvar_ps(pm23, dat78);
__m512 tmp533 = _mm512_add_ps(in78, in82);
__m512 tmp537 = _mm512_add_ps(in85, in89);
__m512 tmp534 = _mm512_sub_ps(in81, in79);
__m512 tmp538 = _mm512_sub_ps(in88, in86);
__m512 tmp535 = _mm512_add_ps(in79, in83);
__m512 tmp539 = _mm512_add_ps(in86, in90);
in77 = _mm512_sub_ps(in77, in83);
__m512 tmp540 = _mm512_sub_ps(_mm512_setzero_ps(), in90);
tmp533 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-4.25e+00f), tmp533);
tmp537 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-4.25e+00f), tmp537);
tmp535 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-4.25e+00f), tmp535);
tmp539 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-4.25e+00f), tmp539);
in77 = _mm512_fmadd_ps(tmp534, _mm512_set1_ps(5.25e+00f), in77);
tmp540 = _mm512_fmadd_ps(tmp538, _mm512_set1_ps(5.25e+00f), tmp540);
tmp534 = _mm512_fmadd_ps(in79, _mm512_set1_ps(2.5e-01f), in83);
tmp538 = _mm512_fmadd_ps(in86, _mm512_set1_ps(2.5e-01f), in90);
in79 = _mm512_fmadd_ps(in79, _mm512_set1_ps(4e+00f), in83);
in86 = _mm512_fmadd_ps(in86, _mm512_set1_ps(4e+00f), in90);
__m512 tmp536 = _mm512_sub_ps(tmp535, tmp533);
__m512 tmp541 = _mm512_sub_ps(tmp539, tmp537);
tmp535 = _mm512_add_ps(tmp533, tmp535);
tmp539 = _mm512_add_ps(tmp537, tmp539);
tmp533 = _mm512_fmadd_ps(in78, _mm512_set1_ps(2.5e-01f), in82);
tmp537 = _mm512_fmadd_ps(in85, _mm512_set1_ps(2.5e-01f), in89);
tmp534 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-1.25e+00f), tmp534);
tmp538 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-1.25e+00f), tmp538);
in81 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-5e+00f), in79);
in88 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-5e+00f), in86);
tmp533 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-1.25e+00f), tmp533);
tmp537 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-1.25e+00f), tmp537);
in83 = _mm512_fmadd_ps(tmp533, _mm512_set1_ps(2e+00f), tmp534);
in90 = _mm512_fmadd_ps(tmp537, _mm512_set1_ps(2e+00f), tmp538);
tmp534 = _mm512_fnmadd_ps(tmp533, _mm512_set1_ps(2e+00f), tmp534);
tmp538 = _mm512_fnmadd_ps(tmp537, _mm512_set1_ps(2e+00f), tmp538);
tmp533 = _mm512_fmadd_ps(in82, _mm512_set1_ps(2.5e-01f), in78);
tmp537 = _mm512_fmadd_ps(in89, _mm512_set1_ps(2.5e-01f), in85);
in78 = _mm512_sub_ps(in84, in78);
in85 = _mm512_sub_ps(in91, in85);
tmp533 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-1.25e+00f), tmp533);
tmp537 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-1.25e+00f), tmp537);
in80 = _mm512_sub_ps(in80, in82);
in87 = _mm512_sub_ps(in87, in89);
in80 = _mm512_fmadd_ps(in80, _mm512_set1_ps(5.25e+00f), in78);
in87 = _mm512_fmadd_ps(in87, _mm512_set1_ps(5.25e+00f), in85);
in79 = _mm512_fmadd_ps(tmp533, _mm512_set1_ps(2e+00f), in81);
in86 = _mm512_fmadd_ps(tmp537, _mm512_set1_ps(2e+00f), in88);
in81 = _mm512_fnmadd_ps(tmp533, _mm512_set1_ps(2e+00f), in81);
in88 = _mm512_fnmadd_ps(tmp537, _mm512_set1_ps(2e+00f), in88);
__m512 tmp550 = _mm512_unpacklo_ps(in77, tmp535);
__m512 tmp551 = _mm512_unpackhi_ps(in77, tmp535);
__m512 tmp552 = _mm512_unpacklo_ps(tmp536, in83);
__m512 tmp553 = _mm512_unpackhi_ps(tmp536, in83);
__m512 tmp554 = _mm512_unpacklo_ps(tmp534, in79);
__m512 tmp555 = _mm512_unpackhi_ps(tmp534, in79);
__m512 tmp556 = _mm512_unpacklo_ps(in81, in80);
__m512 tmp557 = _mm512_unpackhi_ps(in81, in80);
__m512 tmp558 = _mm512_unpacklo_ps(tmp540, tmp539);
__m512 tmp559 = _mm512_unpackhi_ps(tmp540, tmp539);
__m512 tmp560 = _mm512_unpacklo_ps(tmp541, in90);
__m512 tmp561 = _mm512_unpackhi_ps(tmp541, in90);
__m512 tmp562 = _mm512_unpacklo_ps(tmp538, in86);
__m512 tmp563 = _mm512_unpackhi_ps(tmp538, in86);
__m512 tmp564 = _mm512_unpacklo_ps(in88, in87);
__m512 tmp565 = _mm512_unpackhi_ps(in88, in87);
__m512 tmp566 = _mm512_shuffle_ps(tmp550, tmp552, 68);
__m512 tmp567 = _mm512_shuffle_ps(tmp550, tmp552, 238);
__m512 tmp568 = _mm512_shuffle_ps(tmp551, tmp553, 68);
__m512 tmp569 = _mm512_shuffle_ps(tmp551, tmp553, 238);
__m512 tmp570 = _mm512_shuffle_ps(tmp554, tmp556, 68);
__m512 tmp571 = _mm512_shuffle_ps(tmp554, tmp556, 238);
__m512 tmp572 = _mm512_shuffle_ps(tmp555, tmp557, 68);
__m512 tmp573 = _mm512_shuffle_ps(tmp555, tmp557, 238);
__m512 tmp574 = _mm512_shuffle_ps(tmp558, tmp560, 68);
__m512 tmp575 = _mm512_shuffle_ps(tmp558, tmp560, 238);
__m512 tmp576 = _mm512_shuffle_ps(tmp559, tmp561, 68);
__m512 tmp577 = _mm512_shuffle_ps(tmp559, tmp561, 238);
__m512 tmp578 = _mm512_shuffle_ps(tmp562, tmp564, 68);
__m512 tmp579 = _mm512_shuffle_ps(tmp562, tmp564, 238);
__m512 tmp580 = _mm512_shuffle_ps(tmp563, tmp565, 68);
__m512 tmp581 = _mm512_shuffle_ps(tmp563, tmp565, 238);
__m512 tmp582 = _mm512_shuffle_f32x4(tmp566, tmp570, 136);
__m512 tmp583 = _mm512_shuffle_f32x4(tmp566, tmp570, 221);
__m512 tmp584 = _mm512_shuffle_f32x4(tmp567, tmp571, 136);
__m512 tmp585 = _mm512_shuffle_f32x4(tmp567, tmp571, 221);
__m512 tmp586 = _mm512_shuffle_f32x4(tmp568, tmp572, 136);
__m512 tmp587 = _mm512_shuffle_f32x4(tmp568, tmp572, 221);
__m512 tmp588 = _mm512_shuffle_f32x4(tmp569, tmp573, 136);
__m512 tmp589 = _mm512_shuffle_f32x4(tmp569, tmp573, 221);
__m512 tmp590 = _mm512_shuffle_f32x4(tmp574, tmp578, 136);
__m512 tmp591 = _mm512_shuffle_f32x4(tmp574, tmp578, 221);
__m512 tmp592 = _mm512_shuffle_f32x4(tmp575, tmp579, 136);
__m512 tmp593 = _mm512_shuffle_f32x4(tmp575, tmp579, 221);
__m512 tmp594 = _mm512_shuffle_f32x4(tmp576, tmp580, 136);
__m512 tmp595 = _mm512_shuffle_f32x4(tmp576, tmp580, 221);
__m512 tmp596 = _mm512_shuffle_f32x4(tmp577, tmp581, 136);
__m512 tmp597 = _mm512_shuffle_f32x4(tmp577, tmp581, 221);
in77 = _mm512_shuffle_f32x4(tmp582, tmp590, 136);
tmp540 = _mm512_shuffle_f32x4(tmp582, tmp590, 221);
tmp535 = _mm512_shuffle_f32x4(tmp584, tmp592, 136);
tmp539 = _mm512_shuffle_f32x4(tmp584, tmp592, 221);
tmp536 = _mm512_shuffle_f32x4(tmp586, tmp594, 136);
tmp541 = _mm512_shuffle_f32x4(tmp586, tmp594, 221);
in83 = _mm512_shuffle_f32x4(tmp588, tmp596, 136);
in90 = _mm512_shuffle_f32x4(tmp588, tmp596, 221);
tmp534 = _mm512_shuffle_f32x4(tmp583, tmp591, 136);
tmp538 = _mm512_shuffle_f32x4(tmp583, tmp591, 221);
in79 = _mm512_shuffle_f32x4(tmp585, tmp593, 136);
in86 = _mm512_shuffle_f32x4(tmp585, tmp593, 221);
in81 = _mm512_shuffle_f32x4(tmp587, tmp595, 136);
in88 = _mm512_shuffle_f32x4(tmp587, tmp595, 221);
in80 = _mm512_shuffle_f32x4(tmp589, tmp597, 136);
in87 = _mm512_shuffle_f32x4(tmp589, tmp597, 221);
__m512 tmp542 = _mm512_add_ps(tmp535, in79);
__m512 tmp546 = _mm512_add_ps(tmp539, in86);
__m512 tmp543 = _mm512_sub_ps(tmp534, tmp536);
__m512 tmp547 = _mm512_sub_ps(tmp538, tmp541);
__m512 tmp544 = _mm512_add_ps(tmp536, in81);
__m512 tmp548 = _mm512_add_ps(tmp541, in88);
in77 = _mm512_sub_ps(in77, in81);
tmp540 = _mm512_sub_ps(tmp540, in88);
tmp542 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-4.25e+00f), tmp542);
tmp546 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-4.25e+00f), tmp546);
tmp544 = _mm512_fmadd_ps(tmp534, _mm512_set1_ps(-4.25e+00f), tmp544);
tmp548 = _mm512_fmadd_ps(tmp538, _mm512_set1_ps(-4.25e+00f), tmp548);
in77 = _mm512_fmadd_ps(tmp543, _mm512_set1_ps(5.25e+00f), in77);
tmp540 = _mm512_fmadd_ps(tmp547, _mm512_set1_ps(5.25e+00f), tmp540);
tmp543 = _mm512_fmadd_ps(tmp536, _mm512_set1_ps(2.5e-01f), in81);
tmp547 = _mm512_fmadd_ps(tmp541, _mm512_set1_ps(2.5e-01f), in88);
tmp536 = _mm512_fmadd_ps(tmp536, _mm512_set1_ps(4e+00f), in81);
tmp541 = _mm512_fmadd_ps(tmp541, _mm512_set1_ps(4e+00f), in88);
__m512 tmp545 = _mm512_sub_ps(tmp544, tmp542);
__m512 tmp549 = _mm512_sub_ps(tmp548, tmp546);
tmp544 = _mm512_add_ps(tmp542, tmp544);
tmp548 = _mm512_add_ps(tmp546, tmp548);
tmp542 = _mm512_fmadd_ps(tmp535, _mm512_set1_ps(2.5e-01f), in79);
tmp546 = _mm512_fmadd_ps(tmp539, _mm512_set1_ps(2.5e-01f), in86);
tmp543 = _mm512_fmadd_ps(tmp534, _mm512_set1_ps(-1.25e+00f), tmp543);
tmp547 = _mm512_fmadd_ps(tmp538, _mm512_set1_ps(-1.25e+00f), tmp547);
tmp534 = _mm512_fmadd_ps(tmp534, _mm512_set1_ps(-5e+00f), tmp536);
tmp538 = _mm512_fmadd_ps(tmp538, _mm512_set1_ps(-5e+00f), tmp541);
tmp542 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-1.25e+00f), tmp542);
tmp546 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-1.25e+00f), tmp546);
in81 = _mm512_fmadd_ps(tmp542, _mm512_set1_ps(2e+00f), tmp543);
in88 = _mm512_fmadd_ps(tmp546, _mm512_set1_ps(2e+00f), tmp547);
tmp543 = _mm512_fnmadd_ps(tmp542, _mm512_set1_ps(2e+00f), tmp543);
tmp547 = _mm512_fnmadd_ps(tmp546, _mm512_set1_ps(2e+00f), tmp547);
tmp542 = _mm512_fmadd_ps(in79, _mm512_set1_ps(2.5e-01f), tmp535);
tmp546 = _mm512_fmadd_ps(in86, _mm512_set1_ps(2.5e-01f), tmp539);
tmp535 = _mm512_sub_ps(in80, tmp535);
tmp539 = _mm512_sub_ps(in87, tmp539);
tmp542 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-1.25e+00f), tmp542);
tmp546 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-1.25e+00f), tmp546);
in83 = _mm512_sub_ps(in83, in79);
in90 = _mm512_sub_ps(in90, in86);
in83 = _mm512_fmadd_ps(in83, _mm512_set1_ps(5.25e+00f), tmp535);
in90 = _mm512_fmadd_ps(in90, _mm512_set1_ps(5.25e+00f), tmp539);
tmp536 = _mm512_fmadd_ps(tmp542, _mm512_set1_ps(2e+00f), tmp534);
tmp541 = _mm512_fmadd_ps(tmp546, _mm512_set1_ps(2e+00f), tmp538);
tmp534 = _mm512_fnmadd_ps(tmp542, _mm512_set1_ps(2e+00f), tmp534);
tmp538 = _mm512_fnmadd_ps(tmp546, _mm512_set1_ps(2e+00f), tmp538);
__m512 out145 = _mm512_shuffle_f32x4(in77, tmp544, 68);
__m512 out153 = _mm512_shuffle_f32x4(in77, tmp544, 238);
__m512 out146 = _mm512_shuffle_f32x4(tmp545, in81, 68);
__m512 out154 = _mm512_shuffle_f32x4(tmp545, in81, 238);
__m512 out147 = _mm512_shuffle_f32x4(tmp543, tmp536, 68);
__m512 out155 = _mm512_shuffle_f32x4(tmp543, tmp536, 238);
__m512 out148 = _mm512_shuffle_f32x4(tmp534, in83, 68);
__m512 out156 = _mm512_shuffle_f32x4(tmp534, in83, 238);
__m512 out149 = _mm512_shuffle_f32x4(tmp540, tmp548, 68);
__m512 out157 = _mm512_shuffle_f32x4(tmp540, tmp548, 238);
__m512 out150 = _mm512_shuffle_f32x4(tmp549, in88, 68);
__m512 out158 = _mm512_shuffle_f32x4(tmp549, in88, 238);
__m512 out151 = _mm512_shuffle_f32x4(tmp547, tmp541, 68);
__m512 out159 = _mm512_shuffle_f32x4(tmp547, tmp541, 238);
__m512 out152 = _mm512_shuffle_f32x4(tmp538, in90, 68);
__m512 out160 = _mm512_shuffle_f32x4(tmp538, in90, 238);
_mm512_storeu_ps(dfPtr2+256+368640*i8+55296*j4+55296*s5+768*k7, out145);
_mm512_storeu_ps(dfPtr2+384+368640*i8+55296*j4+55296*s5+768*k7, out153);
_mm512_storeu_ps(dfPtr2+320+368640*i8+55296*j4+55296*s5+768*k7, out149);
_mm512_storeu_ps(dfPtr2+448+368640*i8+55296*j4+55296*s5+768*k7, out157);
_mm512_storeu_ps(dfPtr2+92416+368640*i8+55296*j4+55296*s5+768*k7, out146);
_mm512_storeu_ps(dfPtr2+92544+368640*i8+55296*j4+55296*s5+768*k7, out154);
_mm512_storeu_ps(dfPtr2+92480+368640*i8+55296*j4+55296*s5+768*k7, out150);
_mm512_storeu_ps(dfPtr2+92608+368640*i8+55296*j4+55296*s5+768*k7, out158);
_mm512_storeu_ps(dfPtr2+184576+368640*i8+55296*j4+55296*s5+768*k7, out147);
_mm512_storeu_ps(dfPtr2+184704+368640*i8+55296*j4+55296*s5+768*k7, out155);
_mm512_storeu_ps(dfPtr2+184640+368640*i8+55296*j4+55296*s5+768*k7, out151);
_mm512_storeu_ps(dfPtr2+184768+368640*i8+55296*j4+55296*s5+768*k7, out159);
_mm512_storeu_ps(dfPtr2+276736+368640*i8+55296*j4+55296*s5+768*k7, out148);
_mm512_storeu_ps(dfPtr2+276864+368640*i8+55296*j4+55296*s5+768*k7, out156);
_mm512_storeu_ps(dfPtr2+276800+368640*i8+55296*j4+55296*s5+768*k7, out152);
_mm512_storeu_ps(dfPtr2+276928+368640*i8+55296*j4+55296*s5+768*k7, out160);
__m512 dat79 = _mm512_maskz_loadu_ps(255, datPtr2+1512+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512i pm25 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in99 = _mm512_permutexvar_ps(pm25, dat79);
__m512 dat80 = _mm512_maskz_loadu_ps(32767, datPtr2+1020+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat81 = _mm512_maskz_loadu_ps(255, datPtr2+1620+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512i pm26 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in92 = _mm512_permutexvar_ps(pm26, dat80);
__m512i pm27 = _mm512_set_epi32(23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 14, 13, 12);
__m512 in100 = _mm512_permutex2var_ps(dat80, pm27, dat81);
__m512 dat82 = _mm512_maskz_loadu_ps(32767, datPtr2+1128+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 dat83 = _mm512_maskz_loadu_ps(255, datPtr2+1728+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in93 = _mm512_permutexvar_ps(pm26, dat82);
__m512 in101 = _mm512_permutex2var_ps(dat82, pm27, dat83);
__m512 dat84 = _mm512_maskz_loadu_ps(32767, datPtr2+1236+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in94 = _mm512_permutexvar_ps(pm26, dat84);
__m512i pm28 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 13, 12);
__m512 in102 = _mm512_permutexvar_ps(pm28, dat84);
__m512 dat85 = _mm512_maskz_loadu_ps(32767, datPtr2+1344+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in95 = _mm512_permutexvar_ps(pm26, dat85);
__m512 in103 = _mm512_permutexvar_ps(pm28, dat85);
__m512 dat86 = _mm512_maskz_loadu_ps(32767, datPtr2+1452+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in96 = _mm512_permutexvar_ps(pm26, dat86);
__m512 in104 = _mm512_permutexvar_ps(pm28, dat86);
__m512 dat87 = _mm512_maskz_loadu_ps(32767, datPtr2+1560+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in97 = _mm512_permutexvar_ps(pm26, dat87);
__m512 in105 = _mm512_permutexvar_ps(pm28, dat87);
__m512 dat88 = _mm512_maskz_loadu_ps(32767, datPtr2+1668+466560*i8+108*h3+4*w3+124416*s5+1728*k7);
__m512 in98 = _mm512_permutexvar_ps(pm26, dat88);
__m512 in106 = _mm512_permutexvar_ps(pm28, dat88);
__m512 tmp598 = _mm512_add_ps(in92, in96);
__m512 tmp603 = _mm512_add_ps(in100, in104);
__m512 tmp599 = _mm512_sub_ps(in95, in93);
__m512 tmp604 = _mm512_sub_ps(in103, in101);
__m512 tmp600 = _mm512_add_ps(in93, in97);
__m512 tmp605 = _mm512_add_ps(in101, in105);
__m512 tmp601 = _mm512_sub_ps(_mm512_setzero_ps(), in97);
in99 = _mm512_sub_ps(in99, in105);
tmp598 = _mm512_fmadd_ps(in94, _mm512_set1_ps(-4.25e+00f), tmp598);
tmp603 = _mm512_fmadd_ps(in102, _mm512_set1_ps(-4.25e+00f), tmp603);
tmp600 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-4.25e+00f), tmp600);
tmp605 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-4.25e+00f), tmp605);
tmp601 = _mm512_fmadd_ps(tmp599, _mm512_set1_ps(5.25e+00f), tmp601);
in99 = _mm512_fmadd_ps(tmp604, _mm512_set1_ps(5.25e+00f), in99);
tmp599 = _mm512_fmadd_ps(in93, _mm512_set1_ps(2.5e-01f), in97);
tmp604 = _mm512_fmadd_ps(in101, _mm512_set1_ps(2.5e-01f), in105);
in93 = _mm512_fmadd_ps(in93, _mm512_set1_ps(4e+00f), in97);
in101 = _mm512_fmadd_ps(in101, _mm512_set1_ps(4e+00f), in105);
__m512 tmp602 = _mm512_sub_ps(tmp600, tmp598);
__m512 tmp606 = _mm512_sub_ps(tmp605, tmp603);
tmp600 = _mm512_add_ps(tmp598, tmp600);
tmp605 = _mm512_add_ps(tmp603, tmp605);
tmp598 = _mm512_fmadd_ps(in92, _mm512_set1_ps(2.5e-01f), in96);
tmp603 = _mm512_fmadd_ps(in100, _mm512_set1_ps(2.5e-01f), in104);
tmp599 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-1.25e+00f), tmp599);
tmp604 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-1.25e+00f), tmp604);
in95 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-5e+00f), in93);
in103 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-5e+00f), in101);
tmp598 = _mm512_fmadd_ps(in94, _mm512_set1_ps(-1.25e+00f), tmp598);
tmp603 = _mm512_fmadd_ps(in102, _mm512_set1_ps(-1.25e+00f), tmp603);
in97 = _mm512_fmadd_ps(tmp598, _mm512_set1_ps(2e+00f), tmp599);
in105 = _mm512_fmadd_ps(tmp603, _mm512_set1_ps(2e+00f), tmp604);
tmp599 = _mm512_fnmadd_ps(tmp598, _mm512_set1_ps(2e+00f), tmp599);
tmp604 = _mm512_fnmadd_ps(tmp603, _mm512_set1_ps(2e+00f), tmp604);
tmp598 = _mm512_fmadd_ps(in96, _mm512_set1_ps(2.5e-01f), in92);
tmp603 = _mm512_fmadd_ps(in104, _mm512_set1_ps(2.5e-01f), in100);
in92 = _mm512_sub_ps(in98, in92);
in100 = _mm512_sub_ps(in106, in100);
tmp598 = _mm512_fmadd_ps(in94, _mm512_set1_ps(-1.25e+00f), tmp598);
tmp603 = _mm512_fmadd_ps(in102, _mm512_set1_ps(-1.25e+00f), tmp603);
in94 = _mm512_sub_ps(in94, in96);
in102 = _mm512_sub_ps(in102, in104);
in94 = _mm512_fmadd_ps(in94, _mm512_set1_ps(5.25e+00f), in92);
in102 = _mm512_fmadd_ps(in102, _mm512_set1_ps(5.25e+00f), in100);
in93 = _mm512_fmadd_ps(tmp598, _mm512_set1_ps(2e+00f), in95);
in101 = _mm512_fmadd_ps(tmp603, _mm512_set1_ps(2e+00f), in103);
in95 = _mm512_fnmadd_ps(tmp598, _mm512_set1_ps(2e+00f), in95);
in103 = _mm512_fnmadd_ps(tmp603, _mm512_set1_ps(2e+00f), in103);
__m512 tmp615 = _mm512_unpacklo_ps(tmp601, tmp600);
__m512 tmp616 = _mm512_unpackhi_ps(tmp601, tmp600);
__m512 tmp617 = _mm512_unpacklo_ps(tmp602, in97);
__m512 tmp618 = _mm512_unpackhi_ps(tmp602, in97);
__m512 tmp619 = _mm512_unpacklo_ps(tmp599, in93);
__m512 tmp620 = _mm512_unpackhi_ps(tmp599, in93);
__m512 tmp621 = _mm512_unpacklo_ps(in95, in94);
__m512 tmp622 = _mm512_unpackhi_ps(in95, in94);
__m512 tmp623 = _mm512_unpacklo_ps(in99, tmp605);
__m512 tmp624 = _mm512_unpackhi_ps(in99, tmp605);
__m512 tmp625 = _mm512_unpacklo_ps(tmp606, in105);
__m512 tmp626 = _mm512_unpackhi_ps(tmp606, in105);
__m512 tmp627 = _mm512_unpacklo_ps(tmp604, in101);
__m512 tmp628 = _mm512_unpackhi_ps(tmp604, in101);
__m512 tmp629 = _mm512_unpacklo_ps(in103, in102);
__m512 tmp630 = _mm512_unpackhi_ps(in103, in102);
__m512 tmp631 = _mm512_shuffle_ps(tmp615, tmp617, 68);
__m512 tmp632 = _mm512_shuffle_ps(tmp615, tmp617, 238);
__m512 tmp633 = _mm512_shuffle_ps(tmp616, tmp618, 68);
__m512 tmp634 = _mm512_shuffle_ps(tmp616, tmp618, 238);
__m512 tmp635 = _mm512_shuffle_ps(tmp619, tmp621, 68);
__m512 tmp636 = _mm512_shuffle_ps(tmp619, tmp621, 238);
__m512 tmp637 = _mm512_shuffle_ps(tmp620, tmp622, 68);
__m512 tmp638 = _mm512_shuffle_ps(tmp620, tmp622, 238);
__m512 tmp639 = _mm512_shuffle_ps(tmp623, tmp625, 68);
__m512 tmp640 = _mm512_shuffle_ps(tmp623, tmp625, 238);
__m512 tmp641 = _mm512_shuffle_ps(tmp624, tmp626, 68);
__m512 tmp642 = _mm512_shuffle_ps(tmp624, tmp626, 238);
__m512 tmp643 = _mm512_shuffle_ps(tmp627, tmp629, 68);
__m512 tmp644 = _mm512_shuffle_ps(tmp627, tmp629, 238);
__m512 tmp645 = _mm512_shuffle_ps(tmp628, tmp630, 68);
__m512 tmp646 = _mm512_shuffle_ps(tmp628, tmp630, 238);
__m512 tmp647 = _mm512_shuffle_f32x4(tmp631, tmp635, 136);
__m512 tmp648 = _mm512_shuffle_f32x4(tmp631, tmp635, 221);
__m512 tmp649 = _mm512_shuffle_f32x4(tmp632, tmp636, 136);
__m512 tmp650 = _mm512_shuffle_f32x4(tmp632, tmp636, 221);
__m512 tmp651 = _mm512_shuffle_f32x4(tmp633, tmp637, 136);
__m512 tmp652 = _mm512_shuffle_f32x4(tmp633, tmp637, 221);
__m512 tmp653 = _mm512_shuffle_f32x4(tmp634, tmp638, 136);
__m512 tmp654 = _mm512_shuffle_f32x4(tmp634, tmp638, 221);
__m512 tmp655 = _mm512_shuffle_f32x4(tmp639, tmp643, 136);
__m512 tmp656 = _mm512_shuffle_f32x4(tmp639, tmp643, 221);
__m512 tmp657 = _mm512_shuffle_f32x4(tmp640, tmp644, 136);
__m512 tmp658 = _mm512_shuffle_f32x4(tmp640, tmp644, 221);
__m512 tmp659 = _mm512_shuffle_f32x4(tmp641, tmp645, 136);
__m512 tmp660 = _mm512_shuffle_f32x4(tmp641, tmp645, 221);
__m512 tmp661 = _mm512_shuffle_f32x4(tmp642, tmp646, 136);
__m512 tmp662 = _mm512_shuffle_f32x4(tmp642, tmp646, 221);
tmp601 = _mm512_shuffle_f32x4(tmp647, tmp655, 136);
in99 = _mm512_shuffle_f32x4(tmp647, tmp655, 221);
tmp600 = _mm512_shuffle_f32x4(tmp649, tmp657, 136);
tmp605 = _mm512_shuffle_f32x4(tmp649, tmp657, 221);
tmp602 = _mm512_shuffle_f32x4(tmp651, tmp659, 136);
tmp606 = _mm512_shuffle_f32x4(tmp651, tmp659, 221);
in97 = _mm512_shuffle_f32x4(tmp653, tmp661, 136);
in105 = _mm512_shuffle_f32x4(tmp653, tmp661, 221);
tmp599 = _mm512_shuffle_f32x4(tmp648, tmp656, 136);
tmp604 = _mm512_shuffle_f32x4(tmp648, tmp656, 221);
in93 = _mm512_shuffle_f32x4(tmp650, tmp658, 136);
in101 = _mm512_shuffle_f32x4(tmp650, tmp658, 221);
in95 = _mm512_shuffle_f32x4(tmp652, tmp660, 136);
in103 = _mm512_shuffle_f32x4(tmp652, tmp660, 221);
in94 = _mm512_shuffle_f32x4(tmp654, tmp662, 136);
in102 = _mm512_shuffle_f32x4(tmp654, tmp662, 221);
__m512 tmp607 = _mm512_add_ps(tmp600, in93);
__m512 tmp611 = _mm512_add_ps(tmp605, in101);
__m512 tmp608 = _mm512_sub_ps(tmp599, tmp602);
__m512 tmp612 = _mm512_sub_ps(tmp604, tmp606);
__m512 tmp609 = _mm512_add_ps(tmp602, in95);
__m512 tmp613 = _mm512_add_ps(tmp606, in103);
tmp601 = _mm512_sub_ps(tmp601, in95);
in99 = _mm512_sub_ps(in99, in103);
tmp607 = _mm512_fmadd_ps(in97, _mm512_set1_ps(-4.25e+00f), tmp607);
tmp611 = _mm512_fmadd_ps(in105, _mm512_set1_ps(-4.25e+00f), tmp611);
tmp609 = _mm512_fmadd_ps(tmp599, _mm512_set1_ps(-4.25e+00f), tmp609);
tmp613 = _mm512_fmadd_ps(tmp604, _mm512_set1_ps(-4.25e+00f), tmp613);
tmp601 = _mm512_fmadd_ps(tmp608, _mm512_set1_ps(5.25e+00f), tmp601);
in99 = _mm512_fmadd_ps(tmp612, _mm512_set1_ps(5.25e+00f), in99);
tmp608 = _mm512_fmadd_ps(tmp602, _mm512_set1_ps(2.5e-01f), in95);
tmp612 = _mm512_fmadd_ps(tmp606, _mm512_set1_ps(2.5e-01f), in103);
tmp602 = _mm512_fmadd_ps(tmp602, _mm512_set1_ps(4e+00f), in95);
tmp606 = _mm512_fmadd_ps(tmp606, _mm512_set1_ps(4e+00f), in103);
__m512 tmp610 = _mm512_sub_ps(tmp609, tmp607);
__m512 tmp614 = _mm512_sub_ps(tmp613, tmp611);
tmp609 = _mm512_add_ps(tmp607, tmp609);
tmp613 = _mm512_add_ps(tmp611, tmp613);
tmp607 = _mm512_fmadd_ps(tmp600, _mm512_set1_ps(2.5e-01f), in93);
tmp611 = _mm512_fmadd_ps(tmp605, _mm512_set1_ps(2.5e-01f), in101);
tmp608 = _mm512_fmadd_ps(tmp599, _mm512_set1_ps(-1.25e+00f), tmp608);
tmp612 = _mm512_fmadd_ps(tmp604, _mm512_set1_ps(-1.25e+00f), tmp612);
tmp599 = _mm512_fmadd_ps(tmp599, _mm512_set1_ps(-5e+00f), tmp602);
tmp604 = _mm512_fmadd_ps(tmp604, _mm512_set1_ps(-5e+00f), tmp606);
tmp607 = _mm512_fmadd_ps(in97, _mm512_set1_ps(-1.25e+00f), tmp607);
tmp611 = _mm512_fmadd_ps(in105, _mm512_set1_ps(-1.25e+00f), tmp611);
in95 = _mm512_fmadd_ps(tmp607, _mm512_set1_ps(2e+00f), tmp608);
in103 = _mm512_fmadd_ps(tmp611, _mm512_set1_ps(2e+00f), tmp612);
tmp608 = _mm512_fnmadd_ps(tmp607, _mm512_set1_ps(2e+00f), tmp608);
tmp612 = _mm512_fnmadd_ps(tmp611, _mm512_set1_ps(2e+00f), tmp612);
tmp607 = _mm512_fmadd_ps(in93, _mm512_set1_ps(2.5e-01f), tmp600);
tmp611 = _mm512_fmadd_ps(in101, _mm512_set1_ps(2.5e-01f), tmp605);
tmp600 = _mm512_sub_ps(in94, tmp600);
tmp605 = _mm512_sub_ps(in102, tmp605);
tmp607 = _mm512_fmadd_ps(in97, _mm512_set1_ps(-1.25e+00f), tmp607);
tmp611 = _mm512_fmadd_ps(in105, _mm512_set1_ps(-1.25e+00f), tmp611);
in97 = _mm512_sub_ps(in97, in93);
in105 = _mm512_sub_ps(in105, in101);
in97 = _mm512_fmadd_ps(in97, _mm512_set1_ps(5.25e+00f), tmp600);
in105 = _mm512_fmadd_ps(in105, _mm512_set1_ps(5.25e+00f), tmp605);
tmp602 = _mm512_fmadd_ps(tmp607, _mm512_set1_ps(2e+00f), tmp599);
tmp606 = _mm512_fmadd_ps(tmp611, _mm512_set1_ps(2e+00f), tmp604);
tmp599 = _mm512_fnmadd_ps(tmp607, _mm512_set1_ps(2e+00f), tmp599);
tmp604 = _mm512_fnmadd_ps(tmp611, _mm512_set1_ps(2e+00f), tmp604);
__m512 out161 = _mm512_shuffle_f32x4(tmp601, tmp609, 68);
__m512 out169 = _mm512_shuffle_f32x4(tmp601, tmp609, 238);
__m512 out162 = _mm512_shuffle_f32x4(tmp610, in95, 68);
__m512 out170 = _mm512_shuffle_f32x4(tmp610, in95, 238);
__m512 out163 = _mm512_shuffle_f32x4(tmp608, tmp602, 68);
__m512 out171 = _mm512_shuffle_f32x4(tmp608, tmp602, 238);
__m512 out164 = _mm512_shuffle_f32x4(tmp599, in97, 68);
__m512 out172 = _mm512_shuffle_f32x4(tmp599, in97, 238);
__m512 out165 = _mm512_shuffle_f32x4(in99, tmp613, 68);
__m512 out173 = _mm512_shuffle_f32x4(in99, tmp613, 238);
__m512 out166 = _mm512_shuffle_f32x4(tmp614, in103, 68);
__m512 out174 = _mm512_shuffle_f32x4(tmp614, in103, 238);
__m512 out167 = _mm512_shuffle_f32x4(tmp612, tmp606, 68);
__m512 out175 = _mm512_shuffle_f32x4(tmp612, tmp606, 238);
__m512 out168 = _mm512_shuffle_f32x4(tmp604, in105, 68);
__m512 out176 = _mm512_shuffle_f32x4(tmp604, in105, 238);
_mm512_storeu_ps(dfPtr2+512+368640*i8+55296*j4+55296*s5+768*k7, out161);
_mm512_storeu_ps(dfPtr2+640+368640*i8+55296*j4+55296*s5+768*k7, out169);
_mm512_storeu_ps(dfPtr2+576+368640*i8+55296*j4+55296*s5+768*k7, out165);
_mm512_storeu_ps(dfPtr2+704+368640*i8+55296*j4+55296*s5+768*k7, out173);
_mm512_storeu_ps(dfPtr2+92672+368640*i8+55296*j4+55296*s5+768*k7, out162);
_mm512_storeu_ps(dfPtr2+92800+368640*i8+55296*j4+55296*s5+768*k7, out170);
_mm512_storeu_ps(dfPtr2+92736+368640*i8+55296*j4+55296*s5+768*k7, out166);
_mm512_storeu_ps(dfPtr2+92864+368640*i8+55296*j4+55296*s5+768*k7, out174);
_mm512_storeu_ps(dfPtr2+184832+368640*i8+55296*j4+55296*s5+768*k7, out163);
_mm512_storeu_ps(dfPtr2+184960+368640*i8+55296*j4+55296*s5+768*k7, out171);
_mm512_storeu_ps(dfPtr2+184896+368640*i8+55296*j4+55296*s5+768*k7, out167);
_mm512_storeu_ps(dfPtr2+185024+368640*i8+55296*j4+55296*s5+768*k7, out175);
_mm512_storeu_ps(dfPtr2+276992+368640*i8+55296*j4+55296*s5+768*k7, out164);
_mm512_storeu_ps(dfPtr2+277120+368640*i8+55296*j4+55296*s5+768*k7, out172);
_mm512_storeu_ps(dfPtr2+277056+368640*i8+55296*j4+55296*s5+768*k7, out168);
_mm512_storeu_ps(dfPtr2+277184+368640*i8+55296*j4+55296*s5+768*k7, out176);
}
if (j4 >= last2) return;
++j4;
rel2 = 1;
}
ptrdiff_t h4 = base2+6;
ptrdiff_t w4 = 6;
ptrdiff_t k8 = 0;
for (; k8 != 144; ++k8) {
__m512 dat89 = _mm512_maskz_loadu_ps(16383, datPtr2+0+466560*i8+108*h4+4*w4+124416*s5+864*k8);
__m512 dat90 = _mm512_maskz_loadu_ps(511, datPtr2+48+466560*i8+108*h4+4*w4+124416*s5+864*k8);
__m512i pm29 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in107 = _mm512_permutexvar_ps(pm29, dat89);
__m512i pm30 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in110 = _mm512_permutexvar_ps(pm30, dat90);
__m512 dat91 = _mm512_maskz_loadu_ps(16383, datPtr2+108+466560*i8+108*h4+4*w4+124416*s5+864*k8);
__m512 dat92 = _mm512_maskz_loadu_ps(511, datPtr2+156+466560*i8+108*h4+4*w4+124416*s5+864*k8);
__m512 in108 = _mm512_permutexvar_ps(pm29, dat91);
__m512 in111 = _mm512_permutexvar_ps(pm30, dat92);
__m512 dat93 = _mm512_maskz_loadu_ps(16383, datPtr2+216+466560*i8+108*h4+4*w4+124416*s5+864*k8);
__m512 dat94 = _mm512_maskz_loadu_ps(511, datPtr2+264+466560*i8+108*h4+4*w4+124416*s5+864*k8);
__m512 in109 = _mm512_permutexvar_ps(pm29, dat93);
__m512 in112 = _mm512_permutexvar_ps(pm30, dat94);
__m512 tmp663 = in108;
__m512 tmp670 = in111;
__m512 tmp664 = _mm512_sub_ps(_mm512_setzero_ps(), in109);
__m512 tmp671 = _mm512_sub_ps(_mm512_setzero_ps(), in112);
__m512 tmp665 = in109;
__m512 tmp672 = in112;
in107 = in107;
in110 = in110;
tmp663 = tmp663;
tmp670 = tmp670;
tmp665 = tmp665;
tmp672 = tmp672;
in107 = _mm512_fmadd_ps(tmp664, _mm512_set1_ps(5.25e+00f), in107);
in110 = _mm512_fmadd_ps(tmp671, _mm512_set1_ps(5.25e+00f), in110);
tmp664 = _mm512_mul_ps(in109, _mm512_set1_ps(2.5e-01f));
tmp671 = _mm512_mul_ps(in112, _mm512_set1_ps(2.5e-01f));
in109 = _mm512_mul_ps(in109, _mm512_set1_ps(4e+00f));
in112 = _mm512_mul_ps(in112, _mm512_set1_ps(4e+00f));
__m512 tmp666 = _mm512_sub_ps(tmp665, tmp663);
__m512 tmp673 = _mm512_sub_ps(tmp672, tmp670);
tmp665 = _mm512_add_ps(tmp663, tmp665);
tmp672 = _mm512_add_ps(tmp670, tmp672);
tmp663 = _mm512_mul_ps(in108, _mm512_set1_ps(2.5e-01f));
tmp670 = _mm512_mul_ps(in111, _mm512_set1_ps(2.5e-01f));
tmp664 = tmp664;
tmp671 = tmp671;
__m512 tmp667 = in109;
__m512 tmp674 = in112;
tmp663 = tmp663;
tmp670 = tmp670;
__m512 tmp668 = _mm512_fmadd_ps(tmp663, _mm512_set1_ps(2e+00f), tmp664);
__m512 tmp675 = _mm512_fmadd_ps(tmp670, _mm512_set1_ps(2e+00f), tmp671);
tmp664 = _mm512_fnmadd_ps(tmp663, _mm512_set1_ps(2e+00f), tmp664);
tmp671 = _mm512_fnmadd_ps(tmp670, _mm512_set1_ps(2e+00f), tmp671);
tmp663 = in108;
tmp670 = in111;
in108 = _mm512_sub_ps(_mm512_setzero_ps(), in108);
in111 = _mm512_sub_ps(_mm512_setzero_ps(), in111);
tmp663 = tmp663;
tmp670 = tmp670;
__m512 tmp669 = in108;
__m512 tmp676 = in111;
in109 = _mm512_fmadd_ps(tmp663, _mm512_set1_ps(2e+00f), tmp667);
in112 = _mm512_fmadd_ps(tmp670, _mm512_set1_ps(2e+00f), tmp674);
tmp667 = _mm512_fnmadd_ps(tmp663, _mm512_set1_ps(2e+00f), tmp667);
tmp674 = _mm512_fnmadd_ps(tmp670, _mm512_set1_ps(2e+00f), tmp674);
__m512 tmp685 = _mm512_unpacklo_ps(in107, tmp665);
__m512 tmp686 = _mm512_unpackhi_ps(in107, tmp665);
__m512 tmp687 = _mm512_unpacklo_ps(tmp666, tmp668);
__m512 tmp688 = _mm512_unpackhi_ps(tmp666, tmp668);
__m512 tmp689 = _mm512_unpacklo_ps(tmp664, in109);
__m512 tmp690 = _mm512_unpackhi_ps(tmp664, in109);
__m512 tmp691 = _mm512_unpacklo_ps(tmp667, tmp669);
__m512 tmp692 = _mm512_unpackhi_ps(tmp667, tmp669);
__m512 tmp693 = _mm512_unpacklo_ps(in110, tmp672);
__m512 tmp694 = _mm512_unpackhi_ps(in110, tmp672);
__m512 tmp695 = _mm512_unpacklo_ps(tmp673, tmp675);
__m512 tmp696 = _mm512_unpackhi_ps(tmp673, tmp675);
__m512 tmp697 = _mm512_unpacklo_ps(tmp671, in112);
__m512 tmp698 = _mm512_unpackhi_ps(tmp671, in112);
__m512 tmp699 = _mm512_unpacklo_ps(tmp674, tmp676);
__m512 tmp700 = _mm512_unpackhi_ps(tmp674, tmp676);
__m512 tmp701 = _mm512_shuffle_ps(tmp685, tmp687, 68);
__m512 tmp702 = _mm512_shuffle_ps(tmp685, tmp687, 238);
__m512 tmp703 = _mm512_shuffle_ps(tmp686, tmp688, 68);
__m512 tmp704 = _mm512_shuffle_ps(tmp686, tmp688, 238);
__m512 tmp705 = _mm512_shuffle_ps(tmp689, tmp691, 68);
__m512 tmp706 = _mm512_shuffle_ps(tmp689, tmp691, 238);
__m512 tmp707 = _mm512_shuffle_ps(tmp690, tmp692, 68);
__m512 tmp708 = _mm512_shuffle_ps(tmp690, tmp692, 238);
__m512 tmp709 = _mm512_shuffle_ps(tmp693, tmp695, 68);
__m512 tmp710 = _mm512_shuffle_ps(tmp693, tmp695, 238);
__m512 tmp711 = _mm512_shuffle_ps(tmp694, tmp696, 68);
__m512 tmp712 = _mm512_shuffle_ps(tmp694, tmp696, 238);
__m512 tmp713 = _mm512_shuffle_ps(tmp697, tmp699, 68);
__m512 tmp714 = _mm512_shuffle_ps(tmp697, tmp699, 238);
__m512 tmp715 = _mm512_shuffle_ps(tmp698, tmp700, 68);
__m512 tmp716 = _mm512_shuffle_ps(tmp698, tmp700, 238);
__m512 tmp717 = _mm512_shuffle_f32x4(tmp701, tmp705, 136);
__m512 tmp718 = _mm512_shuffle_f32x4(tmp701, tmp705, 221);
__m512 tmp719 = _mm512_shuffle_f32x4(tmp702, tmp706, 136);
__m512 tmp720 = _mm512_shuffle_f32x4(tmp702, tmp706, 221);
__m512 tmp721 = _mm512_shuffle_f32x4(tmp703, tmp707, 136);
__m512 tmp722 = _mm512_shuffle_f32x4(tmp703, tmp707, 221);
__m512 tmp723 = _mm512_shuffle_f32x4(tmp704, tmp708, 136);
__m512 tmp724 = _mm512_shuffle_f32x4(tmp704, tmp708, 221);
__m512 tmp725 = _mm512_shuffle_f32x4(tmp709, tmp713, 136);
__m512 tmp726 = _mm512_shuffle_f32x4(tmp709, tmp713, 221);
__m512 tmp727 = _mm512_shuffle_f32x4(tmp710, tmp714, 136);
__m512 tmp728 = _mm512_shuffle_f32x4(tmp710, tmp714, 221);
__m512 tmp729 = _mm512_shuffle_f32x4(tmp711, tmp715, 136);
__m512 tmp730 = _mm512_shuffle_f32x4(tmp711, tmp715, 221);
__m512 tmp731 = _mm512_shuffle_f32x4(tmp712, tmp716, 136);
__m512 tmp732 = _mm512_shuffle_f32x4(tmp712, tmp716, 221);
in107 = _mm512_shuffle_f32x4(tmp717, tmp725, 136);
in110 = _mm512_shuffle_f32x4(tmp717, tmp725, 221);
tmp665 = _mm512_shuffle_f32x4(tmp719, tmp727, 136);
tmp672 = _mm512_shuffle_f32x4(tmp719, tmp727, 221);
tmp666 = _mm512_shuffle_f32x4(tmp721, tmp729, 136);
tmp673 = _mm512_shuffle_f32x4(tmp721, tmp729, 221);
tmp668 = _mm512_shuffle_f32x4(tmp723, tmp731, 136);
tmp675 = _mm512_shuffle_f32x4(tmp723, tmp731, 221);
tmp664 = _mm512_shuffle_f32x4(tmp718, tmp726, 136);
tmp671 = _mm512_shuffle_f32x4(tmp718, tmp726, 221);
in109 = _mm512_shuffle_f32x4(tmp720, tmp728, 136);
in112 = _mm512_shuffle_f32x4(tmp720, tmp728, 221);
tmp667 = _mm512_shuffle_f32x4(tmp722, tmp730, 136);
tmp674 = _mm512_shuffle_f32x4(tmp722, tmp730, 221);
tmp669 = _mm512_shuffle_f32x4(tmp724, tmp732, 136);
tmp676 = _mm512_shuffle_f32x4(tmp724, tmp732, 221);
__m512 tmp677 = _mm512_add_ps(tmp665, in109);
__m512 tmp681 = _mm512_add_ps(tmp672, in112);
__m512 tmp678 = _mm512_sub_ps(tmp664, tmp666);
__m512 tmp682 = _mm512_sub_ps(tmp671, tmp673);
__m512 tmp679 = _mm512_add_ps(tmp666, tmp667);
__m512 tmp683 = _mm512_add_ps(tmp673, tmp674);
in107 = _mm512_sub_ps(in107, tmp667);
in110 = _mm512_sub_ps(in110, tmp674);
tmp677 = _mm512_fmadd_ps(tmp668, _mm512_set1_ps(-4.25e+00f), tmp677);
tmp681 = _mm512_fmadd_ps(tmp675, _mm512_set1_ps(-4.25e+00f), tmp681);
tmp679 = _mm512_fmadd_ps(tmp664, _mm512_set1_ps(-4.25e+00f), tmp679);
tmp683 = _mm512_fmadd_ps(tmp671, _mm512_set1_ps(-4.25e+00f), tmp683);
in107 = _mm512_fmadd_ps(tmp678, _mm512_set1_ps(5.25e+00f), in107);
in110 = _mm512_fmadd_ps(tmp682, _mm512_set1_ps(5.25e+00f), in110);
tmp678 = _mm512_fmadd_ps(tmp666, _mm512_set1_ps(2.5e-01f), tmp667);
tmp682 = _mm512_fmadd_ps(tmp673, _mm512_set1_ps(2.5e-01f), tmp674);
tmp666 = _mm512_fmadd_ps(tmp666, _mm512_set1_ps(4e+00f), tmp667);
tmp673 = _mm512_fmadd_ps(tmp673, _mm512_set1_ps(4e+00f), tmp674);
__m512 tmp680 = _mm512_sub_ps(tmp679, tmp677);
__m512 tmp684 = _mm512_sub_ps(tmp683, tmp681);
tmp679 = _mm512_add_ps(tmp677, tmp679);
tmp683 = _mm512_add_ps(tmp681, tmp683);
tmp677 = _mm512_fmadd_ps(tmp665, _mm512_set1_ps(2.5e-01f), in109);
tmp681 = _mm512_fmadd_ps(tmp672, _mm512_set1_ps(2.5e-01f), in112);
tmp678 = _mm512_fmadd_ps(tmp664, _mm512_set1_ps(-1.25e+00f), tmp678);
tmp682 = _mm512_fmadd_ps(tmp671, _mm512_set1_ps(-1.25e+00f), tmp682);
tmp664 = _mm512_fmadd_ps(tmp664, _mm512_set1_ps(-5e+00f), tmp666);
tmp671 = _mm512_fmadd_ps(tmp671, _mm512_set1_ps(-5e+00f), tmp673);
tmp677 = _mm512_fmadd_ps(tmp668, _mm512_set1_ps(-1.25e+00f), tmp677);
tmp681 = _mm512_fmadd_ps(tmp675, _mm512_set1_ps(-1.25e+00f), tmp681);
tmp667 = _mm512_fmadd_ps(tmp677, _mm512_set1_ps(2e+00f), tmp678);
tmp674 = _mm512_fmadd_ps(tmp681, _mm512_set1_ps(2e+00f), tmp682);
tmp678 = _mm512_fnmadd_ps(tmp677, _mm512_set1_ps(2e+00f), tmp678);
tmp682 = _mm512_fnmadd_ps(tmp681, _mm512_set1_ps(2e+00f), tmp682);
tmp677 = _mm512_fmadd_ps(in109, _mm512_set1_ps(2.5e-01f), tmp665);
tmp681 = _mm512_fmadd_ps(in112, _mm512_set1_ps(2.5e-01f), tmp672);
tmp665 = _mm512_sub_ps(tmp669, tmp665);
tmp672 = _mm512_sub_ps(tmp676, tmp672);
tmp677 = _mm512_fmadd_ps(tmp668, _mm512_set1_ps(-1.25e+00f), tmp677);
tmp681 = _mm512_fmadd_ps(tmp675, _mm512_set1_ps(-1.25e+00f), tmp681);
tmp668 = _mm512_sub_ps(tmp668, in109);
tmp675 = _mm512_sub_ps(tmp675, in112);
tmp668 = _mm512_fmadd_ps(tmp668, _mm512_set1_ps(5.25e+00f), tmp665);
tmp675 = _mm512_fmadd_ps(tmp675, _mm512_set1_ps(5.25e+00f), tmp672);
tmp666 = _mm512_fmadd_ps(tmp677, _mm512_set1_ps(2e+00f), tmp664);
tmp673 = _mm512_fmadd_ps(tmp681, _mm512_set1_ps(2e+00f), tmp671);
tmp664 = _mm512_fnmadd_ps(tmp677, _mm512_set1_ps(2e+00f), tmp664);
tmp671 = _mm512_fnmadd_ps(tmp681, _mm512_set1_ps(2e+00f), tmp671);
__m512 out177 = _mm512_shuffle_f32x4(in107, tmp679, 68);
__m512 out185 = _mm512_shuffle_f32x4(in107, tmp679, 238);
__m512 out178 = _mm512_shuffle_f32x4(tmp680, tmp667, 68);
__m512 out186 = _mm512_shuffle_f32x4(tmp680, tmp667, 238);
__m512 out179 = _mm512_shuffle_f32x4(tmp678, tmp666, 68);
__m512 out187 = _mm512_shuffle_f32x4(tmp678, tmp666, 238);
__m512 out180 = _mm512_shuffle_f32x4(tmp664, tmp668, 68);
__m512 out188 = _mm512_shuffle_f32x4(tmp664, tmp668, 238);
__m512 out181 = _mm512_shuffle_f32x4(in110, tmp683, 68);
__m512 out189 = _mm512_shuffle_f32x4(in110, tmp683, 238);
__m512 out182 = _mm512_shuffle_f32x4(tmp684, tmp674, 68);
__m512 out190 = _mm512_shuffle_f32x4(tmp684, tmp674, 238);
__m512 out183 = _mm512_shuffle_f32x4(tmp682, tmp673, 68);
__m512 out191 = _mm512_shuffle_f32x4(tmp682, tmp673, 238);
__m512 out184 = _mm512_shuffle_f32x4(tmp671, tmp675, 68);
__m512 out192 = _mm512_shuffle_f32x4(tmp671, tmp675, 238);
_mm512_storeu_ps(dfPtr2+0+368640*i8+55296*j4+36864*s5+256*k8, out177);
_mm512_storeu_ps(dfPtr2+128+368640*i8+55296*j4+36864*s5+256*k8, out185);
_mm512_storeu_ps(dfPtr2+64+368640*i8+55296*j4+36864*s5+256*k8, out181);
_mm512_storeu_ps(dfPtr2+192+368640*i8+55296*j4+36864*s5+256*k8, out189);
_mm512_storeu_ps(dfPtr2+92160+368640*i8+55296*j4+36864*s5+256*k8, out178);
_mm512_storeu_ps(dfPtr2+92288+368640*i8+55296*j4+36864*s5+256*k8, out186);
_mm512_storeu_ps(dfPtr2+92224+368640*i8+55296*j4+36864*s5+256*k8, out182);
_mm512_storeu_ps(dfPtr2+92352+368640*i8+55296*j4+36864*s5+256*k8, out190);
_mm512_storeu_ps(dfPtr2+184320+368640*i8+55296*j4+36864*s5+256*k8, out179);
_mm512_storeu_ps(dfPtr2+184448+368640*i8+55296*j4+36864*s5+256*k8, out187);
_mm512_storeu_ps(dfPtr2+184384+368640*i8+55296*j4+36864*s5+256*k8, out183);
_mm512_storeu_ps(dfPtr2+184512+368640*i8+55296*j4+36864*s5+256*k8, out191);
_mm512_storeu_ps(dfPtr2+276480+368640*i8+55296*j4+36864*s5+256*k8, out180);
_mm512_storeu_ps(dfPtr2+276608+368640*i8+55296*j4+36864*s5+256*k8, out188);
_mm512_storeu_ps(dfPtr2+276544+368640*i8+55296*j4+36864*s5+256*k8, out184);
_mm512_storeu_ps(dfPtr2+276672+368640*i8+55296*j4+36864*s5+256*k8, out192);
}
if (j4 >= last2) return;
++j4;
}

static void Example12ThreeArrangeDats1(Example12ThreaderTeam1* team15, char** tensors3) {
Example12ThreaderTask1 task7;
task7.callee1 = Example12ThreeArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 1;
task7.hull1[1] = 2;
task7.hull1[2] = 1;
task7.hull1[3] = 2;
Example12ThreaderDo1(team15, &task7);
}

static void Example12ThreeProduceSums1Callee1(Example12ThreaderTask1* task8, int64_t* pt9) {
void** pair2 = task8->any1;
char** tensors6 = pair2[0];
ptrdiff_t e3 = 0;
ptrdiff_t g4 = 0;
ptrdiff_t f2 = pt9[2];
ptrdiff_t d1 = pt9[1];
ptrdiff_t w5 = pt9[0];
char*restrict bfPtr3 = tensors6[0]+3096*e3;
char*restrict wfPtr3 = tensors6[0]+6208+39232512*e3;
char*restrict dfPtr3 = tensors6[1]+1013760*e3;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i9 = 1*g4;
ptrdiff_t j5 = 1*f2;
ptrdiff_t k9 = 1*d1;
ptrdiff_t kk1 = k9+0;
for (; k9 != 1; ++k9) {
ptrdiff_t l1 = 2*w5;
ptrdiff_t ll1 = l1+1;
for (; l1 != 193; ++l1) {
__m512 sum2;
__m512 sum8;
__m512 sum14;
__m512 sum20;
if (__builtin_expect(!j5, 0)) {
sum2 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+0+3096*i9+16*l1)));
sum8 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+4+3096*i9+16*l1)));
sum14 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+8+3096*i9+16*l1)));
sum20 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+12+3096*i9+16*l1)));
} else {
sum2 = _mm512_setzero_ps();
sum8 = _mm512_setzero_ps();
sum14 = _mm512_setzero_ps();
sum20 = _mm512_setzero_ps();
}
__m512 sum3 = sum2;
__m512 sum4 = sum2;
__m512 sum5 = sum2;
__m512 sum6 = sum2;
__m512 sum7 = sum2;
__m512 sum9 = sum8;
__m512 sum10 = sum8;
__m512 sum11 = sum8;
__m512 sum12 = sum8;
__m512 sum13 = sum8;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum18 = sum14;
__m512 sum19 = sum14;
__m512 sum21 = sum20;
__m512 sum22 = sum20;
__m512 sum23 = sum20;
__m512 sum24 = sum20;
__m512 sum25 = sum20;
ptrdiff_t b3 = 0;
for (; b3 != 396; ++b3) {
__m512i wfs1 = _mm512_maskz_loadu_epi32(65535, wfPtr3+0+39232512*i9+9808128*j5+50688*l1+128*b3);
__m512 wf65 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs1));
__m512 df1 = _mm512_loadu_ps(dfPtr3+0+1013760*i9+253440*j5+152064*k9+384*b3);
sum2 = _mm512_fmadd_ps(wf65, df1, sum2);
__m512 df2 = _mm512_loadu_ps(dfPtr3+64+1013760*i9+253440*j5+152064*k9+384*b3);
sum3 = _mm512_fmadd_ps(wf65, df2, sum3);
__m512 df3 = _mm512_loadu_ps(dfPtr3+128+1013760*i9+253440*j5+152064*k9+384*b3);
sum4 = _mm512_fmadd_ps(wf65, df3, sum4);
__m512 df4 = _mm512_loadu_ps(dfPtr3+192+1013760*i9+253440*j5+152064*k9+384*b3);
sum5 = _mm512_fmadd_ps(wf65, df4, sum5);
__m512 df5 = _mm512_loadu_ps(dfPtr3+256+1013760*i9+253440*j5+152064*k9+384*b3);
sum6 = _mm512_fmadd_ps(wf65, df5, sum6);
__m512 df6 = _mm512_loadu_ps(dfPtr3+320+1013760*i9+253440*j5+152064*k9+384*b3);
sum7 = _mm512_fmadd_ps(wf65, df6, sum7);
__m512 wf66 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs1, 1));
sum8 = _mm512_fmadd_ps(wf66, df1, sum8);
sum9 = _mm512_fmadd_ps(wf66, df2, sum9);
sum10 = _mm512_fmadd_ps(wf66, df3, sum10);
sum11 = _mm512_fmadd_ps(wf66, df4, sum11);
sum12 = _mm512_fmadd_ps(wf66, df5, sum12);
sum13 = _mm512_fmadd_ps(wf66, df6, sum13);
__m512i wfs2 = _mm512_maskz_loadu_epi32(65535, wfPtr3+64+39232512*i9+9808128*j5+50688*l1+128*b3);
__m512 wf67 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs2));
sum14 = _mm512_fmadd_ps(wf67, df1, sum14);
sum15 = _mm512_fmadd_ps(wf67, df2, sum15);
sum16 = _mm512_fmadd_ps(wf67, df3, sum16);
sum17 = _mm512_fmadd_ps(wf67, df4, sum17);
sum18 = _mm512_fmadd_ps(wf67, df5, sum18);
sum19 = _mm512_fmadd_ps(wf67, df6, sum19);
__m512 wf68 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs2, 1));
sum20 = _mm512_fmadd_ps(wf68, df1, sum20);
sum21 = _mm512_fmadd_ps(wf68, df2, sum21);
sum22 = _mm512_fmadd_ps(wf68, df3, sum22);
sum23 = _mm512_fmadd_ps(wf68, df4, sum23);
sum24 = _mm512_fmadd_ps(wf68, df5, sum24);
sum25 = _mm512_fmadd_ps(wf68, df6, sum25);
}
_mm512_storeu_ps(sfPtr1+0+1981440*i9+495360*j5+297216*k9+1536*l1, sum2);
_mm512_storeu_ps(sfPtr1+64+1981440*i9+495360*j5+297216*k9+1536*l1, sum3);
_mm512_storeu_ps(sfPtr1+128+1981440*i9+495360*j5+297216*k9+1536*l1, sum4);
_mm512_storeu_ps(sfPtr1+192+1981440*i9+495360*j5+297216*k9+1536*l1, sum5);
_mm512_storeu_ps(sfPtr1+256+1981440*i9+495360*j5+297216*k9+1536*l1, sum6);
_mm512_storeu_ps(sfPtr1+320+1981440*i9+495360*j5+297216*k9+1536*l1, sum7);
_mm512_storeu_ps(sfPtr1+384+1981440*i9+495360*j5+297216*k9+1536*l1, sum8);
_mm512_storeu_ps(sfPtr1+448+1981440*i9+495360*j5+297216*k9+1536*l1, sum9);
_mm512_storeu_ps(sfPtr1+512+1981440*i9+495360*j5+297216*k9+1536*l1, sum10);
_mm512_storeu_ps(sfPtr1+576+1981440*i9+495360*j5+297216*k9+1536*l1, sum11);
_mm512_storeu_ps(sfPtr1+640+1981440*i9+495360*j5+297216*k9+1536*l1, sum12);
_mm512_storeu_ps(sfPtr1+704+1981440*i9+495360*j5+297216*k9+1536*l1, sum13);
_mm512_storeu_ps(sfPtr1+768+1981440*i9+495360*j5+297216*k9+1536*l1, sum14);
_mm512_storeu_ps(sfPtr1+832+1981440*i9+495360*j5+297216*k9+1536*l1, sum15);
_mm512_storeu_ps(sfPtr1+896+1981440*i9+495360*j5+297216*k9+1536*l1, sum16);
_mm512_storeu_ps(sfPtr1+960+1981440*i9+495360*j5+297216*k9+1536*l1, sum17);
_mm512_storeu_ps(sfPtr1+1024+1981440*i9+495360*j5+297216*k9+1536*l1, sum18);
_mm512_storeu_ps(sfPtr1+1088+1981440*i9+495360*j5+297216*k9+1536*l1, sum19);
_mm512_storeu_ps(sfPtr1+1152+1981440*i9+495360*j5+297216*k9+1536*l1, sum20);
_mm512_storeu_ps(sfPtr1+1216+1981440*i9+495360*j5+297216*k9+1536*l1, sum21);
_mm512_storeu_ps(sfPtr1+1280+1981440*i9+495360*j5+297216*k9+1536*l1, sum22);
_mm512_storeu_ps(sfPtr1+1344+1981440*i9+495360*j5+297216*k9+1536*l1, sum23);
_mm512_storeu_ps(sfPtr1+1408+1981440*i9+495360*j5+297216*k9+1536*l1, sum24);
_mm512_storeu_ps(sfPtr1+1472+1981440*i9+495360*j5+297216*k9+1536*l1, sum25);
if (l1 >= ll1) return;
}
__m512 sum26;
__m512 sum32;
if (__builtin_expect(!j5, 0)) {
sum26 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+0+3096*i9+16*l1)));
sum32 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+4+3096*i9+16*l1)));
} else {
sum26 = _mm512_setzero_ps();
sum32 = _mm512_setzero_ps();
}
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
__m512 sum30 = sum26;
__m512 sum31 = sum26;
__m512 sum33 = sum32;
__m512 sum34 = sum32;
__m512 sum35 = sum32;
__m512 sum36 = sum32;
__m512 sum37 = sum32;
ptrdiff_t b4 = 0;
for (; b4 != 396; ++b4) {
__m512i wfs3 = _mm512_maskz_loadu_epi32(65535, wfPtr3+0+39232512*i9+9808128*j5+50688*l1+64*b4);
__m512 wf69 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs3));
__m512 df7 = _mm512_loadu_ps(dfPtr3+0+1013760*i9+253440*j5+152064*k9+384*b4);
sum26 = _mm512_fmadd_ps(wf69, df7, sum26);
__m512 df8 = _mm512_loadu_ps(dfPtr3+64+1013760*i9+253440*j5+152064*k9+384*b4);
sum27 = _mm512_fmadd_ps(wf69, df8, sum27);
__m512 df9 = _mm512_loadu_ps(dfPtr3+128+1013760*i9+253440*j5+152064*k9+384*b4);
sum28 = _mm512_fmadd_ps(wf69, df9, sum28);
__m512 df10 = _mm512_loadu_ps(dfPtr3+192+1013760*i9+253440*j5+152064*k9+384*b4);
sum29 = _mm512_fmadd_ps(wf69, df10, sum29);
__m512 df11 = _mm512_loadu_ps(dfPtr3+256+1013760*i9+253440*j5+152064*k9+384*b4);
sum30 = _mm512_fmadd_ps(wf69, df11, sum30);
__m512 df12 = _mm512_loadu_ps(dfPtr3+320+1013760*i9+253440*j5+152064*k9+384*b4);
sum31 = _mm512_fmadd_ps(wf69, df12, sum31);
__m512 wf70 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs3, 1));
sum32 = _mm512_fmadd_ps(wf70, df7, sum32);
sum33 = _mm512_fmadd_ps(wf70, df8, sum33);
sum34 = _mm512_fmadd_ps(wf70, df9, sum34);
sum35 = _mm512_fmadd_ps(wf70, df10, sum35);
sum36 = _mm512_fmadd_ps(wf70, df11, sum36);
sum37 = _mm512_fmadd_ps(wf70, df12, sum37);
}
_mm512_storeu_ps(sfPtr1+0+1981440*i9+495360*j5+297216*k9+1536*l1, sum26);
_mm512_storeu_ps(sfPtr1+64+1981440*i9+495360*j5+297216*k9+1536*l1, sum27);
_mm512_storeu_ps(sfPtr1+128+1981440*i9+495360*j5+297216*k9+1536*l1, sum28);
_mm512_storeu_ps(sfPtr1+192+1981440*i9+495360*j5+297216*k9+1536*l1, sum29);
_mm512_storeu_ps(sfPtr1+256+1981440*i9+495360*j5+297216*k9+1536*l1, sum30);
_mm512_storeu_ps(sfPtr1+320+1981440*i9+495360*j5+297216*k9+1536*l1, sum31);
_mm512_storeu_ps(sfPtr1+384+1981440*i9+495360*j5+297216*k9+1536*l1, sum32);
_mm512_storeu_ps(sfPtr1+448+1981440*i9+495360*j5+297216*k9+1536*l1, sum33);
_mm512_storeu_ps(sfPtr1+512+1981440*i9+495360*j5+297216*k9+1536*l1, sum34);
_mm512_storeu_ps(sfPtr1+576+1981440*i9+495360*j5+297216*k9+1536*l1, sum35);
_mm512_storeu_ps(sfPtr1+640+1981440*i9+495360*j5+297216*k9+1536*l1, sum36);
_mm512_storeu_ps(sfPtr1+704+1981440*i9+495360*j5+297216*k9+1536*l1, sum37);
if (k9 >= kk1) return;
}
ptrdiff_t l2 = 2*w5;
ptrdiff_t ll2 = l2+1;
for (; l2 != 193; ++l2) {
__m512 sum38;
__m512 sum42;
__m512 sum46;
__m512 sum50;
if (__builtin_expect(!j5, 0)) {
sum38 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+0+3096*i9+16*l2)));
sum42 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+4+3096*i9+16*l2)));
sum46 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+8+3096*i9+16*l2)));
sum50 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+12+3096*i9+16*l2)));
} else {
sum38 = _mm512_setzero_ps();
sum42 = _mm512_setzero_ps();
sum46 = _mm512_setzero_ps();
sum50 = _mm512_setzero_ps();
}
__m512 sum39 = sum38;
__m512 sum40 = sum38;
__m512 sum41 = sum38;
__m512 sum43 = sum42;
__m512 sum44 = sum42;
__m512 sum45 = sum42;
__m512 sum47 = sum46;
__m512 sum48 = sum46;
__m512 sum49 = sum46;
__m512 sum51 = sum50;
__m512 sum52 = sum50;
__m512 sum53 = sum50;
ptrdiff_t b5 = 0;
for (; b5 != 396; ++b5) {
__m512i wfs4 = _mm512_maskz_loadu_epi32(65535, wfPtr3+0+39232512*i9+9808128*j5+50688*l2+128*b5);
__m512 wf71 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs4));
__m512 df13 = _mm512_loadu_ps(dfPtr3+0+1013760*i9+253440*j5+152064*k9+256*b5);
sum38 = _mm512_fmadd_ps(wf71, df13, sum38);
__m512 df14 = _mm512_loadu_ps(dfPtr3+64+1013760*i9+253440*j5+152064*k9+256*b5);
sum39 = _mm512_fmadd_ps(wf71, df14, sum39);
__m512 df15 = _mm512_loadu_ps(dfPtr3+128+1013760*i9+253440*j5+152064*k9+256*b5);
sum40 = _mm512_fmadd_ps(wf71, df15, sum40);
__m512 df16 = _mm512_loadu_ps(dfPtr3+192+1013760*i9+253440*j5+152064*k9+256*b5);
sum41 = _mm512_fmadd_ps(wf71, df16, sum41);
__m512 wf72 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs4, 1));
sum42 = _mm512_fmadd_ps(wf72, df13, sum42);
sum43 = _mm512_fmadd_ps(wf72, df14, sum43);
sum44 = _mm512_fmadd_ps(wf72, df15, sum44);
sum45 = _mm512_fmadd_ps(wf72, df16, sum45);
__m512i wfs5 = _mm512_maskz_loadu_epi32(65535, wfPtr3+64+39232512*i9+9808128*j5+50688*l2+128*b5);
__m512 wf73 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs5));
sum46 = _mm512_fmadd_ps(wf73, df13, sum46);
sum47 = _mm512_fmadd_ps(wf73, df14, sum47);
sum48 = _mm512_fmadd_ps(wf73, df15, sum48);
sum49 = _mm512_fmadd_ps(wf73, df16, sum49);
__m512 wf74 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs5, 1));
sum50 = _mm512_fmadd_ps(wf74, df13, sum50);
sum51 = _mm512_fmadd_ps(wf74, df14, sum51);
sum52 = _mm512_fmadd_ps(wf74, df15, sum52);
sum53 = _mm512_fmadd_ps(wf74, df16, sum53);
}
_mm512_storeu_ps(sfPtr1+0+1981440*i9+495360*j5+297216*k9+1024*l2, sum38);
_mm512_storeu_ps(sfPtr1+64+1981440*i9+495360*j5+297216*k9+1024*l2, sum39);
_mm512_storeu_ps(sfPtr1+128+1981440*i9+495360*j5+297216*k9+1024*l2, sum40);
_mm512_storeu_ps(sfPtr1+192+1981440*i9+495360*j5+297216*k9+1024*l2, sum41);
_mm512_storeu_ps(sfPtr1+256+1981440*i9+495360*j5+297216*k9+1024*l2, sum42);
_mm512_storeu_ps(sfPtr1+320+1981440*i9+495360*j5+297216*k9+1024*l2, sum43);
_mm512_storeu_ps(sfPtr1+384+1981440*i9+495360*j5+297216*k9+1024*l2, sum44);
_mm512_storeu_ps(sfPtr1+448+1981440*i9+495360*j5+297216*k9+1024*l2, sum45);
_mm512_storeu_ps(sfPtr1+512+1981440*i9+495360*j5+297216*k9+1024*l2, sum46);
_mm512_storeu_ps(sfPtr1+576+1981440*i9+495360*j5+297216*k9+1024*l2, sum47);
_mm512_storeu_ps(sfPtr1+640+1981440*i9+495360*j5+297216*k9+1024*l2, sum48);
_mm512_storeu_ps(sfPtr1+704+1981440*i9+495360*j5+297216*k9+1024*l2, sum49);
_mm512_storeu_ps(sfPtr1+768+1981440*i9+495360*j5+297216*k9+1024*l2, sum50);
_mm512_storeu_ps(sfPtr1+832+1981440*i9+495360*j5+297216*k9+1024*l2, sum51);
_mm512_storeu_ps(sfPtr1+896+1981440*i9+495360*j5+297216*k9+1024*l2, sum52);
_mm512_storeu_ps(sfPtr1+960+1981440*i9+495360*j5+297216*k9+1024*l2, sum53);
if (l2 >= ll2) return;
}
__m512 sum54;
__m512 sum58;
if (__builtin_expect(!j5, 0)) {
sum54 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+0+3096*i9+16*l2)));
sum58 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr3+4+3096*i9+16*l2)));
} else {
sum54 = _mm512_setzero_ps();
sum58 = _mm512_setzero_ps();
}
__m512 sum55 = sum54;
__m512 sum56 = sum54;
__m512 sum57 = sum54;
__m512 sum59 = sum58;
__m512 sum60 = sum58;
__m512 sum61 = sum58;
ptrdiff_t b6 = 0;
for (; b6 != 396; ++b6) {
__m512i wfs6 = _mm512_maskz_loadu_epi32(65535, wfPtr3+0+39232512*i9+9808128*j5+50688*l2+64*b6);
__m512 wf75 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs6));
__m512 df17 = _mm512_loadu_ps(dfPtr3+0+1013760*i9+253440*j5+152064*k9+256*b6);
sum54 = _mm512_fmadd_ps(wf75, df17, sum54);
__m512 df18 = _mm512_loadu_ps(dfPtr3+64+1013760*i9+253440*j5+152064*k9+256*b6);
sum55 = _mm512_fmadd_ps(wf75, df18, sum55);
__m512 df19 = _mm512_loadu_ps(dfPtr3+128+1013760*i9+253440*j5+152064*k9+256*b6);
sum56 = _mm512_fmadd_ps(wf75, df19, sum56);
__m512 df20 = _mm512_loadu_ps(dfPtr3+192+1013760*i9+253440*j5+152064*k9+256*b6);
sum57 = _mm512_fmadd_ps(wf75, df20, sum57);
__m512 wf76 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs6, 1));
sum58 = _mm512_fmadd_ps(wf76, df17, sum58);
sum59 = _mm512_fmadd_ps(wf76, df18, sum59);
sum60 = _mm512_fmadd_ps(wf76, df19, sum60);
sum61 = _mm512_fmadd_ps(wf76, df20, sum61);
}
_mm512_storeu_ps(sfPtr1+0+1981440*i9+495360*j5+297216*k9+1024*l2, sum54);
_mm512_storeu_ps(sfPtr1+64+1981440*i9+495360*j5+297216*k9+1024*l2, sum55);
_mm512_storeu_ps(sfPtr1+128+1981440*i9+495360*j5+297216*k9+1024*l2, sum56);
_mm512_storeu_ps(sfPtr1+192+1981440*i9+495360*j5+297216*k9+1024*l2, sum57);
_mm512_storeu_ps(sfPtr1+256+1981440*i9+495360*j5+297216*k9+1024*l2, sum58);
_mm512_storeu_ps(sfPtr1+320+1981440*i9+495360*j5+297216*k9+1024*l2, sum59);
_mm512_storeu_ps(sfPtr1+384+1981440*i9+495360*j5+297216*k9+1024*l2, sum60);
_mm512_storeu_ps(sfPtr1+448+1981440*i9+495360*j5+297216*k9+1024*l2, sum61);
}

static void Example12ThreeProduceSums1Callee2(Example12ThreaderTask1* task9, int64_t* pt10) {
void** pair3 = task9->any1;
char** tensors7 = pair3[0];
ptrdiff_t e4 = 1;
ptrdiff_t g5 = 0;
ptrdiff_t f3 = pt10[2];
ptrdiff_t d2 = pt10[1];
ptrdiff_t w6 = pt10[0];
char*restrict bfPtr4 = tensors7[0]+3096*e4;
char*restrict wfPtr4 = tensors7[0]+6208+39232512*e4;
char*restrict dfPtr4 = tensors7[1]+1013760*e4;
char*restrict sfPtr2 = tensors7[2];
ptrdiff_t i10 = 1*g5;
ptrdiff_t j6 = 1*f3;
ptrdiff_t k10 = 1*d2;
ptrdiff_t kk2 = k10+0;
for (; k10 != 1; ++k10) {
ptrdiff_t l3 = 4*w6;
ptrdiff_t ll3 = l3+(w6 < 47 ? 3 : 5);
for (; l3 != 193; ++l3) {
(void)bfPtr4;
__m512 sum62 = _mm512_setzero_ps();
__m512 sum68 = _mm512_setzero_ps();
__m512 sum74 = _mm512_setzero_ps();
__m512 sum80 = _mm512_setzero_ps();
__m512 sum63 = sum62;
__m512 sum64 = sum62;
__m512 sum65 = sum62;
__m512 sum66 = sum62;
__m512 sum67 = sum62;
__m512 sum69 = sum68;
__m512 sum70 = sum68;
__m512 sum71 = sum68;
__m512 sum72 = sum68;
__m512 sum73 = sum68;
__m512 sum75 = sum74;
__m512 sum76 = sum74;
__m512 sum77 = sum74;
__m512 sum78 = sum74;
__m512 sum79 = sum74;
__m512 sum81 = sum80;
__m512 sum82 = sum80;
__m512 sum83 = sum80;
__m512 sum84 = sum80;
__m512 sum85 = sum80;
ptrdiff_t b7 = 0;
for (; b7 != 144; ++b7) {
__m512i wfs7 = _mm512_maskz_loadu_epi32(65535, wfPtr4+0+14266368*i10+3566592*j6+18432*l3+128*b7);
__m512 wf77 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs7));
__m512 df21 = _mm512_loadu_ps(dfPtr4+0+368640*i10+92160*j6+55296*k10+384*b7);
sum62 = _mm512_fmadd_ps(wf77, df21, sum62);
__m512 df22 = _mm512_loadu_ps(dfPtr4+64+368640*i10+92160*j6+55296*k10+384*b7);
sum63 = _mm512_fmadd_ps(wf77, df22, sum63);
__m512 df23 = _mm512_loadu_ps(dfPtr4+128+368640*i10+92160*j6+55296*k10+384*b7);
sum64 = _mm512_fmadd_ps(wf77, df23, sum64);
__m512 df24 = _mm512_loadu_ps(dfPtr4+192+368640*i10+92160*j6+55296*k10+384*b7);
sum65 = _mm512_fmadd_ps(wf77, df24, sum65);
__m512 df25 = _mm512_loadu_ps(dfPtr4+256+368640*i10+92160*j6+55296*k10+384*b7);
sum66 = _mm512_fmadd_ps(wf77, df25, sum66);
__m512 df26 = _mm512_loadu_ps(dfPtr4+320+368640*i10+92160*j6+55296*k10+384*b7);
sum67 = _mm512_fmadd_ps(wf77, df26, sum67);
__m512 wf78 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs7, 1));
sum68 = _mm512_fmadd_ps(wf78, df21, sum68);
sum69 = _mm512_fmadd_ps(wf78, df22, sum69);
sum70 = _mm512_fmadd_ps(wf78, df23, sum70);
sum71 = _mm512_fmadd_ps(wf78, df24, sum71);
sum72 = _mm512_fmadd_ps(wf78, df25, sum72);
sum73 = _mm512_fmadd_ps(wf78, df26, sum73);
__m512i wfs8 = _mm512_maskz_loadu_epi32(65535, wfPtr4+64+14266368*i10+3566592*j6+18432*l3+128*b7);
__m512 wf79 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs8));
sum74 = _mm512_fmadd_ps(wf79, df21, sum74);
sum75 = _mm512_fmadd_ps(wf79, df22, sum75);
sum76 = _mm512_fmadd_ps(wf79, df23, sum76);
sum77 = _mm512_fmadd_ps(wf79, df24, sum77);
sum78 = _mm512_fmadd_ps(wf79, df25, sum78);
sum79 = _mm512_fmadd_ps(wf79, df26, sum79);
__m512 wf80 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs8, 1));
sum80 = _mm512_fmadd_ps(wf80, df21, sum80);
sum81 = _mm512_fmadd_ps(wf80, df22, sum81);
sum82 = _mm512_fmadd_ps(wf80, df23, sum82);
sum83 = _mm512_fmadd_ps(wf80, df24, sum83);
sum84 = _mm512_fmadd_ps(wf80, df25, sum84);
sum85 = _mm512_fmadd_ps(wf80, df26, sum85);
}
sum62 = _mm512_add_ps(sum62, _mm512_loadu_ps(sfPtr2+0+1981440*i10+495360*j6+297216*k10+1536*l3));
sum63 = _mm512_add_ps(sum63, _mm512_loadu_ps(sfPtr2+64+1981440*i10+495360*j6+297216*k10+1536*l3));
sum64 = _mm512_add_ps(sum64, _mm512_loadu_ps(sfPtr2+128+1981440*i10+495360*j6+297216*k10+1536*l3));
sum65 = _mm512_add_ps(sum65, _mm512_loadu_ps(sfPtr2+192+1981440*i10+495360*j6+297216*k10+1536*l3));
sum66 = _mm512_add_ps(sum66, _mm512_loadu_ps(sfPtr2+256+1981440*i10+495360*j6+297216*k10+1536*l3));
sum67 = _mm512_add_ps(sum67, _mm512_loadu_ps(sfPtr2+320+1981440*i10+495360*j6+297216*k10+1536*l3));
sum68 = _mm512_add_ps(sum68, _mm512_loadu_ps(sfPtr2+384+1981440*i10+495360*j6+297216*k10+1536*l3));
sum69 = _mm512_add_ps(sum69, _mm512_loadu_ps(sfPtr2+448+1981440*i10+495360*j6+297216*k10+1536*l3));
sum70 = _mm512_add_ps(sum70, _mm512_loadu_ps(sfPtr2+512+1981440*i10+495360*j6+297216*k10+1536*l3));
sum71 = _mm512_add_ps(sum71, _mm512_loadu_ps(sfPtr2+576+1981440*i10+495360*j6+297216*k10+1536*l3));
sum72 = _mm512_add_ps(sum72, _mm512_loadu_ps(sfPtr2+640+1981440*i10+495360*j6+297216*k10+1536*l3));
sum73 = _mm512_add_ps(sum73, _mm512_loadu_ps(sfPtr2+704+1981440*i10+495360*j6+297216*k10+1536*l3));
sum74 = _mm512_add_ps(sum74, _mm512_loadu_ps(sfPtr2+768+1981440*i10+495360*j6+297216*k10+1536*l3));
sum75 = _mm512_add_ps(sum75, _mm512_loadu_ps(sfPtr2+832+1981440*i10+495360*j6+297216*k10+1536*l3));
sum76 = _mm512_add_ps(sum76, _mm512_loadu_ps(sfPtr2+896+1981440*i10+495360*j6+297216*k10+1536*l3));
sum77 = _mm512_add_ps(sum77, _mm512_loadu_ps(sfPtr2+960+1981440*i10+495360*j6+297216*k10+1536*l3));
sum78 = _mm512_add_ps(sum78, _mm512_loadu_ps(sfPtr2+1024+1981440*i10+495360*j6+297216*k10+1536*l3));
sum79 = _mm512_add_ps(sum79, _mm512_loadu_ps(sfPtr2+1088+1981440*i10+495360*j6+297216*k10+1536*l3));
sum80 = _mm512_add_ps(sum80, _mm512_loadu_ps(sfPtr2+1152+1981440*i10+495360*j6+297216*k10+1536*l3));
sum81 = _mm512_add_ps(sum81, _mm512_loadu_ps(sfPtr2+1216+1981440*i10+495360*j6+297216*k10+1536*l3));
sum82 = _mm512_add_ps(sum82, _mm512_loadu_ps(sfPtr2+1280+1981440*i10+495360*j6+297216*k10+1536*l3));
sum83 = _mm512_add_ps(sum83, _mm512_loadu_ps(sfPtr2+1344+1981440*i10+495360*j6+297216*k10+1536*l3));
sum84 = _mm512_add_ps(sum84, _mm512_loadu_ps(sfPtr2+1408+1981440*i10+495360*j6+297216*k10+1536*l3));
sum85 = _mm512_add_ps(sum85, _mm512_loadu_ps(sfPtr2+1472+1981440*i10+495360*j6+297216*k10+1536*l3));
_mm512_storeu_ps(sfPtr2+0+1981440*i10+495360*j6+297216*k10+1536*l3, sum62);
_mm512_storeu_ps(sfPtr2+64+1981440*i10+495360*j6+297216*k10+1536*l3, sum63);
_mm512_storeu_ps(sfPtr2+128+1981440*i10+495360*j6+297216*k10+1536*l3, sum64);
_mm512_storeu_ps(sfPtr2+192+1981440*i10+495360*j6+297216*k10+1536*l3, sum65);
_mm512_storeu_ps(sfPtr2+256+1981440*i10+495360*j6+297216*k10+1536*l3, sum66);
_mm512_storeu_ps(sfPtr2+320+1981440*i10+495360*j6+297216*k10+1536*l3, sum67);
_mm512_storeu_ps(sfPtr2+384+1981440*i10+495360*j6+297216*k10+1536*l3, sum68);
_mm512_storeu_ps(sfPtr2+448+1981440*i10+495360*j6+297216*k10+1536*l3, sum69);
_mm512_storeu_ps(sfPtr2+512+1981440*i10+495360*j6+297216*k10+1536*l3, sum70);
_mm512_storeu_ps(sfPtr2+576+1981440*i10+495360*j6+297216*k10+1536*l3, sum71);
_mm512_storeu_ps(sfPtr2+640+1981440*i10+495360*j6+297216*k10+1536*l3, sum72);
_mm512_storeu_ps(sfPtr2+704+1981440*i10+495360*j6+297216*k10+1536*l3, sum73);
_mm512_storeu_ps(sfPtr2+768+1981440*i10+495360*j6+297216*k10+1536*l3, sum74);
_mm512_storeu_ps(sfPtr2+832+1981440*i10+495360*j6+297216*k10+1536*l3, sum75);
_mm512_storeu_ps(sfPtr2+896+1981440*i10+495360*j6+297216*k10+1536*l3, sum76);
_mm512_storeu_ps(sfPtr2+960+1981440*i10+495360*j6+297216*k10+1536*l3, sum77);
_mm512_storeu_ps(sfPtr2+1024+1981440*i10+495360*j6+297216*k10+1536*l3, sum78);
_mm512_storeu_ps(sfPtr2+1088+1981440*i10+495360*j6+297216*k10+1536*l3, sum79);
_mm512_storeu_ps(sfPtr2+1152+1981440*i10+495360*j6+297216*k10+1536*l3, sum80);
_mm512_storeu_ps(sfPtr2+1216+1981440*i10+495360*j6+297216*k10+1536*l3, sum81);
_mm512_storeu_ps(sfPtr2+1280+1981440*i10+495360*j6+297216*k10+1536*l3, sum82);
_mm512_storeu_ps(sfPtr2+1344+1981440*i10+495360*j6+297216*k10+1536*l3, sum83);
_mm512_storeu_ps(sfPtr2+1408+1981440*i10+495360*j6+297216*k10+1536*l3, sum84);
_mm512_storeu_ps(sfPtr2+1472+1981440*i10+495360*j6+297216*k10+1536*l3, sum85);
if (l3 >= ll3) return;
}
(void)bfPtr4;
__m512 sum86 = _mm512_setzero_ps();
__m512 sum92 = _mm512_setzero_ps();
__m512 sum87 = sum86;
__m512 sum88 = sum86;
__m512 sum89 = sum86;
__m512 sum90 = sum86;
__m512 sum91 = sum86;
__m512 sum93 = sum92;
__m512 sum94 = sum92;
__m512 sum95 = sum92;
__m512 sum96 = sum92;
__m512 sum97 = sum92;
ptrdiff_t b8 = 0;
for (; b8 != 144; ++b8) {
__m512i wfs9 = _mm512_maskz_loadu_epi32(65535, wfPtr4+0+14266368*i10+3566592*j6+18432*l3+64*b8);
__m512 wf81 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs9));
__m512 df27 = _mm512_loadu_ps(dfPtr4+0+368640*i10+92160*j6+55296*k10+384*b8);
sum86 = _mm512_fmadd_ps(wf81, df27, sum86);
__m512 df28 = _mm512_loadu_ps(dfPtr4+64+368640*i10+92160*j6+55296*k10+384*b8);
sum87 = _mm512_fmadd_ps(wf81, df28, sum87);
__m512 df29 = _mm512_loadu_ps(dfPtr4+128+368640*i10+92160*j6+55296*k10+384*b8);
sum88 = _mm512_fmadd_ps(wf81, df29, sum88);
__m512 df30 = _mm512_loadu_ps(dfPtr4+192+368640*i10+92160*j6+55296*k10+384*b8);
sum89 = _mm512_fmadd_ps(wf81, df30, sum89);
__m512 df31 = _mm512_loadu_ps(dfPtr4+256+368640*i10+92160*j6+55296*k10+384*b8);
sum90 = _mm512_fmadd_ps(wf81, df31, sum90);
__m512 df32 = _mm512_loadu_ps(dfPtr4+320+368640*i10+92160*j6+55296*k10+384*b8);
sum91 = _mm512_fmadd_ps(wf81, df32, sum91);
__m512 wf82 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs9, 1));
sum92 = _mm512_fmadd_ps(wf82, df27, sum92);
sum93 = _mm512_fmadd_ps(wf82, df28, sum93);
sum94 = _mm512_fmadd_ps(wf82, df29, sum94);
sum95 = _mm512_fmadd_ps(wf82, df30, sum95);
sum96 = _mm512_fmadd_ps(wf82, df31, sum96);
sum97 = _mm512_fmadd_ps(wf82, df32, sum97);
}
sum86 = _mm512_add_ps(sum86, _mm512_loadu_ps(sfPtr2+0+1981440*i10+495360*j6+297216*k10+1536*l3));
sum87 = _mm512_add_ps(sum87, _mm512_loadu_ps(sfPtr2+64+1981440*i10+495360*j6+297216*k10+1536*l3));
sum88 = _mm512_add_ps(sum88, _mm512_loadu_ps(sfPtr2+128+1981440*i10+495360*j6+297216*k10+1536*l3));
sum89 = _mm512_add_ps(sum89, _mm512_loadu_ps(sfPtr2+192+1981440*i10+495360*j6+297216*k10+1536*l3));
sum90 = _mm512_add_ps(sum90, _mm512_loadu_ps(sfPtr2+256+1981440*i10+495360*j6+297216*k10+1536*l3));
sum91 = _mm512_add_ps(sum91, _mm512_loadu_ps(sfPtr2+320+1981440*i10+495360*j6+297216*k10+1536*l3));
sum92 = _mm512_add_ps(sum92, _mm512_loadu_ps(sfPtr2+384+1981440*i10+495360*j6+297216*k10+1536*l3));
sum93 = _mm512_add_ps(sum93, _mm512_loadu_ps(sfPtr2+448+1981440*i10+495360*j6+297216*k10+1536*l3));
sum94 = _mm512_add_ps(sum94, _mm512_loadu_ps(sfPtr2+512+1981440*i10+495360*j6+297216*k10+1536*l3));
sum95 = _mm512_add_ps(sum95, _mm512_loadu_ps(sfPtr2+576+1981440*i10+495360*j6+297216*k10+1536*l3));
sum96 = _mm512_add_ps(sum96, _mm512_loadu_ps(sfPtr2+640+1981440*i10+495360*j6+297216*k10+1536*l3));
sum97 = _mm512_add_ps(sum97, _mm512_loadu_ps(sfPtr2+704+1981440*i10+495360*j6+297216*k10+1536*l3));
_mm512_storeu_ps(sfPtr2+0+1981440*i10+495360*j6+297216*k10+1536*l3, sum86);
_mm512_storeu_ps(sfPtr2+64+1981440*i10+495360*j6+297216*k10+1536*l3, sum87);
_mm512_storeu_ps(sfPtr2+128+1981440*i10+495360*j6+297216*k10+1536*l3, sum88);
_mm512_storeu_ps(sfPtr2+192+1981440*i10+495360*j6+297216*k10+1536*l3, sum89);
_mm512_storeu_ps(sfPtr2+256+1981440*i10+495360*j6+297216*k10+1536*l3, sum90);
_mm512_storeu_ps(sfPtr2+320+1981440*i10+495360*j6+297216*k10+1536*l3, sum91);
_mm512_storeu_ps(sfPtr2+384+1981440*i10+495360*j6+297216*k10+1536*l3, sum92);
_mm512_storeu_ps(sfPtr2+448+1981440*i10+495360*j6+297216*k10+1536*l3, sum93);
_mm512_storeu_ps(sfPtr2+512+1981440*i10+495360*j6+297216*k10+1536*l3, sum94);
_mm512_storeu_ps(sfPtr2+576+1981440*i10+495360*j6+297216*k10+1536*l3, sum95);
_mm512_storeu_ps(sfPtr2+640+1981440*i10+495360*j6+297216*k10+1536*l3, sum96);
_mm512_storeu_ps(sfPtr2+704+1981440*i10+495360*j6+297216*k10+1536*l3, sum97);
if (k10 >= kk2) return;
}
ptrdiff_t l4 = 4*w6;
ptrdiff_t ll4 = l4+(w6 < 47 ? 3 : 5);
for (; l4 != 193; ++l4) {
(void)bfPtr4;
__m512 sum98 = _mm512_setzero_ps();
__m512 sum102 = _mm512_setzero_ps();
__m512 sum106 = _mm512_setzero_ps();
__m512 sum110 = _mm512_setzero_ps();
__m512 sum99 = sum98;
__m512 sum100 = sum98;
__m512 sum101 = sum98;
__m512 sum103 = sum102;
__m512 sum104 = sum102;
__m512 sum105 = sum102;
__m512 sum107 = sum106;
__m512 sum108 = sum106;
__m512 sum109 = sum106;
__m512 sum111 = sum110;
__m512 sum112 = sum110;
__m512 sum113 = sum110;
ptrdiff_t b9 = 0;
for (; b9 != 144; ++b9) {
__m512i wfs10 = _mm512_maskz_loadu_epi32(65535, wfPtr4+0+14266368*i10+3566592*j6+18432*l4+128*b9);
__m512 wf83 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs10));
__m512 df33 = _mm512_loadu_ps(dfPtr4+0+368640*i10+92160*j6+55296*k10+256*b9);
sum98 = _mm512_fmadd_ps(wf83, df33, sum98);
__m512 df34 = _mm512_loadu_ps(dfPtr4+64+368640*i10+92160*j6+55296*k10+256*b9);
sum99 = _mm512_fmadd_ps(wf83, df34, sum99);
__m512 df35 = _mm512_loadu_ps(dfPtr4+128+368640*i10+92160*j6+55296*k10+256*b9);
sum100 = _mm512_fmadd_ps(wf83, df35, sum100);
__m512 df36 = _mm512_loadu_ps(dfPtr4+192+368640*i10+92160*j6+55296*k10+256*b9);
sum101 = _mm512_fmadd_ps(wf83, df36, sum101);
__m512 wf84 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs10, 1));
sum102 = _mm512_fmadd_ps(wf84, df33, sum102);
sum103 = _mm512_fmadd_ps(wf84, df34, sum103);
sum104 = _mm512_fmadd_ps(wf84, df35, sum104);
sum105 = _mm512_fmadd_ps(wf84, df36, sum105);
__m512i wfs11 = _mm512_maskz_loadu_epi32(65535, wfPtr4+64+14266368*i10+3566592*j6+18432*l4+128*b9);
__m512 wf85 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs11));
sum106 = _mm512_fmadd_ps(wf85, df33, sum106);
sum107 = _mm512_fmadd_ps(wf85, df34, sum107);
sum108 = _mm512_fmadd_ps(wf85, df35, sum108);
sum109 = _mm512_fmadd_ps(wf85, df36, sum109);
__m512 wf86 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs11, 1));
sum110 = _mm512_fmadd_ps(wf86, df33, sum110);
sum111 = _mm512_fmadd_ps(wf86, df34, sum111);
sum112 = _mm512_fmadd_ps(wf86, df35, sum112);
sum113 = _mm512_fmadd_ps(wf86, df36, sum113);
}
sum98 = _mm512_add_ps(sum98, _mm512_loadu_ps(sfPtr2+0+1981440*i10+495360*j6+297216*k10+1024*l4));
sum99 = _mm512_add_ps(sum99, _mm512_loadu_ps(sfPtr2+64+1981440*i10+495360*j6+297216*k10+1024*l4));
sum100 = _mm512_add_ps(sum100, _mm512_loadu_ps(sfPtr2+128+1981440*i10+495360*j6+297216*k10+1024*l4));
sum101 = _mm512_add_ps(sum101, _mm512_loadu_ps(sfPtr2+192+1981440*i10+495360*j6+297216*k10+1024*l4));
sum102 = _mm512_add_ps(sum102, _mm512_loadu_ps(sfPtr2+256+1981440*i10+495360*j6+297216*k10+1024*l4));
sum103 = _mm512_add_ps(sum103, _mm512_loadu_ps(sfPtr2+320+1981440*i10+495360*j6+297216*k10+1024*l4));
sum104 = _mm512_add_ps(sum104, _mm512_loadu_ps(sfPtr2+384+1981440*i10+495360*j6+297216*k10+1024*l4));
sum105 = _mm512_add_ps(sum105, _mm512_loadu_ps(sfPtr2+448+1981440*i10+495360*j6+297216*k10+1024*l4));
sum106 = _mm512_add_ps(sum106, _mm512_loadu_ps(sfPtr2+512+1981440*i10+495360*j6+297216*k10+1024*l4));
sum107 = _mm512_add_ps(sum107, _mm512_loadu_ps(sfPtr2+576+1981440*i10+495360*j6+297216*k10+1024*l4));
sum108 = _mm512_add_ps(sum108, _mm512_loadu_ps(sfPtr2+640+1981440*i10+495360*j6+297216*k10+1024*l4));
sum109 = _mm512_add_ps(sum109, _mm512_loadu_ps(sfPtr2+704+1981440*i10+495360*j6+297216*k10+1024*l4));
sum110 = _mm512_add_ps(sum110, _mm512_loadu_ps(sfPtr2+768+1981440*i10+495360*j6+297216*k10+1024*l4));
sum111 = _mm512_add_ps(sum111, _mm512_loadu_ps(sfPtr2+832+1981440*i10+495360*j6+297216*k10+1024*l4));
sum112 = _mm512_add_ps(sum112, _mm512_loadu_ps(sfPtr2+896+1981440*i10+495360*j6+297216*k10+1024*l4));
sum113 = _mm512_add_ps(sum113, _mm512_loadu_ps(sfPtr2+960+1981440*i10+495360*j6+297216*k10+1024*l4));
_mm512_storeu_ps(sfPtr2+0+1981440*i10+495360*j6+297216*k10+1024*l4, sum98);
_mm512_storeu_ps(sfPtr2+64+1981440*i10+495360*j6+297216*k10+1024*l4, sum99);
_mm512_storeu_ps(sfPtr2+128+1981440*i10+495360*j6+297216*k10+1024*l4, sum100);
_mm512_storeu_ps(sfPtr2+192+1981440*i10+495360*j6+297216*k10+1024*l4, sum101);
_mm512_storeu_ps(sfPtr2+256+1981440*i10+495360*j6+297216*k10+1024*l4, sum102);
_mm512_storeu_ps(sfPtr2+320+1981440*i10+495360*j6+297216*k10+1024*l4, sum103);
_mm512_storeu_ps(sfPtr2+384+1981440*i10+495360*j6+297216*k10+1024*l4, sum104);
_mm512_storeu_ps(sfPtr2+448+1981440*i10+495360*j6+297216*k10+1024*l4, sum105);
_mm512_storeu_ps(sfPtr2+512+1981440*i10+495360*j6+297216*k10+1024*l4, sum106);
_mm512_storeu_ps(sfPtr2+576+1981440*i10+495360*j6+297216*k10+1024*l4, sum107);
_mm512_storeu_ps(sfPtr2+640+1981440*i10+495360*j6+297216*k10+1024*l4, sum108);
_mm512_storeu_ps(sfPtr2+704+1981440*i10+495360*j6+297216*k10+1024*l4, sum109);
_mm512_storeu_ps(sfPtr2+768+1981440*i10+495360*j6+297216*k10+1024*l4, sum110);
_mm512_storeu_ps(sfPtr2+832+1981440*i10+495360*j6+297216*k10+1024*l4, sum111);
_mm512_storeu_ps(sfPtr2+896+1981440*i10+495360*j6+297216*k10+1024*l4, sum112);
_mm512_storeu_ps(sfPtr2+960+1981440*i10+495360*j6+297216*k10+1024*l4, sum113);
if (l4 >= ll4) return;
}
(void)bfPtr4;
__m512 sum114 = _mm512_setzero_ps();
__m512 sum118 = _mm512_setzero_ps();
__m512 sum115 = sum114;
__m512 sum116 = sum114;
__m512 sum117 = sum114;
__m512 sum119 = sum118;
__m512 sum120 = sum118;
__m512 sum121 = sum118;
ptrdiff_t b10 = 0;
for (; b10 != 144; ++b10) {
__m512i wfs12 = _mm512_maskz_loadu_epi32(65535, wfPtr4+0+14266368*i10+3566592*j6+18432*l4+64*b10);
__m512 wf87 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs12));
__m512 df37 = _mm512_loadu_ps(dfPtr4+0+368640*i10+92160*j6+55296*k10+256*b10);
sum114 = _mm512_fmadd_ps(wf87, df37, sum114);
__m512 df38 = _mm512_loadu_ps(dfPtr4+64+368640*i10+92160*j6+55296*k10+256*b10);
sum115 = _mm512_fmadd_ps(wf87, df38, sum115);
__m512 df39 = _mm512_loadu_ps(dfPtr4+128+368640*i10+92160*j6+55296*k10+256*b10);
sum116 = _mm512_fmadd_ps(wf87, df39, sum116);
__m512 df40 = _mm512_loadu_ps(dfPtr4+192+368640*i10+92160*j6+55296*k10+256*b10);
sum117 = _mm512_fmadd_ps(wf87, df40, sum117);
__m512 wf88 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs12, 1));
sum118 = _mm512_fmadd_ps(wf88, df37, sum118);
sum119 = _mm512_fmadd_ps(wf88, df38, sum119);
sum120 = _mm512_fmadd_ps(wf88, df39, sum120);
sum121 = _mm512_fmadd_ps(wf88, df40, sum121);
}
sum114 = _mm512_add_ps(sum114, _mm512_loadu_ps(sfPtr2+0+1981440*i10+495360*j6+297216*k10+1024*l4));
sum115 = _mm512_add_ps(sum115, _mm512_loadu_ps(sfPtr2+64+1981440*i10+495360*j6+297216*k10+1024*l4));
sum116 = _mm512_add_ps(sum116, _mm512_loadu_ps(sfPtr2+128+1981440*i10+495360*j6+297216*k10+1024*l4));
sum117 = _mm512_add_ps(sum117, _mm512_loadu_ps(sfPtr2+192+1981440*i10+495360*j6+297216*k10+1024*l4));
sum118 = _mm512_add_ps(sum118, _mm512_loadu_ps(sfPtr2+256+1981440*i10+495360*j6+297216*k10+1024*l4));
sum119 = _mm512_add_ps(sum119, _mm512_loadu_ps(sfPtr2+320+1981440*i10+495360*j6+297216*k10+1024*l4));
sum120 = _mm512_add_ps(sum120, _mm512_loadu_ps(sfPtr2+384+1981440*i10+495360*j6+297216*k10+1024*l4));
sum121 = _mm512_add_ps(sum121, _mm512_loadu_ps(sfPtr2+448+1981440*i10+495360*j6+297216*k10+1024*l4));
_mm512_storeu_ps(sfPtr2+0+1981440*i10+495360*j6+297216*k10+1024*l4, sum114);
_mm512_storeu_ps(sfPtr2+64+1981440*i10+495360*j6+297216*k10+1024*l4, sum115);
_mm512_storeu_ps(sfPtr2+128+1981440*i10+495360*j6+297216*k10+1024*l4, sum116);
_mm512_storeu_ps(sfPtr2+192+1981440*i10+495360*j6+297216*k10+1024*l4, sum117);
_mm512_storeu_ps(sfPtr2+256+1981440*i10+495360*j6+297216*k10+1024*l4, sum118);
_mm512_storeu_ps(sfPtr2+320+1981440*i10+495360*j6+297216*k10+1024*l4, sum119);
_mm512_storeu_ps(sfPtr2+384+1981440*i10+495360*j6+297216*k10+1024*l4, sum120);
_mm512_storeu_ps(sfPtr2+448+1981440*i10+495360*j6+297216*k10+1024*l4, sum121);
}

static void Example12ThreeProduceSums1(Example12ThreaderTeam1* team16, char** tensors5) {
void* pair1[] = {tensors5, 0};
Example12ThreaderTask1 task10;
task10.callee1 = Example12ThreeProduceSums1Callee1;
task10.any1 = pair1;
task10.nd1 = 4;
task10.hull1[0] = 97;
task10.hull1[1] = 2;
task10.hull1[2] = 4;
task10.hull1[3] = 1;
Example12ThreaderDo1(team16, &task10);
Example12ThreaderTask1 task11;
task11.callee1 = Example12ThreeProduceSums1Callee2;
task11.any1 = pair1;
task11.nd1 = 4;
task11.hull1[0] = 48;
task11.hull1[1] = 2;
task11.hull1[2] = 4;
task11.hull1[3] = 1;
Example12ThreaderDo1(team16, &task11);
}

static void Example12ThreeConsumeSums1Callee1(Example12ThreaderTask1* task12, int64_t* pt11) {
char** tensors9 = task12->any1;
ptrdiff_t w7 = pt11[0];
ptrdiff_t d3 = pt11[1];
ptrdiff_t g6 = 0;
char*restrict sfPtr3 = tensors9[0];
char*restrict datPtr3 = tensors9[1];
ptrdiff_t i11 = 1*g6;
ptrdiff_t j7 = 1*d3;
ptrdiff_t last3 = j7+0;
ptrdiff_t rel3 = j7-0;
ptrdiff_t base3 = 0;
if (rel3 < 1) {
ptrdiff_t toH1 = base3+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k11 = 27*w7;
ptrdiff_t kk3 = k11+(w7 < 6 ? 26 : 31);
for (; k11 != 193; ++k11) {
ptrdiff_t l5 = 0;
for (; l5 != 2; ++l5) {
__m512 sf1 = _mm512_loadu_ps(sfPtr3+0+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf2 = _mm512_loadu_ps(sfPtr3+128+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in113 = _mm512_shuffle_f32x4(sf1, sf2, 68);
__m512 in114 = _mm512_shuffle_f32x4(sf1, sf2, 238);
__m512 sf3 = _mm512_loadu_ps(sfPtr3+64+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf4 = _mm512_loadu_ps(sfPtr3+192+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in121 = _mm512_shuffle_f32x4(sf3, sf4, 68);
__m512 in122 = _mm512_shuffle_f32x4(sf3, sf4, 238);
__m512 sf5 = _mm512_loadu_ps(sfPtr3+495360+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf6 = _mm512_loadu_ps(sfPtr3+495488+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in115 = _mm512_shuffle_f32x4(sf5, sf6, 68);
__m512 in116 = _mm512_shuffle_f32x4(sf5, sf6, 238);
__m512 sf7 = _mm512_loadu_ps(sfPtr3+495424+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf8 = _mm512_loadu_ps(sfPtr3+495552+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in123 = _mm512_shuffle_f32x4(sf7, sf8, 68);
__m512 in124 = _mm512_shuffle_f32x4(sf7, sf8, 238);
__m512 sf9 = _mm512_loadu_ps(sfPtr3+990720+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf10 = _mm512_loadu_ps(sfPtr3+990848+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in117 = _mm512_shuffle_f32x4(sf9, sf10, 68);
__m512 in118 = _mm512_shuffle_f32x4(sf9, sf10, 238);
__m512 sf11 = _mm512_loadu_ps(sfPtr3+990784+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf12 = _mm512_loadu_ps(sfPtr3+990912+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in125 = _mm512_shuffle_f32x4(sf11, sf12, 68);
__m512 in126 = _mm512_shuffle_f32x4(sf11, sf12, 238);
__m512 sf13 = _mm512_loadu_ps(sfPtr3+1486080+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf14 = _mm512_loadu_ps(sfPtr3+1486208+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in119 = _mm512_shuffle_f32x4(sf13, sf14, 68);
__m512 in120 = _mm512_shuffle_f32x4(sf13, sf14, 238);
__m512 sf15 = _mm512_loadu_ps(sfPtr3+1486144+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf16 = _mm512_loadu_ps(sfPtr3+1486272+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in127 = _mm512_shuffle_f32x4(sf15, sf16, 68);
__m512 in128 = _mm512_shuffle_f32x4(sf15, sf16, 238);
__m512 tmp749 = _mm512_add_ps(in114, in115);
__m512 tmp769 = _mm512_add_ps(in122, in123);
__m512 tmp748 = _mm512_add_ps(in116, in117);
__m512 tmp768 = _mm512_add_ps(in124, in125);
__m512 tmp754 = _mm512_sub_ps(in116, in117);
__m512 tmp774 = _mm512_sub_ps(in124, in125);
__m512 tmp753 = _mm512_sub_ps(in114, in115);
__m512 tmp773 = _mm512_sub_ps(in122, in123);
__m512 tmp750 = _mm512_add_ps(in118, in119);
__m512 tmp770 = _mm512_add_ps(in126, in127);
__m512 tmp755 = _mm512_sub_ps(in118, in119);
__m512 tmp775 = _mm512_sub_ps(in126, in127);
__m512 tmp752 = _mm512_fmadd_ps(tmp754, _mm512_set1_ps(2e+00f), tmp753);
__m512 tmp772 = _mm512_fmadd_ps(tmp774, _mm512_set1_ps(2e+00f), tmp773);
__m512 tmp759 = _mm512_fmadd_ps(tmp754, _mm512_set1_ps(8e+00f), tmp753);
__m512 tmp779 = _mm512_fmadd_ps(tmp774, _mm512_set1_ps(8e+00f), tmp773);
__m512 tmp747 = _mm512_add_ps(tmp748, tmp749);
__m512 tmp767 = _mm512_add_ps(tmp768, tmp769);
__m512 tmp751 = _mm512_fmadd_ps(tmp755, _mm512_set1_ps(1.6e+01f), tmp752);
__m512 tmp771 = _mm512_fmadd_ps(tmp775, _mm512_set1_ps(1.6e+01f), tmp772);
__m512 tmp758 = _mm512_fmadd_ps(tmp755, _mm512_set1_ps(4e+00f), tmp759);
__m512 tmp778 = _mm512_fmadd_ps(tmp775, _mm512_set1_ps(4e+00f), tmp779);
__m512 tmp764 = _mm512_add_ps(tmp755, tmp753);
__m512 tmp784 = _mm512_add_ps(tmp775, tmp773);
__m512 tmp757 = _mm512_fmadd_ps(tmp748, _mm512_set1_ps(4e+00f), tmp749);
__m512 tmp777 = _mm512_fmadd_ps(tmp768, _mm512_set1_ps(4e+00f), tmp769);
__m512 tmp761 = _mm512_fmadd_ps(tmp748, _mm512_set1_ps(1.6e+01f), tmp749);
__m512 tmp781 = _mm512_fmadd_ps(tmp768, _mm512_set1_ps(1.6e+01f), tmp769);
__m512 tmp746 = _mm512_add_ps(tmp747, in113);
__m512 tmp766 = _mm512_add_ps(tmp767, in121);
__m512 tmp763 = _mm512_add_ps(tmp764, in120);
__m512 tmp783 = _mm512_add_ps(tmp784, in128);
__m512 tmp745 = _mm512_fmadd_ps(tmp750, _mm512_set1_ps(3.2e+01f), tmp746);
__m512 tmp765 = _mm512_fmadd_ps(tmp770, _mm512_set1_ps(3.2e+01f), tmp766);
__m512 tmp756 = _mm512_fmadd_ps(tmp750, _mm512_set1_ps(8e+00f), tmp757);
__m512 tmp776 = _mm512_fmadd_ps(tmp770, _mm512_set1_ps(8e+00f), tmp777);
__m512 tmp762 = _mm512_fmadd_ps(tmp754, _mm512_set1_ps(3.2e+01f), tmp763);
__m512 tmp782 = _mm512_fmadd_ps(tmp774, _mm512_set1_ps(3.2e+01f), tmp783);
__m512 tmp760 = _mm512_fmadd_ps(tmp750, _mm512_set1_ps(2e+00f), tmp761);
__m512 tmp780 = _mm512_fmadd_ps(tmp770, _mm512_set1_ps(2e+00f), tmp781);
__m512 tmp733 = tmp745;
__m512 tmp739 = tmp765;
__m512 tmp734 = tmp751;
__m512 tmp740 = tmp771;
__m512 tmp735 = tmp756;
__m512 tmp741 = tmp776;
__m512 tmp736 = tmp758;
__m512 tmp742 = tmp778;
__m512 tmp737 = tmp760;
__m512 tmp743 = tmp780;
__m512 tmp738 = tmp762;
__m512 tmp744 = tmp782;
__m512 tmp829 = _mm512_unpacklo_ps(tmp733, tmp734);
__m512 tmp830 = _mm512_unpackhi_ps(tmp733, tmp734);
__m512 tmp831 = _mm512_unpacklo_ps(tmp735, tmp736);
__m512 tmp832 = _mm512_unpackhi_ps(tmp735, tmp736);
__m512 tmp833 = _mm512_unpacklo_ps(tmp737, tmp738);
__m512 tmp834 = _mm512_unpackhi_ps(tmp737, tmp738);
__m512 tmp835 = _mm512_unpacklo_ps(tmp739, tmp740);
__m512 tmp836 = _mm512_unpackhi_ps(tmp739, tmp740);
__m512 tmp837 = _mm512_unpacklo_ps(tmp741, tmp742);
__m512 tmp838 = _mm512_unpackhi_ps(tmp741, tmp742);
__m512 tmp839 = _mm512_unpacklo_ps(tmp743, tmp744);
__m512 tmp840 = _mm512_unpackhi_ps(tmp743, tmp744);
__m512 tmp841 = _mm512_shuffle_ps(tmp829, tmp831, 68);
__m512 tmp842 = _mm512_shuffle_ps(tmp829, tmp831, 238);
__m512 tmp843 = _mm512_shuffle_ps(tmp830, tmp832, 68);
__m512 tmp844 = _mm512_shuffle_ps(tmp830, tmp832, 238);
__m512 tmp845 = _mm512_shuffle_ps(tmp833, tmp835, 68);
__m512 tmp846 = _mm512_shuffle_ps(tmp833, tmp835, 238);
__m512 tmp847 = _mm512_shuffle_ps(tmp834, tmp836, 68);
__m512 tmp848 = _mm512_shuffle_ps(tmp834, tmp836, 238);
__m512 tmp849 = _mm512_shuffle_ps(tmp837, tmp839, 68);
__m512 tmp850 = _mm512_shuffle_ps(tmp837, tmp839, 238);
__m512 tmp851 = _mm512_shuffle_ps(tmp838, tmp840, 68);
__m512 tmp852 = _mm512_shuffle_ps(tmp838, tmp840, 238);
__m512 tmp853 = _mm512_shuffle_f32x4(tmp841, tmp845, 136);
__m512 tmp854 = _mm512_shuffle_f32x4(tmp841, tmp845, 221);
__m512 tmp855 = _mm512_shuffle_f32x4(tmp842, tmp846, 136);
__m512 tmp856 = _mm512_shuffle_f32x4(tmp842, tmp846, 221);
__m512 tmp857 = _mm512_shuffle_f32x4(tmp843, tmp847, 136);
__m512 tmp858 = _mm512_shuffle_f32x4(tmp843, tmp847, 221);
__m512 tmp859 = _mm512_shuffle_f32x4(tmp844, tmp848, 136);
__m512 tmp860 = _mm512_shuffle_f32x4(tmp844, tmp848, 221);
__m512 tmp861 = _mm512_shuffle_f32x4(tmp849, tmp849, 136);
__m512 tmp862 = _mm512_shuffle_f32x4(tmp849, tmp849, 221);
__m512 tmp863 = _mm512_shuffle_f32x4(tmp850, tmp850, 136);
__m512 tmp864 = _mm512_shuffle_f32x4(tmp850, tmp850, 221);
__m512 tmp865 = _mm512_shuffle_f32x4(tmp851, tmp851, 136);
__m512 tmp866 = _mm512_shuffle_f32x4(tmp851, tmp851, 221);
__m512 tmp867 = _mm512_shuffle_f32x4(tmp852, tmp852, 136);
__m512 tmp868 = _mm512_shuffle_f32x4(tmp852, tmp852, 221);
tmp733 = _mm512_shuffle_f32x4(tmp853, tmp861, 136);
tmp741 = _mm512_shuffle_f32x4(tmp853, tmp861, 221);
tmp734 = _mm512_shuffle_f32x4(tmp855, tmp863, 136);
tmp742 = _mm512_shuffle_f32x4(tmp855, tmp863, 221);
tmp735 = _mm512_shuffle_f32x4(tmp857, tmp865, 136);
tmp743 = _mm512_shuffle_f32x4(tmp857, tmp865, 221);
tmp736 = _mm512_shuffle_f32x4(tmp859, tmp867, 136);
tmp744 = _mm512_shuffle_f32x4(tmp859, tmp867, 221);
tmp737 = _mm512_shuffle_f32x4(tmp854, tmp862, 136);
__m512 tmp785 = _mm512_shuffle_f32x4(tmp854, tmp862, 221);
tmp738 = _mm512_shuffle_f32x4(tmp856, tmp864, 136);
__m512 tmp786 = _mm512_shuffle_f32x4(tmp856, tmp864, 221);
tmp739 = _mm512_shuffle_f32x4(tmp858, tmp866, 136);
__m512 tmp787 = _mm512_shuffle_f32x4(tmp858, tmp866, 221);
tmp740 = _mm512_shuffle_f32x4(tmp860, tmp868, 136);
__m512 tmp788 = _mm512_shuffle_f32x4(tmp860, tmp868, 221);
__m512 tmp793 = _mm512_add_ps(tmp734, tmp735);
__m512 tmp813 = _mm512_add_ps(tmp742, tmp743);
__m512 tmp792 = _mm512_add_ps(tmp736, tmp737);
__m512 tmp812 = _mm512_add_ps(tmp744, tmp785);
__m512 tmp798 = _mm512_sub_ps(tmp736, tmp737);
__m512 tmp818 = _mm512_sub_ps(tmp744, tmp785);
__m512 tmp797 = _mm512_sub_ps(tmp734, tmp735);
__m512 tmp817 = _mm512_sub_ps(tmp742, tmp743);
__m512 tmp794 = _mm512_add_ps(tmp738, tmp739);
__m512 tmp814 = _mm512_add_ps(tmp786, tmp787);
__m512 tmp799 = _mm512_sub_ps(tmp738, tmp739);
__m512 tmp819 = _mm512_sub_ps(tmp786, tmp787);
__m512 tmp796 = _mm512_fmadd_ps(tmp798, _mm512_set1_ps(2e+00f), tmp797);
__m512 tmp816 = _mm512_fmadd_ps(tmp818, _mm512_set1_ps(2e+00f), tmp817);
__m512 tmp803 = _mm512_fmadd_ps(tmp798, _mm512_set1_ps(8e+00f), tmp797);
__m512 tmp823 = _mm512_fmadd_ps(tmp818, _mm512_set1_ps(8e+00f), tmp817);
__m512 tmp791 = _mm512_add_ps(tmp792, tmp793);
__m512 tmp811 = _mm512_add_ps(tmp812, tmp813);
__m512 tmp795 = _mm512_fmadd_ps(tmp799, _mm512_set1_ps(1.6e+01f), tmp796);
__m512 tmp815 = _mm512_fmadd_ps(tmp819, _mm512_set1_ps(1.6e+01f), tmp816);
__m512 tmp802 = _mm512_fmadd_ps(tmp799, _mm512_set1_ps(4e+00f), tmp803);
__m512 tmp822 = _mm512_fmadd_ps(tmp819, _mm512_set1_ps(4e+00f), tmp823);
__m512 tmp808 = _mm512_add_ps(tmp799, tmp797);
__m512 tmp828 = _mm512_add_ps(tmp819, tmp817);
__m512 tmp801 = _mm512_fmadd_ps(tmp792, _mm512_set1_ps(4e+00f), tmp793);
__m512 tmp821 = _mm512_fmadd_ps(tmp812, _mm512_set1_ps(4e+00f), tmp813);
__m512 tmp805 = _mm512_fmadd_ps(tmp792, _mm512_set1_ps(1.6e+01f), tmp793);
__m512 tmp825 = _mm512_fmadd_ps(tmp812, _mm512_set1_ps(1.6e+01f), tmp813);
__m512 tmp790 = _mm512_add_ps(tmp791, tmp733);
__m512 tmp810 = _mm512_add_ps(tmp811, tmp741);
__m512 tmp807 = _mm512_add_ps(tmp808, tmp740);
__m512 tmp827 = _mm512_add_ps(tmp828, tmp788);
__m512 tmp789 = _mm512_fmadd_ps(tmp794, _mm512_set1_ps(3.2e+01f), tmp790);
__m512 tmp809 = _mm512_fmadd_ps(tmp814, _mm512_set1_ps(3.2e+01f), tmp810);
__m512 tmp800 = _mm512_fmadd_ps(tmp794, _mm512_set1_ps(8e+00f), tmp801);
__m512 tmp820 = _mm512_fmadd_ps(tmp814, _mm512_set1_ps(8e+00f), tmp821);
__m512 tmp806 = _mm512_fmadd_ps(tmp798, _mm512_set1_ps(3.2e+01f), tmp807);
__m512 tmp826 = _mm512_fmadd_ps(tmp818, _mm512_set1_ps(3.2e+01f), tmp827);
__m512 tmp804 = _mm512_fmadd_ps(tmp794, _mm512_set1_ps(2e+00f), tmp805);
__m512 tmp824 = _mm512_fmadd_ps(tmp814, _mm512_set1_ps(2e+00f), tmp825);
__m512 out193 = tmp789;
__m512 out199 = tmp809;
__m512 out194 = tmp795;
__m512 out200 = tmp815;
__m512 out195 = tmp800;
__m512 out201 = tmp820;
__m512 out196 = tmp802;
__m512 out202 = tmp822;
__m512 out197 = tmp804;
__m512 out203 = tmp824;
__m512 out198 = tmp806;
__m512 out204 = tmp826;
_mm512_mask_storeu_ps(datPtr3+0+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out193);
_mm512_mask_storeu_ps(datPtr3+48+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out199);
_mm512_mask_storeu_ps(datPtr3+100+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out194);
_mm512_mask_storeu_ps(datPtr3+148+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out200);
_mm512_mask_storeu_ps(datPtr3+200+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out195);
_mm512_mask_storeu_ps(datPtr3+248+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out201);
_mm512_mask_storeu_ps(datPtr3+300+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out196);
_mm512_mask_storeu_ps(datPtr3+348+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out202);
_mm512_mask_storeu_ps(datPtr3+400+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out197);
_mm512_mask_storeu_ps(datPtr3+448+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out203);
_mm512_mask_storeu_ps(datPtr3+500+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out198);
_mm512_mask_storeu_ps(datPtr3+548+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out204);
__m512 sf17 = _mm512_loadu_ps(sfPtr3+256+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf18 = _mm512_loadu_ps(sfPtr3+384+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in129 = _mm512_shuffle_f32x4(sf18, sf17, 68);
__m512 in130 = _mm512_shuffle_f32x4(sf18, sf17, 238);
__m512 sf19 = _mm512_loadu_ps(sfPtr3+320+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf20 = _mm512_loadu_ps(sfPtr3+448+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in137 = _mm512_shuffle_f32x4(sf20, sf19, 68);
__m512 in138 = _mm512_shuffle_f32x4(sf20, sf19, 238);
__m512 sf21 = _mm512_loadu_ps(sfPtr3+495616+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf22 = _mm512_loadu_ps(sfPtr3+495744+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in131 = _mm512_shuffle_f32x4(sf22, sf21, 68);
__m512 in132 = _mm512_shuffle_f32x4(sf22, sf21, 238);
__m512 sf23 = _mm512_loadu_ps(sfPtr3+495680+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf24 = _mm512_loadu_ps(sfPtr3+495808+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in139 = _mm512_shuffle_f32x4(sf24, sf23, 68);
__m512 in140 = _mm512_shuffle_f32x4(sf24, sf23, 238);
__m512 sf25 = _mm512_loadu_ps(sfPtr3+990976+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf26 = _mm512_loadu_ps(sfPtr3+991104+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in133 = _mm512_shuffle_f32x4(sf26, sf25, 68);
__m512 in134 = _mm512_shuffle_f32x4(sf26, sf25, 238);
__m512 sf27 = _mm512_loadu_ps(sfPtr3+991040+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf28 = _mm512_loadu_ps(sfPtr3+991168+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in141 = _mm512_shuffle_f32x4(sf28, sf27, 68);
__m512 in142 = _mm512_shuffle_f32x4(sf28, sf27, 238);
__m512 sf29 = _mm512_loadu_ps(sfPtr3+1486336+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf30 = _mm512_loadu_ps(sfPtr3+1486464+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in135 = _mm512_shuffle_f32x4(sf30, sf29, 68);
__m512 in136 = _mm512_shuffle_f32x4(sf30, sf29, 238);
__m512 sf31 = _mm512_loadu_ps(sfPtr3+1486400+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf32 = _mm512_loadu_ps(sfPtr3+1486528+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in143 = _mm512_shuffle_f32x4(sf32, sf31, 68);
__m512 in144 = _mm512_shuffle_f32x4(sf32, sf31, 238);
__m512 tmp885 = _mm512_add_ps(in130, in131);
__m512 tmp905 = _mm512_add_ps(in138, in139);
__m512 tmp884 = _mm512_add_ps(in132, in133);
__m512 tmp904 = _mm512_add_ps(in140, in141);
__m512 tmp890 = _mm512_sub_ps(in132, in133);
__m512 tmp910 = _mm512_sub_ps(in140, in141);
__m512 tmp889 = _mm512_sub_ps(in130, in131);
__m512 tmp909 = _mm512_sub_ps(in138, in139);
__m512 tmp886 = _mm512_add_ps(in134, in135);
__m512 tmp906 = _mm512_add_ps(in142, in143);
__m512 tmp891 = _mm512_sub_ps(in134, in135);
__m512 tmp911 = _mm512_sub_ps(in142, in143);
__m512 tmp888 = _mm512_fmadd_ps(tmp890, _mm512_set1_ps(2e+00f), tmp889);
__m512 tmp908 = _mm512_fmadd_ps(tmp910, _mm512_set1_ps(2e+00f), tmp909);
__m512 tmp895 = _mm512_fmadd_ps(tmp890, _mm512_set1_ps(8e+00f), tmp889);
__m512 tmp915 = _mm512_fmadd_ps(tmp910, _mm512_set1_ps(8e+00f), tmp909);
__m512 tmp883 = _mm512_add_ps(tmp884, tmp885);
__m512 tmp903 = _mm512_add_ps(tmp904, tmp905);
__m512 tmp887 = _mm512_fmadd_ps(tmp891, _mm512_set1_ps(1.6e+01f), tmp888);
__m512 tmp907 = _mm512_fmadd_ps(tmp911, _mm512_set1_ps(1.6e+01f), tmp908);
__m512 tmp894 = _mm512_fmadd_ps(tmp891, _mm512_set1_ps(4e+00f), tmp895);
__m512 tmp914 = _mm512_fmadd_ps(tmp911, _mm512_set1_ps(4e+00f), tmp915);
__m512 tmp900 = _mm512_add_ps(tmp891, tmp889);
__m512 tmp920 = _mm512_add_ps(tmp911, tmp909);
__m512 tmp893 = _mm512_fmadd_ps(tmp884, _mm512_set1_ps(4e+00f), tmp885);
__m512 tmp913 = _mm512_fmadd_ps(tmp904, _mm512_set1_ps(4e+00f), tmp905);
__m512 tmp897 = _mm512_fmadd_ps(tmp884, _mm512_set1_ps(1.6e+01f), tmp885);
__m512 tmp917 = _mm512_fmadd_ps(tmp904, _mm512_set1_ps(1.6e+01f), tmp905);
__m512 tmp882 = _mm512_add_ps(tmp883, in129);
__m512 tmp902 = _mm512_add_ps(tmp903, in137);
__m512 tmp899 = _mm512_add_ps(tmp900, in136);
__m512 tmp919 = _mm512_add_ps(tmp920, in144);
__m512 tmp881 = _mm512_fmadd_ps(tmp886, _mm512_set1_ps(3.2e+01f), tmp882);
__m512 tmp901 = _mm512_fmadd_ps(tmp906, _mm512_set1_ps(3.2e+01f), tmp902);
__m512 tmp892 = _mm512_fmadd_ps(tmp886, _mm512_set1_ps(8e+00f), tmp893);
__m512 tmp912 = _mm512_fmadd_ps(tmp906, _mm512_set1_ps(8e+00f), tmp913);
__m512 tmp898 = _mm512_fmadd_ps(tmp890, _mm512_set1_ps(3.2e+01f), tmp899);
__m512 tmp918 = _mm512_fmadd_ps(tmp910, _mm512_set1_ps(3.2e+01f), tmp919);
__m512 tmp896 = _mm512_fmadd_ps(tmp886, _mm512_set1_ps(2e+00f), tmp897);
__m512 tmp916 = _mm512_fmadd_ps(tmp906, _mm512_set1_ps(2e+00f), tmp917);
__m512 tmp869 = tmp881;
__m512 tmp875 = tmp901;
__m512 tmp870 = tmp887;
__m512 tmp876 = tmp907;
__m512 tmp871 = tmp892;
__m512 tmp877 = tmp912;
__m512 tmp872 = tmp894;
__m512 tmp878 = tmp914;
__m512 tmp873 = tmp896;
__m512 tmp879 = tmp916;
__m512 tmp874 = tmp898;
__m512 tmp880 = tmp918;
__m512 tmp965 = _mm512_unpacklo_ps(tmp869, tmp870);
__m512 tmp966 = _mm512_unpackhi_ps(tmp869, tmp870);
__m512 tmp967 = _mm512_unpacklo_ps(tmp871, tmp872);
__m512 tmp968 = _mm512_unpackhi_ps(tmp871, tmp872);
__m512 tmp969 = _mm512_unpacklo_ps(tmp873, tmp874);
__m512 tmp970 = _mm512_unpackhi_ps(tmp873, tmp874);
__m512 tmp971 = _mm512_unpacklo_ps(tmp875, tmp876);
__m512 tmp972 = _mm512_unpackhi_ps(tmp875, tmp876);
__m512 tmp973 = _mm512_unpacklo_ps(tmp877, tmp878);
__m512 tmp974 = _mm512_unpackhi_ps(tmp877, tmp878);
__m512 tmp975 = _mm512_unpacklo_ps(tmp879, tmp880);
__m512 tmp976 = _mm512_unpackhi_ps(tmp879, tmp880);
__m512 tmp977 = _mm512_shuffle_ps(tmp965, tmp967, 68);
__m512 tmp978 = _mm512_shuffle_ps(tmp965, tmp967, 238);
__m512 tmp979 = _mm512_shuffle_ps(tmp966, tmp968, 68);
__m512 tmp980 = _mm512_shuffle_ps(tmp966, tmp968, 238);
__m512 tmp981 = _mm512_shuffle_ps(tmp969, tmp971, 68);
__m512 tmp982 = _mm512_shuffle_ps(tmp969, tmp971, 238);
__m512 tmp983 = _mm512_shuffle_ps(tmp970, tmp972, 68);
__m512 tmp984 = _mm512_shuffle_ps(tmp970, tmp972, 238);
__m512 tmp985 = _mm512_shuffle_ps(tmp973, tmp975, 68);
__m512 tmp986 = _mm512_shuffle_ps(tmp973, tmp975, 238);
__m512 tmp987 = _mm512_shuffle_ps(tmp974, tmp976, 68);
__m512 tmp988 = _mm512_shuffle_ps(tmp974, tmp976, 238);
__m512 tmp989 = _mm512_shuffle_f32x4(tmp977, tmp981, 136);
__m512 tmp990 = _mm512_shuffle_f32x4(tmp977, tmp981, 221);
__m512 tmp991 = _mm512_shuffle_f32x4(tmp978, tmp982, 136);
__m512 tmp992 = _mm512_shuffle_f32x4(tmp978, tmp982, 221);
__m512 tmp993 = _mm512_shuffle_f32x4(tmp979, tmp983, 136);
__m512 tmp994 = _mm512_shuffle_f32x4(tmp979, tmp983, 221);
__m512 tmp995 = _mm512_shuffle_f32x4(tmp980, tmp984, 136);
__m512 tmp996 = _mm512_shuffle_f32x4(tmp980, tmp984, 221);
__m512 tmp997 = _mm512_shuffle_f32x4(tmp985, tmp985, 136);
__m512 tmp998 = _mm512_shuffle_f32x4(tmp985, tmp985, 221);
__m512 tmp999 = _mm512_shuffle_f32x4(tmp986, tmp986, 136);
__m512 tmp1000 = _mm512_shuffle_f32x4(tmp986, tmp986, 221);
__m512 tmp1001 = _mm512_shuffle_f32x4(tmp987, tmp987, 136);
__m512 tmp1002 = _mm512_shuffle_f32x4(tmp987, tmp987, 221);
__m512 tmp1003 = _mm512_shuffle_f32x4(tmp988, tmp988, 136);
__m512 tmp1004 = _mm512_shuffle_f32x4(tmp988, tmp988, 221);
tmp869 = _mm512_shuffle_f32x4(tmp989, tmp997, 136);
tmp877 = _mm512_shuffle_f32x4(tmp989, tmp997, 221);
tmp870 = _mm512_shuffle_f32x4(tmp991, tmp999, 136);
tmp878 = _mm512_shuffle_f32x4(tmp991, tmp999, 221);
tmp871 = _mm512_shuffle_f32x4(tmp993, tmp1001, 136);
tmp879 = _mm512_shuffle_f32x4(tmp993, tmp1001, 221);
tmp872 = _mm512_shuffle_f32x4(tmp995, tmp1003, 136);
tmp880 = _mm512_shuffle_f32x4(tmp995, tmp1003, 221);
tmp873 = _mm512_shuffle_f32x4(tmp990, tmp998, 136);
__m512 tmp921 = _mm512_shuffle_f32x4(tmp990, tmp998, 221);
tmp874 = _mm512_shuffle_f32x4(tmp992, tmp1000, 136);
__m512 tmp922 = _mm512_shuffle_f32x4(tmp992, tmp1000, 221);
tmp875 = _mm512_shuffle_f32x4(tmp994, tmp1002, 136);
__m512 tmp923 = _mm512_shuffle_f32x4(tmp994, tmp1002, 221);
tmp876 = _mm512_shuffle_f32x4(tmp996, tmp1004, 136);
__m512 tmp924 = _mm512_shuffle_f32x4(tmp996, tmp1004, 221);
__m512 tmp929 = _mm512_add_ps(tmp870, tmp871);
__m512 tmp949 = _mm512_add_ps(tmp878, tmp879);
__m512 tmp928 = _mm512_add_ps(tmp872, tmp873);
__m512 tmp948 = _mm512_add_ps(tmp880, tmp921);
__m512 tmp934 = _mm512_sub_ps(tmp872, tmp873);
__m512 tmp954 = _mm512_sub_ps(tmp880, tmp921);
__m512 tmp933 = _mm512_sub_ps(tmp870, tmp871);
__m512 tmp953 = _mm512_sub_ps(tmp878, tmp879);
__m512 tmp930 = _mm512_add_ps(tmp874, tmp875);
__m512 tmp950 = _mm512_add_ps(tmp922, tmp923);
__m512 tmp935 = _mm512_sub_ps(tmp874, tmp875);
__m512 tmp955 = _mm512_sub_ps(tmp922, tmp923);
__m512 tmp932 = _mm512_fmadd_ps(tmp934, _mm512_set1_ps(2e+00f), tmp933);
__m512 tmp952 = _mm512_fmadd_ps(tmp954, _mm512_set1_ps(2e+00f), tmp953);
__m512 tmp939 = _mm512_fmadd_ps(tmp934, _mm512_set1_ps(8e+00f), tmp933);
__m512 tmp959 = _mm512_fmadd_ps(tmp954, _mm512_set1_ps(8e+00f), tmp953);
__m512 tmp927 = _mm512_add_ps(tmp928, tmp929);
__m512 tmp947 = _mm512_add_ps(tmp948, tmp949);
__m512 tmp931 = _mm512_fmadd_ps(tmp935, _mm512_set1_ps(1.6e+01f), tmp932);
__m512 tmp951 = _mm512_fmadd_ps(tmp955, _mm512_set1_ps(1.6e+01f), tmp952);
__m512 tmp938 = _mm512_fmadd_ps(tmp935, _mm512_set1_ps(4e+00f), tmp939);
__m512 tmp958 = _mm512_fmadd_ps(tmp955, _mm512_set1_ps(4e+00f), tmp959);
__m512 tmp944 = _mm512_add_ps(tmp935, tmp933);
__m512 tmp964 = _mm512_add_ps(tmp955, tmp953);
__m512 tmp937 = _mm512_fmadd_ps(tmp928, _mm512_set1_ps(4e+00f), tmp929);
__m512 tmp957 = _mm512_fmadd_ps(tmp948, _mm512_set1_ps(4e+00f), tmp949);
__m512 tmp941 = _mm512_fmadd_ps(tmp928, _mm512_set1_ps(1.6e+01f), tmp929);
__m512 tmp961 = _mm512_fmadd_ps(tmp948, _mm512_set1_ps(1.6e+01f), tmp949);
__m512 tmp926 = _mm512_add_ps(tmp927, tmp869);
__m512 tmp946 = _mm512_add_ps(tmp947, tmp877);
__m512 tmp943 = _mm512_add_ps(tmp944, tmp876);
__m512 tmp963 = _mm512_add_ps(tmp964, tmp924);
__m512 tmp925 = _mm512_fmadd_ps(tmp930, _mm512_set1_ps(3.2e+01f), tmp926);
__m512 tmp945 = _mm512_fmadd_ps(tmp950, _mm512_set1_ps(3.2e+01f), tmp946);
__m512 tmp936 = _mm512_fmadd_ps(tmp930, _mm512_set1_ps(8e+00f), tmp937);
__m512 tmp956 = _mm512_fmadd_ps(tmp950, _mm512_set1_ps(8e+00f), tmp957);
__m512 tmp942 = _mm512_fmadd_ps(tmp934, _mm512_set1_ps(3.2e+01f), tmp943);
__m512 tmp962 = _mm512_fmadd_ps(tmp954, _mm512_set1_ps(3.2e+01f), tmp963);
__m512 tmp940 = _mm512_fmadd_ps(tmp930, _mm512_set1_ps(2e+00f), tmp941);
__m512 tmp960 = _mm512_fmadd_ps(tmp950, _mm512_set1_ps(2e+00f), tmp961);
__m512 out211 = tmp925;
__m512 out205 = tmp945;
__m512 out212 = tmp931;
__m512 out206 = tmp951;
__m512 out213 = tmp936;
__m512 out207 = tmp956;
__m512 out214 = tmp938;
__m512 out208 = tmp958;
__m512 out215 = tmp940;
__m512 out209 = tmp960;
__m512 out216 = tmp942;
__m512 out210 = tmp962;
_mm512_mask_storeu_ps(datPtr3+800+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out211);
_mm512_mask_storeu_ps(datPtr3+96+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out205);
_mm512_mask_storeu_ps(datPtr3+576+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4032, out205);
_mm512_mask_storeu_ps(datPtr3+900+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out212);
_mm512_mask_storeu_ps(datPtr3+196+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out206);
_mm512_mask_storeu_ps(datPtr3+676+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4032, out206);
_mm512_mask_storeu_ps(datPtr3+1000+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out213);
_mm512_mask_storeu_ps(datPtr3+296+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out207);
_mm512_mask_storeu_ps(datPtr3+1100+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out214);
_mm512_mask_storeu_ps(datPtr3+396+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out208);
_mm512_mask_storeu_ps(datPtr3+1200+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out215);
_mm512_mask_storeu_ps(datPtr3+496+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out209);
_mm512_mask_storeu_ps(datPtr3+1300+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out216);
_mm512_mask_storeu_ps(datPtr3+596+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out210);
__m512 sf33 = _mm512_loadu_ps(sfPtr3+512+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf34 = _mm512_loadu_ps(sfPtr3+640+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in145 = _mm512_shuffle_f32x4(sf33, sf34, 68);
__m512 in146 = _mm512_shuffle_f32x4(sf33, sf34, 238);
__m512 sf35 = _mm512_loadu_ps(sfPtr3+576+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf36 = _mm512_loadu_ps(sfPtr3+704+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in153 = _mm512_shuffle_f32x4(sf35, sf36, 68);
__m512 in154 = _mm512_shuffle_f32x4(sf35, sf36, 238);
__m512 sf37 = _mm512_loadu_ps(sfPtr3+495872+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf38 = _mm512_loadu_ps(sfPtr3+496000+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in147 = _mm512_shuffle_f32x4(sf37, sf38, 68);
__m512 in148 = _mm512_shuffle_f32x4(sf37, sf38, 238);
__m512 sf39 = _mm512_loadu_ps(sfPtr3+495936+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf40 = _mm512_loadu_ps(sfPtr3+496064+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in155 = _mm512_shuffle_f32x4(sf39, sf40, 68);
__m512 in156 = _mm512_shuffle_f32x4(sf39, sf40, 238);
__m512 sf41 = _mm512_loadu_ps(sfPtr3+991232+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf42 = _mm512_loadu_ps(sfPtr3+991360+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in149 = _mm512_shuffle_f32x4(sf41, sf42, 68);
__m512 in150 = _mm512_shuffle_f32x4(sf41, sf42, 238);
__m512 sf43 = _mm512_loadu_ps(sfPtr3+991296+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf44 = _mm512_loadu_ps(sfPtr3+991424+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in157 = _mm512_shuffle_f32x4(sf43, sf44, 68);
__m512 in158 = _mm512_shuffle_f32x4(sf43, sf44, 238);
__m512 sf45 = _mm512_loadu_ps(sfPtr3+1486592+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf46 = _mm512_loadu_ps(sfPtr3+1486720+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in151 = _mm512_shuffle_f32x4(sf45, sf46, 68);
__m512 in152 = _mm512_shuffle_f32x4(sf45, sf46, 238);
__m512 sf47 = _mm512_loadu_ps(sfPtr3+1486656+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 sf48 = _mm512_loadu_ps(sfPtr3+1486784+1981440*i11+297216*j7+1536*k11+768*l5);
__m512 in159 = _mm512_shuffle_f32x4(sf47, sf48, 68);
__m512 in160 = _mm512_shuffle_f32x4(sf47, sf48, 238);
__m512 tmp1021 = _mm512_add_ps(in146, in147);
__m512 tmp1041 = _mm512_add_ps(in154, in155);
__m512 tmp1020 = _mm512_add_ps(in148, in149);
__m512 tmp1040 = _mm512_add_ps(in156, in157);
__m512 tmp1026 = _mm512_sub_ps(in148, in149);
__m512 tmp1046 = _mm512_sub_ps(in156, in157);
__m512 tmp1025 = _mm512_sub_ps(in146, in147);
__m512 tmp1045 = _mm512_sub_ps(in154, in155);
__m512 tmp1022 = _mm512_add_ps(in150, in151);
__m512 tmp1042 = _mm512_add_ps(in158, in159);
__m512 tmp1027 = _mm512_sub_ps(in150, in151);
__m512 tmp1047 = _mm512_sub_ps(in158, in159);
__m512 tmp1024 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(2e+00f), tmp1025);
__m512 tmp1044 = _mm512_fmadd_ps(tmp1046, _mm512_set1_ps(2e+00f), tmp1045);
__m512 tmp1031 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(8e+00f), tmp1025);
__m512 tmp1051 = _mm512_fmadd_ps(tmp1046, _mm512_set1_ps(8e+00f), tmp1045);
__m512 tmp1019 = _mm512_add_ps(tmp1020, tmp1021);
__m512 tmp1039 = _mm512_add_ps(tmp1040, tmp1041);
__m512 tmp1023 = _mm512_fmadd_ps(tmp1027, _mm512_set1_ps(1.6e+01f), tmp1024);
__m512 tmp1043 = _mm512_fmadd_ps(tmp1047, _mm512_set1_ps(1.6e+01f), tmp1044);
__m512 tmp1030 = _mm512_fmadd_ps(tmp1027, _mm512_set1_ps(4e+00f), tmp1031);
__m512 tmp1050 = _mm512_fmadd_ps(tmp1047, _mm512_set1_ps(4e+00f), tmp1051);
__m512 tmp1036 = _mm512_add_ps(tmp1027, tmp1025);
__m512 tmp1056 = _mm512_add_ps(tmp1047, tmp1045);
__m512 tmp1029 = _mm512_fmadd_ps(tmp1020, _mm512_set1_ps(4e+00f), tmp1021);
__m512 tmp1049 = _mm512_fmadd_ps(tmp1040, _mm512_set1_ps(4e+00f), tmp1041);
__m512 tmp1033 = _mm512_fmadd_ps(tmp1020, _mm512_set1_ps(1.6e+01f), tmp1021);
__m512 tmp1053 = _mm512_fmadd_ps(tmp1040, _mm512_set1_ps(1.6e+01f), tmp1041);
__m512 tmp1018 = _mm512_add_ps(tmp1019, in145);
__m512 tmp1038 = _mm512_add_ps(tmp1039, in153);
__m512 tmp1035 = _mm512_add_ps(tmp1036, in152);
__m512 tmp1055 = _mm512_add_ps(tmp1056, in160);
__m512 tmp1017 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(3.2e+01f), tmp1018);
__m512 tmp1037 = _mm512_fmadd_ps(tmp1042, _mm512_set1_ps(3.2e+01f), tmp1038);
__m512 tmp1028 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(8e+00f), tmp1029);
__m512 tmp1048 = _mm512_fmadd_ps(tmp1042, _mm512_set1_ps(8e+00f), tmp1049);
__m512 tmp1034 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(3.2e+01f), tmp1035);
__m512 tmp1054 = _mm512_fmadd_ps(tmp1046, _mm512_set1_ps(3.2e+01f), tmp1055);
__m512 tmp1032 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(2e+00f), tmp1033);
__m512 tmp1052 = _mm512_fmadd_ps(tmp1042, _mm512_set1_ps(2e+00f), tmp1053);
__m512 tmp1005 = tmp1017;
__m512 tmp1011 = tmp1037;
__m512 tmp1006 = tmp1023;
__m512 tmp1012 = tmp1043;
__m512 tmp1007 = tmp1028;
__m512 tmp1013 = tmp1048;
__m512 tmp1008 = tmp1030;
__m512 tmp1014 = tmp1050;
__m512 tmp1009 = tmp1032;
__m512 tmp1015 = tmp1052;
__m512 tmp1010 = tmp1034;
__m512 tmp1016 = tmp1054;
__m512 tmp1101 = _mm512_unpacklo_ps(tmp1005, tmp1006);
__m512 tmp1102 = _mm512_unpackhi_ps(tmp1005, tmp1006);
__m512 tmp1103 = _mm512_unpacklo_ps(tmp1007, tmp1008);
__m512 tmp1104 = _mm512_unpackhi_ps(tmp1007, tmp1008);
__m512 tmp1105 = _mm512_unpacklo_ps(tmp1009, tmp1010);
__m512 tmp1106 = _mm512_unpackhi_ps(tmp1009, tmp1010);
__m512 tmp1107 = _mm512_unpacklo_ps(tmp1011, tmp1012);
__m512 tmp1108 = _mm512_unpackhi_ps(tmp1011, tmp1012);
__m512 tmp1109 = _mm512_unpacklo_ps(tmp1013, tmp1014);
__m512 tmp1110 = _mm512_unpackhi_ps(tmp1013, tmp1014);
__m512 tmp1111 = _mm512_unpacklo_ps(tmp1015, tmp1016);
__m512 tmp1112 = _mm512_unpackhi_ps(tmp1015, tmp1016);
__m512 tmp1113 = _mm512_shuffle_ps(tmp1101, tmp1103, 68);
__m512 tmp1114 = _mm512_shuffle_ps(tmp1101, tmp1103, 238);
__m512 tmp1115 = _mm512_shuffle_ps(tmp1102, tmp1104, 68);
__m512 tmp1116 = _mm512_shuffle_ps(tmp1102, tmp1104, 238);
__m512 tmp1117 = _mm512_shuffle_ps(tmp1105, tmp1107, 68);
__m512 tmp1118 = _mm512_shuffle_ps(tmp1105, tmp1107, 238);
__m512 tmp1119 = _mm512_shuffle_ps(tmp1106, tmp1108, 68);
__m512 tmp1120 = _mm512_shuffle_ps(tmp1106, tmp1108, 238);
__m512 tmp1121 = _mm512_shuffle_ps(tmp1109, tmp1111, 68);
__m512 tmp1122 = _mm512_shuffle_ps(tmp1109, tmp1111, 238);
__m512 tmp1123 = _mm512_shuffle_ps(tmp1110, tmp1112, 68);
__m512 tmp1124 = _mm512_shuffle_ps(tmp1110, tmp1112, 238);
__m512 tmp1125 = _mm512_shuffle_f32x4(tmp1113, tmp1117, 136);
__m512 tmp1126 = _mm512_shuffle_f32x4(tmp1113, tmp1117, 221);
__m512 tmp1127 = _mm512_shuffle_f32x4(tmp1114, tmp1118, 136);
__m512 tmp1128 = _mm512_shuffle_f32x4(tmp1114, tmp1118, 221);
__m512 tmp1129 = _mm512_shuffle_f32x4(tmp1115, tmp1119, 136);
__m512 tmp1130 = _mm512_shuffle_f32x4(tmp1115, tmp1119, 221);
__m512 tmp1131 = _mm512_shuffle_f32x4(tmp1116, tmp1120, 136);
__m512 tmp1132 = _mm512_shuffle_f32x4(tmp1116, tmp1120, 221);
__m512 tmp1133 = _mm512_shuffle_f32x4(tmp1121, tmp1121, 136);
__m512 tmp1134 = _mm512_shuffle_f32x4(tmp1121, tmp1121, 221);
__m512 tmp1135 = _mm512_shuffle_f32x4(tmp1122, tmp1122, 136);
__m512 tmp1136 = _mm512_shuffle_f32x4(tmp1122, tmp1122, 221);
__m512 tmp1137 = _mm512_shuffle_f32x4(tmp1123, tmp1123, 136);
__m512 tmp1138 = _mm512_shuffle_f32x4(tmp1123, tmp1123, 221);
__m512 tmp1139 = _mm512_shuffle_f32x4(tmp1124, tmp1124, 136);
__m512 tmp1140 = _mm512_shuffle_f32x4(tmp1124, tmp1124, 221);
tmp1005 = _mm512_shuffle_f32x4(tmp1125, tmp1133, 136);
tmp1013 = _mm512_shuffle_f32x4(tmp1125, tmp1133, 221);
tmp1006 = _mm512_shuffle_f32x4(tmp1127, tmp1135, 136);
tmp1014 = _mm512_shuffle_f32x4(tmp1127, tmp1135, 221);
tmp1007 = _mm512_shuffle_f32x4(tmp1129, tmp1137, 136);
tmp1015 = _mm512_shuffle_f32x4(tmp1129, tmp1137, 221);
tmp1008 = _mm512_shuffle_f32x4(tmp1131, tmp1139, 136);
tmp1016 = _mm512_shuffle_f32x4(tmp1131, tmp1139, 221);
tmp1009 = _mm512_shuffle_f32x4(tmp1126, tmp1134, 136);
__m512 tmp1057 = _mm512_shuffle_f32x4(tmp1126, tmp1134, 221);
tmp1010 = _mm512_shuffle_f32x4(tmp1128, tmp1136, 136);
__m512 tmp1058 = _mm512_shuffle_f32x4(tmp1128, tmp1136, 221);
tmp1011 = _mm512_shuffle_f32x4(tmp1130, tmp1138, 136);
__m512 tmp1059 = _mm512_shuffle_f32x4(tmp1130, tmp1138, 221);
tmp1012 = _mm512_shuffle_f32x4(tmp1132, tmp1140, 136);
__m512 tmp1060 = _mm512_shuffle_f32x4(tmp1132, tmp1140, 221);
__m512 tmp1065 = _mm512_add_ps(tmp1006, tmp1007);
__m512 tmp1085 = _mm512_add_ps(tmp1014, tmp1015);
__m512 tmp1064 = _mm512_add_ps(tmp1008, tmp1009);
__m512 tmp1084 = _mm512_add_ps(tmp1016, tmp1057);
__m512 tmp1070 = _mm512_sub_ps(tmp1008, tmp1009);
__m512 tmp1090 = _mm512_sub_ps(tmp1016, tmp1057);
__m512 tmp1069 = _mm512_sub_ps(tmp1006, tmp1007);
__m512 tmp1089 = _mm512_sub_ps(tmp1014, tmp1015);
__m512 tmp1066 = _mm512_add_ps(tmp1010, tmp1011);
__m512 tmp1086 = _mm512_add_ps(tmp1058, tmp1059);
__m512 tmp1071 = _mm512_sub_ps(tmp1010, tmp1011);
__m512 tmp1091 = _mm512_sub_ps(tmp1058, tmp1059);
__m512 tmp1068 = _mm512_fmadd_ps(tmp1070, _mm512_set1_ps(2e+00f), tmp1069);
__m512 tmp1088 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(2e+00f), tmp1089);
__m512 tmp1075 = _mm512_fmadd_ps(tmp1070, _mm512_set1_ps(8e+00f), tmp1069);
__m512 tmp1095 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(8e+00f), tmp1089);
__m512 tmp1063 = _mm512_add_ps(tmp1064, tmp1065);
__m512 tmp1083 = _mm512_add_ps(tmp1084, tmp1085);
__m512 tmp1067 = _mm512_fmadd_ps(tmp1071, _mm512_set1_ps(1.6e+01f), tmp1068);
__m512 tmp1087 = _mm512_fmadd_ps(tmp1091, _mm512_set1_ps(1.6e+01f), tmp1088);
__m512 tmp1074 = _mm512_fmadd_ps(tmp1071, _mm512_set1_ps(4e+00f), tmp1075);
__m512 tmp1094 = _mm512_fmadd_ps(tmp1091, _mm512_set1_ps(4e+00f), tmp1095);
__m512 tmp1080 = _mm512_add_ps(tmp1071, tmp1069);
__m512 tmp1100 = _mm512_add_ps(tmp1091, tmp1089);
__m512 tmp1073 = _mm512_fmadd_ps(tmp1064, _mm512_set1_ps(4e+00f), tmp1065);
__m512 tmp1093 = _mm512_fmadd_ps(tmp1084, _mm512_set1_ps(4e+00f), tmp1085);
__m512 tmp1077 = _mm512_fmadd_ps(tmp1064, _mm512_set1_ps(1.6e+01f), tmp1065);
__m512 tmp1097 = _mm512_fmadd_ps(tmp1084, _mm512_set1_ps(1.6e+01f), tmp1085);
__m512 tmp1062 = _mm512_add_ps(tmp1063, tmp1005);
__m512 tmp1082 = _mm512_add_ps(tmp1083, tmp1013);
__m512 tmp1079 = _mm512_add_ps(tmp1080, tmp1012);
__m512 tmp1099 = _mm512_add_ps(tmp1100, tmp1060);
__m512 tmp1061 = _mm512_fmadd_ps(tmp1066, _mm512_set1_ps(3.2e+01f), tmp1062);
__m512 tmp1081 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(3.2e+01f), tmp1082);
__m512 tmp1072 = _mm512_fmadd_ps(tmp1066, _mm512_set1_ps(8e+00f), tmp1073);
__m512 tmp1092 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(8e+00f), tmp1093);
__m512 tmp1078 = _mm512_fmadd_ps(tmp1070, _mm512_set1_ps(3.2e+01f), tmp1079);
__m512 tmp1098 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(3.2e+01f), tmp1099);
__m512 tmp1076 = _mm512_fmadd_ps(tmp1066, _mm512_set1_ps(2e+00f), tmp1077);
__m512 tmp1096 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(2e+00f), tmp1097);
__m512 out217 = tmp1061;
__m512 out223 = tmp1081;
__m512 out218 = tmp1067;
__m512 out224 = tmp1087;
__m512 out219 = tmp1072;
__m512 out225 = tmp1092;
__m512 out220 = tmp1074;
__m512 out226 = tmp1094;
__m512 out221 = tmp1076;
__m512 out227 = tmp1096;
__m512 out222 = tmp1078;
__m512 out228 = tmp1098;
_mm512_mask_storeu_ps(datPtr3+848+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out217);
_mm512_mask_storeu_ps(datPtr3+896+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out223);
_mm512_mask_storeu_ps(datPtr3+1376+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4032, out223);
_mm512_mask_storeu_ps(datPtr3+948+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out218);
_mm512_mask_storeu_ps(datPtr3+996+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out224);
_mm512_mask_storeu_ps(datPtr3+1476+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4032, out224);
_mm512_mask_storeu_ps(datPtr3+1048+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out219);
_mm512_mask_storeu_ps(datPtr3+1096+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out225);
_mm512_mask_storeu_ps(datPtr3+1148+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out220);
_mm512_mask_storeu_ps(datPtr3+1196+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out226);
_mm512_mask_storeu_ps(datPtr3+1248+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out221);
_mm512_mask_storeu_ps(datPtr3+1296+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out227);
_mm512_mask_storeu_ps(datPtr3+1348+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 4095, out222);
_mm512_mask_storeu_ps(datPtr3+1396+619200*i11+100*toH1+4*toW1+3200*k11+1600*l5, 1, out228);
}
if (k11 >= kk3) return;
}
ptrdiff_t l6 = 0;
for (; l6 != 1; ++l6) {
__m512 sf49 = _mm512_loadu_ps(sfPtr3+0+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf50 = _mm512_loadu_ps(sfPtr3+128+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in161 = _mm512_shuffle_f32x4(sf49, sf50, 68);
__m512 in162 = _mm512_shuffle_f32x4(sf49, sf50, 238);
__m512 sf51 = _mm512_loadu_ps(sfPtr3+64+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf52 = _mm512_loadu_ps(sfPtr3+192+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in169 = _mm512_shuffle_f32x4(sf51, sf52, 68);
__m512 in170 = _mm512_shuffle_f32x4(sf51, sf52, 238);
__m512 sf53 = _mm512_loadu_ps(sfPtr3+495360+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf54 = _mm512_loadu_ps(sfPtr3+495488+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in163 = _mm512_shuffle_f32x4(sf53, sf54, 68);
__m512 in164 = _mm512_shuffle_f32x4(sf53, sf54, 238);
__m512 sf55 = _mm512_loadu_ps(sfPtr3+495424+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf56 = _mm512_loadu_ps(sfPtr3+495552+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in171 = _mm512_shuffle_f32x4(sf55, sf56, 68);
__m512 in172 = _mm512_shuffle_f32x4(sf55, sf56, 238);
__m512 sf57 = _mm512_loadu_ps(sfPtr3+990720+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf58 = _mm512_loadu_ps(sfPtr3+990848+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in165 = _mm512_shuffle_f32x4(sf57, sf58, 68);
__m512 in166 = _mm512_shuffle_f32x4(sf57, sf58, 238);
__m512 sf59 = _mm512_loadu_ps(sfPtr3+990784+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf60 = _mm512_loadu_ps(sfPtr3+990912+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in173 = _mm512_shuffle_f32x4(sf59, sf60, 68);
__m512 in174 = _mm512_shuffle_f32x4(sf59, sf60, 238);
__m512 sf61 = _mm512_loadu_ps(sfPtr3+1486080+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf62 = _mm512_loadu_ps(sfPtr3+1486208+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in167 = _mm512_shuffle_f32x4(sf61, sf62, 68);
__m512 in168 = _mm512_shuffle_f32x4(sf61, sf62, 238);
__m512 sf63 = _mm512_loadu_ps(sfPtr3+1486144+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf64 = _mm512_loadu_ps(sfPtr3+1486272+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in175 = _mm512_shuffle_f32x4(sf63, sf64, 68);
__m512 in176 = _mm512_shuffle_f32x4(sf63, sf64, 238);
__m512 tmp1157 = _mm512_add_ps(in162, in163);
__m512 tmp1177 = _mm512_add_ps(in170, in171);
__m512 tmp1156 = _mm512_add_ps(in164, in165);
__m512 tmp1176 = _mm512_add_ps(in172, in173);
__m512 tmp1162 = _mm512_sub_ps(in164, in165);
__m512 tmp1182 = _mm512_sub_ps(in172, in173);
__m512 tmp1161 = _mm512_sub_ps(in162, in163);
__m512 tmp1181 = _mm512_sub_ps(in170, in171);
__m512 tmp1158 = _mm512_add_ps(in166, in167);
__m512 tmp1178 = _mm512_add_ps(in174, in175);
__m512 tmp1163 = _mm512_sub_ps(in166, in167);
__m512 tmp1183 = _mm512_sub_ps(in174, in175);
__m512 tmp1160 = _mm512_fmadd_ps(tmp1162, _mm512_set1_ps(2e+00f), tmp1161);
__m512 tmp1180 = _mm512_fmadd_ps(tmp1182, _mm512_set1_ps(2e+00f), tmp1181);
__m512 tmp1167 = _mm512_fmadd_ps(tmp1162, _mm512_set1_ps(8e+00f), tmp1161);
__m512 tmp1187 = _mm512_fmadd_ps(tmp1182, _mm512_set1_ps(8e+00f), tmp1181);
__m512 tmp1155 = _mm512_add_ps(tmp1156, tmp1157);
__m512 tmp1175 = _mm512_add_ps(tmp1176, tmp1177);
__m512 tmp1159 = _mm512_fmadd_ps(tmp1163, _mm512_set1_ps(1.6e+01f), tmp1160);
__m512 tmp1179 = _mm512_fmadd_ps(tmp1183, _mm512_set1_ps(1.6e+01f), tmp1180);
__m512 tmp1166 = _mm512_fmadd_ps(tmp1163, _mm512_set1_ps(4e+00f), tmp1167);
__m512 tmp1186 = _mm512_fmadd_ps(tmp1183, _mm512_set1_ps(4e+00f), tmp1187);
__m512 tmp1172 = _mm512_add_ps(tmp1163, tmp1161);
__m512 tmp1192 = _mm512_add_ps(tmp1183, tmp1181);
__m512 tmp1165 = _mm512_fmadd_ps(tmp1156, _mm512_set1_ps(4e+00f), tmp1157);
__m512 tmp1185 = _mm512_fmadd_ps(tmp1176, _mm512_set1_ps(4e+00f), tmp1177);
__m512 tmp1169 = _mm512_fmadd_ps(tmp1156, _mm512_set1_ps(1.6e+01f), tmp1157);
__m512 tmp1189 = _mm512_fmadd_ps(tmp1176, _mm512_set1_ps(1.6e+01f), tmp1177);
__m512 tmp1154 = _mm512_add_ps(tmp1155, in161);
__m512 tmp1174 = _mm512_add_ps(tmp1175, in169);
__m512 tmp1171 = _mm512_add_ps(tmp1172, in168);
__m512 tmp1191 = _mm512_add_ps(tmp1192, in176);
__m512 tmp1153 = _mm512_fmadd_ps(tmp1158, _mm512_set1_ps(3.2e+01f), tmp1154);
__m512 tmp1173 = _mm512_fmadd_ps(tmp1178, _mm512_set1_ps(3.2e+01f), tmp1174);
__m512 tmp1164 = _mm512_fmadd_ps(tmp1158, _mm512_set1_ps(8e+00f), tmp1165);
__m512 tmp1184 = _mm512_fmadd_ps(tmp1178, _mm512_set1_ps(8e+00f), tmp1185);
__m512 tmp1170 = _mm512_fmadd_ps(tmp1162, _mm512_set1_ps(3.2e+01f), tmp1171);
__m512 tmp1190 = _mm512_fmadd_ps(tmp1182, _mm512_set1_ps(3.2e+01f), tmp1191);
__m512 tmp1168 = _mm512_fmadd_ps(tmp1158, _mm512_set1_ps(2e+00f), tmp1169);
__m512 tmp1188 = _mm512_fmadd_ps(tmp1178, _mm512_set1_ps(2e+00f), tmp1189);
__m512 tmp1141 = tmp1153;
__m512 tmp1147 = tmp1173;
__m512 tmp1142 = tmp1159;
__m512 tmp1148 = tmp1179;
__m512 tmp1143 = tmp1164;
__m512 tmp1149 = tmp1184;
__m512 tmp1144 = tmp1166;
__m512 tmp1150 = tmp1186;
__m512 tmp1145 = tmp1168;
__m512 tmp1151 = tmp1188;
__m512 tmp1146 = tmp1170;
__m512 tmp1152 = tmp1190;
__m512 tmp1237 = _mm512_unpacklo_ps(tmp1141, tmp1142);
__m512 tmp1238 = _mm512_unpackhi_ps(tmp1141, tmp1142);
__m512 tmp1239 = _mm512_unpacklo_ps(tmp1143, tmp1144);
__m512 tmp1240 = _mm512_unpackhi_ps(tmp1143, tmp1144);
__m512 tmp1241 = _mm512_unpacklo_ps(tmp1145, tmp1146);
__m512 tmp1242 = _mm512_unpackhi_ps(tmp1145, tmp1146);
__m512 tmp1243 = _mm512_unpacklo_ps(tmp1147, tmp1148);
__m512 tmp1244 = _mm512_unpackhi_ps(tmp1147, tmp1148);
__m512 tmp1245 = _mm512_unpacklo_ps(tmp1149, tmp1150);
__m512 tmp1246 = _mm512_unpackhi_ps(tmp1149, tmp1150);
__m512 tmp1247 = _mm512_unpacklo_ps(tmp1151, tmp1152);
__m512 tmp1248 = _mm512_unpackhi_ps(tmp1151, tmp1152);
__m512 tmp1249 = _mm512_shuffle_ps(tmp1237, tmp1239, 68);
__m512 tmp1250 = _mm512_shuffle_ps(tmp1237, tmp1239, 238);
__m512 tmp1251 = _mm512_shuffle_ps(tmp1238, tmp1240, 68);
__m512 tmp1252 = _mm512_shuffle_ps(tmp1238, tmp1240, 238);
__m512 tmp1253 = _mm512_shuffle_ps(tmp1241, tmp1243, 68);
__m512 tmp1254 = _mm512_shuffle_ps(tmp1241, tmp1243, 238);
__m512 tmp1255 = _mm512_shuffle_ps(tmp1242, tmp1244, 68);
__m512 tmp1256 = _mm512_shuffle_ps(tmp1242, tmp1244, 238);
__m512 tmp1257 = _mm512_shuffle_ps(tmp1245, tmp1247, 68);
__m512 tmp1258 = _mm512_shuffle_ps(tmp1245, tmp1247, 238);
__m512 tmp1259 = _mm512_shuffle_ps(tmp1246, tmp1248, 68);
__m512 tmp1260 = _mm512_shuffle_ps(tmp1246, tmp1248, 238);
__m512 tmp1261 = _mm512_shuffle_f32x4(tmp1249, tmp1253, 136);
__m512 tmp1262 = _mm512_shuffle_f32x4(tmp1249, tmp1253, 221);
__m512 tmp1263 = _mm512_shuffle_f32x4(tmp1250, tmp1254, 136);
__m512 tmp1264 = _mm512_shuffle_f32x4(tmp1250, tmp1254, 221);
__m512 tmp1265 = _mm512_shuffle_f32x4(tmp1251, tmp1255, 136);
__m512 tmp1266 = _mm512_shuffle_f32x4(tmp1251, tmp1255, 221);
__m512 tmp1267 = _mm512_shuffle_f32x4(tmp1252, tmp1256, 136);
__m512 tmp1268 = _mm512_shuffle_f32x4(tmp1252, tmp1256, 221);
__m512 tmp1269 = _mm512_shuffle_f32x4(tmp1257, tmp1257, 136);
__m512 tmp1270 = _mm512_shuffle_f32x4(tmp1257, tmp1257, 221);
__m512 tmp1271 = _mm512_shuffle_f32x4(tmp1258, tmp1258, 136);
__m512 tmp1272 = _mm512_shuffle_f32x4(tmp1258, tmp1258, 221);
__m512 tmp1273 = _mm512_shuffle_f32x4(tmp1259, tmp1259, 136);
__m512 tmp1274 = _mm512_shuffle_f32x4(tmp1259, tmp1259, 221);
__m512 tmp1275 = _mm512_shuffle_f32x4(tmp1260, tmp1260, 136);
__m512 tmp1276 = _mm512_shuffle_f32x4(tmp1260, tmp1260, 221);
tmp1141 = _mm512_shuffle_f32x4(tmp1261, tmp1269, 136);
tmp1149 = _mm512_shuffle_f32x4(tmp1261, tmp1269, 221);
tmp1142 = _mm512_shuffle_f32x4(tmp1263, tmp1271, 136);
tmp1150 = _mm512_shuffle_f32x4(tmp1263, tmp1271, 221);
tmp1143 = _mm512_shuffle_f32x4(tmp1265, tmp1273, 136);
tmp1151 = _mm512_shuffle_f32x4(tmp1265, tmp1273, 221);
tmp1144 = _mm512_shuffle_f32x4(tmp1267, tmp1275, 136);
tmp1152 = _mm512_shuffle_f32x4(tmp1267, tmp1275, 221);
tmp1145 = _mm512_shuffle_f32x4(tmp1262, tmp1270, 136);
__m512 tmp1193 = _mm512_shuffle_f32x4(tmp1262, tmp1270, 221);
tmp1146 = _mm512_shuffle_f32x4(tmp1264, tmp1272, 136);
__m512 tmp1194 = _mm512_shuffle_f32x4(tmp1264, tmp1272, 221);
tmp1147 = _mm512_shuffle_f32x4(tmp1266, tmp1274, 136);
__m512 tmp1195 = _mm512_shuffle_f32x4(tmp1266, tmp1274, 221);
tmp1148 = _mm512_shuffle_f32x4(tmp1268, tmp1276, 136);
__m512 tmp1196 = _mm512_shuffle_f32x4(tmp1268, tmp1276, 221);
__m512 tmp1201 = _mm512_add_ps(tmp1142, tmp1143);
__m512 tmp1221 = _mm512_add_ps(tmp1150, tmp1151);
__m512 tmp1200 = _mm512_add_ps(tmp1144, tmp1145);
__m512 tmp1220 = _mm512_add_ps(tmp1152, tmp1193);
__m512 tmp1206 = _mm512_sub_ps(tmp1144, tmp1145);
__m512 tmp1226 = _mm512_sub_ps(tmp1152, tmp1193);
__m512 tmp1205 = _mm512_sub_ps(tmp1142, tmp1143);
__m512 tmp1225 = _mm512_sub_ps(tmp1150, tmp1151);
__m512 tmp1202 = _mm512_add_ps(tmp1146, tmp1147);
__m512 tmp1222 = _mm512_add_ps(tmp1194, tmp1195);
__m512 tmp1207 = _mm512_sub_ps(tmp1146, tmp1147);
__m512 tmp1227 = _mm512_sub_ps(tmp1194, tmp1195);
__m512 tmp1204 = _mm512_fmadd_ps(tmp1206, _mm512_set1_ps(2e+00f), tmp1205);
__m512 tmp1224 = _mm512_fmadd_ps(tmp1226, _mm512_set1_ps(2e+00f), tmp1225);
__m512 tmp1211 = _mm512_fmadd_ps(tmp1206, _mm512_set1_ps(8e+00f), tmp1205);
__m512 tmp1231 = _mm512_fmadd_ps(tmp1226, _mm512_set1_ps(8e+00f), tmp1225);
__m512 tmp1199 = _mm512_add_ps(tmp1200, tmp1201);
__m512 tmp1219 = _mm512_add_ps(tmp1220, tmp1221);
__m512 tmp1203 = _mm512_fmadd_ps(tmp1207, _mm512_set1_ps(1.6e+01f), tmp1204);
__m512 tmp1223 = _mm512_fmadd_ps(tmp1227, _mm512_set1_ps(1.6e+01f), tmp1224);
__m512 tmp1210 = _mm512_fmadd_ps(tmp1207, _mm512_set1_ps(4e+00f), tmp1211);
__m512 tmp1230 = _mm512_fmadd_ps(tmp1227, _mm512_set1_ps(4e+00f), tmp1231);
__m512 tmp1216 = _mm512_add_ps(tmp1207, tmp1205);
__m512 tmp1236 = _mm512_add_ps(tmp1227, tmp1225);
__m512 tmp1209 = _mm512_fmadd_ps(tmp1200, _mm512_set1_ps(4e+00f), tmp1201);
__m512 tmp1229 = _mm512_fmadd_ps(tmp1220, _mm512_set1_ps(4e+00f), tmp1221);
__m512 tmp1213 = _mm512_fmadd_ps(tmp1200, _mm512_set1_ps(1.6e+01f), tmp1201);
__m512 tmp1233 = _mm512_fmadd_ps(tmp1220, _mm512_set1_ps(1.6e+01f), tmp1221);
__m512 tmp1198 = _mm512_add_ps(tmp1199, tmp1141);
__m512 tmp1218 = _mm512_add_ps(tmp1219, tmp1149);
__m512 tmp1215 = _mm512_add_ps(tmp1216, tmp1148);
__m512 tmp1235 = _mm512_add_ps(tmp1236, tmp1196);
__m512 tmp1197 = _mm512_fmadd_ps(tmp1202, _mm512_set1_ps(3.2e+01f), tmp1198);
__m512 tmp1217 = _mm512_fmadd_ps(tmp1222, _mm512_set1_ps(3.2e+01f), tmp1218);
__m512 tmp1208 = _mm512_fmadd_ps(tmp1202, _mm512_set1_ps(8e+00f), tmp1209);
__m512 tmp1228 = _mm512_fmadd_ps(tmp1222, _mm512_set1_ps(8e+00f), tmp1229);
__m512 tmp1214 = _mm512_fmadd_ps(tmp1206, _mm512_set1_ps(3.2e+01f), tmp1215);
__m512 tmp1234 = _mm512_fmadd_ps(tmp1226, _mm512_set1_ps(3.2e+01f), tmp1235);
__m512 tmp1212 = _mm512_fmadd_ps(tmp1202, _mm512_set1_ps(2e+00f), tmp1213);
__m512 tmp1232 = _mm512_fmadd_ps(tmp1222, _mm512_set1_ps(2e+00f), tmp1233);
__m512 out229 = tmp1197;
__m512 out235 = tmp1217;
__m512 out230 = tmp1203;
__m512 out236 = tmp1223;
__m512 out231 = tmp1208;
__m512 out237 = tmp1228;
__m512 out232 = tmp1210;
__m512 out238 = tmp1230;
__m512 out233 = tmp1212;
__m512 out239 = tmp1232;
__m512 out234 = tmp1214;
__m512 out240 = tmp1234;
_mm512_mask_storeu_ps(datPtr3+0+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out229);
_mm512_mask_storeu_ps(datPtr3+48+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out235);
_mm512_mask_storeu_ps(datPtr3+100+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out230);
_mm512_mask_storeu_ps(datPtr3+148+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out236);
_mm512_mask_storeu_ps(datPtr3+200+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out231);
_mm512_mask_storeu_ps(datPtr3+248+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out237);
_mm512_mask_storeu_ps(datPtr3+300+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out232);
_mm512_mask_storeu_ps(datPtr3+348+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out238);
_mm512_mask_storeu_ps(datPtr3+400+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out233);
_mm512_mask_storeu_ps(datPtr3+448+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out239);
_mm512_mask_storeu_ps(datPtr3+500+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out234);
_mm512_mask_storeu_ps(datPtr3+548+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out240);
__m512 sf65 = _mm512_loadu_ps(sfPtr3+256+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf66 = _mm512_loadu_ps(sfPtr3+384+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in177 = _mm512_shuffle_f32x4(sf66, sf65, 68);
__m512 in178 = _mm512_shuffle_f32x4(sf66, sf65, 238);
__m512 sf67 = _mm512_loadu_ps(sfPtr3+320+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf68 = _mm512_loadu_ps(sfPtr3+448+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in185 = _mm512_shuffle_f32x4(sf68, sf67, 68);
__m512 in186 = _mm512_shuffle_f32x4(sf68, sf67, 238);
__m512 sf69 = _mm512_loadu_ps(sfPtr3+495616+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf70 = _mm512_loadu_ps(sfPtr3+495744+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in179 = _mm512_shuffle_f32x4(sf70, sf69, 68);
__m512 in180 = _mm512_shuffle_f32x4(sf70, sf69, 238);
__m512 sf71 = _mm512_loadu_ps(sfPtr3+495680+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf72 = _mm512_loadu_ps(sfPtr3+495808+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in187 = _mm512_shuffle_f32x4(sf72, sf71, 68);
__m512 in188 = _mm512_shuffle_f32x4(sf72, sf71, 238);
__m512 sf73 = _mm512_loadu_ps(sfPtr3+990976+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf74 = _mm512_loadu_ps(sfPtr3+991104+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in181 = _mm512_shuffle_f32x4(sf74, sf73, 68);
__m512 in182 = _mm512_shuffle_f32x4(sf74, sf73, 238);
__m512 sf75 = _mm512_loadu_ps(sfPtr3+991040+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf76 = _mm512_loadu_ps(sfPtr3+991168+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in189 = _mm512_shuffle_f32x4(sf76, sf75, 68);
__m512 in190 = _mm512_shuffle_f32x4(sf76, sf75, 238);
__m512 sf77 = _mm512_loadu_ps(sfPtr3+1486336+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf78 = _mm512_loadu_ps(sfPtr3+1486464+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in183 = _mm512_shuffle_f32x4(sf78, sf77, 68);
__m512 in184 = _mm512_shuffle_f32x4(sf78, sf77, 238);
__m512 sf79 = _mm512_loadu_ps(sfPtr3+1486400+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf80 = _mm512_loadu_ps(sfPtr3+1486528+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in191 = _mm512_shuffle_f32x4(sf80, sf79, 68);
__m512 in192 = _mm512_shuffle_f32x4(sf80, sf79, 238);
__m512 tmp1293 = _mm512_add_ps(in178, in179);
__m512 tmp1313 = _mm512_add_ps(in186, in187);
__m512 tmp1292 = _mm512_add_ps(in180, in181);
__m512 tmp1312 = _mm512_add_ps(in188, in189);
__m512 tmp1298 = _mm512_sub_ps(in180, in181);
__m512 tmp1318 = _mm512_sub_ps(in188, in189);
__m512 tmp1297 = _mm512_sub_ps(in178, in179);
__m512 tmp1317 = _mm512_sub_ps(in186, in187);
__m512 tmp1294 = _mm512_add_ps(in182, in183);
__m512 tmp1314 = _mm512_add_ps(in190, in191);
__m512 tmp1299 = _mm512_sub_ps(in182, in183);
__m512 tmp1319 = _mm512_sub_ps(in190, in191);
__m512 tmp1296 = _mm512_fmadd_ps(tmp1298, _mm512_set1_ps(2e+00f), tmp1297);
__m512 tmp1316 = _mm512_fmadd_ps(tmp1318, _mm512_set1_ps(2e+00f), tmp1317);
__m512 tmp1303 = _mm512_fmadd_ps(tmp1298, _mm512_set1_ps(8e+00f), tmp1297);
__m512 tmp1323 = _mm512_fmadd_ps(tmp1318, _mm512_set1_ps(8e+00f), tmp1317);
__m512 tmp1291 = _mm512_add_ps(tmp1292, tmp1293);
__m512 tmp1311 = _mm512_add_ps(tmp1312, tmp1313);
__m512 tmp1295 = _mm512_fmadd_ps(tmp1299, _mm512_set1_ps(1.6e+01f), tmp1296);
__m512 tmp1315 = _mm512_fmadd_ps(tmp1319, _mm512_set1_ps(1.6e+01f), tmp1316);
__m512 tmp1302 = _mm512_fmadd_ps(tmp1299, _mm512_set1_ps(4e+00f), tmp1303);
__m512 tmp1322 = _mm512_fmadd_ps(tmp1319, _mm512_set1_ps(4e+00f), tmp1323);
__m512 tmp1308 = _mm512_add_ps(tmp1299, tmp1297);
__m512 tmp1328 = _mm512_add_ps(tmp1319, tmp1317);
__m512 tmp1301 = _mm512_fmadd_ps(tmp1292, _mm512_set1_ps(4e+00f), tmp1293);
__m512 tmp1321 = _mm512_fmadd_ps(tmp1312, _mm512_set1_ps(4e+00f), tmp1313);
__m512 tmp1305 = _mm512_fmadd_ps(tmp1292, _mm512_set1_ps(1.6e+01f), tmp1293);
__m512 tmp1325 = _mm512_fmadd_ps(tmp1312, _mm512_set1_ps(1.6e+01f), tmp1313);
__m512 tmp1290 = _mm512_add_ps(tmp1291, in177);
__m512 tmp1310 = _mm512_add_ps(tmp1311, in185);
__m512 tmp1307 = _mm512_add_ps(tmp1308, in184);
__m512 tmp1327 = _mm512_add_ps(tmp1328, in192);
__m512 tmp1289 = _mm512_fmadd_ps(tmp1294, _mm512_set1_ps(3.2e+01f), tmp1290);
__m512 tmp1309 = _mm512_fmadd_ps(tmp1314, _mm512_set1_ps(3.2e+01f), tmp1310);
__m512 tmp1300 = _mm512_fmadd_ps(tmp1294, _mm512_set1_ps(8e+00f), tmp1301);
__m512 tmp1320 = _mm512_fmadd_ps(tmp1314, _mm512_set1_ps(8e+00f), tmp1321);
__m512 tmp1306 = _mm512_fmadd_ps(tmp1298, _mm512_set1_ps(3.2e+01f), tmp1307);
__m512 tmp1326 = _mm512_fmadd_ps(tmp1318, _mm512_set1_ps(3.2e+01f), tmp1327);
__m512 tmp1304 = _mm512_fmadd_ps(tmp1294, _mm512_set1_ps(2e+00f), tmp1305);
__m512 tmp1324 = _mm512_fmadd_ps(tmp1314, _mm512_set1_ps(2e+00f), tmp1325);
__m512 tmp1277 = tmp1289;
__m512 tmp1283 = tmp1309;
__m512 tmp1278 = tmp1295;
__m512 tmp1284 = tmp1315;
__m512 tmp1279 = tmp1300;
__m512 tmp1285 = tmp1320;
__m512 tmp1280 = tmp1302;
__m512 tmp1286 = tmp1322;
__m512 tmp1281 = tmp1304;
__m512 tmp1287 = tmp1324;
__m512 tmp1282 = tmp1306;
__m512 tmp1288 = tmp1326;
__m512 tmp1373 = _mm512_unpacklo_ps(tmp1277, tmp1278);
__m512 tmp1374 = _mm512_unpackhi_ps(tmp1277, tmp1278);
__m512 tmp1375 = _mm512_unpacklo_ps(tmp1279, tmp1280);
__m512 tmp1376 = _mm512_unpackhi_ps(tmp1279, tmp1280);
__m512 tmp1377 = _mm512_unpacklo_ps(tmp1281, tmp1282);
__m512 tmp1378 = _mm512_unpackhi_ps(tmp1281, tmp1282);
__m512 tmp1379 = _mm512_unpacklo_ps(tmp1283, tmp1284);
__m512 tmp1380 = _mm512_unpackhi_ps(tmp1283, tmp1284);
__m512 tmp1381 = _mm512_unpacklo_ps(tmp1285, tmp1286);
__m512 tmp1382 = _mm512_unpackhi_ps(tmp1285, tmp1286);
__m512 tmp1383 = _mm512_unpacklo_ps(tmp1287, tmp1288);
__m512 tmp1384 = _mm512_unpackhi_ps(tmp1287, tmp1288);
__m512 tmp1385 = _mm512_shuffle_ps(tmp1373, tmp1375, 68);
__m512 tmp1386 = _mm512_shuffle_ps(tmp1373, tmp1375, 238);
__m512 tmp1387 = _mm512_shuffle_ps(tmp1374, tmp1376, 68);
__m512 tmp1388 = _mm512_shuffle_ps(tmp1374, tmp1376, 238);
__m512 tmp1389 = _mm512_shuffle_ps(tmp1377, tmp1379, 68);
__m512 tmp1390 = _mm512_shuffle_ps(tmp1377, tmp1379, 238);
__m512 tmp1391 = _mm512_shuffle_ps(tmp1378, tmp1380, 68);
__m512 tmp1392 = _mm512_shuffle_ps(tmp1378, tmp1380, 238);
__m512 tmp1393 = _mm512_shuffle_ps(tmp1381, tmp1383, 68);
__m512 tmp1394 = _mm512_shuffle_ps(tmp1381, tmp1383, 238);
__m512 tmp1395 = _mm512_shuffle_ps(tmp1382, tmp1384, 68);
__m512 tmp1396 = _mm512_shuffle_ps(tmp1382, tmp1384, 238);
__m512 tmp1397 = _mm512_shuffle_f32x4(tmp1385, tmp1389, 136);
__m512 tmp1398 = _mm512_shuffle_f32x4(tmp1385, tmp1389, 221);
__m512 tmp1399 = _mm512_shuffle_f32x4(tmp1386, tmp1390, 136);
__m512 tmp1400 = _mm512_shuffle_f32x4(tmp1386, tmp1390, 221);
__m512 tmp1401 = _mm512_shuffle_f32x4(tmp1387, tmp1391, 136);
__m512 tmp1402 = _mm512_shuffle_f32x4(tmp1387, tmp1391, 221);
__m512 tmp1403 = _mm512_shuffle_f32x4(tmp1388, tmp1392, 136);
__m512 tmp1404 = _mm512_shuffle_f32x4(tmp1388, tmp1392, 221);
__m512 tmp1405 = _mm512_shuffle_f32x4(tmp1393, tmp1393, 136);
__m512 tmp1406 = _mm512_shuffle_f32x4(tmp1393, tmp1393, 221);
__m512 tmp1407 = _mm512_shuffle_f32x4(tmp1394, tmp1394, 136);
__m512 tmp1408 = _mm512_shuffle_f32x4(tmp1394, tmp1394, 221);
__m512 tmp1409 = _mm512_shuffle_f32x4(tmp1395, tmp1395, 136);
__m512 tmp1410 = _mm512_shuffle_f32x4(tmp1395, tmp1395, 221);
__m512 tmp1411 = _mm512_shuffle_f32x4(tmp1396, tmp1396, 136);
__m512 tmp1412 = _mm512_shuffle_f32x4(tmp1396, tmp1396, 221);
tmp1277 = _mm512_shuffle_f32x4(tmp1397, tmp1405, 136);
tmp1285 = _mm512_shuffle_f32x4(tmp1397, tmp1405, 221);
tmp1278 = _mm512_shuffle_f32x4(tmp1399, tmp1407, 136);
tmp1286 = _mm512_shuffle_f32x4(tmp1399, tmp1407, 221);
tmp1279 = _mm512_shuffle_f32x4(tmp1401, tmp1409, 136);
tmp1287 = _mm512_shuffle_f32x4(tmp1401, tmp1409, 221);
tmp1280 = _mm512_shuffle_f32x4(tmp1403, tmp1411, 136);
tmp1288 = _mm512_shuffle_f32x4(tmp1403, tmp1411, 221);
tmp1281 = _mm512_shuffle_f32x4(tmp1398, tmp1406, 136);
__m512 tmp1329 = _mm512_shuffle_f32x4(tmp1398, tmp1406, 221);
tmp1282 = _mm512_shuffle_f32x4(tmp1400, tmp1408, 136);
__m512 tmp1330 = _mm512_shuffle_f32x4(tmp1400, tmp1408, 221);
tmp1283 = _mm512_shuffle_f32x4(tmp1402, tmp1410, 136);
__m512 tmp1331 = _mm512_shuffle_f32x4(tmp1402, tmp1410, 221);
tmp1284 = _mm512_shuffle_f32x4(tmp1404, tmp1412, 136);
__m512 tmp1332 = _mm512_shuffle_f32x4(tmp1404, tmp1412, 221);
__m512 tmp1337 = _mm512_add_ps(tmp1278, tmp1279);
__m512 tmp1357 = _mm512_add_ps(tmp1286, tmp1287);
__m512 tmp1336 = _mm512_add_ps(tmp1280, tmp1281);
__m512 tmp1356 = _mm512_add_ps(tmp1288, tmp1329);
__m512 tmp1342 = _mm512_sub_ps(tmp1280, tmp1281);
__m512 tmp1362 = _mm512_sub_ps(tmp1288, tmp1329);
__m512 tmp1341 = _mm512_sub_ps(tmp1278, tmp1279);
__m512 tmp1361 = _mm512_sub_ps(tmp1286, tmp1287);
__m512 tmp1338 = _mm512_add_ps(tmp1282, tmp1283);
__m512 tmp1358 = _mm512_add_ps(tmp1330, tmp1331);
__m512 tmp1343 = _mm512_sub_ps(tmp1282, tmp1283);
__m512 tmp1363 = _mm512_sub_ps(tmp1330, tmp1331);
__m512 tmp1340 = _mm512_fmadd_ps(tmp1342, _mm512_set1_ps(2e+00f), tmp1341);
__m512 tmp1360 = _mm512_fmadd_ps(tmp1362, _mm512_set1_ps(2e+00f), tmp1361);
__m512 tmp1347 = _mm512_fmadd_ps(tmp1342, _mm512_set1_ps(8e+00f), tmp1341);
__m512 tmp1367 = _mm512_fmadd_ps(tmp1362, _mm512_set1_ps(8e+00f), tmp1361);
__m512 tmp1335 = _mm512_add_ps(tmp1336, tmp1337);
__m512 tmp1355 = _mm512_add_ps(tmp1356, tmp1357);
__m512 tmp1339 = _mm512_fmadd_ps(tmp1343, _mm512_set1_ps(1.6e+01f), tmp1340);
__m512 tmp1359 = _mm512_fmadd_ps(tmp1363, _mm512_set1_ps(1.6e+01f), tmp1360);
__m512 tmp1346 = _mm512_fmadd_ps(tmp1343, _mm512_set1_ps(4e+00f), tmp1347);
__m512 tmp1366 = _mm512_fmadd_ps(tmp1363, _mm512_set1_ps(4e+00f), tmp1367);
__m512 tmp1352 = _mm512_add_ps(tmp1343, tmp1341);
__m512 tmp1372 = _mm512_add_ps(tmp1363, tmp1361);
__m512 tmp1345 = _mm512_fmadd_ps(tmp1336, _mm512_set1_ps(4e+00f), tmp1337);
__m512 tmp1365 = _mm512_fmadd_ps(tmp1356, _mm512_set1_ps(4e+00f), tmp1357);
__m512 tmp1349 = _mm512_fmadd_ps(tmp1336, _mm512_set1_ps(1.6e+01f), tmp1337);
__m512 tmp1369 = _mm512_fmadd_ps(tmp1356, _mm512_set1_ps(1.6e+01f), tmp1357);
__m512 tmp1334 = _mm512_add_ps(tmp1335, tmp1277);
__m512 tmp1354 = _mm512_add_ps(tmp1355, tmp1285);
__m512 tmp1351 = _mm512_add_ps(tmp1352, tmp1284);
__m512 tmp1371 = _mm512_add_ps(tmp1372, tmp1332);
__m512 tmp1333 = _mm512_fmadd_ps(tmp1338, _mm512_set1_ps(3.2e+01f), tmp1334);
__m512 tmp1353 = _mm512_fmadd_ps(tmp1358, _mm512_set1_ps(3.2e+01f), tmp1354);
__m512 tmp1344 = _mm512_fmadd_ps(tmp1338, _mm512_set1_ps(8e+00f), tmp1345);
__m512 tmp1364 = _mm512_fmadd_ps(tmp1358, _mm512_set1_ps(8e+00f), tmp1365);
__m512 tmp1350 = _mm512_fmadd_ps(tmp1342, _mm512_set1_ps(3.2e+01f), tmp1351);
__m512 tmp1370 = _mm512_fmadd_ps(tmp1362, _mm512_set1_ps(3.2e+01f), tmp1371);
__m512 tmp1348 = _mm512_fmadd_ps(tmp1338, _mm512_set1_ps(2e+00f), tmp1349);
__m512 tmp1368 = _mm512_fmadd_ps(tmp1358, _mm512_set1_ps(2e+00f), tmp1369);
__m512 out247 = tmp1333;
__m512 out241 = tmp1353;
__m512 out248 = tmp1339;
__m512 out242 = tmp1359;
__m512 out249 = tmp1344;
__m512 out243 = tmp1364;
__m512 out250 = tmp1346;
__m512 out244 = tmp1366;
__m512 out251 = tmp1348;
__m512 out245 = tmp1368;
__m512 out252 = tmp1350;
__m512 out246 = tmp1370;
_mm512_mask_storeu_ps(datPtr3+800+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out247);
_mm512_mask_storeu_ps(datPtr3+96+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out241);
_mm512_mask_storeu_ps(datPtr3+576+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4032, out241);
_mm512_mask_storeu_ps(datPtr3+900+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out248);
_mm512_mask_storeu_ps(datPtr3+196+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out242);
_mm512_mask_storeu_ps(datPtr3+676+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4032, out242);
_mm512_mask_storeu_ps(datPtr3+1000+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out249);
_mm512_mask_storeu_ps(datPtr3+296+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out243);
_mm512_mask_storeu_ps(datPtr3+1100+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out250);
_mm512_mask_storeu_ps(datPtr3+396+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out244);
_mm512_mask_storeu_ps(datPtr3+1200+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out251);
_mm512_mask_storeu_ps(datPtr3+496+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out245);
_mm512_mask_storeu_ps(datPtr3+1300+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out252);
_mm512_mask_storeu_ps(datPtr3+596+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out246);
__m512 sf81 = _mm512_loadu_ps(sfPtr3+512+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf82 = _mm512_loadu_ps(sfPtr3+640+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in193 = _mm512_shuffle_f32x4(sf81, sf82, 68);
__m512 in194 = _mm512_shuffle_f32x4(sf81, sf82, 238);
__m512 sf83 = _mm512_loadu_ps(sfPtr3+576+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf84 = _mm512_loadu_ps(sfPtr3+704+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in201 = _mm512_shuffle_f32x4(sf83, sf84, 68);
__m512 in202 = _mm512_shuffle_f32x4(sf83, sf84, 238);
__m512 sf85 = _mm512_loadu_ps(sfPtr3+495872+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf86 = _mm512_loadu_ps(sfPtr3+496000+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in195 = _mm512_shuffle_f32x4(sf85, sf86, 68);
__m512 in196 = _mm512_shuffle_f32x4(sf85, sf86, 238);
__m512 sf87 = _mm512_loadu_ps(sfPtr3+495936+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf88 = _mm512_loadu_ps(sfPtr3+496064+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in203 = _mm512_shuffle_f32x4(sf87, sf88, 68);
__m512 in204 = _mm512_shuffle_f32x4(sf87, sf88, 238);
__m512 sf89 = _mm512_loadu_ps(sfPtr3+991232+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf90 = _mm512_loadu_ps(sfPtr3+991360+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in197 = _mm512_shuffle_f32x4(sf89, sf90, 68);
__m512 in198 = _mm512_shuffle_f32x4(sf89, sf90, 238);
__m512 sf91 = _mm512_loadu_ps(sfPtr3+991296+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf92 = _mm512_loadu_ps(sfPtr3+991424+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in205 = _mm512_shuffle_f32x4(sf91, sf92, 68);
__m512 in206 = _mm512_shuffle_f32x4(sf91, sf92, 238);
__m512 sf93 = _mm512_loadu_ps(sfPtr3+1486592+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf94 = _mm512_loadu_ps(sfPtr3+1486720+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in199 = _mm512_shuffle_f32x4(sf93, sf94, 68);
__m512 in200 = _mm512_shuffle_f32x4(sf93, sf94, 238);
__m512 sf95 = _mm512_loadu_ps(sfPtr3+1486656+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 sf96 = _mm512_loadu_ps(sfPtr3+1486784+1981440*i11+297216*j7+1536*k11+768*l6);
__m512 in207 = _mm512_shuffle_f32x4(sf95, sf96, 68);
__m512 in208 = _mm512_shuffle_f32x4(sf95, sf96, 238);
__m512 tmp1429 = _mm512_add_ps(in194, in195);
__m512 tmp1449 = _mm512_add_ps(in202, in203);
__m512 tmp1428 = _mm512_add_ps(in196, in197);
__m512 tmp1448 = _mm512_add_ps(in204, in205);
__m512 tmp1434 = _mm512_sub_ps(in196, in197);
__m512 tmp1454 = _mm512_sub_ps(in204, in205);
__m512 tmp1433 = _mm512_sub_ps(in194, in195);
__m512 tmp1453 = _mm512_sub_ps(in202, in203);
__m512 tmp1430 = _mm512_add_ps(in198, in199);
__m512 tmp1450 = _mm512_add_ps(in206, in207);
__m512 tmp1435 = _mm512_sub_ps(in198, in199);
__m512 tmp1455 = _mm512_sub_ps(in206, in207);
__m512 tmp1432 = _mm512_fmadd_ps(tmp1434, _mm512_set1_ps(2e+00f), tmp1433);
__m512 tmp1452 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(2e+00f), tmp1453);
__m512 tmp1439 = _mm512_fmadd_ps(tmp1434, _mm512_set1_ps(8e+00f), tmp1433);
__m512 tmp1459 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(8e+00f), tmp1453);
__m512 tmp1427 = _mm512_add_ps(tmp1428, tmp1429);
__m512 tmp1447 = _mm512_add_ps(tmp1448, tmp1449);
__m512 tmp1431 = _mm512_fmadd_ps(tmp1435, _mm512_set1_ps(1.6e+01f), tmp1432);
__m512 tmp1451 = _mm512_fmadd_ps(tmp1455, _mm512_set1_ps(1.6e+01f), tmp1452);
__m512 tmp1438 = _mm512_fmadd_ps(tmp1435, _mm512_set1_ps(4e+00f), tmp1439);
__m512 tmp1458 = _mm512_fmadd_ps(tmp1455, _mm512_set1_ps(4e+00f), tmp1459);
__m512 tmp1444 = _mm512_add_ps(tmp1435, tmp1433);
__m512 tmp1464 = _mm512_add_ps(tmp1455, tmp1453);
__m512 tmp1437 = _mm512_fmadd_ps(tmp1428, _mm512_set1_ps(4e+00f), tmp1429);
__m512 tmp1457 = _mm512_fmadd_ps(tmp1448, _mm512_set1_ps(4e+00f), tmp1449);
__m512 tmp1441 = _mm512_fmadd_ps(tmp1428, _mm512_set1_ps(1.6e+01f), tmp1429);
__m512 tmp1461 = _mm512_fmadd_ps(tmp1448, _mm512_set1_ps(1.6e+01f), tmp1449);
__m512 tmp1426 = _mm512_add_ps(tmp1427, in193);
__m512 tmp1446 = _mm512_add_ps(tmp1447, in201);
__m512 tmp1443 = _mm512_add_ps(tmp1444, in200);
__m512 tmp1463 = _mm512_add_ps(tmp1464, in208);
__m512 tmp1425 = _mm512_fmadd_ps(tmp1430, _mm512_set1_ps(3.2e+01f), tmp1426);
__m512 tmp1445 = _mm512_fmadd_ps(tmp1450, _mm512_set1_ps(3.2e+01f), tmp1446);
__m512 tmp1436 = _mm512_fmadd_ps(tmp1430, _mm512_set1_ps(8e+00f), tmp1437);
__m512 tmp1456 = _mm512_fmadd_ps(tmp1450, _mm512_set1_ps(8e+00f), tmp1457);
__m512 tmp1442 = _mm512_fmadd_ps(tmp1434, _mm512_set1_ps(3.2e+01f), tmp1443);
__m512 tmp1462 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(3.2e+01f), tmp1463);
__m512 tmp1440 = _mm512_fmadd_ps(tmp1430, _mm512_set1_ps(2e+00f), tmp1441);
__m512 tmp1460 = _mm512_fmadd_ps(tmp1450, _mm512_set1_ps(2e+00f), tmp1461);
__m512 tmp1413 = tmp1425;
__m512 tmp1419 = tmp1445;
__m512 tmp1414 = tmp1431;
__m512 tmp1420 = tmp1451;
__m512 tmp1415 = tmp1436;
__m512 tmp1421 = tmp1456;
__m512 tmp1416 = tmp1438;
__m512 tmp1422 = tmp1458;
__m512 tmp1417 = tmp1440;
__m512 tmp1423 = tmp1460;
__m512 tmp1418 = tmp1442;
__m512 tmp1424 = tmp1462;
__m512 tmp1509 = _mm512_unpacklo_ps(tmp1413, tmp1414);
__m512 tmp1510 = _mm512_unpackhi_ps(tmp1413, tmp1414);
__m512 tmp1511 = _mm512_unpacklo_ps(tmp1415, tmp1416);
__m512 tmp1512 = _mm512_unpackhi_ps(tmp1415, tmp1416);
__m512 tmp1513 = _mm512_unpacklo_ps(tmp1417, tmp1418);
__m512 tmp1514 = _mm512_unpackhi_ps(tmp1417, tmp1418);
__m512 tmp1515 = _mm512_unpacklo_ps(tmp1419, tmp1420);
__m512 tmp1516 = _mm512_unpackhi_ps(tmp1419, tmp1420);
__m512 tmp1517 = _mm512_unpacklo_ps(tmp1421, tmp1422);
__m512 tmp1518 = _mm512_unpackhi_ps(tmp1421, tmp1422);
__m512 tmp1519 = _mm512_unpacklo_ps(tmp1423, tmp1424);
__m512 tmp1520 = _mm512_unpackhi_ps(tmp1423, tmp1424);
__m512 tmp1521 = _mm512_shuffle_ps(tmp1509, tmp1511, 68);
__m512 tmp1522 = _mm512_shuffle_ps(tmp1509, tmp1511, 238);
__m512 tmp1523 = _mm512_shuffle_ps(tmp1510, tmp1512, 68);
__m512 tmp1524 = _mm512_shuffle_ps(tmp1510, tmp1512, 238);
__m512 tmp1525 = _mm512_shuffle_ps(tmp1513, tmp1515, 68);
__m512 tmp1526 = _mm512_shuffle_ps(tmp1513, tmp1515, 238);
__m512 tmp1527 = _mm512_shuffle_ps(tmp1514, tmp1516, 68);
__m512 tmp1528 = _mm512_shuffle_ps(tmp1514, tmp1516, 238);
__m512 tmp1529 = _mm512_shuffle_ps(tmp1517, tmp1519, 68);
__m512 tmp1530 = _mm512_shuffle_ps(tmp1517, tmp1519, 238);
__m512 tmp1531 = _mm512_shuffle_ps(tmp1518, tmp1520, 68);
__m512 tmp1532 = _mm512_shuffle_ps(tmp1518, tmp1520, 238);
__m512 tmp1533 = _mm512_shuffle_f32x4(tmp1521, tmp1525, 136);
__m512 tmp1534 = _mm512_shuffle_f32x4(tmp1521, tmp1525, 221);
__m512 tmp1535 = _mm512_shuffle_f32x4(tmp1522, tmp1526, 136);
__m512 tmp1536 = _mm512_shuffle_f32x4(tmp1522, tmp1526, 221);
__m512 tmp1537 = _mm512_shuffle_f32x4(tmp1523, tmp1527, 136);
__m512 tmp1538 = _mm512_shuffle_f32x4(tmp1523, tmp1527, 221);
__m512 tmp1539 = _mm512_shuffle_f32x4(tmp1524, tmp1528, 136);
__m512 tmp1540 = _mm512_shuffle_f32x4(tmp1524, tmp1528, 221);
__m512 tmp1541 = _mm512_shuffle_f32x4(tmp1529, tmp1529, 136);
__m512 tmp1542 = _mm512_shuffle_f32x4(tmp1529, tmp1529, 221);
__m512 tmp1543 = _mm512_shuffle_f32x4(tmp1530, tmp1530, 136);
__m512 tmp1544 = _mm512_shuffle_f32x4(tmp1530, tmp1530, 221);
__m512 tmp1545 = _mm512_shuffle_f32x4(tmp1531, tmp1531, 136);
__m512 tmp1546 = _mm512_shuffle_f32x4(tmp1531, tmp1531, 221);
__m512 tmp1547 = _mm512_shuffle_f32x4(tmp1532, tmp1532, 136);
__m512 tmp1548 = _mm512_shuffle_f32x4(tmp1532, tmp1532, 221);
tmp1413 = _mm512_shuffle_f32x4(tmp1533, tmp1541, 136);
tmp1421 = _mm512_shuffle_f32x4(tmp1533, tmp1541, 221);
tmp1414 = _mm512_shuffle_f32x4(tmp1535, tmp1543, 136);
tmp1422 = _mm512_shuffle_f32x4(tmp1535, tmp1543, 221);
tmp1415 = _mm512_shuffle_f32x4(tmp1537, tmp1545, 136);
tmp1423 = _mm512_shuffle_f32x4(tmp1537, tmp1545, 221);
tmp1416 = _mm512_shuffle_f32x4(tmp1539, tmp1547, 136);
tmp1424 = _mm512_shuffle_f32x4(tmp1539, tmp1547, 221);
tmp1417 = _mm512_shuffle_f32x4(tmp1534, tmp1542, 136);
__m512 tmp1465 = _mm512_shuffle_f32x4(tmp1534, tmp1542, 221);
tmp1418 = _mm512_shuffle_f32x4(tmp1536, tmp1544, 136);
__m512 tmp1466 = _mm512_shuffle_f32x4(tmp1536, tmp1544, 221);
tmp1419 = _mm512_shuffle_f32x4(tmp1538, tmp1546, 136);
__m512 tmp1467 = _mm512_shuffle_f32x4(tmp1538, tmp1546, 221);
tmp1420 = _mm512_shuffle_f32x4(tmp1540, tmp1548, 136);
__m512 tmp1468 = _mm512_shuffle_f32x4(tmp1540, tmp1548, 221);
__m512 tmp1473 = _mm512_add_ps(tmp1414, tmp1415);
__m512 tmp1493 = _mm512_add_ps(tmp1422, tmp1423);
__m512 tmp1472 = _mm512_add_ps(tmp1416, tmp1417);
__m512 tmp1492 = _mm512_add_ps(tmp1424, tmp1465);
__m512 tmp1478 = _mm512_sub_ps(tmp1416, tmp1417);
__m512 tmp1498 = _mm512_sub_ps(tmp1424, tmp1465);
__m512 tmp1477 = _mm512_sub_ps(tmp1414, tmp1415);
__m512 tmp1497 = _mm512_sub_ps(tmp1422, tmp1423);
__m512 tmp1474 = _mm512_add_ps(tmp1418, tmp1419);
__m512 tmp1494 = _mm512_add_ps(tmp1466, tmp1467);
__m512 tmp1479 = _mm512_sub_ps(tmp1418, tmp1419);
__m512 tmp1499 = _mm512_sub_ps(tmp1466, tmp1467);
__m512 tmp1476 = _mm512_fmadd_ps(tmp1478, _mm512_set1_ps(2e+00f), tmp1477);
__m512 tmp1496 = _mm512_fmadd_ps(tmp1498, _mm512_set1_ps(2e+00f), tmp1497);
__m512 tmp1483 = _mm512_fmadd_ps(tmp1478, _mm512_set1_ps(8e+00f), tmp1477);
__m512 tmp1503 = _mm512_fmadd_ps(tmp1498, _mm512_set1_ps(8e+00f), tmp1497);
__m512 tmp1471 = _mm512_add_ps(tmp1472, tmp1473);
__m512 tmp1491 = _mm512_add_ps(tmp1492, tmp1493);
__m512 tmp1475 = _mm512_fmadd_ps(tmp1479, _mm512_set1_ps(1.6e+01f), tmp1476);
__m512 tmp1495 = _mm512_fmadd_ps(tmp1499, _mm512_set1_ps(1.6e+01f), tmp1496);
__m512 tmp1482 = _mm512_fmadd_ps(tmp1479, _mm512_set1_ps(4e+00f), tmp1483);
__m512 tmp1502 = _mm512_fmadd_ps(tmp1499, _mm512_set1_ps(4e+00f), tmp1503);
__m512 tmp1488 = _mm512_add_ps(tmp1479, tmp1477);
__m512 tmp1508 = _mm512_add_ps(tmp1499, tmp1497);
__m512 tmp1481 = _mm512_fmadd_ps(tmp1472, _mm512_set1_ps(4e+00f), tmp1473);
__m512 tmp1501 = _mm512_fmadd_ps(tmp1492, _mm512_set1_ps(4e+00f), tmp1493);
__m512 tmp1485 = _mm512_fmadd_ps(tmp1472, _mm512_set1_ps(1.6e+01f), tmp1473);
__m512 tmp1505 = _mm512_fmadd_ps(tmp1492, _mm512_set1_ps(1.6e+01f), tmp1493);
__m512 tmp1470 = _mm512_add_ps(tmp1471, tmp1413);
__m512 tmp1490 = _mm512_add_ps(tmp1491, tmp1421);
__m512 tmp1487 = _mm512_add_ps(tmp1488, tmp1420);
__m512 tmp1507 = _mm512_add_ps(tmp1508, tmp1468);
__m512 tmp1469 = _mm512_fmadd_ps(tmp1474, _mm512_set1_ps(3.2e+01f), tmp1470);
__m512 tmp1489 = _mm512_fmadd_ps(tmp1494, _mm512_set1_ps(3.2e+01f), tmp1490);
__m512 tmp1480 = _mm512_fmadd_ps(tmp1474, _mm512_set1_ps(8e+00f), tmp1481);
__m512 tmp1500 = _mm512_fmadd_ps(tmp1494, _mm512_set1_ps(8e+00f), tmp1501);
__m512 tmp1486 = _mm512_fmadd_ps(tmp1478, _mm512_set1_ps(3.2e+01f), tmp1487);
__m512 tmp1506 = _mm512_fmadd_ps(tmp1498, _mm512_set1_ps(3.2e+01f), tmp1507);
__m512 tmp1484 = _mm512_fmadd_ps(tmp1474, _mm512_set1_ps(2e+00f), tmp1485);
__m512 tmp1504 = _mm512_fmadd_ps(tmp1494, _mm512_set1_ps(2e+00f), tmp1505);
__m512 out253 = tmp1469;
__m512 out259 = tmp1489;
__m512 out254 = tmp1475;
__m512 out260 = tmp1495;
__m512 out255 = tmp1480;
__m512 out261 = tmp1500;
__m512 out256 = tmp1482;
__m512 out262 = tmp1502;
__m512 out257 = tmp1484;
__m512 out263 = tmp1504;
__m512 out258 = tmp1486;
__m512 out264 = tmp1506;
_mm512_mask_storeu_ps(datPtr3+848+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out253);
_mm512_mask_storeu_ps(datPtr3+896+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out259);
_mm512_mask_storeu_ps(datPtr3+1376+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4032, out259);
_mm512_mask_storeu_ps(datPtr3+948+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out254);
_mm512_mask_storeu_ps(datPtr3+996+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out260);
_mm512_mask_storeu_ps(datPtr3+1476+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4032, out260);
_mm512_mask_storeu_ps(datPtr3+1048+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out255);
_mm512_mask_storeu_ps(datPtr3+1096+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out261);
_mm512_mask_storeu_ps(datPtr3+1148+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out256);
_mm512_mask_storeu_ps(datPtr3+1196+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out262);
_mm512_mask_storeu_ps(datPtr3+1248+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out257);
_mm512_mask_storeu_ps(datPtr3+1296+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out263);
_mm512_mask_storeu_ps(datPtr3+1348+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 4095, out258);
_mm512_mask_storeu_ps(datPtr3+1396+619200*i11+100*toH1+4*toW1+3200*k11+1600*l6, 1, out264);
}
if (j7 >= last3) return;
++j7;
rel3 = 1;
}
ptrdiff_t toH2 = base3+6;
ptrdiff_t toW2 = 6;
ptrdiff_t k12 = 27*w7;
ptrdiff_t kk4 = k12+(w7 < 6 ? 26 : 31);
for (; k12 != 193; ++k12) {
ptrdiff_t l7 = 0;
for (; l7 != 4; ++l7) {
__m512 sf97 = _mm512_loadu_ps(sfPtr3+0+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 sf98 = _mm512_loadu_ps(sfPtr3+128+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 in209 = _mm512_shuffle_f32x4(sf97, sf98, 68);
__m512 in210 = _mm512_shuffle_f32x4(sf97, sf98, 238);
__m512 sf99 = _mm512_loadu_ps(sfPtr3+64+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 sf100 = _mm512_loadu_ps(sfPtr3+192+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 in217 = _mm512_shuffle_f32x4(sf99, sf100, 68);
__m512 in218 = _mm512_shuffle_f32x4(sf99, sf100, 238);
__m512 sf101 = _mm512_loadu_ps(sfPtr3+495360+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 sf102 = _mm512_loadu_ps(sfPtr3+495488+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 in211 = _mm512_shuffle_f32x4(sf101, sf102, 68);
__m512 in212 = _mm512_shuffle_f32x4(sf101, sf102, 238);
__m512 sf103 = _mm512_loadu_ps(sfPtr3+495424+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 sf104 = _mm512_loadu_ps(sfPtr3+495552+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 in219 = _mm512_shuffle_f32x4(sf103, sf104, 68);
__m512 in220 = _mm512_shuffle_f32x4(sf103, sf104, 238);
__m512 sf105 = _mm512_loadu_ps(sfPtr3+990720+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 sf106 = _mm512_loadu_ps(sfPtr3+990848+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 in213 = _mm512_shuffle_f32x4(sf105, sf106, 68);
__m512 in214 = _mm512_shuffle_f32x4(sf105, sf106, 238);
__m512 sf107 = _mm512_loadu_ps(sfPtr3+990784+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 sf108 = _mm512_loadu_ps(sfPtr3+990912+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 in221 = _mm512_shuffle_f32x4(sf107, sf108, 68);
__m512 in222 = _mm512_shuffle_f32x4(sf107, sf108, 238);
__m512 sf109 = _mm512_loadu_ps(sfPtr3+1486080+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 sf110 = _mm512_loadu_ps(sfPtr3+1486208+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 in215 = _mm512_shuffle_f32x4(sf109, sf110, 68);
__m512 in216 = _mm512_shuffle_f32x4(sf109, sf110, 238);
__m512 sf111 = _mm512_loadu_ps(sfPtr3+1486144+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 sf112 = _mm512_loadu_ps(sfPtr3+1486272+1981440*i11+297216*j7+1024*k12+256*l7);
__m512 in223 = _mm512_shuffle_f32x4(sf111, sf112, 68);
__m512 in224 = _mm512_shuffle_f32x4(sf111, sf112, 238);
__m512 tmp1565 = _mm512_add_ps(in210, in211);
__m512 tmp1585 = _mm512_add_ps(in218, in219);
__m512 tmp1564 = _mm512_add_ps(in212, in213);
__m512 tmp1584 = _mm512_add_ps(in220, in221);
__m512 tmp1570 = _mm512_sub_ps(in212, in213);
__m512 tmp1590 = _mm512_sub_ps(in220, in221);
__m512 tmp1569 = _mm512_sub_ps(in210, in211);
__m512 tmp1589 = _mm512_sub_ps(in218, in219);
__m512 tmp1566 = _mm512_add_ps(in214, in215);
__m512 tmp1586 = _mm512_add_ps(in222, in223);
__m512 tmp1571 = _mm512_sub_ps(in214, in215);
__m512 tmp1591 = _mm512_sub_ps(in222, in223);
__m512 tmp1568 = _mm512_fmadd_ps(tmp1570, _mm512_set1_ps(2e+00f), tmp1569);
__m512 tmp1588 = _mm512_fmadd_ps(tmp1590, _mm512_set1_ps(2e+00f), tmp1589);
__m512 tmp1575 = _mm512_fmadd_ps(tmp1570, _mm512_set1_ps(8e+00f), tmp1569);
__m512 tmp1595 = _mm512_fmadd_ps(tmp1590, _mm512_set1_ps(8e+00f), tmp1589);
__m512 tmp1563 = _mm512_add_ps(tmp1564, tmp1565);
__m512 tmp1583 = _mm512_add_ps(tmp1584, tmp1585);
__m512 tmp1567 = _mm512_fmadd_ps(tmp1571, _mm512_set1_ps(1.6e+01f), tmp1568);
__m512 tmp1587 = _mm512_fmadd_ps(tmp1591, _mm512_set1_ps(1.6e+01f), tmp1588);
__m512 tmp1574 = _mm512_fmadd_ps(tmp1571, _mm512_set1_ps(4e+00f), tmp1575);
__m512 tmp1594 = _mm512_fmadd_ps(tmp1591, _mm512_set1_ps(4e+00f), tmp1595);
__m512 tmp1580 = _mm512_add_ps(tmp1571, tmp1569);
__m512 tmp1600 = _mm512_add_ps(tmp1591, tmp1589);
__m512 tmp1573 = _mm512_fmadd_ps(tmp1564, _mm512_set1_ps(4e+00f), tmp1565);
__m512 tmp1593 = _mm512_fmadd_ps(tmp1584, _mm512_set1_ps(4e+00f), tmp1585);
__m512 tmp1577 = _mm512_fmadd_ps(tmp1564, _mm512_set1_ps(1.6e+01f), tmp1565);
__m512 tmp1597 = _mm512_fmadd_ps(tmp1584, _mm512_set1_ps(1.6e+01f), tmp1585);
__m512 tmp1562 = _mm512_add_ps(tmp1563, in209);
__m512 tmp1582 = _mm512_add_ps(tmp1583, in217);
__m512 tmp1579 = _mm512_add_ps(tmp1580, in216);
__m512 tmp1599 = _mm512_add_ps(tmp1600, in224);
__m512 tmp1561 = _mm512_fmadd_ps(tmp1566, _mm512_set1_ps(3.2e+01f), tmp1562);
__m512 tmp1581 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(3.2e+01f), tmp1582);
__m512 tmp1572 = _mm512_fmadd_ps(tmp1566, _mm512_set1_ps(8e+00f), tmp1573);
__m512 tmp1592 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(8e+00f), tmp1593);
__m512 tmp1578 = _mm512_fmadd_ps(tmp1570, _mm512_set1_ps(3.2e+01f), tmp1579);
__m512 tmp1598 = _mm512_fmadd_ps(tmp1590, _mm512_set1_ps(3.2e+01f), tmp1599);
__m512 tmp1576 = _mm512_fmadd_ps(tmp1566, _mm512_set1_ps(2e+00f), tmp1577);
__m512 tmp1596 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(2e+00f), tmp1597);
__m512 tmp1549 = tmp1561;
__m512 tmp1555 = tmp1581;
__m512 tmp1550 = tmp1567;
__m512 tmp1556 = tmp1587;
__m512 tmp1551 = tmp1572;
__m512 tmp1557 = tmp1592;
__m512 tmp1552 = tmp1574;
__m512 tmp1558 = tmp1594;
__m512 tmp1553 = tmp1576;
__m512 tmp1559 = tmp1596;
__m512 tmp1554 = tmp1578;
__m512 tmp1560 = tmp1598;
__m512 tmp1627 = _mm512_unpacklo_ps(tmp1549, tmp1550);
__m512 tmp1628 = _mm512_unpackhi_ps(tmp1549, tmp1550);
__m512 tmp1629 = _mm512_unpacklo_ps(tmp1551, tmp1552);
__m512 tmp1630 = _mm512_unpackhi_ps(tmp1551, tmp1552);
__m512 tmp1631 = _mm512_unpacklo_ps(tmp1553, tmp1554);
__m512 tmp1632 = _mm512_unpackhi_ps(tmp1553, tmp1554);
__m512 tmp1633 = _mm512_unpacklo_ps(tmp1555, tmp1556);
__m512 tmp1634 = _mm512_unpackhi_ps(tmp1555, tmp1556);
__m512 tmp1635 = _mm512_unpacklo_ps(tmp1557, tmp1558);
__m512 tmp1636 = _mm512_unpackhi_ps(tmp1557, tmp1558);
__m512 tmp1637 = _mm512_unpacklo_ps(tmp1559, tmp1560);
__m512 tmp1638 = _mm512_unpackhi_ps(tmp1559, tmp1560);
__m512 tmp1639 = _mm512_shuffle_ps(tmp1627, tmp1629, 68);
__m512 tmp1640 = _mm512_shuffle_ps(tmp1627, tmp1629, 238);
__m512 tmp1641 = _mm512_shuffle_ps(tmp1628, tmp1630, 68);
__m512 tmp1642 = _mm512_shuffle_ps(tmp1628, tmp1630, 238);
__m512 tmp1643 = _mm512_shuffle_ps(tmp1631, tmp1633, 68);
__m512 tmp1644 = _mm512_shuffle_ps(tmp1631, tmp1633, 238);
__m512 tmp1645 = _mm512_shuffle_ps(tmp1632, tmp1634, 68);
__m512 tmp1646 = _mm512_shuffle_ps(tmp1632, tmp1634, 238);
__m512 tmp1647 = _mm512_shuffle_ps(tmp1635, tmp1637, 68);
__m512 tmp1648 = _mm512_shuffle_ps(tmp1635, tmp1637, 238);
__m512 tmp1649 = _mm512_shuffle_ps(tmp1636, tmp1638, 68);
__m512 tmp1650 = _mm512_shuffle_ps(tmp1636, tmp1638, 238);
__m512 tmp1651 = _mm512_shuffle_f32x4(tmp1639, tmp1643, 136);
__m512 tmp1652 = _mm512_shuffle_f32x4(tmp1639, tmp1643, 221);
__m512 tmp1653 = _mm512_shuffle_f32x4(tmp1640, tmp1644, 136);
__m512 tmp1654 = _mm512_shuffle_f32x4(tmp1640, tmp1644, 221);
__m512 tmp1655 = _mm512_shuffle_f32x4(tmp1641, tmp1645, 136);
__m512 tmp1656 = _mm512_shuffle_f32x4(tmp1641, tmp1645, 221);
__m512 tmp1657 = _mm512_shuffle_f32x4(tmp1642, tmp1646, 136);
__m512 tmp1658 = _mm512_shuffle_f32x4(tmp1642, tmp1646, 221);
__m512 tmp1659 = _mm512_shuffle_f32x4(tmp1647, tmp1647, 136);
__m512 tmp1660 = _mm512_shuffle_f32x4(tmp1647, tmp1647, 221);
__m512 tmp1661 = _mm512_shuffle_f32x4(tmp1648, tmp1648, 136);
__m512 tmp1662 = _mm512_shuffle_f32x4(tmp1648, tmp1648, 221);
__m512 tmp1663 = _mm512_shuffle_f32x4(tmp1649, tmp1649, 136);
__m512 tmp1664 = _mm512_shuffle_f32x4(tmp1649, tmp1649, 221);
__m512 tmp1665 = _mm512_shuffle_f32x4(tmp1650, tmp1650, 136);
__m512 tmp1666 = _mm512_shuffle_f32x4(tmp1650, tmp1650, 221);
tmp1549 = _mm512_shuffle_f32x4(tmp1651, tmp1659, 136);
tmp1557 = _mm512_shuffle_f32x4(tmp1651, tmp1659, 221);
tmp1550 = _mm512_shuffle_f32x4(tmp1653, tmp1661, 136);
tmp1558 = _mm512_shuffle_f32x4(tmp1653, tmp1661, 221);
tmp1551 = _mm512_shuffle_f32x4(tmp1655, tmp1663, 136);
tmp1559 = _mm512_shuffle_f32x4(tmp1655, tmp1663, 221);
tmp1552 = _mm512_shuffle_f32x4(tmp1657, tmp1665, 136);
tmp1560 = _mm512_shuffle_f32x4(tmp1657, tmp1665, 221);
tmp1553 = _mm512_shuffle_f32x4(tmp1652, tmp1660, 136);
__m512 tmp1601 = _mm512_shuffle_f32x4(tmp1652, tmp1660, 221);
tmp1554 = _mm512_shuffle_f32x4(tmp1654, tmp1662, 136);
__m512 tmp1602 = _mm512_shuffle_f32x4(tmp1654, tmp1662, 221);
tmp1555 = _mm512_shuffle_f32x4(tmp1656, tmp1664, 136);
__m512 tmp1603 = _mm512_shuffle_f32x4(tmp1656, tmp1664, 221);
tmp1556 = _mm512_shuffle_f32x4(tmp1658, tmp1666, 136);
__m512 tmp1604 = _mm512_shuffle_f32x4(tmp1658, tmp1666, 221);
(void)tmp1556;
(void)tmp1604;
__m512 tmp1609 = _mm512_add_ps(tmp1550, tmp1551);
__m512 tmp1620 = _mm512_add_ps(tmp1558, tmp1559);
__m512 tmp1608 = _mm512_add_ps(tmp1552, tmp1553);
__m512 tmp1619 = _mm512_add_ps(tmp1560, tmp1601);
__m512 tmp1614 = _mm512_sub_ps(tmp1552, tmp1553);
__m512 tmp1625 = _mm512_sub_ps(tmp1560, tmp1601);
__m512 tmp1613 = _mm512_sub_ps(tmp1550, tmp1551);
__m512 tmp1624 = _mm512_sub_ps(tmp1558, tmp1559);
__m512 tmp1610 = _mm512_add_ps(tmp1554, tmp1555);
__m512 tmp1621 = _mm512_add_ps(tmp1602, tmp1603);
__m512 tmp1615 = _mm512_sub_ps(tmp1554, tmp1555);
__m512 tmp1626 = _mm512_sub_ps(tmp1602, tmp1603);
__m512 tmp1612 = _mm512_fmadd_ps(tmp1614, _mm512_set1_ps(2e+00f), tmp1613);
__m512 tmp1623 = _mm512_fmadd_ps(tmp1625, _mm512_set1_ps(2e+00f), tmp1624);
__m512 tmp1607 = _mm512_add_ps(tmp1608, tmp1609);
__m512 tmp1618 = _mm512_add_ps(tmp1619, tmp1620);
__m512 tmp1611 = _mm512_fmadd_ps(tmp1615, _mm512_set1_ps(1.6e+01f), tmp1612);
__m512 tmp1622 = _mm512_fmadd_ps(tmp1626, _mm512_set1_ps(1.6e+01f), tmp1623);
__m512 tmp1606 = _mm512_add_ps(tmp1607, tmp1549);
__m512 tmp1617 = _mm512_add_ps(tmp1618, tmp1557);
__m512 tmp1605 = _mm512_fmadd_ps(tmp1610, _mm512_set1_ps(3.2e+01f), tmp1606);
__m512 tmp1616 = _mm512_fmadd_ps(tmp1621, _mm512_set1_ps(3.2e+01f), tmp1617);
__m512 out265 = tmp1605;
__m512 out267 = tmp1616;
__m512 out266 = tmp1611;
__m512 out268 = tmp1622;
_mm512_mask_storeu_ps(datPtr3+0+619200*i11+100*toH2+4*toW2+3200*k12+800*l7, 4095, out265);
_mm512_mask_storeu_ps(datPtr3+48+619200*i11+100*toH2+4*toW2+3200*k12+800*l7, 127, out267);
_mm512_mask_storeu_ps(datPtr3+100+619200*i11+100*toH2+4*toW2+3200*k12+800*l7, 4095, out266);
_mm512_mask_storeu_ps(datPtr3+148+619200*i11+100*toH2+4*toW2+3200*k12+800*l7, 127, out268);
}
if (k12 >= kk4) return;
}
ptrdiff_t l8 = 0;
for (; l8 != 2; ++l8) {
__m512 sf113 = _mm512_loadu_ps(sfPtr3+0+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 sf114 = _mm512_loadu_ps(sfPtr3+128+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 in225 = _mm512_shuffle_f32x4(sf113, sf114, 68);
__m512 in226 = _mm512_shuffle_f32x4(sf113, sf114, 238);
__m512 sf115 = _mm512_loadu_ps(sfPtr3+64+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 sf116 = _mm512_loadu_ps(sfPtr3+192+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 in233 = _mm512_shuffle_f32x4(sf115, sf116, 68);
__m512 in234 = _mm512_shuffle_f32x4(sf115, sf116, 238);
__m512 sf117 = _mm512_loadu_ps(sfPtr3+495360+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 sf118 = _mm512_loadu_ps(sfPtr3+495488+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 in227 = _mm512_shuffle_f32x4(sf117, sf118, 68);
__m512 in228 = _mm512_shuffle_f32x4(sf117, sf118, 238);
__m512 sf119 = _mm512_loadu_ps(sfPtr3+495424+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 sf120 = _mm512_loadu_ps(sfPtr3+495552+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 in235 = _mm512_shuffle_f32x4(sf119, sf120, 68);
__m512 in236 = _mm512_shuffle_f32x4(sf119, sf120, 238);
__m512 sf121 = _mm512_loadu_ps(sfPtr3+990720+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 sf122 = _mm512_loadu_ps(sfPtr3+990848+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 in229 = _mm512_shuffle_f32x4(sf121, sf122, 68);
__m512 in230 = _mm512_shuffle_f32x4(sf121, sf122, 238);
__m512 sf123 = _mm512_loadu_ps(sfPtr3+990784+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 sf124 = _mm512_loadu_ps(sfPtr3+990912+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 in237 = _mm512_shuffle_f32x4(sf123, sf124, 68);
__m512 in238 = _mm512_shuffle_f32x4(sf123, sf124, 238);
__m512 sf125 = _mm512_loadu_ps(sfPtr3+1486080+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 sf126 = _mm512_loadu_ps(sfPtr3+1486208+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 in231 = _mm512_shuffle_f32x4(sf125, sf126, 68);
__m512 in232 = _mm512_shuffle_f32x4(sf125, sf126, 238);
__m512 sf127 = _mm512_loadu_ps(sfPtr3+1486144+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 sf128 = _mm512_loadu_ps(sfPtr3+1486272+1981440*i11+297216*j7+1024*k12+256*l8);
__m512 in239 = _mm512_shuffle_f32x4(sf127, sf128, 68);
__m512 in240 = _mm512_shuffle_f32x4(sf127, sf128, 238);
__m512 tmp1683 = _mm512_add_ps(in226, in227);
__m512 tmp1703 = _mm512_add_ps(in234, in235);
__m512 tmp1682 = _mm512_add_ps(in228, in229);
__m512 tmp1702 = _mm512_add_ps(in236, in237);
__m512 tmp1688 = _mm512_sub_ps(in228, in229);
__m512 tmp1708 = _mm512_sub_ps(in236, in237);
__m512 tmp1687 = _mm512_sub_ps(in226, in227);
__m512 tmp1707 = _mm512_sub_ps(in234, in235);
__m512 tmp1684 = _mm512_add_ps(in230, in231);
__m512 tmp1704 = _mm512_add_ps(in238, in239);
__m512 tmp1689 = _mm512_sub_ps(in230, in231);
__m512 tmp1709 = _mm512_sub_ps(in238, in239);
__m512 tmp1686 = _mm512_fmadd_ps(tmp1688, _mm512_set1_ps(2e+00f), tmp1687);
__m512 tmp1706 = _mm512_fmadd_ps(tmp1708, _mm512_set1_ps(2e+00f), tmp1707);
__m512 tmp1693 = _mm512_fmadd_ps(tmp1688, _mm512_set1_ps(8e+00f), tmp1687);
__m512 tmp1713 = _mm512_fmadd_ps(tmp1708, _mm512_set1_ps(8e+00f), tmp1707);
__m512 tmp1681 = _mm512_add_ps(tmp1682, tmp1683);
__m512 tmp1701 = _mm512_add_ps(tmp1702, tmp1703);
__m512 tmp1685 = _mm512_fmadd_ps(tmp1689, _mm512_set1_ps(1.6e+01f), tmp1686);
__m512 tmp1705 = _mm512_fmadd_ps(tmp1709, _mm512_set1_ps(1.6e+01f), tmp1706);
__m512 tmp1692 = _mm512_fmadd_ps(tmp1689, _mm512_set1_ps(4e+00f), tmp1693);
__m512 tmp1712 = _mm512_fmadd_ps(tmp1709, _mm512_set1_ps(4e+00f), tmp1713);
__m512 tmp1698 = _mm512_add_ps(tmp1689, tmp1687);
__m512 tmp1718 = _mm512_add_ps(tmp1709, tmp1707);
__m512 tmp1691 = _mm512_fmadd_ps(tmp1682, _mm512_set1_ps(4e+00f), tmp1683);
__m512 tmp1711 = _mm512_fmadd_ps(tmp1702, _mm512_set1_ps(4e+00f), tmp1703);
__m512 tmp1695 = _mm512_fmadd_ps(tmp1682, _mm512_set1_ps(1.6e+01f), tmp1683);
__m512 tmp1715 = _mm512_fmadd_ps(tmp1702, _mm512_set1_ps(1.6e+01f), tmp1703);
__m512 tmp1680 = _mm512_add_ps(tmp1681, in225);
__m512 tmp1700 = _mm512_add_ps(tmp1701, in233);
__m512 tmp1697 = _mm512_add_ps(tmp1698, in232);
__m512 tmp1717 = _mm512_add_ps(tmp1718, in240);
__m512 tmp1679 = _mm512_fmadd_ps(tmp1684, _mm512_set1_ps(3.2e+01f), tmp1680);
__m512 tmp1699 = _mm512_fmadd_ps(tmp1704, _mm512_set1_ps(3.2e+01f), tmp1700);
__m512 tmp1690 = _mm512_fmadd_ps(tmp1684, _mm512_set1_ps(8e+00f), tmp1691);
__m512 tmp1710 = _mm512_fmadd_ps(tmp1704, _mm512_set1_ps(8e+00f), tmp1711);
__m512 tmp1696 = _mm512_fmadd_ps(tmp1688, _mm512_set1_ps(3.2e+01f), tmp1697);
__m512 tmp1716 = _mm512_fmadd_ps(tmp1708, _mm512_set1_ps(3.2e+01f), tmp1717);
__m512 tmp1694 = _mm512_fmadd_ps(tmp1684, _mm512_set1_ps(2e+00f), tmp1695);
__m512 tmp1714 = _mm512_fmadd_ps(tmp1704, _mm512_set1_ps(2e+00f), tmp1715);
__m512 tmp1667 = tmp1679;
__m512 tmp1673 = tmp1699;
__m512 tmp1668 = tmp1685;
__m512 tmp1674 = tmp1705;
__m512 tmp1669 = tmp1690;
__m512 tmp1675 = tmp1710;
__m512 tmp1670 = tmp1692;
__m512 tmp1676 = tmp1712;
__m512 tmp1671 = tmp1694;
__m512 tmp1677 = tmp1714;
__m512 tmp1672 = tmp1696;
__m512 tmp1678 = tmp1716;
__m512 tmp1745 = _mm512_unpacklo_ps(tmp1667, tmp1668);
__m512 tmp1746 = _mm512_unpackhi_ps(tmp1667, tmp1668);
__m512 tmp1747 = _mm512_unpacklo_ps(tmp1669, tmp1670);
__m512 tmp1748 = _mm512_unpackhi_ps(tmp1669, tmp1670);
__m512 tmp1749 = _mm512_unpacklo_ps(tmp1671, tmp1672);
__m512 tmp1750 = _mm512_unpackhi_ps(tmp1671, tmp1672);
__m512 tmp1751 = _mm512_unpacklo_ps(tmp1673, tmp1674);
__m512 tmp1752 = _mm512_unpackhi_ps(tmp1673, tmp1674);
__m512 tmp1753 = _mm512_unpacklo_ps(tmp1675, tmp1676);
__m512 tmp1754 = _mm512_unpackhi_ps(tmp1675, tmp1676);
__m512 tmp1755 = _mm512_unpacklo_ps(tmp1677, tmp1678);
__m512 tmp1756 = _mm512_unpackhi_ps(tmp1677, tmp1678);
__m512 tmp1757 = _mm512_shuffle_ps(tmp1745, tmp1747, 68);
__m512 tmp1758 = _mm512_shuffle_ps(tmp1745, tmp1747, 238);
__m512 tmp1759 = _mm512_shuffle_ps(tmp1746, tmp1748, 68);
__m512 tmp1760 = _mm512_shuffle_ps(tmp1746, tmp1748, 238);
__m512 tmp1761 = _mm512_shuffle_ps(tmp1749, tmp1751, 68);
__m512 tmp1762 = _mm512_shuffle_ps(tmp1749, tmp1751, 238);
__m512 tmp1763 = _mm512_shuffle_ps(tmp1750, tmp1752, 68);
__m512 tmp1764 = _mm512_shuffle_ps(tmp1750, tmp1752, 238);
__m512 tmp1765 = _mm512_shuffle_ps(tmp1753, tmp1755, 68);
__m512 tmp1766 = _mm512_shuffle_ps(tmp1753, tmp1755, 238);
__m512 tmp1767 = _mm512_shuffle_ps(tmp1754, tmp1756, 68);
__m512 tmp1768 = _mm512_shuffle_ps(tmp1754, tmp1756, 238);
__m512 tmp1769 = _mm512_shuffle_f32x4(tmp1757, tmp1761, 136);
__m512 tmp1770 = _mm512_shuffle_f32x4(tmp1757, tmp1761, 221);
__m512 tmp1771 = _mm512_shuffle_f32x4(tmp1758, tmp1762, 136);
__m512 tmp1772 = _mm512_shuffle_f32x4(tmp1758, tmp1762, 221);
__m512 tmp1773 = _mm512_shuffle_f32x4(tmp1759, tmp1763, 136);
__m512 tmp1774 = _mm512_shuffle_f32x4(tmp1759, tmp1763, 221);
__m512 tmp1775 = _mm512_shuffle_f32x4(tmp1760, tmp1764, 136);
__m512 tmp1776 = _mm512_shuffle_f32x4(tmp1760, tmp1764, 221);
__m512 tmp1777 = _mm512_shuffle_f32x4(tmp1765, tmp1765, 136);
__m512 tmp1778 = _mm512_shuffle_f32x4(tmp1765, tmp1765, 221);
__m512 tmp1779 = _mm512_shuffle_f32x4(tmp1766, tmp1766, 136);
__m512 tmp1780 = _mm512_shuffle_f32x4(tmp1766, tmp1766, 221);
__m512 tmp1781 = _mm512_shuffle_f32x4(tmp1767, tmp1767, 136);
__m512 tmp1782 = _mm512_shuffle_f32x4(tmp1767, tmp1767, 221);
__m512 tmp1783 = _mm512_shuffle_f32x4(tmp1768, tmp1768, 136);
__m512 tmp1784 = _mm512_shuffle_f32x4(tmp1768, tmp1768, 221);
tmp1667 = _mm512_shuffle_f32x4(tmp1769, tmp1777, 136);
tmp1675 = _mm512_shuffle_f32x4(tmp1769, tmp1777, 221);
tmp1668 = _mm512_shuffle_f32x4(tmp1771, tmp1779, 136);
tmp1676 = _mm512_shuffle_f32x4(tmp1771, tmp1779, 221);
tmp1669 = _mm512_shuffle_f32x4(tmp1773, tmp1781, 136);
tmp1677 = _mm512_shuffle_f32x4(tmp1773, tmp1781, 221);
tmp1670 = _mm512_shuffle_f32x4(tmp1775, tmp1783, 136);
tmp1678 = _mm512_shuffle_f32x4(tmp1775, tmp1783, 221);
tmp1671 = _mm512_shuffle_f32x4(tmp1770, tmp1778, 136);
__m512 tmp1719 = _mm512_shuffle_f32x4(tmp1770, tmp1778, 221);
tmp1672 = _mm512_shuffle_f32x4(tmp1772, tmp1780, 136);
__m512 tmp1720 = _mm512_shuffle_f32x4(tmp1772, tmp1780, 221);
tmp1673 = _mm512_shuffle_f32x4(tmp1774, tmp1782, 136);
__m512 tmp1721 = _mm512_shuffle_f32x4(tmp1774, tmp1782, 221);
tmp1674 = _mm512_shuffle_f32x4(tmp1776, tmp1784, 136);
__m512 tmp1722 = _mm512_shuffle_f32x4(tmp1776, tmp1784, 221);
(void)tmp1674;
(void)tmp1722;
__m512 tmp1727 = _mm512_add_ps(tmp1668, tmp1669);
__m512 tmp1738 = _mm512_add_ps(tmp1676, tmp1677);
__m512 tmp1726 = _mm512_add_ps(tmp1670, tmp1671);
__m512 tmp1737 = _mm512_add_ps(tmp1678, tmp1719);
__m512 tmp1732 = _mm512_sub_ps(tmp1670, tmp1671);
__m512 tmp1743 = _mm512_sub_ps(tmp1678, tmp1719);
__m512 tmp1731 = _mm512_sub_ps(tmp1668, tmp1669);
__m512 tmp1742 = _mm512_sub_ps(tmp1676, tmp1677);
__m512 tmp1728 = _mm512_add_ps(tmp1672, tmp1673);
__m512 tmp1739 = _mm512_add_ps(tmp1720, tmp1721);
__m512 tmp1733 = _mm512_sub_ps(tmp1672, tmp1673);
__m512 tmp1744 = _mm512_sub_ps(tmp1720, tmp1721);
__m512 tmp1730 = _mm512_fmadd_ps(tmp1732, _mm512_set1_ps(2e+00f), tmp1731);
__m512 tmp1741 = _mm512_fmadd_ps(tmp1743, _mm512_set1_ps(2e+00f), tmp1742);
__m512 tmp1725 = _mm512_add_ps(tmp1726, tmp1727);
__m512 tmp1736 = _mm512_add_ps(tmp1737, tmp1738);
__m512 tmp1729 = _mm512_fmadd_ps(tmp1733, _mm512_set1_ps(1.6e+01f), tmp1730);
__m512 tmp1740 = _mm512_fmadd_ps(tmp1744, _mm512_set1_ps(1.6e+01f), tmp1741);
__m512 tmp1724 = _mm512_add_ps(tmp1725, tmp1667);
__m512 tmp1735 = _mm512_add_ps(tmp1736, tmp1675);
__m512 tmp1723 = _mm512_fmadd_ps(tmp1728, _mm512_set1_ps(3.2e+01f), tmp1724);
__m512 tmp1734 = _mm512_fmadd_ps(tmp1739, _mm512_set1_ps(3.2e+01f), tmp1735);
__m512 out269 = tmp1723;
__m512 out271 = tmp1734;
__m512 out270 = tmp1729;
__m512 out272 = tmp1740;
_mm512_mask_storeu_ps(datPtr3+0+619200*i11+100*toH2+4*toW2+3200*k12+800*l8, 4095, out269);
_mm512_mask_storeu_ps(datPtr3+48+619200*i11+100*toH2+4*toW2+3200*k12+800*l8, 127, out271);
_mm512_mask_storeu_ps(datPtr3+100+619200*i11+100*toH2+4*toW2+3200*k12+800*l8, 4095, out270);
_mm512_mask_storeu_ps(datPtr3+148+619200*i11+100*toH2+4*toW2+3200*k12+800*l8, 127, out272);
}
if (j7 >= last3) return;
++j7;
}

static void Example12ThreeConsumeSums1(Example12ThreaderTeam1* team17, char** tensors8) {
Example12ThreaderTask1 task13;
task13.callee1 = Example12ThreeConsumeSums1Callee1;
task13.any1 = tensors8;
task13.nd1 = 3;
task13.hull1[0] = 7;
task13.hull1[1] = 2;
task13.hull1[2] = 1;
Example12ThreaderDo1(team17, &task13);
}

struct Example12Net {
char* alloc1;
char* align1;
};

void Example12NetDestroy(Example12Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example12NetCreate(
Example12Net** net1,
Example12Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example12Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(53505151);
if (__builtin_expect(!alloc3, 0)) {
return Example12Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example12ThreaderTeam1* team12 = 0;
char* err8 = Example12ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors13[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example12ThreeArrangeFilts1(team12, tensors13);
}
Example12ThreaderDestroy1(team12);
Example12Net* net5 = malloc(sizeof(Example12Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example12Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example12Engine {
Example12Net* net3;
Example12ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example12EnginePthreadT(
Example12Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example12ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example12EngineDestroy(Example12Engine* eng3) {
Example12ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example12EngineCreate(
Example12Engine** eng4,
Example12Net* net4,
ptrdiff_t threads2
) {
Example12Engine* eng5 = malloc(sizeof(Example12Engine));
if (__builtin_expect(!eng5, 0)) {
return Example12Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(3363903);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example12Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example12ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example12EngineInference(
Example12Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example12ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors10[] = {
(char*)inData,
align4+0
};
Example12ThreeArrangeDats1(team14, tensors10);
char* tensors11[] = {
netAlign1+0,
align4+0,
align4+1382400
};
Example12ThreeProduceSums1(team14, tensors11);
char* tensors12[] = {
align4+1382400,
(char*)outData
};
Example12ThreeConsumeSums1(team14, tensors12);
}
}

// End of file.

Top