NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example11 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=25 Height=11 Width=4
Conv FromTensor=in ToTensor=out ToChannels=219 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
Output FromTensor=out

Top || Output Example11.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example11Params);
// Example11Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example11Params Example11Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example11Params* params = malloc(sizeof(Example11Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example11Net* net; // For example, 4 threads:
// char* err = Example11NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example11NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example11Net Example11Net;

char* Example11NetCreate(
Example11Net**,
Example11Params*,
ptrdiff_t threads
);

void Example11NetDestroy(Example11Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example11Net* net;
//
// ... Create net ...
//
// Example11Engine* engine; // For example, 4 inference threads:
// char* err = Example11EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example11EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example11EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*25*11*4);
// float* outData = malloc(sizeof(float)*219*9*2);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example11EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example11Engine Example11Engine;

char* Example11EngineCreate(
Example11Engine**,
Example11Net*,
ptrdiff_t threads
);

char* Example11EnginePthreadT(
Example11Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example11EngineInference(
Example11Engine*,
float* inData,
float* outData
);

void Example11EngineDestroy(Example11Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example11Params {
float outBiases[219]; // 1x219x1x1
float outWeights[49275]; // 219x25x3x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example11.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example11.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example11.h"

static char* Example11Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example11: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example11ThreaderTask1 Example11ThreaderTask1;
typedef void (*Example11ThreaderCallee1)(Example11ThreaderTask1*, int64_t*);
typedef struct Example11ThreaderHub1 Example11ThreaderHub1;
typedef struct Example11ThreaderNode1 Example11ThreaderNode1;
typedef struct Example11ThreaderUnwind1 Example11ThreaderUnwind1;
typedef struct Example11ThreaderTeam1 Example11ThreaderTeam1;

struct Example11ThreaderTask1 {
Example11ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example11ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example11ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example11ThreaderTask1* task1;
pthread_cond_t cond2;
Example11ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example11ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example11ThreaderTeam1 {
ptrdiff_t nt1;
Example11ThreaderHub1* hub2;
Example11ThreaderNode1* nodes2;
Example11ThreaderUnwind1 unwind1;
};

static void Example11ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example11ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example11ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example11ThreaderMain1(void* arg1) {
Example11ThreaderNode1* node1 = arg1;
Example11ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example11ThreaderHub1* hub3 = team2->hub2;
Example11ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example11ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example11ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example11ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example11ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example11ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example11ThreaderDestroy1(Example11ThreaderTeam1* team3) {
if (!team3) return;
Example11ThreaderNode1* nodes4 = team3->nodes2;
Example11ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example11ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example11ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example11ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example11ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example11ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example11ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example11ThreaderCreate1Up4(Example11ThreaderTeam1* team8, ptrdiff_t nt7) {
Example11ThreaderNode1* nodes5 = team8->nodes2;
for (Example11ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example11Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example11Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example11ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example11Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example11ThreaderCreate1Up3(Example11ThreaderTeam1* team7, ptrdiff_t nt6) {
Example11ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example11Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example11Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example11ThreaderCreate1Up4(team7, nt6);
}

static char* Example11ThreaderCreate1Up2(Example11ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example11ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example11ThreaderNode1) != (size_t)nt5, 0)) {
return Example11Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example11Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example11ThreaderCreate1Up3(team6, nt5);
}

static char* Example11ThreaderCreate1Up1(Example11ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example11ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example11Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example11ThreaderCreate1Up2(team5, nt4);
}

static char* Example11ThreaderCreate1(Example11ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example11Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example11ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example11Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example11ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example11ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example11ThreaderPthreadT1(
pthread_t* thr2,
Example11ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example11Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example11ThreaderDo1(Example11ThreaderTeam1* team10, Example11ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example11ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example11ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example11ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example11ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example11Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example11Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example11ThreeArrangeFilts1Callee1(Example11ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = 0;
ptrdiff_t e1 = 0;
char*restrict bfPtr1 = tensors2[2]+876*e1;
char*restrict wfPtr1 = tensors2[2]+896+18529280*e1;
char*restrict wtPtr1 = tensors2[0]+23796*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 6*b2;
ptrdiff_t jj1 = j1+(b2 < 8 ? 5 : 6);
if (j1 < 54) {
for (; j1 != 54; ++j1) {
ptrdiff_t k1 = 0+1*j1;
ptrdiff_t cut1 = 0;
ptrdiff_t s1 = 0;
for (; s1 != 25; ++s1) {
__m512 wt1 = _mm512_maskz_loadu_ps(511, wtPtr1+0+197100*i5+3600*j1+36*s1);
__m512 wt2 = _mm512_maskz_loadu_ps(511, wtPtr1+900+197100*i5+3600*j1+36*s1);
__m512 wt3 = _mm512_maskz_loadu_ps(511, wtPtr1+1800+197100*i5+3600*j1+36*s1);
__m512 wt4 = _mm512_maskz_loadu_ps(511, wtPtr1+2700+197100*i5+3600*j1+36*s1);
__m512i pm1 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm2 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp1 = _mm512_permutex2var_ps(wt1, pm1, wt3);
__m512 tmp2 = _mm512_permutex2var_ps(wt2, pm1, wt4);
__m512 tmp3 = _mm512_permutex2var_ps(wt1, pm2, wt3);
__m512 tmp4 = _mm512_permutex2var_ps(wt2, pm2, wt4);
__m512 in1 = _mm512_permutex2var_ps(tmp1, pm1, tmp2);
__m512 in2 = _mm512_permutex2var_ps(tmp1, pm2, tmp2);
__m512 in3 = _mm512_permutex2var_ps(tmp3, pm1, tmp4);
__m512 tmp9 = _mm512_fmadd_ps(in1, _mm512_set1_ps(4e+00f), in3);
__m512 tmp10 = _mm512_add_ps(in1, in3);
__m512 tmp11 = _mm512_fmadd_ps(in3, _mm512_set1_ps(4e+00f), in1);
__m512 tmp12 = _mm512_add_ps(in2, tmp10);
__m512 tmp13 = _mm512_fmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp11);
tmp11 = _mm512_fnmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp11);
__m512 tmp14 = _mm512_fnmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp9);
tmp9 = _mm512_fmadd_ps(in2, _mm512_set1_ps(2e+00f), tmp9);
tmp10 = _mm512_sub_ps(tmp10, in2);
__m512 tmp31 = _mm512_unpacklo_ps(in1, tmp12);
__m512 tmp32 = _mm512_unpackhi_ps(in1, tmp12);
__m512 tmp33 = _mm512_unpacklo_ps(tmp10, tmp13);
__m512 tmp34 = _mm512_unpackhi_ps(tmp10, tmp13);
__m512 tmp35 = _mm512_unpacklo_ps(tmp11, tmp9);
__m512 tmp36 = _mm512_unpackhi_ps(tmp11, tmp9);
__m512 tmp37 = _mm512_unpacklo_ps(tmp14, in3);
__m512 tmp38 = _mm512_unpackhi_ps(tmp14, in3);
__m512 tmp39 = _mm512_shuffle_ps(tmp31, tmp33, 68);
__m512 tmp40 = _mm512_shuffle_ps(tmp31, tmp33, 238);
__m512 tmp41 = _mm512_shuffle_ps(tmp32, tmp34, 68);
__m512 tmp42 = _mm512_shuffle_ps(tmp32, tmp34, 238);
__m512 tmp43 = _mm512_shuffle_ps(tmp35, tmp37, 68);
__m512 tmp44 = _mm512_shuffle_ps(tmp35, tmp37, 238);
__m512 tmp45 = _mm512_shuffle_ps(tmp36, tmp38, 68);
__m512 tmp46 = _mm512_shuffle_ps(tmp36, tmp38, 238);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp39, tmp43, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp39, tmp43, 221);
__m512 tmp49 = _mm512_shuffle_f32x4(tmp40, tmp44, 136);
__m512 tmp50 = _mm512_shuffle_f32x4(tmp40, tmp44, 221);
__m512 tmp51 = _mm512_shuffle_f32x4(tmp41, tmp45, 136);
__m512 tmp52 = _mm512_shuffle_f32x4(tmp41, tmp45, 221);
__m512 tmp53 = _mm512_shuffle_f32x4(tmp42, tmp46, 136);
__m512 tmp54 = _mm512_shuffle_f32x4(tmp42, tmp46, 221);
in1 = _mm512_shuffle_f32x4(tmp47, tmp47, 136);
__m512 tmp15 = _mm512_shuffle_f32x4(tmp47, tmp47, 221);
tmp12 = _mm512_shuffle_f32x4(tmp49, tmp49, 136);
__m512 tmp16 = _mm512_shuffle_f32x4(tmp49, tmp49, 221);
tmp10 = _mm512_shuffle_f32x4(tmp51, tmp51, 136);
__m512 tmp17 = _mm512_shuffle_f32x4(tmp51, tmp51, 221);
tmp13 = _mm512_shuffle_f32x4(tmp53, tmp53, 136);
__m512 tmp18 = _mm512_shuffle_f32x4(tmp53, tmp53, 221);
tmp11 = _mm512_shuffle_f32x4(tmp48, tmp48, 136);
tmp9 = _mm512_shuffle_f32x4(tmp50, tmp50, 136);
tmp14 = _mm512_shuffle_f32x4(tmp52, tmp52, 136);
in3 = _mm512_shuffle_f32x4(tmp54, tmp54, 136);
in1 = _mm512_shuffle_f32x4(in1, tmp13, 68);
tmp12 = _mm512_shuffle_f32x4(tmp12, tmp11, 68);
tmp10 = _mm512_shuffle_f32x4(tmp10, tmp9, 68);
tmp14 = _mm512_shuffle_f32x4(tmp14, tmp16, 68);
in3 = _mm512_shuffle_f32x4(in3, tmp17, 68);
tmp15 = _mm512_shuffle_f32x4(tmp15, tmp18, 68);
__m512 tmp19 = _mm512_fmadd_ps(in1, _mm512_set1_ps(4e+00f), tmp10);
__m512 tmp25 = _mm512_fmadd_ps(tmp14, _mm512_set1_ps(4e+00f), tmp15);
__m512 tmp20 = _mm512_add_ps(in1, tmp10);
__m512 tmp26 = _mm512_add_ps(tmp14, tmp15);
__m512 tmp21 = _mm512_fmadd_ps(tmp10, _mm512_set1_ps(4e+00f), in1);
__m512 tmp27 = _mm512_fmadd_ps(tmp15, _mm512_set1_ps(4e+00f), tmp14);
__m512 tmp22 = _mm512_add_ps(tmp12, tmp20);
__m512 tmp28 = _mm512_add_ps(in3, tmp26);
__m512 tmp23 = _mm512_fmadd_ps(tmp12, _mm512_set1_ps(2e+00f), tmp21);
__m512 tmp29 = _mm512_fmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp27);
tmp21 = _mm512_fnmadd_ps(tmp12, _mm512_set1_ps(2e+00f), tmp21);
tmp27 = _mm512_fnmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp27);
__m512 tmp24 = _mm512_fnmadd_ps(tmp12, _mm512_set1_ps(2e+00f), tmp19);
__m512 tmp30 = _mm512_fnmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp25);
tmp19 = _mm512_fmadd_ps(tmp12, _mm512_set1_ps(2e+00f), tmp19);
tmp25 = _mm512_fmadd_ps(in3, _mm512_set1_ps(2e+00f), tmp25);
tmp20 = _mm512_sub_ps(tmp20, tmp12);
tmp26 = _mm512_sub_ps(tmp26, in3);
in1 = _mm512_mul_ps(in1, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp22 = _mm512_mul_ps(tmp22, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp20 = _mm512_mul_ps(tmp20, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp23 = _mm512_mul_ps(tmp23, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp21 = _mm512_mul_ps(tmp21, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp19 = _mm512_mul_ps(tmp19, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp24 = _mm512_mul_ps(tmp24, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp10 = _mm512_mul_ps(tmp10, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp14 = _mm512_mul_ps(tmp14, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp28 = _mm512_mul_ps(tmp28, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp26 = _mm512_mul_ps(tmp26, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp29 = _mm512_mul_ps(tmp29, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp27 = _mm512_mul_ps(tmp27, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp25 = _mm512_mul_ps(tmp25, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp30 = _mm512_mul_ps(tmp30, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp15 = _mm512_mul_ps(tmp15, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out1 = _mm512_shuffle_f32x4(in1, tmp22, 68);
__m512 out5 = _mm512_shuffle_f32x4(in1, tmp22, 238);
__m512 out2 = _mm512_shuffle_f32x4(tmp20, tmp23, 68);
__m512 out6 = _mm512_shuffle_f32x4(tmp20, tmp23, 238);
__m512 out3 = _mm512_shuffle_f32x4(tmp21, tmp19, 68);
__m512 out7 = _mm512_shuffle_f32x4(tmp21, tmp19, 238);
__m512 out4 = _mm512_shuffle_f32x4(tmp24, tmp10, 68);
__m512 out8 = _mm512_shuffle_f32x4(tmp24, tmp10, 238);
__m512 out9 = _mm512_shuffle_f32x4(tmp14, tmp28, 68);
__m512 out13 = _mm512_shuffle_f32x4(tmp14, tmp28, 238);
__m512 out10 = _mm512_shuffle_f32x4(tmp26, tmp29, 68);
__m512 out14 = _mm512_shuffle_f32x4(tmp26, tmp29, 238);
__m512 out11 = _mm512_shuffle_f32x4(tmp27, tmp25, 68);
__m512 out15 = _mm512_shuffle_f32x4(tmp27, tmp25, 238);
__m512 out12 = _mm512_shuffle_f32x4(tmp30, tmp15, 68);
__m512 out16 = _mm512_shuffle_f32x4(tmp30, tmp15, 238);
ptrdiff_t off1 = 32*cut1;
ptrdiff_t off2 = (size_t)(cut1+1)/4*3200+(size_t)(cut1+1)%4*32;
ptrdiff_t off3 = (size_t)(cut1+2)/4*3200+(size_t)(cut1+2)%4*32;
ptrdiff_t off4 = (size_t)(cut1+3)/4*3200+(size_t)(cut1+3)%4*32;
__m512i wf1 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf2 = _mm512_castsi256_si512(_mm512_cvtps_ph(out5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf3 = _mm512_castsi256_si512(_mm512_cvtps_ph(out9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf4 = _mm512_castsi256_si512(_mm512_cvtps_ph(out13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf5 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf6 = _mm512_castsi256_si512(_mm512_cvtps_ph(out6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf7 = _mm512_castsi256_si512(_mm512_cvtps_ph(out10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf8 = _mm512_castsi256_si512(_mm512_cvtps_ph(out14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf9 = _mm512_castsi256_si512(_mm512_cvtps_ph(out3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf10 = _mm512_castsi256_si512(_mm512_cvtps_ph(out7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf11 = _mm512_castsi256_si512(_mm512_cvtps_ph(out11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf12 = _mm512_castsi256_si512(_mm512_cvtps_ph(out15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf13 = _mm512_castsi256_si512(_mm512_cvtps_ph(out4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf14 = _mm512_castsi256_si512(_mm512_cvtps_ph(out8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf15 = _mm512_castsi256_si512(_mm512_cvtps_ph(out12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf16 = _mm512_castsi256_si512(_mm512_cvtps_ph(out16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr1+0+700928*i5+3200*k1+off1+128*s1, 255, wf1);
_mm512_mask_storeu_epi32(wfPtr1+0+700928*i5+3200*k1+off2+128*s1, 255, wf2);
_mm512_mask_storeu_epi32(wfPtr1+0+700928*i5+3200*k1+off3+128*s1, 255, wf3);
_mm512_mask_storeu_epi32(wfPtr1+0+700928*i5+3200*k1+off4+128*s1, 255, wf4);
_mm512_mask_storeu_epi32(wfPtr1+175232+700928*i5+3200*k1+off1+128*s1, 255, wf5);
_mm512_mask_storeu_epi32(wfPtr1+175232+700928*i5+3200*k1+off2+128*s1, 255, wf6);
_mm512_mask_storeu_epi32(wfPtr1+175232+700928*i5+3200*k1+off3+128*s1, 255, wf7);
_mm512_mask_storeu_epi32(wfPtr1+175232+700928*i5+3200*k1+off4+128*s1, 255, wf8);
_mm512_mask_storeu_epi32(wfPtr1+350464+700928*i5+3200*k1+off1+128*s1, 255, wf9);
_mm512_mask_storeu_epi32(wfPtr1+350464+700928*i5+3200*k1+off2+128*s1, 255, wf10);
_mm512_mask_storeu_epi32(wfPtr1+350464+700928*i5+3200*k1+off3+128*s1, 255, wf11);
_mm512_mask_storeu_epi32(wfPtr1+350464+700928*i5+3200*k1+off4+128*s1, 255, wf12);
_mm512_mask_storeu_epi32(wfPtr1+525696+700928*i5+3200*k1+off1+128*s1, 255, wf13);
_mm512_mask_storeu_epi32(wfPtr1+525696+700928*i5+3200*k1+off2+128*s1, 255, wf14);
_mm512_mask_storeu_epi32(wfPtr1+525696+700928*i5+3200*k1+off3+128*s1, 255, wf15);
_mm512_mask_storeu_epi32(wfPtr1+525696+700928*i5+3200*k1+off4+128*s1, 255, wf16);
}
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(15, biasPtr1-0+876*i5+16*j1);
}
_mm512_mask_storeu_ps(bfPtr1-0+876*i5+16*j1, 15, bias1);
if (j1 >= jj1) return;
}
}
if (j1 == 54) {
ptrdiff_t k2 = 0+1*j1;
ptrdiff_t cut2 = 0;
ptrdiff_t s2 = 0;
for (; s2 != 25; ++s2) {
__m512 wt5 = _mm512_maskz_loadu_ps(511, wtPtr1+0+197100*i5+3600*j1+36*s2);
__m512 wt6 = _mm512_maskz_loadu_ps(511, wtPtr1+900+197100*i5+3600*j1+36*s2);
__m512 wt7 = _mm512_maskz_loadu_ps(511, wtPtr1+1800+197100*i5+3600*j1+36*s2);
__m512i pm3 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm4 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp5 = _mm512_permutex2var_ps(wt5, pm3, wt7);
__m512 tmp6 = _mm512_permutex2var_ps(wt6, pm3, wt6);
__m512 tmp7 = _mm512_permutex2var_ps(wt5, pm4, wt7);
__m512 tmp8 = _mm512_permutex2var_ps(wt6, pm4, wt6);
__m512 in4 = _mm512_permutex2var_ps(tmp5, pm3, tmp6);
__m512 in5 = _mm512_permutex2var_ps(tmp5, pm4, tmp6);
__m512 in6 = _mm512_permutex2var_ps(tmp7, pm3, tmp8);
__m512 tmp55 = _mm512_fmadd_ps(in4, _mm512_set1_ps(4e+00f), in6);
__m512 tmp56 = _mm512_add_ps(in4, in6);
__m512 tmp57 = _mm512_fmadd_ps(in6, _mm512_set1_ps(4e+00f), in4);
__m512 tmp58 = _mm512_add_ps(in5, tmp56);
__m512 tmp59 = _mm512_fmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp57);
tmp57 = _mm512_fnmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp57);
__m512 tmp60 = _mm512_fnmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp55);
tmp55 = _mm512_fmadd_ps(in5, _mm512_set1_ps(2e+00f), tmp55);
tmp56 = _mm512_sub_ps(tmp56, in5);
__m512 tmp74 = _mm512_unpacklo_ps(in4, tmp58);
__m512 tmp75 = _mm512_unpackhi_ps(in4, tmp58);
__m512 tmp76 = _mm512_unpacklo_ps(tmp56, tmp59);
__m512 tmp77 = _mm512_unpackhi_ps(tmp56, tmp59);
__m512 tmp78 = _mm512_unpacklo_ps(tmp57, tmp55);
__m512 tmp79 = _mm512_unpackhi_ps(tmp57, tmp55);
__m512 tmp80 = _mm512_unpacklo_ps(tmp60, in6);
__m512 tmp81 = _mm512_unpackhi_ps(tmp60, in6);
__m512 tmp82 = _mm512_shuffle_ps(tmp74, tmp76, 68);
__m512 tmp83 = _mm512_shuffle_ps(tmp74, tmp76, 238);
__m512 tmp84 = _mm512_shuffle_ps(tmp75, tmp77, 68);
__m512 tmp85 = _mm512_shuffle_ps(tmp75, tmp77, 238);
__m512 tmp86 = _mm512_shuffle_ps(tmp78, tmp80, 68);
__m512 tmp87 = _mm512_shuffle_ps(tmp78, tmp80, 238);
__m512 tmp88 = _mm512_shuffle_ps(tmp79, tmp81, 68);
__m512 tmp89 = _mm512_shuffle_ps(tmp79, tmp81, 238);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp82, tmp86, 136);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp82, tmp86, 221);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp83, tmp87, 136);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp83, tmp87, 221);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp84, tmp88, 136);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp84, tmp88, 221);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp85, tmp89, 136);
__m512 tmp97 = _mm512_shuffle_f32x4(tmp85, tmp89, 221);
in4 = _mm512_shuffle_f32x4(tmp90, tmp90, 136);
__m512 tmp61 = _mm512_shuffle_f32x4(tmp90, tmp90, 221);
tmp58 = _mm512_shuffle_f32x4(tmp92, tmp92, 136);
tmp56 = _mm512_shuffle_f32x4(tmp94, tmp94, 136);
tmp59 = _mm512_shuffle_f32x4(tmp96, tmp96, 136);
tmp57 = _mm512_shuffle_f32x4(tmp91, tmp91, 136);
tmp55 = _mm512_shuffle_f32x4(tmp93, tmp93, 136);
tmp60 = _mm512_shuffle_f32x4(tmp95, tmp95, 136);
in6 = _mm512_shuffle_f32x4(tmp97, tmp97, 136);
in4 = _mm512_shuffle_f32x4(in4, tmp59, 68);
tmp58 = _mm512_shuffle_f32x4(tmp58, tmp57, 68);
tmp56 = _mm512_shuffle_f32x4(tmp56, tmp55, 68);
__m512 tmp62 = _mm512_fmadd_ps(in4, _mm512_set1_ps(4e+00f), tmp56);
__m512 tmp68 = _mm512_fmadd_ps(tmp60, _mm512_set1_ps(4e+00f), tmp61);
__m512 tmp63 = _mm512_add_ps(in4, tmp56);
__m512 tmp69 = _mm512_add_ps(tmp60, tmp61);
__m512 tmp64 = _mm512_fmadd_ps(tmp56, _mm512_set1_ps(4e+00f), in4);
__m512 tmp70 = _mm512_fmadd_ps(tmp61, _mm512_set1_ps(4e+00f), tmp60);
__m512 tmp65 = _mm512_add_ps(tmp58, tmp63);
__m512 tmp71 = _mm512_add_ps(in6, tmp69);
__m512 tmp66 = _mm512_fmadd_ps(tmp58, _mm512_set1_ps(2e+00f), tmp64);
__m512 tmp72 = _mm512_fmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp70);
tmp64 = _mm512_fnmadd_ps(tmp58, _mm512_set1_ps(2e+00f), tmp64);
tmp70 = _mm512_fnmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp70);
__m512 tmp67 = _mm512_fnmadd_ps(tmp58, _mm512_set1_ps(2e+00f), tmp62);
__m512 tmp73 = _mm512_fnmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp68);
tmp62 = _mm512_fmadd_ps(tmp58, _mm512_set1_ps(2e+00f), tmp62);
tmp68 = _mm512_fmadd_ps(in6, _mm512_set1_ps(2e+00f), tmp68);
tmp63 = _mm512_sub_ps(tmp63, tmp58);
tmp69 = _mm512_sub_ps(tmp69, in6);
in4 = _mm512_mul_ps(in4, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp65 = _mm512_mul_ps(tmp65, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp63 = _mm512_mul_ps(tmp63, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp66 = _mm512_mul_ps(tmp66, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp64 = _mm512_mul_ps(tmp64, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp62 = _mm512_mul_ps(tmp62, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp67 = _mm512_mul_ps(tmp67, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp56 = _mm512_mul_ps(tmp56, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp60 = _mm512_mul_ps(tmp60, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp71 = _mm512_mul_ps(tmp71, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp69 = _mm512_mul_ps(tmp69, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp72 = _mm512_mul_ps(tmp72, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp70 = _mm512_mul_ps(tmp70, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp68 = _mm512_mul_ps(tmp68, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp73 = _mm512_mul_ps(tmp73, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp61 = _mm512_mul_ps(tmp61, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out17 = _mm512_shuffle_f32x4(in4, tmp65, 68);
__m512 out21 = _mm512_shuffle_f32x4(in4, tmp65, 238);
__m512 out18 = _mm512_shuffle_f32x4(tmp63, tmp66, 68);
__m512 out22 = _mm512_shuffle_f32x4(tmp63, tmp66, 238);
__m512 out19 = _mm512_shuffle_f32x4(tmp64, tmp62, 68);
__m512 out23 = _mm512_shuffle_f32x4(tmp64, tmp62, 238);
__m512 out20 = _mm512_shuffle_f32x4(tmp67, tmp56, 68);
__m512 out24 = _mm512_shuffle_f32x4(tmp67, tmp56, 238);
__m512 out25 = _mm512_shuffle_f32x4(tmp60, tmp71, 68);
__m512 out26 = _mm512_shuffle_f32x4(tmp69, tmp72, 68);
__m512 out27 = _mm512_shuffle_f32x4(tmp70, tmp68, 68);
__m512 out28 = _mm512_shuffle_f32x4(tmp73, tmp61, 68);
ptrdiff_t off5 = 32*cut2;
ptrdiff_t off6 = (size_t)(cut2+1)/4*3200+(size_t)(cut2+1)%4*32;
ptrdiff_t off7 = (size_t)(cut2+2)/4*3200+(size_t)(cut2+2)%4*32;
__m512i wf17 = _mm512_castsi256_si512(_mm512_cvtps_ph(out17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf18 = _mm512_castsi256_si512(_mm512_cvtps_ph(out21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf19 = _mm512_castsi256_si512(_mm512_cvtps_ph(out25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf20 = _mm512_castsi256_si512(_mm512_cvtps_ph(out18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf21 = _mm512_castsi256_si512(_mm512_cvtps_ph(out22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf22 = _mm512_castsi256_si512(_mm512_cvtps_ph(out26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf23 = _mm512_castsi256_si512(_mm512_cvtps_ph(out19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf24 = _mm512_castsi256_si512(_mm512_cvtps_ph(out23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf25 = _mm512_castsi256_si512(_mm512_cvtps_ph(out27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf26 = _mm512_castsi256_si512(_mm512_cvtps_ph(out20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf27 = _mm512_castsi256_si512(_mm512_cvtps_ph(out24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf28 = _mm512_castsi256_si512(_mm512_cvtps_ph(out28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr1+0+700928*i5+3200*k2+off5+96*s2, 255, wf17);
_mm512_mask_storeu_epi32(wfPtr1+0+700928*i5+3200*k2+off6+96*s2, 255, wf18);
_mm512_mask_storeu_epi32(wfPtr1+0+700928*i5+3200*k2+off7+96*s2, 255, wf19);
_mm512_mask_storeu_epi32(wfPtr1+175232+700928*i5+3200*k2+off5+96*s2, 255, wf20);
_mm512_mask_storeu_epi32(wfPtr1+175232+700928*i5+3200*k2+off6+96*s2, 255, wf21);
_mm512_mask_storeu_epi32(wfPtr1+175232+700928*i5+3200*k2+off7+96*s2, 255, wf22);
_mm512_mask_storeu_epi32(wfPtr1+350464+700928*i5+3200*k2+off5+96*s2, 255, wf23);
_mm512_mask_storeu_epi32(wfPtr1+350464+700928*i5+3200*k2+off6+96*s2, 255, wf24);
_mm512_mask_storeu_epi32(wfPtr1+350464+700928*i5+3200*k2+off7+96*s2, 255, wf25);
_mm512_mask_storeu_epi32(wfPtr1+525696+700928*i5+3200*k2+off5+96*s2, 255, wf26);
_mm512_mask_storeu_epi32(wfPtr1+525696+700928*i5+3200*k2+off6+96*s2, 255, wf27);
_mm512_mask_storeu_epi32(wfPtr1+525696+700928*i5+3200*k2+off7+96*s2, 255, wf28);
}
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(7, biasPtr1-0+876*i5+16*j1);
}
_mm512_mask_storeu_ps(bfPtr1-0+876*i5+16*j1, 7, bias2);
if (j1 >= jj1) return;
j1 = 55;
}
}

static void Example11ThreeArrangeFilts1(Example11ThreaderTeam1* team13, char** tensors1) {
Example11ThreaderTask1 task5;
task5.callee1 = Example11ThreeArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 9;
task5.hull1[1] = 1;
task5.hull1[2] = 1;
Example11ThreaderDo1(team13, &task5);
}

static void Example11ThreeArrangeDats1Callee1(Example11ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s3 = 0;
ptrdiff_t c1 = 0;
ptrdiff_t g3 = 0;
ptrdiff_t e2 = 0;
(void)pt8;
char*restrict datPtr1 = tensors4[0]-0+116336*e2;
char*restrict dfPtr1 = tensors4[1]+338432*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 1*c1;
ptrdiff_t rel1 = j2-0;
ptrdiff_t base1 = 0;
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k3 = 0;
for (; k3 != 12; ++k3) {
__m512 dat1 = _mm512_maskz_loadu_ps(15, datPtr1+0+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat2 = _mm512_maskz_loadu_ps(15, datPtr1+96+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat3 = _mm512_maskz_loadu_ps(15, datPtr1+176+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat4 = _mm512_maskz_loadu_ps(15, datPtr1+272+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512i pm5 = _mm512_set_epi32(15, 15, 15, 15, 19, 18, 17, 16, 15, 15, 15, 15, 3, 2, 1, 0);
__m512 in7 = _mm512_permutex2var_ps(dat1, pm5, dat2);
__m512 in15 = _mm512_permutex2var_ps(dat3, pm5, dat4);
__m512 dat5 = _mm512_maskz_loadu_ps(15, datPtr1+16+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat6 = _mm512_maskz_loadu_ps(15, datPtr1+112+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat7 = _mm512_maskz_loadu_ps(15, datPtr1+192+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat8 = _mm512_maskz_loadu_ps(15, datPtr1+288+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in8 = _mm512_permutex2var_ps(dat5, pm5, dat6);
__m512 in16 = _mm512_permutex2var_ps(dat7, pm5, dat8);
__m512 dat9 = _mm512_maskz_loadu_ps(15, datPtr1+32+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat10 = _mm512_maskz_loadu_ps(15, datPtr1+128+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat11 = _mm512_maskz_loadu_ps(15, datPtr1+208+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat12 = _mm512_maskz_loadu_ps(15, datPtr1+304+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in9 = _mm512_permutex2var_ps(dat9, pm5, dat10);
__m512 in17 = _mm512_permutex2var_ps(dat11, pm5, dat12);
__m512 dat13 = _mm512_maskz_loadu_ps(15, datPtr1+48+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat14 = _mm512_maskz_loadu_ps(15, datPtr1+144+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat15 = _mm512_maskz_loadu_ps(15, datPtr1+224+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat16 = _mm512_maskz_loadu_ps(15, datPtr1+320+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in10 = _mm512_permutex2var_ps(dat13, pm5, dat14);
__m512 in18 = _mm512_permutex2var_ps(dat15, pm5, dat16);
__m512 dat17 = _mm512_maskz_loadu_ps(15, datPtr1+64+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat18 = _mm512_maskz_loadu_ps(15, datPtr1+160+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat19 = _mm512_maskz_loadu_ps(15, datPtr1+240+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat20 = _mm512_maskz_loadu_ps(15, datPtr1+336+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in11 = _mm512_permutex2var_ps(dat17, pm5, dat18);
__m512 in19 = _mm512_permutex2var_ps(dat19, pm5, dat20);
__m512 dat21 = _mm512_maskz_loadu_ps(15, datPtr1+80+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat22 = _mm512_maskz_loadu_ps(15, datPtr1+256+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512i pm6 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 3, 2, 1, 0);
__m512 in12 = _mm512_permutexvar_ps(pm6, dat21);
__m512 in20 = _mm512_permutexvar_ps(pm6, dat22);
__m512 dat23 = _mm512_maskz_loadu_ps(15, datPtr1+96+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat24 = _mm512_maskz_loadu_ps(15, datPtr1+272+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in13 = _mm512_permutexvar_ps(pm6, dat23);
__m512 in21 = _mm512_permutexvar_ps(pm6, dat24);
__m512 dat25 = _mm512_maskz_loadu_ps(15, datPtr1+112+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat26 = _mm512_maskz_loadu_ps(15, datPtr1+288+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in14 = _mm512_permutexvar_ps(pm6, dat25);
__m512 in22 = _mm512_permutexvar_ps(pm6, dat26);
__m512 tmp98 = _mm512_add_ps(in8, in12);
__m512 tmp102 = _mm512_add_ps(in16, in20);
__m512 tmp99 = _mm512_sub_ps(in11, in9);
__m512 tmp103 = _mm512_sub_ps(in19, in17);
__m512 tmp100 = _mm512_add_ps(in9, in13);
__m512 tmp104 = _mm512_add_ps(in17, in21);
in7 = _mm512_sub_ps(in7, in13);
in15 = _mm512_sub_ps(in15, in21);
tmp98 = _mm512_fmadd_ps(in10, _mm512_set1_ps(-4.25e+00f), tmp98);
tmp102 = _mm512_fmadd_ps(in18, _mm512_set1_ps(-4.25e+00f), tmp102);
tmp100 = _mm512_fmadd_ps(in11, _mm512_set1_ps(-4.25e+00f), tmp100);
tmp104 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-4.25e+00f), tmp104);
in7 = _mm512_fmadd_ps(tmp99, _mm512_set1_ps(5.25e+00f), in7);
in15 = _mm512_fmadd_ps(tmp103, _mm512_set1_ps(5.25e+00f), in15);
tmp99 = _mm512_fmadd_ps(in9, _mm512_set1_ps(2.5e-01f), in13);
tmp103 = _mm512_fmadd_ps(in17, _mm512_set1_ps(2.5e-01f), in21);
in9 = _mm512_fmadd_ps(in9, _mm512_set1_ps(4e+00f), in13);
in17 = _mm512_fmadd_ps(in17, _mm512_set1_ps(4e+00f), in21);
__m512 tmp101 = _mm512_sub_ps(tmp100, tmp98);
__m512 tmp105 = _mm512_sub_ps(tmp104, tmp102);
tmp100 = _mm512_add_ps(tmp98, tmp100);
tmp104 = _mm512_add_ps(tmp102, tmp104);
tmp98 = _mm512_fmadd_ps(in8, _mm512_set1_ps(2.5e-01f), in12);
tmp102 = _mm512_fmadd_ps(in16, _mm512_set1_ps(2.5e-01f), in20);
tmp99 = _mm512_fmadd_ps(in11, _mm512_set1_ps(-1.25e+00f), tmp99);
tmp103 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-1.25e+00f), tmp103);
in11 = _mm512_fmadd_ps(in11, _mm512_set1_ps(-5e+00f), in9);
in19 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-5e+00f), in17);
tmp98 = _mm512_fmadd_ps(in10, _mm512_set1_ps(-1.25e+00f), tmp98);
tmp102 = _mm512_fmadd_ps(in18, _mm512_set1_ps(-1.25e+00f), tmp102);
in13 = _mm512_fmadd_ps(tmp98, _mm512_set1_ps(2e+00f), tmp99);
in21 = _mm512_fmadd_ps(tmp102, _mm512_set1_ps(2e+00f), tmp103);
tmp99 = _mm512_fnmadd_ps(tmp98, _mm512_set1_ps(2e+00f), tmp99);
tmp103 = _mm512_fnmadd_ps(tmp102, _mm512_set1_ps(2e+00f), tmp103);
tmp98 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2.5e-01f), in8);
tmp102 = _mm512_fmadd_ps(in20, _mm512_set1_ps(2.5e-01f), in16);
in8 = _mm512_sub_ps(in14, in8);
in16 = _mm512_sub_ps(in22, in16);
tmp98 = _mm512_fmadd_ps(in10, _mm512_set1_ps(-1.25e+00f), tmp98);
tmp102 = _mm512_fmadd_ps(in18, _mm512_set1_ps(-1.25e+00f), tmp102);
in10 = _mm512_sub_ps(in10, in12);
in18 = _mm512_sub_ps(in18, in20);
in10 = _mm512_fmadd_ps(in10, _mm512_set1_ps(5.25e+00f), in8);
in18 = _mm512_fmadd_ps(in18, _mm512_set1_ps(5.25e+00f), in16);
in9 = _mm512_fmadd_ps(tmp98, _mm512_set1_ps(2e+00f), in11);
in17 = _mm512_fmadd_ps(tmp102, _mm512_set1_ps(2e+00f), in19);
in11 = _mm512_fnmadd_ps(tmp98, _mm512_set1_ps(2e+00f), in11);
in19 = _mm512_fnmadd_ps(tmp102, _mm512_set1_ps(2e+00f), in19);
__m512 tmp118 = _mm512_unpacklo_ps(in7, tmp100);
__m512 tmp119 = _mm512_unpackhi_ps(in7, tmp100);
__m512 tmp120 = _mm512_unpacklo_ps(tmp101, in13);
__m512 tmp121 = _mm512_unpackhi_ps(tmp101, in13);
__m512 tmp122 = _mm512_unpacklo_ps(tmp99, in9);
__m512 tmp123 = _mm512_unpackhi_ps(tmp99, in9);
__m512 tmp124 = _mm512_unpacklo_ps(in11, in10);
__m512 tmp125 = _mm512_unpackhi_ps(in11, in10);
__m512 tmp126 = _mm512_unpacklo_ps(in15, tmp104);
__m512 tmp127 = _mm512_unpackhi_ps(in15, tmp104);
__m512 tmp128 = _mm512_unpacklo_ps(tmp105, in21);
__m512 tmp129 = _mm512_unpackhi_ps(tmp105, in21);
__m512 tmp130 = _mm512_unpacklo_ps(tmp103, in17);
__m512 tmp131 = _mm512_unpackhi_ps(tmp103, in17);
__m512 tmp132 = _mm512_unpacklo_ps(in19, in18);
__m512 tmp133 = _mm512_unpackhi_ps(in19, in18);
__m512 tmp134 = _mm512_shuffle_ps(tmp118, tmp120, 68);
__m512 tmp135 = _mm512_shuffle_ps(tmp118, tmp120, 238);
__m512 tmp136 = _mm512_shuffle_ps(tmp119, tmp121, 68);
__m512 tmp137 = _mm512_shuffle_ps(tmp119, tmp121, 238);
__m512 tmp138 = _mm512_shuffle_ps(tmp122, tmp124, 68);
__m512 tmp139 = _mm512_shuffle_ps(tmp122, tmp124, 238);
__m512 tmp140 = _mm512_shuffle_ps(tmp123, tmp125, 68);
__m512 tmp141 = _mm512_shuffle_ps(tmp123, tmp125, 238);
__m512 tmp142 = _mm512_shuffle_ps(tmp126, tmp128, 68);
__m512 tmp143 = _mm512_shuffle_ps(tmp126, tmp128, 238);
__m512 tmp144 = _mm512_shuffle_ps(tmp127, tmp129, 68);
__m512 tmp145 = _mm512_shuffle_ps(tmp127, tmp129, 238);
__m512 tmp146 = _mm512_shuffle_ps(tmp130, tmp132, 68);
__m512 tmp147 = _mm512_shuffle_ps(tmp130, tmp132, 238);
__m512 tmp148 = _mm512_shuffle_ps(tmp131, tmp133, 68);
__m512 tmp149 = _mm512_shuffle_ps(tmp131, tmp133, 238);
__m512 tmp150 = _mm512_shuffle_f32x4(tmp134, tmp138, 136);
__m512 tmp151 = _mm512_shuffle_f32x4(tmp134, tmp138, 221);
__m512 tmp152 = _mm512_shuffle_f32x4(tmp135, tmp139, 136);
__m512 tmp153 = _mm512_shuffle_f32x4(tmp135, tmp139, 221);
__m512 tmp154 = _mm512_shuffle_f32x4(tmp136, tmp140, 136);
__m512 tmp155 = _mm512_shuffle_f32x4(tmp136, tmp140, 221);
__m512 tmp156 = _mm512_shuffle_f32x4(tmp137, tmp141, 136);
__m512 tmp157 = _mm512_shuffle_f32x4(tmp137, tmp141, 221);
__m512 tmp158 = _mm512_shuffle_f32x4(tmp142, tmp146, 136);
__m512 tmp159 = _mm512_shuffle_f32x4(tmp142, tmp146, 221);
__m512 tmp160 = _mm512_shuffle_f32x4(tmp143, tmp147, 136);
__m512 tmp161 = _mm512_shuffle_f32x4(tmp143, tmp147, 221);
__m512 tmp162 = _mm512_shuffle_f32x4(tmp144, tmp148, 136);
__m512 tmp163 = _mm512_shuffle_f32x4(tmp144, tmp148, 221);
__m512 tmp164 = _mm512_shuffle_f32x4(tmp145, tmp149, 136);
__m512 tmp165 = _mm512_shuffle_f32x4(tmp145, tmp149, 221);
in7 = _mm512_shuffle_f32x4(tmp150, tmp158, 136);
in15 = _mm512_shuffle_f32x4(tmp150, tmp158, 221);
tmp100 = _mm512_shuffle_f32x4(tmp152, tmp160, 136);
tmp104 = _mm512_shuffle_f32x4(tmp152, tmp160, 221);
tmp101 = _mm512_shuffle_f32x4(tmp154, tmp162, 136);
tmp105 = _mm512_shuffle_f32x4(tmp154, tmp162, 221);
in13 = _mm512_shuffle_f32x4(tmp156, tmp164, 136);
in21 = _mm512_shuffle_f32x4(tmp156, tmp164, 221);
tmp99 = _mm512_shuffle_f32x4(tmp151, tmp159, 136);
in9 = _mm512_shuffle_f32x4(tmp153, tmp161, 136);
in11 = _mm512_shuffle_f32x4(tmp155, tmp163, 136);
in10 = _mm512_shuffle_f32x4(tmp157, tmp165, 136);
(void)tmp99;
(void)in9;
(void)in11;
(void)in10;
__m512 tmp106 = tmp100;
__m512 tmp112 = tmp104;
__m512 tmp107 = _mm512_sub_ps(_mm512_setzero_ps(), tmp101);
__m512 tmp113 = _mm512_sub_ps(_mm512_setzero_ps(), tmp105);
__m512 tmp108 = tmp101;
__m512 tmp114 = tmp105;
in7 = in7;
in15 = in15;
tmp106 = _mm512_fmadd_ps(in13, _mm512_set1_ps(-4.25e+00f), tmp106);
tmp112 = _mm512_fmadd_ps(in21, _mm512_set1_ps(-4.25e+00f), tmp112);
tmp108 = tmp108;
tmp114 = tmp114;
in7 = _mm512_fmadd_ps(tmp107, _mm512_set1_ps(5.25e+00f), in7);
in15 = _mm512_fmadd_ps(tmp113, _mm512_set1_ps(5.25e+00f), in15);
tmp107 = _mm512_mul_ps(tmp101, _mm512_set1_ps(2.5e-01f));
tmp113 = _mm512_mul_ps(tmp105, _mm512_set1_ps(2.5e-01f));
tmp101 = _mm512_mul_ps(tmp101, _mm512_set1_ps(4e+00f));
tmp105 = _mm512_mul_ps(tmp105, _mm512_set1_ps(4e+00f));
__m512 tmp109 = _mm512_sub_ps(tmp108, tmp106);
__m512 tmp115 = _mm512_sub_ps(tmp114, tmp112);
tmp108 = _mm512_add_ps(tmp106, tmp108);
tmp114 = _mm512_add_ps(tmp112, tmp114);
tmp106 = _mm512_mul_ps(tmp100, _mm512_set1_ps(2.5e-01f));
tmp112 = _mm512_mul_ps(tmp104, _mm512_set1_ps(2.5e-01f));
tmp107 = tmp107;
tmp113 = tmp113;
__m512 tmp110 = tmp101;
__m512 tmp116 = tmp105;
tmp106 = _mm512_fmadd_ps(in13, _mm512_set1_ps(-1.25e+00f), tmp106);
tmp112 = _mm512_fmadd_ps(in21, _mm512_set1_ps(-1.25e+00f), tmp112);
__m512 tmp111 = _mm512_fmadd_ps(tmp106, _mm512_set1_ps(2e+00f), tmp107);
__m512 tmp117 = _mm512_fmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp113);
tmp107 = _mm512_fnmadd_ps(tmp106, _mm512_set1_ps(2e+00f), tmp107);
tmp113 = _mm512_fnmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp113);
tmp106 = tmp100;
tmp112 = tmp104;
tmp100 = _mm512_sub_ps(_mm512_setzero_ps(), tmp100);
tmp104 = _mm512_sub_ps(_mm512_setzero_ps(), tmp104);
tmp106 = _mm512_fmadd_ps(in13, _mm512_set1_ps(-1.25e+00f), tmp106);
tmp112 = _mm512_fmadd_ps(in21, _mm512_set1_ps(-1.25e+00f), tmp112);
in13 = in13;
in21 = in21;
in13 = _mm512_fmadd_ps(in13, _mm512_set1_ps(5.25e+00f), tmp100);
in21 = _mm512_fmadd_ps(in21, _mm512_set1_ps(5.25e+00f), tmp104);
tmp101 = _mm512_fmadd_ps(tmp106, _mm512_set1_ps(2e+00f), tmp110);
tmp105 = _mm512_fmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp116);
tmp110 = _mm512_fnmadd_ps(tmp106, _mm512_set1_ps(2e+00f), tmp110);
tmp116 = _mm512_fnmadd_ps(tmp112, _mm512_set1_ps(2e+00f), tmp116);
__m512 out29 = _mm512_shuffle_f32x4(in7, tmp108, 68);
__m512 out37 = _mm512_shuffle_f32x4(in7, tmp108, 238);
__m512 out30 = _mm512_shuffle_f32x4(tmp109, tmp111, 68);
__m512 out38 = _mm512_shuffle_f32x4(tmp109, tmp111, 238);
__m512 out31 = _mm512_shuffle_f32x4(tmp107, tmp101, 68);
__m512 out39 = _mm512_shuffle_f32x4(tmp107, tmp101, 238);
__m512 out32 = _mm512_shuffle_f32x4(tmp110, in13, 68);
__m512 out40 = _mm512_shuffle_f32x4(tmp110, in13, 238);
__m512 out33 = _mm512_shuffle_f32x4(in15, tmp114, 68);
__m512 out41 = _mm512_shuffle_f32x4(in15, tmp114, 238);
__m512 out34 = _mm512_shuffle_f32x4(tmp115, tmp117, 68);
__m512 out42 = _mm512_shuffle_f32x4(tmp115, tmp117, 238);
__m512 out35 = _mm512_shuffle_f32x4(tmp113, tmp105, 68);
__m512 out43 = _mm512_shuffle_f32x4(tmp113, tmp105, 238);
__m512 out36 = _mm512_shuffle_f32x4(tmp116, in21, 68);
__m512 out44 = _mm512_shuffle_f32x4(tmp116, in21, 238);
_mm512_storeu_ps(dfPtr1+0+12800*i6+9600*j2+3200*s3+256*k3, out29);
_mm512_storeu_ps(dfPtr1+128+12800*i6+9600*j2+3200*s3+256*k3, out37);
_mm512_storeu_ps(dfPtr1+64+12800*i6+9600*j2+3200*s3+256*k3, out33);
_mm512_storeu_ps(dfPtr1+192+12800*i6+9600*j2+3200*s3+256*k3, out41);
_mm512_storeu_ps(dfPtr1+3200+12800*i6+9600*j2+3200*s3+256*k3, out30);
_mm512_storeu_ps(dfPtr1+3328+12800*i6+9600*j2+3200*s3+256*k3, out38);
_mm512_storeu_ps(dfPtr1+3264+12800*i6+9600*j2+3200*s3+256*k3, out34);
_mm512_storeu_ps(dfPtr1+3392+12800*i6+9600*j2+3200*s3+256*k3, out42);
_mm512_storeu_ps(dfPtr1+6400+12800*i6+9600*j2+3200*s3+256*k3, out31);
_mm512_storeu_ps(dfPtr1+6528+12800*i6+9600*j2+3200*s3+256*k3, out39);
_mm512_storeu_ps(dfPtr1+6464+12800*i6+9600*j2+3200*s3+256*k3, out35);
_mm512_storeu_ps(dfPtr1+6592+12800*i6+9600*j2+3200*s3+256*k3, out43);
_mm512_storeu_ps(dfPtr1+9600+12800*i6+9600*j2+3200*s3+256*k3, out32);
_mm512_storeu_ps(dfPtr1+9728+12800*i6+9600*j2+3200*s3+256*k3, out40);
_mm512_storeu_ps(dfPtr1+9664+12800*i6+9600*j2+3200*s3+256*k3, out36);
_mm512_storeu_ps(dfPtr1+9792+12800*i6+9600*j2+3200*s3+256*k3, out44);
}
__m512 dat27 = _mm512_maskz_loadu_ps(15, datPtr1+0+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat28 = _mm512_maskz_loadu_ps(15, datPtr1+96+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512i pm7 = _mm512_set_epi32(15, 15, 15, 15, 19, 18, 17, 16, 15, 15, 15, 15, 3, 2, 1, 0);
__m512 in23 = _mm512_permutex2var_ps(dat27, pm7, dat28);
__m512 dat29 = _mm512_maskz_loadu_ps(15, datPtr1+16+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat30 = _mm512_maskz_loadu_ps(15, datPtr1+112+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in24 = _mm512_permutex2var_ps(dat29, pm7, dat30);
__m512 dat31 = _mm512_maskz_loadu_ps(15, datPtr1+32+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat32 = _mm512_maskz_loadu_ps(15, datPtr1+128+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in25 = _mm512_permutex2var_ps(dat31, pm7, dat32);
__m512 dat33 = _mm512_maskz_loadu_ps(15, datPtr1+48+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat34 = _mm512_maskz_loadu_ps(15, datPtr1+144+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in26 = _mm512_permutex2var_ps(dat33, pm7, dat34);
__m512 dat35 = _mm512_maskz_loadu_ps(15, datPtr1+64+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 dat36 = _mm512_maskz_loadu_ps(15, datPtr1+160+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in27 = _mm512_permutex2var_ps(dat35, pm7, dat36);
__m512 dat37 = _mm512_maskz_loadu_ps(15, datPtr1+80+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512i pm8 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 3, 2, 1, 0);
__m512 in28 = _mm512_permutexvar_ps(pm8, dat37);
__m512 dat38 = _mm512_maskz_loadu_ps(15, datPtr1+96+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in29 = _mm512_permutexvar_ps(pm8, dat38);
__m512 dat39 = _mm512_maskz_loadu_ps(15, datPtr1+112+4400*i6+16*h1+4*w1+4400*s3+352*k3);
__m512 in30 = _mm512_permutexvar_ps(pm8, dat39);
__m512 tmp166 = _mm512_add_ps(in24, in28);
__m512 tmp167 = _mm512_sub_ps(in27, in25);
__m512 tmp168 = _mm512_add_ps(in25, in29);
in23 = _mm512_sub_ps(in23, in29);
tmp166 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-4.25e+00f), tmp166);
tmp168 = _mm512_fmadd_ps(in27, _mm512_set1_ps(-4.25e+00f), tmp168);
in23 = _mm512_fmadd_ps(tmp167, _mm512_set1_ps(5.25e+00f), in23);
tmp167 = _mm512_fmadd_ps(in25, _mm512_set1_ps(2.5e-01f), in29);
in25 = _mm512_fmadd_ps(in25, _mm512_set1_ps(4e+00f), in29);
__m512 tmp169 = _mm512_sub_ps(tmp168, tmp166);
tmp168 = _mm512_add_ps(tmp166, tmp168);
tmp166 = _mm512_fmadd_ps(in24, _mm512_set1_ps(2.5e-01f), in28);
tmp167 = _mm512_fmadd_ps(in27, _mm512_set1_ps(-1.25e+00f), tmp167);
in27 = _mm512_fmadd_ps(in27, _mm512_set1_ps(-5e+00f), in25);
tmp166 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-1.25e+00f), tmp166);
in29 = _mm512_fmadd_ps(tmp166, _mm512_set1_ps(2e+00f), tmp167);
tmp167 = _mm512_fnmadd_ps(tmp166, _mm512_set1_ps(2e+00f), tmp167);
tmp166 = _mm512_fmadd_ps(in28, _mm512_set1_ps(2.5e-01f), in24);
in24 = _mm512_sub_ps(in30, in24);
tmp166 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-1.25e+00f), tmp166);
in26 = _mm512_sub_ps(in26, in28);
in26 = _mm512_fmadd_ps(in26, _mm512_set1_ps(5.25e+00f), in24);
in25 = _mm512_fmadd_ps(tmp166, _mm512_set1_ps(2e+00f), in27);
in27 = _mm512_fnmadd_ps(tmp166, _mm512_set1_ps(2e+00f), in27);
__m512 tmp186 = _mm512_unpacklo_ps(in23, tmp168);
__m512 tmp187 = _mm512_unpackhi_ps(in23, tmp168);
__m512 tmp188 = _mm512_unpacklo_ps(tmp169, in29);
__m512 tmp189 = _mm512_unpackhi_ps(tmp169, in29);
__m512 tmp190 = _mm512_unpacklo_ps(tmp167, in25);
__m512 tmp191 = _mm512_unpackhi_ps(tmp167, in25);
__m512 tmp192 = _mm512_unpacklo_ps(in27, in26);
__m512 tmp193 = _mm512_unpackhi_ps(in27, in26);
__m512 tmp194 = _mm512_shuffle_ps(tmp186, tmp188, 68);
__m512 tmp195 = _mm512_shuffle_ps(tmp186, tmp188, 238);
__m512 tmp196 = _mm512_shuffle_ps(tmp187, tmp189, 68);
__m512 tmp197 = _mm512_shuffle_ps(tmp187, tmp189, 238);
__m512 tmp198 = _mm512_shuffle_ps(tmp190, tmp192, 68);
__m512 tmp199 = _mm512_shuffle_ps(tmp190, tmp192, 238);
__m512 tmp200 = _mm512_shuffle_ps(tmp191, tmp193, 68);
__m512 tmp201 = _mm512_shuffle_ps(tmp191, tmp193, 238);
__m512 tmp202 = _mm512_shuffle_f32x4(tmp194, tmp198, 136);
__m512 tmp203 = _mm512_shuffle_f32x4(tmp194, tmp198, 221);
__m512 tmp204 = _mm512_shuffle_f32x4(tmp195, tmp199, 136);
__m512 tmp205 = _mm512_shuffle_f32x4(tmp195, tmp199, 221);
__m512 tmp206 = _mm512_shuffle_f32x4(tmp196, tmp200, 136);
__m512 tmp207 = _mm512_shuffle_f32x4(tmp196, tmp200, 221);
__m512 tmp208 = _mm512_shuffle_f32x4(tmp197, tmp201, 136);
__m512 tmp209 = _mm512_shuffle_f32x4(tmp197, tmp201, 221);
in23 = _mm512_shuffle_f32x4(tmp202, tmp202, 136);
__m512 tmp170 = _mm512_shuffle_f32x4(tmp202, tmp202, 221);
tmp168 = _mm512_shuffle_f32x4(tmp204, tmp204, 136);
__m512 tmp171 = _mm512_shuffle_f32x4(tmp204, tmp204, 221);
tmp169 = _mm512_shuffle_f32x4(tmp206, tmp206, 136);
__m512 tmp172 = _mm512_shuffle_f32x4(tmp206, tmp206, 221);
in29 = _mm512_shuffle_f32x4(tmp208, tmp208, 136);
__m512 tmp173 = _mm512_shuffle_f32x4(tmp208, tmp208, 221);
tmp167 = _mm512_shuffle_f32x4(tmp203, tmp203, 136);
in25 = _mm512_shuffle_f32x4(tmp205, tmp205, 136);
in27 = _mm512_shuffle_f32x4(tmp207, tmp207, 136);
in26 = _mm512_shuffle_f32x4(tmp209, tmp209, 136);
(void)tmp167;
(void)in25;
(void)in27;
(void)in26;
__m512 tmp174 = tmp168;
__m512 tmp180 = tmp171;
__m512 tmp175 = _mm512_sub_ps(_mm512_setzero_ps(), tmp169);
__m512 tmp181 = _mm512_sub_ps(_mm512_setzero_ps(), tmp172);
__m512 tmp176 = tmp169;
__m512 tmp182 = tmp172;
in23 = in23;
tmp170 = tmp170;
tmp174 = _mm512_fmadd_ps(in29, _mm512_set1_ps(-4.25e+00f), tmp174);
tmp180 = _mm512_fmadd_ps(tmp173, _mm512_set1_ps(-4.25e+00f), tmp180);
tmp176 = tmp176;
tmp182 = tmp182;
in23 = _mm512_fmadd_ps(tmp175, _mm512_set1_ps(5.25e+00f), in23);
tmp170 = _mm512_fmadd_ps(tmp181, _mm512_set1_ps(5.25e+00f), tmp170);
tmp175 = _mm512_mul_ps(tmp169, _mm512_set1_ps(2.5e-01f));
tmp181 = _mm512_mul_ps(tmp172, _mm512_set1_ps(2.5e-01f));
tmp169 = _mm512_mul_ps(tmp169, _mm512_set1_ps(4e+00f));
tmp172 = _mm512_mul_ps(tmp172, _mm512_set1_ps(4e+00f));
__m512 tmp177 = _mm512_sub_ps(tmp176, tmp174);
__m512 tmp183 = _mm512_sub_ps(tmp182, tmp180);
tmp176 = _mm512_add_ps(tmp174, tmp176);
tmp182 = _mm512_add_ps(tmp180, tmp182);
tmp174 = _mm512_mul_ps(tmp168, _mm512_set1_ps(2.5e-01f));
tmp180 = _mm512_mul_ps(tmp171, _mm512_set1_ps(2.5e-01f));
tmp175 = tmp175;
tmp181 = tmp181;
__m512 tmp178 = tmp169;
__m512 tmp184 = tmp172;
tmp174 = _mm512_fmadd_ps(in29, _mm512_set1_ps(-1.25e+00f), tmp174);
tmp180 = _mm512_fmadd_ps(tmp173, _mm512_set1_ps(-1.25e+00f), tmp180);
__m512 tmp179 = _mm512_fmadd_ps(tmp174, _mm512_set1_ps(2e+00f), tmp175);
__m512 tmp185 = _mm512_fmadd_ps(tmp180, _mm512_set1_ps(2e+00f), tmp181);
tmp175 = _mm512_fnmadd_ps(tmp174, _mm512_set1_ps(2e+00f), tmp175);
tmp181 = _mm512_fnmadd_ps(tmp180, _mm512_set1_ps(2e+00f), tmp181);
tmp174 = tmp168;
tmp180 = tmp171;
tmp168 = _mm512_sub_ps(_mm512_setzero_ps(), tmp168);
tmp171 = _mm512_sub_ps(_mm512_setzero_ps(), tmp171);
tmp174 = _mm512_fmadd_ps(in29, _mm512_set1_ps(-1.25e+00f), tmp174);
tmp180 = _mm512_fmadd_ps(tmp173, _mm512_set1_ps(-1.25e+00f), tmp180);
in29 = in29;
tmp173 = tmp173;
in29 = _mm512_fmadd_ps(in29, _mm512_set1_ps(5.25e+00f), tmp168);
tmp173 = _mm512_fmadd_ps(tmp173, _mm512_set1_ps(5.25e+00f), tmp171);
tmp169 = _mm512_fmadd_ps(tmp174, _mm512_set1_ps(2e+00f), tmp178);
tmp172 = _mm512_fmadd_ps(tmp180, _mm512_set1_ps(2e+00f), tmp184);
tmp178 = _mm512_fnmadd_ps(tmp174, _mm512_set1_ps(2e+00f), tmp178);
tmp184 = _mm512_fnmadd_ps(tmp180, _mm512_set1_ps(2e+00f), tmp184);
__m512 out45 = _mm512_shuffle_f32x4(in23, tmp176, 68);
__m512 out46 = _mm512_shuffle_f32x4(tmp177, tmp179, 68);
__m512 out47 = _mm512_shuffle_f32x4(tmp175, tmp169, 68);
__m512 out48 = _mm512_shuffle_f32x4(tmp178, in29, 68);
__m512 out49 = _mm512_shuffle_f32x4(tmp170, tmp182, 68);
__m512 out50 = _mm512_shuffle_f32x4(tmp183, tmp185, 68);
__m512 out51 = _mm512_shuffle_f32x4(tmp181, tmp172, 68);
__m512 out52 = _mm512_shuffle_f32x4(tmp184, tmp173, 68);
_mm512_storeu_ps(dfPtr1+0+12800*i6+9600*j2+3200*s3+256*k3, out45);
_mm512_storeu_ps(dfPtr1+64+12800*i6+9600*j2+3200*s3+256*k3, out49);
_mm512_storeu_ps(dfPtr1+3200+12800*i6+9600*j2+3200*s3+256*k3, out46);
_mm512_storeu_ps(dfPtr1+3264+12800*i6+9600*j2+3200*s3+256*k3, out50);
_mm512_storeu_ps(dfPtr1+6400+12800*i6+9600*j2+3200*s3+256*k3, out47);
_mm512_storeu_ps(dfPtr1+6464+12800*i6+9600*j2+3200*s3+256*k3, out51);
_mm512_storeu_ps(dfPtr1+9600+12800*i6+9600*j2+3200*s3+256*k3, out48);
_mm512_storeu_ps(dfPtr1+9664+12800*i6+9600*j2+3200*s3+256*k3, out52);
++j2;
}

static void Example11ThreeArrangeDats1(Example11ThreaderTeam1* team15, char** tensors3) {
Example11ThreaderTask1 task7;
task7.callee1 = Example11ThreeArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 1;
task7.hull1[1] = 1;
task7.hull1[2] = 1;
task7.hull1[3] = 1;
Example11ThreaderDo1(team15, &task7);
}

static void Example11ThreeProduceSums1Callee1(Example11ThreaderTask1* task8, int64_t* pt9) {
void** pair2 = task8->any1;
char** tensors6 = pair2[0];
ptrdiff_t e3 = 0;
ptrdiff_t g4 = 0;
ptrdiff_t f2 = pt9[2];
ptrdiff_t d1 = 0;
ptrdiff_t w2 = pt9[0];
char*restrict bfPtr2 = tensors6[0]+876*e3;
char*restrict wfPtr2 = tensors6[0]+896+18529280*e3;
char*restrict dfPtr2 = tensors6[1]+338432*e3;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i7 = 1*g4;
ptrdiff_t j3 = 1*f2;
ptrdiff_t k4 = 1*d1;
ptrdiff_t l1 = 27*w2;
ptrdiff_t ll1 = l1+(w2 < 1 ? 26 : 27);
for (; l1 != 54; ++l1) {
__m512 sum2;
__m512 sum4;
__m512 sum6;
__m512 sum8;
if (__builtin_expect(!j3, 0)) {
sum2 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr2+0+876*i7+16*l1)));
sum4 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr2+4+876*i7+16*l1)));
sum6 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr2+8+876*i7+16*l1)));
sum8 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr2+12+876*i7+16*l1)));
} else {
sum2 = _mm512_setzero_ps();
sum4 = _mm512_setzero_ps();
sum6 = _mm512_setzero_ps();
sum8 = _mm512_setzero_ps();
}
__m512 sum3 = sum2;
__m512 sum5 = sum4;
__m512 sum7 = sum6;
__m512 sum9 = sum8;
ptrdiff_t b3 = 0;
for (; b3 != 25; ++b3) {
__m512i wfs1 = _mm512_maskz_loadu_epi32(65535, wfPtr2+0+700928*i7+175232*j3+3200*l1+128*b3);
__m512 wf29 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs1));
__m512 df1 = _mm512_loadu_ps(dfPtr2+0+12800*i7+3200*j3+9600*k4+128*b3);
sum2 = _mm512_fmadd_ps(wf29, df1, sum2);
__m512 df2 = _mm512_loadu_ps(dfPtr2+64+12800*i7+3200*j3+9600*k4+128*b3);
sum3 = _mm512_fmadd_ps(wf29, df2, sum3);
__m512 wf30 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs1, 1));
sum4 = _mm512_fmadd_ps(wf30, df1, sum4);
sum5 = _mm512_fmadd_ps(wf30, df2, sum5);
__m512i wfs2 = _mm512_maskz_loadu_epi32(65535, wfPtr2+64+700928*i7+175232*j3+3200*l1+128*b3);
__m512 wf31 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs2));
sum6 = _mm512_fmadd_ps(wf31, df1, sum6);
sum7 = _mm512_fmadd_ps(wf31, df2, sum7);
__m512 wf32 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs2, 1));
sum8 = _mm512_fmadd_ps(wf32, df1, sum8);
sum9 = _mm512_fmadd_ps(wf32, df2, sum9);
}
_mm512_storeu_ps(sfPtr1+0+112128*i7+28032*j3+84096*k4+512*l1, sum2);
_mm512_storeu_ps(sfPtr1+64+112128*i7+28032*j3+84096*k4+512*l1, sum3);
_mm512_storeu_ps(sfPtr1+128+112128*i7+28032*j3+84096*k4+512*l1, sum4);
_mm512_storeu_ps(sfPtr1+192+112128*i7+28032*j3+84096*k4+512*l1, sum5);
_mm512_storeu_ps(sfPtr1+256+112128*i7+28032*j3+84096*k4+512*l1, sum6);
_mm512_storeu_ps(sfPtr1+320+112128*i7+28032*j3+84096*k4+512*l1, sum7);
_mm512_storeu_ps(sfPtr1+384+112128*i7+28032*j3+84096*k4+512*l1, sum8);
_mm512_storeu_ps(sfPtr1+448+112128*i7+28032*j3+84096*k4+512*l1, sum9);
if (l1 >= ll1) return;
}
__m512 sum10;
__m512 sum12;
__m512 sum14;
if (__builtin_expect(!j3, 0)) {
sum10 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr2+0+876*i7+16*l1)));
sum12 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr2+4+876*i7+16*l1)));
sum14 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr2+8+876*i7+16*l1)));
} else {
sum10 = _mm512_setzero_ps();
sum12 = _mm512_setzero_ps();
sum14 = _mm512_setzero_ps();
}
__m512 sum11 = sum10;
__m512 sum13 = sum12;
__m512 sum15 = sum14;
ptrdiff_t b4 = 0;
for (; b4 != 12; ++b4) {
__m512i wfs3 = _mm512_maskz_loadu_epi32(65535, wfPtr2+0+700928*i7+175232*j3+3200*l1+192*b4);
__m512 wf33 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs3));
__m512 df3 = _mm512_loadu_ps(dfPtr2+0+12800*i7+3200*j3+9600*k4+256*b4);
sum10 = _mm512_fmadd_ps(wf33, df3, sum10);
__m512 df4 = _mm512_loadu_ps(dfPtr2+64+12800*i7+3200*j3+9600*k4+256*b4);
sum11 = _mm512_fmadd_ps(wf33, df4, sum11);
__m512 wf34 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs3, 1));
sum12 = _mm512_fmadd_ps(wf34, df3, sum12);
sum13 = _mm512_fmadd_ps(wf34, df4, sum13);
__m512i wfs4 = _mm512_maskz_loadu_epi32(65535, wfPtr2+64+700928*i7+175232*j3+3200*l1+192*b4);
__m512 wf35 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs4));
sum14 = _mm512_fmadd_ps(wf35, df3, sum14);
sum15 = _mm512_fmadd_ps(wf35, df4, sum15);
__m512 wf36 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs4, 1));
__m512 df5 = _mm512_loadu_ps(dfPtr2+128+12800*i7+3200*j3+9600*k4+256*b4);
sum10 = _mm512_fmadd_ps(wf36, df5, sum10);
__m512 df6 = _mm512_loadu_ps(dfPtr2+192+12800*i7+3200*j3+9600*k4+256*b4);
sum11 = _mm512_fmadd_ps(wf36, df6, sum11);
__m512i wfs5 = _mm512_maskz_loadu_epi32(65535, wfPtr2+128+700928*i7+175232*j3+3200*l1+192*b4);
__m512 wf37 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs5));
sum12 = _mm512_fmadd_ps(wf37, df5, sum12);
sum13 = _mm512_fmadd_ps(wf37, df6, sum13);
__m512 wf38 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs5, 1));
sum14 = _mm512_fmadd_ps(wf38, df5, sum14);
sum15 = _mm512_fmadd_ps(wf38, df6, sum15);
}
__m512i wfs6 = _mm512_maskz_loadu_epi32(65535, wfPtr2+0+700928*i7+175232*j3+3200*l1+192*b4);
__m512 wf39 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs6));
__m512 df7 = _mm512_loadu_ps(dfPtr2+0+12800*i7+3200*j3+9600*k4+256*b4);
sum10 = _mm512_fmadd_ps(wf39, df7, sum10);
__m512 df8 = _mm512_loadu_ps(dfPtr2+64+12800*i7+3200*j3+9600*k4+256*b4);
sum11 = _mm512_fmadd_ps(wf39, df8, sum11);
__m512 wf40 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs6, 1));
sum12 = _mm512_fmadd_ps(wf40, df7, sum12);
sum13 = _mm512_fmadd_ps(wf40, df8, sum13);
__m512i wfs7 = _mm512_maskz_loadu_epi32(255, wfPtr2+64+700928*i7+175232*j3+3200*l1+192*b4);
__m512 wf41 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs7));
sum14 = _mm512_fmadd_ps(wf41, df7, sum14);
sum15 = _mm512_fmadd_ps(wf41, df8, sum15);
_mm512_storeu_ps(sfPtr1+0+112128*i7+28032*j3+84096*k4+512*l1, sum10);
_mm512_storeu_ps(sfPtr1+64+112128*i7+28032*j3+84096*k4+512*l1, sum11);
_mm512_storeu_ps(sfPtr1+128+112128*i7+28032*j3+84096*k4+512*l1, sum12);
_mm512_storeu_ps(sfPtr1+192+112128*i7+28032*j3+84096*k4+512*l1, sum13);
_mm512_storeu_ps(sfPtr1+256+112128*i7+28032*j3+84096*k4+512*l1, sum14);
_mm512_storeu_ps(sfPtr1+320+112128*i7+28032*j3+84096*k4+512*l1, sum15);
}

static void Example11ThreeProduceSums1(Example11ThreaderTeam1* team16, char** tensors5) {
void* pair1[] = {tensors5, 0};
Example11ThreaderTask1 task9;
task9.callee1 = Example11ThreeProduceSums1Callee1;
task9.any1 = pair1;
task9.nd1 = 4;
task9.hull1[0] = 2;
task9.hull1[1] = 1;
task9.hull1[2] = 4;
task9.hull1[3] = 1;
Example11ThreaderDo1(team16, &task9);
}

static void Example11ThreeConsumeSums1Callee1(Example11ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t w3 = 0;
ptrdiff_t d2 = 0;
ptrdiff_t g5 = 0;
(void)pt10;
char*restrict sfPtr2 = tensors8[0];
char*restrict datPtr2 = tensors8[1];
ptrdiff_t i8 = 1*g5;
ptrdiff_t j4 = 1*d2;
ptrdiff_t rel2 = j4-0;
ptrdiff_t base2 = 0;
ptrdiff_t toH1 = base2+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k5 = 55*w3;
for (; k5 != 54; ++k5) {
ptrdiff_t l2 = 0;
for (; l2 != 2; ++l2) {
__m512 sf1 = _mm512_loadu_ps(sfPtr2+0+112128*i8+84096*j4+512*k5+256*l2);
__m512 sf2 = _mm512_loadu_ps(sfPtr2+64+112128*i8+84096*j4+512*k5+256*l2);
__m512 in31 = _mm512_shuffle_f32x4(sf1, sf2, 68);
__m512 in32 = _mm512_shuffle_f32x4(sf1, sf2, 238);
__m512 sf3 = _mm512_loadu_ps(sfPtr2+128+112128*i8+84096*j4+512*k5+256*l2);
__m512 sf4 = _mm512_loadu_ps(sfPtr2+192+112128*i8+84096*j4+512*k5+256*l2);
__m512 in39 = _mm512_shuffle_f32x4(sf3, sf4, 68);
__m512 in40 = _mm512_shuffle_f32x4(sf3, sf4, 238);
__m512 sf5 = _mm512_loadu_ps(sfPtr2+28032+112128*i8+84096*j4+512*k5+256*l2);
__m512 sf6 = _mm512_loadu_ps(sfPtr2+28096+112128*i8+84096*j4+512*k5+256*l2);
__m512 in33 = _mm512_shuffle_f32x4(sf5, sf6, 68);
__m512 in34 = _mm512_shuffle_f32x4(sf5, sf6, 238);
__m512 sf7 = _mm512_loadu_ps(sfPtr2+28160+112128*i8+84096*j4+512*k5+256*l2);
__m512 sf8 = _mm512_loadu_ps(sfPtr2+28224+112128*i8+84096*j4+512*k5+256*l2);
__m512 in41 = _mm512_shuffle_f32x4(sf7, sf8, 68);
__m512 in42 = _mm512_shuffle_f32x4(sf7, sf8, 238);
__m512 sf9 = _mm512_loadu_ps(sfPtr2+56064+112128*i8+84096*j4+512*k5+256*l2);
__m512 sf10 = _mm512_loadu_ps(sfPtr2+56128+112128*i8+84096*j4+512*k5+256*l2);
__m512 in35 = _mm512_shuffle_f32x4(sf9, sf10, 68);
__m512 in36 = _mm512_shuffle_f32x4(sf9, sf10, 238);
__m512 sf11 = _mm512_loadu_ps(sfPtr2+56192+112128*i8+84096*j4+512*k5+256*l2);
__m512 sf12 = _mm512_loadu_ps(sfPtr2+56256+112128*i8+84096*j4+512*k5+256*l2);
__m512 in43 = _mm512_shuffle_f32x4(sf11, sf12, 68);
__m512 in44 = _mm512_shuffle_f32x4(sf11, sf12, 238);
__m512 sf13 = _mm512_loadu_ps(sfPtr2+84096+112128*i8+84096*j4+512*k5+256*l2);
__m512 sf14 = _mm512_loadu_ps(sfPtr2+84160+112128*i8+84096*j4+512*k5+256*l2);
__m512 in37 = _mm512_shuffle_f32x4(sf13, sf14, 68);
__m512 in38 = _mm512_shuffle_f32x4(sf13, sf14, 238);
__m512 sf15 = _mm512_loadu_ps(sfPtr2+84224+112128*i8+84096*j4+512*k5+256*l2);
__m512 sf16 = _mm512_loadu_ps(sfPtr2+84288+112128*i8+84096*j4+512*k5+256*l2);
__m512 in45 = _mm512_shuffle_f32x4(sf15, sf16, 68);
__m512 in46 = _mm512_shuffle_f32x4(sf15, sf16, 238);
(void)in38;
(void)in46;
__m512 tmp218 = _mm512_add_ps(in32, in33);
__m512 tmp229 = _mm512_add_ps(in40, in41);
__m512 tmp217 = _mm512_add_ps(in34, in35);
__m512 tmp228 = _mm512_add_ps(in42, in43);
__m512 tmp223 = _mm512_sub_ps(in34, in35);
__m512 tmp234 = _mm512_sub_ps(in42, in43);
__m512 tmp222 = _mm512_sub_ps(in32, in33);
__m512 tmp233 = _mm512_sub_ps(in40, in41);
__m512 tmp219 = _mm512_add_ps(in36, in37);
__m512 tmp230 = _mm512_add_ps(in44, in45);
__m512 tmp224 = _mm512_sub_ps(in36, in37);
__m512 tmp235 = _mm512_sub_ps(in44, in45);
__m512 tmp221 = _mm512_fmadd_ps(tmp223, _mm512_set1_ps(2e+00f), tmp222);
__m512 tmp232 = _mm512_fmadd_ps(tmp234, _mm512_set1_ps(2e+00f), tmp233);
__m512 tmp216 = _mm512_add_ps(tmp217, tmp218);
__m512 tmp227 = _mm512_add_ps(tmp228, tmp229);
__m512 tmp220 = _mm512_fmadd_ps(tmp224, _mm512_set1_ps(1.6e+01f), tmp221);
__m512 tmp231 = _mm512_fmadd_ps(tmp235, _mm512_set1_ps(1.6e+01f), tmp232);
__m512 tmp215 = _mm512_add_ps(tmp216, in31);
__m512 tmp226 = _mm512_add_ps(tmp227, in39);
__m512 tmp214 = _mm512_fmadd_ps(tmp219, _mm512_set1_ps(3.2e+01f), tmp215);
__m512 tmp225 = _mm512_fmadd_ps(tmp230, _mm512_set1_ps(3.2e+01f), tmp226);
__m512 tmp210 = tmp214;
__m512 tmp212 = tmp225;
__m512 tmp211 = tmp220;
__m512 tmp213 = tmp231;
__m512 tmp236 = _mm512_setzero_ps();
__m512 tmp237 = _mm512_setzero_ps();
__m512 tmp238 = _mm512_setzero_ps();
__m512 tmp239 = _mm512_setzero_ps();
__m512 tmp281 = _mm512_unpacklo_ps(tmp210, tmp211);
__m512 tmp282 = _mm512_unpackhi_ps(tmp210, tmp211);
__m512 tmp283 = _mm512_unpacklo_ps(tmp236, tmp237);
__m512 tmp284 = _mm512_unpackhi_ps(tmp236, tmp237);
__m512 tmp285 = _mm512_unpacklo_ps(tmp238, tmp239);
__m512 tmp286 = _mm512_unpackhi_ps(tmp238, tmp239);
__m512 tmp287 = _mm512_unpacklo_ps(tmp212, tmp213);
__m512 tmp288 = _mm512_unpackhi_ps(tmp212, tmp213);
__m512 tmp289 = _mm512_shuffle_ps(tmp281, tmp283, 68);
__m512 tmp290 = _mm512_shuffle_ps(tmp281, tmp283, 238);
__m512 tmp291 = _mm512_shuffle_ps(tmp282, tmp284, 68);
__m512 tmp292 = _mm512_shuffle_ps(tmp282, tmp284, 238);
__m512 tmp293 = _mm512_shuffle_ps(tmp285, tmp287, 68);
__m512 tmp294 = _mm512_shuffle_ps(tmp285, tmp287, 238);
__m512 tmp295 = _mm512_shuffle_ps(tmp286, tmp288, 68);
__m512 tmp296 = _mm512_shuffle_ps(tmp286, tmp288, 238);
__m512 tmp297 = _mm512_shuffle_f32x4(tmp289, tmp293, 136);
__m512 tmp298 = _mm512_shuffle_f32x4(tmp289, tmp293, 221);
__m512 tmp299 = _mm512_shuffle_f32x4(tmp290, tmp294, 136);
__m512 tmp300 = _mm512_shuffle_f32x4(tmp290, tmp294, 221);
__m512 tmp301 = _mm512_shuffle_f32x4(tmp291, tmp295, 136);
__m512 tmp302 = _mm512_shuffle_f32x4(tmp291, tmp295, 221);
__m512 tmp303 = _mm512_shuffle_f32x4(tmp292, tmp296, 136);
__m512 tmp304 = _mm512_shuffle_f32x4(tmp292, tmp296, 221);
tmp210 = _mm512_shuffle_f32x4(tmp297, tmp297, 136);
__m512 tmp240 = _mm512_shuffle_f32x4(tmp297, tmp297, 221);
tmp211 = _mm512_shuffle_f32x4(tmp299, tmp299, 136);
__m512 tmp241 = _mm512_shuffle_f32x4(tmp299, tmp299, 221);
tmp236 = _mm512_shuffle_f32x4(tmp301, tmp301, 136);
__m512 tmp242 = _mm512_shuffle_f32x4(tmp301, tmp301, 221);
tmp237 = _mm512_shuffle_f32x4(tmp303, tmp303, 136);
__m512 tmp243 = _mm512_shuffle_f32x4(tmp303, tmp303, 221);
tmp238 = _mm512_shuffle_f32x4(tmp298, tmp298, 136);
__m512 tmp244 = _mm512_shuffle_f32x4(tmp298, tmp298, 221);
tmp239 = _mm512_shuffle_f32x4(tmp300, tmp300, 136);
__m512 tmp245 = _mm512_shuffle_f32x4(tmp300, tmp300, 221);
tmp212 = _mm512_shuffle_f32x4(tmp302, tmp302, 136);
__m512 tmp246 = _mm512_shuffle_f32x4(tmp302, tmp302, 221);
tmp213 = _mm512_shuffle_f32x4(tmp304, tmp304, 136);
__m512 tmp247 = _mm512_shuffle_f32x4(tmp304, tmp304, 221);
(void)tmp247;
__m512 tmp252 = _mm512_add_ps(tmp211, tmp236);
__m512 tmp272 = _mm512_add_ps(tmp241, tmp242);
__m512 tmp251 = _mm512_add_ps(tmp237, tmp238);
__m512 tmp271 = _mm512_add_ps(tmp243, tmp244);
__m512 tmp257 = _mm512_sub_ps(tmp237, tmp238);
__m512 tmp277 = _mm512_sub_ps(tmp243, tmp244);
__m512 tmp256 = _mm512_sub_ps(tmp211, tmp236);
__m512 tmp276 = _mm512_sub_ps(tmp241, tmp242);
__m512 tmp253 = _mm512_add_ps(tmp239, tmp212);
__m512 tmp273 = _mm512_add_ps(tmp245, tmp246);
__m512 tmp258 = _mm512_sub_ps(tmp239, tmp212);
__m512 tmp278 = _mm512_sub_ps(tmp245, tmp246);
__m512 tmp255 = _mm512_fmadd_ps(tmp257, _mm512_set1_ps(2e+00f), tmp256);
__m512 tmp275 = _mm512_fmadd_ps(tmp277, _mm512_set1_ps(2e+00f), tmp276);
__m512 tmp262 = _mm512_fmadd_ps(tmp257, _mm512_set1_ps(8e+00f), tmp256);
__m512 tmp250 = _mm512_add_ps(tmp251, tmp252);
__m512 tmp270 = _mm512_add_ps(tmp271, tmp272);
__m512 tmp254 = _mm512_fmadd_ps(tmp258, _mm512_set1_ps(1.6e+01f), tmp255);
__m512 tmp274 = _mm512_fmadd_ps(tmp278, _mm512_set1_ps(1.6e+01f), tmp275);
__m512 tmp261 = _mm512_fmadd_ps(tmp258, _mm512_set1_ps(4e+00f), tmp262);
__m512 tmp267 = _mm512_add_ps(tmp258, tmp256);
__m512 tmp260 = _mm512_fmadd_ps(tmp251, _mm512_set1_ps(4e+00f), tmp252);
__m512 tmp280 = _mm512_fmadd_ps(tmp271, _mm512_set1_ps(4e+00f), tmp272);
__m512 tmp264 = _mm512_fmadd_ps(tmp251, _mm512_set1_ps(1.6e+01f), tmp252);
__m512 tmp249 = _mm512_add_ps(tmp250, tmp210);
__m512 tmp269 = _mm512_add_ps(tmp270, tmp240);
__m512 tmp266 = _mm512_add_ps(tmp267, tmp213);
__m512 tmp248 = _mm512_fmadd_ps(tmp253, _mm512_set1_ps(3.2e+01f), tmp249);
__m512 tmp268 = _mm512_fmadd_ps(tmp273, _mm512_set1_ps(3.2e+01f), tmp269);
__m512 tmp259 = _mm512_fmadd_ps(tmp253, _mm512_set1_ps(8e+00f), tmp260);
__m512 tmp279 = _mm512_fmadd_ps(tmp273, _mm512_set1_ps(8e+00f), tmp280);
__m512 tmp265 = _mm512_fmadd_ps(tmp257, _mm512_set1_ps(3.2e+01f), tmp266);
__m512 tmp263 = _mm512_fmadd_ps(tmp253, _mm512_set1_ps(2e+00f), tmp264);
__m512 out53 = tmp248;
__m512 out59 = tmp268;
__m512 out54 = tmp254;
__m512 out60 = tmp274;
__m512 out55 = tmp259;
__m512 out61 = tmp279;
__m512 out56 = tmp261;
__m512 out57 = tmp263;
__m512 out58 = tmp265;
_mm512_mask_storeu_ps(datPtr2+0+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out53);
_mm512_mask_storeu_ps(datPtr2+48+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out53);
_mm512_mask_storeu_ps(datPtr2+48+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out59);
_mm512_mask_storeu_ps(datPtr2+96+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out59);
_mm512_mask_storeu_ps(datPtr2+8+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out54);
_mm512_mask_storeu_ps(datPtr2+56+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out54);
_mm512_mask_storeu_ps(datPtr2+56+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out60);
_mm512_mask_storeu_ps(datPtr2+104+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out60);
_mm512_mask_storeu_ps(datPtr2+16+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out55);
_mm512_mask_storeu_ps(datPtr2+64+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out55);
_mm512_mask_storeu_ps(datPtr2+64+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out61);
_mm512_mask_storeu_ps(datPtr2+112+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out61);
_mm512_mask_storeu_ps(datPtr2+24+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out56);
_mm512_mask_storeu_ps(datPtr2+72+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out56);
_mm512_mask_storeu_ps(datPtr2+32+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out57);
_mm512_mask_storeu_ps(datPtr2+80+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out57);
_mm512_mask_storeu_ps(datPtr2+40+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 3, out58);
_mm512_mask_storeu_ps(datPtr2+88+15768*i8+8*toH1+4*toW1+288*k5+144*l2, 192, out58);
}
}
ptrdiff_t l3 = 0;
for (; l3 != 1; ++l3) {
__m512 sf17 = _mm512_loadu_ps(sfPtr2+0+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf18 = _mm512_loadu_ps(sfPtr2+64+112128*i8+84096*j4+512*k5+256*l3);
__m512 in47 = _mm512_shuffle_f32x4(sf17, sf18, 68);
__m512 in48 = _mm512_shuffle_f32x4(sf17, sf18, 238);
__m512 sf19 = _mm512_loadu_ps(sfPtr2+128+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf20 = _mm512_loadu_ps(sfPtr2+192+112128*i8+84096*j4+512*k5+256*l3);
__m512 in55 = _mm512_shuffle_f32x4(sf19, sf20, 68);
__m512 in56 = _mm512_shuffle_f32x4(sf19, sf20, 238);
__m512 sf21 = _mm512_loadu_ps(sfPtr2+28032+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf22 = _mm512_loadu_ps(sfPtr2+28096+112128*i8+84096*j4+512*k5+256*l3);
__m512 in49 = _mm512_shuffle_f32x4(sf21, sf22, 68);
__m512 in50 = _mm512_shuffle_f32x4(sf21, sf22, 238);
__m512 sf23 = _mm512_loadu_ps(sfPtr2+28160+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf24 = _mm512_loadu_ps(sfPtr2+28224+112128*i8+84096*j4+512*k5+256*l3);
__m512 in57 = _mm512_shuffle_f32x4(sf23, sf24, 68);
__m512 in58 = _mm512_shuffle_f32x4(sf23, sf24, 238);
__m512 sf25 = _mm512_loadu_ps(sfPtr2+56064+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf26 = _mm512_loadu_ps(sfPtr2+56128+112128*i8+84096*j4+512*k5+256*l3);
__m512 in51 = _mm512_shuffle_f32x4(sf25, sf26, 68);
__m512 in52 = _mm512_shuffle_f32x4(sf25, sf26, 238);
__m512 sf27 = _mm512_loadu_ps(sfPtr2+56192+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf28 = _mm512_loadu_ps(sfPtr2+56256+112128*i8+84096*j4+512*k5+256*l3);
__m512 in59 = _mm512_shuffle_f32x4(sf27, sf28, 68);
__m512 in60 = _mm512_shuffle_f32x4(sf27, sf28, 238);
__m512 sf29 = _mm512_loadu_ps(sfPtr2+84096+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf30 = _mm512_loadu_ps(sfPtr2+84160+112128*i8+84096*j4+512*k5+256*l3);
__m512 in53 = _mm512_shuffle_f32x4(sf29, sf30, 68);
__m512 in54 = _mm512_shuffle_f32x4(sf29, sf30, 238);
__m512 sf31 = _mm512_loadu_ps(sfPtr2+84224+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf32 = _mm512_loadu_ps(sfPtr2+84288+112128*i8+84096*j4+512*k5+256*l3);
__m512 in61 = _mm512_shuffle_f32x4(sf31, sf32, 68);
__m512 in62 = _mm512_shuffle_f32x4(sf31, sf32, 238);
(void)in54;
(void)in62;
__m512 tmp313 = _mm512_add_ps(in48, in49);
__m512 tmp324 = _mm512_add_ps(in56, in57);
__m512 tmp312 = _mm512_add_ps(in50, in51);
__m512 tmp323 = _mm512_add_ps(in58, in59);
__m512 tmp318 = _mm512_sub_ps(in50, in51);
__m512 tmp329 = _mm512_sub_ps(in58, in59);
__m512 tmp317 = _mm512_sub_ps(in48, in49);
__m512 tmp328 = _mm512_sub_ps(in56, in57);
__m512 tmp314 = _mm512_add_ps(in52, in53);
__m512 tmp325 = _mm512_add_ps(in60, in61);
__m512 tmp319 = _mm512_sub_ps(in52, in53);
__m512 tmp330 = _mm512_sub_ps(in60, in61);
__m512 tmp316 = _mm512_fmadd_ps(tmp318, _mm512_set1_ps(2e+00f), tmp317);
__m512 tmp327 = _mm512_fmadd_ps(tmp329, _mm512_set1_ps(2e+00f), tmp328);
__m512 tmp311 = _mm512_add_ps(tmp312, tmp313);
__m512 tmp322 = _mm512_add_ps(tmp323, tmp324);
__m512 tmp315 = _mm512_fmadd_ps(tmp319, _mm512_set1_ps(1.6e+01f), tmp316);
__m512 tmp326 = _mm512_fmadd_ps(tmp330, _mm512_set1_ps(1.6e+01f), tmp327);
__m512 tmp310 = _mm512_add_ps(tmp311, in47);
__m512 tmp321 = _mm512_add_ps(tmp322, in55);
__m512 tmp309 = _mm512_fmadd_ps(tmp314, _mm512_set1_ps(3.2e+01f), tmp310);
__m512 tmp320 = _mm512_fmadd_ps(tmp325, _mm512_set1_ps(3.2e+01f), tmp321);
__m512 tmp305 = tmp309;
__m512 tmp307 = tmp320;
__m512 tmp306 = tmp315;
__m512 tmp308 = tmp326;
__m512 tmp331 = _mm512_setzero_ps();
__m512 tmp332 = _mm512_setzero_ps();
__m512 tmp333 = _mm512_setzero_ps();
__m512 tmp334 = _mm512_setzero_ps();
__m512 tmp376 = _mm512_unpacklo_ps(tmp305, tmp306);
__m512 tmp377 = _mm512_unpackhi_ps(tmp305, tmp306);
__m512 tmp378 = _mm512_unpacklo_ps(tmp331, tmp332);
__m512 tmp379 = _mm512_unpackhi_ps(tmp331, tmp332);
__m512 tmp380 = _mm512_unpacklo_ps(tmp333, tmp334);
__m512 tmp381 = _mm512_unpackhi_ps(tmp333, tmp334);
__m512 tmp382 = _mm512_unpacklo_ps(tmp307, tmp308);
__m512 tmp383 = _mm512_unpackhi_ps(tmp307, tmp308);
__m512 tmp384 = _mm512_shuffle_ps(tmp376, tmp378, 68);
__m512 tmp385 = _mm512_shuffle_ps(tmp376, tmp378, 238);
__m512 tmp386 = _mm512_shuffle_ps(tmp377, tmp379, 68);
__m512 tmp387 = _mm512_shuffle_ps(tmp377, tmp379, 238);
__m512 tmp388 = _mm512_shuffle_ps(tmp380, tmp382, 68);
__m512 tmp389 = _mm512_shuffle_ps(tmp380, tmp382, 238);
__m512 tmp390 = _mm512_shuffle_ps(tmp381, tmp383, 68);
__m512 tmp391 = _mm512_shuffle_ps(tmp381, tmp383, 238);
__m512 tmp392 = _mm512_shuffle_f32x4(tmp384, tmp388, 136);
__m512 tmp393 = _mm512_shuffle_f32x4(tmp384, tmp388, 221);
__m512 tmp394 = _mm512_shuffle_f32x4(tmp385, tmp389, 136);
__m512 tmp395 = _mm512_shuffle_f32x4(tmp385, tmp389, 221);
__m512 tmp396 = _mm512_shuffle_f32x4(tmp386, tmp390, 136);
__m512 tmp397 = _mm512_shuffle_f32x4(tmp386, tmp390, 221);
__m512 tmp398 = _mm512_shuffle_f32x4(tmp387, tmp391, 136);
__m512 tmp399 = _mm512_shuffle_f32x4(tmp387, tmp391, 221);
tmp305 = _mm512_shuffle_f32x4(tmp392, tmp392, 136);
__m512 tmp335 = _mm512_shuffle_f32x4(tmp392, tmp392, 221);
tmp306 = _mm512_shuffle_f32x4(tmp394, tmp394, 136);
__m512 tmp336 = _mm512_shuffle_f32x4(tmp394, tmp394, 221);
tmp331 = _mm512_shuffle_f32x4(tmp396, tmp396, 136);
__m512 tmp337 = _mm512_shuffle_f32x4(tmp396, tmp396, 221);
tmp332 = _mm512_shuffle_f32x4(tmp398, tmp398, 136);
__m512 tmp338 = _mm512_shuffle_f32x4(tmp398, tmp398, 221);
tmp333 = _mm512_shuffle_f32x4(tmp393, tmp393, 136);
__m512 tmp339 = _mm512_shuffle_f32x4(tmp393, tmp393, 221);
tmp334 = _mm512_shuffle_f32x4(tmp395, tmp395, 136);
__m512 tmp340 = _mm512_shuffle_f32x4(tmp395, tmp395, 221);
tmp307 = _mm512_shuffle_f32x4(tmp397, tmp397, 136);
__m512 tmp341 = _mm512_shuffle_f32x4(tmp397, tmp397, 221);
tmp308 = _mm512_shuffle_f32x4(tmp399, tmp399, 136);
__m512 tmp342 = _mm512_shuffle_f32x4(tmp399, tmp399, 221);
(void)tmp342;
__m512 tmp347 = _mm512_add_ps(tmp306, tmp331);
__m512 tmp367 = _mm512_add_ps(tmp336, tmp337);
__m512 tmp346 = _mm512_add_ps(tmp332, tmp333);
__m512 tmp366 = _mm512_add_ps(tmp338, tmp339);
__m512 tmp352 = _mm512_sub_ps(tmp332, tmp333);
__m512 tmp372 = _mm512_sub_ps(tmp338, tmp339);
__m512 tmp351 = _mm512_sub_ps(tmp306, tmp331);
__m512 tmp371 = _mm512_sub_ps(tmp336, tmp337);
__m512 tmp348 = _mm512_add_ps(tmp334, tmp307);
__m512 tmp368 = _mm512_add_ps(tmp340, tmp341);
__m512 tmp353 = _mm512_sub_ps(tmp334, tmp307);
__m512 tmp373 = _mm512_sub_ps(tmp340, tmp341);
__m512 tmp350 = _mm512_fmadd_ps(tmp352, _mm512_set1_ps(2e+00f), tmp351);
__m512 tmp370 = _mm512_fmadd_ps(tmp372, _mm512_set1_ps(2e+00f), tmp371);
__m512 tmp357 = _mm512_fmadd_ps(tmp352, _mm512_set1_ps(8e+00f), tmp351);
__m512 tmp345 = _mm512_add_ps(tmp346, tmp347);
__m512 tmp365 = _mm512_add_ps(tmp366, tmp367);
__m512 tmp349 = _mm512_fmadd_ps(tmp353, _mm512_set1_ps(1.6e+01f), tmp350);
__m512 tmp369 = _mm512_fmadd_ps(tmp373, _mm512_set1_ps(1.6e+01f), tmp370);
__m512 tmp356 = _mm512_fmadd_ps(tmp353, _mm512_set1_ps(4e+00f), tmp357);
__m512 tmp362 = _mm512_add_ps(tmp353, tmp351);
__m512 tmp355 = _mm512_fmadd_ps(tmp346, _mm512_set1_ps(4e+00f), tmp347);
__m512 tmp375 = _mm512_fmadd_ps(tmp366, _mm512_set1_ps(4e+00f), tmp367);
__m512 tmp359 = _mm512_fmadd_ps(tmp346, _mm512_set1_ps(1.6e+01f), tmp347);
__m512 tmp344 = _mm512_add_ps(tmp345, tmp305);
__m512 tmp364 = _mm512_add_ps(tmp365, tmp335);
__m512 tmp361 = _mm512_add_ps(tmp362, tmp308);
__m512 tmp343 = _mm512_fmadd_ps(tmp348, _mm512_set1_ps(3.2e+01f), tmp344);
__m512 tmp363 = _mm512_fmadd_ps(tmp368, _mm512_set1_ps(3.2e+01f), tmp364);
__m512 tmp354 = _mm512_fmadd_ps(tmp348, _mm512_set1_ps(8e+00f), tmp355);
__m512 tmp374 = _mm512_fmadd_ps(tmp368, _mm512_set1_ps(8e+00f), tmp375);
__m512 tmp360 = _mm512_fmadd_ps(tmp352, _mm512_set1_ps(3.2e+01f), tmp361);
__m512 tmp358 = _mm512_fmadd_ps(tmp348, _mm512_set1_ps(2e+00f), tmp359);
__m512 out62 = tmp343;
__m512 out68 = tmp363;
__m512 out63 = tmp349;
__m512 out69 = tmp369;
__m512 out64 = tmp354;
__m512 out70 = tmp374;
__m512 out65 = tmp356;
__m512 out66 = tmp358;
__m512 out67 = tmp360;
_mm512_mask_storeu_ps(datPtr2+0+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out62);
_mm512_mask_storeu_ps(datPtr2+48+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out62);
_mm512_mask_storeu_ps(datPtr2+48+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out68);
_mm512_mask_storeu_ps(datPtr2+96+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out68);
_mm512_mask_storeu_ps(datPtr2+8+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out63);
_mm512_mask_storeu_ps(datPtr2+56+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out63);
_mm512_mask_storeu_ps(datPtr2+56+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out69);
_mm512_mask_storeu_ps(datPtr2+104+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out69);
_mm512_mask_storeu_ps(datPtr2+16+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out64);
_mm512_mask_storeu_ps(datPtr2+64+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out64);
_mm512_mask_storeu_ps(datPtr2+64+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out70);
_mm512_mask_storeu_ps(datPtr2+112+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out70);
_mm512_mask_storeu_ps(datPtr2+24+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out65);
_mm512_mask_storeu_ps(datPtr2+72+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out65);
_mm512_mask_storeu_ps(datPtr2+32+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out66);
_mm512_mask_storeu_ps(datPtr2+80+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out66);
_mm512_mask_storeu_ps(datPtr2+40+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out67);
_mm512_mask_storeu_ps(datPtr2+88+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 192, out67);
}
__m512 sf33 = _mm512_loadu_ps(sfPtr2+0+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf34 = _mm512_loadu_ps(sfPtr2+64+112128*i8+84096*j4+512*k5+256*l3);
__m512 in63 = _mm512_shuffle_f32x4(sf33, sf34, 68);
__m512 in64 = _mm512_shuffle_f32x4(sf33, sf34, 238);
__m512 sf35 = _mm512_loadu_ps(sfPtr2+28032+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf36 = _mm512_loadu_ps(sfPtr2+28096+112128*i8+84096*j4+512*k5+256*l3);
__m512 in65 = _mm512_shuffle_f32x4(sf35, sf36, 68);
__m512 in66 = _mm512_shuffle_f32x4(sf35, sf36, 238);
__m512 sf37 = _mm512_loadu_ps(sfPtr2+56064+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf38 = _mm512_loadu_ps(sfPtr2+56128+112128*i8+84096*j4+512*k5+256*l3);
__m512 in67 = _mm512_shuffle_f32x4(sf37, sf38, 68);
__m512 in68 = _mm512_shuffle_f32x4(sf37, sf38, 238);
__m512 sf39 = _mm512_loadu_ps(sfPtr2+84096+112128*i8+84096*j4+512*k5+256*l3);
__m512 sf40 = _mm512_loadu_ps(sfPtr2+84160+112128*i8+84096*j4+512*k5+256*l3);
__m512 in69 = _mm512_shuffle_f32x4(sf39, sf40, 68);
__m512 in70 = _mm512_shuffle_f32x4(sf39, sf40, 238);
(void)in70;
__m512 tmp406 = _mm512_add_ps(in64, in65);
__m512 tmp405 = _mm512_add_ps(in66, in67);
__m512 tmp411 = _mm512_sub_ps(in66, in67);
__m512 tmp410 = _mm512_sub_ps(in64, in65);
__m512 tmp407 = _mm512_add_ps(in68, in69);
__m512 tmp412 = _mm512_sub_ps(in68, in69);
__m512 tmp409 = _mm512_fmadd_ps(tmp411, _mm512_set1_ps(2e+00f), tmp410);
__m512 tmp404 = _mm512_add_ps(tmp405, tmp406);
__m512 tmp408 = _mm512_fmadd_ps(tmp412, _mm512_set1_ps(1.6e+01f), tmp409);
__m512 tmp403 = _mm512_add_ps(tmp404, in63);
__m512 tmp402 = _mm512_fmadd_ps(tmp407, _mm512_set1_ps(3.2e+01f), tmp403);
__m512 tmp400 = tmp402;
__m512 tmp401 = tmp408;
__m512 tmp460 = _mm512_unpacklo_ps(tmp400, tmp401);
__m512 tmp461 = _mm512_unpackhi_ps(tmp400, tmp401);
__m512 tmp462 = _mm512_shuffle_ps(tmp460, tmp460, 238);
__m512 tmp463 = _mm512_shuffle_ps(tmp461, tmp461, 238);
__m512 tmp464 = _mm512_shuffle_f32x4(tmp460, tmp460, 136);
__m512 tmp465 = _mm512_shuffle_f32x4(tmp460, tmp460, 221);
__m512 tmp466 = _mm512_shuffle_f32x4(tmp462, tmp462, 136);
__m512 tmp467 = _mm512_shuffle_f32x4(tmp462, tmp462, 221);
__m512 tmp468 = _mm512_shuffle_f32x4(tmp461, tmp461, 136);
__m512 tmp469 = _mm512_shuffle_f32x4(tmp461, tmp461, 221);
__m512 tmp470 = _mm512_shuffle_f32x4(tmp463, tmp463, 136);
__m512 tmp471 = _mm512_shuffle_f32x4(tmp463, tmp463, 221);
tmp400 = tmp464;
__m512 tmp419 = _mm512_shuffle_f32x4(tmp464, tmp464, 221);
tmp401 = tmp466;
__m512 tmp420 = _mm512_shuffle_f32x4(tmp466, tmp466, 221);
__m512 tmp413 = tmp468;
__m512 tmp421 = _mm512_shuffle_f32x4(tmp468, tmp468, 221);
__m512 tmp414 = tmp470;
__m512 tmp422 = _mm512_shuffle_f32x4(tmp470, tmp470, 221);
__m512 tmp415 = tmp465;
__m512 tmp423 = _mm512_shuffle_f32x4(tmp465, tmp465, 221);
__m512 tmp416 = tmp467;
__m512 tmp424 = _mm512_shuffle_f32x4(tmp467, tmp467, 221);
__m512 tmp417 = tmp469;
__m512 tmp425 = _mm512_shuffle_f32x4(tmp469, tmp469, 221);
__m512 tmp418 = tmp471;
__m512 tmp426 = _mm512_shuffle_f32x4(tmp471, tmp471, 221);
(void)tmp426;
__m512 tmp431 = _mm512_add_ps(tmp401, tmp413);
__m512 tmp451 = _mm512_add_ps(tmp420, tmp421);
__m512 tmp430 = _mm512_add_ps(tmp414, tmp415);
__m512 tmp450 = _mm512_add_ps(tmp422, tmp423);
__m512 tmp436 = _mm512_sub_ps(tmp414, tmp415);
__m512 tmp456 = _mm512_sub_ps(tmp422, tmp423);
__m512 tmp435 = _mm512_sub_ps(tmp401, tmp413);
__m512 tmp455 = _mm512_sub_ps(tmp420, tmp421);
__m512 tmp432 = _mm512_add_ps(tmp416, tmp417);
__m512 tmp452 = _mm512_add_ps(tmp424, tmp425);
__m512 tmp437 = _mm512_sub_ps(tmp416, tmp417);
__m512 tmp457 = _mm512_sub_ps(tmp424, tmp425);
__m512 tmp434 = _mm512_fmadd_ps(tmp436, _mm512_set1_ps(2e+00f), tmp435);
__m512 tmp454 = _mm512_fmadd_ps(tmp456, _mm512_set1_ps(2e+00f), tmp455);
__m512 tmp441 = _mm512_fmadd_ps(tmp436, _mm512_set1_ps(8e+00f), tmp435);
__m512 tmp429 = _mm512_add_ps(tmp430, tmp431);
__m512 tmp449 = _mm512_add_ps(tmp450, tmp451);
__m512 tmp433 = _mm512_fmadd_ps(tmp437, _mm512_set1_ps(1.6e+01f), tmp434);
__m512 tmp453 = _mm512_fmadd_ps(tmp457, _mm512_set1_ps(1.6e+01f), tmp454);
__m512 tmp440 = _mm512_fmadd_ps(tmp437, _mm512_set1_ps(4e+00f), tmp441);
__m512 tmp446 = _mm512_add_ps(tmp437, tmp435);
__m512 tmp439 = _mm512_fmadd_ps(tmp430, _mm512_set1_ps(4e+00f), tmp431);
__m512 tmp459 = _mm512_fmadd_ps(tmp450, _mm512_set1_ps(4e+00f), tmp451);
__m512 tmp443 = _mm512_fmadd_ps(tmp430, _mm512_set1_ps(1.6e+01f), tmp431);
__m512 tmp428 = _mm512_add_ps(tmp429, tmp400);
__m512 tmp448 = _mm512_add_ps(tmp449, tmp419);
__m512 tmp445 = _mm512_add_ps(tmp446, tmp418);
__m512 tmp427 = _mm512_fmadd_ps(tmp432, _mm512_set1_ps(3.2e+01f), tmp428);
__m512 tmp447 = _mm512_fmadd_ps(tmp452, _mm512_set1_ps(3.2e+01f), tmp448);
__m512 tmp438 = _mm512_fmadd_ps(tmp432, _mm512_set1_ps(8e+00f), tmp439);
__m512 tmp458 = _mm512_fmadd_ps(tmp452, _mm512_set1_ps(8e+00f), tmp459);
__m512 tmp444 = _mm512_fmadd_ps(tmp436, _mm512_set1_ps(3.2e+01f), tmp445);
__m512 tmp442 = _mm512_fmadd_ps(tmp432, _mm512_set1_ps(2e+00f), tmp443);
__m512 out71 = tmp427;
__m512 out77 = tmp447;
__m512 out72 = tmp433;
__m512 out78 = tmp453;
__m512 out73 = tmp438;
__m512 out79 = tmp458;
__m512 out74 = tmp440;
__m512 out75 = tmp442;
__m512 out76 = tmp444;
_mm512_mask_storeu_ps(datPtr2+0+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out71);
_mm512_mask_storeu_ps(datPtr2+48+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out77);
_mm512_mask_storeu_ps(datPtr2+8+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out72);
_mm512_mask_storeu_ps(datPtr2+56+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out78);
_mm512_mask_storeu_ps(datPtr2+16+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out73);
_mm512_mask_storeu_ps(datPtr2+64+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out79);
_mm512_mask_storeu_ps(datPtr2+24+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out74);
_mm512_mask_storeu_ps(datPtr2+32+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out75);
_mm512_mask_storeu_ps(datPtr2+40+15768*i8+8*toH1+4*toW1+288*k5+144*l3, 3, out76);
++j4;
}

static void Example11ThreeConsumeSums1(Example11ThreaderTeam1* team17, char** tensors7) {
Example11ThreaderTask1 task11;
task11.callee1 = Example11ThreeConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 3;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 1;
Example11ThreaderDo1(team17, &task11);
}

struct Example11Net {
char* alloc1;
char* align1;
};

void Example11NetDestroy(Example11Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example11NetCreate(
Example11Net** net1,
Example11Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example11Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(701887);
if (__builtin_expect(!alloc3, 0)) {
return Example11Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example11ThreaderTeam1* team12 = 0;
char* err8 = Example11ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors12[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example11ThreeArrangeFilts1(team12, tensors12);
}
Example11ThreaderDestroy1(team12);
Example11Net* net5 = malloc(sizeof(Example11Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example11Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example11Engine {
Example11Net* net3;
Example11ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example11EnginePthreadT(
Example11Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example11ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example11EngineDestroy(Example11Engine* eng3) {
Example11ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example11EngineCreate(
Example11Engine** eng4,
Example11Net* net4,
ptrdiff_t threads2
) {
Example11Engine* eng5 = malloc(sizeof(Example11Engine));
if (__builtin_expect(!eng5, 0)) {
return Example11Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(124991);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example11Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example11ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example11EngineInference(
Example11Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example11ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example11ThreeArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
align4+12800
};
Example11ThreeProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+12800,
(char*)outData
};
Example11ThreeConsumeSums1(team14, tensors11);
}
}

// End of file.

Top