NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example5 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=1272 Height=111 Width=108
Conv FromTensor=in ToTensor=out ToChannels=3888 FilterH=7 FilterW=7 StrideH=4 StrideW=3 PaddingH=3 PaddingW=1 DilationH=2 DilationW=1 Groups=4
Output FromTensor=out

Top || Output Example5.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example5Params);
// Example5Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example5Params Example5Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example5Params* params = malloc(sizeof(Example5Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example5Net* net; // For example, 4 threads:
// char* err = Example5NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example5NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example5Net Example5Net;

char* Example5NetCreate(
Example5Net**,
Example5Params*,
ptrdiff_t threads
);

void Example5NetDestroy(Example5Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example5Net* net;
//
// ... Create net ...
//
// Example5Engine* engine; // For example, 4 inference threads:
// char* err = Example5EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example5EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example5EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*1272*111*108);
// float* outData = malloc(sizeof(float)*3888*27*35);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example5EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example5Engine Example5Engine;

char* Example5EngineCreate(
Example5Engine**,
Example5Net*,
ptrdiff_t threads
);

char* Example5EnginePthreadT(
Example5Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example5EngineInference(
Example5Engine*,
float* inData,
float* outData
);

void Example5EngineDestroy(Example5Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example5Params {
float outBiases[3888]; // 1x3888x1x1
float outWeights[60582816]; // 3888x318x7x7
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example5.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example5.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example5.h"

static char* Example5Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(276);
int step1 = sprintf(msg1, "Example5: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 276-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example5ThreaderTask1 Example5ThreaderTask1;
typedef void (*Example5ThreaderCallee1)(Example5ThreaderTask1*, int64_t*);
typedef struct Example5ThreaderHub1 Example5ThreaderHub1;
typedef struct Example5ThreaderNode1 Example5ThreaderNode1;
typedef struct Example5ThreaderUnwind1 Example5ThreaderUnwind1;
typedef struct Example5ThreaderTeam1 Example5ThreaderTeam1;

struct Example5ThreaderTask1 {
Example5ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example5ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example5ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example5ThreaderTask1* task1;
pthread_cond_t cond2;
Example5ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example5ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example5ThreaderTeam1 {
ptrdiff_t nt1;
Example5ThreaderHub1* hub2;
Example5ThreaderNode1* nodes2;
Example5ThreaderUnwind1 unwind1;
};

static void Example5ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example5ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example5ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example5ThreaderMain1(void* arg1) {
Example5ThreaderNode1* node1 = arg1;
Example5ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example5ThreaderHub1* hub3 = team2->hub2;
Example5ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example5ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example5ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example5ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example5ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example5ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example5ThreaderDestroy1(Example5ThreaderTeam1* team3) {
if (!team3) return;
Example5ThreaderNode1* nodes4 = team3->nodes2;
Example5ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example5ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example5ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example5ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example5ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example5ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example5ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example5ThreaderCreate1Up4(Example5ThreaderTeam1* team8, ptrdiff_t nt7) {
Example5ThreaderNode1* nodes5 = team8->nodes2;
for (Example5ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example5Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example5Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example5ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example5Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example5ThreaderCreate1Up3(Example5ThreaderTeam1* team7, ptrdiff_t nt6) {
Example5ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example5Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example5Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example5ThreaderCreate1Up4(team7, nt6);
}

static char* Example5ThreaderCreate1Up2(Example5ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example5ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example5ThreaderNode1) != (size_t)nt5, 0)) {
return Example5Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example5Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example5ThreaderCreate1Up3(team6, nt5);
}

static char* Example5ThreaderCreate1Up1(Example5ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example5ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example5Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example5ThreaderCreate1Up2(team5, nt4);
}

static char* Example5ThreaderCreate1(Example5ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example5Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example5ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example5Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example5ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example5ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example5ThreaderPthreadT1(
pthread_t* thr2,
Example5ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example5Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example5ThreaderDo1(Example5ThreaderTeam1* team10, Example5ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example5ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example5ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example5ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example5ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example5Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example5Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example5LoomArrangeFilts1Callee1(Example5ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = pt7[1];
ptrdiff_t e1 = 0;
char*restrict arrangedB1 = tensors2[2]+15552*e1;
char*restrict arrangedW1 = tensors2[2]+15552+636310080*e1;
char*restrict wtPtr1 = tensors2[0]+163660*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+0;
if (j1 < 60) {
for (; j1 != 60; ++j1) {
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(65535, biasPtr1-0+3888*i5+64*j1);
}
_mm512_mask_storeu_ps(arrangedB1-0+3888*i5+64*j1, 65535, bias1);
ptrdiff_t c1 = (size_t)(0+16*j1)/6;
switch ((size_t)(0+16*j1)%6) {
case 0: {
ptrdiff_t k1 = 0;
for (; k1 != 318; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+60582816*i5+997248*j1+196*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(65535, wtPtr1+62328+60582816*i5+997248*j1+196*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(65535, wtPtr1+124656+60582816*i5+997248*j1+196*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(65535, wtPtr1+186984+60582816*i5+997248*j1+196*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(65535, wtPtr1+249312+60582816*i5+997248*j1+196*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(65535, wtPtr1+311640+60582816*i5+997248*j1+196*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(65535, wtPtr1+373968+60582816*i5+997248*j1+196*k1);
__m512 wt8 = _mm512_maskz_loadu_ps(65535, wtPtr1+436296+60582816*i5+997248*j1+196*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(65535, wtPtr1+498624+60582816*i5+997248*j1+196*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(65535, wtPtr1+560952+60582816*i5+997248*j1+196*k1);
__m512 wt11 = _mm512_maskz_loadu_ps(65535, wtPtr1+623280+60582816*i5+997248*j1+196*k1);
__m512 wt12 = _mm512_maskz_loadu_ps(65535, wtPtr1+685608+60582816*i5+997248*j1+196*k1);
__m512 wt13 = _mm512_maskz_loadu_ps(65535, wtPtr1+747936+60582816*i5+997248*j1+196*k1);
__m512 wt14 = _mm512_maskz_loadu_ps(65535, wtPtr1+810264+60582816*i5+997248*j1+196*k1);
__m512 wt15 = _mm512_maskz_loadu_ps(65535, wtPtr1+872592+60582816*i5+997248*j1+196*k1);
__m512 wt16 = _mm512_maskz_loadu_ps(65535, wtPtr1+934920+60582816*i5+997248*j1+196*k1);
__m512 tmp1 = _mm512_unpacklo_ps(wt1, wt2);
__m512 tmp2 = _mm512_unpackhi_ps(wt1, wt2);
__m512 tmp3 = _mm512_unpacklo_ps(wt3, wt4);
__m512 tmp4 = _mm512_unpackhi_ps(wt3, wt4);
__m512 tmp5 = _mm512_unpacklo_ps(wt5, wt6);
__m512 tmp6 = _mm512_unpackhi_ps(wt5, wt6);
__m512 tmp7 = _mm512_unpacklo_ps(wt7, wt8);
__m512 tmp8 = _mm512_unpackhi_ps(wt7, wt8);
__m512 tmp9 = _mm512_unpacklo_ps(wt9, wt10);
__m512 tmp10 = _mm512_unpackhi_ps(wt9, wt10);
__m512 tmp11 = _mm512_unpacklo_ps(wt11, wt12);
__m512 tmp12 = _mm512_unpackhi_ps(wt11, wt12);
__m512 tmp13 = _mm512_unpacklo_ps(wt13, wt14);
__m512 tmp14 = _mm512_unpackhi_ps(wt13, wt14);
__m512 tmp15 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp16 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt1 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt9 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt2 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt10 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt3 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt11 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt4 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt12 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt5 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt13 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt6 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt14 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt7 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt15 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt8 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt16 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
_mm512_mask_storeu_ps(arrangedW1+0+60582816*i5+7632*c1+24*k1, 63, wt1);
_mm512_mask_storeu_ps(arrangedW1+7608+60582816*i5+7632*c1+24*k1, 4032, wt1);
_mm512_mask_storeu_ps(arrangedW1+15216+60582816*i5+7632*c1+24*k1, 61440, wt1);
_mm512_mask_storeu_ps(arrangedW1+14836608+60582816*i5+7632*c1+24*k1, 63, wt2);
_mm512_mask_storeu_ps(arrangedW1+14844216+60582816*i5+7632*c1+24*k1, 4032, wt2);
_mm512_mask_storeu_ps(arrangedW1+14851824+60582816*i5+7632*c1+24*k1, 61440, wt2);
_mm512_mask_storeu_ps(arrangedW1+24727680+60582816*i5+7632*c1+24*k1, 63, wt3);
_mm512_mask_storeu_ps(arrangedW1+24735288+60582816*i5+7632*c1+24*k1, 4032, wt3);
_mm512_mask_storeu_ps(arrangedW1+24742896+60582816*i5+7632*c1+24*k1, 61440, wt3);
_mm512_mask_storeu_ps(arrangedW1+1236384+60582816*i5+7632*c1+24*k1, 63, wt4);
_mm512_mask_storeu_ps(arrangedW1+1243992+60582816*i5+7632*c1+24*k1, 4032, wt4);
_mm512_mask_storeu_ps(arrangedW1+1251600+60582816*i5+7632*c1+24*k1, 61440, wt4);
_mm512_mask_storeu_ps(arrangedW1+16072992+60582816*i5+7632*c1+24*k1, 63, wt5);
_mm512_mask_storeu_ps(arrangedW1+16080600+60582816*i5+7632*c1+24*k1, 4032, wt5);
_mm512_mask_storeu_ps(arrangedW1+16088208+60582816*i5+7632*c1+24*k1, 61440, wt5);
_mm512_mask_storeu_ps(arrangedW1+25964064+60582816*i5+7632*c1+24*k1, 63, wt6);
_mm512_mask_storeu_ps(arrangedW1+25971672+60582816*i5+7632*c1+24*k1, 4032, wt6);
_mm512_mask_storeu_ps(arrangedW1+25979280+60582816*i5+7632*c1+24*k1, 61440, wt6);
_mm512_mask_storeu_ps(arrangedW1+2472768+60582816*i5+7632*c1+24*k1, 63, wt7);
_mm512_mask_storeu_ps(arrangedW1+2480376+60582816*i5+7632*c1+24*k1, 4032, wt7);
_mm512_mask_storeu_ps(arrangedW1+2487984+60582816*i5+7632*c1+24*k1, 61440, wt7);
_mm512_mask_storeu_ps(arrangedW1+34618752+60582816*i5+7632*c1+24*k1, 63, wt8);
_mm512_mask_storeu_ps(arrangedW1+34626360+60582816*i5+7632*c1+24*k1, 4032, wt8);
_mm512_mask_storeu_ps(arrangedW1+34633968+60582816*i5+7632*c1+24*k1, 61440, wt8);
_mm512_mask_storeu_ps(arrangedW1+45746208+60582816*i5+7632*c1+24*k1, 63, wt9);
_mm512_mask_storeu_ps(arrangedW1+45753816+60582816*i5+7632*c1+24*k1, 4032, wt9);
_mm512_mask_storeu_ps(arrangedW1+45761424+60582816*i5+7632*c1+24*k1, 61440, wt9);
_mm512_mask_storeu_ps(arrangedW1+53164512+60582816*i5+7632*c1+24*k1, 63, wt10);
_mm512_mask_storeu_ps(arrangedW1+53172120+60582816*i5+7632*c1+24*k1, 4032, wt10);
_mm512_mask_storeu_ps(arrangedW1+53179728+60582816*i5+7632*c1+24*k1, 61440, wt10);
_mm512_mask_storeu_ps(arrangedW1+35855136+60582816*i5+7632*c1+24*k1, 63, wt11);
_mm512_mask_storeu_ps(arrangedW1+35862744+60582816*i5+7632*c1+24*k1, 4032, wt11);
_mm512_mask_storeu_ps(arrangedW1+35870352+60582816*i5+7632*c1+24*k1, 61440, wt11);
_mm512_mask_storeu_ps(arrangedW1+46982592+60582816*i5+7632*c1+24*k1, 63, wt12);
_mm512_mask_storeu_ps(arrangedW1+46990200+60582816*i5+7632*c1+24*k1, 4032, wt12);
_mm512_mask_storeu_ps(arrangedW1+46997808+60582816*i5+7632*c1+24*k1, 61440, wt12);
_mm512_mask_storeu_ps(arrangedW1+54400896+60582816*i5+7632*c1+24*k1, 63, wt13);
_mm512_mask_storeu_ps(arrangedW1+54408504+60582816*i5+7632*c1+24*k1, 4032, wt13);
_mm512_mask_storeu_ps(arrangedW1+54416112+60582816*i5+7632*c1+24*k1, 61440, wt13);
_mm512_mask_storeu_ps(arrangedW1+37091520+60582816*i5+7632*c1+24*k1, 63, wt14);
_mm512_mask_storeu_ps(arrangedW1+37099128+60582816*i5+7632*c1+24*k1, 4032, wt14);
_mm512_mask_storeu_ps(arrangedW1+37106736+60582816*i5+7632*c1+24*k1, 61440, wt14);
_mm512_mask_storeu_ps(arrangedW1+3709152+60582816*i5+7632*c1+24*k1, 63, wt15);
_mm512_mask_storeu_ps(arrangedW1+3716760+60582816*i5+7632*c1+24*k1, 4032, wt15);
_mm512_mask_storeu_ps(arrangedW1+3724368+60582816*i5+7632*c1+24*k1, 61440, wt15);
_mm512_mask_storeu_ps(arrangedW1+17309376+60582816*i5+7632*c1+24*k1, 63, wt16);
_mm512_mask_storeu_ps(arrangedW1+17316984+60582816*i5+7632*c1+24*k1, 4032, wt16);
_mm512_mask_storeu_ps(arrangedW1+17324592+60582816*i5+7632*c1+24*k1, 61440, wt16);
__m512 wt17 = _mm512_maskz_loadu_ps(65535, wtPtr1+64+60582816*i5+997248*j1+196*k1);
__m512 wt18 = _mm512_maskz_loadu_ps(65535, wtPtr1+62392+60582816*i5+997248*j1+196*k1);
__m512 wt19 = _mm512_maskz_loadu_ps(65535, wtPtr1+124720+60582816*i5+997248*j1+196*k1);
__m512 wt20 = _mm512_maskz_loadu_ps(65535, wtPtr1+187048+60582816*i5+997248*j1+196*k1);
__m512 wt21 = _mm512_maskz_loadu_ps(65535, wtPtr1+249376+60582816*i5+997248*j1+196*k1);
__m512 wt22 = _mm512_maskz_loadu_ps(65535, wtPtr1+311704+60582816*i5+997248*j1+196*k1);
__m512 wt23 = _mm512_maskz_loadu_ps(65535, wtPtr1+374032+60582816*i5+997248*j1+196*k1);
__m512 wt24 = _mm512_maskz_loadu_ps(65535, wtPtr1+436360+60582816*i5+997248*j1+196*k1);
__m512 wt25 = _mm512_maskz_loadu_ps(65535, wtPtr1+498688+60582816*i5+997248*j1+196*k1);
__m512 wt26 = _mm512_maskz_loadu_ps(65535, wtPtr1+561016+60582816*i5+997248*j1+196*k1);
__m512 wt27 = _mm512_maskz_loadu_ps(65535, wtPtr1+623344+60582816*i5+997248*j1+196*k1);
__m512 wt28 = _mm512_maskz_loadu_ps(65535, wtPtr1+685672+60582816*i5+997248*j1+196*k1);
__m512 wt29 = _mm512_maskz_loadu_ps(65535, wtPtr1+748000+60582816*i5+997248*j1+196*k1);
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr1+810328+60582816*i5+997248*j1+196*k1);
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr1+872656+60582816*i5+997248*j1+196*k1);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr1+934984+60582816*i5+997248*j1+196*k1);
__m512 tmp49 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp50 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp51 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp52 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp53 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp54 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp55 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp56 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp57 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp58 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp59 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp60 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp61 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp62 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp63 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp64 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp50, tmp52, 238);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp70 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp71 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp54, tmp56, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp75 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp76 = _mm512_shuffle_ps(tmp58, tmp60, 238);
__m512 tmp77 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp78 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp79 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp80 = _mm512_shuffle_ps(tmp62, tmp64, 238);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp65, tmp69, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp65, tmp69, 221);
__m512 tmp83 = _mm512_shuffle_f32x4(tmp66, tmp70, 136);
__m512 tmp84 = _mm512_shuffle_f32x4(tmp66, tmp70, 221);
__m512 tmp85 = _mm512_shuffle_f32x4(tmp67, tmp71, 136);
__m512 tmp86 = _mm512_shuffle_f32x4(tmp67, tmp71, 221);
__m512 tmp87 = _mm512_shuffle_f32x4(tmp68, tmp72, 136);
__m512 tmp88 = _mm512_shuffle_f32x4(tmp68, tmp72, 221);
__m512 tmp89 = _mm512_shuffle_f32x4(tmp73, tmp77, 136);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp73, tmp77, 221);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp74, tmp78, 136);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp74, tmp78, 221);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp75, tmp79, 136);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp75, tmp79, 221);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp76, tmp80, 136);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp76, tmp80, 221);
wt17 = _mm512_shuffle_f32x4(tmp81, tmp89, 136);
wt25 = _mm512_shuffle_f32x4(tmp81, tmp89, 221);
wt18 = _mm512_shuffle_f32x4(tmp83, tmp91, 136);
wt26 = _mm512_shuffle_f32x4(tmp83, tmp91, 221);
wt19 = _mm512_shuffle_f32x4(tmp85, tmp93, 136);
wt27 = _mm512_shuffle_f32x4(tmp85, tmp93, 221);
wt20 = _mm512_shuffle_f32x4(tmp87, tmp95, 136);
wt28 = _mm512_shuffle_f32x4(tmp87, tmp95, 221);
wt21 = _mm512_shuffle_f32x4(tmp82, tmp90, 136);
wt29 = _mm512_shuffle_f32x4(tmp82, tmp90, 221);
wt22 = _mm512_shuffle_f32x4(tmp84, tmp92, 136);
wt30 = _mm512_shuffle_f32x4(tmp84, tmp92, 221);
wt23 = _mm512_shuffle_f32x4(tmp86, tmp94, 136);
wt31 = _mm512_shuffle_f32x4(tmp86, tmp94, 221);
wt24 = _mm512_shuffle_f32x4(tmp88, tmp96, 136);
wt32 = _mm512_shuffle_f32x4(tmp88, tmp96, 221);
_mm512_mask_storeu_ps(arrangedW1+27200448+60582816*i5+7632*c1+24*k1, 63, wt17);
_mm512_mask_storeu_ps(arrangedW1+27208056+60582816*i5+7632*c1+24*k1, 4032, wt17);
_mm512_mask_storeu_ps(arrangedW1+27215664+60582816*i5+7632*c1+24*k1, 61440, wt17);
_mm512_mask_storeu_ps(arrangedW1+4945536+60582816*i5+7632*c1+24*k1, 63, wt18);
_mm512_mask_storeu_ps(arrangedW1+4953144+60582816*i5+7632*c1+24*k1, 4032, wt18);
_mm512_mask_storeu_ps(arrangedW1+4960752+60582816*i5+7632*c1+24*k1, 61440, wt18);
_mm512_mask_storeu_ps(arrangedW1+18545760+60582816*i5+7632*c1+24*k1, 63, wt19);
_mm512_mask_storeu_ps(arrangedW1+18553368+60582816*i5+7632*c1+24*k1, 4032, wt19);
_mm512_mask_storeu_ps(arrangedW1+18560976+60582816*i5+7632*c1+24*k1, 61440, wt19);
_mm512_mask_storeu_ps(arrangedW1+28436832+60582816*i5+7632*c1+24*k1, 63, wt20);
_mm512_mask_storeu_ps(arrangedW1+28444440+60582816*i5+7632*c1+24*k1, 4032, wt20);
_mm512_mask_storeu_ps(arrangedW1+28452048+60582816*i5+7632*c1+24*k1, 61440, wt20);
_mm512_mask_storeu_ps(arrangedW1+6181920+60582816*i5+7632*c1+24*k1, 63, wt21);
_mm512_mask_storeu_ps(arrangedW1+6189528+60582816*i5+7632*c1+24*k1, 4032, wt21);
_mm512_mask_storeu_ps(arrangedW1+6197136+60582816*i5+7632*c1+24*k1, 61440, wt21);
_mm512_mask_storeu_ps(arrangedW1+38327904+60582816*i5+7632*c1+24*k1, 63, wt22);
_mm512_mask_storeu_ps(arrangedW1+38335512+60582816*i5+7632*c1+24*k1, 4032, wt22);
_mm512_mask_storeu_ps(arrangedW1+38343120+60582816*i5+7632*c1+24*k1, 61440, wt22);
_mm512_mask_storeu_ps(arrangedW1+48218976+60582816*i5+7632*c1+24*k1, 63, wt23);
_mm512_mask_storeu_ps(arrangedW1+48226584+60582816*i5+7632*c1+24*k1, 4032, wt23);
_mm512_mask_storeu_ps(arrangedW1+48234192+60582816*i5+7632*c1+24*k1, 61440, wt23);
_mm512_mask_storeu_ps(arrangedW1+55637280+60582816*i5+7632*c1+24*k1, 63, wt24);
_mm512_mask_storeu_ps(arrangedW1+55644888+60582816*i5+7632*c1+24*k1, 4032, wt24);
_mm512_mask_storeu_ps(arrangedW1+55652496+60582816*i5+7632*c1+24*k1, 61440, wt24);
_mm512_mask_storeu_ps(arrangedW1+39564288+60582816*i5+7632*c1+24*k1, 63, wt25);
_mm512_mask_storeu_ps(arrangedW1+39571896+60582816*i5+7632*c1+24*k1, 4032, wt25);
_mm512_mask_storeu_ps(arrangedW1+39579504+60582816*i5+7632*c1+24*k1, 61440, wt25);
_mm512_mask_storeu_ps(arrangedW1+49455360+60582816*i5+7632*c1+24*k1, 63, wt26);
_mm512_mask_storeu_ps(arrangedW1+49462968+60582816*i5+7632*c1+24*k1, 4032, wt26);
_mm512_mask_storeu_ps(arrangedW1+49470576+60582816*i5+7632*c1+24*k1, 61440, wt26);
_mm512_mask_storeu_ps(arrangedW1+56873664+60582816*i5+7632*c1+24*k1, 63, wt27);
_mm512_mask_storeu_ps(arrangedW1+56881272+60582816*i5+7632*c1+24*k1, 4032, wt27);
_mm512_mask_storeu_ps(arrangedW1+56888880+60582816*i5+7632*c1+24*k1, 61440, wt27);
_mm512_mask_storeu_ps(arrangedW1+40800672+60582816*i5+7632*c1+24*k1, 63, wt28);
_mm512_mask_storeu_ps(arrangedW1+40808280+60582816*i5+7632*c1+24*k1, 4032, wt28);
_mm512_mask_storeu_ps(arrangedW1+40815888+60582816*i5+7632*c1+24*k1, 61440, wt28);
_mm512_mask_storeu_ps(arrangedW1+7418304+60582816*i5+7632*c1+24*k1, 63, wt29);
_mm512_mask_storeu_ps(arrangedW1+7425912+60582816*i5+7632*c1+24*k1, 4032, wt29);
_mm512_mask_storeu_ps(arrangedW1+7433520+60582816*i5+7632*c1+24*k1, 61440, wt29);
_mm512_mask_storeu_ps(arrangedW1+19782144+60582816*i5+7632*c1+24*k1, 63, wt30);
_mm512_mask_storeu_ps(arrangedW1+19789752+60582816*i5+7632*c1+24*k1, 4032, wt30);
_mm512_mask_storeu_ps(arrangedW1+19797360+60582816*i5+7632*c1+24*k1, 61440, wt30);
_mm512_mask_storeu_ps(arrangedW1+29673216+60582816*i5+7632*c1+24*k1, 63, wt31);
_mm512_mask_storeu_ps(arrangedW1+29680824+60582816*i5+7632*c1+24*k1, 4032, wt31);
_mm512_mask_storeu_ps(arrangedW1+29688432+60582816*i5+7632*c1+24*k1, 61440, wt31);
_mm512_mask_storeu_ps(arrangedW1+8654688+60582816*i5+7632*c1+24*k1, 63, wt32);
_mm512_mask_storeu_ps(arrangedW1+8662296+60582816*i5+7632*c1+24*k1, 4032, wt32);
_mm512_mask_storeu_ps(arrangedW1+8669904+60582816*i5+7632*c1+24*k1, 61440, wt32);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr1+128+60582816*i5+997248*j1+196*k1);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr1+62456+60582816*i5+997248*j1+196*k1);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr1+124784+60582816*i5+997248*j1+196*k1);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr1+187112+60582816*i5+997248*j1+196*k1);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr1+249440+60582816*i5+997248*j1+196*k1);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr1+311768+60582816*i5+997248*j1+196*k1);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr1+374096+60582816*i5+997248*j1+196*k1);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr1+436424+60582816*i5+997248*j1+196*k1);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr1+498752+60582816*i5+997248*j1+196*k1);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr1+561080+60582816*i5+997248*j1+196*k1);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr1+623408+60582816*i5+997248*j1+196*k1);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr1+685736+60582816*i5+997248*j1+196*k1);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr1+748064+60582816*i5+997248*j1+196*k1);
__m512 wt46 = _mm512_maskz_loadu_ps(65535, wtPtr1+810392+60582816*i5+997248*j1+196*k1);
__m512 wt47 = _mm512_maskz_loadu_ps(65535, wtPtr1+872720+60582816*i5+997248*j1+196*k1);
__m512 wt48 = _mm512_maskz_loadu_ps(65535, wtPtr1+935048+60582816*i5+997248*j1+196*k1);
__m512 tmp97 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp98 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp99 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp100 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp101 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp102 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp103 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp104 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp105 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp106 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp107 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp108 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp109 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp110 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp111 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp112 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp113 = _mm512_shuffle_ps(tmp97, tmp99, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp97, tmp99, 238);
__m512 tmp115 = _mm512_shuffle_ps(tmp98, tmp100, 68);
__m512 tmp116 = _mm512_shuffle_ps(tmp98, tmp100, 238);
__m512 tmp117 = _mm512_shuffle_ps(tmp101, tmp103, 68);
__m512 tmp118 = _mm512_shuffle_ps(tmp101, tmp103, 238);
__m512 tmp119 = _mm512_shuffle_ps(tmp102, tmp104, 68);
__m512 tmp120 = _mm512_shuffle_ps(tmp102, tmp104, 238);
__m512 tmp121 = _mm512_shuffle_ps(tmp105, tmp107, 68);
__m512 tmp122 = _mm512_shuffle_ps(tmp105, tmp107, 238);
__m512 tmp123 = _mm512_shuffle_ps(tmp106, tmp108, 68);
__m512 tmp124 = _mm512_shuffle_ps(tmp106, tmp108, 238);
__m512 tmp125 = _mm512_shuffle_ps(tmp109, tmp111, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp109, tmp111, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp110, tmp112, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp110, tmp112, 238);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp113, tmp117, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp113, tmp117, 221);
__m512 tmp131 = _mm512_shuffle_f32x4(tmp114, tmp118, 136);
__m512 tmp132 = _mm512_shuffle_f32x4(tmp114, tmp118, 221);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp115, tmp119, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp115, tmp119, 221);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp116, tmp120, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp116, tmp120, 221);
__m512 tmp137 = _mm512_shuffle_f32x4(tmp121, tmp125, 136);
__m512 tmp138 = _mm512_shuffle_f32x4(tmp121, tmp125, 221);
__m512 tmp139 = _mm512_shuffle_f32x4(tmp122, tmp126, 136);
__m512 tmp140 = _mm512_shuffle_f32x4(tmp122, tmp126, 221);
__m512 tmp141 = _mm512_shuffle_f32x4(tmp123, tmp127, 136);
__m512 tmp142 = _mm512_shuffle_f32x4(tmp123, tmp127, 221);
__m512 tmp143 = _mm512_shuffle_f32x4(tmp124, tmp128, 136);
__m512 tmp144 = _mm512_shuffle_f32x4(tmp124, tmp128, 221);
wt33 = _mm512_shuffle_f32x4(tmp129, tmp137, 136);
wt41 = _mm512_shuffle_f32x4(tmp129, tmp137, 221);
wt34 = _mm512_shuffle_f32x4(tmp131, tmp139, 136);
wt42 = _mm512_shuffle_f32x4(tmp131, tmp139, 221);
wt35 = _mm512_shuffle_f32x4(tmp133, tmp141, 136);
wt43 = _mm512_shuffle_f32x4(tmp133, tmp141, 221);
wt36 = _mm512_shuffle_f32x4(tmp135, tmp143, 136);
wt44 = _mm512_shuffle_f32x4(tmp135, tmp143, 221);
wt37 = _mm512_shuffle_f32x4(tmp130, tmp138, 136);
wt45 = _mm512_shuffle_f32x4(tmp130, tmp138, 221);
wt38 = _mm512_shuffle_f32x4(tmp132, tmp140, 136);
wt46 = _mm512_shuffle_f32x4(tmp132, tmp140, 221);
wt39 = _mm512_shuffle_f32x4(tmp134, tmp142, 136);
wt47 = _mm512_shuffle_f32x4(tmp134, tmp142, 221);
wt40 = _mm512_shuffle_f32x4(tmp136, tmp144, 136);
wt48 = _mm512_shuffle_f32x4(tmp136, tmp144, 221);
_mm512_mask_storeu_ps(arrangedW1+21018528+60582816*i5+7632*c1+24*k1, 63, wt33);
_mm512_mask_storeu_ps(arrangedW1+21026136+60582816*i5+7632*c1+24*k1, 4032, wt33);
_mm512_mask_storeu_ps(arrangedW1+21033744+60582816*i5+7632*c1+24*k1, 61440, wt33);
_mm512_mask_storeu_ps(arrangedW1+30909600+60582816*i5+7632*c1+24*k1, 63, wt34);
_mm512_mask_storeu_ps(arrangedW1+30917208+60582816*i5+7632*c1+24*k1, 4032, wt34);
_mm512_mask_storeu_ps(arrangedW1+30924816+60582816*i5+7632*c1+24*k1, 61440, wt34);
_mm512_mask_storeu_ps(arrangedW1+9891072+60582816*i5+7632*c1+24*k1, 63, wt35);
_mm512_mask_storeu_ps(arrangedW1+9898680+60582816*i5+7632*c1+24*k1, 4032, wt35);
_mm512_mask_storeu_ps(arrangedW1+9906288+60582816*i5+7632*c1+24*k1, 61440, wt35);
_mm512_mask_storeu_ps(arrangedW1+42037056+60582816*i5+7632*c1+24*k1, 63, wt36);
_mm512_mask_storeu_ps(arrangedW1+42044664+60582816*i5+7632*c1+24*k1, 4032, wt36);
_mm512_mask_storeu_ps(arrangedW1+42052272+60582816*i5+7632*c1+24*k1, 61440, wt36);
_mm512_mask_storeu_ps(arrangedW1+50691744+60582816*i5+7632*c1+24*k1, 63, wt37);
_mm512_mask_storeu_ps(arrangedW1+50699352+60582816*i5+7632*c1+24*k1, 4032, wt37);
_mm512_mask_storeu_ps(arrangedW1+50706960+60582816*i5+7632*c1+24*k1, 61440, wt37);
_mm512_mask_storeu_ps(arrangedW1+58110048+60582816*i5+7632*c1+24*k1, 63, wt38);
_mm512_mask_storeu_ps(arrangedW1+58117656+60582816*i5+7632*c1+24*k1, 4032, wt38);
_mm512_mask_storeu_ps(arrangedW1+58125264+60582816*i5+7632*c1+24*k1, 61440, wt38);
_mm512_mask_storeu_ps(arrangedW1+43273440+60582816*i5+7632*c1+24*k1, 63, wt39);
_mm512_mask_storeu_ps(arrangedW1+43281048+60582816*i5+7632*c1+24*k1, 4032, wt39);
_mm512_mask_storeu_ps(arrangedW1+43288656+60582816*i5+7632*c1+24*k1, 61440, wt39);
_mm512_mask_storeu_ps(arrangedW1+51928128+60582816*i5+7632*c1+24*k1, 63, wt40);
_mm512_mask_storeu_ps(arrangedW1+51935736+60582816*i5+7632*c1+24*k1, 4032, wt40);
_mm512_mask_storeu_ps(arrangedW1+51943344+60582816*i5+7632*c1+24*k1, 61440, wt40);
_mm512_mask_storeu_ps(arrangedW1+59346432+60582816*i5+7632*c1+24*k1, 63, wt41);
_mm512_mask_storeu_ps(arrangedW1+59354040+60582816*i5+7632*c1+24*k1, 4032, wt41);
_mm512_mask_storeu_ps(arrangedW1+59361648+60582816*i5+7632*c1+24*k1, 61440, wt41);
_mm512_mask_storeu_ps(arrangedW1+44509824+60582816*i5+7632*c1+24*k1, 63, wt42);
_mm512_mask_storeu_ps(arrangedW1+44517432+60582816*i5+7632*c1+24*k1, 4032, wt42);
_mm512_mask_storeu_ps(arrangedW1+44525040+60582816*i5+7632*c1+24*k1, 61440, wt42);
_mm512_mask_storeu_ps(arrangedW1+11127456+60582816*i5+7632*c1+24*k1, 63, wt43);
_mm512_mask_storeu_ps(arrangedW1+11135064+60582816*i5+7632*c1+24*k1, 4032, wt43);
_mm512_mask_storeu_ps(arrangedW1+11142672+60582816*i5+7632*c1+24*k1, 61440, wt43);
_mm512_mask_storeu_ps(arrangedW1+22254912+60582816*i5+7632*c1+24*k1, 63, wt44);
_mm512_mask_storeu_ps(arrangedW1+22262520+60582816*i5+7632*c1+24*k1, 4032, wt44);
_mm512_mask_storeu_ps(arrangedW1+22270128+60582816*i5+7632*c1+24*k1, 61440, wt44);
_mm512_mask_storeu_ps(arrangedW1+32145984+60582816*i5+7632*c1+24*k1, 63, wt45);
_mm512_mask_storeu_ps(arrangedW1+32153592+60582816*i5+7632*c1+24*k1, 4032, wt45);
_mm512_mask_storeu_ps(arrangedW1+32161200+60582816*i5+7632*c1+24*k1, 61440, wt45);
_mm512_mask_storeu_ps(arrangedW1+12363840+60582816*i5+7632*c1+24*k1, 63, wt46);
_mm512_mask_storeu_ps(arrangedW1+12371448+60582816*i5+7632*c1+24*k1, 4032, wt46);
_mm512_mask_storeu_ps(arrangedW1+12379056+60582816*i5+7632*c1+24*k1, 61440, wt46);
_mm512_mask_storeu_ps(arrangedW1+23491296+60582816*i5+7632*c1+24*k1, 63, wt47);
_mm512_mask_storeu_ps(arrangedW1+23498904+60582816*i5+7632*c1+24*k1, 4032, wt47);
_mm512_mask_storeu_ps(arrangedW1+23506512+60582816*i5+7632*c1+24*k1, 61440, wt47);
_mm512_mask_storeu_ps(arrangedW1+33382368+60582816*i5+7632*c1+24*k1, 63, wt48);
_mm512_mask_storeu_ps(arrangedW1+33389976+60582816*i5+7632*c1+24*k1, 4032, wt48);
_mm512_mask_storeu_ps(arrangedW1+33397584+60582816*i5+7632*c1+24*k1, 61440, wt48);
__m512 wt49 = _mm512_maskz_loadu_ps(1, wtPtr1+192+60582816*i5+997248*j1+196*k1);
__m512 wt50 = _mm512_maskz_loadu_ps(1, wtPtr1+62520+60582816*i5+997248*j1+196*k1);
__m512 wt51 = _mm512_maskz_loadu_ps(1, wtPtr1+124848+60582816*i5+997248*j1+196*k1);
__m512 wt52 = _mm512_maskz_loadu_ps(1, wtPtr1+187176+60582816*i5+997248*j1+196*k1);
__m512 wt53 = _mm512_maskz_loadu_ps(1, wtPtr1+249504+60582816*i5+997248*j1+196*k1);
__m512 wt54 = _mm512_maskz_loadu_ps(1, wtPtr1+311832+60582816*i5+997248*j1+196*k1);
__m512 wt55 = _mm512_maskz_loadu_ps(1, wtPtr1+374160+60582816*i5+997248*j1+196*k1);
__m512 wt56 = _mm512_maskz_loadu_ps(1, wtPtr1+436488+60582816*i5+997248*j1+196*k1);
__m512 wt57 = _mm512_maskz_loadu_ps(1, wtPtr1+498816+60582816*i5+997248*j1+196*k1);
__m512 wt58 = _mm512_maskz_loadu_ps(1, wtPtr1+561144+60582816*i5+997248*j1+196*k1);
__m512 wt59 = _mm512_maskz_loadu_ps(1, wtPtr1+623472+60582816*i5+997248*j1+196*k1);
__m512 wt60 = _mm512_maskz_loadu_ps(1, wtPtr1+685800+60582816*i5+997248*j1+196*k1);
__m512 wt61 = _mm512_maskz_loadu_ps(1, wtPtr1+748128+60582816*i5+997248*j1+196*k1);
__m512 wt62 = _mm512_maskz_loadu_ps(1, wtPtr1+810456+60582816*i5+997248*j1+196*k1);
__m512 wt63 = _mm512_maskz_loadu_ps(1, wtPtr1+872784+60582816*i5+997248*j1+196*k1);
__m512 wt64 = _mm512_maskz_loadu_ps(1, wtPtr1+935112+60582816*i5+997248*j1+196*k1);
__m512 tmp145 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp146 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp147 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp148 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp149 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp150 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp151 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp152 = _mm512_unpacklo_ps(wt63, wt64);
__m512 tmp153 = _mm512_shuffle_ps(tmp145, tmp146, 68);
__m512 tmp154 = _mm512_shuffle_ps(tmp147, tmp148, 68);
__m512 tmp155 = _mm512_shuffle_ps(tmp149, tmp150, 68);
__m512 tmp156 = _mm512_shuffle_ps(tmp151, tmp152, 68);
__m512 tmp157 = _mm512_shuffle_f32x4(tmp153, tmp154, 136);
__m512 tmp158 = _mm512_shuffle_f32x4(tmp155, tmp156, 136);
wt49 = _mm512_shuffle_f32x4(tmp157, tmp158, 136);
_mm512_mask_storeu_ps(arrangedW1+13600224+60582816*i5+7632*c1+24*k1, 63, wt49);
_mm512_mask_storeu_ps(arrangedW1+13607832+60582816*i5+7632*c1+24*k1, 4032, wt49);
_mm512_mask_storeu_ps(arrangedW1+13615440+60582816*i5+7632*c1+24*k1, 61440, wt49);
}
break;
}
case 2: {
ptrdiff_t k2 = 0;
for (; k2 != 318; ++k2) {
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+60582816*i5+997248*j1+196*k2);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr1+62328+60582816*i5+997248*j1+196*k2);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr1+124656+60582816*i5+997248*j1+196*k2);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr1+186984+60582816*i5+997248*j1+196*k2);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr1+249312+60582816*i5+997248*j1+196*k2);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr1+311640+60582816*i5+997248*j1+196*k2);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr1+373968+60582816*i5+997248*j1+196*k2);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr1+436296+60582816*i5+997248*j1+196*k2);
__m512 wt73 = _mm512_maskz_loadu_ps(65535, wtPtr1+498624+60582816*i5+997248*j1+196*k2);
__m512 wt74 = _mm512_maskz_loadu_ps(65535, wtPtr1+560952+60582816*i5+997248*j1+196*k2);
__m512 wt75 = _mm512_maskz_loadu_ps(65535, wtPtr1+623280+60582816*i5+997248*j1+196*k2);
__m512 wt76 = _mm512_maskz_loadu_ps(65535, wtPtr1+685608+60582816*i5+997248*j1+196*k2);
__m512 wt77 = _mm512_maskz_loadu_ps(65535, wtPtr1+747936+60582816*i5+997248*j1+196*k2);
__m512 wt78 = _mm512_maskz_loadu_ps(65535, wtPtr1+810264+60582816*i5+997248*j1+196*k2);
__m512 wt79 = _mm512_maskz_loadu_ps(65535, wtPtr1+872592+60582816*i5+997248*j1+196*k2);
__m512 wt80 = _mm512_maskz_loadu_ps(65535, wtPtr1+934920+60582816*i5+997248*j1+196*k2);
__m512 tmp159 = _mm512_unpacklo_ps(wt65, wt66);
__m512 tmp160 = _mm512_unpackhi_ps(wt65, wt66);
__m512 tmp161 = _mm512_unpacklo_ps(wt67, wt68);
__m512 tmp162 = _mm512_unpackhi_ps(wt67, wt68);
__m512 tmp163 = _mm512_unpacklo_ps(wt69, wt70);
__m512 tmp164 = _mm512_unpackhi_ps(wt69, wt70);
__m512 tmp165 = _mm512_unpacklo_ps(wt71, wt72);
__m512 tmp166 = _mm512_unpackhi_ps(wt71, wt72);
__m512 tmp167 = _mm512_unpacklo_ps(wt73, wt74);
__m512 tmp168 = _mm512_unpackhi_ps(wt73, wt74);
__m512 tmp169 = _mm512_unpacklo_ps(wt75, wt76);
__m512 tmp170 = _mm512_unpackhi_ps(wt75, wt76);
__m512 tmp171 = _mm512_unpacklo_ps(wt77, wt78);
__m512 tmp172 = _mm512_unpackhi_ps(wt77, wt78);
__m512 tmp173 = _mm512_unpacklo_ps(wt79, wt80);
__m512 tmp174 = _mm512_unpackhi_ps(wt79, wt80);
__m512 tmp175 = _mm512_shuffle_ps(tmp159, tmp161, 68);
__m512 tmp176 = _mm512_shuffle_ps(tmp159, tmp161, 238);
__m512 tmp177 = _mm512_shuffle_ps(tmp160, tmp162, 68);
__m512 tmp178 = _mm512_shuffle_ps(tmp160, tmp162, 238);
__m512 tmp179 = _mm512_shuffle_ps(tmp163, tmp165, 68);
__m512 tmp180 = _mm512_shuffle_ps(tmp163, tmp165, 238);
__m512 tmp181 = _mm512_shuffle_ps(tmp164, tmp166, 68);
__m512 tmp182 = _mm512_shuffle_ps(tmp164, tmp166, 238);
__m512 tmp183 = _mm512_shuffle_ps(tmp167, tmp169, 68);
__m512 tmp184 = _mm512_shuffle_ps(tmp167, tmp169, 238);
__m512 tmp185 = _mm512_shuffle_ps(tmp168, tmp170, 68);
__m512 tmp186 = _mm512_shuffle_ps(tmp168, tmp170, 238);
__m512 tmp187 = _mm512_shuffle_ps(tmp171, tmp173, 68);
__m512 tmp188 = _mm512_shuffle_ps(tmp171, tmp173, 238);
__m512 tmp189 = _mm512_shuffle_ps(tmp172, tmp174, 68);
__m512 tmp190 = _mm512_shuffle_ps(tmp172, tmp174, 238);
__m512 tmp191 = _mm512_shuffle_f32x4(tmp175, tmp179, 136);
__m512 tmp192 = _mm512_shuffle_f32x4(tmp175, tmp179, 221);
__m512 tmp193 = _mm512_shuffle_f32x4(tmp176, tmp180, 136);
__m512 tmp194 = _mm512_shuffle_f32x4(tmp176, tmp180, 221);
__m512 tmp195 = _mm512_shuffle_f32x4(tmp177, tmp181, 136);
__m512 tmp196 = _mm512_shuffle_f32x4(tmp177, tmp181, 221);
__m512 tmp197 = _mm512_shuffle_f32x4(tmp178, tmp182, 136);
__m512 tmp198 = _mm512_shuffle_f32x4(tmp178, tmp182, 221);
__m512 tmp199 = _mm512_shuffle_f32x4(tmp183, tmp187, 136);
__m512 tmp200 = _mm512_shuffle_f32x4(tmp183, tmp187, 221);
__m512 tmp201 = _mm512_shuffle_f32x4(tmp184, tmp188, 136);
__m512 tmp202 = _mm512_shuffle_f32x4(tmp184, tmp188, 221);
__m512 tmp203 = _mm512_shuffle_f32x4(tmp185, tmp189, 136);
__m512 tmp204 = _mm512_shuffle_f32x4(tmp185, tmp189, 221);
__m512 tmp205 = _mm512_shuffle_f32x4(tmp186, tmp190, 136);
__m512 tmp206 = _mm512_shuffle_f32x4(tmp186, tmp190, 221);
wt65 = _mm512_shuffle_f32x4(tmp191, tmp199, 136);
wt73 = _mm512_shuffle_f32x4(tmp191, tmp199, 221);
wt66 = _mm512_shuffle_f32x4(tmp193, tmp201, 136);
wt74 = _mm512_shuffle_f32x4(tmp193, tmp201, 221);
wt67 = _mm512_shuffle_f32x4(tmp195, tmp203, 136);
wt75 = _mm512_shuffle_f32x4(tmp195, tmp203, 221);
wt68 = _mm512_shuffle_f32x4(tmp197, tmp205, 136);
wt76 = _mm512_shuffle_f32x4(tmp197, tmp205, 221);
wt69 = _mm512_shuffle_f32x4(tmp192, tmp200, 136);
wt77 = _mm512_shuffle_f32x4(tmp192, tmp200, 221);
wt70 = _mm512_shuffle_f32x4(tmp194, tmp202, 136);
wt78 = _mm512_shuffle_f32x4(tmp194, tmp202, 221);
wt71 = _mm512_shuffle_f32x4(tmp196, tmp204, 136);
wt79 = _mm512_shuffle_f32x4(tmp196, tmp204, 221);
wt72 = _mm512_shuffle_f32x4(tmp198, tmp206, 136);
wt80 = _mm512_shuffle_f32x4(tmp198, tmp206, 221);
_mm512_mask_storeu_ps(arrangedW1+8+60582816*i5+7632*c1+24*k2, 15, wt65);
_mm512_mask_storeu_ps(arrangedW1+7616+60582816*i5+7632*c1+24*k2, 1008, wt65);
_mm512_mask_storeu_ps(arrangedW1+15224+60582816*i5+7632*c1+24*k2, 64512, wt65);
_mm512_mask_storeu_ps(arrangedW1+14836616+60582816*i5+7632*c1+24*k2, 15, wt66);
_mm512_mask_storeu_ps(arrangedW1+14844224+60582816*i5+7632*c1+24*k2, 1008, wt66);
_mm512_mask_storeu_ps(arrangedW1+14851832+60582816*i5+7632*c1+24*k2, 64512, wt66);
_mm512_mask_storeu_ps(arrangedW1+24727688+60582816*i5+7632*c1+24*k2, 15, wt67);
_mm512_mask_storeu_ps(arrangedW1+24735296+60582816*i5+7632*c1+24*k2, 1008, wt67);
_mm512_mask_storeu_ps(arrangedW1+24742904+60582816*i5+7632*c1+24*k2, 64512, wt67);
_mm512_mask_storeu_ps(arrangedW1+1236392+60582816*i5+7632*c1+24*k2, 15, wt68);
_mm512_mask_storeu_ps(arrangedW1+1244000+60582816*i5+7632*c1+24*k2, 1008, wt68);
_mm512_mask_storeu_ps(arrangedW1+1251608+60582816*i5+7632*c1+24*k2, 64512, wt68);
_mm512_mask_storeu_ps(arrangedW1+16073000+60582816*i5+7632*c1+24*k2, 15, wt69);
_mm512_mask_storeu_ps(arrangedW1+16080608+60582816*i5+7632*c1+24*k2, 1008, wt69);
_mm512_mask_storeu_ps(arrangedW1+16088216+60582816*i5+7632*c1+24*k2, 64512, wt69);
_mm512_mask_storeu_ps(arrangedW1+25964072+60582816*i5+7632*c1+24*k2, 15, wt70);
_mm512_mask_storeu_ps(arrangedW1+25971680+60582816*i5+7632*c1+24*k2, 1008, wt70);
_mm512_mask_storeu_ps(arrangedW1+25979288+60582816*i5+7632*c1+24*k2, 64512, wt70);
_mm512_mask_storeu_ps(arrangedW1+2472776+60582816*i5+7632*c1+24*k2, 15, wt71);
_mm512_mask_storeu_ps(arrangedW1+2480384+60582816*i5+7632*c1+24*k2, 1008, wt71);
_mm512_mask_storeu_ps(arrangedW1+2487992+60582816*i5+7632*c1+24*k2, 64512, wt71);
_mm512_mask_storeu_ps(arrangedW1+34618760+60582816*i5+7632*c1+24*k2, 15, wt72);
_mm512_mask_storeu_ps(arrangedW1+34626368+60582816*i5+7632*c1+24*k2, 1008, wt72);
_mm512_mask_storeu_ps(arrangedW1+34633976+60582816*i5+7632*c1+24*k2, 64512, wt72);
_mm512_mask_storeu_ps(arrangedW1+45746216+60582816*i5+7632*c1+24*k2, 15, wt73);
_mm512_mask_storeu_ps(arrangedW1+45753824+60582816*i5+7632*c1+24*k2, 1008, wt73);
_mm512_mask_storeu_ps(arrangedW1+45761432+60582816*i5+7632*c1+24*k2, 64512, wt73);
_mm512_mask_storeu_ps(arrangedW1+53164520+60582816*i5+7632*c1+24*k2, 15, wt74);
_mm512_mask_storeu_ps(arrangedW1+53172128+60582816*i5+7632*c1+24*k2, 1008, wt74);
_mm512_mask_storeu_ps(arrangedW1+53179736+60582816*i5+7632*c1+24*k2, 64512, wt74);
_mm512_mask_storeu_ps(arrangedW1+35855144+60582816*i5+7632*c1+24*k2, 15, wt75);
_mm512_mask_storeu_ps(arrangedW1+35862752+60582816*i5+7632*c1+24*k2, 1008, wt75);
_mm512_mask_storeu_ps(arrangedW1+35870360+60582816*i5+7632*c1+24*k2, 64512, wt75);
_mm512_mask_storeu_ps(arrangedW1+46982600+60582816*i5+7632*c1+24*k2, 15, wt76);
_mm512_mask_storeu_ps(arrangedW1+46990208+60582816*i5+7632*c1+24*k2, 1008, wt76);
_mm512_mask_storeu_ps(arrangedW1+46997816+60582816*i5+7632*c1+24*k2, 64512, wt76);
_mm512_mask_storeu_ps(arrangedW1+54400904+60582816*i5+7632*c1+24*k2, 15, wt77);
_mm512_mask_storeu_ps(arrangedW1+54408512+60582816*i5+7632*c1+24*k2, 1008, wt77);
_mm512_mask_storeu_ps(arrangedW1+54416120+60582816*i5+7632*c1+24*k2, 64512, wt77);
_mm512_mask_storeu_ps(arrangedW1+37091528+60582816*i5+7632*c1+24*k2, 15, wt78);
_mm512_mask_storeu_ps(arrangedW1+37099136+60582816*i5+7632*c1+24*k2, 1008, wt78);
_mm512_mask_storeu_ps(arrangedW1+37106744+60582816*i5+7632*c1+24*k2, 64512, wt78);
_mm512_mask_storeu_ps(arrangedW1+3709160+60582816*i5+7632*c1+24*k2, 15, wt79);
_mm512_mask_storeu_ps(arrangedW1+3716768+60582816*i5+7632*c1+24*k2, 1008, wt79);
_mm512_mask_storeu_ps(arrangedW1+3724376+60582816*i5+7632*c1+24*k2, 64512, wt79);
_mm512_mask_storeu_ps(arrangedW1+17309384+60582816*i5+7632*c1+24*k2, 15, wt80);
_mm512_mask_storeu_ps(arrangedW1+17316992+60582816*i5+7632*c1+24*k2, 1008, wt80);
_mm512_mask_storeu_ps(arrangedW1+17324600+60582816*i5+7632*c1+24*k2, 64512, wt80);
__m512 wt81 = _mm512_maskz_loadu_ps(65535, wtPtr1+64+60582816*i5+997248*j1+196*k2);
__m512 wt82 = _mm512_maskz_loadu_ps(65535, wtPtr1+62392+60582816*i5+997248*j1+196*k2);
__m512 wt83 = _mm512_maskz_loadu_ps(65535, wtPtr1+124720+60582816*i5+997248*j1+196*k2);
__m512 wt84 = _mm512_maskz_loadu_ps(65535, wtPtr1+187048+60582816*i5+997248*j1+196*k2);
__m512 wt85 = _mm512_maskz_loadu_ps(65535, wtPtr1+249376+60582816*i5+997248*j1+196*k2);
__m512 wt86 = _mm512_maskz_loadu_ps(65535, wtPtr1+311704+60582816*i5+997248*j1+196*k2);
__m512 wt87 = _mm512_maskz_loadu_ps(65535, wtPtr1+374032+60582816*i5+997248*j1+196*k2);
__m512 wt88 = _mm512_maskz_loadu_ps(65535, wtPtr1+436360+60582816*i5+997248*j1+196*k2);
__m512 wt89 = _mm512_maskz_loadu_ps(65535, wtPtr1+498688+60582816*i5+997248*j1+196*k2);
__m512 wt90 = _mm512_maskz_loadu_ps(65535, wtPtr1+561016+60582816*i5+997248*j1+196*k2);
__m512 wt91 = _mm512_maskz_loadu_ps(65535, wtPtr1+623344+60582816*i5+997248*j1+196*k2);
__m512 wt92 = _mm512_maskz_loadu_ps(65535, wtPtr1+685672+60582816*i5+997248*j1+196*k2);
__m512 wt93 = _mm512_maskz_loadu_ps(65535, wtPtr1+748000+60582816*i5+997248*j1+196*k2);
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr1+810328+60582816*i5+997248*j1+196*k2);
__m512 wt95 = _mm512_maskz_loadu_ps(65535, wtPtr1+872656+60582816*i5+997248*j1+196*k2);
__m512 wt96 = _mm512_maskz_loadu_ps(65535, wtPtr1+934984+60582816*i5+997248*j1+196*k2);
__m512 tmp207 = _mm512_unpacklo_ps(wt81, wt82);
__m512 tmp208 = _mm512_unpackhi_ps(wt81, wt82);
__m512 tmp209 = _mm512_unpacklo_ps(wt83, wt84);
__m512 tmp210 = _mm512_unpackhi_ps(wt83, wt84);
__m512 tmp211 = _mm512_unpacklo_ps(wt85, wt86);
__m512 tmp212 = _mm512_unpackhi_ps(wt85, wt86);
__m512 tmp213 = _mm512_unpacklo_ps(wt87, wt88);
__m512 tmp214 = _mm512_unpackhi_ps(wt87, wt88);
__m512 tmp215 = _mm512_unpacklo_ps(wt89, wt90);
__m512 tmp216 = _mm512_unpackhi_ps(wt89, wt90);
__m512 tmp217 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp218 = _mm512_unpackhi_ps(wt91, wt92);
__m512 tmp219 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp220 = _mm512_unpackhi_ps(wt93, wt94);
__m512 tmp221 = _mm512_unpacklo_ps(wt95, wt96);
__m512 tmp222 = _mm512_unpackhi_ps(wt95, wt96);
__m512 tmp223 = _mm512_shuffle_ps(tmp207, tmp209, 68);
__m512 tmp224 = _mm512_shuffle_ps(tmp207, tmp209, 238);
__m512 tmp225 = _mm512_shuffle_ps(tmp208, tmp210, 68);
__m512 tmp226 = _mm512_shuffle_ps(tmp208, tmp210, 238);
__m512 tmp227 = _mm512_shuffle_ps(tmp211, tmp213, 68);
__m512 tmp228 = _mm512_shuffle_ps(tmp211, tmp213, 238);
__m512 tmp229 = _mm512_shuffle_ps(tmp212, tmp214, 68);
__m512 tmp230 = _mm512_shuffle_ps(tmp212, tmp214, 238);
__m512 tmp231 = _mm512_shuffle_ps(tmp215, tmp217, 68);
__m512 tmp232 = _mm512_shuffle_ps(tmp215, tmp217, 238);
__m512 tmp233 = _mm512_shuffle_ps(tmp216, tmp218, 68);
__m512 tmp234 = _mm512_shuffle_ps(tmp216, tmp218, 238);
__m512 tmp235 = _mm512_shuffle_ps(tmp219, tmp221, 68);
__m512 tmp236 = _mm512_shuffle_ps(tmp219, tmp221, 238);
__m512 tmp237 = _mm512_shuffle_ps(tmp220, tmp222, 68);
__m512 tmp238 = _mm512_shuffle_ps(tmp220, tmp222, 238);
__m512 tmp239 = _mm512_shuffle_f32x4(tmp223, tmp227, 136);
__m512 tmp240 = _mm512_shuffle_f32x4(tmp223, tmp227, 221);
__m512 tmp241 = _mm512_shuffle_f32x4(tmp224, tmp228, 136);
__m512 tmp242 = _mm512_shuffle_f32x4(tmp224, tmp228, 221);
__m512 tmp243 = _mm512_shuffle_f32x4(tmp225, tmp229, 136);
__m512 tmp244 = _mm512_shuffle_f32x4(tmp225, tmp229, 221);
__m512 tmp245 = _mm512_shuffle_f32x4(tmp226, tmp230, 136);
__m512 tmp246 = _mm512_shuffle_f32x4(tmp226, tmp230, 221);
__m512 tmp247 = _mm512_shuffle_f32x4(tmp231, tmp235, 136);
__m512 tmp248 = _mm512_shuffle_f32x4(tmp231, tmp235, 221);
__m512 tmp249 = _mm512_shuffle_f32x4(tmp232, tmp236, 136);
__m512 tmp250 = _mm512_shuffle_f32x4(tmp232, tmp236, 221);
__m512 tmp251 = _mm512_shuffle_f32x4(tmp233, tmp237, 136);
__m512 tmp252 = _mm512_shuffle_f32x4(tmp233, tmp237, 221);
__m512 tmp253 = _mm512_shuffle_f32x4(tmp234, tmp238, 136);
__m512 tmp254 = _mm512_shuffle_f32x4(tmp234, tmp238, 221);
wt81 = _mm512_shuffle_f32x4(tmp239, tmp247, 136);
wt89 = _mm512_shuffle_f32x4(tmp239, tmp247, 221);
wt82 = _mm512_shuffle_f32x4(tmp241, tmp249, 136);
wt90 = _mm512_shuffle_f32x4(tmp241, tmp249, 221);
wt83 = _mm512_shuffle_f32x4(tmp243, tmp251, 136);
wt91 = _mm512_shuffle_f32x4(tmp243, tmp251, 221);
wt84 = _mm512_shuffle_f32x4(tmp245, tmp253, 136);
wt92 = _mm512_shuffle_f32x4(tmp245, tmp253, 221);
wt85 = _mm512_shuffle_f32x4(tmp240, tmp248, 136);
wt93 = _mm512_shuffle_f32x4(tmp240, tmp248, 221);
wt86 = _mm512_shuffle_f32x4(tmp242, tmp250, 136);
wt94 = _mm512_shuffle_f32x4(tmp242, tmp250, 221);
wt87 = _mm512_shuffle_f32x4(tmp244, tmp252, 136);
wt95 = _mm512_shuffle_f32x4(tmp244, tmp252, 221);
wt88 = _mm512_shuffle_f32x4(tmp246, tmp254, 136);
wt96 = _mm512_shuffle_f32x4(tmp246, tmp254, 221);
_mm512_mask_storeu_ps(arrangedW1+27200456+60582816*i5+7632*c1+24*k2, 15, wt81);
_mm512_mask_storeu_ps(arrangedW1+27208064+60582816*i5+7632*c1+24*k2, 1008, wt81);
_mm512_mask_storeu_ps(arrangedW1+27215672+60582816*i5+7632*c1+24*k2, 64512, wt81);
_mm512_mask_storeu_ps(arrangedW1+4945544+60582816*i5+7632*c1+24*k2, 15, wt82);
_mm512_mask_storeu_ps(arrangedW1+4953152+60582816*i5+7632*c1+24*k2, 1008, wt82);
_mm512_mask_storeu_ps(arrangedW1+4960760+60582816*i5+7632*c1+24*k2, 64512, wt82);
_mm512_mask_storeu_ps(arrangedW1+18545768+60582816*i5+7632*c1+24*k2, 15, wt83);
_mm512_mask_storeu_ps(arrangedW1+18553376+60582816*i5+7632*c1+24*k2, 1008, wt83);
_mm512_mask_storeu_ps(arrangedW1+18560984+60582816*i5+7632*c1+24*k2, 64512, wt83);
_mm512_mask_storeu_ps(arrangedW1+28436840+60582816*i5+7632*c1+24*k2, 15, wt84);
_mm512_mask_storeu_ps(arrangedW1+28444448+60582816*i5+7632*c1+24*k2, 1008, wt84);
_mm512_mask_storeu_ps(arrangedW1+28452056+60582816*i5+7632*c1+24*k2, 64512, wt84);
_mm512_mask_storeu_ps(arrangedW1+6181928+60582816*i5+7632*c1+24*k2, 15, wt85);
_mm512_mask_storeu_ps(arrangedW1+6189536+60582816*i5+7632*c1+24*k2, 1008, wt85);
_mm512_mask_storeu_ps(arrangedW1+6197144+60582816*i5+7632*c1+24*k2, 64512, wt85);
_mm512_mask_storeu_ps(arrangedW1+38327912+60582816*i5+7632*c1+24*k2, 15, wt86);
_mm512_mask_storeu_ps(arrangedW1+38335520+60582816*i5+7632*c1+24*k2, 1008, wt86);
_mm512_mask_storeu_ps(arrangedW1+38343128+60582816*i5+7632*c1+24*k2, 64512, wt86);
_mm512_mask_storeu_ps(arrangedW1+48218984+60582816*i5+7632*c1+24*k2, 15, wt87);
_mm512_mask_storeu_ps(arrangedW1+48226592+60582816*i5+7632*c1+24*k2, 1008, wt87);
_mm512_mask_storeu_ps(arrangedW1+48234200+60582816*i5+7632*c1+24*k2, 64512, wt87);
_mm512_mask_storeu_ps(arrangedW1+55637288+60582816*i5+7632*c1+24*k2, 15, wt88);
_mm512_mask_storeu_ps(arrangedW1+55644896+60582816*i5+7632*c1+24*k2, 1008, wt88);
_mm512_mask_storeu_ps(arrangedW1+55652504+60582816*i5+7632*c1+24*k2, 64512, wt88);
_mm512_mask_storeu_ps(arrangedW1+39564296+60582816*i5+7632*c1+24*k2, 15, wt89);
_mm512_mask_storeu_ps(arrangedW1+39571904+60582816*i5+7632*c1+24*k2, 1008, wt89);
_mm512_mask_storeu_ps(arrangedW1+39579512+60582816*i5+7632*c1+24*k2, 64512, wt89);
_mm512_mask_storeu_ps(arrangedW1+49455368+60582816*i5+7632*c1+24*k2, 15, wt90);
_mm512_mask_storeu_ps(arrangedW1+49462976+60582816*i5+7632*c1+24*k2, 1008, wt90);
_mm512_mask_storeu_ps(arrangedW1+49470584+60582816*i5+7632*c1+24*k2, 64512, wt90);
_mm512_mask_storeu_ps(arrangedW1+56873672+60582816*i5+7632*c1+24*k2, 15, wt91);
_mm512_mask_storeu_ps(arrangedW1+56881280+60582816*i5+7632*c1+24*k2, 1008, wt91);
_mm512_mask_storeu_ps(arrangedW1+56888888+60582816*i5+7632*c1+24*k2, 64512, wt91);
_mm512_mask_storeu_ps(arrangedW1+40800680+60582816*i5+7632*c1+24*k2, 15, wt92);
_mm512_mask_storeu_ps(arrangedW1+40808288+60582816*i5+7632*c1+24*k2, 1008, wt92);
_mm512_mask_storeu_ps(arrangedW1+40815896+60582816*i5+7632*c1+24*k2, 64512, wt92);
_mm512_mask_storeu_ps(arrangedW1+7418312+60582816*i5+7632*c1+24*k2, 15, wt93);
_mm512_mask_storeu_ps(arrangedW1+7425920+60582816*i5+7632*c1+24*k2, 1008, wt93);
_mm512_mask_storeu_ps(arrangedW1+7433528+60582816*i5+7632*c1+24*k2, 64512, wt93);
_mm512_mask_storeu_ps(arrangedW1+19782152+60582816*i5+7632*c1+24*k2, 15, wt94);
_mm512_mask_storeu_ps(arrangedW1+19789760+60582816*i5+7632*c1+24*k2, 1008, wt94);
_mm512_mask_storeu_ps(arrangedW1+19797368+60582816*i5+7632*c1+24*k2, 64512, wt94);
_mm512_mask_storeu_ps(arrangedW1+29673224+60582816*i5+7632*c1+24*k2, 15, wt95);
_mm512_mask_storeu_ps(arrangedW1+29680832+60582816*i5+7632*c1+24*k2, 1008, wt95);
_mm512_mask_storeu_ps(arrangedW1+29688440+60582816*i5+7632*c1+24*k2, 64512, wt95);
_mm512_mask_storeu_ps(arrangedW1+8654696+60582816*i5+7632*c1+24*k2, 15, wt96);
_mm512_mask_storeu_ps(arrangedW1+8662304+60582816*i5+7632*c1+24*k2, 1008, wt96);
_mm512_mask_storeu_ps(arrangedW1+8669912+60582816*i5+7632*c1+24*k2, 64512, wt96);
__m512 wt97 = _mm512_maskz_loadu_ps(65535, wtPtr1+128+60582816*i5+997248*j1+196*k2);
__m512 wt98 = _mm512_maskz_loadu_ps(65535, wtPtr1+62456+60582816*i5+997248*j1+196*k2);
__m512 wt99 = _mm512_maskz_loadu_ps(65535, wtPtr1+124784+60582816*i5+997248*j1+196*k2);
__m512 wt100 = _mm512_maskz_loadu_ps(65535, wtPtr1+187112+60582816*i5+997248*j1+196*k2);
__m512 wt101 = _mm512_maskz_loadu_ps(65535, wtPtr1+249440+60582816*i5+997248*j1+196*k2);
__m512 wt102 = _mm512_maskz_loadu_ps(65535, wtPtr1+311768+60582816*i5+997248*j1+196*k2);
__m512 wt103 = _mm512_maskz_loadu_ps(65535, wtPtr1+374096+60582816*i5+997248*j1+196*k2);
__m512 wt104 = _mm512_maskz_loadu_ps(65535, wtPtr1+436424+60582816*i5+997248*j1+196*k2);
__m512 wt105 = _mm512_maskz_loadu_ps(65535, wtPtr1+498752+60582816*i5+997248*j1+196*k2);
__m512 wt106 = _mm512_maskz_loadu_ps(65535, wtPtr1+561080+60582816*i5+997248*j1+196*k2);
__m512 wt107 = _mm512_maskz_loadu_ps(65535, wtPtr1+623408+60582816*i5+997248*j1+196*k2);
__m512 wt108 = _mm512_maskz_loadu_ps(65535, wtPtr1+685736+60582816*i5+997248*j1+196*k2);
__m512 wt109 = _mm512_maskz_loadu_ps(65535, wtPtr1+748064+60582816*i5+997248*j1+196*k2);
__m512 wt110 = _mm512_maskz_loadu_ps(65535, wtPtr1+810392+60582816*i5+997248*j1+196*k2);
__m512 wt111 = _mm512_maskz_loadu_ps(65535, wtPtr1+872720+60582816*i5+997248*j1+196*k2);
__m512 wt112 = _mm512_maskz_loadu_ps(65535, wtPtr1+935048+60582816*i5+997248*j1+196*k2);
__m512 tmp255 = _mm512_unpacklo_ps(wt97, wt98);
__m512 tmp256 = _mm512_unpackhi_ps(wt97, wt98);
__m512 tmp257 = _mm512_unpacklo_ps(wt99, wt100);
__m512 tmp258 = _mm512_unpackhi_ps(wt99, wt100);
__m512 tmp259 = _mm512_unpacklo_ps(wt101, wt102);
__m512 tmp260 = _mm512_unpackhi_ps(wt101, wt102);
__m512 tmp261 = _mm512_unpacklo_ps(wt103, wt104);
__m512 tmp262 = _mm512_unpackhi_ps(wt103, wt104);
__m512 tmp263 = _mm512_unpacklo_ps(wt105, wt106);
__m512 tmp264 = _mm512_unpackhi_ps(wt105, wt106);
__m512 tmp265 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp266 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp267 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp268 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp269 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp270 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp271 = _mm512_shuffle_ps(tmp255, tmp257, 68);
__m512 tmp272 = _mm512_shuffle_ps(tmp255, tmp257, 238);
__m512 tmp273 = _mm512_shuffle_ps(tmp256, tmp258, 68);
__m512 tmp274 = _mm512_shuffle_ps(tmp256, tmp258, 238);
__m512 tmp275 = _mm512_shuffle_ps(tmp259, tmp261, 68);
__m512 tmp276 = _mm512_shuffle_ps(tmp259, tmp261, 238);
__m512 tmp277 = _mm512_shuffle_ps(tmp260, tmp262, 68);
__m512 tmp278 = _mm512_shuffle_ps(tmp260, tmp262, 238);
__m512 tmp279 = _mm512_shuffle_ps(tmp263, tmp265, 68);
__m512 tmp280 = _mm512_shuffle_ps(tmp263, tmp265, 238);
__m512 tmp281 = _mm512_shuffle_ps(tmp264, tmp266, 68);
__m512 tmp282 = _mm512_shuffle_ps(tmp264, tmp266, 238);
__m512 tmp283 = _mm512_shuffle_ps(tmp267, tmp269, 68);
__m512 tmp284 = _mm512_shuffle_ps(tmp267, tmp269, 238);
__m512 tmp285 = _mm512_shuffle_ps(tmp268, tmp270, 68);
__m512 tmp286 = _mm512_shuffle_ps(tmp268, tmp270, 238);
__m512 tmp287 = _mm512_shuffle_f32x4(tmp271, tmp275, 136);
__m512 tmp288 = _mm512_shuffle_f32x4(tmp271, tmp275, 221);
__m512 tmp289 = _mm512_shuffle_f32x4(tmp272, tmp276, 136);
__m512 tmp290 = _mm512_shuffle_f32x4(tmp272, tmp276, 221);
__m512 tmp291 = _mm512_shuffle_f32x4(tmp273, tmp277, 136);
__m512 tmp292 = _mm512_shuffle_f32x4(tmp273, tmp277, 221);
__m512 tmp293 = _mm512_shuffle_f32x4(tmp274, tmp278, 136);
__m512 tmp294 = _mm512_shuffle_f32x4(tmp274, tmp278, 221);
__m512 tmp295 = _mm512_shuffle_f32x4(tmp279, tmp283, 136);
__m512 tmp296 = _mm512_shuffle_f32x4(tmp279, tmp283, 221);
__m512 tmp297 = _mm512_shuffle_f32x4(tmp280, tmp284, 136);
__m512 tmp298 = _mm512_shuffle_f32x4(tmp280, tmp284, 221);
__m512 tmp299 = _mm512_shuffle_f32x4(tmp281, tmp285, 136);
__m512 tmp300 = _mm512_shuffle_f32x4(tmp281, tmp285, 221);
__m512 tmp301 = _mm512_shuffle_f32x4(tmp282, tmp286, 136);
__m512 tmp302 = _mm512_shuffle_f32x4(tmp282, tmp286, 221);
wt97 = _mm512_shuffle_f32x4(tmp287, tmp295, 136);
wt105 = _mm512_shuffle_f32x4(tmp287, tmp295, 221);
wt98 = _mm512_shuffle_f32x4(tmp289, tmp297, 136);
wt106 = _mm512_shuffle_f32x4(tmp289, tmp297, 221);
wt99 = _mm512_shuffle_f32x4(tmp291, tmp299, 136);
wt107 = _mm512_shuffle_f32x4(tmp291, tmp299, 221);
wt100 = _mm512_shuffle_f32x4(tmp293, tmp301, 136);
wt108 = _mm512_shuffle_f32x4(tmp293, tmp301, 221);
wt101 = _mm512_shuffle_f32x4(tmp288, tmp296, 136);
wt109 = _mm512_shuffle_f32x4(tmp288, tmp296, 221);
wt102 = _mm512_shuffle_f32x4(tmp290, tmp298, 136);
wt110 = _mm512_shuffle_f32x4(tmp290, tmp298, 221);
wt103 = _mm512_shuffle_f32x4(tmp292, tmp300, 136);
wt111 = _mm512_shuffle_f32x4(tmp292, tmp300, 221);
wt104 = _mm512_shuffle_f32x4(tmp294, tmp302, 136);
wt112 = _mm512_shuffle_f32x4(tmp294, tmp302, 221);
_mm512_mask_storeu_ps(arrangedW1+21018536+60582816*i5+7632*c1+24*k2, 15, wt97);
_mm512_mask_storeu_ps(arrangedW1+21026144+60582816*i5+7632*c1+24*k2, 1008, wt97);
_mm512_mask_storeu_ps(arrangedW1+21033752+60582816*i5+7632*c1+24*k2, 64512, wt97);
_mm512_mask_storeu_ps(arrangedW1+30909608+60582816*i5+7632*c1+24*k2, 15, wt98);
_mm512_mask_storeu_ps(arrangedW1+30917216+60582816*i5+7632*c1+24*k2, 1008, wt98);
_mm512_mask_storeu_ps(arrangedW1+30924824+60582816*i5+7632*c1+24*k2, 64512, wt98);
_mm512_mask_storeu_ps(arrangedW1+9891080+60582816*i5+7632*c1+24*k2, 15, wt99);
_mm512_mask_storeu_ps(arrangedW1+9898688+60582816*i5+7632*c1+24*k2, 1008, wt99);
_mm512_mask_storeu_ps(arrangedW1+9906296+60582816*i5+7632*c1+24*k2, 64512, wt99);
_mm512_mask_storeu_ps(arrangedW1+42037064+60582816*i5+7632*c1+24*k2, 15, wt100);
_mm512_mask_storeu_ps(arrangedW1+42044672+60582816*i5+7632*c1+24*k2, 1008, wt100);
_mm512_mask_storeu_ps(arrangedW1+42052280+60582816*i5+7632*c1+24*k2, 64512, wt100);
_mm512_mask_storeu_ps(arrangedW1+50691752+60582816*i5+7632*c1+24*k2, 15, wt101);
_mm512_mask_storeu_ps(arrangedW1+50699360+60582816*i5+7632*c1+24*k2, 1008, wt101);
_mm512_mask_storeu_ps(arrangedW1+50706968+60582816*i5+7632*c1+24*k2, 64512, wt101);
_mm512_mask_storeu_ps(arrangedW1+58110056+60582816*i5+7632*c1+24*k2, 15, wt102);
_mm512_mask_storeu_ps(arrangedW1+58117664+60582816*i5+7632*c1+24*k2, 1008, wt102);
_mm512_mask_storeu_ps(arrangedW1+58125272+60582816*i5+7632*c1+24*k2, 64512, wt102);
_mm512_mask_storeu_ps(arrangedW1+43273448+60582816*i5+7632*c1+24*k2, 15, wt103);
_mm512_mask_storeu_ps(arrangedW1+43281056+60582816*i5+7632*c1+24*k2, 1008, wt103);
_mm512_mask_storeu_ps(arrangedW1+43288664+60582816*i5+7632*c1+24*k2, 64512, wt103);
_mm512_mask_storeu_ps(arrangedW1+51928136+60582816*i5+7632*c1+24*k2, 15, wt104);
_mm512_mask_storeu_ps(arrangedW1+51935744+60582816*i5+7632*c1+24*k2, 1008, wt104);
_mm512_mask_storeu_ps(arrangedW1+51943352+60582816*i5+7632*c1+24*k2, 64512, wt104);
_mm512_mask_storeu_ps(arrangedW1+59346440+60582816*i5+7632*c1+24*k2, 15, wt105);
_mm512_mask_storeu_ps(arrangedW1+59354048+60582816*i5+7632*c1+24*k2, 1008, wt105);
_mm512_mask_storeu_ps(arrangedW1+59361656+60582816*i5+7632*c1+24*k2, 64512, wt105);
_mm512_mask_storeu_ps(arrangedW1+44509832+60582816*i5+7632*c1+24*k2, 15, wt106);
_mm512_mask_storeu_ps(arrangedW1+44517440+60582816*i5+7632*c1+24*k2, 1008, wt106);
_mm512_mask_storeu_ps(arrangedW1+44525048+60582816*i5+7632*c1+24*k2, 64512, wt106);
_mm512_mask_storeu_ps(arrangedW1+11127464+60582816*i5+7632*c1+24*k2, 15, wt107);
_mm512_mask_storeu_ps(arrangedW1+11135072+60582816*i5+7632*c1+24*k2, 1008, wt107);
_mm512_mask_storeu_ps(arrangedW1+11142680+60582816*i5+7632*c1+24*k2, 64512, wt107);
_mm512_mask_storeu_ps(arrangedW1+22254920+60582816*i5+7632*c1+24*k2, 15, wt108);
_mm512_mask_storeu_ps(arrangedW1+22262528+60582816*i5+7632*c1+24*k2, 1008, wt108);
_mm512_mask_storeu_ps(arrangedW1+22270136+60582816*i5+7632*c1+24*k2, 64512, wt108);
_mm512_mask_storeu_ps(arrangedW1+32145992+60582816*i5+7632*c1+24*k2, 15, wt109);
_mm512_mask_storeu_ps(arrangedW1+32153600+60582816*i5+7632*c1+24*k2, 1008, wt109);
_mm512_mask_storeu_ps(arrangedW1+32161208+60582816*i5+7632*c1+24*k2, 64512, wt109);
_mm512_mask_storeu_ps(arrangedW1+12363848+60582816*i5+7632*c1+24*k2, 15, wt110);
_mm512_mask_storeu_ps(arrangedW1+12371456+60582816*i5+7632*c1+24*k2, 1008, wt110);
_mm512_mask_storeu_ps(arrangedW1+12379064+60582816*i5+7632*c1+24*k2, 64512, wt110);
_mm512_mask_storeu_ps(arrangedW1+23491304+60582816*i5+7632*c1+24*k2, 15, wt111);
_mm512_mask_storeu_ps(arrangedW1+23498912+60582816*i5+7632*c1+24*k2, 1008, wt111);
_mm512_mask_storeu_ps(arrangedW1+23506520+60582816*i5+7632*c1+24*k2, 64512, wt111);
_mm512_mask_storeu_ps(arrangedW1+33382376+60582816*i5+7632*c1+24*k2, 15, wt112);
_mm512_mask_storeu_ps(arrangedW1+33389984+60582816*i5+7632*c1+24*k2, 1008, wt112);
_mm512_mask_storeu_ps(arrangedW1+33397592+60582816*i5+7632*c1+24*k2, 64512, wt112);
__m512 wt113 = _mm512_maskz_loadu_ps(1, wtPtr1+192+60582816*i5+997248*j1+196*k2);
__m512 wt114 = _mm512_maskz_loadu_ps(1, wtPtr1+62520+60582816*i5+997248*j1+196*k2);
__m512 wt115 = _mm512_maskz_loadu_ps(1, wtPtr1+124848+60582816*i5+997248*j1+196*k2);
__m512 wt116 = _mm512_maskz_loadu_ps(1, wtPtr1+187176+60582816*i5+997248*j1+196*k2);
__m512 wt117 = _mm512_maskz_loadu_ps(1, wtPtr1+249504+60582816*i5+997248*j1+196*k2);
__m512 wt118 = _mm512_maskz_loadu_ps(1, wtPtr1+311832+60582816*i5+997248*j1+196*k2);
__m512 wt119 = _mm512_maskz_loadu_ps(1, wtPtr1+374160+60582816*i5+997248*j1+196*k2);
__m512 wt120 = _mm512_maskz_loadu_ps(1, wtPtr1+436488+60582816*i5+997248*j1+196*k2);
__m512 wt121 = _mm512_maskz_loadu_ps(1, wtPtr1+498816+60582816*i5+997248*j1+196*k2);
__m512 wt122 = _mm512_maskz_loadu_ps(1, wtPtr1+561144+60582816*i5+997248*j1+196*k2);
__m512 wt123 = _mm512_maskz_loadu_ps(1, wtPtr1+623472+60582816*i5+997248*j1+196*k2);
__m512 wt124 = _mm512_maskz_loadu_ps(1, wtPtr1+685800+60582816*i5+997248*j1+196*k2);
__m512 wt125 = _mm512_maskz_loadu_ps(1, wtPtr1+748128+60582816*i5+997248*j1+196*k2);
__m512 wt126 = _mm512_maskz_loadu_ps(1, wtPtr1+810456+60582816*i5+997248*j1+196*k2);
__m512 wt127 = _mm512_maskz_loadu_ps(1, wtPtr1+872784+60582816*i5+997248*j1+196*k2);
__m512 wt128 = _mm512_maskz_loadu_ps(1, wtPtr1+935112+60582816*i5+997248*j1+196*k2);
__m512 tmp303 = _mm512_unpacklo_ps(wt113, wt114);
__m512 tmp304 = _mm512_unpacklo_ps(wt115, wt116);
__m512 tmp305 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp306 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp307 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp308 = _mm512_unpacklo_ps(wt123, wt124);
__m512 tmp309 = _mm512_unpacklo_ps(wt125, wt126);
__m512 tmp310 = _mm512_unpacklo_ps(wt127, wt128);
__m512 tmp311 = _mm512_shuffle_ps(tmp303, tmp304, 68);
__m512 tmp312 = _mm512_shuffle_ps(tmp305, tmp306, 68);
__m512 tmp313 = _mm512_shuffle_ps(tmp307, tmp308, 68);
__m512 tmp314 = _mm512_shuffle_ps(tmp309, tmp310, 68);
__m512 tmp315 = _mm512_shuffle_f32x4(tmp311, tmp312, 136);
__m512 tmp316 = _mm512_shuffle_f32x4(tmp313, tmp314, 136);
wt113 = _mm512_shuffle_f32x4(tmp315, tmp316, 136);
_mm512_mask_storeu_ps(arrangedW1+13600232+60582816*i5+7632*c1+24*k2, 15, wt113);
_mm512_mask_storeu_ps(arrangedW1+13607840+60582816*i5+7632*c1+24*k2, 1008, wt113);
_mm512_mask_storeu_ps(arrangedW1+13615448+60582816*i5+7632*c1+24*k2, 64512, wt113);
}
break;
}
default: {
ptrdiff_t k3 = 0;
for (; k3 != 318; ++k3) {
__m512 wt129 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+60582816*i5+997248*j1+196*k3);
__m512 wt130 = _mm512_maskz_loadu_ps(65535, wtPtr1+62328+60582816*i5+997248*j1+196*k3);
__m512 wt131 = _mm512_maskz_loadu_ps(65535, wtPtr1+124656+60582816*i5+997248*j1+196*k3);
__m512 wt132 = _mm512_maskz_loadu_ps(65535, wtPtr1+186984+60582816*i5+997248*j1+196*k3);
__m512 wt133 = _mm512_maskz_loadu_ps(65535, wtPtr1+249312+60582816*i5+997248*j1+196*k3);
__m512 wt134 = _mm512_maskz_loadu_ps(65535, wtPtr1+311640+60582816*i5+997248*j1+196*k3);
__m512 wt135 = _mm512_maskz_loadu_ps(65535, wtPtr1+373968+60582816*i5+997248*j1+196*k3);
__m512 wt136 = _mm512_maskz_loadu_ps(65535, wtPtr1+436296+60582816*i5+997248*j1+196*k3);
__m512 wt137 = _mm512_maskz_loadu_ps(65535, wtPtr1+498624+60582816*i5+997248*j1+196*k3);
__m512 wt138 = _mm512_maskz_loadu_ps(65535, wtPtr1+560952+60582816*i5+997248*j1+196*k3);
__m512 wt139 = _mm512_maskz_loadu_ps(65535, wtPtr1+623280+60582816*i5+997248*j1+196*k3);
__m512 wt140 = _mm512_maskz_loadu_ps(65535, wtPtr1+685608+60582816*i5+997248*j1+196*k3);
__m512 wt141 = _mm512_maskz_loadu_ps(65535, wtPtr1+747936+60582816*i5+997248*j1+196*k3);
__m512 wt142 = _mm512_maskz_loadu_ps(65535, wtPtr1+810264+60582816*i5+997248*j1+196*k3);
__m512 wt143 = _mm512_maskz_loadu_ps(65535, wtPtr1+872592+60582816*i5+997248*j1+196*k3);
__m512 wt144 = _mm512_maskz_loadu_ps(65535, wtPtr1+934920+60582816*i5+997248*j1+196*k3);
__m512 tmp317 = _mm512_unpacklo_ps(wt129, wt130);
__m512 tmp318 = _mm512_unpackhi_ps(wt129, wt130);
__m512 tmp319 = _mm512_unpacklo_ps(wt131, wt132);
__m512 tmp320 = _mm512_unpackhi_ps(wt131, wt132);
__m512 tmp321 = _mm512_unpacklo_ps(wt133, wt134);
__m512 tmp322 = _mm512_unpackhi_ps(wt133, wt134);
__m512 tmp323 = _mm512_unpacklo_ps(wt135, wt136);
__m512 tmp324 = _mm512_unpackhi_ps(wt135, wt136);
__m512 tmp325 = _mm512_unpacklo_ps(wt137, wt138);
__m512 tmp326 = _mm512_unpackhi_ps(wt137, wt138);
__m512 tmp327 = _mm512_unpacklo_ps(wt139, wt140);
__m512 tmp328 = _mm512_unpackhi_ps(wt139, wt140);
__m512 tmp329 = _mm512_unpacklo_ps(wt141, wt142);
__m512 tmp330 = _mm512_unpackhi_ps(wt141, wt142);
__m512 tmp331 = _mm512_unpacklo_ps(wt143, wt144);
__m512 tmp332 = _mm512_unpackhi_ps(wt143, wt144);
__m512 tmp333 = _mm512_shuffle_ps(tmp317, tmp319, 68);
__m512 tmp334 = _mm512_shuffle_ps(tmp317, tmp319, 238);
__m512 tmp335 = _mm512_shuffle_ps(tmp318, tmp320, 68);
__m512 tmp336 = _mm512_shuffle_ps(tmp318, tmp320, 238);
__m512 tmp337 = _mm512_shuffle_ps(tmp321, tmp323, 68);
__m512 tmp338 = _mm512_shuffle_ps(tmp321, tmp323, 238);
__m512 tmp339 = _mm512_shuffle_ps(tmp322, tmp324, 68);
__m512 tmp340 = _mm512_shuffle_ps(tmp322, tmp324, 238);
__m512 tmp341 = _mm512_shuffle_ps(tmp325, tmp327, 68);
__m512 tmp342 = _mm512_shuffle_ps(tmp325, tmp327, 238);
__m512 tmp343 = _mm512_shuffle_ps(tmp326, tmp328, 68);
__m512 tmp344 = _mm512_shuffle_ps(tmp326, tmp328, 238);
__m512 tmp345 = _mm512_shuffle_ps(tmp329, tmp331, 68);
__m512 tmp346 = _mm512_shuffle_ps(tmp329, tmp331, 238);
__m512 tmp347 = _mm512_shuffle_ps(tmp330, tmp332, 68);
__m512 tmp348 = _mm512_shuffle_ps(tmp330, tmp332, 238);
__m512 tmp349 = _mm512_shuffle_f32x4(tmp333, tmp337, 136);
__m512 tmp350 = _mm512_shuffle_f32x4(tmp333, tmp337, 221);
__m512 tmp351 = _mm512_shuffle_f32x4(tmp334, tmp338, 136);
__m512 tmp352 = _mm512_shuffle_f32x4(tmp334, tmp338, 221);
__m512 tmp353 = _mm512_shuffle_f32x4(tmp335, tmp339, 136);
__m512 tmp354 = _mm512_shuffle_f32x4(tmp335, tmp339, 221);
__m512 tmp355 = _mm512_shuffle_f32x4(tmp336, tmp340, 136);
__m512 tmp356 = _mm512_shuffle_f32x4(tmp336, tmp340, 221);
__m512 tmp357 = _mm512_shuffle_f32x4(tmp341, tmp345, 136);
__m512 tmp358 = _mm512_shuffle_f32x4(tmp341, tmp345, 221);
__m512 tmp359 = _mm512_shuffle_f32x4(tmp342, tmp346, 136);
__m512 tmp360 = _mm512_shuffle_f32x4(tmp342, tmp346, 221);
__m512 tmp361 = _mm512_shuffle_f32x4(tmp343, tmp347, 136);
__m512 tmp362 = _mm512_shuffle_f32x4(tmp343, tmp347, 221);
__m512 tmp363 = _mm512_shuffle_f32x4(tmp344, tmp348, 136);
__m512 tmp364 = _mm512_shuffle_f32x4(tmp344, tmp348, 221);
wt129 = _mm512_shuffle_f32x4(tmp349, tmp357, 136);
wt137 = _mm512_shuffle_f32x4(tmp349, tmp357, 221);
wt130 = _mm512_shuffle_f32x4(tmp351, tmp359, 136);
wt138 = _mm512_shuffle_f32x4(tmp351, tmp359, 221);
wt131 = _mm512_shuffle_f32x4(tmp353, tmp361, 136);
wt139 = _mm512_shuffle_f32x4(tmp353, tmp361, 221);
wt132 = _mm512_shuffle_f32x4(tmp355, tmp363, 136);
wt140 = _mm512_shuffle_f32x4(tmp355, tmp363, 221);
wt133 = _mm512_shuffle_f32x4(tmp350, tmp358, 136);
wt141 = _mm512_shuffle_f32x4(tmp350, tmp358, 221);
wt134 = _mm512_shuffle_f32x4(tmp352, tmp360, 136);
wt142 = _mm512_shuffle_f32x4(tmp352, tmp360, 221);
wt135 = _mm512_shuffle_f32x4(tmp354, tmp362, 136);
wt143 = _mm512_shuffle_f32x4(tmp354, tmp362, 221);
wt136 = _mm512_shuffle_f32x4(tmp356, tmp364, 136);
wt144 = _mm512_shuffle_f32x4(tmp356, tmp364, 221);
_mm512_mask_storeu_ps(arrangedW1+16+60582816*i5+7632*c1+24*k3, 3, wt129);
_mm512_mask_storeu_ps(arrangedW1+7624+60582816*i5+7632*c1+24*k3, 252, wt129);
_mm512_mask_storeu_ps(arrangedW1+15232+60582816*i5+7632*c1+24*k3, 16128, wt129);
_mm512_mask_storeu_ps(arrangedW1+22840+60582816*i5+7632*c1+24*k3, 49152, wt129);
_mm512_mask_storeu_ps(arrangedW1+14836624+60582816*i5+7632*c1+24*k3, 3, wt130);
_mm512_mask_storeu_ps(arrangedW1+14844232+60582816*i5+7632*c1+24*k3, 252, wt130);
_mm512_mask_storeu_ps(arrangedW1+14851840+60582816*i5+7632*c1+24*k3, 16128, wt130);
_mm512_mask_storeu_ps(arrangedW1+14859448+60582816*i5+7632*c1+24*k3, 49152, wt130);
_mm512_mask_storeu_ps(arrangedW1+24727696+60582816*i5+7632*c1+24*k3, 3, wt131);
_mm512_mask_storeu_ps(arrangedW1+24735304+60582816*i5+7632*c1+24*k3, 252, wt131);
_mm512_mask_storeu_ps(arrangedW1+24742912+60582816*i5+7632*c1+24*k3, 16128, wt131);
_mm512_mask_storeu_ps(arrangedW1+24750520+60582816*i5+7632*c1+24*k3, 49152, wt131);
_mm512_mask_storeu_ps(arrangedW1+1236400+60582816*i5+7632*c1+24*k3, 3, wt132);
_mm512_mask_storeu_ps(arrangedW1+1244008+60582816*i5+7632*c1+24*k3, 252, wt132);
_mm512_mask_storeu_ps(arrangedW1+1251616+60582816*i5+7632*c1+24*k3, 16128, wt132);
_mm512_mask_storeu_ps(arrangedW1+1259224+60582816*i5+7632*c1+24*k3, 49152, wt132);
_mm512_mask_storeu_ps(arrangedW1+16073008+60582816*i5+7632*c1+24*k3, 3, wt133);
_mm512_mask_storeu_ps(arrangedW1+16080616+60582816*i5+7632*c1+24*k3, 252, wt133);
_mm512_mask_storeu_ps(arrangedW1+16088224+60582816*i5+7632*c1+24*k3, 16128, wt133);
_mm512_mask_storeu_ps(arrangedW1+16095832+60582816*i5+7632*c1+24*k3, 49152, wt133);
_mm512_mask_storeu_ps(arrangedW1+25964080+60582816*i5+7632*c1+24*k3, 3, wt134);
_mm512_mask_storeu_ps(arrangedW1+25971688+60582816*i5+7632*c1+24*k3, 252, wt134);
_mm512_mask_storeu_ps(arrangedW1+25979296+60582816*i5+7632*c1+24*k3, 16128, wt134);
_mm512_mask_storeu_ps(arrangedW1+25986904+60582816*i5+7632*c1+24*k3, 49152, wt134);
_mm512_mask_storeu_ps(arrangedW1+2472784+60582816*i5+7632*c1+24*k3, 3, wt135);
_mm512_mask_storeu_ps(arrangedW1+2480392+60582816*i5+7632*c1+24*k3, 252, wt135);
_mm512_mask_storeu_ps(arrangedW1+2488000+60582816*i5+7632*c1+24*k3, 16128, wt135);
_mm512_mask_storeu_ps(arrangedW1+2495608+60582816*i5+7632*c1+24*k3, 49152, wt135);
_mm512_mask_storeu_ps(arrangedW1+34618768+60582816*i5+7632*c1+24*k3, 3, wt136);
_mm512_mask_storeu_ps(arrangedW1+34626376+60582816*i5+7632*c1+24*k3, 252, wt136);
_mm512_mask_storeu_ps(arrangedW1+34633984+60582816*i5+7632*c1+24*k3, 16128, wt136);
_mm512_mask_storeu_ps(arrangedW1+34641592+60582816*i5+7632*c1+24*k3, 49152, wt136);
_mm512_mask_storeu_ps(arrangedW1+45746224+60582816*i5+7632*c1+24*k3, 3, wt137);
_mm512_mask_storeu_ps(arrangedW1+45753832+60582816*i5+7632*c1+24*k3, 252, wt137);
_mm512_mask_storeu_ps(arrangedW1+45761440+60582816*i5+7632*c1+24*k3, 16128, wt137);
_mm512_mask_storeu_ps(arrangedW1+45769048+60582816*i5+7632*c1+24*k3, 49152, wt137);
_mm512_mask_storeu_ps(arrangedW1+53164528+60582816*i5+7632*c1+24*k3, 3, wt138);
_mm512_mask_storeu_ps(arrangedW1+53172136+60582816*i5+7632*c1+24*k3, 252, wt138);
_mm512_mask_storeu_ps(arrangedW1+53179744+60582816*i5+7632*c1+24*k3, 16128, wt138);
_mm512_mask_storeu_ps(arrangedW1+53187352+60582816*i5+7632*c1+24*k3, 49152, wt138);
_mm512_mask_storeu_ps(arrangedW1+35855152+60582816*i5+7632*c1+24*k3, 3, wt139);
_mm512_mask_storeu_ps(arrangedW1+35862760+60582816*i5+7632*c1+24*k3, 252, wt139);
_mm512_mask_storeu_ps(arrangedW1+35870368+60582816*i5+7632*c1+24*k3, 16128, wt139);
_mm512_mask_storeu_ps(arrangedW1+35877976+60582816*i5+7632*c1+24*k3, 49152, wt139);
_mm512_mask_storeu_ps(arrangedW1+46982608+60582816*i5+7632*c1+24*k3, 3, wt140);
_mm512_mask_storeu_ps(arrangedW1+46990216+60582816*i5+7632*c1+24*k3, 252, wt140);
_mm512_mask_storeu_ps(arrangedW1+46997824+60582816*i5+7632*c1+24*k3, 16128, wt140);
_mm512_mask_storeu_ps(arrangedW1+47005432+60582816*i5+7632*c1+24*k3, 49152, wt140);
_mm512_mask_storeu_ps(arrangedW1+54400912+60582816*i5+7632*c1+24*k3, 3, wt141);
_mm512_mask_storeu_ps(arrangedW1+54408520+60582816*i5+7632*c1+24*k3, 252, wt141);
_mm512_mask_storeu_ps(arrangedW1+54416128+60582816*i5+7632*c1+24*k3, 16128, wt141);
_mm512_mask_storeu_ps(arrangedW1+54423736+60582816*i5+7632*c1+24*k3, 49152, wt141);
_mm512_mask_storeu_ps(arrangedW1+37091536+60582816*i5+7632*c1+24*k3, 3, wt142);
_mm512_mask_storeu_ps(arrangedW1+37099144+60582816*i5+7632*c1+24*k3, 252, wt142);
_mm512_mask_storeu_ps(arrangedW1+37106752+60582816*i5+7632*c1+24*k3, 16128, wt142);
_mm512_mask_storeu_ps(arrangedW1+37114360+60582816*i5+7632*c1+24*k3, 49152, wt142);
_mm512_mask_storeu_ps(arrangedW1+3709168+60582816*i5+7632*c1+24*k3, 3, wt143);
_mm512_mask_storeu_ps(arrangedW1+3716776+60582816*i5+7632*c1+24*k3, 252, wt143);
_mm512_mask_storeu_ps(arrangedW1+3724384+60582816*i5+7632*c1+24*k3, 16128, wt143);
_mm512_mask_storeu_ps(arrangedW1+3731992+60582816*i5+7632*c1+24*k3, 49152, wt143);
_mm512_mask_storeu_ps(arrangedW1+17309392+60582816*i5+7632*c1+24*k3, 3, wt144);
_mm512_mask_storeu_ps(arrangedW1+17317000+60582816*i5+7632*c1+24*k3, 252, wt144);
_mm512_mask_storeu_ps(arrangedW1+17324608+60582816*i5+7632*c1+24*k3, 16128, wt144);
_mm512_mask_storeu_ps(arrangedW1+17332216+60582816*i5+7632*c1+24*k3, 49152, wt144);
__m512 wt145 = _mm512_maskz_loadu_ps(65535, wtPtr1+64+60582816*i5+997248*j1+196*k3);
__m512 wt146 = _mm512_maskz_loadu_ps(65535, wtPtr1+62392+60582816*i5+997248*j1+196*k3);
__m512 wt147 = _mm512_maskz_loadu_ps(65535, wtPtr1+124720+60582816*i5+997248*j1+196*k3);
__m512 wt148 = _mm512_maskz_loadu_ps(65535, wtPtr1+187048+60582816*i5+997248*j1+196*k3);
__m512 wt149 = _mm512_maskz_loadu_ps(65535, wtPtr1+249376+60582816*i5+997248*j1+196*k3);
__m512 wt150 = _mm512_maskz_loadu_ps(65535, wtPtr1+311704+60582816*i5+997248*j1+196*k3);
__m512 wt151 = _mm512_maskz_loadu_ps(65535, wtPtr1+374032+60582816*i5+997248*j1+196*k3);
__m512 wt152 = _mm512_maskz_loadu_ps(65535, wtPtr1+436360+60582816*i5+997248*j1+196*k3);
__m512 wt153 = _mm512_maskz_loadu_ps(65535, wtPtr1+498688+60582816*i5+997248*j1+196*k3);
__m512 wt154 = _mm512_maskz_loadu_ps(65535, wtPtr1+561016+60582816*i5+997248*j1+196*k3);
__m512 wt155 = _mm512_maskz_loadu_ps(65535, wtPtr1+623344+60582816*i5+997248*j1+196*k3);
__m512 wt156 = _mm512_maskz_loadu_ps(65535, wtPtr1+685672+60582816*i5+997248*j1+196*k3);
__m512 wt157 = _mm512_maskz_loadu_ps(65535, wtPtr1+748000+60582816*i5+997248*j1+196*k3);
__m512 wt158 = _mm512_maskz_loadu_ps(65535, wtPtr1+810328+60582816*i5+997248*j1+196*k3);
__m512 wt159 = _mm512_maskz_loadu_ps(65535, wtPtr1+872656+60582816*i5+997248*j1+196*k3);
__m512 wt160 = _mm512_maskz_loadu_ps(65535, wtPtr1+934984+60582816*i5+997248*j1+196*k3);
__m512 tmp365 = _mm512_unpacklo_ps(wt145, wt146);
__m512 tmp366 = _mm512_unpackhi_ps(wt145, wt146);
__m512 tmp367 = _mm512_unpacklo_ps(wt147, wt148);
__m512 tmp368 = _mm512_unpackhi_ps(wt147, wt148);
__m512 tmp369 = _mm512_unpacklo_ps(wt149, wt150);
__m512 tmp370 = _mm512_unpackhi_ps(wt149, wt150);
__m512 tmp371 = _mm512_unpacklo_ps(wt151, wt152);
__m512 tmp372 = _mm512_unpackhi_ps(wt151, wt152);
__m512 tmp373 = _mm512_unpacklo_ps(wt153, wt154);
__m512 tmp374 = _mm512_unpackhi_ps(wt153, wt154);
__m512 tmp375 = _mm512_unpacklo_ps(wt155, wt156);
__m512 tmp376 = _mm512_unpackhi_ps(wt155, wt156);
__m512 tmp377 = _mm512_unpacklo_ps(wt157, wt158);
__m512 tmp378 = _mm512_unpackhi_ps(wt157, wt158);
__m512 tmp379 = _mm512_unpacklo_ps(wt159, wt160);
__m512 tmp380 = _mm512_unpackhi_ps(wt159, wt160);
__m512 tmp381 = _mm512_shuffle_ps(tmp365, tmp367, 68);
__m512 tmp382 = _mm512_shuffle_ps(tmp365, tmp367, 238);
__m512 tmp383 = _mm512_shuffle_ps(tmp366, tmp368, 68);
__m512 tmp384 = _mm512_shuffle_ps(tmp366, tmp368, 238);
__m512 tmp385 = _mm512_shuffle_ps(tmp369, tmp371, 68);
__m512 tmp386 = _mm512_shuffle_ps(tmp369, tmp371, 238);
__m512 tmp387 = _mm512_shuffle_ps(tmp370, tmp372, 68);
__m512 tmp388 = _mm512_shuffle_ps(tmp370, tmp372, 238);
__m512 tmp389 = _mm512_shuffle_ps(tmp373, tmp375, 68);
__m512 tmp390 = _mm512_shuffle_ps(tmp373, tmp375, 238);
__m512 tmp391 = _mm512_shuffle_ps(tmp374, tmp376, 68);
__m512 tmp392 = _mm512_shuffle_ps(tmp374, tmp376, 238);
__m512 tmp393 = _mm512_shuffle_ps(tmp377, tmp379, 68);
__m512 tmp394 = _mm512_shuffle_ps(tmp377, tmp379, 238);
__m512 tmp395 = _mm512_shuffle_ps(tmp378, tmp380, 68);
__m512 tmp396 = _mm512_shuffle_ps(tmp378, tmp380, 238);
__m512 tmp397 = _mm512_shuffle_f32x4(tmp381, tmp385, 136);
__m512 tmp398 = _mm512_shuffle_f32x4(tmp381, tmp385, 221);
__m512 tmp399 = _mm512_shuffle_f32x4(tmp382, tmp386, 136);
__m512 tmp400 = _mm512_shuffle_f32x4(tmp382, tmp386, 221);
__m512 tmp401 = _mm512_shuffle_f32x4(tmp383, tmp387, 136);
__m512 tmp402 = _mm512_shuffle_f32x4(tmp383, tmp387, 221);
__m512 tmp403 = _mm512_shuffle_f32x4(tmp384, tmp388, 136);
__m512 tmp404 = _mm512_shuffle_f32x4(tmp384, tmp388, 221);
__m512 tmp405 = _mm512_shuffle_f32x4(tmp389, tmp393, 136);
__m512 tmp406 = _mm512_shuffle_f32x4(tmp389, tmp393, 221);
__m512 tmp407 = _mm512_shuffle_f32x4(tmp390, tmp394, 136);
__m512 tmp408 = _mm512_shuffle_f32x4(tmp390, tmp394, 221);
__m512 tmp409 = _mm512_shuffle_f32x4(tmp391, tmp395, 136);
__m512 tmp410 = _mm512_shuffle_f32x4(tmp391, tmp395, 221);
__m512 tmp411 = _mm512_shuffle_f32x4(tmp392, tmp396, 136);
__m512 tmp412 = _mm512_shuffle_f32x4(tmp392, tmp396, 221);
wt145 = _mm512_shuffle_f32x4(tmp397, tmp405, 136);
wt153 = _mm512_shuffle_f32x4(tmp397, tmp405, 221);
wt146 = _mm512_shuffle_f32x4(tmp399, tmp407, 136);
wt154 = _mm512_shuffle_f32x4(tmp399, tmp407, 221);
wt147 = _mm512_shuffle_f32x4(tmp401, tmp409, 136);
wt155 = _mm512_shuffle_f32x4(tmp401, tmp409, 221);
wt148 = _mm512_shuffle_f32x4(tmp403, tmp411, 136);
wt156 = _mm512_shuffle_f32x4(tmp403, tmp411, 221);
wt149 = _mm512_shuffle_f32x4(tmp398, tmp406, 136);
wt157 = _mm512_shuffle_f32x4(tmp398, tmp406, 221);
wt150 = _mm512_shuffle_f32x4(tmp400, tmp408, 136);
wt158 = _mm512_shuffle_f32x4(tmp400, tmp408, 221);
wt151 = _mm512_shuffle_f32x4(tmp402, tmp410, 136);
wt159 = _mm512_shuffle_f32x4(tmp402, tmp410, 221);
wt152 = _mm512_shuffle_f32x4(tmp404, tmp412, 136);
wt160 = _mm512_shuffle_f32x4(tmp404, tmp412, 221);
_mm512_mask_storeu_ps(arrangedW1+27200464+60582816*i5+7632*c1+24*k3, 3, wt145);
_mm512_mask_storeu_ps(arrangedW1+27208072+60582816*i5+7632*c1+24*k3, 252, wt145);
_mm512_mask_storeu_ps(arrangedW1+27215680+60582816*i5+7632*c1+24*k3, 16128, wt145);
_mm512_mask_storeu_ps(arrangedW1+27223288+60582816*i5+7632*c1+24*k3, 49152, wt145);
_mm512_mask_storeu_ps(arrangedW1+4945552+60582816*i5+7632*c1+24*k3, 3, wt146);
_mm512_mask_storeu_ps(arrangedW1+4953160+60582816*i5+7632*c1+24*k3, 252, wt146);
_mm512_mask_storeu_ps(arrangedW1+4960768+60582816*i5+7632*c1+24*k3, 16128, wt146);
_mm512_mask_storeu_ps(arrangedW1+4968376+60582816*i5+7632*c1+24*k3, 49152, wt146);
_mm512_mask_storeu_ps(arrangedW1+18545776+60582816*i5+7632*c1+24*k3, 3, wt147);
_mm512_mask_storeu_ps(arrangedW1+18553384+60582816*i5+7632*c1+24*k3, 252, wt147);
_mm512_mask_storeu_ps(arrangedW1+18560992+60582816*i5+7632*c1+24*k3, 16128, wt147);
_mm512_mask_storeu_ps(arrangedW1+18568600+60582816*i5+7632*c1+24*k3, 49152, wt147);
_mm512_mask_storeu_ps(arrangedW1+28436848+60582816*i5+7632*c1+24*k3, 3, wt148);
_mm512_mask_storeu_ps(arrangedW1+28444456+60582816*i5+7632*c1+24*k3, 252, wt148);
_mm512_mask_storeu_ps(arrangedW1+28452064+60582816*i5+7632*c1+24*k3, 16128, wt148);
_mm512_mask_storeu_ps(arrangedW1+28459672+60582816*i5+7632*c1+24*k3, 49152, wt148);
_mm512_mask_storeu_ps(arrangedW1+6181936+60582816*i5+7632*c1+24*k3, 3, wt149);
_mm512_mask_storeu_ps(arrangedW1+6189544+60582816*i5+7632*c1+24*k3, 252, wt149);
_mm512_mask_storeu_ps(arrangedW1+6197152+60582816*i5+7632*c1+24*k3, 16128, wt149);
_mm512_mask_storeu_ps(arrangedW1+6204760+60582816*i5+7632*c1+24*k3, 49152, wt149);
_mm512_mask_storeu_ps(arrangedW1+38327920+60582816*i5+7632*c1+24*k3, 3, wt150);
_mm512_mask_storeu_ps(arrangedW1+38335528+60582816*i5+7632*c1+24*k3, 252, wt150);
_mm512_mask_storeu_ps(arrangedW1+38343136+60582816*i5+7632*c1+24*k3, 16128, wt150);
_mm512_mask_storeu_ps(arrangedW1+38350744+60582816*i5+7632*c1+24*k3, 49152, wt150);
_mm512_mask_storeu_ps(arrangedW1+48218992+60582816*i5+7632*c1+24*k3, 3, wt151);
_mm512_mask_storeu_ps(arrangedW1+48226600+60582816*i5+7632*c1+24*k3, 252, wt151);
_mm512_mask_storeu_ps(arrangedW1+48234208+60582816*i5+7632*c1+24*k3, 16128, wt151);
_mm512_mask_storeu_ps(arrangedW1+48241816+60582816*i5+7632*c1+24*k3, 49152, wt151);
_mm512_mask_storeu_ps(arrangedW1+55637296+60582816*i5+7632*c1+24*k3, 3, wt152);
_mm512_mask_storeu_ps(arrangedW1+55644904+60582816*i5+7632*c1+24*k3, 252, wt152);
_mm512_mask_storeu_ps(arrangedW1+55652512+60582816*i5+7632*c1+24*k3, 16128, wt152);
_mm512_mask_storeu_ps(arrangedW1+55660120+60582816*i5+7632*c1+24*k3, 49152, wt152);
_mm512_mask_storeu_ps(arrangedW1+39564304+60582816*i5+7632*c1+24*k3, 3, wt153);
_mm512_mask_storeu_ps(arrangedW1+39571912+60582816*i5+7632*c1+24*k3, 252, wt153);
_mm512_mask_storeu_ps(arrangedW1+39579520+60582816*i5+7632*c1+24*k3, 16128, wt153);
_mm512_mask_storeu_ps(arrangedW1+39587128+60582816*i5+7632*c1+24*k3, 49152, wt153);
_mm512_mask_storeu_ps(arrangedW1+49455376+60582816*i5+7632*c1+24*k3, 3, wt154);
_mm512_mask_storeu_ps(arrangedW1+49462984+60582816*i5+7632*c1+24*k3, 252, wt154);
_mm512_mask_storeu_ps(arrangedW1+49470592+60582816*i5+7632*c1+24*k3, 16128, wt154);
_mm512_mask_storeu_ps(arrangedW1+49478200+60582816*i5+7632*c1+24*k3, 49152, wt154);
_mm512_mask_storeu_ps(arrangedW1+56873680+60582816*i5+7632*c1+24*k3, 3, wt155);
_mm512_mask_storeu_ps(arrangedW1+56881288+60582816*i5+7632*c1+24*k3, 252, wt155);
_mm512_mask_storeu_ps(arrangedW1+56888896+60582816*i5+7632*c1+24*k3, 16128, wt155);
_mm512_mask_storeu_ps(arrangedW1+56896504+60582816*i5+7632*c1+24*k3, 49152, wt155);
_mm512_mask_storeu_ps(arrangedW1+40800688+60582816*i5+7632*c1+24*k3, 3, wt156);
_mm512_mask_storeu_ps(arrangedW1+40808296+60582816*i5+7632*c1+24*k3, 252, wt156);
_mm512_mask_storeu_ps(arrangedW1+40815904+60582816*i5+7632*c1+24*k3, 16128, wt156);
_mm512_mask_storeu_ps(arrangedW1+40823512+60582816*i5+7632*c1+24*k3, 49152, wt156);
_mm512_mask_storeu_ps(arrangedW1+7418320+60582816*i5+7632*c1+24*k3, 3, wt157);
_mm512_mask_storeu_ps(arrangedW1+7425928+60582816*i5+7632*c1+24*k3, 252, wt157);
_mm512_mask_storeu_ps(arrangedW1+7433536+60582816*i5+7632*c1+24*k3, 16128, wt157);
_mm512_mask_storeu_ps(arrangedW1+7441144+60582816*i5+7632*c1+24*k3, 49152, wt157);
_mm512_mask_storeu_ps(arrangedW1+19782160+60582816*i5+7632*c1+24*k3, 3, wt158);
_mm512_mask_storeu_ps(arrangedW1+19789768+60582816*i5+7632*c1+24*k3, 252, wt158);
_mm512_mask_storeu_ps(arrangedW1+19797376+60582816*i5+7632*c1+24*k3, 16128, wt158);
_mm512_mask_storeu_ps(arrangedW1+19804984+60582816*i5+7632*c1+24*k3, 49152, wt158);
_mm512_mask_storeu_ps(arrangedW1+29673232+60582816*i5+7632*c1+24*k3, 3, wt159);
_mm512_mask_storeu_ps(arrangedW1+29680840+60582816*i5+7632*c1+24*k3, 252, wt159);
_mm512_mask_storeu_ps(arrangedW1+29688448+60582816*i5+7632*c1+24*k3, 16128, wt159);
_mm512_mask_storeu_ps(arrangedW1+29696056+60582816*i5+7632*c1+24*k3, 49152, wt159);
_mm512_mask_storeu_ps(arrangedW1+8654704+60582816*i5+7632*c1+24*k3, 3, wt160);
_mm512_mask_storeu_ps(arrangedW1+8662312+60582816*i5+7632*c1+24*k3, 252, wt160);
_mm512_mask_storeu_ps(arrangedW1+8669920+60582816*i5+7632*c1+24*k3, 16128, wt160);
_mm512_mask_storeu_ps(arrangedW1+8677528+60582816*i5+7632*c1+24*k3, 49152, wt160);
__m512 wt161 = _mm512_maskz_loadu_ps(65535, wtPtr1+128+60582816*i5+997248*j1+196*k3);
__m512 wt162 = _mm512_maskz_loadu_ps(65535, wtPtr1+62456+60582816*i5+997248*j1+196*k3);
__m512 wt163 = _mm512_maskz_loadu_ps(65535, wtPtr1+124784+60582816*i5+997248*j1+196*k3);
__m512 wt164 = _mm512_maskz_loadu_ps(65535, wtPtr1+187112+60582816*i5+997248*j1+196*k3);
__m512 wt165 = _mm512_maskz_loadu_ps(65535, wtPtr1+249440+60582816*i5+997248*j1+196*k3);
__m512 wt166 = _mm512_maskz_loadu_ps(65535, wtPtr1+311768+60582816*i5+997248*j1+196*k3);
__m512 wt167 = _mm512_maskz_loadu_ps(65535, wtPtr1+374096+60582816*i5+997248*j1+196*k3);
__m512 wt168 = _mm512_maskz_loadu_ps(65535, wtPtr1+436424+60582816*i5+997248*j1+196*k3);
__m512 wt169 = _mm512_maskz_loadu_ps(65535, wtPtr1+498752+60582816*i5+997248*j1+196*k3);
__m512 wt170 = _mm512_maskz_loadu_ps(65535, wtPtr1+561080+60582816*i5+997248*j1+196*k3);
__m512 wt171 = _mm512_maskz_loadu_ps(65535, wtPtr1+623408+60582816*i5+997248*j1+196*k3);
__m512 wt172 = _mm512_maskz_loadu_ps(65535, wtPtr1+685736+60582816*i5+997248*j1+196*k3);
__m512 wt173 = _mm512_maskz_loadu_ps(65535, wtPtr1+748064+60582816*i5+997248*j1+196*k3);
__m512 wt174 = _mm512_maskz_loadu_ps(65535, wtPtr1+810392+60582816*i5+997248*j1+196*k3);
__m512 wt175 = _mm512_maskz_loadu_ps(65535, wtPtr1+872720+60582816*i5+997248*j1+196*k3);
__m512 wt176 = _mm512_maskz_loadu_ps(65535, wtPtr1+935048+60582816*i5+997248*j1+196*k3);
__m512 tmp413 = _mm512_unpacklo_ps(wt161, wt162);
__m512 tmp414 = _mm512_unpackhi_ps(wt161, wt162);
__m512 tmp415 = _mm512_unpacklo_ps(wt163, wt164);
__m512 tmp416 = _mm512_unpackhi_ps(wt163, wt164);
__m512 tmp417 = _mm512_unpacklo_ps(wt165, wt166);
__m512 tmp418 = _mm512_unpackhi_ps(wt165, wt166);
__m512 tmp419 = _mm512_unpacklo_ps(wt167, wt168);
__m512 tmp420 = _mm512_unpackhi_ps(wt167, wt168);
__m512 tmp421 = _mm512_unpacklo_ps(wt169, wt170);
__m512 tmp422 = _mm512_unpackhi_ps(wt169, wt170);
__m512 tmp423 = _mm512_unpacklo_ps(wt171, wt172);
__m512 tmp424 = _mm512_unpackhi_ps(wt171, wt172);
__m512 tmp425 = _mm512_unpacklo_ps(wt173, wt174);
__m512 tmp426 = _mm512_unpackhi_ps(wt173, wt174);
__m512 tmp427 = _mm512_unpacklo_ps(wt175, wt176);
__m512 tmp428 = _mm512_unpackhi_ps(wt175, wt176);
__m512 tmp429 = _mm512_shuffle_ps(tmp413, tmp415, 68);
__m512 tmp430 = _mm512_shuffle_ps(tmp413, tmp415, 238);
__m512 tmp431 = _mm512_shuffle_ps(tmp414, tmp416, 68);
__m512 tmp432 = _mm512_shuffle_ps(tmp414, tmp416, 238);
__m512 tmp433 = _mm512_shuffle_ps(tmp417, tmp419, 68);
__m512 tmp434 = _mm512_shuffle_ps(tmp417, tmp419, 238);
__m512 tmp435 = _mm512_shuffle_ps(tmp418, tmp420, 68);
__m512 tmp436 = _mm512_shuffle_ps(tmp418, tmp420, 238);
__m512 tmp437 = _mm512_shuffle_ps(tmp421, tmp423, 68);
__m512 tmp438 = _mm512_shuffle_ps(tmp421, tmp423, 238);
__m512 tmp439 = _mm512_shuffle_ps(tmp422, tmp424, 68);
__m512 tmp440 = _mm512_shuffle_ps(tmp422, tmp424, 238);
__m512 tmp441 = _mm512_shuffle_ps(tmp425, tmp427, 68);
__m512 tmp442 = _mm512_shuffle_ps(tmp425, tmp427, 238);
__m512 tmp443 = _mm512_shuffle_ps(tmp426, tmp428, 68);
__m512 tmp444 = _mm512_shuffle_ps(tmp426, tmp428, 238);
__m512 tmp445 = _mm512_shuffle_f32x4(tmp429, tmp433, 136);
__m512 tmp446 = _mm512_shuffle_f32x4(tmp429, tmp433, 221);
__m512 tmp447 = _mm512_shuffle_f32x4(tmp430, tmp434, 136);
__m512 tmp448 = _mm512_shuffle_f32x4(tmp430, tmp434, 221);
__m512 tmp449 = _mm512_shuffle_f32x4(tmp431, tmp435, 136);
__m512 tmp450 = _mm512_shuffle_f32x4(tmp431, tmp435, 221);
__m512 tmp451 = _mm512_shuffle_f32x4(tmp432, tmp436, 136);
__m512 tmp452 = _mm512_shuffle_f32x4(tmp432, tmp436, 221);
__m512 tmp453 = _mm512_shuffle_f32x4(tmp437, tmp441, 136);
__m512 tmp454 = _mm512_shuffle_f32x4(tmp437, tmp441, 221);
__m512 tmp455 = _mm512_shuffle_f32x4(tmp438, tmp442, 136);
__m512 tmp456 = _mm512_shuffle_f32x4(tmp438, tmp442, 221);
__m512 tmp457 = _mm512_shuffle_f32x4(tmp439, tmp443, 136);
__m512 tmp458 = _mm512_shuffle_f32x4(tmp439, tmp443, 221);
__m512 tmp459 = _mm512_shuffle_f32x4(tmp440, tmp444, 136);
__m512 tmp460 = _mm512_shuffle_f32x4(tmp440, tmp444, 221);
wt161 = _mm512_shuffle_f32x4(tmp445, tmp453, 136);
wt169 = _mm512_shuffle_f32x4(tmp445, tmp453, 221);
wt162 = _mm512_shuffle_f32x4(tmp447, tmp455, 136);
wt170 = _mm512_shuffle_f32x4(tmp447, tmp455, 221);
wt163 = _mm512_shuffle_f32x4(tmp449, tmp457, 136);
wt171 = _mm512_shuffle_f32x4(tmp449, tmp457, 221);
wt164 = _mm512_shuffle_f32x4(tmp451, tmp459, 136);
wt172 = _mm512_shuffle_f32x4(tmp451, tmp459, 221);
wt165 = _mm512_shuffle_f32x4(tmp446, tmp454, 136);
wt173 = _mm512_shuffle_f32x4(tmp446, tmp454, 221);
wt166 = _mm512_shuffle_f32x4(tmp448, tmp456, 136);
wt174 = _mm512_shuffle_f32x4(tmp448, tmp456, 221);
wt167 = _mm512_shuffle_f32x4(tmp450, tmp458, 136);
wt175 = _mm512_shuffle_f32x4(tmp450, tmp458, 221);
wt168 = _mm512_shuffle_f32x4(tmp452, tmp460, 136);
wt176 = _mm512_shuffle_f32x4(tmp452, tmp460, 221);
_mm512_mask_storeu_ps(arrangedW1+21018544+60582816*i5+7632*c1+24*k3, 3, wt161);
_mm512_mask_storeu_ps(arrangedW1+21026152+60582816*i5+7632*c1+24*k3, 252, wt161);
_mm512_mask_storeu_ps(arrangedW1+21033760+60582816*i5+7632*c1+24*k3, 16128, wt161);
_mm512_mask_storeu_ps(arrangedW1+21041368+60582816*i5+7632*c1+24*k3, 49152, wt161);
_mm512_mask_storeu_ps(arrangedW1+30909616+60582816*i5+7632*c1+24*k3, 3, wt162);
_mm512_mask_storeu_ps(arrangedW1+30917224+60582816*i5+7632*c1+24*k3, 252, wt162);
_mm512_mask_storeu_ps(arrangedW1+30924832+60582816*i5+7632*c1+24*k3, 16128, wt162);
_mm512_mask_storeu_ps(arrangedW1+30932440+60582816*i5+7632*c1+24*k3, 49152, wt162);
_mm512_mask_storeu_ps(arrangedW1+9891088+60582816*i5+7632*c1+24*k3, 3, wt163);
_mm512_mask_storeu_ps(arrangedW1+9898696+60582816*i5+7632*c1+24*k3, 252, wt163);
_mm512_mask_storeu_ps(arrangedW1+9906304+60582816*i5+7632*c1+24*k3, 16128, wt163);
_mm512_mask_storeu_ps(arrangedW1+9913912+60582816*i5+7632*c1+24*k3, 49152, wt163);
_mm512_mask_storeu_ps(arrangedW1+42037072+60582816*i5+7632*c1+24*k3, 3, wt164);
_mm512_mask_storeu_ps(arrangedW1+42044680+60582816*i5+7632*c1+24*k3, 252, wt164);
_mm512_mask_storeu_ps(arrangedW1+42052288+60582816*i5+7632*c1+24*k3, 16128, wt164);
_mm512_mask_storeu_ps(arrangedW1+42059896+60582816*i5+7632*c1+24*k3, 49152, wt164);
_mm512_mask_storeu_ps(arrangedW1+50691760+60582816*i5+7632*c1+24*k3, 3, wt165);
_mm512_mask_storeu_ps(arrangedW1+50699368+60582816*i5+7632*c1+24*k3, 252, wt165);
_mm512_mask_storeu_ps(arrangedW1+50706976+60582816*i5+7632*c1+24*k3, 16128, wt165);
_mm512_mask_storeu_ps(arrangedW1+50714584+60582816*i5+7632*c1+24*k3, 49152, wt165);
_mm512_mask_storeu_ps(arrangedW1+58110064+60582816*i5+7632*c1+24*k3, 3, wt166);
_mm512_mask_storeu_ps(arrangedW1+58117672+60582816*i5+7632*c1+24*k3, 252, wt166);
_mm512_mask_storeu_ps(arrangedW1+58125280+60582816*i5+7632*c1+24*k3, 16128, wt166);
_mm512_mask_storeu_ps(arrangedW1+58132888+60582816*i5+7632*c1+24*k3, 49152, wt166);
_mm512_mask_storeu_ps(arrangedW1+43273456+60582816*i5+7632*c1+24*k3, 3, wt167);
_mm512_mask_storeu_ps(arrangedW1+43281064+60582816*i5+7632*c1+24*k3, 252, wt167);
_mm512_mask_storeu_ps(arrangedW1+43288672+60582816*i5+7632*c1+24*k3, 16128, wt167);
_mm512_mask_storeu_ps(arrangedW1+43296280+60582816*i5+7632*c1+24*k3, 49152, wt167);
_mm512_mask_storeu_ps(arrangedW1+51928144+60582816*i5+7632*c1+24*k3, 3, wt168);
_mm512_mask_storeu_ps(arrangedW1+51935752+60582816*i5+7632*c1+24*k3, 252, wt168);
_mm512_mask_storeu_ps(arrangedW1+51943360+60582816*i5+7632*c1+24*k3, 16128, wt168);
_mm512_mask_storeu_ps(arrangedW1+51950968+60582816*i5+7632*c1+24*k3, 49152, wt168);
_mm512_mask_storeu_ps(arrangedW1+59346448+60582816*i5+7632*c1+24*k3, 3, wt169);
_mm512_mask_storeu_ps(arrangedW1+59354056+60582816*i5+7632*c1+24*k3, 252, wt169);
_mm512_mask_storeu_ps(arrangedW1+59361664+60582816*i5+7632*c1+24*k3, 16128, wt169);
_mm512_mask_storeu_ps(arrangedW1+59369272+60582816*i5+7632*c1+24*k3, 49152, wt169);
_mm512_mask_storeu_ps(arrangedW1+44509840+60582816*i5+7632*c1+24*k3, 3, wt170);
_mm512_mask_storeu_ps(arrangedW1+44517448+60582816*i5+7632*c1+24*k3, 252, wt170);
_mm512_mask_storeu_ps(arrangedW1+44525056+60582816*i5+7632*c1+24*k3, 16128, wt170);
_mm512_mask_storeu_ps(arrangedW1+44532664+60582816*i5+7632*c1+24*k3, 49152, wt170);
_mm512_mask_storeu_ps(arrangedW1+11127472+60582816*i5+7632*c1+24*k3, 3, wt171);
_mm512_mask_storeu_ps(arrangedW1+11135080+60582816*i5+7632*c1+24*k3, 252, wt171);
_mm512_mask_storeu_ps(arrangedW1+11142688+60582816*i5+7632*c1+24*k3, 16128, wt171);
_mm512_mask_storeu_ps(arrangedW1+11150296+60582816*i5+7632*c1+24*k3, 49152, wt171);
_mm512_mask_storeu_ps(arrangedW1+22254928+60582816*i5+7632*c1+24*k3, 3, wt172);
_mm512_mask_storeu_ps(arrangedW1+22262536+60582816*i5+7632*c1+24*k3, 252, wt172);
_mm512_mask_storeu_ps(arrangedW1+22270144+60582816*i5+7632*c1+24*k3, 16128, wt172);
_mm512_mask_storeu_ps(arrangedW1+22277752+60582816*i5+7632*c1+24*k3, 49152, wt172);
_mm512_mask_storeu_ps(arrangedW1+32146000+60582816*i5+7632*c1+24*k3, 3, wt173);
_mm512_mask_storeu_ps(arrangedW1+32153608+60582816*i5+7632*c1+24*k3, 252, wt173);
_mm512_mask_storeu_ps(arrangedW1+32161216+60582816*i5+7632*c1+24*k3, 16128, wt173);
_mm512_mask_storeu_ps(arrangedW1+32168824+60582816*i5+7632*c1+24*k3, 49152, wt173);
_mm512_mask_storeu_ps(arrangedW1+12363856+60582816*i5+7632*c1+24*k3, 3, wt174);
_mm512_mask_storeu_ps(arrangedW1+12371464+60582816*i5+7632*c1+24*k3, 252, wt174);
_mm512_mask_storeu_ps(arrangedW1+12379072+60582816*i5+7632*c1+24*k3, 16128, wt174);
_mm512_mask_storeu_ps(arrangedW1+12386680+60582816*i5+7632*c1+24*k3, 49152, wt174);
_mm512_mask_storeu_ps(arrangedW1+23491312+60582816*i5+7632*c1+24*k3, 3, wt175);
_mm512_mask_storeu_ps(arrangedW1+23498920+60582816*i5+7632*c1+24*k3, 252, wt175);
_mm512_mask_storeu_ps(arrangedW1+23506528+60582816*i5+7632*c1+24*k3, 16128, wt175);
_mm512_mask_storeu_ps(arrangedW1+23514136+60582816*i5+7632*c1+24*k3, 49152, wt175);
_mm512_mask_storeu_ps(arrangedW1+33382384+60582816*i5+7632*c1+24*k3, 3, wt176);
_mm512_mask_storeu_ps(arrangedW1+33389992+60582816*i5+7632*c1+24*k3, 252, wt176);
_mm512_mask_storeu_ps(arrangedW1+33397600+60582816*i5+7632*c1+24*k3, 16128, wt176);
_mm512_mask_storeu_ps(arrangedW1+33405208+60582816*i5+7632*c1+24*k3, 49152, wt176);
__m512 wt177 = _mm512_maskz_loadu_ps(1, wtPtr1+192+60582816*i5+997248*j1+196*k3);
__m512 wt178 = _mm512_maskz_loadu_ps(1, wtPtr1+62520+60582816*i5+997248*j1+196*k3);
__m512 wt179 = _mm512_maskz_loadu_ps(1, wtPtr1+124848+60582816*i5+997248*j1+196*k3);
__m512 wt180 = _mm512_maskz_loadu_ps(1, wtPtr1+187176+60582816*i5+997248*j1+196*k3);
__m512 wt181 = _mm512_maskz_loadu_ps(1, wtPtr1+249504+60582816*i5+997248*j1+196*k3);
__m512 wt182 = _mm512_maskz_loadu_ps(1, wtPtr1+311832+60582816*i5+997248*j1+196*k3);
__m512 wt183 = _mm512_maskz_loadu_ps(1, wtPtr1+374160+60582816*i5+997248*j1+196*k3);
__m512 wt184 = _mm512_maskz_loadu_ps(1, wtPtr1+436488+60582816*i5+997248*j1+196*k3);
__m512 wt185 = _mm512_maskz_loadu_ps(1, wtPtr1+498816+60582816*i5+997248*j1+196*k3);
__m512 wt186 = _mm512_maskz_loadu_ps(1, wtPtr1+561144+60582816*i5+997248*j1+196*k3);
__m512 wt187 = _mm512_maskz_loadu_ps(1, wtPtr1+623472+60582816*i5+997248*j1+196*k3);
__m512 wt188 = _mm512_maskz_loadu_ps(1, wtPtr1+685800+60582816*i5+997248*j1+196*k3);
__m512 wt189 = _mm512_maskz_loadu_ps(1, wtPtr1+748128+60582816*i5+997248*j1+196*k3);
__m512 wt190 = _mm512_maskz_loadu_ps(1, wtPtr1+810456+60582816*i5+997248*j1+196*k3);
__m512 wt191 = _mm512_maskz_loadu_ps(1, wtPtr1+872784+60582816*i5+997248*j1+196*k3);
__m512 wt192 = _mm512_maskz_loadu_ps(1, wtPtr1+935112+60582816*i5+997248*j1+196*k3);
__m512 tmp461 = _mm512_unpacklo_ps(wt177, wt178);
__m512 tmp462 = _mm512_unpacklo_ps(wt179, wt180);
__m512 tmp463 = _mm512_unpacklo_ps(wt181, wt182);
__m512 tmp464 = _mm512_unpacklo_ps(wt183, wt184);
__m512 tmp465 = _mm512_unpacklo_ps(wt185, wt186);
__m512 tmp466 = _mm512_unpacklo_ps(wt187, wt188);
__m512 tmp467 = _mm512_unpacklo_ps(wt189, wt190);
__m512 tmp468 = _mm512_unpacklo_ps(wt191, wt192);
__m512 tmp469 = _mm512_shuffle_ps(tmp461, tmp462, 68);
__m512 tmp470 = _mm512_shuffle_ps(tmp463, tmp464, 68);
__m512 tmp471 = _mm512_shuffle_ps(tmp465, tmp466, 68);
__m512 tmp472 = _mm512_shuffle_ps(tmp467, tmp468, 68);
__m512 tmp473 = _mm512_shuffle_f32x4(tmp469, tmp470, 136);
__m512 tmp474 = _mm512_shuffle_f32x4(tmp471, tmp472, 136);
wt177 = _mm512_shuffle_f32x4(tmp473, tmp474, 136);
_mm512_mask_storeu_ps(arrangedW1+13600240+60582816*i5+7632*c1+24*k3, 3, wt177);
_mm512_mask_storeu_ps(arrangedW1+13607848+60582816*i5+7632*c1+24*k3, 252, wt177);
_mm512_mask_storeu_ps(arrangedW1+13615456+60582816*i5+7632*c1+24*k3, 16128, wt177);
_mm512_mask_storeu_ps(arrangedW1+13623064+60582816*i5+7632*c1+24*k3, 49152, wt177);
}
break;
}
}
if (j1 >= jj1) return;
}
}
if (j1 == 60) {
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(4095, biasPtr1-0+3888*i5+64*j1);
}
_mm512_mask_storeu_ps(arrangedB1-0+3888*i5+64*j1, 4095, bias2);
ptrdiff_t c2 = (size_t)(0+16*j1)/6;
ptrdiff_t k4 = 0;
for (; k4 != 318; ++k4) {
__m512 wt193 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+60582816*i5+997248*j1+196*k4);
__m512 wt194 = _mm512_maskz_loadu_ps(65535, wtPtr1+62328+60582816*i5+997248*j1+196*k4);
__m512 wt195 = _mm512_maskz_loadu_ps(65535, wtPtr1+124656+60582816*i5+997248*j1+196*k4);
__m512 wt196 = _mm512_maskz_loadu_ps(65535, wtPtr1+186984+60582816*i5+997248*j1+196*k4);
__m512 wt197 = _mm512_maskz_loadu_ps(65535, wtPtr1+249312+60582816*i5+997248*j1+196*k4);
__m512 wt198 = _mm512_maskz_loadu_ps(65535, wtPtr1+311640+60582816*i5+997248*j1+196*k4);
__m512 wt199 = _mm512_maskz_loadu_ps(65535, wtPtr1+373968+60582816*i5+997248*j1+196*k4);
__m512 wt200 = _mm512_maskz_loadu_ps(65535, wtPtr1+436296+60582816*i5+997248*j1+196*k4);
__m512 wt201 = _mm512_maskz_loadu_ps(65535, wtPtr1+498624+60582816*i5+997248*j1+196*k4);
__m512 wt202 = _mm512_maskz_loadu_ps(65535, wtPtr1+560952+60582816*i5+997248*j1+196*k4);
__m512 wt203 = _mm512_maskz_loadu_ps(65535, wtPtr1+623280+60582816*i5+997248*j1+196*k4);
__m512 wt204 = _mm512_maskz_loadu_ps(65535, wtPtr1+685608+60582816*i5+997248*j1+196*k4);
__m512 tmp475 = _mm512_unpacklo_ps(wt193, wt194);
__m512 tmp476 = _mm512_unpackhi_ps(wt193, wt194);
__m512 tmp477 = _mm512_unpacklo_ps(wt195, wt196);
__m512 tmp478 = _mm512_unpackhi_ps(wt195, wt196);
__m512 tmp479 = _mm512_unpacklo_ps(wt197, wt198);
__m512 tmp480 = _mm512_unpackhi_ps(wt197, wt198);
__m512 tmp481 = _mm512_unpacklo_ps(wt199, wt200);
__m512 tmp482 = _mm512_unpackhi_ps(wt199, wt200);
__m512 tmp483 = _mm512_unpacklo_ps(wt201, wt202);
__m512 tmp484 = _mm512_unpackhi_ps(wt201, wt202);
__m512 tmp485 = _mm512_unpacklo_ps(wt203, wt204);
__m512 tmp486 = _mm512_unpackhi_ps(wt203, wt204);
__m512 tmp487 = _mm512_shuffle_ps(tmp475, tmp477, 68);
__m512 tmp488 = _mm512_shuffle_ps(tmp475, tmp477, 238);
__m512 tmp489 = _mm512_shuffle_ps(tmp476, tmp478, 68);
__m512 tmp490 = _mm512_shuffle_ps(tmp476, tmp478, 238);
__m512 tmp491 = _mm512_shuffle_ps(tmp479, tmp481, 68);
__m512 tmp492 = _mm512_shuffle_ps(tmp479, tmp481, 238);
__m512 tmp493 = _mm512_shuffle_ps(tmp480, tmp482, 68);
__m512 tmp494 = _mm512_shuffle_ps(tmp480, tmp482, 238);
__m512 tmp495 = _mm512_shuffle_ps(tmp483, tmp485, 68);
__m512 tmp496 = _mm512_shuffle_ps(tmp483, tmp485, 238);
__m512 tmp497 = _mm512_shuffle_ps(tmp484, tmp486, 68);
__m512 tmp498 = _mm512_shuffle_ps(tmp484, tmp486, 238);
__m512 tmp499 = _mm512_shuffle_f32x4(tmp487, tmp491, 136);
__m512 tmp500 = _mm512_shuffle_f32x4(tmp487, tmp491, 221);
__m512 tmp501 = _mm512_shuffle_f32x4(tmp488, tmp492, 136);
__m512 tmp502 = _mm512_shuffle_f32x4(tmp488, tmp492, 221);
__m512 tmp503 = _mm512_shuffle_f32x4(tmp489, tmp493, 136);
__m512 tmp504 = _mm512_shuffle_f32x4(tmp489, tmp493, 221);
__m512 tmp505 = _mm512_shuffle_f32x4(tmp490, tmp494, 136);
__m512 tmp506 = _mm512_shuffle_f32x4(tmp490, tmp494, 221);
__m512 tmp507 = _mm512_shuffle_f32x4(tmp495, tmp495, 136);
__m512 tmp508 = _mm512_shuffle_f32x4(tmp495, tmp495, 221);
__m512 tmp509 = _mm512_shuffle_f32x4(tmp496, tmp496, 136);
__m512 tmp510 = _mm512_shuffle_f32x4(tmp496, tmp496, 221);
__m512 tmp511 = _mm512_shuffle_f32x4(tmp497, tmp497, 136);
__m512 tmp512 = _mm512_shuffle_f32x4(tmp497, tmp497, 221);
__m512 tmp513 = _mm512_shuffle_f32x4(tmp498, tmp498, 136);
__m512 tmp514 = _mm512_shuffle_f32x4(tmp498, tmp498, 221);
wt193 = _mm512_shuffle_f32x4(tmp499, tmp507, 136);
wt201 = _mm512_shuffle_f32x4(tmp499, tmp507, 221);
wt194 = _mm512_shuffle_f32x4(tmp501, tmp509, 136);
wt202 = _mm512_shuffle_f32x4(tmp501, tmp509, 221);
wt195 = _mm512_shuffle_f32x4(tmp503, tmp511, 136);
wt203 = _mm512_shuffle_f32x4(tmp503, tmp511, 221);
wt196 = _mm512_shuffle_f32x4(tmp505, tmp513, 136);
wt204 = _mm512_shuffle_f32x4(tmp505, tmp513, 221);
wt197 = _mm512_shuffle_f32x4(tmp500, tmp508, 136);
__m512 wt205 = _mm512_shuffle_f32x4(tmp500, tmp508, 221);
wt198 = _mm512_shuffle_f32x4(tmp502, tmp510, 136);
__m512 wt206 = _mm512_shuffle_f32x4(tmp502, tmp510, 221);
wt199 = _mm512_shuffle_f32x4(tmp504, tmp512, 136);
__m512 wt207 = _mm512_shuffle_f32x4(tmp504, tmp512, 221);
wt200 = _mm512_shuffle_f32x4(tmp506, tmp514, 136);
__m512 wt208 = _mm512_shuffle_f32x4(tmp506, tmp514, 221);
_mm512_mask_storeu_ps(arrangedW1+0+60582816*i5+7632*c2+24*k4, 63, wt193);
_mm512_mask_storeu_ps(arrangedW1+7608+60582816*i5+7632*c2+24*k4, 4032, wt193);
_mm512_mask_storeu_ps(arrangedW1+14836608+60582816*i5+7632*c2+24*k4, 63, wt194);
_mm512_mask_storeu_ps(arrangedW1+14844216+60582816*i5+7632*c2+24*k4, 4032, wt194);
_mm512_mask_storeu_ps(arrangedW1+24727680+60582816*i5+7632*c2+24*k4, 63, wt195);
_mm512_mask_storeu_ps(arrangedW1+24735288+60582816*i5+7632*c2+24*k4, 4032, wt195);
_mm512_mask_storeu_ps(arrangedW1+1236384+60582816*i5+7632*c2+24*k4, 63, wt196);
_mm512_mask_storeu_ps(arrangedW1+1243992+60582816*i5+7632*c2+24*k4, 4032, wt196);
_mm512_mask_storeu_ps(arrangedW1+16072992+60582816*i5+7632*c2+24*k4, 63, wt197);
_mm512_mask_storeu_ps(arrangedW1+16080600+60582816*i5+7632*c2+24*k4, 4032, wt197);
_mm512_mask_storeu_ps(arrangedW1+25964064+60582816*i5+7632*c2+24*k4, 63, wt198);
_mm512_mask_storeu_ps(arrangedW1+25971672+60582816*i5+7632*c2+24*k4, 4032, wt198);
_mm512_mask_storeu_ps(arrangedW1+2472768+60582816*i5+7632*c2+24*k4, 63, wt199);
_mm512_mask_storeu_ps(arrangedW1+2480376+60582816*i5+7632*c2+24*k4, 4032, wt199);
_mm512_mask_storeu_ps(arrangedW1+34618752+60582816*i5+7632*c2+24*k4, 63, wt200);
_mm512_mask_storeu_ps(arrangedW1+34626360+60582816*i5+7632*c2+24*k4, 4032, wt200);
_mm512_mask_storeu_ps(arrangedW1+45746208+60582816*i5+7632*c2+24*k4, 63, wt201);
_mm512_mask_storeu_ps(arrangedW1+45753816+60582816*i5+7632*c2+24*k4, 4032, wt201);
_mm512_mask_storeu_ps(arrangedW1+53164512+60582816*i5+7632*c2+24*k4, 63, wt202);
_mm512_mask_storeu_ps(arrangedW1+53172120+60582816*i5+7632*c2+24*k4, 4032, wt202);
_mm512_mask_storeu_ps(arrangedW1+35855136+60582816*i5+7632*c2+24*k4, 63, wt203);
_mm512_mask_storeu_ps(arrangedW1+35862744+60582816*i5+7632*c2+24*k4, 4032, wt203);
_mm512_mask_storeu_ps(arrangedW1+46982592+60582816*i5+7632*c2+24*k4, 63, wt204);
_mm512_mask_storeu_ps(arrangedW1+46990200+60582816*i5+7632*c2+24*k4, 4032, wt204);
_mm512_mask_storeu_ps(arrangedW1+54400896+60582816*i5+7632*c2+24*k4, 63, wt205);
_mm512_mask_storeu_ps(arrangedW1+54408504+60582816*i5+7632*c2+24*k4, 4032, wt205);
_mm512_mask_storeu_ps(arrangedW1+37091520+60582816*i5+7632*c2+24*k4, 63, wt206);
_mm512_mask_storeu_ps(arrangedW1+37099128+60582816*i5+7632*c2+24*k4, 4032, wt206);
_mm512_mask_storeu_ps(arrangedW1+3709152+60582816*i5+7632*c2+24*k4, 63, wt207);
_mm512_mask_storeu_ps(arrangedW1+3716760+60582816*i5+7632*c2+24*k4, 4032, wt207);
_mm512_mask_storeu_ps(arrangedW1+17309376+60582816*i5+7632*c2+24*k4, 63, wt208);
_mm512_mask_storeu_ps(arrangedW1+17316984+60582816*i5+7632*c2+24*k4, 4032, wt208);
__m512 wt209 = _mm512_maskz_loadu_ps(65535, wtPtr1+64+60582816*i5+997248*j1+196*k4);
__m512 wt210 = _mm512_maskz_loadu_ps(65535, wtPtr1+62392+60582816*i5+997248*j1+196*k4);
__m512 wt211 = _mm512_maskz_loadu_ps(65535, wtPtr1+124720+60582816*i5+997248*j1+196*k4);
__m512 wt212 = _mm512_maskz_loadu_ps(65535, wtPtr1+187048+60582816*i5+997248*j1+196*k4);
__m512 wt213 = _mm512_maskz_loadu_ps(65535, wtPtr1+249376+60582816*i5+997248*j1+196*k4);
__m512 wt214 = _mm512_maskz_loadu_ps(65535, wtPtr1+311704+60582816*i5+997248*j1+196*k4);
__m512 wt215 = _mm512_maskz_loadu_ps(65535, wtPtr1+374032+60582816*i5+997248*j1+196*k4);
__m512 wt216 = _mm512_maskz_loadu_ps(65535, wtPtr1+436360+60582816*i5+997248*j1+196*k4);
__m512 wt217 = _mm512_maskz_loadu_ps(65535, wtPtr1+498688+60582816*i5+997248*j1+196*k4);
__m512 wt218 = _mm512_maskz_loadu_ps(65535, wtPtr1+561016+60582816*i5+997248*j1+196*k4);
__m512 wt219 = _mm512_maskz_loadu_ps(65535, wtPtr1+623344+60582816*i5+997248*j1+196*k4);
__m512 wt220 = _mm512_maskz_loadu_ps(65535, wtPtr1+685672+60582816*i5+997248*j1+196*k4);
__m512 tmp515 = _mm512_unpacklo_ps(wt209, wt210);
__m512 tmp516 = _mm512_unpackhi_ps(wt209, wt210);
__m512 tmp517 = _mm512_unpacklo_ps(wt211, wt212);
__m512 tmp518 = _mm512_unpackhi_ps(wt211, wt212);
__m512 tmp519 = _mm512_unpacklo_ps(wt213, wt214);
__m512 tmp520 = _mm512_unpackhi_ps(wt213, wt214);
__m512 tmp521 = _mm512_unpacklo_ps(wt215, wt216);
__m512 tmp522 = _mm512_unpackhi_ps(wt215, wt216);
__m512 tmp523 = _mm512_unpacklo_ps(wt217, wt218);
__m512 tmp524 = _mm512_unpackhi_ps(wt217, wt218);
__m512 tmp525 = _mm512_unpacklo_ps(wt219, wt220);
__m512 tmp526 = _mm512_unpackhi_ps(wt219, wt220);
__m512 tmp527 = _mm512_shuffle_ps(tmp515, tmp517, 68);
__m512 tmp528 = _mm512_shuffle_ps(tmp515, tmp517, 238);
__m512 tmp529 = _mm512_shuffle_ps(tmp516, tmp518, 68);
__m512 tmp530 = _mm512_shuffle_ps(tmp516, tmp518, 238);
__m512 tmp531 = _mm512_shuffle_ps(tmp519, tmp521, 68);
__m512 tmp532 = _mm512_shuffle_ps(tmp519, tmp521, 238);
__m512 tmp533 = _mm512_shuffle_ps(tmp520, tmp522, 68);
__m512 tmp534 = _mm512_shuffle_ps(tmp520, tmp522, 238);
__m512 tmp535 = _mm512_shuffle_ps(tmp523, tmp525, 68);
__m512 tmp536 = _mm512_shuffle_ps(tmp523, tmp525, 238);
__m512 tmp537 = _mm512_shuffle_ps(tmp524, tmp526, 68);
__m512 tmp538 = _mm512_shuffle_ps(tmp524, tmp526, 238);
__m512 tmp539 = _mm512_shuffle_f32x4(tmp527, tmp531, 136);
__m512 tmp540 = _mm512_shuffle_f32x4(tmp527, tmp531, 221);
__m512 tmp541 = _mm512_shuffle_f32x4(tmp528, tmp532, 136);
__m512 tmp542 = _mm512_shuffle_f32x4(tmp528, tmp532, 221);
__m512 tmp543 = _mm512_shuffle_f32x4(tmp529, tmp533, 136);
__m512 tmp544 = _mm512_shuffle_f32x4(tmp529, tmp533, 221);
__m512 tmp545 = _mm512_shuffle_f32x4(tmp530, tmp534, 136);
__m512 tmp546 = _mm512_shuffle_f32x4(tmp530, tmp534, 221);
__m512 tmp547 = _mm512_shuffle_f32x4(tmp535, tmp535, 136);
__m512 tmp548 = _mm512_shuffle_f32x4(tmp535, tmp535, 221);
__m512 tmp549 = _mm512_shuffle_f32x4(tmp536, tmp536, 136);
__m512 tmp550 = _mm512_shuffle_f32x4(tmp536, tmp536, 221);
__m512 tmp551 = _mm512_shuffle_f32x4(tmp537, tmp537, 136);
__m512 tmp552 = _mm512_shuffle_f32x4(tmp537, tmp537, 221);
__m512 tmp553 = _mm512_shuffle_f32x4(tmp538, tmp538, 136);
__m512 tmp554 = _mm512_shuffle_f32x4(tmp538, tmp538, 221);
wt209 = _mm512_shuffle_f32x4(tmp539, tmp547, 136);
wt217 = _mm512_shuffle_f32x4(tmp539, tmp547, 221);
wt210 = _mm512_shuffle_f32x4(tmp541, tmp549, 136);
wt218 = _mm512_shuffle_f32x4(tmp541, tmp549, 221);
wt211 = _mm512_shuffle_f32x4(tmp543, tmp551, 136);
wt219 = _mm512_shuffle_f32x4(tmp543, tmp551, 221);
wt212 = _mm512_shuffle_f32x4(tmp545, tmp553, 136);
wt220 = _mm512_shuffle_f32x4(tmp545, tmp553, 221);
wt213 = _mm512_shuffle_f32x4(tmp540, tmp548, 136);
__m512 wt221 = _mm512_shuffle_f32x4(tmp540, tmp548, 221);
wt214 = _mm512_shuffle_f32x4(tmp542, tmp550, 136);
__m512 wt222 = _mm512_shuffle_f32x4(tmp542, tmp550, 221);
wt215 = _mm512_shuffle_f32x4(tmp544, tmp552, 136);
__m512 wt223 = _mm512_shuffle_f32x4(tmp544, tmp552, 221);
wt216 = _mm512_shuffle_f32x4(tmp546, tmp554, 136);
__m512 wt224 = _mm512_shuffle_f32x4(tmp546, tmp554, 221);
_mm512_mask_storeu_ps(arrangedW1+27200448+60582816*i5+7632*c2+24*k4, 63, wt209);
_mm512_mask_storeu_ps(arrangedW1+27208056+60582816*i5+7632*c2+24*k4, 4032, wt209);
_mm512_mask_storeu_ps(arrangedW1+4945536+60582816*i5+7632*c2+24*k4, 63, wt210);
_mm512_mask_storeu_ps(arrangedW1+4953144+60582816*i5+7632*c2+24*k4, 4032, wt210);
_mm512_mask_storeu_ps(arrangedW1+18545760+60582816*i5+7632*c2+24*k4, 63, wt211);
_mm512_mask_storeu_ps(arrangedW1+18553368+60582816*i5+7632*c2+24*k4, 4032, wt211);
_mm512_mask_storeu_ps(arrangedW1+28436832+60582816*i5+7632*c2+24*k4, 63, wt212);
_mm512_mask_storeu_ps(arrangedW1+28444440+60582816*i5+7632*c2+24*k4, 4032, wt212);
_mm512_mask_storeu_ps(arrangedW1+6181920+60582816*i5+7632*c2+24*k4, 63, wt213);
_mm512_mask_storeu_ps(arrangedW1+6189528+60582816*i5+7632*c2+24*k4, 4032, wt213);
_mm512_mask_storeu_ps(arrangedW1+38327904+60582816*i5+7632*c2+24*k4, 63, wt214);
_mm512_mask_storeu_ps(arrangedW1+38335512+60582816*i5+7632*c2+24*k4, 4032, wt214);
_mm512_mask_storeu_ps(arrangedW1+48218976+60582816*i5+7632*c2+24*k4, 63, wt215);
_mm512_mask_storeu_ps(arrangedW1+48226584+60582816*i5+7632*c2+24*k4, 4032, wt215);
_mm512_mask_storeu_ps(arrangedW1+55637280+60582816*i5+7632*c2+24*k4, 63, wt216);
_mm512_mask_storeu_ps(arrangedW1+55644888+60582816*i5+7632*c2+24*k4, 4032, wt216);
_mm512_mask_storeu_ps(arrangedW1+39564288+60582816*i5+7632*c2+24*k4, 63, wt217);
_mm512_mask_storeu_ps(arrangedW1+39571896+60582816*i5+7632*c2+24*k4, 4032, wt217);
_mm512_mask_storeu_ps(arrangedW1+49455360+60582816*i5+7632*c2+24*k4, 63, wt218);
_mm512_mask_storeu_ps(arrangedW1+49462968+60582816*i5+7632*c2+24*k4, 4032, wt218);
_mm512_mask_storeu_ps(arrangedW1+56873664+60582816*i5+7632*c2+24*k4, 63, wt219);
_mm512_mask_storeu_ps(arrangedW1+56881272+60582816*i5+7632*c2+24*k4, 4032, wt219);
_mm512_mask_storeu_ps(arrangedW1+40800672+60582816*i5+7632*c2+24*k4, 63, wt220);
_mm512_mask_storeu_ps(arrangedW1+40808280+60582816*i5+7632*c2+24*k4, 4032, wt220);
_mm512_mask_storeu_ps(arrangedW1+7418304+60582816*i5+7632*c2+24*k4, 63, wt221);
_mm512_mask_storeu_ps(arrangedW1+7425912+60582816*i5+7632*c2+24*k4, 4032, wt221);
_mm512_mask_storeu_ps(arrangedW1+19782144+60582816*i5+7632*c2+24*k4, 63, wt222);
_mm512_mask_storeu_ps(arrangedW1+19789752+60582816*i5+7632*c2+24*k4, 4032, wt222);
_mm512_mask_storeu_ps(arrangedW1+29673216+60582816*i5+7632*c2+24*k4, 63, wt223);
_mm512_mask_storeu_ps(arrangedW1+29680824+60582816*i5+7632*c2+24*k4, 4032, wt223);
_mm512_mask_storeu_ps(arrangedW1+8654688+60582816*i5+7632*c2+24*k4, 63, wt224);
_mm512_mask_storeu_ps(arrangedW1+8662296+60582816*i5+7632*c2+24*k4, 4032, wt224);
__m512 wt225 = _mm512_maskz_loadu_ps(65535, wtPtr1+128+60582816*i5+997248*j1+196*k4);
__m512 wt226 = _mm512_maskz_loadu_ps(65535, wtPtr1+62456+60582816*i5+997248*j1+196*k4);
__m512 wt227 = _mm512_maskz_loadu_ps(65535, wtPtr1+124784+60582816*i5+997248*j1+196*k4);
__m512 wt228 = _mm512_maskz_loadu_ps(65535, wtPtr1+187112+60582816*i5+997248*j1+196*k4);
__m512 wt229 = _mm512_maskz_loadu_ps(65535, wtPtr1+249440+60582816*i5+997248*j1+196*k4);
__m512 wt230 = _mm512_maskz_loadu_ps(65535, wtPtr1+311768+60582816*i5+997248*j1+196*k4);
__m512 wt231 = _mm512_maskz_loadu_ps(65535, wtPtr1+374096+60582816*i5+997248*j1+196*k4);
__m512 wt232 = _mm512_maskz_loadu_ps(65535, wtPtr1+436424+60582816*i5+997248*j1+196*k4);
__m512 wt233 = _mm512_maskz_loadu_ps(65535, wtPtr1+498752+60582816*i5+997248*j1+196*k4);
__m512 wt234 = _mm512_maskz_loadu_ps(65535, wtPtr1+561080+60582816*i5+997248*j1+196*k4);
__m512 wt235 = _mm512_maskz_loadu_ps(65535, wtPtr1+623408+60582816*i5+997248*j1+196*k4);
__m512 wt236 = _mm512_maskz_loadu_ps(65535, wtPtr1+685736+60582816*i5+997248*j1+196*k4);
__m512 tmp555 = _mm512_unpacklo_ps(wt225, wt226);
__m512 tmp556 = _mm512_unpackhi_ps(wt225, wt226);
__m512 tmp557 = _mm512_unpacklo_ps(wt227, wt228);
__m512 tmp558 = _mm512_unpackhi_ps(wt227, wt228);
__m512 tmp559 = _mm512_unpacklo_ps(wt229, wt230);
__m512 tmp560 = _mm512_unpackhi_ps(wt229, wt230);
__m512 tmp561 = _mm512_unpacklo_ps(wt231, wt232);
__m512 tmp562 = _mm512_unpackhi_ps(wt231, wt232);
__m512 tmp563 = _mm512_unpacklo_ps(wt233, wt234);
__m512 tmp564 = _mm512_unpackhi_ps(wt233, wt234);
__m512 tmp565 = _mm512_unpacklo_ps(wt235, wt236);
__m512 tmp566 = _mm512_unpackhi_ps(wt235, wt236);
__m512 tmp567 = _mm512_shuffle_ps(tmp555, tmp557, 68);
__m512 tmp568 = _mm512_shuffle_ps(tmp555, tmp557, 238);
__m512 tmp569 = _mm512_shuffle_ps(tmp556, tmp558, 68);
__m512 tmp570 = _mm512_shuffle_ps(tmp556, tmp558, 238);
__m512 tmp571 = _mm512_shuffle_ps(tmp559, tmp561, 68);
__m512 tmp572 = _mm512_shuffle_ps(tmp559, tmp561, 238);
__m512 tmp573 = _mm512_shuffle_ps(tmp560, tmp562, 68);
__m512 tmp574 = _mm512_shuffle_ps(tmp560, tmp562, 238);
__m512 tmp575 = _mm512_shuffle_ps(tmp563, tmp565, 68);
__m512 tmp576 = _mm512_shuffle_ps(tmp563, tmp565, 238);
__m512 tmp577 = _mm512_shuffle_ps(tmp564, tmp566, 68);
__m512 tmp578 = _mm512_shuffle_ps(tmp564, tmp566, 238);
__m512 tmp579 = _mm512_shuffle_f32x4(tmp567, tmp571, 136);
__m512 tmp580 = _mm512_shuffle_f32x4(tmp567, tmp571, 221);
__m512 tmp581 = _mm512_shuffle_f32x4(tmp568, tmp572, 136);
__m512 tmp582 = _mm512_shuffle_f32x4(tmp568, tmp572, 221);
__m512 tmp583 = _mm512_shuffle_f32x4(tmp569, tmp573, 136);
__m512 tmp584 = _mm512_shuffle_f32x4(tmp569, tmp573, 221);
__m512 tmp585 = _mm512_shuffle_f32x4(tmp570, tmp574, 136);
__m512 tmp586 = _mm512_shuffle_f32x4(tmp570, tmp574, 221);
__m512 tmp587 = _mm512_shuffle_f32x4(tmp575, tmp575, 136);
__m512 tmp588 = _mm512_shuffle_f32x4(tmp575, tmp575, 221);
__m512 tmp589 = _mm512_shuffle_f32x4(tmp576, tmp576, 136);
__m512 tmp590 = _mm512_shuffle_f32x4(tmp576, tmp576, 221);
__m512 tmp591 = _mm512_shuffle_f32x4(tmp577, tmp577, 136);
__m512 tmp592 = _mm512_shuffle_f32x4(tmp577, tmp577, 221);
__m512 tmp593 = _mm512_shuffle_f32x4(tmp578, tmp578, 136);
__m512 tmp594 = _mm512_shuffle_f32x4(tmp578, tmp578, 221);
wt225 = _mm512_shuffle_f32x4(tmp579, tmp587, 136);
wt233 = _mm512_shuffle_f32x4(tmp579, tmp587, 221);
wt226 = _mm512_shuffle_f32x4(tmp581, tmp589, 136);
wt234 = _mm512_shuffle_f32x4(tmp581, tmp589, 221);
wt227 = _mm512_shuffle_f32x4(tmp583, tmp591, 136);
wt235 = _mm512_shuffle_f32x4(tmp583, tmp591, 221);
wt228 = _mm512_shuffle_f32x4(tmp585, tmp593, 136);
wt236 = _mm512_shuffle_f32x4(tmp585, tmp593, 221);
wt229 = _mm512_shuffle_f32x4(tmp580, tmp588, 136);
__m512 wt237 = _mm512_shuffle_f32x4(tmp580, tmp588, 221);
wt230 = _mm512_shuffle_f32x4(tmp582, tmp590, 136);
__m512 wt238 = _mm512_shuffle_f32x4(tmp582, tmp590, 221);
wt231 = _mm512_shuffle_f32x4(tmp584, tmp592, 136);
__m512 wt239 = _mm512_shuffle_f32x4(tmp584, tmp592, 221);
wt232 = _mm512_shuffle_f32x4(tmp586, tmp594, 136);
__m512 wt240 = _mm512_shuffle_f32x4(tmp586, tmp594, 221);
_mm512_mask_storeu_ps(arrangedW1+21018528+60582816*i5+7632*c2+24*k4, 63, wt225);
_mm512_mask_storeu_ps(arrangedW1+21026136+60582816*i5+7632*c2+24*k4, 4032, wt225);
_mm512_mask_storeu_ps(arrangedW1+30909600+60582816*i5+7632*c2+24*k4, 63, wt226);
_mm512_mask_storeu_ps(arrangedW1+30917208+60582816*i5+7632*c2+24*k4, 4032, wt226);
_mm512_mask_storeu_ps(arrangedW1+9891072+60582816*i5+7632*c2+24*k4, 63, wt227);
_mm512_mask_storeu_ps(arrangedW1+9898680+60582816*i5+7632*c2+24*k4, 4032, wt227);
_mm512_mask_storeu_ps(arrangedW1+42037056+60582816*i5+7632*c2+24*k4, 63, wt228);
_mm512_mask_storeu_ps(arrangedW1+42044664+60582816*i5+7632*c2+24*k4, 4032, wt228);
_mm512_mask_storeu_ps(arrangedW1+50691744+60582816*i5+7632*c2+24*k4, 63, wt229);
_mm512_mask_storeu_ps(arrangedW1+50699352+60582816*i5+7632*c2+24*k4, 4032, wt229);
_mm512_mask_storeu_ps(arrangedW1+58110048+60582816*i5+7632*c2+24*k4, 63, wt230);
_mm512_mask_storeu_ps(arrangedW1+58117656+60582816*i5+7632*c2+24*k4, 4032, wt230);
_mm512_mask_storeu_ps(arrangedW1+43273440+60582816*i5+7632*c2+24*k4, 63, wt231);
_mm512_mask_storeu_ps(arrangedW1+43281048+60582816*i5+7632*c2+24*k4, 4032, wt231);
_mm512_mask_storeu_ps(arrangedW1+51928128+60582816*i5+7632*c2+24*k4, 63, wt232);
_mm512_mask_storeu_ps(arrangedW1+51935736+60582816*i5+7632*c2+24*k4, 4032, wt232);
_mm512_mask_storeu_ps(arrangedW1+59346432+60582816*i5+7632*c2+24*k4, 63, wt233);
_mm512_mask_storeu_ps(arrangedW1+59354040+60582816*i5+7632*c2+24*k4, 4032, wt233);
_mm512_mask_storeu_ps(arrangedW1+44509824+60582816*i5+7632*c2+24*k4, 63, wt234);
_mm512_mask_storeu_ps(arrangedW1+44517432+60582816*i5+7632*c2+24*k4, 4032, wt234);
_mm512_mask_storeu_ps(arrangedW1+11127456+60582816*i5+7632*c2+24*k4, 63, wt235);
_mm512_mask_storeu_ps(arrangedW1+11135064+60582816*i5+7632*c2+24*k4, 4032, wt235);
_mm512_mask_storeu_ps(arrangedW1+22254912+60582816*i5+7632*c2+24*k4, 63, wt236);
_mm512_mask_storeu_ps(arrangedW1+22262520+60582816*i5+7632*c2+24*k4, 4032, wt236);
_mm512_mask_storeu_ps(arrangedW1+32145984+60582816*i5+7632*c2+24*k4, 63, wt237);
_mm512_mask_storeu_ps(arrangedW1+32153592+60582816*i5+7632*c2+24*k4, 4032, wt237);
_mm512_mask_storeu_ps(arrangedW1+12363840+60582816*i5+7632*c2+24*k4, 63, wt238);
_mm512_mask_storeu_ps(arrangedW1+12371448+60582816*i5+7632*c2+24*k4, 4032, wt238);
_mm512_mask_storeu_ps(arrangedW1+23491296+60582816*i5+7632*c2+24*k4, 63, wt239);
_mm512_mask_storeu_ps(arrangedW1+23498904+60582816*i5+7632*c2+24*k4, 4032, wt239);
_mm512_mask_storeu_ps(arrangedW1+33382368+60582816*i5+7632*c2+24*k4, 63, wt240);
_mm512_mask_storeu_ps(arrangedW1+33389976+60582816*i5+7632*c2+24*k4, 4032, wt240);
__m512 wt241 = _mm512_maskz_loadu_ps(1, wtPtr1+192+60582816*i5+997248*j1+196*k4);
__m512 wt242 = _mm512_maskz_loadu_ps(1, wtPtr1+62520+60582816*i5+997248*j1+196*k4);
__m512 wt243 = _mm512_maskz_loadu_ps(1, wtPtr1+124848+60582816*i5+997248*j1+196*k4);
__m512 wt244 = _mm512_maskz_loadu_ps(1, wtPtr1+187176+60582816*i5+997248*j1+196*k4);
__m512 wt245 = _mm512_maskz_loadu_ps(1, wtPtr1+249504+60582816*i5+997248*j1+196*k4);
__m512 wt246 = _mm512_maskz_loadu_ps(1, wtPtr1+311832+60582816*i5+997248*j1+196*k4);
__m512 wt247 = _mm512_maskz_loadu_ps(1, wtPtr1+374160+60582816*i5+997248*j1+196*k4);
__m512 wt248 = _mm512_maskz_loadu_ps(1, wtPtr1+436488+60582816*i5+997248*j1+196*k4);
__m512 wt249 = _mm512_maskz_loadu_ps(1, wtPtr1+498816+60582816*i5+997248*j1+196*k4);
__m512 wt250 = _mm512_maskz_loadu_ps(1, wtPtr1+561144+60582816*i5+997248*j1+196*k4);
__m512 wt251 = _mm512_maskz_loadu_ps(1, wtPtr1+623472+60582816*i5+997248*j1+196*k4);
__m512 wt252 = _mm512_maskz_loadu_ps(1, wtPtr1+685800+60582816*i5+997248*j1+196*k4);
__m512 tmp595 = _mm512_unpacklo_ps(wt241, wt242);
__m512 tmp596 = _mm512_unpacklo_ps(wt243, wt244);
__m512 tmp597 = _mm512_unpacklo_ps(wt245, wt246);
__m512 tmp598 = _mm512_unpacklo_ps(wt247, wt248);
__m512 tmp599 = _mm512_unpacklo_ps(wt249, wt250);
__m512 tmp600 = _mm512_unpacklo_ps(wt251, wt252);
__m512 tmp601 = _mm512_shuffle_ps(tmp595, tmp596, 68);
__m512 tmp602 = _mm512_shuffle_ps(tmp597, tmp598, 68);
__m512 tmp603 = _mm512_shuffle_ps(tmp599, tmp600, 68);
__m512 tmp604 = _mm512_shuffle_f32x4(tmp601, tmp602, 136);
wt241 = _mm512_shuffle_f32x4(tmp604, tmp603, 136);
_mm512_mask_storeu_ps(arrangedW1+13600224+60582816*i5+7632*c2+24*k4, 63, wt241);
_mm512_mask_storeu_ps(arrangedW1+13607832+60582816*i5+7632*c2+24*k4, 4032, wt241);
}
if (j1 >= jj1) return;
j1 = 61;
}
}

static void Example5LoomArrangeFilts1(Example5ThreaderTeam1* team13, char** tensors1) {
Example5ThreaderTask1 task5;
task5.callee1 = Example5LoomArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 61;
task5.hull1[1] = 4;
task5.hull1[2] = 1;
Example5ThreaderDo1(team13, &task5);
}

static void Example5LoomArrangeDats1Callee1(Example5ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = pt8[0];
ptrdiff_t c3 = pt8[1];
ptrdiff_t g3 = pt8[2];
ptrdiff_t e2 = 0;
char*restrict datPtr1 = tensors4[0]-1300+40039920*e2;
char*restrict arranged1 = tensors4[1]+123125760*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 1*c3;
ptrdiff_t last1 = j2+0;
if (j2 < 3) {
ptrdiff_t rel1 = j2-0;
ptrdiff_t h1 = 0;
if (rel1 < 1) {
ptrdiff_t w1 = 0;
ptrdiff_t k5 = 22*s1;
ptrdiff_t kk1 = k5+(s1 < 13 ? 21 : 31);
for (; k5 <= kk1; ++k5) {
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k5, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k5, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k5, 65535, _mm512_setzero_ps());
__m512 dat1 = _mm512_maskz_loadu_ps(65534, datPtr1+1728+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat2 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat3 = _mm512_maskz_loadu_ps(65535, datPtr1+1856+15248736*i6+47952*k5+432*h1+4*w1);
__m512i pm1 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat1, pm1, dat2));
__m512i pm2 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+108+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm2, dat3));
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat1, pm2, dat2));
__m512i pm3 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+7815276+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm3, dat3));
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k5, 1023, _mm512_permutex2var_ps(dat1, pm3, dat2));
_mm512_mask_storeu_ps(arranged1+15630440+1953792*i6+81408*j2+256*k5, 63, _mm512_permutexvar_ps(pm1, dat3));
__m512 dat4 = _mm512_maskz_loadu_ps(65534, datPtr1+3456+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat5 = _mm512_maskz_loadu_ps(65535, datPtr1+3520+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat6 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+15248736*i6+47952*k5+432*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat4, pm1, dat5));
_mm512_mask_storeu_ps(arranged1+172+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm2, dat6));
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat4, pm2, dat5));
_mm512_mask_storeu_ps(arranged1+7815340+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm3, dat6));
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k5, 1023, _mm512_permutex2var_ps(dat4, pm3, dat5));
_mm512_mask_storeu_ps(arranged1+15630504+1953792*i6+81408*j2+256*k5, 63, _mm512_permutexvar_ps(pm1, dat6));
__m512 dat7 = _mm512_maskz_loadu_ps(65534, datPtr1+5184+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat8 = _mm512_maskz_loadu_ps(65535, datPtr1+5248+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat9 = _mm512_maskz_loadu_ps(65535, datPtr1+5312+15248736*i6+47952*k5+432*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat7, pm1, dat8));
_mm512_mask_storeu_ps(arranged1+236+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm2, dat9));
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat7, pm2, dat8));
_mm512_mask_storeu_ps(arranged1+7815404+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm3, dat9));
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k5, 1023, _mm512_permutex2var_ps(dat7, pm3, dat8));
_mm512_mask_storeu_ps(arranged1+15630568+1953792*i6+81408*j2+256*k5, 63, _mm512_permutexvar_ps(pm1, dat9));
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k5, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k5, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k5, 65535, _mm512_setzero_ps());
__m512 dat10 = _mm512_maskz_loadu_ps(65534, datPtr1+2592+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat11 = _mm512_maskz_loadu_ps(65535, datPtr1+2656+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat12 = _mm512_maskz_loadu_ps(65535, datPtr1+2720+15248736*i6+47952*k5+432*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat10, pm1, dat11));
_mm512_mask_storeu_ps(arranged1+23445612+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm2, dat12));
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat10, pm2, dat11));
_mm512_mask_storeu_ps(arranged1+31260780+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm3, dat12));
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k5, 1023, _mm512_permutex2var_ps(dat10, pm3, dat11));
_mm512_mask_storeu_ps(arranged1+39075944+1953792*i6+81408*j2+256*k5, 63, _mm512_permutexvar_ps(pm1, dat12));
__m512 dat13 = _mm512_maskz_loadu_ps(65534, datPtr1+4320+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat14 = _mm512_maskz_loadu_ps(65535, datPtr1+4384+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat15 = _mm512_maskz_loadu_ps(65535, datPtr1+4448+15248736*i6+47952*k5+432*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat13, pm1, dat14));
_mm512_mask_storeu_ps(arranged1+23445676+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm2, dat15));
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat13, pm2, dat14));
_mm512_mask_storeu_ps(arranged1+31260844+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm3, dat15));
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k5, 1023, _mm512_permutex2var_ps(dat13, pm3, dat14));
_mm512_mask_storeu_ps(arranged1+39076008+1953792*i6+81408*j2+256*k5, 63, _mm512_permutexvar_ps(pm1, dat15));
__m512 dat16 = _mm512_maskz_loadu_ps(65534, datPtr1+6048+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat17 = _mm512_maskz_loadu_ps(65535, datPtr1+6112+15248736*i6+47952*k5+432*h1+4*w1);
__m512 dat18 = _mm512_maskz_loadu_ps(65535, datPtr1+6176+15248736*i6+47952*k5+432*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat16, pm1, dat17));
_mm512_mask_storeu_ps(arranged1+23445740+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm2, dat18));
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k5, 2047, _mm512_permutex2var_ps(dat16, pm2, dat17));
_mm512_mask_storeu_ps(arranged1+31260908+1953792*i6+81408*j2+256*k5, 31, _mm512_permutexvar_ps(pm3, dat18));
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k5, 1023, _mm512_permutex2var_ps(dat16, pm3, dat17));
_mm512_mask_storeu_ps(arranged1+39076072+1953792*i6+81408*j2+256*k5, 63, _mm512_permutexvar_ps(pm1, dat18));
}
if (j2 >= last1) return;
++j2;
rel1 = 1;
}
if (rel1 < 2) {
ptrdiff_t w2 = 48;
ptrdiff_t k6 = 22*s1;
ptrdiff_t kk2 = k6+(s1 < 13 ? 21 : 31);
for (; k6 <= kk2; ++k6) {
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k6, 65535, _mm512_setzero_ps());
__m512 dat19 = _mm512_maskz_loadu_ps(65535, datPtr1+1728+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat20 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat21 = _mm512_maskz_loadu_ps(65535, datPtr1+1856+15248736*i6+47952*k6+432*h1+4*w2);
__m512i pm4 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat19, pm4, dat20));
__m512i pm5 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+108+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm5, dat21));
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat19, pm5, dat20));
__m512i pm6 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+7815276+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm6, dat21));
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k6, 1023, _mm512_permutex2var_ps(dat19, pm6, dat20));
_mm512_mask_storeu_ps(arranged1+15630440+1953792*i6+81408*j2+256*k6, 63, _mm512_permutexvar_ps(pm4, dat21));
__m512 dat22 = _mm512_maskz_loadu_ps(65535, datPtr1+3456+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat23 = _mm512_maskz_loadu_ps(65535, datPtr1+3520+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat24 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+15248736*i6+47952*k6+432*h1+4*w2);
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat22, pm4, dat23));
_mm512_mask_storeu_ps(arranged1+172+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm5, dat24));
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat22, pm5, dat23));
_mm512_mask_storeu_ps(arranged1+7815340+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm6, dat24));
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k6, 1023, _mm512_permutex2var_ps(dat22, pm6, dat23));
_mm512_mask_storeu_ps(arranged1+15630504+1953792*i6+81408*j2+256*k6, 63, _mm512_permutexvar_ps(pm4, dat24));
__m512 dat25 = _mm512_maskz_loadu_ps(65535, datPtr1+5184+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat26 = _mm512_maskz_loadu_ps(65535, datPtr1+5248+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat27 = _mm512_maskz_loadu_ps(65535, datPtr1+5312+15248736*i6+47952*k6+432*h1+4*w2);
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat25, pm4, dat26));
_mm512_mask_storeu_ps(arranged1+236+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm5, dat27));
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat25, pm5, dat26));
_mm512_mask_storeu_ps(arranged1+7815404+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm6, dat27));
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k6, 1023, _mm512_permutex2var_ps(dat25, pm6, dat26));
_mm512_mask_storeu_ps(arranged1+15630568+1953792*i6+81408*j2+256*k6, 63, _mm512_permutexvar_ps(pm4, dat27));
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k6, 65535, _mm512_setzero_ps());
__m512 dat28 = _mm512_maskz_loadu_ps(65535, datPtr1+2592+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat29 = _mm512_maskz_loadu_ps(65535, datPtr1+2656+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat30 = _mm512_maskz_loadu_ps(65535, datPtr1+2720+15248736*i6+47952*k6+432*h1+4*w2);
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat28, pm4, dat29));
_mm512_mask_storeu_ps(arranged1+23445612+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm5, dat30));
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat28, pm5, dat29));
_mm512_mask_storeu_ps(arranged1+31260780+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm6, dat30));
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k6, 1023, _mm512_permutex2var_ps(dat28, pm6, dat29));
_mm512_mask_storeu_ps(arranged1+39075944+1953792*i6+81408*j2+256*k6, 63, _mm512_permutexvar_ps(pm4, dat30));
__m512 dat31 = _mm512_maskz_loadu_ps(65535, datPtr1+4320+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat32 = _mm512_maskz_loadu_ps(65535, datPtr1+4384+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat33 = _mm512_maskz_loadu_ps(65535, datPtr1+4448+15248736*i6+47952*k6+432*h1+4*w2);
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat31, pm4, dat32));
_mm512_mask_storeu_ps(arranged1+23445676+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm5, dat33));
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat31, pm5, dat32));
_mm512_mask_storeu_ps(arranged1+31260844+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm6, dat33));
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k6, 1023, _mm512_permutex2var_ps(dat31, pm6, dat32));
_mm512_mask_storeu_ps(arranged1+39076008+1953792*i6+81408*j2+256*k6, 63, _mm512_permutexvar_ps(pm4, dat33));
__m512 dat34 = _mm512_maskz_loadu_ps(65535, datPtr1+6048+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat35 = _mm512_maskz_loadu_ps(65535, datPtr1+6112+15248736*i6+47952*k6+432*h1+4*w2);
__m512 dat36 = _mm512_maskz_loadu_ps(65535, datPtr1+6176+15248736*i6+47952*k6+432*h1+4*w2);
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat34, pm4, dat35));
_mm512_mask_storeu_ps(arranged1+23445740+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm5, dat36));
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k6, 2047, _mm512_permutex2var_ps(dat34, pm5, dat35));
_mm512_mask_storeu_ps(arranged1+31260908+1953792*i6+81408*j2+256*k6, 31, _mm512_permutexvar_ps(pm6, dat36));
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k6, 1023, _mm512_permutex2var_ps(dat34, pm6, dat35));
_mm512_mask_storeu_ps(arranged1+39076072+1953792*i6+81408*j2+256*k6, 63, _mm512_permutexvar_ps(pm4, dat36));
}
if (j2 >= last1) return;
++j2;
rel1 = 2;
}
ptrdiff_t w3 = 96;
ptrdiff_t k7 = 22*s1;
ptrdiff_t kk3 = k7+(s1 < 13 ? 21 : 31);
for (; k7 <= kk3; ++k7) {
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k7, 65535, _mm512_setzero_ps());
__m512 dat37 = _mm512_maskz_loadu_ps(8191, datPtr1+1728+15248736*i6+47952*k7+432*h1+4*w3);
__m512i pm7 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat37, pm7, _mm512_setzero_ps()));
__m512i pm8 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat37, pm8, _mm512_setzero_ps()));
__m512i pm9 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat37, pm9, _mm512_setzero_ps()));
__m512 dat38 = _mm512_maskz_loadu_ps(8191, datPtr1+3456+15248736*i6+47952*k7+432*h1+4*w3);
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat38, pm7, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat38, pm8, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat38, pm9, _mm512_setzero_ps()));
__m512 dat39 = _mm512_maskz_loadu_ps(8191, datPtr1+5184+15248736*i6+47952*k7+432*h1+4*w3);
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat39, pm7, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat39, pm8, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat39, pm9, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k7, 65535, _mm512_setzero_ps());
__m512 dat40 = _mm512_maskz_loadu_ps(8191, datPtr1+2592+15248736*i6+47952*k7+432*h1+4*w3);
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat40, pm7, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat40, pm8, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat40, pm9, _mm512_setzero_ps()));
__m512 dat41 = _mm512_maskz_loadu_ps(8191, datPtr1+4320+15248736*i6+47952*k7+432*h1+4*w3);
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat41, pm7, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat41, pm8, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat41, pm9, _mm512_setzero_ps()));
__m512 dat42 = _mm512_maskz_loadu_ps(8191, datPtr1+6048+15248736*i6+47952*k7+432*h1+4*w3);
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat42, pm7, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat42, pm8, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k7, 65535, _mm512_permutex2var_ps(dat42, pm9, _mm512_setzero_ps()));
}
if (j2 >= last1) return;
++j2;
j2 = 3;
}
if (j2 < 21) {
ptrdiff_t rel2 = (size_t)(j2-3)%3;
ptrdiff_t h2 = 16+(size_t)(j2-3)/3*16;
for (; j2 < 21; rel2 = 0, h2 += 16) {
if (rel2 < 1) {
ptrdiff_t w4 = 0;
ptrdiff_t k8 = 22*s1;
ptrdiff_t kk4 = k8+(s1 < 13 ? 21 : 31);
for (; k8 <= kk4; ++k8) {
__m512 dat43 = _mm512_maskz_loadu_ps(65534, datPtr1+0+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat44 = _mm512_maskz_loadu_ps(65535, datPtr1+64+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat45 = _mm512_maskz_loadu_ps(65535, datPtr1+128+15248736*i6+47952*k8+432*h2+4*w4);
__m512i pm10 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat43, pm10, dat44));
__m512i pm11 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+44+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm11, dat45));
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat43, pm11, dat44));
__m512i pm12 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+7815212+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm12, dat45));
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k8, 1023, _mm512_permutex2var_ps(dat43, pm12, dat44));
_mm512_mask_storeu_ps(arranged1+15630376+1953792*i6+81408*j2+256*k8, 63, _mm512_permutexvar_ps(pm10, dat45));
__m512 dat46 = _mm512_maskz_loadu_ps(65534, datPtr1+1728+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat47 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat48 = _mm512_maskz_loadu_ps(65535, datPtr1+1856+15248736*i6+47952*k8+432*h2+4*w4);
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat46, pm10, dat47));
_mm512_mask_storeu_ps(arranged1+108+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm11, dat48));
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat46, pm11, dat47));
_mm512_mask_storeu_ps(arranged1+7815276+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm12, dat48));
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k8, 1023, _mm512_permutex2var_ps(dat46, pm12, dat47));
_mm512_mask_storeu_ps(arranged1+15630440+1953792*i6+81408*j2+256*k8, 63, _mm512_permutexvar_ps(pm10, dat48));
__m512 dat49 = _mm512_maskz_loadu_ps(65534, datPtr1+3456+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat50 = _mm512_maskz_loadu_ps(65535, datPtr1+3520+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat51 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+15248736*i6+47952*k8+432*h2+4*w4);
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat49, pm10, dat50));
_mm512_mask_storeu_ps(arranged1+172+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm11, dat51));
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat49, pm11, dat50));
_mm512_mask_storeu_ps(arranged1+7815340+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm12, dat51));
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k8, 1023, _mm512_permutex2var_ps(dat49, pm12, dat50));
_mm512_mask_storeu_ps(arranged1+15630504+1953792*i6+81408*j2+256*k8, 63, _mm512_permutexvar_ps(pm10, dat51));
__m512 dat52 = _mm512_maskz_loadu_ps(65534, datPtr1+5184+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat53 = _mm512_maskz_loadu_ps(65535, datPtr1+5248+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat54 = _mm512_maskz_loadu_ps(65535, datPtr1+5312+15248736*i6+47952*k8+432*h2+4*w4);
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat52, pm10, dat53));
_mm512_mask_storeu_ps(arranged1+236+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm11, dat54));
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat52, pm11, dat53));
_mm512_mask_storeu_ps(arranged1+7815404+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm12, dat54));
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k8, 1023, _mm512_permutex2var_ps(dat52, pm12, dat53));
_mm512_mask_storeu_ps(arranged1+15630568+1953792*i6+81408*j2+256*k8, 63, _mm512_permutexvar_ps(pm10, dat54));
__m512 dat55 = _mm512_maskz_loadu_ps(65534, datPtr1+864+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat56 = _mm512_maskz_loadu_ps(65535, datPtr1+928+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat57 = _mm512_maskz_loadu_ps(65535, datPtr1+992+15248736*i6+47952*k8+432*h2+4*w4);
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat55, pm10, dat56));
_mm512_mask_storeu_ps(arranged1+23445548+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm11, dat57));
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat55, pm11, dat56));
_mm512_mask_storeu_ps(arranged1+31260716+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm12, dat57));
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k8, 1023, _mm512_permutex2var_ps(dat55, pm12, dat56));
_mm512_mask_storeu_ps(arranged1+39075880+1953792*i6+81408*j2+256*k8, 63, _mm512_permutexvar_ps(pm10, dat57));
__m512 dat58 = _mm512_maskz_loadu_ps(65534, datPtr1+2592+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat59 = _mm512_maskz_loadu_ps(65535, datPtr1+2656+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat60 = _mm512_maskz_loadu_ps(65535, datPtr1+2720+15248736*i6+47952*k8+432*h2+4*w4);
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat58, pm10, dat59));
_mm512_mask_storeu_ps(arranged1+23445612+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm11, dat60));
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat58, pm11, dat59));
_mm512_mask_storeu_ps(arranged1+31260780+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm12, dat60));
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k8, 1023, _mm512_permutex2var_ps(dat58, pm12, dat59));
_mm512_mask_storeu_ps(arranged1+39075944+1953792*i6+81408*j2+256*k8, 63, _mm512_permutexvar_ps(pm10, dat60));
__m512 dat61 = _mm512_maskz_loadu_ps(65534, datPtr1+4320+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat62 = _mm512_maskz_loadu_ps(65535, datPtr1+4384+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat63 = _mm512_maskz_loadu_ps(65535, datPtr1+4448+15248736*i6+47952*k8+432*h2+4*w4);
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat61, pm10, dat62));
_mm512_mask_storeu_ps(arranged1+23445676+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm11, dat63));
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat61, pm11, dat62));
_mm512_mask_storeu_ps(arranged1+31260844+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm12, dat63));
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k8, 1023, _mm512_permutex2var_ps(dat61, pm12, dat62));
_mm512_mask_storeu_ps(arranged1+39076008+1953792*i6+81408*j2+256*k8, 63, _mm512_permutexvar_ps(pm10, dat63));
__m512 dat64 = _mm512_maskz_loadu_ps(65534, datPtr1+6048+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat65 = _mm512_maskz_loadu_ps(65535, datPtr1+6112+15248736*i6+47952*k8+432*h2+4*w4);
__m512 dat66 = _mm512_maskz_loadu_ps(65535, datPtr1+6176+15248736*i6+47952*k8+432*h2+4*w4);
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat64, pm10, dat65));
_mm512_mask_storeu_ps(arranged1+23445740+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm11, dat66));
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k8, 2047, _mm512_permutex2var_ps(dat64, pm11, dat65));
_mm512_mask_storeu_ps(arranged1+31260908+1953792*i6+81408*j2+256*k8, 31, _mm512_permutexvar_ps(pm12, dat66));
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k8, 1023, _mm512_permutex2var_ps(dat64, pm12, dat65));
_mm512_mask_storeu_ps(arranged1+39076072+1953792*i6+81408*j2+256*k8, 63, _mm512_permutexvar_ps(pm10, dat66));
}
if (j2 >= last1) return;
++j2;
rel2 = 1;
}
if (rel2 < 2) {
ptrdiff_t w5 = 48;
ptrdiff_t k9 = 22*s1;
ptrdiff_t kk5 = k9+(s1 < 13 ? 21 : 31);
for (; k9 <= kk5; ++k9) {
__m512 dat67 = _mm512_maskz_loadu_ps(65535, datPtr1+0+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat68 = _mm512_maskz_loadu_ps(65535, datPtr1+64+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat69 = _mm512_maskz_loadu_ps(65535, datPtr1+128+15248736*i6+47952*k9+432*h2+4*w5);
__m512i pm13 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat67, pm13, dat68));
__m512i pm14 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+44+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm14, dat69));
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat67, pm14, dat68));
__m512i pm15 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+7815212+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm15, dat69));
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k9, 1023, _mm512_permutex2var_ps(dat67, pm15, dat68));
_mm512_mask_storeu_ps(arranged1+15630376+1953792*i6+81408*j2+256*k9, 63, _mm512_permutexvar_ps(pm13, dat69));
__m512 dat70 = _mm512_maskz_loadu_ps(65535, datPtr1+1728+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat71 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat72 = _mm512_maskz_loadu_ps(65535, datPtr1+1856+15248736*i6+47952*k9+432*h2+4*w5);
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat70, pm13, dat71));
_mm512_mask_storeu_ps(arranged1+108+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm14, dat72));
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat70, pm14, dat71));
_mm512_mask_storeu_ps(arranged1+7815276+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm15, dat72));
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k9, 1023, _mm512_permutex2var_ps(dat70, pm15, dat71));
_mm512_mask_storeu_ps(arranged1+15630440+1953792*i6+81408*j2+256*k9, 63, _mm512_permutexvar_ps(pm13, dat72));
__m512 dat73 = _mm512_maskz_loadu_ps(65535, datPtr1+3456+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat74 = _mm512_maskz_loadu_ps(65535, datPtr1+3520+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat75 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+15248736*i6+47952*k9+432*h2+4*w5);
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat73, pm13, dat74));
_mm512_mask_storeu_ps(arranged1+172+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm14, dat75));
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat73, pm14, dat74));
_mm512_mask_storeu_ps(arranged1+7815340+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm15, dat75));
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k9, 1023, _mm512_permutex2var_ps(dat73, pm15, dat74));
_mm512_mask_storeu_ps(arranged1+15630504+1953792*i6+81408*j2+256*k9, 63, _mm512_permutexvar_ps(pm13, dat75));
__m512 dat76 = _mm512_maskz_loadu_ps(65535, datPtr1+5184+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat77 = _mm512_maskz_loadu_ps(65535, datPtr1+5248+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat78 = _mm512_maskz_loadu_ps(65535, datPtr1+5312+15248736*i6+47952*k9+432*h2+4*w5);
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat76, pm13, dat77));
_mm512_mask_storeu_ps(arranged1+236+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm14, dat78));
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat76, pm14, dat77));
_mm512_mask_storeu_ps(arranged1+7815404+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm15, dat78));
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k9, 1023, _mm512_permutex2var_ps(dat76, pm15, dat77));
_mm512_mask_storeu_ps(arranged1+15630568+1953792*i6+81408*j2+256*k9, 63, _mm512_permutexvar_ps(pm13, dat78));
__m512 dat79 = _mm512_maskz_loadu_ps(65535, datPtr1+864+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat80 = _mm512_maskz_loadu_ps(65535, datPtr1+928+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat81 = _mm512_maskz_loadu_ps(65535, datPtr1+992+15248736*i6+47952*k9+432*h2+4*w5);
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat79, pm13, dat80));
_mm512_mask_storeu_ps(arranged1+23445548+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm14, dat81));
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat79, pm14, dat80));
_mm512_mask_storeu_ps(arranged1+31260716+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm15, dat81));
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k9, 1023, _mm512_permutex2var_ps(dat79, pm15, dat80));
_mm512_mask_storeu_ps(arranged1+39075880+1953792*i6+81408*j2+256*k9, 63, _mm512_permutexvar_ps(pm13, dat81));
__m512 dat82 = _mm512_maskz_loadu_ps(65535, datPtr1+2592+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat83 = _mm512_maskz_loadu_ps(65535, datPtr1+2656+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat84 = _mm512_maskz_loadu_ps(65535, datPtr1+2720+15248736*i6+47952*k9+432*h2+4*w5);
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat82, pm13, dat83));
_mm512_mask_storeu_ps(arranged1+23445612+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm14, dat84));
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat82, pm14, dat83));
_mm512_mask_storeu_ps(arranged1+31260780+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm15, dat84));
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k9, 1023, _mm512_permutex2var_ps(dat82, pm15, dat83));
_mm512_mask_storeu_ps(arranged1+39075944+1953792*i6+81408*j2+256*k9, 63, _mm512_permutexvar_ps(pm13, dat84));
__m512 dat85 = _mm512_maskz_loadu_ps(65535, datPtr1+4320+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat86 = _mm512_maskz_loadu_ps(65535, datPtr1+4384+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat87 = _mm512_maskz_loadu_ps(65535, datPtr1+4448+15248736*i6+47952*k9+432*h2+4*w5);
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat85, pm13, dat86));
_mm512_mask_storeu_ps(arranged1+23445676+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm14, dat87));
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat85, pm14, dat86));
_mm512_mask_storeu_ps(arranged1+31260844+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm15, dat87));
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k9, 1023, _mm512_permutex2var_ps(dat85, pm15, dat86));
_mm512_mask_storeu_ps(arranged1+39076008+1953792*i6+81408*j2+256*k9, 63, _mm512_permutexvar_ps(pm13, dat87));
__m512 dat88 = _mm512_maskz_loadu_ps(65535, datPtr1+6048+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat89 = _mm512_maskz_loadu_ps(65535, datPtr1+6112+15248736*i6+47952*k9+432*h2+4*w5);
__m512 dat90 = _mm512_maskz_loadu_ps(65535, datPtr1+6176+15248736*i6+47952*k9+432*h2+4*w5);
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat88, pm13, dat89));
_mm512_mask_storeu_ps(arranged1+23445740+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm14, dat90));
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k9, 2047, _mm512_permutex2var_ps(dat88, pm14, dat89));
_mm512_mask_storeu_ps(arranged1+31260908+1953792*i6+81408*j2+256*k9, 31, _mm512_permutexvar_ps(pm15, dat90));
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k9, 1023, _mm512_permutex2var_ps(dat88, pm15, dat89));
_mm512_mask_storeu_ps(arranged1+39076072+1953792*i6+81408*j2+256*k9, 63, _mm512_permutexvar_ps(pm13, dat90));
}
if (j2 >= last1) return;
++j2;
rel2 = 2;
}
ptrdiff_t w6 = 96;
ptrdiff_t k10 = 22*s1;
ptrdiff_t kk6 = k10+(s1 < 13 ? 21 : 31);
for (; k10 <= kk6; ++k10) {
__m512 dat91 = _mm512_maskz_loadu_ps(8191, datPtr1+0+15248736*i6+47952*k10+432*h2+4*w6);
__m512i pm16 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat91, pm16, _mm512_setzero_ps()));
__m512i pm17 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat91, pm17, _mm512_setzero_ps()));
__m512i pm18 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat91, pm18, _mm512_setzero_ps()));
__m512 dat92 = _mm512_maskz_loadu_ps(8191, datPtr1+1728+15248736*i6+47952*k10+432*h2+4*w6);
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat92, pm16, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat92, pm17, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat92, pm18, _mm512_setzero_ps()));
__m512 dat93 = _mm512_maskz_loadu_ps(8191, datPtr1+3456+15248736*i6+47952*k10+432*h2+4*w6);
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat93, pm16, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat93, pm17, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat93, pm18, _mm512_setzero_ps()));
__m512 dat94 = _mm512_maskz_loadu_ps(8191, datPtr1+5184+15248736*i6+47952*k10+432*h2+4*w6);
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat94, pm16, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat94, pm17, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat94, pm18, _mm512_setzero_ps()));
__m512 dat95 = _mm512_maskz_loadu_ps(8191, datPtr1+864+15248736*i6+47952*k10+432*h2+4*w6);
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat95, pm16, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat95, pm17, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat95, pm18, _mm512_setzero_ps()));
__m512 dat96 = _mm512_maskz_loadu_ps(8191, datPtr1+2592+15248736*i6+47952*k10+432*h2+4*w6);
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat96, pm16, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat96, pm17, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat96, pm18, _mm512_setzero_ps()));
__m512 dat97 = _mm512_maskz_loadu_ps(8191, datPtr1+4320+15248736*i6+47952*k10+432*h2+4*w6);
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat97, pm16, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat97, pm17, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat97, pm18, _mm512_setzero_ps()));
__m512 dat98 = _mm512_maskz_loadu_ps(8191, datPtr1+6048+15248736*i6+47952*k10+432*h2+4*w6);
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat98, pm16, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat98, pm17, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k10, 65535, _mm512_permutex2var_ps(dat98, pm18, _mm512_setzero_ps()));
}
if (j2 >= last1) return;
++j2;
}
j2 = 21;
}
ptrdiff_t rel3 = j2-21;
ptrdiff_t h3 = 112;
if (rel3 < 1) {
ptrdiff_t w7 = 0;
ptrdiff_t k11 = 22*s1;
ptrdiff_t kk7 = k11+(s1 < 13 ? 21 : 31);
for (; k11 <= kk7; ++k11) {
__m512 dat99 = _mm512_maskz_loadu_ps(65534, datPtr1+0+15248736*i6+47952*k11+432*h3+4*w7);
__m512 dat100 = _mm512_maskz_loadu_ps(65535, datPtr1+64+15248736*i6+47952*k11+432*h3+4*w7);
__m512 dat101 = _mm512_maskz_loadu_ps(65535, datPtr1+128+15248736*i6+47952*k11+432*h3+4*w7);
__m512i pm19 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k11, 2047, _mm512_permutex2var_ps(dat99, pm19, dat100));
__m512i pm20 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+44+1953792*i6+81408*j2+256*k11, 31, _mm512_permutexvar_ps(pm20, dat101));
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k11, 2047, _mm512_permutex2var_ps(dat99, pm20, dat100));
__m512i pm21 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+7815212+1953792*i6+81408*j2+256*k11, 31, _mm512_permutexvar_ps(pm21, dat101));
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k11, 1023, _mm512_permutex2var_ps(dat99, pm21, dat100));
_mm512_mask_storeu_ps(arranged1+15630376+1953792*i6+81408*j2+256*k11, 63, _mm512_permutexvar_ps(pm19, dat101));
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k11, 65535, _mm512_setzero_ps());
}
if (j2 >= last1) return;
++j2;
rel3 = 1;
}
if (rel3 < 2) {
ptrdiff_t w8 = 48;
ptrdiff_t k12 = 22*s1;
ptrdiff_t kk8 = k12+(s1 < 13 ? 21 : 31);
for (; k12 <= kk8; ++k12) {
__m512 dat102 = _mm512_maskz_loadu_ps(65535, datPtr1+0+15248736*i6+47952*k12+432*h3+4*w8);
__m512 dat103 = _mm512_maskz_loadu_ps(65535, datPtr1+64+15248736*i6+47952*k12+432*h3+4*w8);
__m512 dat104 = _mm512_maskz_loadu_ps(65535, datPtr1+128+15248736*i6+47952*k12+432*h3+4*w8);
__m512i pm22 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k12, 2047, _mm512_permutex2var_ps(dat102, pm22, dat103));
__m512i pm23 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+44+1953792*i6+81408*j2+256*k12, 31, _mm512_permutexvar_ps(pm23, dat104));
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k12, 2047, _mm512_permutex2var_ps(dat102, pm23, dat103));
__m512i pm24 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+7815212+1953792*i6+81408*j2+256*k12, 31, _mm512_permutexvar_ps(pm24, dat104));
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k12, 1023, _mm512_permutex2var_ps(dat102, pm24, dat103));
_mm512_mask_storeu_ps(arranged1+15630376+1953792*i6+81408*j2+256*k12, 63, _mm512_permutexvar_ps(pm22, dat104));
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k12, 65535, _mm512_setzero_ps());
}
if (j2 >= last1) return;
++j2;
rel3 = 2;
}
ptrdiff_t w9 = 96;
ptrdiff_t k13 = 22*s1;
ptrdiff_t kk9 = k13+(s1 < 13 ? 21 : 31);
for (; k13 <= kk9; ++k13) {
__m512 dat105 = _mm512_maskz_loadu_ps(8191, datPtr1+0+15248736*i6+47952*k13+432*h3+4*w9);
__m512i pm25 = _mm512_set_epi32(31, 31, 31, 31, 31, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
_mm512_mask_storeu_ps(arranged1+0+1953792*i6+81408*j2+256*k13, 65535, _mm512_permutex2var_ps(dat105, pm25, _mm512_setzero_ps()));
__m512i pm26 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1);
_mm512_mask_storeu_ps(arranged1+7815168+1953792*i6+81408*j2+256*k13, 65535, _mm512_permutex2var_ps(dat105, pm26, _mm512_setzero_ps()));
__m512i pm27 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
_mm512_mask_storeu_ps(arranged1+15630336+1953792*i6+81408*j2+256*k13, 65535, _mm512_permutex2var_ps(dat105, pm27, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+64+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815232+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630400+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+128+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815296+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630464+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+192+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+7815360+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15630528+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445504+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260672+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075840+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445568+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260736+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075904+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445632+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260800+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39075968+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+23445696+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31260864+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+39076032+1953792*i6+81408*j2+256*k13, 65535, _mm512_setzero_ps());
}
if (j2 >= last1) return;
++j2;
}

static void Example5LoomArrangeDats1(Example5ThreaderTeam1* team15, char** tensors3) {
Example5ThreaderTask1 task7;
task7.callee1 = Example5LoomArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 14;
task7.hull1[1] = 24;
task7.hull1[2] = 4;
task7.hull1[3] = 1;
Example5ThreaderDo1(team15, &task7);
}

static ptrdiff_t Example5LoomProduceSums1FieldTbl1[] = {
0, 3,
12, 2,
20, 2,
28, 3,
37, 2,
43, 2, 49
};

static ptrdiff_t Example5LoomProduceSums1NodeTbl1[] = {
0, 0, 1,
0, 1, 1,
0, 2, 1,
1, 0, 0,
1, 1, 0,
1, 2, 0,
2, 0, 0,
2, 1, 0,
2, 2, 0,
3, 0, 0,
3, 1, 0,
3, 2, 0,
0, 0, 0,
0, 1, 0,
1, 0, 0,
1, 1, 0,
2, 0, 0,
2, 1, 0,
3, 0, 0,
3, 1, 0,
0, 0, 0,
0, 1, 0,
1, 0, 0,
1, 1, 0,
2, 0, 0,
2, 1, 0,
3, 0, 0,
3, 1, 0,
0, 0, 0,
0, 1, 0,
0, 2, 0,
1, 0, 0,
1, 1, 0,
1, 2, 0,
2, 0, 0,
2, 1, 0,
2, 2, 0,
0, 0, 0,
0, 1, 0,
1, 0, 0,
1, 1, 0,
2, 0, 0,
2, 1, 0,
0, 0, 0,
0, 1, 0,
1, 0, 0,
1, 1, 0,
2, 0, 0,
2, 1, 0
};

static void Example5LoomProduceSums1Callee1(Example5ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t epoch1 = 0;
ptrdiff_t field1 = (ptrdiff_t)tuple2[2];
ptrdiff_t nodeFirst1 = (ptrdiff_t)tuple2[3];
ptrdiff_t group1 = pt9[3];
ptrdiff_t to2 = pt9[2];
ptrdiff_t nodeOff1 = pt9[1];
ptrdiff_t w10 = pt9[0];
ptrdiff_t node6 = nodeFirst1+nodeOff1;
ptrdiff_t lift1 = Example5LoomProduceSums1NodeTbl1[0+3*node6];
ptrdiff_t pile1 = Example5LoomProduceSums1NodeTbl1[1+3*node6];
ptrdiff_t base1 = Example5LoomProduceSums1NodeTbl1[2+3*node6];
ptrdiff_t from1 = to2+(size_t)lift1/4*3;
if (from1 >= 24) return;
char*restrict biasPtr2 = tensors6[0]+15552*epoch1+3888*group1;
char*restrict wtPtr2 = tensors6[0]+15552+636310080*epoch1+60582816*group1+1236384*node6;
char*restrict datPtr2 = tensors6[1]+123125760*epoch1+7815168*field1+1953792*group1+81408*from1;
char*restrict sumPtr1 = tensors6[2]+17915904*group1+746496*to2+248832*pile1;
switch ((size_t)lift1%4*2+(to2 >= 3)) {
default: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i7 = 2*w10;
ptrdiff_t ii1 = i7+1;
for (; i7 != 162; ++i7) {
__m512 sum2 = _mm512_setzero_ps();
__m512 sum6 = _mm512_setzero_ps();
__m512 sum10 = _mm512_setzero_ps();
__m512 sum14 = _mm512_setzero_ps();
__m512 sum18 = _mm512_setzero_ps();
__m512 sum22 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum3 = sum2;
__m512 sum4 = sum2;
__m512 sum5 = sum2;
__m512 sum7 = sum6;
__m512 sum8 = sum6;
__m512 sum9 = sum6;
__m512 sum11 = sum10;
__m512 sum12 = sum10;
__m512 sum13 = sum10;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum19 = sum18;
__m512 sum20 = sum18;
__m512 sum21 = sum18;
__m512 sum23 = sum22;
__m512 sum24 = sum22;
__m512 sum25 = sum22;
for (ptrdiff_t j3 = 0; j3 < 318; ++j3) {
__m512 dat106 = _mm512_loadu_ps(datPtr2+0+256*j3);
__m512 dat107 = _mm512_loadu_ps(datPtr2+64+256*j3);
__m512 dat108 = _mm512_loadu_ps(datPtr2+128+256*j3);
__m512 dat109 = _mm512_loadu_ps(datPtr2+192+256*j3);
__m512 wt253 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i7+24*j3));
sum2 = _mm512_fmadd_ps(wt253, dat106, sum2);
sum3 = _mm512_fmadd_ps(wt253, dat107, sum3);
sum4 = _mm512_fmadd_ps(wt253, dat108, sum4);
sum5 = _mm512_fmadd_ps(wt253, dat109, sum5);
__m512 wt254 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i7+24*j3));
sum6 = _mm512_fmadd_ps(wt254, dat106, sum6);
sum7 = _mm512_fmadd_ps(wt254, dat107, sum7);
sum8 = _mm512_fmadd_ps(wt254, dat108, sum8);
sum9 = _mm512_fmadd_ps(wt254, dat109, sum9);
__m512 wt255 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i7+24*j3));
sum10 = _mm512_fmadd_ps(wt255, dat106, sum10);
sum11 = _mm512_fmadd_ps(wt255, dat107, sum11);
sum12 = _mm512_fmadd_ps(wt255, dat108, sum12);
sum13 = _mm512_fmadd_ps(wt255, dat109, sum13);
__m512 wt256 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i7+24*j3));
sum14 = _mm512_fmadd_ps(wt256, dat106, sum14);
sum15 = _mm512_fmadd_ps(wt256, dat107, sum15);
sum16 = _mm512_fmadd_ps(wt256, dat108, sum16);
sum17 = _mm512_fmadd_ps(wt256, dat109, sum17);
__m512 wt257 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i7+24*j3));
sum18 = _mm512_fmadd_ps(wt257, dat106, sum18);
sum19 = _mm512_fmadd_ps(wt257, dat107, sum19);
sum20 = _mm512_fmadd_ps(wt257, dat108, sum20);
sum21 = _mm512_fmadd_ps(wt257, dat109, sum21);
__m512 wt258 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i7+24*j3));
sum22 = _mm512_fmadd_ps(wt258, dat106, sum22);
sum23 = _mm512_fmadd_ps(wt258, dat107, sum23);
sum24 = _mm512_fmadd_ps(wt258, dat108, sum24);
sum25 = _mm512_fmadd_ps(wt258, dat109, sum25);
}
_mm512_storeu_ps(sumPtr1+0+1536*i7, sum2);
_mm512_storeu_ps(sumPtr1+64+1536*i7, sum3);
_mm512_storeu_ps(sumPtr1+128+1536*i7, sum4);
_mm512_storeu_ps(sumPtr1+192+1536*i7, sum5);
_mm512_storeu_ps(sumPtr1+256+1536*i7, sum6);
_mm512_storeu_ps(sumPtr1+320+1536*i7, sum7);
_mm512_storeu_ps(sumPtr1+384+1536*i7, sum8);
_mm512_storeu_ps(sumPtr1+448+1536*i7, sum9);
_mm512_storeu_ps(sumPtr1+512+1536*i7, sum10);
_mm512_storeu_ps(sumPtr1+576+1536*i7, sum11);
_mm512_storeu_ps(sumPtr1+640+1536*i7, sum12);
_mm512_storeu_ps(sumPtr1+704+1536*i7, sum13);
_mm512_storeu_ps(sumPtr1+768+1536*i7, sum14);
_mm512_storeu_ps(sumPtr1+832+1536*i7, sum15);
_mm512_storeu_ps(sumPtr1+896+1536*i7, sum16);
_mm512_storeu_ps(sumPtr1+960+1536*i7, sum17);
_mm512_storeu_ps(sumPtr1+1024+1536*i7, sum18);
_mm512_storeu_ps(sumPtr1+1088+1536*i7, sum19);
_mm512_storeu_ps(sumPtr1+1152+1536*i7, sum20);
_mm512_storeu_ps(sumPtr1+1216+1536*i7, sum21);
_mm512_storeu_ps(sumPtr1+1280+1536*i7, sum22);
_mm512_storeu_ps(sumPtr1+1344+1536*i7, sum23);
_mm512_storeu_ps(sumPtr1+1408+1536*i7, sum24);
_mm512_storeu_ps(sumPtr1+1472+1536*i7, sum25);
if (i7 >= ii1) return;
}
return;
}
ptrdiff_t i8 = 2*w10;
ptrdiff_t ii2 = i8+1;
for (; i8 != 162; ++i8) {
__m512 sum26 = _mm512_setzero_ps();
__m512 sum30 = _mm512_setzero_ps();
__m512 sum34 = _mm512_setzero_ps();
__m512 sum38 = _mm512_setzero_ps();
__m512 sum42 = _mm512_setzero_ps();
__m512 sum46 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
__m512 sum31 = sum30;
__m512 sum32 = sum30;
__m512 sum33 = sum30;
__m512 sum35 = sum34;
__m512 sum36 = sum34;
__m512 sum37 = sum34;
__m512 sum39 = sum38;
__m512 sum40 = sum38;
__m512 sum41 = sum38;
__m512 sum43 = sum42;
__m512 sum44 = sum42;
__m512 sum45 = sum42;
__m512 sum47 = sum46;
__m512 sum48 = sum46;
__m512 sum49 = sum46;
for (ptrdiff_t j4 = 0; j4 < 318; ++j4) {
__m512 dat110 = _mm512_loadu_ps(datPtr2+0+256*j4);
__m512 dat111 = _mm512_loadu_ps(datPtr2+64+256*j4);
__m512 dat112 = _mm512_loadu_ps(datPtr2+128+256*j4);
__m512 dat113 = _mm512_loadu_ps(datPtr2+192+256*j4);
__m512 wt259 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i8+24*j4));
sum26 = _mm512_fmadd_ps(wt259, dat110, sum26);
sum27 = _mm512_fmadd_ps(wt259, dat111, sum27);
sum28 = _mm512_fmadd_ps(wt259, dat112, sum28);
sum29 = _mm512_fmadd_ps(wt259, dat113, sum29);
__m512 wt260 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i8+24*j4));
sum30 = _mm512_fmadd_ps(wt260, dat110, sum30);
sum31 = _mm512_fmadd_ps(wt260, dat111, sum31);
sum32 = _mm512_fmadd_ps(wt260, dat112, sum32);
sum33 = _mm512_fmadd_ps(wt260, dat113, sum33);
__m512 wt261 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i8+24*j4));
sum34 = _mm512_fmadd_ps(wt261, dat110, sum34);
sum35 = _mm512_fmadd_ps(wt261, dat111, sum35);
sum36 = _mm512_fmadd_ps(wt261, dat112, sum36);
sum37 = _mm512_fmadd_ps(wt261, dat113, sum37);
__m512 wt262 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i8+24*j4));
sum38 = _mm512_fmadd_ps(wt262, dat110, sum38);
sum39 = _mm512_fmadd_ps(wt262, dat111, sum39);
sum40 = _mm512_fmadd_ps(wt262, dat112, sum40);
sum41 = _mm512_fmadd_ps(wt262, dat113, sum41);
__m512 wt263 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i8+24*j4));
sum42 = _mm512_fmadd_ps(wt263, dat110, sum42);
sum43 = _mm512_fmadd_ps(wt263, dat111, sum43);
sum44 = _mm512_fmadd_ps(wt263, dat112, sum44);
sum45 = _mm512_fmadd_ps(wt263, dat113, sum45);
__m512 wt264 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i8+24*j4));
sum46 = _mm512_fmadd_ps(wt264, dat110, sum46);
sum47 = _mm512_fmadd_ps(wt264, dat111, sum47);
sum48 = _mm512_fmadd_ps(wt264, dat112, sum48);
sum49 = _mm512_fmadd_ps(wt264, dat113, sum49);
}
_mm512_storeu_ps(sumPtr1+0+1536*i8, _mm512_add_ps(sum26, _mm512_loadu_ps(sumPtr1+0+1536*i8)));
_mm512_storeu_ps(sumPtr1+64+1536*i8, _mm512_add_ps(sum27, _mm512_loadu_ps(sumPtr1+64+1536*i8)));
_mm512_storeu_ps(sumPtr1+128+1536*i8, _mm512_add_ps(sum28, _mm512_loadu_ps(sumPtr1+128+1536*i8)));
_mm512_storeu_ps(sumPtr1+192+1536*i8, _mm512_add_ps(sum29, _mm512_loadu_ps(sumPtr1+192+1536*i8)));
_mm512_storeu_ps(sumPtr1+256+1536*i8, _mm512_add_ps(sum30, _mm512_loadu_ps(sumPtr1+256+1536*i8)));
_mm512_storeu_ps(sumPtr1+320+1536*i8, _mm512_add_ps(sum31, _mm512_loadu_ps(sumPtr1+320+1536*i8)));
_mm512_storeu_ps(sumPtr1+384+1536*i8, _mm512_add_ps(sum32, _mm512_loadu_ps(sumPtr1+384+1536*i8)));
_mm512_storeu_ps(sumPtr1+448+1536*i8, _mm512_add_ps(sum33, _mm512_loadu_ps(sumPtr1+448+1536*i8)));
_mm512_storeu_ps(sumPtr1+512+1536*i8, _mm512_add_ps(sum34, _mm512_loadu_ps(sumPtr1+512+1536*i8)));
_mm512_storeu_ps(sumPtr1+576+1536*i8, _mm512_add_ps(sum35, _mm512_loadu_ps(sumPtr1+576+1536*i8)));
_mm512_storeu_ps(sumPtr1+640+1536*i8, _mm512_add_ps(sum36, _mm512_loadu_ps(sumPtr1+640+1536*i8)));
_mm512_storeu_ps(sumPtr1+704+1536*i8, _mm512_add_ps(sum37, _mm512_loadu_ps(sumPtr1+704+1536*i8)));
_mm512_storeu_ps(sumPtr1+768+1536*i8, _mm512_add_ps(sum38, _mm512_loadu_ps(sumPtr1+768+1536*i8)));
_mm512_storeu_ps(sumPtr1+832+1536*i8, _mm512_add_ps(sum39, _mm512_loadu_ps(sumPtr1+832+1536*i8)));
_mm512_storeu_ps(sumPtr1+896+1536*i8, _mm512_add_ps(sum40, _mm512_loadu_ps(sumPtr1+896+1536*i8)));
_mm512_storeu_ps(sumPtr1+960+1536*i8, _mm512_add_ps(sum41, _mm512_loadu_ps(sumPtr1+960+1536*i8)));
_mm512_storeu_ps(sumPtr1+1024+1536*i8, _mm512_add_ps(sum42, _mm512_loadu_ps(sumPtr1+1024+1536*i8)));
_mm512_storeu_ps(sumPtr1+1088+1536*i8, _mm512_add_ps(sum43, _mm512_loadu_ps(sumPtr1+1088+1536*i8)));
_mm512_storeu_ps(sumPtr1+1152+1536*i8, _mm512_add_ps(sum44, _mm512_loadu_ps(sumPtr1+1152+1536*i8)));
_mm512_storeu_ps(sumPtr1+1216+1536*i8, _mm512_add_ps(sum45, _mm512_loadu_ps(sumPtr1+1216+1536*i8)));
_mm512_storeu_ps(sumPtr1+1280+1536*i8, _mm512_add_ps(sum46, _mm512_loadu_ps(sumPtr1+1280+1536*i8)));
_mm512_storeu_ps(sumPtr1+1344+1536*i8, _mm512_add_ps(sum47, _mm512_loadu_ps(sumPtr1+1344+1536*i8)));
_mm512_storeu_ps(sumPtr1+1408+1536*i8, _mm512_add_ps(sum48, _mm512_loadu_ps(sumPtr1+1408+1536*i8)));
_mm512_storeu_ps(sumPtr1+1472+1536*i8, _mm512_add_ps(sum49, _mm512_loadu_ps(sumPtr1+1472+1536*i8)));
if (i8 >= ii2) return;
}
return;
}
(void)base1;
ptrdiff_t i9 = 2*w10;
ptrdiff_t ii3 = i9+1;
for (; i9 != 162; ++i9) {
__m512 sum50 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i9));
__m512 sum54 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i9));
__m512 sum58 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i9));
__m512 sum62 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i9));
__m512 sum66 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i9));
__m512 sum70 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i9));
__m512 sum51 = sum50;
__m512 sum52 = sum50;
__m512 sum53 = sum50;
__m512 sum55 = sum54;
__m512 sum56 = sum54;
__m512 sum57 = sum54;
__m512 sum59 = sum58;
__m512 sum60 = sum58;
__m512 sum61 = sum58;
__m512 sum63 = sum62;
__m512 sum64 = sum62;
__m512 sum65 = sum62;
__m512 sum67 = sum66;
__m512 sum68 = sum66;
__m512 sum69 = sum66;
__m512 sum71 = sum70;
__m512 sum72 = sum70;
__m512 sum73 = sum70;
for (ptrdiff_t j5 = 0; j5 < 318; ++j5) {
__m512 dat114 = _mm512_loadu_ps(datPtr2+0+256*j5);
__m512 dat115 = _mm512_loadu_ps(datPtr2+64+256*j5);
__m512 dat116 = _mm512_loadu_ps(datPtr2+128+256*j5);
__m512 dat117 = _mm512_loadu_ps(datPtr2+192+256*j5);
__m512 wt265 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i9+24*j5));
sum50 = _mm512_fmadd_ps(wt265, dat114, sum50);
sum51 = _mm512_fmadd_ps(wt265, dat115, sum51);
sum52 = _mm512_fmadd_ps(wt265, dat116, sum52);
sum53 = _mm512_fmadd_ps(wt265, dat117, sum53);
__m512 wt266 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i9+24*j5));
sum54 = _mm512_fmadd_ps(wt266, dat114, sum54);
sum55 = _mm512_fmadd_ps(wt266, dat115, sum55);
sum56 = _mm512_fmadd_ps(wt266, dat116, sum56);
sum57 = _mm512_fmadd_ps(wt266, dat117, sum57);
__m512 wt267 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i9+24*j5));
sum58 = _mm512_fmadd_ps(wt267, dat114, sum58);
sum59 = _mm512_fmadd_ps(wt267, dat115, sum59);
sum60 = _mm512_fmadd_ps(wt267, dat116, sum60);
sum61 = _mm512_fmadd_ps(wt267, dat117, sum61);
__m512 wt268 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i9+24*j5));
sum62 = _mm512_fmadd_ps(wt268, dat114, sum62);
sum63 = _mm512_fmadd_ps(wt268, dat115, sum63);
sum64 = _mm512_fmadd_ps(wt268, dat116, sum64);
sum65 = _mm512_fmadd_ps(wt268, dat117, sum65);
__m512 wt269 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i9+24*j5));
sum66 = _mm512_fmadd_ps(wt269, dat114, sum66);
sum67 = _mm512_fmadd_ps(wt269, dat115, sum67);
sum68 = _mm512_fmadd_ps(wt269, dat116, sum68);
sum69 = _mm512_fmadd_ps(wt269, dat117, sum69);
__m512 wt270 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i9+24*j5));
sum70 = _mm512_fmadd_ps(wt270, dat114, sum70);
sum71 = _mm512_fmadd_ps(wt270, dat115, sum71);
sum72 = _mm512_fmadd_ps(wt270, dat116, sum72);
sum73 = _mm512_fmadd_ps(wt270, dat117, sum73);
}
_mm512_storeu_ps(sumPtr1+0+1536*i9, sum50);
_mm512_storeu_ps(sumPtr1+64+1536*i9, sum51);
_mm512_storeu_ps(sumPtr1+128+1536*i9, sum52);
_mm512_storeu_ps(sumPtr1+192+1536*i9, sum53);
_mm512_storeu_ps(sumPtr1+256+1536*i9, sum54);
_mm512_storeu_ps(sumPtr1+320+1536*i9, sum55);
_mm512_storeu_ps(sumPtr1+384+1536*i9, sum56);
_mm512_storeu_ps(sumPtr1+448+1536*i9, sum57);
_mm512_storeu_ps(sumPtr1+512+1536*i9, sum58);
_mm512_storeu_ps(sumPtr1+576+1536*i9, sum59);
_mm512_storeu_ps(sumPtr1+640+1536*i9, sum60);
_mm512_storeu_ps(sumPtr1+704+1536*i9, sum61);
_mm512_storeu_ps(sumPtr1+768+1536*i9, sum62);
_mm512_storeu_ps(sumPtr1+832+1536*i9, sum63);
_mm512_storeu_ps(sumPtr1+896+1536*i9, sum64);
_mm512_storeu_ps(sumPtr1+960+1536*i9, sum65);
_mm512_storeu_ps(sumPtr1+1024+1536*i9, sum66);
_mm512_storeu_ps(sumPtr1+1088+1536*i9, sum67);
_mm512_storeu_ps(sumPtr1+1152+1536*i9, sum68);
_mm512_storeu_ps(sumPtr1+1216+1536*i9, sum69);
_mm512_storeu_ps(sumPtr1+1280+1536*i9, sum70);
_mm512_storeu_ps(sumPtr1+1344+1536*i9, sum71);
_mm512_storeu_ps(sumPtr1+1408+1536*i9, sum72);
_mm512_storeu_ps(sumPtr1+1472+1536*i9, sum73);
if (i9 >= ii3) return;
}
break;
}
case 2: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i10 = 2*w10;
ptrdiff_t ii4 = i10+1;
for (; i10 != 162; ++i10) {
__m512 sum74 = _mm512_setzero_ps();
__m512 sum77 = _mm512_setzero_ps();
__m512 sum80 = _mm512_setzero_ps();
__m512 sum83 = _mm512_setzero_ps();
__m512 sum86 = _mm512_setzero_ps();
__m512 sum89 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum75 = sum74;
__m512 sum76 = sum74;
__m512 sum78 = sum77;
__m512 sum79 = sum77;
__m512 sum81 = sum80;
__m512 sum82 = sum80;
__m512 sum84 = sum83;
__m512 sum85 = sum83;
__m512 sum87 = sum86;
__m512 sum88 = sum86;
__m512 sum90 = sum89;
__m512 sum91 = sum89;
for (ptrdiff_t j6 = 0; j6 < 318; ++j6) {
__m512 dat118 = _mm512_loadu_ps(datPtr2+64+256*j6);
__m512 dat119 = _mm512_loadu_ps(datPtr2+128+256*j6);
__m512 dat120 = _mm512_loadu_ps(datPtr2+192+256*j6);
__m512 wt271 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i10+24*j6));
sum74 = _mm512_fmadd_ps(wt271, dat118, sum74);
sum75 = _mm512_fmadd_ps(wt271, dat119, sum75);
sum76 = _mm512_fmadd_ps(wt271, dat120, sum76);
__m512 wt272 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i10+24*j6));
sum77 = _mm512_fmadd_ps(wt272, dat118, sum77);
sum78 = _mm512_fmadd_ps(wt272, dat119, sum78);
sum79 = _mm512_fmadd_ps(wt272, dat120, sum79);
__m512 wt273 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i10+24*j6));
sum80 = _mm512_fmadd_ps(wt273, dat118, sum80);
sum81 = _mm512_fmadd_ps(wt273, dat119, sum81);
sum82 = _mm512_fmadd_ps(wt273, dat120, sum82);
__m512 wt274 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i10+24*j6));
sum83 = _mm512_fmadd_ps(wt274, dat118, sum83);
sum84 = _mm512_fmadd_ps(wt274, dat119, sum84);
sum85 = _mm512_fmadd_ps(wt274, dat120, sum85);
__m512 wt275 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i10+24*j6));
sum86 = _mm512_fmadd_ps(wt275, dat118, sum86);
sum87 = _mm512_fmadd_ps(wt275, dat119, sum87);
sum88 = _mm512_fmadd_ps(wt275, dat120, sum88);
__m512 wt276 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i10+24*j6));
sum89 = _mm512_fmadd_ps(wt276, dat118, sum89);
sum90 = _mm512_fmadd_ps(wt276, dat119, sum90);
sum91 = _mm512_fmadd_ps(wt276, dat120, sum91);
}
_mm512_storeu_ps(sumPtr1+0+1536*i10, sum74);
_mm512_storeu_ps(sumPtr1+64+1536*i10, sum75);
_mm512_storeu_ps(sumPtr1+128+1536*i10, sum76);
_mm512_storeu_ps(sumPtr1+256+1536*i10, sum77);
_mm512_storeu_ps(sumPtr1+320+1536*i10, sum78);
_mm512_storeu_ps(sumPtr1+384+1536*i10, sum79);
_mm512_storeu_ps(sumPtr1+512+1536*i10, sum80);
_mm512_storeu_ps(sumPtr1+576+1536*i10, sum81);
_mm512_storeu_ps(sumPtr1+640+1536*i10, sum82);
_mm512_storeu_ps(sumPtr1+768+1536*i10, sum83);
_mm512_storeu_ps(sumPtr1+832+1536*i10, sum84);
_mm512_storeu_ps(sumPtr1+896+1536*i10, sum85);
_mm512_storeu_ps(sumPtr1+1024+1536*i10, sum86);
_mm512_storeu_ps(sumPtr1+1088+1536*i10, sum87);
_mm512_storeu_ps(sumPtr1+1152+1536*i10, sum88);
_mm512_storeu_ps(sumPtr1+1280+1536*i10, sum89);
_mm512_storeu_ps(sumPtr1+1344+1536*i10, sum90);
_mm512_storeu_ps(sumPtr1+1408+1536*i10, sum91);
if (i10 >= ii4) return;
}
return;
}
ptrdiff_t i11 = 2*w10;
ptrdiff_t ii5 = i11+1;
for (; i11 != 162; ++i11) {
__m512 sum92 = _mm512_setzero_ps();
__m512 sum95 = _mm512_setzero_ps();
__m512 sum98 = _mm512_setzero_ps();
__m512 sum101 = _mm512_setzero_ps();
__m512 sum104 = _mm512_setzero_ps();
__m512 sum107 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum93 = sum92;
__m512 sum94 = sum92;
__m512 sum96 = sum95;
__m512 sum97 = sum95;
__m512 sum99 = sum98;
__m512 sum100 = sum98;
__m512 sum102 = sum101;
__m512 sum103 = sum101;
__m512 sum105 = sum104;
__m512 sum106 = sum104;
__m512 sum108 = sum107;
__m512 sum109 = sum107;
for (ptrdiff_t j7 = 0; j7 < 318; ++j7) {
__m512 dat121 = _mm512_loadu_ps(datPtr2+64+256*j7);
__m512 dat122 = _mm512_loadu_ps(datPtr2+128+256*j7);
__m512 dat123 = _mm512_loadu_ps(datPtr2+192+256*j7);
__m512 wt277 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i11+24*j7));
sum92 = _mm512_fmadd_ps(wt277, dat121, sum92);
sum93 = _mm512_fmadd_ps(wt277, dat122, sum93);
sum94 = _mm512_fmadd_ps(wt277, dat123, sum94);
__m512 wt278 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i11+24*j7));
sum95 = _mm512_fmadd_ps(wt278, dat121, sum95);
sum96 = _mm512_fmadd_ps(wt278, dat122, sum96);
sum97 = _mm512_fmadd_ps(wt278, dat123, sum97);
__m512 wt279 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i11+24*j7));
sum98 = _mm512_fmadd_ps(wt279, dat121, sum98);
sum99 = _mm512_fmadd_ps(wt279, dat122, sum99);
sum100 = _mm512_fmadd_ps(wt279, dat123, sum100);
__m512 wt280 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i11+24*j7));
sum101 = _mm512_fmadd_ps(wt280, dat121, sum101);
sum102 = _mm512_fmadd_ps(wt280, dat122, sum102);
sum103 = _mm512_fmadd_ps(wt280, dat123, sum103);
__m512 wt281 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i11+24*j7));
sum104 = _mm512_fmadd_ps(wt281, dat121, sum104);
sum105 = _mm512_fmadd_ps(wt281, dat122, sum105);
sum106 = _mm512_fmadd_ps(wt281, dat123, sum106);
__m512 wt282 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i11+24*j7));
sum107 = _mm512_fmadd_ps(wt282, dat121, sum107);
sum108 = _mm512_fmadd_ps(wt282, dat122, sum108);
sum109 = _mm512_fmadd_ps(wt282, dat123, sum109);
}
_mm512_storeu_ps(sumPtr1+0+1536*i11, _mm512_add_ps(sum92, _mm512_loadu_ps(sumPtr1+0+1536*i11)));
_mm512_storeu_ps(sumPtr1+64+1536*i11, _mm512_add_ps(sum93, _mm512_loadu_ps(sumPtr1+64+1536*i11)));
_mm512_storeu_ps(sumPtr1+128+1536*i11, _mm512_add_ps(sum94, _mm512_loadu_ps(sumPtr1+128+1536*i11)));
_mm512_storeu_ps(sumPtr1+256+1536*i11, _mm512_add_ps(sum95, _mm512_loadu_ps(sumPtr1+256+1536*i11)));
_mm512_storeu_ps(sumPtr1+320+1536*i11, _mm512_add_ps(sum96, _mm512_loadu_ps(sumPtr1+320+1536*i11)));
_mm512_storeu_ps(sumPtr1+384+1536*i11, _mm512_add_ps(sum97, _mm512_loadu_ps(sumPtr1+384+1536*i11)));
_mm512_storeu_ps(sumPtr1+512+1536*i11, _mm512_add_ps(sum98, _mm512_loadu_ps(sumPtr1+512+1536*i11)));
_mm512_storeu_ps(sumPtr1+576+1536*i11, _mm512_add_ps(sum99, _mm512_loadu_ps(sumPtr1+576+1536*i11)));
_mm512_storeu_ps(sumPtr1+640+1536*i11, _mm512_add_ps(sum100, _mm512_loadu_ps(sumPtr1+640+1536*i11)));
_mm512_storeu_ps(sumPtr1+768+1536*i11, _mm512_add_ps(sum101, _mm512_loadu_ps(sumPtr1+768+1536*i11)));
_mm512_storeu_ps(sumPtr1+832+1536*i11, _mm512_add_ps(sum102, _mm512_loadu_ps(sumPtr1+832+1536*i11)));
_mm512_storeu_ps(sumPtr1+896+1536*i11, _mm512_add_ps(sum103, _mm512_loadu_ps(sumPtr1+896+1536*i11)));
_mm512_storeu_ps(sumPtr1+1024+1536*i11, _mm512_add_ps(sum104, _mm512_loadu_ps(sumPtr1+1024+1536*i11)));
_mm512_storeu_ps(sumPtr1+1088+1536*i11, _mm512_add_ps(sum105, _mm512_loadu_ps(sumPtr1+1088+1536*i11)));
_mm512_storeu_ps(sumPtr1+1152+1536*i11, _mm512_add_ps(sum106, _mm512_loadu_ps(sumPtr1+1152+1536*i11)));
_mm512_storeu_ps(sumPtr1+1280+1536*i11, _mm512_add_ps(sum107, _mm512_loadu_ps(sumPtr1+1280+1536*i11)));
_mm512_storeu_ps(sumPtr1+1344+1536*i11, _mm512_add_ps(sum108, _mm512_loadu_ps(sumPtr1+1344+1536*i11)));
_mm512_storeu_ps(sumPtr1+1408+1536*i11, _mm512_add_ps(sum109, _mm512_loadu_ps(sumPtr1+1408+1536*i11)));
if (i11 >= ii5) return;
}
return;
}
(void)base1;
ptrdiff_t i12 = 2*w10;
ptrdiff_t ii6 = i12+1;
for (; i12 != 162; ++i12) {
__m512 sum110 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i12));
__m512 sum113 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i12));
__m512 sum116 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i12));
__m512 sum119 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i12));
__m512 sum122 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i12));
__m512 sum125 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i12));
__m512 sum111 = sum110;
__m512 sum112 = sum110;
__m512 sum114 = sum113;
__m512 sum115 = sum113;
__m512 sum117 = sum116;
__m512 sum118 = sum116;
__m512 sum120 = sum119;
__m512 sum121 = sum119;
__m512 sum123 = sum122;
__m512 sum124 = sum122;
__m512 sum126 = sum125;
__m512 sum127 = sum125;
for (ptrdiff_t j8 = 0; j8 < 318; ++j8) {
__m512 dat124 = _mm512_loadu_ps(datPtr2+64+256*j8);
__m512 dat125 = _mm512_loadu_ps(datPtr2+128+256*j8);
__m512 dat126 = _mm512_loadu_ps(datPtr2+192+256*j8);
__m512 wt283 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i12+24*j8));
sum110 = _mm512_fmadd_ps(wt283, dat124, sum110);
sum111 = _mm512_fmadd_ps(wt283, dat125, sum111);
sum112 = _mm512_fmadd_ps(wt283, dat126, sum112);
__m512 wt284 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i12+24*j8));
sum113 = _mm512_fmadd_ps(wt284, dat124, sum113);
sum114 = _mm512_fmadd_ps(wt284, dat125, sum114);
sum115 = _mm512_fmadd_ps(wt284, dat126, sum115);
__m512 wt285 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i12+24*j8));
sum116 = _mm512_fmadd_ps(wt285, dat124, sum116);
sum117 = _mm512_fmadd_ps(wt285, dat125, sum117);
sum118 = _mm512_fmadd_ps(wt285, dat126, sum118);
__m512 wt286 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i12+24*j8));
sum119 = _mm512_fmadd_ps(wt286, dat124, sum119);
sum120 = _mm512_fmadd_ps(wt286, dat125, sum120);
sum121 = _mm512_fmadd_ps(wt286, dat126, sum121);
__m512 wt287 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i12+24*j8));
sum122 = _mm512_fmadd_ps(wt287, dat124, sum122);
sum123 = _mm512_fmadd_ps(wt287, dat125, sum123);
sum124 = _mm512_fmadd_ps(wt287, dat126, sum124);
__m512 wt288 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i12+24*j8));
sum125 = _mm512_fmadd_ps(wt288, dat124, sum125);
sum126 = _mm512_fmadd_ps(wt288, dat125, sum126);
sum127 = _mm512_fmadd_ps(wt288, dat126, sum127);
}
_mm512_storeu_ps(sumPtr1+0+1536*i12, sum110);
_mm512_storeu_ps(sumPtr1+64+1536*i12, sum111);
_mm512_storeu_ps(sumPtr1+128+1536*i12, sum112);
_mm512_storeu_ps(sumPtr1+256+1536*i12, sum113);
_mm512_storeu_ps(sumPtr1+320+1536*i12, sum114);
_mm512_storeu_ps(sumPtr1+384+1536*i12, sum115);
_mm512_storeu_ps(sumPtr1+512+1536*i12, sum116);
_mm512_storeu_ps(sumPtr1+576+1536*i12, sum117);
_mm512_storeu_ps(sumPtr1+640+1536*i12, sum118);
_mm512_storeu_ps(sumPtr1+768+1536*i12, sum119);
_mm512_storeu_ps(sumPtr1+832+1536*i12, sum120);
_mm512_storeu_ps(sumPtr1+896+1536*i12, sum121);
_mm512_storeu_ps(sumPtr1+1024+1536*i12, sum122);
_mm512_storeu_ps(sumPtr1+1088+1536*i12, sum123);
_mm512_storeu_ps(sumPtr1+1152+1536*i12, sum124);
_mm512_storeu_ps(sumPtr1+1280+1536*i12, sum125);
_mm512_storeu_ps(sumPtr1+1344+1536*i12, sum126);
_mm512_storeu_ps(sumPtr1+1408+1536*i12, sum127);
if (i12 >= ii6) return;
}
break;
}
case 3: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i13 = 2*w10;
ptrdiff_t ii7 = i13+1;
for (; i13 != 162; ++i13) {
__m512 sum128 = _mm512_setzero_ps();
__m512 sum132 = _mm512_setzero_ps();
__m512 sum136 = _mm512_setzero_ps();
__m512 sum140 = _mm512_setzero_ps();
__m512 sum144 = _mm512_setzero_ps();
__m512 sum148 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum129 = sum128;
__m512 sum130 = sum128;
__m512 sum131 = sum128;
__m512 sum133 = sum132;
__m512 sum134 = sum132;
__m512 sum135 = sum132;
__m512 sum137 = sum136;
__m512 sum138 = sum136;
__m512 sum139 = sum136;
__m512 sum141 = sum140;
__m512 sum142 = sum140;
__m512 sum143 = sum140;
__m512 sum145 = sum144;
__m512 sum146 = sum144;
__m512 sum147 = sum144;
__m512 sum149 = sum148;
__m512 sum150 = sum148;
__m512 sum151 = sum148;
for (ptrdiff_t j9 = 0; j9 < 318; ++j9) {
__m512 dat127 = _mm512_loadu_ps(datPtr2+0+256*j9);
__m512 dat128 = _mm512_loadu_ps(datPtr2+64+256*j9);
__m512 dat129 = _mm512_loadu_ps(datPtr2+128+256*j9);
__m512 dat130 = _mm512_loadu_ps(datPtr2+192+256*j9);
__m512 wt289 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i13+24*j9));
sum128 = _mm512_fmadd_ps(wt289, dat127, sum128);
sum129 = _mm512_fmadd_ps(wt289, dat128, sum129);
sum130 = _mm512_fmadd_ps(wt289, dat129, sum130);
sum131 = _mm512_fmadd_ps(wt289, dat130, sum131);
__m512 wt290 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i13+24*j9));
sum132 = _mm512_fmadd_ps(wt290, dat127, sum132);
sum133 = _mm512_fmadd_ps(wt290, dat128, sum133);
sum134 = _mm512_fmadd_ps(wt290, dat129, sum134);
sum135 = _mm512_fmadd_ps(wt290, dat130, sum135);
__m512 wt291 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i13+24*j9));
sum136 = _mm512_fmadd_ps(wt291, dat127, sum136);
sum137 = _mm512_fmadd_ps(wt291, dat128, sum137);
sum138 = _mm512_fmadd_ps(wt291, dat129, sum138);
sum139 = _mm512_fmadd_ps(wt291, dat130, sum139);
__m512 wt292 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i13+24*j9));
sum140 = _mm512_fmadd_ps(wt292, dat127, sum140);
sum141 = _mm512_fmadd_ps(wt292, dat128, sum141);
sum142 = _mm512_fmadd_ps(wt292, dat129, sum142);
sum143 = _mm512_fmadd_ps(wt292, dat130, sum143);
__m512 wt293 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i13+24*j9));
sum144 = _mm512_fmadd_ps(wt293, dat127, sum144);
sum145 = _mm512_fmadd_ps(wt293, dat128, sum145);
sum146 = _mm512_fmadd_ps(wt293, dat129, sum146);
sum147 = _mm512_fmadd_ps(wt293, dat130, sum147);
__m512 wt294 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i13+24*j9));
sum148 = _mm512_fmadd_ps(wt294, dat127, sum148);
sum149 = _mm512_fmadd_ps(wt294, dat128, sum149);
sum150 = _mm512_fmadd_ps(wt294, dat129, sum150);
sum151 = _mm512_fmadd_ps(wt294, dat130, sum151);
}
_mm512_storeu_ps(sumPtr1+-2239296+1536*i13, sum128);
_mm512_storeu_ps(sumPtr1+0+1536*i13, sum129);
_mm512_storeu_ps(sumPtr1+64+1536*i13, sum130);
_mm512_storeu_ps(sumPtr1+128+1536*i13, sum131);
_mm512_storeu_ps(sumPtr1+-2239040+1536*i13, sum132);
_mm512_storeu_ps(sumPtr1+256+1536*i13, sum133);
_mm512_storeu_ps(sumPtr1+320+1536*i13, sum134);
_mm512_storeu_ps(sumPtr1+384+1536*i13, sum135);
_mm512_storeu_ps(sumPtr1+-2238784+1536*i13, sum136);
_mm512_storeu_ps(sumPtr1+512+1536*i13, sum137);
_mm512_storeu_ps(sumPtr1+576+1536*i13, sum138);
_mm512_storeu_ps(sumPtr1+640+1536*i13, sum139);
_mm512_storeu_ps(sumPtr1+-2238528+1536*i13, sum140);
_mm512_storeu_ps(sumPtr1+768+1536*i13, sum141);
_mm512_storeu_ps(sumPtr1+832+1536*i13, sum142);
_mm512_storeu_ps(sumPtr1+896+1536*i13, sum143);
_mm512_storeu_ps(sumPtr1+-2238272+1536*i13, sum144);
_mm512_storeu_ps(sumPtr1+1024+1536*i13, sum145);
_mm512_storeu_ps(sumPtr1+1088+1536*i13, sum146);
_mm512_storeu_ps(sumPtr1+1152+1536*i13, sum147);
_mm512_storeu_ps(sumPtr1+-2238016+1536*i13, sum148);
_mm512_storeu_ps(sumPtr1+1280+1536*i13, sum149);
_mm512_storeu_ps(sumPtr1+1344+1536*i13, sum150);
_mm512_storeu_ps(sumPtr1+1408+1536*i13, sum151);
if (i13 >= ii7) return;
}
return;
}
ptrdiff_t i14 = 2*w10;
ptrdiff_t ii8 = i14+1;
for (; i14 != 162; ++i14) {
__m512 sum152 = _mm512_setzero_ps();
__m512 sum156 = _mm512_setzero_ps();
__m512 sum160 = _mm512_setzero_ps();
__m512 sum164 = _mm512_setzero_ps();
__m512 sum168 = _mm512_setzero_ps();
__m512 sum172 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum153 = sum152;
__m512 sum154 = sum152;
__m512 sum155 = sum152;
__m512 sum157 = sum156;
__m512 sum158 = sum156;
__m512 sum159 = sum156;
__m512 sum161 = sum160;
__m512 sum162 = sum160;
__m512 sum163 = sum160;
__m512 sum165 = sum164;
__m512 sum166 = sum164;
__m512 sum167 = sum164;
__m512 sum169 = sum168;
__m512 sum170 = sum168;
__m512 sum171 = sum168;
__m512 sum173 = sum172;
__m512 sum174 = sum172;
__m512 sum175 = sum172;
for (ptrdiff_t j10 = 0; j10 < 318; ++j10) {
__m512 dat131 = _mm512_loadu_ps(datPtr2+0+256*j10);
__m512 dat132 = _mm512_loadu_ps(datPtr2+64+256*j10);
__m512 dat133 = _mm512_loadu_ps(datPtr2+128+256*j10);
__m512 dat134 = _mm512_loadu_ps(datPtr2+192+256*j10);
__m512 wt295 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i14+24*j10));
sum152 = _mm512_fmadd_ps(wt295, dat131, sum152);
sum153 = _mm512_fmadd_ps(wt295, dat132, sum153);
sum154 = _mm512_fmadd_ps(wt295, dat133, sum154);
sum155 = _mm512_fmadd_ps(wt295, dat134, sum155);
__m512 wt296 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i14+24*j10));
sum156 = _mm512_fmadd_ps(wt296, dat131, sum156);
sum157 = _mm512_fmadd_ps(wt296, dat132, sum157);
sum158 = _mm512_fmadd_ps(wt296, dat133, sum158);
sum159 = _mm512_fmadd_ps(wt296, dat134, sum159);
__m512 wt297 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i14+24*j10));
sum160 = _mm512_fmadd_ps(wt297, dat131, sum160);
sum161 = _mm512_fmadd_ps(wt297, dat132, sum161);
sum162 = _mm512_fmadd_ps(wt297, dat133, sum162);
sum163 = _mm512_fmadd_ps(wt297, dat134, sum163);
__m512 wt298 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i14+24*j10));
sum164 = _mm512_fmadd_ps(wt298, dat131, sum164);
sum165 = _mm512_fmadd_ps(wt298, dat132, sum165);
sum166 = _mm512_fmadd_ps(wt298, dat133, sum166);
sum167 = _mm512_fmadd_ps(wt298, dat134, sum167);
__m512 wt299 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i14+24*j10));
sum168 = _mm512_fmadd_ps(wt299, dat131, sum168);
sum169 = _mm512_fmadd_ps(wt299, dat132, sum169);
sum170 = _mm512_fmadd_ps(wt299, dat133, sum170);
sum171 = _mm512_fmadd_ps(wt299, dat134, sum171);
__m512 wt300 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i14+24*j10));
sum172 = _mm512_fmadd_ps(wt300, dat131, sum172);
sum173 = _mm512_fmadd_ps(wt300, dat132, sum173);
sum174 = _mm512_fmadd_ps(wt300, dat133, sum174);
sum175 = _mm512_fmadd_ps(wt300, dat134, sum175);
}
_mm512_storeu_ps(sumPtr1+-2239296+1536*i14, _mm512_add_ps(sum152, _mm512_loadu_ps(sumPtr1+-2239296+1536*i14)));
_mm512_storeu_ps(sumPtr1+0+1536*i14, _mm512_add_ps(sum153, _mm512_loadu_ps(sumPtr1+0+1536*i14)));
_mm512_storeu_ps(sumPtr1+64+1536*i14, _mm512_add_ps(sum154, _mm512_loadu_ps(sumPtr1+64+1536*i14)));
_mm512_storeu_ps(sumPtr1+128+1536*i14, _mm512_add_ps(sum155, _mm512_loadu_ps(sumPtr1+128+1536*i14)));
_mm512_storeu_ps(sumPtr1+-2239040+1536*i14, _mm512_add_ps(sum156, _mm512_loadu_ps(sumPtr1+-2239040+1536*i14)));
_mm512_storeu_ps(sumPtr1+256+1536*i14, _mm512_add_ps(sum157, _mm512_loadu_ps(sumPtr1+256+1536*i14)));
_mm512_storeu_ps(sumPtr1+320+1536*i14, _mm512_add_ps(sum158, _mm512_loadu_ps(sumPtr1+320+1536*i14)));
_mm512_storeu_ps(sumPtr1+384+1536*i14, _mm512_add_ps(sum159, _mm512_loadu_ps(sumPtr1+384+1536*i14)));
_mm512_storeu_ps(sumPtr1+-2238784+1536*i14, _mm512_add_ps(sum160, _mm512_loadu_ps(sumPtr1+-2238784+1536*i14)));
_mm512_storeu_ps(sumPtr1+512+1536*i14, _mm512_add_ps(sum161, _mm512_loadu_ps(sumPtr1+512+1536*i14)));
_mm512_storeu_ps(sumPtr1+576+1536*i14, _mm512_add_ps(sum162, _mm512_loadu_ps(sumPtr1+576+1536*i14)));
_mm512_storeu_ps(sumPtr1+640+1536*i14, _mm512_add_ps(sum163, _mm512_loadu_ps(sumPtr1+640+1536*i14)));
_mm512_storeu_ps(sumPtr1+-2238528+1536*i14, _mm512_add_ps(sum164, _mm512_loadu_ps(sumPtr1+-2238528+1536*i14)));
_mm512_storeu_ps(sumPtr1+768+1536*i14, _mm512_add_ps(sum165, _mm512_loadu_ps(sumPtr1+768+1536*i14)));
_mm512_storeu_ps(sumPtr1+832+1536*i14, _mm512_add_ps(sum166, _mm512_loadu_ps(sumPtr1+832+1536*i14)));
_mm512_storeu_ps(sumPtr1+896+1536*i14, _mm512_add_ps(sum167, _mm512_loadu_ps(sumPtr1+896+1536*i14)));
_mm512_storeu_ps(sumPtr1+-2238272+1536*i14, _mm512_add_ps(sum168, _mm512_loadu_ps(sumPtr1+-2238272+1536*i14)));
_mm512_storeu_ps(sumPtr1+1024+1536*i14, _mm512_add_ps(sum169, _mm512_loadu_ps(sumPtr1+1024+1536*i14)));
_mm512_storeu_ps(sumPtr1+1088+1536*i14, _mm512_add_ps(sum170, _mm512_loadu_ps(sumPtr1+1088+1536*i14)));
_mm512_storeu_ps(sumPtr1+1152+1536*i14, _mm512_add_ps(sum171, _mm512_loadu_ps(sumPtr1+1152+1536*i14)));
_mm512_storeu_ps(sumPtr1+-2238016+1536*i14, _mm512_add_ps(sum172, _mm512_loadu_ps(sumPtr1+-2238016+1536*i14)));
_mm512_storeu_ps(sumPtr1+1280+1536*i14, _mm512_add_ps(sum173, _mm512_loadu_ps(sumPtr1+1280+1536*i14)));
_mm512_storeu_ps(sumPtr1+1344+1536*i14, _mm512_add_ps(sum174, _mm512_loadu_ps(sumPtr1+1344+1536*i14)));
_mm512_storeu_ps(sumPtr1+1408+1536*i14, _mm512_add_ps(sum175, _mm512_loadu_ps(sumPtr1+1408+1536*i14)));
if (i14 >= ii8) return;
}
return;
}
(void)base1;
ptrdiff_t i15 = 2*w10;
ptrdiff_t ii9 = i15+1;
for (; i15 != 162; ++i15) {
__m512 sum176 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i15));
__m512 sum180 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i15));
__m512 sum184 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i15));
__m512 sum188 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i15));
__m512 sum192 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i15));
__m512 sum196 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i15));
__m512 sum177 = sum176;
__m512 sum178 = sum176;
__m512 sum179 = sum176;
__m512 sum181 = sum180;
__m512 sum182 = sum180;
__m512 sum183 = sum180;
__m512 sum185 = sum184;
__m512 sum186 = sum184;
__m512 sum187 = sum184;
__m512 sum189 = sum188;
__m512 sum190 = sum188;
__m512 sum191 = sum188;
__m512 sum193 = sum192;
__m512 sum194 = sum192;
__m512 sum195 = sum192;
__m512 sum197 = sum196;
__m512 sum198 = sum196;
__m512 sum199 = sum196;
for (ptrdiff_t j11 = 0; j11 < 318; ++j11) {
__m512 dat135 = _mm512_loadu_ps(datPtr2+0+256*j11);
__m512 dat136 = _mm512_loadu_ps(datPtr2+64+256*j11);
__m512 dat137 = _mm512_loadu_ps(datPtr2+128+256*j11);
__m512 dat138 = _mm512_loadu_ps(datPtr2+192+256*j11);
__m512 wt301 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i15+24*j11));
sum176 = _mm512_fmadd_ps(wt301, dat135, sum176);
sum177 = _mm512_fmadd_ps(wt301, dat136, sum177);
sum178 = _mm512_fmadd_ps(wt301, dat137, sum178);
sum179 = _mm512_fmadd_ps(wt301, dat138, sum179);
__m512 wt302 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i15+24*j11));
sum180 = _mm512_fmadd_ps(wt302, dat135, sum180);
sum181 = _mm512_fmadd_ps(wt302, dat136, sum181);
sum182 = _mm512_fmadd_ps(wt302, dat137, sum182);
sum183 = _mm512_fmadd_ps(wt302, dat138, sum183);
__m512 wt303 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i15+24*j11));
sum184 = _mm512_fmadd_ps(wt303, dat135, sum184);
sum185 = _mm512_fmadd_ps(wt303, dat136, sum185);
sum186 = _mm512_fmadd_ps(wt303, dat137, sum186);
sum187 = _mm512_fmadd_ps(wt303, dat138, sum187);
__m512 wt304 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i15+24*j11));
sum188 = _mm512_fmadd_ps(wt304, dat135, sum188);
sum189 = _mm512_fmadd_ps(wt304, dat136, sum189);
sum190 = _mm512_fmadd_ps(wt304, dat137, sum190);
sum191 = _mm512_fmadd_ps(wt304, dat138, sum191);
__m512 wt305 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i15+24*j11));
sum192 = _mm512_fmadd_ps(wt305, dat135, sum192);
sum193 = _mm512_fmadd_ps(wt305, dat136, sum193);
sum194 = _mm512_fmadd_ps(wt305, dat137, sum194);
sum195 = _mm512_fmadd_ps(wt305, dat138, sum195);
__m512 wt306 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i15+24*j11));
sum196 = _mm512_fmadd_ps(wt306, dat135, sum196);
sum197 = _mm512_fmadd_ps(wt306, dat136, sum197);
sum198 = _mm512_fmadd_ps(wt306, dat137, sum198);
sum199 = _mm512_fmadd_ps(wt306, dat138, sum199);
}
_mm512_storeu_ps(sumPtr1+-2239296+1536*i15, sum176);
_mm512_storeu_ps(sumPtr1+0+1536*i15, sum177);
_mm512_storeu_ps(sumPtr1+64+1536*i15, sum178);
_mm512_storeu_ps(sumPtr1+128+1536*i15, sum179);
_mm512_storeu_ps(sumPtr1+-2239040+1536*i15, sum180);
_mm512_storeu_ps(sumPtr1+256+1536*i15, sum181);
_mm512_storeu_ps(sumPtr1+320+1536*i15, sum182);
_mm512_storeu_ps(sumPtr1+384+1536*i15, sum183);
_mm512_storeu_ps(sumPtr1+-2238784+1536*i15, sum184);
_mm512_storeu_ps(sumPtr1+512+1536*i15, sum185);
_mm512_storeu_ps(sumPtr1+576+1536*i15, sum186);
_mm512_storeu_ps(sumPtr1+640+1536*i15, sum187);
_mm512_storeu_ps(sumPtr1+-2238528+1536*i15, sum188);
_mm512_storeu_ps(sumPtr1+768+1536*i15, sum189);
_mm512_storeu_ps(sumPtr1+832+1536*i15, sum190);
_mm512_storeu_ps(sumPtr1+896+1536*i15, sum191);
_mm512_storeu_ps(sumPtr1+-2238272+1536*i15, sum192);
_mm512_storeu_ps(sumPtr1+1024+1536*i15, sum193);
_mm512_storeu_ps(sumPtr1+1088+1536*i15, sum194);
_mm512_storeu_ps(sumPtr1+1152+1536*i15, sum195);
_mm512_storeu_ps(sumPtr1+-2238016+1536*i15, sum196);
_mm512_storeu_ps(sumPtr1+1280+1536*i15, sum197);
_mm512_storeu_ps(sumPtr1+1344+1536*i15, sum198);
_mm512_storeu_ps(sumPtr1+1408+1536*i15, sum199);
if (i15 >= ii9) return;
}
break;
}
case 4: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i16 = 2*w10;
ptrdiff_t ii10 = i16+1;
for (; i16 != 162; ++i16) {
__m512 sum200 = _mm512_setzero_ps();
__m512 sum202 = _mm512_setzero_ps();
__m512 sum204 = _mm512_setzero_ps();
__m512 sum206 = _mm512_setzero_ps();
__m512 sum208 = _mm512_setzero_ps();
__m512 sum210 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum201 = sum200;
__m512 sum203 = sum202;
__m512 sum205 = sum204;
__m512 sum207 = sum206;
__m512 sum209 = sum208;
__m512 sum211 = sum210;
for (ptrdiff_t j12 = 0; j12 < 318; ++j12) {
__m512 dat139 = _mm512_loadu_ps(datPtr2+128+256*j12);
__m512 dat140 = _mm512_loadu_ps(datPtr2+192+256*j12);
__m512 wt307 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i16+24*j12));
sum200 = _mm512_fmadd_ps(wt307, dat139, sum200);
sum201 = _mm512_fmadd_ps(wt307, dat140, sum201);
__m512 wt308 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i16+24*j12));
sum202 = _mm512_fmadd_ps(wt308, dat139, sum202);
sum203 = _mm512_fmadd_ps(wt308, dat140, sum203);
__m512 wt309 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i16+24*j12));
sum204 = _mm512_fmadd_ps(wt309, dat139, sum204);
sum205 = _mm512_fmadd_ps(wt309, dat140, sum205);
__m512 wt310 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i16+24*j12));
sum206 = _mm512_fmadd_ps(wt310, dat139, sum206);
sum207 = _mm512_fmadd_ps(wt310, dat140, sum207);
__m512 wt311 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i16+24*j12));
sum208 = _mm512_fmadd_ps(wt311, dat139, sum208);
sum209 = _mm512_fmadd_ps(wt311, dat140, sum209);
__m512 wt312 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i16+24*j12));
sum210 = _mm512_fmadd_ps(wt312, dat139, sum210);
sum211 = _mm512_fmadd_ps(wt312, dat140, sum211);
}
_mm512_storeu_ps(sumPtr1+0+1536*i16, sum200);
_mm512_storeu_ps(sumPtr1+64+1536*i16, sum201);
_mm512_storeu_ps(sumPtr1+256+1536*i16, sum202);
_mm512_storeu_ps(sumPtr1+320+1536*i16, sum203);
_mm512_storeu_ps(sumPtr1+512+1536*i16, sum204);
_mm512_storeu_ps(sumPtr1+576+1536*i16, sum205);
_mm512_storeu_ps(sumPtr1+768+1536*i16, sum206);
_mm512_storeu_ps(sumPtr1+832+1536*i16, sum207);
_mm512_storeu_ps(sumPtr1+1024+1536*i16, sum208);
_mm512_storeu_ps(sumPtr1+1088+1536*i16, sum209);
_mm512_storeu_ps(sumPtr1+1280+1536*i16, sum210);
_mm512_storeu_ps(sumPtr1+1344+1536*i16, sum211);
if (i16 >= ii10) return;
}
return;
}
ptrdiff_t i17 = 2*w10;
ptrdiff_t ii11 = i17+1;
for (; i17 != 162; ++i17) {
__m512 sum212 = _mm512_setzero_ps();
__m512 sum214 = _mm512_setzero_ps();
__m512 sum216 = _mm512_setzero_ps();
__m512 sum218 = _mm512_setzero_ps();
__m512 sum220 = _mm512_setzero_ps();
__m512 sum222 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum213 = sum212;
__m512 sum215 = sum214;
__m512 sum217 = sum216;
__m512 sum219 = sum218;
__m512 sum221 = sum220;
__m512 sum223 = sum222;
for (ptrdiff_t j13 = 0; j13 < 318; ++j13) {
__m512 dat141 = _mm512_loadu_ps(datPtr2+128+256*j13);
__m512 dat142 = _mm512_loadu_ps(datPtr2+192+256*j13);
__m512 wt313 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i17+24*j13));
sum212 = _mm512_fmadd_ps(wt313, dat141, sum212);
sum213 = _mm512_fmadd_ps(wt313, dat142, sum213);
__m512 wt314 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i17+24*j13));
sum214 = _mm512_fmadd_ps(wt314, dat141, sum214);
sum215 = _mm512_fmadd_ps(wt314, dat142, sum215);
__m512 wt315 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i17+24*j13));
sum216 = _mm512_fmadd_ps(wt315, dat141, sum216);
sum217 = _mm512_fmadd_ps(wt315, dat142, sum217);
__m512 wt316 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i17+24*j13));
sum218 = _mm512_fmadd_ps(wt316, dat141, sum218);
sum219 = _mm512_fmadd_ps(wt316, dat142, sum219);
__m512 wt317 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i17+24*j13));
sum220 = _mm512_fmadd_ps(wt317, dat141, sum220);
sum221 = _mm512_fmadd_ps(wt317, dat142, sum221);
__m512 wt318 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i17+24*j13));
sum222 = _mm512_fmadd_ps(wt318, dat141, sum222);
sum223 = _mm512_fmadd_ps(wt318, dat142, sum223);
}
_mm512_storeu_ps(sumPtr1+0+1536*i17, _mm512_add_ps(sum212, _mm512_loadu_ps(sumPtr1+0+1536*i17)));
_mm512_storeu_ps(sumPtr1+64+1536*i17, _mm512_add_ps(sum213, _mm512_loadu_ps(sumPtr1+64+1536*i17)));
_mm512_storeu_ps(sumPtr1+256+1536*i17, _mm512_add_ps(sum214, _mm512_loadu_ps(sumPtr1+256+1536*i17)));
_mm512_storeu_ps(sumPtr1+320+1536*i17, _mm512_add_ps(sum215, _mm512_loadu_ps(sumPtr1+320+1536*i17)));
_mm512_storeu_ps(sumPtr1+512+1536*i17, _mm512_add_ps(sum216, _mm512_loadu_ps(sumPtr1+512+1536*i17)));
_mm512_storeu_ps(sumPtr1+576+1536*i17, _mm512_add_ps(sum217, _mm512_loadu_ps(sumPtr1+576+1536*i17)));
_mm512_storeu_ps(sumPtr1+768+1536*i17, _mm512_add_ps(sum218, _mm512_loadu_ps(sumPtr1+768+1536*i17)));
_mm512_storeu_ps(sumPtr1+832+1536*i17, _mm512_add_ps(sum219, _mm512_loadu_ps(sumPtr1+832+1536*i17)));
_mm512_storeu_ps(sumPtr1+1024+1536*i17, _mm512_add_ps(sum220, _mm512_loadu_ps(sumPtr1+1024+1536*i17)));
_mm512_storeu_ps(sumPtr1+1088+1536*i17, _mm512_add_ps(sum221, _mm512_loadu_ps(sumPtr1+1088+1536*i17)));
_mm512_storeu_ps(sumPtr1+1280+1536*i17, _mm512_add_ps(sum222, _mm512_loadu_ps(sumPtr1+1280+1536*i17)));
_mm512_storeu_ps(sumPtr1+1344+1536*i17, _mm512_add_ps(sum223, _mm512_loadu_ps(sumPtr1+1344+1536*i17)));
if (i17 >= ii11) return;
}
return;
}
(void)base1;
ptrdiff_t i18 = 2*w10;
ptrdiff_t ii12 = i18+1;
for (; i18 != 162; ++i18) {
__m512 sum224 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i18));
__m512 sum226 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i18));
__m512 sum228 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i18));
__m512 sum230 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i18));
__m512 sum232 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i18));
__m512 sum234 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i18));
__m512 sum225 = sum224;
__m512 sum227 = sum226;
__m512 sum229 = sum228;
__m512 sum231 = sum230;
__m512 sum233 = sum232;
__m512 sum235 = sum234;
for (ptrdiff_t j14 = 0; j14 < 318; ++j14) {
__m512 dat143 = _mm512_loadu_ps(datPtr2+128+256*j14);
__m512 dat144 = _mm512_loadu_ps(datPtr2+192+256*j14);
__m512 wt319 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i18+24*j14));
sum224 = _mm512_fmadd_ps(wt319, dat143, sum224);
sum225 = _mm512_fmadd_ps(wt319, dat144, sum225);
__m512 wt320 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i18+24*j14));
sum226 = _mm512_fmadd_ps(wt320, dat143, sum226);
sum227 = _mm512_fmadd_ps(wt320, dat144, sum227);
__m512 wt321 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i18+24*j14));
sum228 = _mm512_fmadd_ps(wt321, dat143, sum228);
sum229 = _mm512_fmadd_ps(wt321, dat144, sum229);
__m512 wt322 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i18+24*j14));
sum230 = _mm512_fmadd_ps(wt322, dat143, sum230);
sum231 = _mm512_fmadd_ps(wt322, dat144, sum231);
__m512 wt323 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i18+24*j14));
sum232 = _mm512_fmadd_ps(wt323, dat143, sum232);
sum233 = _mm512_fmadd_ps(wt323, dat144, sum233);
__m512 wt324 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i18+24*j14));
sum234 = _mm512_fmadd_ps(wt324, dat143, sum234);
sum235 = _mm512_fmadd_ps(wt324, dat144, sum235);
}
_mm512_storeu_ps(sumPtr1+0+1536*i18, sum224);
_mm512_storeu_ps(sumPtr1+64+1536*i18, sum225);
_mm512_storeu_ps(sumPtr1+256+1536*i18, sum226);
_mm512_storeu_ps(sumPtr1+320+1536*i18, sum227);
_mm512_storeu_ps(sumPtr1+512+1536*i18, sum228);
_mm512_storeu_ps(sumPtr1+576+1536*i18, sum229);
_mm512_storeu_ps(sumPtr1+768+1536*i18, sum230);
_mm512_storeu_ps(sumPtr1+832+1536*i18, sum231);
_mm512_storeu_ps(sumPtr1+1024+1536*i18, sum232);
_mm512_storeu_ps(sumPtr1+1088+1536*i18, sum233);
_mm512_storeu_ps(sumPtr1+1280+1536*i18, sum234);
_mm512_storeu_ps(sumPtr1+1344+1536*i18, sum235);
if (i18 >= ii12) return;
}
break;
}
case 5: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i19 = 2*w10;
ptrdiff_t ii13 = i19+1;
for (; i19 != 162; ++i19) {
__m512 sum236 = _mm512_setzero_ps();
__m512 sum240 = _mm512_setzero_ps();
__m512 sum244 = _mm512_setzero_ps();
__m512 sum248 = _mm512_setzero_ps();
__m512 sum252 = _mm512_setzero_ps();
__m512 sum256 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum237 = sum236;
__m512 sum238 = sum236;
__m512 sum239 = sum236;
__m512 sum241 = sum240;
__m512 sum242 = sum240;
__m512 sum243 = sum240;
__m512 sum245 = sum244;
__m512 sum246 = sum244;
__m512 sum247 = sum244;
__m512 sum249 = sum248;
__m512 sum250 = sum248;
__m512 sum251 = sum248;
__m512 sum253 = sum252;
__m512 sum254 = sum252;
__m512 sum255 = sum252;
__m512 sum257 = sum256;
__m512 sum258 = sum256;
__m512 sum259 = sum256;
for (ptrdiff_t j15 = 0; j15 < 318; ++j15) {
__m512 dat145 = _mm512_loadu_ps(datPtr2+0+256*j15);
__m512 dat146 = _mm512_loadu_ps(datPtr2+64+256*j15);
__m512 dat147 = _mm512_loadu_ps(datPtr2+128+256*j15);
__m512 dat148 = _mm512_loadu_ps(datPtr2+192+256*j15);
__m512 wt325 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i19+24*j15));
sum236 = _mm512_fmadd_ps(wt325, dat145, sum236);
sum237 = _mm512_fmadd_ps(wt325, dat146, sum237);
sum238 = _mm512_fmadd_ps(wt325, dat147, sum238);
sum239 = _mm512_fmadd_ps(wt325, dat148, sum239);
__m512 wt326 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i19+24*j15));
sum240 = _mm512_fmadd_ps(wt326, dat145, sum240);
sum241 = _mm512_fmadd_ps(wt326, dat146, sum241);
sum242 = _mm512_fmadd_ps(wt326, dat147, sum242);
sum243 = _mm512_fmadd_ps(wt326, dat148, sum243);
__m512 wt327 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i19+24*j15));
sum244 = _mm512_fmadd_ps(wt327, dat145, sum244);
sum245 = _mm512_fmadd_ps(wt327, dat146, sum245);
sum246 = _mm512_fmadd_ps(wt327, dat147, sum246);
sum247 = _mm512_fmadd_ps(wt327, dat148, sum247);
__m512 wt328 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i19+24*j15));
sum248 = _mm512_fmadd_ps(wt328, dat145, sum248);
sum249 = _mm512_fmadd_ps(wt328, dat146, sum249);
sum250 = _mm512_fmadd_ps(wt328, dat147, sum250);
sum251 = _mm512_fmadd_ps(wt328, dat148, sum251);
__m512 wt329 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i19+24*j15));
sum252 = _mm512_fmadd_ps(wt329, dat145, sum252);
sum253 = _mm512_fmadd_ps(wt329, dat146, sum253);
sum254 = _mm512_fmadd_ps(wt329, dat147, sum254);
sum255 = _mm512_fmadd_ps(wt329, dat148, sum255);
__m512 wt330 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i19+24*j15));
sum256 = _mm512_fmadd_ps(wt330, dat145, sum256);
sum257 = _mm512_fmadd_ps(wt330, dat146, sum257);
sum258 = _mm512_fmadd_ps(wt330, dat147, sum258);
sum259 = _mm512_fmadd_ps(wt330, dat148, sum259);
}
_mm512_storeu_ps(sumPtr1+-2239360+1536*i19, sum236);
_mm512_storeu_ps(sumPtr1+-2239296+1536*i19, sum237);
_mm512_storeu_ps(sumPtr1+0+1536*i19, sum238);
_mm512_storeu_ps(sumPtr1+64+1536*i19, sum239);
_mm512_storeu_ps(sumPtr1+-2239104+1536*i19, sum240);
_mm512_storeu_ps(sumPtr1+-2239040+1536*i19, sum241);
_mm512_storeu_ps(sumPtr1+256+1536*i19, sum242);
_mm512_storeu_ps(sumPtr1+320+1536*i19, sum243);
_mm512_storeu_ps(sumPtr1+-2238848+1536*i19, sum244);
_mm512_storeu_ps(sumPtr1+-2238784+1536*i19, sum245);
_mm512_storeu_ps(sumPtr1+512+1536*i19, sum246);
_mm512_storeu_ps(sumPtr1+576+1536*i19, sum247);
_mm512_storeu_ps(sumPtr1+-2238592+1536*i19, sum248);
_mm512_storeu_ps(sumPtr1+-2238528+1536*i19, sum249);
_mm512_storeu_ps(sumPtr1+768+1536*i19, sum250);
_mm512_storeu_ps(sumPtr1+832+1536*i19, sum251);
_mm512_storeu_ps(sumPtr1+-2238336+1536*i19, sum252);
_mm512_storeu_ps(sumPtr1+-2238272+1536*i19, sum253);
_mm512_storeu_ps(sumPtr1+1024+1536*i19, sum254);
_mm512_storeu_ps(sumPtr1+1088+1536*i19, sum255);
_mm512_storeu_ps(sumPtr1+-2238080+1536*i19, sum256);
_mm512_storeu_ps(sumPtr1+-2238016+1536*i19, sum257);
_mm512_storeu_ps(sumPtr1+1280+1536*i19, sum258);
_mm512_storeu_ps(sumPtr1+1344+1536*i19, sum259);
if (i19 >= ii13) return;
}
return;
}
ptrdiff_t i20 = 2*w10;
ptrdiff_t ii14 = i20+1;
for (; i20 != 162; ++i20) {
__m512 sum260 = _mm512_setzero_ps();
__m512 sum264 = _mm512_setzero_ps();
__m512 sum268 = _mm512_setzero_ps();
__m512 sum272 = _mm512_setzero_ps();
__m512 sum276 = _mm512_setzero_ps();
__m512 sum280 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum261 = sum260;
__m512 sum262 = sum260;
__m512 sum263 = sum260;
__m512 sum265 = sum264;
__m512 sum266 = sum264;
__m512 sum267 = sum264;
__m512 sum269 = sum268;
__m512 sum270 = sum268;
__m512 sum271 = sum268;
__m512 sum273 = sum272;
__m512 sum274 = sum272;
__m512 sum275 = sum272;
__m512 sum277 = sum276;
__m512 sum278 = sum276;
__m512 sum279 = sum276;
__m512 sum281 = sum280;
__m512 sum282 = sum280;
__m512 sum283 = sum280;
for (ptrdiff_t j16 = 0; j16 < 318; ++j16) {
__m512 dat149 = _mm512_loadu_ps(datPtr2+0+256*j16);
__m512 dat150 = _mm512_loadu_ps(datPtr2+64+256*j16);
__m512 dat151 = _mm512_loadu_ps(datPtr2+128+256*j16);
__m512 dat152 = _mm512_loadu_ps(datPtr2+192+256*j16);
__m512 wt331 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i20+24*j16));
sum260 = _mm512_fmadd_ps(wt331, dat149, sum260);
sum261 = _mm512_fmadd_ps(wt331, dat150, sum261);
sum262 = _mm512_fmadd_ps(wt331, dat151, sum262);
sum263 = _mm512_fmadd_ps(wt331, dat152, sum263);
__m512 wt332 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i20+24*j16));
sum264 = _mm512_fmadd_ps(wt332, dat149, sum264);
sum265 = _mm512_fmadd_ps(wt332, dat150, sum265);
sum266 = _mm512_fmadd_ps(wt332, dat151, sum266);
sum267 = _mm512_fmadd_ps(wt332, dat152, sum267);
__m512 wt333 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i20+24*j16));
sum268 = _mm512_fmadd_ps(wt333, dat149, sum268);
sum269 = _mm512_fmadd_ps(wt333, dat150, sum269);
sum270 = _mm512_fmadd_ps(wt333, dat151, sum270);
sum271 = _mm512_fmadd_ps(wt333, dat152, sum271);
__m512 wt334 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i20+24*j16));
sum272 = _mm512_fmadd_ps(wt334, dat149, sum272);
sum273 = _mm512_fmadd_ps(wt334, dat150, sum273);
sum274 = _mm512_fmadd_ps(wt334, dat151, sum274);
sum275 = _mm512_fmadd_ps(wt334, dat152, sum275);
__m512 wt335 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i20+24*j16));
sum276 = _mm512_fmadd_ps(wt335, dat149, sum276);
sum277 = _mm512_fmadd_ps(wt335, dat150, sum277);
sum278 = _mm512_fmadd_ps(wt335, dat151, sum278);
sum279 = _mm512_fmadd_ps(wt335, dat152, sum279);
__m512 wt336 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i20+24*j16));
sum280 = _mm512_fmadd_ps(wt336, dat149, sum280);
sum281 = _mm512_fmadd_ps(wt336, dat150, sum281);
sum282 = _mm512_fmadd_ps(wt336, dat151, sum282);
sum283 = _mm512_fmadd_ps(wt336, dat152, sum283);
}
_mm512_storeu_ps(sumPtr1+-2239360+1536*i20, _mm512_add_ps(sum260, _mm512_loadu_ps(sumPtr1+-2239360+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2239296+1536*i20, _mm512_add_ps(sum261, _mm512_loadu_ps(sumPtr1+-2239296+1536*i20)));
_mm512_storeu_ps(sumPtr1+0+1536*i20, _mm512_add_ps(sum262, _mm512_loadu_ps(sumPtr1+0+1536*i20)));
_mm512_storeu_ps(sumPtr1+64+1536*i20, _mm512_add_ps(sum263, _mm512_loadu_ps(sumPtr1+64+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2239104+1536*i20, _mm512_add_ps(sum264, _mm512_loadu_ps(sumPtr1+-2239104+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2239040+1536*i20, _mm512_add_ps(sum265, _mm512_loadu_ps(sumPtr1+-2239040+1536*i20)));
_mm512_storeu_ps(sumPtr1+256+1536*i20, _mm512_add_ps(sum266, _mm512_loadu_ps(sumPtr1+256+1536*i20)));
_mm512_storeu_ps(sumPtr1+320+1536*i20, _mm512_add_ps(sum267, _mm512_loadu_ps(sumPtr1+320+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2238848+1536*i20, _mm512_add_ps(sum268, _mm512_loadu_ps(sumPtr1+-2238848+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2238784+1536*i20, _mm512_add_ps(sum269, _mm512_loadu_ps(sumPtr1+-2238784+1536*i20)));
_mm512_storeu_ps(sumPtr1+512+1536*i20, _mm512_add_ps(sum270, _mm512_loadu_ps(sumPtr1+512+1536*i20)));
_mm512_storeu_ps(sumPtr1+576+1536*i20, _mm512_add_ps(sum271, _mm512_loadu_ps(sumPtr1+576+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2238592+1536*i20, _mm512_add_ps(sum272, _mm512_loadu_ps(sumPtr1+-2238592+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2238528+1536*i20, _mm512_add_ps(sum273, _mm512_loadu_ps(sumPtr1+-2238528+1536*i20)));
_mm512_storeu_ps(sumPtr1+768+1536*i20, _mm512_add_ps(sum274, _mm512_loadu_ps(sumPtr1+768+1536*i20)));
_mm512_storeu_ps(sumPtr1+832+1536*i20, _mm512_add_ps(sum275, _mm512_loadu_ps(sumPtr1+832+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2238336+1536*i20, _mm512_add_ps(sum276, _mm512_loadu_ps(sumPtr1+-2238336+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2238272+1536*i20, _mm512_add_ps(sum277, _mm512_loadu_ps(sumPtr1+-2238272+1536*i20)));
_mm512_storeu_ps(sumPtr1+1024+1536*i20, _mm512_add_ps(sum278, _mm512_loadu_ps(sumPtr1+1024+1536*i20)));
_mm512_storeu_ps(sumPtr1+1088+1536*i20, _mm512_add_ps(sum279, _mm512_loadu_ps(sumPtr1+1088+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2238080+1536*i20, _mm512_add_ps(sum280, _mm512_loadu_ps(sumPtr1+-2238080+1536*i20)));
_mm512_storeu_ps(sumPtr1+-2238016+1536*i20, _mm512_add_ps(sum281, _mm512_loadu_ps(sumPtr1+-2238016+1536*i20)));
_mm512_storeu_ps(sumPtr1+1280+1536*i20, _mm512_add_ps(sum282, _mm512_loadu_ps(sumPtr1+1280+1536*i20)));
_mm512_storeu_ps(sumPtr1+1344+1536*i20, _mm512_add_ps(sum283, _mm512_loadu_ps(sumPtr1+1344+1536*i20)));
if (i20 >= ii14) return;
}
return;
}
(void)base1;
ptrdiff_t i21 = 2*w10;
ptrdiff_t ii15 = i21+1;
for (; i21 != 162; ++i21) {
__m512 sum284 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i21));
__m512 sum288 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i21));
__m512 sum292 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i21));
__m512 sum296 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i21));
__m512 sum300 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i21));
__m512 sum304 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i21));
__m512 sum285 = sum284;
__m512 sum286 = sum284;
__m512 sum287 = sum284;
__m512 sum289 = sum288;
__m512 sum290 = sum288;
__m512 sum291 = sum288;
__m512 sum293 = sum292;
__m512 sum294 = sum292;
__m512 sum295 = sum292;
__m512 sum297 = sum296;
__m512 sum298 = sum296;
__m512 sum299 = sum296;
__m512 sum301 = sum300;
__m512 sum302 = sum300;
__m512 sum303 = sum300;
__m512 sum305 = sum304;
__m512 sum306 = sum304;
__m512 sum307 = sum304;
for (ptrdiff_t j17 = 0; j17 < 318; ++j17) {
__m512 dat153 = _mm512_loadu_ps(datPtr2+0+256*j17);
__m512 dat154 = _mm512_loadu_ps(datPtr2+64+256*j17);
__m512 dat155 = _mm512_loadu_ps(datPtr2+128+256*j17);
__m512 dat156 = _mm512_loadu_ps(datPtr2+192+256*j17);
__m512 wt337 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i21+24*j17));
sum284 = _mm512_fmadd_ps(wt337, dat153, sum284);
sum285 = _mm512_fmadd_ps(wt337, dat154, sum285);
sum286 = _mm512_fmadd_ps(wt337, dat155, sum286);
sum287 = _mm512_fmadd_ps(wt337, dat156, sum287);
__m512 wt338 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i21+24*j17));
sum288 = _mm512_fmadd_ps(wt338, dat153, sum288);
sum289 = _mm512_fmadd_ps(wt338, dat154, sum289);
sum290 = _mm512_fmadd_ps(wt338, dat155, sum290);
sum291 = _mm512_fmadd_ps(wt338, dat156, sum291);
__m512 wt339 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i21+24*j17));
sum292 = _mm512_fmadd_ps(wt339, dat153, sum292);
sum293 = _mm512_fmadd_ps(wt339, dat154, sum293);
sum294 = _mm512_fmadd_ps(wt339, dat155, sum294);
sum295 = _mm512_fmadd_ps(wt339, dat156, sum295);
__m512 wt340 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i21+24*j17));
sum296 = _mm512_fmadd_ps(wt340, dat153, sum296);
sum297 = _mm512_fmadd_ps(wt340, dat154, sum297);
sum298 = _mm512_fmadd_ps(wt340, dat155, sum298);
sum299 = _mm512_fmadd_ps(wt340, dat156, sum299);
__m512 wt341 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i21+24*j17));
sum300 = _mm512_fmadd_ps(wt341, dat153, sum300);
sum301 = _mm512_fmadd_ps(wt341, dat154, sum301);
sum302 = _mm512_fmadd_ps(wt341, dat155, sum302);
sum303 = _mm512_fmadd_ps(wt341, dat156, sum303);
__m512 wt342 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i21+24*j17));
sum304 = _mm512_fmadd_ps(wt342, dat153, sum304);
sum305 = _mm512_fmadd_ps(wt342, dat154, sum305);
sum306 = _mm512_fmadd_ps(wt342, dat155, sum306);
sum307 = _mm512_fmadd_ps(wt342, dat156, sum307);
}
_mm512_storeu_ps(sumPtr1+-2239360+1536*i21, sum284);
_mm512_storeu_ps(sumPtr1+-2239296+1536*i21, sum285);
_mm512_storeu_ps(sumPtr1+0+1536*i21, sum286);
_mm512_storeu_ps(sumPtr1+64+1536*i21, sum287);
_mm512_storeu_ps(sumPtr1+-2239104+1536*i21, sum288);
_mm512_storeu_ps(sumPtr1+-2239040+1536*i21, sum289);
_mm512_storeu_ps(sumPtr1+256+1536*i21, sum290);
_mm512_storeu_ps(sumPtr1+320+1536*i21, sum291);
_mm512_storeu_ps(sumPtr1+-2238848+1536*i21, sum292);
_mm512_storeu_ps(sumPtr1+-2238784+1536*i21, sum293);
_mm512_storeu_ps(sumPtr1+512+1536*i21, sum294);
_mm512_storeu_ps(sumPtr1+576+1536*i21, sum295);
_mm512_storeu_ps(sumPtr1+-2238592+1536*i21, sum296);
_mm512_storeu_ps(sumPtr1+-2238528+1536*i21, sum297);
_mm512_storeu_ps(sumPtr1+768+1536*i21, sum298);
_mm512_storeu_ps(sumPtr1+832+1536*i21, sum299);
_mm512_storeu_ps(sumPtr1+-2238336+1536*i21, sum300);
_mm512_storeu_ps(sumPtr1+-2238272+1536*i21, sum301);
_mm512_storeu_ps(sumPtr1+1024+1536*i21, sum302);
_mm512_storeu_ps(sumPtr1+1088+1536*i21, sum303);
_mm512_storeu_ps(sumPtr1+-2238080+1536*i21, sum304);
_mm512_storeu_ps(sumPtr1+-2238016+1536*i21, sum305);
_mm512_storeu_ps(sumPtr1+1280+1536*i21, sum306);
_mm512_storeu_ps(sumPtr1+1344+1536*i21, sum307);
if (i21 >= ii15) return;
}
break;
}
case 6: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i22 = 2*w10;
ptrdiff_t ii16 = i22+1;
for (; i22 != 162; ++i22) {
__m512 sum308 = _mm512_setzero_ps();
__m512 sum309 = _mm512_setzero_ps();
__m512 sum310 = _mm512_setzero_ps();
__m512 sum311 = _mm512_setzero_ps();
__m512 sum312 = _mm512_setzero_ps();
__m512 sum313 = _mm512_setzero_ps();
(void)biasPtr2;
for (ptrdiff_t j18 = 0; j18 < 318; ++j18) {
__m512 dat157 = _mm512_loadu_ps(datPtr2+192+256*j18);
__m512 wt343 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i22+24*j18));
sum308 = _mm512_fmadd_ps(wt343, dat157, sum308);
__m512 wt344 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i22+24*j18));
sum309 = _mm512_fmadd_ps(wt344, dat157, sum309);
__m512 wt345 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i22+24*j18));
sum310 = _mm512_fmadd_ps(wt345, dat157, sum310);
__m512 wt346 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i22+24*j18));
sum311 = _mm512_fmadd_ps(wt346, dat157, sum311);
__m512 wt347 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i22+24*j18));
sum312 = _mm512_fmadd_ps(wt347, dat157, sum312);
__m512 wt348 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i22+24*j18));
sum313 = _mm512_fmadd_ps(wt348, dat157, sum313);
}
_mm512_storeu_ps(sumPtr1+0+1536*i22, sum308);
_mm512_storeu_ps(sumPtr1+256+1536*i22, sum309);
_mm512_storeu_ps(sumPtr1+512+1536*i22, sum310);
_mm512_storeu_ps(sumPtr1+768+1536*i22, sum311);
_mm512_storeu_ps(sumPtr1+1024+1536*i22, sum312);
_mm512_storeu_ps(sumPtr1+1280+1536*i22, sum313);
if (i22 >= ii16) return;
}
return;
}
ptrdiff_t i23 = 2*w10;
ptrdiff_t ii17 = i23+1;
for (; i23 != 162; ++i23) {
__m512 sum314 = _mm512_setzero_ps();
__m512 sum315 = _mm512_setzero_ps();
__m512 sum316 = _mm512_setzero_ps();
__m512 sum317 = _mm512_setzero_ps();
__m512 sum318 = _mm512_setzero_ps();
__m512 sum319 = _mm512_setzero_ps();
(void)biasPtr2;
for (ptrdiff_t j19 = 0; j19 < 318; ++j19) {
__m512 dat158 = _mm512_loadu_ps(datPtr2+192+256*j19);
__m512 wt349 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i23+24*j19));
sum314 = _mm512_fmadd_ps(wt349, dat158, sum314);
__m512 wt350 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i23+24*j19));
sum315 = _mm512_fmadd_ps(wt350, dat158, sum315);
__m512 wt351 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i23+24*j19));
sum316 = _mm512_fmadd_ps(wt351, dat158, sum316);
__m512 wt352 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i23+24*j19));
sum317 = _mm512_fmadd_ps(wt352, dat158, sum317);
__m512 wt353 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i23+24*j19));
sum318 = _mm512_fmadd_ps(wt353, dat158, sum318);
__m512 wt354 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i23+24*j19));
sum319 = _mm512_fmadd_ps(wt354, dat158, sum319);
}
_mm512_storeu_ps(sumPtr1+0+1536*i23, _mm512_add_ps(sum314, _mm512_loadu_ps(sumPtr1+0+1536*i23)));
_mm512_storeu_ps(sumPtr1+256+1536*i23, _mm512_add_ps(sum315, _mm512_loadu_ps(sumPtr1+256+1536*i23)));
_mm512_storeu_ps(sumPtr1+512+1536*i23, _mm512_add_ps(sum316, _mm512_loadu_ps(sumPtr1+512+1536*i23)));
_mm512_storeu_ps(sumPtr1+768+1536*i23, _mm512_add_ps(sum317, _mm512_loadu_ps(sumPtr1+768+1536*i23)));
_mm512_storeu_ps(sumPtr1+1024+1536*i23, _mm512_add_ps(sum318, _mm512_loadu_ps(sumPtr1+1024+1536*i23)));
_mm512_storeu_ps(sumPtr1+1280+1536*i23, _mm512_add_ps(sum319, _mm512_loadu_ps(sumPtr1+1280+1536*i23)));
if (i23 >= ii17) return;
}
return;
}
(void)base1;
ptrdiff_t i24 = 2*w10;
ptrdiff_t ii18 = i24+1;
for (; i24 != 162; ++i24) {
__m512 sum320 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i24));
__m512 sum321 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i24));
__m512 sum322 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i24));
__m512 sum323 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i24));
__m512 sum324 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i24));
__m512 sum325 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i24));
for (ptrdiff_t j20 = 0; j20 < 318; ++j20) {
__m512 dat159 = _mm512_loadu_ps(datPtr2+192+256*j20);
__m512 wt355 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i24+24*j20));
sum320 = _mm512_fmadd_ps(wt355, dat159, sum320);
__m512 wt356 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i24+24*j20));
sum321 = _mm512_fmadd_ps(wt356, dat159, sum321);
__m512 wt357 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i24+24*j20));
sum322 = _mm512_fmadd_ps(wt357, dat159, sum322);
__m512 wt358 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i24+24*j20));
sum323 = _mm512_fmadd_ps(wt358, dat159, sum323);
__m512 wt359 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i24+24*j20));
sum324 = _mm512_fmadd_ps(wt359, dat159, sum324);
__m512 wt360 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i24+24*j20));
sum325 = _mm512_fmadd_ps(wt360, dat159, sum325);
}
_mm512_storeu_ps(sumPtr1+0+1536*i24, sum320);
_mm512_storeu_ps(sumPtr1+256+1536*i24, sum321);
_mm512_storeu_ps(sumPtr1+512+1536*i24, sum322);
_mm512_storeu_ps(sumPtr1+768+1536*i24, sum323);
_mm512_storeu_ps(sumPtr1+1024+1536*i24, sum324);
_mm512_storeu_ps(sumPtr1+1280+1536*i24, sum325);
if (i24 >= ii18) return;
}
break;
}
case 7: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i25 = 2*w10;
ptrdiff_t ii19 = i25+1;
for (; i25 != 162; ++i25) {
__m512 sum326 = _mm512_setzero_ps();
__m512 sum330 = _mm512_setzero_ps();
__m512 sum334 = _mm512_setzero_ps();
__m512 sum338 = _mm512_setzero_ps();
__m512 sum342 = _mm512_setzero_ps();
__m512 sum346 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum327 = sum326;
__m512 sum328 = sum326;
__m512 sum329 = sum326;
__m512 sum331 = sum330;
__m512 sum332 = sum330;
__m512 sum333 = sum330;
__m512 sum335 = sum334;
__m512 sum336 = sum334;
__m512 sum337 = sum334;
__m512 sum339 = sum338;
__m512 sum340 = sum338;
__m512 sum341 = sum338;
__m512 sum343 = sum342;
__m512 sum344 = sum342;
__m512 sum345 = sum342;
__m512 sum347 = sum346;
__m512 sum348 = sum346;
__m512 sum349 = sum346;
for (ptrdiff_t j21 = 0; j21 < 318; ++j21) {
__m512 dat160 = _mm512_loadu_ps(datPtr2+0+256*j21);
__m512 dat161 = _mm512_loadu_ps(datPtr2+64+256*j21);
__m512 dat162 = _mm512_loadu_ps(datPtr2+128+256*j21);
__m512 dat163 = _mm512_loadu_ps(datPtr2+192+256*j21);
__m512 wt361 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i25+24*j21));
sum326 = _mm512_fmadd_ps(wt361, dat160, sum326);
sum327 = _mm512_fmadd_ps(wt361, dat161, sum327);
sum328 = _mm512_fmadd_ps(wt361, dat162, sum328);
sum329 = _mm512_fmadd_ps(wt361, dat163, sum329);
__m512 wt362 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i25+24*j21));
sum330 = _mm512_fmadd_ps(wt362, dat160, sum330);
sum331 = _mm512_fmadd_ps(wt362, dat161, sum331);
sum332 = _mm512_fmadd_ps(wt362, dat162, sum332);
sum333 = _mm512_fmadd_ps(wt362, dat163, sum333);
__m512 wt363 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i25+24*j21));
sum334 = _mm512_fmadd_ps(wt363, dat160, sum334);
sum335 = _mm512_fmadd_ps(wt363, dat161, sum335);
sum336 = _mm512_fmadd_ps(wt363, dat162, sum336);
sum337 = _mm512_fmadd_ps(wt363, dat163, sum337);
__m512 wt364 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i25+24*j21));
sum338 = _mm512_fmadd_ps(wt364, dat160, sum338);
sum339 = _mm512_fmadd_ps(wt364, dat161, sum339);
sum340 = _mm512_fmadd_ps(wt364, dat162, sum340);
sum341 = _mm512_fmadd_ps(wt364, dat163, sum341);
__m512 wt365 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i25+24*j21));
sum342 = _mm512_fmadd_ps(wt365, dat160, sum342);
sum343 = _mm512_fmadd_ps(wt365, dat161, sum343);
sum344 = _mm512_fmadd_ps(wt365, dat162, sum344);
sum345 = _mm512_fmadd_ps(wt365, dat163, sum345);
__m512 wt366 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i25+24*j21));
sum346 = _mm512_fmadd_ps(wt366, dat160, sum346);
sum347 = _mm512_fmadd_ps(wt366, dat161, sum347);
sum348 = _mm512_fmadd_ps(wt366, dat162, sum348);
sum349 = _mm512_fmadd_ps(wt366, dat163, sum349);
}
_mm512_storeu_ps(sumPtr1+-2239424+1536*i25, sum326);
_mm512_storeu_ps(sumPtr1+-2239360+1536*i25, sum327);
_mm512_storeu_ps(sumPtr1+-2239296+1536*i25, sum328);
_mm512_storeu_ps(sumPtr1+0+1536*i25, sum329);
_mm512_storeu_ps(sumPtr1+-2239168+1536*i25, sum330);
_mm512_storeu_ps(sumPtr1+-2239104+1536*i25, sum331);
_mm512_storeu_ps(sumPtr1+-2239040+1536*i25, sum332);
_mm512_storeu_ps(sumPtr1+256+1536*i25, sum333);
_mm512_storeu_ps(sumPtr1+-2238912+1536*i25, sum334);
_mm512_storeu_ps(sumPtr1+-2238848+1536*i25, sum335);
_mm512_storeu_ps(sumPtr1+-2238784+1536*i25, sum336);
_mm512_storeu_ps(sumPtr1+512+1536*i25, sum337);
_mm512_storeu_ps(sumPtr1+-2238656+1536*i25, sum338);
_mm512_storeu_ps(sumPtr1+-2238592+1536*i25, sum339);
_mm512_storeu_ps(sumPtr1+-2238528+1536*i25, sum340);
_mm512_storeu_ps(sumPtr1+768+1536*i25, sum341);
_mm512_storeu_ps(sumPtr1+-2238400+1536*i25, sum342);
_mm512_storeu_ps(sumPtr1+-2238336+1536*i25, sum343);
_mm512_storeu_ps(sumPtr1+-2238272+1536*i25, sum344);
_mm512_storeu_ps(sumPtr1+1024+1536*i25, sum345);
_mm512_storeu_ps(sumPtr1+-2238144+1536*i25, sum346);
_mm512_storeu_ps(sumPtr1+-2238080+1536*i25, sum347);
_mm512_storeu_ps(sumPtr1+-2238016+1536*i25, sum348);
_mm512_storeu_ps(sumPtr1+1280+1536*i25, sum349);
if (i25 >= ii19) return;
}
return;
}
ptrdiff_t i26 = 2*w10;
ptrdiff_t ii20 = i26+1;
for (; i26 != 162; ++i26) {
__m512 sum350 = _mm512_setzero_ps();
__m512 sum354 = _mm512_setzero_ps();
__m512 sum358 = _mm512_setzero_ps();
__m512 sum362 = _mm512_setzero_ps();
__m512 sum366 = _mm512_setzero_ps();
__m512 sum370 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum351 = sum350;
__m512 sum352 = sum350;
__m512 sum353 = sum350;
__m512 sum355 = sum354;
__m512 sum356 = sum354;
__m512 sum357 = sum354;
__m512 sum359 = sum358;
__m512 sum360 = sum358;
__m512 sum361 = sum358;
__m512 sum363 = sum362;
__m512 sum364 = sum362;
__m512 sum365 = sum362;
__m512 sum367 = sum366;
__m512 sum368 = sum366;
__m512 sum369 = sum366;
__m512 sum371 = sum370;
__m512 sum372 = sum370;
__m512 sum373 = sum370;
for (ptrdiff_t j22 = 0; j22 < 318; ++j22) {
__m512 dat164 = _mm512_loadu_ps(datPtr2+0+256*j22);
__m512 dat165 = _mm512_loadu_ps(datPtr2+64+256*j22);
__m512 dat166 = _mm512_loadu_ps(datPtr2+128+256*j22);
__m512 dat167 = _mm512_loadu_ps(datPtr2+192+256*j22);
__m512 wt367 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i26+24*j22));
sum350 = _mm512_fmadd_ps(wt367, dat164, sum350);
sum351 = _mm512_fmadd_ps(wt367, dat165, sum351);
sum352 = _mm512_fmadd_ps(wt367, dat166, sum352);
sum353 = _mm512_fmadd_ps(wt367, dat167, sum353);
__m512 wt368 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i26+24*j22));
sum354 = _mm512_fmadd_ps(wt368, dat164, sum354);
sum355 = _mm512_fmadd_ps(wt368, dat165, sum355);
sum356 = _mm512_fmadd_ps(wt368, dat166, sum356);
sum357 = _mm512_fmadd_ps(wt368, dat167, sum357);
__m512 wt369 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i26+24*j22));
sum358 = _mm512_fmadd_ps(wt369, dat164, sum358);
sum359 = _mm512_fmadd_ps(wt369, dat165, sum359);
sum360 = _mm512_fmadd_ps(wt369, dat166, sum360);
sum361 = _mm512_fmadd_ps(wt369, dat167, sum361);
__m512 wt370 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i26+24*j22));
sum362 = _mm512_fmadd_ps(wt370, dat164, sum362);
sum363 = _mm512_fmadd_ps(wt370, dat165, sum363);
sum364 = _mm512_fmadd_ps(wt370, dat166, sum364);
sum365 = _mm512_fmadd_ps(wt370, dat167, sum365);
__m512 wt371 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i26+24*j22));
sum366 = _mm512_fmadd_ps(wt371, dat164, sum366);
sum367 = _mm512_fmadd_ps(wt371, dat165, sum367);
sum368 = _mm512_fmadd_ps(wt371, dat166, sum368);
sum369 = _mm512_fmadd_ps(wt371, dat167, sum369);
__m512 wt372 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i26+24*j22));
sum370 = _mm512_fmadd_ps(wt372, dat164, sum370);
sum371 = _mm512_fmadd_ps(wt372, dat165, sum371);
sum372 = _mm512_fmadd_ps(wt372, dat166, sum372);
sum373 = _mm512_fmadd_ps(wt372, dat167, sum373);
}
_mm512_storeu_ps(sumPtr1+-2239424+1536*i26, _mm512_add_ps(sum350, _mm512_loadu_ps(sumPtr1+-2239424+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2239360+1536*i26, _mm512_add_ps(sum351, _mm512_loadu_ps(sumPtr1+-2239360+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2239296+1536*i26, _mm512_add_ps(sum352, _mm512_loadu_ps(sumPtr1+-2239296+1536*i26)));
_mm512_storeu_ps(sumPtr1+0+1536*i26, _mm512_add_ps(sum353, _mm512_loadu_ps(sumPtr1+0+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2239168+1536*i26, _mm512_add_ps(sum354, _mm512_loadu_ps(sumPtr1+-2239168+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2239104+1536*i26, _mm512_add_ps(sum355, _mm512_loadu_ps(sumPtr1+-2239104+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2239040+1536*i26, _mm512_add_ps(sum356, _mm512_loadu_ps(sumPtr1+-2239040+1536*i26)));
_mm512_storeu_ps(sumPtr1+256+1536*i26, _mm512_add_ps(sum357, _mm512_loadu_ps(sumPtr1+256+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238912+1536*i26, _mm512_add_ps(sum358, _mm512_loadu_ps(sumPtr1+-2238912+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238848+1536*i26, _mm512_add_ps(sum359, _mm512_loadu_ps(sumPtr1+-2238848+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238784+1536*i26, _mm512_add_ps(sum360, _mm512_loadu_ps(sumPtr1+-2238784+1536*i26)));
_mm512_storeu_ps(sumPtr1+512+1536*i26, _mm512_add_ps(sum361, _mm512_loadu_ps(sumPtr1+512+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238656+1536*i26, _mm512_add_ps(sum362, _mm512_loadu_ps(sumPtr1+-2238656+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238592+1536*i26, _mm512_add_ps(sum363, _mm512_loadu_ps(sumPtr1+-2238592+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238528+1536*i26, _mm512_add_ps(sum364, _mm512_loadu_ps(sumPtr1+-2238528+1536*i26)));
_mm512_storeu_ps(sumPtr1+768+1536*i26, _mm512_add_ps(sum365, _mm512_loadu_ps(sumPtr1+768+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238400+1536*i26, _mm512_add_ps(sum366, _mm512_loadu_ps(sumPtr1+-2238400+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238336+1536*i26, _mm512_add_ps(sum367, _mm512_loadu_ps(sumPtr1+-2238336+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238272+1536*i26, _mm512_add_ps(sum368, _mm512_loadu_ps(sumPtr1+-2238272+1536*i26)));
_mm512_storeu_ps(sumPtr1+1024+1536*i26, _mm512_add_ps(sum369, _mm512_loadu_ps(sumPtr1+1024+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238144+1536*i26, _mm512_add_ps(sum370, _mm512_loadu_ps(sumPtr1+-2238144+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238080+1536*i26, _mm512_add_ps(sum371, _mm512_loadu_ps(sumPtr1+-2238080+1536*i26)));
_mm512_storeu_ps(sumPtr1+-2238016+1536*i26, _mm512_add_ps(sum372, _mm512_loadu_ps(sumPtr1+-2238016+1536*i26)));
_mm512_storeu_ps(sumPtr1+1280+1536*i26, _mm512_add_ps(sum373, _mm512_loadu_ps(sumPtr1+1280+1536*i26)));
if (i26 >= ii20) return;
}
return;
}
(void)base1;
ptrdiff_t i27 = 2*w10;
ptrdiff_t ii21 = i27+1;
for (; i27 != 162; ++i27) {
__m512 sum374 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i27));
__m512 sum378 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i27));
__m512 sum382 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i27));
__m512 sum386 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i27));
__m512 sum390 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i27));
__m512 sum394 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i27));
__m512 sum375 = sum374;
__m512 sum376 = sum374;
__m512 sum377 = sum374;
__m512 sum379 = sum378;
__m512 sum380 = sum378;
__m512 sum381 = sum378;
__m512 sum383 = sum382;
__m512 sum384 = sum382;
__m512 sum385 = sum382;
__m512 sum387 = sum386;
__m512 sum388 = sum386;
__m512 sum389 = sum386;
__m512 sum391 = sum390;
__m512 sum392 = sum390;
__m512 sum393 = sum390;
__m512 sum395 = sum394;
__m512 sum396 = sum394;
__m512 sum397 = sum394;
for (ptrdiff_t j23 = 0; j23 < 318; ++j23) {
__m512 dat168 = _mm512_loadu_ps(datPtr2+0+256*j23);
__m512 dat169 = _mm512_loadu_ps(datPtr2+64+256*j23);
__m512 dat170 = _mm512_loadu_ps(datPtr2+128+256*j23);
__m512 dat171 = _mm512_loadu_ps(datPtr2+192+256*j23);
__m512 wt373 = _mm512_set1_ps(*(float*)(wtPtr2+0+7632*i27+24*j23));
sum374 = _mm512_fmadd_ps(wt373, dat168, sum374);
sum375 = _mm512_fmadd_ps(wt373, dat169, sum375);
sum376 = _mm512_fmadd_ps(wt373, dat170, sum376);
sum377 = _mm512_fmadd_ps(wt373, dat171, sum377);
__m512 wt374 = _mm512_set1_ps(*(float*)(wtPtr2+4+7632*i27+24*j23));
sum378 = _mm512_fmadd_ps(wt374, dat168, sum378);
sum379 = _mm512_fmadd_ps(wt374, dat169, sum379);
sum380 = _mm512_fmadd_ps(wt374, dat170, sum380);
sum381 = _mm512_fmadd_ps(wt374, dat171, sum381);
__m512 wt375 = _mm512_set1_ps(*(float*)(wtPtr2+8+7632*i27+24*j23));
sum382 = _mm512_fmadd_ps(wt375, dat168, sum382);
sum383 = _mm512_fmadd_ps(wt375, dat169, sum383);
sum384 = _mm512_fmadd_ps(wt375, dat170, sum384);
sum385 = _mm512_fmadd_ps(wt375, dat171, sum385);
__m512 wt376 = _mm512_set1_ps(*(float*)(wtPtr2+12+7632*i27+24*j23));
sum386 = _mm512_fmadd_ps(wt376, dat168, sum386);
sum387 = _mm512_fmadd_ps(wt376, dat169, sum387);
sum388 = _mm512_fmadd_ps(wt376, dat170, sum388);
sum389 = _mm512_fmadd_ps(wt376, dat171, sum389);
__m512 wt377 = _mm512_set1_ps(*(float*)(wtPtr2+16+7632*i27+24*j23));
sum390 = _mm512_fmadd_ps(wt377, dat168, sum390);
sum391 = _mm512_fmadd_ps(wt377, dat169, sum391);
sum392 = _mm512_fmadd_ps(wt377, dat170, sum392);
sum393 = _mm512_fmadd_ps(wt377, dat171, sum393);
__m512 wt378 = _mm512_set1_ps(*(float*)(wtPtr2+20+7632*i27+24*j23));
sum394 = _mm512_fmadd_ps(wt378, dat168, sum394);
sum395 = _mm512_fmadd_ps(wt378, dat169, sum395);
sum396 = _mm512_fmadd_ps(wt378, dat170, sum396);
sum397 = _mm512_fmadd_ps(wt378, dat171, sum397);
}
_mm512_storeu_ps(sumPtr1+-2239424+1536*i27, sum374);
_mm512_storeu_ps(sumPtr1+-2239360+1536*i27, sum375);
_mm512_storeu_ps(sumPtr1+-2239296+1536*i27, sum376);
_mm512_storeu_ps(sumPtr1+0+1536*i27, sum377);
_mm512_storeu_ps(sumPtr1+-2239168+1536*i27, sum378);
_mm512_storeu_ps(sumPtr1+-2239104+1536*i27, sum379);
_mm512_storeu_ps(sumPtr1+-2239040+1536*i27, sum380);
_mm512_storeu_ps(sumPtr1+256+1536*i27, sum381);
_mm512_storeu_ps(sumPtr1+-2238912+1536*i27, sum382);
_mm512_storeu_ps(sumPtr1+-2238848+1536*i27, sum383);
_mm512_storeu_ps(sumPtr1+-2238784+1536*i27, sum384);
_mm512_storeu_ps(sumPtr1+512+1536*i27, sum385);
_mm512_storeu_ps(sumPtr1+-2238656+1536*i27, sum386);
_mm512_storeu_ps(sumPtr1+-2238592+1536*i27, sum387);
_mm512_storeu_ps(sumPtr1+-2238528+1536*i27, sum388);
_mm512_storeu_ps(sumPtr1+768+1536*i27, sum389);
_mm512_storeu_ps(sumPtr1+-2238400+1536*i27, sum390);
_mm512_storeu_ps(sumPtr1+-2238336+1536*i27, sum391);
_mm512_storeu_ps(sumPtr1+-2238272+1536*i27, sum392);
_mm512_storeu_ps(sumPtr1+1024+1536*i27, sum393);
_mm512_storeu_ps(sumPtr1+-2238144+1536*i27, sum394);
_mm512_storeu_ps(sumPtr1+-2238080+1536*i27, sum395);
_mm512_storeu_ps(sumPtr1+-2238016+1536*i27, sum396);
_mm512_storeu_ps(sumPtr1+1280+1536*i27, sum397);
if (i27 >= ii21) return;
}
break;
}
}
}

static void Example5LoomProduceSums1(Example5ThreaderTeam1* team16, char** tensors5) {
void* tuple1[4];
tuple1[0] = tensors5;
for (ptrdiff_t epoch2 = 0; epoch2 < 1; ++epoch2) {
tuple1[1] = (void*)epoch2;
for (ptrdiff_t field2 = 0; field2 < 6; ++field2) {
tuple1[2] = (void*)field2;
ptrdiff_t node7 = Example5LoomProduceSums1FieldTbl1[0+2*field2];
ptrdiff_t step2 = Example5LoomProduceSums1FieldTbl1[1+2*field2];
ptrdiff_t past1 = Example5LoomProduceSums1FieldTbl1[2+2*field2];
for (; node7 < past1; node7 += step2) {
tuple1[3] = (void*)node7;
Example5ThreaderTask1 task9;
task9.callee1 = Example5LoomProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 81;
task9.hull1[1] = step2;
task9.hull1[2] = 24;
task9.hull1[3] = 4;
Example5ThreaderDo1(team16, &task9);
}
}
}
}

static void Example5LoomConsumeSums1Callee1(Example5ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t cell1 = 0;
ptrdiff_t strip1 = 0;
ptrdiff_t chan1 = pt10[2];
ptrdiff_t group2 = pt10[3];
char*restrict sumPtr2 = tensors8[0];
char*restrict datPtr3 = tensors8[1];
ptrdiff_t i28 = 1*group2;
ptrdiff_t j24 = 9*chan1;
ptrdiff_t jj2 = j24+8;
for (; j24 <= jj2; ++j24) {
ptrdiff_t k14 = 7*strip1;
for (; k14 != 6; ++k14) {
ptrdiff_t l1 = 3*cell1;
for (; l1 != 2; ++l1) {
__m512 load1 = _mm512_loadu_ps(sumPtr2+0+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load6 = _mm512_loadu_ps(sumPtr2+64+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load11 = _mm512_loadu_ps(sumPtr2+128+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load16 = _mm512_loadu_ps(sumPtr2+192+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load2 = _mm512_loadu_ps(sumPtr2+248832+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load7 = _mm512_loadu_ps(sumPtr2+248896+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load12 = _mm512_loadu_ps(sumPtr2+248960+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load17 = _mm512_loadu_ps(sumPtr2+249024+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load3 = _mm512_loadu_ps(sumPtr2+995328+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load8 = _mm512_loadu_ps(sumPtr2+995392+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load13 = _mm512_loadu_ps(sumPtr2+995456+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load18 = _mm512_loadu_ps(sumPtr2+995520+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512i cast1 = _mm512_castps_si512(load2);
__m512i cast5 = _mm512_castps_si512(load7);
__m512i cast9 = _mm512_castps_si512(load12);
__m512i cast13 = _mm512_castps_si512(load17);
__m512i cast2 = _mm512_castps_si512(load3);
__m512i cast6 = _mm512_castps_si512(load8);
__m512i cast10 = _mm512_castps_si512(load13);
__m512i cast14 = _mm512_castps_si512(load18);
__m512 join2 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast2, cast1, 1));
__m512 join4 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast6, cast5, 1));
__m512 join6 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast10, cast9, 1));
__m512 join8 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast14, cast13, 1));
__m512 add1 = _mm512_add_ps(load1, join2);
__m512 add3 = _mm512_add_ps(load6, join4);
__m512 add5 = _mm512_add_ps(load11, join6);
__m512 add7 = _mm512_add_ps(load16, join8);
__m512 load4 = _mm512_loadu_ps(sumPtr2+497664+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load9 = _mm512_loadu_ps(sumPtr2+497728+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load14 = _mm512_loadu_ps(sumPtr2+497792+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load19 = _mm512_loadu_ps(sumPtr2+497856+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load5 = _mm512_loadu_ps(sumPtr2+1244160+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load10 = _mm512_loadu_ps(sumPtr2+1244224+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load15 = _mm512_loadu_ps(sumPtr2+1244288+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load20 = _mm512_loadu_ps(sumPtr2+1244352+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512i cast3 = _mm512_castps_si512(load4);
__m512i cast7 = _mm512_castps_si512(load9);
__m512i cast11 = _mm512_castps_si512(load14);
__m512i cast15 = _mm512_castps_si512(load19);
__m512i cast4 = _mm512_castps_si512(load5);
__m512i cast8 = _mm512_castps_si512(load10);
__m512i cast12 = _mm512_castps_si512(load15);
__m512i cast16 = _mm512_castps_si512(load20);
__m512 join3 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast4, cast3, 2));
__m512 join5 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast8, cast7, 2));
__m512 join7 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast12, cast11, 2));
__m512 join9 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast16, cast15, 2));
__m512 add2 = _mm512_add_ps(add1, join3);
__m512 add4 = _mm512_add_ps(add3, join5);
__m512 add6 = _mm512_add_ps(add5, join7);
__m512 add8 = _mm512_add_ps(add7, join9);
_mm512_mask_storeu_ps(datPtr3+0+3674160*i28+3780*j24+560*k14+64*l1, 65535, add2);
_mm512_mask_storeu_ps(datPtr3+140+3674160*i28+3780*j24+560*k14+64*l1, 65535, add4);
_mm512_mask_storeu_ps(datPtr3+280+3674160*i28+3780*j24+560*k14+64*l1, 65535, add6);
_mm512_mask_storeu_ps(datPtr3+420+3674160*i28+3780*j24+560*k14+64*l1, 65535, add8);
}
__m512 load21 = _mm512_loadu_ps(sumPtr2+0+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load24 = _mm512_loadu_ps(sumPtr2+64+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load27 = _mm512_loadu_ps(sumPtr2+128+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load30 = _mm512_loadu_ps(sumPtr2+192+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load22 = _mm512_loadu_ps(sumPtr2+248832+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load25 = _mm512_loadu_ps(sumPtr2+248896+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load28 = _mm512_loadu_ps(sumPtr2+248960+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load31 = _mm512_loadu_ps(sumPtr2+249024+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512i cast17 = _mm512_castps_si512(load22);
__m512i cast19 = _mm512_castps_si512(load25);
__m512i cast21 = _mm512_castps_si512(load28);
__m512i cast23 = _mm512_castps_si512(load31);
__m512 join10 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast17, cast17, 1));
__m512 join12 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast19, cast19, 1));
__m512 join14 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast21, cast21, 1));
__m512 join16 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast23, cast23, 1));
__m512 add9 = _mm512_add_ps(load21, join10);
__m512 add11 = _mm512_add_ps(load24, join12);
__m512 add13 = _mm512_add_ps(load27, join14);
__m512 add15 = _mm512_add_ps(load30, join16);
__m512 load23 = _mm512_loadu_ps(sumPtr2+497664+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load26 = _mm512_loadu_ps(sumPtr2+497728+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load29 = _mm512_loadu_ps(sumPtr2+497792+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512 load32 = _mm512_loadu_ps(sumPtr2+497856+17915904*i28+2239488*k14+746496*l1+256*j24);
__m512i cast18 = _mm512_castps_si512(load23);
__m512i cast20 = _mm512_castps_si512(load26);
__m512i cast22 = _mm512_castps_si512(load29);
__m512i cast24 = _mm512_castps_si512(load32);
__m512 join11 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast18, cast18, 2));
__m512 join13 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast20, cast20, 2));
__m512 join15 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast22, cast22, 2));
__m512 join17 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast24, cast24, 2));
__m512 add10 = _mm512_add_ps(add9, join11);
__m512 add12 = _mm512_add_ps(add11, join13);
__m512 add14 = _mm512_add_ps(add13, join15);
__m512 add16 = _mm512_add_ps(add15, join17);
_mm512_mask_storeu_ps(datPtr3+0+3674160*i28+3780*j24+560*k14+64*l1, 7, add10);
_mm512_mask_storeu_ps(datPtr3+140+3674160*i28+3780*j24+560*k14+64*l1, 7, add12);
_mm512_mask_storeu_ps(datPtr3+280+3674160*i28+3780*j24+560*k14+64*l1, 7, add14);
_mm512_mask_storeu_ps(datPtr3+420+3674160*i28+3780*j24+560*k14+64*l1, 7, add16);
}
ptrdiff_t l2 = 3*cell1;
for (; l2 != 2; ++l2) {
__m512 load33 = _mm512_loadu_ps(sumPtr2+0+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load38 = _mm512_loadu_ps(sumPtr2+64+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load43 = _mm512_loadu_ps(sumPtr2+128+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load34 = _mm512_loadu_ps(sumPtr2+248832+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load39 = _mm512_loadu_ps(sumPtr2+248896+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load44 = _mm512_loadu_ps(sumPtr2+248960+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load35 = _mm512_loadu_ps(sumPtr2+995328+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load40 = _mm512_loadu_ps(sumPtr2+995392+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load45 = _mm512_loadu_ps(sumPtr2+995456+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512i cast25 = _mm512_castps_si512(load34);
__m512i cast29 = _mm512_castps_si512(load39);
__m512i cast33 = _mm512_castps_si512(load44);
__m512i cast26 = _mm512_castps_si512(load35);
__m512i cast30 = _mm512_castps_si512(load40);
__m512i cast34 = _mm512_castps_si512(load45);
__m512 join18 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast26, cast25, 1));
__m512 join20 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast30, cast29, 1));
__m512 join22 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast34, cast33, 1));
__m512 add17 = _mm512_add_ps(load33, join18);
__m512 add19 = _mm512_add_ps(load38, join20);
__m512 add21 = _mm512_add_ps(load43, join22);
__m512 load36 = _mm512_loadu_ps(sumPtr2+497664+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load41 = _mm512_loadu_ps(sumPtr2+497728+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load46 = _mm512_loadu_ps(sumPtr2+497792+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load37 = _mm512_loadu_ps(sumPtr2+1244160+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load42 = _mm512_loadu_ps(sumPtr2+1244224+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load47 = _mm512_loadu_ps(sumPtr2+1244288+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512i cast27 = _mm512_castps_si512(load36);
__m512i cast31 = _mm512_castps_si512(load41);
__m512i cast35 = _mm512_castps_si512(load46);
__m512i cast28 = _mm512_castps_si512(load37);
__m512i cast32 = _mm512_castps_si512(load42);
__m512i cast36 = _mm512_castps_si512(load47);
__m512 join19 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast28, cast27, 2));
__m512 join21 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast32, cast31, 2));
__m512 join23 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast36, cast35, 2));
__m512 add18 = _mm512_add_ps(add17, join19);
__m512 add20 = _mm512_add_ps(add19, join21);
__m512 add22 = _mm512_add_ps(add21, join23);
_mm512_mask_storeu_ps(datPtr3+0+3674160*i28+3780*j24+560*k14+64*l2, 65535, add18);
_mm512_mask_storeu_ps(datPtr3+140+3674160*i28+3780*j24+560*k14+64*l2, 65535, add20);
_mm512_mask_storeu_ps(datPtr3+280+3674160*i28+3780*j24+560*k14+64*l2, 65535, add22);
}
__m512 load48 = _mm512_loadu_ps(sumPtr2+0+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load51 = _mm512_loadu_ps(sumPtr2+64+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load54 = _mm512_loadu_ps(sumPtr2+128+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load49 = _mm512_loadu_ps(sumPtr2+248832+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load52 = _mm512_loadu_ps(sumPtr2+248896+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load55 = _mm512_loadu_ps(sumPtr2+248960+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512i cast37 = _mm512_castps_si512(load49);
__m512i cast39 = _mm512_castps_si512(load52);
__m512i cast41 = _mm512_castps_si512(load55);
__m512 join24 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast37, cast37, 1));
__m512 join26 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast39, cast39, 1));
__m512 join28 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast41, cast41, 1));
__m512 add23 = _mm512_add_ps(load48, join24);
__m512 add25 = _mm512_add_ps(load51, join26);
__m512 add27 = _mm512_add_ps(load54, join28);
__m512 load50 = _mm512_loadu_ps(sumPtr2+497664+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load53 = _mm512_loadu_ps(sumPtr2+497728+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512 load56 = _mm512_loadu_ps(sumPtr2+497792+17915904*i28+2239488*k14+746496*l2+256*j24);
__m512i cast38 = _mm512_castps_si512(load50);
__m512i cast40 = _mm512_castps_si512(load53);
__m512i cast42 = _mm512_castps_si512(load56);
__m512 join25 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast38, cast38, 2));
__m512 join27 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast40, cast40, 2));
__m512 join29 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast42, cast42, 2));
__m512 add24 = _mm512_add_ps(add23, join25);
__m512 add26 = _mm512_add_ps(add25, join27);
__m512 add28 = _mm512_add_ps(add27, join29);
_mm512_mask_storeu_ps(datPtr3+0+3674160*i28+3780*j24+560*k14+64*l2, 7, add24);
_mm512_mask_storeu_ps(datPtr3+140+3674160*i28+3780*j24+560*k14+64*l2, 7, add26);
_mm512_mask_storeu_ps(datPtr3+280+3674160*i28+3780*j24+560*k14+64*l2, 7, add28);
}
}

static void Example5LoomConsumeSums1(Example5ThreaderTeam1* team17, char** tensors7) {
Example5ThreaderTask1 task11;
task11.callee1 = Example5LoomConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 4;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 108;
task11.hull1[3] = 4;
Example5ThreaderDo1(team17, &task11);
}

struct Example5Net {
char* alloc1;
char* align1;
};

void Example5NetDestroy(Example5Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example5NetCreate(
Example5Net** net1,
Example5Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example5Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(242346879);
if (__builtin_expect(!alloc3, 0)) {
return Example5Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example5ThreaderTeam1* team12 = 0;
char* err8 = Example5ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors12[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example5LoomArrangeFilts1(team12, tensors12);
}
Example5ThreaderDestroy1(team12);
Example5Net* net5 = malloc(sizeof(Example5Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example5Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example5Engine {
Example5Net* net3;
Example5ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example5EnginePthreadT(
Example5Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example5ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example5EngineDestroy(Example5Engine* eng3) {
Example5ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example5EngineCreate(
Example5Engine** eng4,
Example5Net* net4,
ptrdiff_t threads2
) {
Example5Engine* eng5 = malloc(sizeof(Example5Engine));
if (__builtin_expect(!eng5, 0)) {
return Example5Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(118554687);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example5Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example5ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example5EngineInference(
Example5Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example5ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example5LoomArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
align4+46891008
};
Example5LoomProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+46891008,
(char*)outData
};
Example5LoomConsumeSums1(team14, tensors11);
}
}

// End of file.

Top