NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example15 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=32704 Height=242 Width=168
Conv FromTensor=in ToTensor=out ToChannels=19816 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=8
Output FromTensor=out

Top || Output Example15.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example15Params);
// Example15Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example15Params Example15Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example15Params* params = malloc(sizeof(Example15Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example15Net* net; // For example, 4 threads:
// char* err = Example15NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example15NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example15Net Example15Net;

char* Example15NetCreate(
Example15Net**,
Example15Params*,
ptrdiff_t threads
);

void Example15NetDestroy(Example15Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example15Net* net;
//
// ... Create net ...
//
// Example15Engine* engine; // For example, 4 inference threads:
// char* err = Example15EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example15EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example15EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*32704*242*168);
// float* outData = malloc(sizeof(float)*19816*121*84);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example15EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example15Engine Example15Engine;

char* Example15EngineCreate(
Example15Engine**,
Example15Net*,
ptrdiff_t threads
);

char* Example15EnginePthreadT(
Example15Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example15EngineInference(
Example15Engine*,
float* inData,
float* outData
);

void Example15EngineDestroy(Example15Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example15Params {
float outBiases[19816]; // 1x19816x1x1
float outWeights[81007808]; // 19816x4088x1x1
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example15.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example15.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example15.h"

static char* Example15Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example15: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example15ThreaderTask1 Example15ThreaderTask1;
typedef void (*Example15ThreaderCallee1)(Example15ThreaderTask1*, int64_t*);
typedef struct Example15ThreaderHub1 Example15ThreaderHub1;
typedef struct Example15ThreaderNode1 Example15ThreaderNode1;
typedef struct Example15ThreaderUnwind1 Example15ThreaderUnwind1;
typedef struct Example15ThreaderTeam1 Example15ThreaderTeam1;

struct Example15ThreaderTask1 {
Example15ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example15ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example15ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example15ThreaderTask1* task1;
pthread_cond_t cond2;
Example15ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example15ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example15ThreaderTeam1 {
ptrdiff_t nt1;
Example15ThreaderHub1* hub2;
Example15ThreaderNode1* nodes2;
Example15ThreaderUnwind1 unwind1;
};

static void Example15ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example15ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example15ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example15ThreaderMain1(void* arg1) {
Example15ThreaderNode1* node1 = arg1;
Example15ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example15ThreaderHub1* hub3 = team2->hub2;
Example15ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example15ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example15ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example15ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example15ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example15ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example15ThreaderDestroy1(Example15ThreaderTeam1* team3) {
if (!team3) return;
Example15ThreaderNode1* nodes4 = team3->nodes2;
Example15ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example15ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example15ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example15ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example15ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example15ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example15ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example15ThreaderCreate1Up4(Example15ThreaderTeam1* team8, ptrdiff_t nt7) {
Example15ThreaderNode1* nodes5 = team8->nodes2;
for (Example15ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example15Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example15Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example15ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example15Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example15ThreaderCreate1Up3(Example15ThreaderTeam1* team7, ptrdiff_t nt6) {
Example15ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example15Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example15Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example15ThreaderCreate1Up4(team7, nt6);
}

static char* Example15ThreaderCreate1Up2(Example15ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example15ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example15ThreaderNode1) != (size_t)nt5, 0)) {
return Example15Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example15Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example15ThreaderCreate1Up3(team6, nt5);
}

static char* Example15ThreaderCreate1Up1(Example15ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example15ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example15Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example15ThreaderCreate1Up2(team5, nt4);
}

static char* Example15ThreaderCreate1(Example15ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example15Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example15ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example15Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example15ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example15ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example15ThreaderPthreadT1(
pthread_t* thr2,
Example15ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example15Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example15ThreaderDo1(Example15ThreaderTeam1* team10, Example15ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example15ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example15ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example15ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example15ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example15Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example15Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example15OneArrangeWts1Callee1(Example15ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = pt7[1];
ptrdiff_t e1 = pt7[2];
if (e1 < 4) {
char*restrict wtPtr1 = tensors2[0]+(ptrdiff_t)3340*e1+(ptrdiff_t)40503904*1*g2;
char*restrict biasPtr1 = tensors2[1]+(ptrdiff_t)9908*1*g2;
char*restrict arranged1 = tensors2[2]+(ptrdiff_t)66264704*e1+(ptrdiff_t)8283088*1*g2;
ptrdiff_t ii1 = 1;
for (ptrdiff_t i5 = 0; i5 < ii1; ++i5) {
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+1;
for (; j1 < jj1; ++j1) {
if (j1 < 154) {
ptrdiff_t k2 = 0+16*(j1-0);
ptrdiff_t l2 = (size_t)(0+k2)/6;
ptrdiff_t cut2 = (size_t)(0+k2)%6;
switch (cut2) {
case 0:;
case 2: {
__m512 sum3;
if (!e1) {
sum3 = _mm512_maskz_loadu_ps(65535, biasPtr1+9908*i5+4*k2);
} else {
sum3 = _mm512_setzero_ps();
}
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum3);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)20040, 4032>>cut2, sum3);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)40080, 65535-(4095>>cut2), sum3);
ptrdiff_t c2 = 0;
for (; c2 != 52; ++c2) {
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)0);
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)16352);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)32704);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)49056);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)65408);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)81760);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)98112);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)114464);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)130816);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)147168);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)163520);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)179872);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)196224);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)212576);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)228928);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)245280);
__m512 tmp1 = _mm512_unpacklo_ps(wt30, wt31);
__m512 tmp2 = _mm512_unpackhi_ps(wt30, wt31);
__m512 tmp3 = _mm512_unpacklo_ps(wt32, wt33);
__m512 tmp4 = _mm512_unpackhi_ps(wt32, wt33);
__m512 tmp5 = _mm512_unpacklo_ps(wt34, wt35);
__m512 tmp6 = _mm512_unpackhi_ps(wt34, wt35);
__m512 tmp7 = _mm512_unpacklo_ps(wt36, wt37);
__m512 tmp8 = _mm512_unpackhi_ps(wt36, wt37);
__m512 tmp9 = _mm512_unpacklo_ps(wt38, wt39);
__m512 tmp10 = _mm512_unpackhi_ps(wt38, wt39);
__m512 tmp11 = _mm512_unpacklo_ps(wt40, wt41);
__m512 tmp12 = _mm512_unpackhi_ps(wt40, wt41);
__m512 tmp13 = _mm512_unpacklo_ps(wt42, wt43);
__m512 tmp14 = _mm512_unpackhi_ps(wt42, wt43);
__m512 tmp15 = _mm512_unpacklo_ps(wt44, wt45);
__m512 tmp16 = _mm512_unpackhi_ps(wt44, wt45);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt30 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt38 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt31 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt39 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt32 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt40 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt33 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt41 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt34 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt42 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt35 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt43 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt36 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt44 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt37 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt45 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)0, 63>>cut2, wt30);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)0, 63>>cut2, wt31);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)0, 63>>cut2, wt32);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)0, 63>>cut2, wt33);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)0, 63>>cut2, wt34);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)0, 63>>cut2, wt35);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)0, 63>>cut2, wt36);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)0, 63>>cut2, wt37);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)0, 63>>cut2, wt38);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)0, 63>>cut2, wt39);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)0, 63>>cut2, wt40);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)0, 63>>cut2, wt41);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)0, 63>>cut2, wt42);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)0, 63>>cut2, wt43);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)0, 63>>cut2, wt44);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)0, 63>>cut2, wt45);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt30);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt31);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt32);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt33);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt34);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt35);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt36);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt37);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt38);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt39);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt40);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt41);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt42);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt43);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt44);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt45);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt30);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt31);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt32);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt33);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt34);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt35);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt36);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt37);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt38);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt39);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt40);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt41);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt42);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt43);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt44);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt45);
}
__m512 wt46 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)0);
__m512 wt47 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)16352);
__m512 wt48 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)32704);
__m512 wt49 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)49056);
__m512 wt50 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)65408);
__m512 wt51 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)81760);
__m512 wt52 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)98112);
__m512 wt53 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)114464);
__m512 wt54 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)130816);
__m512 wt55 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)147168);
__m512 wt56 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)163520);
__m512 wt57 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)179872);
__m512 wt58 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)196224);
__m512 wt59 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)212576);
__m512 wt60 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)228928);
__m512 wt61 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c2+(ptrdiff_t)245280);
__m512 tmp49 = _mm512_unpacklo_ps(wt46, wt47);
__m512 tmp50 = _mm512_unpackhi_ps(wt46, wt47);
__m512 tmp51 = _mm512_unpacklo_ps(wt48, wt49);
__m512 tmp52 = _mm512_unpackhi_ps(wt48, wt49);
__m512 tmp53 = _mm512_unpacklo_ps(wt50, wt51);
__m512 tmp54 = _mm512_unpackhi_ps(wt50, wt51);
__m512 tmp55 = _mm512_unpacklo_ps(wt52, wt53);
__m512 tmp56 = _mm512_unpackhi_ps(wt52, wt53);
__m512 tmp57 = _mm512_unpacklo_ps(wt54, wt55);
__m512 tmp58 = _mm512_unpackhi_ps(wt54, wt55);
__m512 tmp59 = _mm512_unpacklo_ps(wt56, wt57);
__m512 tmp60 = _mm512_unpackhi_ps(wt56, wt57);
__m512 tmp61 = _mm512_unpacklo_ps(wt58, wt59);
__m512 tmp62 = _mm512_unpackhi_ps(wt58, wt59);
__m512 tmp63 = _mm512_unpacklo_ps(wt60, wt61);
__m512 tmp64 = _mm512_unpackhi_ps(wt60, wt61);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp70 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp71 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp75 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp76 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp77 = _mm512_shuffle_f32x4(tmp65, tmp68, 136);
__m512 tmp78 = _mm512_shuffle_f32x4(tmp66, tmp69, 136);
__m512 tmp79 = _mm512_shuffle_f32x4(tmp67, tmp70, 136);
__m512 tmp80 = _mm512_shuffle_f32x4(tmp71, tmp74, 136);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp72, tmp75, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp73, tmp76, 136);
wt46 = _mm512_shuffle_f32x4(tmp77, tmp80, 136);
wt47 = _mm512_shuffle_f32x4(tmp78, tmp81, 136);
wt48 = _mm512_shuffle_f32x4(tmp79, tmp82, 136);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)0, 63>>cut2, wt46);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)0, 63>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)0, 63>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt46);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt46);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt47);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt48);
break;
}
default: {
cut2 = 4;
__m512 sum4;
if (!e1) {
sum4 = _mm512_maskz_loadu_ps(65535, biasPtr1+9908*i5+4*k2);
} else {
sum4 = _mm512_setzero_ps();
}
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)20040, 4032>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)40080, 258048>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)60120, 65535-(262143>>cut2), sum4);
ptrdiff_t c3 = 0;
for (; c3 != 52; ++c3) {
__m512 wt62 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)0);
__m512 wt63 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)16352);
__m512 wt64 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)32704);
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)49056);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)65408);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)81760);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)98112);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)114464);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)130816);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)147168);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)163520);
__m512 wt73 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)179872);
__m512 wt74 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)196224);
__m512 wt75 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)212576);
__m512 wt76 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)228928);
__m512 wt77 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)245280);
__m512 tmp83 = _mm512_unpacklo_ps(wt62, wt63);
__m512 tmp84 = _mm512_unpackhi_ps(wt62, wt63);
__m512 tmp85 = _mm512_unpacklo_ps(wt64, wt65);
__m512 tmp86 = _mm512_unpackhi_ps(wt64, wt65);
__m512 tmp87 = _mm512_unpacklo_ps(wt66, wt67);
__m512 tmp88 = _mm512_unpackhi_ps(wt66, wt67);
__m512 tmp89 = _mm512_unpacklo_ps(wt68, wt69);
__m512 tmp90 = _mm512_unpackhi_ps(wt68, wt69);
__m512 tmp91 = _mm512_unpacklo_ps(wt70, wt71);
__m512 tmp92 = _mm512_unpackhi_ps(wt70, wt71);
__m512 tmp93 = _mm512_unpacklo_ps(wt72, wt73);
__m512 tmp94 = _mm512_unpackhi_ps(wt72, wt73);
__m512 tmp95 = _mm512_unpacklo_ps(wt74, wt75);
__m512 tmp96 = _mm512_unpackhi_ps(wt74, wt75);
__m512 tmp97 = _mm512_unpacklo_ps(wt76, wt77);
__m512 tmp98 = _mm512_unpackhi_ps(wt76, wt77);
__m512 tmp99 = _mm512_shuffle_ps(tmp83, tmp85, 68);
__m512 tmp100 = _mm512_shuffle_ps(tmp83, tmp85, 238);
__m512 tmp101 = _mm512_shuffle_ps(tmp84, tmp86, 68);
__m512 tmp102 = _mm512_shuffle_ps(tmp84, tmp86, 238);
__m512 tmp103 = _mm512_shuffle_ps(tmp87, tmp89, 68);
__m512 tmp104 = _mm512_shuffle_ps(tmp87, tmp89, 238);
__m512 tmp105 = _mm512_shuffle_ps(tmp88, tmp90, 68);
__m512 tmp106 = _mm512_shuffle_ps(tmp88, tmp90, 238);
__m512 tmp107 = _mm512_shuffle_ps(tmp91, tmp93, 68);
__m512 tmp108 = _mm512_shuffle_ps(tmp91, tmp93, 238);
__m512 tmp109 = _mm512_shuffle_ps(tmp92, tmp94, 68);
__m512 tmp110 = _mm512_shuffle_ps(tmp92, tmp94, 238);
__m512 tmp111 = _mm512_shuffle_ps(tmp95, tmp97, 68);
__m512 tmp112 = _mm512_shuffle_ps(tmp95, tmp97, 238);
__m512 tmp113 = _mm512_shuffle_ps(tmp96, tmp98, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp96, tmp98, 238);
__m512 tmp115 = _mm512_shuffle_f32x4(tmp99, tmp103, 136);
__m512 tmp116 = _mm512_shuffle_f32x4(tmp99, tmp103, 221);
__m512 tmp117 = _mm512_shuffle_f32x4(tmp100, tmp104, 136);
__m512 tmp118 = _mm512_shuffle_f32x4(tmp100, tmp104, 221);
__m512 tmp119 = _mm512_shuffle_f32x4(tmp101, tmp105, 136);
__m512 tmp120 = _mm512_shuffle_f32x4(tmp101, tmp105, 221);
__m512 tmp121 = _mm512_shuffle_f32x4(tmp102, tmp106, 136);
__m512 tmp122 = _mm512_shuffle_f32x4(tmp102, tmp106, 221);
__m512 tmp123 = _mm512_shuffle_f32x4(tmp107, tmp111, 136);
__m512 tmp124 = _mm512_shuffle_f32x4(tmp107, tmp111, 221);
__m512 tmp125 = _mm512_shuffle_f32x4(tmp108, tmp112, 136);
__m512 tmp126 = _mm512_shuffle_f32x4(tmp108, tmp112, 221);
__m512 tmp127 = _mm512_shuffle_f32x4(tmp109, tmp113, 136);
__m512 tmp128 = _mm512_shuffle_f32x4(tmp109, tmp113, 221);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp110, tmp114, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp110, tmp114, 221);
wt62 = _mm512_shuffle_f32x4(tmp115, tmp123, 136);
wt70 = _mm512_shuffle_f32x4(tmp115, tmp123, 221);
wt63 = _mm512_shuffle_f32x4(tmp117, tmp125, 136);
wt71 = _mm512_shuffle_f32x4(tmp117, tmp125, 221);
wt64 = _mm512_shuffle_f32x4(tmp119, tmp127, 136);
wt72 = _mm512_shuffle_f32x4(tmp119, tmp127, 221);
wt65 = _mm512_shuffle_f32x4(tmp121, tmp129, 136);
wt73 = _mm512_shuffle_f32x4(tmp121, tmp129, 221);
wt66 = _mm512_shuffle_f32x4(tmp116, tmp124, 136);
wt74 = _mm512_shuffle_f32x4(tmp116, tmp124, 221);
wt67 = _mm512_shuffle_f32x4(tmp118, tmp126, 136);
wt75 = _mm512_shuffle_f32x4(tmp118, tmp126, 221);
wt68 = _mm512_shuffle_f32x4(tmp120, tmp128, 136);
wt76 = _mm512_shuffle_f32x4(tmp120, tmp128, 221);
wt69 = _mm512_shuffle_f32x4(tmp122, tmp130, 136);
wt77 = _mm512_shuffle_f32x4(tmp122, tmp130, 221);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)0, 63>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)0, 63>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)0, 63>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(4+16*c3)+(ptrdiff_t)0, 63>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(5+16*c3)+(ptrdiff_t)0, 63>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(6+16*c3)+(ptrdiff_t)0, 63>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(7+16*c3)+(ptrdiff_t)0, 63>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(8+16*c3)+(ptrdiff_t)0, 63>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(9+16*c3)+(ptrdiff_t)0, 63>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(10+16*c3)+(ptrdiff_t)0, 63>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(11+16*c3)+(ptrdiff_t)0, 63>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(12+16*c3)+(ptrdiff_t)0, 63>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(13+16*c3)+(ptrdiff_t)0, 63>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(14+16*c3)+(ptrdiff_t)0, 63>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(15+16*c3)+(ptrdiff_t)0, 63>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(16+16*c3)+(ptrdiff_t)0, 63>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(4+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(5+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(6+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(7+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(8+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(9+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(10+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(11+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(12+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(13+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(14+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(15+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(16+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(4+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(5+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(6+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(7+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(8+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(9+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(10+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(11+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(12+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(13+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(14+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(15+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(16+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt62);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt63);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt64);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(4+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt65);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(5+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt66);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(6+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt67);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(7+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt68);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(8+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt69);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(9+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt70);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(10+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt71);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(11+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt72);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(12+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt73);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(13+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt74);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(14+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt75);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(15+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt76);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(16+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt77);
}
__m512 wt78 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)0);
__m512 wt79 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)16352);
__m512 wt80 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)32704);
__m512 wt81 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)49056);
__m512 wt82 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)65408);
__m512 wt83 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)81760);
__m512 wt84 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)98112);
__m512 wt85 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)114464);
__m512 wt86 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)130816);
__m512 wt87 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)147168);
__m512 wt88 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)163520);
__m512 wt89 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)179872);
__m512 wt90 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)196224);
__m512 wt91 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)212576);
__m512 wt92 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)228928);
__m512 wt93 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k2+64*c3+(ptrdiff_t)245280);
__m512 tmp131 = _mm512_unpacklo_ps(wt78, wt79);
__m512 tmp132 = _mm512_unpackhi_ps(wt78, wt79);
__m512 tmp133 = _mm512_unpacklo_ps(wt80, wt81);
__m512 tmp134 = _mm512_unpackhi_ps(wt80, wt81);
__m512 tmp135 = _mm512_unpacklo_ps(wt82, wt83);
__m512 tmp136 = _mm512_unpackhi_ps(wt82, wt83);
__m512 tmp137 = _mm512_unpacklo_ps(wt84, wt85);
__m512 tmp138 = _mm512_unpackhi_ps(wt84, wt85);
__m512 tmp139 = _mm512_unpacklo_ps(wt86, wt87);
__m512 tmp140 = _mm512_unpackhi_ps(wt86, wt87);
__m512 tmp141 = _mm512_unpacklo_ps(wt88, wt89);
__m512 tmp142 = _mm512_unpackhi_ps(wt88, wt89);
__m512 tmp143 = _mm512_unpacklo_ps(wt90, wt91);
__m512 tmp144 = _mm512_unpackhi_ps(wt90, wt91);
__m512 tmp145 = _mm512_unpacklo_ps(wt92, wt93);
__m512 tmp146 = _mm512_unpackhi_ps(wt92, wt93);
__m512 tmp147 = _mm512_shuffle_ps(tmp131, tmp133, 68);
__m512 tmp148 = _mm512_shuffle_ps(tmp131, tmp133, 238);
__m512 tmp149 = _mm512_shuffle_ps(tmp132, tmp134, 68);
__m512 tmp150 = _mm512_shuffle_ps(tmp135, tmp137, 68);
__m512 tmp151 = _mm512_shuffle_ps(tmp135, tmp137, 238);
__m512 tmp152 = _mm512_shuffle_ps(tmp136, tmp138, 68);
__m512 tmp153 = _mm512_shuffle_ps(tmp139, tmp141, 68);
__m512 tmp154 = _mm512_shuffle_ps(tmp139, tmp141, 238);
__m512 tmp155 = _mm512_shuffle_ps(tmp140, tmp142, 68);
__m512 tmp156 = _mm512_shuffle_ps(tmp143, tmp145, 68);
__m512 tmp157 = _mm512_shuffle_ps(tmp143, tmp145, 238);
__m512 tmp158 = _mm512_shuffle_ps(tmp144, tmp146, 68);
__m512 tmp159 = _mm512_shuffle_f32x4(tmp147, tmp150, 136);
__m512 tmp160 = _mm512_shuffle_f32x4(tmp148, tmp151, 136);
__m512 tmp161 = _mm512_shuffle_f32x4(tmp149, tmp152, 136);
__m512 tmp162 = _mm512_shuffle_f32x4(tmp153, tmp156, 136);
__m512 tmp163 = _mm512_shuffle_f32x4(tmp154, tmp157, 136);
__m512 tmp164 = _mm512_shuffle_f32x4(tmp155, tmp158, 136);
wt78 = _mm512_shuffle_f32x4(tmp159, tmp162, 136);
wt79 = _mm512_shuffle_f32x4(tmp160, tmp163, 136);
wt80 = _mm512_shuffle_f32x4(tmp161, tmp164, 136);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)0, 63>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)0, 63>>cut2, wt79);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)0, 63>>cut2, wt80);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt79);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt80);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt79);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt80);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt78);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt79);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt80);
}
}
} else {
ptrdiff_t k1 = 2464;
ptrdiff_t l1 = (size_t)(0+k1)/6;
ptrdiff_t cut1 = (size_t)(0+k1)%6;
__m512 sum2;
if (!e1) {
sum2 = _mm512_maskz_loadu_ps(8191, biasPtr1+9908*i5+4*k1);
} else {
sum2 = _mm512_setzero_ps();
}
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*0+(ptrdiff_t)20040, 4032>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*0+(ptrdiff_t)40080, 8191-(4095>>cut1), sum2);
ptrdiff_t c1 = 0;
for (; c1 != 52; ++c1) {
__m512 wt1 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)0);
__m512 wt2 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)16352);
__m512 wt3 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)32704);
__m512 wt4 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)49056);
__m512 wt5 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)65408);
__m512 wt6 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)81760);
__m512 wt7 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)98112);
__m512 wt8 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)114464);
__m512 wt9 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)130816);
__m512 wt10 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)147168);
__m512 wt11 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)163520);
__m512 wt12 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)179872);
__m512 wt13 = _mm512_maskz_loadu_ps(65535, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)196224);
__m512 tmp165 = _mm512_unpacklo_ps(wt1, wt2);
__m512 tmp166 = _mm512_unpackhi_ps(wt1, wt2);
__m512 tmp167 = _mm512_unpacklo_ps(wt3, wt4);
__m512 tmp168 = _mm512_unpackhi_ps(wt3, wt4);
__m512 tmp169 = _mm512_unpacklo_ps(wt5, wt6);
__m512 tmp170 = _mm512_unpackhi_ps(wt5, wt6);
__m512 tmp171 = _mm512_unpacklo_ps(wt7, wt8);
__m512 tmp172 = _mm512_unpackhi_ps(wt7, wt8);
__m512 tmp173 = _mm512_unpacklo_ps(wt9, wt10);
__m512 tmp174 = _mm512_unpackhi_ps(wt9, wt10);
__m512 tmp175 = _mm512_unpacklo_ps(wt11, wt12);
__m512 tmp176 = _mm512_unpackhi_ps(wt11, wt12);
__m512 tmp177 = _mm512_unpacklo_ps(wt13, wt13);
__m512 tmp178 = _mm512_unpackhi_ps(wt13, wt13);
__m512 tmp179 = _mm512_shuffle_ps(tmp165, tmp167, 68);
__m512 tmp180 = _mm512_shuffle_ps(tmp165, tmp167, 238);
__m512 tmp181 = _mm512_shuffle_ps(tmp166, tmp168, 68);
__m512 tmp182 = _mm512_shuffle_ps(tmp166, tmp168, 238);
__m512 tmp183 = _mm512_shuffle_ps(tmp169, tmp171, 68);
__m512 tmp184 = _mm512_shuffle_ps(tmp169, tmp171, 238);
__m512 tmp185 = _mm512_shuffle_ps(tmp170, tmp172, 68);
__m512 tmp186 = _mm512_shuffle_ps(tmp170, tmp172, 238);
__m512 tmp187 = _mm512_shuffle_ps(tmp173, tmp175, 68);
__m512 tmp188 = _mm512_shuffle_ps(tmp173, tmp175, 238);
__m512 tmp189 = _mm512_shuffle_ps(tmp174, tmp176, 68);
__m512 tmp190 = _mm512_shuffle_ps(tmp174, tmp176, 238);
__m512 tmp191 = _mm512_shuffle_ps(tmp177, tmp177, 238);
__m512 tmp192 = _mm512_shuffle_ps(tmp178, tmp178, 238);
__m512 tmp193 = _mm512_shuffle_f32x4(tmp179, tmp183, 136);
__m512 tmp194 = _mm512_shuffle_f32x4(tmp179, tmp183, 221);
__m512 tmp195 = _mm512_shuffle_f32x4(tmp180, tmp184, 136);
__m512 tmp196 = _mm512_shuffle_f32x4(tmp180, tmp184, 221);
__m512 tmp197 = _mm512_shuffle_f32x4(tmp181, tmp185, 136);
__m512 tmp198 = _mm512_shuffle_f32x4(tmp181, tmp185, 221);
__m512 tmp199 = _mm512_shuffle_f32x4(tmp182, tmp186, 136);
__m512 tmp200 = _mm512_shuffle_f32x4(tmp182, tmp186, 221);
__m512 tmp201 = _mm512_shuffle_f32x4(tmp187, tmp177, 136);
__m512 tmp202 = _mm512_shuffle_f32x4(tmp187, tmp177, 221);
__m512 tmp203 = _mm512_shuffle_f32x4(tmp188, tmp191, 136);
__m512 tmp204 = _mm512_shuffle_f32x4(tmp188, tmp191, 221);
__m512 tmp205 = _mm512_shuffle_f32x4(tmp189, tmp178, 136);
__m512 tmp206 = _mm512_shuffle_f32x4(tmp189, tmp178, 221);
__m512 tmp207 = _mm512_shuffle_f32x4(tmp190, tmp192, 136);
__m512 tmp208 = _mm512_shuffle_f32x4(tmp190, tmp192, 221);
wt1 = _mm512_shuffle_f32x4(tmp193, tmp201, 136);
wt9 = _mm512_shuffle_f32x4(tmp193, tmp201, 221);
wt2 = _mm512_shuffle_f32x4(tmp195, tmp203, 136);
wt10 = _mm512_shuffle_f32x4(tmp195, tmp203, 221);
wt3 = _mm512_shuffle_f32x4(tmp197, tmp205, 136);
wt11 = _mm512_shuffle_f32x4(tmp197, tmp205, 221);
wt4 = _mm512_shuffle_f32x4(tmp199, tmp207, 136);
wt12 = _mm512_shuffle_f32x4(tmp199, tmp207, 221);
wt5 = _mm512_shuffle_f32x4(tmp194, tmp202, 136);
wt13 = _mm512_shuffle_f32x4(tmp194, tmp202, 221);
wt6 = _mm512_shuffle_f32x4(tmp196, tmp204, 136);
__m512 wt14 = _mm512_shuffle_f32x4(tmp196, tmp204, 221);
wt7 = _mm512_shuffle_f32x4(tmp198, tmp206, 136);
__m512 wt15 = _mm512_shuffle_f32x4(tmp198, tmp206, 221);
wt8 = _mm512_shuffle_f32x4(tmp200, tmp208, 136);
__m512 wt16 = _mm512_shuffle_f32x4(tmp200, tmp208, 221);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)0, 63>>cut1, wt1);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)0, 63>>cut1, wt2);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)0, 63>>cut1, wt3);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(4+16*c1)+(ptrdiff_t)0, 63>>cut1, wt4);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(5+16*c1)+(ptrdiff_t)0, 63>>cut1, wt5);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(6+16*c1)+(ptrdiff_t)0, 63>>cut1, wt6);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(7+16*c1)+(ptrdiff_t)0, 63>>cut1, wt7);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(8+16*c1)+(ptrdiff_t)0, 63>>cut1, wt8);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(9+16*c1)+(ptrdiff_t)0, 63>>cut1, wt9);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(10+16*c1)+(ptrdiff_t)0, 63>>cut1, wt10);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(11+16*c1)+(ptrdiff_t)0, 63>>cut1, wt11);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(12+16*c1)+(ptrdiff_t)0, 63>>cut1, wt12);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(13+16*c1)+(ptrdiff_t)0, 63>>cut1, wt13);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(14+16*c1)+(ptrdiff_t)0, 63>>cut1, wt14);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(15+16*c1)+(ptrdiff_t)0, 63>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(16+16*c1)+(ptrdiff_t)0, 63>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt1);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt2);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt3);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(4+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt4);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(5+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt5);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(6+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt6);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(7+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt7);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(8+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt8);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(9+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt9);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(10+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt10);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(11+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt11);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(12+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt12);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(13+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt13);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(14+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt14);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(15+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(16+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(1+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt1);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(2+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt2);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(3+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt3);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(4+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt4);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(5+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt5);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(6+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt6);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(7+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt7);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(8+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt8);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(9+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt9);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(10+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt10);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(11+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt11);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(12+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt12);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(13+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt13);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(14+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt14);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(15+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt15);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(16+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt16);
}
__m512 wt17 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)0);
__m512 wt18 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)16352);
__m512 wt19 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)32704);
__m512 wt20 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)49056);
__m512 wt21 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)65408);
__m512 wt22 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)81760);
__m512 wt23 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)98112);
__m512 wt24 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)114464);
__m512 wt25 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)130816);
__m512 wt26 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)147168);
__m512 wt27 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)163520);
__m512 wt28 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)179872);
__m512 wt29 = _mm512_maskz_loadu_ps(7, wtPtr1+40503904*i5+16352*k1+64*c1+(ptrdiff_t)196224);
__m512 tmp209 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp210 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp211 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp212 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp213 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp214 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp215 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp216 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp217 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp218 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp219 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp220 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp221 = _mm512_unpacklo_ps(wt29, wt29);
__m512 tmp222 = _mm512_unpackhi_ps(wt29, wt29);
__m512 tmp223 = _mm512_shuffle_ps(tmp209, tmp211, 68);
__m512 tmp224 = _mm512_shuffle_ps(tmp209, tmp211, 238);
__m512 tmp225 = _mm512_shuffle_ps(tmp210, tmp212, 68);
__m512 tmp226 = _mm512_shuffle_ps(tmp213, tmp215, 68);
__m512 tmp227 = _mm512_shuffle_ps(tmp213, tmp215, 238);
__m512 tmp228 = _mm512_shuffle_ps(tmp214, tmp216, 68);
__m512 tmp229 = _mm512_shuffle_ps(tmp217, tmp219, 68);
__m512 tmp230 = _mm512_shuffle_ps(tmp217, tmp219, 238);
__m512 tmp231 = _mm512_shuffle_ps(tmp218, tmp220, 68);
__m512 tmp232 = _mm512_shuffle_ps(tmp221, tmp221, 238);
__m512 tmp233 = _mm512_shuffle_f32x4(tmp223, tmp226, 136);
__m512 tmp234 = _mm512_shuffle_f32x4(tmp224, tmp227, 136);
__m512 tmp235 = _mm512_shuffle_f32x4(tmp225, tmp228, 136);
__m512 tmp236 = _mm512_shuffle_f32x4(tmp229, tmp221, 136);
__m512 tmp237 = _mm512_shuffle_f32x4(tmp230, tmp232, 136);
__m512 tmp238 = _mm512_shuffle_f32x4(tmp231, tmp222, 136);
wt17 = _mm512_shuffle_f32x4(tmp233, tmp236, 136);
wt18 = _mm512_shuffle_f32x4(tmp234, tmp237, 136);
wt19 = _mm512_shuffle_f32x4(tmp235, tmp238, 136);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)0, 63>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)0, 63>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)0, 63>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(1+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt17);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(2+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt18);
_mm512_mask_storeu_ps(arranged1+8283088*i5+20064*l1+4*cut1+20*(3+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt19);
}
}
}
return;
}
char*restrict wtPtr2 = tensors2[0]+(ptrdiff_t)3340*4+(ptrdiff_t)40503904*1*g2;
char*restrict arranged2 = tensors2[2]+(ptrdiff_t)66264704*4+(ptrdiff_t)7421092*1*g2;
ptrdiff_t ii2 = 1;
for (ptrdiff_t i6 = 0; i6 < ii2; ++i6) {
ptrdiff_t j2 = 1*b2;
ptrdiff_t jj2 = j2+1;
for (; j2 < jj2; ++j2) {
if (j2 < 154) {
ptrdiff_t k4 = 0+16*(j2-0);
ptrdiff_t l4 = (size_t)(0+k4)/6;
ptrdiff_t cut4 = (size_t)(0+k4)%6;
switch (cut4) {
case 0:;
case 2: {
__m512 sum6 = _mm512_setzero_ps();
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*0+(ptrdiff_t)0, 63>>cut4, sum6);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*0+(ptrdiff_t)17952, 4032>>cut4, sum6);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*0+(ptrdiff_t)35904, 65535-(4095>>cut4), sum6);
ptrdiff_t c5 = 0;
for (; c5 != 46; ++c5) {
__m512 wt123 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)0);
__m512 wt124 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)16352);
__m512 wt125 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)32704);
__m512 wt126 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)49056);
__m512 wt127 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)65408);
__m512 wt128 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)81760);
__m512 wt129 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)98112);
__m512 wt130 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)114464);
__m512 wt131 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)130816);
__m512 wt132 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)147168);
__m512 wt133 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)163520);
__m512 wt134 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)179872);
__m512 wt135 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)196224);
__m512 wt136 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)212576);
__m512 wt137 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)228928);
__m512 wt138 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)245280);
__m512 tmp239 = _mm512_unpacklo_ps(wt123, wt124);
__m512 tmp240 = _mm512_unpackhi_ps(wt123, wt124);
__m512 tmp241 = _mm512_unpacklo_ps(wt125, wt126);
__m512 tmp242 = _mm512_unpackhi_ps(wt125, wt126);
__m512 tmp243 = _mm512_unpacklo_ps(wt127, wt128);
__m512 tmp244 = _mm512_unpackhi_ps(wt127, wt128);
__m512 tmp245 = _mm512_unpacklo_ps(wt129, wt130);
__m512 tmp246 = _mm512_unpackhi_ps(wt129, wt130);
__m512 tmp247 = _mm512_unpacklo_ps(wt131, wt132);
__m512 tmp248 = _mm512_unpackhi_ps(wt131, wt132);
__m512 tmp249 = _mm512_unpacklo_ps(wt133, wt134);
__m512 tmp250 = _mm512_unpackhi_ps(wt133, wt134);
__m512 tmp251 = _mm512_unpacklo_ps(wt135, wt136);
__m512 tmp252 = _mm512_unpackhi_ps(wt135, wt136);
__m512 tmp253 = _mm512_unpacklo_ps(wt137, wt138);
__m512 tmp254 = _mm512_unpackhi_ps(wt137, wt138);
__m512 tmp255 = _mm512_shuffle_ps(tmp239, tmp241, 68);
__m512 tmp256 = _mm512_shuffle_ps(tmp239, tmp241, 238);
__m512 tmp257 = _mm512_shuffle_ps(tmp240, tmp242, 68);
__m512 tmp258 = _mm512_shuffle_ps(tmp240, tmp242, 238);
__m512 tmp259 = _mm512_shuffle_ps(tmp243, tmp245, 68);
__m512 tmp260 = _mm512_shuffle_ps(tmp243, tmp245, 238);
__m512 tmp261 = _mm512_shuffle_ps(tmp244, tmp246, 68);
__m512 tmp262 = _mm512_shuffle_ps(tmp244, tmp246, 238);
__m512 tmp263 = _mm512_shuffle_ps(tmp247, tmp249, 68);
__m512 tmp264 = _mm512_shuffle_ps(tmp247, tmp249, 238);
__m512 tmp265 = _mm512_shuffle_ps(tmp248, tmp250, 68);
__m512 tmp266 = _mm512_shuffle_ps(tmp248, tmp250, 238);
__m512 tmp267 = _mm512_shuffle_ps(tmp251, tmp253, 68);
__m512 tmp268 = _mm512_shuffle_ps(tmp251, tmp253, 238);
__m512 tmp269 = _mm512_shuffle_ps(tmp252, tmp254, 68);
__m512 tmp270 = _mm512_shuffle_ps(tmp252, tmp254, 238);
__m512 tmp271 = _mm512_shuffle_f32x4(tmp255, tmp259, 136);
__m512 tmp272 = _mm512_shuffle_f32x4(tmp255, tmp259, 221);
__m512 tmp273 = _mm512_shuffle_f32x4(tmp256, tmp260, 136);
__m512 tmp274 = _mm512_shuffle_f32x4(tmp256, tmp260, 221);
__m512 tmp275 = _mm512_shuffle_f32x4(tmp257, tmp261, 136);
__m512 tmp276 = _mm512_shuffle_f32x4(tmp257, tmp261, 221);
__m512 tmp277 = _mm512_shuffle_f32x4(tmp258, tmp262, 136);
__m512 tmp278 = _mm512_shuffle_f32x4(tmp258, tmp262, 221);
__m512 tmp279 = _mm512_shuffle_f32x4(tmp263, tmp267, 136);
__m512 tmp280 = _mm512_shuffle_f32x4(tmp263, tmp267, 221);
__m512 tmp281 = _mm512_shuffle_f32x4(tmp264, tmp268, 136);
__m512 tmp282 = _mm512_shuffle_f32x4(tmp264, tmp268, 221);
__m512 tmp283 = _mm512_shuffle_f32x4(tmp265, tmp269, 136);
__m512 tmp284 = _mm512_shuffle_f32x4(tmp265, tmp269, 221);
__m512 tmp285 = _mm512_shuffle_f32x4(tmp266, tmp270, 136);
__m512 tmp286 = _mm512_shuffle_f32x4(tmp266, tmp270, 221);
wt123 = _mm512_shuffle_f32x4(tmp271, tmp279, 136);
wt131 = _mm512_shuffle_f32x4(tmp271, tmp279, 221);
wt124 = _mm512_shuffle_f32x4(tmp273, tmp281, 136);
wt132 = _mm512_shuffle_f32x4(tmp273, tmp281, 221);
wt125 = _mm512_shuffle_f32x4(tmp275, tmp283, 136);
wt133 = _mm512_shuffle_f32x4(tmp275, tmp283, 221);
wt126 = _mm512_shuffle_f32x4(tmp277, tmp285, 136);
wt134 = _mm512_shuffle_f32x4(tmp277, tmp285, 221);
wt127 = _mm512_shuffle_f32x4(tmp272, tmp280, 136);
wt135 = _mm512_shuffle_f32x4(tmp272, tmp280, 221);
wt128 = _mm512_shuffle_f32x4(tmp274, tmp282, 136);
wt136 = _mm512_shuffle_f32x4(tmp274, tmp282, 221);
wt129 = _mm512_shuffle_f32x4(tmp276, tmp284, 136);
wt137 = _mm512_shuffle_f32x4(tmp276, tmp284, 221);
wt130 = _mm512_shuffle_f32x4(tmp278, tmp286, 136);
wt138 = _mm512_shuffle_f32x4(tmp278, tmp286, 221);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut4, wt123);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut4, wt124);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut4, wt125);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut4, wt126);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut4, wt127);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut4, wt128);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut4, wt129);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut4, wt130);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut4, wt131);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut4, wt132);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut4, wt133);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut4, wt134);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(13+16*c5)+(ptrdiff_t)0, 63>>cut4, wt135);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(14+16*c5)+(ptrdiff_t)0, 63>>cut4, wt136);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(15+16*c5)+(ptrdiff_t)0, 63>>cut4, wt137);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(16+16*c5)+(ptrdiff_t)0, 63>>cut4, wt138);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt123);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt124);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt125);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt126);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt127);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt128);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt129);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt130);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt131);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt132);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt133);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt134);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(13+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt135);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(14+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt136);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(15+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt137);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(16+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt138);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt123);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt124);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt125);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt126);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt127);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt128);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt129);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt130);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt131);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt132);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt133);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt134);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(13+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt135);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(14+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt136);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(15+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt137);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(16+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt138);
}
__m512 wt139 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)0);
__m512 wt140 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)16352);
__m512 wt141 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)32704);
__m512 wt142 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)49056);
__m512 wt143 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)65408);
__m512 wt144 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)81760);
__m512 wt145 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)98112);
__m512 wt146 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)114464);
__m512 wt147 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)130816);
__m512 wt148 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)147168);
__m512 wt149 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)163520);
__m512 wt150 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)179872);
__m512 wt151 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)196224);
__m512 wt152 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)212576);
__m512 wt153 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)228928);
__m512 wt154 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c5+(ptrdiff_t)245280);
__m512 tmp287 = _mm512_unpacklo_ps(wt139, wt140);
__m512 tmp288 = _mm512_unpackhi_ps(wt139, wt140);
__m512 tmp289 = _mm512_unpacklo_ps(wt141, wt142);
__m512 tmp290 = _mm512_unpackhi_ps(wt141, wt142);
__m512 tmp291 = _mm512_unpacklo_ps(wt143, wt144);
__m512 tmp292 = _mm512_unpackhi_ps(wt143, wt144);
__m512 tmp293 = _mm512_unpacklo_ps(wt145, wt146);
__m512 tmp294 = _mm512_unpackhi_ps(wt145, wt146);
__m512 tmp295 = _mm512_unpacklo_ps(wt147, wt148);
__m512 tmp296 = _mm512_unpackhi_ps(wt147, wt148);
__m512 tmp297 = _mm512_unpacklo_ps(wt149, wt150);
__m512 tmp298 = _mm512_unpackhi_ps(wt149, wt150);
__m512 tmp299 = _mm512_unpacklo_ps(wt151, wt152);
__m512 tmp300 = _mm512_unpackhi_ps(wt151, wt152);
__m512 tmp301 = _mm512_unpacklo_ps(wt153, wt154);
__m512 tmp302 = _mm512_unpackhi_ps(wt153, wt154);
__m512 tmp303 = _mm512_shuffle_ps(tmp287, tmp289, 68);
__m512 tmp304 = _mm512_shuffle_ps(tmp287, tmp289, 238);
__m512 tmp305 = _mm512_shuffle_ps(tmp288, tmp290, 68);
__m512 tmp306 = _mm512_shuffle_ps(tmp288, tmp290, 238);
__m512 tmp307 = _mm512_shuffle_ps(tmp291, tmp293, 68);
__m512 tmp308 = _mm512_shuffle_ps(tmp291, tmp293, 238);
__m512 tmp309 = _mm512_shuffle_ps(tmp292, tmp294, 68);
__m512 tmp310 = _mm512_shuffle_ps(tmp292, tmp294, 238);
__m512 tmp311 = _mm512_shuffle_ps(tmp295, tmp297, 68);
__m512 tmp312 = _mm512_shuffle_ps(tmp295, tmp297, 238);
__m512 tmp313 = _mm512_shuffle_ps(tmp296, tmp298, 68);
__m512 tmp314 = _mm512_shuffle_ps(tmp296, tmp298, 238);
__m512 tmp315 = _mm512_shuffle_ps(tmp299, tmp301, 68);
__m512 tmp316 = _mm512_shuffle_ps(tmp299, tmp301, 238);
__m512 tmp317 = _mm512_shuffle_ps(tmp300, tmp302, 68);
__m512 tmp318 = _mm512_shuffle_ps(tmp300, tmp302, 238);
__m512 tmp319 = _mm512_shuffle_f32x4(tmp303, tmp307, 136);
__m512 tmp320 = _mm512_shuffle_f32x4(tmp303, tmp307, 221);
__m512 tmp321 = _mm512_shuffle_f32x4(tmp304, tmp308, 136);
__m512 tmp322 = _mm512_shuffle_f32x4(tmp304, tmp308, 221);
__m512 tmp323 = _mm512_shuffle_f32x4(tmp305, tmp309, 136);
__m512 tmp324 = _mm512_shuffle_f32x4(tmp305, tmp309, 221);
__m512 tmp325 = _mm512_shuffle_f32x4(tmp306, tmp310, 136);
__m512 tmp326 = _mm512_shuffle_f32x4(tmp306, tmp310, 221);
__m512 tmp327 = _mm512_shuffle_f32x4(tmp311, tmp315, 136);
__m512 tmp328 = _mm512_shuffle_f32x4(tmp311, tmp315, 221);
__m512 tmp329 = _mm512_shuffle_f32x4(tmp312, tmp316, 136);
__m512 tmp330 = _mm512_shuffle_f32x4(tmp312, tmp316, 221);
__m512 tmp331 = _mm512_shuffle_f32x4(tmp313, tmp317, 136);
__m512 tmp332 = _mm512_shuffle_f32x4(tmp313, tmp317, 221);
__m512 tmp333 = _mm512_shuffle_f32x4(tmp314, tmp318, 136);
__m512 tmp334 = _mm512_shuffle_f32x4(tmp314, tmp318, 221);
wt139 = _mm512_shuffle_f32x4(tmp319, tmp327, 136);
wt147 = _mm512_shuffle_f32x4(tmp319, tmp327, 221);
wt140 = _mm512_shuffle_f32x4(tmp321, tmp329, 136);
wt148 = _mm512_shuffle_f32x4(tmp321, tmp329, 221);
wt141 = _mm512_shuffle_f32x4(tmp323, tmp331, 136);
wt149 = _mm512_shuffle_f32x4(tmp323, tmp331, 221);
wt142 = _mm512_shuffle_f32x4(tmp325, tmp333, 136);
wt150 = _mm512_shuffle_f32x4(tmp325, tmp333, 221);
wt143 = _mm512_shuffle_f32x4(tmp320, tmp328, 136);
wt144 = _mm512_shuffle_f32x4(tmp322, tmp330, 136);
wt145 = _mm512_shuffle_f32x4(tmp324, tmp332, 136);
wt146 = _mm512_shuffle_f32x4(tmp326, tmp334, 136);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut4, wt139);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut4, wt140);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut4, wt141);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut4, wt142);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut4, wt143);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut4, wt144);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut4, wt145);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut4, wt146);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut4, wt147);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut4, wt148);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut4, wt149);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut4, wt150);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt139);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt140);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt141);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt142);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt143);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt144);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt145);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt146);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt147);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt148);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt149);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)17952, 4032>>cut4, wt150);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt139);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt140);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt141);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt142);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt143);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt144);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt145);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt146);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt147);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt148);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt149);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)35904, 65535-(4095>>cut4), wt150);
break;
}
default: {
cut4 = 4;
__m512 sum7 = _mm512_setzero_ps();
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*0+(ptrdiff_t)0, 63>>cut4, sum7);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*0+(ptrdiff_t)17952, 4032>>cut4, sum7);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*0+(ptrdiff_t)35904, 258048>>cut4, sum7);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*0+(ptrdiff_t)53856, 65535-(262143>>cut4), sum7);
ptrdiff_t c6 = 0;
for (; c6 != 46; ++c6) {
__m512 wt155 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)0);
__m512 wt156 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)16352);
__m512 wt157 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)32704);
__m512 wt158 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)49056);
__m512 wt159 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)65408);
__m512 wt160 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)81760);
__m512 wt161 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)98112);
__m512 wt162 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)114464);
__m512 wt163 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)130816);
__m512 wt164 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)147168);
__m512 wt165 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)163520);
__m512 wt166 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)179872);
__m512 wt167 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)196224);
__m512 wt168 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)212576);
__m512 wt169 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)228928);
__m512 wt170 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)245280);
__m512 tmp335 = _mm512_unpacklo_ps(wt155, wt156);
__m512 tmp336 = _mm512_unpackhi_ps(wt155, wt156);
__m512 tmp337 = _mm512_unpacklo_ps(wt157, wt158);
__m512 tmp338 = _mm512_unpackhi_ps(wt157, wt158);
__m512 tmp339 = _mm512_unpacklo_ps(wt159, wt160);
__m512 tmp340 = _mm512_unpackhi_ps(wt159, wt160);
__m512 tmp341 = _mm512_unpacklo_ps(wt161, wt162);
__m512 tmp342 = _mm512_unpackhi_ps(wt161, wt162);
__m512 tmp343 = _mm512_unpacklo_ps(wt163, wt164);
__m512 tmp344 = _mm512_unpackhi_ps(wt163, wt164);
__m512 tmp345 = _mm512_unpacklo_ps(wt165, wt166);
__m512 tmp346 = _mm512_unpackhi_ps(wt165, wt166);
__m512 tmp347 = _mm512_unpacklo_ps(wt167, wt168);
__m512 tmp348 = _mm512_unpackhi_ps(wt167, wt168);
__m512 tmp349 = _mm512_unpacklo_ps(wt169, wt170);
__m512 tmp350 = _mm512_unpackhi_ps(wt169, wt170);
__m512 tmp351 = _mm512_shuffle_ps(tmp335, tmp337, 68);
__m512 tmp352 = _mm512_shuffle_ps(tmp335, tmp337, 238);
__m512 tmp353 = _mm512_shuffle_ps(tmp336, tmp338, 68);
__m512 tmp354 = _mm512_shuffle_ps(tmp336, tmp338, 238);
__m512 tmp355 = _mm512_shuffle_ps(tmp339, tmp341, 68);
__m512 tmp356 = _mm512_shuffle_ps(tmp339, tmp341, 238);
__m512 tmp357 = _mm512_shuffle_ps(tmp340, tmp342, 68);
__m512 tmp358 = _mm512_shuffle_ps(tmp340, tmp342, 238);
__m512 tmp359 = _mm512_shuffle_ps(tmp343, tmp345, 68);
__m512 tmp360 = _mm512_shuffle_ps(tmp343, tmp345, 238);
__m512 tmp361 = _mm512_shuffle_ps(tmp344, tmp346, 68);
__m512 tmp362 = _mm512_shuffle_ps(tmp344, tmp346, 238);
__m512 tmp363 = _mm512_shuffle_ps(tmp347, tmp349, 68);
__m512 tmp364 = _mm512_shuffle_ps(tmp347, tmp349, 238);
__m512 tmp365 = _mm512_shuffle_ps(tmp348, tmp350, 68);
__m512 tmp366 = _mm512_shuffle_ps(tmp348, tmp350, 238);
__m512 tmp367 = _mm512_shuffle_f32x4(tmp351, tmp355, 136);
__m512 tmp368 = _mm512_shuffle_f32x4(tmp351, tmp355, 221);
__m512 tmp369 = _mm512_shuffle_f32x4(tmp352, tmp356, 136);
__m512 tmp370 = _mm512_shuffle_f32x4(tmp352, tmp356, 221);
__m512 tmp371 = _mm512_shuffle_f32x4(tmp353, tmp357, 136);
__m512 tmp372 = _mm512_shuffle_f32x4(tmp353, tmp357, 221);
__m512 tmp373 = _mm512_shuffle_f32x4(tmp354, tmp358, 136);
__m512 tmp374 = _mm512_shuffle_f32x4(tmp354, tmp358, 221);
__m512 tmp375 = _mm512_shuffle_f32x4(tmp359, tmp363, 136);
__m512 tmp376 = _mm512_shuffle_f32x4(tmp359, tmp363, 221);
__m512 tmp377 = _mm512_shuffle_f32x4(tmp360, tmp364, 136);
__m512 tmp378 = _mm512_shuffle_f32x4(tmp360, tmp364, 221);
__m512 tmp379 = _mm512_shuffle_f32x4(tmp361, tmp365, 136);
__m512 tmp380 = _mm512_shuffle_f32x4(tmp361, tmp365, 221);
__m512 tmp381 = _mm512_shuffle_f32x4(tmp362, tmp366, 136);
__m512 tmp382 = _mm512_shuffle_f32x4(tmp362, tmp366, 221);
wt155 = _mm512_shuffle_f32x4(tmp367, tmp375, 136);
wt163 = _mm512_shuffle_f32x4(tmp367, tmp375, 221);
wt156 = _mm512_shuffle_f32x4(tmp369, tmp377, 136);
wt164 = _mm512_shuffle_f32x4(tmp369, tmp377, 221);
wt157 = _mm512_shuffle_f32x4(tmp371, tmp379, 136);
wt165 = _mm512_shuffle_f32x4(tmp371, tmp379, 221);
wt158 = _mm512_shuffle_f32x4(tmp373, tmp381, 136);
wt166 = _mm512_shuffle_f32x4(tmp373, tmp381, 221);
wt159 = _mm512_shuffle_f32x4(tmp368, tmp376, 136);
wt167 = _mm512_shuffle_f32x4(tmp368, tmp376, 221);
wt160 = _mm512_shuffle_f32x4(tmp370, tmp378, 136);
wt168 = _mm512_shuffle_f32x4(tmp370, tmp378, 221);
wt161 = _mm512_shuffle_f32x4(tmp372, tmp380, 136);
wt169 = _mm512_shuffle_f32x4(tmp372, tmp380, 221);
wt162 = _mm512_shuffle_f32x4(tmp374, tmp382, 136);
wt170 = _mm512_shuffle_f32x4(tmp374, tmp382, 221);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut4, wt155);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut4, wt156);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut4, wt157);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut4, wt158);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut4, wt159);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut4, wt160);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut4, wt161);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut4, wt162);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut4, wt163);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut4, wt164);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut4, wt165);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut4, wt166);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(13+16*c6)+(ptrdiff_t)0, 63>>cut4, wt167);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(14+16*c6)+(ptrdiff_t)0, 63>>cut4, wt168);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(15+16*c6)+(ptrdiff_t)0, 63>>cut4, wt169);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(16+16*c6)+(ptrdiff_t)0, 63>>cut4, wt170);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt155);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt156);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt157);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt158);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt159);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt160);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt161);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt162);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt163);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt164);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt165);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt166);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(13+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt167);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(14+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt168);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(15+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt169);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(16+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt170);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt155);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt156);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt157);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt158);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt159);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt160);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt161);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt162);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt163);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt164);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt165);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt166);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(13+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt167);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(14+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt168);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(15+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt169);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(16+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt170);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt155);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt156);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt157);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt158);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt159);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt160);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt161);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt162);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt163);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt164);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt165);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt166);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(13+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt167);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(14+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt168);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(15+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt169);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(16+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt170);
}
__m512 wt171 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)0);
__m512 wt172 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)16352);
__m512 wt173 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)32704);
__m512 wt174 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)49056);
__m512 wt175 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)65408);
__m512 wt176 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)81760);
__m512 wt177 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)98112);
__m512 wt178 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)114464);
__m512 wt179 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)130816);
__m512 wt180 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)147168);
__m512 wt181 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)163520);
__m512 wt182 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)179872);
__m512 wt183 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)196224);
__m512 wt184 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)212576);
__m512 wt185 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)228928);
__m512 wt186 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k4+64*c6+(ptrdiff_t)245280);
__m512 tmp383 = _mm512_unpacklo_ps(wt171, wt172);
__m512 tmp384 = _mm512_unpackhi_ps(wt171, wt172);
__m512 tmp385 = _mm512_unpacklo_ps(wt173, wt174);
__m512 tmp386 = _mm512_unpackhi_ps(wt173, wt174);
__m512 tmp387 = _mm512_unpacklo_ps(wt175, wt176);
__m512 tmp388 = _mm512_unpackhi_ps(wt175, wt176);
__m512 tmp389 = _mm512_unpacklo_ps(wt177, wt178);
__m512 tmp390 = _mm512_unpackhi_ps(wt177, wt178);
__m512 tmp391 = _mm512_unpacklo_ps(wt179, wt180);
__m512 tmp392 = _mm512_unpackhi_ps(wt179, wt180);
__m512 tmp393 = _mm512_unpacklo_ps(wt181, wt182);
__m512 tmp394 = _mm512_unpackhi_ps(wt181, wt182);
__m512 tmp395 = _mm512_unpacklo_ps(wt183, wt184);
__m512 tmp396 = _mm512_unpackhi_ps(wt183, wt184);
__m512 tmp397 = _mm512_unpacklo_ps(wt185, wt186);
__m512 tmp398 = _mm512_unpackhi_ps(wt185, wt186);
__m512 tmp399 = _mm512_shuffle_ps(tmp383, tmp385, 68);
__m512 tmp400 = _mm512_shuffle_ps(tmp383, tmp385, 238);
__m512 tmp401 = _mm512_shuffle_ps(tmp384, tmp386, 68);
__m512 tmp402 = _mm512_shuffle_ps(tmp384, tmp386, 238);
__m512 tmp403 = _mm512_shuffle_ps(tmp387, tmp389, 68);
__m512 tmp404 = _mm512_shuffle_ps(tmp387, tmp389, 238);
__m512 tmp405 = _mm512_shuffle_ps(tmp388, tmp390, 68);
__m512 tmp406 = _mm512_shuffle_ps(tmp388, tmp390, 238);
__m512 tmp407 = _mm512_shuffle_ps(tmp391, tmp393, 68);
__m512 tmp408 = _mm512_shuffle_ps(tmp391, tmp393, 238);
__m512 tmp409 = _mm512_shuffle_ps(tmp392, tmp394, 68);
__m512 tmp410 = _mm512_shuffle_ps(tmp392, tmp394, 238);
__m512 tmp411 = _mm512_shuffle_ps(tmp395, tmp397, 68);
__m512 tmp412 = _mm512_shuffle_ps(tmp395, tmp397, 238);
__m512 tmp413 = _mm512_shuffle_ps(tmp396, tmp398, 68);
__m512 tmp414 = _mm512_shuffle_ps(tmp396, tmp398, 238);
__m512 tmp415 = _mm512_shuffle_f32x4(tmp399, tmp403, 136);
__m512 tmp416 = _mm512_shuffle_f32x4(tmp399, tmp403, 221);
__m512 tmp417 = _mm512_shuffle_f32x4(tmp400, tmp404, 136);
__m512 tmp418 = _mm512_shuffle_f32x4(tmp400, tmp404, 221);
__m512 tmp419 = _mm512_shuffle_f32x4(tmp401, tmp405, 136);
__m512 tmp420 = _mm512_shuffle_f32x4(tmp401, tmp405, 221);
__m512 tmp421 = _mm512_shuffle_f32x4(tmp402, tmp406, 136);
__m512 tmp422 = _mm512_shuffle_f32x4(tmp402, tmp406, 221);
__m512 tmp423 = _mm512_shuffle_f32x4(tmp407, tmp411, 136);
__m512 tmp424 = _mm512_shuffle_f32x4(tmp407, tmp411, 221);
__m512 tmp425 = _mm512_shuffle_f32x4(tmp408, tmp412, 136);
__m512 tmp426 = _mm512_shuffle_f32x4(tmp408, tmp412, 221);
__m512 tmp427 = _mm512_shuffle_f32x4(tmp409, tmp413, 136);
__m512 tmp428 = _mm512_shuffle_f32x4(tmp409, tmp413, 221);
__m512 tmp429 = _mm512_shuffle_f32x4(tmp410, tmp414, 136);
__m512 tmp430 = _mm512_shuffle_f32x4(tmp410, tmp414, 221);
wt171 = _mm512_shuffle_f32x4(tmp415, tmp423, 136);
wt179 = _mm512_shuffle_f32x4(tmp415, tmp423, 221);
wt172 = _mm512_shuffle_f32x4(tmp417, tmp425, 136);
wt180 = _mm512_shuffle_f32x4(tmp417, tmp425, 221);
wt173 = _mm512_shuffle_f32x4(tmp419, tmp427, 136);
wt181 = _mm512_shuffle_f32x4(tmp419, tmp427, 221);
wt174 = _mm512_shuffle_f32x4(tmp421, tmp429, 136);
wt182 = _mm512_shuffle_f32x4(tmp421, tmp429, 221);
wt175 = _mm512_shuffle_f32x4(tmp416, tmp424, 136);
wt176 = _mm512_shuffle_f32x4(tmp418, tmp426, 136);
wt177 = _mm512_shuffle_f32x4(tmp420, tmp428, 136);
wt178 = _mm512_shuffle_f32x4(tmp422, tmp430, 136);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut4, wt171);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut4, wt172);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut4, wt173);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut4, wt174);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut4, wt175);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut4, wt176);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut4, wt177);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut4, wt178);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut4, wt179);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut4, wt180);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut4, wt181);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut4, wt182);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt171);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt172);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt173);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt174);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt175);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt176);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt177);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt178);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt179);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt180);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt181);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)17952, 4032>>cut4, wt182);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt171);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt172);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt173);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt174);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt175);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt176);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt177);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt178);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt179);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt180);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt181);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)35904, 258048>>cut4, wt182);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt171);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt172);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt173);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt174);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt175);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt176);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt177);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt178);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt179);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt180);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt181);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)53856, 65535-(262143>>cut4), wt182);
}
}
} else {
ptrdiff_t k3 = 2464;
ptrdiff_t l3 = (size_t)(0+k3)/6;
ptrdiff_t cut3 = (size_t)(0+k3)%6;
__m512 sum5 = _mm512_setzero_ps();
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*0+(ptrdiff_t)0, 63>>cut3, sum5);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*0+(ptrdiff_t)17952, 4032>>cut3, sum5);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*0+(ptrdiff_t)35904, 8191-(4095>>cut3), sum5);
ptrdiff_t c4 = 0;
for (; c4 != 46; ++c4) {
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)0);
__m512 wt95 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)16352);
__m512 wt96 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)32704);
__m512 wt97 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)49056);
__m512 wt98 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)65408);
__m512 wt99 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)81760);
__m512 wt100 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)98112);
__m512 wt101 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)114464);
__m512 wt102 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)130816);
__m512 wt103 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)147168);
__m512 wt104 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)163520);
__m512 wt105 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)179872);
__m512 wt106 = _mm512_maskz_loadu_ps(65535, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)196224);
__m512 tmp431 = _mm512_unpacklo_ps(wt94, wt95);
__m512 tmp432 = _mm512_unpackhi_ps(wt94, wt95);
__m512 tmp433 = _mm512_unpacklo_ps(wt96, wt97);
__m512 tmp434 = _mm512_unpackhi_ps(wt96, wt97);
__m512 tmp435 = _mm512_unpacklo_ps(wt98, wt99);
__m512 tmp436 = _mm512_unpackhi_ps(wt98, wt99);
__m512 tmp437 = _mm512_unpacklo_ps(wt100, wt101);
__m512 tmp438 = _mm512_unpackhi_ps(wt100, wt101);
__m512 tmp439 = _mm512_unpacklo_ps(wt102, wt103);
__m512 tmp440 = _mm512_unpackhi_ps(wt102, wt103);
__m512 tmp441 = _mm512_unpacklo_ps(wt104, wt105);
__m512 tmp442 = _mm512_unpackhi_ps(wt104, wt105);
__m512 tmp443 = _mm512_unpacklo_ps(wt106, wt106);
__m512 tmp444 = _mm512_unpackhi_ps(wt106, wt106);
__m512 tmp445 = _mm512_shuffle_ps(tmp431, tmp433, 68);
__m512 tmp446 = _mm512_shuffle_ps(tmp431, tmp433, 238);
__m512 tmp447 = _mm512_shuffle_ps(tmp432, tmp434, 68);
__m512 tmp448 = _mm512_shuffle_ps(tmp432, tmp434, 238);
__m512 tmp449 = _mm512_shuffle_ps(tmp435, tmp437, 68);
__m512 tmp450 = _mm512_shuffle_ps(tmp435, tmp437, 238);
__m512 tmp451 = _mm512_shuffle_ps(tmp436, tmp438, 68);
__m512 tmp452 = _mm512_shuffle_ps(tmp436, tmp438, 238);
__m512 tmp453 = _mm512_shuffle_ps(tmp439, tmp441, 68);
__m512 tmp454 = _mm512_shuffle_ps(tmp439, tmp441, 238);
__m512 tmp455 = _mm512_shuffle_ps(tmp440, tmp442, 68);
__m512 tmp456 = _mm512_shuffle_ps(tmp440, tmp442, 238);
__m512 tmp457 = _mm512_shuffle_ps(tmp443, tmp443, 238);
__m512 tmp458 = _mm512_shuffle_ps(tmp444, tmp444, 238);
__m512 tmp459 = _mm512_shuffle_f32x4(tmp445, tmp449, 136);
__m512 tmp460 = _mm512_shuffle_f32x4(tmp445, tmp449, 221);
__m512 tmp461 = _mm512_shuffle_f32x4(tmp446, tmp450, 136);
__m512 tmp462 = _mm512_shuffle_f32x4(tmp446, tmp450, 221);
__m512 tmp463 = _mm512_shuffle_f32x4(tmp447, tmp451, 136);
__m512 tmp464 = _mm512_shuffle_f32x4(tmp447, tmp451, 221);
__m512 tmp465 = _mm512_shuffle_f32x4(tmp448, tmp452, 136);
__m512 tmp466 = _mm512_shuffle_f32x4(tmp448, tmp452, 221);
__m512 tmp467 = _mm512_shuffle_f32x4(tmp453, tmp443, 136);
__m512 tmp468 = _mm512_shuffle_f32x4(tmp453, tmp443, 221);
__m512 tmp469 = _mm512_shuffle_f32x4(tmp454, tmp457, 136);
__m512 tmp470 = _mm512_shuffle_f32x4(tmp454, tmp457, 221);
__m512 tmp471 = _mm512_shuffle_f32x4(tmp455, tmp444, 136);
__m512 tmp472 = _mm512_shuffle_f32x4(tmp455, tmp444, 221);
__m512 tmp473 = _mm512_shuffle_f32x4(tmp456, tmp458, 136);
__m512 tmp474 = _mm512_shuffle_f32x4(tmp456, tmp458, 221);
wt94 = _mm512_shuffle_f32x4(tmp459, tmp467, 136);
wt102 = _mm512_shuffle_f32x4(tmp459, tmp467, 221);
wt95 = _mm512_shuffle_f32x4(tmp461, tmp469, 136);
wt103 = _mm512_shuffle_f32x4(tmp461, tmp469, 221);
wt96 = _mm512_shuffle_f32x4(tmp463, tmp471, 136);
wt104 = _mm512_shuffle_f32x4(tmp463, tmp471, 221);
wt97 = _mm512_shuffle_f32x4(tmp465, tmp473, 136);
wt105 = _mm512_shuffle_f32x4(tmp465, tmp473, 221);
wt98 = _mm512_shuffle_f32x4(tmp460, tmp468, 136);
wt106 = _mm512_shuffle_f32x4(tmp460, tmp468, 221);
wt99 = _mm512_shuffle_f32x4(tmp462, tmp470, 136);
__m512 wt107 = _mm512_shuffle_f32x4(tmp462, tmp470, 221);
wt100 = _mm512_shuffle_f32x4(tmp464, tmp472, 136);
__m512 wt108 = _mm512_shuffle_f32x4(tmp464, tmp472, 221);
wt101 = _mm512_shuffle_f32x4(tmp466, tmp474, 136);
__m512 wt109 = _mm512_shuffle_f32x4(tmp466, tmp474, 221);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(1+16*c4)+(ptrdiff_t)0, 63>>cut3, wt94);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(2+16*c4)+(ptrdiff_t)0, 63>>cut3, wt95);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(3+16*c4)+(ptrdiff_t)0, 63>>cut3, wt96);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(4+16*c4)+(ptrdiff_t)0, 63>>cut3, wt97);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(5+16*c4)+(ptrdiff_t)0, 63>>cut3, wt98);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(6+16*c4)+(ptrdiff_t)0, 63>>cut3, wt99);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(7+16*c4)+(ptrdiff_t)0, 63>>cut3, wt100);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(8+16*c4)+(ptrdiff_t)0, 63>>cut3, wt101);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(9+16*c4)+(ptrdiff_t)0, 63>>cut3, wt102);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(10+16*c4)+(ptrdiff_t)0, 63>>cut3, wt103);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(11+16*c4)+(ptrdiff_t)0, 63>>cut3, wt104);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(12+16*c4)+(ptrdiff_t)0, 63>>cut3, wt105);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(13+16*c4)+(ptrdiff_t)0, 63>>cut3, wt106);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(14+16*c4)+(ptrdiff_t)0, 63>>cut3, wt107);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(15+16*c4)+(ptrdiff_t)0, 63>>cut3, wt108);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(16+16*c4)+(ptrdiff_t)0, 63>>cut3, wt109);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(1+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt94);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(2+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt95);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(3+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt96);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(4+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt97);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(5+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt98);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(6+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt99);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(7+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt100);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(8+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt101);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(9+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt102);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(10+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt103);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(11+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt104);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(12+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt105);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(13+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt106);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(14+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt107);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(15+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt108);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(16+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt109);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(1+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt94);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(2+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt95);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(3+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt96);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(4+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt97);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(5+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt98);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(6+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt99);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(7+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt100);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(8+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt101);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(9+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt102);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(10+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt103);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(11+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt104);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(12+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt105);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(13+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt106);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(14+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt107);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(15+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt108);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(16+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt109);
}
__m512 wt110 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)0);
__m512 wt111 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)16352);
__m512 wt112 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)32704);
__m512 wt113 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)49056);
__m512 wt114 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)65408);
__m512 wt115 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)81760);
__m512 wt116 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)98112);
__m512 wt117 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)114464);
__m512 wt118 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)130816);
__m512 wt119 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)147168);
__m512 wt120 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)163520);
__m512 wt121 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)179872);
__m512 wt122 = _mm512_maskz_loadu_ps(4095, wtPtr2+40503904*i6+16352*k3+64*c4+(ptrdiff_t)196224);
__m512 tmp475 = _mm512_unpacklo_ps(wt110, wt111);
__m512 tmp476 = _mm512_unpackhi_ps(wt110, wt111);
__m512 tmp477 = _mm512_unpacklo_ps(wt112, wt113);
__m512 tmp478 = _mm512_unpackhi_ps(wt112, wt113);
__m512 tmp479 = _mm512_unpacklo_ps(wt114, wt115);
__m512 tmp480 = _mm512_unpackhi_ps(wt114, wt115);
__m512 tmp481 = _mm512_unpacklo_ps(wt116, wt117);
__m512 tmp482 = _mm512_unpackhi_ps(wt116, wt117);
__m512 tmp483 = _mm512_unpacklo_ps(wt118, wt119);
__m512 tmp484 = _mm512_unpackhi_ps(wt118, wt119);
__m512 tmp485 = _mm512_unpacklo_ps(wt120, wt121);
__m512 tmp486 = _mm512_unpackhi_ps(wt120, wt121);
__m512 tmp487 = _mm512_unpacklo_ps(wt122, wt122);
__m512 tmp488 = _mm512_unpackhi_ps(wt122, wt122);
__m512 tmp489 = _mm512_shuffle_ps(tmp475, tmp477, 68);
__m512 tmp490 = _mm512_shuffle_ps(tmp475, tmp477, 238);
__m512 tmp491 = _mm512_shuffle_ps(tmp476, tmp478, 68);
__m512 tmp492 = _mm512_shuffle_ps(tmp476, tmp478, 238);
__m512 tmp493 = _mm512_shuffle_ps(tmp479, tmp481, 68);
__m512 tmp494 = _mm512_shuffle_ps(tmp479, tmp481, 238);
__m512 tmp495 = _mm512_shuffle_ps(tmp480, tmp482, 68);
__m512 tmp496 = _mm512_shuffle_ps(tmp480, tmp482, 238);
__m512 tmp497 = _mm512_shuffle_ps(tmp483, tmp485, 68);
__m512 tmp498 = _mm512_shuffle_ps(tmp483, tmp485, 238);
__m512 tmp499 = _mm512_shuffle_ps(tmp484, tmp486, 68);
__m512 tmp500 = _mm512_shuffle_ps(tmp484, tmp486, 238);
__m512 tmp501 = _mm512_shuffle_ps(tmp487, tmp487, 238);
__m512 tmp502 = _mm512_shuffle_ps(tmp488, tmp488, 238);
__m512 tmp503 = _mm512_shuffle_f32x4(tmp489, tmp493, 136);
__m512 tmp504 = _mm512_shuffle_f32x4(tmp489, tmp493, 221);
__m512 tmp505 = _mm512_shuffle_f32x4(tmp490, tmp494, 136);
__m512 tmp506 = _mm512_shuffle_f32x4(tmp490, tmp494, 221);
__m512 tmp507 = _mm512_shuffle_f32x4(tmp491, tmp495, 136);
__m512 tmp508 = _mm512_shuffle_f32x4(tmp491, tmp495, 221);
__m512 tmp509 = _mm512_shuffle_f32x4(tmp492, tmp496, 136);
__m512 tmp510 = _mm512_shuffle_f32x4(tmp492, tmp496, 221);
__m512 tmp511 = _mm512_shuffle_f32x4(tmp497, tmp487, 136);
__m512 tmp512 = _mm512_shuffle_f32x4(tmp497, tmp487, 221);
__m512 tmp513 = _mm512_shuffle_f32x4(tmp498, tmp501, 136);
__m512 tmp514 = _mm512_shuffle_f32x4(tmp498, tmp501, 221);
__m512 tmp515 = _mm512_shuffle_f32x4(tmp499, tmp488, 136);
__m512 tmp516 = _mm512_shuffle_f32x4(tmp499, tmp488, 221);
__m512 tmp517 = _mm512_shuffle_f32x4(tmp500, tmp502, 136);
__m512 tmp518 = _mm512_shuffle_f32x4(tmp500, tmp502, 221);
wt110 = _mm512_shuffle_f32x4(tmp503, tmp511, 136);
wt118 = _mm512_shuffle_f32x4(tmp503, tmp511, 221);
wt111 = _mm512_shuffle_f32x4(tmp505, tmp513, 136);
wt119 = _mm512_shuffle_f32x4(tmp505, tmp513, 221);
wt112 = _mm512_shuffle_f32x4(tmp507, tmp515, 136);
wt120 = _mm512_shuffle_f32x4(tmp507, tmp515, 221);
wt113 = _mm512_shuffle_f32x4(tmp509, tmp517, 136);
wt121 = _mm512_shuffle_f32x4(tmp509, tmp517, 221);
wt114 = _mm512_shuffle_f32x4(tmp504, tmp512, 136);
wt115 = _mm512_shuffle_f32x4(tmp506, tmp514, 136);
wt116 = _mm512_shuffle_f32x4(tmp508, tmp516, 136);
wt117 = _mm512_shuffle_f32x4(tmp510, tmp518, 136);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(1+16*c4)+(ptrdiff_t)0, 63>>cut3, wt110);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(2+16*c4)+(ptrdiff_t)0, 63>>cut3, wt111);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(3+16*c4)+(ptrdiff_t)0, 63>>cut3, wt112);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(4+16*c4)+(ptrdiff_t)0, 63>>cut3, wt113);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(5+16*c4)+(ptrdiff_t)0, 63>>cut3, wt114);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(6+16*c4)+(ptrdiff_t)0, 63>>cut3, wt115);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(7+16*c4)+(ptrdiff_t)0, 63>>cut3, wt116);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(8+16*c4)+(ptrdiff_t)0, 63>>cut3, wt117);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(9+16*c4)+(ptrdiff_t)0, 63>>cut3, wt118);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(10+16*c4)+(ptrdiff_t)0, 63>>cut3, wt119);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(11+16*c4)+(ptrdiff_t)0, 63>>cut3, wt120);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(12+16*c4)+(ptrdiff_t)0, 63>>cut3, wt121);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(1+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt110);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(2+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt111);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(3+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt112);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(4+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt113);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(5+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt114);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(6+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt115);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(7+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt116);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(8+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt117);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(9+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt118);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(10+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt119);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(11+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt120);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+24*(12+16*c4)+(ptrdiff_t)17952, 4032>>cut3, wt121);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(1+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt110);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(2+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt111);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(3+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt112);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(4+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt113);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(5+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt114);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(6+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt115);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(7+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt116);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(8+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt117);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(9+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt118);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(10+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt119);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(11+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt120);
_mm512_mask_storeu_ps(arranged2+7421092*i6+17976*l3+4*cut3+20*(12+16*c4)+(ptrdiff_t)35904, 8191-(4095>>cut3), wt121);
}
}
}
}

static void Example15OneArrangeWts1(Example15ThreaderTeam1* team13, char** tensors1) {
Example15ThreaderTask1 task5;
task5.callee1 = Example15OneArrangeWts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 155;
task5.hull1[1] = 8;
task5.hull1[2] = 5;
Example15ThreaderDo1(team13, &task5);
}

static void Example15OneArrangeDats1Callee1(Example15ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = pt8[0];
ptrdiff_t c7 = pt8[1];
ptrdiff_t g3 = pt8[2];
ptrdiff_t e2 = pt8[3];
if (e2 < 4) {
char*restrict datPtr1 = tensors4[0]+(ptrdiff_t)0+(ptrdiff_t)135791040*e2+(ptrdiff_t)664806912*1*g3;
char*restrict arranged3 = tensors4[1]+(ptrdiff_t)310379520*e2+(ptrdiff_t)38797440*1*g3;
ptrdiff_t ii3 = 1;
for (ptrdiff_t i7 = 0; i7 < ii3; ++i7) {
ptrdiff_t j3 = 1*c7;
ptrdiff_t jj3 = j3+0;
if (j3 < 181) {
ptrdiff_t h1 = 0+((size_t)j3-0)/3*4;
switch (((size_t)j3-0)%3) {
case 0: {
wrap3:;
ptrdiff_t k5 = 167*s1;
ptrdiff_t kk1 = k5+167;
for (; k5 < kk1; ++k5) {
__m512 dat1 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k5+(ptrdiff_t)0);
__m512 dat2 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k5+(ptrdiff_t)64);
__m512i pm1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat3 = _mm512_permutex2var_ps(dat1, pm1, dat2);
__m512 dat4 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k5+(ptrdiff_t)128);
__m512 dat5 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k5+(ptrdiff_t)192);
__m512i pm2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat6 = _mm512_permutex2var_ps(dat4, pm2, dat5);
__m512 dat7 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k5+(ptrdiff_t)256);
__m512 dat8 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k5+(ptrdiff_t)320);
__m512i pm3 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat9 = _mm512_permutex2var_ps(dat7, pm3, dat8);
__m512 dat10 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k5+(ptrdiff_t)384);
__m512 dat11 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k5+(ptrdiff_t)448);
__m512i pm4 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat12 = _mm512_permutex2var_ps(dat10, pm4, dat11);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k5+(ptrdiff_t)0, dat3);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k5+(ptrdiff_t)64, dat6);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k5+(ptrdiff_t)128, dat9);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k5+(ptrdiff_t)192, dat12);
}
if (j3 >= jj3) goto next1;
if (j3 >= 180) break;
++j3;
}
case 1: {
ptrdiff_t k6 = 167*s1;
ptrdiff_t kk2 = k6+167;
for (; k6 < kk2; ++k6) {
__m512 dat13 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k6+(ptrdiff_t)512);
__m512 dat14 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k6+(ptrdiff_t)576);
__m512i pm5 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat15 = _mm512_permutex2var_ps(dat13, pm5, dat14);
__m512 dat16 = _mm512_maskz_loadu_ps(127, datPtr1+664806912*i7+672*h1+162624*k6+(ptrdiff_t)640);
__m512i pm6 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat17 = _mm512_permutexvar_ps(pm6, dat16);
__m512 dat18 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k6+(ptrdiff_t)1344);
__m512 dat19 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k6+(ptrdiff_t)1408);
__m512i pm7 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat20 = _mm512_permutex2var_ps(dat18, pm7, dat19);
__m512 dat21 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k6+(ptrdiff_t)1472);
__m512 dat22 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k6+(ptrdiff_t)1536);
__m512i pm8 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat23 = _mm512_permutex2var_ps(dat21, pm8, dat22);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k6+(ptrdiff_t)0, dat15);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k6+(ptrdiff_t)64, dat17);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k6+(ptrdiff_t)128, dat20);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k6+(ptrdiff_t)192, dat23);
}
if (j3 >= jj3) goto next1;
++j3;
}
default: {
ptrdiff_t k7 = 167*s1;
ptrdiff_t kk3 = k7+167;
for (; k7 < kk3; ++k7) {
__m512 dat24 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k7+(ptrdiff_t)1600);
__m512 dat25 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k7+(ptrdiff_t)1664);
__m512i pm9 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat26 = _mm512_permutex2var_ps(dat24, pm9, dat25);
__m512 dat27 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k7+(ptrdiff_t)1728);
__m512 dat28 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k7+(ptrdiff_t)1792);
__m512i pm10 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat29 = _mm512_permutex2var_ps(dat27, pm10, dat28);
__m512 dat30 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k7+(ptrdiff_t)1856);
__m512 dat31 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*h1+162624*k7+(ptrdiff_t)1920);
__m512i pm11 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat32 = _mm512_permutex2var_ps(dat30, pm11, dat31);
__m512 dat33 = _mm512_maskz_loadu_ps(127, datPtr1+664806912*i7+672*h1+162624*k7+(ptrdiff_t)1984);
__m512i pm12 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat34 = _mm512_permutexvar_ps(pm12, dat33);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k7+(ptrdiff_t)0, dat26);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k7+(ptrdiff_t)64, dat29);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k7+(ptrdiff_t)128, dat32);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+256*k7+(ptrdiff_t)192, dat34);
}
if (j3 >= jj3) goto next1;
++j3;
h1 += 4;
goto wrap3;
}
}
j3 = 181;
}
switch ((size_t)j3-181) {
default: {
j3 = 181;
ptrdiff_t k8 = 167*s1;
ptrdiff_t kk4 = k8+167;
for (; k8 < kk4; ++k8) {
__m512 dat35 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*(ptrdiff_t)240+162624*k8+(ptrdiff_t)512);
__m512 dat36 = _mm512_maskz_loadu_ps(32767, datPtr1+664806912*i7+672*(ptrdiff_t)240+162624*k8+(ptrdiff_t)576);
__m512i pm13 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat37 = _mm512_permutex2var_ps(dat35, pm13, dat36);
__m512 dat38 = _mm512_maskz_loadu_ps(127, datPtr1+664806912*i7+672*(ptrdiff_t)240+162624*k8+(ptrdiff_t)640);
__m512i pm14 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat39 = _mm512_permutexvar_ps(pm14, dat38);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+128*k8+(ptrdiff_t)0, dat37);
_mm512_storeu_ps(arranged3+38797440*i7+213760*j3+128*k8+(ptrdiff_t)64, dat39);
}
if (j3 >= jj3) goto next1;
}
}
j3 = 182;
next1:;
}
return;
}
char*restrict datPtr2 = tensors4[0]+(ptrdiff_t)0+(ptrdiff_t)135791040*4+(ptrdiff_t)664806912*1*g3;
char*restrict arranged4 = tensors4[1]+(ptrdiff_t)310379520*4+(ptrdiff_t)34755072*1*g3;
ptrdiff_t ii4 = 1;
for (ptrdiff_t i8 = 0; i8 < ii4; ++i8) {
ptrdiff_t j4 = 1*c7;
ptrdiff_t jj4 = j4+0;
if (j4 < 181) {
ptrdiff_t h2 = 0+((size_t)j4-0)/3*4;
switch (((size_t)j4-0)%3) {
case 0: {
wrap4:;
ptrdiff_t k9 = 149*s1;
ptrdiff_t kk5 = k9+(s1 < 4 ? 149 : 152);
for (; k9 < kk5; ++k9) {
__m512 dat40 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k9+(ptrdiff_t)0);
__m512 dat41 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k9+(ptrdiff_t)64);
__m512i pm15 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat42 = _mm512_permutex2var_ps(dat40, pm15, dat41);
__m512 dat43 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k9+(ptrdiff_t)128);
__m512 dat44 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k9+(ptrdiff_t)192);
__m512i pm16 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat45 = _mm512_permutex2var_ps(dat43, pm16, dat44);
__m512 dat46 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k9+(ptrdiff_t)256);
__m512 dat47 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k9+(ptrdiff_t)320);
__m512i pm17 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat48 = _mm512_permutex2var_ps(dat46, pm17, dat47);
__m512 dat49 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k9+(ptrdiff_t)384);
__m512 dat50 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k9+(ptrdiff_t)448);
__m512i pm18 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat51 = _mm512_permutex2var_ps(dat49, pm18, dat50);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k9+(ptrdiff_t)0, dat42);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k9+(ptrdiff_t)64, dat45);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k9+(ptrdiff_t)128, dat48);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k9+(ptrdiff_t)192, dat51);
}
if (j4 >= jj4) goto next2;
if (j4 >= 180) break;
++j4;
}
case 1: {
ptrdiff_t k10 = 149*s1;
ptrdiff_t kk6 = k10+(s1 < 4 ? 149 : 152);
for (; k10 < kk6; ++k10) {
__m512 dat52 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k10+(ptrdiff_t)512);
__m512 dat53 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k10+(ptrdiff_t)576);
__m512i pm19 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat54 = _mm512_permutex2var_ps(dat52, pm19, dat53);
__m512 dat55 = _mm512_maskz_loadu_ps(127, datPtr2+664806912*i8+672*h2+162624*k10+(ptrdiff_t)640);
__m512i pm20 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat56 = _mm512_permutexvar_ps(pm20, dat55);
__m512 dat57 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k10+(ptrdiff_t)1344);
__m512 dat58 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k10+(ptrdiff_t)1408);
__m512i pm21 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat59 = _mm512_permutex2var_ps(dat57, pm21, dat58);
__m512 dat60 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k10+(ptrdiff_t)1472);
__m512 dat61 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k10+(ptrdiff_t)1536);
__m512i pm22 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat62 = _mm512_permutex2var_ps(dat60, pm22, dat61);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k10+(ptrdiff_t)0, dat54);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k10+(ptrdiff_t)64, dat56);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k10+(ptrdiff_t)128, dat59);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k10+(ptrdiff_t)192, dat62);
}
if (j4 >= jj4) goto next2;
++j4;
}
default: {
ptrdiff_t k11 = 149*s1;
ptrdiff_t kk7 = k11+(s1 < 4 ? 149 : 152);
for (; k11 < kk7; ++k11) {
__m512 dat63 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k11+(ptrdiff_t)1600);
__m512 dat64 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k11+(ptrdiff_t)1664);
__m512i pm23 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat65 = _mm512_permutex2var_ps(dat63, pm23, dat64);
__m512 dat66 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k11+(ptrdiff_t)1728);
__m512 dat67 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k11+(ptrdiff_t)1792);
__m512i pm24 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat68 = _mm512_permutex2var_ps(dat66, pm24, dat67);
__m512 dat69 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k11+(ptrdiff_t)1856);
__m512 dat70 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*h2+162624*k11+(ptrdiff_t)1920);
__m512i pm25 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat71 = _mm512_permutex2var_ps(dat69, pm25, dat70);
__m512 dat72 = _mm512_maskz_loadu_ps(127, datPtr2+664806912*i8+672*h2+162624*k11+(ptrdiff_t)1984);
__m512i pm26 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat73 = _mm512_permutexvar_ps(pm26, dat72);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k11+(ptrdiff_t)0, dat65);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k11+(ptrdiff_t)64, dat68);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k11+(ptrdiff_t)128, dat71);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+256*k11+(ptrdiff_t)192, dat73);
}
if (j4 >= jj4) goto next2;
++j4;
h2 += 4;
goto wrap4;
}
}
j4 = 181;
}
switch ((size_t)j4-181) {
default: {
j4 = 181;
ptrdiff_t k12 = 149*s1;
ptrdiff_t kk8 = k12+(s1 < 4 ? 149 : 152);
for (; k12 < kk8; ++k12) {
__m512 dat74 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*(ptrdiff_t)240+162624*k12+(ptrdiff_t)512);
__m512 dat75 = _mm512_maskz_loadu_ps(32767, datPtr2+664806912*i8+672*(ptrdiff_t)240+162624*k12+(ptrdiff_t)576);
__m512i pm27 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat76 = _mm512_permutex2var_ps(dat74, pm27, dat75);
__m512 dat77 = _mm512_maskz_loadu_ps(127, datPtr2+664806912*i8+672*(ptrdiff_t)240+162624*k12+(ptrdiff_t)640);
__m512i pm28 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat78 = _mm512_permutexvar_ps(pm28, dat77);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+128*k12+(ptrdiff_t)0, dat76);
_mm512_storeu_ps(arranged4+34755072*i8+191488*j4+128*k12+(ptrdiff_t)64, dat78);
}
if (j4 >= jj4) goto next2;
}
}
j4 = 182;
next2:;
}
}

static void Example15OneArrangeDats1(Example15ThreaderTeam1* team15, char** tensors3) {
Example15ThreaderTask1 task7;
task7.callee1 = Example15OneArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 5;
task7.hull1[1] = 182;
task7.hull1[2] = 8;
task7.hull1[3] = 5;
Example15ThreaderDo1(team15, &task7);
}

static void Example15OneApply1Callee1(Example15ThreaderTask1* task8, int64_t* pt9) {
void** pair2 = task8->any1;
char** tensors6 = pair2[0];
ptrdiff_t e3 = 0;
ptrdiff_t g4 = pt9[2];
ptrdiff_t d1 = pt9[1];
ptrdiff_t w1 = pt9[0];
char*restrict arrangedWts1 = tensors6[0]+66264704*e3+(ptrdiff_t)8283088*1*g4;
char*restrict arrangedDats1 = tensors6[1]+310379520*e3+(ptrdiff_t)38797440*1*g4;
char*restrict datPtr3 = tensors6[2]+(ptrdiff_t)100704912*1*g4;
ptrdiff_t ii5 = 1;
for (ptrdiff_t i9 = 0; i9 < ii5; ++i9) {
ptrdiff_t j5 = 1*d1;
ptrdiff_t jj5 = j5+0;
if (j5 < 181) {
ptrdiff_t h3 = 0+((size_t)j5-0)/3*2;
switch (((size_t)j5-0)%3) {
case 0: {
wrap5:;
ptrdiff_t k13 = 1*w1;
ptrdiff_t kk9 = k13+0;
for (; k13 != 412; ++k13) {
ptrdiff_t s2 = -1;
__m512 sum8 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)24));
__m512 sum12 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)28));
__m512 sum16 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)32));
__m512 sum20 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)36));
__m512 sum24 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)40));
__m512 sum28 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)44));
__m512 sum9 = sum8;
__m512 sum10 = sum8;
__m512 sum11 = sum8;
__m512 sum13 = sum12;
__m512 sum14 = sum12;
__m512 sum15 = sum12;
__m512 sum17 = sum16;
__m512 sum18 = sum16;
__m512 sum19 = sum16;
__m512 sum21 = sum20;
__m512 sum22 = sum20;
__m512 sum23 = sum20;
__m512 sum25 = sum24;
__m512 sum26 = sum24;
__m512 sum27 = sum24;
__m512 sum29 = sum28;
__m512 sum30 = sum28;
__m512 sum31 = sum28;
for (s2 = 0; s2 < 835; ++s2) {
__m512 dat79 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s2+(ptrdiff_t)0);
__m512 dat80 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s2+(ptrdiff_t)64);
__m512 dat81 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s2+(ptrdiff_t)128);
__m512 dat82 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s2+(ptrdiff_t)192);
__m512 wt187 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)24));
sum8 = _mm512_fmadd_ps(wt187, dat79, sum8);
sum9 = _mm512_fmadd_ps(wt187, dat80, sum9);
sum10 = _mm512_fmadd_ps(wt187, dat81, sum10);
sum11 = _mm512_fmadd_ps(wt187, dat82, sum11);
__m512 wt188 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)28));
sum12 = _mm512_fmadd_ps(wt188, dat79, sum12);
sum13 = _mm512_fmadd_ps(wt188, dat80, sum13);
sum14 = _mm512_fmadd_ps(wt188, dat81, sum14);
sum15 = _mm512_fmadd_ps(wt188, dat82, sum15);
__m512 wt189 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)32));
sum16 = _mm512_fmadd_ps(wt189, dat79, sum16);
sum17 = _mm512_fmadd_ps(wt189, dat80, sum17);
sum18 = _mm512_fmadd_ps(wt189, dat81, sum18);
sum19 = _mm512_fmadd_ps(wt189, dat82, sum19);
__m512 wt190 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)36));
sum20 = _mm512_fmadd_ps(wt190, dat79, sum20);
sum21 = _mm512_fmadd_ps(wt190, dat80, sum21);
sum22 = _mm512_fmadd_ps(wt190, dat81, sum22);
sum23 = _mm512_fmadd_ps(wt190, dat82, sum23);
__m512 wt191 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)40));
sum24 = _mm512_fmadd_ps(wt191, dat79, sum24);
sum25 = _mm512_fmadd_ps(wt191, dat80, sum25);
sum26 = _mm512_fmadd_ps(wt191, dat81, sum26);
sum27 = _mm512_fmadd_ps(wt191, dat82, sum27);
__m512 wt192 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+24*s2+(ptrdiff_t)44));
sum28 = _mm512_fmadd_ps(wt192, dat79, sum28);
sum29 = _mm512_fmadd_ps(wt192, dat80, sum29);
sum30 = _mm512_fmadd_ps(wt192, dat81, sum30);
sum31 = _mm512_fmadd_ps(wt192, dat82, sum31);
}
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)0, 65535, sum8);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)64, 65535, sum9);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)128, 65535, sum10);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)192, 65535, sum11);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)40656, 65535, sum12);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)40720, 65535, sum13);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)40784, 65535, sum14);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)40848, 65535, sum15);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)81312, 65535, sum16);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)81376, 65535, sum17);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)81440, 65535, sum18);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)81504, 65535, sum19);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)121968, 65535, sum20);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)122032, 65535, sum21);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)122096, 65535, sum22);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)122160, 65535, sum23);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)162624, 65535, sum24);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)162688, 65535, sum25);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)162752, 65535, sum26);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)162816, 65535, sum27);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)203280, 65535, sum28);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)203344, 65535, sum29);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)203408, 65535, sum30);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)203472, 65535, sum31);
if (k13 >= kk9) return;
}
ptrdiff_t s3 = -1;
__m512 sum32 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)20));
__m512 sum36 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)24));
__m512 sum40 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)28));
__m512 sum44 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)32));
__m512 sum48 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)36));
__m512 sum33 = sum32;
__m512 sum34 = sum32;
__m512 sum35 = sum32;
__m512 sum37 = sum36;
__m512 sum38 = sum36;
__m512 sum39 = sum36;
__m512 sum41 = sum40;
__m512 sum42 = sum40;
__m512 sum43 = sum40;
__m512 sum45 = sum44;
__m512 sum46 = sum44;
__m512 sum47 = sum44;
__m512 sum49 = sum48;
__m512 sum50 = sum48;
__m512 sum51 = sum48;
for (s3 = 0; s3 < 835; ++s3) {
__m512 dat83 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s3+(ptrdiff_t)0);
__m512 dat84 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s3+(ptrdiff_t)64);
__m512 dat85 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s3+(ptrdiff_t)128);
__m512 dat86 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s3+(ptrdiff_t)192);
__m512 wt193 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)20));
sum32 = _mm512_fmadd_ps(wt193, dat83, sum32);
sum33 = _mm512_fmadd_ps(wt193, dat84, sum33);
sum34 = _mm512_fmadd_ps(wt193, dat85, sum34);
sum35 = _mm512_fmadd_ps(wt193, dat86, sum35);
__m512 wt194 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)24));
sum36 = _mm512_fmadd_ps(wt194, dat83, sum36);
sum37 = _mm512_fmadd_ps(wt194, dat84, sum37);
sum38 = _mm512_fmadd_ps(wt194, dat85, sum38);
sum39 = _mm512_fmadd_ps(wt194, dat86, sum39);
__m512 wt195 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)28));
sum40 = _mm512_fmadd_ps(wt195, dat83, sum40);
sum41 = _mm512_fmadd_ps(wt195, dat84, sum41);
sum42 = _mm512_fmadd_ps(wt195, dat85, sum42);
sum43 = _mm512_fmadd_ps(wt195, dat86, sum43);
__m512 wt196 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)32));
sum44 = _mm512_fmadd_ps(wt196, dat83, sum44);
sum45 = _mm512_fmadd_ps(wt196, dat84, sum45);
sum46 = _mm512_fmadd_ps(wt196, dat85, sum46);
sum47 = _mm512_fmadd_ps(wt196, dat86, sum47);
__m512 wt197 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k13+20*s3+(ptrdiff_t)36));
sum48 = _mm512_fmadd_ps(wt197, dat83, sum48);
sum49 = _mm512_fmadd_ps(wt197, dat84, sum49);
sum50 = _mm512_fmadd_ps(wt197, dat85, sum50);
sum51 = _mm512_fmadd_ps(wt197, dat86, sum51);
}
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)0, 65535, sum32);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)64, 65535, sum33);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)128, 65535, sum34);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)192, 65535, sum35);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)40656, 65535, sum36);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)40720, 65535, sum37);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)40784, 65535, sum38);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)40848, 65535, sum39);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)81312, 65535, sum40);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)81376, 65535, sum41);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)81440, 65535, sum42);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)81504, 65535, sum43);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)121968, 65535, sum44);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)122032, 65535, sum45);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)122096, 65535, sum46);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)122160, 65535, sum47);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)162624, 65535, sum48);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)162688, 65535, sum49);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)162752, 65535, sum50);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k13+(ptrdiff_t)162816, 65535, sum51);
if (j5 >= jj5) return;
if (j5 >= 180) break;
++j5;
}
case 1: {
ptrdiff_t k14 = 1*w1;
ptrdiff_t kk10 = k14+0;
for (; k14 != 412; ++k14) {
ptrdiff_t s4 = -1;
__m512 sum52 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)24));
__m512 sum56 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)28));
__m512 sum60 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)32));
__m512 sum64 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)36));
__m512 sum68 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)40));
__m512 sum72 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)44));
__m512 sum53 = sum52;
__m512 sum54 = sum52;
__m512 sum55 = sum52;
__m512 sum57 = sum56;
__m512 sum58 = sum56;
__m512 sum59 = sum56;
__m512 sum61 = sum60;
__m512 sum62 = sum60;
__m512 sum63 = sum60;
__m512 sum65 = sum64;
__m512 sum66 = sum64;
__m512 sum67 = sum64;
__m512 sum69 = sum68;
__m512 sum70 = sum68;
__m512 sum71 = sum68;
__m512 sum73 = sum72;
__m512 sum74 = sum72;
__m512 sum75 = sum72;
for (s4 = 0; s4 < 835; ++s4) {
__m512 dat87 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s4+(ptrdiff_t)0);
__m512 dat88 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s4+(ptrdiff_t)64);
__m512 dat89 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s4+(ptrdiff_t)128);
__m512 dat90 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s4+(ptrdiff_t)192);
__m512 wt198 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)24));
sum52 = _mm512_fmadd_ps(wt198, dat87, sum52);
sum53 = _mm512_fmadd_ps(wt198, dat88, sum53);
sum54 = _mm512_fmadd_ps(wt198, dat89, sum54);
sum55 = _mm512_fmadd_ps(wt198, dat90, sum55);
__m512 wt199 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)28));
sum56 = _mm512_fmadd_ps(wt199, dat87, sum56);
sum57 = _mm512_fmadd_ps(wt199, dat88, sum57);
sum58 = _mm512_fmadd_ps(wt199, dat89, sum58);
sum59 = _mm512_fmadd_ps(wt199, dat90, sum59);
__m512 wt200 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)32));
sum60 = _mm512_fmadd_ps(wt200, dat87, sum60);
sum61 = _mm512_fmadd_ps(wt200, dat88, sum61);
sum62 = _mm512_fmadd_ps(wt200, dat89, sum62);
sum63 = _mm512_fmadd_ps(wt200, dat90, sum63);
__m512 wt201 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)36));
sum64 = _mm512_fmadd_ps(wt201, dat87, sum64);
sum65 = _mm512_fmadd_ps(wt201, dat88, sum65);
sum66 = _mm512_fmadd_ps(wt201, dat89, sum66);
sum67 = _mm512_fmadd_ps(wt201, dat90, sum67);
__m512 wt202 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)40));
sum68 = _mm512_fmadd_ps(wt202, dat87, sum68);
sum69 = _mm512_fmadd_ps(wt202, dat88, sum69);
sum70 = _mm512_fmadd_ps(wt202, dat89, sum70);
sum71 = _mm512_fmadd_ps(wt202, dat90, sum71);
__m512 wt203 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+24*s4+(ptrdiff_t)44));
sum72 = _mm512_fmadd_ps(wt203, dat87, sum72);
sum73 = _mm512_fmadd_ps(wt203, dat88, sum73);
sum74 = _mm512_fmadd_ps(wt203, dat89, sum74);
sum75 = _mm512_fmadd_ps(wt203, dat90, sum75);
}
__m512 dat91 = sum53;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)256, 65535, sum52);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)320, 15, dat91);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)336, 65535, sum54);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)400, 65535, sum55);
__m512 dat92 = sum57;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)40912, 65535, sum56);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)40976, 15, dat92);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)40992, 65535, sum58);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)41056, 65535, sum59);
__m512 dat93 = sum61;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)81568, 65535, sum60);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)81632, 15, dat93);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)81648, 65535, sum62);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)81712, 65535, sum63);
__m512 dat94 = sum65;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)122224, 65535, sum64);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)122288, 15, dat94);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)122304, 65535, sum66);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)122368, 65535, sum67);
__m512 dat95 = sum69;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)162880, 65535, sum68);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)162944, 15, dat95);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)162960, 65535, sum70);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)163024, 65535, sum71);
__m512 dat96 = sum73;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)203536, 65535, sum72);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)203600, 15, dat96);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)203616, 65535, sum74);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)203680, 65535, sum75);
if (k14 >= kk10) return;
}
ptrdiff_t s5 = -1;
__m512 sum76 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)20));
__m512 sum80 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)24));
__m512 sum84 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)28));
__m512 sum88 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)32));
__m512 sum92 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)36));
__m512 sum77 = sum76;
__m512 sum78 = sum76;
__m512 sum79 = sum76;
__m512 sum81 = sum80;
__m512 sum82 = sum80;
__m512 sum83 = sum80;
__m512 sum85 = sum84;
__m512 sum86 = sum84;
__m512 sum87 = sum84;
__m512 sum89 = sum88;
__m512 sum90 = sum88;
__m512 sum91 = sum88;
__m512 sum93 = sum92;
__m512 sum94 = sum92;
__m512 sum95 = sum92;
for (s5 = 0; s5 < 835; ++s5) {
__m512 dat97 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s5+(ptrdiff_t)0);
__m512 dat98 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s5+(ptrdiff_t)64);
__m512 dat99 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s5+(ptrdiff_t)128);
__m512 dat100 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s5+(ptrdiff_t)192);
__m512 wt204 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)20));
sum76 = _mm512_fmadd_ps(wt204, dat97, sum76);
sum77 = _mm512_fmadd_ps(wt204, dat98, sum77);
sum78 = _mm512_fmadd_ps(wt204, dat99, sum78);
sum79 = _mm512_fmadd_ps(wt204, dat100, sum79);
__m512 wt205 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)24));
sum80 = _mm512_fmadd_ps(wt205, dat97, sum80);
sum81 = _mm512_fmadd_ps(wt205, dat98, sum81);
sum82 = _mm512_fmadd_ps(wt205, dat99, sum82);
sum83 = _mm512_fmadd_ps(wt205, dat100, sum83);
__m512 wt206 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)28));
sum84 = _mm512_fmadd_ps(wt206, dat97, sum84);
sum85 = _mm512_fmadd_ps(wt206, dat98, sum85);
sum86 = _mm512_fmadd_ps(wt206, dat99, sum86);
sum87 = _mm512_fmadd_ps(wt206, dat100, sum87);
__m512 wt207 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)32));
sum88 = _mm512_fmadd_ps(wt207, dat97, sum88);
sum89 = _mm512_fmadd_ps(wt207, dat98, sum89);
sum90 = _mm512_fmadd_ps(wt207, dat99, sum90);
sum91 = _mm512_fmadd_ps(wt207, dat100, sum91);
__m512 wt208 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k14+20*s5+(ptrdiff_t)36));
sum92 = _mm512_fmadd_ps(wt208, dat97, sum92);
sum93 = _mm512_fmadd_ps(wt208, dat98, sum93);
sum94 = _mm512_fmadd_ps(wt208, dat99, sum94);
sum95 = _mm512_fmadd_ps(wt208, dat100, sum95);
}
__m512 dat101 = sum77;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)256, 65535, sum76);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)320, 15, dat101);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)336, 65535, sum78);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)400, 65535, sum79);
__m512 dat102 = sum81;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)40912, 65535, sum80);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)40976, 15, dat102);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)40992, 65535, sum82);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)41056, 65535, sum83);
__m512 dat103 = sum85;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)81568, 65535, sum84);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)81632, 15, dat103);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)81648, 65535, sum86);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)81712, 65535, sum87);
__m512 dat104 = sum89;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)122224, 65535, sum88);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)122288, 15, dat104);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)122304, 65535, sum90);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)122368, 65535, sum91);
__m512 dat105 = sum93;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)162880, 65535, sum92);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)162944, 15, dat105);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)162960, 65535, sum94);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k14+(ptrdiff_t)163024, 65535, sum95);
if (j5 >= jj5) return;
++j5;
}
default: {
ptrdiff_t k15 = 1*w1;
ptrdiff_t kk11 = k15+0;
for (; k15 != 412; ++k15) {
ptrdiff_t s6 = -1;
__m512 sum96 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)24));
__m512 sum100 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)28));
__m512 sum104 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)32));
__m512 sum108 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)36));
__m512 sum112 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)40));
__m512 sum116 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)44));
__m512 sum97 = sum96;
__m512 sum98 = sum96;
__m512 sum99 = sum96;
__m512 sum101 = sum100;
__m512 sum102 = sum100;
__m512 sum103 = sum100;
__m512 sum105 = sum104;
__m512 sum106 = sum104;
__m512 sum107 = sum104;
__m512 sum109 = sum108;
__m512 sum110 = sum108;
__m512 sum111 = sum108;
__m512 sum113 = sum112;
__m512 sum114 = sum112;
__m512 sum115 = sum112;
__m512 sum117 = sum116;
__m512 sum118 = sum116;
__m512 sum119 = sum116;
for (s6 = 0; s6 < 835; ++s6) {
__m512 dat106 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s6+(ptrdiff_t)0);
__m512 dat107 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s6+(ptrdiff_t)64);
__m512 dat108 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s6+(ptrdiff_t)128);
__m512 dat109 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s6+(ptrdiff_t)192);
__m512 wt209 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)24));
sum96 = _mm512_fmadd_ps(wt209, dat106, sum96);
sum97 = _mm512_fmadd_ps(wt209, dat107, sum97);
sum98 = _mm512_fmadd_ps(wt209, dat108, sum98);
sum99 = _mm512_fmadd_ps(wt209, dat109, sum99);
__m512 wt210 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)28));
sum100 = _mm512_fmadd_ps(wt210, dat106, sum100);
sum101 = _mm512_fmadd_ps(wt210, dat107, sum101);
sum102 = _mm512_fmadd_ps(wt210, dat108, sum102);
sum103 = _mm512_fmadd_ps(wt210, dat109, sum103);
__m512 wt211 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)32));
sum104 = _mm512_fmadd_ps(wt211, dat106, sum104);
sum105 = _mm512_fmadd_ps(wt211, dat107, sum105);
sum106 = _mm512_fmadd_ps(wt211, dat108, sum106);
sum107 = _mm512_fmadd_ps(wt211, dat109, sum107);
__m512 wt212 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)36));
sum108 = _mm512_fmadd_ps(wt212, dat106, sum108);
sum109 = _mm512_fmadd_ps(wt212, dat107, sum109);
sum110 = _mm512_fmadd_ps(wt212, dat108, sum110);
sum111 = _mm512_fmadd_ps(wt212, dat109, sum111);
__m512 wt213 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)40));
sum112 = _mm512_fmadd_ps(wt213, dat106, sum112);
sum113 = _mm512_fmadd_ps(wt213, dat107, sum113);
sum114 = _mm512_fmadd_ps(wt213, dat108, sum114);
sum115 = _mm512_fmadd_ps(wt213, dat109, sum115);
__m512 wt214 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+24*s6+(ptrdiff_t)44));
sum116 = _mm512_fmadd_ps(wt214, dat106, sum116);
sum117 = _mm512_fmadd_ps(wt214, dat107, sum117);
sum118 = _mm512_fmadd_ps(wt214, dat108, sum118);
sum119 = _mm512_fmadd_ps(wt214, dat109, sum119);
}
__m512 dat110 = sum99;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)464, 65535, sum96);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)528, 65535, sum97);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)592, 65535, sum98);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)656, 15, dat110);
__m512 dat111 = sum103;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)41120, 65535, sum100);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)41184, 65535, sum101);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)41248, 65535, sum102);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)41312, 15, dat111);
__m512 dat112 = sum107;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)81776, 65535, sum104);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)81840, 65535, sum105);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)81904, 65535, sum106);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)81968, 15, dat112);
__m512 dat113 = sum111;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)122432, 65535, sum108);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)122496, 65535, sum109);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)122560, 65535, sum110);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)122624, 15, dat113);
__m512 dat114 = sum115;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)163088, 65535, sum112);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)163152, 65535, sum113);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)163216, 65535, sum114);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)163280, 15, dat114);
__m512 dat115 = sum119;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)203744, 65535, sum116);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)203808, 65535, sum117);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)203872, 65535, sum118);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)203936, 15, dat115);
if (k15 >= kk11) return;
}
ptrdiff_t s7 = -1;
__m512 sum120 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)20));
__m512 sum124 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)24));
__m512 sum128 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)28));
__m512 sum132 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)32));
__m512 sum136 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)36));
__m512 sum121 = sum120;
__m512 sum122 = sum120;
__m512 sum123 = sum120;
__m512 sum125 = sum124;
__m512 sum126 = sum124;
__m512 sum127 = sum124;
__m512 sum129 = sum128;
__m512 sum130 = sum128;
__m512 sum131 = sum128;
__m512 sum133 = sum132;
__m512 sum134 = sum132;
__m512 sum135 = sum132;
__m512 sum137 = sum136;
__m512 sum138 = sum136;
__m512 sum139 = sum136;
for (s7 = 0; s7 < 835; ++s7) {
__m512 dat116 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s7+(ptrdiff_t)0);
__m512 dat117 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s7+(ptrdiff_t)64);
__m512 dat118 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s7+(ptrdiff_t)128);
__m512 dat119 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+256*s7+(ptrdiff_t)192);
__m512 wt215 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)20));
sum120 = _mm512_fmadd_ps(wt215, dat116, sum120);
sum121 = _mm512_fmadd_ps(wt215, dat117, sum121);
sum122 = _mm512_fmadd_ps(wt215, dat118, sum122);
sum123 = _mm512_fmadd_ps(wt215, dat119, sum123);
__m512 wt216 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)24));
sum124 = _mm512_fmadd_ps(wt216, dat116, sum124);
sum125 = _mm512_fmadd_ps(wt216, dat117, sum125);
sum126 = _mm512_fmadd_ps(wt216, dat118, sum126);
sum127 = _mm512_fmadd_ps(wt216, dat119, sum127);
__m512 wt217 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)28));
sum128 = _mm512_fmadd_ps(wt217, dat116, sum128);
sum129 = _mm512_fmadd_ps(wt217, dat117, sum129);
sum130 = _mm512_fmadd_ps(wt217, dat118, sum130);
sum131 = _mm512_fmadd_ps(wt217, dat119, sum131);
__m512 wt218 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)32));
sum132 = _mm512_fmadd_ps(wt218, dat116, sum132);
sum133 = _mm512_fmadd_ps(wt218, dat117, sum133);
sum134 = _mm512_fmadd_ps(wt218, dat118, sum134);
sum135 = _mm512_fmadd_ps(wt218, dat119, sum135);
__m512 wt219 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k15+20*s7+(ptrdiff_t)36));
sum136 = _mm512_fmadd_ps(wt219, dat116, sum136);
sum137 = _mm512_fmadd_ps(wt219, dat117, sum137);
sum138 = _mm512_fmadd_ps(wt219, dat118, sum138);
sum139 = _mm512_fmadd_ps(wt219, dat119, sum139);
}
__m512 dat120 = sum123;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)464, 65535, sum120);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)528, 65535, sum121);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)592, 65535, sum122);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)656, 15, dat120);
__m512 dat121 = sum127;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)41120, 65535, sum124);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)41184, 65535, sum125);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)41248, 65535, sum126);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)41312, 15, dat121);
__m512 dat122 = sum131;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)81776, 65535, sum128);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)81840, 65535, sum129);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)81904, 65535, sum130);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)81968, 15, dat122);
__m512 dat123 = sum135;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)122432, 65535, sum132);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)122496, 65535, sum133);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)122560, 65535, sum134);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)122624, 15, dat123);
__m512 dat124 = sum139;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)163088, 65535, sum136);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)163152, 65535, sum137);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)163216, 65535, sum138);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h3+243936*k15+(ptrdiff_t)163280, 15, dat124);
if (j5 >= jj5) return;
++j5;
h3 += 2;
goto wrap5;
}
}
j5 = 181;
}
ptrdiff_t h4 = 120;
switch (j5) {
default: {
j5 = 181;
ptrdiff_t k16 = 1*w1;
ptrdiff_t kk12 = k16+0;
for (; k16 != 412; ++k16) {
ptrdiff_t s8 = -1;
__m512 sum140 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)24));
__m512 sum142 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)28));
__m512 sum144 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)32));
__m512 sum146 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)36));
__m512 sum148 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)40));
__m512 sum150 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)44));
__m512 sum141 = sum140;
__m512 sum143 = sum142;
__m512 sum145 = sum144;
__m512 sum147 = sum146;
__m512 sum149 = sum148;
__m512 sum151 = sum150;
for (s8 = 0; s8 < 835; ++s8) {
__m512 dat125 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+128*s8+(ptrdiff_t)0);
__m512 dat126 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+128*s8+(ptrdiff_t)64);
__m512 wt220 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)24));
sum140 = _mm512_fmadd_ps(wt220, dat125, sum140);
sum141 = _mm512_fmadd_ps(wt220, dat126, sum141);
__m512 wt221 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)28));
sum142 = _mm512_fmadd_ps(wt221, dat125, sum142);
sum143 = _mm512_fmadd_ps(wt221, dat126, sum143);
__m512 wt222 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)32));
sum144 = _mm512_fmadd_ps(wt222, dat125, sum144);
sum145 = _mm512_fmadd_ps(wt222, dat126, sum145);
__m512 wt223 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)36));
sum146 = _mm512_fmadd_ps(wt223, dat125, sum146);
sum147 = _mm512_fmadd_ps(wt223, dat126, sum147);
__m512 wt224 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)40));
sum148 = _mm512_fmadd_ps(wt224, dat125, sum148);
sum149 = _mm512_fmadd_ps(wt224, dat126, sum149);
__m512 wt225 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+24*s8+(ptrdiff_t)44));
sum150 = _mm512_fmadd_ps(wt225, dat125, sum150);
sum151 = _mm512_fmadd_ps(wt225, dat126, sum151);
}
__m512 dat127 = sum141;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)256, 65535, sum140);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)320, 15, dat127);
__m512 dat128 = sum143;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)40912, 65535, sum142);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)40976, 15, dat128);
__m512 dat129 = sum145;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)81568, 65535, sum144);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)81632, 15, dat129);
__m512 dat130 = sum147;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)122224, 65535, sum146);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)122288, 15, dat130);
__m512 dat131 = sum149;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)162880, 65535, sum148);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)162944, 15, dat131);
__m512 dat132 = sum151;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)203536, 65535, sum150);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)203600, 15, dat132);
if (k16 >= kk12) return;
}
ptrdiff_t s9 = -1;
__m512 sum152 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)20));
__m512 sum154 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)24));
__m512 sum156 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)28));
__m512 sum158 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)32));
__m512 sum160 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)36));
__m512 sum153 = sum152;
__m512 sum155 = sum154;
__m512 sum157 = sum156;
__m512 sum159 = sum158;
__m512 sum161 = sum160;
for (s9 = 0; s9 < 835; ++s9) {
__m512 dat133 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+128*s9+(ptrdiff_t)0);
__m512 dat134 = _mm512_loadu_ps(arrangedDats1+38797440*i9+213760*j5+128*s9+(ptrdiff_t)64);
__m512 wt226 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)20));
sum152 = _mm512_fmadd_ps(wt226, dat133, sum152);
sum153 = _mm512_fmadd_ps(wt226, dat134, sum153);
__m512 wt227 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)24));
sum154 = _mm512_fmadd_ps(wt227, dat133, sum154);
sum155 = _mm512_fmadd_ps(wt227, dat134, sum155);
__m512 wt228 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)28));
sum156 = _mm512_fmadd_ps(wt228, dat133, sum156);
sum157 = _mm512_fmadd_ps(wt228, dat134, sum157);
__m512 wt229 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)32));
sum158 = _mm512_fmadd_ps(wt229, dat133, sum158);
sum159 = _mm512_fmadd_ps(wt229, dat134, sum159);
__m512 wt230 = _mm512_set1_ps(*(float*)(arrangedWts1+8283088*i9+20064*k16+20*s9+(ptrdiff_t)36));
sum160 = _mm512_fmadd_ps(wt230, dat133, sum160);
sum161 = _mm512_fmadd_ps(wt230, dat134, sum161);
}
__m512 dat135 = sum153;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)256, 65535, sum152);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)320, 15, dat135);
__m512 dat136 = sum155;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)40912, 65535, sum154);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)40976, 15, dat136);
__m512 dat137 = sum157;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)81568, 65535, sum156);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)81632, 15, dat137);
__m512 dat138 = sum159;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)122224, 65535, sum158);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)122288, 15, dat138);
__m512 dat139 = sum161;
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)162880, 65535, sum160);
_mm512_mask_storeu_ps(datPtr3+100704912*i9+336*h4+243936*k16+(ptrdiff_t)162944, 15, dat139);
if (j5 >= jj5) return;
}
}
j5 = 182;
}
}

static void Example15OneApply1Callee2(Example15ThreaderTask1* task9, int64_t* pt10) {
void** pair3 = task9->any1;
char** tensors7 = pair3[0];
ptrdiff_t e4 = (ptrdiff_t)pair3[1];
ptrdiff_t g5 = pt10[2];
ptrdiff_t d2 = pt10[1];
ptrdiff_t w2 = pt10[0];
char*restrict arrangedWts2 = tensors7[0]+66264704*e4+(ptrdiff_t)8283088*1*g5;
char*restrict arrangedDats2 = tensors7[1]+310379520*e4+(ptrdiff_t)38797440*1*g5;
char*restrict datPtr4 = tensors7[2]+(ptrdiff_t)100704912*1*g5;
ptrdiff_t ii6 = 1;
for (ptrdiff_t i10 = 0; i10 < ii6; ++i10) {
ptrdiff_t j6 = 1*d2;
ptrdiff_t jj6 = j6+0;
if (j6 < 181) {
ptrdiff_t h5 = 0+((size_t)j6-0)/3*2;
switch (((size_t)j6-0)%3) {
case 0: {
wrap6:;
ptrdiff_t k17 = 1*w2;
ptrdiff_t kk13 = k17+0;
for (; k17 != 412; ++k17) {
ptrdiff_t s10 = -1;
__m512 sum162 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)24));
__m512 sum166 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)28));
__m512 sum170 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)32));
__m512 sum174 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)36));
__m512 sum178 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)40));
__m512 sum182 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)44));
__m512 sum163 = sum162;
__m512 sum164 = sum162;
__m512 sum165 = sum162;
__m512 sum167 = sum166;
__m512 sum168 = sum166;
__m512 sum169 = sum166;
__m512 sum171 = sum170;
__m512 sum172 = sum170;
__m512 sum173 = sum170;
__m512 sum175 = sum174;
__m512 sum176 = sum174;
__m512 sum177 = sum174;
__m512 sum179 = sum178;
__m512 sum180 = sum178;
__m512 sum181 = sum178;
__m512 sum183 = sum182;
__m512 sum184 = sum182;
__m512 sum185 = sum182;
for (s10 = 0; s10 < 835; ++s10) {
__m512 dat140 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s10+(ptrdiff_t)0);
__m512 dat141 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s10+(ptrdiff_t)64);
__m512 dat142 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s10+(ptrdiff_t)128);
__m512 dat143 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s10+(ptrdiff_t)192);
__m512 wt231 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)24));
sum162 = _mm512_fmadd_ps(wt231, dat140, sum162);
sum163 = _mm512_fmadd_ps(wt231, dat141, sum163);
sum164 = _mm512_fmadd_ps(wt231, dat142, sum164);
sum165 = _mm512_fmadd_ps(wt231, dat143, sum165);
__m512 wt232 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)28));
sum166 = _mm512_fmadd_ps(wt232, dat140, sum166);
sum167 = _mm512_fmadd_ps(wt232, dat141, sum167);
sum168 = _mm512_fmadd_ps(wt232, dat142, sum168);
sum169 = _mm512_fmadd_ps(wt232, dat143, sum169);
__m512 wt233 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)32));
sum170 = _mm512_fmadd_ps(wt233, dat140, sum170);
sum171 = _mm512_fmadd_ps(wt233, dat141, sum171);
sum172 = _mm512_fmadd_ps(wt233, dat142, sum172);
sum173 = _mm512_fmadd_ps(wt233, dat143, sum173);
__m512 wt234 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)36));
sum174 = _mm512_fmadd_ps(wt234, dat140, sum174);
sum175 = _mm512_fmadd_ps(wt234, dat141, sum175);
sum176 = _mm512_fmadd_ps(wt234, dat142, sum176);
sum177 = _mm512_fmadd_ps(wt234, dat143, sum177);
__m512 wt235 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)40));
sum178 = _mm512_fmadd_ps(wt235, dat140, sum178);
sum179 = _mm512_fmadd_ps(wt235, dat141, sum179);
sum180 = _mm512_fmadd_ps(wt235, dat142, sum180);
sum181 = _mm512_fmadd_ps(wt235, dat143, sum181);
__m512 wt236 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+24*s10+(ptrdiff_t)44));
sum182 = _mm512_fmadd_ps(wt236, dat140, sum182);
sum183 = _mm512_fmadd_ps(wt236, dat141, sum183);
sum184 = _mm512_fmadd_ps(wt236, dat142, sum184);
sum185 = _mm512_fmadd_ps(wt236, dat143, sum185);
}
sum162 = _mm512_add_ps(sum162, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)0));
sum163 = _mm512_add_ps(sum163, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)64));
sum164 = _mm512_add_ps(sum164, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)128));
sum165 = _mm512_add_ps(sum165, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)192));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)0, 65535, sum162);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)64, 65535, sum163);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)128, 65535, sum164);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)192, 65535, sum165);
sum166 = _mm512_add_ps(sum166, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40656));
sum167 = _mm512_add_ps(sum167, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40720));
sum168 = _mm512_add_ps(sum168, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40784));
sum169 = _mm512_add_ps(sum169, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40848));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40656, 65535, sum166);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40720, 65535, sum167);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40784, 65535, sum168);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40848, 65535, sum169);
sum170 = _mm512_add_ps(sum170, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81312));
sum171 = _mm512_add_ps(sum171, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81376));
sum172 = _mm512_add_ps(sum172, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81440));
sum173 = _mm512_add_ps(sum173, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81504));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81312, 65535, sum170);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81376, 65535, sum171);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81440, 65535, sum172);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81504, 65535, sum173);
sum174 = _mm512_add_ps(sum174, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)121968));
sum175 = _mm512_add_ps(sum175, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122032));
sum176 = _mm512_add_ps(sum176, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122096));
sum177 = _mm512_add_ps(sum177, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122160));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)121968, 65535, sum174);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122032, 65535, sum175);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122096, 65535, sum176);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122160, 65535, sum177);
sum178 = _mm512_add_ps(sum178, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162624));
sum179 = _mm512_add_ps(sum179, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162688));
sum180 = _mm512_add_ps(sum180, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162752));
sum181 = _mm512_add_ps(sum181, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162816));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162624, 65535, sum178);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162688, 65535, sum179);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162752, 65535, sum180);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162816, 65535, sum181);
sum182 = _mm512_add_ps(sum182, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)203280));
sum183 = _mm512_add_ps(sum183, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)203344));
sum184 = _mm512_add_ps(sum184, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)203408));
sum185 = _mm512_add_ps(sum185, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)203472));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)203280, 65535, sum182);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)203344, 65535, sum183);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)203408, 65535, sum184);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)203472, 65535, sum185);
if (k17 >= kk13) return;
}
ptrdiff_t s11 = -1;
__m512 sum186 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)20));
__m512 sum190 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)24));
__m512 sum194 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)28));
__m512 sum198 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)32));
__m512 sum202 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)36));
__m512 sum187 = sum186;
__m512 sum188 = sum186;
__m512 sum189 = sum186;
__m512 sum191 = sum190;
__m512 sum192 = sum190;
__m512 sum193 = sum190;
__m512 sum195 = sum194;
__m512 sum196 = sum194;
__m512 sum197 = sum194;
__m512 sum199 = sum198;
__m512 sum200 = sum198;
__m512 sum201 = sum198;
__m512 sum203 = sum202;
__m512 sum204 = sum202;
__m512 sum205 = sum202;
for (s11 = 0; s11 < 835; ++s11) {
__m512 dat144 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s11+(ptrdiff_t)0);
__m512 dat145 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s11+(ptrdiff_t)64);
__m512 dat146 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s11+(ptrdiff_t)128);
__m512 dat147 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s11+(ptrdiff_t)192);
__m512 wt237 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)20));
sum186 = _mm512_fmadd_ps(wt237, dat144, sum186);
sum187 = _mm512_fmadd_ps(wt237, dat145, sum187);
sum188 = _mm512_fmadd_ps(wt237, dat146, sum188);
sum189 = _mm512_fmadd_ps(wt237, dat147, sum189);
__m512 wt238 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)24));
sum190 = _mm512_fmadd_ps(wt238, dat144, sum190);
sum191 = _mm512_fmadd_ps(wt238, dat145, sum191);
sum192 = _mm512_fmadd_ps(wt238, dat146, sum192);
sum193 = _mm512_fmadd_ps(wt238, dat147, sum193);
__m512 wt239 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)28));
sum194 = _mm512_fmadd_ps(wt239, dat144, sum194);
sum195 = _mm512_fmadd_ps(wt239, dat145, sum195);
sum196 = _mm512_fmadd_ps(wt239, dat146, sum196);
sum197 = _mm512_fmadd_ps(wt239, dat147, sum197);
__m512 wt240 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)32));
sum198 = _mm512_fmadd_ps(wt240, dat144, sum198);
sum199 = _mm512_fmadd_ps(wt240, dat145, sum199);
sum200 = _mm512_fmadd_ps(wt240, dat146, sum200);
sum201 = _mm512_fmadd_ps(wt240, dat147, sum201);
__m512 wt241 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k17+20*s11+(ptrdiff_t)36));
sum202 = _mm512_fmadd_ps(wt241, dat144, sum202);
sum203 = _mm512_fmadd_ps(wt241, dat145, sum203);
sum204 = _mm512_fmadd_ps(wt241, dat146, sum204);
sum205 = _mm512_fmadd_ps(wt241, dat147, sum205);
}
sum186 = _mm512_add_ps(sum186, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)0));
sum187 = _mm512_add_ps(sum187, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)64));
sum188 = _mm512_add_ps(sum188, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)128));
sum189 = _mm512_add_ps(sum189, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)192));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)0, 65535, sum186);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)64, 65535, sum187);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)128, 65535, sum188);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)192, 65535, sum189);
sum190 = _mm512_add_ps(sum190, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40656));
sum191 = _mm512_add_ps(sum191, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40720));
sum192 = _mm512_add_ps(sum192, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40784));
sum193 = _mm512_add_ps(sum193, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40848));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40656, 65535, sum190);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40720, 65535, sum191);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40784, 65535, sum192);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)40848, 65535, sum193);
sum194 = _mm512_add_ps(sum194, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81312));
sum195 = _mm512_add_ps(sum195, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81376));
sum196 = _mm512_add_ps(sum196, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81440));
sum197 = _mm512_add_ps(sum197, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81504));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81312, 65535, sum194);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81376, 65535, sum195);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81440, 65535, sum196);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)81504, 65535, sum197);
sum198 = _mm512_add_ps(sum198, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)121968));
sum199 = _mm512_add_ps(sum199, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122032));
sum200 = _mm512_add_ps(sum200, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122096));
sum201 = _mm512_add_ps(sum201, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122160));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)121968, 65535, sum198);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122032, 65535, sum199);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122096, 65535, sum200);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)122160, 65535, sum201);
sum202 = _mm512_add_ps(sum202, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162624));
sum203 = _mm512_add_ps(sum203, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162688));
sum204 = _mm512_add_ps(sum204, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162752));
sum205 = _mm512_add_ps(sum205, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162816));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162624, 65535, sum202);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162688, 65535, sum203);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162752, 65535, sum204);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k17+(ptrdiff_t)162816, 65535, sum205);
if (j6 >= jj6) return;
if (j6 >= 180) break;
++j6;
}
case 1: {
ptrdiff_t k18 = 1*w2;
ptrdiff_t kk14 = k18+0;
for (; k18 != 412; ++k18) {
ptrdiff_t s12 = -1;
__m512 sum206 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)24));
__m512 sum210 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)28));
__m512 sum214 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)32));
__m512 sum218 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)36));
__m512 sum222 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)40));
__m512 sum226 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)44));
__m512 sum207 = sum206;
__m512 sum208 = sum206;
__m512 sum209 = sum206;
__m512 sum211 = sum210;
__m512 sum212 = sum210;
__m512 sum213 = sum210;
__m512 sum215 = sum214;
__m512 sum216 = sum214;
__m512 sum217 = sum214;
__m512 sum219 = sum218;
__m512 sum220 = sum218;
__m512 sum221 = sum218;
__m512 sum223 = sum222;
__m512 sum224 = sum222;
__m512 sum225 = sum222;
__m512 sum227 = sum226;
__m512 sum228 = sum226;
__m512 sum229 = sum226;
for (s12 = 0; s12 < 835; ++s12) {
__m512 dat148 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s12+(ptrdiff_t)0);
__m512 dat149 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s12+(ptrdiff_t)64);
__m512 dat150 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s12+(ptrdiff_t)128);
__m512 dat151 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s12+(ptrdiff_t)192);
__m512 wt242 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)24));
sum206 = _mm512_fmadd_ps(wt242, dat148, sum206);
sum207 = _mm512_fmadd_ps(wt242, dat149, sum207);
sum208 = _mm512_fmadd_ps(wt242, dat150, sum208);
sum209 = _mm512_fmadd_ps(wt242, dat151, sum209);
__m512 wt243 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)28));
sum210 = _mm512_fmadd_ps(wt243, dat148, sum210);
sum211 = _mm512_fmadd_ps(wt243, dat149, sum211);
sum212 = _mm512_fmadd_ps(wt243, dat150, sum212);
sum213 = _mm512_fmadd_ps(wt243, dat151, sum213);
__m512 wt244 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)32));
sum214 = _mm512_fmadd_ps(wt244, dat148, sum214);
sum215 = _mm512_fmadd_ps(wt244, dat149, sum215);
sum216 = _mm512_fmadd_ps(wt244, dat150, sum216);
sum217 = _mm512_fmadd_ps(wt244, dat151, sum217);
__m512 wt245 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)36));
sum218 = _mm512_fmadd_ps(wt245, dat148, sum218);
sum219 = _mm512_fmadd_ps(wt245, dat149, sum219);
sum220 = _mm512_fmadd_ps(wt245, dat150, sum220);
sum221 = _mm512_fmadd_ps(wt245, dat151, sum221);
__m512 wt246 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)40));
sum222 = _mm512_fmadd_ps(wt246, dat148, sum222);
sum223 = _mm512_fmadd_ps(wt246, dat149, sum223);
sum224 = _mm512_fmadd_ps(wt246, dat150, sum224);
sum225 = _mm512_fmadd_ps(wt246, dat151, sum225);
__m512 wt247 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+24*s12+(ptrdiff_t)44));
sum226 = _mm512_fmadd_ps(wt247, dat148, sum226);
sum227 = _mm512_fmadd_ps(wt247, dat149, sum227);
sum228 = _mm512_fmadd_ps(wt247, dat150, sum228);
sum229 = _mm512_fmadd_ps(wt247, dat151, sum229);
}
sum206 = _mm512_add_ps(sum206, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)256));
__m512 dat152 = sum207;
dat152 = _mm512_add_ps(dat152, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)320));
sum208 = _mm512_add_ps(sum208, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)336));
sum209 = _mm512_add_ps(sum209, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)400));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)256, 65535, sum206);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)320, 15, dat152);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)336, 65535, sum208);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)400, 65535, sum209);
sum210 = _mm512_add_ps(sum210, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40912));
__m512 dat153 = sum211;
dat153 = _mm512_add_ps(dat153, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40976));
sum212 = _mm512_add_ps(sum212, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40992));
sum213 = _mm512_add_ps(sum213, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)41056));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40912, 65535, sum210);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40976, 15, dat153);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40992, 65535, sum212);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)41056, 65535, sum213);
sum214 = _mm512_add_ps(sum214, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81568));
__m512 dat154 = sum215;
dat154 = _mm512_add_ps(dat154, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81632));
sum216 = _mm512_add_ps(sum216, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81648));
sum217 = _mm512_add_ps(sum217, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81712));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81568, 65535, sum214);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81632, 15, dat154);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81648, 65535, sum216);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81712, 65535, sum217);
sum218 = _mm512_add_ps(sum218, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122224));
__m512 dat155 = sum219;
dat155 = _mm512_add_ps(dat155, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122288));
sum220 = _mm512_add_ps(sum220, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122304));
sum221 = _mm512_add_ps(sum221, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122368));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122224, 65535, sum218);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122288, 15, dat155);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122304, 65535, sum220);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122368, 65535, sum221);
sum222 = _mm512_add_ps(sum222, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162880));
__m512 dat156 = sum223;
dat156 = _mm512_add_ps(dat156, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162944));
sum224 = _mm512_add_ps(sum224, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162960));
sum225 = _mm512_add_ps(sum225, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)163024));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162880, 65535, sum222);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162944, 15, dat156);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162960, 65535, sum224);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)163024, 65535, sum225);
sum226 = _mm512_add_ps(sum226, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)203536));
__m512 dat157 = sum227;
dat157 = _mm512_add_ps(dat157, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)203600));
sum228 = _mm512_add_ps(sum228, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)203616));
sum229 = _mm512_add_ps(sum229, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)203680));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)203536, 65535, sum226);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)203600, 15, dat157);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)203616, 65535, sum228);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)203680, 65535, sum229);
if (k18 >= kk14) return;
}
ptrdiff_t s13 = -1;
__m512 sum230 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)20));
__m512 sum234 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)24));
__m512 sum238 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)28));
__m512 sum242 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)32));
__m512 sum246 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)36));
__m512 sum231 = sum230;
__m512 sum232 = sum230;
__m512 sum233 = sum230;
__m512 sum235 = sum234;
__m512 sum236 = sum234;
__m512 sum237 = sum234;
__m512 sum239 = sum238;
__m512 sum240 = sum238;
__m512 sum241 = sum238;
__m512 sum243 = sum242;
__m512 sum244 = sum242;
__m512 sum245 = sum242;
__m512 sum247 = sum246;
__m512 sum248 = sum246;
__m512 sum249 = sum246;
for (s13 = 0; s13 < 835; ++s13) {
__m512 dat158 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s13+(ptrdiff_t)0);
__m512 dat159 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s13+(ptrdiff_t)64);
__m512 dat160 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s13+(ptrdiff_t)128);
__m512 dat161 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s13+(ptrdiff_t)192);
__m512 wt248 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)20));
sum230 = _mm512_fmadd_ps(wt248, dat158, sum230);
sum231 = _mm512_fmadd_ps(wt248, dat159, sum231);
sum232 = _mm512_fmadd_ps(wt248, dat160, sum232);
sum233 = _mm512_fmadd_ps(wt248, dat161, sum233);
__m512 wt249 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)24));
sum234 = _mm512_fmadd_ps(wt249, dat158, sum234);
sum235 = _mm512_fmadd_ps(wt249, dat159, sum235);
sum236 = _mm512_fmadd_ps(wt249, dat160, sum236);
sum237 = _mm512_fmadd_ps(wt249, dat161, sum237);
__m512 wt250 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)28));
sum238 = _mm512_fmadd_ps(wt250, dat158, sum238);
sum239 = _mm512_fmadd_ps(wt250, dat159, sum239);
sum240 = _mm512_fmadd_ps(wt250, dat160, sum240);
sum241 = _mm512_fmadd_ps(wt250, dat161, sum241);
__m512 wt251 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)32));
sum242 = _mm512_fmadd_ps(wt251, dat158, sum242);
sum243 = _mm512_fmadd_ps(wt251, dat159, sum243);
sum244 = _mm512_fmadd_ps(wt251, dat160, sum244);
sum245 = _mm512_fmadd_ps(wt251, dat161, sum245);
__m512 wt252 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k18+20*s13+(ptrdiff_t)36));
sum246 = _mm512_fmadd_ps(wt252, dat158, sum246);
sum247 = _mm512_fmadd_ps(wt252, dat159, sum247);
sum248 = _mm512_fmadd_ps(wt252, dat160, sum248);
sum249 = _mm512_fmadd_ps(wt252, dat161, sum249);
}
sum230 = _mm512_add_ps(sum230, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)256));
__m512 dat162 = sum231;
dat162 = _mm512_add_ps(dat162, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)320));
sum232 = _mm512_add_ps(sum232, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)336));
sum233 = _mm512_add_ps(sum233, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)400));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)256, 65535, sum230);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)320, 15, dat162);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)336, 65535, sum232);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)400, 65535, sum233);
sum234 = _mm512_add_ps(sum234, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40912));
__m512 dat163 = sum235;
dat163 = _mm512_add_ps(dat163, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40976));
sum236 = _mm512_add_ps(sum236, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40992));
sum237 = _mm512_add_ps(sum237, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)41056));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40912, 65535, sum234);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40976, 15, dat163);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)40992, 65535, sum236);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)41056, 65535, sum237);
sum238 = _mm512_add_ps(sum238, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81568));
__m512 dat164 = sum239;
dat164 = _mm512_add_ps(dat164, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81632));
sum240 = _mm512_add_ps(sum240, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81648));
sum241 = _mm512_add_ps(sum241, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81712));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81568, 65535, sum238);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81632, 15, dat164);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81648, 65535, sum240);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)81712, 65535, sum241);
sum242 = _mm512_add_ps(sum242, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122224));
__m512 dat165 = sum243;
dat165 = _mm512_add_ps(dat165, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122288));
sum244 = _mm512_add_ps(sum244, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122304));
sum245 = _mm512_add_ps(sum245, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122368));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122224, 65535, sum242);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122288, 15, dat165);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122304, 65535, sum244);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)122368, 65535, sum245);
sum246 = _mm512_add_ps(sum246, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162880));
__m512 dat166 = sum247;
dat166 = _mm512_add_ps(dat166, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162944));
sum248 = _mm512_add_ps(sum248, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162960));
sum249 = _mm512_add_ps(sum249, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)163024));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162880, 65535, sum246);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162944, 15, dat166);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)162960, 65535, sum248);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k18+(ptrdiff_t)163024, 65535, sum249);
if (j6 >= jj6) return;
++j6;
}
default: {
ptrdiff_t k19 = 1*w2;
ptrdiff_t kk15 = k19+0;
for (; k19 != 412; ++k19) {
ptrdiff_t s14 = -1;
__m512 sum250 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)24));
__m512 sum254 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)28));
__m512 sum258 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)32));
__m512 sum262 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)36));
__m512 sum266 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)40));
__m512 sum270 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)44));
__m512 sum251 = sum250;
__m512 sum252 = sum250;
__m512 sum253 = sum250;
__m512 sum255 = sum254;
__m512 sum256 = sum254;
__m512 sum257 = sum254;
__m512 sum259 = sum258;
__m512 sum260 = sum258;
__m512 sum261 = sum258;
__m512 sum263 = sum262;
__m512 sum264 = sum262;
__m512 sum265 = sum262;
__m512 sum267 = sum266;
__m512 sum268 = sum266;
__m512 sum269 = sum266;
__m512 sum271 = sum270;
__m512 sum272 = sum270;
__m512 sum273 = sum270;
for (s14 = 0; s14 < 835; ++s14) {
__m512 dat167 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s14+(ptrdiff_t)0);
__m512 dat168 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s14+(ptrdiff_t)64);
__m512 dat169 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s14+(ptrdiff_t)128);
__m512 dat170 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s14+(ptrdiff_t)192);
__m512 wt253 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)24));
sum250 = _mm512_fmadd_ps(wt253, dat167, sum250);
sum251 = _mm512_fmadd_ps(wt253, dat168, sum251);
sum252 = _mm512_fmadd_ps(wt253, dat169, sum252);
sum253 = _mm512_fmadd_ps(wt253, dat170, sum253);
__m512 wt254 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)28));
sum254 = _mm512_fmadd_ps(wt254, dat167, sum254);
sum255 = _mm512_fmadd_ps(wt254, dat168, sum255);
sum256 = _mm512_fmadd_ps(wt254, dat169, sum256);
sum257 = _mm512_fmadd_ps(wt254, dat170, sum257);
__m512 wt255 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)32));
sum258 = _mm512_fmadd_ps(wt255, dat167, sum258);
sum259 = _mm512_fmadd_ps(wt255, dat168, sum259);
sum260 = _mm512_fmadd_ps(wt255, dat169, sum260);
sum261 = _mm512_fmadd_ps(wt255, dat170, sum261);
__m512 wt256 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)36));
sum262 = _mm512_fmadd_ps(wt256, dat167, sum262);
sum263 = _mm512_fmadd_ps(wt256, dat168, sum263);
sum264 = _mm512_fmadd_ps(wt256, dat169, sum264);
sum265 = _mm512_fmadd_ps(wt256, dat170, sum265);
__m512 wt257 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)40));
sum266 = _mm512_fmadd_ps(wt257, dat167, sum266);
sum267 = _mm512_fmadd_ps(wt257, dat168, sum267);
sum268 = _mm512_fmadd_ps(wt257, dat169, sum268);
sum269 = _mm512_fmadd_ps(wt257, dat170, sum269);
__m512 wt258 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+24*s14+(ptrdiff_t)44));
sum270 = _mm512_fmadd_ps(wt258, dat167, sum270);
sum271 = _mm512_fmadd_ps(wt258, dat168, sum271);
sum272 = _mm512_fmadd_ps(wt258, dat169, sum272);
sum273 = _mm512_fmadd_ps(wt258, dat170, sum273);
}
sum250 = _mm512_add_ps(sum250, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)464));
sum251 = _mm512_add_ps(sum251, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)528));
sum252 = _mm512_add_ps(sum252, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)592));
__m512 dat171 = sum253;
dat171 = _mm512_add_ps(dat171, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)656));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)464, 65535, sum250);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)528, 65535, sum251);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)592, 65535, sum252);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)656, 15, dat171);
sum254 = _mm512_add_ps(sum254, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41120));
sum255 = _mm512_add_ps(sum255, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41184));
sum256 = _mm512_add_ps(sum256, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41248));
__m512 dat172 = sum257;
dat172 = _mm512_add_ps(dat172, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41312));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41120, 65535, sum254);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41184, 65535, sum255);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41248, 65535, sum256);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41312, 15, dat172);
sum258 = _mm512_add_ps(sum258, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81776));
sum259 = _mm512_add_ps(sum259, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81840));
sum260 = _mm512_add_ps(sum260, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81904));
__m512 dat173 = sum261;
dat173 = _mm512_add_ps(dat173, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81968));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81776, 65535, sum258);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81840, 65535, sum259);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81904, 65535, sum260);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81968, 15, dat173);
sum262 = _mm512_add_ps(sum262, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122432));
sum263 = _mm512_add_ps(sum263, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122496));
sum264 = _mm512_add_ps(sum264, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122560));
__m512 dat174 = sum265;
dat174 = _mm512_add_ps(dat174, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122624));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122432, 65535, sum262);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122496, 65535, sum263);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122560, 65535, sum264);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122624, 15, dat174);
sum266 = _mm512_add_ps(sum266, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163088));
sum267 = _mm512_add_ps(sum267, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163152));
sum268 = _mm512_add_ps(sum268, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163216));
__m512 dat175 = sum269;
dat175 = _mm512_add_ps(dat175, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163280));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163088, 65535, sum266);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163152, 65535, sum267);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163216, 65535, sum268);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163280, 15, dat175);
sum270 = _mm512_add_ps(sum270, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)203744));
sum271 = _mm512_add_ps(sum271, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)203808));
sum272 = _mm512_add_ps(sum272, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)203872));
__m512 dat176 = sum273;
dat176 = _mm512_add_ps(dat176, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)203936));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)203744, 65535, sum270);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)203808, 65535, sum271);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)203872, 65535, sum272);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)203936, 15, dat176);
if (k19 >= kk15) return;
}
ptrdiff_t s15 = -1;
__m512 sum274 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)20));
__m512 sum278 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)24));
__m512 sum282 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)28));
__m512 sum286 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)32));
__m512 sum290 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)36));
__m512 sum275 = sum274;
__m512 sum276 = sum274;
__m512 sum277 = sum274;
__m512 sum279 = sum278;
__m512 sum280 = sum278;
__m512 sum281 = sum278;
__m512 sum283 = sum282;
__m512 sum284 = sum282;
__m512 sum285 = sum282;
__m512 sum287 = sum286;
__m512 sum288 = sum286;
__m512 sum289 = sum286;
__m512 sum291 = sum290;
__m512 sum292 = sum290;
__m512 sum293 = sum290;
for (s15 = 0; s15 < 835; ++s15) {
__m512 dat177 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s15+(ptrdiff_t)0);
__m512 dat178 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s15+(ptrdiff_t)64);
__m512 dat179 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s15+(ptrdiff_t)128);
__m512 dat180 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+256*s15+(ptrdiff_t)192);
__m512 wt259 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)20));
sum274 = _mm512_fmadd_ps(wt259, dat177, sum274);
sum275 = _mm512_fmadd_ps(wt259, dat178, sum275);
sum276 = _mm512_fmadd_ps(wt259, dat179, sum276);
sum277 = _mm512_fmadd_ps(wt259, dat180, sum277);
__m512 wt260 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)24));
sum278 = _mm512_fmadd_ps(wt260, dat177, sum278);
sum279 = _mm512_fmadd_ps(wt260, dat178, sum279);
sum280 = _mm512_fmadd_ps(wt260, dat179, sum280);
sum281 = _mm512_fmadd_ps(wt260, dat180, sum281);
__m512 wt261 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)28));
sum282 = _mm512_fmadd_ps(wt261, dat177, sum282);
sum283 = _mm512_fmadd_ps(wt261, dat178, sum283);
sum284 = _mm512_fmadd_ps(wt261, dat179, sum284);
sum285 = _mm512_fmadd_ps(wt261, dat180, sum285);
__m512 wt262 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)32));
sum286 = _mm512_fmadd_ps(wt262, dat177, sum286);
sum287 = _mm512_fmadd_ps(wt262, dat178, sum287);
sum288 = _mm512_fmadd_ps(wt262, dat179, sum288);
sum289 = _mm512_fmadd_ps(wt262, dat180, sum289);
__m512 wt263 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k19+20*s15+(ptrdiff_t)36));
sum290 = _mm512_fmadd_ps(wt263, dat177, sum290);
sum291 = _mm512_fmadd_ps(wt263, dat178, sum291);
sum292 = _mm512_fmadd_ps(wt263, dat179, sum292);
sum293 = _mm512_fmadd_ps(wt263, dat180, sum293);
}
sum274 = _mm512_add_ps(sum274, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)464));
sum275 = _mm512_add_ps(sum275, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)528));
sum276 = _mm512_add_ps(sum276, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)592));
__m512 dat181 = sum277;
dat181 = _mm512_add_ps(dat181, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)656));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)464, 65535, sum274);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)528, 65535, sum275);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)592, 65535, sum276);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)656, 15, dat181);
sum278 = _mm512_add_ps(sum278, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41120));
sum279 = _mm512_add_ps(sum279, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41184));
sum280 = _mm512_add_ps(sum280, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41248));
__m512 dat182 = sum281;
dat182 = _mm512_add_ps(dat182, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41312));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41120, 65535, sum278);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41184, 65535, sum279);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41248, 65535, sum280);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)41312, 15, dat182);
sum282 = _mm512_add_ps(sum282, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81776));
sum283 = _mm512_add_ps(sum283, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81840));
sum284 = _mm512_add_ps(sum284, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81904));
__m512 dat183 = sum285;
dat183 = _mm512_add_ps(dat183, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81968));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81776, 65535, sum282);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81840, 65535, sum283);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81904, 65535, sum284);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)81968, 15, dat183);
sum286 = _mm512_add_ps(sum286, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122432));
sum287 = _mm512_add_ps(sum287, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122496));
sum288 = _mm512_add_ps(sum288, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122560));
__m512 dat184 = sum289;
dat184 = _mm512_add_ps(dat184, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122624));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122432, 65535, sum286);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122496, 65535, sum287);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122560, 65535, sum288);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)122624, 15, dat184);
sum290 = _mm512_add_ps(sum290, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163088));
sum291 = _mm512_add_ps(sum291, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163152));
sum292 = _mm512_add_ps(sum292, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163216));
__m512 dat185 = sum293;
dat185 = _mm512_add_ps(dat185, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163280));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163088, 65535, sum290);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163152, 65535, sum291);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163216, 65535, sum292);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h5+243936*k19+(ptrdiff_t)163280, 15, dat185);
if (j6 >= jj6) return;
++j6;
h5 += 2;
goto wrap6;
}
}
j6 = 181;
}
ptrdiff_t h6 = 120;
switch (j6) {
default: {
j6 = 181;
ptrdiff_t k20 = 1*w2;
ptrdiff_t kk16 = k20+0;
for (; k20 != 412; ++k20) {
ptrdiff_t s16 = -1;
__m512 sum294 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)24));
__m512 sum296 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)28));
__m512 sum298 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)32));
__m512 sum300 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)36));
__m512 sum302 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)40));
__m512 sum304 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)44));
__m512 sum295 = sum294;
__m512 sum297 = sum296;
__m512 sum299 = sum298;
__m512 sum301 = sum300;
__m512 sum303 = sum302;
__m512 sum305 = sum304;
for (s16 = 0; s16 < 835; ++s16) {
__m512 dat186 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+128*s16+(ptrdiff_t)0);
__m512 dat187 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+128*s16+(ptrdiff_t)64);
__m512 wt264 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)24));
sum294 = _mm512_fmadd_ps(wt264, dat186, sum294);
sum295 = _mm512_fmadd_ps(wt264, dat187, sum295);
__m512 wt265 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)28));
sum296 = _mm512_fmadd_ps(wt265, dat186, sum296);
sum297 = _mm512_fmadd_ps(wt265, dat187, sum297);
__m512 wt266 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)32));
sum298 = _mm512_fmadd_ps(wt266, dat186, sum298);
sum299 = _mm512_fmadd_ps(wt266, dat187, sum299);
__m512 wt267 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)36));
sum300 = _mm512_fmadd_ps(wt267, dat186, sum300);
sum301 = _mm512_fmadd_ps(wt267, dat187, sum301);
__m512 wt268 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)40));
sum302 = _mm512_fmadd_ps(wt268, dat186, sum302);
sum303 = _mm512_fmadd_ps(wt268, dat187, sum303);
__m512 wt269 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+24*s16+(ptrdiff_t)44));
sum304 = _mm512_fmadd_ps(wt269, dat186, sum304);
sum305 = _mm512_fmadd_ps(wt269, dat187, sum305);
}
sum294 = _mm512_add_ps(sum294, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)256));
__m512 dat188 = sum295;
dat188 = _mm512_add_ps(dat188, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)320));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)256, 65535, sum294);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)320, 15, dat188);
sum296 = _mm512_add_ps(sum296, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)40912));
__m512 dat189 = sum297;
dat189 = _mm512_add_ps(dat189, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)40976));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)40912, 65535, sum296);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)40976, 15, dat189);
sum298 = _mm512_add_ps(sum298, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)81568));
__m512 dat190 = sum299;
dat190 = _mm512_add_ps(dat190, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)81632));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)81568, 65535, sum298);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)81632, 15, dat190);
sum300 = _mm512_add_ps(sum300, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)122224));
__m512 dat191 = sum301;
dat191 = _mm512_add_ps(dat191, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)122288));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)122224, 65535, sum300);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)122288, 15, dat191);
sum302 = _mm512_add_ps(sum302, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)162880));
__m512 dat192 = sum303;
dat192 = _mm512_add_ps(dat192, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)162944));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)162880, 65535, sum302);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)162944, 15, dat192);
sum304 = _mm512_add_ps(sum304, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)203536));
__m512 dat193 = sum305;
dat193 = _mm512_add_ps(dat193, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)203600));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)203536, 65535, sum304);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)203600, 15, dat193);
if (k20 >= kk16) return;
}
ptrdiff_t s17 = -1;
__m512 sum306 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)20));
__m512 sum308 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)24));
__m512 sum310 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)28));
__m512 sum312 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)32));
__m512 sum314 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)36));
__m512 sum307 = sum306;
__m512 sum309 = sum308;
__m512 sum311 = sum310;
__m512 sum313 = sum312;
__m512 sum315 = sum314;
for (s17 = 0; s17 < 835; ++s17) {
__m512 dat194 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+128*s17+(ptrdiff_t)0);
__m512 dat195 = _mm512_loadu_ps(arrangedDats2+38797440*i10+213760*j6+128*s17+(ptrdiff_t)64);
__m512 wt270 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)20));
sum306 = _mm512_fmadd_ps(wt270, dat194, sum306);
sum307 = _mm512_fmadd_ps(wt270, dat195, sum307);
__m512 wt271 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)24));
sum308 = _mm512_fmadd_ps(wt271, dat194, sum308);
sum309 = _mm512_fmadd_ps(wt271, dat195, sum309);
__m512 wt272 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)28));
sum310 = _mm512_fmadd_ps(wt272, dat194, sum310);
sum311 = _mm512_fmadd_ps(wt272, dat195, sum311);
__m512 wt273 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)32));
sum312 = _mm512_fmadd_ps(wt273, dat194, sum312);
sum313 = _mm512_fmadd_ps(wt273, dat195, sum313);
__m512 wt274 = _mm512_set1_ps(*(float*)(arrangedWts2+8283088*i10+20064*k20+20*s17+(ptrdiff_t)36));
sum314 = _mm512_fmadd_ps(wt274, dat194, sum314);
sum315 = _mm512_fmadd_ps(wt274, dat195, sum315);
}
sum306 = _mm512_add_ps(sum306, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)256));
__m512 dat196 = sum307;
dat196 = _mm512_add_ps(dat196, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)320));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)256, 65535, sum306);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)320, 15, dat196);
sum308 = _mm512_add_ps(sum308, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)40912));
__m512 dat197 = sum309;
dat197 = _mm512_add_ps(dat197, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)40976));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)40912, 65535, sum308);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)40976, 15, dat197);
sum310 = _mm512_add_ps(sum310, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)81568));
__m512 dat198 = sum311;
dat198 = _mm512_add_ps(dat198, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)81632));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)81568, 65535, sum310);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)81632, 15, dat198);
sum312 = _mm512_add_ps(sum312, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)122224));
__m512 dat199 = sum313;
dat199 = _mm512_add_ps(dat199, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)122288));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)122224, 65535, sum312);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)122288, 15, dat199);
sum314 = _mm512_add_ps(sum314, _mm512_maskz_loadu_ps(65535, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)162880));
__m512 dat200 = sum315;
dat200 = _mm512_add_ps(dat200, _mm512_maskz_loadu_ps(15, datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)162944));
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)162880, 65535, sum314);
_mm512_mask_storeu_ps(datPtr4+100704912*i10+336*h6+243936*k20+(ptrdiff_t)162944, 15, dat200);
if (j6 >= jj6) return;
}
}
j6 = 182;
}
}

static void Example15OneApply1Callee3(Example15ThreaderTask1* task10, int64_t* pt11) {
void** pair4 = task10->any1;
char** tensors8 = pair4[0];
ptrdiff_t e6 = 4;
ptrdiff_t g6 = pt11[2];
ptrdiff_t d3 = pt11[1];
ptrdiff_t w3 = pt11[0];
char*restrict arrangedWts3 = tensors8[0]+66264704*e6+(ptrdiff_t)7421092*1*g6;
char*restrict arrangedDats3 = tensors8[1]+310379520*e6+(ptrdiff_t)34755072*1*g6;
char*restrict datPtr5 = tensors8[2]+(ptrdiff_t)100704912*1*g6;
ptrdiff_t ii7 = 1;
for (ptrdiff_t i11 = 0; i11 < ii7; ++i11) {
ptrdiff_t j7 = 1*d3;
ptrdiff_t jj7 = j7+0;
if (j7 < 181) {
ptrdiff_t h7 = 0+((size_t)j7-0)/3*2;
switch (((size_t)j7-0)%3) {
case 0: {
wrap7:;
ptrdiff_t k21 = 1*w3;
ptrdiff_t kk17 = k21+0;
for (; k21 != 412; ++k21) {
ptrdiff_t s18 = -1;
__m512 sum316 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)24));
__m512 sum320 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)28));
__m512 sum324 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)32));
__m512 sum328 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)36));
__m512 sum332 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)40));
__m512 sum336 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)44));
__m512 sum317 = sum316;
__m512 sum318 = sum316;
__m512 sum319 = sum316;
__m512 sum321 = sum320;
__m512 sum322 = sum320;
__m512 sum323 = sum320;
__m512 sum325 = sum324;
__m512 sum326 = sum324;
__m512 sum327 = sum324;
__m512 sum329 = sum328;
__m512 sum330 = sum328;
__m512 sum331 = sum328;
__m512 sum333 = sum332;
__m512 sum334 = sum332;
__m512 sum335 = sum332;
__m512 sum337 = sum336;
__m512 sum338 = sum336;
__m512 sum339 = sum336;
for (s18 = 0; s18 < 748; ++s18) {
__m512 dat201 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s18+(ptrdiff_t)0);
__m512 dat202 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s18+(ptrdiff_t)64);
__m512 dat203 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s18+(ptrdiff_t)128);
__m512 dat204 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s18+(ptrdiff_t)192);
__m512 wt275 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)24));
sum316 = _mm512_fmadd_ps(wt275, dat201, sum316);
sum317 = _mm512_fmadd_ps(wt275, dat202, sum317);
sum318 = _mm512_fmadd_ps(wt275, dat203, sum318);
sum319 = _mm512_fmadd_ps(wt275, dat204, sum319);
__m512 wt276 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)28));
sum320 = _mm512_fmadd_ps(wt276, dat201, sum320);
sum321 = _mm512_fmadd_ps(wt276, dat202, sum321);
sum322 = _mm512_fmadd_ps(wt276, dat203, sum322);
sum323 = _mm512_fmadd_ps(wt276, dat204, sum323);
__m512 wt277 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)32));
sum324 = _mm512_fmadd_ps(wt277, dat201, sum324);
sum325 = _mm512_fmadd_ps(wt277, dat202, sum325);
sum326 = _mm512_fmadd_ps(wt277, dat203, sum326);
sum327 = _mm512_fmadd_ps(wt277, dat204, sum327);
__m512 wt278 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)36));
sum328 = _mm512_fmadd_ps(wt278, dat201, sum328);
sum329 = _mm512_fmadd_ps(wt278, dat202, sum329);
sum330 = _mm512_fmadd_ps(wt278, dat203, sum330);
sum331 = _mm512_fmadd_ps(wt278, dat204, sum331);
__m512 wt279 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)40));
sum332 = _mm512_fmadd_ps(wt279, dat201, sum332);
sum333 = _mm512_fmadd_ps(wt279, dat202, sum333);
sum334 = _mm512_fmadd_ps(wt279, dat203, sum334);
sum335 = _mm512_fmadd_ps(wt279, dat204, sum335);
__m512 wt280 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+24*s18+(ptrdiff_t)44));
sum336 = _mm512_fmadd_ps(wt280, dat201, sum336);
sum337 = _mm512_fmadd_ps(wt280, dat202, sum337);
sum338 = _mm512_fmadd_ps(wt280, dat203, sum338);
sum339 = _mm512_fmadd_ps(wt280, dat204, sum339);
}
sum316 = _mm512_add_ps(sum316, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)0));
sum317 = _mm512_add_ps(sum317, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)64));
sum318 = _mm512_add_ps(sum318, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)128));
sum319 = _mm512_add_ps(sum319, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)192));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)0, 65535, sum316);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)64, 65535, sum317);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)128, 65535, sum318);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)192, 65535, sum319);
sum320 = _mm512_add_ps(sum320, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40656));
sum321 = _mm512_add_ps(sum321, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40720));
sum322 = _mm512_add_ps(sum322, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40784));
sum323 = _mm512_add_ps(sum323, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40848));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40656, 65535, sum320);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40720, 65535, sum321);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40784, 65535, sum322);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40848, 65535, sum323);
sum324 = _mm512_add_ps(sum324, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81312));
sum325 = _mm512_add_ps(sum325, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81376));
sum326 = _mm512_add_ps(sum326, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81440));
sum327 = _mm512_add_ps(sum327, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81504));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81312, 65535, sum324);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81376, 65535, sum325);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81440, 65535, sum326);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81504, 65535, sum327);
sum328 = _mm512_add_ps(sum328, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)121968));
sum329 = _mm512_add_ps(sum329, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122032));
sum330 = _mm512_add_ps(sum330, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122096));
sum331 = _mm512_add_ps(sum331, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122160));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)121968, 65535, sum328);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122032, 65535, sum329);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122096, 65535, sum330);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122160, 65535, sum331);
sum332 = _mm512_add_ps(sum332, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162624));
sum333 = _mm512_add_ps(sum333, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162688));
sum334 = _mm512_add_ps(sum334, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162752));
sum335 = _mm512_add_ps(sum335, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162816));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162624, 65535, sum332);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162688, 65535, sum333);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162752, 65535, sum334);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162816, 65535, sum335);
sum336 = _mm512_add_ps(sum336, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)203280));
sum337 = _mm512_add_ps(sum337, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)203344));
sum338 = _mm512_add_ps(sum338, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)203408));
sum339 = _mm512_add_ps(sum339, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)203472));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)203280, 65535, sum336);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)203344, 65535, sum337);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)203408, 65535, sum338);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)203472, 65535, sum339);
if (k21 >= kk17) return;
}
ptrdiff_t s19 = -1;
__m512 sum340 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)20));
__m512 sum344 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)24));
__m512 sum348 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)28));
__m512 sum352 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)32));
__m512 sum356 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)36));
__m512 sum341 = sum340;
__m512 sum342 = sum340;
__m512 sum343 = sum340;
__m512 sum345 = sum344;
__m512 sum346 = sum344;
__m512 sum347 = sum344;
__m512 sum349 = sum348;
__m512 sum350 = sum348;
__m512 sum351 = sum348;
__m512 sum353 = sum352;
__m512 sum354 = sum352;
__m512 sum355 = sum352;
__m512 sum357 = sum356;
__m512 sum358 = sum356;
__m512 sum359 = sum356;
for (s19 = 0; s19 < 748; ++s19) {
__m512 dat205 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s19+(ptrdiff_t)0);
__m512 dat206 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s19+(ptrdiff_t)64);
__m512 dat207 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s19+(ptrdiff_t)128);
__m512 dat208 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s19+(ptrdiff_t)192);
__m512 wt281 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)20));
sum340 = _mm512_fmadd_ps(wt281, dat205, sum340);
sum341 = _mm512_fmadd_ps(wt281, dat206, sum341);
sum342 = _mm512_fmadd_ps(wt281, dat207, sum342);
sum343 = _mm512_fmadd_ps(wt281, dat208, sum343);
__m512 wt282 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)24));
sum344 = _mm512_fmadd_ps(wt282, dat205, sum344);
sum345 = _mm512_fmadd_ps(wt282, dat206, sum345);
sum346 = _mm512_fmadd_ps(wt282, dat207, sum346);
sum347 = _mm512_fmadd_ps(wt282, dat208, sum347);
__m512 wt283 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)28));
sum348 = _mm512_fmadd_ps(wt283, dat205, sum348);
sum349 = _mm512_fmadd_ps(wt283, dat206, sum349);
sum350 = _mm512_fmadd_ps(wt283, dat207, sum350);
sum351 = _mm512_fmadd_ps(wt283, dat208, sum351);
__m512 wt284 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)32));
sum352 = _mm512_fmadd_ps(wt284, dat205, sum352);
sum353 = _mm512_fmadd_ps(wt284, dat206, sum353);
sum354 = _mm512_fmadd_ps(wt284, dat207, sum354);
sum355 = _mm512_fmadd_ps(wt284, dat208, sum355);
__m512 wt285 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k21+20*s19+(ptrdiff_t)36));
sum356 = _mm512_fmadd_ps(wt285, dat205, sum356);
sum357 = _mm512_fmadd_ps(wt285, dat206, sum357);
sum358 = _mm512_fmadd_ps(wt285, dat207, sum358);
sum359 = _mm512_fmadd_ps(wt285, dat208, sum359);
}
sum340 = _mm512_add_ps(sum340, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)0));
sum341 = _mm512_add_ps(sum341, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)64));
sum342 = _mm512_add_ps(sum342, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)128));
sum343 = _mm512_add_ps(sum343, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)192));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)0, 65535, sum340);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)64, 65535, sum341);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)128, 65535, sum342);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)192, 65535, sum343);
sum344 = _mm512_add_ps(sum344, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40656));
sum345 = _mm512_add_ps(sum345, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40720));
sum346 = _mm512_add_ps(sum346, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40784));
sum347 = _mm512_add_ps(sum347, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40848));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40656, 65535, sum344);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40720, 65535, sum345);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40784, 65535, sum346);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)40848, 65535, sum347);
sum348 = _mm512_add_ps(sum348, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81312));
sum349 = _mm512_add_ps(sum349, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81376));
sum350 = _mm512_add_ps(sum350, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81440));
sum351 = _mm512_add_ps(sum351, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81504));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81312, 65535, sum348);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81376, 65535, sum349);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81440, 65535, sum350);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)81504, 65535, sum351);
sum352 = _mm512_add_ps(sum352, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)121968));
sum353 = _mm512_add_ps(sum353, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122032));
sum354 = _mm512_add_ps(sum354, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122096));
sum355 = _mm512_add_ps(sum355, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122160));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)121968, 65535, sum352);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122032, 65535, sum353);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122096, 65535, sum354);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)122160, 65535, sum355);
sum356 = _mm512_add_ps(sum356, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162624));
sum357 = _mm512_add_ps(sum357, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162688));
sum358 = _mm512_add_ps(sum358, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162752));
sum359 = _mm512_add_ps(sum359, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162816));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162624, 65535, sum356);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162688, 65535, sum357);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162752, 65535, sum358);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k21+(ptrdiff_t)162816, 65535, sum359);
if (j7 >= jj7) return;
if (j7 >= 180) break;
++j7;
}
case 1: {
ptrdiff_t k22 = 1*w3;
ptrdiff_t kk18 = k22+0;
for (; k22 != 412; ++k22) {
ptrdiff_t s20 = -1;
__m512 sum360 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)24));
__m512 sum364 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)28));
__m512 sum368 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)32));
__m512 sum372 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)36));
__m512 sum376 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)40));
__m512 sum380 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)44));
__m512 sum361 = sum360;
__m512 sum362 = sum360;
__m512 sum363 = sum360;
__m512 sum365 = sum364;
__m512 sum366 = sum364;
__m512 sum367 = sum364;
__m512 sum369 = sum368;
__m512 sum370 = sum368;
__m512 sum371 = sum368;
__m512 sum373 = sum372;
__m512 sum374 = sum372;
__m512 sum375 = sum372;
__m512 sum377 = sum376;
__m512 sum378 = sum376;
__m512 sum379 = sum376;
__m512 sum381 = sum380;
__m512 sum382 = sum380;
__m512 sum383 = sum380;
for (s20 = 0; s20 < 748; ++s20) {
__m512 dat209 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s20+(ptrdiff_t)0);
__m512 dat210 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s20+(ptrdiff_t)64);
__m512 dat211 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s20+(ptrdiff_t)128);
__m512 dat212 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s20+(ptrdiff_t)192);
__m512 wt286 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)24));
sum360 = _mm512_fmadd_ps(wt286, dat209, sum360);
sum361 = _mm512_fmadd_ps(wt286, dat210, sum361);
sum362 = _mm512_fmadd_ps(wt286, dat211, sum362);
sum363 = _mm512_fmadd_ps(wt286, dat212, sum363);
__m512 wt287 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)28));
sum364 = _mm512_fmadd_ps(wt287, dat209, sum364);
sum365 = _mm512_fmadd_ps(wt287, dat210, sum365);
sum366 = _mm512_fmadd_ps(wt287, dat211, sum366);
sum367 = _mm512_fmadd_ps(wt287, dat212, sum367);
__m512 wt288 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)32));
sum368 = _mm512_fmadd_ps(wt288, dat209, sum368);
sum369 = _mm512_fmadd_ps(wt288, dat210, sum369);
sum370 = _mm512_fmadd_ps(wt288, dat211, sum370);
sum371 = _mm512_fmadd_ps(wt288, dat212, sum371);
__m512 wt289 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)36));
sum372 = _mm512_fmadd_ps(wt289, dat209, sum372);
sum373 = _mm512_fmadd_ps(wt289, dat210, sum373);
sum374 = _mm512_fmadd_ps(wt289, dat211, sum374);
sum375 = _mm512_fmadd_ps(wt289, dat212, sum375);
__m512 wt290 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)40));
sum376 = _mm512_fmadd_ps(wt290, dat209, sum376);
sum377 = _mm512_fmadd_ps(wt290, dat210, sum377);
sum378 = _mm512_fmadd_ps(wt290, dat211, sum378);
sum379 = _mm512_fmadd_ps(wt290, dat212, sum379);
__m512 wt291 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+24*s20+(ptrdiff_t)44));
sum380 = _mm512_fmadd_ps(wt291, dat209, sum380);
sum381 = _mm512_fmadd_ps(wt291, dat210, sum381);
sum382 = _mm512_fmadd_ps(wt291, dat211, sum382);
sum383 = _mm512_fmadd_ps(wt291, dat212, sum383);
}
sum360 = _mm512_add_ps(sum360, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)256));
__m512 dat213 = sum361;
dat213 = _mm512_add_ps(dat213, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)320));
sum362 = _mm512_add_ps(sum362, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)336));
sum363 = _mm512_add_ps(sum363, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)400));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)256, 65535, sum360);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)320, 15, dat213);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)336, 65535, sum362);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)400, 65535, sum363);
sum364 = _mm512_add_ps(sum364, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40912));
__m512 dat214 = sum365;
dat214 = _mm512_add_ps(dat214, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40976));
sum366 = _mm512_add_ps(sum366, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40992));
sum367 = _mm512_add_ps(sum367, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)41056));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40912, 65535, sum364);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40976, 15, dat214);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40992, 65535, sum366);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)41056, 65535, sum367);
sum368 = _mm512_add_ps(sum368, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81568));
__m512 dat215 = sum369;
dat215 = _mm512_add_ps(dat215, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81632));
sum370 = _mm512_add_ps(sum370, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81648));
sum371 = _mm512_add_ps(sum371, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81712));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81568, 65535, sum368);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81632, 15, dat215);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81648, 65535, sum370);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81712, 65535, sum371);
sum372 = _mm512_add_ps(sum372, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122224));
__m512 dat216 = sum373;
dat216 = _mm512_add_ps(dat216, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122288));
sum374 = _mm512_add_ps(sum374, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122304));
sum375 = _mm512_add_ps(sum375, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122368));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122224, 65535, sum372);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122288, 15, dat216);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122304, 65535, sum374);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122368, 65535, sum375);
sum376 = _mm512_add_ps(sum376, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162880));
__m512 dat217 = sum377;
dat217 = _mm512_add_ps(dat217, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162944));
sum378 = _mm512_add_ps(sum378, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162960));
sum379 = _mm512_add_ps(sum379, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)163024));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162880, 65535, sum376);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162944, 15, dat217);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162960, 65535, sum378);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)163024, 65535, sum379);
sum380 = _mm512_add_ps(sum380, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)203536));
__m512 dat218 = sum381;
dat218 = _mm512_add_ps(dat218, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)203600));
sum382 = _mm512_add_ps(sum382, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)203616));
sum383 = _mm512_add_ps(sum383, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)203680));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)203536, 65535, sum380);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)203600, 15, dat218);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)203616, 65535, sum382);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)203680, 65535, sum383);
if (k22 >= kk18) return;
}
ptrdiff_t s21 = -1;
__m512 sum384 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)20));
__m512 sum388 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)24));
__m512 sum392 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)28));
__m512 sum396 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)32));
__m512 sum400 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)36));
__m512 sum385 = sum384;
__m512 sum386 = sum384;
__m512 sum387 = sum384;
__m512 sum389 = sum388;
__m512 sum390 = sum388;
__m512 sum391 = sum388;
__m512 sum393 = sum392;
__m512 sum394 = sum392;
__m512 sum395 = sum392;
__m512 sum397 = sum396;
__m512 sum398 = sum396;
__m512 sum399 = sum396;
__m512 sum401 = sum400;
__m512 sum402 = sum400;
__m512 sum403 = sum400;
for (s21 = 0; s21 < 748; ++s21) {
__m512 dat219 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s21+(ptrdiff_t)0);
__m512 dat220 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s21+(ptrdiff_t)64);
__m512 dat221 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s21+(ptrdiff_t)128);
__m512 dat222 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s21+(ptrdiff_t)192);
__m512 wt292 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)20));
sum384 = _mm512_fmadd_ps(wt292, dat219, sum384);
sum385 = _mm512_fmadd_ps(wt292, dat220, sum385);
sum386 = _mm512_fmadd_ps(wt292, dat221, sum386);
sum387 = _mm512_fmadd_ps(wt292, dat222, sum387);
__m512 wt293 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)24));
sum388 = _mm512_fmadd_ps(wt293, dat219, sum388);
sum389 = _mm512_fmadd_ps(wt293, dat220, sum389);
sum390 = _mm512_fmadd_ps(wt293, dat221, sum390);
sum391 = _mm512_fmadd_ps(wt293, dat222, sum391);
__m512 wt294 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)28));
sum392 = _mm512_fmadd_ps(wt294, dat219, sum392);
sum393 = _mm512_fmadd_ps(wt294, dat220, sum393);
sum394 = _mm512_fmadd_ps(wt294, dat221, sum394);
sum395 = _mm512_fmadd_ps(wt294, dat222, sum395);
__m512 wt295 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)32));
sum396 = _mm512_fmadd_ps(wt295, dat219, sum396);
sum397 = _mm512_fmadd_ps(wt295, dat220, sum397);
sum398 = _mm512_fmadd_ps(wt295, dat221, sum398);
sum399 = _mm512_fmadd_ps(wt295, dat222, sum399);
__m512 wt296 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k22+20*s21+(ptrdiff_t)36));
sum400 = _mm512_fmadd_ps(wt296, dat219, sum400);
sum401 = _mm512_fmadd_ps(wt296, dat220, sum401);
sum402 = _mm512_fmadd_ps(wt296, dat221, sum402);
sum403 = _mm512_fmadd_ps(wt296, dat222, sum403);
}
sum384 = _mm512_add_ps(sum384, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)256));
__m512 dat223 = sum385;
dat223 = _mm512_add_ps(dat223, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)320));
sum386 = _mm512_add_ps(sum386, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)336));
sum387 = _mm512_add_ps(sum387, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)400));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)256, 65535, sum384);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)320, 15, dat223);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)336, 65535, sum386);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)400, 65535, sum387);
sum388 = _mm512_add_ps(sum388, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40912));
__m512 dat224 = sum389;
dat224 = _mm512_add_ps(dat224, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40976));
sum390 = _mm512_add_ps(sum390, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40992));
sum391 = _mm512_add_ps(sum391, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)41056));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40912, 65535, sum388);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40976, 15, dat224);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)40992, 65535, sum390);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)41056, 65535, sum391);
sum392 = _mm512_add_ps(sum392, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81568));
__m512 dat225 = sum393;
dat225 = _mm512_add_ps(dat225, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81632));
sum394 = _mm512_add_ps(sum394, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81648));
sum395 = _mm512_add_ps(sum395, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81712));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81568, 65535, sum392);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81632, 15, dat225);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81648, 65535, sum394);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)81712, 65535, sum395);
sum396 = _mm512_add_ps(sum396, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122224));
__m512 dat226 = sum397;
dat226 = _mm512_add_ps(dat226, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122288));
sum398 = _mm512_add_ps(sum398, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122304));
sum399 = _mm512_add_ps(sum399, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122368));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122224, 65535, sum396);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122288, 15, dat226);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122304, 65535, sum398);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)122368, 65535, sum399);
sum400 = _mm512_add_ps(sum400, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162880));
__m512 dat227 = sum401;
dat227 = _mm512_add_ps(dat227, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162944));
sum402 = _mm512_add_ps(sum402, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162960));
sum403 = _mm512_add_ps(sum403, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)163024));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162880, 65535, sum400);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162944, 15, dat227);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)162960, 65535, sum402);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k22+(ptrdiff_t)163024, 65535, sum403);
if (j7 >= jj7) return;
++j7;
}
default: {
ptrdiff_t k23 = 1*w3;
ptrdiff_t kk19 = k23+0;
for (; k23 != 412; ++k23) {
ptrdiff_t s22 = -1;
__m512 sum404 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)24));
__m512 sum408 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)28));
__m512 sum412 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)32));
__m512 sum416 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)36));
__m512 sum420 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)40));
__m512 sum424 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)44));
__m512 sum405 = sum404;
__m512 sum406 = sum404;
__m512 sum407 = sum404;
__m512 sum409 = sum408;
__m512 sum410 = sum408;
__m512 sum411 = sum408;
__m512 sum413 = sum412;
__m512 sum414 = sum412;
__m512 sum415 = sum412;
__m512 sum417 = sum416;
__m512 sum418 = sum416;
__m512 sum419 = sum416;
__m512 sum421 = sum420;
__m512 sum422 = sum420;
__m512 sum423 = sum420;
__m512 sum425 = sum424;
__m512 sum426 = sum424;
__m512 sum427 = sum424;
for (s22 = 0; s22 < 748; ++s22) {
__m512 dat228 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s22+(ptrdiff_t)0);
__m512 dat229 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s22+(ptrdiff_t)64);
__m512 dat230 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s22+(ptrdiff_t)128);
__m512 dat231 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s22+(ptrdiff_t)192);
__m512 wt297 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)24));
sum404 = _mm512_fmadd_ps(wt297, dat228, sum404);
sum405 = _mm512_fmadd_ps(wt297, dat229, sum405);
sum406 = _mm512_fmadd_ps(wt297, dat230, sum406);
sum407 = _mm512_fmadd_ps(wt297, dat231, sum407);
__m512 wt298 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)28));
sum408 = _mm512_fmadd_ps(wt298, dat228, sum408);
sum409 = _mm512_fmadd_ps(wt298, dat229, sum409);
sum410 = _mm512_fmadd_ps(wt298, dat230, sum410);
sum411 = _mm512_fmadd_ps(wt298, dat231, sum411);
__m512 wt299 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)32));
sum412 = _mm512_fmadd_ps(wt299, dat228, sum412);
sum413 = _mm512_fmadd_ps(wt299, dat229, sum413);
sum414 = _mm512_fmadd_ps(wt299, dat230, sum414);
sum415 = _mm512_fmadd_ps(wt299, dat231, sum415);
__m512 wt300 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)36));
sum416 = _mm512_fmadd_ps(wt300, dat228, sum416);
sum417 = _mm512_fmadd_ps(wt300, dat229, sum417);
sum418 = _mm512_fmadd_ps(wt300, dat230, sum418);
sum419 = _mm512_fmadd_ps(wt300, dat231, sum419);
__m512 wt301 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)40));
sum420 = _mm512_fmadd_ps(wt301, dat228, sum420);
sum421 = _mm512_fmadd_ps(wt301, dat229, sum421);
sum422 = _mm512_fmadd_ps(wt301, dat230, sum422);
sum423 = _mm512_fmadd_ps(wt301, dat231, sum423);
__m512 wt302 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+24*s22+(ptrdiff_t)44));
sum424 = _mm512_fmadd_ps(wt302, dat228, sum424);
sum425 = _mm512_fmadd_ps(wt302, dat229, sum425);
sum426 = _mm512_fmadd_ps(wt302, dat230, sum426);
sum427 = _mm512_fmadd_ps(wt302, dat231, sum427);
}
sum404 = _mm512_add_ps(sum404, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)464));
sum405 = _mm512_add_ps(sum405, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)528));
sum406 = _mm512_add_ps(sum406, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)592));
__m512 dat232 = sum407;
dat232 = _mm512_add_ps(dat232, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)656));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)464, 65535, sum404);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)528, 65535, sum405);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)592, 65535, sum406);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)656, 15, dat232);
sum408 = _mm512_add_ps(sum408, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41120));
sum409 = _mm512_add_ps(sum409, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41184));
sum410 = _mm512_add_ps(sum410, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41248));
__m512 dat233 = sum411;
dat233 = _mm512_add_ps(dat233, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41312));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41120, 65535, sum408);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41184, 65535, sum409);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41248, 65535, sum410);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41312, 15, dat233);
sum412 = _mm512_add_ps(sum412, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81776));
sum413 = _mm512_add_ps(sum413, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81840));
sum414 = _mm512_add_ps(sum414, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81904));
__m512 dat234 = sum415;
dat234 = _mm512_add_ps(dat234, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81968));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81776, 65535, sum412);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81840, 65535, sum413);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81904, 65535, sum414);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81968, 15, dat234);
sum416 = _mm512_add_ps(sum416, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122432));
sum417 = _mm512_add_ps(sum417, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122496));
sum418 = _mm512_add_ps(sum418, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122560));
__m512 dat235 = sum419;
dat235 = _mm512_add_ps(dat235, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122624));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122432, 65535, sum416);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122496, 65535, sum417);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122560, 65535, sum418);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122624, 15, dat235);
sum420 = _mm512_add_ps(sum420, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163088));
sum421 = _mm512_add_ps(sum421, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163152));
sum422 = _mm512_add_ps(sum422, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163216));
__m512 dat236 = sum423;
dat236 = _mm512_add_ps(dat236, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163280));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163088, 65535, sum420);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163152, 65535, sum421);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163216, 65535, sum422);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163280, 15, dat236);
sum424 = _mm512_add_ps(sum424, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)203744));
sum425 = _mm512_add_ps(sum425, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)203808));
sum426 = _mm512_add_ps(sum426, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)203872));
__m512 dat237 = sum427;
dat237 = _mm512_add_ps(dat237, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)203936));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)203744, 65535, sum424);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)203808, 65535, sum425);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)203872, 65535, sum426);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)203936, 15, dat237);
if (k23 >= kk19) return;
}
ptrdiff_t s23 = -1;
__m512 sum428 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)20));
__m512 sum432 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)24));
__m512 sum436 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)28));
__m512 sum440 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)32));
__m512 sum444 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)36));
__m512 sum429 = sum428;
__m512 sum430 = sum428;
__m512 sum431 = sum428;
__m512 sum433 = sum432;
__m512 sum434 = sum432;
__m512 sum435 = sum432;
__m512 sum437 = sum436;
__m512 sum438 = sum436;
__m512 sum439 = sum436;
__m512 sum441 = sum440;
__m512 sum442 = sum440;
__m512 sum443 = sum440;
__m512 sum445 = sum444;
__m512 sum446 = sum444;
__m512 sum447 = sum444;
for (s23 = 0; s23 < 748; ++s23) {
__m512 dat238 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s23+(ptrdiff_t)0);
__m512 dat239 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s23+(ptrdiff_t)64);
__m512 dat240 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s23+(ptrdiff_t)128);
__m512 dat241 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+256*s23+(ptrdiff_t)192);
__m512 wt303 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)20));
sum428 = _mm512_fmadd_ps(wt303, dat238, sum428);
sum429 = _mm512_fmadd_ps(wt303, dat239, sum429);
sum430 = _mm512_fmadd_ps(wt303, dat240, sum430);
sum431 = _mm512_fmadd_ps(wt303, dat241, sum431);
__m512 wt304 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)24));
sum432 = _mm512_fmadd_ps(wt304, dat238, sum432);
sum433 = _mm512_fmadd_ps(wt304, dat239, sum433);
sum434 = _mm512_fmadd_ps(wt304, dat240, sum434);
sum435 = _mm512_fmadd_ps(wt304, dat241, sum435);
__m512 wt305 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)28));
sum436 = _mm512_fmadd_ps(wt305, dat238, sum436);
sum437 = _mm512_fmadd_ps(wt305, dat239, sum437);
sum438 = _mm512_fmadd_ps(wt305, dat240, sum438);
sum439 = _mm512_fmadd_ps(wt305, dat241, sum439);
__m512 wt306 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)32));
sum440 = _mm512_fmadd_ps(wt306, dat238, sum440);
sum441 = _mm512_fmadd_ps(wt306, dat239, sum441);
sum442 = _mm512_fmadd_ps(wt306, dat240, sum442);
sum443 = _mm512_fmadd_ps(wt306, dat241, sum443);
__m512 wt307 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k23+20*s23+(ptrdiff_t)36));
sum444 = _mm512_fmadd_ps(wt307, dat238, sum444);
sum445 = _mm512_fmadd_ps(wt307, dat239, sum445);
sum446 = _mm512_fmadd_ps(wt307, dat240, sum446);
sum447 = _mm512_fmadd_ps(wt307, dat241, sum447);
}
sum428 = _mm512_add_ps(sum428, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)464));
sum429 = _mm512_add_ps(sum429, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)528));
sum430 = _mm512_add_ps(sum430, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)592));
__m512 dat242 = sum431;
dat242 = _mm512_add_ps(dat242, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)656));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)464, 65535, sum428);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)528, 65535, sum429);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)592, 65535, sum430);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)656, 15, dat242);
sum432 = _mm512_add_ps(sum432, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41120));
sum433 = _mm512_add_ps(sum433, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41184));
sum434 = _mm512_add_ps(sum434, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41248));
__m512 dat243 = sum435;
dat243 = _mm512_add_ps(dat243, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41312));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41120, 65535, sum432);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41184, 65535, sum433);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41248, 65535, sum434);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)41312, 15, dat243);
sum436 = _mm512_add_ps(sum436, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81776));
sum437 = _mm512_add_ps(sum437, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81840));
sum438 = _mm512_add_ps(sum438, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81904));
__m512 dat244 = sum439;
dat244 = _mm512_add_ps(dat244, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81968));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81776, 65535, sum436);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81840, 65535, sum437);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81904, 65535, sum438);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)81968, 15, dat244);
sum440 = _mm512_add_ps(sum440, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122432));
sum441 = _mm512_add_ps(sum441, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122496));
sum442 = _mm512_add_ps(sum442, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122560));
__m512 dat245 = sum443;
dat245 = _mm512_add_ps(dat245, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122624));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122432, 65535, sum440);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122496, 65535, sum441);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122560, 65535, sum442);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)122624, 15, dat245);
sum444 = _mm512_add_ps(sum444, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163088));
sum445 = _mm512_add_ps(sum445, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163152));
sum446 = _mm512_add_ps(sum446, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163216));
__m512 dat246 = sum447;
dat246 = _mm512_add_ps(dat246, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163280));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163088, 65535, sum444);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163152, 65535, sum445);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163216, 65535, sum446);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h7+243936*k23+(ptrdiff_t)163280, 15, dat246);
if (j7 >= jj7) return;
++j7;
h7 += 2;
goto wrap7;
}
}
j7 = 181;
}
ptrdiff_t h8 = 120;
switch (j7) {
default: {
j7 = 181;
ptrdiff_t k24 = 1*w3;
ptrdiff_t kk20 = k24+0;
for (; k24 != 412; ++k24) {
ptrdiff_t s24 = -1;
__m512 sum448 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)24));
__m512 sum450 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)28));
__m512 sum452 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)32));
__m512 sum454 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)36));
__m512 sum456 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)40));
__m512 sum458 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)44));
__m512 sum449 = sum448;
__m512 sum451 = sum450;
__m512 sum453 = sum452;
__m512 sum455 = sum454;
__m512 sum457 = sum456;
__m512 sum459 = sum458;
for (s24 = 0; s24 < 748; ++s24) {
__m512 dat247 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+128*s24+(ptrdiff_t)0);
__m512 dat248 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+128*s24+(ptrdiff_t)64);
__m512 wt308 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)24));
sum448 = _mm512_fmadd_ps(wt308, dat247, sum448);
sum449 = _mm512_fmadd_ps(wt308, dat248, sum449);
__m512 wt309 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)28));
sum450 = _mm512_fmadd_ps(wt309, dat247, sum450);
sum451 = _mm512_fmadd_ps(wt309, dat248, sum451);
__m512 wt310 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)32));
sum452 = _mm512_fmadd_ps(wt310, dat247, sum452);
sum453 = _mm512_fmadd_ps(wt310, dat248, sum453);
__m512 wt311 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)36));
sum454 = _mm512_fmadd_ps(wt311, dat247, sum454);
sum455 = _mm512_fmadd_ps(wt311, dat248, sum455);
__m512 wt312 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)40));
sum456 = _mm512_fmadd_ps(wt312, dat247, sum456);
sum457 = _mm512_fmadd_ps(wt312, dat248, sum457);
__m512 wt313 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+24*s24+(ptrdiff_t)44));
sum458 = _mm512_fmadd_ps(wt313, dat247, sum458);
sum459 = _mm512_fmadd_ps(wt313, dat248, sum459);
}
sum448 = _mm512_add_ps(sum448, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)256));
__m512 dat249 = sum449;
dat249 = _mm512_add_ps(dat249, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)320));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)256, 65535, sum448);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)320, 15, dat249);
sum450 = _mm512_add_ps(sum450, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)40912));
__m512 dat250 = sum451;
dat250 = _mm512_add_ps(dat250, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)40976));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)40912, 65535, sum450);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)40976, 15, dat250);
sum452 = _mm512_add_ps(sum452, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)81568));
__m512 dat251 = sum453;
dat251 = _mm512_add_ps(dat251, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)81632));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)81568, 65535, sum452);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)81632, 15, dat251);
sum454 = _mm512_add_ps(sum454, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)122224));
__m512 dat252 = sum455;
dat252 = _mm512_add_ps(dat252, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)122288));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)122224, 65535, sum454);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)122288, 15, dat252);
sum456 = _mm512_add_ps(sum456, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)162880));
__m512 dat253 = sum457;
dat253 = _mm512_add_ps(dat253, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)162944));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)162880, 65535, sum456);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)162944, 15, dat253);
sum458 = _mm512_add_ps(sum458, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)203536));
__m512 dat254 = sum459;
dat254 = _mm512_add_ps(dat254, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)203600));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)203536, 65535, sum458);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)203600, 15, dat254);
if (k24 >= kk20) return;
}
ptrdiff_t s25 = -1;
__m512 sum460 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)20));
__m512 sum462 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)24));
__m512 sum464 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)28));
__m512 sum466 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)32));
__m512 sum468 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)36));
__m512 sum461 = sum460;
__m512 sum463 = sum462;
__m512 sum465 = sum464;
__m512 sum467 = sum466;
__m512 sum469 = sum468;
for (s25 = 0; s25 < 748; ++s25) {
__m512 dat255 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+128*s25+(ptrdiff_t)0);
__m512 dat256 = _mm512_loadu_ps(arrangedDats3+34755072*i11+191488*j7+128*s25+(ptrdiff_t)64);
__m512 wt314 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)20));
sum460 = _mm512_fmadd_ps(wt314, dat255, sum460);
sum461 = _mm512_fmadd_ps(wt314, dat256, sum461);
__m512 wt315 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)24));
sum462 = _mm512_fmadd_ps(wt315, dat255, sum462);
sum463 = _mm512_fmadd_ps(wt315, dat256, sum463);
__m512 wt316 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)28));
sum464 = _mm512_fmadd_ps(wt316, dat255, sum464);
sum465 = _mm512_fmadd_ps(wt316, dat256, sum465);
__m512 wt317 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)32));
sum466 = _mm512_fmadd_ps(wt317, dat255, sum466);
sum467 = _mm512_fmadd_ps(wt317, dat256, sum467);
__m512 wt318 = _mm512_set1_ps(*(float*)(arrangedWts3+7421092*i11+17976*k24+20*s25+(ptrdiff_t)36));
sum468 = _mm512_fmadd_ps(wt318, dat255, sum468);
sum469 = _mm512_fmadd_ps(wt318, dat256, sum469);
}
sum460 = _mm512_add_ps(sum460, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)256));
__m512 dat257 = sum461;
dat257 = _mm512_add_ps(dat257, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)320));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)256, 65535, sum460);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)320, 15, dat257);
sum462 = _mm512_add_ps(sum462, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)40912));
__m512 dat258 = sum463;
dat258 = _mm512_add_ps(dat258, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)40976));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)40912, 65535, sum462);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)40976, 15, dat258);
sum464 = _mm512_add_ps(sum464, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)81568));
__m512 dat259 = sum465;
dat259 = _mm512_add_ps(dat259, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)81632));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)81568, 65535, sum464);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)81632, 15, dat259);
sum466 = _mm512_add_ps(sum466, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)122224));
__m512 dat260 = sum467;
dat260 = _mm512_add_ps(dat260, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)122288));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)122224, 65535, sum466);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)122288, 15, dat260);
sum468 = _mm512_add_ps(sum468, _mm512_maskz_loadu_ps(65535, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)162880));
__m512 dat261 = sum469;
dat261 = _mm512_add_ps(dat261, _mm512_maskz_loadu_ps(15, datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)162944));
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)162880, 65535, sum468);
_mm512_mask_storeu_ps(datPtr5+100704912*i11+336*h8+243936*k24+(ptrdiff_t)162944, 15, dat261);
if (j7 >= jj7) return;
}
}
j7 = 182;
}
}

static void Example15OneApply1(Example15ThreaderTeam1* team16, char** tensors5) {
void* pair1[] = {tensors5, 0};
Example15ThreaderTask1 task11;
task11.callee1 = Example15OneApply1Callee1;
task11.any1 = pair1;
task11.nd1 = 3;
task11.hull1[0] = 413;
task11.hull1[1] = 182;
task11.hull1[2] = 8;
Example15ThreaderDo1(team16, &task11);
for (ptrdiff_t e5 = 1; e5 < 4; ++e5) {
pair1[1] = (void*)e5;
Example15ThreaderTask1 task12;
task12.callee1 = Example15OneApply1Callee2;
task12.any1 = pair1;
task12.nd1 = 3;
task12.hull1[0] = 413;
task12.hull1[1] = 182;
task12.hull1[2] = 8;
Example15ThreaderDo1(team16, &task12);
}
pair1[1] = (void*)4;
Example15ThreaderTask1 task13;
task13.callee1 = Example15OneApply1Callee3;
task13.any1 = pair1;
task13.nd1 = 3;
task13.hull1[0] = 413;
task13.hull1[1] = 182;
task13.hull1[2] = 8;
Example15ThreaderDo1(team16, &task13);
}

struct Example15Net {
char* alloc1;
char* align1;
};

void Example15NetDestroy(Example15Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example15NetCreate(
Example15Net** net1,
Example15Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example15Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(324427615);
if (__builtin_expect(!alloc3, 0)) {
return Example15Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example15ThreaderTeam1* team12 = 0;
char* err8 = Example15ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors11[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example15OneArrangeWts1(team12, tensors11);
}
Example15ThreaderDestroy1(team12);
Example15Net* net5 = malloc(sizeof(Example15Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example15Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example15Engine {
Example15Net* net3;
Example15ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example15EnginePthreadT(
Example15Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example15ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example15EngineDestroy(Example15Engine* eng3) {
Example15ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example15EngineCreate(
Example15Engine** eng4,
Example15Net* net4,
ptrdiff_t threads2
) {
Example15Engine* eng5 = malloc(sizeof(Example15Engine));
if (__builtin_expect(!eng5, 0)) {
return Example15Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(1519558719);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example15Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example15ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example15EngineInference(
Example15Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example15ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example15OneArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
(char*)outData
};
Example15OneApply1(team14, tensors10);
}
}

// End of file.

Top