NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example17 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=3493 Height=126 Width=86
Conv FromTensor=in ToTensor=out ToChannels=2696 FilterH=1 FilterW=1 StrideH=4 StrideW=4 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
Output FromTensor=out

Top || Output Example17.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example17Params);
// Example17Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example17Params Example17Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example17Params* params = malloc(sizeof(Example17Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example17Net* net; // For example, 4 threads:
// char* err = Example17NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example17NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example17Net Example17Net;

char* Example17NetCreate(
Example17Net**,
Example17Params*,
ptrdiff_t threads
);

void Example17NetDestroy(Example17Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example17Net* net;
//
// ... Create net ...
//
// Example17Engine* engine; // For example, 4 inference threads:
// char* err = Example17EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example17EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example17EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*3493*126*86);
// float* outData = malloc(sizeof(float)*2696*32*22);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example17EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example17Engine Example17Engine;

char* Example17EngineCreate(
Example17Engine**,
Example17Net*,
ptrdiff_t threads
);

char* Example17EnginePthreadT(
Example17Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example17EngineInference(
Example17Engine*,
float* inData,
float* outData
);

void Example17EngineDestroy(Example17Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example17Params {
float outBiases[2696]; // 1x2696x1x1
float outWeights[9417128]; // 2696x3493x1x1
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example17.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example17.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example17.h"

static char* Example17Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example17: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example17ThreaderTask1 Example17ThreaderTask1;
typedef void (*Example17ThreaderCallee1)(Example17ThreaderTask1*, int64_t*);
typedef struct Example17ThreaderHub1 Example17ThreaderHub1;
typedef struct Example17ThreaderNode1 Example17ThreaderNode1;
typedef struct Example17ThreaderUnwind1 Example17ThreaderUnwind1;
typedef struct Example17ThreaderTeam1 Example17ThreaderTeam1;

struct Example17ThreaderTask1 {
Example17ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example17ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example17ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example17ThreaderTask1* task1;
pthread_cond_t cond2;
Example17ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example17ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example17ThreaderTeam1 {
ptrdiff_t nt1;
Example17ThreaderHub1* hub2;
Example17ThreaderNode1* nodes2;
Example17ThreaderUnwind1 unwind1;
};

static void Example17ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example17ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example17ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example17ThreaderMain1(void* arg1) {
Example17ThreaderNode1* node1 = arg1;
Example17ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example17ThreaderHub1* hub3 = team2->hub2;
Example17ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example17ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example17ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example17ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example17ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example17ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example17ThreaderDestroy1(Example17ThreaderTeam1* team3) {
if (!team3) return;
Example17ThreaderNode1* nodes4 = team3->nodes2;
Example17ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example17ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example17ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example17ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example17ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example17ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example17ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example17ThreaderCreate1Up4(Example17ThreaderTeam1* team8, ptrdiff_t nt7) {
Example17ThreaderNode1* nodes5 = team8->nodes2;
for (Example17ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example17Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example17Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example17ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example17Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example17ThreaderCreate1Up3(Example17ThreaderTeam1* team7, ptrdiff_t nt6) {
Example17ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example17Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example17Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example17ThreaderCreate1Up4(team7, nt6);
}

static char* Example17ThreaderCreate1Up2(Example17ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example17ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example17ThreaderNode1) != (size_t)nt5, 0)) {
return Example17Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example17Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example17ThreaderCreate1Up3(team6, nt5);
}

static char* Example17ThreaderCreate1Up1(Example17ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example17ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example17Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example17ThreaderCreate1Up2(team5, nt4);
}

static char* Example17ThreaderCreate1(Example17ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example17Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example17ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example17Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example17ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example17ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example17ThreaderPthreadT1(
pthread_t* thr2,
Example17ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example17Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example17ThreaderDo1(Example17ThreaderTeam1* team10, Example17ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example17ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example17ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example17ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example17ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example17Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example17Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example17OneArrangeWts1Callee1(Example17ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t e1 = pt7[2];
if (e1 < 3) {
char*restrict wtPtr1 = tensors2[0]+(ptrdiff_t)3340*e1+(ptrdiff_t)37668512*0;
char*restrict biasPtr1 = tensors2[1]+(ptrdiff_t)10784*0;
char*restrict arranged1 = tensors2[2]+(ptrdiff_t)9015424*e1+(ptrdiff_t)9015424*0;
ptrdiff_t ii1 = 1;
for (ptrdiff_t i5 = 0; i5 < ii1; ++i5) {
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+1;
for (; j1 < jj1; ++j1) {
if (j1 < 168) {
ptrdiff_t k2 = 0+16*(j1-0);
ptrdiff_t l2 = (size_t)(0+k2)/6;
ptrdiff_t cut2 = (size_t)(0+k2)%6;
switch (cut2) {
case 0:;
case 2: {
__m512 sum3;
if (!e1) {
sum3 = _mm512_maskz_loadu_ps(65535, biasPtr1+10784*i5+4*k2);
} else {
sum3 = _mm512_setzero_ps();
}
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum3);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)20040, 4032>>cut2, sum3);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)40080, 65535-(4095>>cut2), sum3);
ptrdiff_t c2 = 0;
for (; c2 != 52; ++c2) {
__m512 wt25 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)0);
__m512 wt26 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)13972);
__m512 wt27 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)27944);
__m512 wt28 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)41916);
__m512 wt29 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)55888);
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)69860);
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)83832);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)97804);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)111776);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)125748);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)139720);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)153692);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)167664);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)181636);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)195608);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)209580);
__m512 tmp1 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp2 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp3 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp4 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp5 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp6 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp7 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp8 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp9 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp10 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp11 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp12 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp13 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp14 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp15 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp16 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt25 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt33 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt26 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt34 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt27 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt35 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt28 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt36 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt29 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt37 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt30 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt38 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt31 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt39 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt32 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt40 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)0, 63>>cut2, wt25);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)0, 63>>cut2, wt26);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)0, 63>>cut2, wt27);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)0, 63>>cut2, wt28);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)0, 63>>cut2, wt29);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)0, 63>>cut2, wt30);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)0, 63>>cut2, wt31);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)0, 63>>cut2, wt32);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)0, 63>>cut2, wt33);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)0, 63>>cut2, wt34);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)0, 63>>cut2, wt35);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)0, 63>>cut2, wt36);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)0, 63>>cut2, wt37);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)0, 63>>cut2, wt38);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)0, 63>>cut2, wt39);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)0, 63>>cut2, wt40);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt25);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt26);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt27);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt28);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt29);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt30);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt31);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt32);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt33);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt34);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt35);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt36);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt37);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt38);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt39);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt40);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt25);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt26);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt27);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt28);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt29);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt30);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt31);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt32);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt33);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt34);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt35);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt36);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt37);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt38);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt39);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt40);
}
__m512 wt41 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)0);
__m512 wt42 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)13972);
__m512 wt43 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)27944);
__m512 wt44 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)41916);
__m512 wt45 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)55888);
__m512 wt46 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)69860);
__m512 wt47 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)83832);
__m512 wt48 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)97804);
__m512 wt49 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)111776);
__m512 wt50 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)125748);
__m512 wt51 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)139720);
__m512 wt52 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)153692);
__m512 wt53 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)167664);
__m512 wt54 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)181636);
__m512 wt55 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)195608);
__m512 wt56 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c2+(ptrdiff_t)209580);
__m512 tmp49 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp50 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp51 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp52 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp53 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp54 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp55 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp56 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp57 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp58 = _mm512_unpackhi_ps(wt49, wt50);
__m512 tmp59 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp60 = _mm512_unpackhi_ps(wt51, wt52);
__m512 tmp61 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp62 = _mm512_unpackhi_ps(wt53, wt54);
__m512 tmp63 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp64 = _mm512_unpackhi_ps(wt55, wt56);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp70 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp71 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp75 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp76 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp77 = _mm512_shuffle_f32x4(tmp65, tmp68, 136);
__m512 tmp78 = _mm512_shuffle_f32x4(tmp66, tmp69, 136);
__m512 tmp79 = _mm512_shuffle_f32x4(tmp67, tmp70, 136);
__m512 tmp80 = _mm512_shuffle_f32x4(tmp71, tmp74, 136);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp72, tmp75, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp73, tmp76, 136);
wt41 = _mm512_shuffle_f32x4(tmp77, tmp80, 136);
wt42 = _mm512_shuffle_f32x4(tmp78, tmp81, 136);
wt43 = _mm512_shuffle_f32x4(tmp79, tmp82, 136);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)0, 63>>cut2, wt41);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)0, 63>>cut2, wt42);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)0, 63>>cut2, wt43);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt41);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt42);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt43);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt41);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt42);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt43);
break;
}
default: {
cut2 = 4;
__m512 sum4;
if (!e1) {
sum4 = _mm512_maskz_loadu_ps(65535, biasPtr1+10784*i5+4*k2);
} else {
sum4 = _mm512_setzero_ps();
}
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)20040, 4032>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)40080, 258048>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*0+(ptrdiff_t)60120, 65535-(262143>>cut2), sum4);
ptrdiff_t c3 = 0;
for (; c3 != 52; ++c3) {
__m512 wt57 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)0);
__m512 wt58 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)13972);
__m512 wt59 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)27944);
__m512 wt60 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)41916);
__m512 wt61 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)55888);
__m512 wt62 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)69860);
__m512 wt63 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)83832);
__m512 wt64 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)97804);
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)111776);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)125748);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)139720);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)153692);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)167664);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)181636);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)195608);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)209580);
__m512 tmp83 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp84 = _mm512_unpackhi_ps(wt57, wt58);
__m512 tmp85 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp86 = _mm512_unpackhi_ps(wt59, wt60);
__m512 tmp87 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp88 = _mm512_unpackhi_ps(wt61, wt62);
__m512 tmp89 = _mm512_unpacklo_ps(wt63, wt64);
__m512 tmp90 = _mm512_unpackhi_ps(wt63, wt64);
__m512 tmp91 = _mm512_unpacklo_ps(wt65, wt66);
__m512 tmp92 = _mm512_unpackhi_ps(wt65, wt66);
__m512 tmp93 = _mm512_unpacklo_ps(wt67, wt68);
__m512 tmp94 = _mm512_unpackhi_ps(wt67, wt68);
__m512 tmp95 = _mm512_unpacklo_ps(wt69, wt70);
__m512 tmp96 = _mm512_unpackhi_ps(wt69, wt70);
__m512 tmp97 = _mm512_unpacklo_ps(wt71, wt72);
__m512 tmp98 = _mm512_unpackhi_ps(wt71, wt72);
__m512 tmp99 = _mm512_shuffle_ps(tmp83, tmp85, 68);
__m512 tmp100 = _mm512_shuffle_ps(tmp83, tmp85, 238);
__m512 tmp101 = _mm512_shuffle_ps(tmp84, tmp86, 68);
__m512 tmp102 = _mm512_shuffle_ps(tmp84, tmp86, 238);
__m512 tmp103 = _mm512_shuffle_ps(tmp87, tmp89, 68);
__m512 tmp104 = _mm512_shuffle_ps(tmp87, tmp89, 238);
__m512 tmp105 = _mm512_shuffle_ps(tmp88, tmp90, 68);
__m512 tmp106 = _mm512_shuffle_ps(tmp88, tmp90, 238);
__m512 tmp107 = _mm512_shuffle_ps(tmp91, tmp93, 68);
__m512 tmp108 = _mm512_shuffle_ps(tmp91, tmp93, 238);
__m512 tmp109 = _mm512_shuffle_ps(tmp92, tmp94, 68);
__m512 tmp110 = _mm512_shuffle_ps(tmp92, tmp94, 238);
__m512 tmp111 = _mm512_shuffle_ps(tmp95, tmp97, 68);
__m512 tmp112 = _mm512_shuffle_ps(tmp95, tmp97, 238);
__m512 tmp113 = _mm512_shuffle_ps(tmp96, tmp98, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp96, tmp98, 238);
__m512 tmp115 = _mm512_shuffle_f32x4(tmp99, tmp103, 136);
__m512 tmp116 = _mm512_shuffle_f32x4(tmp99, tmp103, 221);
__m512 tmp117 = _mm512_shuffle_f32x4(tmp100, tmp104, 136);
__m512 tmp118 = _mm512_shuffle_f32x4(tmp100, tmp104, 221);
__m512 tmp119 = _mm512_shuffle_f32x4(tmp101, tmp105, 136);
__m512 tmp120 = _mm512_shuffle_f32x4(tmp101, tmp105, 221);
__m512 tmp121 = _mm512_shuffle_f32x4(tmp102, tmp106, 136);
__m512 tmp122 = _mm512_shuffle_f32x4(tmp102, tmp106, 221);
__m512 tmp123 = _mm512_shuffle_f32x4(tmp107, tmp111, 136);
__m512 tmp124 = _mm512_shuffle_f32x4(tmp107, tmp111, 221);
__m512 tmp125 = _mm512_shuffle_f32x4(tmp108, tmp112, 136);
__m512 tmp126 = _mm512_shuffle_f32x4(tmp108, tmp112, 221);
__m512 tmp127 = _mm512_shuffle_f32x4(tmp109, tmp113, 136);
__m512 tmp128 = _mm512_shuffle_f32x4(tmp109, tmp113, 221);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp110, tmp114, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp110, tmp114, 221);
wt57 = _mm512_shuffle_f32x4(tmp115, tmp123, 136);
wt65 = _mm512_shuffle_f32x4(tmp115, tmp123, 221);
wt58 = _mm512_shuffle_f32x4(tmp117, tmp125, 136);
wt66 = _mm512_shuffle_f32x4(tmp117, tmp125, 221);
wt59 = _mm512_shuffle_f32x4(tmp119, tmp127, 136);
wt67 = _mm512_shuffle_f32x4(tmp119, tmp127, 221);
wt60 = _mm512_shuffle_f32x4(tmp121, tmp129, 136);
wt68 = _mm512_shuffle_f32x4(tmp121, tmp129, 221);
wt61 = _mm512_shuffle_f32x4(tmp116, tmp124, 136);
wt69 = _mm512_shuffle_f32x4(tmp116, tmp124, 221);
wt62 = _mm512_shuffle_f32x4(tmp118, tmp126, 136);
wt70 = _mm512_shuffle_f32x4(tmp118, tmp126, 221);
wt63 = _mm512_shuffle_f32x4(tmp120, tmp128, 136);
wt71 = _mm512_shuffle_f32x4(tmp120, tmp128, 221);
wt64 = _mm512_shuffle_f32x4(tmp122, tmp130, 136);
wt72 = _mm512_shuffle_f32x4(tmp122, tmp130, 221);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)0, 63>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)0, 63>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)0, 63>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(4+16*c3)+(ptrdiff_t)0, 63>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(5+16*c3)+(ptrdiff_t)0, 63>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(6+16*c3)+(ptrdiff_t)0, 63>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(7+16*c3)+(ptrdiff_t)0, 63>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(8+16*c3)+(ptrdiff_t)0, 63>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(9+16*c3)+(ptrdiff_t)0, 63>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(10+16*c3)+(ptrdiff_t)0, 63>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(11+16*c3)+(ptrdiff_t)0, 63>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(12+16*c3)+(ptrdiff_t)0, 63>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(13+16*c3)+(ptrdiff_t)0, 63>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(14+16*c3)+(ptrdiff_t)0, 63>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(15+16*c3)+(ptrdiff_t)0, 63>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(16+16*c3)+(ptrdiff_t)0, 63>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(4+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(5+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(6+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(7+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(8+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(9+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(10+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(11+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(12+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(13+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(14+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(15+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(16+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(4+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(5+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(6+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(7+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(8+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(9+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(10+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(11+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(12+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(13+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(14+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(15+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(16+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt57);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt58);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt59);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(4+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt60);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(5+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt61);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(6+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt62);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(7+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt63);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(8+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt64);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(9+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt65);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(10+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt66);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(11+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt67);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(12+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt68);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(13+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt69);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(14+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt70);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(15+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt71);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(16+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt72);
}
__m512 wt73 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)0);
__m512 wt74 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)13972);
__m512 wt75 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)27944);
__m512 wt76 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)41916);
__m512 wt77 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)55888);
__m512 wt78 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)69860);
__m512 wt79 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)83832);
__m512 wt80 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)97804);
__m512 wt81 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)111776);
__m512 wt82 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)125748);
__m512 wt83 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)139720);
__m512 wt84 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)153692);
__m512 wt85 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)167664);
__m512 wt86 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)181636);
__m512 wt87 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)195608);
__m512 wt88 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k2+64*c3+(ptrdiff_t)209580);
__m512 tmp131 = _mm512_unpacklo_ps(wt73, wt74);
__m512 tmp132 = _mm512_unpackhi_ps(wt73, wt74);
__m512 tmp133 = _mm512_unpacklo_ps(wt75, wt76);
__m512 tmp134 = _mm512_unpackhi_ps(wt75, wt76);
__m512 tmp135 = _mm512_unpacklo_ps(wt77, wt78);
__m512 tmp136 = _mm512_unpackhi_ps(wt77, wt78);
__m512 tmp137 = _mm512_unpacklo_ps(wt79, wt80);
__m512 tmp138 = _mm512_unpackhi_ps(wt79, wt80);
__m512 tmp139 = _mm512_unpacklo_ps(wt81, wt82);
__m512 tmp140 = _mm512_unpackhi_ps(wt81, wt82);
__m512 tmp141 = _mm512_unpacklo_ps(wt83, wt84);
__m512 tmp142 = _mm512_unpackhi_ps(wt83, wt84);
__m512 tmp143 = _mm512_unpacklo_ps(wt85, wt86);
__m512 tmp144 = _mm512_unpackhi_ps(wt85, wt86);
__m512 tmp145 = _mm512_unpacklo_ps(wt87, wt88);
__m512 tmp146 = _mm512_unpackhi_ps(wt87, wt88);
__m512 tmp147 = _mm512_shuffle_ps(tmp131, tmp133, 68);
__m512 tmp148 = _mm512_shuffle_ps(tmp131, tmp133, 238);
__m512 tmp149 = _mm512_shuffle_ps(tmp132, tmp134, 68);
__m512 tmp150 = _mm512_shuffle_ps(tmp135, tmp137, 68);
__m512 tmp151 = _mm512_shuffle_ps(tmp135, tmp137, 238);
__m512 tmp152 = _mm512_shuffle_ps(tmp136, tmp138, 68);
__m512 tmp153 = _mm512_shuffle_ps(tmp139, tmp141, 68);
__m512 tmp154 = _mm512_shuffle_ps(tmp139, tmp141, 238);
__m512 tmp155 = _mm512_shuffle_ps(tmp140, tmp142, 68);
__m512 tmp156 = _mm512_shuffle_ps(tmp143, tmp145, 68);
__m512 tmp157 = _mm512_shuffle_ps(tmp143, tmp145, 238);
__m512 tmp158 = _mm512_shuffle_ps(tmp144, tmp146, 68);
__m512 tmp159 = _mm512_shuffle_f32x4(tmp147, tmp150, 136);
__m512 tmp160 = _mm512_shuffle_f32x4(tmp148, tmp151, 136);
__m512 tmp161 = _mm512_shuffle_f32x4(tmp149, tmp152, 136);
__m512 tmp162 = _mm512_shuffle_f32x4(tmp153, tmp156, 136);
__m512 tmp163 = _mm512_shuffle_f32x4(tmp154, tmp157, 136);
__m512 tmp164 = _mm512_shuffle_f32x4(tmp155, tmp158, 136);
wt73 = _mm512_shuffle_f32x4(tmp159, tmp162, 136);
wt74 = _mm512_shuffle_f32x4(tmp160, tmp163, 136);
wt75 = _mm512_shuffle_f32x4(tmp161, tmp164, 136);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)0, 63>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)0, 63>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)0, 63>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)20040, 4032>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)40080, 258048>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(1+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt73);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(2+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt74);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l2+4*cut2+24*(3+16*c3)+(ptrdiff_t)60120, 65535-(262143>>cut2), wt75);
}
}
} else {
ptrdiff_t k1 = 2688;
ptrdiff_t l1 = (size_t)(0+k1)/6;
ptrdiff_t cut1 = (size_t)(0+k1)%6;
__m512 sum2;
if (!e1) {
sum2 = _mm512_maskz_loadu_ps(255, biasPtr1+10784*i5+4*k1);
} else {
sum2 = _mm512_setzero_ps();
}
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*0+(ptrdiff_t)20040, 255-(63>>cut1), sum2);
ptrdiff_t c1 = 0;
for (; c1 != 52; ++c1) {
__m512 wt1 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)0);
__m512 wt2 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)13972);
__m512 wt3 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)27944);
__m512 wt4 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)41916);
__m512 wt5 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)55888);
__m512 wt6 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)69860);
__m512 wt7 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)83832);
__m512 wt8 = _mm512_maskz_loadu_ps(65535, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)97804);
__m512 tmp165 = _mm512_unpacklo_ps(wt1, wt2);
__m512 tmp166 = _mm512_unpackhi_ps(wt1, wt2);
__m512 tmp167 = _mm512_unpacklo_ps(wt3, wt4);
__m512 tmp168 = _mm512_unpackhi_ps(wt3, wt4);
__m512 tmp169 = _mm512_unpacklo_ps(wt5, wt6);
__m512 tmp170 = _mm512_unpackhi_ps(wt5, wt6);
__m512 tmp171 = _mm512_unpacklo_ps(wt7, wt8);
__m512 tmp172 = _mm512_unpackhi_ps(wt7, wt8);
__m512 tmp173 = _mm512_shuffle_ps(tmp165, tmp167, 68);
__m512 tmp174 = _mm512_shuffle_ps(tmp165, tmp167, 238);
__m512 tmp175 = _mm512_shuffle_ps(tmp166, tmp168, 68);
__m512 tmp176 = _mm512_shuffle_ps(tmp166, tmp168, 238);
__m512 tmp177 = _mm512_shuffle_ps(tmp169, tmp171, 68);
__m512 tmp178 = _mm512_shuffle_ps(tmp169, tmp171, 238);
__m512 tmp179 = _mm512_shuffle_ps(tmp170, tmp172, 68);
__m512 tmp180 = _mm512_shuffle_ps(tmp170, tmp172, 238);
__m512 tmp181 = _mm512_shuffle_f32x4(tmp173, tmp177, 136);
__m512 tmp182 = _mm512_shuffle_f32x4(tmp173, tmp177, 221);
__m512 tmp183 = _mm512_shuffle_f32x4(tmp174, tmp178, 136);
__m512 tmp184 = _mm512_shuffle_f32x4(tmp174, tmp178, 221);
__m512 tmp185 = _mm512_shuffle_f32x4(tmp175, tmp179, 136);
__m512 tmp186 = _mm512_shuffle_f32x4(tmp175, tmp179, 221);
__m512 tmp187 = _mm512_shuffle_f32x4(tmp176, tmp180, 136);
__m512 tmp188 = _mm512_shuffle_f32x4(tmp176, tmp180, 221);
wt1 = _mm512_shuffle_f32x4(tmp181, tmp181, 136);
__m512 wt9 = _mm512_shuffle_f32x4(tmp181, tmp181, 221);
wt2 = _mm512_shuffle_f32x4(tmp183, tmp183, 136);
__m512 wt10 = _mm512_shuffle_f32x4(tmp183, tmp183, 221);
wt3 = _mm512_shuffle_f32x4(tmp185, tmp185, 136);
__m512 wt11 = _mm512_shuffle_f32x4(tmp185, tmp185, 221);
wt4 = _mm512_shuffle_f32x4(tmp187, tmp187, 136);
__m512 wt12 = _mm512_shuffle_f32x4(tmp187, tmp187, 221);
wt5 = _mm512_shuffle_f32x4(tmp182, tmp182, 136);
__m512 wt13 = _mm512_shuffle_f32x4(tmp182, tmp182, 221);
wt6 = _mm512_shuffle_f32x4(tmp184, tmp184, 136);
__m512 wt14 = _mm512_shuffle_f32x4(tmp184, tmp184, 221);
wt7 = _mm512_shuffle_f32x4(tmp186, tmp186, 136);
__m512 wt15 = _mm512_shuffle_f32x4(tmp186, tmp186, 221);
wt8 = _mm512_shuffle_f32x4(tmp188, tmp188, 136);
__m512 wt16 = _mm512_shuffle_f32x4(tmp188, tmp188, 221);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)0, 63>>cut1, wt1);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)0, 63>>cut1, wt2);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)0, 63>>cut1, wt3);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(4+16*c1)+(ptrdiff_t)0, 63>>cut1, wt4);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(5+16*c1)+(ptrdiff_t)0, 63>>cut1, wt5);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(6+16*c1)+(ptrdiff_t)0, 63>>cut1, wt6);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(7+16*c1)+(ptrdiff_t)0, 63>>cut1, wt7);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(8+16*c1)+(ptrdiff_t)0, 63>>cut1, wt8);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(9+16*c1)+(ptrdiff_t)0, 63>>cut1, wt9);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(10+16*c1)+(ptrdiff_t)0, 63>>cut1, wt10);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(11+16*c1)+(ptrdiff_t)0, 63>>cut1, wt11);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(12+16*c1)+(ptrdiff_t)0, 63>>cut1, wt12);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(13+16*c1)+(ptrdiff_t)0, 63>>cut1, wt13);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(14+16*c1)+(ptrdiff_t)0, 63>>cut1, wt14);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(15+16*c1)+(ptrdiff_t)0, 63>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(16+16*c1)+(ptrdiff_t)0, 63>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(1+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt1);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(2+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt2);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(3+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt3);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(4+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt4);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(5+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt5);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(6+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt6);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(7+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt7);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(8+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt8);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(9+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt9);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(10+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt10);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(11+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt11);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(12+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt12);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(13+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt13);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(14+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt14);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(15+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt15);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(16+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt16);
}
__m512 wt17 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)0);
__m512 wt18 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)13972);
__m512 wt19 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)27944);
__m512 wt20 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)41916);
__m512 wt21 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)55888);
__m512 wt22 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)69860);
__m512 wt23 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)83832);
__m512 wt24 = _mm512_maskz_loadu_ps(7, wtPtr1+37668512*i5+13972*k1+64*c1+(ptrdiff_t)97804);
__m512 tmp189 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp190 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp191 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp192 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp193 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp194 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp195 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp196 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp197 = _mm512_shuffle_ps(tmp189, tmp191, 68);
__m512 tmp198 = _mm512_shuffle_ps(tmp189, tmp191, 238);
__m512 tmp199 = _mm512_shuffle_ps(tmp190, tmp192, 68);
__m512 tmp200 = _mm512_shuffle_ps(tmp193, tmp195, 68);
__m512 tmp201 = _mm512_shuffle_ps(tmp193, tmp195, 238);
__m512 tmp202 = _mm512_shuffle_ps(tmp194, tmp196, 68);
__m512 tmp203 = _mm512_shuffle_f32x4(tmp197, tmp200, 136);
__m512 tmp204 = _mm512_shuffle_f32x4(tmp198, tmp201, 136);
__m512 tmp205 = _mm512_shuffle_f32x4(tmp199, tmp202, 136);
wt17 = _mm512_shuffle_f32x4(tmp203, tmp203, 136);
wt18 = _mm512_shuffle_f32x4(tmp204, tmp204, 136);
wt19 = _mm512_shuffle_f32x4(tmp205, tmp205, 136);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)0, 63>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)0, 63>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)0, 63>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(1+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt17);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(2+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt18);
_mm512_mask_storeu_ps(arranged1+9015424*i5+20064*l1+4*cut1+8*(3+16*c1)+(ptrdiff_t)20040, 255-(63>>cut1), wt19);
}
}
}
return;
}
char*restrict wtPtr2 = tensors2[0]+(ptrdiff_t)3340*3+(ptrdiff_t)37668512*0;
char*restrict arranged2 = tensors2[2]+(ptrdiff_t)9015424*3+(ptrdiff_t)10665376*0;
ptrdiff_t ii2 = 1;
for (ptrdiff_t i6 = 0; i6 < ii2; ++i6) {
ptrdiff_t j2 = 1*b2;
ptrdiff_t jj2 = j2+1;
for (; j2 < jj2; ++j2) {
if (j2 < 168) {
ptrdiff_t k4 = 0+16*(j2-0);
ptrdiff_t l4 = (size_t)(0+k4)/6;
ptrdiff_t cut4 = (size_t)(0+k4)%6;
switch (cut4) {
case 0:;
case 2: {
__m512 sum6 = _mm512_setzero_ps();
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*0+(ptrdiff_t)0, 63>>cut4, sum6);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*0+(ptrdiff_t)23712, 4032>>cut4, sum6);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*0+(ptrdiff_t)47424, 65535-(4095>>cut4), sum6);
ptrdiff_t c5 = 0;
for (; c5 != 61; ++c5) {
__m512 wt117 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)0);
__m512 wt118 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)13972);
__m512 wt119 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)27944);
__m512 wt120 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)41916);
__m512 wt121 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)55888);
__m512 wt122 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)69860);
__m512 wt123 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)83832);
__m512 wt124 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)97804);
__m512 wt125 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)111776);
__m512 wt126 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)125748);
__m512 wt127 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)139720);
__m512 wt128 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)153692);
__m512 wt129 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)167664);
__m512 wt130 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)181636);
__m512 wt131 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)195608);
__m512 wt132 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)209580);
__m512 tmp206 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp207 = _mm512_unpackhi_ps(wt117, wt118);
__m512 tmp208 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp209 = _mm512_unpackhi_ps(wt119, wt120);
__m512 tmp210 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp211 = _mm512_unpackhi_ps(wt121, wt122);
__m512 tmp212 = _mm512_unpacklo_ps(wt123, wt124);
__m512 tmp213 = _mm512_unpackhi_ps(wt123, wt124);
__m512 tmp214 = _mm512_unpacklo_ps(wt125, wt126);
__m512 tmp215 = _mm512_unpackhi_ps(wt125, wt126);
__m512 tmp216 = _mm512_unpacklo_ps(wt127, wt128);
__m512 tmp217 = _mm512_unpackhi_ps(wt127, wt128);
__m512 tmp218 = _mm512_unpacklo_ps(wt129, wt130);
__m512 tmp219 = _mm512_unpackhi_ps(wt129, wt130);
__m512 tmp220 = _mm512_unpacklo_ps(wt131, wt132);
__m512 tmp221 = _mm512_unpackhi_ps(wt131, wt132);
__m512 tmp222 = _mm512_shuffle_ps(tmp206, tmp208, 68);
__m512 tmp223 = _mm512_shuffle_ps(tmp206, tmp208, 238);
__m512 tmp224 = _mm512_shuffle_ps(tmp207, tmp209, 68);
__m512 tmp225 = _mm512_shuffle_ps(tmp207, tmp209, 238);
__m512 tmp226 = _mm512_shuffle_ps(tmp210, tmp212, 68);
__m512 tmp227 = _mm512_shuffle_ps(tmp210, tmp212, 238);
__m512 tmp228 = _mm512_shuffle_ps(tmp211, tmp213, 68);
__m512 tmp229 = _mm512_shuffle_ps(tmp211, tmp213, 238);
__m512 tmp230 = _mm512_shuffle_ps(tmp214, tmp216, 68);
__m512 tmp231 = _mm512_shuffle_ps(tmp214, tmp216, 238);
__m512 tmp232 = _mm512_shuffle_ps(tmp215, tmp217, 68);
__m512 tmp233 = _mm512_shuffle_ps(tmp215, tmp217, 238);
__m512 tmp234 = _mm512_shuffle_ps(tmp218, tmp220, 68);
__m512 tmp235 = _mm512_shuffle_ps(tmp218, tmp220, 238);
__m512 tmp236 = _mm512_shuffle_ps(tmp219, tmp221, 68);
__m512 tmp237 = _mm512_shuffle_ps(tmp219, tmp221, 238);
__m512 tmp238 = _mm512_shuffle_f32x4(tmp222, tmp226, 136);
__m512 tmp239 = _mm512_shuffle_f32x4(tmp222, tmp226, 221);
__m512 tmp240 = _mm512_shuffle_f32x4(tmp223, tmp227, 136);
__m512 tmp241 = _mm512_shuffle_f32x4(tmp223, tmp227, 221);
__m512 tmp242 = _mm512_shuffle_f32x4(tmp224, tmp228, 136);
__m512 tmp243 = _mm512_shuffle_f32x4(tmp224, tmp228, 221);
__m512 tmp244 = _mm512_shuffle_f32x4(tmp225, tmp229, 136);
__m512 tmp245 = _mm512_shuffle_f32x4(tmp225, tmp229, 221);
__m512 tmp246 = _mm512_shuffle_f32x4(tmp230, tmp234, 136);
__m512 tmp247 = _mm512_shuffle_f32x4(tmp230, tmp234, 221);
__m512 tmp248 = _mm512_shuffle_f32x4(tmp231, tmp235, 136);
__m512 tmp249 = _mm512_shuffle_f32x4(tmp231, tmp235, 221);
__m512 tmp250 = _mm512_shuffle_f32x4(tmp232, tmp236, 136);
__m512 tmp251 = _mm512_shuffle_f32x4(tmp232, tmp236, 221);
__m512 tmp252 = _mm512_shuffle_f32x4(tmp233, tmp237, 136);
__m512 tmp253 = _mm512_shuffle_f32x4(tmp233, tmp237, 221);
wt117 = _mm512_shuffle_f32x4(tmp238, tmp246, 136);
wt125 = _mm512_shuffle_f32x4(tmp238, tmp246, 221);
wt118 = _mm512_shuffle_f32x4(tmp240, tmp248, 136);
wt126 = _mm512_shuffle_f32x4(tmp240, tmp248, 221);
wt119 = _mm512_shuffle_f32x4(tmp242, tmp250, 136);
wt127 = _mm512_shuffle_f32x4(tmp242, tmp250, 221);
wt120 = _mm512_shuffle_f32x4(tmp244, tmp252, 136);
wt128 = _mm512_shuffle_f32x4(tmp244, tmp252, 221);
wt121 = _mm512_shuffle_f32x4(tmp239, tmp247, 136);
wt129 = _mm512_shuffle_f32x4(tmp239, tmp247, 221);
wt122 = _mm512_shuffle_f32x4(tmp241, tmp249, 136);
wt130 = _mm512_shuffle_f32x4(tmp241, tmp249, 221);
wt123 = _mm512_shuffle_f32x4(tmp243, tmp251, 136);
wt131 = _mm512_shuffle_f32x4(tmp243, tmp251, 221);
wt124 = _mm512_shuffle_f32x4(tmp245, tmp253, 136);
wt132 = _mm512_shuffle_f32x4(tmp245, tmp253, 221);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut4, wt117);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut4, wt118);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut4, wt119);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut4, wt120);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut4, wt121);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut4, wt122);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut4, wt123);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut4, wt124);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut4, wt125);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut4, wt126);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut4, wt127);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut4, wt128);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(13+16*c5)+(ptrdiff_t)0, 63>>cut4, wt129);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(14+16*c5)+(ptrdiff_t)0, 63>>cut4, wt130);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(15+16*c5)+(ptrdiff_t)0, 63>>cut4, wt131);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(16+16*c5)+(ptrdiff_t)0, 63>>cut4, wt132);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt117);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt118);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt119);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt120);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt121);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt122);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt123);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt124);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt125);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt126);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt127);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt128);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(13+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt129);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(14+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt130);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(15+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt131);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(16+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt132);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt117);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt118);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt119);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt120);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt121);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt122);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt123);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt124);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt125);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt126);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt127);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt128);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(13+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt129);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(14+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt130);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(15+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt131);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(16+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt132);
}
__m512 wt133 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)0);
__m512 wt134 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)13972);
__m512 wt135 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)27944);
__m512 wt136 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)41916);
__m512 wt137 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)55888);
__m512 wt138 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)69860);
__m512 wt139 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)83832);
__m512 wt140 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)97804);
__m512 wt141 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)111776);
__m512 wt142 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)125748);
__m512 wt143 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)139720);
__m512 wt144 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)153692);
__m512 wt145 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)167664);
__m512 wt146 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)181636);
__m512 wt147 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)195608);
__m512 wt148 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c5+(ptrdiff_t)209580);
__m512 tmp254 = _mm512_unpacklo_ps(wt133, wt134);
__m512 tmp255 = _mm512_unpackhi_ps(wt133, wt134);
__m512 tmp256 = _mm512_unpacklo_ps(wt135, wt136);
__m512 tmp257 = _mm512_unpackhi_ps(wt135, wt136);
__m512 tmp258 = _mm512_unpacklo_ps(wt137, wt138);
__m512 tmp259 = _mm512_unpackhi_ps(wt137, wt138);
__m512 tmp260 = _mm512_unpacklo_ps(wt139, wt140);
__m512 tmp261 = _mm512_unpackhi_ps(wt139, wt140);
__m512 tmp262 = _mm512_unpacklo_ps(wt141, wt142);
__m512 tmp263 = _mm512_unpackhi_ps(wt141, wt142);
__m512 tmp264 = _mm512_unpacklo_ps(wt143, wt144);
__m512 tmp265 = _mm512_unpackhi_ps(wt143, wt144);
__m512 tmp266 = _mm512_unpacklo_ps(wt145, wt146);
__m512 tmp267 = _mm512_unpackhi_ps(wt145, wt146);
__m512 tmp268 = _mm512_unpacklo_ps(wt147, wt148);
__m512 tmp269 = _mm512_unpackhi_ps(wt147, wt148);
__m512 tmp270 = _mm512_shuffle_ps(tmp254, tmp256, 68);
__m512 tmp271 = _mm512_shuffle_ps(tmp254, tmp256, 238);
__m512 tmp272 = _mm512_shuffle_ps(tmp255, tmp257, 68);
__m512 tmp273 = _mm512_shuffle_ps(tmp255, tmp257, 238);
__m512 tmp274 = _mm512_shuffle_ps(tmp258, tmp260, 68);
__m512 tmp275 = _mm512_shuffle_ps(tmp258, tmp260, 238);
__m512 tmp276 = _mm512_shuffle_ps(tmp259, tmp261, 68);
__m512 tmp277 = _mm512_shuffle_ps(tmp259, tmp261, 238);
__m512 tmp278 = _mm512_shuffle_ps(tmp262, tmp264, 68);
__m512 tmp279 = _mm512_shuffle_ps(tmp262, tmp264, 238);
__m512 tmp280 = _mm512_shuffle_ps(tmp263, tmp265, 68);
__m512 tmp281 = _mm512_shuffle_ps(tmp263, tmp265, 238);
__m512 tmp282 = _mm512_shuffle_ps(tmp266, tmp268, 68);
__m512 tmp283 = _mm512_shuffle_ps(tmp266, tmp268, 238);
__m512 tmp284 = _mm512_shuffle_ps(tmp267, tmp269, 68);
__m512 tmp285 = _mm512_shuffle_ps(tmp267, tmp269, 238);
__m512 tmp286 = _mm512_shuffle_f32x4(tmp270, tmp274, 136);
__m512 tmp287 = _mm512_shuffle_f32x4(tmp270, tmp274, 221);
__m512 tmp288 = _mm512_shuffle_f32x4(tmp271, tmp275, 136);
__m512 tmp289 = _mm512_shuffle_f32x4(tmp271, tmp275, 221);
__m512 tmp290 = _mm512_shuffle_f32x4(tmp272, tmp276, 136);
__m512 tmp291 = _mm512_shuffle_f32x4(tmp272, tmp276, 221);
__m512 tmp292 = _mm512_shuffle_f32x4(tmp273, tmp277, 136);
__m512 tmp293 = _mm512_shuffle_f32x4(tmp273, tmp277, 221);
__m512 tmp294 = _mm512_shuffle_f32x4(tmp278, tmp282, 136);
__m512 tmp295 = _mm512_shuffle_f32x4(tmp278, tmp282, 221);
__m512 tmp296 = _mm512_shuffle_f32x4(tmp279, tmp283, 136);
__m512 tmp297 = _mm512_shuffle_f32x4(tmp279, tmp283, 221);
__m512 tmp298 = _mm512_shuffle_f32x4(tmp280, tmp284, 136);
__m512 tmp299 = _mm512_shuffle_f32x4(tmp280, tmp284, 221);
__m512 tmp300 = _mm512_shuffle_f32x4(tmp281, tmp285, 136);
__m512 tmp301 = _mm512_shuffle_f32x4(tmp281, tmp285, 221);
wt133 = _mm512_shuffle_f32x4(tmp286, tmp294, 136);
wt141 = _mm512_shuffle_f32x4(tmp286, tmp294, 221);
wt134 = _mm512_shuffle_f32x4(tmp288, tmp296, 136);
wt142 = _mm512_shuffle_f32x4(tmp288, tmp296, 221);
wt135 = _mm512_shuffle_f32x4(tmp290, tmp298, 136);
wt143 = _mm512_shuffle_f32x4(tmp290, tmp298, 221);
wt136 = _mm512_shuffle_f32x4(tmp292, tmp300, 136);
wt144 = _mm512_shuffle_f32x4(tmp292, tmp300, 221);
wt137 = _mm512_shuffle_f32x4(tmp287, tmp295, 136);
wt138 = _mm512_shuffle_f32x4(tmp289, tmp297, 136);
wt139 = _mm512_shuffle_f32x4(tmp291, tmp299, 136);
wt140 = _mm512_shuffle_f32x4(tmp293, tmp301, 136);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut4, wt133);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut4, wt134);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut4, wt135);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut4, wt136);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut4, wt137);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut4, wt138);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut4, wt139);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut4, wt140);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut4, wt141);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut4, wt142);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut4, wt143);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut4, wt144);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt133);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt134);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt135);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt136);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt137);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt138);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt139);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt140);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt141);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt142);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt143);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)23712, 4032>>cut4, wt144);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt133);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt134);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt135);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt136);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt137);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt138);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt139);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt140);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt141);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt142);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt143);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c5)+(ptrdiff_t)47424, 65535-(4095>>cut4), wt144);
break;
}
default: {
cut4 = 4;
__m512 sum7 = _mm512_setzero_ps();
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*0+(ptrdiff_t)0, 63>>cut4, sum7);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*0+(ptrdiff_t)23712, 4032>>cut4, sum7);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*0+(ptrdiff_t)47424, 258048>>cut4, sum7);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*0+(ptrdiff_t)71136, 65535-(262143>>cut4), sum7);
ptrdiff_t c6 = 0;
for (; c6 != 61; ++c6) {
__m512 wt149 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)0);
__m512 wt150 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)13972);
__m512 wt151 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)27944);
__m512 wt152 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)41916);
__m512 wt153 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)55888);
__m512 wt154 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)69860);
__m512 wt155 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)83832);
__m512 wt156 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)97804);
__m512 wt157 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)111776);
__m512 wt158 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)125748);
__m512 wt159 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)139720);
__m512 wt160 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)153692);
__m512 wt161 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)167664);
__m512 wt162 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)181636);
__m512 wt163 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)195608);
__m512 wt164 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)209580);
__m512 tmp302 = _mm512_unpacklo_ps(wt149, wt150);
__m512 tmp303 = _mm512_unpackhi_ps(wt149, wt150);
__m512 tmp304 = _mm512_unpacklo_ps(wt151, wt152);
__m512 tmp305 = _mm512_unpackhi_ps(wt151, wt152);
__m512 tmp306 = _mm512_unpacklo_ps(wt153, wt154);
__m512 tmp307 = _mm512_unpackhi_ps(wt153, wt154);
__m512 tmp308 = _mm512_unpacklo_ps(wt155, wt156);
__m512 tmp309 = _mm512_unpackhi_ps(wt155, wt156);
__m512 tmp310 = _mm512_unpacklo_ps(wt157, wt158);
__m512 tmp311 = _mm512_unpackhi_ps(wt157, wt158);
__m512 tmp312 = _mm512_unpacklo_ps(wt159, wt160);
__m512 tmp313 = _mm512_unpackhi_ps(wt159, wt160);
__m512 tmp314 = _mm512_unpacklo_ps(wt161, wt162);
__m512 tmp315 = _mm512_unpackhi_ps(wt161, wt162);
__m512 tmp316 = _mm512_unpacklo_ps(wt163, wt164);
__m512 tmp317 = _mm512_unpackhi_ps(wt163, wt164);
__m512 tmp318 = _mm512_shuffle_ps(tmp302, tmp304, 68);
__m512 tmp319 = _mm512_shuffle_ps(tmp302, tmp304, 238);
__m512 tmp320 = _mm512_shuffle_ps(tmp303, tmp305, 68);
__m512 tmp321 = _mm512_shuffle_ps(tmp303, tmp305, 238);
__m512 tmp322 = _mm512_shuffle_ps(tmp306, tmp308, 68);
__m512 tmp323 = _mm512_shuffle_ps(tmp306, tmp308, 238);
__m512 tmp324 = _mm512_shuffle_ps(tmp307, tmp309, 68);
__m512 tmp325 = _mm512_shuffle_ps(tmp307, tmp309, 238);
__m512 tmp326 = _mm512_shuffle_ps(tmp310, tmp312, 68);
__m512 tmp327 = _mm512_shuffle_ps(tmp310, tmp312, 238);
__m512 tmp328 = _mm512_shuffle_ps(tmp311, tmp313, 68);
__m512 tmp329 = _mm512_shuffle_ps(tmp311, tmp313, 238);
__m512 tmp330 = _mm512_shuffle_ps(tmp314, tmp316, 68);
__m512 tmp331 = _mm512_shuffle_ps(tmp314, tmp316, 238);
__m512 tmp332 = _mm512_shuffle_ps(tmp315, tmp317, 68);
__m512 tmp333 = _mm512_shuffle_ps(tmp315, tmp317, 238);
__m512 tmp334 = _mm512_shuffle_f32x4(tmp318, tmp322, 136);
__m512 tmp335 = _mm512_shuffle_f32x4(tmp318, tmp322, 221);
__m512 tmp336 = _mm512_shuffle_f32x4(tmp319, tmp323, 136);
__m512 tmp337 = _mm512_shuffle_f32x4(tmp319, tmp323, 221);
__m512 tmp338 = _mm512_shuffle_f32x4(tmp320, tmp324, 136);
__m512 tmp339 = _mm512_shuffle_f32x4(tmp320, tmp324, 221);
__m512 tmp340 = _mm512_shuffle_f32x4(tmp321, tmp325, 136);
__m512 tmp341 = _mm512_shuffle_f32x4(tmp321, tmp325, 221);
__m512 tmp342 = _mm512_shuffle_f32x4(tmp326, tmp330, 136);
__m512 tmp343 = _mm512_shuffle_f32x4(tmp326, tmp330, 221);
__m512 tmp344 = _mm512_shuffle_f32x4(tmp327, tmp331, 136);
__m512 tmp345 = _mm512_shuffle_f32x4(tmp327, tmp331, 221);
__m512 tmp346 = _mm512_shuffle_f32x4(tmp328, tmp332, 136);
__m512 tmp347 = _mm512_shuffle_f32x4(tmp328, tmp332, 221);
__m512 tmp348 = _mm512_shuffle_f32x4(tmp329, tmp333, 136);
__m512 tmp349 = _mm512_shuffle_f32x4(tmp329, tmp333, 221);
wt149 = _mm512_shuffle_f32x4(tmp334, tmp342, 136);
wt157 = _mm512_shuffle_f32x4(tmp334, tmp342, 221);
wt150 = _mm512_shuffle_f32x4(tmp336, tmp344, 136);
wt158 = _mm512_shuffle_f32x4(tmp336, tmp344, 221);
wt151 = _mm512_shuffle_f32x4(tmp338, tmp346, 136);
wt159 = _mm512_shuffle_f32x4(tmp338, tmp346, 221);
wt152 = _mm512_shuffle_f32x4(tmp340, tmp348, 136);
wt160 = _mm512_shuffle_f32x4(tmp340, tmp348, 221);
wt153 = _mm512_shuffle_f32x4(tmp335, tmp343, 136);
wt161 = _mm512_shuffle_f32x4(tmp335, tmp343, 221);
wt154 = _mm512_shuffle_f32x4(tmp337, tmp345, 136);
wt162 = _mm512_shuffle_f32x4(tmp337, tmp345, 221);
wt155 = _mm512_shuffle_f32x4(tmp339, tmp347, 136);
wt163 = _mm512_shuffle_f32x4(tmp339, tmp347, 221);
wt156 = _mm512_shuffle_f32x4(tmp341, tmp349, 136);
wt164 = _mm512_shuffle_f32x4(tmp341, tmp349, 221);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut4, wt149);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut4, wt150);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut4, wt151);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut4, wt152);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut4, wt153);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut4, wt154);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut4, wt155);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut4, wt156);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut4, wt157);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut4, wt158);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut4, wt159);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut4, wt160);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(13+16*c6)+(ptrdiff_t)0, 63>>cut4, wt161);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(14+16*c6)+(ptrdiff_t)0, 63>>cut4, wt162);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(15+16*c6)+(ptrdiff_t)0, 63>>cut4, wt163);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(16+16*c6)+(ptrdiff_t)0, 63>>cut4, wt164);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt149);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt150);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt151);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt152);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt153);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt154);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt155);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt156);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt157);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt158);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt159);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt160);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(13+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt161);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(14+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt162);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(15+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt163);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(16+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt164);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt149);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt150);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt151);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt152);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt153);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt154);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt155);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt156);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt157);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt158);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt159);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt160);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(13+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt161);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(14+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt162);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(15+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt163);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(16+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt164);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt149);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt150);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt151);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt152);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt153);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt154);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt155);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt156);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt157);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt158);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt159);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt160);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(13+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt161);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(14+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt162);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(15+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt163);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(16+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt164);
}
__m512 wt165 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)0);
__m512 wt166 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)13972);
__m512 wt167 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)27944);
__m512 wt168 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)41916);
__m512 wt169 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)55888);
__m512 wt170 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)69860);
__m512 wt171 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)83832);
__m512 wt172 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)97804);
__m512 wt173 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)111776);
__m512 wt174 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)125748);
__m512 wt175 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)139720);
__m512 wt176 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)153692);
__m512 wt177 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)167664);
__m512 wt178 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)181636);
__m512 wt179 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)195608);
__m512 wt180 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k4+64*c6+(ptrdiff_t)209580);
__m512 tmp350 = _mm512_unpacklo_ps(wt165, wt166);
__m512 tmp351 = _mm512_unpackhi_ps(wt165, wt166);
__m512 tmp352 = _mm512_unpacklo_ps(wt167, wt168);
__m512 tmp353 = _mm512_unpackhi_ps(wt167, wt168);
__m512 tmp354 = _mm512_unpacklo_ps(wt169, wt170);
__m512 tmp355 = _mm512_unpackhi_ps(wt169, wt170);
__m512 tmp356 = _mm512_unpacklo_ps(wt171, wt172);
__m512 tmp357 = _mm512_unpackhi_ps(wt171, wt172);
__m512 tmp358 = _mm512_unpacklo_ps(wt173, wt174);
__m512 tmp359 = _mm512_unpackhi_ps(wt173, wt174);
__m512 tmp360 = _mm512_unpacklo_ps(wt175, wt176);
__m512 tmp361 = _mm512_unpackhi_ps(wt175, wt176);
__m512 tmp362 = _mm512_unpacklo_ps(wt177, wt178);
__m512 tmp363 = _mm512_unpackhi_ps(wt177, wt178);
__m512 tmp364 = _mm512_unpacklo_ps(wt179, wt180);
__m512 tmp365 = _mm512_unpackhi_ps(wt179, wt180);
__m512 tmp366 = _mm512_shuffle_ps(tmp350, tmp352, 68);
__m512 tmp367 = _mm512_shuffle_ps(tmp350, tmp352, 238);
__m512 tmp368 = _mm512_shuffle_ps(tmp351, tmp353, 68);
__m512 tmp369 = _mm512_shuffle_ps(tmp351, tmp353, 238);
__m512 tmp370 = _mm512_shuffle_ps(tmp354, tmp356, 68);
__m512 tmp371 = _mm512_shuffle_ps(tmp354, tmp356, 238);
__m512 tmp372 = _mm512_shuffle_ps(tmp355, tmp357, 68);
__m512 tmp373 = _mm512_shuffle_ps(tmp355, tmp357, 238);
__m512 tmp374 = _mm512_shuffle_ps(tmp358, tmp360, 68);
__m512 tmp375 = _mm512_shuffle_ps(tmp358, tmp360, 238);
__m512 tmp376 = _mm512_shuffle_ps(tmp359, tmp361, 68);
__m512 tmp377 = _mm512_shuffle_ps(tmp359, tmp361, 238);
__m512 tmp378 = _mm512_shuffle_ps(tmp362, tmp364, 68);
__m512 tmp379 = _mm512_shuffle_ps(tmp362, tmp364, 238);
__m512 tmp380 = _mm512_shuffle_ps(tmp363, tmp365, 68);
__m512 tmp381 = _mm512_shuffle_ps(tmp363, tmp365, 238);
__m512 tmp382 = _mm512_shuffle_f32x4(tmp366, tmp370, 136);
__m512 tmp383 = _mm512_shuffle_f32x4(tmp366, tmp370, 221);
__m512 tmp384 = _mm512_shuffle_f32x4(tmp367, tmp371, 136);
__m512 tmp385 = _mm512_shuffle_f32x4(tmp367, tmp371, 221);
__m512 tmp386 = _mm512_shuffle_f32x4(tmp368, tmp372, 136);
__m512 tmp387 = _mm512_shuffle_f32x4(tmp368, tmp372, 221);
__m512 tmp388 = _mm512_shuffle_f32x4(tmp369, tmp373, 136);
__m512 tmp389 = _mm512_shuffle_f32x4(tmp369, tmp373, 221);
__m512 tmp390 = _mm512_shuffle_f32x4(tmp374, tmp378, 136);
__m512 tmp391 = _mm512_shuffle_f32x4(tmp374, tmp378, 221);
__m512 tmp392 = _mm512_shuffle_f32x4(tmp375, tmp379, 136);
__m512 tmp393 = _mm512_shuffle_f32x4(tmp375, tmp379, 221);
__m512 tmp394 = _mm512_shuffle_f32x4(tmp376, tmp380, 136);
__m512 tmp395 = _mm512_shuffle_f32x4(tmp376, tmp380, 221);
__m512 tmp396 = _mm512_shuffle_f32x4(tmp377, tmp381, 136);
__m512 tmp397 = _mm512_shuffle_f32x4(tmp377, tmp381, 221);
wt165 = _mm512_shuffle_f32x4(tmp382, tmp390, 136);
wt173 = _mm512_shuffle_f32x4(tmp382, tmp390, 221);
wt166 = _mm512_shuffle_f32x4(tmp384, tmp392, 136);
wt174 = _mm512_shuffle_f32x4(tmp384, tmp392, 221);
wt167 = _mm512_shuffle_f32x4(tmp386, tmp394, 136);
wt175 = _mm512_shuffle_f32x4(tmp386, tmp394, 221);
wt168 = _mm512_shuffle_f32x4(tmp388, tmp396, 136);
wt176 = _mm512_shuffle_f32x4(tmp388, tmp396, 221);
wt169 = _mm512_shuffle_f32x4(tmp383, tmp391, 136);
wt170 = _mm512_shuffle_f32x4(tmp385, tmp393, 136);
wt171 = _mm512_shuffle_f32x4(tmp387, tmp395, 136);
wt172 = _mm512_shuffle_f32x4(tmp389, tmp397, 136);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut4, wt165);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut4, wt166);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut4, wt167);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut4, wt168);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut4, wt169);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut4, wt170);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut4, wt171);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut4, wt172);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut4, wt173);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut4, wt174);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut4, wt175);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut4, wt176);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt165);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt166);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt167);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt168);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt169);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt170);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt171);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt172);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt173);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt174);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt175);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)23712, 4032>>cut4, wt176);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt165);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt166);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt167);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt168);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt169);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt170);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt171);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt172);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt173);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt174);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt175);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)47424, 258048>>cut4, wt176);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(1+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt165);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(2+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt166);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(3+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt167);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(4+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt168);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(5+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt169);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(6+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt170);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(7+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt171);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(8+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt172);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(9+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt173);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(10+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt174);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(11+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt175);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l4+4*cut4+24*(12+16*c6)+(ptrdiff_t)71136, 65535-(262143>>cut4), wt176);
}
}
} else {
ptrdiff_t k3 = 2688;
ptrdiff_t l3 = (size_t)(0+k3)/6;
ptrdiff_t cut3 = (size_t)(0+k3)%6;
__m512 sum5 = _mm512_setzero_ps();
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*0+(ptrdiff_t)0, 63>>cut3, sum5);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*0+(ptrdiff_t)23712, 255-(63>>cut3), sum5);
ptrdiff_t c4 = 0;
for (; c4 != 61; ++c4) {
__m512 wt89 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)0);
__m512 wt90 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)13972);
__m512 wt91 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)27944);
__m512 wt92 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)41916);
__m512 wt93 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)55888);
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)69860);
__m512 wt95 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)83832);
__m512 wt96 = _mm512_maskz_loadu_ps(65535, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)97804);
__m512 tmp398 = _mm512_unpacklo_ps(wt89, wt90);
__m512 tmp399 = _mm512_unpackhi_ps(wt89, wt90);
__m512 tmp400 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp401 = _mm512_unpackhi_ps(wt91, wt92);
__m512 tmp402 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp403 = _mm512_unpackhi_ps(wt93, wt94);
__m512 tmp404 = _mm512_unpacklo_ps(wt95, wt96);
__m512 tmp405 = _mm512_unpackhi_ps(wt95, wt96);
__m512 tmp406 = _mm512_shuffle_ps(tmp398, tmp400, 68);
__m512 tmp407 = _mm512_shuffle_ps(tmp398, tmp400, 238);
__m512 tmp408 = _mm512_shuffle_ps(tmp399, tmp401, 68);
__m512 tmp409 = _mm512_shuffle_ps(tmp399, tmp401, 238);
__m512 tmp410 = _mm512_shuffle_ps(tmp402, tmp404, 68);
__m512 tmp411 = _mm512_shuffle_ps(tmp402, tmp404, 238);
__m512 tmp412 = _mm512_shuffle_ps(tmp403, tmp405, 68);
__m512 tmp413 = _mm512_shuffle_ps(tmp403, tmp405, 238);
__m512 tmp414 = _mm512_shuffle_f32x4(tmp406, tmp410, 136);
__m512 tmp415 = _mm512_shuffle_f32x4(tmp406, tmp410, 221);
__m512 tmp416 = _mm512_shuffle_f32x4(tmp407, tmp411, 136);
__m512 tmp417 = _mm512_shuffle_f32x4(tmp407, tmp411, 221);
__m512 tmp418 = _mm512_shuffle_f32x4(tmp408, tmp412, 136);
__m512 tmp419 = _mm512_shuffle_f32x4(tmp408, tmp412, 221);
__m512 tmp420 = _mm512_shuffle_f32x4(tmp409, tmp413, 136);
__m512 tmp421 = _mm512_shuffle_f32x4(tmp409, tmp413, 221);
wt89 = _mm512_shuffle_f32x4(tmp414, tmp414, 136);
__m512 wt97 = _mm512_shuffle_f32x4(tmp414, tmp414, 221);
wt90 = _mm512_shuffle_f32x4(tmp416, tmp416, 136);
__m512 wt98 = _mm512_shuffle_f32x4(tmp416, tmp416, 221);
wt91 = _mm512_shuffle_f32x4(tmp418, tmp418, 136);
__m512 wt99 = _mm512_shuffle_f32x4(tmp418, tmp418, 221);
wt92 = _mm512_shuffle_f32x4(tmp420, tmp420, 136);
__m512 wt100 = _mm512_shuffle_f32x4(tmp420, tmp420, 221);
wt93 = _mm512_shuffle_f32x4(tmp415, tmp415, 136);
__m512 wt101 = _mm512_shuffle_f32x4(tmp415, tmp415, 221);
wt94 = _mm512_shuffle_f32x4(tmp417, tmp417, 136);
__m512 wt102 = _mm512_shuffle_f32x4(tmp417, tmp417, 221);
wt95 = _mm512_shuffle_f32x4(tmp419, tmp419, 136);
__m512 wt103 = _mm512_shuffle_f32x4(tmp419, tmp419, 221);
wt96 = _mm512_shuffle_f32x4(tmp421, tmp421, 136);
__m512 wt104 = _mm512_shuffle_f32x4(tmp421, tmp421, 221);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(1+16*c4)+(ptrdiff_t)0, 63>>cut3, wt89);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(2+16*c4)+(ptrdiff_t)0, 63>>cut3, wt90);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(3+16*c4)+(ptrdiff_t)0, 63>>cut3, wt91);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(4+16*c4)+(ptrdiff_t)0, 63>>cut3, wt92);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(5+16*c4)+(ptrdiff_t)0, 63>>cut3, wt93);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(6+16*c4)+(ptrdiff_t)0, 63>>cut3, wt94);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(7+16*c4)+(ptrdiff_t)0, 63>>cut3, wt95);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(8+16*c4)+(ptrdiff_t)0, 63>>cut3, wt96);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(9+16*c4)+(ptrdiff_t)0, 63>>cut3, wt97);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(10+16*c4)+(ptrdiff_t)0, 63>>cut3, wt98);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(11+16*c4)+(ptrdiff_t)0, 63>>cut3, wt99);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(12+16*c4)+(ptrdiff_t)0, 63>>cut3, wt100);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(13+16*c4)+(ptrdiff_t)0, 63>>cut3, wt101);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(14+16*c4)+(ptrdiff_t)0, 63>>cut3, wt102);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(15+16*c4)+(ptrdiff_t)0, 63>>cut3, wt103);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(16+16*c4)+(ptrdiff_t)0, 63>>cut3, wt104);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(1+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt89);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(2+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt90);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(3+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt91);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(4+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt92);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(5+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt93);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(6+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt94);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(7+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt95);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(8+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt96);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(9+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt97);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(10+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt98);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(11+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt99);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(12+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt100);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(13+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt101);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(14+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt102);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(15+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt103);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(16+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt104);
}
__m512 wt105 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)0);
__m512 wt106 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)13972);
__m512 wt107 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)27944);
__m512 wt108 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)41916);
__m512 wt109 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)55888);
__m512 wt110 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)69860);
__m512 wt111 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)83832);
__m512 wt112 = _mm512_maskz_loadu_ps(4095, wtPtr2+37668512*i6+13972*k3+64*c4+(ptrdiff_t)97804);
__m512 tmp422 = _mm512_unpacklo_ps(wt105, wt106);
__m512 tmp423 = _mm512_unpackhi_ps(wt105, wt106);
__m512 tmp424 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp425 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp426 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp427 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp428 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp429 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp430 = _mm512_shuffle_ps(tmp422, tmp424, 68);
__m512 tmp431 = _mm512_shuffle_ps(tmp422, tmp424, 238);
__m512 tmp432 = _mm512_shuffle_ps(tmp423, tmp425, 68);
__m512 tmp433 = _mm512_shuffle_ps(tmp423, tmp425, 238);
__m512 tmp434 = _mm512_shuffle_ps(tmp426, tmp428, 68);
__m512 tmp435 = _mm512_shuffle_ps(tmp426, tmp428, 238);
__m512 tmp436 = _mm512_shuffle_ps(tmp427, tmp429, 68);
__m512 tmp437 = _mm512_shuffle_ps(tmp427, tmp429, 238);
__m512 tmp438 = _mm512_shuffle_f32x4(tmp430, tmp434, 136);
__m512 tmp439 = _mm512_shuffle_f32x4(tmp430, tmp434, 221);
__m512 tmp440 = _mm512_shuffle_f32x4(tmp431, tmp435, 136);
__m512 tmp441 = _mm512_shuffle_f32x4(tmp431, tmp435, 221);
__m512 tmp442 = _mm512_shuffle_f32x4(tmp432, tmp436, 136);
__m512 tmp443 = _mm512_shuffle_f32x4(tmp432, tmp436, 221);
__m512 tmp444 = _mm512_shuffle_f32x4(tmp433, tmp437, 136);
__m512 tmp445 = _mm512_shuffle_f32x4(tmp433, tmp437, 221);
wt105 = _mm512_shuffle_f32x4(tmp438, tmp438, 136);
__m512 wt113 = _mm512_shuffle_f32x4(tmp438, tmp438, 221);
wt106 = _mm512_shuffle_f32x4(tmp440, tmp440, 136);
__m512 wt114 = _mm512_shuffle_f32x4(tmp440, tmp440, 221);
wt107 = _mm512_shuffle_f32x4(tmp442, tmp442, 136);
__m512 wt115 = _mm512_shuffle_f32x4(tmp442, tmp442, 221);
wt108 = _mm512_shuffle_f32x4(tmp444, tmp444, 136);
__m512 wt116 = _mm512_shuffle_f32x4(tmp444, tmp444, 221);
wt109 = _mm512_shuffle_f32x4(tmp439, tmp439, 136);
wt110 = _mm512_shuffle_f32x4(tmp441, tmp441, 136);
wt111 = _mm512_shuffle_f32x4(tmp443, tmp443, 136);
wt112 = _mm512_shuffle_f32x4(tmp445, tmp445, 136);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(1+16*c4)+(ptrdiff_t)0, 63>>cut3, wt105);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(2+16*c4)+(ptrdiff_t)0, 63>>cut3, wt106);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(3+16*c4)+(ptrdiff_t)0, 63>>cut3, wt107);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(4+16*c4)+(ptrdiff_t)0, 63>>cut3, wt108);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(5+16*c4)+(ptrdiff_t)0, 63>>cut3, wt109);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(6+16*c4)+(ptrdiff_t)0, 63>>cut3, wt110);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(7+16*c4)+(ptrdiff_t)0, 63>>cut3, wt111);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(8+16*c4)+(ptrdiff_t)0, 63>>cut3, wt112);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(9+16*c4)+(ptrdiff_t)0, 63>>cut3, wt113);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(10+16*c4)+(ptrdiff_t)0, 63>>cut3, wt114);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(11+16*c4)+(ptrdiff_t)0, 63>>cut3, wt115);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+24*(12+16*c4)+(ptrdiff_t)0, 63>>cut3, wt116);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(1+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt105);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(2+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt106);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(3+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt107);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(4+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt108);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(5+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt109);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(6+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt110);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(7+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt111);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(8+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt112);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(9+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt113);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(10+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt114);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(11+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt115);
_mm512_mask_storeu_ps(arranged2+10665376*i6+23736*l3+4*cut3+8*(12+16*c4)+(ptrdiff_t)23712, 255-(63>>cut3), wt116);
}
}
}
}

static void Example17OneArrangeWts1(Example17ThreaderTeam1* team13, char** tensors1) {
Example17ThreaderTask1 task5;
task5.callee1 = Example17OneArrangeWts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 169;
task5.hull1[1] = 1;
task5.hull1[2] = 4;
Example17ThreaderDo1(team13, &task5);
}

static void Example17OneArrangeDats1Callee1(Example17ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = pt8[0];
ptrdiff_t c7 = pt8[1];
ptrdiff_t e2 = pt8[3];
if (e2 < 3) {
char*restrict datPtr1 = tensors4[0]+(ptrdiff_t)0+(ptrdiff_t)36192240*e2+(ptrdiff_t)151400592*0;
char*restrict arranged3 = tensors4[1]+(ptrdiff_t)2725440*e2+(ptrdiff_t)2725440*0;
ptrdiff_t ii3 = 1;
for (ptrdiff_t i7 = 0; i7 < ii3; ++i7) {
ptrdiff_t j3 = 1*c7;
ptrdiff_t jj3 = j3+0;
if (j3 < 12) {
ptrdiff_t h1 = 0+((size_t)j3-0)/2*20;
switch (((size_t)j3-0)%2) {
case 0: {
wrap3:;
ptrdiff_t k5 = 139*s1;
ptrdiff_t kk1 = k5+(s1 < 5 ? 139 : 140);
for (; k5 < kk1; ++k5) {
__m512 dat1 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)0);
__m512 dat2 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)64);
__m512i pm1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat3 = _mm512_permutex2var_ps(dat1, pm1, dat2);
__m512 dat4 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)128);
__m512 dat5 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)192);
__m512i pm2 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat6 = _mm512_permutex2var_ps(dat4, pm2, dat5);
dat3 = _mm512_mask_mov_ps(dat3, 65280, dat6);
__m512 dat7 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)256);
__m512 dat8 = _mm512_maskz_loadu_ps(31, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)320);
__m512i pm3 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat9 = _mm512_permutex2var_ps(dat7, pm3, dat8);
__m512 dat10 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)1376);
__m512 dat11 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)1440);
__m512i pm4 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat12 = _mm512_permutex2var_ps(dat10, pm4, dat11);
__m512 dat13 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)1504);
__m512 dat14 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)1568);
__m512i pm5 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat15 = _mm512_permutex2var_ps(dat13, pm5, dat14);
dat12 = _mm512_mask_mov_ps(dat12, 65280, dat15);
__m512 dat16 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)1632);
__m512 dat17 = _mm512_maskz_loadu_ps(31, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)1696);
__m512i pm6 = _mm512_set_epi32(0, 0, 28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0);
__m512 dat18 = _mm512_permutex2var_ps(dat16, pm6, dat17);
dat9 = _mm512_mask_mov_ps(dat9, 4032, dat18);
__m512 dat19 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)2752);
__m512 dat20 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)2816);
__m512i pm7 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat21 = _mm512_permutex2var_ps(dat19, pm7, dat20);
__m512 dat22 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)2880);
__m512 dat23 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k5+(ptrdiff_t)2944);
__m512i pm8 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat24 = _mm512_permutex2var_ps(dat22, pm8, dat23);
dat21 = _mm512_mask_mov_ps(dat21, 65280, dat24);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+256*k5+(ptrdiff_t)0, dat3);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+256*k5+(ptrdiff_t)64, dat9);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+256*k5+(ptrdiff_t)128, dat12);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+256*k5+(ptrdiff_t)192, dat21);
}
if (j3 >= jj3) goto next1;
++j3;
}
default: {
ptrdiff_t k6 = 139*s1;
ptrdiff_t kk2 = k6+(s1 < 5 ? 139 : 140);
for (; k6 < kk2; ++k6) {
__m512 dat25 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)3008);
__m512 dat26 = _mm512_maskz_loadu_ps(31, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)3072);
__m512i pm9 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat27 = _mm512_permutex2var_ps(dat25, pm9, dat26);
__m512 dat28 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)4128);
__m512 dat29 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)4192);
__m512i pm10 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat30 = _mm512_permutex2var_ps(dat28, pm10, dat29);
__m512 dat31 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)4256);
__m512 dat32 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)4320);
__m512i pm11 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat33 = _mm512_permutex2var_ps(dat31, pm11, dat32);
dat30 = _mm512_mask_mov_ps(dat30, 65280, dat33);
__m512 dat34 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)4384);
__m512 dat35 = _mm512_maskz_loadu_ps(31, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)4448);
__m512i pm12 = _mm512_set_epi32(0, 0, 28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0);
__m512 dat36 = _mm512_permutex2var_ps(dat34, pm12, dat35);
dat27 = _mm512_mask_mov_ps(dat27, 4032, dat36);
__m512 dat37 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)5504);
__m512 dat38 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)5568);
__m512i pm13 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat39 = _mm512_permutex2var_ps(dat37, pm13, dat38);
__m512 dat40 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)5632);
__m512 dat41 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)5696);
__m512i pm14 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat42 = _mm512_permutex2var_ps(dat40, pm14, dat41);
dat39 = _mm512_mask_mov_ps(dat39, 65280, dat42);
__m512 dat43 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)5760);
__m512 dat44 = _mm512_maskz_loadu_ps(31, datPtr1+151400592*i7+344*h1+43344*k6+(ptrdiff_t)5824);
__m512i pm15 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat45 = _mm512_permutex2var_ps(dat43, pm15, dat44);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+256*k6+(ptrdiff_t)0, dat27);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+256*k6+(ptrdiff_t)64, dat30);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+256*k6+(ptrdiff_t)128, dat39);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+256*k6+(ptrdiff_t)192, dat45);
}
if (j3 >= jj3) goto next1;
if (j3 >= 11) break;
++j3;
h1 += 20;
goto wrap3;
}
}
j3 = 12;
}
switch ((size_t)j3-12) {
default: {
j3 = 12;
ptrdiff_t k7 = 139*s1;
ptrdiff_t kk3 = k7+(s1 < 5 ? 139 : 140);
for (; k7 < kk3; ++k7) {
__m512 dat46 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)0);
__m512 dat47 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)64);
__m512i pm16 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat48 = _mm512_permutex2var_ps(dat46, pm16, dat47);
__m512 dat49 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)128);
__m512 dat50 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)192);
__m512i pm17 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat51 = _mm512_permutex2var_ps(dat49, pm17, dat50);
dat48 = _mm512_mask_mov_ps(dat48, 65280, dat51);
__m512 dat52 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)256);
__m512 dat53 = _mm512_maskz_loadu_ps(31, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)320);
__m512i pm18 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat54 = _mm512_permutex2var_ps(dat52, pm18, dat53);
__m512 dat55 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)1376);
__m512 dat56 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)1440);
__m512i pm19 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat57 = _mm512_permutex2var_ps(dat55, pm19, dat56);
__m512 dat58 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)1504);
__m512 dat59 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)1568);
__m512i pm20 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat60 = _mm512_permutex2var_ps(dat58, pm20, dat59);
dat57 = _mm512_mask_mov_ps(dat57, 65280, dat60);
__m512 dat61 = _mm512_maskz_loadu_ps(8191, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)1632);
__m512 dat62 = _mm512_maskz_loadu_ps(31, datPtr1+151400592*i7+344*(ptrdiff_t)120+43344*k7+(ptrdiff_t)1696);
__m512i pm21 = _mm512_set_epi32(0, 0, 28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0);
__m512 dat63 = _mm512_permutex2var_ps(dat61, pm21, dat62);
dat54 = _mm512_mask_mov_ps(dat54, 4032, dat63);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+192*k7+(ptrdiff_t)0, dat48);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+192*k7+(ptrdiff_t)64, dat54);
_mm512_storeu_ps(arranged3+2725440*i7+213760*j3+192*k7+(ptrdiff_t)128, dat57);
}
if (j3 >= jj3) goto next1;
}
}
j3 = 13;
next1:;
}
return;
}
char*restrict datPtr2 = tensors4[0]+(ptrdiff_t)0+(ptrdiff_t)36192240*3+(ptrdiff_t)151400592*0;
char*restrict arranged4 = tensors4[1]+(ptrdiff_t)2725440*3+(ptrdiff_t)3224832*0;
ptrdiff_t ii4 = 1;
for (ptrdiff_t i8 = 0; i8 < ii4; ++i8) {
ptrdiff_t j4 = 1*c7;
ptrdiff_t jj4 = j4+0;
if (j4 < 12) {
ptrdiff_t h2 = 0+((size_t)j4-0)/2*20;
switch (((size_t)j4-0)%2) {
case 0: {
wrap4:;
ptrdiff_t k8 = 164*s1;
ptrdiff_t kk4 = k8+(s1 < 5 ? 164 : 168);
for (; k8 < kk4; ++k8) {
__m512 dat64 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)0);
__m512 dat65 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)64);
__m512i pm22 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat66 = _mm512_permutex2var_ps(dat64, pm22, dat65);
__m512 dat67 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)128);
__m512 dat68 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)192);
__m512i pm23 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat69 = _mm512_permutex2var_ps(dat67, pm23, dat68);
dat66 = _mm512_mask_mov_ps(dat66, 65280, dat69);
__m512 dat70 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)256);
__m512 dat71 = _mm512_maskz_loadu_ps(31, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)320);
__m512i pm24 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat72 = _mm512_permutex2var_ps(dat70, pm24, dat71);
__m512 dat73 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)1376);
__m512 dat74 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)1440);
__m512i pm25 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat75 = _mm512_permutex2var_ps(dat73, pm25, dat74);
__m512 dat76 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)1504);
__m512 dat77 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)1568);
__m512i pm26 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat78 = _mm512_permutex2var_ps(dat76, pm26, dat77);
dat75 = _mm512_mask_mov_ps(dat75, 65280, dat78);
__m512 dat79 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)1632);
__m512 dat80 = _mm512_maskz_loadu_ps(31, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)1696);
__m512i pm27 = _mm512_set_epi32(0, 0, 28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0);
__m512 dat81 = _mm512_permutex2var_ps(dat79, pm27, dat80);
dat72 = _mm512_mask_mov_ps(dat72, 4032, dat81);
__m512 dat82 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)2752);
__m512 dat83 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)2816);
__m512i pm28 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat84 = _mm512_permutex2var_ps(dat82, pm28, dat83);
__m512 dat85 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)2880);
__m512 dat86 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k8+(ptrdiff_t)2944);
__m512i pm29 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat87 = _mm512_permutex2var_ps(dat85, pm29, dat86);
dat84 = _mm512_mask_mov_ps(dat84, 65280, dat87);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+256*k8+(ptrdiff_t)0, dat66);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+256*k8+(ptrdiff_t)64, dat72);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+256*k8+(ptrdiff_t)128, dat75);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+256*k8+(ptrdiff_t)192, dat84);
}
if (j4 >= jj4) goto next2;
++j4;
}
default: {
ptrdiff_t k9 = 164*s1;
ptrdiff_t kk5 = k9+(s1 < 5 ? 164 : 168);
for (; k9 < kk5; ++k9) {
__m512 dat88 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)3008);
__m512 dat89 = _mm512_maskz_loadu_ps(31, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)3072);
__m512i pm30 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat90 = _mm512_permutex2var_ps(dat88, pm30, dat89);
__m512 dat91 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)4128);
__m512 dat92 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)4192);
__m512i pm31 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat93 = _mm512_permutex2var_ps(dat91, pm31, dat92);
__m512 dat94 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)4256);
__m512 dat95 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)4320);
__m512i pm32 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat96 = _mm512_permutex2var_ps(dat94, pm32, dat95);
dat93 = _mm512_mask_mov_ps(dat93, 65280, dat96);
__m512 dat97 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)4384);
__m512 dat98 = _mm512_maskz_loadu_ps(31, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)4448);
__m512i pm33 = _mm512_set_epi32(0, 0, 28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0);
__m512 dat99 = _mm512_permutex2var_ps(dat97, pm33, dat98);
dat90 = _mm512_mask_mov_ps(dat90, 4032, dat99);
__m512 dat100 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)5504);
__m512 dat101 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)5568);
__m512i pm34 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat102 = _mm512_permutex2var_ps(dat100, pm34, dat101);
__m512 dat103 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)5632);
__m512 dat104 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)5696);
__m512i pm35 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat105 = _mm512_permutex2var_ps(dat103, pm35, dat104);
dat102 = _mm512_mask_mov_ps(dat102, 65280, dat105);
__m512 dat106 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)5760);
__m512 dat107 = _mm512_maskz_loadu_ps(31, datPtr2+151400592*i8+344*h2+43344*k9+(ptrdiff_t)5824);
__m512i pm36 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat108 = _mm512_permutex2var_ps(dat106, pm36, dat107);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+256*k9+(ptrdiff_t)0, dat90);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+256*k9+(ptrdiff_t)64, dat93);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+256*k9+(ptrdiff_t)128, dat102);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+256*k9+(ptrdiff_t)192, dat108);
}
if (j4 >= jj4) goto next2;
if (j4 >= 11) break;
++j4;
h2 += 20;
goto wrap4;
}
}
j4 = 12;
}
switch ((size_t)j4-12) {
default: {
j4 = 12;
ptrdiff_t k10 = 164*s1;
ptrdiff_t kk6 = k10+(s1 < 5 ? 164 : 168);
for (; k10 < kk6; ++k10) {
__m512 dat109 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)0);
__m512 dat110 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)64);
__m512i pm37 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat111 = _mm512_permutex2var_ps(dat109, pm37, dat110);
__m512 dat112 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)128);
__m512 dat113 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)192);
__m512i pm38 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat114 = _mm512_permutex2var_ps(dat112, pm38, dat113);
dat111 = _mm512_mask_mov_ps(dat111, 65280, dat114);
__m512 dat115 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)256);
__m512 dat116 = _mm512_maskz_loadu_ps(31, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)320);
__m512i pm39 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat117 = _mm512_permutex2var_ps(dat115, pm39, dat116);
__m512 dat118 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)1376);
__m512 dat119 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)1440);
__m512i pm40 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 28, 24, 20, 16, 12, 8, 4, 0);
__m512 dat120 = _mm512_permutex2var_ps(dat118, pm40, dat119);
__m512 dat121 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)1504);
__m512 dat122 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)1568);
__m512i pm41 = _mm512_set_epi32(28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat123 = _mm512_permutex2var_ps(dat121, pm41, dat122);
dat120 = _mm512_mask_mov_ps(dat120, 65280, dat123);
__m512 dat124 = _mm512_maskz_loadu_ps(8191, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)1632);
__m512 dat125 = _mm512_maskz_loadu_ps(31, datPtr2+151400592*i8+344*(ptrdiff_t)120+43344*k10+(ptrdiff_t)1696);
__m512i pm42 = _mm512_set_epi32(0, 0, 28, 24, 20, 16, 12, 8, 4, 0, 0, 0, 0, 0, 0, 0);
__m512 dat126 = _mm512_permutex2var_ps(dat124, pm42, dat125);
dat117 = _mm512_mask_mov_ps(dat117, 4032, dat126);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+192*k10+(ptrdiff_t)0, dat111);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+192*k10+(ptrdiff_t)64, dat117);
_mm512_storeu_ps(arranged4+3224832*i8+252928*j4+192*k10+(ptrdiff_t)128, dat120);
}
if (j4 >= jj4) goto next2;
}
}
j4 = 13;
next2:;
}
}

static void Example17OneArrangeDats1(Example17ThreaderTeam1* team15, char** tensors3) {
Example17ThreaderTask1 task7;
task7.callee1 = Example17OneArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 6;
task7.hull1[1] = 13;
task7.hull1[2] = 1;
task7.hull1[3] = 4;
Example17ThreaderDo1(team15, &task7);
}

static void Example17OneApply1Callee1(Example17ThreaderTask1* task8, int64_t* pt9) {
void** pair2 = task8->any1;
char** tensors6 = pair2[0];
ptrdiff_t e3 = 0;
ptrdiff_t g2 = 0;
ptrdiff_t d1 = pt9[1];
ptrdiff_t w1 = pt9[0];
char*restrict arrangedWts1 = tensors6[0]+9015424*e3+(ptrdiff_t)9015424*1*g2;
char*restrict arrangedDats1 = tensors6[1]+2725440*e3+(ptrdiff_t)2725440*1*g2;
char*restrict datPtr3 = tensors6[2]+(ptrdiff_t)7591936*1*g2;
ptrdiff_t ii5 = 1;
for (ptrdiff_t i9 = 0; i9 < ii5; ++i9) {
ptrdiff_t j5 = 1*d1;
ptrdiff_t jj5 = j5+0;
if (j5 < 12) {
ptrdiff_t h3 = 0+((size_t)j5-0)/2*5;
switch (((size_t)j5-0)%2) {
case 0: {
wrap5:;
ptrdiff_t k11 = 1*w1;
ptrdiff_t kk7 = k11+0;
for (; k11 != 449; ++k11) {
ptrdiff_t s2 = -1;
__m512 sum8 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)24));
__m512 sum12 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)28));
__m512 sum16 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)32));
__m512 sum20 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)36));
__m512 sum24 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)40));
__m512 sum28 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)44));
__m512 sum9 = sum8;
__m512 sum10 = sum8;
__m512 sum11 = sum8;
__m512 sum13 = sum12;
__m512 sum14 = sum12;
__m512 sum15 = sum12;
__m512 sum17 = sum16;
__m512 sum18 = sum16;
__m512 sum19 = sum16;
__m512 sum21 = sum20;
__m512 sum22 = sum20;
__m512 sum23 = sum20;
__m512 sum25 = sum24;
__m512 sum26 = sum24;
__m512 sum27 = sum24;
__m512 sum29 = sum28;
__m512 sum30 = sum28;
__m512 sum31 = sum28;
for (s2 = 0; s2 < 835; ++s2) {
__m512 dat127 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s2+(ptrdiff_t)0);
__m512 dat128 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s2+(ptrdiff_t)64);
__m512 dat129 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s2+(ptrdiff_t)128);
__m512 dat130 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s2+(ptrdiff_t)192);
__m512 wt181 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)24));
sum8 = _mm512_fmadd_ps(wt181, dat127, sum8);
sum9 = _mm512_fmadd_ps(wt181, dat128, sum9);
sum10 = _mm512_fmadd_ps(wt181, dat129, sum10);
sum11 = _mm512_fmadd_ps(wt181, dat130, sum11);
__m512 wt182 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)28));
sum12 = _mm512_fmadd_ps(wt182, dat127, sum12);
sum13 = _mm512_fmadd_ps(wt182, dat128, sum13);
sum14 = _mm512_fmadd_ps(wt182, dat129, sum14);
sum15 = _mm512_fmadd_ps(wt182, dat130, sum15);
__m512 wt183 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)32));
sum16 = _mm512_fmadd_ps(wt183, dat127, sum16);
sum17 = _mm512_fmadd_ps(wt183, dat128, sum17);
sum18 = _mm512_fmadd_ps(wt183, dat129, sum18);
sum19 = _mm512_fmadd_ps(wt183, dat130, sum19);
__m512 wt184 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)36));
sum20 = _mm512_fmadd_ps(wt184, dat127, sum20);
sum21 = _mm512_fmadd_ps(wt184, dat128, sum21);
sum22 = _mm512_fmadd_ps(wt184, dat129, sum22);
sum23 = _mm512_fmadd_ps(wt184, dat130, sum23);
__m512 wt185 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)40));
sum24 = _mm512_fmadd_ps(wt185, dat127, sum24);
sum25 = _mm512_fmadd_ps(wt185, dat128, sum25);
sum26 = _mm512_fmadd_ps(wt185, dat129, sum26);
sum27 = _mm512_fmadd_ps(wt185, dat130, sum27);
__m512 wt186 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+24*s2+(ptrdiff_t)44));
sum28 = _mm512_fmadd_ps(wt186, dat127, sum28);
sum29 = _mm512_fmadd_ps(wt186, dat128, sum29);
sum30 = _mm512_fmadd_ps(wt186, dat129, sum30);
sum31 = _mm512_fmadd_ps(wt186, dat130, sum31);
}
__m512 dat131 = sum9;
__m512i via1 = _mm512_castps_si512(sum9);
via1 = _mm512_alignr_epi32(via1, via1, 6);
__m512 dat132 = _mm512_castsi512_ps(via1);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)0, 65535, sum8);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)64, 63, dat131);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)88, 65535, sum10);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)152, 63, dat132);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)176, 65535, sum11);
__m512 dat133 = sum13;
__m512i via2 = _mm512_castps_si512(sum13);
via2 = _mm512_alignr_epi32(via2, via2, 6);
__m512 dat134 = _mm512_castsi512_ps(via2);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2816, 65535, sum12);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2880, 63, dat133);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2904, 65535, sum14);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2968, 63, dat134);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2992, 65535, sum15);
__m512 dat135 = sum17;
__m512i via3 = _mm512_castps_si512(sum17);
via3 = _mm512_alignr_epi32(via3, via3, 6);
__m512 dat136 = _mm512_castsi512_ps(via3);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)5632, 65535, sum16);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)5696, 63, dat135);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)5720, 65535, sum18);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)5784, 63, dat136);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)5808, 65535, sum19);
__m512 dat137 = sum21;
__m512i via4 = _mm512_castps_si512(sum21);
via4 = _mm512_alignr_epi32(via4, via4, 6);
__m512 dat138 = _mm512_castsi512_ps(via4);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)8448, 65535, sum20);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)8512, 63, dat137);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)8536, 65535, sum22);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)8600, 63, dat138);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)8624, 65535, sum23);
__m512 dat139 = sum25;
__m512i via5 = _mm512_castps_si512(sum25);
via5 = _mm512_alignr_epi32(via5, via5, 6);
__m512 dat140 = _mm512_castsi512_ps(via5);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)11264, 65535, sum24);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)11328, 63, dat139);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)11352, 65535, sum26);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)11416, 63, dat140);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)11440, 65535, sum27);
__m512 dat141 = sum29;
__m512i via6 = _mm512_castps_si512(sum29);
via6 = _mm512_alignr_epi32(via6, via6, 6);
__m512 dat142 = _mm512_castsi512_ps(via6);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)14080, 65535, sum28);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)14144, 63, dat141);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)14168, 65535, sum30);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)14232, 63, dat142);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)14256, 65535, sum31);
if (k11 >= kk7) return;
}
ptrdiff_t s3 = -1;
__m512 sum32 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+8*s3+(ptrdiff_t)8));
__m512 sum36 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+8*s3+(ptrdiff_t)12));
__m512 sum33 = sum32;
__m512 sum34 = sum32;
__m512 sum35 = sum32;
__m512 sum37 = sum36;
__m512 sum38 = sum36;
__m512 sum39 = sum36;
for (s3 = 0; s3 < 835; ++s3) {
__m512 dat143 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s3+(ptrdiff_t)0);
__m512 dat144 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s3+(ptrdiff_t)64);
__m512 dat145 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s3+(ptrdiff_t)128);
__m512 dat146 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s3+(ptrdiff_t)192);
__m512 wt187 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+8*s3+(ptrdiff_t)8));
sum32 = _mm512_fmadd_ps(wt187, dat143, sum32);
sum33 = _mm512_fmadd_ps(wt187, dat144, sum33);
sum34 = _mm512_fmadd_ps(wt187, dat145, sum34);
sum35 = _mm512_fmadd_ps(wt187, dat146, sum35);
__m512 wt188 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k11+8*s3+(ptrdiff_t)12));
sum36 = _mm512_fmadd_ps(wt188, dat143, sum36);
sum37 = _mm512_fmadd_ps(wt188, dat144, sum37);
sum38 = _mm512_fmadd_ps(wt188, dat145, sum38);
sum39 = _mm512_fmadd_ps(wt188, dat146, sum39);
}
__m512 dat147 = sum33;
__m512i via7 = _mm512_castps_si512(sum33);
via7 = _mm512_alignr_epi32(via7, via7, 6);
__m512 dat148 = _mm512_castsi512_ps(via7);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)0, 65535, sum32);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)64, 63, dat147);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)88, 65535, sum34);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)152, 63, dat148);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)176, 65535, sum35);
__m512 dat149 = sum37;
__m512i via8 = _mm512_castps_si512(sum37);
via8 = _mm512_alignr_epi32(via8, via8, 6);
__m512 dat150 = _mm512_castsi512_ps(via8);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2816, 65535, sum36);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2880, 63, dat149);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2904, 65535, sum38);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2968, 63, dat150);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k11+(ptrdiff_t)2992, 65535, sum39);
if (j5 >= jj5) return;
++j5;
}
default: {
ptrdiff_t k12 = 1*w1;
ptrdiff_t kk8 = k12+0;
for (; k12 != 449; ++k12) {
ptrdiff_t s4 = -1;
__m512 sum40 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)24));
__m512 sum44 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)28));
__m512 sum48 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)32));
__m512 sum52 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)36));
__m512 sum56 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)40));
__m512 sum60 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)44));
__m512 sum41 = sum40;
__m512 sum42 = sum40;
__m512 sum43 = sum40;
__m512 sum45 = sum44;
__m512 sum46 = sum44;
__m512 sum47 = sum44;
__m512 sum49 = sum48;
__m512 sum50 = sum48;
__m512 sum51 = sum48;
__m512 sum53 = sum52;
__m512 sum54 = sum52;
__m512 sum55 = sum52;
__m512 sum57 = sum56;
__m512 sum58 = sum56;
__m512 sum59 = sum56;
__m512 sum61 = sum60;
__m512 sum62 = sum60;
__m512 sum63 = sum60;
for (s4 = 0; s4 < 835; ++s4) {
__m512 dat151 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s4+(ptrdiff_t)0);
__m512 dat152 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s4+(ptrdiff_t)64);
__m512 dat153 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s4+(ptrdiff_t)128);
__m512 dat154 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s4+(ptrdiff_t)192);
__m512 wt189 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)24));
sum40 = _mm512_fmadd_ps(wt189, dat151, sum40);
sum41 = _mm512_fmadd_ps(wt189, dat152, sum41);
sum42 = _mm512_fmadd_ps(wt189, dat153, sum42);
sum43 = _mm512_fmadd_ps(wt189, dat154, sum43);
__m512 wt190 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)28));
sum44 = _mm512_fmadd_ps(wt190, dat151, sum44);
sum45 = _mm512_fmadd_ps(wt190, dat152, sum45);
sum46 = _mm512_fmadd_ps(wt190, dat153, sum46);
sum47 = _mm512_fmadd_ps(wt190, dat154, sum47);
__m512 wt191 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)32));
sum48 = _mm512_fmadd_ps(wt191, dat151, sum48);
sum49 = _mm512_fmadd_ps(wt191, dat152, sum49);
sum50 = _mm512_fmadd_ps(wt191, dat153, sum50);
sum51 = _mm512_fmadd_ps(wt191, dat154, sum51);
__m512 wt192 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)36));
sum52 = _mm512_fmadd_ps(wt192, dat151, sum52);
sum53 = _mm512_fmadd_ps(wt192, dat152, sum53);
sum54 = _mm512_fmadd_ps(wt192, dat153, sum54);
sum55 = _mm512_fmadd_ps(wt192, dat154, sum55);
__m512 wt193 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)40));
sum56 = _mm512_fmadd_ps(wt193, dat151, sum56);
sum57 = _mm512_fmadd_ps(wt193, dat152, sum57);
sum58 = _mm512_fmadd_ps(wt193, dat153, sum58);
sum59 = _mm512_fmadd_ps(wt193, dat154, sum59);
__m512 wt194 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+24*s4+(ptrdiff_t)44));
sum60 = _mm512_fmadd_ps(wt194, dat151, sum60);
sum61 = _mm512_fmadd_ps(wt194, dat152, sum61);
sum62 = _mm512_fmadd_ps(wt194, dat153, sum62);
sum63 = _mm512_fmadd_ps(wt194, dat154, sum63);
}
__m512 dat155 = sum40;
__m512i via9 = _mm512_castps_si512(sum40);
via9 = _mm512_alignr_epi32(via9, via9, 6);
__m512 dat156 = _mm512_castsi512_ps(via9);
__m512 dat157 = sum43;
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)240, 63, dat155);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)264, 65535, sum41);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)328, 63, dat156);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)352, 65535, sum42);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)416, 63, dat157);
__m512 dat158 = sum44;
__m512i via10 = _mm512_castps_si512(sum44);
via10 = _mm512_alignr_epi32(via10, via10, 6);
__m512 dat159 = _mm512_castsi512_ps(via10);
__m512 dat160 = sum47;
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3056, 63, dat158);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3080, 65535, sum45);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3144, 63, dat159);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3168, 65535, sum46);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3232, 63, dat160);
__m512 dat161 = sum48;
__m512i via11 = _mm512_castps_si512(sum48);
via11 = _mm512_alignr_epi32(via11, via11, 6);
__m512 dat162 = _mm512_castsi512_ps(via11);
__m512 dat163 = sum51;
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)5872, 63, dat161);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)5896, 65535, sum49);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)5960, 63, dat162);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)5984, 65535, sum50);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)6048, 63, dat163);
__m512 dat164 = sum52;
__m512i via12 = _mm512_castps_si512(sum52);
via12 = _mm512_alignr_epi32(via12, via12, 6);
__m512 dat165 = _mm512_castsi512_ps(via12);
__m512 dat166 = sum55;
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)8688, 63, dat164);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)8712, 65535, sum53);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)8776, 63, dat165);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)8800, 65535, sum54);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)8864, 63, dat166);
__m512 dat167 = sum56;
__m512i via13 = _mm512_castps_si512(sum56);
via13 = _mm512_alignr_epi32(via13, via13, 6);
__m512 dat168 = _mm512_castsi512_ps(via13);
__m512 dat169 = sum59;
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)11504, 63, dat167);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)11528, 65535, sum57);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)11592, 63, dat168);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)11616, 65535, sum58);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)11680, 63, dat169);
__m512 dat170 = sum60;
__m512i via14 = _mm512_castps_si512(sum60);
via14 = _mm512_alignr_epi32(via14, via14, 6);
__m512 dat171 = _mm512_castsi512_ps(via14);
__m512 dat172 = sum63;
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)14320, 63, dat170);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)14344, 65535, sum61);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)14408, 63, dat171);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)14432, 65535, sum62);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)14496, 63, dat172);
if (k12 >= kk8) return;
}
ptrdiff_t s5 = -1;
__m512 sum64 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+8*s5+(ptrdiff_t)8));
__m512 sum68 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+8*s5+(ptrdiff_t)12));
__m512 sum65 = sum64;
__m512 sum66 = sum64;
__m512 sum67 = sum64;
__m512 sum69 = sum68;
__m512 sum70 = sum68;
__m512 sum71 = sum68;
for (s5 = 0; s5 < 835; ++s5) {
__m512 dat173 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s5+(ptrdiff_t)0);
__m512 dat174 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s5+(ptrdiff_t)64);
__m512 dat175 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s5+(ptrdiff_t)128);
__m512 dat176 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+256*s5+(ptrdiff_t)192);
__m512 wt195 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+8*s5+(ptrdiff_t)8));
sum64 = _mm512_fmadd_ps(wt195, dat173, sum64);
sum65 = _mm512_fmadd_ps(wt195, dat174, sum65);
sum66 = _mm512_fmadd_ps(wt195, dat175, sum66);
sum67 = _mm512_fmadd_ps(wt195, dat176, sum67);
__m512 wt196 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k12+8*s5+(ptrdiff_t)12));
sum68 = _mm512_fmadd_ps(wt196, dat173, sum68);
sum69 = _mm512_fmadd_ps(wt196, dat174, sum69);
sum70 = _mm512_fmadd_ps(wt196, dat175, sum70);
sum71 = _mm512_fmadd_ps(wt196, dat176, sum71);
}
__m512 dat177 = sum64;
__m512i via15 = _mm512_castps_si512(sum64);
via15 = _mm512_alignr_epi32(via15, via15, 6);
__m512 dat178 = _mm512_castsi512_ps(via15);
__m512 dat179 = sum67;
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)240, 63, dat177);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)264, 65535, sum65);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)328, 63, dat178);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)352, 65535, sum66);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)416, 63, dat179);
__m512 dat180 = sum68;
__m512i via16 = _mm512_castps_si512(sum68);
via16 = _mm512_alignr_epi32(via16, via16, 6);
__m512 dat181 = _mm512_castsi512_ps(via16);
__m512 dat182 = sum71;
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3056, 63, dat180);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3080, 65535, sum69);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3144, 63, dat181);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3168, 65535, sum70);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h3+16896*k12+(ptrdiff_t)3232, 63, dat182);
if (j5 >= jj5) return;
if (j5 >= 11) break;
++j5;
h3 += 5;
goto wrap5;
}
}
j5 = 12;
}
ptrdiff_t h4 = 30;
switch (j5) {
default: {
j5 = 12;
ptrdiff_t k13 = 1*w1;
ptrdiff_t kk9 = k13+0;
for (; k13 != 449; ++k13) {
ptrdiff_t s6 = -1;
__m512 sum72 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)24));
__m512 sum75 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)28));
__m512 sum78 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)32));
__m512 sum81 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)36));
__m512 sum84 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)40));
__m512 sum87 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)44));
__m512 sum73 = sum72;
__m512 sum74 = sum72;
__m512 sum76 = sum75;
__m512 sum77 = sum75;
__m512 sum79 = sum78;
__m512 sum80 = sum78;
__m512 sum82 = sum81;
__m512 sum83 = sum81;
__m512 sum85 = sum84;
__m512 sum86 = sum84;
__m512 sum88 = sum87;
__m512 sum89 = sum87;
for (s6 = 0; s6 < 835; ++s6) {
__m512 dat183 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+192*s6+(ptrdiff_t)0);
__m512 dat184 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+192*s6+(ptrdiff_t)64);
__m512 dat185 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+192*s6+(ptrdiff_t)128);
__m512 wt197 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)24));
sum72 = _mm512_fmadd_ps(wt197, dat183, sum72);
sum73 = _mm512_fmadd_ps(wt197, dat184, sum73);
sum74 = _mm512_fmadd_ps(wt197, dat185, sum74);
__m512 wt198 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)28));
sum75 = _mm512_fmadd_ps(wt198, dat183, sum75);
sum76 = _mm512_fmadd_ps(wt198, dat184, sum76);
sum77 = _mm512_fmadd_ps(wt198, dat185, sum77);
__m512 wt199 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)32));
sum78 = _mm512_fmadd_ps(wt199, dat183, sum78);
sum79 = _mm512_fmadd_ps(wt199, dat184, sum79);
sum80 = _mm512_fmadd_ps(wt199, dat185, sum80);
__m512 wt200 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)36));
sum81 = _mm512_fmadd_ps(wt200, dat183, sum81);
sum82 = _mm512_fmadd_ps(wt200, dat184, sum82);
sum83 = _mm512_fmadd_ps(wt200, dat185, sum83);
__m512 wt201 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)40));
sum84 = _mm512_fmadd_ps(wt201, dat183, sum84);
sum85 = _mm512_fmadd_ps(wt201, dat184, sum85);
sum86 = _mm512_fmadd_ps(wt201, dat185, sum86);
__m512 wt202 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+24*s6+(ptrdiff_t)44));
sum87 = _mm512_fmadd_ps(wt202, dat183, sum87);
sum88 = _mm512_fmadd_ps(wt202, dat184, sum88);
sum89 = _mm512_fmadd_ps(wt202, dat185, sum89);
}
__m512 dat186 = sum73;
__m512i via17 = _mm512_castps_si512(sum73);
via17 = _mm512_alignr_epi32(via17, via17, 6);
__m512 dat187 = _mm512_castsi512_ps(via17);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)0, 65535, sum72);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)64, 63, dat186);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)88, 65535, sum74);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)152, 63, dat187);
__m512 dat188 = sum76;
__m512i via18 = _mm512_castps_si512(sum76);
via18 = _mm512_alignr_epi32(via18, via18, 6);
__m512 dat189 = _mm512_castsi512_ps(via18);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)2816, 65535, sum75);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)2880, 63, dat188);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)2904, 65535, sum77);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)2968, 63, dat189);
__m512 dat190 = sum79;
__m512i via19 = _mm512_castps_si512(sum79);
via19 = _mm512_alignr_epi32(via19, via19, 6);
__m512 dat191 = _mm512_castsi512_ps(via19);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)5632, 65535, sum78);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)5696, 63, dat190);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)5720, 65535, sum80);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)5784, 63, dat191);
__m512 dat192 = sum82;
__m512i via20 = _mm512_castps_si512(sum82);
via20 = _mm512_alignr_epi32(via20, via20, 6);
__m512 dat193 = _mm512_castsi512_ps(via20);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)8448, 65535, sum81);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)8512, 63, dat192);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)8536, 65535, sum83);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)8600, 63, dat193);
__m512 dat194 = sum85;
__m512i via21 = _mm512_castps_si512(sum85);
via21 = _mm512_alignr_epi32(via21, via21, 6);
__m512 dat195 = _mm512_castsi512_ps(via21);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)11264, 65535, sum84);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)11328, 63, dat194);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)11352, 65535, sum86);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)11416, 63, dat195);
__m512 dat196 = sum88;
__m512i via22 = _mm512_castps_si512(sum88);
via22 = _mm512_alignr_epi32(via22, via22, 6);
__m512 dat197 = _mm512_castsi512_ps(via22);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)14080, 65535, sum87);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)14144, 63, dat196);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)14168, 65535, sum89);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)14232, 63, dat197);
if (k13 >= kk9) return;
}
ptrdiff_t s7 = -1;
__m512 sum90 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+8*s7+(ptrdiff_t)8));
__m512 sum93 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+8*s7+(ptrdiff_t)12));
__m512 sum91 = sum90;
__m512 sum92 = sum90;
__m512 sum94 = sum93;
__m512 sum95 = sum93;
for (s7 = 0; s7 < 835; ++s7) {
__m512 dat198 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+192*s7+(ptrdiff_t)0);
__m512 dat199 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+192*s7+(ptrdiff_t)64);
__m512 dat200 = _mm512_loadu_ps(arrangedDats1+2725440*i9+213760*j5+192*s7+(ptrdiff_t)128);
__m512 wt203 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+8*s7+(ptrdiff_t)8));
sum90 = _mm512_fmadd_ps(wt203, dat198, sum90);
sum91 = _mm512_fmadd_ps(wt203, dat199, sum91);
sum92 = _mm512_fmadd_ps(wt203, dat200, sum92);
__m512 wt204 = _mm512_set1_ps(*(float*)(arrangedWts1+9015424*i9+20064*k13+8*s7+(ptrdiff_t)12));
sum93 = _mm512_fmadd_ps(wt204, dat198, sum93);
sum94 = _mm512_fmadd_ps(wt204, dat199, sum94);
sum95 = _mm512_fmadd_ps(wt204, dat200, sum95);
}
__m512 dat201 = sum91;
__m512i via23 = _mm512_castps_si512(sum91);
via23 = _mm512_alignr_epi32(via23, via23, 6);
__m512 dat202 = _mm512_castsi512_ps(via23);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)0, 65535, sum90);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)64, 63, dat201);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)88, 65535, sum92);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)152, 63, dat202);
__m512 dat203 = sum94;
__m512i via24 = _mm512_castps_si512(sum94);
via24 = _mm512_alignr_epi32(via24, via24, 6);
__m512 dat204 = _mm512_castsi512_ps(via24);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)2816, 65535, sum93);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)2880, 63, dat203);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)2904, 65535, sum95);
_mm512_mask_storeu_ps(datPtr3+7591936*i9+88*h4+16896*k13+(ptrdiff_t)2968, 63, dat204);
if (j5 >= jj5) return;
}
}
j5 = 13;
}
}

static void Example17OneApply1Callee2(Example17ThreaderTask1* task9, int64_t* pt10) {
void** pair3 = task9->any1;
char** tensors7 = pair3[0];
ptrdiff_t e4 = (ptrdiff_t)pair3[1];
ptrdiff_t g3 = 0;
ptrdiff_t d2 = pt10[1];
ptrdiff_t w2 = pt10[0];
char*restrict arrangedWts2 = tensors7[0]+9015424*e4+(ptrdiff_t)9015424*1*g3;
char*restrict arrangedDats2 = tensors7[1]+2725440*e4+(ptrdiff_t)2725440*1*g3;
char*restrict datPtr4 = tensors7[2]+(ptrdiff_t)7591936*1*g3;
ptrdiff_t ii6 = 1;
for (ptrdiff_t i10 = 0; i10 < ii6; ++i10) {
ptrdiff_t j6 = 1*d2;
ptrdiff_t jj6 = j6+0;
if (j6 < 12) {
ptrdiff_t h5 = 0+((size_t)j6-0)/2*5;
switch (((size_t)j6-0)%2) {
case 0: {
wrap6:;
ptrdiff_t k14 = 1*w2;
ptrdiff_t kk10 = k14+0;
for (; k14 != 449; ++k14) {
ptrdiff_t s8 = -1;
__m512 sum96 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)24));
__m512 sum100 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)28));
__m512 sum104 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)32));
__m512 sum108 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)36));
__m512 sum112 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)40));
__m512 sum116 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)44));
__m512 sum97 = sum96;
__m512 sum98 = sum96;
__m512 sum99 = sum96;
__m512 sum101 = sum100;
__m512 sum102 = sum100;
__m512 sum103 = sum100;
__m512 sum105 = sum104;
__m512 sum106 = sum104;
__m512 sum107 = sum104;
__m512 sum109 = sum108;
__m512 sum110 = sum108;
__m512 sum111 = sum108;
__m512 sum113 = sum112;
__m512 sum114 = sum112;
__m512 sum115 = sum112;
__m512 sum117 = sum116;
__m512 sum118 = sum116;
__m512 sum119 = sum116;
for (s8 = 0; s8 < 835; ++s8) {
__m512 dat205 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s8+(ptrdiff_t)0);
__m512 dat206 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s8+(ptrdiff_t)64);
__m512 dat207 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s8+(ptrdiff_t)128);
__m512 dat208 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s8+(ptrdiff_t)192);
__m512 wt205 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)24));
sum96 = _mm512_fmadd_ps(wt205, dat205, sum96);
sum97 = _mm512_fmadd_ps(wt205, dat206, sum97);
sum98 = _mm512_fmadd_ps(wt205, dat207, sum98);
sum99 = _mm512_fmadd_ps(wt205, dat208, sum99);
__m512 wt206 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)28));
sum100 = _mm512_fmadd_ps(wt206, dat205, sum100);
sum101 = _mm512_fmadd_ps(wt206, dat206, sum101);
sum102 = _mm512_fmadd_ps(wt206, dat207, sum102);
sum103 = _mm512_fmadd_ps(wt206, dat208, sum103);
__m512 wt207 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)32));
sum104 = _mm512_fmadd_ps(wt207, dat205, sum104);
sum105 = _mm512_fmadd_ps(wt207, dat206, sum105);
sum106 = _mm512_fmadd_ps(wt207, dat207, sum106);
sum107 = _mm512_fmadd_ps(wt207, dat208, sum107);
__m512 wt208 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)36));
sum108 = _mm512_fmadd_ps(wt208, dat205, sum108);
sum109 = _mm512_fmadd_ps(wt208, dat206, sum109);
sum110 = _mm512_fmadd_ps(wt208, dat207, sum110);
sum111 = _mm512_fmadd_ps(wt208, dat208, sum111);
__m512 wt209 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)40));
sum112 = _mm512_fmadd_ps(wt209, dat205, sum112);
sum113 = _mm512_fmadd_ps(wt209, dat206, sum113);
sum114 = _mm512_fmadd_ps(wt209, dat207, sum114);
sum115 = _mm512_fmadd_ps(wt209, dat208, sum115);
__m512 wt210 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+24*s8+(ptrdiff_t)44));
sum116 = _mm512_fmadd_ps(wt210, dat205, sum116);
sum117 = _mm512_fmadd_ps(wt210, dat206, sum117);
sum118 = _mm512_fmadd_ps(wt210, dat207, sum118);
sum119 = _mm512_fmadd_ps(wt210, dat208, sum119);
}
sum96 = _mm512_add_ps(sum96, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)0));
__m512 dat209 = sum97;
dat209 = _mm512_add_ps(dat209, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)64));
sum98 = _mm512_add_ps(sum98, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)88));
__m512i via25 = _mm512_castps_si512(sum97);
via25 = _mm512_alignr_epi32(via25, via25, 6);
__m512 dat210 = _mm512_castsi512_ps(via25);
dat210 = _mm512_add_ps(dat210, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)152));
sum99 = _mm512_add_ps(sum99, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)176));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)0, 65535, sum96);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)64, 63, dat209);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)88, 65535, sum98);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)152, 63, dat210);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)176, 65535, sum99);
sum100 = _mm512_add_ps(sum100, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2816));
__m512 dat211 = sum101;
dat211 = _mm512_add_ps(dat211, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2880));
sum102 = _mm512_add_ps(sum102, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2904));
__m512i via26 = _mm512_castps_si512(sum101);
via26 = _mm512_alignr_epi32(via26, via26, 6);
__m512 dat212 = _mm512_castsi512_ps(via26);
dat212 = _mm512_add_ps(dat212, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2968));
sum103 = _mm512_add_ps(sum103, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2992));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2816, 65535, sum100);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2880, 63, dat211);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2904, 65535, sum102);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2968, 63, dat212);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2992, 65535, sum103);
sum104 = _mm512_add_ps(sum104, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5632));
__m512 dat213 = sum105;
dat213 = _mm512_add_ps(dat213, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5696));
sum106 = _mm512_add_ps(sum106, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5720));
__m512i via27 = _mm512_castps_si512(sum105);
via27 = _mm512_alignr_epi32(via27, via27, 6);
__m512 dat214 = _mm512_castsi512_ps(via27);
dat214 = _mm512_add_ps(dat214, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5784));
sum107 = _mm512_add_ps(sum107, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5808));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5632, 65535, sum104);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5696, 63, dat213);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5720, 65535, sum106);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5784, 63, dat214);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)5808, 65535, sum107);
sum108 = _mm512_add_ps(sum108, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8448));
__m512 dat215 = sum109;
dat215 = _mm512_add_ps(dat215, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8512));
sum110 = _mm512_add_ps(sum110, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8536));
__m512i via28 = _mm512_castps_si512(sum109);
via28 = _mm512_alignr_epi32(via28, via28, 6);
__m512 dat216 = _mm512_castsi512_ps(via28);
dat216 = _mm512_add_ps(dat216, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8600));
sum111 = _mm512_add_ps(sum111, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8624));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8448, 65535, sum108);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8512, 63, dat215);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8536, 65535, sum110);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8600, 63, dat216);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)8624, 65535, sum111);
sum112 = _mm512_add_ps(sum112, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11264));
__m512 dat217 = sum113;
dat217 = _mm512_add_ps(dat217, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11328));
sum114 = _mm512_add_ps(sum114, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11352));
__m512i via29 = _mm512_castps_si512(sum113);
via29 = _mm512_alignr_epi32(via29, via29, 6);
__m512 dat218 = _mm512_castsi512_ps(via29);
dat218 = _mm512_add_ps(dat218, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11416));
sum115 = _mm512_add_ps(sum115, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11440));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11264, 65535, sum112);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11328, 63, dat217);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11352, 65535, sum114);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11416, 63, dat218);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)11440, 65535, sum115);
sum116 = _mm512_add_ps(sum116, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14080));
__m512 dat219 = sum117;
dat219 = _mm512_add_ps(dat219, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14144));
sum118 = _mm512_add_ps(sum118, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14168));
__m512i via30 = _mm512_castps_si512(sum117);
via30 = _mm512_alignr_epi32(via30, via30, 6);
__m512 dat220 = _mm512_castsi512_ps(via30);
dat220 = _mm512_add_ps(dat220, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14232));
sum119 = _mm512_add_ps(sum119, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14256));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14080, 65535, sum116);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14144, 63, dat219);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14168, 65535, sum118);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14232, 63, dat220);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)14256, 65535, sum119);
if (k14 >= kk10) return;
}
ptrdiff_t s9 = -1;
__m512 sum120 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+8*s9+(ptrdiff_t)8));
__m512 sum124 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+8*s9+(ptrdiff_t)12));
__m512 sum121 = sum120;
__m512 sum122 = sum120;
__m512 sum123 = sum120;
__m512 sum125 = sum124;
__m512 sum126 = sum124;
__m512 sum127 = sum124;
for (s9 = 0; s9 < 835; ++s9) {
__m512 dat221 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s9+(ptrdiff_t)0);
__m512 dat222 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s9+(ptrdiff_t)64);
__m512 dat223 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s9+(ptrdiff_t)128);
__m512 dat224 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s9+(ptrdiff_t)192);
__m512 wt211 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+8*s9+(ptrdiff_t)8));
sum120 = _mm512_fmadd_ps(wt211, dat221, sum120);
sum121 = _mm512_fmadd_ps(wt211, dat222, sum121);
sum122 = _mm512_fmadd_ps(wt211, dat223, sum122);
sum123 = _mm512_fmadd_ps(wt211, dat224, sum123);
__m512 wt212 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k14+8*s9+(ptrdiff_t)12));
sum124 = _mm512_fmadd_ps(wt212, dat221, sum124);
sum125 = _mm512_fmadd_ps(wt212, dat222, sum125);
sum126 = _mm512_fmadd_ps(wt212, dat223, sum126);
sum127 = _mm512_fmadd_ps(wt212, dat224, sum127);
}
sum120 = _mm512_add_ps(sum120, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)0));
__m512 dat225 = sum121;
dat225 = _mm512_add_ps(dat225, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)64));
sum122 = _mm512_add_ps(sum122, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)88));
__m512i via31 = _mm512_castps_si512(sum121);
via31 = _mm512_alignr_epi32(via31, via31, 6);
__m512 dat226 = _mm512_castsi512_ps(via31);
dat226 = _mm512_add_ps(dat226, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)152));
sum123 = _mm512_add_ps(sum123, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)176));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)0, 65535, sum120);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)64, 63, dat225);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)88, 65535, sum122);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)152, 63, dat226);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)176, 65535, sum123);
sum124 = _mm512_add_ps(sum124, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2816));
__m512 dat227 = sum125;
dat227 = _mm512_add_ps(dat227, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2880));
sum126 = _mm512_add_ps(sum126, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2904));
__m512i via32 = _mm512_castps_si512(sum125);
via32 = _mm512_alignr_epi32(via32, via32, 6);
__m512 dat228 = _mm512_castsi512_ps(via32);
dat228 = _mm512_add_ps(dat228, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2968));
sum127 = _mm512_add_ps(sum127, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2992));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2816, 65535, sum124);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2880, 63, dat227);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2904, 65535, sum126);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2968, 63, dat228);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k14+(ptrdiff_t)2992, 65535, sum127);
if (j6 >= jj6) return;
++j6;
}
default: {
ptrdiff_t k15 = 1*w2;
ptrdiff_t kk11 = k15+0;
for (; k15 != 449; ++k15) {
ptrdiff_t s10 = -1;
__m512 sum128 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)24));
__m512 sum132 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)28));
__m512 sum136 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)32));
__m512 sum140 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)36));
__m512 sum144 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)40));
__m512 sum148 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)44));
__m512 sum129 = sum128;
__m512 sum130 = sum128;
__m512 sum131 = sum128;
__m512 sum133 = sum132;
__m512 sum134 = sum132;
__m512 sum135 = sum132;
__m512 sum137 = sum136;
__m512 sum138 = sum136;
__m512 sum139 = sum136;
__m512 sum141 = sum140;
__m512 sum142 = sum140;
__m512 sum143 = sum140;
__m512 sum145 = sum144;
__m512 sum146 = sum144;
__m512 sum147 = sum144;
__m512 sum149 = sum148;
__m512 sum150 = sum148;
__m512 sum151 = sum148;
for (s10 = 0; s10 < 835; ++s10) {
__m512 dat229 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s10+(ptrdiff_t)0);
__m512 dat230 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s10+(ptrdiff_t)64);
__m512 dat231 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s10+(ptrdiff_t)128);
__m512 dat232 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s10+(ptrdiff_t)192);
__m512 wt213 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)24));
sum128 = _mm512_fmadd_ps(wt213, dat229, sum128);
sum129 = _mm512_fmadd_ps(wt213, dat230, sum129);
sum130 = _mm512_fmadd_ps(wt213, dat231, sum130);
sum131 = _mm512_fmadd_ps(wt213, dat232, sum131);
__m512 wt214 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)28));
sum132 = _mm512_fmadd_ps(wt214, dat229, sum132);
sum133 = _mm512_fmadd_ps(wt214, dat230, sum133);
sum134 = _mm512_fmadd_ps(wt214, dat231, sum134);
sum135 = _mm512_fmadd_ps(wt214, dat232, sum135);
__m512 wt215 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)32));
sum136 = _mm512_fmadd_ps(wt215, dat229, sum136);
sum137 = _mm512_fmadd_ps(wt215, dat230, sum137);
sum138 = _mm512_fmadd_ps(wt215, dat231, sum138);
sum139 = _mm512_fmadd_ps(wt215, dat232, sum139);
__m512 wt216 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)36));
sum140 = _mm512_fmadd_ps(wt216, dat229, sum140);
sum141 = _mm512_fmadd_ps(wt216, dat230, sum141);
sum142 = _mm512_fmadd_ps(wt216, dat231, sum142);
sum143 = _mm512_fmadd_ps(wt216, dat232, sum143);
__m512 wt217 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)40));
sum144 = _mm512_fmadd_ps(wt217, dat229, sum144);
sum145 = _mm512_fmadd_ps(wt217, dat230, sum145);
sum146 = _mm512_fmadd_ps(wt217, dat231, sum146);
sum147 = _mm512_fmadd_ps(wt217, dat232, sum147);
__m512 wt218 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+24*s10+(ptrdiff_t)44));
sum148 = _mm512_fmadd_ps(wt218, dat229, sum148);
sum149 = _mm512_fmadd_ps(wt218, dat230, sum149);
sum150 = _mm512_fmadd_ps(wt218, dat231, sum150);
sum151 = _mm512_fmadd_ps(wt218, dat232, sum151);
}
__m512 dat233 = sum128;
dat233 = _mm512_add_ps(dat233, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)240));
sum129 = _mm512_add_ps(sum129, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)264));
__m512i via33 = _mm512_castps_si512(sum128);
via33 = _mm512_alignr_epi32(via33, via33, 6);
__m512 dat234 = _mm512_castsi512_ps(via33);
dat234 = _mm512_add_ps(dat234, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)328));
sum130 = _mm512_add_ps(sum130, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)352));
__m512 dat235 = sum131;
dat235 = _mm512_add_ps(dat235, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)416));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)240, 63, dat233);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)264, 65535, sum129);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)328, 63, dat234);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)352, 65535, sum130);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)416, 63, dat235);
__m512 dat236 = sum132;
dat236 = _mm512_add_ps(dat236, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3056));
sum133 = _mm512_add_ps(sum133, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3080));
__m512i via34 = _mm512_castps_si512(sum132);
via34 = _mm512_alignr_epi32(via34, via34, 6);
__m512 dat237 = _mm512_castsi512_ps(via34);
dat237 = _mm512_add_ps(dat237, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3144));
sum134 = _mm512_add_ps(sum134, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3168));
__m512 dat238 = sum135;
dat238 = _mm512_add_ps(dat238, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3232));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3056, 63, dat236);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3080, 65535, sum133);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3144, 63, dat237);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3168, 65535, sum134);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3232, 63, dat238);
__m512 dat239 = sum136;
dat239 = _mm512_add_ps(dat239, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)5872));
sum137 = _mm512_add_ps(sum137, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)5896));
__m512i via35 = _mm512_castps_si512(sum136);
via35 = _mm512_alignr_epi32(via35, via35, 6);
__m512 dat240 = _mm512_castsi512_ps(via35);
dat240 = _mm512_add_ps(dat240, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)5960));
sum138 = _mm512_add_ps(sum138, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)5984));
__m512 dat241 = sum139;
dat241 = _mm512_add_ps(dat241, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)6048));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)5872, 63, dat239);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)5896, 65535, sum137);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)5960, 63, dat240);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)5984, 65535, sum138);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)6048, 63, dat241);
__m512 dat242 = sum140;
dat242 = _mm512_add_ps(dat242, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8688));
sum141 = _mm512_add_ps(sum141, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8712));
__m512i via36 = _mm512_castps_si512(sum140);
via36 = _mm512_alignr_epi32(via36, via36, 6);
__m512 dat243 = _mm512_castsi512_ps(via36);
dat243 = _mm512_add_ps(dat243, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8776));
sum142 = _mm512_add_ps(sum142, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8800));
__m512 dat244 = sum143;
dat244 = _mm512_add_ps(dat244, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8864));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8688, 63, dat242);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8712, 65535, sum141);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8776, 63, dat243);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8800, 65535, sum142);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)8864, 63, dat244);
__m512 dat245 = sum144;
dat245 = _mm512_add_ps(dat245, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11504));
sum145 = _mm512_add_ps(sum145, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11528));
__m512i via37 = _mm512_castps_si512(sum144);
via37 = _mm512_alignr_epi32(via37, via37, 6);
__m512 dat246 = _mm512_castsi512_ps(via37);
dat246 = _mm512_add_ps(dat246, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11592));
sum146 = _mm512_add_ps(sum146, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11616));
__m512 dat247 = sum147;
dat247 = _mm512_add_ps(dat247, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11680));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11504, 63, dat245);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11528, 65535, sum145);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11592, 63, dat246);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11616, 65535, sum146);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)11680, 63, dat247);
__m512 dat248 = sum148;
dat248 = _mm512_add_ps(dat248, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14320));
sum149 = _mm512_add_ps(sum149, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14344));
__m512i via38 = _mm512_castps_si512(sum148);
via38 = _mm512_alignr_epi32(via38, via38, 6);
__m512 dat249 = _mm512_castsi512_ps(via38);
dat249 = _mm512_add_ps(dat249, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14408));
sum150 = _mm512_add_ps(sum150, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14432));
__m512 dat250 = sum151;
dat250 = _mm512_add_ps(dat250, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14496));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14320, 63, dat248);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14344, 65535, sum149);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14408, 63, dat249);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14432, 65535, sum150);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)14496, 63, dat250);
if (k15 >= kk11) return;
}
ptrdiff_t s11 = -1;
__m512 sum152 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+8*s11+(ptrdiff_t)8));
__m512 sum156 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+8*s11+(ptrdiff_t)12));
__m512 sum153 = sum152;
__m512 sum154 = sum152;
__m512 sum155 = sum152;
__m512 sum157 = sum156;
__m512 sum158 = sum156;
__m512 sum159 = sum156;
for (s11 = 0; s11 < 835; ++s11) {
__m512 dat251 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s11+(ptrdiff_t)0);
__m512 dat252 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s11+(ptrdiff_t)64);
__m512 dat253 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s11+(ptrdiff_t)128);
__m512 dat254 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+256*s11+(ptrdiff_t)192);
__m512 wt219 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+8*s11+(ptrdiff_t)8));
sum152 = _mm512_fmadd_ps(wt219, dat251, sum152);
sum153 = _mm512_fmadd_ps(wt219, dat252, sum153);
sum154 = _mm512_fmadd_ps(wt219, dat253, sum154);
sum155 = _mm512_fmadd_ps(wt219, dat254, sum155);
__m512 wt220 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k15+8*s11+(ptrdiff_t)12));
sum156 = _mm512_fmadd_ps(wt220, dat251, sum156);
sum157 = _mm512_fmadd_ps(wt220, dat252, sum157);
sum158 = _mm512_fmadd_ps(wt220, dat253, sum158);
sum159 = _mm512_fmadd_ps(wt220, dat254, sum159);
}
__m512 dat255 = sum152;
dat255 = _mm512_add_ps(dat255, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)240));
sum153 = _mm512_add_ps(sum153, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)264));
__m512i via39 = _mm512_castps_si512(sum152);
via39 = _mm512_alignr_epi32(via39, via39, 6);
__m512 dat256 = _mm512_castsi512_ps(via39);
dat256 = _mm512_add_ps(dat256, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)328));
sum154 = _mm512_add_ps(sum154, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)352));
__m512 dat257 = sum155;
dat257 = _mm512_add_ps(dat257, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)416));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)240, 63, dat255);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)264, 65535, sum153);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)328, 63, dat256);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)352, 65535, sum154);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)416, 63, dat257);
__m512 dat258 = sum156;
dat258 = _mm512_add_ps(dat258, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3056));
sum157 = _mm512_add_ps(sum157, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3080));
__m512i via40 = _mm512_castps_si512(sum156);
via40 = _mm512_alignr_epi32(via40, via40, 6);
__m512 dat259 = _mm512_castsi512_ps(via40);
dat259 = _mm512_add_ps(dat259, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3144));
sum158 = _mm512_add_ps(sum158, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3168));
__m512 dat260 = sum159;
dat260 = _mm512_add_ps(dat260, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3232));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3056, 63, dat258);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3080, 65535, sum157);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3144, 63, dat259);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3168, 65535, sum158);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h5+16896*k15+(ptrdiff_t)3232, 63, dat260);
if (j6 >= jj6) return;
if (j6 >= 11) break;
++j6;
h5 += 5;
goto wrap6;
}
}
j6 = 12;
}
ptrdiff_t h6 = 30;
switch (j6) {
default: {
j6 = 12;
ptrdiff_t k16 = 1*w2;
ptrdiff_t kk12 = k16+0;
for (; k16 != 449; ++k16) {
ptrdiff_t s12 = -1;
__m512 sum160 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)24));
__m512 sum163 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)28));
__m512 sum166 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)32));
__m512 sum169 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)36));
__m512 sum172 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)40));
__m512 sum175 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)44));
__m512 sum161 = sum160;
__m512 sum162 = sum160;
__m512 sum164 = sum163;
__m512 sum165 = sum163;
__m512 sum167 = sum166;
__m512 sum168 = sum166;
__m512 sum170 = sum169;
__m512 sum171 = sum169;
__m512 sum173 = sum172;
__m512 sum174 = sum172;
__m512 sum176 = sum175;
__m512 sum177 = sum175;
for (s12 = 0; s12 < 835; ++s12) {
__m512 dat261 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+192*s12+(ptrdiff_t)0);
__m512 dat262 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+192*s12+(ptrdiff_t)64);
__m512 dat263 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+192*s12+(ptrdiff_t)128);
__m512 wt221 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)24));
sum160 = _mm512_fmadd_ps(wt221, dat261, sum160);
sum161 = _mm512_fmadd_ps(wt221, dat262, sum161);
sum162 = _mm512_fmadd_ps(wt221, dat263, sum162);
__m512 wt222 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)28));
sum163 = _mm512_fmadd_ps(wt222, dat261, sum163);
sum164 = _mm512_fmadd_ps(wt222, dat262, sum164);
sum165 = _mm512_fmadd_ps(wt222, dat263, sum165);
__m512 wt223 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)32));
sum166 = _mm512_fmadd_ps(wt223, dat261, sum166);
sum167 = _mm512_fmadd_ps(wt223, dat262, sum167);
sum168 = _mm512_fmadd_ps(wt223, dat263, sum168);
__m512 wt224 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)36));
sum169 = _mm512_fmadd_ps(wt224, dat261, sum169);
sum170 = _mm512_fmadd_ps(wt224, dat262, sum170);
sum171 = _mm512_fmadd_ps(wt224, dat263, sum171);
__m512 wt225 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)40));
sum172 = _mm512_fmadd_ps(wt225, dat261, sum172);
sum173 = _mm512_fmadd_ps(wt225, dat262, sum173);
sum174 = _mm512_fmadd_ps(wt225, dat263, sum174);
__m512 wt226 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+24*s12+(ptrdiff_t)44));
sum175 = _mm512_fmadd_ps(wt226, dat261, sum175);
sum176 = _mm512_fmadd_ps(wt226, dat262, sum176);
sum177 = _mm512_fmadd_ps(wt226, dat263, sum177);
}
sum160 = _mm512_add_ps(sum160, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)0));
__m512 dat264 = sum161;
dat264 = _mm512_add_ps(dat264, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)64));
sum162 = _mm512_add_ps(sum162, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)88));
__m512i via41 = _mm512_castps_si512(sum161);
via41 = _mm512_alignr_epi32(via41, via41, 6);
__m512 dat265 = _mm512_castsi512_ps(via41);
dat265 = _mm512_add_ps(dat265, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)152));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)0, 65535, sum160);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)64, 63, dat264);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)88, 65535, sum162);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)152, 63, dat265);
sum163 = _mm512_add_ps(sum163, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2816));
__m512 dat266 = sum164;
dat266 = _mm512_add_ps(dat266, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2880));
sum165 = _mm512_add_ps(sum165, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2904));
__m512i via42 = _mm512_castps_si512(sum164);
via42 = _mm512_alignr_epi32(via42, via42, 6);
__m512 dat267 = _mm512_castsi512_ps(via42);
dat267 = _mm512_add_ps(dat267, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2968));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2816, 65535, sum163);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2880, 63, dat266);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2904, 65535, sum165);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2968, 63, dat267);
sum166 = _mm512_add_ps(sum166, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)5632));
__m512 dat268 = sum167;
dat268 = _mm512_add_ps(dat268, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)5696));
sum168 = _mm512_add_ps(sum168, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)5720));
__m512i via43 = _mm512_castps_si512(sum167);
via43 = _mm512_alignr_epi32(via43, via43, 6);
__m512 dat269 = _mm512_castsi512_ps(via43);
dat269 = _mm512_add_ps(dat269, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)5784));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)5632, 65535, sum166);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)5696, 63, dat268);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)5720, 65535, sum168);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)5784, 63, dat269);
sum169 = _mm512_add_ps(sum169, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)8448));
__m512 dat270 = sum170;
dat270 = _mm512_add_ps(dat270, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)8512));
sum171 = _mm512_add_ps(sum171, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)8536));
__m512i via44 = _mm512_castps_si512(sum170);
via44 = _mm512_alignr_epi32(via44, via44, 6);
__m512 dat271 = _mm512_castsi512_ps(via44);
dat271 = _mm512_add_ps(dat271, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)8600));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)8448, 65535, sum169);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)8512, 63, dat270);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)8536, 65535, sum171);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)8600, 63, dat271);
sum172 = _mm512_add_ps(sum172, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)11264));
__m512 dat272 = sum173;
dat272 = _mm512_add_ps(dat272, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)11328));
sum174 = _mm512_add_ps(sum174, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)11352));
__m512i via45 = _mm512_castps_si512(sum173);
via45 = _mm512_alignr_epi32(via45, via45, 6);
__m512 dat273 = _mm512_castsi512_ps(via45);
dat273 = _mm512_add_ps(dat273, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)11416));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)11264, 65535, sum172);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)11328, 63, dat272);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)11352, 65535, sum174);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)11416, 63, dat273);
sum175 = _mm512_add_ps(sum175, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)14080));
__m512 dat274 = sum176;
dat274 = _mm512_add_ps(dat274, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)14144));
sum177 = _mm512_add_ps(sum177, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)14168));
__m512i via46 = _mm512_castps_si512(sum176);
via46 = _mm512_alignr_epi32(via46, via46, 6);
__m512 dat275 = _mm512_castsi512_ps(via46);
dat275 = _mm512_add_ps(dat275, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)14232));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)14080, 65535, sum175);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)14144, 63, dat274);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)14168, 65535, sum177);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)14232, 63, dat275);
if (k16 >= kk12) return;
}
ptrdiff_t s13 = -1;
__m512 sum178 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+8*s13+(ptrdiff_t)8));
__m512 sum181 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+8*s13+(ptrdiff_t)12));
__m512 sum179 = sum178;
__m512 sum180 = sum178;
__m512 sum182 = sum181;
__m512 sum183 = sum181;
for (s13 = 0; s13 < 835; ++s13) {
__m512 dat276 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+192*s13+(ptrdiff_t)0);
__m512 dat277 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+192*s13+(ptrdiff_t)64);
__m512 dat278 = _mm512_loadu_ps(arrangedDats2+2725440*i10+213760*j6+192*s13+(ptrdiff_t)128);
__m512 wt227 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+8*s13+(ptrdiff_t)8));
sum178 = _mm512_fmadd_ps(wt227, dat276, sum178);
sum179 = _mm512_fmadd_ps(wt227, dat277, sum179);
sum180 = _mm512_fmadd_ps(wt227, dat278, sum180);
__m512 wt228 = _mm512_set1_ps(*(float*)(arrangedWts2+9015424*i10+20064*k16+8*s13+(ptrdiff_t)12));
sum181 = _mm512_fmadd_ps(wt228, dat276, sum181);
sum182 = _mm512_fmadd_ps(wt228, dat277, sum182);
sum183 = _mm512_fmadd_ps(wt228, dat278, sum183);
}
sum178 = _mm512_add_ps(sum178, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)0));
__m512 dat279 = sum179;
dat279 = _mm512_add_ps(dat279, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)64));
sum180 = _mm512_add_ps(sum180, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)88));
__m512i via47 = _mm512_castps_si512(sum179);
via47 = _mm512_alignr_epi32(via47, via47, 6);
__m512 dat280 = _mm512_castsi512_ps(via47);
dat280 = _mm512_add_ps(dat280, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)152));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)0, 65535, sum178);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)64, 63, dat279);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)88, 65535, sum180);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)152, 63, dat280);
sum181 = _mm512_add_ps(sum181, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2816));
__m512 dat281 = sum182;
dat281 = _mm512_add_ps(dat281, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2880));
sum183 = _mm512_add_ps(sum183, _mm512_maskz_loadu_ps(65535, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2904));
__m512i via48 = _mm512_castps_si512(sum182);
via48 = _mm512_alignr_epi32(via48, via48, 6);
__m512 dat282 = _mm512_castsi512_ps(via48);
dat282 = _mm512_add_ps(dat282, _mm512_maskz_loadu_ps(63, datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2968));
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2816, 65535, sum181);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2880, 63, dat281);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2904, 65535, sum183);
_mm512_mask_storeu_ps(datPtr4+7591936*i10+88*h6+16896*k16+(ptrdiff_t)2968, 63, dat282);
if (j6 >= jj6) return;
}
}
j6 = 13;
}
}

static void Example17OneApply1Callee3(Example17ThreaderTask1* task10, int64_t* pt11) {
void** pair4 = task10->any1;
char** tensors8 = pair4[0];
ptrdiff_t e6 = 3;
ptrdiff_t g4 = 0;
ptrdiff_t d3 = pt11[1];
ptrdiff_t w3 = pt11[0];
char*restrict arrangedWts3 = tensors8[0]+9015424*e6+(ptrdiff_t)10665376*1*g4;
char*restrict arrangedDats3 = tensors8[1]+2725440*e6+(ptrdiff_t)3224832*1*g4;
char*restrict datPtr5 = tensors8[2]+(ptrdiff_t)7591936*1*g4;
ptrdiff_t ii7 = 1;
for (ptrdiff_t i11 = 0; i11 < ii7; ++i11) {
ptrdiff_t j7 = 1*d3;
ptrdiff_t jj7 = j7+0;
if (j7 < 12) {
ptrdiff_t h7 = 0+((size_t)j7-0)/2*5;
switch (((size_t)j7-0)%2) {
case 0: {
wrap7:;
ptrdiff_t k17 = 1*w3;
ptrdiff_t kk13 = k17+0;
for (; k17 != 449; ++k17) {
ptrdiff_t s14 = -1;
__m512 sum184 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)24));
__m512 sum188 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)28));
__m512 sum192 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)32));
__m512 sum196 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)36));
__m512 sum200 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)40));
__m512 sum204 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)44));
__m512 sum185 = sum184;
__m512 sum186 = sum184;
__m512 sum187 = sum184;
__m512 sum189 = sum188;
__m512 sum190 = sum188;
__m512 sum191 = sum188;
__m512 sum193 = sum192;
__m512 sum194 = sum192;
__m512 sum195 = sum192;
__m512 sum197 = sum196;
__m512 sum198 = sum196;
__m512 sum199 = sum196;
__m512 sum201 = sum200;
__m512 sum202 = sum200;
__m512 sum203 = sum200;
__m512 sum205 = sum204;
__m512 sum206 = sum204;
__m512 sum207 = sum204;
for (s14 = 0; s14 < 988; ++s14) {
__m512 dat283 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s14+(ptrdiff_t)0);
__m512 dat284 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s14+(ptrdiff_t)64);
__m512 dat285 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s14+(ptrdiff_t)128);
__m512 dat286 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s14+(ptrdiff_t)192);
__m512 wt229 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)24));
sum184 = _mm512_fmadd_ps(wt229, dat283, sum184);
sum185 = _mm512_fmadd_ps(wt229, dat284, sum185);
sum186 = _mm512_fmadd_ps(wt229, dat285, sum186);
sum187 = _mm512_fmadd_ps(wt229, dat286, sum187);
__m512 wt230 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)28));
sum188 = _mm512_fmadd_ps(wt230, dat283, sum188);
sum189 = _mm512_fmadd_ps(wt230, dat284, sum189);
sum190 = _mm512_fmadd_ps(wt230, dat285, sum190);
sum191 = _mm512_fmadd_ps(wt230, dat286, sum191);
__m512 wt231 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)32));
sum192 = _mm512_fmadd_ps(wt231, dat283, sum192);
sum193 = _mm512_fmadd_ps(wt231, dat284, sum193);
sum194 = _mm512_fmadd_ps(wt231, dat285, sum194);
sum195 = _mm512_fmadd_ps(wt231, dat286, sum195);
__m512 wt232 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)36));
sum196 = _mm512_fmadd_ps(wt232, dat283, sum196);
sum197 = _mm512_fmadd_ps(wt232, dat284, sum197);
sum198 = _mm512_fmadd_ps(wt232, dat285, sum198);
sum199 = _mm512_fmadd_ps(wt232, dat286, sum199);
__m512 wt233 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)40));
sum200 = _mm512_fmadd_ps(wt233, dat283, sum200);
sum201 = _mm512_fmadd_ps(wt233, dat284, sum201);
sum202 = _mm512_fmadd_ps(wt233, dat285, sum202);
sum203 = _mm512_fmadd_ps(wt233, dat286, sum203);
__m512 wt234 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+24*s14+(ptrdiff_t)44));
sum204 = _mm512_fmadd_ps(wt234, dat283, sum204);
sum205 = _mm512_fmadd_ps(wt234, dat284, sum205);
sum206 = _mm512_fmadd_ps(wt234, dat285, sum206);
sum207 = _mm512_fmadd_ps(wt234, dat286, sum207);
}
sum184 = _mm512_add_ps(sum184, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)0));
__m512 dat287 = sum185;
dat287 = _mm512_add_ps(dat287, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)64));
sum186 = _mm512_add_ps(sum186, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)88));
__m512i via49 = _mm512_castps_si512(sum185);
via49 = _mm512_alignr_epi32(via49, via49, 6);
__m512 dat288 = _mm512_castsi512_ps(via49);
dat288 = _mm512_add_ps(dat288, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)152));
sum187 = _mm512_add_ps(sum187, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)176));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)0, 65535, sum184);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)64, 63, dat287);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)88, 65535, sum186);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)152, 63, dat288);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)176, 65535, sum187);
sum188 = _mm512_add_ps(sum188, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2816));
__m512 dat289 = sum189;
dat289 = _mm512_add_ps(dat289, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2880));
sum190 = _mm512_add_ps(sum190, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2904));
__m512i via50 = _mm512_castps_si512(sum189);
via50 = _mm512_alignr_epi32(via50, via50, 6);
__m512 dat290 = _mm512_castsi512_ps(via50);
dat290 = _mm512_add_ps(dat290, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2968));
sum191 = _mm512_add_ps(sum191, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2992));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2816, 65535, sum188);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2880, 63, dat289);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2904, 65535, sum190);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2968, 63, dat290);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2992, 65535, sum191);
sum192 = _mm512_add_ps(sum192, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5632));
__m512 dat291 = sum193;
dat291 = _mm512_add_ps(dat291, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5696));
sum194 = _mm512_add_ps(sum194, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5720));
__m512i via51 = _mm512_castps_si512(sum193);
via51 = _mm512_alignr_epi32(via51, via51, 6);
__m512 dat292 = _mm512_castsi512_ps(via51);
dat292 = _mm512_add_ps(dat292, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5784));
sum195 = _mm512_add_ps(sum195, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5808));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5632, 65535, sum192);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5696, 63, dat291);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5720, 65535, sum194);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5784, 63, dat292);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)5808, 65535, sum195);
sum196 = _mm512_add_ps(sum196, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8448));
__m512 dat293 = sum197;
dat293 = _mm512_add_ps(dat293, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8512));
sum198 = _mm512_add_ps(sum198, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8536));
__m512i via52 = _mm512_castps_si512(sum197);
via52 = _mm512_alignr_epi32(via52, via52, 6);
__m512 dat294 = _mm512_castsi512_ps(via52);
dat294 = _mm512_add_ps(dat294, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8600));
sum199 = _mm512_add_ps(sum199, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8624));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8448, 65535, sum196);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8512, 63, dat293);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8536, 65535, sum198);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8600, 63, dat294);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)8624, 65535, sum199);
sum200 = _mm512_add_ps(sum200, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11264));
__m512 dat295 = sum201;
dat295 = _mm512_add_ps(dat295, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11328));
sum202 = _mm512_add_ps(sum202, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11352));
__m512i via53 = _mm512_castps_si512(sum201);
via53 = _mm512_alignr_epi32(via53, via53, 6);
__m512 dat296 = _mm512_castsi512_ps(via53);
dat296 = _mm512_add_ps(dat296, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11416));
sum203 = _mm512_add_ps(sum203, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11440));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11264, 65535, sum200);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11328, 63, dat295);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11352, 65535, sum202);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11416, 63, dat296);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)11440, 65535, sum203);
sum204 = _mm512_add_ps(sum204, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14080));
__m512 dat297 = sum205;
dat297 = _mm512_add_ps(dat297, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14144));
sum206 = _mm512_add_ps(sum206, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14168));
__m512i via54 = _mm512_castps_si512(sum205);
via54 = _mm512_alignr_epi32(via54, via54, 6);
__m512 dat298 = _mm512_castsi512_ps(via54);
dat298 = _mm512_add_ps(dat298, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14232));
sum207 = _mm512_add_ps(sum207, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14256));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14080, 65535, sum204);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14144, 63, dat297);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14168, 65535, sum206);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14232, 63, dat298);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)14256, 65535, sum207);
if (k17 >= kk13) return;
}
ptrdiff_t s15 = -1;
__m512 sum208 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+8*s15+(ptrdiff_t)8));
__m512 sum212 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+8*s15+(ptrdiff_t)12));
__m512 sum209 = sum208;
__m512 sum210 = sum208;
__m512 sum211 = sum208;
__m512 sum213 = sum212;
__m512 sum214 = sum212;
__m512 sum215 = sum212;
for (s15 = 0; s15 < 988; ++s15) {
__m512 dat299 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s15+(ptrdiff_t)0);
__m512 dat300 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s15+(ptrdiff_t)64);
__m512 dat301 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s15+(ptrdiff_t)128);
__m512 dat302 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s15+(ptrdiff_t)192);
__m512 wt235 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+8*s15+(ptrdiff_t)8));
sum208 = _mm512_fmadd_ps(wt235, dat299, sum208);
sum209 = _mm512_fmadd_ps(wt235, dat300, sum209);
sum210 = _mm512_fmadd_ps(wt235, dat301, sum210);
sum211 = _mm512_fmadd_ps(wt235, dat302, sum211);
__m512 wt236 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k17+8*s15+(ptrdiff_t)12));
sum212 = _mm512_fmadd_ps(wt236, dat299, sum212);
sum213 = _mm512_fmadd_ps(wt236, dat300, sum213);
sum214 = _mm512_fmadd_ps(wt236, dat301, sum214);
sum215 = _mm512_fmadd_ps(wt236, dat302, sum215);
}
sum208 = _mm512_add_ps(sum208, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)0));
__m512 dat303 = sum209;
dat303 = _mm512_add_ps(dat303, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)64));
sum210 = _mm512_add_ps(sum210, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)88));
__m512i via55 = _mm512_castps_si512(sum209);
via55 = _mm512_alignr_epi32(via55, via55, 6);
__m512 dat304 = _mm512_castsi512_ps(via55);
dat304 = _mm512_add_ps(dat304, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)152));
sum211 = _mm512_add_ps(sum211, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)176));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)0, 65535, sum208);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)64, 63, dat303);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)88, 65535, sum210);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)152, 63, dat304);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)176, 65535, sum211);
sum212 = _mm512_add_ps(sum212, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2816));
__m512 dat305 = sum213;
dat305 = _mm512_add_ps(dat305, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2880));
sum214 = _mm512_add_ps(sum214, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2904));
__m512i via56 = _mm512_castps_si512(sum213);
via56 = _mm512_alignr_epi32(via56, via56, 6);
__m512 dat306 = _mm512_castsi512_ps(via56);
dat306 = _mm512_add_ps(dat306, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2968));
sum215 = _mm512_add_ps(sum215, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2992));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2816, 65535, sum212);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2880, 63, dat305);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2904, 65535, sum214);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2968, 63, dat306);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k17+(ptrdiff_t)2992, 65535, sum215);
if (j7 >= jj7) return;
++j7;
}
default: {
ptrdiff_t k18 = 1*w3;
ptrdiff_t kk14 = k18+0;
for (; k18 != 449; ++k18) {
ptrdiff_t s16 = -1;
__m512 sum216 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)24));
__m512 sum220 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)28));
__m512 sum224 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)32));
__m512 sum228 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)36));
__m512 sum232 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)40));
__m512 sum236 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)44));
__m512 sum217 = sum216;
__m512 sum218 = sum216;
__m512 sum219 = sum216;
__m512 sum221 = sum220;
__m512 sum222 = sum220;
__m512 sum223 = sum220;
__m512 sum225 = sum224;
__m512 sum226 = sum224;
__m512 sum227 = sum224;
__m512 sum229 = sum228;
__m512 sum230 = sum228;
__m512 sum231 = sum228;
__m512 sum233 = sum232;
__m512 sum234 = sum232;
__m512 sum235 = sum232;
__m512 sum237 = sum236;
__m512 sum238 = sum236;
__m512 sum239 = sum236;
for (s16 = 0; s16 < 988; ++s16) {
__m512 dat307 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s16+(ptrdiff_t)0);
__m512 dat308 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s16+(ptrdiff_t)64);
__m512 dat309 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s16+(ptrdiff_t)128);
__m512 dat310 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s16+(ptrdiff_t)192);
__m512 wt237 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)24));
sum216 = _mm512_fmadd_ps(wt237, dat307, sum216);
sum217 = _mm512_fmadd_ps(wt237, dat308, sum217);
sum218 = _mm512_fmadd_ps(wt237, dat309, sum218);
sum219 = _mm512_fmadd_ps(wt237, dat310, sum219);
__m512 wt238 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)28));
sum220 = _mm512_fmadd_ps(wt238, dat307, sum220);
sum221 = _mm512_fmadd_ps(wt238, dat308, sum221);
sum222 = _mm512_fmadd_ps(wt238, dat309, sum222);
sum223 = _mm512_fmadd_ps(wt238, dat310, sum223);
__m512 wt239 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)32));
sum224 = _mm512_fmadd_ps(wt239, dat307, sum224);
sum225 = _mm512_fmadd_ps(wt239, dat308, sum225);
sum226 = _mm512_fmadd_ps(wt239, dat309, sum226);
sum227 = _mm512_fmadd_ps(wt239, dat310, sum227);
__m512 wt240 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)36));
sum228 = _mm512_fmadd_ps(wt240, dat307, sum228);
sum229 = _mm512_fmadd_ps(wt240, dat308, sum229);
sum230 = _mm512_fmadd_ps(wt240, dat309, sum230);
sum231 = _mm512_fmadd_ps(wt240, dat310, sum231);
__m512 wt241 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)40));
sum232 = _mm512_fmadd_ps(wt241, dat307, sum232);
sum233 = _mm512_fmadd_ps(wt241, dat308, sum233);
sum234 = _mm512_fmadd_ps(wt241, dat309, sum234);
sum235 = _mm512_fmadd_ps(wt241, dat310, sum235);
__m512 wt242 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+24*s16+(ptrdiff_t)44));
sum236 = _mm512_fmadd_ps(wt242, dat307, sum236);
sum237 = _mm512_fmadd_ps(wt242, dat308, sum237);
sum238 = _mm512_fmadd_ps(wt242, dat309, sum238);
sum239 = _mm512_fmadd_ps(wt242, dat310, sum239);
}
__m512 dat311 = sum216;
dat311 = _mm512_add_ps(dat311, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)240));
sum217 = _mm512_add_ps(sum217, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)264));
__m512i via57 = _mm512_castps_si512(sum216);
via57 = _mm512_alignr_epi32(via57, via57, 6);
__m512 dat312 = _mm512_castsi512_ps(via57);
dat312 = _mm512_add_ps(dat312, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)328));
sum218 = _mm512_add_ps(sum218, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)352));
__m512 dat313 = sum219;
dat313 = _mm512_add_ps(dat313, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)416));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)240, 63, dat311);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)264, 65535, sum217);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)328, 63, dat312);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)352, 65535, sum218);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)416, 63, dat313);
__m512 dat314 = sum220;
dat314 = _mm512_add_ps(dat314, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3056));
sum221 = _mm512_add_ps(sum221, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3080));
__m512i via58 = _mm512_castps_si512(sum220);
via58 = _mm512_alignr_epi32(via58, via58, 6);
__m512 dat315 = _mm512_castsi512_ps(via58);
dat315 = _mm512_add_ps(dat315, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3144));
sum222 = _mm512_add_ps(sum222, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3168));
__m512 dat316 = sum223;
dat316 = _mm512_add_ps(dat316, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3232));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3056, 63, dat314);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3080, 65535, sum221);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3144, 63, dat315);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3168, 65535, sum222);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3232, 63, dat316);
__m512 dat317 = sum224;
dat317 = _mm512_add_ps(dat317, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)5872));
sum225 = _mm512_add_ps(sum225, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)5896));
__m512i via59 = _mm512_castps_si512(sum224);
via59 = _mm512_alignr_epi32(via59, via59, 6);
__m512 dat318 = _mm512_castsi512_ps(via59);
dat318 = _mm512_add_ps(dat318, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)5960));
sum226 = _mm512_add_ps(sum226, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)5984));
__m512 dat319 = sum227;
dat319 = _mm512_add_ps(dat319, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)6048));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)5872, 63, dat317);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)5896, 65535, sum225);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)5960, 63, dat318);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)5984, 65535, sum226);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)6048, 63, dat319);
__m512 dat320 = sum228;
dat320 = _mm512_add_ps(dat320, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8688));
sum229 = _mm512_add_ps(sum229, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8712));
__m512i via60 = _mm512_castps_si512(sum228);
via60 = _mm512_alignr_epi32(via60, via60, 6);
__m512 dat321 = _mm512_castsi512_ps(via60);
dat321 = _mm512_add_ps(dat321, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8776));
sum230 = _mm512_add_ps(sum230, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8800));
__m512 dat322 = sum231;
dat322 = _mm512_add_ps(dat322, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8864));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8688, 63, dat320);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8712, 65535, sum229);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8776, 63, dat321);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8800, 65535, sum230);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)8864, 63, dat322);
__m512 dat323 = sum232;
dat323 = _mm512_add_ps(dat323, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11504));
sum233 = _mm512_add_ps(sum233, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11528));
__m512i via61 = _mm512_castps_si512(sum232);
via61 = _mm512_alignr_epi32(via61, via61, 6);
__m512 dat324 = _mm512_castsi512_ps(via61);
dat324 = _mm512_add_ps(dat324, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11592));
sum234 = _mm512_add_ps(sum234, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11616));
__m512 dat325 = sum235;
dat325 = _mm512_add_ps(dat325, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11680));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11504, 63, dat323);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11528, 65535, sum233);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11592, 63, dat324);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11616, 65535, sum234);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)11680, 63, dat325);
__m512 dat326 = sum236;
dat326 = _mm512_add_ps(dat326, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14320));
sum237 = _mm512_add_ps(sum237, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14344));
__m512i via62 = _mm512_castps_si512(sum236);
via62 = _mm512_alignr_epi32(via62, via62, 6);
__m512 dat327 = _mm512_castsi512_ps(via62);
dat327 = _mm512_add_ps(dat327, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14408));
sum238 = _mm512_add_ps(sum238, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14432));
__m512 dat328 = sum239;
dat328 = _mm512_add_ps(dat328, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14496));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14320, 63, dat326);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14344, 65535, sum237);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14408, 63, dat327);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14432, 65535, sum238);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)14496, 63, dat328);
if (k18 >= kk14) return;
}
ptrdiff_t s17 = -1;
__m512 sum240 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+8*s17+(ptrdiff_t)8));
__m512 sum244 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+8*s17+(ptrdiff_t)12));
__m512 sum241 = sum240;
__m512 sum242 = sum240;
__m512 sum243 = sum240;
__m512 sum245 = sum244;
__m512 sum246 = sum244;
__m512 sum247 = sum244;
for (s17 = 0; s17 < 988; ++s17) {
__m512 dat329 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s17+(ptrdiff_t)0);
__m512 dat330 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s17+(ptrdiff_t)64);
__m512 dat331 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s17+(ptrdiff_t)128);
__m512 dat332 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+256*s17+(ptrdiff_t)192);
__m512 wt243 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+8*s17+(ptrdiff_t)8));
sum240 = _mm512_fmadd_ps(wt243, dat329, sum240);
sum241 = _mm512_fmadd_ps(wt243, dat330, sum241);
sum242 = _mm512_fmadd_ps(wt243, dat331, sum242);
sum243 = _mm512_fmadd_ps(wt243, dat332, sum243);
__m512 wt244 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k18+8*s17+(ptrdiff_t)12));
sum244 = _mm512_fmadd_ps(wt244, dat329, sum244);
sum245 = _mm512_fmadd_ps(wt244, dat330, sum245);
sum246 = _mm512_fmadd_ps(wt244, dat331, sum246);
sum247 = _mm512_fmadd_ps(wt244, dat332, sum247);
}
__m512 dat333 = sum240;
dat333 = _mm512_add_ps(dat333, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)240));
sum241 = _mm512_add_ps(sum241, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)264));
__m512i via63 = _mm512_castps_si512(sum240);
via63 = _mm512_alignr_epi32(via63, via63, 6);
__m512 dat334 = _mm512_castsi512_ps(via63);
dat334 = _mm512_add_ps(dat334, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)328));
sum242 = _mm512_add_ps(sum242, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)352));
__m512 dat335 = sum243;
dat335 = _mm512_add_ps(dat335, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)416));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)240, 63, dat333);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)264, 65535, sum241);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)328, 63, dat334);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)352, 65535, sum242);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)416, 63, dat335);
__m512 dat336 = sum244;
dat336 = _mm512_add_ps(dat336, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3056));
sum245 = _mm512_add_ps(sum245, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3080));
__m512i via64 = _mm512_castps_si512(sum244);
via64 = _mm512_alignr_epi32(via64, via64, 6);
__m512 dat337 = _mm512_castsi512_ps(via64);
dat337 = _mm512_add_ps(dat337, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3144));
sum246 = _mm512_add_ps(sum246, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3168));
__m512 dat338 = sum247;
dat338 = _mm512_add_ps(dat338, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3232));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3056, 63, dat336);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3080, 65535, sum245);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3144, 63, dat337);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3168, 65535, sum246);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h7+16896*k18+(ptrdiff_t)3232, 63, dat338);
if (j7 >= jj7) return;
if (j7 >= 11) break;
++j7;
h7 += 5;
goto wrap7;
}
}
j7 = 12;
}
ptrdiff_t h8 = 30;
switch (j7) {
default: {
j7 = 12;
ptrdiff_t k19 = 1*w3;
ptrdiff_t kk15 = k19+0;
for (; k19 != 449; ++k19) {
ptrdiff_t s18 = -1;
__m512 sum248 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)24));
__m512 sum251 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)28));
__m512 sum254 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)32));
__m512 sum257 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)36));
__m512 sum260 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)40));
__m512 sum263 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)44));
__m512 sum249 = sum248;
__m512 sum250 = sum248;
__m512 sum252 = sum251;
__m512 sum253 = sum251;
__m512 sum255 = sum254;
__m512 sum256 = sum254;
__m512 sum258 = sum257;
__m512 sum259 = sum257;
__m512 sum261 = sum260;
__m512 sum262 = sum260;
__m512 sum264 = sum263;
__m512 sum265 = sum263;
for (s18 = 0; s18 < 988; ++s18) {
__m512 dat339 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+192*s18+(ptrdiff_t)0);
__m512 dat340 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+192*s18+(ptrdiff_t)64);
__m512 dat341 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+192*s18+(ptrdiff_t)128);
__m512 wt245 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)24));
sum248 = _mm512_fmadd_ps(wt245, dat339, sum248);
sum249 = _mm512_fmadd_ps(wt245, dat340, sum249);
sum250 = _mm512_fmadd_ps(wt245, dat341, sum250);
__m512 wt246 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)28));
sum251 = _mm512_fmadd_ps(wt246, dat339, sum251);
sum252 = _mm512_fmadd_ps(wt246, dat340, sum252);
sum253 = _mm512_fmadd_ps(wt246, dat341, sum253);
__m512 wt247 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)32));
sum254 = _mm512_fmadd_ps(wt247, dat339, sum254);
sum255 = _mm512_fmadd_ps(wt247, dat340, sum255);
sum256 = _mm512_fmadd_ps(wt247, dat341, sum256);
__m512 wt248 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)36));
sum257 = _mm512_fmadd_ps(wt248, dat339, sum257);
sum258 = _mm512_fmadd_ps(wt248, dat340, sum258);
sum259 = _mm512_fmadd_ps(wt248, dat341, sum259);
__m512 wt249 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)40));
sum260 = _mm512_fmadd_ps(wt249, dat339, sum260);
sum261 = _mm512_fmadd_ps(wt249, dat340, sum261);
sum262 = _mm512_fmadd_ps(wt249, dat341, sum262);
__m512 wt250 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+24*s18+(ptrdiff_t)44));
sum263 = _mm512_fmadd_ps(wt250, dat339, sum263);
sum264 = _mm512_fmadd_ps(wt250, dat340, sum264);
sum265 = _mm512_fmadd_ps(wt250, dat341, sum265);
}
sum248 = _mm512_add_ps(sum248, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)0));
__m512 dat342 = sum249;
dat342 = _mm512_add_ps(dat342, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)64));
sum250 = _mm512_add_ps(sum250, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)88));
__m512i via65 = _mm512_castps_si512(sum249);
via65 = _mm512_alignr_epi32(via65, via65, 6);
__m512 dat343 = _mm512_castsi512_ps(via65);
dat343 = _mm512_add_ps(dat343, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)152));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)0, 65535, sum248);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)64, 63, dat342);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)88, 65535, sum250);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)152, 63, dat343);
sum251 = _mm512_add_ps(sum251, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2816));
__m512 dat344 = sum252;
dat344 = _mm512_add_ps(dat344, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2880));
sum253 = _mm512_add_ps(sum253, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2904));
__m512i via66 = _mm512_castps_si512(sum252);
via66 = _mm512_alignr_epi32(via66, via66, 6);
__m512 dat345 = _mm512_castsi512_ps(via66);
dat345 = _mm512_add_ps(dat345, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2968));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2816, 65535, sum251);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2880, 63, dat344);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2904, 65535, sum253);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2968, 63, dat345);
sum254 = _mm512_add_ps(sum254, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)5632));
__m512 dat346 = sum255;
dat346 = _mm512_add_ps(dat346, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)5696));
sum256 = _mm512_add_ps(sum256, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)5720));
__m512i via67 = _mm512_castps_si512(sum255);
via67 = _mm512_alignr_epi32(via67, via67, 6);
__m512 dat347 = _mm512_castsi512_ps(via67);
dat347 = _mm512_add_ps(dat347, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)5784));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)5632, 65535, sum254);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)5696, 63, dat346);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)5720, 65535, sum256);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)5784, 63, dat347);
sum257 = _mm512_add_ps(sum257, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)8448));
__m512 dat348 = sum258;
dat348 = _mm512_add_ps(dat348, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)8512));
sum259 = _mm512_add_ps(sum259, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)8536));
__m512i via68 = _mm512_castps_si512(sum258);
via68 = _mm512_alignr_epi32(via68, via68, 6);
__m512 dat349 = _mm512_castsi512_ps(via68);
dat349 = _mm512_add_ps(dat349, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)8600));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)8448, 65535, sum257);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)8512, 63, dat348);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)8536, 65535, sum259);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)8600, 63, dat349);
sum260 = _mm512_add_ps(sum260, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)11264));
__m512 dat350 = sum261;
dat350 = _mm512_add_ps(dat350, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)11328));
sum262 = _mm512_add_ps(sum262, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)11352));
__m512i via69 = _mm512_castps_si512(sum261);
via69 = _mm512_alignr_epi32(via69, via69, 6);
__m512 dat351 = _mm512_castsi512_ps(via69);
dat351 = _mm512_add_ps(dat351, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)11416));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)11264, 65535, sum260);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)11328, 63, dat350);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)11352, 65535, sum262);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)11416, 63, dat351);
sum263 = _mm512_add_ps(sum263, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)14080));
__m512 dat352 = sum264;
dat352 = _mm512_add_ps(dat352, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)14144));
sum265 = _mm512_add_ps(sum265, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)14168));
__m512i via70 = _mm512_castps_si512(sum264);
via70 = _mm512_alignr_epi32(via70, via70, 6);
__m512 dat353 = _mm512_castsi512_ps(via70);
dat353 = _mm512_add_ps(dat353, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)14232));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)14080, 65535, sum263);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)14144, 63, dat352);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)14168, 65535, sum265);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)14232, 63, dat353);
if (k19 >= kk15) return;
}
ptrdiff_t s19 = -1;
__m512 sum266 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+8*s19+(ptrdiff_t)8));
__m512 sum269 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+8*s19+(ptrdiff_t)12));
__m512 sum267 = sum266;
__m512 sum268 = sum266;
__m512 sum270 = sum269;
__m512 sum271 = sum269;
for (s19 = 0; s19 < 988; ++s19) {
__m512 dat354 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+192*s19+(ptrdiff_t)0);
__m512 dat355 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+192*s19+(ptrdiff_t)64);
__m512 dat356 = _mm512_loadu_ps(arrangedDats3+3224832*i11+252928*j7+192*s19+(ptrdiff_t)128);
__m512 wt251 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+8*s19+(ptrdiff_t)8));
sum266 = _mm512_fmadd_ps(wt251, dat354, sum266);
sum267 = _mm512_fmadd_ps(wt251, dat355, sum267);
sum268 = _mm512_fmadd_ps(wt251, dat356, sum268);
__m512 wt252 = _mm512_set1_ps(*(float*)(arrangedWts3+10665376*i11+23736*k19+8*s19+(ptrdiff_t)12));
sum269 = _mm512_fmadd_ps(wt252, dat354, sum269);
sum270 = _mm512_fmadd_ps(wt252, dat355, sum270);
sum271 = _mm512_fmadd_ps(wt252, dat356, sum271);
}
sum266 = _mm512_add_ps(sum266, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)0));
__m512 dat357 = sum267;
dat357 = _mm512_add_ps(dat357, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)64));
sum268 = _mm512_add_ps(sum268, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)88));
__m512i via71 = _mm512_castps_si512(sum267);
via71 = _mm512_alignr_epi32(via71, via71, 6);
__m512 dat358 = _mm512_castsi512_ps(via71);
dat358 = _mm512_add_ps(dat358, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)152));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)0, 65535, sum266);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)64, 63, dat357);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)88, 65535, sum268);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)152, 63, dat358);
sum269 = _mm512_add_ps(sum269, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2816));
__m512 dat359 = sum270;
dat359 = _mm512_add_ps(dat359, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2880));
sum271 = _mm512_add_ps(sum271, _mm512_maskz_loadu_ps(65535, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2904));
__m512i via72 = _mm512_castps_si512(sum270);
via72 = _mm512_alignr_epi32(via72, via72, 6);
__m512 dat360 = _mm512_castsi512_ps(via72);
dat360 = _mm512_add_ps(dat360, _mm512_maskz_loadu_ps(63, datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2968));
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2816, 65535, sum269);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2880, 63, dat359);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2904, 65535, sum271);
_mm512_mask_storeu_ps(datPtr5+7591936*i11+88*h8+16896*k19+(ptrdiff_t)2968, 63, dat360);
if (j7 >= jj7) return;
}
}
j7 = 13;
}
}

static void Example17OneApply1(Example17ThreaderTeam1* team16, char** tensors5) {
void* pair1[] = {tensors5, 0};
Example17ThreaderTask1 task11;
task11.callee1 = Example17OneApply1Callee1;
task11.any1 = pair1;
task11.nd1 = 3;
task11.hull1[0] = 450;
task11.hull1[1] = 13;
task11.hull1[2] = 1;
Example17ThreaderDo1(team16, &task11);
for (ptrdiff_t e5 = 1; e5 < 3; ++e5) {
pair1[1] = (void*)e5;
Example17ThreaderTask1 task12;
task12.callee1 = Example17OneApply1Callee2;
task12.any1 = pair1;
task12.nd1 = 3;
task12.hull1[0] = 450;
task12.hull1[1] = 13;
task12.hull1[2] = 1;
Example17ThreaderDo1(team16, &task12);
}
pair1[1] = (void*)3;
Example17ThreaderTask1 task13;
task13.callee1 = Example17OneApply1Callee3;
task13.any1 = pair1;
task13.nd1 = 3;
task13.hull1[0] = 450;
task13.hull1[1] = 13;
task13.hull1[2] = 1;
Example17ThreaderDo1(team16, &task13);
}

struct Example17Net {
char* alloc1;
char* align1;
};

void Example17NetDestroy(Example17Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example17NetCreate(
Example17Net** net1,
Example17Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example17Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(37711711);
if (__builtin_expect(!alloc3, 0)) {
return Example17Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example17ThreaderTeam1* team12 = 0;
char* err8 = Example17ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors11[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example17OneArrangeWts1(team12, tensors11);
}
Example17ThreaderDestroy1(team12);
Example17Net* net5 = malloc(sizeof(Example17Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example17Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example17Engine {
Example17Net* net3;
Example17ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example17EnginePthreadT(
Example17Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example17ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example17EngineDestroy(Example17Engine* eng3) {
Example17ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example17EngineCreate(
Example17Engine** eng4,
Example17Net* net4,
ptrdiff_t threads2
) {
Example17Engine* eng5 = malloc(sizeof(Example17Engine));
if (__builtin_expect(!eng5, 0)) {
return Example17Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(11401215);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example17Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example17ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example17EngineInference(
Example17Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example17ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example17OneArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
(char*)outData
};
Example17OneApply1(team14, tensors10);
}
}

// End of file.

Top