NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example27 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in1 Channels=1709 Height=21 Width=19
Input ToTensor=in2 Channels=1709 Height=21 Width=19
Input ToTensor=in3 Channels=29 Height=21 Width=19
BatchNorm FromTensor=in1 ToTensor=bn1 Epsilon=0.00001
Activation FromTensor=bn1 ToTensor=act1 Kind=ReLU Param=0
Add FromTensor1=act1 FromTensor2=in2 ToTensor=add1
BatchNorm FromTensor=add1 ToTensor=bn2 Epsilon=0.00001
Conv FromTensor=bn2 ToTensor=conv ToChannels=29 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=conv ToTensor=bn3 Epsilon=0.00001
Activation FromTensor=bn3 ToTensor=act2 Kind=ReLU Param=0.3125
Add FromTensor1=act2 FromTensor2=in3 ToTensor=add2
BatchNorm FromTensor=add2 ToTensor=bn4 Epsilon=0.00001
Output FromTensor=bn4

Top || Output Example27.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example27Params);
// Example27Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example27Params Example27Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example27Params* params = malloc(sizeof(Example27Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example27Net* net; // For example, 4 threads:
// char* err = Example27NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example27NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example27Net Example27Net;

char* Example27NetCreate(
Example27Net**,
Example27Params*,
ptrdiff_t threads
);

void Example27NetDestroy(Example27Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example27Net* net;
//
// ... Create net ...
//
// Example27Engine* engine; // For example, 4 inference threads:
// char* err = Example27EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example27EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example27EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* bn4Data = malloc(sizeof(float)*29*21*19);
// float* in1Data = malloc(sizeof(float)*1709*21*19);
// float* in2Data = malloc(sizeof(float)*1709*21*19);
// float* in3Data = malloc(sizeof(float)*29*21*19);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example27EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// bn4Data, // The tensor arguments are sorted by name.
// in1Data,
// in2Data,
// in3Data
// );
//
// ... Read the output floats ...
//
// }
//
// free(bn4Data);
// free(in1Data);
// free(in2Data);
// free(in3Data);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example27Engine Example27Engine;

char* Example27EngineCreate(
Example27Engine**,
Example27Net*,
ptrdiff_t threads
);

char* Example27EnginePthreadT(
Example27Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example27EngineInference(
Example27Engine*,
float* bn4Data,
float* in1Data,
float* in2Data,
float* in3Data
);

void Example27EngineDestroy(Example27Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example27Params {
float bn1Means[1709]; // 1x1709x1x1
float bn1Scales[1709]; // 1x1709x1x1
float bn1Shifts[1709]; // 1x1709x1x1
float bn1Variances[1709]; // 1x1709x1x1
float bn2Means[1709]; // 1x1709x1x1
float bn2Scales[1709]; // 1x1709x1x1
float bn2Shifts[1709]; // 1x1709x1x1
float bn2Variances[1709]; // 1x1709x1x1
float bn3Means[29]; // 1x29x1x1
float bn3Scales[29]; // 1x29x1x1
float bn3Shifts[29]; // 1x29x1x1
float bn3Variances[29]; // 1x29x1x1
float bn4Means[29]; // 1x29x1x1
float bn4Scales[29]; // 1x29x1x1
float bn4Shifts[29]; // 1x29x1x1
float bn4Variances[29]; // 1x29x1x1
float convBiases[29]; // 1x29x1x1
float convWeights[49561]; // 29x1709x1x1
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example27.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example27.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example27.h"

static char* Example27Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example27: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example27ThreaderTask1 Example27ThreaderTask1;
typedef void (*Example27ThreaderCallee1)(Example27ThreaderTask1*, int64_t*);
typedef struct Example27ThreaderHub1 Example27ThreaderHub1;
typedef struct Example27ThreaderNode1 Example27ThreaderNode1;
typedef struct Example27ThreaderUnwind1 Example27ThreaderUnwind1;
typedef struct Example27ThreaderTeam1 Example27ThreaderTeam1;

struct Example27ThreaderTask1 {
Example27ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example27ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example27ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example27ThreaderTask1* task1;
pthread_cond_t cond2;
Example27ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example27ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example27ThreaderTeam1 {
ptrdiff_t nt1;
Example27ThreaderHub1* hub2;
Example27ThreaderNode1* nodes2;
Example27ThreaderUnwind1 unwind1;
};

static void Example27ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example27ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example27ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example27ThreaderMain1(void* arg1) {
Example27ThreaderNode1* node1 = arg1;
Example27ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example27ThreaderHub1* hub3 = team2->hub2;
Example27ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example27ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example27ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example27ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example27ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example27ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example27ThreaderDestroy1(Example27ThreaderTeam1* team3) {
if (!team3) return;
Example27ThreaderNode1* nodes4 = team3->nodes2;
Example27ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example27ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example27ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example27ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example27ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example27ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example27ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example27ThreaderCreate1Up4(Example27ThreaderTeam1* team8, ptrdiff_t nt7) {
Example27ThreaderNode1* nodes5 = team8->nodes2;
for (Example27ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example27Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example27Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example27ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example27Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example27ThreaderCreate1Up3(Example27ThreaderTeam1* team7, ptrdiff_t nt6) {
Example27ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example27Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example27Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example27ThreaderCreate1Up4(team7, nt6);
}

static char* Example27ThreaderCreate1Up2(Example27ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example27ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example27ThreaderNode1) != (size_t)nt5, 0)) {
return Example27Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example27Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example27ThreaderCreate1Up3(team6, nt5);
}

static char* Example27ThreaderCreate1Up1(Example27ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example27ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example27Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example27ThreaderCreate1Up2(team5, nt4);
}

static char* Example27ThreaderCreate1(Example27ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example27Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example27ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example27Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example27ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example27ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example27ThreaderPthreadT1(
pthread_t* thr2,
Example27ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example27Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example27ThreaderDo1(Example27ThreaderTeam1* team10, Example27ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example27ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example27ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example27ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example27ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example27Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example27Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example27BnSimplify1(
float*restrict means1,
float*restrict variances1,
float*restrict scales1,
float*restrict shifts1,
char*restrict mas1
) {
__m512 eps1 = _mm512_set1_ps(1e-05f);
__m512i xlo1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i5 = 0; i5 < 21; ++i5) {
__m512 va1 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*0+(ptrdiff_t)80*i5);
__m512 va2 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*1+(ptrdiff_t)80*i5);
__m512 va3 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*2+(ptrdiff_t)80*i5);
__m512 va4 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*3+(ptrdiff_t)80*i5);
__m512 va5 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*4+(ptrdiff_t)80*i5);
__m512 rcp1 = Example27Rsqrt1(_mm512_add_ps(eps1, va1));
__m512 rcp2 = Example27Rsqrt1(_mm512_add_ps(eps1, va2));
__m512 rcp3 = Example27Rsqrt1(_mm512_add_ps(eps1, va3));
__m512 rcp4 = Example27Rsqrt1(_mm512_add_ps(eps1, va4));
__m512 rcp5 = Example27Rsqrt1(_mm512_add_ps(eps1, va5));
__m512 sc1 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*0+(ptrdiff_t)80*i5);
__m512 sc2 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*1+(ptrdiff_t)80*i5);
__m512 sc3 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*2+(ptrdiff_t)80*i5);
__m512 sc4 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*3+(ptrdiff_t)80*i5);
__m512 sc5 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*4+(ptrdiff_t)80*i5);
__m512 mul1 = _mm512_mul_ps(rcp1, sc1);
__m512 mul2 = _mm512_mul_ps(rcp2, sc2);
__m512 mul3 = _mm512_mul_ps(rcp3, sc3);
__m512 mul4 = _mm512_mul_ps(rcp4, sc4);
__m512 mul5 = _mm512_mul_ps(rcp5, sc5);
__m512 me1 = _mm512_loadu_ps(means1+(ptrdiff_t)16*0+(ptrdiff_t)80*i5);
__m512 me2 = _mm512_loadu_ps(means1+(ptrdiff_t)16*1+(ptrdiff_t)80*i5);
__m512 me3 = _mm512_loadu_ps(means1+(ptrdiff_t)16*2+(ptrdiff_t)80*i5);
__m512 me4 = _mm512_loadu_ps(means1+(ptrdiff_t)16*3+(ptrdiff_t)80*i5);
__m512 me5 = _mm512_loadu_ps(means1+(ptrdiff_t)16*4+(ptrdiff_t)80*i5);
__m512 sh1 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*0+(ptrdiff_t)80*i5);
__m512 sh2 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*1+(ptrdiff_t)80*i5);
__m512 sh3 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*2+(ptrdiff_t)80*i5);
__m512 sh4 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*3+(ptrdiff_t)80*i5);
__m512 sh5 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*4+(ptrdiff_t)80*i5);
__m512 add1 = _mm512_fnmadd_ps(me1, mul1, sh1);
__m512 add2 = _mm512_fnmadd_ps(me2, mul2, sh2);
__m512 add3 = _mm512_fnmadd_ps(me3, mul3, sh3);
__m512 add4 = _mm512_fnmadd_ps(me4, mul4, sh4);
__m512 add5 = _mm512_fnmadd_ps(me5, mul5, sh5);
__m512 lo1 = _mm512_permutex2var_ps(mul1, xlo1, add1);
__m512 lo2 = _mm512_permutex2var_ps(mul2, xlo1, add2);
__m512 lo3 = _mm512_permutex2var_ps(mul3, xlo1, add3);
__m512 lo4 = _mm512_permutex2var_ps(mul4, xlo1, add4);
__m512 lo5 = _mm512_permutex2var_ps(mul5, xlo1, add5);
__m512 hi1 = _mm512_permutex2var_ps(mul1, xhi1, add1);
__m512 hi2 = _mm512_permutex2var_ps(mul2, xhi1, add2);
__m512 hi3 = _mm512_permutex2var_ps(mul3, xhi1, add3);
__m512 hi4 = _mm512_permutex2var_ps(mul4, xhi1, add4);
__m512 hi5 = _mm512_permutex2var_ps(mul5, xhi1, add5);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*0+(ptrdiff_t)640*i5, lo1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*1+(ptrdiff_t)640*i5, hi1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*2+(ptrdiff_t)640*i5, lo2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*3+(ptrdiff_t)640*i5, hi2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*4+(ptrdiff_t)640*i5, lo3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*5+(ptrdiff_t)640*i5, hi3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*6+(ptrdiff_t)640*i5, lo4);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*7+(ptrdiff_t)640*i5, hi4);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*8+(ptrdiff_t)640*i5, lo5);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*9+(ptrdiff_t)640*i5, hi5);
}
__m512 va6 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*0+(ptrdiff_t)80*21);
__m512 va7 = _mm512_maskz_loadu_ps(8191, variances1+(ptrdiff_t)16*1+(ptrdiff_t)80*21);
__m512 rcp6 = Example27Rsqrt1(_mm512_add_ps(eps1, va6));
__m512 rcp7 = Example27Rsqrt1(_mm512_add_ps(eps1, va7));
__m512 sc6 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*0+(ptrdiff_t)80*21);
__m512 sc7 = _mm512_maskz_loadu_ps(8191, scales1+(ptrdiff_t)16*1+(ptrdiff_t)80*21);
__m512 mul6 = _mm512_mul_ps(rcp6, sc6);
__m512 mul7 = _mm512_mul_ps(rcp7, sc7);
__m512 me6 = _mm512_loadu_ps(means1+(ptrdiff_t)16*0+(ptrdiff_t)80*21);
__m512 me7 = _mm512_maskz_loadu_ps(8191, means1+(ptrdiff_t)16*1+(ptrdiff_t)80*21);
__m512 sh6 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*0+(ptrdiff_t)80*21);
__m512 sh7 = _mm512_maskz_loadu_ps(8191, shifts1+(ptrdiff_t)16*1+(ptrdiff_t)80*21);
__m512 add6 = _mm512_fnmadd_ps(me6, mul6, sh6);
__m512 add7 = _mm512_fnmadd_ps(me7, mul7, sh7);
__m512 lo6 = _mm512_permutex2var_ps(mul6, xlo1, add6);
__m512 lo7 = _mm512_permutex2var_ps(mul7, xlo1, add7);
__m512 hi6 = _mm512_permutex2var_ps(mul6, xhi1, add6);
__m512 hi7 = _mm512_permutex2var_ps(mul7, xhi1, add7);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*0+(ptrdiff_t)640*21, lo6);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*1+(ptrdiff_t)640*21, hi6);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*2+(ptrdiff_t)640*21, lo7);
_mm512_mask_storeu_ps(mas1+(ptrdiff_t)64*3+(ptrdiff_t)640*21, 1023, hi7);
}

static void Example27BnSimplify2(
float*restrict means2,
float*restrict variances2,
float*restrict scales2,
float*restrict shifts2,
char*restrict mas2
) {
__m512 eps2 = _mm512_set1_ps(1e-05f);
__m512i xlo2 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va8 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0);
__m512 va9 = _mm512_maskz_loadu_ps(8191, variances2+(ptrdiff_t)16*1);
__m512 rcp8 = Example27Rsqrt1(_mm512_add_ps(eps2, va8));
__m512 rcp9 = Example27Rsqrt1(_mm512_add_ps(eps2, va9));
__m512 sc8 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0);
__m512 sc9 = _mm512_maskz_loadu_ps(8191, scales2+(ptrdiff_t)16*1);
__m512 mul8 = _mm512_mul_ps(rcp8, sc8);
__m512 mul9 = _mm512_mul_ps(rcp9, sc9);
__m512 me8 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0);
__m512 me9 = _mm512_maskz_loadu_ps(8191, means2+(ptrdiff_t)16*1);
__m512 sh8 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0);
__m512 sh9 = _mm512_maskz_loadu_ps(8191, shifts2+(ptrdiff_t)16*1);
__m512 add8 = _mm512_fnmadd_ps(me8, mul8, sh8);
__m512 add9 = _mm512_fnmadd_ps(me9, mul9, sh9);
__m512 lo8 = _mm512_permutex2var_ps(mul8, xlo2, add8);
__m512 lo9 = _mm512_permutex2var_ps(mul9, xlo2, add9);
__m512 hi8 = _mm512_permutex2var_ps(mul8, xhi2, add8);
__m512 hi9 = _mm512_permutex2var_ps(mul9, xhi2, add9);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*0, lo8);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*1, hi8);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*2, lo9);
_mm512_mask_storeu_ps(mas2+(ptrdiff_t)64*3, 1023, hi9);
}

static void Example27OneArrangeWts1Callee1(Example27ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t e1 = pt7[2];
if (e1 < 1) {
char*restrict wtPtr1 = tensors2[0]+(ptrdiff_t)3340*0+(ptrdiff_t)198244*0;
char*restrict biasPtr1 = tensors2[1]+(ptrdiff_t)116*0;
char*restrict bnPtr1 = tensors2[2]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)1709*0);
char*restrict bnPtr2 = tensors2[3]+(ptrdiff_t)8*29*0;
char*restrict arranged1 = tensors2[4]+(ptrdiff_t)96976*0+(ptrdiff_t)96976*0;
ptrdiff_t ii1 = 1;
for (ptrdiff_t i6 = 0; i6 < ii1; ++i6) {
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+1;
for (; j1 < jj1; ++j1) {
if (j1 < 1) {
ptrdiff_t k2 = 0;
ptrdiff_t l2 = (size_t)(0+k2)/6;
ptrdiff_t cut2 = (size_t)(0+k2)%6;
__m512 sum3 = _mm512_maskz_loadu_ps(65535, biasPtr1+116*i6+4*k2);
__m512i pmMul1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo1 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k2+29*i6));
__m512 masHi1 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(k2+29*i6)+(ptrdiff_t)64);
__m512 postMul2 = _mm512_permutex2var_ps(masLo1, pmMul1, masHi1);
__m512 postAdd2 = _mm512_permutex2var_ps(masLo1, pmAdd1, masHi1);
sum3 = _mm512_fmadd_ps(sum3, postMul2, postAdd2);
ptrdiff_t c2 = 0;
for (; c2 != 52; ++c2) {
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)0);
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)6836);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)13672);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)20508);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)27344);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)34180);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)41016);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)47852);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)54688);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)61524);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)68360);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)75196);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)82032);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)88868);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)95704);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)102540);
__m512 tmp1 = _mm512_unpacklo_ps(wt30, wt31);
__m512 tmp2 = _mm512_unpackhi_ps(wt30, wt31);
__m512 tmp3 = _mm512_unpacklo_ps(wt32, wt33);
__m512 tmp4 = _mm512_unpackhi_ps(wt32, wt33);
__m512 tmp5 = _mm512_unpacklo_ps(wt34, wt35);
__m512 tmp6 = _mm512_unpackhi_ps(wt34, wt35);
__m512 tmp7 = _mm512_unpacklo_ps(wt36, wt37);
__m512 tmp8 = _mm512_unpackhi_ps(wt36, wt37);
__m512 tmp9 = _mm512_unpacklo_ps(wt38, wt39);
__m512 tmp10 = _mm512_unpackhi_ps(wt38, wt39);
__m512 tmp11 = _mm512_unpacklo_ps(wt40, wt41);
__m512 tmp12 = _mm512_unpackhi_ps(wt40, wt41);
__m512 tmp13 = _mm512_unpacklo_ps(wt42, wt43);
__m512 tmp14 = _mm512_unpackhi_ps(wt42, wt43);
__m512 tmp15 = _mm512_unpacklo_ps(wt44, wt45);
__m512 tmp16 = _mm512_unpackhi_ps(wt44, wt45);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt30 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt38 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt31 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt39 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt32 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt40 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt33 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt41 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt34 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt42 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt35 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt43 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt36 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt44 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt37 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt45 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
wt30 = _mm512_mul_ps(wt30, postMul2);
wt31 = _mm512_mul_ps(wt31, postMul2);
wt32 = _mm512_mul_ps(wt32, postMul2);
wt33 = _mm512_mul_ps(wt33, postMul2);
wt34 = _mm512_mul_ps(wt34, postMul2);
wt35 = _mm512_mul_ps(wt35, postMul2);
wt36 = _mm512_mul_ps(wt36, postMul2);
wt37 = _mm512_mul_ps(wt37, postMul2);
wt38 = _mm512_mul_ps(wt38, postMul2);
wt39 = _mm512_mul_ps(wt39, postMul2);
wt40 = _mm512_mul_ps(wt40, postMul2);
wt41 = _mm512_mul_ps(wt41, postMul2);
wt42 = _mm512_mul_ps(wt42, postMul2);
wt43 = _mm512_mul_ps(wt43, postMul2);
wt44 = _mm512_mul_ps(wt44, postMul2);
wt45 = _mm512_mul_ps(wt45, postMul2);
__m512 preMul20 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*c2+1709*i6))[0]);
__m512 preAdd20 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt30, preAdd20, sum3);
wt30 = _mm512_mul_ps(wt30, preMul20);
__m512 preMul21 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+16*c2+1709*i6))[0]);
__m512 preAdd21 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt31, preAdd21, sum3);
wt31 = _mm512_mul_ps(wt31, preMul21);
__m512 preMul22 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(2+16*c2+1709*i6))[0]);
__m512 preAdd22 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(2+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt32, preAdd22, sum3);
wt32 = _mm512_mul_ps(wt32, preMul22);
__m512 preMul23 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(3+16*c2+1709*i6))[0]);
__m512 preAdd23 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(3+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt33, preAdd23, sum3);
wt33 = _mm512_mul_ps(wt33, preMul23);
__m512 preMul24 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(4+16*c2+1709*i6))[0]);
__m512 preAdd24 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(4+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt34, preAdd24, sum3);
wt34 = _mm512_mul_ps(wt34, preMul24);
__m512 preMul25 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(5+16*c2+1709*i6))[0]);
__m512 preAdd25 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(5+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt35, preAdd25, sum3);
wt35 = _mm512_mul_ps(wt35, preMul25);
__m512 preMul26 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(6+16*c2+1709*i6))[0]);
__m512 preAdd26 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(6+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt36, preAdd26, sum3);
wt36 = _mm512_mul_ps(wt36, preMul26);
__m512 preMul27 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(7+16*c2+1709*i6))[0]);
__m512 preAdd27 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(7+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt37, preAdd27, sum3);
wt37 = _mm512_mul_ps(wt37, preMul27);
__m512 preMul28 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(8+16*c2+1709*i6))[0]);
__m512 preAdd28 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(8+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt38, preAdd28, sum3);
wt38 = _mm512_mul_ps(wt38, preMul28);
__m512 preMul29 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(9+16*c2+1709*i6))[0]);
__m512 preAdd29 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(9+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt39, preAdd29, sum3);
wt39 = _mm512_mul_ps(wt39, preMul29);
__m512 preMul30 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(10+16*c2+1709*i6))[0]);
__m512 preAdd30 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(10+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt40, preAdd30, sum3);
wt40 = _mm512_mul_ps(wt40, preMul30);
__m512 preMul31 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(11+16*c2+1709*i6))[0]);
__m512 preAdd31 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(11+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt41, preAdd31, sum3);
wt41 = _mm512_mul_ps(wt41, preMul31);
__m512 preMul32 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(12+16*c2+1709*i6))[0]);
__m512 preAdd32 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(12+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt42, preAdd32, sum3);
wt42 = _mm512_mul_ps(wt42, preMul32);
__m512 preMul33 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(13+16*c2+1709*i6))[0]);
__m512 preAdd33 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(13+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt43, preAdd33, sum3);
wt43 = _mm512_mul_ps(wt43, preMul33);
__m512 preMul34 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(14+16*c2+1709*i6))[0]);
__m512 preAdd34 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(14+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt44, preAdd34, sum3);
wt44 = _mm512_mul_ps(wt44, preMul34);
__m512 preMul35 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(15+16*c2+1709*i6))[0]);
__m512 preAdd35 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(15+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt45, preAdd35, sum3);
wt45 = _mm512_mul_ps(wt45, preMul35);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)0, 63>>cut2, wt30);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)0, 63>>cut2, wt31);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)0, 63>>cut2, wt32);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)0, 63>>cut2, wt33);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)0, 63>>cut2, wt34);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)0, 63>>cut2, wt35);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)0, 63>>cut2, wt36);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)0, 63>>cut2, wt37);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)0, 63>>cut2, wt38);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)0, 63>>cut2, wt39);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)0, 63>>cut2, wt40);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)0, 63>>cut2, wt41);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)0, 63>>cut2, wt42);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)0, 63>>cut2, wt43);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)0, 63>>cut2, wt44);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)0, 63>>cut2, wt45);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt30);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt31);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt32);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt33);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt34);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt35);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt36);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt37);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt38);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt39);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt40);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt41);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt42);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt43);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt44);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt45);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt30);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt31);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt32);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(4+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt33);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(5+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt34);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(6+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt35);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(7+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt36);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(8+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt37);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(9+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt38);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(10+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt39);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(11+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt40);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(12+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt41);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(13+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt42);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(14+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt43);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(15+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt44);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(16+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt45);
}
__m512 wt46 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)0);
__m512 wt47 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)6836);
__m512 wt48 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)13672);
__m512 wt49 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)20508);
__m512 wt50 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)27344);
__m512 wt51 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)34180);
__m512 wt52 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)41016);
__m512 wt53 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)47852);
__m512 wt54 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)54688);
__m512 wt55 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)61524);
__m512 wt56 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)68360);
__m512 wt57 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)75196);
__m512 wt58 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)82032);
__m512 wt59 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)88868);
__m512 wt60 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)95704);
__m512 wt61 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k2+64*c2+(ptrdiff_t)102540);
__m512 tmp49 = _mm512_unpacklo_ps(wt46, wt47);
__m512 tmp50 = _mm512_unpackhi_ps(wt46, wt47);
__m512 tmp51 = _mm512_unpacklo_ps(wt48, wt49);
__m512 tmp52 = _mm512_unpackhi_ps(wt48, wt49);
__m512 tmp53 = _mm512_unpacklo_ps(wt50, wt51);
__m512 tmp54 = _mm512_unpackhi_ps(wt50, wt51);
__m512 tmp55 = _mm512_unpacklo_ps(wt52, wt53);
__m512 tmp56 = _mm512_unpackhi_ps(wt52, wt53);
__m512 tmp57 = _mm512_unpacklo_ps(wt54, wt55);
__m512 tmp58 = _mm512_unpackhi_ps(wt54, wt55);
__m512 tmp59 = _mm512_unpacklo_ps(wt56, wt57);
__m512 tmp60 = _mm512_unpackhi_ps(wt56, wt57);
__m512 tmp61 = _mm512_unpacklo_ps(wt58, wt59);
__m512 tmp62 = _mm512_unpackhi_ps(wt58, wt59);
__m512 tmp63 = _mm512_unpacklo_ps(wt60, wt61);
__m512 tmp64 = _mm512_unpackhi_ps(wt60, wt61);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp70 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp71 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp75 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp76 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp77 = _mm512_shuffle_f32x4(tmp65, tmp68, 136);
__m512 tmp78 = _mm512_shuffle_f32x4(tmp66, tmp69, 136);
__m512 tmp79 = _mm512_shuffle_f32x4(tmp67, tmp70, 136);
__m512 tmp80 = _mm512_shuffle_f32x4(tmp71, tmp74, 136);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp72, tmp75, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp73, tmp76, 136);
wt46 = _mm512_shuffle_f32x4(tmp77, tmp80, 136);
wt47 = _mm512_shuffle_f32x4(tmp78, tmp81, 136);
wt48 = _mm512_shuffle_f32x4(tmp79, tmp82, 136);
wt46 = _mm512_mul_ps(wt46, postMul2);
wt47 = _mm512_mul_ps(wt47, postMul2);
wt48 = _mm512_mul_ps(wt48, postMul2);
__m512 preMul36 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*c2+1709*i6))[0]);
__m512 preAdd36 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt46, preAdd36, sum3);
wt46 = _mm512_mul_ps(wt46, preMul36);
__m512 preMul37 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+16*c2+1709*i6))[0]);
__m512 preAdd37 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt47, preAdd37, sum3);
wt47 = _mm512_mul_ps(wt47, preMul37);
__m512 preMul38 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(2+16*c2+1709*i6))[0]);
__m512 preAdd38 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(2+16*c2+1709*i6))[1]);
sum3 = _mm512_fmadd_ps(wt48, preAdd38, sum3);
wt48 = _mm512_mul_ps(wt48, preMul38);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)0, 63>>cut2, wt46);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)0, 63>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)0, 63>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt46);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)20040, 4032>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(1+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt46);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(2+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt47);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*(3+16*c2)+(ptrdiff_t)40080, 65535-(4095>>cut2), wt48);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum3);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*0+(ptrdiff_t)20040, 4032>>cut2, sum3);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l2+4*cut2+24*0+(ptrdiff_t)40080, 65535-(4095>>cut2), sum3);
} else {
ptrdiff_t k1 = 16;
ptrdiff_t l1 = (size_t)(0+k1)/6;
ptrdiff_t cut1 = (size_t)(0+k1)%6;
__m512 sum2 = _mm512_maskz_loadu_ps(8191, biasPtr1+116*i6+4*k1);
__m512i pmMul2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo2 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k1+29*i6));
__m512 masHi2 = _mm512_maskz_loadu_ps(1023, bnPtr2+(ptrdiff_t)8*(k1+29*i6)+(ptrdiff_t)64);
__m512 postMul1 = _mm512_permutex2var_ps(masLo2, pmMul2, masHi2);
__m512 postAdd1 = _mm512_permutex2var_ps(masLo2, pmAdd2, masHi2);
sum2 = _mm512_fmadd_ps(sum2, postMul1, postAdd1);
ptrdiff_t c1 = 0;
for (; c1 != 52; ++c1) {
__m512 wt1 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)0);
__m512 wt2 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)6836);
__m512 wt3 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)13672);
__m512 wt4 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)20508);
__m512 wt5 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)27344);
__m512 wt6 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)34180);
__m512 wt7 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)41016);
__m512 wt8 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)47852);
__m512 wt9 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)54688);
__m512 wt10 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)61524);
__m512 wt11 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)68360);
__m512 wt12 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)75196);
__m512 wt13 = _mm512_maskz_loadu_ps(65535, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)82032);
__m512 tmp83 = _mm512_unpacklo_ps(wt1, wt2);
__m512 tmp84 = _mm512_unpackhi_ps(wt1, wt2);
__m512 tmp85 = _mm512_unpacklo_ps(wt3, wt4);
__m512 tmp86 = _mm512_unpackhi_ps(wt3, wt4);
__m512 tmp87 = _mm512_unpacklo_ps(wt5, wt6);
__m512 tmp88 = _mm512_unpackhi_ps(wt5, wt6);
__m512 tmp89 = _mm512_unpacklo_ps(wt7, wt8);
__m512 tmp90 = _mm512_unpackhi_ps(wt7, wt8);
__m512 tmp91 = _mm512_unpacklo_ps(wt9, wt10);
__m512 tmp92 = _mm512_unpackhi_ps(wt9, wt10);
__m512 tmp93 = _mm512_unpacklo_ps(wt11, wt12);
__m512 tmp94 = _mm512_unpackhi_ps(wt11, wt12);
__m512 tmp95 = _mm512_unpacklo_ps(wt13, wt13);
__m512 tmp96 = _mm512_unpackhi_ps(wt13, wt13);
__m512 tmp97 = _mm512_shuffle_ps(tmp83, tmp85, 68);
__m512 tmp98 = _mm512_shuffle_ps(tmp83, tmp85, 238);
__m512 tmp99 = _mm512_shuffle_ps(tmp84, tmp86, 68);
__m512 tmp100 = _mm512_shuffle_ps(tmp84, tmp86, 238);
__m512 tmp101 = _mm512_shuffle_ps(tmp87, tmp89, 68);
__m512 tmp102 = _mm512_shuffle_ps(tmp87, tmp89, 238);
__m512 tmp103 = _mm512_shuffle_ps(tmp88, tmp90, 68);
__m512 tmp104 = _mm512_shuffle_ps(tmp88, tmp90, 238);
__m512 tmp105 = _mm512_shuffle_ps(tmp91, tmp93, 68);
__m512 tmp106 = _mm512_shuffle_ps(tmp91, tmp93, 238);
__m512 tmp107 = _mm512_shuffle_ps(tmp92, tmp94, 68);
__m512 tmp108 = _mm512_shuffle_ps(tmp92, tmp94, 238);
__m512 tmp109 = _mm512_shuffle_ps(tmp95, tmp95, 238);
__m512 tmp110 = _mm512_shuffle_ps(tmp96, tmp96, 238);
__m512 tmp111 = _mm512_shuffle_f32x4(tmp97, tmp101, 136);
__m512 tmp112 = _mm512_shuffle_f32x4(tmp97, tmp101, 221);
__m512 tmp113 = _mm512_shuffle_f32x4(tmp98, tmp102, 136);
__m512 tmp114 = _mm512_shuffle_f32x4(tmp98, tmp102, 221);
__m512 tmp115 = _mm512_shuffle_f32x4(tmp99, tmp103, 136);
__m512 tmp116 = _mm512_shuffle_f32x4(tmp99, tmp103, 221);
__m512 tmp117 = _mm512_shuffle_f32x4(tmp100, tmp104, 136);
__m512 tmp118 = _mm512_shuffle_f32x4(tmp100, tmp104, 221);
__m512 tmp119 = _mm512_shuffle_f32x4(tmp105, tmp95, 136);
__m512 tmp120 = _mm512_shuffle_f32x4(tmp105, tmp95, 221);
__m512 tmp121 = _mm512_shuffle_f32x4(tmp106, tmp109, 136);
__m512 tmp122 = _mm512_shuffle_f32x4(tmp106, tmp109, 221);
__m512 tmp123 = _mm512_shuffle_f32x4(tmp107, tmp96, 136);
__m512 tmp124 = _mm512_shuffle_f32x4(tmp107, tmp96, 221);
__m512 tmp125 = _mm512_shuffle_f32x4(tmp108, tmp110, 136);
__m512 tmp126 = _mm512_shuffle_f32x4(tmp108, tmp110, 221);
wt1 = _mm512_shuffle_f32x4(tmp111, tmp119, 136);
wt9 = _mm512_shuffle_f32x4(tmp111, tmp119, 221);
wt2 = _mm512_shuffle_f32x4(tmp113, tmp121, 136);
wt10 = _mm512_shuffle_f32x4(tmp113, tmp121, 221);
wt3 = _mm512_shuffle_f32x4(tmp115, tmp123, 136);
wt11 = _mm512_shuffle_f32x4(tmp115, tmp123, 221);
wt4 = _mm512_shuffle_f32x4(tmp117, tmp125, 136);
wt12 = _mm512_shuffle_f32x4(tmp117, tmp125, 221);
wt5 = _mm512_shuffle_f32x4(tmp112, tmp120, 136);
wt13 = _mm512_shuffle_f32x4(tmp112, tmp120, 221);
wt6 = _mm512_shuffle_f32x4(tmp114, tmp122, 136);
__m512 wt14 = _mm512_shuffle_f32x4(tmp114, tmp122, 221);
wt7 = _mm512_shuffle_f32x4(tmp116, tmp124, 136);
__m512 wt15 = _mm512_shuffle_f32x4(tmp116, tmp124, 221);
wt8 = _mm512_shuffle_f32x4(tmp118, tmp126, 136);
__m512 wt16 = _mm512_shuffle_f32x4(tmp118, tmp126, 221);
wt1 = _mm512_mul_ps(wt1, postMul1);
wt2 = _mm512_mul_ps(wt2, postMul1);
wt3 = _mm512_mul_ps(wt3, postMul1);
wt4 = _mm512_mul_ps(wt4, postMul1);
wt5 = _mm512_mul_ps(wt5, postMul1);
wt6 = _mm512_mul_ps(wt6, postMul1);
wt7 = _mm512_mul_ps(wt7, postMul1);
wt8 = _mm512_mul_ps(wt8, postMul1);
wt9 = _mm512_mul_ps(wt9, postMul1);
wt10 = _mm512_mul_ps(wt10, postMul1);
wt11 = _mm512_mul_ps(wt11, postMul1);
wt12 = _mm512_mul_ps(wt12, postMul1);
wt13 = _mm512_mul_ps(wt13, postMul1);
wt14 = _mm512_mul_ps(wt14, postMul1);
wt15 = _mm512_mul_ps(wt15, postMul1);
wt16 = _mm512_mul_ps(wt16, postMul1);
__m512 preMul1 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*c1+1709*i6))[0]);
__m512 preAdd1 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt1, preAdd1, sum2);
wt1 = _mm512_mul_ps(wt1, preMul1);
__m512 preMul2 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+16*c1+1709*i6))[0]);
__m512 preAdd2 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt2, preAdd2, sum2);
wt2 = _mm512_mul_ps(wt2, preMul2);
__m512 preMul3 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(2+16*c1+1709*i6))[0]);
__m512 preAdd3 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(2+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt3, preAdd3, sum2);
wt3 = _mm512_mul_ps(wt3, preMul3);
__m512 preMul4 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(3+16*c1+1709*i6))[0]);
__m512 preAdd4 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(3+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt4, preAdd4, sum2);
wt4 = _mm512_mul_ps(wt4, preMul4);
__m512 preMul5 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(4+16*c1+1709*i6))[0]);
__m512 preAdd5 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(4+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt5, preAdd5, sum2);
wt5 = _mm512_mul_ps(wt5, preMul5);
__m512 preMul6 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(5+16*c1+1709*i6))[0]);
__m512 preAdd6 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(5+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt6, preAdd6, sum2);
wt6 = _mm512_mul_ps(wt6, preMul6);
__m512 preMul7 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(6+16*c1+1709*i6))[0]);
__m512 preAdd7 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(6+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt7, preAdd7, sum2);
wt7 = _mm512_mul_ps(wt7, preMul7);
__m512 preMul8 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(7+16*c1+1709*i6))[0]);
__m512 preAdd8 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(7+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt8, preAdd8, sum2);
wt8 = _mm512_mul_ps(wt8, preMul8);
__m512 preMul9 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(8+16*c1+1709*i6))[0]);
__m512 preAdd9 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(8+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt9, preAdd9, sum2);
wt9 = _mm512_mul_ps(wt9, preMul9);
__m512 preMul10 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(9+16*c1+1709*i6))[0]);
__m512 preAdd10 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(9+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt10, preAdd10, sum2);
wt10 = _mm512_mul_ps(wt10, preMul10);
__m512 preMul11 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(10+16*c1+1709*i6))[0]);
__m512 preAdd11 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(10+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt11, preAdd11, sum2);
wt11 = _mm512_mul_ps(wt11, preMul11);
__m512 preMul12 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(11+16*c1+1709*i6))[0]);
__m512 preAdd12 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(11+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt12, preAdd12, sum2);
wt12 = _mm512_mul_ps(wt12, preMul12);
__m512 preMul13 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(12+16*c1+1709*i6))[0]);
__m512 preAdd13 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(12+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt13, preAdd13, sum2);
wt13 = _mm512_mul_ps(wt13, preMul13);
__m512 preMul14 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(13+16*c1+1709*i6))[0]);
__m512 preAdd14 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(13+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt14, preAdd14, sum2);
wt14 = _mm512_mul_ps(wt14, preMul14);
__m512 preMul15 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(14+16*c1+1709*i6))[0]);
__m512 preAdd15 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(14+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt15, preAdd15, sum2);
wt15 = _mm512_mul_ps(wt15, preMul15);
__m512 preMul16 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(15+16*c1+1709*i6))[0]);
__m512 preAdd16 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(15+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt16, preAdd16, sum2);
wt16 = _mm512_mul_ps(wt16, preMul16);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)0, 63>>cut1, wt1);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)0, 63>>cut1, wt2);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)0, 63>>cut1, wt3);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(4+16*c1)+(ptrdiff_t)0, 63>>cut1, wt4);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(5+16*c1)+(ptrdiff_t)0, 63>>cut1, wt5);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(6+16*c1)+(ptrdiff_t)0, 63>>cut1, wt6);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(7+16*c1)+(ptrdiff_t)0, 63>>cut1, wt7);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(8+16*c1)+(ptrdiff_t)0, 63>>cut1, wt8);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(9+16*c1)+(ptrdiff_t)0, 63>>cut1, wt9);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(10+16*c1)+(ptrdiff_t)0, 63>>cut1, wt10);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(11+16*c1)+(ptrdiff_t)0, 63>>cut1, wt11);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(12+16*c1)+(ptrdiff_t)0, 63>>cut1, wt12);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(13+16*c1)+(ptrdiff_t)0, 63>>cut1, wt13);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(14+16*c1)+(ptrdiff_t)0, 63>>cut1, wt14);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(15+16*c1)+(ptrdiff_t)0, 63>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(16+16*c1)+(ptrdiff_t)0, 63>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt1);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt2);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt3);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(4+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt4);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(5+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt5);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(6+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt6);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(7+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt7);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(8+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt8);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(9+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt9);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(10+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt10);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(11+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt11);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(12+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt12);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(13+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt13);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(14+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt14);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(15+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(16+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(1+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt1);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(2+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt2);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(3+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt3);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(4+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt4);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(5+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt5);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(6+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt6);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(7+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt7);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(8+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt8);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(9+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt9);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(10+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt10);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(11+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt11);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(12+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt12);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(13+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt13);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(14+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt14);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(15+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt15);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(16+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt16);
}
__m512 wt17 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)0);
__m512 wt18 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)6836);
__m512 wt19 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)13672);
__m512 wt20 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)20508);
__m512 wt21 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)27344);
__m512 wt22 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)34180);
__m512 wt23 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)41016);
__m512 wt24 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)47852);
__m512 wt25 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)54688);
__m512 wt26 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)61524);
__m512 wt27 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)68360);
__m512 wt28 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)75196);
__m512 wt29 = _mm512_maskz_loadu_ps(7, wtPtr1+198244*i6+6836*k1+64*c1+(ptrdiff_t)82032);
__m512 tmp127 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp128 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp129 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp130 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp131 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp132 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp133 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp134 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp135 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp136 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp137 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp138 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp139 = _mm512_unpacklo_ps(wt29, wt29);
__m512 tmp140 = _mm512_unpackhi_ps(wt29, wt29);
__m512 tmp141 = _mm512_shuffle_ps(tmp127, tmp129, 68);
__m512 tmp142 = _mm512_shuffle_ps(tmp127, tmp129, 238);
__m512 tmp143 = _mm512_shuffle_ps(tmp128, tmp130, 68);
__m512 tmp144 = _mm512_shuffle_ps(tmp131, tmp133, 68);
__m512 tmp145 = _mm512_shuffle_ps(tmp131, tmp133, 238);
__m512 tmp146 = _mm512_shuffle_ps(tmp132, tmp134, 68);
__m512 tmp147 = _mm512_shuffle_ps(tmp135, tmp137, 68);
__m512 tmp148 = _mm512_shuffle_ps(tmp135, tmp137, 238);
__m512 tmp149 = _mm512_shuffle_ps(tmp136, tmp138, 68);
__m512 tmp150 = _mm512_shuffle_ps(tmp139, tmp139, 238);
__m512 tmp151 = _mm512_shuffle_f32x4(tmp141, tmp144, 136);
__m512 tmp152 = _mm512_shuffle_f32x4(tmp142, tmp145, 136);
__m512 tmp153 = _mm512_shuffle_f32x4(tmp143, tmp146, 136);
__m512 tmp154 = _mm512_shuffle_f32x4(tmp147, tmp139, 136);
__m512 tmp155 = _mm512_shuffle_f32x4(tmp148, tmp150, 136);
__m512 tmp156 = _mm512_shuffle_f32x4(tmp149, tmp140, 136);
wt17 = _mm512_shuffle_f32x4(tmp151, tmp154, 136);
wt18 = _mm512_shuffle_f32x4(tmp152, tmp155, 136);
wt19 = _mm512_shuffle_f32x4(tmp153, tmp156, 136);
wt17 = _mm512_mul_ps(wt17, postMul1);
wt18 = _mm512_mul_ps(wt18, postMul1);
wt19 = _mm512_mul_ps(wt19, postMul1);
__m512 preMul17 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*c1+1709*i6))[0]);
__m512 preAdd17 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt17, preAdd17, sum2);
wt17 = _mm512_mul_ps(wt17, preMul17);
__m512 preMul18 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+16*c1+1709*i6))[0]);
__m512 preAdd18 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt18, preAdd18, sum2);
wt18 = _mm512_mul_ps(wt18, preMul18);
__m512 preMul19 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(2+16*c1+1709*i6))[0]);
__m512 preAdd19 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(2+16*c1+1709*i6))[1]);
sum2 = _mm512_fmadd_ps(wt19, preAdd19, sum2);
wt19 = _mm512_mul_ps(wt19, preMul19);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)0, 63>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)0, 63>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)0, 63>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(1+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(2+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*(3+16*c1)+(ptrdiff_t)20040, 4032>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(1+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt17);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(2+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt18);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*(3+16*c1)+(ptrdiff_t)40080, 8191-(4095>>cut1), wt19);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+24*0+(ptrdiff_t)20040, 4032>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+96976*i6+20064*l1+4*cut1+20*0+(ptrdiff_t)40080, 8191-(4095>>cut1), sum2);
}
}
}
return;
}
char*restrict wtPtr2 = tensors2[0]+(ptrdiff_t)3340*1+(ptrdiff_t)198244*0;
char*restrict bnPtr3 = tensors2[2]+(ptrdiff_t)8*((ptrdiff_t)835*1+(ptrdiff_t)1709*0);
char*restrict bnPtr4 = tensors2[3]+(ptrdiff_t)8*29*0;
char*restrict arranged2 = tensors2[4]+(ptrdiff_t)96976*1+(ptrdiff_t)101500*0;
ptrdiff_t ii2 = 1;
for (ptrdiff_t i7 = 0; i7 < ii2; ++i7) {
ptrdiff_t j2 = 1*b2;
ptrdiff_t jj2 = j2+1;
for (; j2 < jj2; ++j2) {
if (j2 < 1) {
ptrdiff_t k4 = 0;
ptrdiff_t l4 = (size_t)(0+k4)/6;
ptrdiff_t cut4 = (size_t)(0+k4)%6;
__m512 sum5 = _mm512_setzero_ps();
__m512i pmMul3 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd3 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo3 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k4+29*i7));
__m512 masHi3 = _mm512_maskz_loadu_ps(65535, bnPtr4+(ptrdiff_t)8*(k4+29*i7)+(ptrdiff_t)64);
__m512 postMul4 = _mm512_permutex2var_ps(masLo3, pmMul3, masHi3);
__m512 postAdd4 = _mm512_permutex2var_ps(masLo3, pmAdd3, masHi3);
(void)postAdd4;
ptrdiff_t c4 = 0;
for (; c4 != 54; ++c4) {
__m512 wt91 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)0);
__m512 wt92 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)6836);
__m512 wt93 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)13672);
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)20508);
__m512 wt95 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)27344);
__m512 wt96 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)34180);
__m512 wt97 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)41016);
__m512 wt98 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)47852);
__m512 wt99 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)54688);
__m512 wt100 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)61524);
__m512 wt101 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)68360);
__m512 wt102 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)75196);
__m512 wt103 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)82032);
__m512 wt104 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)88868);
__m512 wt105 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)95704);
__m512 wt106 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)102540);
__m512 tmp157 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp158 = _mm512_unpackhi_ps(wt91, wt92);
__m512 tmp159 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp160 = _mm512_unpackhi_ps(wt93, wt94);
__m512 tmp161 = _mm512_unpacklo_ps(wt95, wt96);
__m512 tmp162 = _mm512_unpackhi_ps(wt95, wt96);
__m512 tmp163 = _mm512_unpacklo_ps(wt97, wt98);
__m512 tmp164 = _mm512_unpackhi_ps(wt97, wt98);
__m512 tmp165 = _mm512_unpacklo_ps(wt99, wt100);
__m512 tmp166 = _mm512_unpackhi_ps(wt99, wt100);
__m512 tmp167 = _mm512_unpacklo_ps(wt101, wt102);
__m512 tmp168 = _mm512_unpackhi_ps(wt101, wt102);
__m512 tmp169 = _mm512_unpacklo_ps(wt103, wt104);
__m512 tmp170 = _mm512_unpackhi_ps(wt103, wt104);
__m512 tmp171 = _mm512_unpacklo_ps(wt105, wt106);
__m512 tmp172 = _mm512_unpackhi_ps(wt105, wt106);
__m512 tmp173 = _mm512_shuffle_ps(tmp157, tmp159, 68);
__m512 tmp174 = _mm512_shuffle_ps(tmp157, tmp159, 238);
__m512 tmp175 = _mm512_shuffle_ps(tmp158, tmp160, 68);
__m512 tmp176 = _mm512_shuffle_ps(tmp158, tmp160, 238);
__m512 tmp177 = _mm512_shuffle_ps(tmp161, tmp163, 68);
__m512 tmp178 = _mm512_shuffle_ps(tmp161, tmp163, 238);
__m512 tmp179 = _mm512_shuffle_ps(tmp162, tmp164, 68);
__m512 tmp180 = _mm512_shuffle_ps(tmp162, tmp164, 238);
__m512 tmp181 = _mm512_shuffle_ps(tmp165, tmp167, 68);
__m512 tmp182 = _mm512_shuffle_ps(tmp165, tmp167, 238);
__m512 tmp183 = _mm512_shuffle_ps(tmp166, tmp168, 68);
__m512 tmp184 = _mm512_shuffle_ps(tmp166, tmp168, 238);
__m512 tmp185 = _mm512_shuffle_ps(tmp169, tmp171, 68);
__m512 tmp186 = _mm512_shuffle_ps(tmp169, tmp171, 238);
__m512 tmp187 = _mm512_shuffle_ps(tmp170, tmp172, 68);
__m512 tmp188 = _mm512_shuffle_ps(tmp170, tmp172, 238);
__m512 tmp189 = _mm512_shuffle_f32x4(tmp173, tmp177, 136);
__m512 tmp190 = _mm512_shuffle_f32x4(tmp173, tmp177, 221);
__m512 tmp191 = _mm512_shuffle_f32x4(tmp174, tmp178, 136);
__m512 tmp192 = _mm512_shuffle_f32x4(tmp174, tmp178, 221);
__m512 tmp193 = _mm512_shuffle_f32x4(tmp175, tmp179, 136);
__m512 tmp194 = _mm512_shuffle_f32x4(tmp175, tmp179, 221);
__m512 tmp195 = _mm512_shuffle_f32x4(tmp176, tmp180, 136);
__m512 tmp196 = _mm512_shuffle_f32x4(tmp176, tmp180, 221);
__m512 tmp197 = _mm512_shuffle_f32x4(tmp181, tmp185, 136);
__m512 tmp198 = _mm512_shuffle_f32x4(tmp181, tmp185, 221);
__m512 tmp199 = _mm512_shuffle_f32x4(tmp182, tmp186, 136);
__m512 tmp200 = _mm512_shuffle_f32x4(tmp182, tmp186, 221);
__m512 tmp201 = _mm512_shuffle_f32x4(tmp183, tmp187, 136);
__m512 tmp202 = _mm512_shuffle_f32x4(tmp183, tmp187, 221);
__m512 tmp203 = _mm512_shuffle_f32x4(tmp184, tmp188, 136);
__m512 tmp204 = _mm512_shuffle_f32x4(tmp184, tmp188, 221);
wt91 = _mm512_shuffle_f32x4(tmp189, tmp197, 136);
wt99 = _mm512_shuffle_f32x4(tmp189, tmp197, 221);
wt92 = _mm512_shuffle_f32x4(tmp191, tmp199, 136);
wt100 = _mm512_shuffle_f32x4(tmp191, tmp199, 221);
wt93 = _mm512_shuffle_f32x4(tmp193, tmp201, 136);
wt101 = _mm512_shuffle_f32x4(tmp193, tmp201, 221);
wt94 = _mm512_shuffle_f32x4(tmp195, tmp203, 136);
wt102 = _mm512_shuffle_f32x4(tmp195, tmp203, 221);
wt95 = _mm512_shuffle_f32x4(tmp190, tmp198, 136);
wt103 = _mm512_shuffle_f32x4(tmp190, tmp198, 221);
wt96 = _mm512_shuffle_f32x4(tmp192, tmp200, 136);
wt104 = _mm512_shuffle_f32x4(tmp192, tmp200, 221);
wt97 = _mm512_shuffle_f32x4(tmp194, tmp202, 136);
wt105 = _mm512_shuffle_f32x4(tmp194, tmp202, 221);
wt98 = _mm512_shuffle_f32x4(tmp196, tmp204, 136);
wt106 = _mm512_shuffle_f32x4(tmp196, tmp204, 221);
wt91 = _mm512_mul_ps(wt91, postMul4);
wt92 = _mm512_mul_ps(wt92, postMul4);
wt93 = _mm512_mul_ps(wt93, postMul4);
wt94 = _mm512_mul_ps(wt94, postMul4);
wt95 = _mm512_mul_ps(wt95, postMul4);
wt96 = _mm512_mul_ps(wt96, postMul4);
wt97 = _mm512_mul_ps(wt97, postMul4);
wt98 = _mm512_mul_ps(wt98, postMul4);
wt99 = _mm512_mul_ps(wt99, postMul4);
wt100 = _mm512_mul_ps(wt100, postMul4);
wt101 = _mm512_mul_ps(wt101, postMul4);
wt102 = _mm512_mul_ps(wt102, postMul4);
wt103 = _mm512_mul_ps(wt103, postMul4);
wt104 = _mm512_mul_ps(wt104, postMul4);
wt105 = _mm512_mul_ps(wt105, postMul4);
wt106 = _mm512_mul_ps(wt106, postMul4);
__m512 preMul65 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(0+16*c4+1709*i7))[0]);
__m512 preAdd65 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(0+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt91, preAdd65, sum5);
wt91 = _mm512_mul_ps(wt91, preMul65);
__m512 preMul66 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(1+16*c4+1709*i7))[0]);
__m512 preAdd66 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(1+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt92, preAdd66, sum5);
wt92 = _mm512_mul_ps(wt92, preMul66);
__m512 preMul67 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(2+16*c4+1709*i7))[0]);
__m512 preAdd67 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(2+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt93, preAdd67, sum5);
wt93 = _mm512_mul_ps(wt93, preMul67);
__m512 preMul68 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(3+16*c4+1709*i7))[0]);
__m512 preAdd68 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(3+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt94, preAdd68, sum5);
wt94 = _mm512_mul_ps(wt94, preMul68);
__m512 preMul69 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(4+16*c4+1709*i7))[0]);
__m512 preAdd69 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(4+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt95, preAdd69, sum5);
wt95 = _mm512_mul_ps(wt95, preMul69);
__m512 preMul70 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(5+16*c4+1709*i7))[0]);
__m512 preAdd70 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(5+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt96, preAdd70, sum5);
wt96 = _mm512_mul_ps(wt96, preMul70);
__m512 preMul71 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(6+16*c4+1709*i7))[0]);
__m512 preAdd71 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(6+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt97, preAdd71, sum5);
wt97 = _mm512_mul_ps(wt97, preMul71);
__m512 preMul72 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(7+16*c4+1709*i7))[0]);
__m512 preAdd72 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(7+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt98, preAdd72, sum5);
wt98 = _mm512_mul_ps(wt98, preMul72);
__m512 preMul73 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(8+16*c4+1709*i7))[0]);
__m512 preAdd73 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(8+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt99, preAdd73, sum5);
wt99 = _mm512_mul_ps(wt99, preMul73);
__m512 preMul74 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(9+16*c4+1709*i7))[0]);
__m512 preAdd74 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(9+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt100, preAdd74, sum5);
wt100 = _mm512_mul_ps(wt100, preMul74);
__m512 preMul75 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(10+16*c4+1709*i7))[0]);
__m512 preAdd75 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(10+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt101, preAdd75, sum5);
wt101 = _mm512_mul_ps(wt101, preMul75);
__m512 preMul76 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(11+16*c4+1709*i7))[0]);
__m512 preAdd76 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(11+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt102, preAdd76, sum5);
wt102 = _mm512_mul_ps(wt102, preMul76);
__m512 preMul77 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(12+16*c4+1709*i7))[0]);
__m512 preAdd77 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(12+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt103, preAdd77, sum5);
wt103 = _mm512_mul_ps(wt103, preMul77);
__m512 preMul78 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(13+16*c4+1709*i7))[0]);
__m512 preAdd78 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(13+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt104, preAdd78, sum5);
wt104 = _mm512_mul_ps(wt104, preMul78);
__m512 preMul79 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(14+16*c4+1709*i7))[0]);
__m512 preAdd79 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(14+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt105, preAdd79, sum5);
wt105 = _mm512_mul_ps(wt105, preMul79);
__m512 preMul80 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(15+16*c4+1709*i7))[0]);
__m512 preAdd80 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(15+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt106, preAdd80, sum5);
wt106 = _mm512_mul_ps(wt106, preMul80);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(1+16*c4)+(ptrdiff_t)0, 63>>cut4, wt91);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(2+16*c4)+(ptrdiff_t)0, 63>>cut4, wt92);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(3+16*c4)+(ptrdiff_t)0, 63>>cut4, wt93);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(4+16*c4)+(ptrdiff_t)0, 63>>cut4, wt94);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(5+16*c4)+(ptrdiff_t)0, 63>>cut4, wt95);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(6+16*c4)+(ptrdiff_t)0, 63>>cut4, wt96);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(7+16*c4)+(ptrdiff_t)0, 63>>cut4, wt97);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(8+16*c4)+(ptrdiff_t)0, 63>>cut4, wt98);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(9+16*c4)+(ptrdiff_t)0, 63>>cut4, wt99);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(10+16*c4)+(ptrdiff_t)0, 63>>cut4, wt100);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(11+16*c4)+(ptrdiff_t)0, 63>>cut4, wt101);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(12+16*c4)+(ptrdiff_t)0, 63>>cut4, wt102);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(13+16*c4)+(ptrdiff_t)0, 63>>cut4, wt103);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(14+16*c4)+(ptrdiff_t)0, 63>>cut4, wt104);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(15+16*c4)+(ptrdiff_t)0, 63>>cut4, wt105);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(16+16*c4)+(ptrdiff_t)0, 63>>cut4, wt106);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(1+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt91);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(2+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt92);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(3+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt93);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(4+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt94);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(5+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt95);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(6+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt96);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(7+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt97);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(8+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt98);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(9+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt99);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(10+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt100);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(11+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt101);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(12+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt102);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(13+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt103);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(14+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt104);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(15+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt105);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(16+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt106);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(1+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt91);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(2+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt92);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(3+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt93);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(4+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt94);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(5+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt95);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(6+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt96);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(7+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt97);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(8+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt98);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(9+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt99);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(10+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt100);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(11+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt101);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(12+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt102);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(13+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt103);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(14+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt104);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(15+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt105);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(16+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt106);
}
__m512 wt107 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)0);
__m512 wt108 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)6836);
__m512 wt109 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)13672);
__m512 wt110 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)20508);
__m512 wt111 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)27344);
__m512 wt112 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)34180);
__m512 wt113 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)41016);
__m512 wt114 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)47852);
__m512 wt115 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)54688);
__m512 wt116 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)61524);
__m512 wt117 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)68360);
__m512 wt118 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)75196);
__m512 wt119 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)82032);
__m512 wt120 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)88868);
__m512 wt121 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)95704);
__m512 wt122 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k4+64*c4+(ptrdiff_t)102540);
__m512 tmp205 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp206 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp207 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp208 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp209 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp210 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp211 = _mm512_unpacklo_ps(wt113, wt114);
__m512 tmp212 = _mm512_unpackhi_ps(wt113, wt114);
__m512 tmp213 = _mm512_unpacklo_ps(wt115, wt116);
__m512 tmp214 = _mm512_unpackhi_ps(wt115, wt116);
__m512 tmp215 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp216 = _mm512_unpackhi_ps(wt117, wt118);
__m512 tmp217 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp218 = _mm512_unpackhi_ps(wt119, wt120);
__m512 tmp219 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp220 = _mm512_unpackhi_ps(wt121, wt122);
__m512 tmp221 = _mm512_shuffle_ps(tmp205, tmp207, 68);
__m512 tmp222 = _mm512_shuffle_ps(tmp205, tmp207, 238);
__m512 tmp223 = _mm512_shuffle_ps(tmp206, tmp208, 68);
__m512 tmp224 = _mm512_shuffle_ps(tmp206, tmp208, 238);
__m512 tmp225 = _mm512_shuffle_ps(tmp209, tmp211, 68);
__m512 tmp226 = _mm512_shuffle_ps(tmp209, tmp211, 238);
__m512 tmp227 = _mm512_shuffle_ps(tmp210, tmp212, 68);
__m512 tmp228 = _mm512_shuffle_ps(tmp210, tmp212, 238);
__m512 tmp229 = _mm512_shuffle_ps(tmp213, tmp215, 68);
__m512 tmp230 = _mm512_shuffle_ps(tmp213, tmp215, 238);
__m512 tmp231 = _mm512_shuffle_ps(tmp214, tmp216, 68);
__m512 tmp232 = _mm512_shuffle_ps(tmp214, tmp216, 238);
__m512 tmp233 = _mm512_shuffle_ps(tmp217, tmp219, 68);
__m512 tmp234 = _mm512_shuffle_ps(tmp217, tmp219, 238);
__m512 tmp235 = _mm512_shuffle_ps(tmp218, tmp220, 68);
__m512 tmp236 = _mm512_shuffle_ps(tmp218, tmp220, 238);
__m512 tmp237 = _mm512_shuffle_f32x4(tmp221, tmp225, 136);
__m512 tmp238 = _mm512_shuffle_f32x4(tmp221, tmp225, 221);
__m512 tmp239 = _mm512_shuffle_f32x4(tmp222, tmp226, 136);
__m512 tmp240 = _mm512_shuffle_f32x4(tmp222, tmp226, 221);
__m512 tmp241 = _mm512_shuffle_f32x4(tmp223, tmp227, 136);
__m512 tmp242 = _mm512_shuffle_f32x4(tmp223, tmp227, 221);
__m512 tmp243 = _mm512_shuffle_f32x4(tmp224, tmp228, 136);
__m512 tmp244 = _mm512_shuffle_f32x4(tmp224, tmp228, 221);
__m512 tmp245 = _mm512_shuffle_f32x4(tmp229, tmp233, 136);
__m512 tmp246 = _mm512_shuffle_f32x4(tmp229, tmp233, 221);
__m512 tmp247 = _mm512_shuffle_f32x4(tmp230, tmp234, 136);
__m512 tmp248 = _mm512_shuffle_f32x4(tmp230, tmp234, 221);
__m512 tmp249 = _mm512_shuffle_f32x4(tmp231, tmp235, 136);
__m512 tmp250 = _mm512_shuffle_f32x4(tmp231, tmp235, 221);
__m512 tmp251 = _mm512_shuffle_f32x4(tmp232, tmp236, 136);
__m512 tmp252 = _mm512_shuffle_f32x4(tmp232, tmp236, 221);
wt107 = _mm512_shuffle_f32x4(tmp237, tmp245, 136);
wt115 = _mm512_shuffle_f32x4(tmp237, tmp245, 221);
wt108 = _mm512_shuffle_f32x4(tmp239, tmp247, 136);
wt116 = _mm512_shuffle_f32x4(tmp239, tmp247, 221);
wt109 = _mm512_shuffle_f32x4(tmp241, tmp249, 136);
wt110 = _mm512_shuffle_f32x4(tmp243, tmp251, 136);
wt111 = _mm512_shuffle_f32x4(tmp238, tmp246, 136);
wt112 = _mm512_shuffle_f32x4(tmp240, tmp248, 136);
wt113 = _mm512_shuffle_f32x4(tmp242, tmp250, 136);
wt114 = _mm512_shuffle_f32x4(tmp244, tmp252, 136);
wt107 = _mm512_mul_ps(wt107, postMul4);
wt108 = _mm512_mul_ps(wt108, postMul4);
wt109 = _mm512_mul_ps(wt109, postMul4);
wt110 = _mm512_mul_ps(wt110, postMul4);
wt111 = _mm512_mul_ps(wt111, postMul4);
wt112 = _mm512_mul_ps(wt112, postMul4);
wt113 = _mm512_mul_ps(wt113, postMul4);
wt114 = _mm512_mul_ps(wt114, postMul4);
wt115 = _mm512_mul_ps(wt115, postMul4);
wt116 = _mm512_mul_ps(wt116, postMul4);
__m512 preMul81 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(0+16*c4+1709*i7))[0]);
__m512 preAdd81 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(0+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt107, preAdd81, sum5);
wt107 = _mm512_mul_ps(wt107, preMul81);
__m512 preMul82 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(1+16*c4+1709*i7))[0]);
__m512 preAdd82 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(1+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt108, preAdd82, sum5);
wt108 = _mm512_mul_ps(wt108, preMul82);
__m512 preMul83 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(2+16*c4+1709*i7))[0]);
__m512 preAdd83 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(2+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt109, preAdd83, sum5);
wt109 = _mm512_mul_ps(wt109, preMul83);
__m512 preMul84 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(3+16*c4+1709*i7))[0]);
__m512 preAdd84 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(3+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt110, preAdd84, sum5);
wt110 = _mm512_mul_ps(wt110, preMul84);
__m512 preMul85 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(4+16*c4+1709*i7))[0]);
__m512 preAdd85 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(4+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt111, preAdd85, sum5);
wt111 = _mm512_mul_ps(wt111, preMul85);
__m512 preMul86 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(5+16*c4+1709*i7))[0]);
__m512 preAdd86 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(5+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt112, preAdd86, sum5);
wt112 = _mm512_mul_ps(wt112, preMul86);
__m512 preMul87 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(6+16*c4+1709*i7))[0]);
__m512 preAdd87 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(6+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt113, preAdd87, sum5);
wt113 = _mm512_mul_ps(wt113, preMul87);
__m512 preMul88 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(7+16*c4+1709*i7))[0]);
__m512 preAdd88 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(7+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt114, preAdd88, sum5);
wt114 = _mm512_mul_ps(wt114, preMul88);
__m512 preMul89 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(8+16*c4+1709*i7))[0]);
__m512 preAdd89 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(8+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt115, preAdd89, sum5);
wt115 = _mm512_mul_ps(wt115, preMul89);
__m512 preMul90 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(9+16*c4+1709*i7))[0]);
__m512 preAdd90 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(9+16*c4+1709*i7))[1]);
sum5 = _mm512_fmadd_ps(wt116, preAdd90, sum5);
wt116 = _mm512_mul_ps(wt116, preMul90);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(1+16*c4)+(ptrdiff_t)0, 63>>cut4, wt107);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(2+16*c4)+(ptrdiff_t)0, 63>>cut4, wt108);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(3+16*c4)+(ptrdiff_t)0, 63>>cut4, wt109);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(4+16*c4)+(ptrdiff_t)0, 63>>cut4, wt110);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(5+16*c4)+(ptrdiff_t)0, 63>>cut4, wt111);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(6+16*c4)+(ptrdiff_t)0, 63>>cut4, wt112);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(7+16*c4)+(ptrdiff_t)0, 63>>cut4, wt113);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(8+16*c4)+(ptrdiff_t)0, 63>>cut4, wt114);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(9+16*c4)+(ptrdiff_t)0, 63>>cut4, wt115);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(10+16*c4)+(ptrdiff_t)0, 63>>cut4, wt116);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(1+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt107);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(2+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt108);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(3+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt109);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(4+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt110);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(5+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt111);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(6+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt112);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(7+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt113);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(8+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt114);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(9+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt115);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(10+16*c4)+(ptrdiff_t)20976, 4032>>cut4, wt116);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(1+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt107);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(2+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt108);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(3+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt109);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(4+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt110);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(5+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt111);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(6+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt112);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(7+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt113);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(8+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt114);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(9+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt115);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*(10+16*c4)+(ptrdiff_t)41952, 65535-(4095>>cut4), wt116);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*0+(ptrdiff_t)0, 63>>cut4, sum5);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*0+(ptrdiff_t)20976, 4032>>cut4, sum5);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l4+4*cut4+24*0+(ptrdiff_t)41952, 65535-(4095>>cut4), sum5);
} else {
ptrdiff_t k3 = 16;
ptrdiff_t l3 = (size_t)(0+k3)/6;
ptrdiff_t cut3 = (size_t)(0+k3)%6;
__m512 sum4 = _mm512_setzero_ps();
__m512i pmMul4 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd4 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo4 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k3+29*i7));
__m512 masHi4 = _mm512_maskz_loadu_ps(1023, bnPtr4+(ptrdiff_t)8*(k3+29*i7)+(ptrdiff_t)64);
__m512 postMul3 = _mm512_permutex2var_ps(masLo4, pmMul4, masHi4);
__m512 postAdd3 = _mm512_permutex2var_ps(masLo4, pmAdd4, masHi4);
(void)postAdd3;
ptrdiff_t c3 = 0;
for (; c3 != 54; ++c3) {
__m512 wt62 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)0);
__m512 wt63 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)6836);
__m512 wt64 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)13672);
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)20508);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)27344);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)34180);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)41016);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)47852);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)54688);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)61524);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)68360);
__m512 wt73 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)75196);
__m512 wt74 = _mm512_maskz_loadu_ps(65535, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)82032);
__m512 tmp253 = _mm512_unpacklo_ps(wt62, wt63);
__m512 tmp254 = _mm512_unpackhi_ps(wt62, wt63);
__m512 tmp255 = _mm512_unpacklo_ps(wt64, wt65);
__m512 tmp256 = _mm512_unpackhi_ps(wt64, wt65);
__m512 tmp257 = _mm512_unpacklo_ps(wt66, wt67);
__m512 tmp258 = _mm512_unpackhi_ps(wt66, wt67);
__m512 tmp259 = _mm512_unpacklo_ps(wt68, wt69);
__m512 tmp260 = _mm512_unpackhi_ps(wt68, wt69);
__m512 tmp261 = _mm512_unpacklo_ps(wt70, wt71);
__m512 tmp262 = _mm512_unpackhi_ps(wt70, wt71);
__m512 tmp263 = _mm512_unpacklo_ps(wt72, wt73);
__m512 tmp264 = _mm512_unpackhi_ps(wt72, wt73);
__m512 tmp265 = _mm512_unpacklo_ps(wt74, wt74);
__m512 tmp266 = _mm512_unpackhi_ps(wt74, wt74);
__m512 tmp267 = _mm512_shuffle_ps(tmp253, tmp255, 68);
__m512 tmp268 = _mm512_shuffle_ps(tmp253, tmp255, 238);
__m512 tmp269 = _mm512_shuffle_ps(tmp254, tmp256, 68);
__m512 tmp270 = _mm512_shuffle_ps(tmp254, tmp256, 238);
__m512 tmp271 = _mm512_shuffle_ps(tmp257, tmp259, 68);
__m512 tmp272 = _mm512_shuffle_ps(tmp257, tmp259, 238);
__m512 tmp273 = _mm512_shuffle_ps(tmp258, tmp260, 68);
__m512 tmp274 = _mm512_shuffle_ps(tmp258, tmp260, 238);
__m512 tmp275 = _mm512_shuffle_ps(tmp261, tmp263, 68);
__m512 tmp276 = _mm512_shuffle_ps(tmp261, tmp263, 238);
__m512 tmp277 = _mm512_shuffle_ps(tmp262, tmp264, 68);
__m512 tmp278 = _mm512_shuffle_ps(tmp262, tmp264, 238);
__m512 tmp279 = _mm512_shuffle_ps(tmp265, tmp265, 238);
__m512 tmp280 = _mm512_shuffle_ps(tmp266, tmp266, 238);
__m512 tmp281 = _mm512_shuffle_f32x4(tmp267, tmp271, 136);
__m512 tmp282 = _mm512_shuffle_f32x4(tmp267, tmp271, 221);
__m512 tmp283 = _mm512_shuffle_f32x4(tmp268, tmp272, 136);
__m512 tmp284 = _mm512_shuffle_f32x4(tmp268, tmp272, 221);
__m512 tmp285 = _mm512_shuffle_f32x4(tmp269, tmp273, 136);
__m512 tmp286 = _mm512_shuffle_f32x4(tmp269, tmp273, 221);
__m512 tmp287 = _mm512_shuffle_f32x4(tmp270, tmp274, 136);
__m512 tmp288 = _mm512_shuffle_f32x4(tmp270, tmp274, 221);
__m512 tmp289 = _mm512_shuffle_f32x4(tmp275, tmp265, 136);
__m512 tmp290 = _mm512_shuffle_f32x4(tmp275, tmp265, 221);
__m512 tmp291 = _mm512_shuffle_f32x4(tmp276, tmp279, 136);
__m512 tmp292 = _mm512_shuffle_f32x4(tmp276, tmp279, 221);
__m512 tmp293 = _mm512_shuffle_f32x4(tmp277, tmp266, 136);
__m512 tmp294 = _mm512_shuffle_f32x4(tmp277, tmp266, 221);
__m512 tmp295 = _mm512_shuffle_f32x4(tmp278, tmp280, 136);
__m512 tmp296 = _mm512_shuffle_f32x4(tmp278, tmp280, 221);
wt62 = _mm512_shuffle_f32x4(tmp281, tmp289, 136);
wt70 = _mm512_shuffle_f32x4(tmp281, tmp289, 221);
wt63 = _mm512_shuffle_f32x4(tmp283, tmp291, 136);
wt71 = _mm512_shuffle_f32x4(tmp283, tmp291, 221);
wt64 = _mm512_shuffle_f32x4(tmp285, tmp293, 136);
wt72 = _mm512_shuffle_f32x4(tmp285, tmp293, 221);
wt65 = _mm512_shuffle_f32x4(tmp287, tmp295, 136);
wt73 = _mm512_shuffle_f32x4(tmp287, tmp295, 221);
wt66 = _mm512_shuffle_f32x4(tmp282, tmp290, 136);
wt74 = _mm512_shuffle_f32x4(tmp282, tmp290, 221);
wt67 = _mm512_shuffle_f32x4(tmp284, tmp292, 136);
__m512 wt75 = _mm512_shuffle_f32x4(tmp284, tmp292, 221);
wt68 = _mm512_shuffle_f32x4(tmp286, tmp294, 136);
__m512 wt76 = _mm512_shuffle_f32x4(tmp286, tmp294, 221);
wt69 = _mm512_shuffle_f32x4(tmp288, tmp296, 136);
__m512 wt77 = _mm512_shuffle_f32x4(tmp288, tmp296, 221);
wt62 = _mm512_mul_ps(wt62, postMul3);
wt63 = _mm512_mul_ps(wt63, postMul3);
wt64 = _mm512_mul_ps(wt64, postMul3);
wt65 = _mm512_mul_ps(wt65, postMul3);
wt66 = _mm512_mul_ps(wt66, postMul3);
wt67 = _mm512_mul_ps(wt67, postMul3);
wt68 = _mm512_mul_ps(wt68, postMul3);
wt69 = _mm512_mul_ps(wt69, postMul3);
wt70 = _mm512_mul_ps(wt70, postMul3);
wt71 = _mm512_mul_ps(wt71, postMul3);
wt72 = _mm512_mul_ps(wt72, postMul3);
wt73 = _mm512_mul_ps(wt73, postMul3);
wt74 = _mm512_mul_ps(wt74, postMul3);
wt75 = _mm512_mul_ps(wt75, postMul3);
wt76 = _mm512_mul_ps(wt76, postMul3);
wt77 = _mm512_mul_ps(wt77, postMul3);
__m512 preMul39 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(0+16*c3+1709*i7))[0]);
__m512 preAdd39 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(0+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt62, preAdd39, sum4);
wt62 = _mm512_mul_ps(wt62, preMul39);
__m512 preMul40 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(1+16*c3+1709*i7))[0]);
__m512 preAdd40 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(1+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt63, preAdd40, sum4);
wt63 = _mm512_mul_ps(wt63, preMul40);
__m512 preMul41 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(2+16*c3+1709*i7))[0]);
__m512 preAdd41 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(2+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt64, preAdd41, sum4);
wt64 = _mm512_mul_ps(wt64, preMul41);
__m512 preMul42 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(3+16*c3+1709*i7))[0]);
__m512 preAdd42 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(3+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt65, preAdd42, sum4);
wt65 = _mm512_mul_ps(wt65, preMul42);
__m512 preMul43 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(4+16*c3+1709*i7))[0]);
__m512 preAdd43 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(4+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt66, preAdd43, sum4);
wt66 = _mm512_mul_ps(wt66, preMul43);
__m512 preMul44 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(5+16*c3+1709*i7))[0]);
__m512 preAdd44 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(5+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt67, preAdd44, sum4);
wt67 = _mm512_mul_ps(wt67, preMul44);
__m512 preMul45 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(6+16*c3+1709*i7))[0]);
__m512 preAdd45 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(6+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt68, preAdd45, sum4);
wt68 = _mm512_mul_ps(wt68, preMul45);
__m512 preMul46 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(7+16*c3+1709*i7))[0]);
__m512 preAdd46 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(7+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt69, preAdd46, sum4);
wt69 = _mm512_mul_ps(wt69, preMul46);
__m512 preMul47 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(8+16*c3+1709*i7))[0]);
__m512 preAdd47 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(8+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt70, preAdd47, sum4);
wt70 = _mm512_mul_ps(wt70, preMul47);
__m512 preMul48 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(9+16*c3+1709*i7))[0]);
__m512 preAdd48 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(9+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt71, preAdd48, sum4);
wt71 = _mm512_mul_ps(wt71, preMul48);
__m512 preMul49 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(10+16*c3+1709*i7))[0]);
__m512 preAdd49 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(10+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt72, preAdd49, sum4);
wt72 = _mm512_mul_ps(wt72, preMul49);
__m512 preMul50 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(11+16*c3+1709*i7))[0]);
__m512 preAdd50 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(11+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt73, preAdd50, sum4);
wt73 = _mm512_mul_ps(wt73, preMul50);
__m512 preMul51 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(12+16*c3+1709*i7))[0]);
__m512 preAdd51 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(12+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt74, preAdd51, sum4);
wt74 = _mm512_mul_ps(wt74, preMul51);
__m512 preMul52 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(13+16*c3+1709*i7))[0]);
__m512 preAdd52 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(13+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt75, preAdd52, sum4);
wt75 = _mm512_mul_ps(wt75, preMul52);
__m512 preMul53 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(14+16*c3+1709*i7))[0]);
__m512 preAdd53 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(14+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt76, preAdd53, sum4);
wt76 = _mm512_mul_ps(wt76, preMul53);
__m512 preMul54 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(15+16*c3+1709*i7))[0]);
__m512 preAdd54 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(15+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt77, preAdd54, sum4);
wt77 = _mm512_mul_ps(wt77, preMul54);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(1+16*c3)+(ptrdiff_t)0, 63>>cut3, wt62);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(2+16*c3)+(ptrdiff_t)0, 63>>cut3, wt63);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(3+16*c3)+(ptrdiff_t)0, 63>>cut3, wt64);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(4+16*c3)+(ptrdiff_t)0, 63>>cut3, wt65);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(5+16*c3)+(ptrdiff_t)0, 63>>cut3, wt66);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(6+16*c3)+(ptrdiff_t)0, 63>>cut3, wt67);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(7+16*c3)+(ptrdiff_t)0, 63>>cut3, wt68);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(8+16*c3)+(ptrdiff_t)0, 63>>cut3, wt69);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(9+16*c3)+(ptrdiff_t)0, 63>>cut3, wt70);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(10+16*c3)+(ptrdiff_t)0, 63>>cut3, wt71);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(11+16*c3)+(ptrdiff_t)0, 63>>cut3, wt72);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(12+16*c3)+(ptrdiff_t)0, 63>>cut3, wt73);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(13+16*c3)+(ptrdiff_t)0, 63>>cut3, wt74);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(14+16*c3)+(ptrdiff_t)0, 63>>cut3, wt75);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(15+16*c3)+(ptrdiff_t)0, 63>>cut3, wt76);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(16+16*c3)+(ptrdiff_t)0, 63>>cut3, wt77);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(1+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt62);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(2+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt63);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(3+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt64);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(4+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt65);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(5+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt66);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(6+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt67);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(7+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt68);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(8+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt69);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(9+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt70);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(10+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt71);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(11+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt72);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(12+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt73);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(13+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt74);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(14+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt75);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(15+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt76);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(16+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt77);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(1+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt62);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(2+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt63);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(3+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt64);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(4+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt65);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(5+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt66);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(6+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt67);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(7+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt68);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(8+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt69);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(9+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt70);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(10+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt71);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(11+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt72);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(12+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt73);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(13+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt74);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(14+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt75);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(15+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt76);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(16+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt77);
}
__m512 wt78 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)0);
__m512 wt79 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)6836);
__m512 wt80 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)13672);
__m512 wt81 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)20508);
__m512 wt82 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)27344);
__m512 wt83 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)34180);
__m512 wt84 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)41016);
__m512 wt85 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)47852);
__m512 wt86 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)54688);
__m512 wt87 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)61524);
__m512 wt88 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)68360);
__m512 wt89 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)75196);
__m512 wt90 = _mm512_maskz_loadu_ps(1023, wtPtr2+198244*i7+6836*k3+64*c3+(ptrdiff_t)82032);
__m512 tmp297 = _mm512_unpacklo_ps(wt78, wt79);
__m512 tmp298 = _mm512_unpackhi_ps(wt78, wt79);
__m512 tmp299 = _mm512_unpacklo_ps(wt80, wt81);
__m512 tmp300 = _mm512_unpackhi_ps(wt80, wt81);
__m512 tmp301 = _mm512_unpacklo_ps(wt82, wt83);
__m512 tmp302 = _mm512_unpackhi_ps(wt82, wt83);
__m512 tmp303 = _mm512_unpacklo_ps(wt84, wt85);
__m512 tmp304 = _mm512_unpackhi_ps(wt84, wt85);
__m512 tmp305 = _mm512_unpacklo_ps(wt86, wt87);
__m512 tmp306 = _mm512_unpackhi_ps(wt86, wt87);
__m512 tmp307 = _mm512_unpacklo_ps(wt88, wt89);
__m512 tmp308 = _mm512_unpackhi_ps(wt88, wt89);
__m512 tmp309 = _mm512_unpacklo_ps(wt90, wt90);
__m512 tmp310 = _mm512_unpackhi_ps(wt90, wt90);
__m512 tmp311 = _mm512_shuffle_ps(tmp297, tmp299, 68);
__m512 tmp312 = _mm512_shuffle_ps(tmp297, tmp299, 238);
__m512 tmp313 = _mm512_shuffle_ps(tmp298, tmp300, 68);
__m512 tmp314 = _mm512_shuffle_ps(tmp298, tmp300, 238);
__m512 tmp315 = _mm512_shuffle_ps(tmp301, tmp303, 68);
__m512 tmp316 = _mm512_shuffle_ps(tmp301, tmp303, 238);
__m512 tmp317 = _mm512_shuffle_ps(tmp302, tmp304, 68);
__m512 tmp318 = _mm512_shuffle_ps(tmp302, tmp304, 238);
__m512 tmp319 = _mm512_shuffle_ps(tmp305, tmp307, 68);
__m512 tmp320 = _mm512_shuffle_ps(tmp305, tmp307, 238);
__m512 tmp321 = _mm512_shuffle_ps(tmp306, tmp308, 68);
__m512 tmp322 = _mm512_shuffle_ps(tmp306, tmp308, 238);
__m512 tmp323 = _mm512_shuffle_ps(tmp309, tmp309, 238);
__m512 tmp324 = _mm512_shuffle_ps(tmp310, tmp310, 238);
__m512 tmp325 = _mm512_shuffle_f32x4(tmp311, tmp315, 136);
__m512 tmp326 = _mm512_shuffle_f32x4(tmp311, tmp315, 221);
__m512 tmp327 = _mm512_shuffle_f32x4(tmp312, tmp316, 136);
__m512 tmp328 = _mm512_shuffle_f32x4(tmp312, tmp316, 221);
__m512 tmp329 = _mm512_shuffle_f32x4(tmp313, tmp317, 136);
__m512 tmp330 = _mm512_shuffle_f32x4(tmp313, tmp317, 221);
__m512 tmp331 = _mm512_shuffle_f32x4(tmp314, tmp318, 136);
__m512 tmp332 = _mm512_shuffle_f32x4(tmp314, tmp318, 221);
__m512 tmp333 = _mm512_shuffle_f32x4(tmp319, tmp309, 136);
__m512 tmp334 = _mm512_shuffle_f32x4(tmp319, tmp309, 221);
__m512 tmp335 = _mm512_shuffle_f32x4(tmp320, tmp323, 136);
__m512 tmp336 = _mm512_shuffle_f32x4(tmp320, tmp323, 221);
__m512 tmp337 = _mm512_shuffle_f32x4(tmp321, tmp310, 136);
__m512 tmp338 = _mm512_shuffle_f32x4(tmp321, tmp310, 221);
__m512 tmp339 = _mm512_shuffle_f32x4(tmp322, tmp324, 136);
__m512 tmp340 = _mm512_shuffle_f32x4(tmp322, tmp324, 221);
wt78 = _mm512_shuffle_f32x4(tmp325, tmp333, 136);
wt86 = _mm512_shuffle_f32x4(tmp325, tmp333, 221);
wt79 = _mm512_shuffle_f32x4(tmp327, tmp335, 136);
wt87 = _mm512_shuffle_f32x4(tmp327, tmp335, 221);
wt80 = _mm512_shuffle_f32x4(tmp329, tmp337, 136);
wt81 = _mm512_shuffle_f32x4(tmp331, tmp339, 136);
wt82 = _mm512_shuffle_f32x4(tmp326, tmp334, 136);
wt83 = _mm512_shuffle_f32x4(tmp328, tmp336, 136);
wt84 = _mm512_shuffle_f32x4(tmp330, tmp338, 136);
wt85 = _mm512_shuffle_f32x4(tmp332, tmp340, 136);
wt78 = _mm512_mul_ps(wt78, postMul3);
wt79 = _mm512_mul_ps(wt79, postMul3);
wt80 = _mm512_mul_ps(wt80, postMul3);
wt81 = _mm512_mul_ps(wt81, postMul3);
wt82 = _mm512_mul_ps(wt82, postMul3);
wt83 = _mm512_mul_ps(wt83, postMul3);
wt84 = _mm512_mul_ps(wt84, postMul3);
wt85 = _mm512_mul_ps(wt85, postMul3);
wt86 = _mm512_mul_ps(wt86, postMul3);
wt87 = _mm512_mul_ps(wt87, postMul3);
__m512 preMul55 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(0+16*c3+1709*i7))[0]);
__m512 preAdd55 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(0+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt78, preAdd55, sum4);
wt78 = _mm512_mul_ps(wt78, preMul55);
__m512 preMul56 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(1+16*c3+1709*i7))[0]);
__m512 preAdd56 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(1+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt79, preAdd56, sum4);
wt79 = _mm512_mul_ps(wt79, preMul56);
__m512 preMul57 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(2+16*c3+1709*i7))[0]);
__m512 preAdd57 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(2+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt80, preAdd57, sum4);
wt80 = _mm512_mul_ps(wt80, preMul57);
__m512 preMul58 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(3+16*c3+1709*i7))[0]);
__m512 preAdd58 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(3+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt81, preAdd58, sum4);
wt81 = _mm512_mul_ps(wt81, preMul58);
__m512 preMul59 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(4+16*c3+1709*i7))[0]);
__m512 preAdd59 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(4+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt82, preAdd59, sum4);
wt82 = _mm512_mul_ps(wt82, preMul59);
__m512 preMul60 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(5+16*c3+1709*i7))[0]);
__m512 preAdd60 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(5+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt83, preAdd60, sum4);
wt83 = _mm512_mul_ps(wt83, preMul60);
__m512 preMul61 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(6+16*c3+1709*i7))[0]);
__m512 preAdd61 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(6+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt84, preAdd61, sum4);
wt84 = _mm512_mul_ps(wt84, preMul61);
__m512 preMul62 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(7+16*c3+1709*i7))[0]);
__m512 preAdd62 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(7+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt85, preAdd62, sum4);
wt85 = _mm512_mul_ps(wt85, preMul62);
__m512 preMul63 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(8+16*c3+1709*i7))[0]);
__m512 preAdd63 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(8+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt86, preAdd63, sum4);
wt86 = _mm512_mul_ps(wt86, preMul63);
__m512 preMul64 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(9+16*c3+1709*i7))[0]);
__m512 preAdd64 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(9+16*c3+1709*i7))[1]);
sum4 = _mm512_fmadd_ps(wt87, preAdd64, sum4);
wt87 = _mm512_mul_ps(wt87, preMul64);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(1+16*c3)+(ptrdiff_t)0, 63>>cut3, wt78);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(2+16*c3)+(ptrdiff_t)0, 63>>cut3, wt79);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(3+16*c3)+(ptrdiff_t)0, 63>>cut3, wt80);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(4+16*c3)+(ptrdiff_t)0, 63>>cut3, wt81);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(5+16*c3)+(ptrdiff_t)0, 63>>cut3, wt82);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(6+16*c3)+(ptrdiff_t)0, 63>>cut3, wt83);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(7+16*c3)+(ptrdiff_t)0, 63>>cut3, wt84);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(8+16*c3)+(ptrdiff_t)0, 63>>cut3, wt85);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(9+16*c3)+(ptrdiff_t)0, 63>>cut3, wt86);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(10+16*c3)+(ptrdiff_t)0, 63>>cut3, wt87);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(1+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt78);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(2+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt79);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(3+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt80);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(4+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt81);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(5+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt82);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(6+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt83);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(7+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt84);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(8+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt85);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(9+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt86);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*(10+16*c3)+(ptrdiff_t)20976, 4032>>cut3, wt87);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(1+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt78);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(2+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt79);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(3+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt80);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(4+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt81);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(5+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt82);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(6+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt83);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(7+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt84);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(8+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt85);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(9+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt86);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*(10+16*c3)+(ptrdiff_t)41952, 8191-(4095>>cut3), wt87);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*0+(ptrdiff_t)0, 63>>cut3, sum4);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+24*0+(ptrdiff_t)20976, 4032>>cut3, sum4);
_mm512_mask_storeu_ps(arranged2+101500*i7+21000*l3+4*cut3+20*0+(ptrdiff_t)41952, 8191-(4095>>cut3), sum4);
}
}
}
}

static void Example27OneArrangeWts1(Example27ThreaderTeam1* team13, char** tensors1) {
Example27ThreaderTask1 task5;
task5.callee1 = Example27OneArrangeWts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 2;
task5.hull1[1] = 1;
task5.hull1[2] = 2;
Example27ThreaderDo1(team13, &task5);
}

static void Example27OneArrangeDats1Callee1(Example27ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = pt8[0];
ptrdiff_t c5 = pt8[1];
ptrdiff_t e2 = pt8[3];
if (e2 < 1) {
char*restrict datPtr1 = tensors4[0]+(ptrdiff_t)0+(ptrdiff_t)1332660*0+(ptrdiff_t)2727564*0;
char*restrict bnPtr5 = tensors4[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)1709*0);
char*restrict datPtr2 = tensors4[2]+(ptrdiff_t)0+(ptrdiff_t)1332660*0+(ptrdiff_t)2727564*0;
char*restrict arranged3 = tensors4[3]+(ptrdiff_t)1336000*0+(ptrdiff_t)1336000*0;
ptrdiff_t ii3 = 1;
for (ptrdiff_t i8 = 0; i8 < ii3; ++i8) {
ptrdiff_t j3 = 1*c5;
ptrdiff_t jj3 = j3+0;
for (; j3 != 6; ++j3) {
ptrdiff_t k5 = 139*s1;
ptrdiff_t kk1 = k5+(s1 < 5 ? 139 : 140);
for (; k5 < kk1; ++k5) {
__m512 dat1 = _mm512_maskz_loadu_ps(65535, datPtr1+2727564*i8+256*j3+1596*k5+(ptrdiff_t)0);
__m512 dat3 = _mm512_maskz_loadu_ps(65535, datPtr1+2727564*i8+256*j3+1596*k5+(ptrdiff_t)64);
__m512 dat5 = _mm512_maskz_loadu_ps(65535, datPtr1+2727564*i8+256*j3+1596*k5+(ptrdiff_t)128);
__m512 dat7 = _mm512_maskz_loadu_ps(65535, datPtr1+2727564*i8+256*j3+1596*k5+(ptrdiff_t)192);
__m512 bnMul1 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(k5+1709*i8))[0]);
__m512 bnAdd1 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(k5+1709*i8))[1]);
dat1 = _mm512_fmadd_ps(dat1, bnMul1, bnAdd1);
dat3 = _mm512_fmadd_ps(dat3, bnMul1, bnAdd1);
dat5 = _mm512_fmadd_ps(dat5, bnMul1, bnAdd1);
dat7 = _mm512_fmadd_ps(dat7, bnMul1, bnAdd1);
dat1 = _mm512_max_ps(_mm512_setzero_ps(), dat1);
dat3 = _mm512_max_ps(_mm512_setzero_ps(), dat3);
dat5 = _mm512_max_ps(_mm512_setzero_ps(), dat5);
dat7 = _mm512_max_ps(_mm512_setzero_ps(), dat7);
__m512 dat2 = _mm512_maskz_loadu_ps(65535, datPtr2+2727564*i8+256*j3+1596*k5+(ptrdiff_t)0);
__m512 dat4 = _mm512_maskz_loadu_ps(65535, datPtr2+2727564*i8+256*j3+1596*k5+(ptrdiff_t)64);
__m512 dat6 = _mm512_maskz_loadu_ps(65535, datPtr2+2727564*i8+256*j3+1596*k5+(ptrdiff_t)128);
__m512 dat8 = _mm512_maskz_loadu_ps(65535, datPtr2+2727564*i8+256*j3+1596*k5+(ptrdiff_t)192);
dat1 = _mm512_add_ps(dat1, dat2);
dat3 = _mm512_add_ps(dat3, dat4);
dat5 = _mm512_add_ps(dat5, dat6);
dat7 = _mm512_add_ps(dat7, dat8);
_mm512_mask_storeu_ps(arranged3+1336000*i8+213760*j3+256*k5+(ptrdiff_t)0, 65535, dat1);
_mm512_mask_storeu_ps(arranged3+1336000*i8+213760*j3+256*k5+(ptrdiff_t)64, 65535, dat3);
_mm512_mask_storeu_ps(arranged3+1336000*i8+213760*j3+256*k5+(ptrdiff_t)128, 65535, dat5);
_mm512_mask_storeu_ps(arranged3+1336000*i8+213760*j3+256*k5+(ptrdiff_t)192, 65535, dat7);
}
if (j3 >= jj3) goto next1;
}
ptrdiff_t k6 = 139*s1;
ptrdiff_t kk2 = k6+(s1 < 5 ? 139 : 140);
for (; k6 < kk2; ++k6) {
__m512 dat9 = _mm512_maskz_loadu_ps(32767, datPtr1+2727564*i8+256*j3+1596*k6+(ptrdiff_t)0);
__m512 bnMul2 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(k6+1709*i8))[0]);
__m512 bnAdd2 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(k6+1709*i8))[1]);
dat9 = _mm512_fmadd_ps(dat9, bnMul2, bnAdd2);
dat9 = _mm512_max_ps(_mm512_setzero_ps(), dat9);
__m512 dat10 = _mm512_maskz_loadu_ps(32767, datPtr2+2727564*i8+256*j3+1596*k6+(ptrdiff_t)0);
dat9 = _mm512_add_ps(dat9, dat10);
_mm512_mask_storeu_ps(arranged3+1336000*i8+213760*j3+64*k6+(ptrdiff_t)0, 32767, dat9);
}
next1:;
}
return;
}
char*restrict datPtr3 = tensors4[0]+(ptrdiff_t)0+(ptrdiff_t)1332660*1+(ptrdiff_t)2727564*0;
char*restrict bnPtr6 = tensors4[1]+(ptrdiff_t)8*((ptrdiff_t)835*1+(ptrdiff_t)1709*0);
char*restrict datPtr4 = tensors4[2]+(ptrdiff_t)0+(ptrdiff_t)1332660*1+(ptrdiff_t)2727564*0;
char*restrict arranged4 = tensors4[3]+(ptrdiff_t)1336000*1+(ptrdiff_t)1398400*0;
ptrdiff_t ii4 = 1;
for (ptrdiff_t i9 = 0; i9 < ii4; ++i9) {
ptrdiff_t j4 = 1*c5;
ptrdiff_t jj4 = j4+0;
for (; j4 != 6; ++j4) {
ptrdiff_t k7 = 145*s1;
ptrdiff_t kk3 = k7+(s1 < 5 ? 145 : 149);
for (; k7 < kk3; ++k7) {
__m512 dat11 = _mm512_maskz_loadu_ps(65535, datPtr3+2727564*i9+256*j4+1596*k7+(ptrdiff_t)0);
__m512 dat13 = _mm512_maskz_loadu_ps(65535, datPtr3+2727564*i9+256*j4+1596*k7+(ptrdiff_t)64);
__m512 dat15 = _mm512_maskz_loadu_ps(65535, datPtr3+2727564*i9+256*j4+1596*k7+(ptrdiff_t)128);
__m512 dat17 = _mm512_maskz_loadu_ps(65535, datPtr3+2727564*i9+256*j4+1596*k7+(ptrdiff_t)192);
__m512 bnMul3 = _mm512_set1_ps(((float*)bnPtr6+(ptrdiff_t)2*(k7+1709*i9))[0]);
__m512 bnAdd3 = _mm512_set1_ps(((float*)bnPtr6+(ptrdiff_t)2*(k7+1709*i9))[1]);
dat11 = _mm512_fmadd_ps(dat11, bnMul3, bnAdd3);
dat13 = _mm512_fmadd_ps(dat13, bnMul3, bnAdd3);
dat15 = _mm512_fmadd_ps(dat15, bnMul3, bnAdd3);
dat17 = _mm512_fmadd_ps(dat17, bnMul3, bnAdd3);
dat11 = _mm512_max_ps(_mm512_setzero_ps(), dat11);
dat13 = _mm512_max_ps(_mm512_setzero_ps(), dat13);
dat15 = _mm512_max_ps(_mm512_setzero_ps(), dat15);
dat17 = _mm512_max_ps(_mm512_setzero_ps(), dat17);
__m512 dat12 = _mm512_maskz_loadu_ps(65535, datPtr4+2727564*i9+256*j4+1596*k7+(ptrdiff_t)0);
__m512 dat14 = _mm512_maskz_loadu_ps(65535, datPtr4+2727564*i9+256*j4+1596*k7+(ptrdiff_t)64);
__m512 dat16 = _mm512_maskz_loadu_ps(65535, datPtr4+2727564*i9+256*j4+1596*k7+(ptrdiff_t)128);
__m512 dat18 = _mm512_maskz_loadu_ps(65535, datPtr4+2727564*i9+256*j4+1596*k7+(ptrdiff_t)192);
dat11 = _mm512_add_ps(dat11, dat12);
dat13 = _mm512_add_ps(dat13, dat14);
dat15 = _mm512_add_ps(dat15, dat16);
dat17 = _mm512_add_ps(dat17, dat18);
_mm512_mask_storeu_ps(arranged4+1398400*i9+223744*j4+256*k7+(ptrdiff_t)0, 65535, dat11);
_mm512_mask_storeu_ps(arranged4+1398400*i9+223744*j4+256*k7+(ptrdiff_t)64, 65535, dat13);
_mm512_mask_storeu_ps(arranged4+1398400*i9+223744*j4+256*k7+(ptrdiff_t)128, 65535, dat15);
_mm512_mask_storeu_ps(arranged4+1398400*i9+223744*j4+256*k7+(ptrdiff_t)192, 65535, dat17);
}
if (j4 >= jj4) goto next2;
}
ptrdiff_t k8 = 145*s1;
ptrdiff_t kk4 = k8+(s1 < 5 ? 145 : 149);
for (; k8 < kk4; ++k8) {
__m512 dat19 = _mm512_maskz_loadu_ps(32767, datPtr3+2727564*i9+256*j4+1596*k8+(ptrdiff_t)0);
__m512 bnMul4 = _mm512_set1_ps(((float*)bnPtr6+(ptrdiff_t)2*(k8+1709*i9))[0]);
__m512 bnAdd4 = _mm512_set1_ps(((float*)bnPtr6+(ptrdiff_t)2*(k8+1709*i9))[1]);
dat19 = _mm512_fmadd_ps(dat19, bnMul4, bnAdd4);
dat19 = _mm512_max_ps(_mm512_setzero_ps(), dat19);
__m512 dat20 = _mm512_maskz_loadu_ps(32767, datPtr4+2727564*i9+256*j4+1596*k8+(ptrdiff_t)0);
dat19 = _mm512_add_ps(dat19, dat20);
_mm512_mask_storeu_ps(arranged4+1398400*i9+223744*j4+64*k8+(ptrdiff_t)0, 32767, dat19);
}
next2:;
}
}

static void Example27OneArrangeDats1(Example27ThreaderTeam1* team15, char** tensors3) {
Example27ThreaderTask1 task7;
task7.callee1 = Example27OneArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 6;
task7.hull1[1] = 7;
task7.hull1[2] = 1;
task7.hull1[3] = 2;
Example27ThreaderDo1(team15, &task7);
}

static void Example27OneApply1Callee1(Example27ThreaderTask1* task8, int64_t* pt9) {
void** pair2 = task8->any1;
char** tensors6 = pair2[0];
ptrdiff_t e3 = 0;
ptrdiff_t g2 = 0;
ptrdiff_t d1 = pt9[1];
ptrdiff_t w1 = pt9[0];
char*restrict arrangedWts1 = tensors6[0]+96976*e3+(ptrdiff_t)96976*1*g2;
char*restrict arrangedDats1 = tensors6[1]+1336000*e3+(ptrdiff_t)1336000*1*g2;
char*restrict datPtr5 = tensors6[4]+(ptrdiff_t)46284*1*g2;
ptrdiff_t ii5 = 1;
for (ptrdiff_t i10 = 0; i10 < ii5; ++i10) {
ptrdiff_t j5 = 1*d1;
ptrdiff_t jj5 = j5+0;
for (; j5 != 6; ++j5) {
ptrdiff_t k9 = 1*w1;
ptrdiff_t kk5 = k9+0;
for (; k9 != 4; ++k9) {
ptrdiff_t s2 = -1;
__m512 sum6 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)24));
__m512 sum10 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)28));
__m512 sum14 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)32));
__m512 sum18 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)36));
__m512 sum22 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)40));
__m512 sum26 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)44));
__m512 sum7 = sum6;
__m512 sum8 = sum6;
__m512 sum9 = sum6;
__m512 sum11 = sum10;
__m512 sum12 = sum10;
__m512 sum13 = sum10;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum19 = sum18;
__m512 sum20 = sum18;
__m512 sum21 = sum18;
__m512 sum23 = sum22;
__m512 sum24 = sum22;
__m512 sum25 = sum22;
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
for (s2 = 0; s2 < 835; ++s2) {
__m512 dat21 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+256*s2+(ptrdiff_t)0);
__m512 dat22 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+256*s2+(ptrdiff_t)64);
__m512 dat23 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+256*s2+(ptrdiff_t)128);
__m512 dat24 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+256*s2+(ptrdiff_t)192);
__m512 wt123 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)24));
sum6 = _mm512_fmadd_ps(wt123, dat21, sum6);
sum7 = _mm512_fmadd_ps(wt123, dat22, sum7);
sum8 = _mm512_fmadd_ps(wt123, dat23, sum8);
sum9 = _mm512_fmadd_ps(wt123, dat24, sum9);
__m512 wt124 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)28));
sum10 = _mm512_fmadd_ps(wt124, dat21, sum10);
sum11 = _mm512_fmadd_ps(wt124, dat22, sum11);
sum12 = _mm512_fmadd_ps(wt124, dat23, sum12);
sum13 = _mm512_fmadd_ps(wt124, dat24, sum13);
__m512 wt125 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)32));
sum14 = _mm512_fmadd_ps(wt125, dat21, sum14);
sum15 = _mm512_fmadd_ps(wt125, dat22, sum15);
sum16 = _mm512_fmadd_ps(wt125, dat23, sum16);
sum17 = _mm512_fmadd_ps(wt125, dat24, sum17);
__m512 wt126 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)36));
sum18 = _mm512_fmadd_ps(wt126, dat21, sum18);
sum19 = _mm512_fmadd_ps(wt126, dat22, sum19);
sum20 = _mm512_fmadd_ps(wt126, dat23, sum20);
sum21 = _mm512_fmadd_ps(wt126, dat24, sum21);
__m512 wt127 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)40));
sum22 = _mm512_fmadd_ps(wt127, dat21, sum22);
sum23 = _mm512_fmadd_ps(wt127, dat22, sum23);
sum24 = _mm512_fmadd_ps(wt127, dat23, sum24);
sum25 = _mm512_fmadd_ps(wt127, dat24, sum25);
__m512 wt128 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+24*s2+(ptrdiff_t)44));
sum26 = _mm512_fmadd_ps(wt128, dat21, sum26);
sum27 = _mm512_fmadd_ps(wt128, dat22, sum27);
sum28 = _mm512_fmadd_ps(wt128, dat23, sum28);
sum29 = _mm512_fmadd_ps(wt128, dat24, sum29);
}
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)0, 65535, sum6);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)64, 65535, sum7);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)128, 65535, sum8);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)192, 65535, sum9);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)1596, 65535, sum10);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)1660, 65535, sum11);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)1724, 65535, sum12);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)1788, 65535, sum13);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)3192, 65535, sum14);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)3256, 65535, sum15);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)3320, 65535, sum16);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)3384, 65535, sum17);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)4788, 65535, sum18);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)4852, 65535, sum19);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)4916, 65535, sum20);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)4980, 65535, sum21);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)6384, 65535, sum22);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)6448, 65535, sum23);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)6512, 65535, sum24);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)6576, 65535, sum25);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)7980, 65535, sum26);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)8044, 65535, sum27);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)8108, 65535, sum28);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)8172, 65535, sum29);
if (k9 >= kk5) return;
}
ptrdiff_t s3 = -1;
__m512 sum30 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)20));
__m512 sum34 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)24));
__m512 sum38 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)28));
__m512 sum42 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)32));
__m512 sum46 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)36));
__m512 sum31 = sum30;
__m512 sum32 = sum30;
__m512 sum33 = sum30;
__m512 sum35 = sum34;
__m512 sum36 = sum34;
__m512 sum37 = sum34;
__m512 sum39 = sum38;
__m512 sum40 = sum38;
__m512 sum41 = sum38;
__m512 sum43 = sum42;
__m512 sum44 = sum42;
__m512 sum45 = sum42;
__m512 sum47 = sum46;
__m512 sum48 = sum46;
__m512 sum49 = sum46;
for (s3 = 0; s3 < 835; ++s3) {
__m512 dat25 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+256*s3+(ptrdiff_t)0);
__m512 dat26 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+256*s3+(ptrdiff_t)64);
__m512 dat27 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+256*s3+(ptrdiff_t)128);
__m512 dat28 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+256*s3+(ptrdiff_t)192);
__m512 wt129 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)20));
sum30 = _mm512_fmadd_ps(wt129, dat25, sum30);
sum31 = _mm512_fmadd_ps(wt129, dat26, sum31);
sum32 = _mm512_fmadd_ps(wt129, dat27, sum32);
sum33 = _mm512_fmadd_ps(wt129, dat28, sum33);
__m512 wt130 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)24));
sum34 = _mm512_fmadd_ps(wt130, dat25, sum34);
sum35 = _mm512_fmadd_ps(wt130, dat26, sum35);
sum36 = _mm512_fmadd_ps(wt130, dat27, sum36);
sum37 = _mm512_fmadd_ps(wt130, dat28, sum37);
__m512 wt131 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)28));
sum38 = _mm512_fmadd_ps(wt131, dat25, sum38);
sum39 = _mm512_fmadd_ps(wt131, dat26, sum39);
sum40 = _mm512_fmadd_ps(wt131, dat27, sum40);
sum41 = _mm512_fmadd_ps(wt131, dat28, sum41);
__m512 wt132 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)32));
sum42 = _mm512_fmadd_ps(wt132, dat25, sum42);
sum43 = _mm512_fmadd_ps(wt132, dat26, sum43);
sum44 = _mm512_fmadd_ps(wt132, dat27, sum44);
sum45 = _mm512_fmadd_ps(wt132, dat28, sum45);
__m512 wt133 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k9+20*s3+(ptrdiff_t)36));
sum46 = _mm512_fmadd_ps(wt133, dat25, sum46);
sum47 = _mm512_fmadd_ps(wt133, dat26, sum47);
sum48 = _mm512_fmadd_ps(wt133, dat27, sum48);
sum49 = _mm512_fmadd_ps(wt133, dat28, sum49);
}
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)0, 65535, sum30);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)64, 65535, sum31);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)128, 65535, sum32);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)192, 65535, sum33);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)1596, 65535, sum34);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)1660, 65535, sum35);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)1724, 65535, sum36);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)1788, 65535, sum37);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)3192, 65535, sum38);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)3256, 65535, sum39);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)3320, 65535, sum40);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)3384, 65535, sum41);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)4788, 65535, sum42);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)4852, 65535, sum43);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)4916, 65535, sum44);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)4980, 65535, sum45);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)6384, 65535, sum46);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)6448, 65535, sum47);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)6512, 65535, sum48);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k9+(ptrdiff_t)6576, 65535, sum49);
if (j5 >= jj5) return;
}
ptrdiff_t k10 = 1*w1;
ptrdiff_t kk6 = k10+0;
for (; k10 != 4; ++k10) {
ptrdiff_t s4 = -1;
__m512 sum50 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)24));
__m512 sum51 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)28));
__m512 sum52 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)32));
__m512 sum53 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)36));
__m512 sum54 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)40));
__m512 sum55 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)44));
for (s4 = 0; s4 < 835; ++s4) {
__m512 dat29 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+64*s4+(ptrdiff_t)0);
__m512 wt134 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)24));
sum50 = _mm512_fmadd_ps(wt134, dat29, sum50);
__m512 wt135 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)28));
sum51 = _mm512_fmadd_ps(wt135, dat29, sum51);
__m512 wt136 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)32));
sum52 = _mm512_fmadd_ps(wt136, dat29, sum52);
__m512 wt137 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)36));
sum53 = _mm512_fmadd_ps(wt137, dat29, sum53);
__m512 wt138 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)40));
sum54 = _mm512_fmadd_ps(wt138, dat29, sum54);
__m512 wt139 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+24*s4+(ptrdiff_t)44));
sum55 = _mm512_fmadd_ps(wt139, dat29, sum55);
}
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)0, 32767, sum50);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)1596, 32767, sum51);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)3192, 32767, sum52);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)4788, 32767, sum53);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)6384, 32767, sum54);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)7980, 32767, sum55);
if (k10 >= kk6) return;
}
ptrdiff_t s5 = -1;
__m512 sum56 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)20));
__m512 sum57 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)24));
__m512 sum58 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)28));
__m512 sum59 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)32));
__m512 sum60 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)36));
for (s5 = 0; s5 < 835; ++s5) {
__m512 dat30 = _mm512_loadu_ps(arrangedDats1+1336000*i10+213760*j5+64*s5+(ptrdiff_t)0);
__m512 wt140 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)20));
sum56 = _mm512_fmadd_ps(wt140, dat30, sum56);
__m512 wt141 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)24));
sum57 = _mm512_fmadd_ps(wt141, dat30, sum57);
__m512 wt142 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)28));
sum58 = _mm512_fmadd_ps(wt142, dat30, sum58);
__m512 wt143 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)32));
sum59 = _mm512_fmadd_ps(wt143, dat30, sum59);
__m512 wt144 = _mm512_set1_ps(*(float*)(arrangedWts1+96976*i10+20064*k10+20*s5+(ptrdiff_t)36));
sum60 = _mm512_fmadd_ps(wt144, dat30, sum60);
}
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)0, 32767, sum56);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)1596, 32767, sum57);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)3192, 32767, sum58);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)4788, 32767, sum59);
_mm512_mask_storeu_ps(datPtr5+46284*i10+256*j5+9576*k10+(ptrdiff_t)6384, 32767, sum60);
}
}

static void Example27OneApply1Callee2(Example27ThreaderTask1* task9, int64_t* pt10) {
void** pair3 = task9->any1;
char** tensors7 = pair3[0];
ptrdiff_t e4 = 1;
ptrdiff_t g3 = 0;
ptrdiff_t d2 = pt10[1];
ptrdiff_t w2 = pt10[0];
char*restrict arrangedWts2 = tensors7[0]+96976*e4+(ptrdiff_t)101500*1*g3;
char*restrict arrangedDats2 = tensors7[1]+1336000*e4+(ptrdiff_t)1398400*1*g3;
char*restrict datPtr6 = tensors7[2]+(ptrdiff_t)46284*1*g3;
char*restrict bnPtr7 = tensors7[3]+(ptrdiff_t)8*29*1*g3;
char*restrict datPtr7 = tensors7[4]+(ptrdiff_t)46284*1*g3;
ptrdiff_t ii6 = 1;
for (ptrdiff_t i11 = 0; i11 < ii6; ++i11) {
ptrdiff_t j6 = 1*d2;
ptrdiff_t jj6 = j6+0;
for (; j6 != 6; ++j6) {
ptrdiff_t k11 = 1*w2;
ptrdiff_t kk7 = k11+0;
for (; k11 != 4; ++k11) {
ptrdiff_t s6 = -1;
__m512 sum61 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)24));
__m512 sum65 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)28));
__m512 sum69 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)32));
__m512 sum73 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)36));
__m512 sum77 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)40));
__m512 sum81 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)44));
__m512 sum62 = sum61;
__m512 sum63 = sum61;
__m512 sum64 = sum61;
__m512 sum66 = sum65;
__m512 sum67 = sum65;
__m512 sum68 = sum65;
__m512 sum70 = sum69;
__m512 sum71 = sum69;
__m512 sum72 = sum69;
__m512 sum74 = sum73;
__m512 sum75 = sum73;
__m512 sum76 = sum73;
__m512 sum78 = sum77;
__m512 sum79 = sum77;
__m512 sum80 = sum77;
__m512 sum82 = sum81;
__m512 sum83 = sum81;
__m512 sum84 = sum81;
for (s6 = 0; s6 < 874; ++s6) {
__m512 dat31 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+256*s6+(ptrdiff_t)0);
__m512 dat32 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+256*s6+(ptrdiff_t)64);
__m512 dat33 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+256*s6+(ptrdiff_t)128);
__m512 dat34 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+256*s6+(ptrdiff_t)192);
__m512 wt145 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)24));
sum61 = _mm512_fmadd_ps(wt145, dat31, sum61);
sum62 = _mm512_fmadd_ps(wt145, dat32, sum62);
sum63 = _mm512_fmadd_ps(wt145, dat33, sum63);
sum64 = _mm512_fmadd_ps(wt145, dat34, sum64);
__m512 wt146 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)28));
sum65 = _mm512_fmadd_ps(wt146, dat31, sum65);
sum66 = _mm512_fmadd_ps(wt146, dat32, sum66);
sum67 = _mm512_fmadd_ps(wt146, dat33, sum67);
sum68 = _mm512_fmadd_ps(wt146, dat34, sum68);
__m512 wt147 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)32));
sum69 = _mm512_fmadd_ps(wt147, dat31, sum69);
sum70 = _mm512_fmadd_ps(wt147, dat32, sum70);
sum71 = _mm512_fmadd_ps(wt147, dat33, sum71);
sum72 = _mm512_fmadd_ps(wt147, dat34, sum72);
__m512 wt148 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)36));
sum73 = _mm512_fmadd_ps(wt148, dat31, sum73);
sum74 = _mm512_fmadd_ps(wt148, dat32, sum74);
sum75 = _mm512_fmadd_ps(wt148, dat33, sum75);
sum76 = _mm512_fmadd_ps(wt148, dat34, sum76);
__m512 wt149 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)40));
sum77 = _mm512_fmadd_ps(wt149, dat31, sum77);
sum78 = _mm512_fmadd_ps(wt149, dat32, sum78);
sum79 = _mm512_fmadd_ps(wt149, dat33, sum79);
sum80 = _mm512_fmadd_ps(wt149, dat34, sum80);
__m512 wt150 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+24*s6+(ptrdiff_t)44));
sum81 = _mm512_fmadd_ps(wt150, dat31, sum81);
sum82 = _mm512_fmadd_ps(wt150, dat32, sum82);
sum83 = _mm512_fmadd_ps(wt150, dat33, sum83);
sum84 = _mm512_fmadd_ps(wt150, dat34, sum84);
}
sum61 = _mm512_add_ps(sum61, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)0));
sum62 = _mm512_add_ps(sum62, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)64));
sum63 = _mm512_add_ps(sum63, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)128));
sum64 = _mm512_add_ps(sum64, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)192));
__mmask16 mask3 = _mm512_cmp_ps_mask(sum61, _mm512_setzero_ps(), _CMP_LT_OQ);
sum61 = _mm512_mask_mul_ps(sum61, mask3, sum61, _mm512_set1_ps(3.125e-01f));
__mmask16 mask4 = _mm512_cmp_ps_mask(sum62, _mm512_setzero_ps(), _CMP_LT_OQ);
sum62 = _mm512_mask_mul_ps(sum62, mask4, sum62, _mm512_set1_ps(3.125e-01f));
__mmask16 mask5 = _mm512_cmp_ps_mask(sum63, _mm512_setzero_ps(), _CMP_LT_OQ);
sum63 = _mm512_mask_mul_ps(sum63, mask5, sum63, _mm512_set1_ps(3.125e-01f));
__mmask16 mask6 = _mm512_cmp_ps_mask(sum64, _mm512_setzero_ps(), _CMP_LT_OQ);
sum64 = _mm512_mask_mul_ps(sum64, mask6, sum64, _mm512_set1_ps(3.125e-01f));
sum61 = _mm512_add_ps(sum61, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)0));
sum62 = _mm512_add_ps(sum62, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)64));
sum63 = _mm512_add_ps(sum63, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)128));
sum64 = _mm512_add_ps(sum64, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)192));
__m512 bnMul5 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+6*k11+29*i11))[0]);
__m512 bnAdd5 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+6*k11+29*i11))[1]);
sum61 = _mm512_fmadd_ps(sum61, bnMul5, bnAdd5);
sum62 = _mm512_fmadd_ps(sum62, bnMul5, bnAdd5);
sum63 = _mm512_fmadd_ps(sum63, bnMul5, bnAdd5);
sum64 = _mm512_fmadd_ps(sum64, bnMul5, bnAdd5);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)0, 65535, sum61);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)64, 65535, sum62);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)128, 65535, sum63);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)192, 65535, sum64);
sum65 = _mm512_add_ps(sum65, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1596));
sum66 = _mm512_add_ps(sum66, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1660));
sum67 = _mm512_add_ps(sum67, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1724));
sum68 = _mm512_add_ps(sum68, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1788));
__mmask16 mask7 = _mm512_cmp_ps_mask(sum65, _mm512_setzero_ps(), _CMP_LT_OQ);
sum65 = _mm512_mask_mul_ps(sum65, mask7, sum65, _mm512_set1_ps(3.125e-01f));
__mmask16 mask8 = _mm512_cmp_ps_mask(sum66, _mm512_setzero_ps(), _CMP_LT_OQ);
sum66 = _mm512_mask_mul_ps(sum66, mask8, sum66, _mm512_set1_ps(3.125e-01f));
__mmask16 mask9 = _mm512_cmp_ps_mask(sum67, _mm512_setzero_ps(), _CMP_LT_OQ);
sum67 = _mm512_mask_mul_ps(sum67, mask9, sum67, _mm512_set1_ps(3.125e-01f));
__mmask16 mask10 = _mm512_cmp_ps_mask(sum68, _mm512_setzero_ps(), _CMP_LT_OQ);
sum68 = _mm512_mask_mul_ps(sum68, mask10, sum68, _mm512_set1_ps(3.125e-01f));
sum65 = _mm512_add_ps(sum65, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)1596));
sum66 = _mm512_add_ps(sum66, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)1660));
sum67 = _mm512_add_ps(sum67, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)1724));
sum68 = _mm512_add_ps(sum68, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)1788));
__m512 bnMul6 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+6*k11+29*i11))[0]);
__m512 bnAdd6 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+6*k11+29*i11))[1]);
sum65 = _mm512_fmadd_ps(sum65, bnMul6, bnAdd6);
sum66 = _mm512_fmadd_ps(sum66, bnMul6, bnAdd6);
sum67 = _mm512_fmadd_ps(sum67, bnMul6, bnAdd6);
sum68 = _mm512_fmadd_ps(sum68, bnMul6, bnAdd6);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1596, 65535, sum65);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1660, 65535, sum66);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1724, 65535, sum67);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1788, 65535, sum68);
sum69 = _mm512_add_ps(sum69, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3192));
sum70 = _mm512_add_ps(sum70, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3256));
sum71 = _mm512_add_ps(sum71, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3320));
sum72 = _mm512_add_ps(sum72, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3384));
__mmask16 mask11 = _mm512_cmp_ps_mask(sum69, _mm512_setzero_ps(), _CMP_LT_OQ);
sum69 = _mm512_mask_mul_ps(sum69, mask11, sum69, _mm512_set1_ps(3.125e-01f));
__mmask16 mask12 = _mm512_cmp_ps_mask(sum70, _mm512_setzero_ps(), _CMP_LT_OQ);
sum70 = _mm512_mask_mul_ps(sum70, mask12, sum70, _mm512_set1_ps(3.125e-01f));
__mmask16 mask13 = _mm512_cmp_ps_mask(sum71, _mm512_setzero_ps(), _CMP_LT_OQ);
sum71 = _mm512_mask_mul_ps(sum71, mask13, sum71, _mm512_set1_ps(3.125e-01f));
__mmask16 mask14 = _mm512_cmp_ps_mask(sum72, _mm512_setzero_ps(), _CMP_LT_OQ);
sum72 = _mm512_mask_mul_ps(sum72, mask14, sum72, _mm512_set1_ps(3.125e-01f));
sum69 = _mm512_add_ps(sum69, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)3192));
sum70 = _mm512_add_ps(sum70, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)3256));
sum71 = _mm512_add_ps(sum71, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)3320));
sum72 = _mm512_add_ps(sum72, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)3384));
__m512 bnMul7 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+6*k11+29*i11))[0]);
__m512 bnAdd7 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+6*k11+29*i11))[1]);
sum69 = _mm512_fmadd_ps(sum69, bnMul7, bnAdd7);
sum70 = _mm512_fmadd_ps(sum70, bnMul7, bnAdd7);
sum71 = _mm512_fmadd_ps(sum71, bnMul7, bnAdd7);
sum72 = _mm512_fmadd_ps(sum72, bnMul7, bnAdd7);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3192, 65535, sum69);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3256, 65535, sum70);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3320, 65535, sum71);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3384, 65535, sum72);
sum73 = _mm512_add_ps(sum73, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4788));
sum74 = _mm512_add_ps(sum74, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4852));
sum75 = _mm512_add_ps(sum75, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4916));
sum76 = _mm512_add_ps(sum76, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4980));
__mmask16 mask15 = _mm512_cmp_ps_mask(sum73, _mm512_setzero_ps(), _CMP_LT_OQ);
sum73 = _mm512_mask_mul_ps(sum73, mask15, sum73, _mm512_set1_ps(3.125e-01f));
__mmask16 mask16 = _mm512_cmp_ps_mask(sum74, _mm512_setzero_ps(), _CMP_LT_OQ);
sum74 = _mm512_mask_mul_ps(sum74, mask16, sum74, _mm512_set1_ps(3.125e-01f));
__mmask16 mask17 = _mm512_cmp_ps_mask(sum75, _mm512_setzero_ps(), _CMP_LT_OQ);
sum75 = _mm512_mask_mul_ps(sum75, mask17, sum75, _mm512_set1_ps(3.125e-01f));
__mmask16 mask18 = _mm512_cmp_ps_mask(sum76, _mm512_setzero_ps(), _CMP_LT_OQ);
sum76 = _mm512_mask_mul_ps(sum76, mask18, sum76, _mm512_set1_ps(3.125e-01f));
sum73 = _mm512_add_ps(sum73, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)4788));
sum74 = _mm512_add_ps(sum74, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)4852));
sum75 = _mm512_add_ps(sum75, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)4916));
sum76 = _mm512_add_ps(sum76, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)4980));
__m512 bnMul8 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+6*k11+29*i11))[0]);
__m512 bnAdd8 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+6*k11+29*i11))[1]);
sum73 = _mm512_fmadd_ps(sum73, bnMul8, bnAdd8);
sum74 = _mm512_fmadd_ps(sum74, bnMul8, bnAdd8);
sum75 = _mm512_fmadd_ps(sum75, bnMul8, bnAdd8);
sum76 = _mm512_fmadd_ps(sum76, bnMul8, bnAdd8);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4788, 65535, sum73);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4852, 65535, sum74);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4916, 65535, sum75);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4980, 65535, sum76);
sum77 = _mm512_add_ps(sum77, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6384));
sum78 = _mm512_add_ps(sum78, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6448));
sum79 = _mm512_add_ps(sum79, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6512));
sum80 = _mm512_add_ps(sum80, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6576));
__mmask16 mask19 = _mm512_cmp_ps_mask(sum77, _mm512_setzero_ps(), _CMP_LT_OQ);
sum77 = _mm512_mask_mul_ps(sum77, mask19, sum77, _mm512_set1_ps(3.125e-01f));
__mmask16 mask20 = _mm512_cmp_ps_mask(sum78, _mm512_setzero_ps(), _CMP_LT_OQ);
sum78 = _mm512_mask_mul_ps(sum78, mask20, sum78, _mm512_set1_ps(3.125e-01f));
__mmask16 mask21 = _mm512_cmp_ps_mask(sum79, _mm512_setzero_ps(), _CMP_LT_OQ);
sum79 = _mm512_mask_mul_ps(sum79, mask21, sum79, _mm512_set1_ps(3.125e-01f));
__mmask16 mask22 = _mm512_cmp_ps_mask(sum80, _mm512_setzero_ps(), _CMP_LT_OQ);
sum80 = _mm512_mask_mul_ps(sum80, mask22, sum80, _mm512_set1_ps(3.125e-01f));
sum77 = _mm512_add_ps(sum77, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)6384));
sum78 = _mm512_add_ps(sum78, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)6448));
sum79 = _mm512_add_ps(sum79, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)6512));
sum80 = _mm512_add_ps(sum80, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)6576));
__m512 bnMul9 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(4+6*k11+29*i11))[0]);
__m512 bnAdd9 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(4+6*k11+29*i11))[1]);
sum77 = _mm512_fmadd_ps(sum77, bnMul9, bnAdd9);
sum78 = _mm512_fmadd_ps(sum78, bnMul9, bnAdd9);
sum79 = _mm512_fmadd_ps(sum79, bnMul9, bnAdd9);
sum80 = _mm512_fmadd_ps(sum80, bnMul9, bnAdd9);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6384, 65535, sum77);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6448, 65535, sum78);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6512, 65535, sum79);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6576, 65535, sum80);
sum81 = _mm512_add_ps(sum81, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)7980));
sum82 = _mm512_add_ps(sum82, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)8044));
sum83 = _mm512_add_ps(sum83, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)8108));
sum84 = _mm512_add_ps(sum84, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)8172));
__mmask16 mask23 = _mm512_cmp_ps_mask(sum81, _mm512_setzero_ps(), _CMP_LT_OQ);
sum81 = _mm512_mask_mul_ps(sum81, mask23, sum81, _mm512_set1_ps(3.125e-01f));
__mmask16 mask24 = _mm512_cmp_ps_mask(sum82, _mm512_setzero_ps(), _CMP_LT_OQ);
sum82 = _mm512_mask_mul_ps(sum82, mask24, sum82, _mm512_set1_ps(3.125e-01f));
__mmask16 mask25 = _mm512_cmp_ps_mask(sum83, _mm512_setzero_ps(), _CMP_LT_OQ);
sum83 = _mm512_mask_mul_ps(sum83, mask25, sum83, _mm512_set1_ps(3.125e-01f));
__mmask16 mask26 = _mm512_cmp_ps_mask(sum84, _mm512_setzero_ps(), _CMP_LT_OQ);
sum84 = _mm512_mask_mul_ps(sum84, mask26, sum84, _mm512_set1_ps(3.125e-01f));
sum81 = _mm512_add_ps(sum81, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)7980));
sum82 = _mm512_add_ps(sum82, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)8044));
sum83 = _mm512_add_ps(sum83, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)8108));
sum84 = _mm512_add_ps(sum84, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)8172));
__m512 bnMul10 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(5+6*k11+29*i11))[0]);
__m512 bnAdd10 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(5+6*k11+29*i11))[1]);
sum81 = _mm512_fmadd_ps(sum81, bnMul10, bnAdd10);
sum82 = _mm512_fmadd_ps(sum82, bnMul10, bnAdd10);
sum83 = _mm512_fmadd_ps(sum83, bnMul10, bnAdd10);
sum84 = _mm512_fmadd_ps(sum84, bnMul10, bnAdd10);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)7980, 65535, sum81);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)8044, 65535, sum82);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)8108, 65535, sum83);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)8172, 65535, sum84);
if (k11 >= kk7) return;
}
ptrdiff_t s7 = -1;
__m512 sum85 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)20));
__m512 sum89 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)24));
__m512 sum93 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)28));
__m512 sum97 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)32));
__m512 sum101 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)36));
__m512 sum86 = sum85;
__m512 sum87 = sum85;
__m512 sum88 = sum85;
__m512 sum90 = sum89;
__m512 sum91 = sum89;
__m512 sum92 = sum89;
__m512 sum94 = sum93;
__m512 sum95 = sum93;
__m512 sum96 = sum93;
__m512 sum98 = sum97;
__m512 sum99 = sum97;
__m512 sum100 = sum97;
__m512 sum102 = sum101;
__m512 sum103 = sum101;
__m512 sum104 = sum101;
for (s7 = 0; s7 < 874; ++s7) {
__m512 dat35 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+256*s7+(ptrdiff_t)0);
__m512 dat36 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+256*s7+(ptrdiff_t)64);
__m512 dat37 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+256*s7+(ptrdiff_t)128);
__m512 dat38 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+256*s7+(ptrdiff_t)192);
__m512 wt151 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)20));
sum85 = _mm512_fmadd_ps(wt151, dat35, sum85);
sum86 = _mm512_fmadd_ps(wt151, dat36, sum86);
sum87 = _mm512_fmadd_ps(wt151, dat37, sum87);
sum88 = _mm512_fmadd_ps(wt151, dat38, sum88);
__m512 wt152 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)24));
sum89 = _mm512_fmadd_ps(wt152, dat35, sum89);
sum90 = _mm512_fmadd_ps(wt152, dat36, sum90);
sum91 = _mm512_fmadd_ps(wt152, dat37, sum91);
sum92 = _mm512_fmadd_ps(wt152, dat38, sum92);
__m512 wt153 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)28));
sum93 = _mm512_fmadd_ps(wt153, dat35, sum93);
sum94 = _mm512_fmadd_ps(wt153, dat36, sum94);
sum95 = _mm512_fmadd_ps(wt153, dat37, sum95);
sum96 = _mm512_fmadd_ps(wt153, dat38, sum96);
__m512 wt154 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)32));
sum97 = _mm512_fmadd_ps(wt154, dat35, sum97);
sum98 = _mm512_fmadd_ps(wt154, dat36, sum98);
sum99 = _mm512_fmadd_ps(wt154, dat37, sum99);
sum100 = _mm512_fmadd_ps(wt154, dat38, sum100);
__m512 wt155 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k11+20*s7+(ptrdiff_t)36));
sum101 = _mm512_fmadd_ps(wt155, dat35, sum101);
sum102 = _mm512_fmadd_ps(wt155, dat36, sum102);
sum103 = _mm512_fmadd_ps(wt155, dat37, sum103);
sum104 = _mm512_fmadd_ps(wt155, dat38, sum104);
}
sum85 = _mm512_add_ps(sum85, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)0));
sum86 = _mm512_add_ps(sum86, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)64));
sum87 = _mm512_add_ps(sum87, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)128));
sum88 = _mm512_add_ps(sum88, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)192));
__mmask16 mask27 = _mm512_cmp_ps_mask(sum85, _mm512_setzero_ps(), _CMP_LT_OQ);
sum85 = _mm512_mask_mul_ps(sum85, mask27, sum85, _mm512_set1_ps(3.125e-01f));
__mmask16 mask28 = _mm512_cmp_ps_mask(sum86, _mm512_setzero_ps(), _CMP_LT_OQ);
sum86 = _mm512_mask_mul_ps(sum86, mask28, sum86, _mm512_set1_ps(3.125e-01f));
__mmask16 mask29 = _mm512_cmp_ps_mask(sum87, _mm512_setzero_ps(), _CMP_LT_OQ);
sum87 = _mm512_mask_mul_ps(sum87, mask29, sum87, _mm512_set1_ps(3.125e-01f));
__mmask16 mask30 = _mm512_cmp_ps_mask(sum88, _mm512_setzero_ps(), _CMP_LT_OQ);
sum88 = _mm512_mask_mul_ps(sum88, mask30, sum88, _mm512_set1_ps(3.125e-01f));
sum85 = _mm512_add_ps(sum85, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)0));
sum86 = _mm512_add_ps(sum86, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)64));
sum87 = _mm512_add_ps(sum87, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)128));
sum88 = _mm512_add_ps(sum88, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)192));
__m512 bnMul11 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+6*k11+29*i11))[0]);
__m512 bnAdd11 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+6*k11+29*i11))[1]);
sum85 = _mm512_fmadd_ps(sum85, bnMul11, bnAdd11);
sum86 = _mm512_fmadd_ps(sum86, bnMul11, bnAdd11);
sum87 = _mm512_fmadd_ps(sum87, bnMul11, bnAdd11);
sum88 = _mm512_fmadd_ps(sum88, bnMul11, bnAdd11);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)0, 65535, sum85);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)64, 65535, sum86);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)128, 65535, sum87);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)192, 65535, sum88);
sum89 = _mm512_add_ps(sum89, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1596));
sum90 = _mm512_add_ps(sum90, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1660));
sum91 = _mm512_add_ps(sum91, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1724));
sum92 = _mm512_add_ps(sum92, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1788));
__mmask16 mask31 = _mm512_cmp_ps_mask(sum89, _mm512_setzero_ps(), _CMP_LT_OQ);
sum89 = _mm512_mask_mul_ps(sum89, mask31, sum89, _mm512_set1_ps(3.125e-01f));
__mmask16 mask32 = _mm512_cmp_ps_mask(sum90, _mm512_setzero_ps(), _CMP_LT_OQ);
sum90 = _mm512_mask_mul_ps(sum90, mask32, sum90, _mm512_set1_ps(3.125e-01f));
__mmask16 mask33 = _mm512_cmp_ps_mask(sum91, _mm512_setzero_ps(), _CMP_LT_OQ);
sum91 = _mm512_mask_mul_ps(sum91, mask33, sum91, _mm512_set1_ps(3.125e-01f));
__mmask16 mask34 = _mm512_cmp_ps_mask(sum92, _mm512_setzero_ps(), _CMP_LT_OQ);
sum92 = _mm512_mask_mul_ps(sum92, mask34, sum92, _mm512_set1_ps(3.125e-01f));
sum89 = _mm512_add_ps(sum89, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)1596));
sum90 = _mm512_add_ps(sum90, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)1660));
sum91 = _mm512_add_ps(sum91, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)1724));
sum92 = _mm512_add_ps(sum92, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)1788));
__m512 bnMul12 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+6*k11+29*i11))[0]);
__m512 bnAdd12 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+6*k11+29*i11))[1]);
sum89 = _mm512_fmadd_ps(sum89, bnMul12, bnAdd12);
sum90 = _mm512_fmadd_ps(sum90, bnMul12, bnAdd12);
sum91 = _mm512_fmadd_ps(sum91, bnMul12, bnAdd12);
sum92 = _mm512_fmadd_ps(sum92, bnMul12, bnAdd12);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1596, 65535, sum89);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1660, 65535, sum90);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1724, 65535, sum91);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)1788, 65535, sum92);
sum93 = _mm512_add_ps(sum93, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3192));
sum94 = _mm512_add_ps(sum94, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3256));
sum95 = _mm512_add_ps(sum95, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3320));
sum96 = _mm512_add_ps(sum96, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3384));
__mmask16 mask35 = _mm512_cmp_ps_mask(sum93, _mm512_setzero_ps(), _CMP_LT_OQ);
sum93 = _mm512_mask_mul_ps(sum93, mask35, sum93, _mm512_set1_ps(3.125e-01f));
__mmask16 mask36 = _mm512_cmp_ps_mask(sum94, _mm512_setzero_ps(), _CMP_LT_OQ);
sum94 = _mm512_mask_mul_ps(sum94, mask36, sum94, _mm512_set1_ps(3.125e-01f));
__mmask16 mask37 = _mm512_cmp_ps_mask(sum95, _mm512_setzero_ps(), _CMP_LT_OQ);
sum95 = _mm512_mask_mul_ps(sum95, mask37, sum95, _mm512_set1_ps(3.125e-01f));
__mmask16 mask38 = _mm512_cmp_ps_mask(sum96, _mm512_setzero_ps(), _CMP_LT_OQ);
sum96 = _mm512_mask_mul_ps(sum96, mask38, sum96, _mm512_set1_ps(3.125e-01f));
sum93 = _mm512_add_ps(sum93, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)3192));
sum94 = _mm512_add_ps(sum94, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)3256));
sum95 = _mm512_add_ps(sum95, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)3320));
sum96 = _mm512_add_ps(sum96, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)3384));
__m512 bnMul13 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+6*k11+29*i11))[0]);
__m512 bnAdd13 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+6*k11+29*i11))[1]);
sum93 = _mm512_fmadd_ps(sum93, bnMul13, bnAdd13);
sum94 = _mm512_fmadd_ps(sum94, bnMul13, bnAdd13);
sum95 = _mm512_fmadd_ps(sum95, bnMul13, bnAdd13);
sum96 = _mm512_fmadd_ps(sum96, bnMul13, bnAdd13);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3192, 65535, sum93);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3256, 65535, sum94);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3320, 65535, sum95);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)3384, 65535, sum96);
sum97 = _mm512_add_ps(sum97, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4788));
sum98 = _mm512_add_ps(sum98, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4852));
sum99 = _mm512_add_ps(sum99, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4916));
sum100 = _mm512_add_ps(sum100, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4980));
__mmask16 mask39 = _mm512_cmp_ps_mask(sum97, _mm512_setzero_ps(), _CMP_LT_OQ);
sum97 = _mm512_mask_mul_ps(sum97, mask39, sum97, _mm512_set1_ps(3.125e-01f));
__mmask16 mask40 = _mm512_cmp_ps_mask(sum98, _mm512_setzero_ps(), _CMP_LT_OQ);
sum98 = _mm512_mask_mul_ps(sum98, mask40, sum98, _mm512_set1_ps(3.125e-01f));
__mmask16 mask41 = _mm512_cmp_ps_mask(sum99, _mm512_setzero_ps(), _CMP_LT_OQ);
sum99 = _mm512_mask_mul_ps(sum99, mask41, sum99, _mm512_set1_ps(3.125e-01f));
__mmask16 mask42 = _mm512_cmp_ps_mask(sum100, _mm512_setzero_ps(), _CMP_LT_OQ);
sum100 = _mm512_mask_mul_ps(sum100, mask42, sum100, _mm512_set1_ps(3.125e-01f));
sum97 = _mm512_add_ps(sum97, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)4788));
sum98 = _mm512_add_ps(sum98, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)4852));
sum99 = _mm512_add_ps(sum99, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)4916));
sum100 = _mm512_add_ps(sum100, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)4980));
__m512 bnMul14 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+6*k11+29*i11))[0]);
__m512 bnAdd14 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+6*k11+29*i11))[1]);
sum97 = _mm512_fmadd_ps(sum97, bnMul14, bnAdd14);
sum98 = _mm512_fmadd_ps(sum98, bnMul14, bnAdd14);
sum99 = _mm512_fmadd_ps(sum99, bnMul14, bnAdd14);
sum100 = _mm512_fmadd_ps(sum100, bnMul14, bnAdd14);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4788, 65535, sum97);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4852, 65535, sum98);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4916, 65535, sum99);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)4980, 65535, sum100);
sum101 = _mm512_add_ps(sum101, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6384));
sum102 = _mm512_add_ps(sum102, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6448));
sum103 = _mm512_add_ps(sum103, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6512));
sum104 = _mm512_add_ps(sum104, _mm512_maskz_loadu_ps(65535, datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6576));
__mmask16 mask43 = _mm512_cmp_ps_mask(sum101, _mm512_setzero_ps(), _CMP_LT_OQ);
sum101 = _mm512_mask_mul_ps(sum101, mask43, sum101, _mm512_set1_ps(3.125e-01f));
__mmask16 mask44 = _mm512_cmp_ps_mask(sum102, _mm512_setzero_ps(), _CMP_LT_OQ);
sum102 = _mm512_mask_mul_ps(sum102, mask44, sum102, _mm512_set1_ps(3.125e-01f));
__mmask16 mask45 = _mm512_cmp_ps_mask(sum103, _mm512_setzero_ps(), _CMP_LT_OQ);
sum103 = _mm512_mask_mul_ps(sum103, mask45, sum103, _mm512_set1_ps(3.125e-01f));
__mmask16 mask46 = _mm512_cmp_ps_mask(sum104, _mm512_setzero_ps(), _CMP_LT_OQ);
sum104 = _mm512_mask_mul_ps(sum104, mask46, sum104, _mm512_set1_ps(3.125e-01f));
sum101 = _mm512_add_ps(sum101, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)6384));
sum102 = _mm512_add_ps(sum102, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)6448));
sum103 = _mm512_add_ps(sum103, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)6512));
sum104 = _mm512_add_ps(sum104, _mm512_maskz_loadu_ps(65535, datPtr6+46284*i11+256*j6+9576*k11+(ptrdiff_t)6576));
__m512 bnMul15 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(4+6*k11+29*i11))[0]);
__m512 bnAdd15 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(4+6*k11+29*i11))[1]);
sum101 = _mm512_fmadd_ps(sum101, bnMul15, bnAdd15);
sum102 = _mm512_fmadd_ps(sum102, bnMul15, bnAdd15);
sum103 = _mm512_fmadd_ps(sum103, bnMul15, bnAdd15);
sum104 = _mm512_fmadd_ps(sum104, bnMul15, bnAdd15);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6384, 65535, sum101);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6448, 65535, sum102);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6512, 65535, sum103);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k11+(ptrdiff_t)6576, 65535, sum104);
if (j6 >= jj6) return;
}
ptrdiff_t k12 = 1*w2;
ptrdiff_t kk8 = k12+0;
for (; k12 != 4; ++k12) {
ptrdiff_t s8 = -1;
__m512 sum105 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)24));
__m512 sum106 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)28));
__m512 sum107 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)32));
__m512 sum108 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)36));
__m512 sum109 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)40));
__m512 sum110 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)44));
for (s8 = 0; s8 < 874; ++s8) {
__m512 dat39 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+64*s8+(ptrdiff_t)0);
__m512 wt156 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)24));
sum105 = _mm512_fmadd_ps(wt156, dat39, sum105);
__m512 wt157 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)28));
sum106 = _mm512_fmadd_ps(wt157, dat39, sum106);
__m512 wt158 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)32));
sum107 = _mm512_fmadd_ps(wt158, dat39, sum107);
__m512 wt159 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)36));
sum108 = _mm512_fmadd_ps(wt159, dat39, sum108);
__m512 wt160 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)40));
sum109 = _mm512_fmadd_ps(wt160, dat39, sum109);
__m512 wt161 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+24*s8+(ptrdiff_t)44));
sum110 = _mm512_fmadd_ps(wt161, dat39, sum110);
}
sum105 = _mm512_add_ps(sum105, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)0));
__mmask16 mask47 = _mm512_cmp_ps_mask(sum105, _mm512_setzero_ps(), _CMP_LT_OQ);
sum105 = _mm512_mask_mul_ps(sum105, mask47, sum105, _mm512_set1_ps(3.125e-01f));
sum105 = _mm512_add_ps(sum105, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)0));
__m512 bnMul16 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+6*k12+29*i11))[0]);
__m512 bnAdd16 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+6*k12+29*i11))[1]);
sum105 = _mm512_fmadd_ps(sum105, bnMul16, bnAdd16);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)0, 32767, sum105);
sum106 = _mm512_add_ps(sum106, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)1596));
__mmask16 mask48 = _mm512_cmp_ps_mask(sum106, _mm512_setzero_ps(), _CMP_LT_OQ);
sum106 = _mm512_mask_mul_ps(sum106, mask48, sum106, _mm512_set1_ps(3.125e-01f));
sum106 = _mm512_add_ps(sum106, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)1596));
__m512 bnMul17 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+6*k12+29*i11))[0]);
__m512 bnAdd17 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+6*k12+29*i11))[1]);
sum106 = _mm512_fmadd_ps(sum106, bnMul17, bnAdd17);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)1596, 32767, sum106);
sum107 = _mm512_add_ps(sum107, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)3192));
__mmask16 mask49 = _mm512_cmp_ps_mask(sum107, _mm512_setzero_ps(), _CMP_LT_OQ);
sum107 = _mm512_mask_mul_ps(sum107, mask49, sum107, _mm512_set1_ps(3.125e-01f));
sum107 = _mm512_add_ps(sum107, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)3192));
__m512 bnMul18 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+6*k12+29*i11))[0]);
__m512 bnAdd18 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+6*k12+29*i11))[1]);
sum107 = _mm512_fmadd_ps(sum107, bnMul18, bnAdd18);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)3192, 32767, sum107);
sum108 = _mm512_add_ps(sum108, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)4788));
__mmask16 mask50 = _mm512_cmp_ps_mask(sum108, _mm512_setzero_ps(), _CMP_LT_OQ);
sum108 = _mm512_mask_mul_ps(sum108, mask50, sum108, _mm512_set1_ps(3.125e-01f));
sum108 = _mm512_add_ps(sum108, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)4788));
__m512 bnMul19 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+6*k12+29*i11))[0]);
__m512 bnAdd19 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+6*k12+29*i11))[1]);
sum108 = _mm512_fmadd_ps(sum108, bnMul19, bnAdd19);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)4788, 32767, sum108);
sum109 = _mm512_add_ps(sum109, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)6384));
__mmask16 mask51 = _mm512_cmp_ps_mask(sum109, _mm512_setzero_ps(), _CMP_LT_OQ);
sum109 = _mm512_mask_mul_ps(sum109, mask51, sum109, _mm512_set1_ps(3.125e-01f));
sum109 = _mm512_add_ps(sum109, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)6384));
__m512 bnMul20 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(4+6*k12+29*i11))[0]);
__m512 bnAdd20 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(4+6*k12+29*i11))[1]);
sum109 = _mm512_fmadd_ps(sum109, bnMul20, bnAdd20);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)6384, 32767, sum109);
sum110 = _mm512_add_ps(sum110, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)7980));
__mmask16 mask52 = _mm512_cmp_ps_mask(sum110, _mm512_setzero_ps(), _CMP_LT_OQ);
sum110 = _mm512_mask_mul_ps(sum110, mask52, sum110, _mm512_set1_ps(3.125e-01f));
sum110 = _mm512_add_ps(sum110, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)7980));
__m512 bnMul21 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(5+6*k12+29*i11))[0]);
__m512 bnAdd21 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(5+6*k12+29*i11))[1]);
sum110 = _mm512_fmadd_ps(sum110, bnMul21, bnAdd21);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)7980, 32767, sum110);
if (k12 >= kk8) return;
}
ptrdiff_t s9 = -1;
__m512 sum111 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)20));
__m512 sum112 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)24));
__m512 sum113 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)28));
__m512 sum114 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)32));
__m512 sum115 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)36));
for (s9 = 0; s9 < 874; ++s9) {
__m512 dat40 = _mm512_loadu_ps(arrangedDats2+1398400*i11+223744*j6+64*s9+(ptrdiff_t)0);
__m512 wt162 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)20));
sum111 = _mm512_fmadd_ps(wt162, dat40, sum111);
__m512 wt163 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)24));
sum112 = _mm512_fmadd_ps(wt163, dat40, sum112);
__m512 wt164 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)28));
sum113 = _mm512_fmadd_ps(wt164, dat40, sum113);
__m512 wt165 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)32));
sum114 = _mm512_fmadd_ps(wt165, dat40, sum114);
__m512 wt166 = _mm512_set1_ps(*(float*)(arrangedWts2+101500*i11+21000*k12+20*s9+(ptrdiff_t)36));
sum115 = _mm512_fmadd_ps(wt166, dat40, sum115);
}
sum111 = _mm512_add_ps(sum111, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)0));
__mmask16 mask53 = _mm512_cmp_ps_mask(sum111, _mm512_setzero_ps(), _CMP_LT_OQ);
sum111 = _mm512_mask_mul_ps(sum111, mask53, sum111, _mm512_set1_ps(3.125e-01f));
sum111 = _mm512_add_ps(sum111, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)0));
__m512 bnMul22 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+6*k12+29*i11))[0]);
__m512 bnAdd22 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+6*k12+29*i11))[1]);
sum111 = _mm512_fmadd_ps(sum111, bnMul22, bnAdd22);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)0, 32767, sum111);
sum112 = _mm512_add_ps(sum112, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)1596));
__mmask16 mask54 = _mm512_cmp_ps_mask(sum112, _mm512_setzero_ps(), _CMP_LT_OQ);
sum112 = _mm512_mask_mul_ps(sum112, mask54, sum112, _mm512_set1_ps(3.125e-01f));
sum112 = _mm512_add_ps(sum112, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)1596));
__m512 bnMul23 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+6*k12+29*i11))[0]);
__m512 bnAdd23 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+6*k12+29*i11))[1]);
sum112 = _mm512_fmadd_ps(sum112, bnMul23, bnAdd23);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)1596, 32767, sum112);
sum113 = _mm512_add_ps(sum113, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)3192));
__mmask16 mask55 = _mm512_cmp_ps_mask(sum113, _mm512_setzero_ps(), _CMP_LT_OQ);
sum113 = _mm512_mask_mul_ps(sum113, mask55, sum113, _mm512_set1_ps(3.125e-01f));
sum113 = _mm512_add_ps(sum113, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)3192));
__m512 bnMul24 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+6*k12+29*i11))[0]);
__m512 bnAdd24 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+6*k12+29*i11))[1]);
sum113 = _mm512_fmadd_ps(sum113, bnMul24, bnAdd24);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)3192, 32767, sum113);
sum114 = _mm512_add_ps(sum114, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)4788));
__mmask16 mask56 = _mm512_cmp_ps_mask(sum114, _mm512_setzero_ps(), _CMP_LT_OQ);
sum114 = _mm512_mask_mul_ps(sum114, mask56, sum114, _mm512_set1_ps(3.125e-01f));
sum114 = _mm512_add_ps(sum114, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)4788));
__m512 bnMul25 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+6*k12+29*i11))[0]);
__m512 bnAdd25 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+6*k12+29*i11))[1]);
sum114 = _mm512_fmadd_ps(sum114, bnMul25, bnAdd25);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)4788, 32767, sum114);
sum115 = _mm512_add_ps(sum115, _mm512_maskz_loadu_ps(32767, datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)6384));
__mmask16 mask57 = _mm512_cmp_ps_mask(sum115, _mm512_setzero_ps(), _CMP_LT_OQ);
sum115 = _mm512_mask_mul_ps(sum115, mask57, sum115, _mm512_set1_ps(3.125e-01f));
sum115 = _mm512_add_ps(sum115, _mm512_maskz_loadu_ps(32767, datPtr6+46284*i11+256*j6+9576*k12+(ptrdiff_t)6384));
__m512 bnMul26 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(4+6*k12+29*i11))[0]);
__m512 bnAdd26 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(4+6*k12+29*i11))[1]);
sum115 = _mm512_fmadd_ps(sum115, bnMul26, bnAdd26);
_mm512_mask_storeu_ps(datPtr7+46284*i11+256*j6+9576*k12+(ptrdiff_t)6384, 32767, sum115);
}
}

static void Example27OneApply1(Example27ThreaderTeam1* team16, char** tensors5) {
void* pair1[] = {tensors5, 0};
Example27ThreaderTask1 task10;
task10.callee1 = Example27OneApply1Callee1;
task10.any1 = pair1;
task10.nd1 = 3;
task10.hull1[0] = 5;
task10.hull1[1] = 7;
task10.hull1[2] = 1;
Example27ThreaderDo1(team16, &task10);
pair1[1] = (void*)1;
Example27ThreaderTask1 task11;
task11.callee1 = Example27OneApply1Callee2;
task11.any1 = pair1;
task11.nd1 = 3;
task11.hull1[0] = 5;
task11.hull1[1] = 7;
task11.hull1[2] = 1;
Example27ThreaderDo1(team16, &task11);
}

struct Example27Net {
char* alloc1;
char* align1;
};

void Example27NetDestroy(Example27Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example27NetCreate(
Example27Net** net1,
Example27Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example27Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(212491);
if (__builtin_expect(!alloc3, 0)) {
return Example27Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
char* tmpAlloc1 = malloc(13991);
if (__builtin_expect(!tmpAlloc1, 0)) {
char* msg6 = Example27Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
char* tmpAlign1 = (void*)(((size_t)tmpAlloc1+63)&-64);
Example27ThreaderTeam1* team12 = 0;
char* err8 = Example27ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(tmpAlloc1);
free(alloc3);
return err8;
}
{
Example27BnSimplify1(
params1->bn1Means,
params1->bn1Variances,
params1->bn1Scales,
params1->bn1Shifts,
align3+0
);
Example27BnSimplify2(
params1->bn4Means,
params1->bn4Variances,
params1->bn4Scales,
params1->bn4Shifts,
align3+13696
);
Example27BnSimplify1(
params1->bn2Means,
params1->bn2Variances,
params1->bn2Scales,
params1->bn2Shifts,
tmpAlign1+0
);
Example27BnSimplify2(
params1->bn3Means,
params1->bn3Variances,
params1->bn3Scales,
params1->bn3Shifts,
tmpAlign1+13696
);
char* tensors10[] = {
(char*)params1->convWeights,
(char*)params1->convBiases,
tmpAlign1+0,
tmpAlign1+13696,
align3+13952
};
Example27OneArrangeWts1(team12, tensors10);
}
Example27ThreaderDestroy1(team12);
free(tmpAlloc1);
Example27Net* net5 = malloc(sizeof(Example27Net));
if (__builtin_expect(!net5, 0)) {
char* msg7 = Example27Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg7;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example27Engine {
Example27Net* net3;
Example27ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example27EnginePthreadT(
Example27Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example27ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example27EngineDestroy(Example27Engine* eng3) {
Example27ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example27EngineCreate(
Example27Engine** eng4,
Example27Net* net4,
ptrdiff_t threads2
) {
Example27Engine* eng5 = malloc(sizeof(Example27Engine));
if (__builtin_expect(!eng5, 0)) {
return Example27Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(2734463);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example27Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example27ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example27EngineInference(
Example27Engine* eng1,
float* bn4Data,
float* in1Data,
float* in2Data,
float* in3Data
) {
char* netAlign1 = eng1->net3->align1;
Example27ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors8[] = {
(char*)in1Data,
netAlign1+0,
(char*)in2Data,
align4+0
};
Example27OneArrangeDats1(team14, tensors8);
char* tensors9[] = {
netAlign1+13952,
align4+0,
(char*)in3Data,
netAlign1+13696,
(char*)bn4Data
};
Example27OneApply1(team14, tensors9);
}
}

// End of file.

Top