NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example2 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=2373 Height=22 Width=58
Conv FromTensor=in ToTensor=out ToChannels=1830 FilterH=3 FilterW=6 StrideH=3 StrideW=4 PaddingH=1 PaddingW=3 DilationH=3 DilationW=1 Groups=3
Output FromTensor=out

Top || Output Example2.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example2Params);
// Example2Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example2Params Example2Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example2Params* params = malloc(sizeof(Example2Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example2Net* net; // For example, 4 threads:
// char* err = Example2NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example2NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example2Net Example2Net;

char* Example2NetCreate(
Example2Net**,
Example2Params*,
ptrdiff_t threads
);

void Example2NetDestroy(Example2Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example2Net* net;
//
// ... Create net ...
//
// Example2Engine* engine; // For example, 4 inference threads:
// char* err = Example2EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example2EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example2EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*2373*22*58);
// float* outData = malloc(sizeof(float)*1830*6*15);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example2EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example2Engine Example2Engine;

char* Example2EngineCreate(
Example2Engine**,
Example2Net*,
ptrdiff_t threads
);

char* Example2EnginePthreadT(
Example2Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example2EngineInference(
Example2Engine*,
float* inData,
float* outData
);

void Example2EngineDestroy(Example2Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example2Params {
float outBiases[1830]; // 1x1830x1x1
float outWeights[26055540]; // 1830x791x3x6
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example2.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example2.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example2.h"

static char* Example2Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(276);
int step1 = sprintf(msg1, "Example2: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 276-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example2ThreaderTask1 Example2ThreaderTask1;
typedef void (*Example2ThreaderCallee1)(Example2ThreaderTask1*, int64_t*);
typedef struct Example2ThreaderHub1 Example2ThreaderHub1;
typedef struct Example2ThreaderNode1 Example2ThreaderNode1;
typedef struct Example2ThreaderUnwind1 Example2ThreaderUnwind1;
typedef struct Example2ThreaderTeam1 Example2ThreaderTeam1;

struct Example2ThreaderTask1 {
Example2ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example2ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example2ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example2ThreaderTask1* task1;
pthread_cond_t cond2;
Example2ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example2ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example2ThreaderTeam1 {
ptrdiff_t nt1;
Example2ThreaderHub1* hub2;
Example2ThreaderNode1* nodes2;
Example2ThreaderUnwind1 unwind1;
};

static void Example2ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example2ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example2ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example2ThreaderMain1(void* arg1) {
Example2ThreaderNode1* node1 = arg1;
Example2ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example2ThreaderHub1* hub3 = team2->hub2;
Example2ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example2ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example2ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example2ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example2ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example2ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example2ThreaderDestroy1(Example2ThreaderTeam1* team3) {
if (!team3) return;
Example2ThreaderNode1* nodes4 = team3->nodes2;
Example2ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example2ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example2ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example2ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example2ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example2ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example2ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example2ThreaderCreate1Up4(Example2ThreaderTeam1* team8, ptrdiff_t nt7) {
Example2ThreaderNode1* nodes5 = team8->nodes2;
for (Example2ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example2Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example2Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example2ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example2Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example2ThreaderCreate1Up3(Example2ThreaderTeam1* team7, ptrdiff_t nt6) {
Example2ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example2Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example2Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example2ThreaderCreate1Up4(team7, nt6);
}

static char* Example2ThreaderCreate1Up2(Example2ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example2ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example2ThreaderNode1) != (size_t)nt5, 0)) {
return Example2Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example2Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example2ThreaderCreate1Up3(team6, nt5);
}

static char* Example2ThreaderCreate1Up1(Example2ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example2ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example2Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example2ThreaderCreate1Up2(team5, nt4);
}

static char* Example2ThreaderCreate1(Example2ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example2Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example2ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example2Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example2ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example2ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example2ThreaderPthreadT1(
pthread_t* thr2,
Example2ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example2Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example2ThreaderDo1(Example2ThreaderTeam1* team10, Example2ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example2ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example2ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example2ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example2ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example2Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example2Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example2LoomArrangeFilts1Callee1(Example2ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = pt7[1];
ptrdiff_t e1 = 0;
char*restrict arrangedB1 = tensors2[2]+7320*e1;
char*restrict arrangedW1 = tensors2[2]+7320+110019600*e1;
char*restrict wtPtr1 = tensors2[0]+60120*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+0;
if (j1 < 37) {
for (; j1 != 37; ++j1) {
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(65535, biasPtr1-0+2440*i5+64*j1);
}
_mm512_mask_storeu_ps(arrangedB1-0+2440*i5+64*j1, 65535, bias1);
ptrdiff_t c1 = (size_t)(0+16*j1)/6;
switch ((size_t)(0+16*j1)%6) {
case 0: {
ptrdiff_t k1 = 0;
for (; k1 != 791; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+34740720*i5+911232*j1+72*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(65535, wtPtr1+56952+34740720*i5+911232*j1+72*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(65535, wtPtr1+113904+34740720*i5+911232*j1+72*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(65535, wtPtr1+170856+34740720*i5+911232*j1+72*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(65535, wtPtr1+227808+34740720*i5+911232*j1+72*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(65535, wtPtr1+284760+34740720*i5+911232*j1+72*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(65535, wtPtr1+341712+34740720*i5+911232*j1+72*k1);
__m512 wt8 = _mm512_maskz_loadu_ps(65535, wtPtr1+398664+34740720*i5+911232*j1+72*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(65535, wtPtr1+455616+34740720*i5+911232*j1+72*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(65535, wtPtr1+512568+34740720*i5+911232*j1+72*k1);
__m512 wt11 = _mm512_maskz_loadu_ps(65535, wtPtr1+569520+34740720*i5+911232*j1+72*k1);
__m512 wt12 = _mm512_maskz_loadu_ps(65535, wtPtr1+626472+34740720*i5+911232*j1+72*k1);
__m512 wt13 = _mm512_maskz_loadu_ps(65535, wtPtr1+683424+34740720*i5+911232*j1+72*k1);
__m512 wt14 = _mm512_maskz_loadu_ps(65535, wtPtr1+740376+34740720*i5+911232*j1+72*k1);
__m512 wt15 = _mm512_maskz_loadu_ps(65535, wtPtr1+797328+34740720*i5+911232*j1+72*k1);
__m512 wt16 = _mm512_maskz_loadu_ps(65535, wtPtr1+854280+34740720*i5+911232*j1+72*k1);
__m512 tmp1 = _mm512_unpacklo_ps(wt1, wt2);
__m512 tmp2 = _mm512_unpackhi_ps(wt1, wt2);
__m512 tmp3 = _mm512_unpacklo_ps(wt3, wt4);
__m512 tmp4 = _mm512_unpackhi_ps(wt3, wt4);
__m512 tmp5 = _mm512_unpacklo_ps(wt5, wt6);
__m512 tmp6 = _mm512_unpackhi_ps(wt5, wt6);
__m512 tmp7 = _mm512_unpacklo_ps(wt7, wt8);
__m512 tmp8 = _mm512_unpackhi_ps(wt7, wt8);
__m512 tmp9 = _mm512_unpacklo_ps(wt9, wt10);
__m512 tmp10 = _mm512_unpackhi_ps(wt9, wt10);
__m512 tmp11 = _mm512_unpacklo_ps(wt11, wt12);
__m512 tmp12 = _mm512_unpackhi_ps(wt11, wt12);
__m512 tmp13 = _mm512_unpacklo_ps(wt13, wt14);
__m512 tmp14 = _mm512_unpackhi_ps(wt13, wt14);
__m512 tmp15 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp16 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt1 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt9 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt2 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt10 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt3 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt11 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt4 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt12 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt5 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt13 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt6 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt14 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt7 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt15 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt8 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt16 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
_mm512_mask_storeu_ps(arrangedW1+0+34740720*i5+18984*c1+24*k1, 63, wt1);
_mm512_mask_storeu_ps(arrangedW1+18960+34740720*i5+18984*c1+24*k1, 4032, wt1);
_mm512_mask_storeu_ps(arrangedW1+37920+34740720*i5+18984*c1+24*k1, 61440, wt1);
_mm512_mask_storeu_ps(arrangedW1+11580240+34740720*i5+18984*c1+24*k1, 63, wt2);
_mm512_mask_storeu_ps(arrangedW1+11599200+34740720*i5+18984*c1+24*k1, 4032, wt2);
_mm512_mask_storeu_ps(arrangedW1+11618160+34740720*i5+18984*c1+24*k1, 61440, wt2);
_mm512_mask_storeu_ps(arrangedW1+23160480+34740720*i5+18984*c1+24*k1, 63, wt3);
_mm512_mask_storeu_ps(arrangedW1+23179440+34740720*i5+18984*c1+24*k1, 4032, wt3);
_mm512_mask_storeu_ps(arrangedW1+23198400+34740720*i5+18984*c1+24*k1, 61440, wt3);
_mm512_mask_storeu_ps(arrangedW1+28950600+34740720*i5+18984*c1+24*k1, 63, wt4);
_mm512_mask_storeu_ps(arrangedW1+28969560+34740720*i5+18984*c1+24*k1, 4032, wt4);
_mm512_mask_storeu_ps(arrangedW1+28988520+34740720*i5+18984*c1+24*k1, 61440, wt4);
_mm512_mask_storeu_ps(arrangedW1+1930040+34740720*i5+18984*c1+24*k1, 63, wt5);
_mm512_mask_storeu_ps(arrangedW1+1949000+34740720*i5+18984*c1+24*k1, 4032, wt5);
_mm512_mask_storeu_ps(arrangedW1+1967960+34740720*i5+18984*c1+24*k1, 61440, wt5);
_mm512_mask_storeu_ps(arrangedW1+13510280+34740720*i5+18984*c1+24*k1, 63, wt6);
_mm512_mask_storeu_ps(arrangedW1+13529240+34740720*i5+18984*c1+24*k1, 4032, wt6);
_mm512_mask_storeu_ps(arrangedW1+13548200+34740720*i5+18984*c1+24*k1, 61440, wt6);
_mm512_mask_storeu_ps(arrangedW1+3860080+34740720*i5+18984*c1+24*k1, 63, wt7);
_mm512_mask_storeu_ps(arrangedW1+3879040+34740720*i5+18984*c1+24*k1, 4032, wt7);
_mm512_mask_storeu_ps(arrangedW1+3898000+34740720*i5+18984*c1+24*k1, 61440, wt7);
_mm512_mask_storeu_ps(arrangedW1+15440320+34740720*i5+18984*c1+24*k1, 63, wt8);
_mm512_mask_storeu_ps(arrangedW1+15459280+34740720*i5+18984*c1+24*k1, 4032, wt8);
_mm512_mask_storeu_ps(arrangedW1+15478240+34740720*i5+18984*c1+24*k1, 61440, wt8);
_mm512_mask_storeu_ps(arrangedW1+25090520+34740720*i5+18984*c1+24*k1, 63, wt9);
_mm512_mask_storeu_ps(arrangedW1+25109480+34740720*i5+18984*c1+24*k1, 4032, wt9);
_mm512_mask_storeu_ps(arrangedW1+25128440+34740720*i5+18984*c1+24*k1, 61440, wt9);
_mm512_mask_storeu_ps(arrangedW1+30880640+34740720*i5+18984*c1+24*k1, 63, wt10);
_mm512_mask_storeu_ps(arrangedW1+30899600+34740720*i5+18984*c1+24*k1, 4032, wt10);
_mm512_mask_storeu_ps(arrangedW1+30918560+34740720*i5+18984*c1+24*k1, 61440, wt10);
_mm512_mask_storeu_ps(arrangedW1+5790120+34740720*i5+18984*c1+24*k1, 63, wt11);
_mm512_mask_storeu_ps(arrangedW1+5809080+34740720*i5+18984*c1+24*k1, 4032, wt11);
_mm512_mask_storeu_ps(arrangedW1+5828040+34740720*i5+18984*c1+24*k1, 61440, wt11);
_mm512_mask_storeu_ps(arrangedW1+17370360+34740720*i5+18984*c1+24*k1, 63, wt12);
_mm512_mask_storeu_ps(arrangedW1+17389320+34740720*i5+18984*c1+24*k1, 4032, wt12);
_mm512_mask_storeu_ps(arrangedW1+17408280+34740720*i5+18984*c1+24*k1, 61440, wt12);
_mm512_mask_storeu_ps(arrangedW1+7720160+34740720*i5+18984*c1+24*k1, 63, wt13);
_mm512_mask_storeu_ps(arrangedW1+7739120+34740720*i5+18984*c1+24*k1, 4032, wt13);
_mm512_mask_storeu_ps(arrangedW1+7758080+34740720*i5+18984*c1+24*k1, 61440, wt13);
_mm512_mask_storeu_ps(arrangedW1+19300400+34740720*i5+18984*c1+24*k1, 63, wt14);
_mm512_mask_storeu_ps(arrangedW1+19319360+34740720*i5+18984*c1+24*k1, 4032, wt14);
_mm512_mask_storeu_ps(arrangedW1+19338320+34740720*i5+18984*c1+24*k1, 61440, wt14);
_mm512_mask_storeu_ps(arrangedW1+27020560+34740720*i5+18984*c1+24*k1, 63, wt15);
_mm512_mask_storeu_ps(arrangedW1+27039520+34740720*i5+18984*c1+24*k1, 4032, wt15);
_mm512_mask_storeu_ps(arrangedW1+27058480+34740720*i5+18984*c1+24*k1, 61440, wt15);
_mm512_mask_storeu_ps(arrangedW1+32810680+34740720*i5+18984*c1+24*k1, 63, wt16);
_mm512_mask_storeu_ps(arrangedW1+32829640+34740720*i5+18984*c1+24*k1, 4032, wt16);
_mm512_mask_storeu_ps(arrangedW1+32848600+34740720*i5+18984*c1+24*k1, 61440, wt16);
__m512 wt17 = _mm512_maskz_loadu_ps(3, wtPtr1+64+34740720*i5+911232*j1+72*k1);
__m512 wt18 = _mm512_maskz_loadu_ps(3, wtPtr1+57016+34740720*i5+911232*j1+72*k1);
__m512 wt19 = _mm512_maskz_loadu_ps(3, wtPtr1+113968+34740720*i5+911232*j1+72*k1);
__m512 wt20 = _mm512_maskz_loadu_ps(3, wtPtr1+170920+34740720*i5+911232*j1+72*k1);
__m512 wt21 = _mm512_maskz_loadu_ps(3, wtPtr1+227872+34740720*i5+911232*j1+72*k1);
__m512 wt22 = _mm512_maskz_loadu_ps(3, wtPtr1+284824+34740720*i5+911232*j1+72*k1);
__m512 wt23 = _mm512_maskz_loadu_ps(3, wtPtr1+341776+34740720*i5+911232*j1+72*k1);
__m512 wt24 = _mm512_maskz_loadu_ps(3, wtPtr1+398728+34740720*i5+911232*j1+72*k1);
__m512 wt25 = _mm512_maskz_loadu_ps(3, wtPtr1+455680+34740720*i5+911232*j1+72*k1);
__m512 wt26 = _mm512_maskz_loadu_ps(3, wtPtr1+512632+34740720*i5+911232*j1+72*k1);
__m512 wt27 = _mm512_maskz_loadu_ps(3, wtPtr1+569584+34740720*i5+911232*j1+72*k1);
__m512 wt28 = _mm512_maskz_loadu_ps(3, wtPtr1+626536+34740720*i5+911232*j1+72*k1);
__m512 wt29 = _mm512_maskz_loadu_ps(3, wtPtr1+683488+34740720*i5+911232*j1+72*k1);
__m512 wt30 = _mm512_maskz_loadu_ps(3, wtPtr1+740440+34740720*i5+911232*j1+72*k1);
__m512 wt31 = _mm512_maskz_loadu_ps(3, wtPtr1+797392+34740720*i5+911232*j1+72*k1);
__m512 wt32 = _mm512_maskz_loadu_ps(3, wtPtr1+854344+34740720*i5+911232*j1+72*k1);
__m512 tmp49 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp50 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp51 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp52 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp53 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp54 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp55 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp56 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp57 = _mm512_shuffle_ps(tmp49, tmp50, 68);
__m512 tmp58 = _mm512_shuffle_ps(tmp49, tmp50, 238);
__m512 tmp59 = _mm512_shuffle_ps(tmp51, tmp52, 68);
__m512 tmp60 = _mm512_shuffle_ps(tmp51, tmp52, 238);
__m512 tmp61 = _mm512_shuffle_ps(tmp53, tmp54, 68);
__m512 tmp62 = _mm512_shuffle_ps(tmp53, tmp54, 238);
__m512 tmp63 = _mm512_shuffle_ps(tmp55, tmp56, 68);
__m512 tmp64 = _mm512_shuffle_ps(tmp55, tmp56, 238);
__m512 tmp65 = _mm512_shuffle_f32x4(tmp57, tmp59, 136);
__m512 tmp66 = _mm512_shuffle_f32x4(tmp58, tmp60, 136);
__m512 tmp67 = _mm512_shuffle_f32x4(tmp61, tmp63, 136);
__m512 tmp68 = _mm512_shuffle_f32x4(tmp62, tmp64, 136);
wt17 = _mm512_shuffle_f32x4(tmp65, tmp67, 136);
wt18 = _mm512_shuffle_f32x4(tmp66, tmp68, 136);
_mm512_mask_storeu_ps(arrangedW1+9650200+34740720*i5+18984*c1+24*k1, 63, wt17);
_mm512_mask_storeu_ps(arrangedW1+9669160+34740720*i5+18984*c1+24*k1, 4032, wt17);
_mm512_mask_storeu_ps(arrangedW1+9688120+34740720*i5+18984*c1+24*k1, 61440, wt17);
_mm512_mask_storeu_ps(arrangedW1+21230440+34740720*i5+18984*c1+24*k1, 63, wt18);
_mm512_mask_storeu_ps(arrangedW1+21249400+34740720*i5+18984*c1+24*k1, 4032, wt18);
_mm512_mask_storeu_ps(arrangedW1+21268360+34740720*i5+18984*c1+24*k1, 61440, wt18);
}
break;
}
case 2: {
ptrdiff_t k2 = 0;
for (; k2 != 791; ++k2) {
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+34740720*i5+911232*j1+72*k2);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr1+56952+34740720*i5+911232*j1+72*k2);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr1+113904+34740720*i5+911232*j1+72*k2);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr1+170856+34740720*i5+911232*j1+72*k2);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr1+227808+34740720*i5+911232*j1+72*k2);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr1+284760+34740720*i5+911232*j1+72*k2);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr1+341712+34740720*i5+911232*j1+72*k2);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr1+398664+34740720*i5+911232*j1+72*k2);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr1+455616+34740720*i5+911232*j1+72*k2);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr1+512568+34740720*i5+911232*j1+72*k2);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr1+569520+34740720*i5+911232*j1+72*k2);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr1+626472+34740720*i5+911232*j1+72*k2);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr1+683424+34740720*i5+911232*j1+72*k2);
__m512 wt46 = _mm512_maskz_loadu_ps(65535, wtPtr1+740376+34740720*i5+911232*j1+72*k2);
__m512 wt47 = _mm512_maskz_loadu_ps(65535, wtPtr1+797328+34740720*i5+911232*j1+72*k2);
__m512 wt48 = _mm512_maskz_loadu_ps(65535, wtPtr1+854280+34740720*i5+911232*j1+72*k2);
__m512 tmp69 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp70 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp71 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp72 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp73 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp74 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp75 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp76 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp77 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp78 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp79 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp80 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp81 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp82 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp83 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp84 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp85 = _mm512_shuffle_ps(tmp69, tmp71, 68);
__m512 tmp86 = _mm512_shuffle_ps(tmp69, tmp71, 238);
__m512 tmp87 = _mm512_shuffle_ps(tmp70, tmp72, 68);
__m512 tmp88 = _mm512_shuffle_ps(tmp70, tmp72, 238);
__m512 tmp89 = _mm512_shuffle_ps(tmp73, tmp75, 68);
__m512 tmp90 = _mm512_shuffle_ps(tmp73, tmp75, 238);
__m512 tmp91 = _mm512_shuffle_ps(tmp74, tmp76, 68);
__m512 tmp92 = _mm512_shuffle_ps(tmp74, tmp76, 238);
__m512 tmp93 = _mm512_shuffle_ps(tmp77, tmp79, 68);
__m512 tmp94 = _mm512_shuffle_ps(tmp77, tmp79, 238);
__m512 tmp95 = _mm512_shuffle_ps(tmp78, tmp80, 68);
__m512 tmp96 = _mm512_shuffle_ps(tmp78, tmp80, 238);
__m512 tmp97 = _mm512_shuffle_ps(tmp81, tmp83, 68);
__m512 tmp98 = _mm512_shuffle_ps(tmp81, tmp83, 238);
__m512 tmp99 = _mm512_shuffle_ps(tmp82, tmp84, 68);
__m512 tmp100 = _mm512_shuffle_ps(tmp82, tmp84, 238);
__m512 tmp101 = _mm512_shuffle_f32x4(tmp85, tmp89, 136);
__m512 tmp102 = _mm512_shuffle_f32x4(tmp85, tmp89, 221);
__m512 tmp103 = _mm512_shuffle_f32x4(tmp86, tmp90, 136);
__m512 tmp104 = _mm512_shuffle_f32x4(tmp86, tmp90, 221);
__m512 tmp105 = _mm512_shuffle_f32x4(tmp87, tmp91, 136);
__m512 tmp106 = _mm512_shuffle_f32x4(tmp87, tmp91, 221);
__m512 tmp107 = _mm512_shuffle_f32x4(tmp88, tmp92, 136);
__m512 tmp108 = _mm512_shuffle_f32x4(tmp88, tmp92, 221);
__m512 tmp109 = _mm512_shuffle_f32x4(tmp93, tmp97, 136);
__m512 tmp110 = _mm512_shuffle_f32x4(tmp93, tmp97, 221);
__m512 tmp111 = _mm512_shuffle_f32x4(tmp94, tmp98, 136);
__m512 tmp112 = _mm512_shuffle_f32x4(tmp94, tmp98, 221);
__m512 tmp113 = _mm512_shuffle_f32x4(tmp95, tmp99, 136);
__m512 tmp114 = _mm512_shuffle_f32x4(tmp95, tmp99, 221);
__m512 tmp115 = _mm512_shuffle_f32x4(tmp96, tmp100, 136);
__m512 tmp116 = _mm512_shuffle_f32x4(tmp96, tmp100, 221);
wt33 = _mm512_shuffle_f32x4(tmp101, tmp109, 136);
wt41 = _mm512_shuffle_f32x4(tmp101, tmp109, 221);
wt34 = _mm512_shuffle_f32x4(tmp103, tmp111, 136);
wt42 = _mm512_shuffle_f32x4(tmp103, tmp111, 221);
wt35 = _mm512_shuffle_f32x4(tmp105, tmp113, 136);
wt43 = _mm512_shuffle_f32x4(tmp105, tmp113, 221);
wt36 = _mm512_shuffle_f32x4(tmp107, tmp115, 136);
wt44 = _mm512_shuffle_f32x4(tmp107, tmp115, 221);
wt37 = _mm512_shuffle_f32x4(tmp102, tmp110, 136);
wt45 = _mm512_shuffle_f32x4(tmp102, tmp110, 221);
wt38 = _mm512_shuffle_f32x4(tmp104, tmp112, 136);
wt46 = _mm512_shuffle_f32x4(tmp104, tmp112, 221);
wt39 = _mm512_shuffle_f32x4(tmp106, tmp114, 136);
wt47 = _mm512_shuffle_f32x4(tmp106, tmp114, 221);
wt40 = _mm512_shuffle_f32x4(tmp108, tmp116, 136);
wt48 = _mm512_shuffle_f32x4(tmp108, tmp116, 221);
_mm512_mask_storeu_ps(arrangedW1+8+34740720*i5+18984*c1+24*k2, 15, wt33);
_mm512_mask_storeu_ps(arrangedW1+18968+34740720*i5+18984*c1+24*k2, 1008, wt33);
_mm512_mask_storeu_ps(arrangedW1+37928+34740720*i5+18984*c1+24*k2, 64512, wt33);
_mm512_mask_storeu_ps(arrangedW1+11580248+34740720*i5+18984*c1+24*k2, 15, wt34);
_mm512_mask_storeu_ps(arrangedW1+11599208+34740720*i5+18984*c1+24*k2, 1008, wt34);
_mm512_mask_storeu_ps(arrangedW1+11618168+34740720*i5+18984*c1+24*k2, 64512, wt34);
_mm512_mask_storeu_ps(arrangedW1+23160488+34740720*i5+18984*c1+24*k2, 15, wt35);
_mm512_mask_storeu_ps(arrangedW1+23179448+34740720*i5+18984*c1+24*k2, 1008, wt35);
_mm512_mask_storeu_ps(arrangedW1+23198408+34740720*i5+18984*c1+24*k2, 64512, wt35);
_mm512_mask_storeu_ps(arrangedW1+28950608+34740720*i5+18984*c1+24*k2, 15, wt36);
_mm512_mask_storeu_ps(arrangedW1+28969568+34740720*i5+18984*c1+24*k2, 1008, wt36);
_mm512_mask_storeu_ps(arrangedW1+28988528+34740720*i5+18984*c1+24*k2, 64512, wt36);
_mm512_mask_storeu_ps(arrangedW1+1930048+34740720*i5+18984*c1+24*k2, 15, wt37);
_mm512_mask_storeu_ps(arrangedW1+1949008+34740720*i5+18984*c1+24*k2, 1008, wt37);
_mm512_mask_storeu_ps(arrangedW1+1967968+34740720*i5+18984*c1+24*k2, 64512, wt37);
_mm512_mask_storeu_ps(arrangedW1+13510288+34740720*i5+18984*c1+24*k2, 15, wt38);
_mm512_mask_storeu_ps(arrangedW1+13529248+34740720*i5+18984*c1+24*k2, 1008, wt38);
_mm512_mask_storeu_ps(arrangedW1+13548208+34740720*i5+18984*c1+24*k2, 64512, wt38);
_mm512_mask_storeu_ps(arrangedW1+3860088+34740720*i5+18984*c1+24*k2, 15, wt39);
_mm512_mask_storeu_ps(arrangedW1+3879048+34740720*i5+18984*c1+24*k2, 1008, wt39);
_mm512_mask_storeu_ps(arrangedW1+3898008+34740720*i5+18984*c1+24*k2, 64512, wt39);
_mm512_mask_storeu_ps(arrangedW1+15440328+34740720*i5+18984*c1+24*k2, 15, wt40);
_mm512_mask_storeu_ps(arrangedW1+15459288+34740720*i5+18984*c1+24*k2, 1008, wt40);
_mm512_mask_storeu_ps(arrangedW1+15478248+34740720*i5+18984*c1+24*k2, 64512, wt40);
_mm512_mask_storeu_ps(arrangedW1+25090528+34740720*i5+18984*c1+24*k2, 15, wt41);
_mm512_mask_storeu_ps(arrangedW1+25109488+34740720*i5+18984*c1+24*k2, 1008, wt41);
_mm512_mask_storeu_ps(arrangedW1+25128448+34740720*i5+18984*c1+24*k2, 64512, wt41);
_mm512_mask_storeu_ps(arrangedW1+30880648+34740720*i5+18984*c1+24*k2, 15, wt42);
_mm512_mask_storeu_ps(arrangedW1+30899608+34740720*i5+18984*c1+24*k2, 1008, wt42);
_mm512_mask_storeu_ps(arrangedW1+30918568+34740720*i5+18984*c1+24*k2, 64512, wt42);
_mm512_mask_storeu_ps(arrangedW1+5790128+34740720*i5+18984*c1+24*k2, 15, wt43);
_mm512_mask_storeu_ps(arrangedW1+5809088+34740720*i5+18984*c1+24*k2, 1008, wt43);
_mm512_mask_storeu_ps(arrangedW1+5828048+34740720*i5+18984*c1+24*k2, 64512, wt43);
_mm512_mask_storeu_ps(arrangedW1+17370368+34740720*i5+18984*c1+24*k2, 15, wt44);
_mm512_mask_storeu_ps(arrangedW1+17389328+34740720*i5+18984*c1+24*k2, 1008, wt44);
_mm512_mask_storeu_ps(arrangedW1+17408288+34740720*i5+18984*c1+24*k2, 64512, wt44);
_mm512_mask_storeu_ps(arrangedW1+7720168+34740720*i5+18984*c1+24*k2, 15, wt45);
_mm512_mask_storeu_ps(arrangedW1+7739128+34740720*i5+18984*c1+24*k2, 1008, wt45);
_mm512_mask_storeu_ps(arrangedW1+7758088+34740720*i5+18984*c1+24*k2, 64512, wt45);
_mm512_mask_storeu_ps(arrangedW1+19300408+34740720*i5+18984*c1+24*k2, 15, wt46);
_mm512_mask_storeu_ps(arrangedW1+19319368+34740720*i5+18984*c1+24*k2, 1008, wt46);
_mm512_mask_storeu_ps(arrangedW1+19338328+34740720*i5+18984*c1+24*k2, 64512, wt46);
_mm512_mask_storeu_ps(arrangedW1+27020568+34740720*i5+18984*c1+24*k2, 15, wt47);
_mm512_mask_storeu_ps(arrangedW1+27039528+34740720*i5+18984*c1+24*k2, 1008, wt47);
_mm512_mask_storeu_ps(arrangedW1+27058488+34740720*i5+18984*c1+24*k2, 64512, wt47);
_mm512_mask_storeu_ps(arrangedW1+32810688+34740720*i5+18984*c1+24*k2, 15, wt48);
_mm512_mask_storeu_ps(arrangedW1+32829648+34740720*i5+18984*c1+24*k2, 1008, wt48);
_mm512_mask_storeu_ps(arrangedW1+32848608+34740720*i5+18984*c1+24*k2, 64512, wt48);
__m512 wt49 = _mm512_maskz_loadu_ps(3, wtPtr1+64+34740720*i5+911232*j1+72*k2);
__m512 wt50 = _mm512_maskz_loadu_ps(3, wtPtr1+57016+34740720*i5+911232*j1+72*k2);
__m512 wt51 = _mm512_maskz_loadu_ps(3, wtPtr1+113968+34740720*i5+911232*j1+72*k2);
__m512 wt52 = _mm512_maskz_loadu_ps(3, wtPtr1+170920+34740720*i5+911232*j1+72*k2);
__m512 wt53 = _mm512_maskz_loadu_ps(3, wtPtr1+227872+34740720*i5+911232*j1+72*k2);
__m512 wt54 = _mm512_maskz_loadu_ps(3, wtPtr1+284824+34740720*i5+911232*j1+72*k2);
__m512 wt55 = _mm512_maskz_loadu_ps(3, wtPtr1+341776+34740720*i5+911232*j1+72*k2);
__m512 wt56 = _mm512_maskz_loadu_ps(3, wtPtr1+398728+34740720*i5+911232*j1+72*k2);
__m512 wt57 = _mm512_maskz_loadu_ps(3, wtPtr1+455680+34740720*i5+911232*j1+72*k2);
__m512 wt58 = _mm512_maskz_loadu_ps(3, wtPtr1+512632+34740720*i5+911232*j1+72*k2);
__m512 wt59 = _mm512_maskz_loadu_ps(3, wtPtr1+569584+34740720*i5+911232*j1+72*k2);
__m512 wt60 = _mm512_maskz_loadu_ps(3, wtPtr1+626536+34740720*i5+911232*j1+72*k2);
__m512 wt61 = _mm512_maskz_loadu_ps(3, wtPtr1+683488+34740720*i5+911232*j1+72*k2);
__m512 wt62 = _mm512_maskz_loadu_ps(3, wtPtr1+740440+34740720*i5+911232*j1+72*k2);
__m512 wt63 = _mm512_maskz_loadu_ps(3, wtPtr1+797392+34740720*i5+911232*j1+72*k2);
__m512 wt64 = _mm512_maskz_loadu_ps(3, wtPtr1+854344+34740720*i5+911232*j1+72*k2);
__m512 tmp117 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp118 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp119 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp120 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp121 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp122 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp123 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp124 = _mm512_unpacklo_ps(wt63, wt64);
__m512 tmp125 = _mm512_shuffle_ps(tmp117, tmp118, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp117, tmp118, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp119, tmp120, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp119, tmp120, 238);
__m512 tmp129 = _mm512_shuffle_ps(tmp121, tmp122, 68);
__m512 tmp130 = _mm512_shuffle_ps(tmp121, tmp122, 238);
__m512 tmp131 = _mm512_shuffle_ps(tmp123, tmp124, 68);
__m512 tmp132 = _mm512_shuffle_ps(tmp123, tmp124, 238);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp125, tmp127, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp126, tmp128, 136);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp129, tmp131, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp130, tmp132, 136);
wt49 = _mm512_shuffle_f32x4(tmp133, tmp135, 136);
wt50 = _mm512_shuffle_f32x4(tmp134, tmp136, 136);
_mm512_mask_storeu_ps(arrangedW1+9650208+34740720*i5+18984*c1+24*k2, 15, wt49);
_mm512_mask_storeu_ps(arrangedW1+9669168+34740720*i5+18984*c1+24*k2, 1008, wt49);
_mm512_mask_storeu_ps(arrangedW1+9688128+34740720*i5+18984*c1+24*k2, 64512, wt49);
_mm512_mask_storeu_ps(arrangedW1+21230448+34740720*i5+18984*c1+24*k2, 15, wt50);
_mm512_mask_storeu_ps(arrangedW1+21249408+34740720*i5+18984*c1+24*k2, 1008, wt50);
_mm512_mask_storeu_ps(arrangedW1+21268368+34740720*i5+18984*c1+24*k2, 64512, wt50);
}
break;
}
default: {
ptrdiff_t k3 = 0;
for (; k3 != 791; ++k3) {
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+34740720*i5+911232*j1+72*k3);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr1+56952+34740720*i5+911232*j1+72*k3);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr1+113904+34740720*i5+911232*j1+72*k3);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr1+170856+34740720*i5+911232*j1+72*k3);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr1+227808+34740720*i5+911232*j1+72*k3);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr1+284760+34740720*i5+911232*j1+72*k3);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr1+341712+34740720*i5+911232*j1+72*k3);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr1+398664+34740720*i5+911232*j1+72*k3);
__m512 wt73 = _mm512_maskz_loadu_ps(65535, wtPtr1+455616+34740720*i5+911232*j1+72*k3);
__m512 wt74 = _mm512_maskz_loadu_ps(65535, wtPtr1+512568+34740720*i5+911232*j1+72*k3);
__m512 wt75 = _mm512_maskz_loadu_ps(65535, wtPtr1+569520+34740720*i5+911232*j1+72*k3);
__m512 wt76 = _mm512_maskz_loadu_ps(65535, wtPtr1+626472+34740720*i5+911232*j1+72*k3);
__m512 wt77 = _mm512_maskz_loadu_ps(65535, wtPtr1+683424+34740720*i5+911232*j1+72*k3);
__m512 wt78 = _mm512_maskz_loadu_ps(65535, wtPtr1+740376+34740720*i5+911232*j1+72*k3);
__m512 wt79 = _mm512_maskz_loadu_ps(65535, wtPtr1+797328+34740720*i5+911232*j1+72*k3);
__m512 wt80 = _mm512_maskz_loadu_ps(65535, wtPtr1+854280+34740720*i5+911232*j1+72*k3);
__m512 tmp137 = _mm512_unpacklo_ps(wt65, wt66);
__m512 tmp138 = _mm512_unpackhi_ps(wt65, wt66);
__m512 tmp139 = _mm512_unpacklo_ps(wt67, wt68);
__m512 tmp140 = _mm512_unpackhi_ps(wt67, wt68);
__m512 tmp141 = _mm512_unpacklo_ps(wt69, wt70);
__m512 tmp142 = _mm512_unpackhi_ps(wt69, wt70);
__m512 tmp143 = _mm512_unpacklo_ps(wt71, wt72);
__m512 tmp144 = _mm512_unpackhi_ps(wt71, wt72);
__m512 tmp145 = _mm512_unpacklo_ps(wt73, wt74);
__m512 tmp146 = _mm512_unpackhi_ps(wt73, wt74);
__m512 tmp147 = _mm512_unpacklo_ps(wt75, wt76);
__m512 tmp148 = _mm512_unpackhi_ps(wt75, wt76);
__m512 tmp149 = _mm512_unpacklo_ps(wt77, wt78);
__m512 tmp150 = _mm512_unpackhi_ps(wt77, wt78);
__m512 tmp151 = _mm512_unpacklo_ps(wt79, wt80);
__m512 tmp152 = _mm512_unpackhi_ps(wt79, wt80);
__m512 tmp153 = _mm512_shuffle_ps(tmp137, tmp139, 68);
__m512 tmp154 = _mm512_shuffle_ps(tmp137, tmp139, 238);
__m512 tmp155 = _mm512_shuffle_ps(tmp138, tmp140, 68);
__m512 tmp156 = _mm512_shuffle_ps(tmp138, tmp140, 238);
__m512 tmp157 = _mm512_shuffle_ps(tmp141, tmp143, 68);
__m512 tmp158 = _mm512_shuffle_ps(tmp141, tmp143, 238);
__m512 tmp159 = _mm512_shuffle_ps(tmp142, tmp144, 68);
__m512 tmp160 = _mm512_shuffle_ps(tmp142, tmp144, 238);
__m512 tmp161 = _mm512_shuffle_ps(tmp145, tmp147, 68);
__m512 tmp162 = _mm512_shuffle_ps(tmp145, tmp147, 238);
__m512 tmp163 = _mm512_shuffle_ps(tmp146, tmp148, 68);
__m512 tmp164 = _mm512_shuffle_ps(tmp146, tmp148, 238);
__m512 tmp165 = _mm512_shuffle_ps(tmp149, tmp151, 68);
__m512 tmp166 = _mm512_shuffle_ps(tmp149, tmp151, 238);
__m512 tmp167 = _mm512_shuffle_ps(tmp150, tmp152, 68);
__m512 tmp168 = _mm512_shuffle_ps(tmp150, tmp152, 238);
__m512 tmp169 = _mm512_shuffle_f32x4(tmp153, tmp157, 136);
__m512 tmp170 = _mm512_shuffle_f32x4(tmp153, tmp157, 221);
__m512 tmp171 = _mm512_shuffle_f32x4(tmp154, tmp158, 136);
__m512 tmp172 = _mm512_shuffle_f32x4(tmp154, tmp158, 221);
__m512 tmp173 = _mm512_shuffle_f32x4(tmp155, tmp159, 136);
__m512 tmp174 = _mm512_shuffle_f32x4(tmp155, tmp159, 221);
__m512 tmp175 = _mm512_shuffle_f32x4(tmp156, tmp160, 136);
__m512 tmp176 = _mm512_shuffle_f32x4(tmp156, tmp160, 221);
__m512 tmp177 = _mm512_shuffle_f32x4(tmp161, tmp165, 136);
__m512 tmp178 = _mm512_shuffle_f32x4(tmp161, tmp165, 221);
__m512 tmp179 = _mm512_shuffle_f32x4(tmp162, tmp166, 136);
__m512 tmp180 = _mm512_shuffle_f32x4(tmp162, tmp166, 221);
__m512 tmp181 = _mm512_shuffle_f32x4(tmp163, tmp167, 136);
__m512 tmp182 = _mm512_shuffle_f32x4(tmp163, tmp167, 221);
__m512 tmp183 = _mm512_shuffle_f32x4(tmp164, tmp168, 136);
__m512 tmp184 = _mm512_shuffle_f32x4(tmp164, tmp168, 221);
wt65 = _mm512_shuffle_f32x4(tmp169, tmp177, 136);
wt73 = _mm512_shuffle_f32x4(tmp169, tmp177, 221);
wt66 = _mm512_shuffle_f32x4(tmp171, tmp179, 136);
wt74 = _mm512_shuffle_f32x4(tmp171, tmp179, 221);
wt67 = _mm512_shuffle_f32x4(tmp173, tmp181, 136);
wt75 = _mm512_shuffle_f32x4(tmp173, tmp181, 221);
wt68 = _mm512_shuffle_f32x4(tmp175, tmp183, 136);
wt76 = _mm512_shuffle_f32x4(tmp175, tmp183, 221);
wt69 = _mm512_shuffle_f32x4(tmp170, tmp178, 136);
wt77 = _mm512_shuffle_f32x4(tmp170, tmp178, 221);
wt70 = _mm512_shuffle_f32x4(tmp172, tmp180, 136);
wt78 = _mm512_shuffle_f32x4(tmp172, tmp180, 221);
wt71 = _mm512_shuffle_f32x4(tmp174, tmp182, 136);
wt79 = _mm512_shuffle_f32x4(tmp174, tmp182, 221);
wt72 = _mm512_shuffle_f32x4(tmp176, tmp184, 136);
wt80 = _mm512_shuffle_f32x4(tmp176, tmp184, 221);
_mm512_mask_storeu_ps(arrangedW1+16+34740720*i5+18984*c1+24*k3, 3, wt65);
_mm512_mask_storeu_ps(arrangedW1+18976+34740720*i5+18984*c1+24*k3, 252, wt65);
_mm512_mask_storeu_ps(arrangedW1+37936+34740720*i5+18984*c1+24*k3, 16128, wt65);
_mm512_mask_storeu_ps(arrangedW1+56896+34740720*i5+18984*c1+24*k3, 49152, wt65);
_mm512_mask_storeu_ps(arrangedW1+11580256+34740720*i5+18984*c1+24*k3, 3, wt66);
_mm512_mask_storeu_ps(arrangedW1+11599216+34740720*i5+18984*c1+24*k3, 252, wt66);
_mm512_mask_storeu_ps(arrangedW1+11618176+34740720*i5+18984*c1+24*k3, 16128, wt66);
_mm512_mask_storeu_ps(arrangedW1+11637136+34740720*i5+18984*c1+24*k3, 49152, wt66);
_mm512_mask_storeu_ps(arrangedW1+23160496+34740720*i5+18984*c1+24*k3, 3, wt67);
_mm512_mask_storeu_ps(arrangedW1+23179456+34740720*i5+18984*c1+24*k3, 252, wt67);
_mm512_mask_storeu_ps(arrangedW1+23198416+34740720*i5+18984*c1+24*k3, 16128, wt67);
_mm512_mask_storeu_ps(arrangedW1+23217376+34740720*i5+18984*c1+24*k3, 49152, wt67);
_mm512_mask_storeu_ps(arrangedW1+28950616+34740720*i5+18984*c1+24*k3, 3, wt68);
_mm512_mask_storeu_ps(arrangedW1+28969576+34740720*i5+18984*c1+24*k3, 252, wt68);
_mm512_mask_storeu_ps(arrangedW1+28988536+34740720*i5+18984*c1+24*k3, 16128, wt68);
_mm512_mask_storeu_ps(arrangedW1+29007496+34740720*i5+18984*c1+24*k3, 49152, wt68);
_mm512_mask_storeu_ps(arrangedW1+1930056+34740720*i5+18984*c1+24*k3, 3, wt69);
_mm512_mask_storeu_ps(arrangedW1+1949016+34740720*i5+18984*c1+24*k3, 252, wt69);
_mm512_mask_storeu_ps(arrangedW1+1967976+34740720*i5+18984*c1+24*k3, 16128, wt69);
_mm512_mask_storeu_ps(arrangedW1+1986936+34740720*i5+18984*c1+24*k3, 49152, wt69);
_mm512_mask_storeu_ps(arrangedW1+13510296+34740720*i5+18984*c1+24*k3, 3, wt70);
_mm512_mask_storeu_ps(arrangedW1+13529256+34740720*i5+18984*c1+24*k3, 252, wt70);
_mm512_mask_storeu_ps(arrangedW1+13548216+34740720*i5+18984*c1+24*k3, 16128, wt70);
_mm512_mask_storeu_ps(arrangedW1+13567176+34740720*i5+18984*c1+24*k3, 49152, wt70);
_mm512_mask_storeu_ps(arrangedW1+3860096+34740720*i5+18984*c1+24*k3, 3, wt71);
_mm512_mask_storeu_ps(arrangedW1+3879056+34740720*i5+18984*c1+24*k3, 252, wt71);
_mm512_mask_storeu_ps(arrangedW1+3898016+34740720*i5+18984*c1+24*k3, 16128, wt71);
_mm512_mask_storeu_ps(arrangedW1+3916976+34740720*i5+18984*c1+24*k3, 49152, wt71);
_mm512_mask_storeu_ps(arrangedW1+15440336+34740720*i5+18984*c1+24*k3, 3, wt72);
_mm512_mask_storeu_ps(arrangedW1+15459296+34740720*i5+18984*c1+24*k3, 252, wt72);
_mm512_mask_storeu_ps(arrangedW1+15478256+34740720*i5+18984*c1+24*k3, 16128, wt72);
_mm512_mask_storeu_ps(arrangedW1+15497216+34740720*i5+18984*c1+24*k3, 49152, wt72);
_mm512_mask_storeu_ps(arrangedW1+25090536+34740720*i5+18984*c1+24*k3, 3, wt73);
_mm512_mask_storeu_ps(arrangedW1+25109496+34740720*i5+18984*c1+24*k3, 252, wt73);
_mm512_mask_storeu_ps(arrangedW1+25128456+34740720*i5+18984*c1+24*k3, 16128, wt73);
_mm512_mask_storeu_ps(arrangedW1+25147416+34740720*i5+18984*c1+24*k3, 49152, wt73);
_mm512_mask_storeu_ps(arrangedW1+30880656+34740720*i5+18984*c1+24*k3, 3, wt74);
_mm512_mask_storeu_ps(arrangedW1+30899616+34740720*i5+18984*c1+24*k3, 252, wt74);
_mm512_mask_storeu_ps(arrangedW1+30918576+34740720*i5+18984*c1+24*k3, 16128, wt74);
_mm512_mask_storeu_ps(arrangedW1+30937536+34740720*i5+18984*c1+24*k3, 49152, wt74);
_mm512_mask_storeu_ps(arrangedW1+5790136+34740720*i5+18984*c1+24*k3, 3, wt75);
_mm512_mask_storeu_ps(arrangedW1+5809096+34740720*i5+18984*c1+24*k3, 252, wt75);
_mm512_mask_storeu_ps(arrangedW1+5828056+34740720*i5+18984*c1+24*k3, 16128, wt75);
_mm512_mask_storeu_ps(arrangedW1+5847016+34740720*i5+18984*c1+24*k3, 49152, wt75);
_mm512_mask_storeu_ps(arrangedW1+17370376+34740720*i5+18984*c1+24*k3, 3, wt76);
_mm512_mask_storeu_ps(arrangedW1+17389336+34740720*i5+18984*c1+24*k3, 252, wt76);
_mm512_mask_storeu_ps(arrangedW1+17408296+34740720*i5+18984*c1+24*k3, 16128, wt76);
_mm512_mask_storeu_ps(arrangedW1+17427256+34740720*i5+18984*c1+24*k3, 49152, wt76);
_mm512_mask_storeu_ps(arrangedW1+7720176+34740720*i5+18984*c1+24*k3, 3, wt77);
_mm512_mask_storeu_ps(arrangedW1+7739136+34740720*i5+18984*c1+24*k3, 252, wt77);
_mm512_mask_storeu_ps(arrangedW1+7758096+34740720*i5+18984*c1+24*k3, 16128, wt77);
_mm512_mask_storeu_ps(arrangedW1+7777056+34740720*i5+18984*c1+24*k3, 49152, wt77);
_mm512_mask_storeu_ps(arrangedW1+19300416+34740720*i5+18984*c1+24*k3, 3, wt78);
_mm512_mask_storeu_ps(arrangedW1+19319376+34740720*i5+18984*c1+24*k3, 252, wt78);
_mm512_mask_storeu_ps(arrangedW1+19338336+34740720*i5+18984*c1+24*k3, 16128, wt78);
_mm512_mask_storeu_ps(arrangedW1+19357296+34740720*i5+18984*c1+24*k3, 49152, wt78);
_mm512_mask_storeu_ps(arrangedW1+27020576+34740720*i5+18984*c1+24*k3, 3, wt79);
_mm512_mask_storeu_ps(arrangedW1+27039536+34740720*i5+18984*c1+24*k3, 252, wt79);
_mm512_mask_storeu_ps(arrangedW1+27058496+34740720*i5+18984*c1+24*k3, 16128, wt79);
_mm512_mask_storeu_ps(arrangedW1+27077456+34740720*i5+18984*c1+24*k3, 49152, wt79);
_mm512_mask_storeu_ps(arrangedW1+32810696+34740720*i5+18984*c1+24*k3, 3, wt80);
_mm512_mask_storeu_ps(arrangedW1+32829656+34740720*i5+18984*c1+24*k3, 252, wt80);
_mm512_mask_storeu_ps(arrangedW1+32848616+34740720*i5+18984*c1+24*k3, 16128, wt80);
_mm512_mask_storeu_ps(arrangedW1+32867576+34740720*i5+18984*c1+24*k3, 49152, wt80);
__m512 wt81 = _mm512_maskz_loadu_ps(3, wtPtr1+64+34740720*i5+911232*j1+72*k3);
__m512 wt82 = _mm512_maskz_loadu_ps(3, wtPtr1+57016+34740720*i5+911232*j1+72*k3);
__m512 wt83 = _mm512_maskz_loadu_ps(3, wtPtr1+113968+34740720*i5+911232*j1+72*k3);
__m512 wt84 = _mm512_maskz_loadu_ps(3, wtPtr1+170920+34740720*i5+911232*j1+72*k3);
__m512 wt85 = _mm512_maskz_loadu_ps(3, wtPtr1+227872+34740720*i5+911232*j1+72*k3);
__m512 wt86 = _mm512_maskz_loadu_ps(3, wtPtr1+284824+34740720*i5+911232*j1+72*k3);
__m512 wt87 = _mm512_maskz_loadu_ps(3, wtPtr1+341776+34740720*i5+911232*j1+72*k3);
__m512 wt88 = _mm512_maskz_loadu_ps(3, wtPtr1+398728+34740720*i5+911232*j1+72*k3);
__m512 wt89 = _mm512_maskz_loadu_ps(3, wtPtr1+455680+34740720*i5+911232*j1+72*k3);
__m512 wt90 = _mm512_maskz_loadu_ps(3, wtPtr1+512632+34740720*i5+911232*j1+72*k3);
__m512 wt91 = _mm512_maskz_loadu_ps(3, wtPtr1+569584+34740720*i5+911232*j1+72*k3);
__m512 wt92 = _mm512_maskz_loadu_ps(3, wtPtr1+626536+34740720*i5+911232*j1+72*k3);
__m512 wt93 = _mm512_maskz_loadu_ps(3, wtPtr1+683488+34740720*i5+911232*j1+72*k3);
__m512 wt94 = _mm512_maskz_loadu_ps(3, wtPtr1+740440+34740720*i5+911232*j1+72*k3);
__m512 wt95 = _mm512_maskz_loadu_ps(3, wtPtr1+797392+34740720*i5+911232*j1+72*k3);
__m512 wt96 = _mm512_maskz_loadu_ps(3, wtPtr1+854344+34740720*i5+911232*j1+72*k3);
__m512 tmp185 = _mm512_unpacklo_ps(wt81, wt82);
__m512 tmp186 = _mm512_unpacklo_ps(wt83, wt84);
__m512 tmp187 = _mm512_unpacklo_ps(wt85, wt86);
__m512 tmp188 = _mm512_unpacklo_ps(wt87, wt88);
__m512 tmp189 = _mm512_unpacklo_ps(wt89, wt90);
__m512 tmp190 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp191 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp192 = _mm512_unpacklo_ps(wt95, wt96);
__m512 tmp193 = _mm512_shuffle_ps(tmp185, tmp186, 68);
__m512 tmp194 = _mm512_shuffle_ps(tmp185, tmp186, 238);
__m512 tmp195 = _mm512_shuffle_ps(tmp187, tmp188, 68);
__m512 tmp196 = _mm512_shuffle_ps(tmp187, tmp188, 238);
__m512 tmp197 = _mm512_shuffle_ps(tmp189, tmp190, 68);
__m512 tmp198 = _mm512_shuffle_ps(tmp189, tmp190, 238);
__m512 tmp199 = _mm512_shuffle_ps(tmp191, tmp192, 68);
__m512 tmp200 = _mm512_shuffle_ps(tmp191, tmp192, 238);
__m512 tmp201 = _mm512_shuffle_f32x4(tmp193, tmp195, 136);
__m512 tmp202 = _mm512_shuffle_f32x4(tmp194, tmp196, 136);
__m512 tmp203 = _mm512_shuffle_f32x4(tmp197, tmp199, 136);
__m512 tmp204 = _mm512_shuffle_f32x4(tmp198, tmp200, 136);
wt81 = _mm512_shuffle_f32x4(tmp201, tmp203, 136);
wt82 = _mm512_shuffle_f32x4(tmp202, tmp204, 136);
_mm512_mask_storeu_ps(arrangedW1+9650216+34740720*i5+18984*c1+24*k3, 3, wt81);
_mm512_mask_storeu_ps(arrangedW1+9669176+34740720*i5+18984*c1+24*k3, 252, wt81);
_mm512_mask_storeu_ps(arrangedW1+9688136+34740720*i5+18984*c1+24*k3, 16128, wt81);
_mm512_mask_storeu_ps(arrangedW1+9707096+34740720*i5+18984*c1+24*k3, 49152, wt81);
_mm512_mask_storeu_ps(arrangedW1+21230456+34740720*i5+18984*c1+24*k3, 3, wt82);
_mm512_mask_storeu_ps(arrangedW1+21249416+34740720*i5+18984*c1+24*k3, 252, wt82);
_mm512_mask_storeu_ps(arrangedW1+21268376+34740720*i5+18984*c1+24*k3, 16128, wt82);
_mm512_mask_storeu_ps(arrangedW1+21287336+34740720*i5+18984*c1+24*k3, 49152, wt82);
}
break;
}
}
if (j1 >= jj1) return;
}
}
if (j1 == 37) {
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(65535, biasPtr1-0+2440*i5+64*j1);
}
_mm512_mask_storeu_ps(arrangedB1-0+2440*i5+64*j1, 65535, bias2);
ptrdiff_t c2 = (size_t)(0+16*j1)/6;
ptrdiff_t k4 = 0;
for (; k4 != 791; ++k4) {
__m512 wt97 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+34740720*i5+911232*j1+72*k4);
__m512 wt98 = _mm512_maskz_loadu_ps(65535, wtPtr1+56952+34740720*i5+911232*j1+72*k4);
__m512 wt99 = _mm512_maskz_loadu_ps(65535, wtPtr1+113904+34740720*i5+911232*j1+72*k4);
__m512 wt100 = _mm512_maskz_loadu_ps(65535, wtPtr1+170856+34740720*i5+911232*j1+72*k4);
__m512 wt101 = _mm512_maskz_loadu_ps(65535, wtPtr1+227808+34740720*i5+911232*j1+72*k4);
__m512 wt102 = _mm512_maskz_loadu_ps(65535, wtPtr1+284760+34740720*i5+911232*j1+72*k4);
__m512 wt103 = _mm512_maskz_loadu_ps(65535, wtPtr1+341712+34740720*i5+911232*j1+72*k4);
__m512 wt104 = _mm512_maskz_loadu_ps(65535, wtPtr1+398664+34740720*i5+911232*j1+72*k4);
__m512 wt105 = _mm512_maskz_loadu_ps(65535, wtPtr1+455616+34740720*i5+911232*j1+72*k4);
__m512 wt106 = _mm512_maskz_loadu_ps(65535, wtPtr1+512568+34740720*i5+911232*j1+72*k4);
__m512 wt107 = _mm512_maskz_loadu_ps(65535, wtPtr1+569520+34740720*i5+911232*j1+72*k4);
__m512 wt108 = _mm512_maskz_loadu_ps(65535, wtPtr1+626472+34740720*i5+911232*j1+72*k4);
__m512 wt109 = _mm512_maskz_loadu_ps(65535, wtPtr1+683424+34740720*i5+911232*j1+72*k4);
__m512 wt110 = _mm512_maskz_loadu_ps(65535, wtPtr1+740376+34740720*i5+911232*j1+72*k4);
__m512 wt111 = _mm512_maskz_loadu_ps(65535, wtPtr1+797328+34740720*i5+911232*j1+72*k4);
__m512 wt112 = _mm512_maskz_loadu_ps(65535, wtPtr1+854280+34740720*i5+911232*j1+72*k4);
__m512 tmp205 = _mm512_unpacklo_ps(wt97, wt98);
__m512 tmp206 = _mm512_unpackhi_ps(wt97, wt98);
__m512 tmp207 = _mm512_unpacklo_ps(wt99, wt100);
__m512 tmp208 = _mm512_unpackhi_ps(wt99, wt100);
__m512 tmp209 = _mm512_unpacklo_ps(wt101, wt102);
__m512 tmp210 = _mm512_unpackhi_ps(wt101, wt102);
__m512 tmp211 = _mm512_unpacklo_ps(wt103, wt104);
__m512 tmp212 = _mm512_unpackhi_ps(wt103, wt104);
__m512 tmp213 = _mm512_unpacklo_ps(wt105, wt106);
__m512 tmp214 = _mm512_unpackhi_ps(wt105, wt106);
__m512 tmp215 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp216 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp217 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp218 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp219 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp220 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp221 = _mm512_shuffle_ps(tmp205, tmp207, 68);
__m512 tmp222 = _mm512_shuffle_ps(tmp205, tmp207, 238);
__m512 tmp223 = _mm512_shuffle_ps(tmp206, tmp208, 68);
__m512 tmp224 = _mm512_shuffle_ps(tmp206, tmp208, 238);
__m512 tmp225 = _mm512_shuffle_ps(tmp209, tmp211, 68);
__m512 tmp226 = _mm512_shuffle_ps(tmp209, tmp211, 238);
__m512 tmp227 = _mm512_shuffle_ps(tmp210, tmp212, 68);
__m512 tmp228 = _mm512_shuffle_ps(tmp210, tmp212, 238);
__m512 tmp229 = _mm512_shuffle_ps(tmp213, tmp215, 68);
__m512 tmp230 = _mm512_shuffle_ps(tmp213, tmp215, 238);
__m512 tmp231 = _mm512_shuffle_ps(tmp214, tmp216, 68);
__m512 tmp232 = _mm512_shuffle_ps(tmp214, tmp216, 238);
__m512 tmp233 = _mm512_shuffle_ps(tmp217, tmp219, 68);
__m512 tmp234 = _mm512_shuffle_ps(tmp217, tmp219, 238);
__m512 tmp235 = _mm512_shuffle_ps(tmp218, tmp220, 68);
__m512 tmp236 = _mm512_shuffle_ps(tmp218, tmp220, 238);
__m512 tmp237 = _mm512_shuffle_f32x4(tmp221, tmp225, 136);
__m512 tmp238 = _mm512_shuffle_f32x4(tmp221, tmp225, 221);
__m512 tmp239 = _mm512_shuffle_f32x4(tmp222, tmp226, 136);
__m512 tmp240 = _mm512_shuffle_f32x4(tmp222, tmp226, 221);
__m512 tmp241 = _mm512_shuffle_f32x4(tmp223, tmp227, 136);
__m512 tmp242 = _mm512_shuffle_f32x4(tmp223, tmp227, 221);
__m512 tmp243 = _mm512_shuffle_f32x4(tmp224, tmp228, 136);
__m512 tmp244 = _mm512_shuffle_f32x4(tmp224, tmp228, 221);
__m512 tmp245 = _mm512_shuffle_f32x4(tmp229, tmp233, 136);
__m512 tmp246 = _mm512_shuffle_f32x4(tmp229, tmp233, 221);
__m512 tmp247 = _mm512_shuffle_f32x4(tmp230, tmp234, 136);
__m512 tmp248 = _mm512_shuffle_f32x4(tmp230, tmp234, 221);
__m512 tmp249 = _mm512_shuffle_f32x4(tmp231, tmp235, 136);
__m512 tmp250 = _mm512_shuffle_f32x4(tmp231, tmp235, 221);
__m512 tmp251 = _mm512_shuffle_f32x4(tmp232, tmp236, 136);
__m512 tmp252 = _mm512_shuffle_f32x4(tmp232, tmp236, 221);
wt97 = _mm512_shuffle_f32x4(tmp237, tmp245, 136);
wt105 = _mm512_shuffle_f32x4(tmp237, tmp245, 221);
wt98 = _mm512_shuffle_f32x4(tmp239, tmp247, 136);
wt106 = _mm512_shuffle_f32x4(tmp239, tmp247, 221);
wt99 = _mm512_shuffle_f32x4(tmp241, tmp249, 136);
wt107 = _mm512_shuffle_f32x4(tmp241, tmp249, 221);
wt100 = _mm512_shuffle_f32x4(tmp243, tmp251, 136);
wt108 = _mm512_shuffle_f32x4(tmp243, tmp251, 221);
wt101 = _mm512_shuffle_f32x4(tmp238, tmp246, 136);
wt109 = _mm512_shuffle_f32x4(tmp238, tmp246, 221);
wt102 = _mm512_shuffle_f32x4(tmp240, tmp248, 136);
wt110 = _mm512_shuffle_f32x4(tmp240, tmp248, 221);
wt103 = _mm512_shuffle_f32x4(tmp242, tmp250, 136);
wt111 = _mm512_shuffle_f32x4(tmp242, tmp250, 221);
wt104 = _mm512_shuffle_f32x4(tmp244, tmp252, 136);
wt112 = _mm512_shuffle_f32x4(tmp244, tmp252, 221);
_mm512_mask_storeu_ps(arrangedW1+16+34740720*i5+18984*c2+24*k4, 3, wt97);
_mm512_mask_storeu_ps(arrangedW1+18976+34740720*i5+18984*c2+24*k4, 252, wt97);
_mm512_mask_storeu_ps(arrangedW1+37936+34740720*i5+18984*c2+24*k4, 16128, wt97);
_mm512_mask_storeu_ps(arrangedW1+56896+34740720*i5+18984*c2+16*k4, 49152, wt97);
_mm512_mask_storeu_ps(arrangedW1+11580256+34740720*i5+18984*c2+24*k4, 3, wt98);
_mm512_mask_storeu_ps(arrangedW1+11599216+34740720*i5+18984*c2+24*k4, 252, wt98);
_mm512_mask_storeu_ps(arrangedW1+11618176+34740720*i5+18984*c2+24*k4, 16128, wt98);
_mm512_mask_storeu_ps(arrangedW1+11637136+34740720*i5+18984*c2+16*k4, 49152, wt98);
_mm512_mask_storeu_ps(arrangedW1+23160496+34740720*i5+18984*c2+24*k4, 3, wt99);
_mm512_mask_storeu_ps(arrangedW1+23179456+34740720*i5+18984*c2+24*k4, 252, wt99);
_mm512_mask_storeu_ps(arrangedW1+23198416+34740720*i5+18984*c2+24*k4, 16128, wt99);
_mm512_mask_storeu_ps(arrangedW1+23217376+34740720*i5+18984*c2+16*k4, 49152, wt99);
_mm512_mask_storeu_ps(arrangedW1+28950616+34740720*i5+18984*c2+24*k4, 3, wt100);
_mm512_mask_storeu_ps(arrangedW1+28969576+34740720*i5+18984*c2+24*k4, 252, wt100);
_mm512_mask_storeu_ps(arrangedW1+28988536+34740720*i5+18984*c2+24*k4, 16128, wt100);
_mm512_mask_storeu_ps(arrangedW1+29007496+34740720*i5+18984*c2+16*k4, 49152, wt100);
_mm512_mask_storeu_ps(arrangedW1+1930056+34740720*i5+18984*c2+24*k4, 3, wt101);
_mm512_mask_storeu_ps(arrangedW1+1949016+34740720*i5+18984*c2+24*k4, 252, wt101);
_mm512_mask_storeu_ps(arrangedW1+1967976+34740720*i5+18984*c2+24*k4, 16128, wt101);
_mm512_mask_storeu_ps(arrangedW1+1986936+34740720*i5+18984*c2+16*k4, 49152, wt101);
_mm512_mask_storeu_ps(arrangedW1+13510296+34740720*i5+18984*c2+24*k4, 3, wt102);
_mm512_mask_storeu_ps(arrangedW1+13529256+34740720*i5+18984*c2+24*k4, 252, wt102);
_mm512_mask_storeu_ps(arrangedW1+13548216+34740720*i5+18984*c2+24*k4, 16128, wt102);
_mm512_mask_storeu_ps(arrangedW1+13567176+34740720*i5+18984*c2+16*k4, 49152, wt102);
_mm512_mask_storeu_ps(arrangedW1+3860096+34740720*i5+18984*c2+24*k4, 3, wt103);
_mm512_mask_storeu_ps(arrangedW1+3879056+34740720*i5+18984*c2+24*k4, 252, wt103);
_mm512_mask_storeu_ps(arrangedW1+3898016+34740720*i5+18984*c2+24*k4, 16128, wt103);
_mm512_mask_storeu_ps(arrangedW1+3916976+34740720*i5+18984*c2+16*k4, 49152, wt103);
_mm512_mask_storeu_ps(arrangedW1+15440336+34740720*i5+18984*c2+24*k4, 3, wt104);
_mm512_mask_storeu_ps(arrangedW1+15459296+34740720*i5+18984*c2+24*k4, 252, wt104);
_mm512_mask_storeu_ps(arrangedW1+15478256+34740720*i5+18984*c2+24*k4, 16128, wt104);
_mm512_mask_storeu_ps(arrangedW1+15497216+34740720*i5+18984*c2+16*k4, 49152, wt104);
_mm512_mask_storeu_ps(arrangedW1+25090536+34740720*i5+18984*c2+24*k4, 3, wt105);
_mm512_mask_storeu_ps(arrangedW1+25109496+34740720*i5+18984*c2+24*k4, 252, wt105);
_mm512_mask_storeu_ps(arrangedW1+25128456+34740720*i5+18984*c2+24*k4, 16128, wt105);
_mm512_mask_storeu_ps(arrangedW1+25147416+34740720*i5+18984*c2+16*k4, 49152, wt105);
_mm512_mask_storeu_ps(arrangedW1+30880656+34740720*i5+18984*c2+24*k4, 3, wt106);
_mm512_mask_storeu_ps(arrangedW1+30899616+34740720*i5+18984*c2+24*k4, 252, wt106);
_mm512_mask_storeu_ps(arrangedW1+30918576+34740720*i5+18984*c2+24*k4, 16128, wt106);
_mm512_mask_storeu_ps(arrangedW1+30937536+34740720*i5+18984*c2+16*k4, 49152, wt106);
_mm512_mask_storeu_ps(arrangedW1+5790136+34740720*i5+18984*c2+24*k4, 3, wt107);
_mm512_mask_storeu_ps(arrangedW1+5809096+34740720*i5+18984*c2+24*k4, 252, wt107);
_mm512_mask_storeu_ps(arrangedW1+5828056+34740720*i5+18984*c2+24*k4, 16128, wt107);
_mm512_mask_storeu_ps(arrangedW1+5847016+34740720*i5+18984*c2+16*k4, 49152, wt107);
_mm512_mask_storeu_ps(arrangedW1+17370376+34740720*i5+18984*c2+24*k4, 3, wt108);
_mm512_mask_storeu_ps(arrangedW1+17389336+34740720*i5+18984*c2+24*k4, 252, wt108);
_mm512_mask_storeu_ps(arrangedW1+17408296+34740720*i5+18984*c2+24*k4, 16128, wt108);
_mm512_mask_storeu_ps(arrangedW1+17427256+34740720*i5+18984*c2+16*k4, 49152, wt108);
_mm512_mask_storeu_ps(arrangedW1+7720176+34740720*i5+18984*c2+24*k4, 3, wt109);
_mm512_mask_storeu_ps(arrangedW1+7739136+34740720*i5+18984*c2+24*k4, 252, wt109);
_mm512_mask_storeu_ps(arrangedW1+7758096+34740720*i5+18984*c2+24*k4, 16128, wt109);
_mm512_mask_storeu_ps(arrangedW1+7777056+34740720*i5+18984*c2+16*k4, 49152, wt109);
_mm512_mask_storeu_ps(arrangedW1+19300416+34740720*i5+18984*c2+24*k4, 3, wt110);
_mm512_mask_storeu_ps(arrangedW1+19319376+34740720*i5+18984*c2+24*k4, 252, wt110);
_mm512_mask_storeu_ps(arrangedW1+19338336+34740720*i5+18984*c2+24*k4, 16128, wt110);
_mm512_mask_storeu_ps(arrangedW1+19357296+34740720*i5+18984*c2+16*k4, 49152, wt110);
_mm512_mask_storeu_ps(arrangedW1+27020576+34740720*i5+18984*c2+24*k4, 3, wt111);
_mm512_mask_storeu_ps(arrangedW1+27039536+34740720*i5+18984*c2+24*k4, 252, wt111);
_mm512_mask_storeu_ps(arrangedW1+27058496+34740720*i5+18984*c2+24*k4, 16128, wt111);
_mm512_mask_storeu_ps(arrangedW1+27077456+34740720*i5+18984*c2+16*k4, 49152, wt111);
_mm512_mask_storeu_ps(arrangedW1+32810696+34740720*i5+18984*c2+24*k4, 3, wt112);
_mm512_mask_storeu_ps(arrangedW1+32829656+34740720*i5+18984*c2+24*k4, 252, wt112);
_mm512_mask_storeu_ps(arrangedW1+32848616+34740720*i5+18984*c2+24*k4, 16128, wt112);
_mm512_mask_storeu_ps(arrangedW1+32867576+34740720*i5+18984*c2+16*k4, 49152, wt112);
__m512 wt113 = _mm512_maskz_loadu_ps(3, wtPtr1+64+34740720*i5+911232*j1+72*k4);
__m512 wt114 = _mm512_maskz_loadu_ps(3, wtPtr1+57016+34740720*i5+911232*j1+72*k4);
__m512 wt115 = _mm512_maskz_loadu_ps(3, wtPtr1+113968+34740720*i5+911232*j1+72*k4);
__m512 wt116 = _mm512_maskz_loadu_ps(3, wtPtr1+170920+34740720*i5+911232*j1+72*k4);
__m512 wt117 = _mm512_maskz_loadu_ps(3, wtPtr1+227872+34740720*i5+911232*j1+72*k4);
__m512 wt118 = _mm512_maskz_loadu_ps(3, wtPtr1+284824+34740720*i5+911232*j1+72*k4);
__m512 wt119 = _mm512_maskz_loadu_ps(3, wtPtr1+341776+34740720*i5+911232*j1+72*k4);
__m512 wt120 = _mm512_maskz_loadu_ps(3, wtPtr1+398728+34740720*i5+911232*j1+72*k4);
__m512 wt121 = _mm512_maskz_loadu_ps(3, wtPtr1+455680+34740720*i5+911232*j1+72*k4);
__m512 wt122 = _mm512_maskz_loadu_ps(3, wtPtr1+512632+34740720*i5+911232*j1+72*k4);
__m512 wt123 = _mm512_maskz_loadu_ps(3, wtPtr1+569584+34740720*i5+911232*j1+72*k4);
__m512 wt124 = _mm512_maskz_loadu_ps(3, wtPtr1+626536+34740720*i5+911232*j1+72*k4);
__m512 wt125 = _mm512_maskz_loadu_ps(3, wtPtr1+683488+34740720*i5+911232*j1+72*k4);
__m512 wt126 = _mm512_maskz_loadu_ps(3, wtPtr1+740440+34740720*i5+911232*j1+72*k4);
__m512 wt127 = _mm512_maskz_loadu_ps(3, wtPtr1+797392+34740720*i5+911232*j1+72*k4);
__m512 wt128 = _mm512_maskz_loadu_ps(3, wtPtr1+854344+34740720*i5+911232*j1+72*k4);
__m512 tmp253 = _mm512_unpacklo_ps(wt113, wt114);
__m512 tmp254 = _mm512_unpacklo_ps(wt115, wt116);
__m512 tmp255 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp256 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp257 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp258 = _mm512_unpacklo_ps(wt123, wt124);
__m512 tmp259 = _mm512_unpacklo_ps(wt125, wt126);
__m512 tmp260 = _mm512_unpacklo_ps(wt127, wt128);
__m512 tmp261 = _mm512_shuffle_ps(tmp253, tmp254, 68);
__m512 tmp262 = _mm512_shuffle_ps(tmp253, tmp254, 238);
__m512 tmp263 = _mm512_shuffle_ps(tmp255, tmp256, 68);
__m512 tmp264 = _mm512_shuffle_ps(tmp255, tmp256, 238);
__m512 tmp265 = _mm512_shuffle_ps(tmp257, tmp258, 68);
__m512 tmp266 = _mm512_shuffle_ps(tmp257, tmp258, 238);
__m512 tmp267 = _mm512_shuffle_ps(tmp259, tmp260, 68);
__m512 tmp268 = _mm512_shuffle_ps(tmp259, tmp260, 238);
__m512 tmp269 = _mm512_shuffle_f32x4(tmp261, tmp263, 136);
__m512 tmp270 = _mm512_shuffle_f32x4(tmp262, tmp264, 136);
__m512 tmp271 = _mm512_shuffle_f32x4(tmp265, tmp267, 136);
__m512 tmp272 = _mm512_shuffle_f32x4(tmp266, tmp268, 136);
wt113 = _mm512_shuffle_f32x4(tmp269, tmp271, 136);
wt114 = _mm512_shuffle_f32x4(tmp270, tmp272, 136);
_mm512_mask_storeu_ps(arrangedW1+9650216+34740720*i5+18984*c2+24*k4, 3, wt113);
_mm512_mask_storeu_ps(arrangedW1+9669176+34740720*i5+18984*c2+24*k4, 252, wt113);
_mm512_mask_storeu_ps(arrangedW1+9688136+34740720*i5+18984*c2+24*k4, 16128, wt113);
_mm512_mask_storeu_ps(arrangedW1+9707096+34740720*i5+18984*c2+16*k4, 49152, wt113);
_mm512_mask_storeu_ps(arrangedW1+21230456+34740720*i5+18984*c2+24*k4, 3, wt114);
_mm512_mask_storeu_ps(arrangedW1+21249416+34740720*i5+18984*c2+24*k4, 252, wt114);
_mm512_mask_storeu_ps(arrangedW1+21268376+34740720*i5+18984*c2+24*k4, 16128, wt114);
_mm512_mask_storeu_ps(arrangedW1+21287336+34740720*i5+18984*c2+16*k4, 49152, wt114);
}
if (j1 >= jj1) return;
j1 = 38;
}
if (j1 == 38) {
__m512 bias3 = _mm512_setzero_ps();
if (!e1) {
bias3 = _mm512_maskz_loadu_ps(3, biasPtr1-0+2440*i5+64*j1);
}
_mm512_mask_storeu_ps(arrangedB1-0+2440*i5+64*j1, 3, bias3);
ptrdiff_t c3 = (size_t)(0+16*j1)/6;
ptrdiff_t k5 = 0;
for (; k5 != 791; ++k5) {
__m512 wt129 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+34740720*i5+911232*j1+72*k5);
__m512 wt130 = _mm512_maskz_loadu_ps(65535, wtPtr1+56952+34740720*i5+911232*j1+72*k5);
__m512 tmp273 = _mm512_unpacklo_ps(wt129, wt130);
__m512 tmp274 = _mm512_unpackhi_ps(wt129, wt130);
__m512 tmp275 = _mm512_shuffle_ps(tmp273, tmp273, 238);
__m512 tmp276 = _mm512_shuffle_ps(tmp274, tmp274, 238);
__m512 tmp277 = _mm512_shuffle_f32x4(tmp273, tmp273, 136);
__m512 tmp278 = _mm512_shuffle_f32x4(tmp273, tmp273, 221);
__m512 tmp279 = _mm512_shuffle_f32x4(tmp275, tmp275, 136);
__m512 tmp280 = _mm512_shuffle_f32x4(tmp275, tmp275, 221);
__m512 tmp281 = _mm512_shuffle_f32x4(tmp274, tmp274, 136);
__m512 tmp282 = _mm512_shuffle_f32x4(tmp274, tmp274, 221);
__m512 tmp283 = _mm512_shuffle_f32x4(tmp276, tmp276, 136);
__m512 tmp284 = _mm512_shuffle_f32x4(tmp276, tmp276, 221);
wt129 = tmp277;
__m512 wt137 = _mm512_shuffle_f32x4(tmp277, tmp277, 221);
wt130 = tmp279;
__m512 wt138 = _mm512_shuffle_f32x4(tmp279, tmp279, 221);
__m512 wt131 = tmp281;
__m512 wt139 = _mm512_shuffle_f32x4(tmp281, tmp281, 221);
__m512 wt132 = tmp283;
__m512 wt140 = _mm512_shuffle_f32x4(tmp283, tmp283, 221);
__m512 wt133 = tmp278;
__m512 wt141 = _mm512_shuffle_f32x4(tmp278, tmp278, 221);
__m512 wt134 = tmp280;
__m512 wt142 = _mm512_shuffle_f32x4(tmp280, tmp280, 221);
__m512 wt135 = tmp282;
__m512 wt143 = _mm512_shuffle_f32x4(tmp282, tmp282, 221);
__m512 wt136 = tmp284;
__m512 wt144 = _mm512_shuffle_f32x4(tmp284, tmp284, 221);
_mm512_mask_storeu_ps(arrangedW1+8+34740720*i5+18984*c3+16*k5, 3, wt129);
_mm512_mask_storeu_ps(arrangedW1+11580248+34740720*i5+18984*c3+16*k5, 3, wt130);
_mm512_mask_storeu_ps(arrangedW1+23160488+34740720*i5+18984*c3+16*k5, 3, wt131);
_mm512_mask_storeu_ps(arrangedW1+28950608+34740720*i5+18984*c3+16*k5, 3, wt132);
_mm512_mask_storeu_ps(arrangedW1+1930048+34740720*i5+18984*c3+16*k5, 3, wt133);
_mm512_mask_storeu_ps(arrangedW1+13510288+34740720*i5+18984*c3+16*k5, 3, wt134);
_mm512_mask_storeu_ps(arrangedW1+3860088+34740720*i5+18984*c3+16*k5, 3, wt135);
_mm512_mask_storeu_ps(arrangedW1+15440328+34740720*i5+18984*c3+16*k5, 3, wt136);
_mm512_mask_storeu_ps(arrangedW1+25090528+34740720*i5+18984*c3+16*k5, 3, wt137);
_mm512_mask_storeu_ps(arrangedW1+30880648+34740720*i5+18984*c3+16*k5, 3, wt138);
_mm512_mask_storeu_ps(arrangedW1+5790128+34740720*i5+18984*c3+16*k5, 3, wt139);
_mm512_mask_storeu_ps(arrangedW1+17370368+34740720*i5+18984*c3+16*k5, 3, wt140);
_mm512_mask_storeu_ps(arrangedW1+7720168+34740720*i5+18984*c3+16*k5, 3, wt141);
_mm512_mask_storeu_ps(arrangedW1+19300408+34740720*i5+18984*c3+16*k5, 3, wt142);
_mm512_mask_storeu_ps(arrangedW1+27020568+34740720*i5+18984*c3+16*k5, 3, wt143);
_mm512_mask_storeu_ps(arrangedW1+32810688+34740720*i5+18984*c3+16*k5, 3, wt144);
__m512 wt145 = _mm512_maskz_loadu_ps(3, wtPtr1+64+34740720*i5+911232*j1+72*k5);
__m512 wt146 = _mm512_maskz_loadu_ps(3, wtPtr1+57016+34740720*i5+911232*j1+72*k5);
__m512 tmp285 = _mm512_unpacklo_ps(wt145, wt146);
__m512 tmp286 = _mm512_shuffle_ps(tmp285, tmp285, 238);
wt145 = tmp285;
wt146 = tmp286;
_mm512_mask_storeu_ps(arrangedW1+9650208+34740720*i5+18984*c3+16*k5, 3, wt145);
_mm512_mask_storeu_ps(arrangedW1+21230448+34740720*i5+18984*c3+16*k5, 3, wt146);
}
if (j1 >= jj1) return;
j1 = 39;
}
}

static void Example2LoomArrangeFilts1(Example2ThreaderTeam1* team13, char** tensors1) {
Example2ThreaderTask1 task5;
task5.callee1 = Example2LoomArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 39;
task5.hull1[1] = 3;
task5.hull1[2] = 1;
Example2ThreaderDo1(team13, &task5);
}

static void Example2LoomArrangeDats1Callee1(Example2ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = pt8[0];
ptrdiff_t c4 = pt8[1];
ptrdiff_t g3 = pt8[2];
ptrdiff_t e2 = 0;
char*restrict datPtr1 = tensors4[0]-244+4261840*e2;
char*restrict arranged1 = tensors4[1]+5130240*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 1*c4;
ptrdiff_t last1 = j2+0;
if (j2 < 1) {
ptrdiff_t rel1 = j2-0;
ptrdiff_t h1 = 0;
ptrdiff_t w1 = 0;
ptrdiff_t k6 = 32*s1;
ptrdiff_t kk1 = k6+(s1 < 23 ? 31 : 54);
for (; k6 <= kk1; ++k6) {
_mm512_mask_storeu_ps(arranged1+0+404992*i6+202496*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+1214976+404992*i6+202496*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+2429952+404992*i6+202496*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+3644928+404992*i6+202496*j2+256*k6, 65535, _mm512_setzero_ps());
__m512 dat1 = _mm512_maskz_loadu_ps(65528, datPtr1+696+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat2 = _mm512_maskz_loadu_ps(65535, datPtr1+760+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat3 = _mm512_maskz_loadu_ps(65535, datPtr1+824+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat4 = _mm512_maskz_loadu_ps(8191, datPtr1+888+4037264*i6+5104*k6+232*h1+4*w1);
__m512i pm1 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 28, 24, 20, 16, 12, 8, 4, 0);
_mm512_mask_storeu_ps(arranged1+64+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat1, pm1, dat2));
_mm512_mask_storeu_ps(arranged1+96+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat3, pm1, dat4));
__m512i pm2 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 29, 25, 21, 17, 13, 9, 5, 1);
_mm512_mask_storeu_ps(arranged1+1215040+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat1, pm2, dat2));
_mm512_mask_storeu_ps(arranged1+1215072+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat3, pm2, dat4));
__m512i pm3 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 30, 26, 22, 18, 14, 10, 6, 2);
_mm512_mask_storeu_ps(arranged1+2430016+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat1, pm3, dat2));
_mm512_mask_storeu_ps(arranged1+2430048+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat3, pm3, dat4));
__m512i pm4 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 31, 27, 23, 19, 15, 11, 7, 3);
_mm512_mask_storeu_ps(arranged1+3644992+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat1, pm4, dat2));
_mm512_mask_storeu_ps(arranged1+3645024+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat3, pm4, dat4));
__m512 dat5 = _mm512_maskz_loadu_ps(65528, datPtr1+1392+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat6 = _mm512_maskz_loadu_ps(65535, datPtr1+1456+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat7 = _mm512_maskz_loadu_ps(65535, datPtr1+1520+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat8 = _mm512_maskz_loadu_ps(8191, datPtr1+1584+4037264*i6+5104*k6+232*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+128+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat5, pm1, dat6));
_mm512_mask_storeu_ps(arranged1+160+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat7, pm1, dat8));
_mm512_mask_storeu_ps(arranged1+1215104+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat5, pm2, dat6));
_mm512_mask_storeu_ps(arranged1+1215136+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat7, pm2, dat8));
_mm512_mask_storeu_ps(arranged1+2430080+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat5, pm3, dat6));
_mm512_mask_storeu_ps(arranged1+2430112+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat7, pm3, dat8));
_mm512_mask_storeu_ps(arranged1+3645056+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat5, pm4, dat6));
_mm512_mask_storeu_ps(arranged1+3645088+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat7, pm4, dat8));
__m512 dat9 = _mm512_maskz_loadu_ps(65528, datPtr1+2088+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat10 = _mm512_maskz_loadu_ps(65535, datPtr1+2152+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat11 = _mm512_maskz_loadu_ps(65535, datPtr1+2216+4037264*i6+5104*k6+232*h1+4*w1);
__m512 dat12 = _mm512_maskz_loadu_ps(8191, datPtr1+2280+4037264*i6+5104*k6+232*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+192+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat9, pm1, dat10));
_mm512_mask_storeu_ps(arranged1+224+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat11, pm1, dat12));
_mm512_mask_storeu_ps(arranged1+1215168+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat9, pm2, dat10));
_mm512_mask_storeu_ps(arranged1+1215200+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat11, pm2, dat12));
_mm512_mask_storeu_ps(arranged1+2430144+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat9, pm3, dat10));
_mm512_mask_storeu_ps(arranged1+2430176+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat11, pm3, dat12));
_mm512_mask_storeu_ps(arranged1+3645120+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat9, pm4, dat10));
_mm512_mask_storeu_ps(arranged1+3645152+404992*i6+202496*j2+256*k6, 255, _mm512_permutex2var_ps(dat11, pm4, dat12));
}
if (j2 >= last1) return;
++j2;
j2 = 1;
}
ptrdiff_t rel2 = j2-1;
ptrdiff_t h2 = 12;
ptrdiff_t w2 = 0;
ptrdiff_t k7 = 32*s1;
ptrdiff_t kk2 = k7+(s1 < 23 ? 31 : 54);
for (; k7 <= kk2; ++k7) {
__m512 dat13 = _mm512_maskz_loadu_ps(65528, datPtr1+0+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat14 = _mm512_maskz_loadu_ps(65535, datPtr1+64+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat15 = _mm512_maskz_loadu_ps(65535, datPtr1+128+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat16 = _mm512_maskz_loadu_ps(8191, datPtr1+192+4037264*i6+5104*k7+232*h2+4*w2);
__m512i pm5 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 28, 24, 20, 16, 12, 8, 4, 0);
_mm512_mask_storeu_ps(arranged1+0+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat13, pm5, dat14));
_mm512_mask_storeu_ps(arranged1+32+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat15, pm5, dat16));
__m512i pm6 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 29, 25, 21, 17, 13, 9, 5, 1);
_mm512_mask_storeu_ps(arranged1+1214976+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat13, pm6, dat14));
_mm512_mask_storeu_ps(arranged1+1215008+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat15, pm6, dat16));
__m512i pm7 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 30, 26, 22, 18, 14, 10, 6, 2);
_mm512_mask_storeu_ps(arranged1+2429952+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat13, pm7, dat14));
_mm512_mask_storeu_ps(arranged1+2429984+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat15, pm7, dat16));
__m512i pm8 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 31, 27, 23, 19, 15, 11, 7, 3);
_mm512_mask_storeu_ps(arranged1+3644928+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat13, pm8, dat14));
_mm512_mask_storeu_ps(arranged1+3644960+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat15, pm8, dat16));
__m512 dat17 = _mm512_maskz_loadu_ps(65528, datPtr1+696+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat18 = _mm512_maskz_loadu_ps(65535, datPtr1+760+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat19 = _mm512_maskz_loadu_ps(65535, datPtr1+824+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat20 = _mm512_maskz_loadu_ps(8191, datPtr1+888+4037264*i6+5104*k7+232*h2+4*w2);
_mm512_mask_storeu_ps(arranged1+64+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat17, pm5, dat18));
_mm512_mask_storeu_ps(arranged1+96+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat19, pm5, dat20));
_mm512_mask_storeu_ps(arranged1+1215040+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat17, pm6, dat18));
_mm512_mask_storeu_ps(arranged1+1215072+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat19, pm6, dat20));
_mm512_mask_storeu_ps(arranged1+2430016+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat17, pm7, dat18));
_mm512_mask_storeu_ps(arranged1+2430048+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat19, pm7, dat20));
_mm512_mask_storeu_ps(arranged1+3644992+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat17, pm8, dat18));
_mm512_mask_storeu_ps(arranged1+3645024+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat19, pm8, dat20));
__m512 dat21 = _mm512_maskz_loadu_ps(65528, datPtr1+1392+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat22 = _mm512_maskz_loadu_ps(65535, datPtr1+1456+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat23 = _mm512_maskz_loadu_ps(65535, datPtr1+1520+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat24 = _mm512_maskz_loadu_ps(8191, datPtr1+1584+4037264*i6+5104*k7+232*h2+4*w2);
_mm512_mask_storeu_ps(arranged1+128+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat21, pm5, dat22));
_mm512_mask_storeu_ps(arranged1+160+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat23, pm5, dat24));
_mm512_mask_storeu_ps(arranged1+1215104+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat21, pm6, dat22));
_mm512_mask_storeu_ps(arranged1+1215136+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat23, pm6, dat24));
_mm512_mask_storeu_ps(arranged1+2430080+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat21, pm7, dat22));
_mm512_mask_storeu_ps(arranged1+2430112+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat23, pm7, dat24));
_mm512_mask_storeu_ps(arranged1+3645056+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat21, pm8, dat22));
_mm512_mask_storeu_ps(arranged1+3645088+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat23, pm8, dat24));
__m512 dat25 = _mm512_maskz_loadu_ps(65528, datPtr1+2088+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat26 = _mm512_maskz_loadu_ps(65535, datPtr1+2152+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat27 = _mm512_maskz_loadu_ps(65535, datPtr1+2216+4037264*i6+5104*k7+232*h2+4*w2);
__m512 dat28 = _mm512_maskz_loadu_ps(8191, datPtr1+2280+4037264*i6+5104*k7+232*h2+4*w2);
_mm512_mask_storeu_ps(arranged1+192+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat25, pm5, dat26));
_mm512_mask_storeu_ps(arranged1+224+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat27, pm5, dat28));
_mm512_mask_storeu_ps(arranged1+1215168+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat25, pm6, dat26));
_mm512_mask_storeu_ps(arranged1+1215200+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat27, pm6, dat28));
_mm512_mask_storeu_ps(arranged1+2430144+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat25, pm7, dat26));
_mm512_mask_storeu_ps(arranged1+2430176+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat27, pm7, dat28));
_mm512_mask_storeu_ps(arranged1+3645120+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat25, pm8, dat26));
_mm512_mask_storeu_ps(arranged1+3645152+404992*i6+202496*j2+256*k7, 255, _mm512_permutex2var_ps(dat27, pm8, dat28));
}
if (j2 >= last1) return;
++j2;
}

static void Example2LoomArrangeDats1(Example2ThreaderTeam1* team15, char** tensors3) {
Example2ThreaderTask1 task7;
task7.callee1 = Example2LoomArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 24;
task7.hull1[1] = 2;
task7.hull1[2] = 3;
task7.hull1[3] = 1;
Example2ThreaderDo1(team15, &task7);
}

static ptrdiff_t Example2LoomProduceSums1FieldTbl1[] = {
0, 2,
6, 2,
12, 1,
15, 1, 18
};

static ptrdiff_t Example2LoomProduceSums1NodeTbl1[] = {
0, 0, 1,
0, 1, 1,
1, 0, 0,
1, 1, 0,
2, 0, 0,
2, 1, 0,
0, 0, 0,
0, 1, 0,
1, 0, 0,
1, 1, 0,
2, 0, 0,
2, 1, 0,
0, 0, 0,
1, 0, 0,
2, 0, 0,
0, 0, 0,
1, 0, 0,
2, 0, 0
};

static void Example2LoomProduceSums1Callee1(Example2ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t epoch1 = 0;
ptrdiff_t field1 = (ptrdiff_t)tuple2[2];
ptrdiff_t nodeFirst1 = (ptrdiff_t)tuple2[3];
ptrdiff_t group1 = pt9[3];
ptrdiff_t to2 = pt9[2];
ptrdiff_t nodeOff1 = pt9[1];
ptrdiff_t w3 = pt9[0];
ptrdiff_t node6 = nodeFirst1+nodeOff1;
ptrdiff_t lift1 = Example2LoomProduceSums1NodeTbl1[0+3*node6];
ptrdiff_t pile1 = Example2LoomProduceSums1NodeTbl1[1+3*node6];
ptrdiff_t base1 = Example2LoomProduceSums1NodeTbl1[2+3*node6];
ptrdiff_t from1 = to2+(size_t)lift1/4*1;
if (from1 >= 2) return;
char*restrict biasPtr2 = tensors6[0]+7320*epoch1+2440*group1;
char*restrict wtPtr2 = tensors6[0]+7320+110019600*epoch1+34740720*group1+1930040*node6;
char*restrict datPtr2 = tensors6[1]+5130240*epoch1+1214976*field1+404992*group1+202496*from1;
char*restrict sumPtr1 = tensors6[2]+624640*group1+312320*to2+156160*pile1;
switch ((size_t)lift1%4*2+(to2 >= 1)) {
default: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i7 = 1*w3;
ptrdiff_t ii1 = i7+0;
for (; i7 != 101; ++i7) {
__m512 sum2 = _mm512_setzero_ps();
__m512 sum6 = _mm512_setzero_ps();
__m512 sum10 = _mm512_setzero_ps();
__m512 sum14 = _mm512_setzero_ps();
__m512 sum18 = _mm512_setzero_ps();
__m512 sum22 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum3 = sum2;
__m512 sum4 = sum2;
__m512 sum5 = sum2;
__m512 sum7 = sum6;
__m512 sum8 = sum6;
__m512 sum9 = sum6;
__m512 sum11 = sum10;
__m512 sum12 = sum10;
__m512 sum13 = sum10;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum19 = sum18;
__m512 sum20 = sum18;
__m512 sum21 = sum18;
__m512 sum23 = sum22;
__m512 sum24 = sum22;
__m512 sum25 = sum22;
for (ptrdiff_t j3 = 0; j3 < 791; ++j3) {
__m512 dat29 = _mm512_loadu_ps(datPtr2+0+256*j3);
__m512 dat30 = _mm512_loadu_ps(datPtr2+64+256*j3);
__m512 dat31 = _mm512_loadu_ps(datPtr2+128+256*j3);
__m512 dat32 = _mm512_loadu_ps(datPtr2+192+256*j3);
__m512 wt147 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i7+24*j3));
sum2 = _mm512_fmadd_ps(wt147, dat29, sum2);
sum3 = _mm512_fmadd_ps(wt147, dat30, sum3);
sum4 = _mm512_fmadd_ps(wt147, dat31, sum4);
sum5 = _mm512_fmadd_ps(wt147, dat32, sum5);
__m512 wt148 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i7+24*j3));
sum6 = _mm512_fmadd_ps(wt148, dat29, sum6);
sum7 = _mm512_fmadd_ps(wt148, dat30, sum7);
sum8 = _mm512_fmadd_ps(wt148, dat31, sum8);
sum9 = _mm512_fmadd_ps(wt148, dat32, sum9);
__m512 wt149 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i7+24*j3));
sum10 = _mm512_fmadd_ps(wt149, dat29, sum10);
sum11 = _mm512_fmadd_ps(wt149, dat30, sum11);
sum12 = _mm512_fmadd_ps(wt149, dat31, sum12);
sum13 = _mm512_fmadd_ps(wt149, dat32, sum13);
__m512 wt150 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i7+24*j3));
sum14 = _mm512_fmadd_ps(wt150, dat29, sum14);
sum15 = _mm512_fmadd_ps(wt150, dat30, sum15);
sum16 = _mm512_fmadd_ps(wt150, dat31, sum16);
sum17 = _mm512_fmadd_ps(wt150, dat32, sum17);
__m512 wt151 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i7+24*j3));
sum18 = _mm512_fmadd_ps(wt151, dat29, sum18);
sum19 = _mm512_fmadd_ps(wt151, dat30, sum19);
sum20 = _mm512_fmadd_ps(wt151, dat31, sum20);
sum21 = _mm512_fmadd_ps(wt151, dat32, sum21);
__m512 wt152 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i7+24*j3));
sum22 = _mm512_fmadd_ps(wt152, dat29, sum22);
sum23 = _mm512_fmadd_ps(wt152, dat30, sum23);
sum24 = _mm512_fmadd_ps(wt152, dat31, sum24);
sum25 = _mm512_fmadd_ps(wt152, dat32, sum25);
}
_mm512_storeu_ps(sumPtr1+0+1536*i7, sum2);
_mm512_storeu_ps(sumPtr1+64+1536*i7, sum3);
_mm512_storeu_ps(sumPtr1+128+1536*i7, sum4);
_mm512_storeu_ps(sumPtr1+192+1536*i7, sum5);
_mm512_storeu_ps(sumPtr1+256+1536*i7, sum6);
_mm512_storeu_ps(sumPtr1+320+1536*i7, sum7);
_mm512_storeu_ps(sumPtr1+384+1536*i7, sum8);
_mm512_storeu_ps(sumPtr1+448+1536*i7, sum9);
_mm512_storeu_ps(sumPtr1+512+1536*i7, sum10);
_mm512_storeu_ps(sumPtr1+576+1536*i7, sum11);
_mm512_storeu_ps(sumPtr1+640+1536*i7, sum12);
_mm512_storeu_ps(sumPtr1+704+1536*i7, sum13);
_mm512_storeu_ps(sumPtr1+768+1536*i7, sum14);
_mm512_storeu_ps(sumPtr1+832+1536*i7, sum15);
_mm512_storeu_ps(sumPtr1+896+1536*i7, sum16);
_mm512_storeu_ps(sumPtr1+960+1536*i7, sum17);
_mm512_storeu_ps(sumPtr1+1024+1536*i7, sum18);
_mm512_storeu_ps(sumPtr1+1088+1536*i7, sum19);
_mm512_storeu_ps(sumPtr1+1152+1536*i7, sum20);
_mm512_storeu_ps(sumPtr1+1216+1536*i7, sum21);
_mm512_storeu_ps(sumPtr1+1280+1536*i7, sum22);
_mm512_storeu_ps(sumPtr1+1344+1536*i7, sum23);
_mm512_storeu_ps(sumPtr1+1408+1536*i7, sum24);
_mm512_storeu_ps(sumPtr1+1472+1536*i7, sum25);
if (i7 >= ii1) return;
}
__m512 sum26 = _mm512_setzero_ps();
__m512 sum30 = _mm512_setzero_ps();
__m512 sum34 = _mm512_setzero_ps();
__m512 sum38 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
__m512 sum31 = sum30;
__m512 sum32 = sum30;
__m512 sum33 = sum30;
__m512 sum35 = sum34;
__m512 sum36 = sum34;
__m512 sum37 = sum34;
__m512 sum39 = sum38;
__m512 sum40 = sum38;
__m512 sum41 = sum38;
for (ptrdiff_t j4 = 0; j4 < 791; ++j4) {
__m512 dat33 = _mm512_loadu_ps(datPtr2+0+256*j4);
__m512 dat34 = _mm512_loadu_ps(datPtr2+64+256*j4);
__m512 dat35 = _mm512_loadu_ps(datPtr2+128+256*j4);
__m512 dat36 = _mm512_loadu_ps(datPtr2+192+256*j4);
__m512 wt153 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i7+16*j4));
sum26 = _mm512_fmadd_ps(wt153, dat33, sum26);
sum27 = _mm512_fmadd_ps(wt153, dat34, sum27);
sum28 = _mm512_fmadd_ps(wt153, dat35, sum28);
sum29 = _mm512_fmadd_ps(wt153, dat36, sum29);
__m512 wt154 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i7+16*j4));
sum30 = _mm512_fmadd_ps(wt154, dat33, sum30);
sum31 = _mm512_fmadd_ps(wt154, dat34, sum31);
sum32 = _mm512_fmadd_ps(wt154, dat35, sum32);
sum33 = _mm512_fmadd_ps(wt154, dat36, sum33);
__m512 wt155 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i7+16*j4));
sum34 = _mm512_fmadd_ps(wt155, dat33, sum34);
sum35 = _mm512_fmadd_ps(wt155, dat34, sum35);
sum36 = _mm512_fmadd_ps(wt155, dat35, sum36);
sum37 = _mm512_fmadd_ps(wt155, dat36, sum37);
__m512 wt156 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i7+16*j4));
sum38 = _mm512_fmadd_ps(wt156, dat33, sum38);
sum39 = _mm512_fmadd_ps(wt156, dat34, sum39);
sum40 = _mm512_fmadd_ps(wt156, dat35, sum40);
sum41 = _mm512_fmadd_ps(wt156, dat36, sum41);
}
_mm512_storeu_ps(sumPtr1+0+1536*i7, sum26);
_mm512_storeu_ps(sumPtr1+64+1536*i7, sum27);
_mm512_storeu_ps(sumPtr1+128+1536*i7, sum28);
_mm512_storeu_ps(sumPtr1+192+1536*i7, sum29);
_mm512_storeu_ps(sumPtr1+256+1536*i7, sum30);
_mm512_storeu_ps(sumPtr1+320+1536*i7, sum31);
_mm512_storeu_ps(sumPtr1+384+1536*i7, sum32);
_mm512_storeu_ps(sumPtr1+448+1536*i7, sum33);
_mm512_storeu_ps(sumPtr1+512+1536*i7, sum34);
_mm512_storeu_ps(sumPtr1+576+1536*i7, sum35);
_mm512_storeu_ps(sumPtr1+640+1536*i7, sum36);
_mm512_storeu_ps(sumPtr1+704+1536*i7, sum37);
_mm512_storeu_ps(sumPtr1+768+1536*i7, sum38);
_mm512_storeu_ps(sumPtr1+832+1536*i7, sum39);
_mm512_storeu_ps(sumPtr1+896+1536*i7, sum40);
_mm512_storeu_ps(sumPtr1+960+1536*i7, sum41);
return;
}
ptrdiff_t i8 = 1*w3;
ptrdiff_t ii2 = i8+0;
for (; i8 != 101; ++i8) {
__m512 sum42 = _mm512_setzero_ps();
__m512 sum46 = _mm512_setzero_ps();
__m512 sum50 = _mm512_setzero_ps();
__m512 sum54 = _mm512_setzero_ps();
__m512 sum58 = _mm512_setzero_ps();
__m512 sum62 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum43 = sum42;
__m512 sum44 = sum42;
__m512 sum45 = sum42;
__m512 sum47 = sum46;
__m512 sum48 = sum46;
__m512 sum49 = sum46;
__m512 sum51 = sum50;
__m512 sum52 = sum50;
__m512 sum53 = sum50;
__m512 sum55 = sum54;
__m512 sum56 = sum54;
__m512 sum57 = sum54;
__m512 sum59 = sum58;
__m512 sum60 = sum58;
__m512 sum61 = sum58;
__m512 sum63 = sum62;
__m512 sum64 = sum62;
__m512 sum65 = sum62;
for (ptrdiff_t j5 = 0; j5 < 791; ++j5) {
__m512 dat37 = _mm512_loadu_ps(datPtr2+0+256*j5);
__m512 dat38 = _mm512_loadu_ps(datPtr2+64+256*j5);
__m512 dat39 = _mm512_loadu_ps(datPtr2+128+256*j5);
__m512 dat40 = _mm512_loadu_ps(datPtr2+192+256*j5);
__m512 wt157 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i8+24*j5));
sum42 = _mm512_fmadd_ps(wt157, dat37, sum42);
sum43 = _mm512_fmadd_ps(wt157, dat38, sum43);
sum44 = _mm512_fmadd_ps(wt157, dat39, sum44);
sum45 = _mm512_fmadd_ps(wt157, dat40, sum45);
__m512 wt158 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i8+24*j5));
sum46 = _mm512_fmadd_ps(wt158, dat37, sum46);
sum47 = _mm512_fmadd_ps(wt158, dat38, sum47);
sum48 = _mm512_fmadd_ps(wt158, dat39, sum48);
sum49 = _mm512_fmadd_ps(wt158, dat40, sum49);
__m512 wt159 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i8+24*j5));
sum50 = _mm512_fmadd_ps(wt159, dat37, sum50);
sum51 = _mm512_fmadd_ps(wt159, dat38, sum51);
sum52 = _mm512_fmadd_ps(wt159, dat39, sum52);
sum53 = _mm512_fmadd_ps(wt159, dat40, sum53);
__m512 wt160 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i8+24*j5));
sum54 = _mm512_fmadd_ps(wt160, dat37, sum54);
sum55 = _mm512_fmadd_ps(wt160, dat38, sum55);
sum56 = _mm512_fmadd_ps(wt160, dat39, sum56);
sum57 = _mm512_fmadd_ps(wt160, dat40, sum57);
__m512 wt161 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i8+24*j5));
sum58 = _mm512_fmadd_ps(wt161, dat37, sum58);
sum59 = _mm512_fmadd_ps(wt161, dat38, sum59);
sum60 = _mm512_fmadd_ps(wt161, dat39, sum60);
sum61 = _mm512_fmadd_ps(wt161, dat40, sum61);
__m512 wt162 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i8+24*j5));
sum62 = _mm512_fmadd_ps(wt162, dat37, sum62);
sum63 = _mm512_fmadd_ps(wt162, dat38, sum63);
sum64 = _mm512_fmadd_ps(wt162, dat39, sum64);
sum65 = _mm512_fmadd_ps(wt162, dat40, sum65);
}
_mm512_storeu_ps(sumPtr1+0+1536*i8, _mm512_add_ps(sum42, _mm512_loadu_ps(sumPtr1+0+1536*i8)));
_mm512_storeu_ps(sumPtr1+64+1536*i8, _mm512_add_ps(sum43, _mm512_loadu_ps(sumPtr1+64+1536*i8)));
_mm512_storeu_ps(sumPtr1+128+1536*i8, _mm512_add_ps(sum44, _mm512_loadu_ps(sumPtr1+128+1536*i8)));
_mm512_storeu_ps(sumPtr1+192+1536*i8, _mm512_add_ps(sum45, _mm512_loadu_ps(sumPtr1+192+1536*i8)));
_mm512_storeu_ps(sumPtr1+256+1536*i8, _mm512_add_ps(sum46, _mm512_loadu_ps(sumPtr1+256+1536*i8)));
_mm512_storeu_ps(sumPtr1+320+1536*i8, _mm512_add_ps(sum47, _mm512_loadu_ps(sumPtr1+320+1536*i8)));
_mm512_storeu_ps(sumPtr1+384+1536*i8, _mm512_add_ps(sum48, _mm512_loadu_ps(sumPtr1+384+1536*i8)));
_mm512_storeu_ps(sumPtr1+448+1536*i8, _mm512_add_ps(sum49, _mm512_loadu_ps(sumPtr1+448+1536*i8)));
_mm512_storeu_ps(sumPtr1+512+1536*i8, _mm512_add_ps(sum50, _mm512_loadu_ps(sumPtr1+512+1536*i8)));
_mm512_storeu_ps(sumPtr1+576+1536*i8, _mm512_add_ps(sum51, _mm512_loadu_ps(sumPtr1+576+1536*i8)));
_mm512_storeu_ps(sumPtr1+640+1536*i8, _mm512_add_ps(sum52, _mm512_loadu_ps(sumPtr1+640+1536*i8)));
_mm512_storeu_ps(sumPtr1+704+1536*i8, _mm512_add_ps(sum53, _mm512_loadu_ps(sumPtr1+704+1536*i8)));
_mm512_storeu_ps(sumPtr1+768+1536*i8, _mm512_add_ps(sum54, _mm512_loadu_ps(sumPtr1+768+1536*i8)));
_mm512_storeu_ps(sumPtr1+832+1536*i8, _mm512_add_ps(sum55, _mm512_loadu_ps(sumPtr1+832+1536*i8)));
_mm512_storeu_ps(sumPtr1+896+1536*i8, _mm512_add_ps(sum56, _mm512_loadu_ps(sumPtr1+896+1536*i8)));
_mm512_storeu_ps(sumPtr1+960+1536*i8, _mm512_add_ps(sum57, _mm512_loadu_ps(sumPtr1+960+1536*i8)));
_mm512_storeu_ps(sumPtr1+1024+1536*i8, _mm512_add_ps(sum58, _mm512_loadu_ps(sumPtr1+1024+1536*i8)));
_mm512_storeu_ps(sumPtr1+1088+1536*i8, _mm512_add_ps(sum59, _mm512_loadu_ps(sumPtr1+1088+1536*i8)));
_mm512_storeu_ps(sumPtr1+1152+1536*i8, _mm512_add_ps(sum60, _mm512_loadu_ps(sumPtr1+1152+1536*i8)));
_mm512_storeu_ps(sumPtr1+1216+1536*i8, _mm512_add_ps(sum61, _mm512_loadu_ps(sumPtr1+1216+1536*i8)));
_mm512_storeu_ps(sumPtr1+1280+1536*i8, _mm512_add_ps(sum62, _mm512_loadu_ps(sumPtr1+1280+1536*i8)));
_mm512_storeu_ps(sumPtr1+1344+1536*i8, _mm512_add_ps(sum63, _mm512_loadu_ps(sumPtr1+1344+1536*i8)));
_mm512_storeu_ps(sumPtr1+1408+1536*i8, _mm512_add_ps(sum64, _mm512_loadu_ps(sumPtr1+1408+1536*i8)));
_mm512_storeu_ps(sumPtr1+1472+1536*i8, _mm512_add_ps(sum65, _mm512_loadu_ps(sumPtr1+1472+1536*i8)));
if (i8 >= ii2) return;
}
__m512 sum66 = _mm512_setzero_ps();
__m512 sum70 = _mm512_setzero_ps();
__m512 sum74 = _mm512_setzero_ps();
__m512 sum78 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum67 = sum66;
__m512 sum68 = sum66;
__m512 sum69 = sum66;
__m512 sum71 = sum70;
__m512 sum72 = sum70;
__m512 sum73 = sum70;
__m512 sum75 = sum74;
__m512 sum76 = sum74;
__m512 sum77 = sum74;
__m512 sum79 = sum78;
__m512 sum80 = sum78;
__m512 sum81 = sum78;
for (ptrdiff_t j6 = 0; j6 < 791; ++j6) {
__m512 dat41 = _mm512_loadu_ps(datPtr2+0+256*j6);
__m512 dat42 = _mm512_loadu_ps(datPtr2+64+256*j6);
__m512 dat43 = _mm512_loadu_ps(datPtr2+128+256*j6);
__m512 dat44 = _mm512_loadu_ps(datPtr2+192+256*j6);
__m512 wt163 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i8+16*j6));
sum66 = _mm512_fmadd_ps(wt163, dat41, sum66);
sum67 = _mm512_fmadd_ps(wt163, dat42, sum67);
sum68 = _mm512_fmadd_ps(wt163, dat43, sum68);
sum69 = _mm512_fmadd_ps(wt163, dat44, sum69);
__m512 wt164 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i8+16*j6));
sum70 = _mm512_fmadd_ps(wt164, dat41, sum70);
sum71 = _mm512_fmadd_ps(wt164, dat42, sum71);
sum72 = _mm512_fmadd_ps(wt164, dat43, sum72);
sum73 = _mm512_fmadd_ps(wt164, dat44, sum73);
__m512 wt165 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i8+16*j6));
sum74 = _mm512_fmadd_ps(wt165, dat41, sum74);
sum75 = _mm512_fmadd_ps(wt165, dat42, sum75);
sum76 = _mm512_fmadd_ps(wt165, dat43, sum76);
sum77 = _mm512_fmadd_ps(wt165, dat44, sum77);
__m512 wt166 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i8+16*j6));
sum78 = _mm512_fmadd_ps(wt166, dat41, sum78);
sum79 = _mm512_fmadd_ps(wt166, dat42, sum79);
sum80 = _mm512_fmadd_ps(wt166, dat43, sum80);
sum81 = _mm512_fmadd_ps(wt166, dat44, sum81);
}
_mm512_storeu_ps(sumPtr1+0+1536*i8, _mm512_add_ps(sum66, _mm512_loadu_ps(sumPtr1+0+1536*i8)));
_mm512_storeu_ps(sumPtr1+64+1536*i8, _mm512_add_ps(sum67, _mm512_loadu_ps(sumPtr1+64+1536*i8)));
_mm512_storeu_ps(sumPtr1+128+1536*i8, _mm512_add_ps(sum68, _mm512_loadu_ps(sumPtr1+128+1536*i8)));
_mm512_storeu_ps(sumPtr1+192+1536*i8, _mm512_add_ps(sum69, _mm512_loadu_ps(sumPtr1+192+1536*i8)));
_mm512_storeu_ps(sumPtr1+256+1536*i8, _mm512_add_ps(sum70, _mm512_loadu_ps(sumPtr1+256+1536*i8)));
_mm512_storeu_ps(sumPtr1+320+1536*i8, _mm512_add_ps(sum71, _mm512_loadu_ps(sumPtr1+320+1536*i8)));
_mm512_storeu_ps(sumPtr1+384+1536*i8, _mm512_add_ps(sum72, _mm512_loadu_ps(sumPtr1+384+1536*i8)));
_mm512_storeu_ps(sumPtr1+448+1536*i8, _mm512_add_ps(sum73, _mm512_loadu_ps(sumPtr1+448+1536*i8)));
_mm512_storeu_ps(sumPtr1+512+1536*i8, _mm512_add_ps(sum74, _mm512_loadu_ps(sumPtr1+512+1536*i8)));
_mm512_storeu_ps(sumPtr1+576+1536*i8, _mm512_add_ps(sum75, _mm512_loadu_ps(sumPtr1+576+1536*i8)));
_mm512_storeu_ps(sumPtr1+640+1536*i8, _mm512_add_ps(sum76, _mm512_loadu_ps(sumPtr1+640+1536*i8)));
_mm512_storeu_ps(sumPtr1+704+1536*i8, _mm512_add_ps(sum77, _mm512_loadu_ps(sumPtr1+704+1536*i8)));
_mm512_storeu_ps(sumPtr1+768+1536*i8, _mm512_add_ps(sum78, _mm512_loadu_ps(sumPtr1+768+1536*i8)));
_mm512_storeu_ps(sumPtr1+832+1536*i8, _mm512_add_ps(sum79, _mm512_loadu_ps(sumPtr1+832+1536*i8)));
_mm512_storeu_ps(sumPtr1+896+1536*i8, _mm512_add_ps(sum80, _mm512_loadu_ps(sumPtr1+896+1536*i8)));
_mm512_storeu_ps(sumPtr1+960+1536*i8, _mm512_add_ps(sum81, _mm512_loadu_ps(sumPtr1+960+1536*i8)));
return;
}
(void)base1;
ptrdiff_t i9 = 1*w3;
ptrdiff_t ii3 = i9+0;
for (; i9 != 101; ++i9) {
__m512 sum82 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i9));
__m512 sum86 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i9));
__m512 sum90 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i9));
__m512 sum94 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i9));
__m512 sum98 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i9));
__m512 sum102 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i9));
__m512 sum83 = sum82;
__m512 sum84 = sum82;
__m512 sum85 = sum82;
__m512 sum87 = sum86;
__m512 sum88 = sum86;
__m512 sum89 = sum86;
__m512 sum91 = sum90;
__m512 sum92 = sum90;
__m512 sum93 = sum90;
__m512 sum95 = sum94;
__m512 sum96 = sum94;
__m512 sum97 = sum94;
__m512 sum99 = sum98;
__m512 sum100 = sum98;
__m512 sum101 = sum98;
__m512 sum103 = sum102;
__m512 sum104 = sum102;
__m512 sum105 = sum102;
for (ptrdiff_t j7 = 0; j7 < 791; ++j7) {
__m512 dat45 = _mm512_loadu_ps(datPtr2+0+256*j7);
__m512 dat46 = _mm512_loadu_ps(datPtr2+64+256*j7);
__m512 dat47 = _mm512_loadu_ps(datPtr2+128+256*j7);
__m512 dat48 = _mm512_loadu_ps(datPtr2+192+256*j7);
__m512 wt167 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i9+24*j7));
sum82 = _mm512_fmadd_ps(wt167, dat45, sum82);
sum83 = _mm512_fmadd_ps(wt167, dat46, sum83);
sum84 = _mm512_fmadd_ps(wt167, dat47, sum84);
sum85 = _mm512_fmadd_ps(wt167, dat48, sum85);
__m512 wt168 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i9+24*j7));
sum86 = _mm512_fmadd_ps(wt168, dat45, sum86);
sum87 = _mm512_fmadd_ps(wt168, dat46, sum87);
sum88 = _mm512_fmadd_ps(wt168, dat47, sum88);
sum89 = _mm512_fmadd_ps(wt168, dat48, sum89);
__m512 wt169 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i9+24*j7));
sum90 = _mm512_fmadd_ps(wt169, dat45, sum90);
sum91 = _mm512_fmadd_ps(wt169, dat46, sum91);
sum92 = _mm512_fmadd_ps(wt169, dat47, sum92);
sum93 = _mm512_fmadd_ps(wt169, dat48, sum93);
__m512 wt170 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i9+24*j7));
sum94 = _mm512_fmadd_ps(wt170, dat45, sum94);
sum95 = _mm512_fmadd_ps(wt170, dat46, sum95);
sum96 = _mm512_fmadd_ps(wt170, dat47, sum96);
sum97 = _mm512_fmadd_ps(wt170, dat48, sum97);
__m512 wt171 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i9+24*j7));
sum98 = _mm512_fmadd_ps(wt171, dat45, sum98);
sum99 = _mm512_fmadd_ps(wt171, dat46, sum99);
sum100 = _mm512_fmadd_ps(wt171, dat47, sum100);
sum101 = _mm512_fmadd_ps(wt171, dat48, sum101);
__m512 wt172 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i9+24*j7));
sum102 = _mm512_fmadd_ps(wt172, dat45, sum102);
sum103 = _mm512_fmadd_ps(wt172, dat46, sum103);
sum104 = _mm512_fmadd_ps(wt172, dat47, sum104);
sum105 = _mm512_fmadd_ps(wt172, dat48, sum105);
}
_mm512_storeu_ps(sumPtr1+0+1536*i9, sum82);
_mm512_storeu_ps(sumPtr1+64+1536*i9, sum83);
_mm512_storeu_ps(sumPtr1+128+1536*i9, sum84);
_mm512_storeu_ps(sumPtr1+192+1536*i9, sum85);
_mm512_storeu_ps(sumPtr1+256+1536*i9, sum86);
_mm512_storeu_ps(sumPtr1+320+1536*i9, sum87);
_mm512_storeu_ps(sumPtr1+384+1536*i9, sum88);
_mm512_storeu_ps(sumPtr1+448+1536*i9, sum89);
_mm512_storeu_ps(sumPtr1+512+1536*i9, sum90);
_mm512_storeu_ps(sumPtr1+576+1536*i9, sum91);
_mm512_storeu_ps(sumPtr1+640+1536*i9, sum92);
_mm512_storeu_ps(sumPtr1+704+1536*i9, sum93);
_mm512_storeu_ps(sumPtr1+768+1536*i9, sum94);
_mm512_storeu_ps(sumPtr1+832+1536*i9, sum95);
_mm512_storeu_ps(sumPtr1+896+1536*i9, sum96);
_mm512_storeu_ps(sumPtr1+960+1536*i9, sum97);
_mm512_storeu_ps(sumPtr1+1024+1536*i9, sum98);
_mm512_storeu_ps(sumPtr1+1088+1536*i9, sum99);
_mm512_storeu_ps(sumPtr1+1152+1536*i9, sum100);
_mm512_storeu_ps(sumPtr1+1216+1536*i9, sum101);
_mm512_storeu_ps(sumPtr1+1280+1536*i9, sum102);
_mm512_storeu_ps(sumPtr1+1344+1536*i9, sum103);
_mm512_storeu_ps(sumPtr1+1408+1536*i9, sum104);
_mm512_storeu_ps(sumPtr1+1472+1536*i9, sum105);
if (i9 >= ii3) return;
}
__m512 sum106 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i9));
__m512 sum110 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i9));
__m512 sum114 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i9));
__m512 sum118 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i9));
__m512 sum107 = sum106;
__m512 sum108 = sum106;
__m512 sum109 = sum106;
__m512 sum111 = sum110;
__m512 sum112 = sum110;
__m512 sum113 = sum110;
__m512 sum115 = sum114;
__m512 sum116 = sum114;
__m512 sum117 = sum114;
__m512 sum119 = sum118;
__m512 sum120 = sum118;
__m512 sum121 = sum118;
for (ptrdiff_t j8 = 0; j8 < 791; ++j8) {
__m512 dat49 = _mm512_loadu_ps(datPtr2+0+256*j8);
__m512 dat50 = _mm512_loadu_ps(datPtr2+64+256*j8);
__m512 dat51 = _mm512_loadu_ps(datPtr2+128+256*j8);
__m512 dat52 = _mm512_loadu_ps(datPtr2+192+256*j8);
__m512 wt173 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i9+16*j8));
sum106 = _mm512_fmadd_ps(wt173, dat49, sum106);
sum107 = _mm512_fmadd_ps(wt173, dat50, sum107);
sum108 = _mm512_fmadd_ps(wt173, dat51, sum108);
sum109 = _mm512_fmadd_ps(wt173, dat52, sum109);
__m512 wt174 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i9+16*j8));
sum110 = _mm512_fmadd_ps(wt174, dat49, sum110);
sum111 = _mm512_fmadd_ps(wt174, dat50, sum111);
sum112 = _mm512_fmadd_ps(wt174, dat51, sum112);
sum113 = _mm512_fmadd_ps(wt174, dat52, sum113);
__m512 wt175 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i9+16*j8));
sum114 = _mm512_fmadd_ps(wt175, dat49, sum114);
sum115 = _mm512_fmadd_ps(wt175, dat50, sum115);
sum116 = _mm512_fmadd_ps(wt175, dat51, sum116);
sum117 = _mm512_fmadd_ps(wt175, dat52, sum117);
__m512 wt176 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i9+16*j8));
sum118 = _mm512_fmadd_ps(wt176, dat49, sum118);
sum119 = _mm512_fmadd_ps(wt176, dat50, sum119);
sum120 = _mm512_fmadd_ps(wt176, dat51, sum120);
sum121 = _mm512_fmadd_ps(wt176, dat52, sum121);
}
_mm512_storeu_ps(sumPtr1+0+1536*i9, sum106);
_mm512_storeu_ps(sumPtr1+64+1536*i9, sum107);
_mm512_storeu_ps(sumPtr1+128+1536*i9, sum108);
_mm512_storeu_ps(sumPtr1+192+1536*i9, sum109);
_mm512_storeu_ps(sumPtr1+256+1536*i9, sum110);
_mm512_storeu_ps(sumPtr1+320+1536*i9, sum111);
_mm512_storeu_ps(sumPtr1+384+1536*i9, sum112);
_mm512_storeu_ps(sumPtr1+448+1536*i9, sum113);
_mm512_storeu_ps(sumPtr1+512+1536*i9, sum114);
_mm512_storeu_ps(sumPtr1+576+1536*i9, sum115);
_mm512_storeu_ps(sumPtr1+640+1536*i9, sum116);
_mm512_storeu_ps(sumPtr1+704+1536*i9, sum117);
_mm512_storeu_ps(sumPtr1+768+1536*i9, sum118);
_mm512_storeu_ps(sumPtr1+832+1536*i9, sum119);
_mm512_storeu_ps(sumPtr1+896+1536*i9, sum120);
_mm512_storeu_ps(sumPtr1+960+1536*i9, sum121);
break;
}
case 2: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i10 = 1*w3;
ptrdiff_t ii4 = i10+0;
for (; i10 != 101; ++i10) {
__m512 sum122 = _mm512_setzero_ps();
__m512 sum125 = _mm512_setzero_ps();
__m512 sum128 = _mm512_setzero_ps();
__m512 sum131 = _mm512_setzero_ps();
__m512 sum134 = _mm512_setzero_ps();
__m512 sum137 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum123 = sum122;
__m512 sum124 = sum122;
__m512 sum126 = sum125;
__m512 sum127 = sum125;
__m512 sum129 = sum128;
__m512 sum130 = sum128;
__m512 sum132 = sum131;
__m512 sum133 = sum131;
__m512 sum135 = sum134;
__m512 sum136 = sum134;
__m512 sum138 = sum137;
__m512 sum139 = sum137;
for (ptrdiff_t j9 = 0; j9 < 791; ++j9) {
__m512 dat53 = _mm512_loadu_ps(datPtr2+64+256*j9);
__m512 dat54 = _mm512_loadu_ps(datPtr2+128+256*j9);
__m512 dat55 = _mm512_loadu_ps(datPtr2+192+256*j9);
__m512 wt177 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i10+24*j9));
sum122 = _mm512_fmadd_ps(wt177, dat53, sum122);
sum123 = _mm512_fmadd_ps(wt177, dat54, sum123);
sum124 = _mm512_fmadd_ps(wt177, dat55, sum124);
__m512 wt178 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i10+24*j9));
sum125 = _mm512_fmadd_ps(wt178, dat53, sum125);
sum126 = _mm512_fmadd_ps(wt178, dat54, sum126);
sum127 = _mm512_fmadd_ps(wt178, dat55, sum127);
__m512 wt179 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i10+24*j9));
sum128 = _mm512_fmadd_ps(wt179, dat53, sum128);
sum129 = _mm512_fmadd_ps(wt179, dat54, sum129);
sum130 = _mm512_fmadd_ps(wt179, dat55, sum130);
__m512 wt180 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i10+24*j9));
sum131 = _mm512_fmadd_ps(wt180, dat53, sum131);
sum132 = _mm512_fmadd_ps(wt180, dat54, sum132);
sum133 = _mm512_fmadd_ps(wt180, dat55, sum133);
__m512 wt181 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i10+24*j9));
sum134 = _mm512_fmadd_ps(wt181, dat53, sum134);
sum135 = _mm512_fmadd_ps(wt181, dat54, sum135);
sum136 = _mm512_fmadd_ps(wt181, dat55, sum136);
__m512 wt182 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i10+24*j9));
sum137 = _mm512_fmadd_ps(wt182, dat53, sum137);
sum138 = _mm512_fmadd_ps(wt182, dat54, sum138);
sum139 = _mm512_fmadd_ps(wt182, dat55, sum139);
}
_mm512_storeu_ps(sumPtr1+0+1536*i10, sum122);
_mm512_storeu_ps(sumPtr1+64+1536*i10, sum123);
_mm512_storeu_ps(sumPtr1+128+1536*i10, sum124);
_mm512_storeu_ps(sumPtr1+256+1536*i10, sum125);
_mm512_storeu_ps(sumPtr1+320+1536*i10, sum126);
_mm512_storeu_ps(sumPtr1+384+1536*i10, sum127);
_mm512_storeu_ps(sumPtr1+512+1536*i10, sum128);
_mm512_storeu_ps(sumPtr1+576+1536*i10, sum129);
_mm512_storeu_ps(sumPtr1+640+1536*i10, sum130);
_mm512_storeu_ps(sumPtr1+768+1536*i10, sum131);
_mm512_storeu_ps(sumPtr1+832+1536*i10, sum132);
_mm512_storeu_ps(sumPtr1+896+1536*i10, sum133);
_mm512_storeu_ps(sumPtr1+1024+1536*i10, sum134);
_mm512_storeu_ps(sumPtr1+1088+1536*i10, sum135);
_mm512_storeu_ps(sumPtr1+1152+1536*i10, sum136);
_mm512_storeu_ps(sumPtr1+1280+1536*i10, sum137);
_mm512_storeu_ps(sumPtr1+1344+1536*i10, sum138);
_mm512_storeu_ps(sumPtr1+1408+1536*i10, sum139);
if (i10 >= ii4) return;
}
__m512 sum140 = _mm512_setzero_ps();
__m512 sum143 = _mm512_setzero_ps();
__m512 sum146 = _mm512_setzero_ps();
__m512 sum149 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum141 = sum140;
__m512 sum142 = sum140;
__m512 sum144 = sum143;
__m512 sum145 = sum143;
__m512 sum147 = sum146;
__m512 sum148 = sum146;
__m512 sum150 = sum149;
__m512 sum151 = sum149;
for (ptrdiff_t j10 = 0; j10 < 791; ++j10) {
__m512 dat56 = _mm512_loadu_ps(datPtr2+64+256*j10);
__m512 dat57 = _mm512_loadu_ps(datPtr2+128+256*j10);
__m512 dat58 = _mm512_loadu_ps(datPtr2+192+256*j10);
__m512 wt183 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i10+16*j10));
sum140 = _mm512_fmadd_ps(wt183, dat56, sum140);
sum141 = _mm512_fmadd_ps(wt183, dat57, sum141);
sum142 = _mm512_fmadd_ps(wt183, dat58, sum142);
__m512 wt184 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i10+16*j10));
sum143 = _mm512_fmadd_ps(wt184, dat56, sum143);
sum144 = _mm512_fmadd_ps(wt184, dat57, sum144);
sum145 = _mm512_fmadd_ps(wt184, dat58, sum145);
__m512 wt185 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i10+16*j10));
sum146 = _mm512_fmadd_ps(wt185, dat56, sum146);
sum147 = _mm512_fmadd_ps(wt185, dat57, sum147);
sum148 = _mm512_fmadd_ps(wt185, dat58, sum148);
__m512 wt186 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i10+16*j10));
sum149 = _mm512_fmadd_ps(wt186, dat56, sum149);
sum150 = _mm512_fmadd_ps(wt186, dat57, sum150);
sum151 = _mm512_fmadd_ps(wt186, dat58, sum151);
}
_mm512_storeu_ps(sumPtr1+0+1536*i10, sum140);
_mm512_storeu_ps(sumPtr1+64+1536*i10, sum141);
_mm512_storeu_ps(sumPtr1+128+1536*i10, sum142);
_mm512_storeu_ps(sumPtr1+256+1536*i10, sum143);
_mm512_storeu_ps(sumPtr1+320+1536*i10, sum144);
_mm512_storeu_ps(sumPtr1+384+1536*i10, sum145);
_mm512_storeu_ps(sumPtr1+512+1536*i10, sum146);
_mm512_storeu_ps(sumPtr1+576+1536*i10, sum147);
_mm512_storeu_ps(sumPtr1+640+1536*i10, sum148);
_mm512_storeu_ps(sumPtr1+768+1536*i10, sum149);
_mm512_storeu_ps(sumPtr1+832+1536*i10, sum150);
_mm512_storeu_ps(sumPtr1+896+1536*i10, sum151);
return;
}
ptrdiff_t i11 = 1*w3;
ptrdiff_t ii5 = i11+0;
for (; i11 != 101; ++i11) {
__m512 sum152 = _mm512_setzero_ps();
__m512 sum155 = _mm512_setzero_ps();
__m512 sum158 = _mm512_setzero_ps();
__m512 sum161 = _mm512_setzero_ps();
__m512 sum164 = _mm512_setzero_ps();
__m512 sum167 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum153 = sum152;
__m512 sum154 = sum152;
__m512 sum156 = sum155;
__m512 sum157 = sum155;
__m512 sum159 = sum158;
__m512 sum160 = sum158;
__m512 sum162 = sum161;
__m512 sum163 = sum161;
__m512 sum165 = sum164;
__m512 sum166 = sum164;
__m512 sum168 = sum167;
__m512 sum169 = sum167;
for (ptrdiff_t j11 = 0; j11 < 791; ++j11) {
__m512 dat59 = _mm512_loadu_ps(datPtr2+64+256*j11);
__m512 dat60 = _mm512_loadu_ps(datPtr2+128+256*j11);
__m512 dat61 = _mm512_loadu_ps(datPtr2+192+256*j11);
__m512 wt187 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i11+24*j11));
sum152 = _mm512_fmadd_ps(wt187, dat59, sum152);
sum153 = _mm512_fmadd_ps(wt187, dat60, sum153);
sum154 = _mm512_fmadd_ps(wt187, dat61, sum154);
__m512 wt188 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i11+24*j11));
sum155 = _mm512_fmadd_ps(wt188, dat59, sum155);
sum156 = _mm512_fmadd_ps(wt188, dat60, sum156);
sum157 = _mm512_fmadd_ps(wt188, dat61, sum157);
__m512 wt189 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i11+24*j11));
sum158 = _mm512_fmadd_ps(wt189, dat59, sum158);
sum159 = _mm512_fmadd_ps(wt189, dat60, sum159);
sum160 = _mm512_fmadd_ps(wt189, dat61, sum160);
__m512 wt190 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i11+24*j11));
sum161 = _mm512_fmadd_ps(wt190, dat59, sum161);
sum162 = _mm512_fmadd_ps(wt190, dat60, sum162);
sum163 = _mm512_fmadd_ps(wt190, dat61, sum163);
__m512 wt191 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i11+24*j11));
sum164 = _mm512_fmadd_ps(wt191, dat59, sum164);
sum165 = _mm512_fmadd_ps(wt191, dat60, sum165);
sum166 = _mm512_fmadd_ps(wt191, dat61, sum166);
__m512 wt192 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i11+24*j11));
sum167 = _mm512_fmadd_ps(wt192, dat59, sum167);
sum168 = _mm512_fmadd_ps(wt192, dat60, sum168);
sum169 = _mm512_fmadd_ps(wt192, dat61, sum169);
}
_mm512_storeu_ps(sumPtr1+0+1536*i11, _mm512_add_ps(sum152, _mm512_loadu_ps(sumPtr1+0+1536*i11)));
_mm512_storeu_ps(sumPtr1+64+1536*i11, _mm512_add_ps(sum153, _mm512_loadu_ps(sumPtr1+64+1536*i11)));
_mm512_storeu_ps(sumPtr1+128+1536*i11, _mm512_add_ps(sum154, _mm512_loadu_ps(sumPtr1+128+1536*i11)));
_mm512_storeu_ps(sumPtr1+256+1536*i11, _mm512_add_ps(sum155, _mm512_loadu_ps(sumPtr1+256+1536*i11)));
_mm512_storeu_ps(sumPtr1+320+1536*i11, _mm512_add_ps(sum156, _mm512_loadu_ps(sumPtr1+320+1536*i11)));
_mm512_storeu_ps(sumPtr1+384+1536*i11, _mm512_add_ps(sum157, _mm512_loadu_ps(sumPtr1+384+1536*i11)));
_mm512_storeu_ps(sumPtr1+512+1536*i11, _mm512_add_ps(sum158, _mm512_loadu_ps(sumPtr1+512+1536*i11)));
_mm512_storeu_ps(sumPtr1+576+1536*i11, _mm512_add_ps(sum159, _mm512_loadu_ps(sumPtr1+576+1536*i11)));
_mm512_storeu_ps(sumPtr1+640+1536*i11, _mm512_add_ps(sum160, _mm512_loadu_ps(sumPtr1+640+1536*i11)));
_mm512_storeu_ps(sumPtr1+768+1536*i11, _mm512_add_ps(sum161, _mm512_loadu_ps(sumPtr1+768+1536*i11)));
_mm512_storeu_ps(sumPtr1+832+1536*i11, _mm512_add_ps(sum162, _mm512_loadu_ps(sumPtr1+832+1536*i11)));
_mm512_storeu_ps(sumPtr1+896+1536*i11, _mm512_add_ps(sum163, _mm512_loadu_ps(sumPtr1+896+1536*i11)));
_mm512_storeu_ps(sumPtr1+1024+1536*i11, _mm512_add_ps(sum164, _mm512_loadu_ps(sumPtr1+1024+1536*i11)));
_mm512_storeu_ps(sumPtr1+1088+1536*i11, _mm512_add_ps(sum165, _mm512_loadu_ps(sumPtr1+1088+1536*i11)));
_mm512_storeu_ps(sumPtr1+1152+1536*i11, _mm512_add_ps(sum166, _mm512_loadu_ps(sumPtr1+1152+1536*i11)));
_mm512_storeu_ps(sumPtr1+1280+1536*i11, _mm512_add_ps(sum167, _mm512_loadu_ps(sumPtr1+1280+1536*i11)));
_mm512_storeu_ps(sumPtr1+1344+1536*i11, _mm512_add_ps(sum168, _mm512_loadu_ps(sumPtr1+1344+1536*i11)));
_mm512_storeu_ps(sumPtr1+1408+1536*i11, _mm512_add_ps(sum169, _mm512_loadu_ps(sumPtr1+1408+1536*i11)));
if (i11 >= ii5) return;
}
__m512 sum170 = _mm512_setzero_ps();
__m512 sum173 = _mm512_setzero_ps();
__m512 sum176 = _mm512_setzero_ps();
__m512 sum179 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum171 = sum170;
__m512 sum172 = sum170;
__m512 sum174 = sum173;
__m512 sum175 = sum173;
__m512 sum177 = sum176;
__m512 sum178 = sum176;
__m512 sum180 = sum179;
__m512 sum181 = sum179;
for (ptrdiff_t j12 = 0; j12 < 791; ++j12) {
__m512 dat62 = _mm512_loadu_ps(datPtr2+64+256*j12);
__m512 dat63 = _mm512_loadu_ps(datPtr2+128+256*j12);
__m512 dat64 = _mm512_loadu_ps(datPtr2+192+256*j12);
__m512 wt193 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i11+16*j12));
sum170 = _mm512_fmadd_ps(wt193, dat62, sum170);
sum171 = _mm512_fmadd_ps(wt193, dat63, sum171);
sum172 = _mm512_fmadd_ps(wt193, dat64, sum172);
__m512 wt194 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i11+16*j12));
sum173 = _mm512_fmadd_ps(wt194, dat62, sum173);
sum174 = _mm512_fmadd_ps(wt194, dat63, sum174);
sum175 = _mm512_fmadd_ps(wt194, dat64, sum175);
__m512 wt195 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i11+16*j12));
sum176 = _mm512_fmadd_ps(wt195, dat62, sum176);
sum177 = _mm512_fmadd_ps(wt195, dat63, sum177);
sum178 = _mm512_fmadd_ps(wt195, dat64, sum178);
__m512 wt196 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i11+16*j12));
sum179 = _mm512_fmadd_ps(wt196, dat62, sum179);
sum180 = _mm512_fmadd_ps(wt196, dat63, sum180);
sum181 = _mm512_fmadd_ps(wt196, dat64, sum181);
}
_mm512_storeu_ps(sumPtr1+0+1536*i11, _mm512_add_ps(sum170, _mm512_loadu_ps(sumPtr1+0+1536*i11)));
_mm512_storeu_ps(sumPtr1+64+1536*i11, _mm512_add_ps(sum171, _mm512_loadu_ps(sumPtr1+64+1536*i11)));
_mm512_storeu_ps(sumPtr1+128+1536*i11, _mm512_add_ps(sum172, _mm512_loadu_ps(sumPtr1+128+1536*i11)));
_mm512_storeu_ps(sumPtr1+256+1536*i11, _mm512_add_ps(sum173, _mm512_loadu_ps(sumPtr1+256+1536*i11)));
_mm512_storeu_ps(sumPtr1+320+1536*i11, _mm512_add_ps(sum174, _mm512_loadu_ps(sumPtr1+320+1536*i11)));
_mm512_storeu_ps(sumPtr1+384+1536*i11, _mm512_add_ps(sum175, _mm512_loadu_ps(sumPtr1+384+1536*i11)));
_mm512_storeu_ps(sumPtr1+512+1536*i11, _mm512_add_ps(sum176, _mm512_loadu_ps(sumPtr1+512+1536*i11)));
_mm512_storeu_ps(sumPtr1+576+1536*i11, _mm512_add_ps(sum177, _mm512_loadu_ps(sumPtr1+576+1536*i11)));
_mm512_storeu_ps(sumPtr1+640+1536*i11, _mm512_add_ps(sum178, _mm512_loadu_ps(sumPtr1+640+1536*i11)));
_mm512_storeu_ps(sumPtr1+768+1536*i11, _mm512_add_ps(sum179, _mm512_loadu_ps(sumPtr1+768+1536*i11)));
_mm512_storeu_ps(sumPtr1+832+1536*i11, _mm512_add_ps(sum180, _mm512_loadu_ps(sumPtr1+832+1536*i11)));
_mm512_storeu_ps(sumPtr1+896+1536*i11, _mm512_add_ps(sum181, _mm512_loadu_ps(sumPtr1+896+1536*i11)));
return;
}
(void)base1;
ptrdiff_t i12 = 1*w3;
ptrdiff_t ii6 = i12+0;
for (; i12 != 101; ++i12) {
__m512 sum182 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i12));
__m512 sum185 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i12));
__m512 sum188 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i12));
__m512 sum191 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i12));
__m512 sum194 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i12));
__m512 sum197 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i12));
__m512 sum183 = sum182;
__m512 sum184 = sum182;
__m512 sum186 = sum185;
__m512 sum187 = sum185;
__m512 sum189 = sum188;
__m512 sum190 = sum188;
__m512 sum192 = sum191;
__m512 sum193 = sum191;
__m512 sum195 = sum194;
__m512 sum196 = sum194;
__m512 sum198 = sum197;
__m512 sum199 = sum197;
for (ptrdiff_t j13 = 0; j13 < 791; ++j13) {
__m512 dat65 = _mm512_loadu_ps(datPtr2+64+256*j13);
__m512 dat66 = _mm512_loadu_ps(datPtr2+128+256*j13);
__m512 dat67 = _mm512_loadu_ps(datPtr2+192+256*j13);
__m512 wt197 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i12+24*j13));
sum182 = _mm512_fmadd_ps(wt197, dat65, sum182);
sum183 = _mm512_fmadd_ps(wt197, dat66, sum183);
sum184 = _mm512_fmadd_ps(wt197, dat67, sum184);
__m512 wt198 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i12+24*j13));
sum185 = _mm512_fmadd_ps(wt198, dat65, sum185);
sum186 = _mm512_fmadd_ps(wt198, dat66, sum186);
sum187 = _mm512_fmadd_ps(wt198, dat67, sum187);
__m512 wt199 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i12+24*j13));
sum188 = _mm512_fmadd_ps(wt199, dat65, sum188);
sum189 = _mm512_fmadd_ps(wt199, dat66, sum189);
sum190 = _mm512_fmadd_ps(wt199, dat67, sum190);
__m512 wt200 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i12+24*j13));
sum191 = _mm512_fmadd_ps(wt200, dat65, sum191);
sum192 = _mm512_fmadd_ps(wt200, dat66, sum192);
sum193 = _mm512_fmadd_ps(wt200, dat67, sum193);
__m512 wt201 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i12+24*j13));
sum194 = _mm512_fmadd_ps(wt201, dat65, sum194);
sum195 = _mm512_fmadd_ps(wt201, dat66, sum195);
sum196 = _mm512_fmadd_ps(wt201, dat67, sum196);
__m512 wt202 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i12+24*j13));
sum197 = _mm512_fmadd_ps(wt202, dat65, sum197);
sum198 = _mm512_fmadd_ps(wt202, dat66, sum198);
sum199 = _mm512_fmadd_ps(wt202, dat67, sum199);
}
_mm512_storeu_ps(sumPtr1+0+1536*i12, sum182);
_mm512_storeu_ps(sumPtr1+64+1536*i12, sum183);
_mm512_storeu_ps(sumPtr1+128+1536*i12, sum184);
_mm512_storeu_ps(sumPtr1+256+1536*i12, sum185);
_mm512_storeu_ps(sumPtr1+320+1536*i12, sum186);
_mm512_storeu_ps(sumPtr1+384+1536*i12, sum187);
_mm512_storeu_ps(sumPtr1+512+1536*i12, sum188);
_mm512_storeu_ps(sumPtr1+576+1536*i12, sum189);
_mm512_storeu_ps(sumPtr1+640+1536*i12, sum190);
_mm512_storeu_ps(sumPtr1+768+1536*i12, sum191);
_mm512_storeu_ps(sumPtr1+832+1536*i12, sum192);
_mm512_storeu_ps(sumPtr1+896+1536*i12, sum193);
_mm512_storeu_ps(sumPtr1+1024+1536*i12, sum194);
_mm512_storeu_ps(sumPtr1+1088+1536*i12, sum195);
_mm512_storeu_ps(sumPtr1+1152+1536*i12, sum196);
_mm512_storeu_ps(sumPtr1+1280+1536*i12, sum197);
_mm512_storeu_ps(sumPtr1+1344+1536*i12, sum198);
_mm512_storeu_ps(sumPtr1+1408+1536*i12, sum199);
if (i12 >= ii6) return;
}
__m512 sum200 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i12));
__m512 sum203 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i12));
__m512 sum206 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i12));
__m512 sum209 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i12));
__m512 sum201 = sum200;
__m512 sum202 = sum200;
__m512 sum204 = sum203;
__m512 sum205 = sum203;
__m512 sum207 = sum206;
__m512 sum208 = sum206;
__m512 sum210 = sum209;
__m512 sum211 = sum209;
for (ptrdiff_t j14 = 0; j14 < 791; ++j14) {
__m512 dat68 = _mm512_loadu_ps(datPtr2+64+256*j14);
__m512 dat69 = _mm512_loadu_ps(datPtr2+128+256*j14);
__m512 dat70 = _mm512_loadu_ps(datPtr2+192+256*j14);
__m512 wt203 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i12+16*j14));
sum200 = _mm512_fmadd_ps(wt203, dat68, sum200);
sum201 = _mm512_fmadd_ps(wt203, dat69, sum201);
sum202 = _mm512_fmadd_ps(wt203, dat70, sum202);
__m512 wt204 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i12+16*j14));
sum203 = _mm512_fmadd_ps(wt204, dat68, sum203);
sum204 = _mm512_fmadd_ps(wt204, dat69, sum204);
sum205 = _mm512_fmadd_ps(wt204, dat70, sum205);
__m512 wt205 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i12+16*j14));
sum206 = _mm512_fmadd_ps(wt205, dat68, sum206);
sum207 = _mm512_fmadd_ps(wt205, dat69, sum207);
sum208 = _mm512_fmadd_ps(wt205, dat70, sum208);
__m512 wt206 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i12+16*j14));
sum209 = _mm512_fmadd_ps(wt206, dat68, sum209);
sum210 = _mm512_fmadd_ps(wt206, dat69, sum210);
sum211 = _mm512_fmadd_ps(wt206, dat70, sum211);
}
_mm512_storeu_ps(sumPtr1+0+1536*i12, sum200);
_mm512_storeu_ps(sumPtr1+64+1536*i12, sum201);
_mm512_storeu_ps(sumPtr1+128+1536*i12, sum202);
_mm512_storeu_ps(sumPtr1+256+1536*i12, sum203);
_mm512_storeu_ps(sumPtr1+320+1536*i12, sum204);
_mm512_storeu_ps(sumPtr1+384+1536*i12, sum205);
_mm512_storeu_ps(sumPtr1+512+1536*i12, sum206);
_mm512_storeu_ps(sumPtr1+576+1536*i12, sum207);
_mm512_storeu_ps(sumPtr1+640+1536*i12, sum208);
_mm512_storeu_ps(sumPtr1+768+1536*i12, sum209);
_mm512_storeu_ps(sumPtr1+832+1536*i12, sum210);
_mm512_storeu_ps(sumPtr1+896+1536*i12, sum211);
break;
}
case 3: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i13 = 1*w3;
ptrdiff_t ii7 = i13+0;
for (; i13 != 101; ++i13) {
__m512 sum212 = _mm512_setzero_ps();
__m512 sum216 = _mm512_setzero_ps();
__m512 sum220 = _mm512_setzero_ps();
__m512 sum224 = _mm512_setzero_ps();
__m512 sum228 = _mm512_setzero_ps();
__m512 sum232 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum213 = sum212;
__m512 sum214 = sum212;
__m512 sum215 = sum212;
__m512 sum217 = sum216;
__m512 sum218 = sum216;
__m512 sum219 = sum216;
__m512 sum221 = sum220;
__m512 sum222 = sum220;
__m512 sum223 = sum220;
__m512 sum225 = sum224;
__m512 sum226 = sum224;
__m512 sum227 = sum224;
__m512 sum229 = sum228;
__m512 sum230 = sum228;
__m512 sum231 = sum228;
__m512 sum233 = sum232;
__m512 sum234 = sum232;
__m512 sum235 = sum232;
for (ptrdiff_t j15 = 0; j15 < 791; ++j15) {
__m512 dat71 = _mm512_loadu_ps(datPtr2+0+256*j15);
__m512 dat72 = _mm512_loadu_ps(datPtr2+64+256*j15);
__m512 dat73 = _mm512_loadu_ps(datPtr2+128+256*j15);
__m512 dat74 = _mm512_loadu_ps(datPtr2+192+256*j15);
__m512 wt207 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i13+24*j15));
sum212 = _mm512_fmadd_ps(wt207, dat71, sum212);
sum213 = _mm512_fmadd_ps(wt207, dat72, sum213);
sum214 = _mm512_fmadd_ps(wt207, dat73, sum214);
sum215 = _mm512_fmadd_ps(wt207, dat74, sum215);
__m512 wt208 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i13+24*j15));
sum216 = _mm512_fmadd_ps(wt208, dat71, sum216);
sum217 = _mm512_fmadd_ps(wt208, dat72, sum217);
sum218 = _mm512_fmadd_ps(wt208, dat73, sum218);
sum219 = _mm512_fmadd_ps(wt208, dat74, sum219);
__m512 wt209 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i13+24*j15));
sum220 = _mm512_fmadd_ps(wt209, dat71, sum220);
sum221 = _mm512_fmadd_ps(wt209, dat72, sum221);
sum222 = _mm512_fmadd_ps(wt209, dat73, sum222);
sum223 = _mm512_fmadd_ps(wt209, dat74, sum223);
__m512 wt210 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i13+24*j15));
sum224 = _mm512_fmadd_ps(wt210, dat71, sum224);
sum225 = _mm512_fmadd_ps(wt210, dat72, sum225);
sum226 = _mm512_fmadd_ps(wt210, dat73, sum226);
sum227 = _mm512_fmadd_ps(wt210, dat74, sum227);
__m512 wt211 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i13+24*j15));
sum228 = _mm512_fmadd_ps(wt211, dat71, sum228);
sum229 = _mm512_fmadd_ps(wt211, dat72, sum229);
sum230 = _mm512_fmadd_ps(wt211, dat73, sum230);
sum231 = _mm512_fmadd_ps(wt211, dat74, sum231);
__m512 wt212 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i13+24*j15));
sum232 = _mm512_fmadd_ps(wt212, dat71, sum232);
sum233 = _mm512_fmadd_ps(wt212, dat72, sum233);
sum234 = _mm512_fmadd_ps(wt212, dat73, sum234);
sum235 = _mm512_fmadd_ps(wt212, dat74, sum235);
}
_mm512_storeu_ps(sumPtr1+-312128+1536*i13, sum212);
_mm512_storeu_ps(sumPtr1+0+1536*i13, sum213);
_mm512_storeu_ps(sumPtr1+64+1536*i13, sum214);
_mm512_storeu_ps(sumPtr1+128+1536*i13, sum215);
_mm512_storeu_ps(sumPtr1+-311872+1536*i13, sum216);
_mm512_storeu_ps(sumPtr1+256+1536*i13, sum217);
_mm512_storeu_ps(sumPtr1+320+1536*i13, sum218);
_mm512_storeu_ps(sumPtr1+384+1536*i13, sum219);
_mm512_storeu_ps(sumPtr1+-311616+1536*i13, sum220);
_mm512_storeu_ps(sumPtr1+512+1536*i13, sum221);
_mm512_storeu_ps(sumPtr1+576+1536*i13, sum222);
_mm512_storeu_ps(sumPtr1+640+1536*i13, sum223);
_mm512_storeu_ps(sumPtr1+-311360+1536*i13, sum224);
_mm512_storeu_ps(sumPtr1+768+1536*i13, sum225);
_mm512_storeu_ps(sumPtr1+832+1536*i13, sum226);
_mm512_storeu_ps(sumPtr1+896+1536*i13, sum227);
_mm512_storeu_ps(sumPtr1+-311104+1536*i13, sum228);
_mm512_storeu_ps(sumPtr1+1024+1536*i13, sum229);
_mm512_storeu_ps(sumPtr1+1088+1536*i13, sum230);
_mm512_storeu_ps(sumPtr1+1152+1536*i13, sum231);
_mm512_storeu_ps(sumPtr1+-310848+1536*i13, sum232);
_mm512_storeu_ps(sumPtr1+1280+1536*i13, sum233);
_mm512_storeu_ps(sumPtr1+1344+1536*i13, sum234);
_mm512_storeu_ps(sumPtr1+1408+1536*i13, sum235);
if (i13 >= ii7) return;
}
__m512 sum236 = _mm512_setzero_ps();
__m512 sum240 = _mm512_setzero_ps();
__m512 sum244 = _mm512_setzero_ps();
__m512 sum248 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum237 = sum236;
__m512 sum238 = sum236;
__m512 sum239 = sum236;
__m512 sum241 = sum240;
__m512 sum242 = sum240;
__m512 sum243 = sum240;
__m512 sum245 = sum244;
__m512 sum246 = sum244;
__m512 sum247 = sum244;
__m512 sum249 = sum248;
__m512 sum250 = sum248;
__m512 sum251 = sum248;
for (ptrdiff_t j16 = 0; j16 < 791; ++j16) {
__m512 dat75 = _mm512_loadu_ps(datPtr2+0+256*j16);
__m512 dat76 = _mm512_loadu_ps(datPtr2+64+256*j16);
__m512 dat77 = _mm512_loadu_ps(datPtr2+128+256*j16);
__m512 dat78 = _mm512_loadu_ps(datPtr2+192+256*j16);
__m512 wt213 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i13+16*j16));
sum236 = _mm512_fmadd_ps(wt213, dat75, sum236);
sum237 = _mm512_fmadd_ps(wt213, dat76, sum237);
sum238 = _mm512_fmadd_ps(wt213, dat77, sum238);
sum239 = _mm512_fmadd_ps(wt213, dat78, sum239);
__m512 wt214 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i13+16*j16));
sum240 = _mm512_fmadd_ps(wt214, dat75, sum240);
sum241 = _mm512_fmadd_ps(wt214, dat76, sum241);
sum242 = _mm512_fmadd_ps(wt214, dat77, sum242);
sum243 = _mm512_fmadd_ps(wt214, dat78, sum243);
__m512 wt215 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i13+16*j16));
sum244 = _mm512_fmadd_ps(wt215, dat75, sum244);
sum245 = _mm512_fmadd_ps(wt215, dat76, sum245);
sum246 = _mm512_fmadd_ps(wt215, dat77, sum246);
sum247 = _mm512_fmadd_ps(wt215, dat78, sum247);
__m512 wt216 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i13+16*j16));
sum248 = _mm512_fmadd_ps(wt216, dat75, sum248);
sum249 = _mm512_fmadd_ps(wt216, dat76, sum249);
sum250 = _mm512_fmadd_ps(wt216, dat77, sum250);
sum251 = _mm512_fmadd_ps(wt216, dat78, sum251);
}
_mm512_storeu_ps(sumPtr1+-312128+1536*i13, sum236);
_mm512_storeu_ps(sumPtr1+0+1536*i13, sum237);
_mm512_storeu_ps(sumPtr1+64+1536*i13, sum238);
_mm512_storeu_ps(sumPtr1+128+1536*i13, sum239);
_mm512_storeu_ps(sumPtr1+-311872+1536*i13, sum240);
_mm512_storeu_ps(sumPtr1+256+1536*i13, sum241);
_mm512_storeu_ps(sumPtr1+320+1536*i13, sum242);
_mm512_storeu_ps(sumPtr1+384+1536*i13, sum243);
_mm512_storeu_ps(sumPtr1+-311616+1536*i13, sum244);
_mm512_storeu_ps(sumPtr1+512+1536*i13, sum245);
_mm512_storeu_ps(sumPtr1+576+1536*i13, sum246);
_mm512_storeu_ps(sumPtr1+640+1536*i13, sum247);
_mm512_storeu_ps(sumPtr1+-311360+1536*i13, sum248);
_mm512_storeu_ps(sumPtr1+768+1536*i13, sum249);
_mm512_storeu_ps(sumPtr1+832+1536*i13, sum250);
_mm512_storeu_ps(sumPtr1+896+1536*i13, sum251);
return;
}
ptrdiff_t i14 = 1*w3;
ptrdiff_t ii8 = i14+0;
for (; i14 != 101; ++i14) {
__m512 sum252 = _mm512_setzero_ps();
__m512 sum256 = _mm512_setzero_ps();
__m512 sum260 = _mm512_setzero_ps();
__m512 sum264 = _mm512_setzero_ps();
__m512 sum268 = _mm512_setzero_ps();
__m512 sum272 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum253 = sum252;
__m512 sum254 = sum252;
__m512 sum255 = sum252;
__m512 sum257 = sum256;
__m512 sum258 = sum256;
__m512 sum259 = sum256;
__m512 sum261 = sum260;
__m512 sum262 = sum260;
__m512 sum263 = sum260;
__m512 sum265 = sum264;
__m512 sum266 = sum264;
__m512 sum267 = sum264;
__m512 sum269 = sum268;
__m512 sum270 = sum268;
__m512 sum271 = sum268;
__m512 sum273 = sum272;
__m512 sum274 = sum272;
__m512 sum275 = sum272;
for (ptrdiff_t j17 = 0; j17 < 791; ++j17) {
__m512 dat79 = _mm512_loadu_ps(datPtr2+0+256*j17);
__m512 dat80 = _mm512_loadu_ps(datPtr2+64+256*j17);
__m512 dat81 = _mm512_loadu_ps(datPtr2+128+256*j17);
__m512 dat82 = _mm512_loadu_ps(datPtr2+192+256*j17);
__m512 wt217 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i14+24*j17));
sum252 = _mm512_fmadd_ps(wt217, dat79, sum252);
sum253 = _mm512_fmadd_ps(wt217, dat80, sum253);
sum254 = _mm512_fmadd_ps(wt217, dat81, sum254);
sum255 = _mm512_fmadd_ps(wt217, dat82, sum255);
__m512 wt218 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i14+24*j17));
sum256 = _mm512_fmadd_ps(wt218, dat79, sum256);
sum257 = _mm512_fmadd_ps(wt218, dat80, sum257);
sum258 = _mm512_fmadd_ps(wt218, dat81, sum258);
sum259 = _mm512_fmadd_ps(wt218, dat82, sum259);
__m512 wt219 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i14+24*j17));
sum260 = _mm512_fmadd_ps(wt219, dat79, sum260);
sum261 = _mm512_fmadd_ps(wt219, dat80, sum261);
sum262 = _mm512_fmadd_ps(wt219, dat81, sum262);
sum263 = _mm512_fmadd_ps(wt219, dat82, sum263);
__m512 wt220 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i14+24*j17));
sum264 = _mm512_fmadd_ps(wt220, dat79, sum264);
sum265 = _mm512_fmadd_ps(wt220, dat80, sum265);
sum266 = _mm512_fmadd_ps(wt220, dat81, sum266);
sum267 = _mm512_fmadd_ps(wt220, dat82, sum267);
__m512 wt221 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i14+24*j17));
sum268 = _mm512_fmadd_ps(wt221, dat79, sum268);
sum269 = _mm512_fmadd_ps(wt221, dat80, sum269);
sum270 = _mm512_fmadd_ps(wt221, dat81, sum270);
sum271 = _mm512_fmadd_ps(wt221, dat82, sum271);
__m512 wt222 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i14+24*j17));
sum272 = _mm512_fmadd_ps(wt222, dat79, sum272);
sum273 = _mm512_fmadd_ps(wt222, dat80, sum273);
sum274 = _mm512_fmadd_ps(wt222, dat81, sum274);
sum275 = _mm512_fmadd_ps(wt222, dat82, sum275);
}
_mm512_storeu_ps(sumPtr1+-312128+1536*i14, _mm512_add_ps(sum252, _mm512_loadu_ps(sumPtr1+-312128+1536*i14)));
_mm512_storeu_ps(sumPtr1+0+1536*i14, _mm512_add_ps(sum253, _mm512_loadu_ps(sumPtr1+0+1536*i14)));
_mm512_storeu_ps(sumPtr1+64+1536*i14, _mm512_add_ps(sum254, _mm512_loadu_ps(sumPtr1+64+1536*i14)));
_mm512_storeu_ps(sumPtr1+128+1536*i14, _mm512_add_ps(sum255, _mm512_loadu_ps(sumPtr1+128+1536*i14)));
_mm512_storeu_ps(sumPtr1+-311872+1536*i14, _mm512_add_ps(sum256, _mm512_loadu_ps(sumPtr1+-311872+1536*i14)));
_mm512_storeu_ps(sumPtr1+256+1536*i14, _mm512_add_ps(sum257, _mm512_loadu_ps(sumPtr1+256+1536*i14)));
_mm512_storeu_ps(sumPtr1+320+1536*i14, _mm512_add_ps(sum258, _mm512_loadu_ps(sumPtr1+320+1536*i14)));
_mm512_storeu_ps(sumPtr1+384+1536*i14, _mm512_add_ps(sum259, _mm512_loadu_ps(sumPtr1+384+1536*i14)));
_mm512_storeu_ps(sumPtr1+-311616+1536*i14, _mm512_add_ps(sum260, _mm512_loadu_ps(sumPtr1+-311616+1536*i14)));
_mm512_storeu_ps(sumPtr1+512+1536*i14, _mm512_add_ps(sum261, _mm512_loadu_ps(sumPtr1+512+1536*i14)));
_mm512_storeu_ps(sumPtr1+576+1536*i14, _mm512_add_ps(sum262, _mm512_loadu_ps(sumPtr1+576+1536*i14)));
_mm512_storeu_ps(sumPtr1+640+1536*i14, _mm512_add_ps(sum263, _mm512_loadu_ps(sumPtr1+640+1536*i14)));
_mm512_storeu_ps(sumPtr1+-311360+1536*i14, _mm512_add_ps(sum264, _mm512_loadu_ps(sumPtr1+-311360+1536*i14)));
_mm512_storeu_ps(sumPtr1+768+1536*i14, _mm512_add_ps(sum265, _mm512_loadu_ps(sumPtr1+768+1536*i14)));
_mm512_storeu_ps(sumPtr1+832+1536*i14, _mm512_add_ps(sum266, _mm512_loadu_ps(sumPtr1+832+1536*i14)));
_mm512_storeu_ps(sumPtr1+896+1536*i14, _mm512_add_ps(sum267, _mm512_loadu_ps(sumPtr1+896+1536*i14)));
_mm512_storeu_ps(sumPtr1+-311104+1536*i14, _mm512_add_ps(sum268, _mm512_loadu_ps(sumPtr1+-311104+1536*i14)));
_mm512_storeu_ps(sumPtr1+1024+1536*i14, _mm512_add_ps(sum269, _mm512_loadu_ps(sumPtr1+1024+1536*i14)));
_mm512_storeu_ps(sumPtr1+1088+1536*i14, _mm512_add_ps(sum270, _mm512_loadu_ps(sumPtr1+1088+1536*i14)));
_mm512_storeu_ps(sumPtr1+1152+1536*i14, _mm512_add_ps(sum271, _mm512_loadu_ps(sumPtr1+1152+1536*i14)));
_mm512_storeu_ps(sumPtr1+-310848+1536*i14, _mm512_add_ps(sum272, _mm512_loadu_ps(sumPtr1+-310848+1536*i14)));
_mm512_storeu_ps(sumPtr1+1280+1536*i14, _mm512_add_ps(sum273, _mm512_loadu_ps(sumPtr1+1280+1536*i14)));
_mm512_storeu_ps(sumPtr1+1344+1536*i14, _mm512_add_ps(sum274, _mm512_loadu_ps(sumPtr1+1344+1536*i14)));
_mm512_storeu_ps(sumPtr1+1408+1536*i14, _mm512_add_ps(sum275, _mm512_loadu_ps(sumPtr1+1408+1536*i14)));
if (i14 >= ii8) return;
}
__m512 sum276 = _mm512_setzero_ps();
__m512 sum280 = _mm512_setzero_ps();
__m512 sum284 = _mm512_setzero_ps();
__m512 sum288 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum277 = sum276;
__m512 sum278 = sum276;
__m512 sum279 = sum276;
__m512 sum281 = sum280;
__m512 sum282 = sum280;
__m512 sum283 = sum280;
__m512 sum285 = sum284;
__m512 sum286 = sum284;
__m512 sum287 = sum284;
__m512 sum289 = sum288;
__m512 sum290 = sum288;
__m512 sum291 = sum288;
for (ptrdiff_t j18 = 0; j18 < 791; ++j18) {
__m512 dat83 = _mm512_loadu_ps(datPtr2+0+256*j18);
__m512 dat84 = _mm512_loadu_ps(datPtr2+64+256*j18);
__m512 dat85 = _mm512_loadu_ps(datPtr2+128+256*j18);
__m512 dat86 = _mm512_loadu_ps(datPtr2+192+256*j18);
__m512 wt223 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i14+16*j18));
sum276 = _mm512_fmadd_ps(wt223, dat83, sum276);
sum277 = _mm512_fmadd_ps(wt223, dat84, sum277);
sum278 = _mm512_fmadd_ps(wt223, dat85, sum278);
sum279 = _mm512_fmadd_ps(wt223, dat86, sum279);
__m512 wt224 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i14+16*j18));
sum280 = _mm512_fmadd_ps(wt224, dat83, sum280);
sum281 = _mm512_fmadd_ps(wt224, dat84, sum281);
sum282 = _mm512_fmadd_ps(wt224, dat85, sum282);
sum283 = _mm512_fmadd_ps(wt224, dat86, sum283);
__m512 wt225 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i14+16*j18));
sum284 = _mm512_fmadd_ps(wt225, dat83, sum284);
sum285 = _mm512_fmadd_ps(wt225, dat84, sum285);
sum286 = _mm512_fmadd_ps(wt225, dat85, sum286);
sum287 = _mm512_fmadd_ps(wt225, dat86, sum287);
__m512 wt226 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i14+16*j18));
sum288 = _mm512_fmadd_ps(wt226, dat83, sum288);
sum289 = _mm512_fmadd_ps(wt226, dat84, sum289);
sum290 = _mm512_fmadd_ps(wt226, dat85, sum290);
sum291 = _mm512_fmadd_ps(wt226, dat86, sum291);
}
_mm512_storeu_ps(sumPtr1+-312128+1536*i14, _mm512_add_ps(sum276, _mm512_loadu_ps(sumPtr1+-312128+1536*i14)));
_mm512_storeu_ps(sumPtr1+0+1536*i14, _mm512_add_ps(sum277, _mm512_loadu_ps(sumPtr1+0+1536*i14)));
_mm512_storeu_ps(sumPtr1+64+1536*i14, _mm512_add_ps(sum278, _mm512_loadu_ps(sumPtr1+64+1536*i14)));
_mm512_storeu_ps(sumPtr1+128+1536*i14, _mm512_add_ps(sum279, _mm512_loadu_ps(sumPtr1+128+1536*i14)));
_mm512_storeu_ps(sumPtr1+-311872+1536*i14, _mm512_add_ps(sum280, _mm512_loadu_ps(sumPtr1+-311872+1536*i14)));
_mm512_storeu_ps(sumPtr1+256+1536*i14, _mm512_add_ps(sum281, _mm512_loadu_ps(sumPtr1+256+1536*i14)));
_mm512_storeu_ps(sumPtr1+320+1536*i14, _mm512_add_ps(sum282, _mm512_loadu_ps(sumPtr1+320+1536*i14)));
_mm512_storeu_ps(sumPtr1+384+1536*i14, _mm512_add_ps(sum283, _mm512_loadu_ps(sumPtr1+384+1536*i14)));
_mm512_storeu_ps(sumPtr1+-311616+1536*i14, _mm512_add_ps(sum284, _mm512_loadu_ps(sumPtr1+-311616+1536*i14)));
_mm512_storeu_ps(sumPtr1+512+1536*i14, _mm512_add_ps(sum285, _mm512_loadu_ps(sumPtr1+512+1536*i14)));
_mm512_storeu_ps(sumPtr1+576+1536*i14, _mm512_add_ps(sum286, _mm512_loadu_ps(sumPtr1+576+1536*i14)));
_mm512_storeu_ps(sumPtr1+640+1536*i14, _mm512_add_ps(sum287, _mm512_loadu_ps(sumPtr1+640+1536*i14)));
_mm512_storeu_ps(sumPtr1+-311360+1536*i14, _mm512_add_ps(sum288, _mm512_loadu_ps(sumPtr1+-311360+1536*i14)));
_mm512_storeu_ps(sumPtr1+768+1536*i14, _mm512_add_ps(sum289, _mm512_loadu_ps(sumPtr1+768+1536*i14)));
_mm512_storeu_ps(sumPtr1+832+1536*i14, _mm512_add_ps(sum290, _mm512_loadu_ps(sumPtr1+832+1536*i14)));
_mm512_storeu_ps(sumPtr1+896+1536*i14, _mm512_add_ps(sum291, _mm512_loadu_ps(sumPtr1+896+1536*i14)));
return;
}
(void)base1;
ptrdiff_t i15 = 1*w3;
ptrdiff_t ii9 = i15+0;
for (; i15 != 101; ++i15) {
__m512 sum292 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i15));
__m512 sum296 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i15));
__m512 sum300 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i15));
__m512 sum304 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i15));
__m512 sum308 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i15));
__m512 sum312 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i15));
__m512 sum293 = sum292;
__m512 sum294 = sum292;
__m512 sum295 = sum292;
__m512 sum297 = sum296;
__m512 sum298 = sum296;
__m512 sum299 = sum296;
__m512 sum301 = sum300;
__m512 sum302 = sum300;
__m512 sum303 = sum300;
__m512 sum305 = sum304;
__m512 sum306 = sum304;
__m512 sum307 = sum304;
__m512 sum309 = sum308;
__m512 sum310 = sum308;
__m512 sum311 = sum308;
__m512 sum313 = sum312;
__m512 sum314 = sum312;
__m512 sum315 = sum312;
for (ptrdiff_t j19 = 0; j19 < 791; ++j19) {
__m512 dat87 = _mm512_loadu_ps(datPtr2+0+256*j19);
__m512 dat88 = _mm512_loadu_ps(datPtr2+64+256*j19);
__m512 dat89 = _mm512_loadu_ps(datPtr2+128+256*j19);
__m512 dat90 = _mm512_loadu_ps(datPtr2+192+256*j19);
__m512 wt227 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i15+24*j19));
sum292 = _mm512_fmadd_ps(wt227, dat87, sum292);
sum293 = _mm512_fmadd_ps(wt227, dat88, sum293);
sum294 = _mm512_fmadd_ps(wt227, dat89, sum294);
sum295 = _mm512_fmadd_ps(wt227, dat90, sum295);
__m512 wt228 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i15+24*j19));
sum296 = _mm512_fmadd_ps(wt228, dat87, sum296);
sum297 = _mm512_fmadd_ps(wt228, dat88, sum297);
sum298 = _mm512_fmadd_ps(wt228, dat89, sum298);
sum299 = _mm512_fmadd_ps(wt228, dat90, sum299);
__m512 wt229 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i15+24*j19));
sum300 = _mm512_fmadd_ps(wt229, dat87, sum300);
sum301 = _mm512_fmadd_ps(wt229, dat88, sum301);
sum302 = _mm512_fmadd_ps(wt229, dat89, sum302);
sum303 = _mm512_fmadd_ps(wt229, dat90, sum303);
__m512 wt230 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i15+24*j19));
sum304 = _mm512_fmadd_ps(wt230, dat87, sum304);
sum305 = _mm512_fmadd_ps(wt230, dat88, sum305);
sum306 = _mm512_fmadd_ps(wt230, dat89, sum306);
sum307 = _mm512_fmadd_ps(wt230, dat90, sum307);
__m512 wt231 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i15+24*j19));
sum308 = _mm512_fmadd_ps(wt231, dat87, sum308);
sum309 = _mm512_fmadd_ps(wt231, dat88, sum309);
sum310 = _mm512_fmadd_ps(wt231, dat89, sum310);
sum311 = _mm512_fmadd_ps(wt231, dat90, sum311);
__m512 wt232 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i15+24*j19));
sum312 = _mm512_fmadd_ps(wt232, dat87, sum312);
sum313 = _mm512_fmadd_ps(wt232, dat88, sum313);
sum314 = _mm512_fmadd_ps(wt232, dat89, sum314);
sum315 = _mm512_fmadd_ps(wt232, dat90, sum315);
}
_mm512_storeu_ps(sumPtr1+-312128+1536*i15, sum292);
_mm512_storeu_ps(sumPtr1+0+1536*i15, sum293);
_mm512_storeu_ps(sumPtr1+64+1536*i15, sum294);
_mm512_storeu_ps(sumPtr1+128+1536*i15, sum295);
_mm512_storeu_ps(sumPtr1+-311872+1536*i15, sum296);
_mm512_storeu_ps(sumPtr1+256+1536*i15, sum297);
_mm512_storeu_ps(sumPtr1+320+1536*i15, sum298);
_mm512_storeu_ps(sumPtr1+384+1536*i15, sum299);
_mm512_storeu_ps(sumPtr1+-311616+1536*i15, sum300);
_mm512_storeu_ps(sumPtr1+512+1536*i15, sum301);
_mm512_storeu_ps(sumPtr1+576+1536*i15, sum302);
_mm512_storeu_ps(sumPtr1+640+1536*i15, sum303);
_mm512_storeu_ps(sumPtr1+-311360+1536*i15, sum304);
_mm512_storeu_ps(sumPtr1+768+1536*i15, sum305);
_mm512_storeu_ps(sumPtr1+832+1536*i15, sum306);
_mm512_storeu_ps(sumPtr1+896+1536*i15, sum307);
_mm512_storeu_ps(sumPtr1+-311104+1536*i15, sum308);
_mm512_storeu_ps(sumPtr1+1024+1536*i15, sum309);
_mm512_storeu_ps(sumPtr1+1088+1536*i15, sum310);
_mm512_storeu_ps(sumPtr1+1152+1536*i15, sum311);
_mm512_storeu_ps(sumPtr1+-310848+1536*i15, sum312);
_mm512_storeu_ps(sumPtr1+1280+1536*i15, sum313);
_mm512_storeu_ps(sumPtr1+1344+1536*i15, sum314);
_mm512_storeu_ps(sumPtr1+1408+1536*i15, sum315);
if (i15 >= ii9) return;
}
__m512 sum316 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i15));
__m512 sum320 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i15));
__m512 sum324 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i15));
__m512 sum328 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i15));
__m512 sum317 = sum316;
__m512 sum318 = sum316;
__m512 sum319 = sum316;
__m512 sum321 = sum320;
__m512 sum322 = sum320;
__m512 sum323 = sum320;
__m512 sum325 = sum324;
__m512 sum326 = sum324;
__m512 sum327 = sum324;
__m512 sum329 = sum328;
__m512 sum330 = sum328;
__m512 sum331 = sum328;
for (ptrdiff_t j20 = 0; j20 < 791; ++j20) {
__m512 dat91 = _mm512_loadu_ps(datPtr2+0+256*j20);
__m512 dat92 = _mm512_loadu_ps(datPtr2+64+256*j20);
__m512 dat93 = _mm512_loadu_ps(datPtr2+128+256*j20);
__m512 dat94 = _mm512_loadu_ps(datPtr2+192+256*j20);
__m512 wt233 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i15+16*j20));
sum316 = _mm512_fmadd_ps(wt233, dat91, sum316);
sum317 = _mm512_fmadd_ps(wt233, dat92, sum317);
sum318 = _mm512_fmadd_ps(wt233, dat93, sum318);
sum319 = _mm512_fmadd_ps(wt233, dat94, sum319);
__m512 wt234 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i15+16*j20));
sum320 = _mm512_fmadd_ps(wt234, dat91, sum320);
sum321 = _mm512_fmadd_ps(wt234, dat92, sum321);
sum322 = _mm512_fmadd_ps(wt234, dat93, sum322);
sum323 = _mm512_fmadd_ps(wt234, dat94, sum323);
__m512 wt235 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i15+16*j20));
sum324 = _mm512_fmadd_ps(wt235, dat91, sum324);
sum325 = _mm512_fmadd_ps(wt235, dat92, sum325);
sum326 = _mm512_fmadd_ps(wt235, dat93, sum326);
sum327 = _mm512_fmadd_ps(wt235, dat94, sum327);
__m512 wt236 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i15+16*j20));
sum328 = _mm512_fmadd_ps(wt236, dat91, sum328);
sum329 = _mm512_fmadd_ps(wt236, dat92, sum329);
sum330 = _mm512_fmadd_ps(wt236, dat93, sum330);
sum331 = _mm512_fmadd_ps(wt236, dat94, sum331);
}
_mm512_storeu_ps(sumPtr1+-312128+1536*i15, sum316);
_mm512_storeu_ps(sumPtr1+0+1536*i15, sum317);
_mm512_storeu_ps(sumPtr1+64+1536*i15, sum318);
_mm512_storeu_ps(sumPtr1+128+1536*i15, sum319);
_mm512_storeu_ps(sumPtr1+-311872+1536*i15, sum320);
_mm512_storeu_ps(sumPtr1+256+1536*i15, sum321);
_mm512_storeu_ps(sumPtr1+320+1536*i15, sum322);
_mm512_storeu_ps(sumPtr1+384+1536*i15, sum323);
_mm512_storeu_ps(sumPtr1+-311616+1536*i15, sum324);
_mm512_storeu_ps(sumPtr1+512+1536*i15, sum325);
_mm512_storeu_ps(sumPtr1+576+1536*i15, sum326);
_mm512_storeu_ps(sumPtr1+640+1536*i15, sum327);
_mm512_storeu_ps(sumPtr1+-311360+1536*i15, sum328);
_mm512_storeu_ps(sumPtr1+768+1536*i15, sum329);
_mm512_storeu_ps(sumPtr1+832+1536*i15, sum330);
_mm512_storeu_ps(sumPtr1+896+1536*i15, sum331);
break;
}
case 4: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i16 = 1*w3;
ptrdiff_t ii10 = i16+0;
for (; i16 != 101; ++i16) {
__m512 sum332 = _mm512_setzero_ps();
__m512 sum334 = _mm512_setzero_ps();
__m512 sum336 = _mm512_setzero_ps();
__m512 sum338 = _mm512_setzero_ps();
__m512 sum340 = _mm512_setzero_ps();
__m512 sum342 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum333 = sum332;
__m512 sum335 = sum334;
__m512 sum337 = sum336;
__m512 sum339 = sum338;
__m512 sum341 = sum340;
__m512 sum343 = sum342;
for (ptrdiff_t j21 = 0; j21 < 791; ++j21) {
__m512 dat95 = _mm512_loadu_ps(datPtr2+128+256*j21);
__m512 dat96 = _mm512_loadu_ps(datPtr2+192+256*j21);
__m512 wt237 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i16+24*j21));
sum332 = _mm512_fmadd_ps(wt237, dat95, sum332);
sum333 = _mm512_fmadd_ps(wt237, dat96, sum333);
__m512 wt238 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i16+24*j21));
sum334 = _mm512_fmadd_ps(wt238, dat95, sum334);
sum335 = _mm512_fmadd_ps(wt238, dat96, sum335);
__m512 wt239 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i16+24*j21));
sum336 = _mm512_fmadd_ps(wt239, dat95, sum336);
sum337 = _mm512_fmadd_ps(wt239, dat96, sum337);
__m512 wt240 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i16+24*j21));
sum338 = _mm512_fmadd_ps(wt240, dat95, sum338);
sum339 = _mm512_fmadd_ps(wt240, dat96, sum339);
__m512 wt241 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i16+24*j21));
sum340 = _mm512_fmadd_ps(wt241, dat95, sum340);
sum341 = _mm512_fmadd_ps(wt241, dat96, sum341);
__m512 wt242 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i16+24*j21));
sum342 = _mm512_fmadd_ps(wt242, dat95, sum342);
sum343 = _mm512_fmadd_ps(wt242, dat96, sum343);
}
_mm512_storeu_ps(sumPtr1+0+1536*i16, sum332);
_mm512_storeu_ps(sumPtr1+64+1536*i16, sum333);
_mm512_storeu_ps(sumPtr1+256+1536*i16, sum334);
_mm512_storeu_ps(sumPtr1+320+1536*i16, sum335);
_mm512_storeu_ps(sumPtr1+512+1536*i16, sum336);
_mm512_storeu_ps(sumPtr1+576+1536*i16, sum337);
_mm512_storeu_ps(sumPtr1+768+1536*i16, sum338);
_mm512_storeu_ps(sumPtr1+832+1536*i16, sum339);
_mm512_storeu_ps(sumPtr1+1024+1536*i16, sum340);
_mm512_storeu_ps(sumPtr1+1088+1536*i16, sum341);
_mm512_storeu_ps(sumPtr1+1280+1536*i16, sum342);
_mm512_storeu_ps(sumPtr1+1344+1536*i16, sum343);
if (i16 >= ii10) return;
}
__m512 sum344 = _mm512_setzero_ps();
__m512 sum346 = _mm512_setzero_ps();
__m512 sum348 = _mm512_setzero_ps();
__m512 sum350 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum345 = sum344;
__m512 sum347 = sum346;
__m512 sum349 = sum348;
__m512 sum351 = sum350;
for (ptrdiff_t j22 = 0; j22 < 791; ++j22) {
__m512 dat97 = _mm512_loadu_ps(datPtr2+128+256*j22);
__m512 dat98 = _mm512_loadu_ps(datPtr2+192+256*j22);
__m512 wt243 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i16+16*j22));
sum344 = _mm512_fmadd_ps(wt243, dat97, sum344);
sum345 = _mm512_fmadd_ps(wt243, dat98, sum345);
__m512 wt244 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i16+16*j22));
sum346 = _mm512_fmadd_ps(wt244, dat97, sum346);
sum347 = _mm512_fmadd_ps(wt244, dat98, sum347);
__m512 wt245 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i16+16*j22));
sum348 = _mm512_fmadd_ps(wt245, dat97, sum348);
sum349 = _mm512_fmadd_ps(wt245, dat98, sum349);
__m512 wt246 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i16+16*j22));
sum350 = _mm512_fmadd_ps(wt246, dat97, sum350);
sum351 = _mm512_fmadd_ps(wt246, dat98, sum351);
}
_mm512_storeu_ps(sumPtr1+0+1536*i16, sum344);
_mm512_storeu_ps(sumPtr1+64+1536*i16, sum345);
_mm512_storeu_ps(sumPtr1+256+1536*i16, sum346);
_mm512_storeu_ps(sumPtr1+320+1536*i16, sum347);
_mm512_storeu_ps(sumPtr1+512+1536*i16, sum348);
_mm512_storeu_ps(sumPtr1+576+1536*i16, sum349);
_mm512_storeu_ps(sumPtr1+768+1536*i16, sum350);
_mm512_storeu_ps(sumPtr1+832+1536*i16, sum351);
return;
}
ptrdiff_t i17 = 1*w3;
ptrdiff_t ii11 = i17+0;
for (; i17 != 101; ++i17) {
__m512 sum352 = _mm512_setzero_ps();
__m512 sum354 = _mm512_setzero_ps();
__m512 sum356 = _mm512_setzero_ps();
__m512 sum358 = _mm512_setzero_ps();
__m512 sum360 = _mm512_setzero_ps();
__m512 sum362 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum353 = sum352;
__m512 sum355 = sum354;
__m512 sum357 = sum356;
__m512 sum359 = sum358;
__m512 sum361 = sum360;
__m512 sum363 = sum362;
for (ptrdiff_t j23 = 0; j23 < 791; ++j23) {
__m512 dat99 = _mm512_loadu_ps(datPtr2+128+256*j23);
__m512 dat100 = _mm512_loadu_ps(datPtr2+192+256*j23);
__m512 wt247 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i17+24*j23));
sum352 = _mm512_fmadd_ps(wt247, dat99, sum352);
sum353 = _mm512_fmadd_ps(wt247, dat100, sum353);
__m512 wt248 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i17+24*j23));
sum354 = _mm512_fmadd_ps(wt248, dat99, sum354);
sum355 = _mm512_fmadd_ps(wt248, dat100, sum355);
__m512 wt249 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i17+24*j23));
sum356 = _mm512_fmadd_ps(wt249, dat99, sum356);
sum357 = _mm512_fmadd_ps(wt249, dat100, sum357);
__m512 wt250 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i17+24*j23));
sum358 = _mm512_fmadd_ps(wt250, dat99, sum358);
sum359 = _mm512_fmadd_ps(wt250, dat100, sum359);
__m512 wt251 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i17+24*j23));
sum360 = _mm512_fmadd_ps(wt251, dat99, sum360);
sum361 = _mm512_fmadd_ps(wt251, dat100, sum361);
__m512 wt252 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i17+24*j23));
sum362 = _mm512_fmadd_ps(wt252, dat99, sum362);
sum363 = _mm512_fmadd_ps(wt252, dat100, sum363);
}
_mm512_storeu_ps(sumPtr1+0+1536*i17, _mm512_add_ps(sum352, _mm512_loadu_ps(sumPtr1+0+1536*i17)));
_mm512_storeu_ps(sumPtr1+64+1536*i17, _mm512_add_ps(sum353, _mm512_loadu_ps(sumPtr1+64+1536*i17)));
_mm512_storeu_ps(sumPtr1+256+1536*i17, _mm512_add_ps(sum354, _mm512_loadu_ps(sumPtr1+256+1536*i17)));
_mm512_storeu_ps(sumPtr1+320+1536*i17, _mm512_add_ps(sum355, _mm512_loadu_ps(sumPtr1+320+1536*i17)));
_mm512_storeu_ps(sumPtr1+512+1536*i17, _mm512_add_ps(sum356, _mm512_loadu_ps(sumPtr1+512+1536*i17)));
_mm512_storeu_ps(sumPtr1+576+1536*i17, _mm512_add_ps(sum357, _mm512_loadu_ps(sumPtr1+576+1536*i17)));
_mm512_storeu_ps(sumPtr1+768+1536*i17, _mm512_add_ps(sum358, _mm512_loadu_ps(sumPtr1+768+1536*i17)));
_mm512_storeu_ps(sumPtr1+832+1536*i17, _mm512_add_ps(sum359, _mm512_loadu_ps(sumPtr1+832+1536*i17)));
_mm512_storeu_ps(sumPtr1+1024+1536*i17, _mm512_add_ps(sum360, _mm512_loadu_ps(sumPtr1+1024+1536*i17)));
_mm512_storeu_ps(sumPtr1+1088+1536*i17, _mm512_add_ps(sum361, _mm512_loadu_ps(sumPtr1+1088+1536*i17)));
_mm512_storeu_ps(sumPtr1+1280+1536*i17, _mm512_add_ps(sum362, _mm512_loadu_ps(sumPtr1+1280+1536*i17)));
_mm512_storeu_ps(sumPtr1+1344+1536*i17, _mm512_add_ps(sum363, _mm512_loadu_ps(sumPtr1+1344+1536*i17)));
if (i17 >= ii11) return;
}
__m512 sum364 = _mm512_setzero_ps();
__m512 sum366 = _mm512_setzero_ps();
__m512 sum368 = _mm512_setzero_ps();
__m512 sum370 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum365 = sum364;
__m512 sum367 = sum366;
__m512 sum369 = sum368;
__m512 sum371 = sum370;
for (ptrdiff_t j24 = 0; j24 < 791; ++j24) {
__m512 dat101 = _mm512_loadu_ps(datPtr2+128+256*j24);
__m512 dat102 = _mm512_loadu_ps(datPtr2+192+256*j24);
__m512 wt253 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i17+16*j24));
sum364 = _mm512_fmadd_ps(wt253, dat101, sum364);
sum365 = _mm512_fmadd_ps(wt253, dat102, sum365);
__m512 wt254 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i17+16*j24));
sum366 = _mm512_fmadd_ps(wt254, dat101, sum366);
sum367 = _mm512_fmadd_ps(wt254, dat102, sum367);
__m512 wt255 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i17+16*j24));
sum368 = _mm512_fmadd_ps(wt255, dat101, sum368);
sum369 = _mm512_fmadd_ps(wt255, dat102, sum369);
__m512 wt256 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i17+16*j24));
sum370 = _mm512_fmadd_ps(wt256, dat101, sum370);
sum371 = _mm512_fmadd_ps(wt256, dat102, sum371);
}
_mm512_storeu_ps(sumPtr1+0+1536*i17, _mm512_add_ps(sum364, _mm512_loadu_ps(sumPtr1+0+1536*i17)));
_mm512_storeu_ps(sumPtr1+64+1536*i17, _mm512_add_ps(sum365, _mm512_loadu_ps(sumPtr1+64+1536*i17)));
_mm512_storeu_ps(sumPtr1+256+1536*i17, _mm512_add_ps(sum366, _mm512_loadu_ps(sumPtr1+256+1536*i17)));
_mm512_storeu_ps(sumPtr1+320+1536*i17, _mm512_add_ps(sum367, _mm512_loadu_ps(sumPtr1+320+1536*i17)));
_mm512_storeu_ps(sumPtr1+512+1536*i17, _mm512_add_ps(sum368, _mm512_loadu_ps(sumPtr1+512+1536*i17)));
_mm512_storeu_ps(sumPtr1+576+1536*i17, _mm512_add_ps(sum369, _mm512_loadu_ps(sumPtr1+576+1536*i17)));
_mm512_storeu_ps(sumPtr1+768+1536*i17, _mm512_add_ps(sum370, _mm512_loadu_ps(sumPtr1+768+1536*i17)));
_mm512_storeu_ps(sumPtr1+832+1536*i17, _mm512_add_ps(sum371, _mm512_loadu_ps(sumPtr1+832+1536*i17)));
return;
}
(void)base1;
ptrdiff_t i18 = 1*w3;
ptrdiff_t ii12 = i18+0;
for (; i18 != 101; ++i18) {
__m512 sum372 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i18));
__m512 sum374 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i18));
__m512 sum376 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i18));
__m512 sum378 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i18));
__m512 sum380 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i18));
__m512 sum382 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i18));
__m512 sum373 = sum372;
__m512 sum375 = sum374;
__m512 sum377 = sum376;
__m512 sum379 = sum378;
__m512 sum381 = sum380;
__m512 sum383 = sum382;
for (ptrdiff_t j25 = 0; j25 < 791; ++j25) {
__m512 dat103 = _mm512_loadu_ps(datPtr2+128+256*j25);
__m512 dat104 = _mm512_loadu_ps(datPtr2+192+256*j25);
__m512 wt257 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i18+24*j25));
sum372 = _mm512_fmadd_ps(wt257, dat103, sum372);
sum373 = _mm512_fmadd_ps(wt257, dat104, sum373);
__m512 wt258 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i18+24*j25));
sum374 = _mm512_fmadd_ps(wt258, dat103, sum374);
sum375 = _mm512_fmadd_ps(wt258, dat104, sum375);
__m512 wt259 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i18+24*j25));
sum376 = _mm512_fmadd_ps(wt259, dat103, sum376);
sum377 = _mm512_fmadd_ps(wt259, dat104, sum377);
__m512 wt260 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i18+24*j25));
sum378 = _mm512_fmadd_ps(wt260, dat103, sum378);
sum379 = _mm512_fmadd_ps(wt260, dat104, sum379);
__m512 wt261 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i18+24*j25));
sum380 = _mm512_fmadd_ps(wt261, dat103, sum380);
sum381 = _mm512_fmadd_ps(wt261, dat104, sum381);
__m512 wt262 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i18+24*j25));
sum382 = _mm512_fmadd_ps(wt262, dat103, sum382);
sum383 = _mm512_fmadd_ps(wt262, dat104, sum383);
}
_mm512_storeu_ps(sumPtr1+0+1536*i18, sum372);
_mm512_storeu_ps(sumPtr1+64+1536*i18, sum373);
_mm512_storeu_ps(sumPtr1+256+1536*i18, sum374);
_mm512_storeu_ps(sumPtr1+320+1536*i18, sum375);
_mm512_storeu_ps(sumPtr1+512+1536*i18, sum376);
_mm512_storeu_ps(sumPtr1+576+1536*i18, sum377);
_mm512_storeu_ps(sumPtr1+768+1536*i18, sum378);
_mm512_storeu_ps(sumPtr1+832+1536*i18, sum379);
_mm512_storeu_ps(sumPtr1+1024+1536*i18, sum380);
_mm512_storeu_ps(sumPtr1+1088+1536*i18, sum381);
_mm512_storeu_ps(sumPtr1+1280+1536*i18, sum382);
_mm512_storeu_ps(sumPtr1+1344+1536*i18, sum383);
if (i18 >= ii12) return;
}
__m512 sum384 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i18));
__m512 sum386 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i18));
__m512 sum388 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i18));
__m512 sum390 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i18));
__m512 sum385 = sum384;
__m512 sum387 = sum386;
__m512 sum389 = sum388;
__m512 sum391 = sum390;
for (ptrdiff_t j26 = 0; j26 < 791; ++j26) {
__m512 dat105 = _mm512_loadu_ps(datPtr2+128+256*j26);
__m512 dat106 = _mm512_loadu_ps(datPtr2+192+256*j26);
__m512 wt263 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i18+16*j26));
sum384 = _mm512_fmadd_ps(wt263, dat105, sum384);
sum385 = _mm512_fmadd_ps(wt263, dat106, sum385);
__m512 wt264 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i18+16*j26));
sum386 = _mm512_fmadd_ps(wt264, dat105, sum386);
sum387 = _mm512_fmadd_ps(wt264, dat106, sum387);
__m512 wt265 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i18+16*j26));
sum388 = _mm512_fmadd_ps(wt265, dat105, sum388);
sum389 = _mm512_fmadd_ps(wt265, dat106, sum389);
__m512 wt266 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i18+16*j26));
sum390 = _mm512_fmadd_ps(wt266, dat105, sum390);
sum391 = _mm512_fmadd_ps(wt266, dat106, sum391);
}
_mm512_storeu_ps(sumPtr1+0+1536*i18, sum384);
_mm512_storeu_ps(sumPtr1+64+1536*i18, sum385);
_mm512_storeu_ps(sumPtr1+256+1536*i18, sum386);
_mm512_storeu_ps(sumPtr1+320+1536*i18, sum387);
_mm512_storeu_ps(sumPtr1+512+1536*i18, sum388);
_mm512_storeu_ps(sumPtr1+576+1536*i18, sum389);
_mm512_storeu_ps(sumPtr1+768+1536*i18, sum390);
_mm512_storeu_ps(sumPtr1+832+1536*i18, sum391);
break;
}
case 5: {
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i19 = 1*w3;
ptrdiff_t ii13 = i19+0;
for (; i19 != 101; ++i19) {
__m512 sum392 = _mm512_setzero_ps();
__m512 sum396 = _mm512_setzero_ps();
__m512 sum400 = _mm512_setzero_ps();
__m512 sum404 = _mm512_setzero_ps();
__m512 sum408 = _mm512_setzero_ps();
__m512 sum412 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum393 = sum392;
__m512 sum394 = sum392;
__m512 sum395 = sum392;
__m512 sum397 = sum396;
__m512 sum398 = sum396;
__m512 sum399 = sum396;
__m512 sum401 = sum400;
__m512 sum402 = sum400;
__m512 sum403 = sum400;
__m512 sum405 = sum404;
__m512 sum406 = sum404;
__m512 sum407 = sum404;
__m512 sum409 = sum408;
__m512 sum410 = sum408;
__m512 sum411 = sum408;
__m512 sum413 = sum412;
__m512 sum414 = sum412;
__m512 sum415 = sum412;
for (ptrdiff_t j27 = 0; j27 < 791; ++j27) {
__m512 dat107 = _mm512_loadu_ps(datPtr2+0+256*j27);
__m512 dat108 = _mm512_loadu_ps(datPtr2+64+256*j27);
__m512 dat109 = _mm512_loadu_ps(datPtr2+128+256*j27);
__m512 dat110 = _mm512_loadu_ps(datPtr2+192+256*j27);
__m512 wt267 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i19+24*j27));
sum392 = _mm512_fmadd_ps(wt267, dat107, sum392);
sum393 = _mm512_fmadd_ps(wt267, dat108, sum393);
sum394 = _mm512_fmadd_ps(wt267, dat109, sum394);
sum395 = _mm512_fmadd_ps(wt267, dat110, sum395);
__m512 wt268 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i19+24*j27));
sum396 = _mm512_fmadd_ps(wt268, dat107, sum396);
sum397 = _mm512_fmadd_ps(wt268, dat108, sum397);
sum398 = _mm512_fmadd_ps(wt268, dat109, sum398);
sum399 = _mm512_fmadd_ps(wt268, dat110, sum399);
__m512 wt269 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i19+24*j27));
sum400 = _mm512_fmadd_ps(wt269, dat107, sum400);
sum401 = _mm512_fmadd_ps(wt269, dat108, sum401);
sum402 = _mm512_fmadd_ps(wt269, dat109, sum402);
sum403 = _mm512_fmadd_ps(wt269, dat110, sum403);
__m512 wt270 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i19+24*j27));
sum404 = _mm512_fmadd_ps(wt270, dat107, sum404);
sum405 = _mm512_fmadd_ps(wt270, dat108, sum405);
sum406 = _mm512_fmadd_ps(wt270, dat109, sum406);
sum407 = _mm512_fmadd_ps(wt270, dat110, sum407);
__m512 wt271 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i19+24*j27));
sum408 = _mm512_fmadd_ps(wt271, dat107, sum408);
sum409 = _mm512_fmadd_ps(wt271, dat108, sum409);
sum410 = _mm512_fmadd_ps(wt271, dat109, sum410);
sum411 = _mm512_fmadd_ps(wt271, dat110, sum411);
__m512 wt272 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i19+24*j27));
sum412 = _mm512_fmadd_ps(wt272, dat107, sum412);
sum413 = _mm512_fmadd_ps(wt272, dat108, sum413);
sum414 = _mm512_fmadd_ps(wt272, dat109, sum414);
sum415 = _mm512_fmadd_ps(wt272, dat110, sum415);
}
_mm512_storeu_ps(sumPtr1+-312192+1536*i19, sum392);
_mm512_storeu_ps(sumPtr1+-312128+1536*i19, sum393);
_mm512_storeu_ps(sumPtr1+0+1536*i19, sum394);
_mm512_storeu_ps(sumPtr1+64+1536*i19, sum395);
_mm512_storeu_ps(sumPtr1+-311936+1536*i19, sum396);
_mm512_storeu_ps(sumPtr1+-311872+1536*i19, sum397);
_mm512_storeu_ps(sumPtr1+256+1536*i19, sum398);
_mm512_storeu_ps(sumPtr1+320+1536*i19, sum399);
_mm512_storeu_ps(sumPtr1+-311680+1536*i19, sum400);
_mm512_storeu_ps(sumPtr1+-311616+1536*i19, sum401);
_mm512_storeu_ps(sumPtr1+512+1536*i19, sum402);
_mm512_storeu_ps(sumPtr1+576+1536*i19, sum403);
_mm512_storeu_ps(sumPtr1+-311424+1536*i19, sum404);
_mm512_storeu_ps(sumPtr1+-311360+1536*i19, sum405);
_mm512_storeu_ps(sumPtr1+768+1536*i19, sum406);
_mm512_storeu_ps(sumPtr1+832+1536*i19, sum407);
_mm512_storeu_ps(sumPtr1+-311168+1536*i19, sum408);
_mm512_storeu_ps(sumPtr1+-311104+1536*i19, sum409);
_mm512_storeu_ps(sumPtr1+1024+1536*i19, sum410);
_mm512_storeu_ps(sumPtr1+1088+1536*i19, sum411);
_mm512_storeu_ps(sumPtr1+-310912+1536*i19, sum412);
_mm512_storeu_ps(sumPtr1+-310848+1536*i19, sum413);
_mm512_storeu_ps(sumPtr1+1280+1536*i19, sum414);
_mm512_storeu_ps(sumPtr1+1344+1536*i19, sum415);
if (i19 >= ii13) return;
}
__m512 sum416 = _mm512_setzero_ps();
__m512 sum420 = _mm512_setzero_ps();
__m512 sum424 = _mm512_setzero_ps();
__m512 sum428 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum417 = sum416;
__m512 sum418 = sum416;
__m512 sum419 = sum416;
__m512 sum421 = sum420;
__m512 sum422 = sum420;
__m512 sum423 = sum420;
__m512 sum425 = sum424;
__m512 sum426 = sum424;
__m512 sum427 = sum424;
__m512 sum429 = sum428;
__m512 sum430 = sum428;
__m512 sum431 = sum428;
for (ptrdiff_t j28 = 0; j28 < 791; ++j28) {
__m512 dat111 = _mm512_loadu_ps(datPtr2+0+256*j28);
__m512 dat112 = _mm512_loadu_ps(datPtr2+64+256*j28);
__m512 dat113 = _mm512_loadu_ps(datPtr2+128+256*j28);
__m512 dat114 = _mm512_loadu_ps(datPtr2+192+256*j28);
__m512 wt273 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i19+16*j28));
sum416 = _mm512_fmadd_ps(wt273, dat111, sum416);
sum417 = _mm512_fmadd_ps(wt273, dat112, sum417);
sum418 = _mm512_fmadd_ps(wt273, dat113, sum418);
sum419 = _mm512_fmadd_ps(wt273, dat114, sum419);
__m512 wt274 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i19+16*j28));
sum420 = _mm512_fmadd_ps(wt274, dat111, sum420);
sum421 = _mm512_fmadd_ps(wt274, dat112, sum421);
sum422 = _mm512_fmadd_ps(wt274, dat113, sum422);
sum423 = _mm512_fmadd_ps(wt274, dat114, sum423);
__m512 wt275 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i19+16*j28));
sum424 = _mm512_fmadd_ps(wt275, dat111, sum424);
sum425 = _mm512_fmadd_ps(wt275, dat112, sum425);
sum426 = _mm512_fmadd_ps(wt275, dat113, sum426);
sum427 = _mm512_fmadd_ps(wt275, dat114, sum427);
__m512 wt276 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i19+16*j28));
sum428 = _mm512_fmadd_ps(wt276, dat111, sum428);
sum429 = _mm512_fmadd_ps(wt276, dat112, sum429);
sum430 = _mm512_fmadd_ps(wt276, dat113, sum430);
sum431 = _mm512_fmadd_ps(wt276, dat114, sum431);
}
_mm512_storeu_ps(sumPtr1+-312192+1536*i19, sum416);
_mm512_storeu_ps(sumPtr1+-312128+1536*i19, sum417);
_mm512_storeu_ps(sumPtr1+0+1536*i19, sum418);
_mm512_storeu_ps(sumPtr1+64+1536*i19, sum419);
_mm512_storeu_ps(sumPtr1+-311936+1536*i19, sum420);
_mm512_storeu_ps(sumPtr1+-311872+1536*i19, sum421);
_mm512_storeu_ps(sumPtr1+256+1536*i19, sum422);
_mm512_storeu_ps(sumPtr1+320+1536*i19, sum423);
_mm512_storeu_ps(sumPtr1+-311680+1536*i19, sum424);
_mm512_storeu_ps(sumPtr1+-311616+1536*i19, sum425);
_mm512_storeu_ps(sumPtr1+512+1536*i19, sum426);
_mm512_storeu_ps(sumPtr1+576+1536*i19, sum427);
_mm512_storeu_ps(sumPtr1+-311424+1536*i19, sum428);
_mm512_storeu_ps(sumPtr1+-311360+1536*i19, sum429);
_mm512_storeu_ps(sumPtr1+768+1536*i19, sum430);
_mm512_storeu_ps(sumPtr1+832+1536*i19, sum431);
return;
}
ptrdiff_t i20 = 1*w3;
ptrdiff_t ii14 = i20+0;
for (; i20 != 101; ++i20) {
__m512 sum432 = _mm512_setzero_ps();
__m512 sum436 = _mm512_setzero_ps();
__m512 sum440 = _mm512_setzero_ps();
__m512 sum444 = _mm512_setzero_ps();
__m512 sum448 = _mm512_setzero_ps();
__m512 sum452 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum433 = sum432;
__m512 sum434 = sum432;
__m512 sum435 = sum432;
__m512 sum437 = sum436;
__m512 sum438 = sum436;
__m512 sum439 = sum436;
__m512 sum441 = sum440;
__m512 sum442 = sum440;
__m512 sum443 = sum440;
__m512 sum445 = sum444;
__m512 sum446 = sum444;
__m512 sum447 = sum444;
__m512 sum449 = sum448;
__m512 sum450 = sum448;
__m512 sum451 = sum448;
__m512 sum453 = sum452;
__m512 sum454 = sum452;
__m512 sum455 = sum452;
for (ptrdiff_t j29 = 0; j29 < 791; ++j29) {
__m512 dat115 = _mm512_loadu_ps(datPtr2+0+256*j29);
__m512 dat116 = _mm512_loadu_ps(datPtr2+64+256*j29);
__m512 dat117 = _mm512_loadu_ps(datPtr2+128+256*j29);
__m512 dat118 = _mm512_loadu_ps(datPtr2+192+256*j29);
__m512 wt277 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i20+24*j29));
sum432 = _mm512_fmadd_ps(wt277, dat115, sum432);
sum433 = _mm512_fmadd_ps(wt277, dat116, sum433);
sum434 = _mm512_fmadd_ps(wt277, dat117, sum434);
sum435 = _mm512_fmadd_ps(wt277, dat118, sum435);
__m512 wt278 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i20+24*j29));
sum436 = _mm512_fmadd_ps(wt278, dat115, sum436);
sum437 = _mm512_fmadd_ps(wt278, dat116, sum437);
sum438 = _mm512_fmadd_ps(wt278, dat117, sum438);
sum439 = _mm512_fmadd_ps(wt278, dat118, sum439);
__m512 wt279 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i20+24*j29));
sum440 = _mm512_fmadd_ps(wt279, dat115, sum440);
sum441 = _mm512_fmadd_ps(wt279, dat116, sum441);
sum442 = _mm512_fmadd_ps(wt279, dat117, sum442);
sum443 = _mm512_fmadd_ps(wt279, dat118, sum443);
__m512 wt280 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i20+24*j29));
sum444 = _mm512_fmadd_ps(wt280, dat115, sum444);
sum445 = _mm512_fmadd_ps(wt280, dat116, sum445);
sum446 = _mm512_fmadd_ps(wt280, dat117, sum446);
sum447 = _mm512_fmadd_ps(wt280, dat118, sum447);
__m512 wt281 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i20+24*j29));
sum448 = _mm512_fmadd_ps(wt281, dat115, sum448);
sum449 = _mm512_fmadd_ps(wt281, dat116, sum449);
sum450 = _mm512_fmadd_ps(wt281, dat117, sum450);
sum451 = _mm512_fmadd_ps(wt281, dat118, sum451);
__m512 wt282 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i20+24*j29));
sum452 = _mm512_fmadd_ps(wt282, dat115, sum452);
sum453 = _mm512_fmadd_ps(wt282, dat116, sum453);
sum454 = _mm512_fmadd_ps(wt282, dat117, sum454);
sum455 = _mm512_fmadd_ps(wt282, dat118, sum455);
}
_mm512_storeu_ps(sumPtr1+-312192+1536*i20, _mm512_add_ps(sum432, _mm512_loadu_ps(sumPtr1+-312192+1536*i20)));
_mm512_storeu_ps(sumPtr1+-312128+1536*i20, _mm512_add_ps(sum433, _mm512_loadu_ps(sumPtr1+-312128+1536*i20)));
_mm512_storeu_ps(sumPtr1+0+1536*i20, _mm512_add_ps(sum434, _mm512_loadu_ps(sumPtr1+0+1536*i20)));
_mm512_storeu_ps(sumPtr1+64+1536*i20, _mm512_add_ps(sum435, _mm512_loadu_ps(sumPtr1+64+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311936+1536*i20, _mm512_add_ps(sum436, _mm512_loadu_ps(sumPtr1+-311936+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311872+1536*i20, _mm512_add_ps(sum437, _mm512_loadu_ps(sumPtr1+-311872+1536*i20)));
_mm512_storeu_ps(sumPtr1+256+1536*i20, _mm512_add_ps(sum438, _mm512_loadu_ps(sumPtr1+256+1536*i20)));
_mm512_storeu_ps(sumPtr1+320+1536*i20, _mm512_add_ps(sum439, _mm512_loadu_ps(sumPtr1+320+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311680+1536*i20, _mm512_add_ps(sum440, _mm512_loadu_ps(sumPtr1+-311680+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311616+1536*i20, _mm512_add_ps(sum441, _mm512_loadu_ps(sumPtr1+-311616+1536*i20)));
_mm512_storeu_ps(sumPtr1+512+1536*i20, _mm512_add_ps(sum442, _mm512_loadu_ps(sumPtr1+512+1536*i20)));
_mm512_storeu_ps(sumPtr1+576+1536*i20, _mm512_add_ps(sum443, _mm512_loadu_ps(sumPtr1+576+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311424+1536*i20, _mm512_add_ps(sum444, _mm512_loadu_ps(sumPtr1+-311424+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311360+1536*i20, _mm512_add_ps(sum445, _mm512_loadu_ps(sumPtr1+-311360+1536*i20)));
_mm512_storeu_ps(sumPtr1+768+1536*i20, _mm512_add_ps(sum446, _mm512_loadu_ps(sumPtr1+768+1536*i20)));
_mm512_storeu_ps(sumPtr1+832+1536*i20, _mm512_add_ps(sum447, _mm512_loadu_ps(sumPtr1+832+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311168+1536*i20, _mm512_add_ps(sum448, _mm512_loadu_ps(sumPtr1+-311168+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311104+1536*i20, _mm512_add_ps(sum449, _mm512_loadu_ps(sumPtr1+-311104+1536*i20)));
_mm512_storeu_ps(sumPtr1+1024+1536*i20, _mm512_add_ps(sum450, _mm512_loadu_ps(sumPtr1+1024+1536*i20)));
_mm512_storeu_ps(sumPtr1+1088+1536*i20, _mm512_add_ps(sum451, _mm512_loadu_ps(sumPtr1+1088+1536*i20)));
_mm512_storeu_ps(sumPtr1+-310912+1536*i20, _mm512_add_ps(sum452, _mm512_loadu_ps(sumPtr1+-310912+1536*i20)));
_mm512_storeu_ps(sumPtr1+-310848+1536*i20, _mm512_add_ps(sum453, _mm512_loadu_ps(sumPtr1+-310848+1536*i20)));
_mm512_storeu_ps(sumPtr1+1280+1536*i20, _mm512_add_ps(sum454, _mm512_loadu_ps(sumPtr1+1280+1536*i20)));
_mm512_storeu_ps(sumPtr1+1344+1536*i20, _mm512_add_ps(sum455, _mm512_loadu_ps(sumPtr1+1344+1536*i20)));
if (i20 >= ii14) return;
}
__m512 sum456 = _mm512_setzero_ps();
__m512 sum460 = _mm512_setzero_ps();
__m512 sum464 = _mm512_setzero_ps();
__m512 sum468 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum457 = sum456;
__m512 sum458 = sum456;
__m512 sum459 = sum456;
__m512 sum461 = sum460;
__m512 sum462 = sum460;
__m512 sum463 = sum460;
__m512 sum465 = sum464;
__m512 sum466 = sum464;
__m512 sum467 = sum464;
__m512 sum469 = sum468;
__m512 sum470 = sum468;
__m512 sum471 = sum468;
for (ptrdiff_t j30 = 0; j30 < 791; ++j30) {
__m512 dat119 = _mm512_loadu_ps(datPtr2+0+256*j30);
__m512 dat120 = _mm512_loadu_ps(datPtr2+64+256*j30);
__m512 dat121 = _mm512_loadu_ps(datPtr2+128+256*j30);
__m512 dat122 = _mm512_loadu_ps(datPtr2+192+256*j30);
__m512 wt283 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i20+16*j30));
sum456 = _mm512_fmadd_ps(wt283, dat119, sum456);
sum457 = _mm512_fmadd_ps(wt283, dat120, sum457);
sum458 = _mm512_fmadd_ps(wt283, dat121, sum458);
sum459 = _mm512_fmadd_ps(wt283, dat122, sum459);
__m512 wt284 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i20+16*j30));
sum460 = _mm512_fmadd_ps(wt284, dat119, sum460);
sum461 = _mm512_fmadd_ps(wt284, dat120, sum461);
sum462 = _mm512_fmadd_ps(wt284, dat121, sum462);
sum463 = _mm512_fmadd_ps(wt284, dat122, sum463);
__m512 wt285 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i20+16*j30));
sum464 = _mm512_fmadd_ps(wt285, dat119, sum464);
sum465 = _mm512_fmadd_ps(wt285, dat120, sum465);
sum466 = _mm512_fmadd_ps(wt285, dat121, sum466);
sum467 = _mm512_fmadd_ps(wt285, dat122, sum467);
__m512 wt286 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i20+16*j30));
sum468 = _mm512_fmadd_ps(wt286, dat119, sum468);
sum469 = _mm512_fmadd_ps(wt286, dat120, sum469);
sum470 = _mm512_fmadd_ps(wt286, dat121, sum470);
sum471 = _mm512_fmadd_ps(wt286, dat122, sum471);
}
_mm512_storeu_ps(sumPtr1+-312192+1536*i20, _mm512_add_ps(sum456, _mm512_loadu_ps(sumPtr1+-312192+1536*i20)));
_mm512_storeu_ps(sumPtr1+-312128+1536*i20, _mm512_add_ps(sum457, _mm512_loadu_ps(sumPtr1+-312128+1536*i20)));
_mm512_storeu_ps(sumPtr1+0+1536*i20, _mm512_add_ps(sum458, _mm512_loadu_ps(sumPtr1+0+1536*i20)));
_mm512_storeu_ps(sumPtr1+64+1536*i20, _mm512_add_ps(sum459, _mm512_loadu_ps(sumPtr1+64+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311936+1536*i20, _mm512_add_ps(sum460, _mm512_loadu_ps(sumPtr1+-311936+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311872+1536*i20, _mm512_add_ps(sum461, _mm512_loadu_ps(sumPtr1+-311872+1536*i20)));
_mm512_storeu_ps(sumPtr1+256+1536*i20, _mm512_add_ps(sum462, _mm512_loadu_ps(sumPtr1+256+1536*i20)));
_mm512_storeu_ps(sumPtr1+320+1536*i20, _mm512_add_ps(sum463, _mm512_loadu_ps(sumPtr1+320+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311680+1536*i20, _mm512_add_ps(sum464, _mm512_loadu_ps(sumPtr1+-311680+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311616+1536*i20, _mm512_add_ps(sum465, _mm512_loadu_ps(sumPtr1+-311616+1536*i20)));
_mm512_storeu_ps(sumPtr1+512+1536*i20, _mm512_add_ps(sum466, _mm512_loadu_ps(sumPtr1+512+1536*i20)));
_mm512_storeu_ps(sumPtr1+576+1536*i20, _mm512_add_ps(sum467, _mm512_loadu_ps(sumPtr1+576+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311424+1536*i20, _mm512_add_ps(sum468, _mm512_loadu_ps(sumPtr1+-311424+1536*i20)));
_mm512_storeu_ps(sumPtr1+-311360+1536*i20, _mm512_add_ps(sum469, _mm512_loadu_ps(sumPtr1+-311360+1536*i20)));
_mm512_storeu_ps(sumPtr1+768+1536*i20, _mm512_add_ps(sum470, _mm512_loadu_ps(sumPtr1+768+1536*i20)));
_mm512_storeu_ps(sumPtr1+832+1536*i20, _mm512_add_ps(sum471, _mm512_loadu_ps(sumPtr1+832+1536*i20)));
return;
}
(void)base1;
ptrdiff_t i21 = 1*w3;
ptrdiff_t ii15 = i21+0;
for (; i21 != 101; ++i21) {
__m512 sum472 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i21));
__m512 sum476 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i21));
__m512 sum480 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i21));
__m512 sum484 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i21));
__m512 sum488 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i21));
__m512 sum492 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i21));
__m512 sum473 = sum472;
__m512 sum474 = sum472;
__m512 sum475 = sum472;
__m512 sum477 = sum476;
__m512 sum478 = sum476;
__m512 sum479 = sum476;
__m512 sum481 = sum480;
__m512 sum482 = sum480;
__m512 sum483 = sum480;
__m512 sum485 = sum484;
__m512 sum486 = sum484;
__m512 sum487 = sum484;
__m512 sum489 = sum488;
__m512 sum490 = sum488;
__m512 sum491 = sum488;
__m512 sum493 = sum492;
__m512 sum494 = sum492;
__m512 sum495 = sum492;
for (ptrdiff_t j31 = 0; j31 < 791; ++j31) {
__m512 dat123 = _mm512_loadu_ps(datPtr2+0+256*j31);
__m512 dat124 = _mm512_loadu_ps(datPtr2+64+256*j31);
__m512 dat125 = _mm512_loadu_ps(datPtr2+128+256*j31);
__m512 dat126 = _mm512_loadu_ps(datPtr2+192+256*j31);
__m512 wt287 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i21+24*j31));
sum472 = _mm512_fmadd_ps(wt287, dat123, sum472);
sum473 = _mm512_fmadd_ps(wt287, dat124, sum473);
sum474 = _mm512_fmadd_ps(wt287, dat125, sum474);
sum475 = _mm512_fmadd_ps(wt287, dat126, sum475);
__m512 wt288 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i21+24*j31));
sum476 = _mm512_fmadd_ps(wt288, dat123, sum476);
sum477 = _mm512_fmadd_ps(wt288, dat124, sum477);
sum478 = _mm512_fmadd_ps(wt288, dat125, sum478);
sum479 = _mm512_fmadd_ps(wt288, dat126, sum479);
__m512 wt289 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i21+24*j31));
sum480 = _mm512_fmadd_ps(wt289, dat123, sum480);
sum481 = _mm512_fmadd_ps(wt289, dat124, sum481);
sum482 = _mm512_fmadd_ps(wt289, dat125, sum482);
sum483 = _mm512_fmadd_ps(wt289, dat126, sum483);
__m512 wt290 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i21+24*j31));
sum484 = _mm512_fmadd_ps(wt290, dat123, sum484);
sum485 = _mm512_fmadd_ps(wt290, dat124, sum485);
sum486 = _mm512_fmadd_ps(wt290, dat125, sum486);
sum487 = _mm512_fmadd_ps(wt290, dat126, sum487);
__m512 wt291 = _mm512_set1_ps(*(float*)(wtPtr2+16+18984*i21+24*j31));
sum488 = _mm512_fmadd_ps(wt291, dat123, sum488);
sum489 = _mm512_fmadd_ps(wt291, dat124, sum489);
sum490 = _mm512_fmadd_ps(wt291, dat125, sum490);
sum491 = _mm512_fmadd_ps(wt291, dat126, sum491);
__m512 wt292 = _mm512_set1_ps(*(float*)(wtPtr2+20+18984*i21+24*j31));
sum492 = _mm512_fmadd_ps(wt292, dat123, sum492);
sum493 = _mm512_fmadd_ps(wt292, dat124, sum493);
sum494 = _mm512_fmadd_ps(wt292, dat125, sum494);
sum495 = _mm512_fmadd_ps(wt292, dat126, sum495);
}
_mm512_storeu_ps(sumPtr1+-312192+1536*i21, sum472);
_mm512_storeu_ps(sumPtr1+-312128+1536*i21, sum473);
_mm512_storeu_ps(sumPtr1+0+1536*i21, sum474);
_mm512_storeu_ps(sumPtr1+64+1536*i21, sum475);
_mm512_storeu_ps(sumPtr1+-311936+1536*i21, sum476);
_mm512_storeu_ps(sumPtr1+-311872+1536*i21, sum477);
_mm512_storeu_ps(sumPtr1+256+1536*i21, sum478);
_mm512_storeu_ps(sumPtr1+320+1536*i21, sum479);
_mm512_storeu_ps(sumPtr1+-311680+1536*i21, sum480);
_mm512_storeu_ps(sumPtr1+-311616+1536*i21, sum481);
_mm512_storeu_ps(sumPtr1+512+1536*i21, sum482);
_mm512_storeu_ps(sumPtr1+576+1536*i21, sum483);
_mm512_storeu_ps(sumPtr1+-311424+1536*i21, sum484);
_mm512_storeu_ps(sumPtr1+-311360+1536*i21, sum485);
_mm512_storeu_ps(sumPtr1+768+1536*i21, sum486);
_mm512_storeu_ps(sumPtr1+832+1536*i21, sum487);
_mm512_storeu_ps(sumPtr1+-311168+1536*i21, sum488);
_mm512_storeu_ps(sumPtr1+-311104+1536*i21, sum489);
_mm512_storeu_ps(sumPtr1+1024+1536*i21, sum490);
_mm512_storeu_ps(sumPtr1+1088+1536*i21, sum491);
_mm512_storeu_ps(sumPtr1+-310912+1536*i21, sum492);
_mm512_storeu_ps(sumPtr1+-310848+1536*i21, sum493);
_mm512_storeu_ps(sumPtr1+1280+1536*i21, sum494);
_mm512_storeu_ps(sumPtr1+1344+1536*i21, sum495);
if (i21 >= ii15) return;
}
__m512 sum496 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i21));
__m512 sum500 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i21));
__m512 sum504 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i21));
__m512 sum508 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i21));
__m512 sum497 = sum496;
__m512 sum498 = sum496;
__m512 sum499 = sum496;
__m512 sum501 = sum500;
__m512 sum502 = sum500;
__m512 sum503 = sum500;
__m512 sum505 = sum504;
__m512 sum506 = sum504;
__m512 sum507 = sum504;
__m512 sum509 = sum508;
__m512 sum510 = sum508;
__m512 sum511 = sum508;
for (ptrdiff_t j32 = 0; j32 < 791; ++j32) {
__m512 dat127 = _mm512_loadu_ps(datPtr2+0+256*j32);
__m512 dat128 = _mm512_loadu_ps(datPtr2+64+256*j32);
__m512 dat129 = _mm512_loadu_ps(datPtr2+128+256*j32);
__m512 dat130 = _mm512_loadu_ps(datPtr2+192+256*j32);
__m512 wt293 = _mm512_set1_ps(*(float*)(wtPtr2+0+18984*i21+16*j32));
sum496 = _mm512_fmadd_ps(wt293, dat127, sum496);
sum497 = _mm512_fmadd_ps(wt293, dat128, sum497);
sum498 = _mm512_fmadd_ps(wt293, dat129, sum498);
sum499 = _mm512_fmadd_ps(wt293, dat130, sum499);
__m512 wt294 = _mm512_set1_ps(*(float*)(wtPtr2+4+18984*i21+16*j32));
sum500 = _mm512_fmadd_ps(wt294, dat127, sum500);
sum501 = _mm512_fmadd_ps(wt294, dat128, sum501);
sum502 = _mm512_fmadd_ps(wt294, dat129, sum502);
sum503 = _mm512_fmadd_ps(wt294, dat130, sum503);
__m512 wt295 = _mm512_set1_ps(*(float*)(wtPtr2+8+18984*i21+16*j32));
sum504 = _mm512_fmadd_ps(wt295, dat127, sum504);
sum505 = _mm512_fmadd_ps(wt295, dat128, sum505);
sum506 = _mm512_fmadd_ps(wt295, dat129, sum506);
sum507 = _mm512_fmadd_ps(wt295, dat130, sum507);
__m512 wt296 = _mm512_set1_ps(*(float*)(wtPtr2+12+18984*i21+16*j32));
sum508 = _mm512_fmadd_ps(wt296, dat127, sum508);
sum509 = _mm512_fmadd_ps(wt296, dat128, sum509);
sum510 = _mm512_fmadd_ps(wt296, dat129, sum510);
sum511 = _mm512_fmadd_ps(wt296, dat130, sum511);
}
_mm512_storeu_ps(sumPtr1+-312192+1536*i21, sum496);
_mm512_storeu_ps(sumPtr1+-312128+1536*i21, sum497);
_mm512_storeu_ps(sumPtr1+0+1536*i21, sum498);
_mm512_storeu_ps(sumPtr1+64+1536*i21, sum499);
_mm512_storeu_ps(sumPtr1+-311936+1536*i21, sum500);
_mm512_storeu_ps(sumPtr1+-311872+1536*i21, sum501);
_mm512_storeu_ps(sumPtr1+256+1536*i21, sum502);
_mm512_storeu_ps(sumPtr1+320+1536*i21, sum503);
_mm512_storeu_ps(sumPtr1+-311680+1536*i21, sum504);
_mm512_storeu_ps(sumPtr1+-311616+1536*i21, sum505);
_mm512_storeu_ps(sumPtr1+512+1536*i21, sum506);
_mm512_storeu_ps(sumPtr1+576+1536*i21, sum507);
_mm512_storeu_ps(sumPtr1+-311424+1536*i21, sum508);
_mm512_storeu_ps(sumPtr1+-311360+1536*i21, sum509);
_mm512_storeu_ps(sumPtr1+768+1536*i21, sum510);
_mm512_storeu_ps(sumPtr1+832+1536*i21, sum511);
break;
}
}
}

static void Example2LoomProduceSums1(Example2ThreaderTeam1* team16, char** tensors5) {
void* tuple1[4];
tuple1[0] = tensors5;
for (ptrdiff_t epoch2 = 0; epoch2 < 1; ++epoch2) {
tuple1[1] = (void*)epoch2;
for (ptrdiff_t field2 = 0; field2 < 4; ++field2) {
tuple1[2] = (void*)field2;
ptrdiff_t node7 = Example2LoomProduceSums1FieldTbl1[0+2*field2];
ptrdiff_t step2 = Example2LoomProduceSums1FieldTbl1[1+2*field2];
ptrdiff_t past1 = Example2LoomProduceSums1FieldTbl1[2+2*field2];
for (; node7 < past1; node7 += step2) {
tuple1[3] = (void*)node7;
Example2ThreaderTask1 task9;
task9.callee1 = Example2LoomProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 102;
task9.hull1[1] = step2;
task9.hull1[2] = 2;
task9.hull1[3] = 3;
Example2ThreaderDo1(team16, &task9);
}
}
}
}

static void Example2LoomConsumeSums1Callee1(Example2ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t cell1 = 0;
ptrdiff_t strip1 = 0;
ptrdiff_t chan1 = pt10[2];
ptrdiff_t group2 = pt10[3];
char*restrict sumPtr2 = tensors8[0];
char*restrict datPtr3 = tensors8[1];
ptrdiff_t i22 = 1*group2;
ptrdiff_t j33 = 152*chan1;
ptrdiff_t jj2 = j33+(chan1 < 3 ? 151 : 153);
for (; j33 <= jj2; ++j33) {
ptrdiff_t k8 = 2*strip1;
for (; k8 != 1; ++k8) {
ptrdiff_t l1 = 1*cell1;
__m512 load1 = _mm512_loadu_ps(sumPtr2+0+624640*i22+312320*k8+312320*l1+256*j33);
__m512 load3 = _mm512_loadu_ps(sumPtr2+64+624640*i22+312320*k8+312320*l1+256*j33);
__m512 load5 = _mm512_loadu_ps(sumPtr2+128+624640*i22+312320*k8+312320*l1+256*j33);
__m512 load7 = _mm512_loadu_ps(sumPtr2+192+624640*i22+312320*k8+312320*l1+256*j33);
__m512 load2 = _mm512_loadu_ps(sumPtr2+156160+624640*i22+312320*k8+312320*l1+256*j33);
__m512 load4 = _mm512_loadu_ps(sumPtr2+156224+624640*i22+312320*k8+312320*l1+256*j33);
__m512 load6 = _mm512_loadu_ps(sumPtr2+156288+624640*i22+312320*k8+312320*l1+256*j33);
__m512 load8 = _mm512_loadu_ps(sumPtr2+156352+624640*i22+312320*k8+312320*l1+256*j33);
__m512i cast1 = _mm512_castps_si512(load2);
__m512i cast2 = _mm512_castps_si512(load4);
__m512i cast3 = _mm512_castps_si512(load6);
__m512i cast4 = _mm512_castps_si512(load8);
__m512 join2 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast1, cast1, 1));
__m512 join3 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast2, cast2, 1));
__m512 join4 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast3, cast3, 1));
__m512 join5 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast4, cast4, 1));
__m512 add1 = _mm512_add_ps(load1, join2);
__m512 add2 = _mm512_add_ps(load3, join3);
__m512 add3 = _mm512_add_ps(load5, join4);
__m512 add4 = _mm512_add_ps(load7, join5);
_mm512_mask_storeu_ps(datPtr3+0+219600*i22+360*j33+240*k8+64*l1, 32767, add1);
_mm512_mask_storeu_ps(datPtr3+60+219600*i22+360*j33+240*k8+64*l1, 32767, add2);
_mm512_mask_storeu_ps(datPtr3+120+219600*i22+360*j33+240*k8+64*l1, 32767, add3);
_mm512_mask_storeu_ps(datPtr3+180+219600*i22+360*j33+240*k8+64*l1, 32767, add4);
}
ptrdiff_t l2 = 1*cell1;
__m512 load9 = _mm512_loadu_ps(sumPtr2+0+624640*i22+312320*k8+312320*l2+256*j33);
__m512 load11 = _mm512_loadu_ps(sumPtr2+64+624640*i22+312320*k8+312320*l2+256*j33);
__m512 load10 = _mm512_loadu_ps(sumPtr2+156160+624640*i22+312320*k8+312320*l2+256*j33);
__m512 load12 = _mm512_loadu_ps(sumPtr2+156224+624640*i22+312320*k8+312320*l2+256*j33);
__m512i cast5 = _mm512_castps_si512(load10);
__m512i cast6 = _mm512_castps_si512(load12);
__m512 join6 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast5, cast5, 1));
__m512 join7 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast6, cast6, 1));
__m512 add5 = _mm512_add_ps(load9, join6);
__m512 add6 = _mm512_add_ps(load11, join7);
_mm512_mask_storeu_ps(datPtr3+0+219600*i22+360*j33+240*k8+64*l2, 32767, add5);
_mm512_mask_storeu_ps(datPtr3+60+219600*i22+360*j33+240*k8+64*l2, 32767, add6);
}
}

static void Example2LoomConsumeSums1(Example2ThreaderTeam1* team17, char** tensors7) {
Example2ThreaderTask1 task11;
task11.callee1 = Example2LoomConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 4;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 4;
task11.hull1[3] = 3;
Example2ThreaderDo1(team17, &task11);
}

struct Example2Net {
char* alloc1;
char* align1;
};

void Example2NetDestroy(Example2Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example2NetCreate(
Example2Net** net1,
Example2Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example2Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(104229543);
if (__builtin_expect(!alloc3, 0)) {
return Example2Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example2ThreaderTeam1* team12 = 0;
char* err8 = Example2ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors12[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example2LoomArrangeFilts1(team12, tensors12);
}
Example2ThreaderDestroy1(team12);
Example2Net* net5 = malloc(sizeof(Example2Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example2Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example2Engine {
Example2Net* net3;
Example2ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example2EnginePthreadT(
Example2Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example2ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example2EngineDestroy(Example2Engine* eng3) {
Example2ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example2EngineCreate(
Example2Engine** eng4,
Example2Net* net4,
ptrdiff_t threads2
) {
Example2Engine* eng5 = malloc(sizeof(Example2Engine));
if (__builtin_expect(!eng5, 0)) {
return Example2Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(6733887);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example2Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example2ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example2EngineInference(
Example2Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example2ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example2LoomArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
align4+4859904
};
Example2LoomProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+4859904,
(char*)outData
};
Example2LoomConsumeSums1(team14, tensors11);
}
}

// End of file.

Top