NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example30 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in1 Channels=16 Height=7 Width=15
Input ToTensor=in2 Channels=16 Height=7 Width=15
Input ToTensor=in3 Channels=61 Height=4 Width=12
BatchNorm FromTensor=in1 ToTensor=bn1 Epsilon=0.00001
Activation FromTensor=bn1 ToTensor=act1 Kind=ReLU Param=0
Add FromTensor1=act1 FromTensor2=in2 ToTensor=add1
BatchNorm FromTensor=add1 ToTensor=bn2 Epsilon=0.00001
Conv FromTensor=bn2 ToTensor=conv ToChannels=61 FilterH=4 FilterW=4 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=conv ToTensor=bn3 Epsilon=0.00001
Activation FromTensor=bn3 ToTensor=act2 Kind=ReLU Param=0
Add FromTensor1=act2 FromTensor2=in3 ToTensor=add2
BatchNorm FromTensor=add2 ToTensor=bn4 Epsilon=0.00001
Output FromTensor=bn4

Top || Output Example30.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example30Params);
// Example30Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example30Params Example30Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example30Params* params = malloc(sizeof(Example30Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example30Net* net; // For example, 4 threads:
// char* err = Example30NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example30NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example30Net Example30Net;

char* Example30NetCreate(
Example30Net**,
Example30Params*,
ptrdiff_t threads
);

void Example30NetDestroy(Example30Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example30Net* net;
//
// ... Create net ...
//
// Example30Engine* engine; // For example, 4 inference threads:
// char* err = Example30EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example30EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example30EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* bn4Data = malloc(sizeof(float)*61*4*12);
// float* in1Data = malloc(sizeof(float)*16*7*15);
// float* in2Data = malloc(sizeof(float)*16*7*15);
// float* in3Data = malloc(sizeof(float)*61*4*12);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example30EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// bn4Data, // The tensor arguments are sorted by name.
// in1Data,
// in2Data,
// in3Data
// );
//
// ... Read the output floats ...
//
// }
//
// free(bn4Data);
// free(in1Data);
// free(in2Data);
// free(in3Data);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example30Engine Example30Engine;

char* Example30EngineCreate(
Example30Engine**,
Example30Net*,
ptrdiff_t threads
);

char* Example30EnginePthreadT(
Example30Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example30EngineInference(
Example30Engine*,
float* bn4Data,
float* in1Data,
float* in2Data,
float* in3Data
);

void Example30EngineDestroy(Example30Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example30Params {
float bn1Means[16]; // 1x16x1x1
float bn1Scales[16]; // 1x16x1x1
float bn1Shifts[16]; // 1x16x1x1
float bn1Variances[16]; // 1x16x1x1
float bn2Means[16]; // 1x16x1x1
float bn2Scales[16]; // 1x16x1x1
float bn2Shifts[16]; // 1x16x1x1
float bn2Variances[16]; // 1x16x1x1
float bn3Means[61]; // 1x61x1x1
float bn3Scales[61]; // 1x61x1x1
float bn3Shifts[61]; // 1x61x1x1
float bn3Variances[61]; // 1x61x1x1
float bn4Means[61]; // 1x61x1x1
float bn4Scales[61]; // 1x61x1x1
float bn4Shifts[61]; // 1x61x1x1
float bn4Variances[61]; // 1x61x1x1
float convBiases[61]; // 1x61x1x1
float convWeights[15616]; // 61x16x4x4
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example30.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example30.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example30.h"

static char* Example30Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example30: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example30ThreaderTask1 Example30ThreaderTask1;
typedef void (*Example30ThreaderCallee1)(Example30ThreaderTask1*, int64_t*);
typedef struct Example30ThreaderHub1 Example30ThreaderHub1;
typedef struct Example30ThreaderNode1 Example30ThreaderNode1;
typedef struct Example30ThreaderUnwind1 Example30ThreaderUnwind1;
typedef struct Example30ThreaderTeam1 Example30ThreaderTeam1;

struct Example30ThreaderTask1 {
Example30ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example30ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example30ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example30ThreaderTask1* task1;
pthread_cond_t cond2;
Example30ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example30ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example30ThreaderTeam1 {
ptrdiff_t nt1;
Example30ThreaderHub1* hub2;
Example30ThreaderNode1* nodes2;
Example30ThreaderUnwind1 unwind1;
};

static void Example30ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example30ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example30ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example30ThreaderMain1(void* arg1) {
Example30ThreaderNode1* node1 = arg1;
Example30ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example30ThreaderHub1* hub3 = team2->hub2;
Example30ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example30ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example30ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example30ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example30ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example30ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example30ThreaderDestroy1(Example30ThreaderTeam1* team3) {
if (!team3) return;
Example30ThreaderNode1* nodes4 = team3->nodes2;
Example30ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example30ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example30ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example30ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example30ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example30ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example30ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example30ThreaderCreate1Up4(Example30ThreaderTeam1* team8, ptrdiff_t nt7) {
Example30ThreaderNode1* nodes5 = team8->nodes2;
for (Example30ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example30Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example30Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example30ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example30Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example30ThreaderCreate1Up3(Example30ThreaderTeam1* team7, ptrdiff_t nt6) {
Example30ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example30Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example30Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example30ThreaderCreate1Up4(team7, nt6);
}

static char* Example30ThreaderCreate1Up2(Example30ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example30ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example30ThreaderNode1) != (size_t)nt5, 0)) {
return Example30Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example30Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example30ThreaderCreate1Up3(team6, nt5);
}

static char* Example30ThreaderCreate1Up1(Example30ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example30ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example30Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example30ThreaderCreate1Up2(team5, nt4);
}

static char* Example30ThreaderCreate1(Example30ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example30Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example30ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example30Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example30ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example30ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example30ThreaderPthreadT1(
pthread_t* thr2,
Example30ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example30Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example30ThreaderDo1(Example30ThreaderTeam1* team10, Example30ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example30ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example30ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example30ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example30ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example30Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example30Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example30BnSimplify1(
float*restrict means1,
float*restrict variances1,
float*restrict scales1,
float*restrict shifts1,
char*restrict mas1
) {
__m512 eps1 = _mm512_set1_ps(1e-05f);
__m512i xlo1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va1 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*0);
__m512 rcp1 = Example30Rsqrt1(_mm512_add_ps(eps1, va1));
__m512 sc1 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*0);
__m512 mul1 = _mm512_mul_ps(rcp1, sc1);
__m512 me1 = _mm512_loadu_ps(means1+(ptrdiff_t)16*0);
__m512 sh1 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*0);
__m512 add1 = _mm512_fnmadd_ps(me1, mul1, sh1);
__m512 lo1 = _mm512_permutex2var_ps(mul1, xlo1, add1);
__m512 hi1 = _mm512_permutex2var_ps(mul1, xhi1, add1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*0, lo1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*1, hi1);
}

static void Example30BnSimplify2(
float*restrict means2,
float*restrict variances2,
float*restrict scales2,
float*restrict shifts2,
char*restrict mas2
) {
__m512 eps2 = _mm512_set1_ps(1e-05f);
__m512i xlo2 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va2 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0);
__m512 va3 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*1);
__m512 va4 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*2);
__m512 va5 = _mm512_maskz_loadu_ps(8191, variances2+(ptrdiff_t)16*3);
__m512 rcp2 = Example30Rsqrt1(_mm512_add_ps(eps2, va2));
__m512 rcp3 = Example30Rsqrt1(_mm512_add_ps(eps2, va3));
__m512 rcp4 = Example30Rsqrt1(_mm512_add_ps(eps2, va4));
__m512 rcp5 = Example30Rsqrt1(_mm512_add_ps(eps2, va5));
__m512 sc2 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0);
__m512 sc3 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*1);
__m512 sc4 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*2);
__m512 sc5 = _mm512_maskz_loadu_ps(8191, scales2+(ptrdiff_t)16*3);
__m512 mul2 = _mm512_mul_ps(rcp2, sc2);
__m512 mul3 = _mm512_mul_ps(rcp3, sc3);
__m512 mul4 = _mm512_mul_ps(rcp4, sc4);
__m512 mul5 = _mm512_mul_ps(rcp5, sc5);
__m512 me2 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0);
__m512 me3 = _mm512_loadu_ps(means2+(ptrdiff_t)16*1);
__m512 me4 = _mm512_loadu_ps(means2+(ptrdiff_t)16*2);
__m512 me5 = _mm512_maskz_loadu_ps(8191, means2+(ptrdiff_t)16*3);
__m512 sh2 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0);
__m512 sh3 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*1);
__m512 sh4 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*2);
__m512 sh5 = _mm512_maskz_loadu_ps(8191, shifts2+(ptrdiff_t)16*3);
__m512 add2 = _mm512_fnmadd_ps(me2, mul2, sh2);
__m512 add3 = _mm512_fnmadd_ps(me3, mul3, sh3);
__m512 add4 = _mm512_fnmadd_ps(me4, mul4, sh4);
__m512 add5 = _mm512_fnmadd_ps(me5, mul5, sh5);
__m512 lo2 = _mm512_permutex2var_ps(mul2, xlo2, add2);
__m512 lo3 = _mm512_permutex2var_ps(mul3, xlo2, add3);
__m512 lo4 = _mm512_permutex2var_ps(mul4, xlo2, add4);
__m512 lo5 = _mm512_permutex2var_ps(mul5, xlo2, add5);
__m512 hi2 = _mm512_permutex2var_ps(mul2, xhi2, add2);
__m512 hi3 = _mm512_permutex2var_ps(mul3, xhi2, add3);
__m512 hi4 = _mm512_permutex2var_ps(mul4, xhi2, add4);
__m512 hi5 = _mm512_permutex2var_ps(mul5, xhi2, add5);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*0, lo2);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*1, hi2);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*2, lo3);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*3, hi3);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*4, lo4);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*5, hi4);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*6, lo5);
_mm512_mask_storeu_ps(mas2+(ptrdiff_t)64*7, 1023, hi5);
}

static void Example30LoomArrangeFilts1Callee1(Example30ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = 0;
ptrdiff_t e1 = 0;
char*restrict arrangedB1 = tensors2[4]+244*e1;
char*restrict arrangedW1 = tensors2[4]+244+3259840*e1;
char*restrict wtPtr1 = tensors2[0]+53440*e1;
char*restrict biasPtr1 = tensors2[1];
char*restrict bnPtr1 = tensors2[2]+(ptrdiff_t)8*835*e1;
char*restrict bnPtr2 = tensors2[3];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 2*b2;
ptrdiff_t jj1 = j1+1;
if (j1 < 3) {
for (; j1 != 3; ++j1) {
__m512i pmMul1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo1 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(0+61*i5+16*j1));
__m512 masHi1 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(0+61*i5+16*j1)+(ptrdiff_t)64);
__m512 postMul1 = _mm512_permutex2var_ps(masLo1, pmMul1, masHi1);
__m512 postAdd1 = _mm512_permutex2var_ps(masLo1, pmAdd1, masHi1);
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(65535, biasPtr1-0+244*i5+64*j1);
bias1 = _mm512_fmadd_ps(postMul1, bias1, postAdd1);
}
ptrdiff_t c1 = (size_t)(0+16*j1)/6;
switch ((size_t)(0+16*j1)%6) {
case 0: {
ptrdiff_t k1 = 0;
for (; k1 != 16; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+62464*i5+16384*j1+64*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(65535, wtPtr1+1024+62464*i5+16384*j1+64*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(65535, wtPtr1+2048+62464*i5+16384*j1+64*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(65535, wtPtr1+3072+62464*i5+16384*j1+64*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(65535, wtPtr1+4096+62464*i5+16384*j1+64*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(65535, wtPtr1+5120+62464*i5+16384*j1+64*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(65535, wtPtr1+6144+62464*i5+16384*j1+64*k1);
__m512 wt8 = _mm512_maskz_loadu_ps(65535, wtPtr1+7168+62464*i5+16384*j1+64*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(65535, wtPtr1+8192+62464*i5+16384*j1+64*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(65535, wtPtr1+9216+62464*i5+16384*j1+64*k1);
__m512 wt11 = _mm512_maskz_loadu_ps(65535, wtPtr1+10240+62464*i5+16384*j1+64*k1);
__m512 wt12 = _mm512_maskz_loadu_ps(65535, wtPtr1+11264+62464*i5+16384*j1+64*k1);
__m512 wt13 = _mm512_maskz_loadu_ps(65535, wtPtr1+12288+62464*i5+16384*j1+64*k1);
__m512 wt14 = _mm512_maskz_loadu_ps(65535, wtPtr1+13312+62464*i5+16384*j1+64*k1);
__m512 wt15 = _mm512_maskz_loadu_ps(65535, wtPtr1+14336+62464*i5+16384*j1+64*k1);
__m512 wt16 = _mm512_maskz_loadu_ps(65535, wtPtr1+15360+62464*i5+16384*j1+64*k1);
__m512 tmp1 = _mm512_unpacklo_ps(wt1, wt2);
__m512 tmp2 = _mm512_unpackhi_ps(wt1, wt2);
__m512 tmp3 = _mm512_unpacklo_ps(wt3, wt4);
__m512 tmp4 = _mm512_unpackhi_ps(wt3, wt4);
__m512 tmp5 = _mm512_unpacklo_ps(wt5, wt6);
__m512 tmp6 = _mm512_unpackhi_ps(wt5, wt6);
__m512 tmp7 = _mm512_unpacklo_ps(wt7, wt8);
__m512 tmp8 = _mm512_unpackhi_ps(wt7, wt8);
__m512 tmp9 = _mm512_unpacklo_ps(wt9, wt10);
__m512 tmp10 = _mm512_unpackhi_ps(wt9, wt10);
__m512 tmp11 = _mm512_unpacklo_ps(wt11, wt12);
__m512 tmp12 = _mm512_unpackhi_ps(wt11, wt12);
__m512 tmp13 = _mm512_unpacklo_ps(wt13, wt14);
__m512 tmp14 = _mm512_unpackhi_ps(wt13, wt14);
__m512 tmp15 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp16 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt1 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt9 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt2 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt10 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt3 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt11 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt4 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt12 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt5 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt13 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt6 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt14 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt7 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt15 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt8 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt16 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
wt1 = _mm512_mul_ps(wt1, postMul1);
__m512 preMul1 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*i5+1*k1))[0]);
__m512 preAdd1 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*i5+1*k1))[1]);
bias1 = _mm512_fmadd_ps(wt1, preAdd1, bias1);
wt1 = _mm512_mul_ps(wt1, preMul1);
_mm512_mask_storeu_ps(arrangedW1+0+62464*i5+384*c1+24*k1, 63, wt1);
_mm512_mask_storeu_ps(arrangedW1+360+62464*i5+384*c1+24*k1, 4032, wt1);
_mm512_mask_storeu_ps(arrangedW1+720+62464*i5+384*c1+24*k1, 61440, wt1);
wt2 = _mm512_mul_ps(wt2, postMul1);
bias1 = _mm512_fmadd_ps(wt2, preAdd1, bias1);
wt2 = _mm512_mul_ps(wt2, preMul1);
_mm512_mask_storeu_ps(arrangedW1+3904+62464*i5+384*c1+24*k1, 63, wt2);
_mm512_mask_storeu_ps(arrangedW1+4264+62464*i5+384*c1+24*k1, 4032, wt2);
_mm512_mask_storeu_ps(arrangedW1+4624+62464*i5+384*c1+24*k1, 61440, wt2);
wt3 = _mm512_mul_ps(wt3, postMul1);
bias1 = _mm512_fmadd_ps(wt3, preAdd1, bias1);
wt3 = _mm512_mul_ps(wt3, preMul1);
_mm512_mask_storeu_ps(arrangedW1+7808+62464*i5+384*c1+24*k1, 63, wt3);
_mm512_mask_storeu_ps(arrangedW1+8168+62464*i5+384*c1+24*k1, 4032, wt3);
_mm512_mask_storeu_ps(arrangedW1+8528+62464*i5+384*c1+24*k1, 61440, wt3);
wt4 = _mm512_mul_ps(wt4, postMul1);
bias1 = _mm512_fmadd_ps(wt4, preAdd1, bias1);
wt4 = _mm512_mul_ps(wt4, preMul1);
_mm512_mask_storeu_ps(arrangedW1+11712+62464*i5+384*c1+24*k1, 63, wt4);
_mm512_mask_storeu_ps(arrangedW1+12072+62464*i5+384*c1+24*k1, 4032, wt4);
_mm512_mask_storeu_ps(arrangedW1+12432+62464*i5+384*c1+24*k1, 61440, wt4);
wt5 = _mm512_mul_ps(wt5, postMul1);
bias1 = _mm512_fmadd_ps(wt5, preAdd1, bias1);
wt5 = _mm512_mul_ps(wt5, preMul1);
_mm512_mask_storeu_ps(arrangedW1+15616+62464*i5+384*c1+24*k1, 63, wt5);
_mm512_mask_storeu_ps(arrangedW1+15976+62464*i5+384*c1+24*k1, 4032, wt5);
_mm512_mask_storeu_ps(arrangedW1+16336+62464*i5+384*c1+24*k1, 61440, wt5);
wt6 = _mm512_mul_ps(wt6, postMul1);
bias1 = _mm512_fmadd_ps(wt6, preAdd1, bias1);
wt6 = _mm512_mul_ps(wt6, preMul1);
_mm512_mask_storeu_ps(arrangedW1+19520+62464*i5+384*c1+24*k1, 63, wt6);
_mm512_mask_storeu_ps(arrangedW1+19880+62464*i5+384*c1+24*k1, 4032, wt6);
_mm512_mask_storeu_ps(arrangedW1+20240+62464*i5+384*c1+24*k1, 61440, wt6);
wt7 = _mm512_mul_ps(wt7, postMul1);
bias1 = _mm512_fmadd_ps(wt7, preAdd1, bias1);
wt7 = _mm512_mul_ps(wt7, preMul1);
_mm512_mask_storeu_ps(arrangedW1+23424+62464*i5+384*c1+24*k1, 63, wt7);
_mm512_mask_storeu_ps(arrangedW1+23784+62464*i5+384*c1+24*k1, 4032, wt7);
_mm512_mask_storeu_ps(arrangedW1+24144+62464*i5+384*c1+24*k1, 61440, wt7);
wt8 = _mm512_mul_ps(wt8, postMul1);
bias1 = _mm512_fmadd_ps(wt8, preAdd1, bias1);
wt8 = _mm512_mul_ps(wt8, preMul1);
_mm512_mask_storeu_ps(arrangedW1+27328+62464*i5+384*c1+24*k1, 63, wt8);
_mm512_mask_storeu_ps(arrangedW1+27688+62464*i5+384*c1+24*k1, 4032, wt8);
_mm512_mask_storeu_ps(arrangedW1+28048+62464*i5+384*c1+24*k1, 61440, wt8);
wt9 = _mm512_mul_ps(wt9, postMul1);
bias1 = _mm512_fmadd_ps(wt9, preAdd1, bias1);
wt9 = _mm512_mul_ps(wt9, preMul1);
_mm512_mask_storeu_ps(arrangedW1+31232+62464*i5+384*c1+24*k1, 63, wt9);
_mm512_mask_storeu_ps(arrangedW1+31592+62464*i5+384*c1+24*k1, 4032, wt9);
_mm512_mask_storeu_ps(arrangedW1+31952+62464*i5+384*c1+24*k1, 61440, wt9);
wt10 = _mm512_mul_ps(wt10, postMul1);
bias1 = _mm512_fmadd_ps(wt10, preAdd1, bias1);
wt10 = _mm512_mul_ps(wt10, preMul1);
_mm512_mask_storeu_ps(arrangedW1+35136+62464*i5+384*c1+24*k1, 63, wt10);
_mm512_mask_storeu_ps(arrangedW1+35496+62464*i5+384*c1+24*k1, 4032, wt10);
_mm512_mask_storeu_ps(arrangedW1+35856+62464*i5+384*c1+24*k1, 61440, wt10);
wt11 = _mm512_mul_ps(wt11, postMul1);
bias1 = _mm512_fmadd_ps(wt11, preAdd1, bias1);
wt11 = _mm512_mul_ps(wt11, preMul1);
_mm512_mask_storeu_ps(arrangedW1+39040+62464*i5+384*c1+24*k1, 63, wt11);
_mm512_mask_storeu_ps(arrangedW1+39400+62464*i5+384*c1+24*k1, 4032, wt11);
_mm512_mask_storeu_ps(arrangedW1+39760+62464*i5+384*c1+24*k1, 61440, wt11);
wt12 = _mm512_mul_ps(wt12, postMul1);
bias1 = _mm512_fmadd_ps(wt12, preAdd1, bias1);
wt12 = _mm512_mul_ps(wt12, preMul1);
_mm512_mask_storeu_ps(arrangedW1+42944+62464*i5+384*c1+24*k1, 63, wt12);
_mm512_mask_storeu_ps(arrangedW1+43304+62464*i5+384*c1+24*k1, 4032, wt12);
_mm512_mask_storeu_ps(arrangedW1+43664+62464*i5+384*c1+24*k1, 61440, wt12);
wt13 = _mm512_mul_ps(wt13, postMul1);
bias1 = _mm512_fmadd_ps(wt13, preAdd1, bias1);
wt13 = _mm512_mul_ps(wt13, preMul1);
_mm512_mask_storeu_ps(arrangedW1+46848+62464*i5+384*c1+24*k1, 63, wt13);
_mm512_mask_storeu_ps(arrangedW1+47208+62464*i5+384*c1+24*k1, 4032, wt13);
_mm512_mask_storeu_ps(arrangedW1+47568+62464*i5+384*c1+24*k1, 61440, wt13);
wt14 = _mm512_mul_ps(wt14, postMul1);
bias1 = _mm512_fmadd_ps(wt14, preAdd1, bias1);
wt14 = _mm512_mul_ps(wt14, preMul1);
_mm512_mask_storeu_ps(arrangedW1+50752+62464*i5+384*c1+24*k1, 63, wt14);
_mm512_mask_storeu_ps(arrangedW1+51112+62464*i5+384*c1+24*k1, 4032, wt14);
_mm512_mask_storeu_ps(arrangedW1+51472+62464*i5+384*c1+24*k1, 61440, wt14);
wt15 = _mm512_mul_ps(wt15, postMul1);
bias1 = _mm512_fmadd_ps(wt15, preAdd1, bias1);
wt15 = _mm512_mul_ps(wt15, preMul1);
_mm512_mask_storeu_ps(arrangedW1+54656+62464*i5+384*c1+24*k1, 63, wt15);
_mm512_mask_storeu_ps(arrangedW1+55016+62464*i5+384*c1+24*k1, 4032, wt15);
_mm512_mask_storeu_ps(arrangedW1+55376+62464*i5+384*c1+24*k1, 61440, wt15);
wt16 = _mm512_mul_ps(wt16, postMul1);
bias1 = _mm512_fmadd_ps(wt16, preAdd1, bias1);
wt16 = _mm512_mul_ps(wt16, preMul1);
_mm512_mask_storeu_ps(arrangedW1+58560+62464*i5+384*c1+24*k1, 63, wt16);
_mm512_mask_storeu_ps(arrangedW1+58920+62464*i5+384*c1+24*k1, 4032, wt16);
_mm512_mask_storeu_ps(arrangedW1+59280+62464*i5+384*c1+24*k1, 61440, wt16);
}
break;
}
case 2: {
ptrdiff_t k2 = 0;
for (; k2 != 16; ++k2) {
__m512 wt17 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+62464*i5+16384*j1+64*k2);
__m512 wt18 = _mm512_maskz_loadu_ps(65535, wtPtr1+1024+62464*i5+16384*j1+64*k2);
__m512 wt19 = _mm512_maskz_loadu_ps(65535, wtPtr1+2048+62464*i5+16384*j1+64*k2);
__m512 wt20 = _mm512_maskz_loadu_ps(65535, wtPtr1+3072+62464*i5+16384*j1+64*k2);
__m512 wt21 = _mm512_maskz_loadu_ps(65535, wtPtr1+4096+62464*i5+16384*j1+64*k2);
__m512 wt22 = _mm512_maskz_loadu_ps(65535, wtPtr1+5120+62464*i5+16384*j1+64*k2);
__m512 wt23 = _mm512_maskz_loadu_ps(65535, wtPtr1+6144+62464*i5+16384*j1+64*k2);
__m512 wt24 = _mm512_maskz_loadu_ps(65535, wtPtr1+7168+62464*i5+16384*j1+64*k2);
__m512 wt25 = _mm512_maskz_loadu_ps(65535, wtPtr1+8192+62464*i5+16384*j1+64*k2);
__m512 wt26 = _mm512_maskz_loadu_ps(65535, wtPtr1+9216+62464*i5+16384*j1+64*k2);
__m512 wt27 = _mm512_maskz_loadu_ps(65535, wtPtr1+10240+62464*i5+16384*j1+64*k2);
__m512 wt28 = _mm512_maskz_loadu_ps(65535, wtPtr1+11264+62464*i5+16384*j1+64*k2);
__m512 wt29 = _mm512_maskz_loadu_ps(65535, wtPtr1+12288+62464*i5+16384*j1+64*k2);
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr1+13312+62464*i5+16384*j1+64*k2);
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr1+14336+62464*i5+16384*j1+64*k2);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr1+15360+62464*i5+16384*j1+64*k2);
__m512 tmp49 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp50 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp51 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp52 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp53 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp54 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp55 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp56 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp57 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp58 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp59 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp60 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp61 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp62 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp63 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp64 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp50, tmp52, 238);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp70 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp71 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp54, tmp56, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp75 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp76 = _mm512_shuffle_ps(tmp58, tmp60, 238);
__m512 tmp77 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp78 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp79 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp80 = _mm512_shuffle_ps(tmp62, tmp64, 238);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp65, tmp69, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp65, tmp69, 221);
__m512 tmp83 = _mm512_shuffle_f32x4(tmp66, tmp70, 136);
__m512 tmp84 = _mm512_shuffle_f32x4(tmp66, tmp70, 221);
__m512 tmp85 = _mm512_shuffle_f32x4(tmp67, tmp71, 136);
__m512 tmp86 = _mm512_shuffle_f32x4(tmp67, tmp71, 221);
__m512 tmp87 = _mm512_shuffle_f32x4(tmp68, tmp72, 136);
__m512 tmp88 = _mm512_shuffle_f32x4(tmp68, tmp72, 221);
__m512 tmp89 = _mm512_shuffle_f32x4(tmp73, tmp77, 136);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp73, tmp77, 221);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp74, tmp78, 136);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp74, tmp78, 221);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp75, tmp79, 136);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp75, tmp79, 221);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp76, tmp80, 136);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp76, tmp80, 221);
wt17 = _mm512_shuffle_f32x4(tmp81, tmp89, 136);
wt25 = _mm512_shuffle_f32x4(tmp81, tmp89, 221);
wt18 = _mm512_shuffle_f32x4(tmp83, tmp91, 136);
wt26 = _mm512_shuffle_f32x4(tmp83, tmp91, 221);
wt19 = _mm512_shuffle_f32x4(tmp85, tmp93, 136);
wt27 = _mm512_shuffle_f32x4(tmp85, tmp93, 221);
wt20 = _mm512_shuffle_f32x4(tmp87, tmp95, 136);
wt28 = _mm512_shuffle_f32x4(tmp87, tmp95, 221);
wt21 = _mm512_shuffle_f32x4(tmp82, tmp90, 136);
wt29 = _mm512_shuffle_f32x4(tmp82, tmp90, 221);
wt22 = _mm512_shuffle_f32x4(tmp84, tmp92, 136);
wt30 = _mm512_shuffle_f32x4(tmp84, tmp92, 221);
wt23 = _mm512_shuffle_f32x4(tmp86, tmp94, 136);
wt31 = _mm512_shuffle_f32x4(tmp86, tmp94, 221);
wt24 = _mm512_shuffle_f32x4(tmp88, tmp96, 136);
wt32 = _mm512_shuffle_f32x4(tmp88, tmp96, 221);
wt17 = _mm512_mul_ps(wt17, postMul1);
__m512 preMul2 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*i5+1*k2))[0]);
__m512 preAdd2 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*i5+1*k2))[1]);
bias1 = _mm512_fmadd_ps(wt17, preAdd2, bias1);
wt17 = _mm512_mul_ps(wt17, preMul2);
_mm512_mask_storeu_ps(arrangedW1+8+62464*i5+384*c1+24*k2, 15, wt17);
_mm512_mask_storeu_ps(arrangedW1+368+62464*i5+384*c1+24*k2, 1008, wt17);
_mm512_mask_storeu_ps(arrangedW1+728+62464*i5+384*c1+24*k2, 64512, wt17);
wt18 = _mm512_mul_ps(wt18, postMul1);
bias1 = _mm512_fmadd_ps(wt18, preAdd2, bias1);
wt18 = _mm512_mul_ps(wt18, preMul2);
_mm512_mask_storeu_ps(arrangedW1+3912+62464*i5+384*c1+24*k2, 15, wt18);
_mm512_mask_storeu_ps(arrangedW1+4272+62464*i5+384*c1+24*k2, 1008, wt18);
_mm512_mask_storeu_ps(arrangedW1+4632+62464*i5+384*c1+24*k2, 64512, wt18);
wt19 = _mm512_mul_ps(wt19, postMul1);
bias1 = _mm512_fmadd_ps(wt19, preAdd2, bias1);
wt19 = _mm512_mul_ps(wt19, preMul2);
_mm512_mask_storeu_ps(arrangedW1+7816+62464*i5+384*c1+24*k2, 15, wt19);
_mm512_mask_storeu_ps(arrangedW1+8176+62464*i5+384*c1+24*k2, 1008, wt19);
_mm512_mask_storeu_ps(arrangedW1+8536+62464*i5+384*c1+24*k2, 64512, wt19);
wt20 = _mm512_mul_ps(wt20, postMul1);
bias1 = _mm512_fmadd_ps(wt20, preAdd2, bias1);
wt20 = _mm512_mul_ps(wt20, preMul2);
_mm512_mask_storeu_ps(arrangedW1+11720+62464*i5+384*c1+24*k2, 15, wt20);
_mm512_mask_storeu_ps(arrangedW1+12080+62464*i5+384*c1+24*k2, 1008, wt20);
_mm512_mask_storeu_ps(arrangedW1+12440+62464*i5+384*c1+24*k2, 64512, wt20);
wt21 = _mm512_mul_ps(wt21, postMul1);
bias1 = _mm512_fmadd_ps(wt21, preAdd2, bias1);
wt21 = _mm512_mul_ps(wt21, preMul2);
_mm512_mask_storeu_ps(arrangedW1+15624+62464*i5+384*c1+24*k2, 15, wt21);
_mm512_mask_storeu_ps(arrangedW1+15984+62464*i5+384*c1+24*k2, 1008, wt21);
_mm512_mask_storeu_ps(arrangedW1+16344+62464*i5+384*c1+24*k2, 64512, wt21);
wt22 = _mm512_mul_ps(wt22, postMul1);
bias1 = _mm512_fmadd_ps(wt22, preAdd2, bias1);
wt22 = _mm512_mul_ps(wt22, preMul2);
_mm512_mask_storeu_ps(arrangedW1+19528+62464*i5+384*c1+24*k2, 15, wt22);
_mm512_mask_storeu_ps(arrangedW1+19888+62464*i5+384*c1+24*k2, 1008, wt22);
_mm512_mask_storeu_ps(arrangedW1+20248+62464*i5+384*c1+24*k2, 64512, wt22);
wt23 = _mm512_mul_ps(wt23, postMul1);
bias1 = _mm512_fmadd_ps(wt23, preAdd2, bias1);
wt23 = _mm512_mul_ps(wt23, preMul2);
_mm512_mask_storeu_ps(arrangedW1+23432+62464*i5+384*c1+24*k2, 15, wt23);
_mm512_mask_storeu_ps(arrangedW1+23792+62464*i5+384*c1+24*k2, 1008, wt23);
_mm512_mask_storeu_ps(arrangedW1+24152+62464*i5+384*c1+24*k2, 64512, wt23);
wt24 = _mm512_mul_ps(wt24, postMul1);
bias1 = _mm512_fmadd_ps(wt24, preAdd2, bias1);
wt24 = _mm512_mul_ps(wt24, preMul2);
_mm512_mask_storeu_ps(arrangedW1+27336+62464*i5+384*c1+24*k2, 15, wt24);
_mm512_mask_storeu_ps(arrangedW1+27696+62464*i5+384*c1+24*k2, 1008, wt24);
_mm512_mask_storeu_ps(arrangedW1+28056+62464*i5+384*c1+24*k2, 64512, wt24);
wt25 = _mm512_mul_ps(wt25, postMul1);
bias1 = _mm512_fmadd_ps(wt25, preAdd2, bias1);
wt25 = _mm512_mul_ps(wt25, preMul2);
_mm512_mask_storeu_ps(arrangedW1+31240+62464*i5+384*c1+24*k2, 15, wt25);
_mm512_mask_storeu_ps(arrangedW1+31600+62464*i5+384*c1+24*k2, 1008, wt25);
_mm512_mask_storeu_ps(arrangedW1+31960+62464*i5+384*c1+24*k2, 64512, wt25);
wt26 = _mm512_mul_ps(wt26, postMul1);
bias1 = _mm512_fmadd_ps(wt26, preAdd2, bias1);
wt26 = _mm512_mul_ps(wt26, preMul2);
_mm512_mask_storeu_ps(arrangedW1+35144+62464*i5+384*c1+24*k2, 15, wt26);
_mm512_mask_storeu_ps(arrangedW1+35504+62464*i5+384*c1+24*k2, 1008, wt26);
_mm512_mask_storeu_ps(arrangedW1+35864+62464*i5+384*c1+24*k2, 64512, wt26);
wt27 = _mm512_mul_ps(wt27, postMul1);
bias1 = _mm512_fmadd_ps(wt27, preAdd2, bias1);
wt27 = _mm512_mul_ps(wt27, preMul2);
_mm512_mask_storeu_ps(arrangedW1+39048+62464*i5+384*c1+24*k2, 15, wt27);
_mm512_mask_storeu_ps(arrangedW1+39408+62464*i5+384*c1+24*k2, 1008, wt27);
_mm512_mask_storeu_ps(arrangedW1+39768+62464*i5+384*c1+24*k2, 64512, wt27);
wt28 = _mm512_mul_ps(wt28, postMul1);
bias1 = _mm512_fmadd_ps(wt28, preAdd2, bias1);
wt28 = _mm512_mul_ps(wt28, preMul2);
_mm512_mask_storeu_ps(arrangedW1+42952+62464*i5+384*c1+24*k2, 15, wt28);
_mm512_mask_storeu_ps(arrangedW1+43312+62464*i5+384*c1+24*k2, 1008, wt28);
_mm512_mask_storeu_ps(arrangedW1+43672+62464*i5+384*c1+24*k2, 64512, wt28);
wt29 = _mm512_mul_ps(wt29, postMul1);
bias1 = _mm512_fmadd_ps(wt29, preAdd2, bias1);
wt29 = _mm512_mul_ps(wt29, preMul2);
_mm512_mask_storeu_ps(arrangedW1+46856+62464*i5+384*c1+24*k2, 15, wt29);
_mm512_mask_storeu_ps(arrangedW1+47216+62464*i5+384*c1+24*k2, 1008, wt29);
_mm512_mask_storeu_ps(arrangedW1+47576+62464*i5+384*c1+24*k2, 64512, wt29);
wt30 = _mm512_mul_ps(wt30, postMul1);
bias1 = _mm512_fmadd_ps(wt30, preAdd2, bias1);
wt30 = _mm512_mul_ps(wt30, preMul2);
_mm512_mask_storeu_ps(arrangedW1+50760+62464*i5+384*c1+24*k2, 15, wt30);
_mm512_mask_storeu_ps(arrangedW1+51120+62464*i5+384*c1+24*k2, 1008, wt30);
_mm512_mask_storeu_ps(arrangedW1+51480+62464*i5+384*c1+24*k2, 64512, wt30);
wt31 = _mm512_mul_ps(wt31, postMul1);
bias1 = _mm512_fmadd_ps(wt31, preAdd2, bias1);
wt31 = _mm512_mul_ps(wt31, preMul2);
_mm512_mask_storeu_ps(arrangedW1+54664+62464*i5+384*c1+24*k2, 15, wt31);
_mm512_mask_storeu_ps(arrangedW1+55024+62464*i5+384*c1+24*k2, 1008, wt31);
_mm512_mask_storeu_ps(arrangedW1+55384+62464*i5+384*c1+24*k2, 64512, wt31);
wt32 = _mm512_mul_ps(wt32, postMul1);
bias1 = _mm512_fmadd_ps(wt32, preAdd2, bias1);
wt32 = _mm512_mul_ps(wt32, preMul2);
_mm512_mask_storeu_ps(arrangedW1+58568+62464*i5+384*c1+24*k2, 15, wt32);
_mm512_mask_storeu_ps(arrangedW1+58928+62464*i5+384*c1+24*k2, 1008, wt32);
_mm512_mask_storeu_ps(arrangedW1+59288+62464*i5+384*c1+24*k2, 64512, wt32);
}
break;
}
default: {
ptrdiff_t k3 = 0;
for (; k3 != 16; ++k3) {
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+62464*i5+16384*j1+64*k3);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr1+1024+62464*i5+16384*j1+64*k3);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr1+2048+62464*i5+16384*j1+64*k3);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr1+3072+62464*i5+16384*j1+64*k3);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr1+4096+62464*i5+16384*j1+64*k3);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr1+5120+62464*i5+16384*j1+64*k3);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr1+6144+62464*i5+16384*j1+64*k3);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr1+7168+62464*i5+16384*j1+64*k3);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr1+8192+62464*i5+16384*j1+64*k3);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr1+9216+62464*i5+16384*j1+64*k3);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr1+10240+62464*i5+16384*j1+64*k3);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr1+11264+62464*i5+16384*j1+64*k3);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr1+12288+62464*i5+16384*j1+64*k3);
__m512 wt46 = _mm512_maskz_loadu_ps(65535, wtPtr1+13312+62464*i5+16384*j1+64*k3);
__m512 wt47 = _mm512_maskz_loadu_ps(65535, wtPtr1+14336+62464*i5+16384*j1+64*k3);
__m512 wt48 = _mm512_maskz_loadu_ps(65535, wtPtr1+15360+62464*i5+16384*j1+64*k3);
__m512 tmp97 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp98 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp99 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp100 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp101 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp102 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp103 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp104 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp105 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp106 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp107 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp108 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp109 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp110 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp111 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp112 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp113 = _mm512_shuffle_ps(tmp97, tmp99, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp97, tmp99, 238);
__m512 tmp115 = _mm512_shuffle_ps(tmp98, tmp100, 68);
__m512 tmp116 = _mm512_shuffle_ps(tmp98, tmp100, 238);
__m512 tmp117 = _mm512_shuffle_ps(tmp101, tmp103, 68);
__m512 tmp118 = _mm512_shuffle_ps(tmp101, tmp103, 238);
__m512 tmp119 = _mm512_shuffle_ps(tmp102, tmp104, 68);
__m512 tmp120 = _mm512_shuffle_ps(tmp102, tmp104, 238);
__m512 tmp121 = _mm512_shuffle_ps(tmp105, tmp107, 68);
__m512 tmp122 = _mm512_shuffle_ps(tmp105, tmp107, 238);
__m512 tmp123 = _mm512_shuffle_ps(tmp106, tmp108, 68);
__m512 tmp124 = _mm512_shuffle_ps(tmp106, tmp108, 238);
__m512 tmp125 = _mm512_shuffle_ps(tmp109, tmp111, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp109, tmp111, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp110, tmp112, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp110, tmp112, 238);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp113, tmp117, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp113, tmp117, 221);
__m512 tmp131 = _mm512_shuffle_f32x4(tmp114, tmp118, 136);
__m512 tmp132 = _mm512_shuffle_f32x4(tmp114, tmp118, 221);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp115, tmp119, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp115, tmp119, 221);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp116, tmp120, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp116, tmp120, 221);
__m512 tmp137 = _mm512_shuffle_f32x4(tmp121, tmp125, 136);
__m512 tmp138 = _mm512_shuffle_f32x4(tmp121, tmp125, 221);
__m512 tmp139 = _mm512_shuffle_f32x4(tmp122, tmp126, 136);
__m512 tmp140 = _mm512_shuffle_f32x4(tmp122, tmp126, 221);
__m512 tmp141 = _mm512_shuffle_f32x4(tmp123, tmp127, 136);
__m512 tmp142 = _mm512_shuffle_f32x4(tmp123, tmp127, 221);
__m512 tmp143 = _mm512_shuffle_f32x4(tmp124, tmp128, 136);
__m512 tmp144 = _mm512_shuffle_f32x4(tmp124, tmp128, 221);
wt33 = _mm512_shuffle_f32x4(tmp129, tmp137, 136);
wt41 = _mm512_shuffle_f32x4(tmp129, tmp137, 221);
wt34 = _mm512_shuffle_f32x4(tmp131, tmp139, 136);
wt42 = _mm512_shuffle_f32x4(tmp131, tmp139, 221);
wt35 = _mm512_shuffle_f32x4(tmp133, tmp141, 136);
wt43 = _mm512_shuffle_f32x4(tmp133, tmp141, 221);
wt36 = _mm512_shuffle_f32x4(tmp135, tmp143, 136);
wt44 = _mm512_shuffle_f32x4(tmp135, tmp143, 221);
wt37 = _mm512_shuffle_f32x4(tmp130, tmp138, 136);
wt45 = _mm512_shuffle_f32x4(tmp130, tmp138, 221);
wt38 = _mm512_shuffle_f32x4(tmp132, tmp140, 136);
wt46 = _mm512_shuffle_f32x4(tmp132, tmp140, 221);
wt39 = _mm512_shuffle_f32x4(tmp134, tmp142, 136);
wt47 = _mm512_shuffle_f32x4(tmp134, tmp142, 221);
wt40 = _mm512_shuffle_f32x4(tmp136, tmp144, 136);
wt48 = _mm512_shuffle_f32x4(tmp136, tmp144, 221);
wt33 = _mm512_mul_ps(wt33, postMul1);
__m512 preMul3 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*i5+1*k3))[0]);
__m512 preAdd3 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*i5+1*k3))[1]);
bias1 = _mm512_fmadd_ps(wt33, preAdd3, bias1);
wt33 = _mm512_mul_ps(wt33, preMul3);
_mm512_mask_storeu_ps(arrangedW1+16+62464*i5+384*c1+24*k3, 3, wt33);
_mm512_mask_storeu_ps(arrangedW1+376+62464*i5+384*c1+24*k3, 252, wt33);
_mm512_mask_storeu_ps(arrangedW1+736+62464*i5+384*c1+24*k3, 16128, wt33);
_mm512_mask_storeu_ps(arrangedW1+1096+62464*i5+384*c1+24*k3, 49152, wt33);
wt34 = _mm512_mul_ps(wt34, postMul1);
bias1 = _mm512_fmadd_ps(wt34, preAdd3, bias1);
wt34 = _mm512_mul_ps(wt34, preMul3);
_mm512_mask_storeu_ps(arrangedW1+3920+62464*i5+384*c1+24*k3, 3, wt34);
_mm512_mask_storeu_ps(arrangedW1+4280+62464*i5+384*c1+24*k3, 252, wt34);
_mm512_mask_storeu_ps(arrangedW1+4640+62464*i5+384*c1+24*k3, 16128, wt34);
_mm512_mask_storeu_ps(arrangedW1+5000+62464*i5+384*c1+24*k3, 49152, wt34);
wt35 = _mm512_mul_ps(wt35, postMul1);
bias1 = _mm512_fmadd_ps(wt35, preAdd3, bias1);
wt35 = _mm512_mul_ps(wt35, preMul3);
_mm512_mask_storeu_ps(arrangedW1+7824+62464*i5+384*c1+24*k3, 3, wt35);
_mm512_mask_storeu_ps(arrangedW1+8184+62464*i5+384*c1+24*k3, 252, wt35);
_mm512_mask_storeu_ps(arrangedW1+8544+62464*i5+384*c1+24*k3, 16128, wt35);
_mm512_mask_storeu_ps(arrangedW1+8904+62464*i5+384*c1+24*k3, 49152, wt35);
wt36 = _mm512_mul_ps(wt36, postMul1);
bias1 = _mm512_fmadd_ps(wt36, preAdd3, bias1);
wt36 = _mm512_mul_ps(wt36, preMul3);
_mm512_mask_storeu_ps(arrangedW1+11728+62464*i5+384*c1+24*k3, 3, wt36);
_mm512_mask_storeu_ps(arrangedW1+12088+62464*i5+384*c1+24*k3, 252, wt36);
_mm512_mask_storeu_ps(arrangedW1+12448+62464*i5+384*c1+24*k3, 16128, wt36);
_mm512_mask_storeu_ps(arrangedW1+12808+62464*i5+384*c1+24*k3, 49152, wt36);
wt37 = _mm512_mul_ps(wt37, postMul1);
bias1 = _mm512_fmadd_ps(wt37, preAdd3, bias1);
wt37 = _mm512_mul_ps(wt37, preMul3);
_mm512_mask_storeu_ps(arrangedW1+15632+62464*i5+384*c1+24*k3, 3, wt37);
_mm512_mask_storeu_ps(arrangedW1+15992+62464*i5+384*c1+24*k3, 252, wt37);
_mm512_mask_storeu_ps(arrangedW1+16352+62464*i5+384*c1+24*k3, 16128, wt37);
_mm512_mask_storeu_ps(arrangedW1+16712+62464*i5+384*c1+24*k3, 49152, wt37);
wt38 = _mm512_mul_ps(wt38, postMul1);
bias1 = _mm512_fmadd_ps(wt38, preAdd3, bias1);
wt38 = _mm512_mul_ps(wt38, preMul3);
_mm512_mask_storeu_ps(arrangedW1+19536+62464*i5+384*c1+24*k3, 3, wt38);
_mm512_mask_storeu_ps(arrangedW1+19896+62464*i5+384*c1+24*k3, 252, wt38);
_mm512_mask_storeu_ps(arrangedW1+20256+62464*i5+384*c1+24*k3, 16128, wt38);
_mm512_mask_storeu_ps(arrangedW1+20616+62464*i5+384*c1+24*k3, 49152, wt38);
wt39 = _mm512_mul_ps(wt39, postMul1);
bias1 = _mm512_fmadd_ps(wt39, preAdd3, bias1);
wt39 = _mm512_mul_ps(wt39, preMul3);
_mm512_mask_storeu_ps(arrangedW1+23440+62464*i5+384*c1+24*k3, 3, wt39);
_mm512_mask_storeu_ps(arrangedW1+23800+62464*i5+384*c1+24*k3, 252, wt39);
_mm512_mask_storeu_ps(arrangedW1+24160+62464*i5+384*c1+24*k3, 16128, wt39);
_mm512_mask_storeu_ps(arrangedW1+24520+62464*i5+384*c1+24*k3, 49152, wt39);
wt40 = _mm512_mul_ps(wt40, postMul1);
bias1 = _mm512_fmadd_ps(wt40, preAdd3, bias1);
wt40 = _mm512_mul_ps(wt40, preMul3);
_mm512_mask_storeu_ps(arrangedW1+27344+62464*i5+384*c1+24*k3, 3, wt40);
_mm512_mask_storeu_ps(arrangedW1+27704+62464*i5+384*c1+24*k3, 252, wt40);
_mm512_mask_storeu_ps(arrangedW1+28064+62464*i5+384*c1+24*k3, 16128, wt40);
_mm512_mask_storeu_ps(arrangedW1+28424+62464*i5+384*c1+24*k3, 49152, wt40);
wt41 = _mm512_mul_ps(wt41, postMul1);
bias1 = _mm512_fmadd_ps(wt41, preAdd3, bias1);
wt41 = _mm512_mul_ps(wt41, preMul3);
_mm512_mask_storeu_ps(arrangedW1+31248+62464*i5+384*c1+24*k3, 3, wt41);
_mm512_mask_storeu_ps(arrangedW1+31608+62464*i5+384*c1+24*k3, 252, wt41);
_mm512_mask_storeu_ps(arrangedW1+31968+62464*i5+384*c1+24*k3, 16128, wt41);
_mm512_mask_storeu_ps(arrangedW1+32328+62464*i5+384*c1+24*k3, 49152, wt41);
wt42 = _mm512_mul_ps(wt42, postMul1);
bias1 = _mm512_fmadd_ps(wt42, preAdd3, bias1);
wt42 = _mm512_mul_ps(wt42, preMul3);
_mm512_mask_storeu_ps(arrangedW1+35152+62464*i5+384*c1+24*k3, 3, wt42);
_mm512_mask_storeu_ps(arrangedW1+35512+62464*i5+384*c1+24*k3, 252, wt42);
_mm512_mask_storeu_ps(arrangedW1+35872+62464*i5+384*c1+24*k3, 16128, wt42);
_mm512_mask_storeu_ps(arrangedW1+36232+62464*i5+384*c1+24*k3, 49152, wt42);
wt43 = _mm512_mul_ps(wt43, postMul1);
bias1 = _mm512_fmadd_ps(wt43, preAdd3, bias1);
wt43 = _mm512_mul_ps(wt43, preMul3);
_mm512_mask_storeu_ps(arrangedW1+39056+62464*i5+384*c1+24*k3, 3, wt43);
_mm512_mask_storeu_ps(arrangedW1+39416+62464*i5+384*c1+24*k3, 252, wt43);
_mm512_mask_storeu_ps(arrangedW1+39776+62464*i5+384*c1+24*k3, 16128, wt43);
_mm512_mask_storeu_ps(arrangedW1+40136+62464*i5+384*c1+24*k3, 49152, wt43);
wt44 = _mm512_mul_ps(wt44, postMul1);
bias1 = _mm512_fmadd_ps(wt44, preAdd3, bias1);
wt44 = _mm512_mul_ps(wt44, preMul3);
_mm512_mask_storeu_ps(arrangedW1+42960+62464*i5+384*c1+24*k3, 3, wt44);
_mm512_mask_storeu_ps(arrangedW1+43320+62464*i5+384*c1+24*k3, 252, wt44);
_mm512_mask_storeu_ps(arrangedW1+43680+62464*i5+384*c1+24*k3, 16128, wt44);
_mm512_mask_storeu_ps(arrangedW1+44040+62464*i5+384*c1+24*k3, 49152, wt44);
wt45 = _mm512_mul_ps(wt45, postMul1);
bias1 = _mm512_fmadd_ps(wt45, preAdd3, bias1);
wt45 = _mm512_mul_ps(wt45, preMul3);
_mm512_mask_storeu_ps(arrangedW1+46864+62464*i5+384*c1+24*k3, 3, wt45);
_mm512_mask_storeu_ps(arrangedW1+47224+62464*i5+384*c1+24*k3, 252, wt45);
_mm512_mask_storeu_ps(arrangedW1+47584+62464*i5+384*c1+24*k3, 16128, wt45);
_mm512_mask_storeu_ps(arrangedW1+47944+62464*i5+384*c1+24*k3, 49152, wt45);
wt46 = _mm512_mul_ps(wt46, postMul1);
bias1 = _mm512_fmadd_ps(wt46, preAdd3, bias1);
wt46 = _mm512_mul_ps(wt46, preMul3);
_mm512_mask_storeu_ps(arrangedW1+50768+62464*i5+384*c1+24*k3, 3, wt46);
_mm512_mask_storeu_ps(arrangedW1+51128+62464*i5+384*c1+24*k3, 252, wt46);
_mm512_mask_storeu_ps(arrangedW1+51488+62464*i5+384*c1+24*k3, 16128, wt46);
_mm512_mask_storeu_ps(arrangedW1+51848+62464*i5+384*c1+24*k3, 49152, wt46);
wt47 = _mm512_mul_ps(wt47, postMul1);
bias1 = _mm512_fmadd_ps(wt47, preAdd3, bias1);
wt47 = _mm512_mul_ps(wt47, preMul3);
_mm512_mask_storeu_ps(arrangedW1+54672+62464*i5+384*c1+24*k3, 3, wt47);
_mm512_mask_storeu_ps(arrangedW1+55032+62464*i5+384*c1+24*k3, 252, wt47);
_mm512_mask_storeu_ps(arrangedW1+55392+62464*i5+384*c1+24*k3, 16128, wt47);
_mm512_mask_storeu_ps(arrangedW1+55752+62464*i5+384*c1+24*k3, 49152, wt47);
wt48 = _mm512_mul_ps(wt48, postMul1);
bias1 = _mm512_fmadd_ps(wt48, preAdd3, bias1);
wt48 = _mm512_mul_ps(wt48, preMul3);
_mm512_mask_storeu_ps(arrangedW1+58576+62464*i5+384*c1+24*k3, 3, wt48);
_mm512_mask_storeu_ps(arrangedW1+58936+62464*i5+384*c1+24*k3, 252, wt48);
_mm512_mask_storeu_ps(arrangedW1+59296+62464*i5+384*c1+24*k3, 16128, wt48);
_mm512_mask_storeu_ps(arrangedW1+59656+62464*i5+384*c1+24*k3, 49152, wt48);
}
break;
}
}
_mm512_mask_storeu_ps(arrangedB1-0+244*i5+64*j1, 65535, bias1);
if (j1 >= jj1) return;
}
}
if (j1 == 3) {
__m512i pmMul2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo2 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(0+61*i5+16*j1));
__m512 masHi2 = _mm512_maskz_loadu_ps(1023, bnPtr2+(ptrdiff_t)8*(0+61*i5+16*j1)+(ptrdiff_t)64);
__m512 postMul2 = _mm512_permutex2var_ps(masLo2, pmMul2, masHi2);
__m512 postAdd2 = _mm512_permutex2var_ps(masLo2, pmAdd2, masHi2);
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(8191, biasPtr1-0+244*i5+64*j1);
bias2 = _mm512_fmadd_ps(postMul2, bias2, postAdd2);
}
ptrdiff_t c2 = (size_t)(0+16*j1)/6;
ptrdiff_t k4 = 0;
for (; k4 != 16; ++k4) {
__m512 wt49 = _mm512_maskz_loadu_ps(65535, wtPtr1+0+62464*i5+16384*j1+64*k4);
__m512 wt50 = _mm512_maskz_loadu_ps(65535, wtPtr1+1024+62464*i5+16384*j1+64*k4);
__m512 wt51 = _mm512_maskz_loadu_ps(65535, wtPtr1+2048+62464*i5+16384*j1+64*k4);
__m512 wt52 = _mm512_maskz_loadu_ps(65535, wtPtr1+3072+62464*i5+16384*j1+64*k4);
__m512 wt53 = _mm512_maskz_loadu_ps(65535, wtPtr1+4096+62464*i5+16384*j1+64*k4);
__m512 wt54 = _mm512_maskz_loadu_ps(65535, wtPtr1+5120+62464*i5+16384*j1+64*k4);
__m512 wt55 = _mm512_maskz_loadu_ps(65535, wtPtr1+6144+62464*i5+16384*j1+64*k4);
__m512 wt56 = _mm512_maskz_loadu_ps(65535, wtPtr1+7168+62464*i5+16384*j1+64*k4);
__m512 wt57 = _mm512_maskz_loadu_ps(65535, wtPtr1+8192+62464*i5+16384*j1+64*k4);
__m512 wt58 = _mm512_maskz_loadu_ps(65535, wtPtr1+9216+62464*i5+16384*j1+64*k4);
__m512 wt59 = _mm512_maskz_loadu_ps(65535, wtPtr1+10240+62464*i5+16384*j1+64*k4);
__m512 wt60 = _mm512_maskz_loadu_ps(65535, wtPtr1+11264+62464*i5+16384*j1+64*k4);
__m512 wt61 = _mm512_maskz_loadu_ps(65535, wtPtr1+12288+62464*i5+16384*j1+64*k4);
__m512 tmp145 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp146 = _mm512_unpackhi_ps(wt49, wt50);
__m512 tmp147 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp148 = _mm512_unpackhi_ps(wt51, wt52);
__m512 tmp149 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp150 = _mm512_unpackhi_ps(wt53, wt54);
__m512 tmp151 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp152 = _mm512_unpackhi_ps(wt55, wt56);
__m512 tmp153 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp154 = _mm512_unpackhi_ps(wt57, wt58);
__m512 tmp155 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp156 = _mm512_unpackhi_ps(wt59, wt60);
__m512 tmp157 = _mm512_unpacklo_ps(wt61, wt61);
__m512 tmp158 = _mm512_unpackhi_ps(wt61, wt61);
__m512 tmp159 = _mm512_shuffle_ps(tmp145, tmp147, 68);
__m512 tmp160 = _mm512_shuffle_ps(tmp145, tmp147, 238);
__m512 tmp161 = _mm512_shuffle_ps(tmp146, tmp148, 68);
__m512 tmp162 = _mm512_shuffle_ps(tmp146, tmp148, 238);
__m512 tmp163 = _mm512_shuffle_ps(tmp149, tmp151, 68);
__m512 tmp164 = _mm512_shuffle_ps(tmp149, tmp151, 238);
__m512 tmp165 = _mm512_shuffle_ps(tmp150, tmp152, 68);
__m512 tmp166 = _mm512_shuffle_ps(tmp150, tmp152, 238);
__m512 tmp167 = _mm512_shuffle_ps(tmp153, tmp155, 68);
__m512 tmp168 = _mm512_shuffle_ps(tmp153, tmp155, 238);
__m512 tmp169 = _mm512_shuffle_ps(tmp154, tmp156, 68);
__m512 tmp170 = _mm512_shuffle_ps(tmp154, tmp156, 238);
__m512 tmp171 = _mm512_shuffle_ps(tmp157, tmp157, 238);
__m512 tmp172 = _mm512_shuffle_ps(tmp158, tmp158, 238);
__m512 tmp173 = _mm512_shuffle_f32x4(tmp159, tmp163, 136);
__m512 tmp174 = _mm512_shuffle_f32x4(tmp159, tmp163, 221);
__m512 tmp175 = _mm512_shuffle_f32x4(tmp160, tmp164, 136);
__m512 tmp176 = _mm512_shuffle_f32x4(tmp160, tmp164, 221);
__m512 tmp177 = _mm512_shuffle_f32x4(tmp161, tmp165, 136);
__m512 tmp178 = _mm512_shuffle_f32x4(tmp161, tmp165, 221);
__m512 tmp179 = _mm512_shuffle_f32x4(tmp162, tmp166, 136);
__m512 tmp180 = _mm512_shuffle_f32x4(tmp162, tmp166, 221);
__m512 tmp181 = _mm512_shuffle_f32x4(tmp167, tmp157, 136);
__m512 tmp182 = _mm512_shuffle_f32x4(tmp167, tmp157, 221);
__m512 tmp183 = _mm512_shuffle_f32x4(tmp168, tmp171, 136);
__m512 tmp184 = _mm512_shuffle_f32x4(tmp168, tmp171, 221);
__m512 tmp185 = _mm512_shuffle_f32x4(tmp169, tmp158, 136);
__m512 tmp186 = _mm512_shuffle_f32x4(tmp169, tmp158, 221);
__m512 tmp187 = _mm512_shuffle_f32x4(tmp170, tmp172, 136);
__m512 tmp188 = _mm512_shuffle_f32x4(tmp170, tmp172, 221);
wt49 = _mm512_shuffle_f32x4(tmp173, tmp181, 136);
wt57 = _mm512_shuffle_f32x4(tmp173, tmp181, 221);
wt50 = _mm512_shuffle_f32x4(tmp175, tmp183, 136);
wt58 = _mm512_shuffle_f32x4(tmp175, tmp183, 221);
wt51 = _mm512_shuffle_f32x4(tmp177, tmp185, 136);
wt59 = _mm512_shuffle_f32x4(tmp177, tmp185, 221);
wt52 = _mm512_shuffle_f32x4(tmp179, tmp187, 136);
wt60 = _mm512_shuffle_f32x4(tmp179, tmp187, 221);
wt53 = _mm512_shuffle_f32x4(tmp174, tmp182, 136);
wt61 = _mm512_shuffle_f32x4(tmp174, tmp182, 221);
wt54 = _mm512_shuffle_f32x4(tmp176, tmp184, 136);
__m512 wt62 = _mm512_shuffle_f32x4(tmp176, tmp184, 221);
wt55 = _mm512_shuffle_f32x4(tmp178, tmp186, 136);
__m512 wt63 = _mm512_shuffle_f32x4(tmp178, tmp186, 221);
wt56 = _mm512_shuffle_f32x4(tmp180, tmp188, 136);
__m512 wt64 = _mm512_shuffle_f32x4(tmp180, tmp188, 221);
wt49 = _mm512_mul_ps(wt49, postMul2);
__m512 preMul4 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*i5+1*k4))[0]);
__m512 preAdd4 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+16*i5+1*k4))[1]);
bias2 = _mm512_fmadd_ps(wt49, preAdd4, bias2);
wt49 = _mm512_mul_ps(wt49, preMul4);
_mm512_mask_storeu_ps(arrangedW1+0+62464*i5+384*c2+24*k4, 63, wt49);
_mm512_mask_storeu_ps(arrangedW1+360+62464*i5+384*c2+24*k4, 4032, wt49);
_mm512_mask_storeu_ps(arrangedW1+720+62464*i5+384*c2+4*k4, 4096, wt49);
wt50 = _mm512_mul_ps(wt50, postMul2);
bias2 = _mm512_fmadd_ps(wt50, preAdd4, bias2);
wt50 = _mm512_mul_ps(wt50, preMul4);
_mm512_mask_storeu_ps(arrangedW1+3904+62464*i5+384*c2+24*k4, 63, wt50);
_mm512_mask_storeu_ps(arrangedW1+4264+62464*i5+384*c2+24*k4, 4032, wt50);
_mm512_mask_storeu_ps(arrangedW1+4624+62464*i5+384*c2+4*k4, 4096, wt50);
wt51 = _mm512_mul_ps(wt51, postMul2);
bias2 = _mm512_fmadd_ps(wt51, preAdd4, bias2);
wt51 = _mm512_mul_ps(wt51, preMul4);
_mm512_mask_storeu_ps(arrangedW1+7808+62464*i5+384*c2+24*k4, 63, wt51);
_mm512_mask_storeu_ps(arrangedW1+8168+62464*i5+384*c2+24*k4, 4032, wt51);
_mm512_mask_storeu_ps(arrangedW1+8528+62464*i5+384*c2+4*k4, 4096, wt51);
wt52 = _mm512_mul_ps(wt52, postMul2);
bias2 = _mm512_fmadd_ps(wt52, preAdd4, bias2);
wt52 = _mm512_mul_ps(wt52, preMul4);
_mm512_mask_storeu_ps(arrangedW1+11712+62464*i5+384*c2+24*k4, 63, wt52);
_mm512_mask_storeu_ps(arrangedW1+12072+62464*i5+384*c2+24*k4, 4032, wt52);
_mm512_mask_storeu_ps(arrangedW1+12432+62464*i5+384*c2+4*k4, 4096, wt52);
wt53 = _mm512_mul_ps(wt53, postMul2);
bias2 = _mm512_fmadd_ps(wt53, preAdd4, bias2);
wt53 = _mm512_mul_ps(wt53, preMul4);
_mm512_mask_storeu_ps(arrangedW1+15616+62464*i5+384*c2+24*k4, 63, wt53);
_mm512_mask_storeu_ps(arrangedW1+15976+62464*i5+384*c2+24*k4, 4032, wt53);
_mm512_mask_storeu_ps(arrangedW1+16336+62464*i5+384*c2+4*k4, 4096, wt53);
wt54 = _mm512_mul_ps(wt54, postMul2);
bias2 = _mm512_fmadd_ps(wt54, preAdd4, bias2);
wt54 = _mm512_mul_ps(wt54, preMul4);
_mm512_mask_storeu_ps(arrangedW1+19520+62464*i5+384*c2+24*k4, 63, wt54);
_mm512_mask_storeu_ps(arrangedW1+19880+62464*i5+384*c2+24*k4, 4032, wt54);
_mm512_mask_storeu_ps(arrangedW1+20240+62464*i5+384*c2+4*k4, 4096, wt54);
wt55 = _mm512_mul_ps(wt55, postMul2);
bias2 = _mm512_fmadd_ps(wt55, preAdd4, bias2);
wt55 = _mm512_mul_ps(wt55, preMul4);
_mm512_mask_storeu_ps(arrangedW1+23424+62464*i5+384*c2+24*k4, 63, wt55);
_mm512_mask_storeu_ps(arrangedW1+23784+62464*i5+384*c2+24*k4, 4032, wt55);
_mm512_mask_storeu_ps(arrangedW1+24144+62464*i5+384*c2+4*k4, 4096, wt55);
wt56 = _mm512_mul_ps(wt56, postMul2);
bias2 = _mm512_fmadd_ps(wt56, preAdd4, bias2);
wt56 = _mm512_mul_ps(wt56, preMul4);
_mm512_mask_storeu_ps(arrangedW1+27328+62464*i5+384*c2+24*k4, 63, wt56);
_mm512_mask_storeu_ps(arrangedW1+27688+62464*i5+384*c2+24*k4, 4032, wt56);
_mm512_mask_storeu_ps(arrangedW1+28048+62464*i5+384*c2+4*k4, 4096, wt56);
wt57 = _mm512_mul_ps(wt57, postMul2);
bias2 = _mm512_fmadd_ps(wt57, preAdd4, bias2);
wt57 = _mm512_mul_ps(wt57, preMul4);
_mm512_mask_storeu_ps(arrangedW1+31232+62464*i5+384*c2+24*k4, 63, wt57);
_mm512_mask_storeu_ps(arrangedW1+31592+62464*i5+384*c2+24*k4, 4032, wt57);
_mm512_mask_storeu_ps(arrangedW1+31952+62464*i5+384*c2+4*k4, 4096, wt57);
wt58 = _mm512_mul_ps(wt58, postMul2);
bias2 = _mm512_fmadd_ps(wt58, preAdd4, bias2);
wt58 = _mm512_mul_ps(wt58, preMul4);
_mm512_mask_storeu_ps(arrangedW1+35136+62464*i5+384*c2+24*k4, 63, wt58);
_mm512_mask_storeu_ps(arrangedW1+35496+62464*i5+384*c2+24*k4, 4032, wt58);
_mm512_mask_storeu_ps(arrangedW1+35856+62464*i5+384*c2+4*k4, 4096, wt58);
wt59 = _mm512_mul_ps(wt59, postMul2);
bias2 = _mm512_fmadd_ps(wt59, preAdd4, bias2);
wt59 = _mm512_mul_ps(wt59, preMul4);
_mm512_mask_storeu_ps(arrangedW1+39040+62464*i5+384*c2+24*k4, 63, wt59);
_mm512_mask_storeu_ps(arrangedW1+39400+62464*i5+384*c2+24*k4, 4032, wt59);
_mm512_mask_storeu_ps(arrangedW1+39760+62464*i5+384*c2+4*k4, 4096, wt59);
wt60 = _mm512_mul_ps(wt60, postMul2);
bias2 = _mm512_fmadd_ps(wt60, preAdd4, bias2);
wt60 = _mm512_mul_ps(wt60, preMul4);
_mm512_mask_storeu_ps(arrangedW1+42944+62464*i5+384*c2+24*k4, 63, wt60);
_mm512_mask_storeu_ps(arrangedW1+43304+62464*i5+384*c2+24*k4, 4032, wt60);
_mm512_mask_storeu_ps(arrangedW1+43664+62464*i5+384*c2+4*k4, 4096, wt60);
wt61 = _mm512_mul_ps(wt61, postMul2);
bias2 = _mm512_fmadd_ps(wt61, preAdd4, bias2);
wt61 = _mm512_mul_ps(wt61, preMul4);
_mm512_mask_storeu_ps(arrangedW1+46848+62464*i5+384*c2+24*k4, 63, wt61);
_mm512_mask_storeu_ps(arrangedW1+47208+62464*i5+384*c2+24*k4, 4032, wt61);
_mm512_mask_storeu_ps(arrangedW1+47568+62464*i5+384*c2+4*k4, 4096, wt61);
wt62 = _mm512_mul_ps(wt62, postMul2);
bias2 = _mm512_fmadd_ps(wt62, preAdd4, bias2);
wt62 = _mm512_mul_ps(wt62, preMul4);
_mm512_mask_storeu_ps(arrangedW1+50752+62464*i5+384*c2+24*k4, 63, wt62);
_mm512_mask_storeu_ps(arrangedW1+51112+62464*i5+384*c2+24*k4, 4032, wt62);
_mm512_mask_storeu_ps(arrangedW1+51472+62464*i5+384*c2+4*k4, 4096, wt62);
wt63 = _mm512_mul_ps(wt63, postMul2);
bias2 = _mm512_fmadd_ps(wt63, preAdd4, bias2);
wt63 = _mm512_mul_ps(wt63, preMul4);
_mm512_mask_storeu_ps(arrangedW1+54656+62464*i5+384*c2+24*k4, 63, wt63);
_mm512_mask_storeu_ps(arrangedW1+55016+62464*i5+384*c2+24*k4, 4032, wt63);
_mm512_mask_storeu_ps(arrangedW1+55376+62464*i5+384*c2+4*k4, 4096, wt63);
wt64 = _mm512_mul_ps(wt64, postMul2);
bias2 = _mm512_fmadd_ps(wt64, preAdd4, bias2);
wt64 = _mm512_mul_ps(wt64, preMul4);
_mm512_mask_storeu_ps(arrangedW1+58560+62464*i5+384*c2+24*k4, 63, wt64);
_mm512_mask_storeu_ps(arrangedW1+58920+62464*i5+384*c2+24*k4, 4032, wt64);
_mm512_mask_storeu_ps(arrangedW1+59280+62464*i5+384*c2+4*k4, 4096, wt64);
}
_mm512_mask_storeu_ps(arrangedB1-0+244*i5+64*j1, 8191, bias2);
if (j1 >= jj1) return;
j1 = 4;
}
}

static void Example30LoomArrangeFilts1(Example30ThreaderTeam1* team13, char** tensors1) {
Example30ThreaderTask1 task5;
task5.callee1 = Example30LoomArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 2;
task5.hull1[1] = 1;
task5.hull1[2] = 1;
Example30ThreaderDo1(team13, &task5);
}

static void Example30LoomArrangeDats1Callee1(Example30ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = 0;
ptrdiff_t c3 = 0;
ptrdiff_t g3 = 0;
ptrdiff_t e2 = 0;
(void)pt8;
char*restrict datPtr1 = tensors4[0]-0+350700*e2;
char*restrict bnPtr3 = tensors4[1]+(ptrdiff_t)8*835*e2;
char*restrict datPtr2 = tensors4[2]-0+350700*e2;
char*restrict arranged1 = tensors4[3]+427520*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 2*c3;
if (j2 < 1) {
ptrdiff_t rel1 = j2-0;
ptrdiff_t h1 = 0;
ptrdiff_t w1 = 0;
ptrdiff_t k5 = 16*s1;
ptrdiff_t kk1 = k5+15;
for (; k5 <= kk1; ++k5) {
__m512 bnMul1 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(k5+16*i6))[0]);
__m512 bnAdd1 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(k5+16*i6))[1]);
__m512 dat1 = _mm512_maskz_loadu_ps(32767, datPtr1+0+6720*i6+420*k5+60*h1+4*w1);
dat1 = _mm512_mask_fmadd_ps(dat1, 32767, bnMul1, bnAdd1);
dat1 = _mm512_max_ps(_mm512_setzero_ps(), dat1);
dat1 = _mm512_add_ps(dat1, _mm512_maskz_loadu_ps(32767, datPtr2+0+6720*i6+420*k5+60*h1+4*w1));
_mm512_mask_storeu_ps(arranged1+0+8192*i6+4096*j2+256*k5, 65535, dat1);
__m512 dat2 = _mm512_maskz_loadu_ps(32767, datPtr1+60+6720*i6+420*k5+60*h1+4*w1);
dat2 = _mm512_mask_fmadd_ps(dat2, 32767, bnMul1, bnAdd1);
dat2 = _mm512_max_ps(_mm512_setzero_ps(), dat2);
dat2 = _mm512_add_ps(dat2, _mm512_maskz_loadu_ps(32767, datPtr2+60+6720*i6+420*k5+60*h1+4*w1));
_mm512_mask_storeu_ps(arranged1+64+8192*i6+4096*j2+256*k5, 65535, dat2);
__m512 dat3 = _mm512_maskz_loadu_ps(32767, datPtr1+120+6720*i6+420*k5+60*h1+4*w1);
dat3 = _mm512_mask_fmadd_ps(dat3, 32767, bnMul1, bnAdd1);
dat3 = _mm512_max_ps(_mm512_setzero_ps(), dat3);
dat3 = _mm512_add_ps(dat3, _mm512_maskz_loadu_ps(32767, datPtr2+120+6720*i6+420*k5+60*h1+4*w1));
_mm512_mask_storeu_ps(arranged1+128+8192*i6+4096*j2+256*k5, 65535, dat3);
__m512 dat4 = _mm512_maskz_loadu_ps(32767, datPtr1+180+6720*i6+420*k5+60*h1+4*w1);
dat4 = _mm512_mask_fmadd_ps(dat4, 32767, bnMul1, bnAdd1);
dat4 = _mm512_max_ps(_mm512_setzero_ps(), dat4);
dat4 = _mm512_add_ps(dat4, _mm512_maskz_loadu_ps(32767, datPtr2+180+6720*i6+420*k5+60*h1+4*w1));
_mm512_mask_storeu_ps(arranged1+192+8192*i6+4096*j2+256*k5, 65535, dat4);
}
++j2;
j2 = 1;
}
ptrdiff_t rel2 = j2-1;
ptrdiff_t h2 = 4;
ptrdiff_t w2 = 0;
ptrdiff_t k6 = 16*s1;
ptrdiff_t kk2 = k6+15;
for (; k6 <= kk2; ++k6) {
__m512 bnMul2 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(k6+16*i6))[0]);
__m512 bnAdd2 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(k6+16*i6))[1]);
__m512 dat5 = _mm512_maskz_loadu_ps(32767, datPtr1+0+6720*i6+420*k6+60*h2+4*w2);
dat5 = _mm512_mask_fmadd_ps(dat5, 32767, bnMul2, bnAdd2);
dat5 = _mm512_max_ps(_mm512_setzero_ps(), dat5);
dat5 = _mm512_add_ps(dat5, _mm512_maskz_loadu_ps(32767, datPtr2+0+6720*i6+420*k6+60*h2+4*w2));
_mm512_mask_storeu_ps(arranged1+0+8192*i6+4096*j2+256*k6, 65535, dat5);
__m512 dat6 = _mm512_maskz_loadu_ps(32767, datPtr1+60+6720*i6+420*k6+60*h2+4*w2);
dat6 = _mm512_mask_fmadd_ps(dat6, 32767, bnMul2, bnAdd2);
dat6 = _mm512_max_ps(_mm512_setzero_ps(), dat6);
dat6 = _mm512_add_ps(dat6, _mm512_maskz_loadu_ps(32767, datPtr2+60+6720*i6+420*k6+60*h2+4*w2));
_mm512_mask_storeu_ps(arranged1+64+8192*i6+4096*j2+256*k6, 65535, dat6);
__m512 dat7 = _mm512_maskz_loadu_ps(32767, datPtr1+120+6720*i6+420*k6+60*h2+4*w2);
dat7 = _mm512_mask_fmadd_ps(dat7, 32767, bnMul2, bnAdd2);
dat7 = _mm512_max_ps(_mm512_setzero_ps(), dat7);
dat7 = _mm512_add_ps(dat7, _mm512_maskz_loadu_ps(32767, datPtr2+120+6720*i6+420*k6+60*h2+4*w2));
_mm512_mask_storeu_ps(arranged1+128+8192*i6+4096*j2+256*k6, 65535, dat7);
_mm512_mask_storeu_ps(arranged1+192+8192*i6+4096*j2+256*k6, 65535, _mm512_setzero_ps());
}
++j2;
}

static void Example30LoomArrangeDats1(Example30ThreaderTeam1* team15, char** tensors3) {
Example30ThreaderTask1 task7;
task7.callee1 = Example30LoomArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 1;
task7.hull1[1] = 1;
task7.hull1[2] = 1;
task7.hull1[3] = 1;
Example30ThreaderDo1(team15, &task7);
}

static ptrdiff_t Example30LoomProduceSums1FieldTbl1[] = {
0, 4, 16
};

static ptrdiff_t Example30LoomProduceSums1NodeTbl1[] = {
0, 0, 1,
0, 1, 1,
0, 2, 1,
0, 3, 1,
1, 0, 0,
1, 1, 0,
1, 2, 0,
1, 3, 0,
2, 0, 0,
2, 1, 0,
2, 2, 0,
2, 3, 0,
3, 0, 0,
3, 1, 0,
3, 2, 0,
3, 3, 0
};

static void Example30LoomProduceSums1Callee1(Example30ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t epoch1 = 0;
ptrdiff_t field1 = 0;
ptrdiff_t nodeFirst1 = (ptrdiff_t)tuple2[3];
ptrdiff_t group1 = 0;
ptrdiff_t to2 = pt9[2];
ptrdiff_t nodeOff1 = pt9[1];
ptrdiff_t w3 = 0;
ptrdiff_t node6 = nodeFirst1+nodeOff1;
ptrdiff_t lift1 = Example30LoomProduceSums1NodeTbl1[0+3*node6];
ptrdiff_t pile1 = Example30LoomProduceSums1NodeTbl1[1+3*node6];
ptrdiff_t base1 = Example30LoomProduceSums1NodeTbl1[2+3*node6];
ptrdiff_t from1 = to2+(size_t)lift1/4*1;
if (from1 >= 2) return;
char*restrict biasPtr2 = tensors6[0]+244*epoch1+244*group1;
char*restrict wtPtr2 = tensors6[0]+244+3259840*epoch1+62464*group1+3904*node6;
char*restrict datPtr3 = tensors6[1]+427520*epoch1+8192*field1+8192*group1+4096*from1;
char*restrict sumPtr1 = tensors6[2]+124928*group1+62464*to2+15616*pile1;
switch ((size_t)lift1%4*2+(to2 >= 1)) {
default: {
if (node6) {
if (!epoch1 && base1) {
ptrdiff_t i7 = 11*w3;
for (; i7 != 10; ++i7) {
__m512 sum2 = _mm512_setzero_ps();
__m512 sum6 = _mm512_setzero_ps();
__m512 sum10 = _mm512_setzero_ps();
__m512 sum14 = _mm512_setzero_ps();
__m512 sum18 = _mm512_setzero_ps();
__m512 sum22 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum3 = sum2;
__m512 sum4 = sum2;
__m512 sum5 = sum2;
__m512 sum7 = sum6;
__m512 sum8 = sum6;
__m512 sum9 = sum6;
__m512 sum11 = sum10;
__m512 sum12 = sum10;
__m512 sum13 = sum10;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum19 = sum18;
__m512 sum20 = sum18;
__m512 sum21 = sum18;
__m512 sum23 = sum22;
__m512 sum24 = sum22;
__m512 sum25 = sum22;
for (ptrdiff_t j3 = 0; j3 < 16; ++j3) {
__m512 dat8 = _mm512_loadu_ps(datPtr3+0+256*j3);
__m512 dat9 = _mm512_loadu_ps(datPtr3+64+256*j3);
__m512 dat10 = _mm512_loadu_ps(datPtr3+128+256*j3);
__m512 dat11 = _mm512_loadu_ps(datPtr3+192+256*j3);
__m512 wt65 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i7+24*j3));
sum2 = _mm512_fmadd_ps(wt65, dat8, sum2);
sum3 = _mm512_fmadd_ps(wt65, dat9, sum3);
sum4 = _mm512_fmadd_ps(wt65, dat10, sum4);
sum5 = _mm512_fmadd_ps(wt65, dat11, sum5);
__m512 wt66 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i7+24*j3));
sum6 = _mm512_fmadd_ps(wt66, dat8, sum6);
sum7 = _mm512_fmadd_ps(wt66, dat9, sum7);
sum8 = _mm512_fmadd_ps(wt66, dat10, sum8);
sum9 = _mm512_fmadd_ps(wt66, dat11, sum9);
__m512 wt67 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i7+24*j3));
sum10 = _mm512_fmadd_ps(wt67, dat8, sum10);
sum11 = _mm512_fmadd_ps(wt67, dat9, sum11);
sum12 = _mm512_fmadd_ps(wt67, dat10, sum12);
sum13 = _mm512_fmadd_ps(wt67, dat11, sum13);
__m512 wt68 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i7+24*j3));
sum14 = _mm512_fmadd_ps(wt68, dat8, sum14);
sum15 = _mm512_fmadd_ps(wt68, dat9, sum15);
sum16 = _mm512_fmadd_ps(wt68, dat10, sum16);
sum17 = _mm512_fmadd_ps(wt68, dat11, sum17);
__m512 wt69 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i7+24*j3));
sum18 = _mm512_fmadd_ps(wt69, dat8, sum18);
sum19 = _mm512_fmadd_ps(wt69, dat9, sum19);
sum20 = _mm512_fmadd_ps(wt69, dat10, sum20);
sum21 = _mm512_fmadd_ps(wt69, dat11, sum21);
__m512 wt70 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i7+24*j3));
sum22 = _mm512_fmadd_ps(wt70, dat8, sum22);
sum23 = _mm512_fmadd_ps(wt70, dat9, sum23);
sum24 = _mm512_fmadd_ps(wt70, dat10, sum24);
sum25 = _mm512_fmadd_ps(wt70, dat11, sum25);
}
_mm512_storeu_ps(sumPtr1+0+1536*i7, sum2);
_mm512_storeu_ps(sumPtr1+64+1536*i7, sum3);
_mm512_storeu_ps(sumPtr1+128+1536*i7, sum4);
_mm512_storeu_ps(sumPtr1+192+1536*i7, sum5);
_mm512_storeu_ps(sumPtr1+256+1536*i7, sum6);
_mm512_storeu_ps(sumPtr1+320+1536*i7, sum7);
_mm512_storeu_ps(sumPtr1+384+1536*i7, sum8);
_mm512_storeu_ps(sumPtr1+448+1536*i7, sum9);
_mm512_storeu_ps(sumPtr1+512+1536*i7, sum10);
_mm512_storeu_ps(sumPtr1+576+1536*i7, sum11);
_mm512_storeu_ps(sumPtr1+640+1536*i7, sum12);
_mm512_storeu_ps(sumPtr1+704+1536*i7, sum13);
_mm512_storeu_ps(sumPtr1+768+1536*i7, sum14);
_mm512_storeu_ps(sumPtr1+832+1536*i7, sum15);
_mm512_storeu_ps(sumPtr1+896+1536*i7, sum16);
_mm512_storeu_ps(sumPtr1+960+1536*i7, sum17);
_mm512_storeu_ps(sumPtr1+1024+1536*i7, sum18);
_mm512_storeu_ps(sumPtr1+1088+1536*i7, sum19);
_mm512_storeu_ps(sumPtr1+1152+1536*i7, sum20);
_mm512_storeu_ps(sumPtr1+1216+1536*i7, sum21);
_mm512_storeu_ps(sumPtr1+1280+1536*i7, sum22);
_mm512_storeu_ps(sumPtr1+1344+1536*i7, sum23);
_mm512_storeu_ps(sumPtr1+1408+1536*i7, sum24);
_mm512_storeu_ps(sumPtr1+1472+1536*i7, sum25);
}
__m512 sum26 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
for (ptrdiff_t j4 = 0; j4 < 16; ++j4) {
__m512 dat12 = _mm512_loadu_ps(datPtr3+0+256*j4);
__m512 dat13 = _mm512_loadu_ps(datPtr3+64+256*j4);
__m512 dat14 = _mm512_loadu_ps(datPtr3+128+256*j4);
__m512 dat15 = _mm512_loadu_ps(datPtr3+192+256*j4);
__m512 wt71 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i7+4*j4));
sum26 = _mm512_fmadd_ps(wt71, dat12, sum26);
sum27 = _mm512_fmadd_ps(wt71, dat13, sum27);
sum28 = _mm512_fmadd_ps(wt71, dat14, sum28);
sum29 = _mm512_fmadd_ps(wt71, dat15, sum29);
}
_mm512_storeu_ps(sumPtr1+0+1536*i7, sum26);
_mm512_storeu_ps(sumPtr1+64+1536*i7, sum27);
_mm512_storeu_ps(sumPtr1+128+1536*i7, sum28);
_mm512_storeu_ps(sumPtr1+192+1536*i7, sum29);
return;
}
ptrdiff_t i8 = 11*w3;
for (; i8 != 10; ++i8) {
__m512 sum30 = _mm512_setzero_ps();
__m512 sum34 = _mm512_setzero_ps();
__m512 sum38 = _mm512_setzero_ps();
__m512 sum42 = _mm512_setzero_ps();
__m512 sum46 = _mm512_setzero_ps();
__m512 sum50 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum31 = sum30;
__m512 sum32 = sum30;
__m512 sum33 = sum30;
__m512 sum35 = sum34;
__m512 sum36 = sum34;
__m512 sum37 = sum34;
__m512 sum39 = sum38;
__m512 sum40 = sum38;
__m512 sum41 = sum38;
__m512 sum43 = sum42;
__m512 sum44 = sum42;
__m512 sum45 = sum42;
__m512 sum47 = sum46;
__m512 sum48 = sum46;
__m512 sum49 = sum46;
__m512 sum51 = sum50;
__m512 sum52 = sum50;
__m512 sum53 = sum50;
for (ptrdiff_t j5 = 0; j5 < 16; ++j5) {
__m512 dat16 = _mm512_loadu_ps(datPtr3+0+256*j5);
__m512 dat17 = _mm512_loadu_ps(datPtr3+64+256*j5);
__m512 dat18 = _mm512_loadu_ps(datPtr3+128+256*j5);
__m512 dat19 = _mm512_loadu_ps(datPtr3+192+256*j5);
__m512 wt72 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i8+24*j5));
sum30 = _mm512_fmadd_ps(wt72, dat16, sum30);
sum31 = _mm512_fmadd_ps(wt72, dat17, sum31);
sum32 = _mm512_fmadd_ps(wt72, dat18, sum32);
sum33 = _mm512_fmadd_ps(wt72, dat19, sum33);
__m512 wt73 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i8+24*j5));
sum34 = _mm512_fmadd_ps(wt73, dat16, sum34);
sum35 = _mm512_fmadd_ps(wt73, dat17, sum35);
sum36 = _mm512_fmadd_ps(wt73, dat18, sum36);
sum37 = _mm512_fmadd_ps(wt73, dat19, sum37);
__m512 wt74 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i8+24*j5));
sum38 = _mm512_fmadd_ps(wt74, dat16, sum38);
sum39 = _mm512_fmadd_ps(wt74, dat17, sum39);
sum40 = _mm512_fmadd_ps(wt74, dat18, sum40);
sum41 = _mm512_fmadd_ps(wt74, dat19, sum41);
__m512 wt75 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i8+24*j5));
sum42 = _mm512_fmadd_ps(wt75, dat16, sum42);
sum43 = _mm512_fmadd_ps(wt75, dat17, sum43);
sum44 = _mm512_fmadd_ps(wt75, dat18, sum44);
sum45 = _mm512_fmadd_ps(wt75, dat19, sum45);
__m512 wt76 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i8+24*j5));
sum46 = _mm512_fmadd_ps(wt76, dat16, sum46);
sum47 = _mm512_fmadd_ps(wt76, dat17, sum47);
sum48 = _mm512_fmadd_ps(wt76, dat18, sum48);
sum49 = _mm512_fmadd_ps(wt76, dat19, sum49);
__m512 wt77 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i8+24*j5));
sum50 = _mm512_fmadd_ps(wt77, dat16, sum50);
sum51 = _mm512_fmadd_ps(wt77, dat17, sum51);
sum52 = _mm512_fmadd_ps(wt77, dat18, sum52);
sum53 = _mm512_fmadd_ps(wt77, dat19, sum53);
}
_mm512_storeu_ps(sumPtr1+0+1536*i8, _mm512_add_ps(sum30, _mm512_loadu_ps(sumPtr1+0+1536*i8)));
_mm512_storeu_ps(sumPtr1+64+1536*i8, _mm512_add_ps(sum31, _mm512_loadu_ps(sumPtr1+64+1536*i8)));
_mm512_storeu_ps(sumPtr1+128+1536*i8, _mm512_add_ps(sum32, _mm512_loadu_ps(sumPtr1+128+1536*i8)));
_mm512_storeu_ps(sumPtr1+192+1536*i8, _mm512_add_ps(sum33, _mm512_loadu_ps(sumPtr1+192+1536*i8)));
_mm512_storeu_ps(sumPtr1+256+1536*i8, _mm512_add_ps(sum34, _mm512_loadu_ps(sumPtr1+256+1536*i8)));
_mm512_storeu_ps(sumPtr1+320+1536*i8, _mm512_add_ps(sum35, _mm512_loadu_ps(sumPtr1+320+1536*i8)));
_mm512_storeu_ps(sumPtr1+384+1536*i8, _mm512_add_ps(sum36, _mm512_loadu_ps(sumPtr1+384+1536*i8)));
_mm512_storeu_ps(sumPtr1+448+1536*i8, _mm512_add_ps(sum37, _mm512_loadu_ps(sumPtr1+448+1536*i8)));
_mm512_storeu_ps(sumPtr1+512+1536*i8, _mm512_add_ps(sum38, _mm512_loadu_ps(sumPtr1+512+1536*i8)));
_mm512_storeu_ps(sumPtr1+576+1536*i8, _mm512_add_ps(sum39, _mm512_loadu_ps(sumPtr1+576+1536*i8)));
_mm512_storeu_ps(sumPtr1+640+1536*i8, _mm512_add_ps(sum40, _mm512_loadu_ps(sumPtr1+640+1536*i8)));
_mm512_storeu_ps(sumPtr1+704+1536*i8, _mm512_add_ps(sum41, _mm512_loadu_ps(sumPtr1+704+1536*i8)));
_mm512_storeu_ps(sumPtr1+768+1536*i8, _mm512_add_ps(sum42, _mm512_loadu_ps(sumPtr1+768+1536*i8)));
_mm512_storeu_ps(sumPtr1+832+1536*i8, _mm512_add_ps(sum43, _mm512_loadu_ps(sumPtr1+832+1536*i8)));
_mm512_storeu_ps(sumPtr1+896+1536*i8, _mm512_add_ps(sum44, _mm512_loadu_ps(sumPtr1+896+1536*i8)));
_mm512_storeu_ps(sumPtr1+960+1536*i8, _mm512_add_ps(sum45, _mm512_loadu_ps(sumPtr1+960+1536*i8)));
_mm512_storeu_ps(sumPtr1+1024+1536*i8, _mm512_add_ps(sum46, _mm512_loadu_ps(sumPtr1+1024+1536*i8)));
_mm512_storeu_ps(sumPtr1+1088+1536*i8, _mm512_add_ps(sum47, _mm512_loadu_ps(sumPtr1+1088+1536*i8)));
_mm512_storeu_ps(sumPtr1+1152+1536*i8, _mm512_add_ps(sum48, _mm512_loadu_ps(sumPtr1+1152+1536*i8)));
_mm512_storeu_ps(sumPtr1+1216+1536*i8, _mm512_add_ps(sum49, _mm512_loadu_ps(sumPtr1+1216+1536*i8)));
_mm512_storeu_ps(sumPtr1+1280+1536*i8, _mm512_add_ps(sum50, _mm512_loadu_ps(sumPtr1+1280+1536*i8)));
_mm512_storeu_ps(sumPtr1+1344+1536*i8, _mm512_add_ps(sum51, _mm512_loadu_ps(sumPtr1+1344+1536*i8)));
_mm512_storeu_ps(sumPtr1+1408+1536*i8, _mm512_add_ps(sum52, _mm512_loadu_ps(sumPtr1+1408+1536*i8)));
_mm512_storeu_ps(sumPtr1+1472+1536*i8, _mm512_add_ps(sum53, _mm512_loadu_ps(sumPtr1+1472+1536*i8)));
}
__m512 sum54 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum55 = sum54;
__m512 sum56 = sum54;
__m512 sum57 = sum54;
for (ptrdiff_t j6 = 0; j6 < 16; ++j6) {
__m512 dat20 = _mm512_loadu_ps(datPtr3+0+256*j6);
__m512 dat21 = _mm512_loadu_ps(datPtr3+64+256*j6);
__m512 dat22 = _mm512_loadu_ps(datPtr3+128+256*j6);
__m512 dat23 = _mm512_loadu_ps(datPtr3+192+256*j6);
__m512 wt78 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i8+4*j6));
sum54 = _mm512_fmadd_ps(wt78, dat20, sum54);
sum55 = _mm512_fmadd_ps(wt78, dat21, sum55);
sum56 = _mm512_fmadd_ps(wt78, dat22, sum56);
sum57 = _mm512_fmadd_ps(wt78, dat23, sum57);
}
_mm512_storeu_ps(sumPtr1+0+1536*i8, _mm512_add_ps(sum54, _mm512_loadu_ps(sumPtr1+0+1536*i8)));
_mm512_storeu_ps(sumPtr1+64+1536*i8, _mm512_add_ps(sum55, _mm512_loadu_ps(sumPtr1+64+1536*i8)));
_mm512_storeu_ps(sumPtr1+128+1536*i8, _mm512_add_ps(sum56, _mm512_loadu_ps(sumPtr1+128+1536*i8)));
_mm512_storeu_ps(sumPtr1+192+1536*i8, _mm512_add_ps(sum57, _mm512_loadu_ps(sumPtr1+192+1536*i8)));
return;
}
(void)base1;
ptrdiff_t i9 = 11*w3;
for (; i9 != 10; ++i9) {
__m512 sum58 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i9));
__m512 sum62 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i9));
__m512 sum66 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i9));
__m512 sum70 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i9));
__m512 sum74 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i9));
__m512 sum78 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i9));
__m512 sum59 = sum58;
__m512 sum60 = sum58;
__m512 sum61 = sum58;
__m512 sum63 = sum62;
__m512 sum64 = sum62;
__m512 sum65 = sum62;
__m512 sum67 = sum66;
__m512 sum68 = sum66;
__m512 sum69 = sum66;
__m512 sum71 = sum70;
__m512 sum72 = sum70;
__m512 sum73 = sum70;
__m512 sum75 = sum74;
__m512 sum76 = sum74;
__m512 sum77 = sum74;
__m512 sum79 = sum78;
__m512 sum80 = sum78;
__m512 sum81 = sum78;
for (ptrdiff_t j7 = 0; j7 < 16; ++j7) {
__m512 dat24 = _mm512_loadu_ps(datPtr3+0+256*j7);
__m512 dat25 = _mm512_loadu_ps(datPtr3+64+256*j7);
__m512 dat26 = _mm512_loadu_ps(datPtr3+128+256*j7);
__m512 dat27 = _mm512_loadu_ps(datPtr3+192+256*j7);
__m512 wt79 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i9+24*j7));
sum58 = _mm512_fmadd_ps(wt79, dat24, sum58);
sum59 = _mm512_fmadd_ps(wt79, dat25, sum59);
sum60 = _mm512_fmadd_ps(wt79, dat26, sum60);
sum61 = _mm512_fmadd_ps(wt79, dat27, sum61);
__m512 wt80 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i9+24*j7));
sum62 = _mm512_fmadd_ps(wt80, dat24, sum62);
sum63 = _mm512_fmadd_ps(wt80, dat25, sum63);
sum64 = _mm512_fmadd_ps(wt80, dat26, sum64);
sum65 = _mm512_fmadd_ps(wt80, dat27, sum65);
__m512 wt81 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i9+24*j7));
sum66 = _mm512_fmadd_ps(wt81, dat24, sum66);
sum67 = _mm512_fmadd_ps(wt81, dat25, sum67);
sum68 = _mm512_fmadd_ps(wt81, dat26, sum68);
sum69 = _mm512_fmadd_ps(wt81, dat27, sum69);
__m512 wt82 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i9+24*j7));
sum70 = _mm512_fmadd_ps(wt82, dat24, sum70);
sum71 = _mm512_fmadd_ps(wt82, dat25, sum71);
sum72 = _mm512_fmadd_ps(wt82, dat26, sum72);
sum73 = _mm512_fmadd_ps(wt82, dat27, sum73);
__m512 wt83 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i9+24*j7));
sum74 = _mm512_fmadd_ps(wt83, dat24, sum74);
sum75 = _mm512_fmadd_ps(wt83, dat25, sum75);
sum76 = _mm512_fmadd_ps(wt83, dat26, sum76);
sum77 = _mm512_fmadd_ps(wt83, dat27, sum77);
__m512 wt84 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i9+24*j7));
sum78 = _mm512_fmadd_ps(wt84, dat24, sum78);
sum79 = _mm512_fmadd_ps(wt84, dat25, sum79);
sum80 = _mm512_fmadd_ps(wt84, dat26, sum80);
sum81 = _mm512_fmadd_ps(wt84, dat27, sum81);
}
_mm512_storeu_ps(sumPtr1+0+1536*i9, sum58);
_mm512_storeu_ps(sumPtr1+64+1536*i9, sum59);
_mm512_storeu_ps(sumPtr1+128+1536*i9, sum60);
_mm512_storeu_ps(sumPtr1+192+1536*i9, sum61);
_mm512_storeu_ps(sumPtr1+256+1536*i9, sum62);
_mm512_storeu_ps(sumPtr1+320+1536*i9, sum63);
_mm512_storeu_ps(sumPtr1+384+1536*i9, sum64);
_mm512_storeu_ps(sumPtr1+448+1536*i9, sum65);
_mm512_storeu_ps(sumPtr1+512+1536*i9, sum66);
_mm512_storeu_ps(sumPtr1+576+1536*i9, sum67);
_mm512_storeu_ps(sumPtr1+640+1536*i9, sum68);
_mm512_storeu_ps(sumPtr1+704+1536*i9, sum69);
_mm512_storeu_ps(sumPtr1+768+1536*i9, sum70);
_mm512_storeu_ps(sumPtr1+832+1536*i9, sum71);
_mm512_storeu_ps(sumPtr1+896+1536*i9, sum72);
_mm512_storeu_ps(sumPtr1+960+1536*i9, sum73);
_mm512_storeu_ps(sumPtr1+1024+1536*i9, sum74);
_mm512_storeu_ps(sumPtr1+1088+1536*i9, sum75);
_mm512_storeu_ps(sumPtr1+1152+1536*i9, sum76);
_mm512_storeu_ps(sumPtr1+1216+1536*i9, sum77);
_mm512_storeu_ps(sumPtr1+1280+1536*i9, sum78);
_mm512_storeu_ps(sumPtr1+1344+1536*i9, sum79);
_mm512_storeu_ps(sumPtr1+1408+1536*i9, sum80);
_mm512_storeu_ps(sumPtr1+1472+1536*i9, sum81);
}
__m512 sum82 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i9));
__m512 sum83 = sum82;
__m512 sum84 = sum82;
__m512 sum85 = sum82;
for (ptrdiff_t j8 = 0; j8 < 16; ++j8) {
__m512 dat28 = _mm512_loadu_ps(datPtr3+0+256*j8);
__m512 dat29 = _mm512_loadu_ps(datPtr3+64+256*j8);
__m512 dat30 = _mm512_loadu_ps(datPtr3+128+256*j8);
__m512 dat31 = _mm512_loadu_ps(datPtr3+192+256*j8);
__m512 wt85 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i9+4*j8));
sum82 = _mm512_fmadd_ps(wt85, dat28, sum82);
sum83 = _mm512_fmadd_ps(wt85, dat29, sum83);
sum84 = _mm512_fmadd_ps(wt85, dat30, sum84);
sum85 = _mm512_fmadd_ps(wt85, dat31, sum85);
}
_mm512_storeu_ps(sumPtr1+0+1536*i9, sum82);
_mm512_storeu_ps(sumPtr1+64+1536*i9, sum83);
_mm512_storeu_ps(sumPtr1+128+1536*i9, sum84);
_mm512_storeu_ps(sumPtr1+192+1536*i9, sum85);
break;
}
case 2: {
if (node6) {
if (!epoch1 && base1) {
ptrdiff_t i10 = 11*w3;
for (; i10 != 10; ++i10) {
__m512 sum86 = _mm512_setzero_ps();
__m512 sum89 = _mm512_setzero_ps();
__m512 sum92 = _mm512_setzero_ps();
__m512 sum95 = _mm512_setzero_ps();
__m512 sum98 = _mm512_setzero_ps();
__m512 sum101 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum87 = sum86;
__m512 sum88 = sum86;
__m512 sum90 = sum89;
__m512 sum91 = sum89;
__m512 sum93 = sum92;
__m512 sum94 = sum92;
__m512 sum96 = sum95;
__m512 sum97 = sum95;
__m512 sum99 = sum98;
__m512 sum100 = sum98;
__m512 sum102 = sum101;
__m512 sum103 = sum101;
for (ptrdiff_t j9 = 0; j9 < 16; ++j9) {
__m512 dat32 = _mm512_loadu_ps(datPtr3+64+256*j9);
__m512 dat33 = _mm512_loadu_ps(datPtr3+128+256*j9);
__m512 dat34 = _mm512_loadu_ps(datPtr3+192+256*j9);
__m512 wt86 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i10+24*j9));
sum86 = _mm512_fmadd_ps(wt86, dat32, sum86);
sum87 = _mm512_fmadd_ps(wt86, dat33, sum87);
sum88 = _mm512_fmadd_ps(wt86, dat34, sum88);
__m512 wt87 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i10+24*j9));
sum89 = _mm512_fmadd_ps(wt87, dat32, sum89);
sum90 = _mm512_fmadd_ps(wt87, dat33, sum90);
sum91 = _mm512_fmadd_ps(wt87, dat34, sum91);
__m512 wt88 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i10+24*j9));
sum92 = _mm512_fmadd_ps(wt88, dat32, sum92);
sum93 = _mm512_fmadd_ps(wt88, dat33, sum93);
sum94 = _mm512_fmadd_ps(wt88, dat34, sum94);
__m512 wt89 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i10+24*j9));
sum95 = _mm512_fmadd_ps(wt89, dat32, sum95);
sum96 = _mm512_fmadd_ps(wt89, dat33, sum96);
sum97 = _mm512_fmadd_ps(wt89, dat34, sum97);
__m512 wt90 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i10+24*j9));
sum98 = _mm512_fmadd_ps(wt90, dat32, sum98);
sum99 = _mm512_fmadd_ps(wt90, dat33, sum99);
sum100 = _mm512_fmadd_ps(wt90, dat34, sum100);
__m512 wt91 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i10+24*j9));
sum101 = _mm512_fmadd_ps(wt91, dat32, sum101);
sum102 = _mm512_fmadd_ps(wt91, dat33, sum102);
sum103 = _mm512_fmadd_ps(wt91, dat34, sum103);
}
_mm512_storeu_ps(sumPtr1+0+1536*i10, sum86);
_mm512_storeu_ps(sumPtr1+64+1536*i10, sum87);
_mm512_storeu_ps(sumPtr1+128+1536*i10, sum88);
_mm512_storeu_ps(sumPtr1+256+1536*i10, sum89);
_mm512_storeu_ps(sumPtr1+320+1536*i10, sum90);
_mm512_storeu_ps(sumPtr1+384+1536*i10, sum91);
_mm512_storeu_ps(sumPtr1+512+1536*i10, sum92);
_mm512_storeu_ps(sumPtr1+576+1536*i10, sum93);
_mm512_storeu_ps(sumPtr1+640+1536*i10, sum94);
_mm512_storeu_ps(sumPtr1+768+1536*i10, sum95);
_mm512_storeu_ps(sumPtr1+832+1536*i10, sum96);
_mm512_storeu_ps(sumPtr1+896+1536*i10, sum97);
_mm512_storeu_ps(sumPtr1+1024+1536*i10, sum98);
_mm512_storeu_ps(sumPtr1+1088+1536*i10, sum99);
_mm512_storeu_ps(sumPtr1+1152+1536*i10, sum100);
_mm512_storeu_ps(sumPtr1+1280+1536*i10, sum101);
_mm512_storeu_ps(sumPtr1+1344+1536*i10, sum102);
_mm512_storeu_ps(sumPtr1+1408+1536*i10, sum103);
}
__m512 sum104 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum105 = sum104;
__m512 sum106 = sum104;
for (ptrdiff_t j10 = 0; j10 < 16; ++j10) {
__m512 dat35 = _mm512_loadu_ps(datPtr3+64+256*j10);
__m512 dat36 = _mm512_loadu_ps(datPtr3+128+256*j10);
__m512 dat37 = _mm512_loadu_ps(datPtr3+192+256*j10);
__m512 wt92 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i10+4*j10));
sum104 = _mm512_fmadd_ps(wt92, dat35, sum104);
sum105 = _mm512_fmadd_ps(wt92, dat36, sum105);
sum106 = _mm512_fmadd_ps(wt92, dat37, sum106);
}
_mm512_storeu_ps(sumPtr1+0+1536*i10, sum104);
_mm512_storeu_ps(sumPtr1+64+1536*i10, sum105);
_mm512_storeu_ps(sumPtr1+128+1536*i10, sum106);
return;
}
ptrdiff_t i11 = 11*w3;
for (; i11 != 10; ++i11) {
__m512 sum107 = _mm512_setzero_ps();
__m512 sum110 = _mm512_setzero_ps();
__m512 sum113 = _mm512_setzero_ps();
__m512 sum116 = _mm512_setzero_ps();
__m512 sum119 = _mm512_setzero_ps();
__m512 sum122 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum108 = sum107;
__m512 sum109 = sum107;
__m512 sum111 = sum110;
__m512 sum112 = sum110;
__m512 sum114 = sum113;
__m512 sum115 = sum113;
__m512 sum117 = sum116;
__m512 sum118 = sum116;
__m512 sum120 = sum119;
__m512 sum121 = sum119;
__m512 sum123 = sum122;
__m512 sum124 = sum122;
for (ptrdiff_t j11 = 0; j11 < 16; ++j11) {
__m512 dat38 = _mm512_loadu_ps(datPtr3+64+256*j11);
__m512 dat39 = _mm512_loadu_ps(datPtr3+128+256*j11);
__m512 dat40 = _mm512_loadu_ps(datPtr3+192+256*j11);
__m512 wt93 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i11+24*j11));
sum107 = _mm512_fmadd_ps(wt93, dat38, sum107);
sum108 = _mm512_fmadd_ps(wt93, dat39, sum108);
sum109 = _mm512_fmadd_ps(wt93, dat40, sum109);
__m512 wt94 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i11+24*j11));
sum110 = _mm512_fmadd_ps(wt94, dat38, sum110);
sum111 = _mm512_fmadd_ps(wt94, dat39, sum111);
sum112 = _mm512_fmadd_ps(wt94, dat40, sum112);
__m512 wt95 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i11+24*j11));
sum113 = _mm512_fmadd_ps(wt95, dat38, sum113);
sum114 = _mm512_fmadd_ps(wt95, dat39, sum114);
sum115 = _mm512_fmadd_ps(wt95, dat40, sum115);
__m512 wt96 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i11+24*j11));
sum116 = _mm512_fmadd_ps(wt96, dat38, sum116);
sum117 = _mm512_fmadd_ps(wt96, dat39, sum117);
sum118 = _mm512_fmadd_ps(wt96, dat40, sum118);
__m512 wt97 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i11+24*j11));
sum119 = _mm512_fmadd_ps(wt97, dat38, sum119);
sum120 = _mm512_fmadd_ps(wt97, dat39, sum120);
sum121 = _mm512_fmadd_ps(wt97, dat40, sum121);
__m512 wt98 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i11+24*j11));
sum122 = _mm512_fmadd_ps(wt98, dat38, sum122);
sum123 = _mm512_fmadd_ps(wt98, dat39, sum123);
sum124 = _mm512_fmadd_ps(wt98, dat40, sum124);
}
_mm512_storeu_ps(sumPtr1+0+1536*i11, _mm512_add_ps(sum107, _mm512_loadu_ps(sumPtr1+0+1536*i11)));
_mm512_storeu_ps(sumPtr1+64+1536*i11, _mm512_add_ps(sum108, _mm512_loadu_ps(sumPtr1+64+1536*i11)));
_mm512_storeu_ps(sumPtr1+128+1536*i11, _mm512_add_ps(sum109, _mm512_loadu_ps(sumPtr1+128+1536*i11)));
_mm512_storeu_ps(sumPtr1+256+1536*i11, _mm512_add_ps(sum110, _mm512_loadu_ps(sumPtr1+256+1536*i11)));
_mm512_storeu_ps(sumPtr1+320+1536*i11, _mm512_add_ps(sum111, _mm512_loadu_ps(sumPtr1+320+1536*i11)));
_mm512_storeu_ps(sumPtr1+384+1536*i11, _mm512_add_ps(sum112, _mm512_loadu_ps(sumPtr1+384+1536*i11)));
_mm512_storeu_ps(sumPtr1+512+1536*i11, _mm512_add_ps(sum113, _mm512_loadu_ps(sumPtr1+512+1536*i11)));
_mm512_storeu_ps(sumPtr1+576+1536*i11, _mm512_add_ps(sum114, _mm512_loadu_ps(sumPtr1+576+1536*i11)));
_mm512_storeu_ps(sumPtr1+640+1536*i11, _mm512_add_ps(sum115, _mm512_loadu_ps(sumPtr1+640+1536*i11)));
_mm512_storeu_ps(sumPtr1+768+1536*i11, _mm512_add_ps(sum116, _mm512_loadu_ps(sumPtr1+768+1536*i11)));
_mm512_storeu_ps(sumPtr1+832+1536*i11, _mm512_add_ps(sum117, _mm512_loadu_ps(sumPtr1+832+1536*i11)));
_mm512_storeu_ps(sumPtr1+896+1536*i11, _mm512_add_ps(sum118, _mm512_loadu_ps(sumPtr1+896+1536*i11)));
_mm512_storeu_ps(sumPtr1+1024+1536*i11, _mm512_add_ps(sum119, _mm512_loadu_ps(sumPtr1+1024+1536*i11)));
_mm512_storeu_ps(sumPtr1+1088+1536*i11, _mm512_add_ps(sum120, _mm512_loadu_ps(sumPtr1+1088+1536*i11)));
_mm512_storeu_ps(sumPtr1+1152+1536*i11, _mm512_add_ps(sum121, _mm512_loadu_ps(sumPtr1+1152+1536*i11)));
_mm512_storeu_ps(sumPtr1+1280+1536*i11, _mm512_add_ps(sum122, _mm512_loadu_ps(sumPtr1+1280+1536*i11)));
_mm512_storeu_ps(sumPtr1+1344+1536*i11, _mm512_add_ps(sum123, _mm512_loadu_ps(sumPtr1+1344+1536*i11)));
_mm512_storeu_ps(sumPtr1+1408+1536*i11, _mm512_add_ps(sum124, _mm512_loadu_ps(sumPtr1+1408+1536*i11)));
}
__m512 sum125 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum126 = sum125;
__m512 sum127 = sum125;
for (ptrdiff_t j12 = 0; j12 < 16; ++j12) {
__m512 dat41 = _mm512_loadu_ps(datPtr3+64+256*j12);
__m512 dat42 = _mm512_loadu_ps(datPtr3+128+256*j12);
__m512 dat43 = _mm512_loadu_ps(datPtr3+192+256*j12);
__m512 wt99 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i11+4*j12));
sum125 = _mm512_fmadd_ps(wt99, dat41, sum125);
sum126 = _mm512_fmadd_ps(wt99, dat42, sum126);
sum127 = _mm512_fmadd_ps(wt99, dat43, sum127);
}
_mm512_storeu_ps(sumPtr1+0+1536*i11, _mm512_add_ps(sum125, _mm512_loadu_ps(sumPtr1+0+1536*i11)));
_mm512_storeu_ps(sumPtr1+64+1536*i11, _mm512_add_ps(sum126, _mm512_loadu_ps(sumPtr1+64+1536*i11)));
_mm512_storeu_ps(sumPtr1+128+1536*i11, _mm512_add_ps(sum127, _mm512_loadu_ps(sumPtr1+128+1536*i11)));
return;
}
(void)base1;
ptrdiff_t i12 = 11*w3;
for (; i12 != 10; ++i12) {
__m512 sum128 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i12));
__m512 sum131 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i12));
__m512 sum134 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i12));
__m512 sum137 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i12));
__m512 sum140 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i12));
__m512 sum143 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i12));
__m512 sum129 = sum128;
__m512 sum130 = sum128;
__m512 sum132 = sum131;
__m512 sum133 = sum131;
__m512 sum135 = sum134;
__m512 sum136 = sum134;
__m512 sum138 = sum137;
__m512 sum139 = sum137;
__m512 sum141 = sum140;
__m512 sum142 = sum140;
__m512 sum144 = sum143;
__m512 sum145 = sum143;
for (ptrdiff_t j13 = 0; j13 < 16; ++j13) {
__m512 dat44 = _mm512_loadu_ps(datPtr3+64+256*j13);
__m512 dat45 = _mm512_loadu_ps(datPtr3+128+256*j13);
__m512 dat46 = _mm512_loadu_ps(datPtr3+192+256*j13);
__m512 wt100 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i12+24*j13));
sum128 = _mm512_fmadd_ps(wt100, dat44, sum128);
sum129 = _mm512_fmadd_ps(wt100, dat45, sum129);
sum130 = _mm512_fmadd_ps(wt100, dat46, sum130);
__m512 wt101 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i12+24*j13));
sum131 = _mm512_fmadd_ps(wt101, dat44, sum131);
sum132 = _mm512_fmadd_ps(wt101, dat45, sum132);
sum133 = _mm512_fmadd_ps(wt101, dat46, sum133);
__m512 wt102 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i12+24*j13));
sum134 = _mm512_fmadd_ps(wt102, dat44, sum134);
sum135 = _mm512_fmadd_ps(wt102, dat45, sum135);
sum136 = _mm512_fmadd_ps(wt102, dat46, sum136);
__m512 wt103 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i12+24*j13));
sum137 = _mm512_fmadd_ps(wt103, dat44, sum137);
sum138 = _mm512_fmadd_ps(wt103, dat45, sum138);
sum139 = _mm512_fmadd_ps(wt103, dat46, sum139);
__m512 wt104 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i12+24*j13));
sum140 = _mm512_fmadd_ps(wt104, dat44, sum140);
sum141 = _mm512_fmadd_ps(wt104, dat45, sum141);
sum142 = _mm512_fmadd_ps(wt104, dat46, sum142);
__m512 wt105 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i12+24*j13));
sum143 = _mm512_fmadd_ps(wt105, dat44, sum143);
sum144 = _mm512_fmadd_ps(wt105, dat45, sum144);
sum145 = _mm512_fmadd_ps(wt105, dat46, sum145);
}
_mm512_storeu_ps(sumPtr1+0+1536*i12, sum128);
_mm512_storeu_ps(sumPtr1+64+1536*i12, sum129);
_mm512_storeu_ps(sumPtr1+128+1536*i12, sum130);
_mm512_storeu_ps(sumPtr1+256+1536*i12, sum131);
_mm512_storeu_ps(sumPtr1+320+1536*i12, sum132);
_mm512_storeu_ps(sumPtr1+384+1536*i12, sum133);
_mm512_storeu_ps(sumPtr1+512+1536*i12, sum134);
_mm512_storeu_ps(sumPtr1+576+1536*i12, sum135);
_mm512_storeu_ps(sumPtr1+640+1536*i12, sum136);
_mm512_storeu_ps(sumPtr1+768+1536*i12, sum137);
_mm512_storeu_ps(sumPtr1+832+1536*i12, sum138);
_mm512_storeu_ps(sumPtr1+896+1536*i12, sum139);
_mm512_storeu_ps(sumPtr1+1024+1536*i12, sum140);
_mm512_storeu_ps(sumPtr1+1088+1536*i12, sum141);
_mm512_storeu_ps(sumPtr1+1152+1536*i12, sum142);
_mm512_storeu_ps(sumPtr1+1280+1536*i12, sum143);
_mm512_storeu_ps(sumPtr1+1344+1536*i12, sum144);
_mm512_storeu_ps(sumPtr1+1408+1536*i12, sum145);
}
__m512 sum146 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i12));
__m512 sum147 = sum146;
__m512 sum148 = sum146;
for (ptrdiff_t j14 = 0; j14 < 16; ++j14) {
__m512 dat47 = _mm512_loadu_ps(datPtr3+64+256*j14);
__m512 dat48 = _mm512_loadu_ps(datPtr3+128+256*j14);
__m512 dat49 = _mm512_loadu_ps(datPtr3+192+256*j14);
__m512 wt106 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i12+4*j14));
sum146 = _mm512_fmadd_ps(wt106, dat47, sum146);
sum147 = _mm512_fmadd_ps(wt106, dat48, sum147);
sum148 = _mm512_fmadd_ps(wt106, dat49, sum148);
}
_mm512_storeu_ps(sumPtr1+0+1536*i12, sum146);
_mm512_storeu_ps(sumPtr1+64+1536*i12, sum147);
_mm512_storeu_ps(sumPtr1+128+1536*i12, sum148);
break;
}
case 3: {
if (node6) {
if (!epoch1 && base1) {
ptrdiff_t i13 = 11*w3;
for (; i13 != 10; ++i13) {
__m512 sum149 = _mm512_setzero_ps();
__m512 sum153 = _mm512_setzero_ps();
__m512 sum157 = _mm512_setzero_ps();
__m512 sum161 = _mm512_setzero_ps();
__m512 sum165 = _mm512_setzero_ps();
__m512 sum169 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum150 = sum149;
__m512 sum151 = sum149;
__m512 sum152 = sum149;
__m512 sum154 = sum153;
__m512 sum155 = sum153;
__m512 sum156 = sum153;
__m512 sum158 = sum157;
__m512 sum159 = sum157;
__m512 sum160 = sum157;
__m512 sum162 = sum161;
__m512 sum163 = sum161;
__m512 sum164 = sum161;
__m512 sum166 = sum165;
__m512 sum167 = sum165;
__m512 sum168 = sum165;
__m512 sum170 = sum169;
__m512 sum171 = sum169;
__m512 sum172 = sum169;
for (ptrdiff_t j15 = 0; j15 < 16; ++j15) {
__m512 dat50 = _mm512_loadu_ps(datPtr3+0+256*j15);
__m512 dat51 = _mm512_loadu_ps(datPtr3+64+256*j15);
__m512 dat52 = _mm512_loadu_ps(datPtr3+128+256*j15);
__m512 dat53 = _mm512_loadu_ps(datPtr3+192+256*j15);
__m512 wt107 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i13+24*j15));
sum149 = _mm512_fmadd_ps(wt107, dat50, sum149);
sum150 = _mm512_fmadd_ps(wt107, dat51, sum150);
sum151 = _mm512_fmadd_ps(wt107, dat52, sum151);
sum152 = _mm512_fmadd_ps(wt107, dat53, sum152);
__m512 wt108 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i13+24*j15));
sum153 = _mm512_fmadd_ps(wt108, dat50, sum153);
sum154 = _mm512_fmadd_ps(wt108, dat51, sum154);
sum155 = _mm512_fmadd_ps(wt108, dat52, sum155);
sum156 = _mm512_fmadd_ps(wt108, dat53, sum156);
__m512 wt109 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i13+24*j15));
sum157 = _mm512_fmadd_ps(wt109, dat50, sum157);
sum158 = _mm512_fmadd_ps(wt109, dat51, sum158);
sum159 = _mm512_fmadd_ps(wt109, dat52, sum159);
sum160 = _mm512_fmadd_ps(wt109, dat53, sum160);
__m512 wt110 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i13+24*j15));
sum161 = _mm512_fmadd_ps(wt110, dat50, sum161);
sum162 = _mm512_fmadd_ps(wt110, dat51, sum162);
sum163 = _mm512_fmadd_ps(wt110, dat52, sum163);
sum164 = _mm512_fmadd_ps(wt110, dat53, sum164);
__m512 wt111 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i13+24*j15));
sum165 = _mm512_fmadd_ps(wt111, dat50, sum165);
sum166 = _mm512_fmadd_ps(wt111, dat51, sum166);
sum167 = _mm512_fmadd_ps(wt111, dat52, sum167);
sum168 = _mm512_fmadd_ps(wt111, dat53, sum168);
__m512 wt112 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i13+24*j15));
sum169 = _mm512_fmadd_ps(wt112, dat50, sum169);
sum170 = _mm512_fmadd_ps(wt112, dat51, sum170);
sum171 = _mm512_fmadd_ps(wt112, dat52, sum171);
sum172 = _mm512_fmadd_ps(wt112, dat53, sum172);
}
_mm512_storeu_ps(sumPtr1+-62272+1536*i13, sum149);
_mm512_storeu_ps(sumPtr1+0+1536*i13, sum150);
_mm512_storeu_ps(sumPtr1+64+1536*i13, sum151);
_mm512_storeu_ps(sumPtr1+128+1536*i13, sum152);
_mm512_storeu_ps(sumPtr1+-62016+1536*i13, sum153);
_mm512_storeu_ps(sumPtr1+256+1536*i13, sum154);
_mm512_storeu_ps(sumPtr1+320+1536*i13, sum155);
_mm512_storeu_ps(sumPtr1+384+1536*i13, sum156);
_mm512_storeu_ps(sumPtr1+-61760+1536*i13, sum157);
_mm512_storeu_ps(sumPtr1+512+1536*i13, sum158);
_mm512_storeu_ps(sumPtr1+576+1536*i13, sum159);
_mm512_storeu_ps(sumPtr1+640+1536*i13, sum160);
_mm512_storeu_ps(sumPtr1+-61504+1536*i13, sum161);
_mm512_storeu_ps(sumPtr1+768+1536*i13, sum162);
_mm512_storeu_ps(sumPtr1+832+1536*i13, sum163);
_mm512_storeu_ps(sumPtr1+896+1536*i13, sum164);
_mm512_storeu_ps(sumPtr1+-61248+1536*i13, sum165);
_mm512_storeu_ps(sumPtr1+1024+1536*i13, sum166);
_mm512_storeu_ps(sumPtr1+1088+1536*i13, sum167);
_mm512_storeu_ps(sumPtr1+1152+1536*i13, sum168);
_mm512_storeu_ps(sumPtr1+-60992+1536*i13, sum169);
_mm512_storeu_ps(sumPtr1+1280+1536*i13, sum170);
_mm512_storeu_ps(sumPtr1+1344+1536*i13, sum171);
_mm512_storeu_ps(sumPtr1+1408+1536*i13, sum172);
}
__m512 sum173 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum174 = sum173;
__m512 sum175 = sum173;
__m512 sum176 = sum173;
for (ptrdiff_t j16 = 0; j16 < 16; ++j16) {
__m512 dat54 = _mm512_loadu_ps(datPtr3+0+256*j16);
__m512 dat55 = _mm512_loadu_ps(datPtr3+64+256*j16);
__m512 dat56 = _mm512_loadu_ps(datPtr3+128+256*j16);
__m512 dat57 = _mm512_loadu_ps(datPtr3+192+256*j16);
__m512 wt113 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i13+4*j16));
sum173 = _mm512_fmadd_ps(wt113, dat54, sum173);
sum174 = _mm512_fmadd_ps(wt113, dat55, sum174);
sum175 = _mm512_fmadd_ps(wt113, dat56, sum175);
sum176 = _mm512_fmadd_ps(wt113, dat57, sum176);
}
_mm512_storeu_ps(sumPtr1+-62272+1536*i13, sum173);
_mm512_storeu_ps(sumPtr1+0+1536*i13, sum174);
_mm512_storeu_ps(sumPtr1+64+1536*i13, sum175);
_mm512_storeu_ps(sumPtr1+128+1536*i13, sum176);
return;
}
ptrdiff_t i14 = 11*w3;
for (; i14 != 10; ++i14) {
__m512 sum177 = _mm512_setzero_ps();
__m512 sum181 = _mm512_setzero_ps();
__m512 sum185 = _mm512_setzero_ps();
__m512 sum189 = _mm512_setzero_ps();
__m512 sum193 = _mm512_setzero_ps();
__m512 sum197 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum178 = sum177;
__m512 sum179 = sum177;
__m512 sum180 = sum177;
__m512 sum182 = sum181;
__m512 sum183 = sum181;
__m512 sum184 = sum181;
__m512 sum186 = sum185;
__m512 sum187 = sum185;
__m512 sum188 = sum185;
__m512 sum190 = sum189;
__m512 sum191 = sum189;
__m512 sum192 = sum189;
__m512 sum194 = sum193;
__m512 sum195 = sum193;
__m512 sum196 = sum193;
__m512 sum198 = sum197;
__m512 sum199 = sum197;
__m512 sum200 = sum197;
for (ptrdiff_t j17 = 0; j17 < 16; ++j17) {
__m512 dat58 = _mm512_loadu_ps(datPtr3+0+256*j17);
__m512 dat59 = _mm512_loadu_ps(datPtr3+64+256*j17);
__m512 dat60 = _mm512_loadu_ps(datPtr3+128+256*j17);
__m512 dat61 = _mm512_loadu_ps(datPtr3+192+256*j17);
__m512 wt114 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i14+24*j17));
sum177 = _mm512_fmadd_ps(wt114, dat58, sum177);
sum178 = _mm512_fmadd_ps(wt114, dat59, sum178);
sum179 = _mm512_fmadd_ps(wt114, dat60, sum179);
sum180 = _mm512_fmadd_ps(wt114, dat61, sum180);
__m512 wt115 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i14+24*j17));
sum181 = _mm512_fmadd_ps(wt115, dat58, sum181);
sum182 = _mm512_fmadd_ps(wt115, dat59, sum182);
sum183 = _mm512_fmadd_ps(wt115, dat60, sum183);
sum184 = _mm512_fmadd_ps(wt115, dat61, sum184);
__m512 wt116 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i14+24*j17));
sum185 = _mm512_fmadd_ps(wt116, dat58, sum185);
sum186 = _mm512_fmadd_ps(wt116, dat59, sum186);
sum187 = _mm512_fmadd_ps(wt116, dat60, sum187);
sum188 = _mm512_fmadd_ps(wt116, dat61, sum188);
__m512 wt117 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i14+24*j17));
sum189 = _mm512_fmadd_ps(wt117, dat58, sum189);
sum190 = _mm512_fmadd_ps(wt117, dat59, sum190);
sum191 = _mm512_fmadd_ps(wt117, dat60, sum191);
sum192 = _mm512_fmadd_ps(wt117, dat61, sum192);
__m512 wt118 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i14+24*j17));
sum193 = _mm512_fmadd_ps(wt118, dat58, sum193);
sum194 = _mm512_fmadd_ps(wt118, dat59, sum194);
sum195 = _mm512_fmadd_ps(wt118, dat60, sum195);
sum196 = _mm512_fmadd_ps(wt118, dat61, sum196);
__m512 wt119 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i14+24*j17));
sum197 = _mm512_fmadd_ps(wt119, dat58, sum197);
sum198 = _mm512_fmadd_ps(wt119, dat59, sum198);
sum199 = _mm512_fmadd_ps(wt119, dat60, sum199);
sum200 = _mm512_fmadd_ps(wt119, dat61, sum200);
}
_mm512_storeu_ps(sumPtr1+-62272+1536*i14, _mm512_add_ps(sum177, _mm512_loadu_ps(sumPtr1+-62272+1536*i14)));
_mm512_storeu_ps(sumPtr1+0+1536*i14, _mm512_add_ps(sum178, _mm512_loadu_ps(sumPtr1+0+1536*i14)));
_mm512_storeu_ps(sumPtr1+64+1536*i14, _mm512_add_ps(sum179, _mm512_loadu_ps(sumPtr1+64+1536*i14)));
_mm512_storeu_ps(sumPtr1+128+1536*i14, _mm512_add_ps(sum180, _mm512_loadu_ps(sumPtr1+128+1536*i14)));
_mm512_storeu_ps(sumPtr1+-62016+1536*i14, _mm512_add_ps(sum181, _mm512_loadu_ps(sumPtr1+-62016+1536*i14)));
_mm512_storeu_ps(sumPtr1+256+1536*i14, _mm512_add_ps(sum182, _mm512_loadu_ps(sumPtr1+256+1536*i14)));
_mm512_storeu_ps(sumPtr1+320+1536*i14, _mm512_add_ps(sum183, _mm512_loadu_ps(sumPtr1+320+1536*i14)));
_mm512_storeu_ps(sumPtr1+384+1536*i14, _mm512_add_ps(sum184, _mm512_loadu_ps(sumPtr1+384+1536*i14)));
_mm512_storeu_ps(sumPtr1+-61760+1536*i14, _mm512_add_ps(sum185, _mm512_loadu_ps(sumPtr1+-61760+1536*i14)));
_mm512_storeu_ps(sumPtr1+512+1536*i14, _mm512_add_ps(sum186, _mm512_loadu_ps(sumPtr1+512+1536*i14)));
_mm512_storeu_ps(sumPtr1+576+1536*i14, _mm512_add_ps(sum187, _mm512_loadu_ps(sumPtr1+576+1536*i14)));
_mm512_storeu_ps(sumPtr1+640+1536*i14, _mm512_add_ps(sum188, _mm512_loadu_ps(sumPtr1+640+1536*i14)));
_mm512_storeu_ps(sumPtr1+-61504+1536*i14, _mm512_add_ps(sum189, _mm512_loadu_ps(sumPtr1+-61504+1536*i14)));
_mm512_storeu_ps(sumPtr1+768+1536*i14, _mm512_add_ps(sum190, _mm512_loadu_ps(sumPtr1+768+1536*i14)));
_mm512_storeu_ps(sumPtr1+832+1536*i14, _mm512_add_ps(sum191, _mm512_loadu_ps(sumPtr1+832+1536*i14)));
_mm512_storeu_ps(sumPtr1+896+1536*i14, _mm512_add_ps(sum192, _mm512_loadu_ps(sumPtr1+896+1536*i14)));
_mm512_storeu_ps(sumPtr1+-61248+1536*i14, _mm512_add_ps(sum193, _mm512_loadu_ps(sumPtr1+-61248+1536*i14)));
_mm512_storeu_ps(sumPtr1+1024+1536*i14, _mm512_add_ps(sum194, _mm512_loadu_ps(sumPtr1+1024+1536*i14)));
_mm512_storeu_ps(sumPtr1+1088+1536*i14, _mm512_add_ps(sum195, _mm512_loadu_ps(sumPtr1+1088+1536*i14)));
_mm512_storeu_ps(sumPtr1+1152+1536*i14, _mm512_add_ps(sum196, _mm512_loadu_ps(sumPtr1+1152+1536*i14)));
_mm512_storeu_ps(sumPtr1+-60992+1536*i14, _mm512_add_ps(sum197, _mm512_loadu_ps(sumPtr1+-60992+1536*i14)));
_mm512_storeu_ps(sumPtr1+1280+1536*i14, _mm512_add_ps(sum198, _mm512_loadu_ps(sumPtr1+1280+1536*i14)));
_mm512_storeu_ps(sumPtr1+1344+1536*i14, _mm512_add_ps(sum199, _mm512_loadu_ps(sumPtr1+1344+1536*i14)));
_mm512_storeu_ps(sumPtr1+1408+1536*i14, _mm512_add_ps(sum200, _mm512_loadu_ps(sumPtr1+1408+1536*i14)));
}
__m512 sum201 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum202 = sum201;
__m512 sum203 = sum201;
__m512 sum204 = sum201;
for (ptrdiff_t j18 = 0; j18 < 16; ++j18) {
__m512 dat62 = _mm512_loadu_ps(datPtr3+0+256*j18);
__m512 dat63 = _mm512_loadu_ps(datPtr3+64+256*j18);
__m512 dat64 = _mm512_loadu_ps(datPtr3+128+256*j18);
__m512 dat65 = _mm512_loadu_ps(datPtr3+192+256*j18);
__m512 wt120 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i14+4*j18));
sum201 = _mm512_fmadd_ps(wt120, dat62, sum201);
sum202 = _mm512_fmadd_ps(wt120, dat63, sum202);
sum203 = _mm512_fmadd_ps(wt120, dat64, sum203);
sum204 = _mm512_fmadd_ps(wt120, dat65, sum204);
}
_mm512_storeu_ps(sumPtr1+-62272+1536*i14, _mm512_add_ps(sum201, _mm512_loadu_ps(sumPtr1+-62272+1536*i14)));
_mm512_storeu_ps(sumPtr1+0+1536*i14, _mm512_add_ps(sum202, _mm512_loadu_ps(sumPtr1+0+1536*i14)));
_mm512_storeu_ps(sumPtr1+64+1536*i14, _mm512_add_ps(sum203, _mm512_loadu_ps(sumPtr1+64+1536*i14)));
_mm512_storeu_ps(sumPtr1+128+1536*i14, _mm512_add_ps(sum204, _mm512_loadu_ps(sumPtr1+128+1536*i14)));
return;
}
(void)base1;
ptrdiff_t i15 = 11*w3;
for (; i15 != 10; ++i15) {
__m512 sum205 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i15));
__m512 sum209 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i15));
__m512 sum213 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i15));
__m512 sum217 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i15));
__m512 sum221 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i15));
__m512 sum225 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i15));
__m512 sum206 = sum205;
__m512 sum207 = sum205;
__m512 sum208 = sum205;
__m512 sum210 = sum209;
__m512 sum211 = sum209;
__m512 sum212 = sum209;
__m512 sum214 = sum213;
__m512 sum215 = sum213;
__m512 sum216 = sum213;
__m512 sum218 = sum217;
__m512 sum219 = sum217;
__m512 sum220 = sum217;
__m512 sum222 = sum221;
__m512 sum223 = sum221;
__m512 sum224 = sum221;
__m512 sum226 = sum225;
__m512 sum227 = sum225;
__m512 sum228 = sum225;
for (ptrdiff_t j19 = 0; j19 < 16; ++j19) {
__m512 dat66 = _mm512_loadu_ps(datPtr3+0+256*j19);
__m512 dat67 = _mm512_loadu_ps(datPtr3+64+256*j19);
__m512 dat68 = _mm512_loadu_ps(datPtr3+128+256*j19);
__m512 dat69 = _mm512_loadu_ps(datPtr3+192+256*j19);
__m512 wt121 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i15+24*j19));
sum205 = _mm512_fmadd_ps(wt121, dat66, sum205);
sum206 = _mm512_fmadd_ps(wt121, dat67, sum206);
sum207 = _mm512_fmadd_ps(wt121, dat68, sum207);
sum208 = _mm512_fmadd_ps(wt121, dat69, sum208);
__m512 wt122 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i15+24*j19));
sum209 = _mm512_fmadd_ps(wt122, dat66, sum209);
sum210 = _mm512_fmadd_ps(wt122, dat67, sum210);
sum211 = _mm512_fmadd_ps(wt122, dat68, sum211);
sum212 = _mm512_fmadd_ps(wt122, dat69, sum212);
__m512 wt123 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i15+24*j19));
sum213 = _mm512_fmadd_ps(wt123, dat66, sum213);
sum214 = _mm512_fmadd_ps(wt123, dat67, sum214);
sum215 = _mm512_fmadd_ps(wt123, dat68, sum215);
sum216 = _mm512_fmadd_ps(wt123, dat69, sum216);
__m512 wt124 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i15+24*j19));
sum217 = _mm512_fmadd_ps(wt124, dat66, sum217);
sum218 = _mm512_fmadd_ps(wt124, dat67, sum218);
sum219 = _mm512_fmadd_ps(wt124, dat68, sum219);
sum220 = _mm512_fmadd_ps(wt124, dat69, sum220);
__m512 wt125 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i15+24*j19));
sum221 = _mm512_fmadd_ps(wt125, dat66, sum221);
sum222 = _mm512_fmadd_ps(wt125, dat67, sum222);
sum223 = _mm512_fmadd_ps(wt125, dat68, sum223);
sum224 = _mm512_fmadd_ps(wt125, dat69, sum224);
__m512 wt126 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i15+24*j19));
sum225 = _mm512_fmadd_ps(wt126, dat66, sum225);
sum226 = _mm512_fmadd_ps(wt126, dat67, sum226);
sum227 = _mm512_fmadd_ps(wt126, dat68, sum227);
sum228 = _mm512_fmadd_ps(wt126, dat69, sum228);
}
_mm512_storeu_ps(sumPtr1+-62272+1536*i15, sum205);
_mm512_storeu_ps(sumPtr1+0+1536*i15, sum206);
_mm512_storeu_ps(sumPtr1+64+1536*i15, sum207);
_mm512_storeu_ps(sumPtr1+128+1536*i15, sum208);
_mm512_storeu_ps(sumPtr1+-62016+1536*i15, sum209);
_mm512_storeu_ps(sumPtr1+256+1536*i15, sum210);
_mm512_storeu_ps(sumPtr1+320+1536*i15, sum211);
_mm512_storeu_ps(sumPtr1+384+1536*i15, sum212);
_mm512_storeu_ps(sumPtr1+-61760+1536*i15, sum213);
_mm512_storeu_ps(sumPtr1+512+1536*i15, sum214);
_mm512_storeu_ps(sumPtr1+576+1536*i15, sum215);
_mm512_storeu_ps(sumPtr1+640+1536*i15, sum216);
_mm512_storeu_ps(sumPtr1+-61504+1536*i15, sum217);
_mm512_storeu_ps(sumPtr1+768+1536*i15, sum218);
_mm512_storeu_ps(sumPtr1+832+1536*i15, sum219);
_mm512_storeu_ps(sumPtr1+896+1536*i15, sum220);
_mm512_storeu_ps(sumPtr1+-61248+1536*i15, sum221);
_mm512_storeu_ps(sumPtr1+1024+1536*i15, sum222);
_mm512_storeu_ps(sumPtr1+1088+1536*i15, sum223);
_mm512_storeu_ps(sumPtr1+1152+1536*i15, sum224);
_mm512_storeu_ps(sumPtr1+-60992+1536*i15, sum225);
_mm512_storeu_ps(sumPtr1+1280+1536*i15, sum226);
_mm512_storeu_ps(sumPtr1+1344+1536*i15, sum227);
_mm512_storeu_ps(sumPtr1+1408+1536*i15, sum228);
}
__m512 sum229 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i15));
__m512 sum230 = sum229;
__m512 sum231 = sum229;
__m512 sum232 = sum229;
for (ptrdiff_t j20 = 0; j20 < 16; ++j20) {
__m512 dat70 = _mm512_loadu_ps(datPtr3+0+256*j20);
__m512 dat71 = _mm512_loadu_ps(datPtr3+64+256*j20);
__m512 dat72 = _mm512_loadu_ps(datPtr3+128+256*j20);
__m512 dat73 = _mm512_loadu_ps(datPtr3+192+256*j20);
__m512 wt127 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i15+4*j20));
sum229 = _mm512_fmadd_ps(wt127, dat70, sum229);
sum230 = _mm512_fmadd_ps(wt127, dat71, sum230);
sum231 = _mm512_fmadd_ps(wt127, dat72, sum231);
sum232 = _mm512_fmadd_ps(wt127, dat73, sum232);
}
_mm512_storeu_ps(sumPtr1+-62272+1536*i15, sum229);
_mm512_storeu_ps(sumPtr1+0+1536*i15, sum230);
_mm512_storeu_ps(sumPtr1+64+1536*i15, sum231);
_mm512_storeu_ps(sumPtr1+128+1536*i15, sum232);
break;
}
case 4: {
if (node6) {
if (!epoch1 && base1) {
ptrdiff_t i16 = 11*w3;
for (; i16 != 10; ++i16) {
__m512 sum233 = _mm512_setzero_ps();
__m512 sum235 = _mm512_setzero_ps();
__m512 sum237 = _mm512_setzero_ps();
__m512 sum239 = _mm512_setzero_ps();
__m512 sum241 = _mm512_setzero_ps();
__m512 sum243 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum234 = sum233;
__m512 sum236 = sum235;
__m512 sum238 = sum237;
__m512 sum240 = sum239;
__m512 sum242 = sum241;
__m512 sum244 = sum243;
for (ptrdiff_t j21 = 0; j21 < 16; ++j21) {
__m512 dat74 = _mm512_loadu_ps(datPtr3+128+256*j21);
__m512 dat75 = _mm512_loadu_ps(datPtr3+192+256*j21);
__m512 wt128 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i16+24*j21));
sum233 = _mm512_fmadd_ps(wt128, dat74, sum233);
sum234 = _mm512_fmadd_ps(wt128, dat75, sum234);
__m512 wt129 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i16+24*j21));
sum235 = _mm512_fmadd_ps(wt129, dat74, sum235);
sum236 = _mm512_fmadd_ps(wt129, dat75, sum236);
__m512 wt130 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i16+24*j21));
sum237 = _mm512_fmadd_ps(wt130, dat74, sum237);
sum238 = _mm512_fmadd_ps(wt130, dat75, sum238);
__m512 wt131 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i16+24*j21));
sum239 = _mm512_fmadd_ps(wt131, dat74, sum239);
sum240 = _mm512_fmadd_ps(wt131, dat75, sum240);
__m512 wt132 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i16+24*j21));
sum241 = _mm512_fmadd_ps(wt132, dat74, sum241);
sum242 = _mm512_fmadd_ps(wt132, dat75, sum242);
__m512 wt133 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i16+24*j21));
sum243 = _mm512_fmadd_ps(wt133, dat74, sum243);
sum244 = _mm512_fmadd_ps(wt133, dat75, sum244);
}
_mm512_storeu_ps(sumPtr1+0+1536*i16, sum233);
_mm512_storeu_ps(sumPtr1+64+1536*i16, sum234);
_mm512_storeu_ps(sumPtr1+256+1536*i16, sum235);
_mm512_storeu_ps(sumPtr1+320+1536*i16, sum236);
_mm512_storeu_ps(sumPtr1+512+1536*i16, sum237);
_mm512_storeu_ps(sumPtr1+576+1536*i16, sum238);
_mm512_storeu_ps(sumPtr1+768+1536*i16, sum239);
_mm512_storeu_ps(sumPtr1+832+1536*i16, sum240);
_mm512_storeu_ps(sumPtr1+1024+1536*i16, sum241);
_mm512_storeu_ps(sumPtr1+1088+1536*i16, sum242);
_mm512_storeu_ps(sumPtr1+1280+1536*i16, sum243);
_mm512_storeu_ps(sumPtr1+1344+1536*i16, sum244);
}
__m512 sum245 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum246 = sum245;
for (ptrdiff_t j22 = 0; j22 < 16; ++j22) {
__m512 dat76 = _mm512_loadu_ps(datPtr3+128+256*j22);
__m512 dat77 = _mm512_loadu_ps(datPtr3+192+256*j22);
__m512 wt134 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i16+4*j22));
sum245 = _mm512_fmadd_ps(wt134, dat76, sum245);
sum246 = _mm512_fmadd_ps(wt134, dat77, sum246);
}
_mm512_storeu_ps(sumPtr1+0+1536*i16, sum245);
_mm512_storeu_ps(sumPtr1+64+1536*i16, sum246);
return;
}
ptrdiff_t i17 = 11*w3;
for (; i17 != 10; ++i17) {
__m512 sum247 = _mm512_setzero_ps();
__m512 sum249 = _mm512_setzero_ps();
__m512 sum251 = _mm512_setzero_ps();
__m512 sum253 = _mm512_setzero_ps();
__m512 sum255 = _mm512_setzero_ps();
__m512 sum257 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum248 = sum247;
__m512 sum250 = sum249;
__m512 sum252 = sum251;
__m512 sum254 = sum253;
__m512 sum256 = sum255;
__m512 sum258 = sum257;
for (ptrdiff_t j23 = 0; j23 < 16; ++j23) {
__m512 dat78 = _mm512_loadu_ps(datPtr3+128+256*j23);
__m512 dat79 = _mm512_loadu_ps(datPtr3+192+256*j23);
__m512 wt135 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i17+24*j23));
sum247 = _mm512_fmadd_ps(wt135, dat78, sum247);
sum248 = _mm512_fmadd_ps(wt135, dat79, sum248);
__m512 wt136 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i17+24*j23));
sum249 = _mm512_fmadd_ps(wt136, dat78, sum249);
sum250 = _mm512_fmadd_ps(wt136, dat79, sum250);
__m512 wt137 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i17+24*j23));
sum251 = _mm512_fmadd_ps(wt137, dat78, sum251);
sum252 = _mm512_fmadd_ps(wt137, dat79, sum252);
__m512 wt138 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i17+24*j23));
sum253 = _mm512_fmadd_ps(wt138, dat78, sum253);
sum254 = _mm512_fmadd_ps(wt138, dat79, sum254);
__m512 wt139 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i17+24*j23));
sum255 = _mm512_fmadd_ps(wt139, dat78, sum255);
sum256 = _mm512_fmadd_ps(wt139, dat79, sum256);
__m512 wt140 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i17+24*j23));
sum257 = _mm512_fmadd_ps(wt140, dat78, sum257);
sum258 = _mm512_fmadd_ps(wt140, dat79, sum258);
}
_mm512_storeu_ps(sumPtr1+0+1536*i17, _mm512_add_ps(sum247, _mm512_loadu_ps(sumPtr1+0+1536*i17)));
_mm512_storeu_ps(sumPtr1+64+1536*i17, _mm512_add_ps(sum248, _mm512_loadu_ps(sumPtr1+64+1536*i17)));
_mm512_storeu_ps(sumPtr1+256+1536*i17, _mm512_add_ps(sum249, _mm512_loadu_ps(sumPtr1+256+1536*i17)));
_mm512_storeu_ps(sumPtr1+320+1536*i17, _mm512_add_ps(sum250, _mm512_loadu_ps(sumPtr1+320+1536*i17)));
_mm512_storeu_ps(sumPtr1+512+1536*i17, _mm512_add_ps(sum251, _mm512_loadu_ps(sumPtr1+512+1536*i17)));
_mm512_storeu_ps(sumPtr1+576+1536*i17, _mm512_add_ps(sum252, _mm512_loadu_ps(sumPtr1+576+1536*i17)));
_mm512_storeu_ps(sumPtr1+768+1536*i17, _mm512_add_ps(sum253, _mm512_loadu_ps(sumPtr1+768+1536*i17)));
_mm512_storeu_ps(sumPtr1+832+1536*i17, _mm512_add_ps(sum254, _mm512_loadu_ps(sumPtr1+832+1536*i17)));
_mm512_storeu_ps(sumPtr1+1024+1536*i17, _mm512_add_ps(sum255, _mm512_loadu_ps(sumPtr1+1024+1536*i17)));
_mm512_storeu_ps(sumPtr1+1088+1536*i17, _mm512_add_ps(sum256, _mm512_loadu_ps(sumPtr1+1088+1536*i17)));
_mm512_storeu_ps(sumPtr1+1280+1536*i17, _mm512_add_ps(sum257, _mm512_loadu_ps(sumPtr1+1280+1536*i17)));
_mm512_storeu_ps(sumPtr1+1344+1536*i17, _mm512_add_ps(sum258, _mm512_loadu_ps(sumPtr1+1344+1536*i17)));
}
__m512 sum259 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum260 = sum259;
for (ptrdiff_t j24 = 0; j24 < 16; ++j24) {
__m512 dat80 = _mm512_loadu_ps(datPtr3+128+256*j24);
__m512 dat81 = _mm512_loadu_ps(datPtr3+192+256*j24);
__m512 wt141 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i17+4*j24));
sum259 = _mm512_fmadd_ps(wt141, dat80, sum259);
sum260 = _mm512_fmadd_ps(wt141, dat81, sum260);
}
_mm512_storeu_ps(sumPtr1+0+1536*i17, _mm512_add_ps(sum259, _mm512_loadu_ps(sumPtr1+0+1536*i17)));
_mm512_storeu_ps(sumPtr1+64+1536*i17, _mm512_add_ps(sum260, _mm512_loadu_ps(sumPtr1+64+1536*i17)));
return;
}
(void)base1;
ptrdiff_t i18 = 11*w3;
for (; i18 != 10; ++i18) {
__m512 sum261 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i18));
__m512 sum263 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i18));
__m512 sum265 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i18));
__m512 sum267 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i18));
__m512 sum269 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i18));
__m512 sum271 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i18));
__m512 sum262 = sum261;
__m512 sum264 = sum263;
__m512 sum266 = sum265;
__m512 sum268 = sum267;
__m512 sum270 = sum269;
__m512 sum272 = sum271;
for (ptrdiff_t j25 = 0; j25 < 16; ++j25) {
__m512 dat82 = _mm512_loadu_ps(datPtr3+128+256*j25);
__m512 dat83 = _mm512_loadu_ps(datPtr3+192+256*j25);
__m512 wt142 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i18+24*j25));
sum261 = _mm512_fmadd_ps(wt142, dat82, sum261);
sum262 = _mm512_fmadd_ps(wt142, dat83, sum262);
__m512 wt143 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i18+24*j25));
sum263 = _mm512_fmadd_ps(wt143, dat82, sum263);
sum264 = _mm512_fmadd_ps(wt143, dat83, sum264);
__m512 wt144 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i18+24*j25));
sum265 = _mm512_fmadd_ps(wt144, dat82, sum265);
sum266 = _mm512_fmadd_ps(wt144, dat83, sum266);
__m512 wt145 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i18+24*j25));
sum267 = _mm512_fmadd_ps(wt145, dat82, sum267);
sum268 = _mm512_fmadd_ps(wt145, dat83, sum268);
__m512 wt146 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i18+24*j25));
sum269 = _mm512_fmadd_ps(wt146, dat82, sum269);
sum270 = _mm512_fmadd_ps(wt146, dat83, sum270);
__m512 wt147 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i18+24*j25));
sum271 = _mm512_fmadd_ps(wt147, dat82, sum271);
sum272 = _mm512_fmadd_ps(wt147, dat83, sum272);
}
_mm512_storeu_ps(sumPtr1+0+1536*i18, sum261);
_mm512_storeu_ps(sumPtr1+64+1536*i18, sum262);
_mm512_storeu_ps(sumPtr1+256+1536*i18, sum263);
_mm512_storeu_ps(sumPtr1+320+1536*i18, sum264);
_mm512_storeu_ps(sumPtr1+512+1536*i18, sum265);
_mm512_storeu_ps(sumPtr1+576+1536*i18, sum266);
_mm512_storeu_ps(sumPtr1+768+1536*i18, sum267);
_mm512_storeu_ps(sumPtr1+832+1536*i18, sum268);
_mm512_storeu_ps(sumPtr1+1024+1536*i18, sum269);
_mm512_storeu_ps(sumPtr1+1088+1536*i18, sum270);
_mm512_storeu_ps(sumPtr1+1280+1536*i18, sum271);
_mm512_storeu_ps(sumPtr1+1344+1536*i18, sum272);
}
__m512 sum273 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i18));
__m512 sum274 = sum273;
for (ptrdiff_t j26 = 0; j26 < 16; ++j26) {
__m512 dat84 = _mm512_loadu_ps(datPtr3+128+256*j26);
__m512 dat85 = _mm512_loadu_ps(datPtr3+192+256*j26);
__m512 wt148 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i18+4*j26));
sum273 = _mm512_fmadd_ps(wt148, dat84, sum273);
sum274 = _mm512_fmadd_ps(wt148, dat85, sum274);
}
_mm512_storeu_ps(sumPtr1+0+1536*i18, sum273);
_mm512_storeu_ps(sumPtr1+64+1536*i18, sum274);
break;
}
case 5: {
if (node6) {
if (!epoch1 && base1) {
ptrdiff_t i19 = 11*w3;
for (; i19 != 10; ++i19) {
__m512 sum275 = _mm512_setzero_ps();
__m512 sum279 = _mm512_setzero_ps();
__m512 sum283 = _mm512_setzero_ps();
__m512 sum287 = _mm512_setzero_ps();
__m512 sum291 = _mm512_setzero_ps();
__m512 sum295 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum276 = sum275;
__m512 sum277 = sum275;
__m512 sum278 = sum275;
__m512 sum280 = sum279;
__m512 sum281 = sum279;
__m512 sum282 = sum279;
__m512 sum284 = sum283;
__m512 sum285 = sum283;
__m512 sum286 = sum283;
__m512 sum288 = sum287;
__m512 sum289 = sum287;
__m512 sum290 = sum287;
__m512 sum292 = sum291;
__m512 sum293 = sum291;
__m512 sum294 = sum291;
__m512 sum296 = sum295;
__m512 sum297 = sum295;
__m512 sum298 = sum295;
for (ptrdiff_t j27 = 0; j27 < 16; ++j27) {
__m512 dat86 = _mm512_loadu_ps(datPtr3+0+256*j27);
__m512 dat87 = _mm512_loadu_ps(datPtr3+64+256*j27);
__m512 dat88 = _mm512_loadu_ps(datPtr3+128+256*j27);
__m512 dat89 = _mm512_loadu_ps(datPtr3+192+256*j27);
__m512 wt149 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i19+24*j27));
sum275 = _mm512_fmadd_ps(wt149, dat86, sum275);
sum276 = _mm512_fmadd_ps(wt149, dat87, sum276);
sum277 = _mm512_fmadd_ps(wt149, dat88, sum277);
sum278 = _mm512_fmadd_ps(wt149, dat89, sum278);
__m512 wt150 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i19+24*j27));
sum279 = _mm512_fmadd_ps(wt150, dat86, sum279);
sum280 = _mm512_fmadd_ps(wt150, dat87, sum280);
sum281 = _mm512_fmadd_ps(wt150, dat88, sum281);
sum282 = _mm512_fmadd_ps(wt150, dat89, sum282);
__m512 wt151 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i19+24*j27));
sum283 = _mm512_fmadd_ps(wt151, dat86, sum283);
sum284 = _mm512_fmadd_ps(wt151, dat87, sum284);
sum285 = _mm512_fmadd_ps(wt151, dat88, sum285);
sum286 = _mm512_fmadd_ps(wt151, dat89, sum286);
__m512 wt152 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i19+24*j27));
sum287 = _mm512_fmadd_ps(wt152, dat86, sum287);
sum288 = _mm512_fmadd_ps(wt152, dat87, sum288);
sum289 = _mm512_fmadd_ps(wt152, dat88, sum289);
sum290 = _mm512_fmadd_ps(wt152, dat89, sum290);
__m512 wt153 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i19+24*j27));
sum291 = _mm512_fmadd_ps(wt153, dat86, sum291);
sum292 = _mm512_fmadd_ps(wt153, dat87, sum292);
sum293 = _mm512_fmadd_ps(wt153, dat88, sum293);
sum294 = _mm512_fmadd_ps(wt153, dat89, sum294);
__m512 wt154 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i19+24*j27));
sum295 = _mm512_fmadd_ps(wt154, dat86, sum295);
sum296 = _mm512_fmadd_ps(wt154, dat87, sum296);
sum297 = _mm512_fmadd_ps(wt154, dat88, sum297);
sum298 = _mm512_fmadd_ps(wt154, dat89, sum298);
}
_mm512_storeu_ps(sumPtr1+-62336+1536*i19, sum275);
_mm512_storeu_ps(sumPtr1+-62272+1536*i19, sum276);
_mm512_storeu_ps(sumPtr1+0+1536*i19, sum277);
_mm512_storeu_ps(sumPtr1+64+1536*i19, sum278);
_mm512_storeu_ps(sumPtr1+-62080+1536*i19, sum279);
_mm512_storeu_ps(sumPtr1+-62016+1536*i19, sum280);
_mm512_storeu_ps(sumPtr1+256+1536*i19, sum281);
_mm512_storeu_ps(sumPtr1+320+1536*i19, sum282);
_mm512_storeu_ps(sumPtr1+-61824+1536*i19, sum283);
_mm512_storeu_ps(sumPtr1+-61760+1536*i19, sum284);
_mm512_storeu_ps(sumPtr1+512+1536*i19, sum285);
_mm512_storeu_ps(sumPtr1+576+1536*i19, sum286);
_mm512_storeu_ps(sumPtr1+-61568+1536*i19, sum287);
_mm512_storeu_ps(sumPtr1+-61504+1536*i19, sum288);
_mm512_storeu_ps(sumPtr1+768+1536*i19, sum289);
_mm512_storeu_ps(sumPtr1+832+1536*i19, sum290);
_mm512_storeu_ps(sumPtr1+-61312+1536*i19, sum291);
_mm512_storeu_ps(sumPtr1+-61248+1536*i19, sum292);
_mm512_storeu_ps(sumPtr1+1024+1536*i19, sum293);
_mm512_storeu_ps(sumPtr1+1088+1536*i19, sum294);
_mm512_storeu_ps(sumPtr1+-61056+1536*i19, sum295);
_mm512_storeu_ps(sumPtr1+-60992+1536*i19, sum296);
_mm512_storeu_ps(sumPtr1+1280+1536*i19, sum297);
_mm512_storeu_ps(sumPtr1+1344+1536*i19, sum298);
}
__m512 sum299 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum300 = sum299;
__m512 sum301 = sum299;
__m512 sum302 = sum299;
for (ptrdiff_t j28 = 0; j28 < 16; ++j28) {
__m512 dat90 = _mm512_loadu_ps(datPtr3+0+256*j28);
__m512 dat91 = _mm512_loadu_ps(datPtr3+64+256*j28);
__m512 dat92 = _mm512_loadu_ps(datPtr3+128+256*j28);
__m512 dat93 = _mm512_loadu_ps(datPtr3+192+256*j28);
__m512 wt155 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i19+4*j28));
sum299 = _mm512_fmadd_ps(wt155, dat90, sum299);
sum300 = _mm512_fmadd_ps(wt155, dat91, sum300);
sum301 = _mm512_fmadd_ps(wt155, dat92, sum301);
sum302 = _mm512_fmadd_ps(wt155, dat93, sum302);
}
_mm512_storeu_ps(sumPtr1+-62336+1536*i19, sum299);
_mm512_storeu_ps(sumPtr1+-62272+1536*i19, sum300);
_mm512_storeu_ps(sumPtr1+0+1536*i19, sum301);
_mm512_storeu_ps(sumPtr1+64+1536*i19, sum302);
return;
}
ptrdiff_t i20 = 11*w3;
for (; i20 != 10; ++i20) {
__m512 sum303 = _mm512_setzero_ps();
__m512 sum307 = _mm512_setzero_ps();
__m512 sum311 = _mm512_setzero_ps();
__m512 sum315 = _mm512_setzero_ps();
__m512 sum319 = _mm512_setzero_ps();
__m512 sum323 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum304 = sum303;
__m512 sum305 = sum303;
__m512 sum306 = sum303;
__m512 sum308 = sum307;
__m512 sum309 = sum307;
__m512 sum310 = sum307;
__m512 sum312 = sum311;
__m512 sum313 = sum311;
__m512 sum314 = sum311;
__m512 sum316 = sum315;
__m512 sum317 = sum315;
__m512 sum318 = sum315;
__m512 sum320 = sum319;
__m512 sum321 = sum319;
__m512 sum322 = sum319;
__m512 sum324 = sum323;
__m512 sum325 = sum323;
__m512 sum326 = sum323;
for (ptrdiff_t j29 = 0; j29 < 16; ++j29) {
__m512 dat94 = _mm512_loadu_ps(datPtr3+0+256*j29);
__m512 dat95 = _mm512_loadu_ps(datPtr3+64+256*j29);
__m512 dat96 = _mm512_loadu_ps(datPtr3+128+256*j29);
__m512 dat97 = _mm512_loadu_ps(datPtr3+192+256*j29);
__m512 wt156 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i20+24*j29));
sum303 = _mm512_fmadd_ps(wt156, dat94, sum303);
sum304 = _mm512_fmadd_ps(wt156, dat95, sum304);
sum305 = _mm512_fmadd_ps(wt156, dat96, sum305);
sum306 = _mm512_fmadd_ps(wt156, dat97, sum306);
__m512 wt157 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i20+24*j29));
sum307 = _mm512_fmadd_ps(wt157, dat94, sum307);
sum308 = _mm512_fmadd_ps(wt157, dat95, sum308);
sum309 = _mm512_fmadd_ps(wt157, dat96, sum309);
sum310 = _mm512_fmadd_ps(wt157, dat97, sum310);
__m512 wt158 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i20+24*j29));
sum311 = _mm512_fmadd_ps(wt158, dat94, sum311);
sum312 = _mm512_fmadd_ps(wt158, dat95, sum312);
sum313 = _mm512_fmadd_ps(wt158, dat96, sum313);
sum314 = _mm512_fmadd_ps(wt158, dat97, sum314);
__m512 wt159 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i20+24*j29));
sum315 = _mm512_fmadd_ps(wt159, dat94, sum315);
sum316 = _mm512_fmadd_ps(wt159, dat95, sum316);
sum317 = _mm512_fmadd_ps(wt159, dat96, sum317);
sum318 = _mm512_fmadd_ps(wt159, dat97, sum318);
__m512 wt160 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i20+24*j29));
sum319 = _mm512_fmadd_ps(wt160, dat94, sum319);
sum320 = _mm512_fmadd_ps(wt160, dat95, sum320);
sum321 = _mm512_fmadd_ps(wt160, dat96, sum321);
sum322 = _mm512_fmadd_ps(wt160, dat97, sum322);
__m512 wt161 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i20+24*j29));
sum323 = _mm512_fmadd_ps(wt161, dat94, sum323);
sum324 = _mm512_fmadd_ps(wt161, dat95, sum324);
sum325 = _mm512_fmadd_ps(wt161, dat96, sum325);
sum326 = _mm512_fmadd_ps(wt161, dat97, sum326);
}
_mm512_storeu_ps(sumPtr1+-62336+1536*i20, _mm512_add_ps(sum303, _mm512_loadu_ps(sumPtr1+-62336+1536*i20)));
_mm512_storeu_ps(sumPtr1+-62272+1536*i20, _mm512_add_ps(sum304, _mm512_loadu_ps(sumPtr1+-62272+1536*i20)));
_mm512_storeu_ps(sumPtr1+0+1536*i20, _mm512_add_ps(sum305, _mm512_loadu_ps(sumPtr1+0+1536*i20)));
_mm512_storeu_ps(sumPtr1+64+1536*i20, _mm512_add_ps(sum306, _mm512_loadu_ps(sumPtr1+64+1536*i20)));
_mm512_storeu_ps(sumPtr1+-62080+1536*i20, _mm512_add_ps(sum307, _mm512_loadu_ps(sumPtr1+-62080+1536*i20)));
_mm512_storeu_ps(sumPtr1+-62016+1536*i20, _mm512_add_ps(sum308, _mm512_loadu_ps(sumPtr1+-62016+1536*i20)));
_mm512_storeu_ps(sumPtr1+256+1536*i20, _mm512_add_ps(sum309, _mm512_loadu_ps(sumPtr1+256+1536*i20)));
_mm512_storeu_ps(sumPtr1+320+1536*i20, _mm512_add_ps(sum310, _mm512_loadu_ps(sumPtr1+320+1536*i20)));
_mm512_storeu_ps(sumPtr1+-61824+1536*i20, _mm512_add_ps(sum311, _mm512_loadu_ps(sumPtr1+-61824+1536*i20)));
_mm512_storeu_ps(sumPtr1+-61760+1536*i20, _mm512_add_ps(sum312, _mm512_loadu_ps(sumPtr1+-61760+1536*i20)));
_mm512_storeu_ps(sumPtr1+512+1536*i20, _mm512_add_ps(sum313, _mm512_loadu_ps(sumPtr1+512+1536*i20)));
_mm512_storeu_ps(sumPtr1+576+1536*i20, _mm512_add_ps(sum314, _mm512_loadu_ps(sumPtr1+576+1536*i20)));
_mm512_storeu_ps(sumPtr1+-61568+1536*i20, _mm512_add_ps(sum315, _mm512_loadu_ps(sumPtr1+-61568+1536*i20)));
_mm512_storeu_ps(sumPtr1+-61504+1536*i20, _mm512_add_ps(sum316, _mm512_loadu_ps(sumPtr1+-61504+1536*i20)));
_mm512_storeu_ps(sumPtr1+768+1536*i20, _mm512_add_ps(sum317, _mm512_loadu_ps(sumPtr1+768+1536*i20)));
_mm512_storeu_ps(sumPtr1+832+1536*i20, _mm512_add_ps(sum318, _mm512_loadu_ps(sumPtr1+832+1536*i20)));
_mm512_storeu_ps(sumPtr1+-61312+1536*i20, _mm512_add_ps(sum319, _mm512_loadu_ps(sumPtr1+-61312+1536*i20)));
_mm512_storeu_ps(sumPtr1+-61248+1536*i20, _mm512_add_ps(sum320, _mm512_loadu_ps(sumPtr1+-61248+1536*i20)));
_mm512_storeu_ps(sumPtr1+1024+1536*i20, _mm512_add_ps(sum321, _mm512_loadu_ps(sumPtr1+1024+1536*i20)));
_mm512_storeu_ps(sumPtr1+1088+1536*i20, _mm512_add_ps(sum322, _mm512_loadu_ps(sumPtr1+1088+1536*i20)));
_mm512_storeu_ps(sumPtr1+-61056+1536*i20, _mm512_add_ps(sum323, _mm512_loadu_ps(sumPtr1+-61056+1536*i20)));
_mm512_storeu_ps(sumPtr1+-60992+1536*i20, _mm512_add_ps(sum324, _mm512_loadu_ps(sumPtr1+-60992+1536*i20)));
_mm512_storeu_ps(sumPtr1+1280+1536*i20, _mm512_add_ps(sum325, _mm512_loadu_ps(sumPtr1+1280+1536*i20)));
_mm512_storeu_ps(sumPtr1+1344+1536*i20, _mm512_add_ps(sum326, _mm512_loadu_ps(sumPtr1+1344+1536*i20)));
}
__m512 sum327 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum328 = sum327;
__m512 sum329 = sum327;
__m512 sum330 = sum327;
for (ptrdiff_t j30 = 0; j30 < 16; ++j30) {
__m512 dat98 = _mm512_loadu_ps(datPtr3+0+256*j30);
__m512 dat99 = _mm512_loadu_ps(datPtr3+64+256*j30);
__m512 dat100 = _mm512_loadu_ps(datPtr3+128+256*j30);
__m512 dat101 = _mm512_loadu_ps(datPtr3+192+256*j30);
__m512 wt162 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i20+4*j30));
sum327 = _mm512_fmadd_ps(wt162, dat98, sum327);
sum328 = _mm512_fmadd_ps(wt162, dat99, sum328);
sum329 = _mm512_fmadd_ps(wt162, dat100, sum329);
sum330 = _mm512_fmadd_ps(wt162, dat101, sum330);
}
_mm512_storeu_ps(sumPtr1+-62336+1536*i20, _mm512_add_ps(sum327, _mm512_loadu_ps(sumPtr1+-62336+1536*i20)));
_mm512_storeu_ps(sumPtr1+-62272+1536*i20, _mm512_add_ps(sum328, _mm512_loadu_ps(sumPtr1+-62272+1536*i20)));
_mm512_storeu_ps(sumPtr1+0+1536*i20, _mm512_add_ps(sum329, _mm512_loadu_ps(sumPtr1+0+1536*i20)));
_mm512_storeu_ps(sumPtr1+64+1536*i20, _mm512_add_ps(sum330, _mm512_loadu_ps(sumPtr1+64+1536*i20)));
return;
}
(void)base1;
ptrdiff_t i21 = 11*w3;
for (; i21 != 10; ++i21) {
__m512 sum331 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i21));
__m512 sum335 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i21));
__m512 sum339 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i21));
__m512 sum343 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i21));
__m512 sum347 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i21));
__m512 sum351 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i21));
__m512 sum332 = sum331;
__m512 sum333 = sum331;
__m512 sum334 = sum331;
__m512 sum336 = sum335;
__m512 sum337 = sum335;
__m512 sum338 = sum335;
__m512 sum340 = sum339;
__m512 sum341 = sum339;
__m512 sum342 = sum339;
__m512 sum344 = sum343;
__m512 sum345 = sum343;
__m512 sum346 = sum343;
__m512 sum348 = sum347;
__m512 sum349 = sum347;
__m512 sum350 = sum347;
__m512 sum352 = sum351;
__m512 sum353 = sum351;
__m512 sum354 = sum351;
for (ptrdiff_t j31 = 0; j31 < 16; ++j31) {
__m512 dat102 = _mm512_loadu_ps(datPtr3+0+256*j31);
__m512 dat103 = _mm512_loadu_ps(datPtr3+64+256*j31);
__m512 dat104 = _mm512_loadu_ps(datPtr3+128+256*j31);
__m512 dat105 = _mm512_loadu_ps(datPtr3+192+256*j31);
__m512 wt163 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i21+24*j31));
sum331 = _mm512_fmadd_ps(wt163, dat102, sum331);
sum332 = _mm512_fmadd_ps(wt163, dat103, sum332);
sum333 = _mm512_fmadd_ps(wt163, dat104, sum333);
sum334 = _mm512_fmadd_ps(wt163, dat105, sum334);
__m512 wt164 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i21+24*j31));
sum335 = _mm512_fmadd_ps(wt164, dat102, sum335);
sum336 = _mm512_fmadd_ps(wt164, dat103, sum336);
sum337 = _mm512_fmadd_ps(wt164, dat104, sum337);
sum338 = _mm512_fmadd_ps(wt164, dat105, sum338);
__m512 wt165 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i21+24*j31));
sum339 = _mm512_fmadd_ps(wt165, dat102, sum339);
sum340 = _mm512_fmadd_ps(wt165, dat103, sum340);
sum341 = _mm512_fmadd_ps(wt165, dat104, sum341);
sum342 = _mm512_fmadd_ps(wt165, dat105, sum342);
__m512 wt166 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i21+24*j31));
sum343 = _mm512_fmadd_ps(wt166, dat102, sum343);
sum344 = _mm512_fmadd_ps(wt166, dat103, sum344);
sum345 = _mm512_fmadd_ps(wt166, dat104, sum345);
sum346 = _mm512_fmadd_ps(wt166, dat105, sum346);
__m512 wt167 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i21+24*j31));
sum347 = _mm512_fmadd_ps(wt167, dat102, sum347);
sum348 = _mm512_fmadd_ps(wt167, dat103, sum348);
sum349 = _mm512_fmadd_ps(wt167, dat104, sum349);
sum350 = _mm512_fmadd_ps(wt167, dat105, sum350);
__m512 wt168 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i21+24*j31));
sum351 = _mm512_fmadd_ps(wt168, dat102, sum351);
sum352 = _mm512_fmadd_ps(wt168, dat103, sum352);
sum353 = _mm512_fmadd_ps(wt168, dat104, sum353);
sum354 = _mm512_fmadd_ps(wt168, dat105, sum354);
}
_mm512_storeu_ps(sumPtr1+-62336+1536*i21, sum331);
_mm512_storeu_ps(sumPtr1+-62272+1536*i21, sum332);
_mm512_storeu_ps(sumPtr1+0+1536*i21, sum333);
_mm512_storeu_ps(sumPtr1+64+1536*i21, sum334);
_mm512_storeu_ps(sumPtr1+-62080+1536*i21, sum335);
_mm512_storeu_ps(sumPtr1+-62016+1536*i21, sum336);
_mm512_storeu_ps(sumPtr1+256+1536*i21, sum337);
_mm512_storeu_ps(sumPtr1+320+1536*i21, sum338);
_mm512_storeu_ps(sumPtr1+-61824+1536*i21, sum339);
_mm512_storeu_ps(sumPtr1+-61760+1536*i21, sum340);
_mm512_storeu_ps(sumPtr1+512+1536*i21, sum341);
_mm512_storeu_ps(sumPtr1+576+1536*i21, sum342);
_mm512_storeu_ps(sumPtr1+-61568+1536*i21, sum343);
_mm512_storeu_ps(sumPtr1+-61504+1536*i21, sum344);
_mm512_storeu_ps(sumPtr1+768+1536*i21, sum345);
_mm512_storeu_ps(sumPtr1+832+1536*i21, sum346);
_mm512_storeu_ps(sumPtr1+-61312+1536*i21, sum347);
_mm512_storeu_ps(sumPtr1+-61248+1536*i21, sum348);
_mm512_storeu_ps(sumPtr1+1024+1536*i21, sum349);
_mm512_storeu_ps(sumPtr1+1088+1536*i21, sum350);
_mm512_storeu_ps(sumPtr1+-61056+1536*i21, sum351);
_mm512_storeu_ps(sumPtr1+-60992+1536*i21, sum352);
_mm512_storeu_ps(sumPtr1+1280+1536*i21, sum353);
_mm512_storeu_ps(sumPtr1+1344+1536*i21, sum354);
}
__m512 sum355 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i21));
__m512 sum356 = sum355;
__m512 sum357 = sum355;
__m512 sum358 = sum355;
for (ptrdiff_t j32 = 0; j32 < 16; ++j32) {
__m512 dat106 = _mm512_loadu_ps(datPtr3+0+256*j32);
__m512 dat107 = _mm512_loadu_ps(datPtr3+64+256*j32);
__m512 dat108 = _mm512_loadu_ps(datPtr3+128+256*j32);
__m512 dat109 = _mm512_loadu_ps(datPtr3+192+256*j32);
__m512 wt169 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i21+4*j32));
sum355 = _mm512_fmadd_ps(wt169, dat106, sum355);
sum356 = _mm512_fmadd_ps(wt169, dat107, sum356);
sum357 = _mm512_fmadd_ps(wt169, dat108, sum357);
sum358 = _mm512_fmadd_ps(wt169, dat109, sum358);
}
_mm512_storeu_ps(sumPtr1+-62336+1536*i21, sum355);
_mm512_storeu_ps(sumPtr1+-62272+1536*i21, sum356);
_mm512_storeu_ps(sumPtr1+0+1536*i21, sum357);
_mm512_storeu_ps(sumPtr1+64+1536*i21, sum358);
break;
}
case 6: {
if (node6) {
if (!epoch1 && base1) {
ptrdiff_t i22 = 11*w3;
for (; i22 != 10; ++i22) {
__m512 sum359 = _mm512_setzero_ps();
__m512 sum360 = _mm512_setzero_ps();
__m512 sum361 = _mm512_setzero_ps();
__m512 sum362 = _mm512_setzero_ps();
__m512 sum363 = _mm512_setzero_ps();
__m512 sum364 = _mm512_setzero_ps();
(void)biasPtr2;
for (ptrdiff_t j33 = 0; j33 < 16; ++j33) {
__m512 dat110 = _mm512_loadu_ps(datPtr3+192+256*j33);
__m512 wt170 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i22+24*j33));
sum359 = _mm512_fmadd_ps(wt170, dat110, sum359);
__m512 wt171 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i22+24*j33));
sum360 = _mm512_fmadd_ps(wt171, dat110, sum360);
__m512 wt172 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i22+24*j33));
sum361 = _mm512_fmadd_ps(wt172, dat110, sum361);
__m512 wt173 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i22+24*j33));
sum362 = _mm512_fmadd_ps(wt173, dat110, sum362);
__m512 wt174 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i22+24*j33));
sum363 = _mm512_fmadd_ps(wt174, dat110, sum363);
__m512 wt175 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i22+24*j33));
sum364 = _mm512_fmadd_ps(wt175, dat110, sum364);
}
_mm512_storeu_ps(sumPtr1+0+1536*i22, sum359);
_mm512_storeu_ps(sumPtr1+256+1536*i22, sum360);
_mm512_storeu_ps(sumPtr1+512+1536*i22, sum361);
_mm512_storeu_ps(sumPtr1+768+1536*i22, sum362);
_mm512_storeu_ps(sumPtr1+1024+1536*i22, sum363);
_mm512_storeu_ps(sumPtr1+1280+1536*i22, sum364);
}
__m512 sum365 = _mm512_setzero_ps();
(void)biasPtr2;
for (ptrdiff_t j34 = 0; j34 < 16; ++j34) {
__m512 dat111 = _mm512_loadu_ps(datPtr3+192+256*j34);
__m512 wt176 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i22+4*j34));
sum365 = _mm512_fmadd_ps(wt176, dat111, sum365);
}
_mm512_storeu_ps(sumPtr1+0+1536*i22, sum365);
return;
}
ptrdiff_t i23 = 11*w3;
for (; i23 != 10; ++i23) {
__m512 sum366 = _mm512_setzero_ps();
__m512 sum367 = _mm512_setzero_ps();
__m512 sum368 = _mm512_setzero_ps();
__m512 sum369 = _mm512_setzero_ps();
__m512 sum370 = _mm512_setzero_ps();
__m512 sum371 = _mm512_setzero_ps();
(void)biasPtr2;
for (ptrdiff_t j35 = 0; j35 < 16; ++j35) {
__m512 dat112 = _mm512_loadu_ps(datPtr3+192+256*j35);
__m512 wt177 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i23+24*j35));
sum366 = _mm512_fmadd_ps(wt177, dat112, sum366);
__m512 wt178 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i23+24*j35));
sum367 = _mm512_fmadd_ps(wt178, dat112, sum367);
__m512 wt179 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i23+24*j35));
sum368 = _mm512_fmadd_ps(wt179, dat112, sum368);
__m512 wt180 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i23+24*j35));
sum369 = _mm512_fmadd_ps(wt180, dat112, sum369);
__m512 wt181 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i23+24*j35));
sum370 = _mm512_fmadd_ps(wt181, dat112, sum370);
__m512 wt182 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i23+24*j35));
sum371 = _mm512_fmadd_ps(wt182, dat112, sum371);
}
_mm512_storeu_ps(sumPtr1+0+1536*i23, _mm512_add_ps(sum366, _mm512_loadu_ps(sumPtr1+0+1536*i23)));
_mm512_storeu_ps(sumPtr1+256+1536*i23, _mm512_add_ps(sum367, _mm512_loadu_ps(sumPtr1+256+1536*i23)));
_mm512_storeu_ps(sumPtr1+512+1536*i23, _mm512_add_ps(sum368, _mm512_loadu_ps(sumPtr1+512+1536*i23)));
_mm512_storeu_ps(sumPtr1+768+1536*i23, _mm512_add_ps(sum369, _mm512_loadu_ps(sumPtr1+768+1536*i23)));
_mm512_storeu_ps(sumPtr1+1024+1536*i23, _mm512_add_ps(sum370, _mm512_loadu_ps(sumPtr1+1024+1536*i23)));
_mm512_storeu_ps(sumPtr1+1280+1536*i23, _mm512_add_ps(sum371, _mm512_loadu_ps(sumPtr1+1280+1536*i23)));
}
__m512 sum372 = _mm512_setzero_ps();
(void)biasPtr2;
for (ptrdiff_t j36 = 0; j36 < 16; ++j36) {
__m512 dat113 = _mm512_loadu_ps(datPtr3+192+256*j36);
__m512 wt183 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i23+4*j36));
sum372 = _mm512_fmadd_ps(wt183, dat113, sum372);
}
_mm512_storeu_ps(sumPtr1+0+1536*i23, _mm512_add_ps(sum372, _mm512_loadu_ps(sumPtr1+0+1536*i23)));
return;
}
(void)base1;
ptrdiff_t i24 = 11*w3;
for (; i24 != 10; ++i24) {
__m512 sum373 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i24));
__m512 sum374 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i24));
__m512 sum375 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i24));
__m512 sum376 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i24));
__m512 sum377 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i24));
__m512 sum378 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i24));
for (ptrdiff_t j37 = 0; j37 < 16; ++j37) {
__m512 dat114 = _mm512_loadu_ps(datPtr3+192+256*j37);
__m512 wt184 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i24+24*j37));
sum373 = _mm512_fmadd_ps(wt184, dat114, sum373);
__m512 wt185 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i24+24*j37));
sum374 = _mm512_fmadd_ps(wt185, dat114, sum374);
__m512 wt186 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i24+24*j37));
sum375 = _mm512_fmadd_ps(wt186, dat114, sum375);
__m512 wt187 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i24+24*j37));
sum376 = _mm512_fmadd_ps(wt187, dat114, sum376);
__m512 wt188 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i24+24*j37));
sum377 = _mm512_fmadd_ps(wt188, dat114, sum377);
__m512 wt189 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i24+24*j37));
sum378 = _mm512_fmadd_ps(wt189, dat114, sum378);
}
_mm512_storeu_ps(sumPtr1+0+1536*i24, sum373);
_mm512_storeu_ps(sumPtr1+256+1536*i24, sum374);
_mm512_storeu_ps(sumPtr1+512+1536*i24, sum375);
_mm512_storeu_ps(sumPtr1+768+1536*i24, sum376);
_mm512_storeu_ps(sumPtr1+1024+1536*i24, sum377);
_mm512_storeu_ps(sumPtr1+1280+1536*i24, sum378);
}
__m512 sum379 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i24));
for (ptrdiff_t j38 = 0; j38 < 16; ++j38) {
__m512 dat115 = _mm512_loadu_ps(datPtr3+192+256*j38);
__m512 wt190 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i24+4*j38));
sum379 = _mm512_fmadd_ps(wt190, dat115, sum379);
}
_mm512_storeu_ps(sumPtr1+0+1536*i24, sum379);
break;
}
case 7: {
if (node6) {
if (!epoch1 && base1) {
ptrdiff_t i25 = 11*w3;
for (; i25 != 10; ++i25) {
__m512 sum380 = _mm512_setzero_ps();
__m512 sum384 = _mm512_setzero_ps();
__m512 sum388 = _mm512_setzero_ps();
__m512 sum392 = _mm512_setzero_ps();
__m512 sum396 = _mm512_setzero_ps();
__m512 sum400 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum381 = sum380;
__m512 sum382 = sum380;
__m512 sum383 = sum380;
__m512 sum385 = sum384;
__m512 sum386 = sum384;
__m512 sum387 = sum384;
__m512 sum389 = sum388;
__m512 sum390 = sum388;
__m512 sum391 = sum388;
__m512 sum393 = sum392;
__m512 sum394 = sum392;
__m512 sum395 = sum392;
__m512 sum397 = sum396;
__m512 sum398 = sum396;
__m512 sum399 = sum396;
__m512 sum401 = sum400;
__m512 sum402 = sum400;
__m512 sum403 = sum400;
for (ptrdiff_t j39 = 0; j39 < 16; ++j39) {
__m512 dat116 = _mm512_loadu_ps(datPtr3+0+256*j39);
__m512 dat117 = _mm512_loadu_ps(datPtr3+64+256*j39);
__m512 dat118 = _mm512_loadu_ps(datPtr3+128+256*j39);
__m512 dat119 = _mm512_loadu_ps(datPtr3+192+256*j39);
__m512 wt191 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i25+24*j39));
sum380 = _mm512_fmadd_ps(wt191, dat116, sum380);
sum381 = _mm512_fmadd_ps(wt191, dat117, sum381);
sum382 = _mm512_fmadd_ps(wt191, dat118, sum382);
sum383 = _mm512_fmadd_ps(wt191, dat119, sum383);
__m512 wt192 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i25+24*j39));
sum384 = _mm512_fmadd_ps(wt192, dat116, sum384);
sum385 = _mm512_fmadd_ps(wt192, dat117, sum385);
sum386 = _mm512_fmadd_ps(wt192, dat118, sum386);
sum387 = _mm512_fmadd_ps(wt192, dat119, sum387);
__m512 wt193 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i25+24*j39));
sum388 = _mm512_fmadd_ps(wt193, dat116, sum388);
sum389 = _mm512_fmadd_ps(wt193, dat117, sum389);
sum390 = _mm512_fmadd_ps(wt193, dat118, sum390);
sum391 = _mm512_fmadd_ps(wt193, dat119, sum391);
__m512 wt194 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i25+24*j39));
sum392 = _mm512_fmadd_ps(wt194, dat116, sum392);
sum393 = _mm512_fmadd_ps(wt194, dat117, sum393);
sum394 = _mm512_fmadd_ps(wt194, dat118, sum394);
sum395 = _mm512_fmadd_ps(wt194, dat119, sum395);
__m512 wt195 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i25+24*j39));
sum396 = _mm512_fmadd_ps(wt195, dat116, sum396);
sum397 = _mm512_fmadd_ps(wt195, dat117, sum397);
sum398 = _mm512_fmadd_ps(wt195, dat118, sum398);
sum399 = _mm512_fmadd_ps(wt195, dat119, sum399);
__m512 wt196 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i25+24*j39));
sum400 = _mm512_fmadd_ps(wt196, dat116, sum400);
sum401 = _mm512_fmadd_ps(wt196, dat117, sum401);
sum402 = _mm512_fmadd_ps(wt196, dat118, sum402);
sum403 = _mm512_fmadd_ps(wt196, dat119, sum403);
}
_mm512_storeu_ps(sumPtr1+-62400+1536*i25, sum380);
_mm512_storeu_ps(sumPtr1+-62336+1536*i25, sum381);
_mm512_storeu_ps(sumPtr1+-62272+1536*i25, sum382);
_mm512_storeu_ps(sumPtr1+0+1536*i25, sum383);
_mm512_storeu_ps(sumPtr1+-62144+1536*i25, sum384);
_mm512_storeu_ps(sumPtr1+-62080+1536*i25, sum385);
_mm512_storeu_ps(sumPtr1+-62016+1536*i25, sum386);
_mm512_storeu_ps(sumPtr1+256+1536*i25, sum387);
_mm512_storeu_ps(sumPtr1+-61888+1536*i25, sum388);
_mm512_storeu_ps(sumPtr1+-61824+1536*i25, sum389);
_mm512_storeu_ps(sumPtr1+-61760+1536*i25, sum390);
_mm512_storeu_ps(sumPtr1+512+1536*i25, sum391);
_mm512_storeu_ps(sumPtr1+-61632+1536*i25, sum392);
_mm512_storeu_ps(sumPtr1+-61568+1536*i25, sum393);
_mm512_storeu_ps(sumPtr1+-61504+1536*i25, sum394);
_mm512_storeu_ps(sumPtr1+768+1536*i25, sum395);
_mm512_storeu_ps(sumPtr1+-61376+1536*i25, sum396);
_mm512_storeu_ps(sumPtr1+-61312+1536*i25, sum397);
_mm512_storeu_ps(sumPtr1+-61248+1536*i25, sum398);
_mm512_storeu_ps(sumPtr1+1024+1536*i25, sum399);
_mm512_storeu_ps(sumPtr1+-61120+1536*i25, sum400);
_mm512_storeu_ps(sumPtr1+-61056+1536*i25, sum401);
_mm512_storeu_ps(sumPtr1+-60992+1536*i25, sum402);
_mm512_storeu_ps(sumPtr1+1280+1536*i25, sum403);
}
__m512 sum404 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum405 = sum404;
__m512 sum406 = sum404;
__m512 sum407 = sum404;
for (ptrdiff_t j40 = 0; j40 < 16; ++j40) {
__m512 dat120 = _mm512_loadu_ps(datPtr3+0+256*j40);
__m512 dat121 = _mm512_loadu_ps(datPtr3+64+256*j40);
__m512 dat122 = _mm512_loadu_ps(datPtr3+128+256*j40);
__m512 dat123 = _mm512_loadu_ps(datPtr3+192+256*j40);
__m512 wt197 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i25+4*j40));
sum404 = _mm512_fmadd_ps(wt197, dat120, sum404);
sum405 = _mm512_fmadd_ps(wt197, dat121, sum405);
sum406 = _mm512_fmadd_ps(wt197, dat122, sum406);
sum407 = _mm512_fmadd_ps(wt197, dat123, sum407);
}
_mm512_storeu_ps(sumPtr1+-62400+1536*i25, sum404);
_mm512_storeu_ps(sumPtr1+-62336+1536*i25, sum405);
_mm512_storeu_ps(sumPtr1+-62272+1536*i25, sum406);
_mm512_storeu_ps(sumPtr1+0+1536*i25, sum407);
return;
}
ptrdiff_t i26 = 11*w3;
for (; i26 != 10; ++i26) {
__m512 sum408 = _mm512_setzero_ps();
__m512 sum412 = _mm512_setzero_ps();
__m512 sum416 = _mm512_setzero_ps();
__m512 sum420 = _mm512_setzero_ps();
__m512 sum424 = _mm512_setzero_ps();
__m512 sum428 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum409 = sum408;
__m512 sum410 = sum408;
__m512 sum411 = sum408;
__m512 sum413 = sum412;
__m512 sum414 = sum412;
__m512 sum415 = sum412;
__m512 sum417 = sum416;
__m512 sum418 = sum416;
__m512 sum419 = sum416;
__m512 sum421 = sum420;
__m512 sum422 = sum420;
__m512 sum423 = sum420;
__m512 sum425 = sum424;
__m512 sum426 = sum424;
__m512 sum427 = sum424;
__m512 sum429 = sum428;
__m512 sum430 = sum428;
__m512 sum431 = sum428;
for (ptrdiff_t j41 = 0; j41 < 16; ++j41) {
__m512 dat124 = _mm512_loadu_ps(datPtr3+0+256*j41);
__m512 dat125 = _mm512_loadu_ps(datPtr3+64+256*j41);
__m512 dat126 = _mm512_loadu_ps(datPtr3+128+256*j41);
__m512 dat127 = _mm512_loadu_ps(datPtr3+192+256*j41);
__m512 wt198 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i26+24*j41));
sum408 = _mm512_fmadd_ps(wt198, dat124, sum408);
sum409 = _mm512_fmadd_ps(wt198, dat125, sum409);
sum410 = _mm512_fmadd_ps(wt198, dat126, sum410);
sum411 = _mm512_fmadd_ps(wt198, dat127, sum411);
__m512 wt199 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i26+24*j41));
sum412 = _mm512_fmadd_ps(wt199, dat124, sum412);
sum413 = _mm512_fmadd_ps(wt199, dat125, sum413);
sum414 = _mm512_fmadd_ps(wt199, dat126, sum414);
sum415 = _mm512_fmadd_ps(wt199, dat127, sum415);
__m512 wt200 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i26+24*j41));
sum416 = _mm512_fmadd_ps(wt200, dat124, sum416);
sum417 = _mm512_fmadd_ps(wt200, dat125, sum417);
sum418 = _mm512_fmadd_ps(wt200, dat126, sum418);
sum419 = _mm512_fmadd_ps(wt200, dat127, sum419);
__m512 wt201 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i26+24*j41));
sum420 = _mm512_fmadd_ps(wt201, dat124, sum420);
sum421 = _mm512_fmadd_ps(wt201, dat125, sum421);
sum422 = _mm512_fmadd_ps(wt201, dat126, sum422);
sum423 = _mm512_fmadd_ps(wt201, dat127, sum423);
__m512 wt202 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i26+24*j41));
sum424 = _mm512_fmadd_ps(wt202, dat124, sum424);
sum425 = _mm512_fmadd_ps(wt202, dat125, sum425);
sum426 = _mm512_fmadd_ps(wt202, dat126, sum426);
sum427 = _mm512_fmadd_ps(wt202, dat127, sum427);
__m512 wt203 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i26+24*j41));
sum428 = _mm512_fmadd_ps(wt203, dat124, sum428);
sum429 = _mm512_fmadd_ps(wt203, dat125, sum429);
sum430 = _mm512_fmadd_ps(wt203, dat126, sum430);
sum431 = _mm512_fmadd_ps(wt203, dat127, sum431);
}
_mm512_storeu_ps(sumPtr1+-62400+1536*i26, _mm512_add_ps(sum408, _mm512_loadu_ps(sumPtr1+-62400+1536*i26)));
_mm512_storeu_ps(sumPtr1+-62336+1536*i26, _mm512_add_ps(sum409, _mm512_loadu_ps(sumPtr1+-62336+1536*i26)));
_mm512_storeu_ps(sumPtr1+-62272+1536*i26, _mm512_add_ps(sum410, _mm512_loadu_ps(sumPtr1+-62272+1536*i26)));
_mm512_storeu_ps(sumPtr1+0+1536*i26, _mm512_add_ps(sum411, _mm512_loadu_ps(sumPtr1+0+1536*i26)));
_mm512_storeu_ps(sumPtr1+-62144+1536*i26, _mm512_add_ps(sum412, _mm512_loadu_ps(sumPtr1+-62144+1536*i26)));
_mm512_storeu_ps(sumPtr1+-62080+1536*i26, _mm512_add_ps(sum413, _mm512_loadu_ps(sumPtr1+-62080+1536*i26)));
_mm512_storeu_ps(sumPtr1+-62016+1536*i26, _mm512_add_ps(sum414, _mm512_loadu_ps(sumPtr1+-62016+1536*i26)));
_mm512_storeu_ps(sumPtr1+256+1536*i26, _mm512_add_ps(sum415, _mm512_loadu_ps(sumPtr1+256+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61888+1536*i26, _mm512_add_ps(sum416, _mm512_loadu_ps(sumPtr1+-61888+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61824+1536*i26, _mm512_add_ps(sum417, _mm512_loadu_ps(sumPtr1+-61824+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61760+1536*i26, _mm512_add_ps(sum418, _mm512_loadu_ps(sumPtr1+-61760+1536*i26)));
_mm512_storeu_ps(sumPtr1+512+1536*i26, _mm512_add_ps(sum419, _mm512_loadu_ps(sumPtr1+512+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61632+1536*i26, _mm512_add_ps(sum420, _mm512_loadu_ps(sumPtr1+-61632+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61568+1536*i26, _mm512_add_ps(sum421, _mm512_loadu_ps(sumPtr1+-61568+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61504+1536*i26, _mm512_add_ps(sum422, _mm512_loadu_ps(sumPtr1+-61504+1536*i26)));
_mm512_storeu_ps(sumPtr1+768+1536*i26, _mm512_add_ps(sum423, _mm512_loadu_ps(sumPtr1+768+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61376+1536*i26, _mm512_add_ps(sum424, _mm512_loadu_ps(sumPtr1+-61376+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61312+1536*i26, _mm512_add_ps(sum425, _mm512_loadu_ps(sumPtr1+-61312+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61248+1536*i26, _mm512_add_ps(sum426, _mm512_loadu_ps(sumPtr1+-61248+1536*i26)));
_mm512_storeu_ps(sumPtr1+1024+1536*i26, _mm512_add_ps(sum427, _mm512_loadu_ps(sumPtr1+1024+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61120+1536*i26, _mm512_add_ps(sum428, _mm512_loadu_ps(sumPtr1+-61120+1536*i26)));
_mm512_storeu_ps(sumPtr1+-61056+1536*i26, _mm512_add_ps(sum429, _mm512_loadu_ps(sumPtr1+-61056+1536*i26)));
_mm512_storeu_ps(sumPtr1+-60992+1536*i26, _mm512_add_ps(sum430, _mm512_loadu_ps(sumPtr1+-60992+1536*i26)));
_mm512_storeu_ps(sumPtr1+1280+1536*i26, _mm512_add_ps(sum431, _mm512_loadu_ps(sumPtr1+1280+1536*i26)));
}
__m512 sum432 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum433 = sum432;
__m512 sum434 = sum432;
__m512 sum435 = sum432;
for (ptrdiff_t j42 = 0; j42 < 16; ++j42) {
__m512 dat128 = _mm512_loadu_ps(datPtr3+0+256*j42);
__m512 dat129 = _mm512_loadu_ps(datPtr3+64+256*j42);
__m512 dat130 = _mm512_loadu_ps(datPtr3+128+256*j42);
__m512 dat131 = _mm512_loadu_ps(datPtr3+192+256*j42);
__m512 wt204 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i26+4*j42));
sum432 = _mm512_fmadd_ps(wt204, dat128, sum432);
sum433 = _mm512_fmadd_ps(wt204, dat129, sum433);
sum434 = _mm512_fmadd_ps(wt204, dat130, sum434);
sum435 = _mm512_fmadd_ps(wt204, dat131, sum435);
}
_mm512_storeu_ps(sumPtr1+-62400+1536*i26, _mm512_add_ps(sum432, _mm512_loadu_ps(sumPtr1+-62400+1536*i26)));
_mm512_storeu_ps(sumPtr1+-62336+1536*i26, _mm512_add_ps(sum433, _mm512_loadu_ps(sumPtr1+-62336+1536*i26)));
_mm512_storeu_ps(sumPtr1+-62272+1536*i26, _mm512_add_ps(sum434, _mm512_loadu_ps(sumPtr1+-62272+1536*i26)));
_mm512_storeu_ps(sumPtr1+0+1536*i26, _mm512_add_ps(sum435, _mm512_loadu_ps(sumPtr1+0+1536*i26)));
return;
}
(void)base1;
ptrdiff_t i27 = 11*w3;
for (; i27 != 10; ++i27) {
__m512 sum436 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i27));
__m512 sum440 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i27));
__m512 sum444 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i27));
__m512 sum448 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i27));
__m512 sum452 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i27));
__m512 sum456 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i27));
__m512 sum437 = sum436;
__m512 sum438 = sum436;
__m512 sum439 = sum436;
__m512 sum441 = sum440;
__m512 sum442 = sum440;
__m512 sum443 = sum440;
__m512 sum445 = sum444;
__m512 sum446 = sum444;
__m512 sum447 = sum444;
__m512 sum449 = sum448;
__m512 sum450 = sum448;
__m512 sum451 = sum448;
__m512 sum453 = sum452;
__m512 sum454 = sum452;
__m512 sum455 = sum452;
__m512 sum457 = sum456;
__m512 sum458 = sum456;
__m512 sum459 = sum456;
for (ptrdiff_t j43 = 0; j43 < 16; ++j43) {
__m512 dat132 = _mm512_loadu_ps(datPtr3+0+256*j43);
__m512 dat133 = _mm512_loadu_ps(datPtr3+64+256*j43);
__m512 dat134 = _mm512_loadu_ps(datPtr3+128+256*j43);
__m512 dat135 = _mm512_loadu_ps(datPtr3+192+256*j43);
__m512 wt205 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i27+24*j43));
sum436 = _mm512_fmadd_ps(wt205, dat132, sum436);
sum437 = _mm512_fmadd_ps(wt205, dat133, sum437);
sum438 = _mm512_fmadd_ps(wt205, dat134, sum438);
sum439 = _mm512_fmadd_ps(wt205, dat135, sum439);
__m512 wt206 = _mm512_set1_ps(*(float*)(wtPtr2+4+384*i27+24*j43));
sum440 = _mm512_fmadd_ps(wt206, dat132, sum440);
sum441 = _mm512_fmadd_ps(wt206, dat133, sum441);
sum442 = _mm512_fmadd_ps(wt206, dat134, sum442);
sum443 = _mm512_fmadd_ps(wt206, dat135, sum443);
__m512 wt207 = _mm512_set1_ps(*(float*)(wtPtr2+8+384*i27+24*j43));
sum444 = _mm512_fmadd_ps(wt207, dat132, sum444);
sum445 = _mm512_fmadd_ps(wt207, dat133, sum445);
sum446 = _mm512_fmadd_ps(wt207, dat134, sum446);
sum447 = _mm512_fmadd_ps(wt207, dat135, sum447);
__m512 wt208 = _mm512_set1_ps(*(float*)(wtPtr2+12+384*i27+24*j43));
sum448 = _mm512_fmadd_ps(wt208, dat132, sum448);
sum449 = _mm512_fmadd_ps(wt208, dat133, sum449);
sum450 = _mm512_fmadd_ps(wt208, dat134, sum450);
sum451 = _mm512_fmadd_ps(wt208, dat135, sum451);
__m512 wt209 = _mm512_set1_ps(*(float*)(wtPtr2+16+384*i27+24*j43));
sum452 = _mm512_fmadd_ps(wt209, dat132, sum452);
sum453 = _mm512_fmadd_ps(wt209, dat133, sum453);
sum454 = _mm512_fmadd_ps(wt209, dat134, sum454);
sum455 = _mm512_fmadd_ps(wt209, dat135, sum455);
__m512 wt210 = _mm512_set1_ps(*(float*)(wtPtr2+20+384*i27+24*j43));
sum456 = _mm512_fmadd_ps(wt210, dat132, sum456);
sum457 = _mm512_fmadd_ps(wt210, dat133, sum457);
sum458 = _mm512_fmadd_ps(wt210, dat134, sum458);
sum459 = _mm512_fmadd_ps(wt210, dat135, sum459);
}
_mm512_storeu_ps(sumPtr1+-62400+1536*i27, sum436);
_mm512_storeu_ps(sumPtr1+-62336+1536*i27, sum437);
_mm512_storeu_ps(sumPtr1+-62272+1536*i27, sum438);
_mm512_storeu_ps(sumPtr1+0+1536*i27, sum439);
_mm512_storeu_ps(sumPtr1+-62144+1536*i27, sum440);
_mm512_storeu_ps(sumPtr1+-62080+1536*i27, sum441);
_mm512_storeu_ps(sumPtr1+-62016+1536*i27, sum442);
_mm512_storeu_ps(sumPtr1+256+1536*i27, sum443);
_mm512_storeu_ps(sumPtr1+-61888+1536*i27, sum444);
_mm512_storeu_ps(sumPtr1+-61824+1536*i27, sum445);
_mm512_storeu_ps(sumPtr1+-61760+1536*i27, sum446);
_mm512_storeu_ps(sumPtr1+512+1536*i27, sum447);
_mm512_storeu_ps(sumPtr1+-61632+1536*i27, sum448);
_mm512_storeu_ps(sumPtr1+-61568+1536*i27, sum449);
_mm512_storeu_ps(sumPtr1+-61504+1536*i27, sum450);
_mm512_storeu_ps(sumPtr1+768+1536*i27, sum451);
_mm512_storeu_ps(sumPtr1+-61376+1536*i27, sum452);
_mm512_storeu_ps(sumPtr1+-61312+1536*i27, sum453);
_mm512_storeu_ps(sumPtr1+-61248+1536*i27, sum454);
_mm512_storeu_ps(sumPtr1+1024+1536*i27, sum455);
_mm512_storeu_ps(sumPtr1+-61120+1536*i27, sum456);
_mm512_storeu_ps(sumPtr1+-61056+1536*i27, sum457);
_mm512_storeu_ps(sumPtr1+-60992+1536*i27, sum458);
_mm512_storeu_ps(sumPtr1+1280+1536*i27, sum459);
}
__m512 sum460 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i27));
__m512 sum461 = sum460;
__m512 sum462 = sum460;
__m512 sum463 = sum460;
for (ptrdiff_t j44 = 0; j44 < 16; ++j44) {
__m512 dat136 = _mm512_loadu_ps(datPtr3+0+256*j44);
__m512 dat137 = _mm512_loadu_ps(datPtr3+64+256*j44);
__m512 dat138 = _mm512_loadu_ps(datPtr3+128+256*j44);
__m512 dat139 = _mm512_loadu_ps(datPtr3+192+256*j44);
__m512 wt211 = _mm512_set1_ps(*(float*)(wtPtr2+0+384*i27+4*j44));
sum460 = _mm512_fmadd_ps(wt211, dat136, sum460);
sum461 = _mm512_fmadd_ps(wt211, dat137, sum461);
sum462 = _mm512_fmadd_ps(wt211, dat138, sum462);
sum463 = _mm512_fmadd_ps(wt211, dat139, sum463);
}
_mm512_storeu_ps(sumPtr1+-62400+1536*i27, sum460);
_mm512_storeu_ps(sumPtr1+-62336+1536*i27, sum461);
_mm512_storeu_ps(sumPtr1+-62272+1536*i27, sum462);
_mm512_storeu_ps(sumPtr1+0+1536*i27, sum463);
break;
}
}
}

static void Example30LoomProduceSums1(Example30ThreaderTeam1* team16, char** tensors5) {
void* tuple1[4];
tuple1[0] = tensors5;
for (ptrdiff_t epoch2 = 0; epoch2 < 1; ++epoch2) {
tuple1[1] = (void*)epoch2;
for (ptrdiff_t field2 = 0; field2 < 1; ++field2) {
tuple1[2] = (void*)field2;
ptrdiff_t node7 = Example30LoomProduceSums1FieldTbl1[0+2*field2];
ptrdiff_t step2 = Example30LoomProduceSums1FieldTbl1[1+2*field2];
ptrdiff_t past1 = Example30LoomProduceSums1FieldTbl1[2+2*field2];
for (; node7 < past1; node7 += step2) {
tuple1[3] = (void*)node7;
Example30ThreaderTask1 task9;
task9.callee1 = Example30LoomProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 1;
task9.hull1[1] = step2;
task9.hull1[2] = 2;
task9.hull1[3] = 1;
Example30ThreaderDo1(team16, &task9);
}
}
}
}

static void Example30LoomConsumeSums1Callee1(Example30ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t cell1 = 0;
ptrdiff_t strip1 = 0;
ptrdiff_t chan1 = 0;
ptrdiff_t group2 = 0;
(void)pt10;
char*restrict sumPtr2 = tensors8[0];
char*restrict datPtr4 = tensors8[1];
char*restrict bnPtr4 = tensors8[2];
char*restrict datPtr5 = tensors8[3];
ptrdiff_t i28 = 1*group2;
ptrdiff_t j45 = 61*chan1;
ptrdiff_t jj2 = j45+60;
for (; j45 <= jj2; ++j45) {
__m512 bnMul3 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(j45+61*i28))[0]);
__m512 bnAdd3 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(j45+61*i28))[1]);
ptrdiff_t k7 = 1*strip1;
for (; k7 != 1; ++k7) {
ptrdiff_t l1 = 1*cell1;
__m512 load1 = _mm512_loadu_ps(sumPtr2+0+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load5 = _mm512_loadu_ps(sumPtr2+64+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load9 = _mm512_loadu_ps(sumPtr2+128+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load13 = _mm512_loadu_ps(sumPtr2+192+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load2 = _mm512_loadu_ps(sumPtr2+15616+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load6 = _mm512_loadu_ps(sumPtr2+15680+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load10 = _mm512_loadu_ps(sumPtr2+15744+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load14 = _mm512_loadu_ps(sumPtr2+15808+124928*i28+62464*k7+62464*l1+256*j45);
__m512i cast1 = _mm512_castps_si512(load2);
__m512i cast4 = _mm512_castps_si512(load6);
__m512i cast7 = _mm512_castps_si512(load10);
__m512i cast10 = _mm512_castps_si512(load14);
__m512 join2 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast1, cast1, 1));
__m512 join5 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast4, cast4, 1));
__m512 join8 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast7, cast7, 1));
__m512 join11 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast10, cast10, 1));
__m512 add6 = _mm512_add_ps(load1, join2);
__m512 add9 = _mm512_add_ps(load5, join5);
__m512 add12 = _mm512_add_ps(load9, join8);
__m512 add15 = _mm512_add_ps(load13, join11);
__m512 load3 = _mm512_loadu_ps(sumPtr2+31232+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load7 = _mm512_loadu_ps(sumPtr2+31296+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load11 = _mm512_loadu_ps(sumPtr2+31360+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load15 = _mm512_loadu_ps(sumPtr2+31424+124928*i28+62464*k7+62464*l1+256*j45);
__m512i cast2 = _mm512_castps_si512(load3);
__m512i cast5 = _mm512_castps_si512(load7);
__m512i cast8 = _mm512_castps_si512(load11);
__m512i cast11 = _mm512_castps_si512(load15);
__m512 join3 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast2, cast2, 2));
__m512 join6 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast5, cast5, 2));
__m512 join9 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast8, cast8, 2));
__m512 join12 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast11, cast11, 2));
__m512 load4 = _mm512_loadu_ps(sumPtr2+46848+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load8 = _mm512_loadu_ps(sumPtr2+46912+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load12 = _mm512_loadu_ps(sumPtr2+46976+124928*i28+62464*k7+62464*l1+256*j45);
__m512 load16 = _mm512_loadu_ps(sumPtr2+47040+124928*i28+62464*k7+62464*l1+256*j45);
__m512i cast3 = _mm512_castps_si512(load4);
__m512i cast6 = _mm512_castps_si512(load8);
__m512i cast9 = _mm512_castps_si512(load12);
__m512i cast12 = _mm512_castps_si512(load16);
__m512 join4 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast3, cast3, 3));
__m512 join7 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast6, cast6, 3));
__m512 join10 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast9, cast9, 3));
__m512 join13 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast12, cast12, 3));
__m512 add7 = _mm512_add_ps(join3, join4);
__m512 add10 = _mm512_add_ps(join6, join7);
__m512 add13 = _mm512_add_ps(join9, join10);
__m512 add16 = _mm512_add_ps(join12, join13);
__m512 add8 = _mm512_add_ps(add6, add7);
__m512 add11 = _mm512_add_ps(add9, add10);
__m512 add14 = _mm512_add_ps(add12, add13);
__m512 add17 = _mm512_add_ps(add15, add16);
add8 = _mm512_max_ps(_mm512_setzero_ps(), add8);
add11 = _mm512_max_ps(_mm512_setzero_ps(), add11);
add14 = _mm512_max_ps(_mm512_setzero_ps(), add14);
add17 = _mm512_max_ps(_mm512_setzero_ps(), add17);
add8 = _mm512_add_ps(add8, _mm512_maskz_loadu_ps(4095, datPtr4+0+11712*i28+192*j45+192*k7+64*l1));
add11 = _mm512_add_ps(add11, _mm512_maskz_loadu_ps(4095, datPtr4+48+11712*i28+192*j45+192*k7+64*l1));
add14 = _mm512_add_ps(add14, _mm512_maskz_loadu_ps(4095, datPtr4+96+11712*i28+192*j45+192*k7+64*l1));
add17 = _mm512_add_ps(add17, _mm512_maskz_loadu_ps(4095, datPtr4+144+11712*i28+192*j45+192*k7+64*l1));
add8 = _mm512_fmadd_ps(add8, bnMul3, bnAdd3);
add11 = _mm512_fmadd_ps(add11, bnMul3, bnAdd3);
add14 = _mm512_fmadd_ps(add14, bnMul3, bnAdd3);
add17 = _mm512_fmadd_ps(add17, bnMul3, bnAdd3);
_mm512_mask_storeu_ps(datPtr5+0+11712*i28+192*j45+192*k7+64*l1, 4095, add8);
_mm512_mask_storeu_ps(datPtr5+48+11712*i28+192*j45+192*k7+64*l1, 4095, add11);
_mm512_mask_storeu_ps(datPtr5+96+11712*i28+192*j45+192*k7+64*l1, 4095, add14);
_mm512_mask_storeu_ps(datPtr5+144+11712*i28+192*j45+192*k7+64*l1, 4095, add17);
}
}
}

static void Example30LoomConsumeSums1(Example30ThreaderTeam1* team17, char** tensors7) {
Example30ThreaderTask1 task11;
task11.callee1 = Example30LoomConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 4;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 1;
task11.hull1[3] = 1;
Example30ThreaderDo1(team17, &task11);
}

struct Example30Net {
char* alloc1;
char* align1;
};

void Example30NetDestroy(Example30Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example30NetCreate(
Example30Net** net1,
Example30Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example30Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(63411);
if (__builtin_expect(!alloc3, 0)) {
return Example30Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
char* tmpAlloc1 = malloc(679);
if (__builtin_expect(!tmpAlloc1, 0)) {
char* msg6 = Example30Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
char* tmpAlign1 = (void*)(((size_t)tmpAlloc1+63)&-64);
Example30ThreaderTeam1* team12 = 0;
char* err8 = Example30ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(tmpAlloc1);
free(alloc3);
return err8;
}
{
Example30BnSimplify1(
params1->bn1Means,
params1->bn1Variances,
params1->bn1Scales,
params1->bn1Shifts,
align3+0
);
Example30BnSimplify2(
params1->bn4Means,
params1->bn4Variances,
params1->bn4Scales,
params1->bn4Shifts,
align3+128
);
Example30BnSimplify1(
params1->bn2Means,
params1->bn2Variances,
params1->bn2Scales,
params1->bn2Shifts,
tmpAlign1+0
);
Example30BnSimplify2(
params1->bn3Means,
params1->bn3Variances,
params1->bn3Scales,
params1->bn3Shifts,
tmpAlign1+128
);
char* tensors12[] = {
(char*)params1->convWeights,
(char*)params1->convBiases,
tmpAlign1+0,
tmpAlign1+128,
align3+640
};
Example30LoomArrangeFilts1(team12, tensors12);
}
Example30ThreaderDestroy1(team12);
free(tmpAlloc1);
Example30Net* net5 = malloc(sizeof(Example30Net));
if (__builtin_expect(!net5, 0)) {
char* msg7 = Example30Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg7;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example30Engine {
Example30Net* net3;
Example30ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example30EnginePthreadT(
Example30Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example30ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example30EngineDestroy(Example30Engine* eng3) {
Example30ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example30EngineCreate(
Example30Engine** eng4,
Example30Net* net4,
ptrdiff_t threads2
) {
Example30Engine* eng5 = malloc(sizeof(Example30Engine));
if (__builtin_expect(!eng5, 0)) {
return Example30Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(133183);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example30Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example30ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example30EngineInference(
Example30Engine* eng1,
float* bn4Data,
float* in1Data,
float* in2Data,
float* in3Data
) {
char* netAlign1 = eng1->net3->align1;
Example30ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)in1Data,
netAlign1+0,
(char*)in2Data,
align4+0
};
Example30LoomArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+640,
align4+0,
align4+8192
};
Example30LoomProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+8192,
(char*)in3Data,
netAlign1+128,
(char*)bn4Data
};
Example30LoomConsumeSums1(team14, tensors11);
}
}

// End of file.

Top