NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example1 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=2170 Height=53 Width=63
Conv FromTensor=in ToTensor=out ToChannels=1275 FilterH=1 FilterW=7 StrideH=1 StrideW=4 PaddingH=0 PaddingW=3 DilationH=1 DilationW=1 Groups=5
Output FromTensor=out

Top || Output Example1.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example1Params);
// Example1Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example1Params Example1Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example1Params* params = malloc(sizeof(Example1Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example1Net* net; // For example, 4 threads:
// char* err = Example1NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example1NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example1Net Example1Net;

char* Example1NetCreate(
Example1Net**,
Example1Params*,
ptrdiff_t threads
);

void Example1NetDestroy(Example1Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example1Net* net;
//
// ... Create net ...
//
// Example1Engine* engine; // For example, 4 inference threads:
// char* err = Example1EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example1EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example1EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*2170*53*63);
// float* outData = malloc(sizeof(float)*1275*53*16);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example1EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example1Engine Example1Engine;

char* Example1EngineCreate(
Example1Engine**,
Example1Net*,
ptrdiff_t threads
);

char* Example1EnginePthreadT(
Example1Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example1EngineInference(
Example1Engine*,
float* inData,
float* outData
);

void Example1EngineDestroy(Example1Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example1Params {
float outBiases[1275]; // 1x1275x1x1
float outWeights[3873450]; // 1275x434x1x7
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example1.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example1.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example1.h"

static char* Example1Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(276);
int step1 = sprintf(msg1, "Example1: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 276-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example1ThreaderTask1 Example1ThreaderTask1;
typedef void (*Example1ThreaderCallee1)(Example1ThreaderTask1*, int64_t*);
typedef struct Example1ThreaderHub1 Example1ThreaderHub1;
typedef struct Example1ThreaderNode1 Example1ThreaderNode1;
typedef struct Example1ThreaderUnwind1 Example1ThreaderUnwind1;
typedef struct Example1ThreaderTeam1 Example1ThreaderTeam1;

struct Example1ThreaderTask1 {
Example1ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example1ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example1ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example1ThreaderTask1* task1;
pthread_cond_t cond2;
Example1ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example1ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example1ThreaderTeam1 {
ptrdiff_t nt1;
Example1ThreaderHub1* hub2;
Example1ThreaderNode1* nodes2;
Example1ThreaderUnwind1 unwind1;
};

static void Example1ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example1ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example1ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example1ThreaderMain1(void* arg1) {
Example1ThreaderNode1* node1 = arg1;
Example1ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example1ThreaderHub1* hub3 = team2->hub2;
Example1ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example1ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example1ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example1ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example1ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example1ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example1ThreaderDestroy1(Example1ThreaderTeam1* team3) {
if (!team3) return;
Example1ThreaderNode1* nodes4 = team3->nodes2;
Example1ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example1ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example1ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example1ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example1ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example1ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example1ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example1ThreaderCreate1Up4(Example1ThreaderTeam1* team8, ptrdiff_t nt7) {
Example1ThreaderNode1* nodes5 = team8->nodes2;
for (Example1ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example1Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example1Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example1ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example1Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example1ThreaderCreate1Up3(Example1ThreaderTeam1* team7, ptrdiff_t nt6) {
Example1ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example1Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example1Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example1ThreaderCreate1Up4(team7, nt6);
}

static char* Example1ThreaderCreate1Up2(Example1ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example1ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example1ThreaderNode1) != (size_t)nt5, 0)) {
return Example1Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example1Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example1ThreaderCreate1Up3(team6, nt5);
}

static char* Example1ThreaderCreate1Up1(Example1ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example1ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example1Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example1ThreaderCreate1Up2(team5, nt4);
}

static char* Example1ThreaderCreate1(Example1ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example1Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example1ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example1Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example1ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example1ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example1ThreaderPthreadT1(
pthread_t* thr2,
Example1ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example1Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example1ThreaderDo1(Example1ThreaderTeam1* team10, Example1ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example1ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example1ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example1ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example1ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example1Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example1Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example1LoomArrangeFilts1Callee1(Example1ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = pt7[0];
ptrdiff_t g2 = pt7[1];
ptrdiff_t e1 = 0;
char*restrict arrangedB1 = tensors2[2]+5100*e1;
char*restrict arrangedW1 = tensors2[2]+5100+29809500*e1;
char*restrict wtPtr1 = tensors2[0]+23380*e1;
char*restrict biasPtr1 = tensors2[1];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 1*b2;
ptrdiff_t jj1 = j1+0;
if (j1 < 15) {
for (; j1 != 15; ++j1) {
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(65535, biasPtr1-0+1020*i5+64*j1);
}
_mm512_mask_storeu_ps(arrangedB1-0+1020*i5+64*j1, 65535, bias1);
ptrdiff_t c1 = (size_t)(0+16*j1)/6;
switch ((size_t)(0+16*j1)%6) {
case 0: {
ptrdiff_t k1 = 0;
for (; k1 != 217; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(16383, wtPtr1+0+3098760*i5+194432*j1+56*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(16383, wtPtr1+12152+3098760*i5+194432*j1+56*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(16383, wtPtr1+24304+3098760*i5+194432*j1+56*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(16383, wtPtr1+36456+3098760*i5+194432*j1+56*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(16383, wtPtr1+48608+3098760*i5+194432*j1+56*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(16383, wtPtr1+60760+3098760*i5+194432*j1+56*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(16383, wtPtr1+72912+3098760*i5+194432*j1+56*k1);
__m512 wt8 = _mm512_maskz_loadu_ps(16383, wtPtr1+85064+3098760*i5+194432*j1+56*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(16383, wtPtr1+97216+3098760*i5+194432*j1+56*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(16383, wtPtr1+109368+3098760*i5+194432*j1+56*k1);
__m512 wt11 = _mm512_maskz_loadu_ps(16383, wtPtr1+121520+3098760*i5+194432*j1+56*k1);
__m512 wt12 = _mm512_maskz_loadu_ps(16383, wtPtr1+133672+3098760*i5+194432*j1+56*k1);
__m512 wt13 = _mm512_maskz_loadu_ps(16383, wtPtr1+145824+3098760*i5+194432*j1+56*k1);
__m512 wt14 = _mm512_maskz_loadu_ps(16383, wtPtr1+157976+3098760*i5+194432*j1+56*k1);
__m512 wt15 = _mm512_maskz_loadu_ps(16383, wtPtr1+170128+3098760*i5+194432*j1+56*k1);
__m512 wt16 = _mm512_maskz_loadu_ps(16383, wtPtr1+182280+3098760*i5+194432*j1+56*k1);
__m512 tmp1 = _mm512_unpacklo_ps(wt1, wt2);
__m512 tmp2 = _mm512_unpackhi_ps(wt1, wt2);
__m512 tmp3 = _mm512_unpacklo_ps(wt3, wt4);
__m512 tmp4 = _mm512_unpackhi_ps(wt3, wt4);
__m512 tmp5 = _mm512_unpacklo_ps(wt5, wt6);
__m512 tmp6 = _mm512_unpackhi_ps(wt5, wt6);
__m512 tmp7 = _mm512_unpacklo_ps(wt7, wt8);
__m512 tmp8 = _mm512_unpackhi_ps(wt7, wt8);
__m512 tmp9 = _mm512_unpacklo_ps(wt9, wt10);
__m512 tmp10 = _mm512_unpackhi_ps(wt9, wt10);
__m512 tmp11 = _mm512_unpacklo_ps(wt11, wt12);
__m512 tmp12 = _mm512_unpackhi_ps(wt11, wt12);
__m512 tmp13 = _mm512_unpacklo_ps(wt13, wt14);
__m512 tmp14 = _mm512_unpackhi_ps(wt13, wt14);
__m512 tmp15 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp16 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt1 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt9 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt2 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt10 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt3 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt11 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt4 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt12 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt5 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt13 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt6 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt14 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt7 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt8 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
_mm512_mask_storeu_ps(arrangedW1+0+3098760*i5+10416*c1+48*k1, 63, wt1);
_mm512_mask_storeu_ps(arrangedW1+10392+3098760*i5+10416*c1+48*k1, 4032, wt1);
_mm512_mask_storeu_ps(arrangedW1+20784+3098760*i5+10416*c1+48*k1, 61440, wt1);
_mm512_mask_storeu_ps(arrangedW1+885360+3098760*i5+10416*c1+48*k1, 63, wt2);
_mm512_mask_storeu_ps(arrangedW1+895752+3098760*i5+10416*c1+48*k1, 4032, wt2);
_mm512_mask_storeu_ps(arrangedW1+906144+3098760*i5+10416*c1+48*k1, 61440, wt2);
_mm512_mask_storeu_ps(arrangedW1+1770720+3098760*i5+10416*c1+48*k1, 63, wt3);
_mm512_mask_storeu_ps(arrangedW1+1781112+3098760*i5+10416*c1+48*k1, 4032, wt3);
_mm512_mask_storeu_ps(arrangedW1+1791504+3098760*i5+10416*c1+48*k1, 61440, wt3);
_mm512_mask_storeu_ps(arrangedW1+2656080+3098760*i5+10416*c1+48*k1, 63, wt4);
_mm512_mask_storeu_ps(arrangedW1+2666472+3098760*i5+10416*c1+48*k1, 4032, wt4);
_mm512_mask_storeu_ps(arrangedW1+2676864+3098760*i5+10416*c1+48*k1, 61440, wt4);
_mm512_mask_storeu_ps(arrangedW1+442680+3098760*i5+10416*c1+48*k1, 63, wt5);
_mm512_mask_storeu_ps(arrangedW1+453072+3098760*i5+10416*c1+48*k1, 4032, wt5);
_mm512_mask_storeu_ps(arrangedW1+463464+3098760*i5+10416*c1+48*k1, 61440, wt5);
_mm512_mask_storeu_ps(arrangedW1+1328040+3098760*i5+10416*c1+48*k1, 63, wt6);
_mm512_mask_storeu_ps(arrangedW1+1338432+3098760*i5+10416*c1+48*k1, 4032, wt6);
_mm512_mask_storeu_ps(arrangedW1+1348824+3098760*i5+10416*c1+48*k1, 61440, wt6);
_mm512_mask_storeu_ps(arrangedW1+2213400+3098760*i5+10416*c1+48*k1, 63, wt7);
_mm512_mask_storeu_ps(arrangedW1+2223792+3098760*i5+10416*c1+48*k1, 4032, wt7);
_mm512_mask_storeu_ps(arrangedW1+2234184+3098760*i5+10416*c1+48*k1, 61440, wt7);
_mm512_mask_storeu_ps(arrangedW1+24+3098760*i5+10416*c1+48*k1, 63, wt8);
_mm512_mask_storeu_ps(arrangedW1+10416+3098760*i5+10416*c1+48*k1, 4032, wt8);
_mm512_mask_storeu_ps(arrangedW1+20808+3098760*i5+10416*c1+48*k1, 61440, wt8);
_mm512_mask_storeu_ps(arrangedW1+885384+3098760*i5+10416*c1+48*k1, 63, wt9);
_mm512_mask_storeu_ps(arrangedW1+895776+3098760*i5+10416*c1+48*k1, 4032, wt9);
_mm512_mask_storeu_ps(arrangedW1+906168+3098760*i5+10416*c1+48*k1, 61440, wt9);
_mm512_mask_storeu_ps(arrangedW1+1770744+3098760*i5+10416*c1+48*k1, 63, wt10);
_mm512_mask_storeu_ps(arrangedW1+1781136+3098760*i5+10416*c1+48*k1, 4032, wt10);
_mm512_mask_storeu_ps(arrangedW1+1791528+3098760*i5+10416*c1+48*k1, 61440, wt10);
_mm512_mask_storeu_ps(arrangedW1+2656104+3098760*i5+10416*c1+48*k1, 63, wt11);
_mm512_mask_storeu_ps(arrangedW1+2666496+3098760*i5+10416*c1+48*k1, 4032, wt11);
_mm512_mask_storeu_ps(arrangedW1+2676888+3098760*i5+10416*c1+48*k1, 61440, wt11);
_mm512_mask_storeu_ps(arrangedW1+442704+3098760*i5+10416*c1+48*k1, 63, wt12);
_mm512_mask_storeu_ps(arrangedW1+453096+3098760*i5+10416*c1+48*k1, 4032, wt12);
_mm512_mask_storeu_ps(arrangedW1+463488+3098760*i5+10416*c1+48*k1, 61440, wt12);
_mm512_mask_storeu_ps(arrangedW1+1328064+3098760*i5+10416*c1+48*k1, 63, wt13);
_mm512_mask_storeu_ps(arrangedW1+1338456+3098760*i5+10416*c1+48*k1, 4032, wt13);
_mm512_mask_storeu_ps(arrangedW1+1348848+3098760*i5+10416*c1+48*k1, 61440, wt13);
_mm512_mask_storeu_ps(arrangedW1+2213424+3098760*i5+10416*c1+48*k1, 63, wt14);
_mm512_mask_storeu_ps(arrangedW1+2223816+3098760*i5+10416*c1+48*k1, 4032, wt14);
_mm512_mask_storeu_ps(arrangedW1+2234208+3098760*i5+10416*c1+48*k1, 61440, wt14);
}
break;
}
case 2: {
ptrdiff_t k2 = 0;
for (; k2 != 217; ++k2) {
__m512 wt17 = _mm512_maskz_loadu_ps(16383, wtPtr1+0+3098760*i5+194432*j1+56*k2);
__m512 wt18 = _mm512_maskz_loadu_ps(16383, wtPtr1+12152+3098760*i5+194432*j1+56*k2);
__m512 wt19 = _mm512_maskz_loadu_ps(16383, wtPtr1+24304+3098760*i5+194432*j1+56*k2);
__m512 wt20 = _mm512_maskz_loadu_ps(16383, wtPtr1+36456+3098760*i5+194432*j1+56*k2);
__m512 wt21 = _mm512_maskz_loadu_ps(16383, wtPtr1+48608+3098760*i5+194432*j1+56*k2);
__m512 wt22 = _mm512_maskz_loadu_ps(16383, wtPtr1+60760+3098760*i5+194432*j1+56*k2);
__m512 wt23 = _mm512_maskz_loadu_ps(16383, wtPtr1+72912+3098760*i5+194432*j1+56*k2);
__m512 wt24 = _mm512_maskz_loadu_ps(16383, wtPtr1+85064+3098760*i5+194432*j1+56*k2);
__m512 wt25 = _mm512_maskz_loadu_ps(16383, wtPtr1+97216+3098760*i5+194432*j1+56*k2);
__m512 wt26 = _mm512_maskz_loadu_ps(16383, wtPtr1+109368+3098760*i5+194432*j1+56*k2);
__m512 wt27 = _mm512_maskz_loadu_ps(16383, wtPtr1+121520+3098760*i5+194432*j1+56*k2);
__m512 wt28 = _mm512_maskz_loadu_ps(16383, wtPtr1+133672+3098760*i5+194432*j1+56*k2);
__m512 wt29 = _mm512_maskz_loadu_ps(16383, wtPtr1+145824+3098760*i5+194432*j1+56*k2);
__m512 wt30 = _mm512_maskz_loadu_ps(16383, wtPtr1+157976+3098760*i5+194432*j1+56*k2);
__m512 wt31 = _mm512_maskz_loadu_ps(16383, wtPtr1+170128+3098760*i5+194432*j1+56*k2);
__m512 wt32 = _mm512_maskz_loadu_ps(16383, wtPtr1+182280+3098760*i5+194432*j1+56*k2);
__m512 tmp49 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp50 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp51 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp52 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp53 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp54 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp55 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp56 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp57 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp58 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp59 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp60 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp61 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp62 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp63 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp64 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp50, tmp52, 238);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp70 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp71 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp54, tmp56, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp75 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp76 = _mm512_shuffle_ps(tmp58, tmp60, 238);
__m512 tmp77 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp78 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp79 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp80 = _mm512_shuffle_ps(tmp62, tmp64, 238);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp65, tmp69, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp65, tmp69, 221);
__m512 tmp83 = _mm512_shuffle_f32x4(tmp66, tmp70, 136);
__m512 tmp84 = _mm512_shuffle_f32x4(tmp66, tmp70, 221);
__m512 tmp85 = _mm512_shuffle_f32x4(tmp67, tmp71, 136);
__m512 tmp86 = _mm512_shuffle_f32x4(tmp67, tmp71, 221);
__m512 tmp87 = _mm512_shuffle_f32x4(tmp68, tmp72, 136);
__m512 tmp88 = _mm512_shuffle_f32x4(tmp68, tmp72, 221);
__m512 tmp89 = _mm512_shuffle_f32x4(tmp73, tmp77, 136);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp73, tmp77, 221);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp74, tmp78, 136);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp74, tmp78, 221);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp75, tmp79, 136);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp75, tmp79, 221);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp76, tmp80, 136);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp76, tmp80, 221);
wt17 = _mm512_shuffle_f32x4(tmp81, tmp89, 136);
wt25 = _mm512_shuffle_f32x4(tmp81, tmp89, 221);
wt18 = _mm512_shuffle_f32x4(tmp83, tmp91, 136);
wt26 = _mm512_shuffle_f32x4(tmp83, tmp91, 221);
wt19 = _mm512_shuffle_f32x4(tmp85, tmp93, 136);
wt27 = _mm512_shuffle_f32x4(tmp85, tmp93, 221);
wt20 = _mm512_shuffle_f32x4(tmp87, tmp95, 136);
wt28 = _mm512_shuffle_f32x4(tmp87, tmp95, 221);
wt21 = _mm512_shuffle_f32x4(tmp82, tmp90, 136);
wt29 = _mm512_shuffle_f32x4(tmp82, tmp90, 221);
wt22 = _mm512_shuffle_f32x4(tmp84, tmp92, 136);
wt30 = _mm512_shuffle_f32x4(tmp84, tmp92, 221);
wt23 = _mm512_shuffle_f32x4(tmp86, tmp94, 136);
wt24 = _mm512_shuffle_f32x4(tmp88, tmp96, 136);
_mm512_mask_storeu_ps(arrangedW1+8+3098760*i5+10416*c1+48*k2, 15, wt17);
_mm512_mask_storeu_ps(arrangedW1+10400+3098760*i5+10416*c1+48*k2, 1008, wt17);
_mm512_mask_storeu_ps(arrangedW1+20792+3098760*i5+10416*c1+48*k2, 64512, wt17);
_mm512_mask_storeu_ps(arrangedW1+885368+3098760*i5+10416*c1+48*k2, 15, wt18);
_mm512_mask_storeu_ps(arrangedW1+895760+3098760*i5+10416*c1+48*k2, 1008, wt18);
_mm512_mask_storeu_ps(arrangedW1+906152+3098760*i5+10416*c1+48*k2, 64512, wt18);
_mm512_mask_storeu_ps(arrangedW1+1770728+3098760*i5+10416*c1+48*k2, 15, wt19);
_mm512_mask_storeu_ps(arrangedW1+1781120+3098760*i5+10416*c1+48*k2, 1008, wt19);
_mm512_mask_storeu_ps(arrangedW1+1791512+3098760*i5+10416*c1+48*k2, 64512, wt19);
_mm512_mask_storeu_ps(arrangedW1+2656088+3098760*i5+10416*c1+48*k2, 15, wt20);
_mm512_mask_storeu_ps(arrangedW1+2666480+3098760*i5+10416*c1+48*k2, 1008, wt20);
_mm512_mask_storeu_ps(arrangedW1+2676872+3098760*i5+10416*c1+48*k2, 64512, wt20);
_mm512_mask_storeu_ps(arrangedW1+442688+3098760*i5+10416*c1+48*k2, 15, wt21);
_mm512_mask_storeu_ps(arrangedW1+453080+3098760*i5+10416*c1+48*k2, 1008, wt21);
_mm512_mask_storeu_ps(arrangedW1+463472+3098760*i5+10416*c1+48*k2, 64512, wt21);
_mm512_mask_storeu_ps(arrangedW1+1328048+3098760*i5+10416*c1+48*k2, 15, wt22);
_mm512_mask_storeu_ps(arrangedW1+1338440+3098760*i5+10416*c1+48*k2, 1008, wt22);
_mm512_mask_storeu_ps(arrangedW1+1348832+3098760*i5+10416*c1+48*k2, 64512, wt22);
_mm512_mask_storeu_ps(arrangedW1+2213408+3098760*i5+10416*c1+48*k2, 15, wt23);
_mm512_mask_storeu_ps(arrangedW1+2223800+3098760*i5+10416*c1+48*k2, 1008, wt23);
_mm512_mask_storeu_ps(arrangedW1+2234192+3098760*i5+10416*c1+48*k2, 64512, wt23);
_mm512_mask_storeu_ps(arrangedW1+32+3098760*i5+10416*c1+48*k2, 15, wt24);
_mm512_mask_storeu_ps(arrangedW1+10424+3098760*i5+10416*c1+48*k2, 1008, wt24);
_mm512_mask_storeu_ps(arrangedW1+20816+3098760*i5+10416*c1+48*k2, 64512, wt24);
_mm512_mask_storeu_ps(arrangedW1+885392+3098760*i5+10416*c1+48*k2, 15, wt25);
_mm512_mask_storeu_ps(arrangedW1+895784+3098760*i5+10416*c1+48*k2, 1008, wt25);
_mm512_mask_storeu_ps(arrangedW1+906176+3098760*i5+10416*c1+48*k2, 64512, wt25);
_mm512_mask_storeu_ps(arrangedW1+1770752+3098760*i5+10416*c1+48*k2, 15, wt26);
_mm512_mask_storeu_ps(arrangedW1+1781144+3098760*i5+10416*c1+48*k2, 1008, wt26);
_mm512_mask_storeu_ps(arrangedW1+1791536+3098760*i5+10416*c1+48*k2, 64512, wt26);
_mm512_mask_storeu_ps(arrangedW1+2656112+3098760*i5+10416*c1+48*k2, 15, wt27);
_mm512_mask_storeu_ps(arrangedW1+2666504+3098760*i5+10416*c1+48*k2, 1008, wt27);
_mm512_mask_storeu_ps(arrangedW1+2676896+3098760*i5+10416*c1+48*k2, 64512, wt27);
_mm512_mask_storeu_ps(arrangedW1+442712+3098760*i5+10416*c1+48*k2, 15, wt28);
_mm512_mask_storeu_ps(arrangedW1+453104+3098760*i5+10416*c1+48*k2, 1008, wt28);
_mm512_mask_storeu_ps(arrangedW1+463496+3098760*i5+10416*c1+48*k2, 64512, wt28);
_mm512_mask_storeu_ps(arrangedW1+1328072+3098760*i5+10416*c1+48*k2, 15, wt29);
_mm512_mask_storeu_ps(arrangedW1+1338464+3098760*i5+10416*c1+48*k2, 1008, wt29);
_mm512_mask_storeu_ps(arrangedW1+1348856+3098760*i5+10416*c1+48*k2, 64512, wt29);
_mm512_mask_storeu_ps(arrangedW1+2213432+3098760*i5+10416*c1+48*k2, 15, wt30);
_mm512_mask_storeu_ps(arrangedW1+2223824+3098760*i5+10416*c1+48*k2, 1008, wt30);
_mm512_mask_storeu_ps(arrangedW1+2234216+3098760*i5+10416*c1+48*k2, 64512, wt30);
}
break;
}
default: {
ptrdiff_t k3 = 0;
for (; k3 != 217; ++k3) {
__m512 wt33 = _mm512_maskz_loadu_ps(16383, wtPtr1+0+3098760*i5+194432*j1+56*k3);
__m512 wt34 = _mm512_maskz_loadu_ps(16383, wtPtr1+12152+3098760*i5+194432*j1+56*k3);
__m512 wt35 = _mm512_maskz_loadu_ps(16383, wtPtr1+24304+3098760*i5+194432*j1+56*k3);
__m512 wt36 = _mm512_maskz_loadu_ps(16383, wtPtr1+36456+3098760*i5+194432*j1+56*k3);
__m512 wt37 = _mm512_maskz_loadu_ps(16383, wtPtr1+48608+3098760*i5+194432*j1+56*k3);
__m512 wt38 = _mm512_maskz_loadu_ps(16383, wtPtr1+60760+3098760*i5+194432*j1+56*k3);
__m512 wt39 = _mm512_maskz_loadu_ps(16383, wtPtr1+72912+3098760*i5+194432*j1+56*k3);
__m512 wt40 = _mm512_maskz_loadu_ps(16383, wtPtr1+85064+3098760*i5+194432*j1+56*k3);
__m512 wt41 = _mm512_maskz_loadu_ps(16383, wtPtr1+97216+3098760*i5+194432*j1+56*k3);
__m512 wt42 = _mm512_maskz_loadu_ps(16383, wtPtr1+109368+3098760*i5+194432*j1+56*k3);
__m512 wt43 = _mm512_maskz_loadu_ps(16383, wtPtr1+121520+3098760*i5+194432*j1+56*k3);
__m512 wt44 = _mm512_maskz_loadu_ps(16383, wtPtr1+133672+3098760*i5+194432*j1+56*k3);
__m512 wt45 = _mm512_maskz_loadu_ps(16383, wtPtr1+145824+3098760*i5+194432*j1+56*k3);
__m512 wt46 = _mm512_maskz_loadu_ps(16383, wtPtr1+157976+3098760*i5+194432*j1+56*k3);
__m512 wt47 = _mm512_maskz_loadu_ps(16383, wtPtr1+170128+3098760*i5+194432*j1+56*k3);
__m512 wt48 = _mm512_maskz_loadu_ps(16383, wtPtr1+182280+3098760*i5+194432*j1+56*k3);
__m512 tmp97 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp98 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp99 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp100 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp101 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp102 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp103 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp104 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp105 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp106 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp107 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp108 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp109 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp110 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp111 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp112 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp113 = _mm512_shuffle_ps(tmp97, tmp99, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp97, tmp99, 238);
__m512 tmp115 = _mm512_shuffle_ps(tmp98, tmp100, 68);
__m512 tmp116 = _mm512_shuffle_ps(tmp98, tmp100, 238);
__m512 tmp117 = _mm512_shuffle_ps(tmp101, tmp103, 68);
__m512 tmp118 = _mm512_shuffle_ps(tmp101, tmp103, 238);
__m512 tmp119 = _mm512_shuffle_ps(tmp102, tmp104, 68);
__m512 tmp120 = _mm512_shuffle_ps(tmp102, tmp104, 238);
__m512 tmp121 = _mm512_shuffle_ps(tmp105, tmp107, 68);
__m512 tmp122 = _mm512_shuffle_ps(tmp105, tmp107, 238);
__m512 tmp123 = _mm512_shuffle_ps(tmp106, tmp108, 68);
__m512 tmp124 = _mm512_shuffle_ps(tmp106, tmp108, 238);
__m512 tmp125 = _mm512_shuffle_ps(tmp109, tmp111, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp109, tmp111, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp110, tmp112, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp110, tmp112, 238);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp113, tmp117, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp113, tmp117, 221);
__m512 tmp131 = _mm512_shuffle_f32x4(tmp114, tmp118, 136);
__m512 tmp132 = _mm512_shuffle_f32x4(tmp114, tmp118, 221);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp115, tmp119, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp115, tmp119, 221);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp116, tmp120, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp116, tmp120, 221);
__m512 tmp137 = _mm512_shuffle_f32x4(tmp121, tmp125, 136);
__m512 tmp138 = _mm512_shuffle_f32x4(tmp121, tmp125, 221);
__m512 tmp139 = _mm512_shuffle_f32x4(tmp122, tmp126, 136);
__m512 tmp140 = _mm512_shuffle_f32x4(tmp122, tmp126, 221);
__m512 tmp141 = _mm512_shuffle_f32x4(tmp123, tmp127, 136);
__m512 tmp142 = _mm512_shuffle_f32x4(tmp123, tmp127, 221);
__m512 tmp143 = _mm512_shuffle_f32x4(tmp124, tmp128, 136);
__m512 tmp144 = _mm512_shuffle_f32x4(tmp124, tmp128, 221);
wt33 = _mm512_shuffle_f32x4(tmp129, tmp137, 136);
wt41 = _mm512_shuffle_f32x4(tmp129, tmp137, 221);
wt34 = _mm512_shuffle_f32x4(tmp131, tmp139, 136);
wt42 = _mm512_shuffle_f32x4(tmp131, tmp139, 221);
wt35 = _mm512_shuffle_f32x4(tmp133, tmp141, 136);
wt43 = _mm512_shuffle_f32x4(tmp133, tmp141, 221);
wt36 = _mm512_shuffle_f32x4(tmp135, tmp143, 136);
wt44 = _mm512_shuffle_f32x4(tmp135, tmp143, 221);
wt37 = _mm512_shuffle_f32x4(tmp130, tmp138, 136);
wt45 = _mm512_shuffle_f32x4(tmp130, tmp138, 221);
wt38 = _mm512_shuffle_f32x4(tmp132, tmp140, 136);
wt46 = _mm512_shuffle_f32x4(tmp132, tmp140, 221);
wt39 = _mm512_shuffle_f32x4(tmp134, tmp142, 136);
wt40 = _mm512_shuffle_f32x4(tmp136, tmp144, 136);
_mm512_mask_storeu_ps(arrangedW1+16+3098760*i5+10416*c1+48*k3, 3, wt33);
_mm512_mask_storeu_ps(arrangedW1+10408+3098760*i5+10416*c1+48*k3, 252, wt33);
_mm512_mask_storeu_ps(arrangedW1+20800+3098760*i5+10416*c1+48*k3, 16128, wt33);
_mm512_mask_storeu_ps(arrangedW1+31192+3098760*i5+10416*c1+48*k3, 49152, wt33);
_mm512_mask_storeu_ps(arrangedW1+885376+3098760*i5+10416*c1+48*k3, 3, wt34);
_mm512_mask_storeu_ps(arrangedW1+895768+3098760*i5+10416*c1+48*k3, 252, wt34);
_mm512_mask_storeu_ps(arrangedW1+906160+3098760*i5+10416*c1+48*k3, 16128, wt34);
_mm512_mask_storeu_ps(arrangedW1+916552+3098760*i5+10416*c1+48*k3, 49152, wt34);
_mm512_mask_storeu_ps(arrangedW1+1770736+3098760*i5+10416*c1+48*k3, 3, wt35);
_mm512_mask_storeu_ps(arrangedW1+1781128+3098760*i5+10416*c1+48*k3, 252, wt35);
_mm512_mask_storeu_ps(arrangedW1+1791520+3098760*i5+10416*c1+48*k3, 16128, wt35);
_mm512_mask_storeu_ps(arrangedW1+1801912+3098760*i5+10416*c1+48*k3, 49152, wt35);
_mm512_mask_storeu_ps(arrangedW1+2656096+3098760*i5+10416*c1+48*k3, 3, wt36);
_mm512_mask_storeu_ps(arrangedW1+2666488+3098760*i5+10416*c1+48*k3, 252, wt36);
_mm512_mask_storeu_ps(arrangedW1+2676880+3098760*i5+10416*c1+48*k3, 16128, wt36);
_mm512_mask_storeu_ps(arrangedW1+2687272+3098760*i5+10416*c1+48*k3, 49152, wt36);
_mm512_mask_storeu_ps(arrangedW1+442696+3098760*i5+10416*c1+48*k3, 3, wt37);
_mm512_mask_storeu_ps(arrangedW1+453088+3098760*i5+10416*c1+48*k3, 252, wt37);
_mm512_mask_storeu_ps(arrangedW1+463480+3098760*i5+10416*c1+48*k3, 16128, wt37);
_mm512_mask_storeu_ps(arrangedW1+473872+3098760*i5+10416*c1+48*k3, 49152, wt37);
_mm512_mask_storeu_ps(arrangedW1+1328056+3098760*i5+10416*c1+48*k3, 3, wt38);
_mm512_mask_storeu_ps(arrangedW1+1338448+3098760*i5+10416*c1+48*k3, 252, wt38);
_mm512_mask_storeu_ps(arrangedW1+1348840+3098760*i5+10416*c1+48*k3, 16128, wt38);
_mm512_mask_storeu_ps(arrangedW1+1359232+3098760*i5+10416*c1+48*k3, 49152, wt38);
_mm512_mask_storeu_ps(arrangedW1+2213416+3098760*i5+10416*c1+48*k3, 3, wt39);
_mm512_mask_storeu_ps(arrangedW1+2223808+3098760*i5+10416*c1+48*k3, 252, wt39);
_mm512_mask_storeu_ps(arrangedW1+2234200+3098760*i5+10416*c1+48*k3, 16128, wt39);
_mm512_mask_storeu_ps(arrangedW1+2244592+3098760*i5+10416*c1+48*k3, 49152, wt39);
_mm512_mask_storeu_ps(arrangedW1+40+3098760*i5+10416*c1+48*k3, 3, wt40);
_mm512_mask_storeu_ps(arrangedW1+10432+3098760*i5+10416*c1+48*k3, 252, wt40);
_mm512_mask_storeu_ps(arrangedW1+20824+3098760*i5+10416*c1+48*k3, 16128, wt40);
_mm512_mask_storeu_ps(arrangedW1+31216+3098760*i5+10416*c1+48*k3, 49152, wt40);
_mm512_mask_storeu_ps(arrangedW1+885400+3098760*i5+10416*c1+48*k3, 3, wt41);
_mm512_mask_storeu_ps(arrangedW1+895792+3098760*i5+10416*c1+48*k3, 252, wt41);
_mm512_mask_storeu_ps(arrangedW1+906184+3098760*i5+10416*c1+48*k3, 16128, wt41);
_mm512_mask_storeu_ps(arrangedW1+916576+3098760*i5+10416*c1+48*k3, 49152, wt41);
_mm512_mask_storeu_ps(arrangedW1+1770760+3098760*i5+10416*c1+48*k3, 3, wt42);
_mm512_mask_storeu_ps(arrangedW1+1781152+3098760*i5+10416*c1+48*k3, 252, wt42);
_mm512_mask_storeu_ps(arrangedW1+1791544+3098760*i5+10416*c1+48*k3, 16128, wt42);
_mm512_mask_storeu_ps(arrangedW1+1801936+3098760*i5+10416*c1+48*k3, 49152, wt42);
_mm512_mask_storeu_ps(arrangedW1+2656120+3098760*i5+10416*c1+48*k3, 3, wt43);
_mm512_mask_storeu_ps(arrangedW1+2666512+3098760*i5+10416*c1+48*k3, 252, wt43);
_mm512_mask_storeu_ps(arrangedW1+2676904+3098760*i5+10416*c1+48*k3, 16128, wt43);
_mm512_mask_storeu_ps(arrangedW1+2687296+3098760*i5+10416*c1+48*k3, 49152, wt43);
_mm512_mask_storeu_ps(arrangedW1+442720+3098760*i5+10416*c1+48*k3, 3, wt44);
_mm512_mask_storeu_ps(arrangedW1+453112+3098760*i5+10416*c1+48*k3, 252, wt44);
_mm512_mask_storeu_ps(arrangedW1+463504+3098760*i5+10416*c1+48*k3, 16128, wt44);
_mm512_mask_storeu_ps(arrangedW1+473896+3098760*i5+10416*c1+48*k3, 49152, wt44);
_mm512_mask_storeu_ps(arrangedW1+1328080+3098760*i5+10416*c1+48*k3, 3, wt45);
_mm512_mask_storeu_ps(arrangedW1+1338472+3098760*i5+10416*c1+48*k3, 252, wt45);
_mm512_mask_storeu_ps(arrangedW1+1348864+3098760*i5+10416*c1+48*k3, 16128, wt45);
_mm512_mask_storeu_ps(arrangedW1+1359256+3098760*i5+10416*c1+48*k3, 49152, wt45);
_mm512_mask_storeu_ps(arrangedW1+2213440+3098760*i5+10416*c1+48*k3, 3, wt46);
_mm512_mask_storeu_ps(arrangedW1+2223832+3098760*i5+10416*c1+48*k3, 252, wt46);
_mm512_mask_storeu_ps(arrangedW1+2234224+3098760*i5+10416*c1+48*k3, 16128, wt46);
_mm512_mask_storeu_ps(arrangedW1+2244616+3098760*i5+10416*c1+48*k3, 49152, wt46);
}
break;
}
}
if (j1 >= jj1) return;
}
}
if (j1 == 15) {
__m512 bias2 = _mm512_setzero_ps();
if (!e1) {
bias2 = _mm512_maskz_loadu_ps(32767, biasPtr1-0+1020*i5+64*j1);
}
_mm512_mask_storeu_ps(arrangedB1-0+1020*i5+64*j1, 32767, bias2);
ptrdiff_t c2 = (size_t)(0+16*j1)/6;
ptrdiff_t k4 = 0;
for (; k4 != 217; ++k4) {
__m512 wt49 = _mm512_maskz_loadu_ps(16383, wtPtr1+0+3098760*i5+194432*j1+56*k4);
__m512 wt50 = _mm512_maskz_loadu_ps(16383, wtPtr1+12152+3098760*i5+194432*j1+56*k4);
__m512 wt51 = _mm512_maskz_loadu_ps(16383, wtPtr1+24304+3098760*i5+194432*j1+56*k4);
__m512 wt52 = _mm512_maskz_loadu_ps(16383, wtPtr1+36456+3098760*i5+194432*j1+56*k4);
__m512 wt53 = _mm512_maskz_loadu_ps(16383, wtPtr1+48608+3098760*i5+194432*j1+56*k4);
__m512 wt54 = _mm512_maskz_loadu_ps(16383, wtPtr1+60760+3098760*i5+194432*j1+56*k4);
__m512 wt55 = _mm512_maskz_loadu_ps(16383, wtPtr1+72912+3098760*i5+194432*j1+56*k4);
__m512 wt56 = _mm512_maskz_loadu_ps(16383, wtPtr1+85064+3098760*i5+194432*j1+56*k4);
__m512 wt57 = _mm512_maskz_loadu_ps(16383, wtPtr1+97216+3098760*i5+194432*j1+56*k4);
__m512 wt58 = _mm512_maskz_loadu_ps(16383, wtPtr1+109368+3098760*i5+194432*j1+56*k4);
__m512 wt59 = _mm512_maskz_loadu_ps(16383, wtPtr1+121520+3098760*i5+194432*j1+56*k4);
__m512 wt60 = _mm512_maskz_loadu_ps(16383, wtPtr1+133672+3098760*i5+194432*j1+56*k4);
__m512 wt61 = _mm512_maskz_loadu_ps(16383, wtPtr1+145824+3098760*i5+194432*j1+56*k4);
__m512 wt62 = _mm512_maskz_loadu_ps(16383, wtPtr1+157976+3098760*i5+194432*j1+56*k4);
__m512 wt63 = _mm512_maskz_loadu_ps(16383, wtPtr1+170128+3098760*i5+194432*j1+56*k4);
__m512 tmp145 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp146 = _mm512_unpackhi_ps(wt49, wt50);
__m512 tmp147 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp148 = _mm512_unpackhi_ps(wt51, wt52);
__m512 tmp149 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp150 = _mm512_unpackhi_ps(wt53, wt54);
__m512 tmp151 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp152 = _mm512_unpackhi_ps(wt55, wt56);
__m512 tmp153 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp154 = _mm512_unpackhi_ps(wt57, wt58);
__m512 tmp155 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp156 = _mm512_unpackhi_ps(wt59, wt60);
__m512 tmp157 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp158 = _mm512_unpackhi_ps(wt61, wt62);
__m512 tmp159 = _mm512_unpacklo_ps(wt63, wt63);
__m512 tmp160 = _mm512_unpackhi_ps(wt63, wt63);
__m512 tmp161 = _mm512_shuffle_ps(tmp145, tmp147, 68);
__m512 tmp162 = _mm512_shuffle_ps(tmp145, tmp147, 238);
__m512 tmp163 = _mm512_shuffle_ps(tmp146, tmp148, 68);
__m512 tmp164 = _mm512_shuffle_ps(tmp146, tmp148, 238);
__m512 tmp165 = _mm512_shuffle_ps(tmp149, tmp151, 68);
__m512 tmp166 = _mm512_shuffle_ps(tmp149, tmp151, 238);
__m512 tmp167 = _mm512_shuffle_ps(tmp150, tmp152, 68);
__m512 tmp168 = _mm512_shuffle_ps(tmp150, tmp152, 238);
__m512 tmp169 = _mm512_shuffle_ps(tmp153, tmp155, 68);
__m512 tmp170 = _mm512_shuffle_ps(tmp153, tmp155, 238);
__m512 tmp171 = _mm512_shuffle_ps(tmp154, tmp156, 68);
__m512 tmp172 = _mm512_shuffle_ps(tmp154, tmp156, 238);
__m512 tmp173 = _mm512_shuffle_ps(tmp157, tmp159, 68);
__m512 tmp174 = _mm512_shuffle_ps(tmp157, tmp159, 238);
__m512 tmp175 = _mm512_shuffle_ps(tmp158, tmp160, 68);
__m512 tmp176 = _mm512_shuffle_ps(tmp158, tmp160, 238);
__m512 tmp177 = _mm512_shuffle_f32x4(tmp161, tmp165, 136);
__m512 tmp178 = _mm512_shuffle_f32x4(tmp161, tmp165, 221);
__m512 tmp179 = _mm512_shuffle_f32x4(tmp162, tmp166, 136);
__m512 tmp180 = _mm512_shuffle_f32x4(tmp162, tmp166, 221);
__m512 tmp181 = _mm512_shuffle_f32x4(tmp163, tmp167, 136);
__m512 tmp182 = _mm512_shuffle_f32x4(tmp163, tmp167, 221);
__m512 tmp183 = _mm512_shuffle_f32x4(tmp164, tmp168, 136);
__m512 tmp184 = _mm512_shuffle_f32x4(tmp164, tmp168, 221);
__m512 tmp185 = _mm512_shuffle_f32x4(tmp169, tmp173, 136);
__m512 tmp186 = _mm512_shuffle_f32x4(tmp169, tmp173, 221);
__m512 tmp187 = _mm512_shuffle_f32x4(tmp170, tmp174, 136);
__m512 tmp188 = _mm512_shuffle_f32x4(tmp170, tmp174, 221);
__m512 tmp189 = _mm512_shuffle_f32x4(tmp171, tmp175, 136);
__m512 tmp190 = _mm512_shuffle_f32x4(tmp171, tmp175, 221);
__m512 tmp191 = _mm512_shuffle_f32x4(tmp172, tmp176, 136);
__m512 tmp192 = _mm512_shuffle_f32x4(tmp172, tmp176, 221);
wt49 = _mm512_shuffle_f32x4(tmp177, tmp185, 136);
wt57 = _mm512_shuffle_f32x4(tmp177, tmp185, 221);
wt50 = _mm512_shuffle_f32x4(tmp179, tmp187, 136);
wt58 = _mm512_shuffle_f32x4(tmp179, tmp187, 221);
wt51 = _mm512_shuffle_f32x4(tmp181, tmp189, 136);
wt59 = _mm512_shuffle_f32x4(tmp181, tmp189, 221);
wt52 = _mm512_shuffle_f32x4(tmp183, tmp191, 136);
wt60 = _mm512_shuffle_f32x4(tmp183, tmp191, 221);
wt53 = _mm512_shuffle_f32x4(tmp178, tmp186, 136);
wt61 = _mm512_shuffle_f32x4(tmp178, tmp186, 221);
wt54 = _mm512_shuffle_f32x4(tmp180, tmp188, 136);
wt62 = _mm512_shuffle_f32x4(tmp180, tmp188, 221);
wt55 = _mm512_shuffle_f32x4(tmp182, tmp190, 136);
wt56 = _mm512_shuffle_f32x4(tmp184, tmp192, 136);
_mm512_mask_storeu_ps(arrangedW1+0+3098760*i5+10416*c2+48*k4, 63, wt49);
_mm512_mask_storeu_ps(arrangedW1+10392+3098760*i5+10416*c2+48*k4, 4032, wt49);
_mm512_mask_storeu_ps(arrangedW1+20784+3098760*i5+10416*c2+24*k4, 28672, wt49);
_mm512_mask_storeu_ps(arrangedW1+885360+3098760*i5+10416*c2+48*k4, 63, wt50);
_mm512_mask_storeu_ps(arrangedW1+895752+3098760*i5+10416*c2+48*k4, 4032, wt50);
_mm512_mask_storeu_ps(arrangedW1+906144+3098760*i5+10416*c2+24*k4, 28672, wt50);
_mm512_mask_storeu_ps(arrangedW1+1770720+3098760*i5+10416*c2+48*k4, 63, wt51);
_mm512_mask_storeu_ps(arrangedW1+1781112+3098760*i5+10416*c2+48*k4, 4032, wt51);
_mm512_mask_storeu_ps(arrangedW1+1791504+3098760*i5+10416*c2+24*k4, 28672, wt51);
_mm512_mask_storeu_ps(arrangedW1+2656080+3098760*i5+10416*c2+48*k4, 63, wt52);
_mm512_mask_storeu_ps(arrangedW1+2666472+3098760*i5+10416*c2+48*k4, 4032, wt52);
_mm512_mask_storeu_ps(arrangedW1+2676864+3098760*i5+10416*c2+24*k4, 28672, wt52);
_mm512_mask_storeu_ps(arrangedW1+442680+3098760*i5+10416*c2+48*k4, 63, wt53);
_mm512_mask_storeu_ps(arrangedW1+453072+3098760*i5+10416*c2+48*k4, 4032, wt53);
_mm512_mask_storeu_ps(arrangedW1+463464+3098760*i5+10416*c2+24*k4, 28672, wt53);
_mm512_mask_storeu_ps(arrangedW1+1328040+3098760*i5+10416*c2+48*k4, 63, wt54);
_mm512_mask_storeu_ps(arrangedW1+1338432+3098760*i5+10416*c2+48*k4, 4032, wt54);
_mm512_mask_storeu_ps(arrangedW1+1348824+3098760*i5+10416*c2+24*k4, 28672, wt54);
_mm512_mask_storeu_ps(arrangedW1+2213400+3098760*i5+10416*c2+48*k4, 63, wt55);
_mm512_mask_storeu_ps(arrangedW1+2223792+3098760*i5+10416*c2+48*k4, 4032, wt55);
_mm512_mask_storeu_ps(arrangedW1+2234184+3098760*i5+10416*c2+24*k4, 28672, wt55);
_mm512_mask_storeu_ps(arrangedW1+24+3098760*i5+10416*c2+48*k4, 63, wt56);
_mm512_mask_storeu_ps(arrangedW1+10416+3098760*i5+10416*c2+48*k4, 4032, wt56);
_mm512_mask_storeu_ps(arrangedW1+20796+3098760*i5+10416*c2+24*k4, 28672, wt56);
_mm512_mask_storeu_ps(arrangedW1+885384+3098760*i5+10416*c2+48*k4, 63, wt57);
_mm512_mask_storeu_ps(arrangedW1+895776+3098760*i5+10416*c2+48*k4, 4032, wt57);
_mm512_mask_storeu_ps(arrangedW1+906156+3098760*i5+10416*c2+24*k4, 28672, wt57);
_mm512_mask_storeu_ps(arrangedW1+1770744+3098760*i5+10416*c2+48*k4, 63, wt58);
_mm512_mask_storeu_ps(arrangedW1+1781136+3098760*i5+10416*c2+48*k4, 4032, wt58);
_mm512_mask_storeu_ps(arrangedW1+1791516+3098760*i5+10416*c2+24*k4, 28672, wt58);
_mm512_mask_storeu_ps(arrangedW1+2656104+3098760*i5+10416*c2+48*k4, 63, wt59);
_mm512_mask_storeu_ps(arrangedW1+2666496+3098760*i5+10416*c2+48*k4, 4032, wt59);
_mm512_mask_storeu_ps(arrangedW1+2676876+3098760*i5+10416*c2+24*k4, 28672, wt59);
_mm512_mask_storeu_ps(arrangedW1+442704+3098760*i5+10416*c2+48*k4, 63, wt60);
_mm512_mask_storeu_ps(arrangedW1+453096+3098760*i5+10416*c2+48*k4, 4032, wt60);
_mm512_mask_storeu_ps(arrangedW1+463476+3098760*i5+10416*c2+24*k4, 28672, wt60);
_mm512_mask_storeu_ps(arrangedW1+1328064+3098760*i5+10416*c2+48*k4, 63, wt61);
_mm512_mask_storeu_ps(arrangedW1+1338456+3098760*i5+10416*c2+48*k4, 4032, wt61);
_mm512_mask_storeu_ps(arrangedW1+1348836+3098760*i5+10416*c2+24*k4, 28672, wt61);
_mm512_mask_storeu_ps(arrangedW1+2213424+3098760*i5+10416*c2+48*k4, 63, wt62);
_mm512_mask_storeu_ps(arrangedW1+2223816+3098760*i5+10416*c2+48*k4, 4032, wt62);
_mm512_mask_storeu_ps(arrangedW1+2234196+3098760*i5+10416*c2+24*k4, 28672, wt62);
}
if (j1 >= jj1) return;
j1 = 16;
}
}

static void Example1LoomArrangeFilts1(Example1ThreaderTeam1* team13, char** tensors1) {
Example1ThreaderTask1 task5;
task5.callee1 = Example1LoomArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 16;
task5.hull1[1] = 5;
task5.hull1[2] = 1;
Example1ThreaderDo1(team13, &task5);
}

static void Example1LoomArrangeDats1Callee1(Example1ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = pt8[0];
ptrdiff_t c3 = pt8[1];
ptrdiff_t g3 = pt8[2];
ptrdiff_t e2 = 0;
char*restrict datPtr1 = tensors4[0]-12+11152260*e2;
char*restrict arranged1 = tensors4[1]+119705600*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 1*c3;
ptrdiff_t last1 = j2+0;
if (j2 < 26) {
ptrdiff_t rel1 = (size_t)(j2-0)%2;
ptrdiff_t h1 = 0+(size_t)(j2-0)/2*4;
for (; j2 < 26; rel1 = 0, h1 += 4) {
if (rel1 < 1) {
ptrdiff_t w1 = 0;
ptrdiff_t k5 = 33*s1;
ptrdiff_t kk1 = k5+(s1 < 12 ? 32 : 37);
for (; k5 <= kk1; ++k5) {
__m512 dat1 = _mm512_maskz_loadu_ps(65528, datPtr1+0+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat2 = _mm512_maskz_loadu_ps(65535, datPtr1+64+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat3 = _mm512_maskz_loadu_ps(65535, datPtr1+128+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat4 = _mm512_maskz_loadu_ps(65535, datPtr1+192+5796504*i6+13356*k5+252*h1+4*w1);
__m512i pm1 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 28, 24, 20, 16, 12, 8, 4, 0);
_mm512_mask_storeu_ps(arranged1+0+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat1, pm1, dat2));
_mm512_mask_storeu_ps(arranged1+32+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat3, pm1, dat4));
__m512i pm2 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 29, 25, 21, 17, 13, 9, 5, 1);
_mm512_mask_storeu_ps(arranged1+15554560+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat1, pm2, dat2));
_mm512_mask_storeu_ps(arranged1+15554592+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat3, pm2, dat4));
__m512i pm3 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 30, 26, 22, 18, 14, 10, 6, 2);
_mm512_mask_storeu_ps(arranged1+31109120+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat1, pm3, dat2));
_mm512_mask_storeu_ps(arranged1+31109152+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat3, pm3, dat4));
__m512i pm4 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 31, 27, 23, 19, 15, 11, 7, 3);
_mm512_mask_storeu_ps(arranged1+46663680+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat1, pm4, dat2));
_mm512_mask_storeu_ps(arranged1+46663712+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat3, pm4, dat4));
__m512 dat5 = _mm512_maskz_loadu_ps(65528, datPtr1+252+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat6 = _mm512_maskz_loadu_ps(65535, datPtr1+316+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat7 = _mm512_maskz_loadu_ps(65535, datPtr1+380+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat8 = _mm512_maskz_loadu_ps(65535, datPtr1+444+5796504*i6+13356*k5+252*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+64+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat5, pm1, dat6));
_mm512_mask_storeu_ps(arranged1+96+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat7, pm1, dat8));
_mm512_mask_storeu_ps(arranged1+15554624+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat5, pm2, dat6));
_mm512_mask_storeu_ps(arranged1+15554656+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat7, pm2, dat8));
_mm512_mask_storeu_ps(arranged1+31109184+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat5, pm3, dat6));
_mm512_mask_storeu_ps(arranged1+31109216+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat7, pm3, dat8));
_mm512_mask_storeu_ps(arranged1+46663744+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat5, pm4, dat6));
_mm512_mask_storeu_ps(arranged1+46663776+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat7, pm4, dat8));
__m512 dat9 = _mm512_maskz_loadu_ps(65528, datPtr1+504+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat10 = _mm512_maskz_loadu_ps(65535, datPtr1+568+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat11 = _mm512_maskz_loadu_ps(65535, datPtr1+632+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat12 = _mm512_maskz_loadu_ps(65535, datPtr1+696+5796504*i6+13356*k5+252*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+128+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat9, pm1, dat10));
_mm512_mask_storeu_ps(arranged1+160+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat11, pm1, dat12));
_mm512_mask_storeu_ps(arranged1+15554688+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat9, pm2, dat10));
_mm512_mask_storeu_ps(arranged1+15554720+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat11, pm2, dat12));
_mm512_mask_storeu_ps(arranged1+31109248+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat9, pm3, dat10));
_mm512_mask_storeu_ps(arranged1+31109280+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat11, pm3, dat12));
_mm512_mask_storeu_ps(arranged1+46663808+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat9, pm4, dat10));
_mm512_mask_storeu_ps(arranged1+46663840+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat11, pm4, dat12));
__m512 dat13 = _mm512_maskz_loadu_ps(65528, datPtr1+756+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat14 = _mm512_maskz_loadu_ps(65535, datPtr1+820+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat15 = _mm512_maskz_loadu_ps(65535, datPtr1+884+5796504*i6+13356*k5+252*h1+4*w1);
__m512 dat16 = _mm512_maskz_loadu_ps(65535, datPtr1+948+5796504*i6+13356*k5+252*h1+4*w1);
_mm512_mask_storeu_ps(arranged1+192+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat13, pm1, dat14));
_mm512_mask_storeu_ps(arranged1+224+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat15, pm1, dat16));
_mm512_mask_storeu_ps(arranged1+15554752+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat13, pm2, dat14));
_mm512_mask_storeu_ps(arranged1+15554784+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat15, pm2, dat16));
_mm512_mask_storeu_ps(arranged1+31109312+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat13, pm3, dat14));
_mm512_mask_storeu_ps(arranged1+31109344+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat15, pm3, dat16));
_mm512_mask_storeu_ps(arranged1+46663872+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat13, pm4, dat14));
_mm512_mask_storeu_ps(arranged1+46663904+3110912*i6+111104*j2+256*k5, 255, _mm512_permutex2var_ps(dat15, pm4, dat16));
}
if (j2 >= last1) return;
++j2;
rel1 = 1;
}
ptrdiff_t w2 = 64;
ptrdiff_t k6 = 33*s1;
ptrdiff_t kk2 = k6+(s1 < 12 ? 32 : 37);
for (; k6 <= kk2; ++k6) {
__m512 dat17 = _mm512_maskz_loadu_ps(3, datPtr1+0+5796504*i6+13356*k6+252*h1+4*w2);
__m512i pm5 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 28, 24, 20, 16, 12, 8, 4, 0);
_mm512_mask_storeu_ps(arranged1+0+3110912*i6+111104*j2+256*k6, 65535, _mm512_permutex2var_ps(dat17, pm5, _mm512_setzero_ps()));
__m512i pm6 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 29, 25, 21, 17, 13, 9, 5, 1);
_mm512_mask_storeu_ps(arranged1+15554560+3110912*i6+111104*j2+256*k6, 65535, _mm512_permutex2var_ps(dat17, pm6, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31109120+3110912*i6+111104*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663680+3110912*i6+111104*j2+256*k6, 65535, _mm512_setzero_ps());
__m512 dat18 = _mm512_maskz_loadu_ps(3, datPtr1+252+5796504*i6+13356*k6+252*h1+4*w2);
_mm512_mask_storeu_ps(arranged1+64+3110912*i6+111104*j2+256*k6, 65535, _mm512_permutex2var_ps(dat18, pm5, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+15554624+3110912*i6+111104*j2+256*k6, 65535, _mm512_permutex2var_ps(dat18, pm6, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31109184+3110912*i6+111104*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663744+3110912*i6+111104*j2+256*k6, 65535, _mm512_setzero_ps());
__m512 dat19 = _mm512_maskz_loadu_ps(3, datPtr1+504+5796504*i6+13356*k6+252*h1+4*w2);
_mm512_mask_storeu_ps(arranged1+128+3110912*i6+111104*j2+256*k6, 65535, _mm512_permutex2var_ps(dat19, pm5, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+15554688+3110912*i6+111104*j2+256*k6, 65535, _mm512_permutex2var_ps(dat19, pm6, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31109248+3110912*i6+111104*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663808+3110912*i6+111104*j2+256*k6, 65535, _mm512_setzero_ps());
__m512 dat20 = _mm512_maskz_loadu_ps(3, datPtr1+756+5796504*i6+13356*k6+252*h1+4*w2);
_mm512_mask_storeu_ps(arranged1+192+3110912*i6+111104*j2+256*k6, 65535, _mm512_permutex2var_ps(dat20, pm5, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+15554752+3110912*i6+111104*j2+256*k6, 65535, _mm512_permutex2var_ps(dat20, pm6, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31109312+3110912*i6+111104*j2+256*k6, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663872+3110912*i6+111104*j2+256*k6, 65535, _mm512_setzero_ps());
}
if (j2 >= last1) return;
++j2;
}
j2 = 26;
}
ptrdiff_t rel2 = j2-26;
ptrdiff_t h2 = 52;
if (rel2 < 1) {
ptrdiff_t w3 = 0;
ptrdiff_t k7 = 33*s1;
ptrdiff_t kk3 = k7+(s1 < 12 ? 32 : 37);
for (; k7 <= kk3; ++k7) {
__m512 dat21 = _mm512_maskz_loadu_ps(65528, datPtr1+0+5796504*i6+13356*k7+252*h2+4*w3);
__m512 dat22 = _mm512_maskz_loadu_ps(65535, datPtr1+64+5796504*i6+13356*k7+252*h2+4*w3);
__m512 dat23 = _mm512_maskz_loadu_ps(65535, datPtr1+128+5796504*i6+13356*k7+252*h2+4*w3);
__m512 dat24 = _mm512_maskz_loadu_ps(65535, datPtr1+192+5796504*i6+13356*k7+252*h2+4*w3);
__m512i pm7 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 28, 24, 20, 16, 12, 8, 4, 0);
_mm512_mask_storeu_ps(arranged1+0+3110912*i6+111104*j2+256*k7, 255, _mm512_permutex2var_ps(dat21, pm7, dat22));
_mm512_mask_storeu_ps(arranged1+32+3110912*i6+111104*j2+256*k7, 255, _mm512_permutex2var_ps(dat23, pm7, dat24));
__m512i pm8 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 29, 25, 21, 17, 13, 9, 5, 1);
_mm512_mask_storeu_ps(arranged1+15554560+3110912*i6+111104*j2+256*k7, 255, _mm512_permutex2var_ps(dat21, pm8, dat22));
_mm512_mask_storeu_ps(arranged1+15554592+3110912*i6+111104*j2+256*k7, 255, _mm512_permutex2var_ps(dat23, pm8, dat24));
__m512i pm9 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 30, 26, 22, 18, 14, 10, 6, 2);
_mm512_mask_storeu_ps(arranged1+31109120+3110912*i6+111104*j2+256*k7, 255, _mm512_permutex2var_ps(dat21, pm9, dat22));
_mm512_mask_storeu_ps(arranged1+31109152+3110912*i6+111104*j2+256*k7, 255, _mm512_permutex2var_ps(dat23, pm9, dat24));
__m512i pm10 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 31, 27, 23, 19, 15, 11, 7, 3);
_mm512_mask_storeu_ps(arranged1+46663680+3110912*i6+111104*j2+256*k7, 255, _mm512_permutex2var_ps(dat21, pm10, dat22));
_mm512_mask_storeu_ps(arranged1+46663712+3110912*i6+111104*j2+256*k7, 255, _mm512_permutex2var_ps(dat23, pm10, dat24));
_mm512_mask_storeu_ps(arranged1+64+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15554624+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31109184+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663744+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+128+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15554688+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31109248+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663808+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+192+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15554752+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31109312+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663872+3110912*i6+111104*j2+256*k7, 65535, _mm512_setzero_ps());
}
if (j2 >= last1) return;
++j2;
rel2 = 1;
}
ptrdiff_t w4 = 64;
ptrdiff_t k8 = 33*s1;
ptrdiff_t kk4 = k8+(s1 < 12 ? 32 : 37);
for (; k8 <= kk4; ++k8) {
__m512 dat25 = _mm512_maskz_loadu_ps(3, datPtr1+0+5796504*i6+13356*k8+252*h2+4*w4);
__m512i pm11 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 28, 24, 20, 16, 12, 8, 4, 0);
_mm512_mask_storeu_ps(arranged1+0+3110912*i6+111104*j2+256*k8, 65535, _mm512_permutex2var_ps(dat25, pm11, _mm512_setzero_ps()));
__m512i pm12 = _mm512_set_epi32(31, 31, 31, 31, 31, 31, 31, 31, 29, 25, 21, 17, 13, 9, 5, 1);
_mm512_mask_storeu_ps(arranged1+15554560+3110912*i6+111104*j2+256*k8, 65535, _mm512_permutex2var_ps(dat25, pm12, _mm512_setzero_ps()));
_mm512_mask_storeu_ps(arranged1+31109120+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663680+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+64+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15554624+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31109184+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663744+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+128+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15554688+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31109248+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663808+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+192+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+15554752+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+31109312+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
_mm512_mask_storeu_ps(arranged1+46663872+3110912*i6+111104*j2+256*k8, 65535, _mm512_setzero_ps());
}
if (j2 >= last1) return;
++j2;
}

static void Example1LoomArrangeDats1(Example1ThreaderTeam1* team15, char** tensors3) {
Example1ThreaderTask1 task7;
task7.callee1 = Example1LoomArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 13;
task7.hull1[1] = 28;
task7.hull1[2] = 5;
task7.hull1[3] = 1;
Example1ThreaderDo1(team15, &task7);
}

static ptrdiff_t Example1LoomProduceSums1FieldTbl1[] = {
0, 2,
2, 2,
4, 2,
6, 1, 7
};

static ptrdiff_t Example1LoomProduceSums1NodeTbl1[] = {
0, 0, 1,
0, 1, 1,
0, 0, 0,
0, 1, 0,
0, 0, 0,
0, 1, 0,
0, 0, 0
};

static void Example1LoomProduceSums1Callee1(Example1ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t epoch1 = 0;
ptrdiff_t field1 = (ptrdiff_t)tuple2[2];
ptrdiff_t nodeFirst1 = (ptrdiff_t)tuple2[3];
ptrdiff_t group1 = pt9[3];
ptrdiff_t to2 = pt9[2];
ptrdiff_t nodeOff1 = pt9[1];
ptrdiff_t w5 = pt9[0];
ptrdiff_t node6 = nodeFirst1+nodeOff1;
ptrdiff_t lift1 = 0;
ptrdiff_t pile1 = Example1LoomProduceSums1NodeTbl1[1+3*node6];
ptrdiff_t base1 = Example1LoomProduceSums1NodeTbl1[2+3*node6];
ptrdiff_t from1 = to2+(size_t)lift1/4*2;
if (from1 >= 28) return;
char*restrict biasPtr2 = tensors6[0]+5100*epoch1+1020*group1;
char*restrict wtPtr2 = tensors6[0]+5100+29809500*epoch1+3098760*group1+442680*node6;
char*restrict datPtr2 = tensors6[1]+119705600*epoch1+15554560*field1+3110912*group1+111104*from1;
char*restrict sumPtr1 = tensors6[2]+3655680*group1+130560*to2+65280*pile1;
if (epoch1|node6) {
if (!epoch1 && base1) {
ptrdiff_t i7 = 2*w5;
ptrdiff_t ii1 = i7+(w5 < 20 ? 1 : 2);
for (; i7 != 42; ++i7) {
__m512 sum2 = _mm512_setzero_ps();
__m512 sum6 = _mm512_setzero_ps();
__m512 sum10 = _mm512_setzero_ps();
__m512 sum14 = _mm512_setzero_ps();
__m512 sum18 = _mm512_setzero_ps();
__m512 sum22 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum3 = sum2;
__m512 sum4 = sum2;
__m512 sum5 = sum2;
__m512 sum7 = sum6;
__m512 sum8 = sum6;
__m512 sum9 = sum6;
__m512 sum11 = sum10;
__m512 sum12 = sum10;
__m512 sum13 = sum10;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum19 = sum18;
__m512 sum20 = sum18;
__m512 sum21 = sum18;
__m512 sum23 = sum22;
__m512 sum24 = sum22;
__m512 sum25 = sum22;
for (ptrdiff_t j3 = 0; j3 < 434; ++j3) {
__m512 dat26 = _mm512_loadu_ps(datPtr2+0+256*j3);
__m512 dat27 = _mm512_loadu_ps(datPtr2+64+256*j3);
__m512 dat28 = _mm512_loadu_ps(datPtr2+128+256*j3);
__m512 dat29 = _mm512_loadu_ps(datPtr2+192+256*j3);
__m512 wt64 = _mm512_set1_ps(*(float*)(wtPtr2+0+10416*i7+24*j3));
sum2 = _mm512_fmadd_ps(wt64, dat26, sum2);
sum3 = _mm512_fmadd_ps(wt64, dat27, sum3);
sum4 = _mm512_fmadd_ps(wt64, dat28, sum4);
sum5 = _mm512_fmadd_ps(wt64, dat29, sum5);
__m512 wt65 = _mm512_set1_ps(*(float*)(wtPtr2+4+10416*i7+24*j3));
sum6 = _mm512_fmadd_ps(wt65, dat26, sum6);
sum7 = _mm512_fmadd_ps(wt65, dat27, sum7);
sum8 = _mm512_fmadd_ps(wt65, dat28, sum8);
sum9 = _mm512_fmadd_ps(wt65, dat29, sum9);
__m512 wt66 = _mm512_set1_ps(*(float*)(wtPtr2+8+10416*i7+24*j3));
sum10 = _mm512_fmadd_ps(wt66, dat26, sum10);
sum11 = _mm512_fmadd_ps(wt66, dat27, sum11);
sum12 = _mm512_fmadd_ps(wt66, dat28, sum12);
sum13 = _mm512_fmadd_ps(wt66, dat29, sum13);
__m512 wt67 = _mm512_set1_ps(*(float*)(wtPtr2+12+10416*i7+24*j3));
sum14 = _mm512_fmadd_ps(wt67, dat26, sum14);
sum15 = _mm512_fmadd_ps(wt67, dat27, sum15);
sum16 = _mm512_fmadd_ps(wt67, dat28, sum16);
sum17 = _mm512_fmadd_ps(wt67, dat29, sum17);
__m512 wt68 = _mm512_set1_ps(*(float*)(wtPtr2+16+10416*i7+24*j3));
sum18 = _mm512_fmadd_ps(wt68, dat26, sum18);
sum19 = _mm512_fmadd_ps(wt68, dat27, sum19);
sum20 = _mm512_fmadd_ps(wt68, dat28, sum20);
sum21 = _mm512_fmadd_ps(wt68, dat29, sum21);
__m512 wt69 = _mm512_set1_ps(*(float*)(wtPtr2+20+10416*i7+24*j3));
sum22 = _mm512_fmadd_ps(wt69, dat26, sum22);
sum23 = _mm512_fmadd_ps(wt69, dat27, sum23);
sum24 = _mm512_fmadd_ps(wt69, dat28, sum24);
sum25 = _mm512_fmadd_ps(wt69, dat29, sum25);
}
_mm512_storeu_ps(sumPtr1+0+1536*i7, sum2);
_mm512_storeu_ps(sumPtr1+64+1536*i7, sum3);
_mm512_storeu_ps(sumPtr1+128+1536*i7, sum4);
_mm512_storeu_ps(sumPtr1+192+1536*i7, sum5);
_mm512_storeu_ps(sumPtr1+256+1536*i7, sum6);
_mm512_storeu_ps(sumPtr1+320+1536*i7, sum7);
_mm512_storeu_ps(sumPtr1+384+1536*i7, sum8);
_mm512_storeu_ps(sumPtr1+448+1536*i7, sum9);
_mm512_storeu_ps(sumPtr1+512+1536*i7, sum10);
_mm512_storeu_ps(sumPtr1+576+1536*i7, sum11);
_mm512_storeu_ps(sumPtr1+640+1536*i7, sum12);
_mm512_storeu_ps(sumPtr1+704+1536*i7, sum13);
_mm512_storeu_ps(sumPtr1+768+1536*i7, sum14);
_mm512_storeu_ps(sumPtr1+832+1536*i7, sum15);
_mm512_storeu_ps(sumPtr1+896+1536*i7, sum16);
_mm512_storeu_ps(sumPtr1+960+1536*i7, sum17);
_mm512_storeu_ps(sumPtr1+1024+1536*i7, sum18);
_mm512_storeu_ps(sumPtr1+1088+1536*i7, sum19);
_mm512_storeu_ps(sumPtr1+1152+1536*i7, sum20);
_mm512_storeu_ps(sumPtr1+1216+1536*i7, sum21);
_mm512_storeu_ps(sumPtr1+1280+1536*i7, sum22);
_mm512_storeu_ps(sumPtr1+1344+1536*i7, sum23);
_mm512_storeu_ps(sumPtr1+1408+1536*i7, sum24);
_mm512_storeu_ps(sumPtr1+1472+1536*i7, sum25);
if (i7 >= ii1) return;
}
__m512 sum26 = _mm512_setzero_ps();
__m512 sum30 = _mm512_setzero_ps();
__m512 sum34 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
__m512 sum31 = sum30;
__m512 sum32 = sum30;
__m512 sum33 = sum30;
__m512 sum35 = sum34;
__m512 sum36 = sum34;
__m512 sum37 = sum34;
for (ptrdiff_t j4 = 0; j4 < 434; ++j4) {
__m512 dat30 = _mm512_loadu_ps(datPtr2+0+256*j4);
__m512 dat31 = _mm512_loadu_ps(datPtr2+64+256*j4);
__m512 dat32 = _mm512_loadu_ps(datPtr2+128+256*j4);
__m512 dat33 = _mm512_loadu_ps(datPtr2+192+256*j4);
__m512 wt70 = _mm512_set1_ps(*(float*)(wtPtr2+0+10416*i7+12*j4));
sum26 = _mm512_fmadd_ps(wt70, dat30, sum26);
sum27 = _mm512_fmadd_ps(wt70, dat31, sum27);
sum28 = _mm512_fmadd_ps(wt70, dat32, sum28);
sum29 = _mm512_fmadd_ps(wt70, dat33, sum29);
__m512 wt71 = _mm512_set1_ps(*(float*)(wtPtr2+4+10416*i7+12*j4));
sum30 = _mm512_fmadd_ps(wt71, dat30, sum30);
sum31 = _mm512_fmadd_ps(wt71, dat31, sum31);
sum32 = _mm512_fmadd_ps(wt71, dat32, sum32);
sum33 = _mm512_fmadd_ps(wt71, dat33, sum33);
__m512 wt72 = _mm512_set1_ps(*(float*)(wtPtr2+8+10416*i7+12*j4));
sum34 = _mm512_fmadd_ps(wt72, dat30, sum34);
sum35 = _mm512_fmadd_ps(wt72, dat31, sum35);
sum36 = _mm512_fmadd_ps(wt72, dat32, sum36);
sum37 = _mm512_fmadd_ps(wt72, dat33, sum37);
}
_mm512_storeu_ps(sumPtr1+0+1536*i7, sum26);
_mm512_storeu_ps(sumPtr1+64+1536*i7, sum27);
_mm512_storeu_ps(sumPtr1+128+1536*i7, sum28);
_mm512_storeu_ps(sumPtr1+192+1536*i7, sum29);
_mm512_storeu_ps(sumPtr1+256+1536*i7, sum30);
_mm512_storeu_ps(sumPtr1+320+1536*i7, sum31);
_mm512_storeu_ps(sumPtr1+384+1536*i7, sum32);
_mm512_storeu_ps(sumPtr1+448+1536*i7, sum33);
_mm512_storeu_ps(sumPtr1+512+1536*i7, sum34);
_mm512_storeu_ps(sumPtr1+576+1536*i7, sum35);
_mm512_storeu_ps(sumPtr1+640+1536*i7, sum36);
_mm512_storeu_ps(sumPtr1+704+1536*i7, sum37);
return;
}
ptrdiff_t i8 = 2*w5;
ptrdiff_t ii2 = i8+(w5 < 20 ? 1 : 2);
for (; i8 != 42; ++i8) {
__m512 sum38 = _mm512_setzero_ps();
__m512 sum42 = _mm512_setzero_ps();
__m512 sum46 = _mm512_setzero_ps();
__m512 sum50 = _mm512_setzero_ps();
__m512 sum54 = _mm512_setzero_ps();
__m512 sum58 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum39 = sum38;
__m512 sum40 = sum38;
__m512 sum41 = sum38;
__m512 sum43 = sum42;
__m512 sum44 = sum42;
__m512 sum45 = sum42;
__m512 sum47 = sum46;
__m512 sum48 = sum46;
__m512 sum49 = sum46;
__m512 sum51 = sum50;
__m512 sum52 = sum50;
__m512 sum53 = sum50;
__m512 sum55 = sum54;
__m512 sum56 = sum54;
__m512 sum57 = sum54;
__m512 sum59 = sum58;
__m512 sum60 = sum58;
__m512 sum61 = sum58;
for (ptrdiff_t j5 = 0; j5 < 434; ++j5) {
__m512 dat34 = _mm512_loadu_ps(datPtr2+0+256*j5);
__m512 dat35 = _mm512_loadu_ps(datPtr2+64+256*j5);
__m512 dat36 = _mm512_loadu_ps(datPtr2+128+256*j5);
__m512 dat37 = _mm512_loadu_ps(datPtr2+192+256*j5);
__m512 wt73 = _mm512_set1_ps(*(float*)(wtPtr2+0+10416*i8+24*j5));
sum38 = _mm512_fmadd_ps(wt73, dat34, sum38);
sum39 = _mm512_fmadd_ps(wt73, dat35, sum39);
sum40 = _mm512_fmadd_ps(wt73, dat36, sum40);
sum41 = _mm512_fmadd_ps(wt73, dat37, sum41);
__m512 wt74 = _mm512_set1_ps(*(float*)(wtPtr2+4+10416*i8+24*j5));
sum42 = _mm512_fmadd_ps(wt74, dat34, sum42);
sum43 = _mm512_fmadd_ps(wt74, dat35, sum43);
sum44 = _mm512_fmadd_ps(wt74, dat36, sum44);
sum45 = _mm512_fmadd_ps(wt74, dat37, sum45);
__m512 wt75 = _mm512_set1_ps(*(float*)(wtPtr2+8+10416*i8+24*j5));
sum46 = _mm512_fmadd_ps(wt75, dat34, sum46);
sum47 = _mm512_fmadd_ps(wt75, dat35, sum47);
sum48 = _mm512_fmadd_ps(wt75, dat36, sum48);
sum49 = _mm512_fmadd_ps(wt75, dat37, sum49);
__m512 wt76 = _mm512_set1_ps(*(float*)(wtPtr2+12+10416*i8+24*j5));
sum50 = _mm512_fmadd_ps(wt76, dat34, sum50);
sum51 = _mm512_fmadd_ps(wt76, dat35, sum51);
sum52 = _mm512_fmadd_ps(wt76, dat36, sum52);
sum53 = _mm512_fmadd_ps(wt76, dat37, sum53);
__m512 wt77 = _mm512_set1_ps(*(float*)(wtPtr2+16+10416*i8+24*j5));
sum54 = _mm512_fmadd_ps(wt77, dat34, sum54);
sum55 = _mm512_fmadd_ps(wt77, dat35, sum55);
sum56 = _mm512_fmadd_ps(wt77, dat36, sum56);
sum57 = _mm512_fmadd_ps(wt77, dat37, sum57);
__m512 wt78 = _mm512_set1_ps(*(float*)(wtPtr2+20+10416*i8+24*j5));
sum58 = _mm512_fmadd_ps(wt78, dat34, sum58);
sum59 = _mm512_fmadd_ps(wt78, dat35, sum59);
sum60 = _mm512_fmadd_ps(wt78, dat36, sum60);
sum61 = _mm512_fmadd_ps(wt78, dat37, sum61);
}
_mm512_storeu_ps(sumPtr1+0+1536*i8, _mm512_add_ps(sum38, _mm512_loadu_ps(sumPtr1+0+1536*i8)));
_mm512_storeu_ps(sumPtr1+64+1536*i8, _mm512_add_ps(sum39, _mm512_loadu_ps(sumPtr1+64+1536*i8)));
_mm512_storeu_ps(sumPtr1+128+1536*i8, _mm512_add_ps(sum40, _mm512_loadu_ps(sumPtr1+128+1536*i8)));
_mm512_storeu_ps(sumPtr1+192+1536*i8, _mm512_add_ps(sum41, _mm512_loadu_ps(sumPtr1+192+1536*i8)));
_mm512_storeu_ps(sumPtr1+256+1536*i8, _mm512_add_ps(sum42, _mm512_loadu_ps(sumPtr1+256+1536*i8)));
_mm512_storeu_ps(sumPtr1+320+1536*i8, _mm512_add_ps(sum43, _mm512_loadu_ps(sumPtr1+320+1536*i8)));
_mm512_storeu_ps(sumPtr1+384+1536*i8, _mm512_add_ps(sum44, _mm512_loadu_ps(sumPtr1+384+1536*i8)));
_mm512_storeu_ps(sumPtr1+448+1536*i8, _mm512_add_ps(sum45, _mm512_loadu_ps(sumPtr1+448+1536*i8)));
_mm512_storeu_ps(sumPtr1+512+1536*i8, _mm512_add_ps(sum46, _mm512_loadu_ps(sumPtr1+512+1536*i8)));
_mm512_storeu_ps(sumPtr1+576+1536*i8, _mm512_add_ps(sum47, _mm512_loadu_ps(sumPtr1+576+1536*i8)));
_mm512_storeu_ps(sumPtr1+640+1536*i8, _mm512_add_ps(sum48, _mm512_loadu_ps(sumPtr1+640+1536*i8)));
_mm512_storeu_ps(sumPtr1+704+1536*i8, _mm512_add_ps(sum49, _mm512_loadu_ps(sumPtr1+704+1536*i8)));
_mm512_storeu_ps(sumPtr1+768+1536*i8, _mm512_add_ps(sum50, _mm512_loadu_ps(sumPtr1+768+1536*i8)));
_mm512_storeu_ps(sumPtr1+832+1536*i8, _mm512_add_ps(sum51, _mm512_loadu_ps(sumPtr1+832+1536*i8)));
_mm512_storeu_ps(sumPtr1+896+1536*i8, _mm512_add_ps(sum52, _mm512_loadu_ps(sumPtr1+896+1536*i8)));
_mm512_storeu_ps(sumPtr1+960+1536*i8, _mm512_add_ps(sum53, _mm512_loadu_ps(sumPtr1+960+1536*i8)));
_mm512_storeu_ps(sumPtr1+1024+1536*i8, _mm512_add_ps(sum54, _mm512_loadu_ps(sumPtr1+1024+1536*i8)));
_mm512_storeu_ps(sumPtr1+1088+1536*i8, _mm512_add_ps(sum55, _mm512_loadu_ps(sumPtr1+1088+1536*i8)));
_mm512_storeu_ps(sumPtr1+1152+1536*i8, _mm512_add_ps(sum56, _mm512_loadu_ps(sumPtr1+1152+1536*i8)));
_mm512_storeu_ps(sumPtr1+1216+1536*i8, _mm512_add_ps(sum57, _mm512_loadu_ps(sumPtr1+1216+1536*i8)));
_mm512_storeu_ps(sumPtr1+1280+1536*i8, _mm512_add_ps(sum58, _mm512_loadu_ps(sumPtr1+1280+1536*i8)));
_mm512_storeu_ps(sumPtr1+1344+1536*i8, _mm512_add_ps(sum59, _mm512_loadu_ps(sumPtr1+1344+1536*i8)));
_mm512_storeu_ps(sumPtr1+1408+1536*i8, _mm512_add_ps(sum60, _mm512_loadu_ps(sumPtr1+1408+1536*i8)));
_mm512_storeu_ps(sumPtr1+1472+1536*i8, _mm512_add_ps(sum61, _mm512_loadu_ps(sumPtr1+1472+1536*i8)));
if (i8 >= ii2) return;
}
__m512 sum62 = _mm512_setzero_ps();
__m512 sum66 = _mm512_setzero_ps();
__m512 sum70 = _mm512_setzero_ps();
(void)biasPtr2;
__m512 sum63 = sum62;
__m512 sum64 = sum62;
__m512 sum65 = sum62;
__m512 sum67 = sum66;
__m512 sum68 = sum66;
__m512 sum69 = sum66;
__m512 sum71 = sum70;
__m512 sum72 = sum70;
__m512 sum73 = sum70;
for (ptrdiff_t j6 = 0; j6 < 434; ++j6) {
__m512 dat38 = _mm512_loadu_ps(datPtr2+0+256*j6);
__m512 dat39 = _mm512_loadu_ps(datPtr2+64+256*j6);
__m512 dat40 = _mm512_loadu_ps(datPtr2+128+256*j6);
__m512 dat41 = _mm512_loadu_ps(datPtr2+192+256*j6);
__m512 wt79 = _mm512_set1_ps(*(float*)(wtPtr2+0+10416*i8+12*j6));
sum62 = _mm512_fmadd_ps(wt79, dat38, sum62);
sum63 = _mm512_fmadd_ps(wt79, dat39, sum63);
sum64 = _mm512_fmadd_ps(wt79, dat40, sum64);
sum65 = _mm512_fmadd_ps(wt79, dat41, sum65);
__m512 wt80 = _mm512_set1_ps(*(float*)(wtPtr2+4+10416*i8+12*j6));
sum66 = _mm512_fmadd_ps(wt80, dat38, sum66);
sum67 = _mm512_fmadd_ps(wt80, dat39, sum67);
sum68 = _mm512_fmadd_ps(wt80, dat40, sum68);
sum69 = _mm512_fmadd_ps(wt80, dat41, sum69);
__m512 wt81 = _mm512_set1_ps(*(float*)(wtPtr2+8+10416*i8+12*j6));
sum70 = _mm512_fmadd_ps(wt81, dat38, sum70);
sum71 = _mm512_fmadd_ps(wt81, dat39, sum71);
sum72 = _mm512_fmadd_ps(wt81, dat40, sum72);
sum73 = _mm512_fmadd_ps(wt81, dat41, sum73);
}
_mm512_storeu_ps(sumPtr1+0+1536*i8, _mm512_add_ps(sum62, _mm512_loadu_ps(sumPtr1+0+1536*i8)));
_mm512_storeu_ps(sumPtr1+64+1536*i8, _mm512_add_ps(sum63, _mm512_loadu_ps(sumPtr1+64+1536*i8)));
_mm512_storeu_ps(sumPtr1+128+1536*i8, _mm512_add_ps(sum64, _mm512_loadu_ps(sumPtr1+128+1536*i8)));
_mm512_storeu_ps(sumPtr1+192+1536*i8, _mm512_add_ps(sum65, _mm512_loadu_ps(sumPtr1+192+1536*i8)));
_mm512_storeu_ps(sumPtr1+256+1536*i8, _mm512_add_ps(sum66, _mm512_loadu_ps(sumPtr1+256+1536*i8)));
_mm512_storeu_ps(sumPtr1+320+1536*i8, _mm512_add_ps(sum67, _mm512_loadu_ps(sumPtr1+320+1536*i8)));
_mm512_storeu_ps(sumPtr1+384+1536*i8, _mm512_add_ps(sum68, _mm512_loadu_ps(sumPtr1+384+1536*i8)));
_mm512_storeu_ps(sumPtr1+448+1536*i8, _mm512_add_ps(sum69, _mm512_loadu_ps(sumPtr1+448+1536*i8)));
_mm512_storeu_ps(sumPtr1+512+1536*i8, _mm512_add_ps(sum70, _mm512_loadu_ps(sumPtr1+512+1536*i8)));
_mm512_storeu_ps(sumPtr1+576+1536*i8, _mm512_add_ps(sum71, _mm512_loadu_ps(sumPtr1+576+1536*i8)));
_mm512_storeu_ps(sumPtr1+640+1536*i8, _mm512_add_ps(sum72, _mm512_loadu_ps(sumPtr1+640+1536*i8)));
_mm512_storeu_ps(sumPtr1+704+1536*i8, _mm512_add_ps(sum73, _mm512_loadu_ps(sumPtr1+704+1536*i8)));
return;
}
(void)base1;
ptrdiff_t i9 = 2*w5;
ptrdiff_t ii3 = i9+(w5 < 20 ? 1 : 2);
for (; i9 != 42; ++i9) {
__m512 sum74 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i9));
__m512 sum78 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i9));
__m512 sum82 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i9));
__m512 sum86 = _mm512_set1_ps(*(float*)(biasPtr2+12+24*i9));
__m512 sum90 = _mm512_set1_ps(*(float*)(biasPtr2+16+24*i9));
__m512 sum94 = _mm512_set1_ps(*(float*)(biasPtr2+20+24*i9));
__m512 sum75 = sum74;
__m512 sum76 = sum74;
__m512 sum77 = sum74;
__m512 sum79 = sum78;
__m512 sum80 = sum78;
__m512 sum81 = sum78;
__m512 sum83 = sum82;
__m512 sum84 = sum82;
__m512 sum85 = sum82;
__m512 sum87 = sum86;
__m512 sum88 = sum86;
__m512 sum89 = sum86;
__m512 sum91 = sum90;
__m512 sum92 = sum90;
__m512 sum93 = sum90;
__m512 sum95 = sum94;
__m512 sum96 = sum94;
__m512 sum97 = sum94;
for (ptrdiff_t j7 = 0; j7 < 434; ++j7) {
__m512 dat42 = _mm512_loadu_ps(datPtr2+0+256*j7);
__m512 dat43 = _mm512_loadu_ps(datPtr2+64+256*j7);
__m512 dat44 = _mm512_loadu_ps(datPtr2+128+256*j7);
__m512 dat45 = _mm512_loadu_ps(datPtr2+192+256*j7);
__m512 wt82 = _mm512_set1_ps(*(float*)(wtPtr2+0+10416*i9+24*j7));
sum74 = _mm512_fmadd_ps(wt82, dat42, sum74);
sum75 = _mm512_fmadd_ps(wt82, dat43, sum75);
sum76 = _mm512_fmadd_ps(wt82, dat44, sum76);
sum77 = _mm512_fmadd_ps(wt82, dat45, sum77);
__m512 wt83 = _mm512_set1_ps(*(float*)(wtPtr2+4+10416*i9+24*j7));
sum78 = _mm512_fmadd_ps(wt83, dat42, sum78);
sum79 = _mm512_fmadd_ps(wt83, dat43, sum79);
sum80 = _mm512_fmadd_ps(wt83, dat44, sum80);
sum81 = _mm512_fmadd_ps(wt83, dat45, sum81);
__m512 wt84 = _mm512_set1_ps(*(float*)(wtPtr2+8+10416*i9+24*j7));
sum82 = _mm512_fmadd_ps(wt84, dat42, sum82);
sum83 = _mm512_fmadd_ps(wt84, dat43, sum83);
sum84 = _mm512_fmadd_ps(wt84, dat44, sum84);
sum85 = _mm512_fmadd_ps(wt84, dat45, sum85);
__m512 wt85 = _mm512_set1_ps(*(float*)(wtPtr2+12+10416*i9+24*j7));
sum86 = _mm512_fmadd_ps(wt85, dat42, sum86);
sum87 = _mm512_fmadd_ps(wt85, dat43, sum87);
sum88 = _mm512_fmadd_ps(wt85, dat44, sum88);
sum89 = _mm512_fmadd_ps(wt85, dat45, sum89);
__m512 wt86 = _mm512_set1_ps(*(float*)(wtPtr2+16+10416*i9+24*j7));
sum90 = _mm512_fmadd_ps(wt86, dat42, sum90);
sum91 = _mm512_fmadd_ps(wt86, dat43, sum91);
sum92 = _mm512_fmadd_ps(wt86, dat44, sum92);
sum93 = _mm512_fmadd_ps(wt86, dat45, sum93);
__m512 wt87 = _mm512_set1_ps(*(float*)(wtPtr2+20+10416*i9+24*j7));
sum94 = _mm512_fmadd_ps(wt87, dat42, sum94);
sum95 = _mm512_fmadd_ps(wt87, dat43, sum95);
sum96 = _mm512_fmadd_ps(wt87, dat44, sum96);
sum97 = _mm512_fmadd_ps(wt87, dat45, sum97);
}
_mm512_storeu_ps(sumPtr1+0+1536*i9, sum74);
_mm512_storeu_ps(sumPtr1+64+1536*i9, sum75);
_mm512_storeu_ps(sumPtr1+128+1536*i9, sum76);
_mm512_storeu_ps(sumPtr1+192+1536*i9, sum77);
_mm512_storeu_ps(sumPtr1+256+1536*i9, sum78);
_mm512_storeu_ps(sumPtr1+320+1536*i9, sum79);
_mm512_storeu_ps(sumPtr1+384+1536*i9, sum80);
_mm512_storeu_ps(sumPtr1+448+1536*i9, sum81);
_mm512_storeu_ps(sumPtr1+512+1536*i9, sum82);
_mm512_storeu_ps(sumPtr1+576+1536*i9, sum83);
_mm512_storeu_ps(sumPtr1+640+1536*i9, sum84);
_mm512_storeu_ps(sumPtr1+704+1536*i9, sum85);
_mm512_storeu_ps(sumPtr1+768+1536*i9, sum86);
_mm512_storeu_ps(sumPtr1+832+1536*i9, sum87);
_mm512_storeu_ps(sumPtr1+896+1536*i9, sum88);
_mm512_storeu_ps(sumPtr1+960+1536*i9, sum89);
_mm512_storeu_ps(sumPtr1+1024+1536*i9, sum90);
_mm512_storeu_ps(sumPtr1+1088+1536*i9, sum91);
_mm512_storeu_ps(sumPtr1+1152+1536*i9, sum92);
_mm512_storeu_ps(sumPtr1+1216+1536*i9, sum93);
_mm512_storeu_ps(sumPtr1+1280+1536*i9, sum94);
_mm512_storeu_ps(sumPtr1+1344+1536*i9, sum95);
_mm512_storeu_ps(sumPtr1+1408+1536*i9, sum96);
_mm512_storeu_ps(sumPtr1+1472+1536*i9, sum97);
if (i9 >= ii3) return;
}
__m512 sum98 = _mm512_set1_ps(*(float*)(biasPtr2+0+24*i9));
__m512 sum102 = _mm512_set1_ps(*(float*)(biasPtr2+4+24*i9));
__m512 sum106 = _mm512_set1_ps(*(float*)(biasPtr2+8+24*i9));
__m512 sum99 = sum98;
__m512 sum100 = sum98;
__m512 sum101 = sum98;
__m512 sum103 = sum102;
__m512 sum104 = sum102;
__m512 sum105 = sum102;
__m512 sum107 = sum106;
__m512 sum108 = sum106;
__m512 sum109 = sum106;
for (ptrdiff_t j8 = 0; j8 < 434; ++j8) {
__m512 dat46 = _mm512_loadu_ps(datPtr2+0+256*j8);
__m512 dat47 = _mm512_loadu_ps(datPtr2+64+256*j8);
__m512 dat48 = _mm512_loadu_ps(datPtr2+128+256*j8);
__m512 dat49 = _mm512_loadu_ps(datPtr2+192+256*j8);
__m512 wt88 = _mm512_set1_ps(*(float*)(wtPtr2+0+10416*i9+12*j8));
sum98 = _mm512_fmadd_ps(wt88, dat46, sum98);
sum99 = _mm512_fmadd_ps(wt88, dat47, sum99);
sum100 = _mm512_fmadd_ps(wt88, dat48, sum100);
sum101 = _mm512_fmadd_ps(wt88, dat49, sum101);
__m512 wt89 = _mm512_set1_ps(*(float*)(wtPtr2+4+10416*i9+12*j8));
sum102 = _mm512_fmadd_ps(wt89, dat46, sum102);
sum103 = _mm512_fmadd_ps(wt89, dat47, sum103);
sum104 = _mm512_fmadd_ps(wt89, dat48, sum104);
sum105 = _mm512_fmadd_ps(wt89, dat49, sum105);
__m512 wt90 = _mm512_set1_ps(*(float*)(wtPtr2+8+10416*i9+12*j8));
sum106 = _mm512_fmadd_ps(wt90, dat46, sum106);
sum107 = _mm512_fmadd_ps(wt90, dat47, sum107);
sum108 = _mm512_fmadd_ps(wt90, dat48, sum108);
sum109 = _mm512_fmadd_ps(wt90, dat49, sum109);
}
_mm512_storeu_ps(sumPtr1+0+1536*i9, sum98);
_mm512_storeu_ps(sumPtr1+64+1536*i9, sum99);
_mm512_storeu_ps(sumPtr1+128+1536*i9, sum100);
_mm512_storeu_ps(sumPtr1+192+1536*i9, sum101);
_mm512_storeu_ps(sumPtr1+256+1536*i9, sum102);
_mm512_storeu_ps(sumPtr1+320+1536*i9, sum103);
_mm512_storeu_ps(sumPtr1+384+1536*i9, sum104);
_mm512_storeu_ps(sumPtr1+448+1536*i9, sum105);
_mm512_storeu_ps(sumPtr1+512+1536*i9, sum106);
_mm512_storeu_ps(sumPtr1+576+1536*i9, sum107);
_mm512_storeu_ps(sumPtr1+640+1536*i9, sum108);
_mm512_storeu_ps(sumPtr1+704+1536*i9, sum109);
}

static void Example1LoomProduceSums1(Example1ThreaderTeam1* team16, char** tensors5) {
void* tuple1[4];
tuple1[0] = tensors5;
for (ptrdiff_t epoch2 = 0; epoch2 < 1; ++epoch2) {
tuple1[1] = (void*)epoch2;
for (ptrdiff_t field2 = 0; field2 < 4; ++field2) {
tuple1[2] = (void*)field2;
ptrdiff_t node7 = Example1LoomProduceSums1FieldTbl1[0+2*field2];
ptrdiff_t step2 = Example1LoomProduceSums1FieldTbl1[1+2*field2];
ptrdiff_t past1 = Example1LoomProduceSums1FieldTbl1[2+2*field2];
for (; node7 < past1; node7 += step2) {
tuple1[3] = (void*)node7;
Example1ThreaderTask1 task9;
task9.callee1 = Example1LoomProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 21;
task9.hull1[1] = step2;
task9.hull1[2] = 28;
task9.hull1[3] = 5;
Example1ThreaderDo1(team16, &task9);
}
}
}
}

static void Example1LoomConsumeSums1Callee1(Example1ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t cell1 = 0;
ptrdiff_t strip1 = 0;
ptrdiff_t chan1 = pt10[2];
ptrdiff_t group2 = pt10[3];
char*restrict sumPtr2 = tensors8[0];
char*restrict datPtr3 = tensors8[1];
ptrdiff_t i10 = 1*group2;
ptrdiff_t j9 = 19*chan1;
ptrdiff_t jj2 = j9+(chan1 < 12 ? 18 : 26);
for (; j9 <= jj2; ++j9) {
ptrdiff_t k9 = 14*strip1;
for (; k9 != 13; ++k9) {
ptrdiff_t l1 = 1*cell1;
for (; l1 != 1; ++l1) {
__m512 load1 = _mm512_loadu_ps(sumPtr2+0+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load4 = _mm512_loadu_ps(sumPtr2+64+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load7 = _mm512_loadu_ps(sumPtr2+128+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load10 = _mm512_loadu_ps(sumPtr2+192+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load2 = _mm512_loadu_ps(sumPtr2+65280+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load5 = _mm512_loadu_ps(sumPtr2+65344+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load8 = _mm512_loadu_ps(sumPtr2+65408+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load11 = _mm512_loadu_ps(sumPtr2+65472+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load3 = _mm512_loadu_ps(sumPtr2+195840+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load6 = _mm512_loadu_ps(sumPtr2+195904+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load9 = _mm512_loadu_ps(sumPtr2+195968+3655680*i10+261120*k9+130560*l1+256*j9);
__m512 load12 = _mm512_loadu_ps(sumPtr2+196032+3655680*i10+261120*k9+130560*l1+256*j9);
__m512i cast1 = _mm512_castps_si512(load2);
__m512i cast3 = _mm512_castps_si512(load5);
__m512i cast5 = _mm512_castps_si512(load8);
__m512i cast7 = _mm512_castps_si512(load11);
__m512i cast2 = _mm512_castps_si512(load3);
__m512i cast4 = _mm512_castps_si512(load6);
__m512i cast6 = _mm512_castps_si512(load9);
__m512i cast8 = _mm512_castps_si512(load12);
__m512 join2 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast2, cast1, 1));
__m512 join3 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast4, cast3, 1));
__m512 join4 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast6, cast5, 1));
__m512 join5 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast8, cast7, 1));
__m512 add1 = _mm512_add_ps(load1, join2);
__m512 add2 = _mm512_add_ps(load4, join3);
__m512 add3 = _mm512_add_ps(load7, join4);
__m512 add4 = _mm512_add_ps(load10, join5);
_mm512_mask_storeu_ps(datPtr3+0+864960*i10+3392*j9+256*k9+64*l1, 65535, add1);
_mm512_mask_storeu_ps(datPtr3+64+864960*i10+3392*j9+256*k9+64*l1, 65535, add2);
_mm512_mask_storeu_ps(datPtr3+128+864960*i10+3392*j9+256*k9+64*l1, 65535, add3);
_mm512_mask_storeu_ps(datPtr3+192+864960*i10+3392*j9+256*k9+64*l1, 65535, add4);
}
}
ptrdiff_t l2 = 1*cell1;
for (; l2 != 1; ++l2) {
__m512 load13 = _mm512_loadu_ps(sumPtr2+0+3655680*i10+261120*k9+130560*l2+256*j9);
__m512 load14 = _mm512_loadu_ps(sumPtr2+65280+3655680*i10+261120*k9+130560*l2+256*j9);
__m512 load15 = _mm512_loadu_ps(sumPtr2+195840+3655680*i10+261120*k9+130560*l2+256*j9);
__m512i cast9 = _mm512_castps_si512(load14);
__m512i cast10 = _mm512_castps_si512(load15);
__m512 join6 = _mm512_castsi512_ps(_mm512_alignr_epi32(cast10, cast9, 1));
__m512 add5 = _mm512_add_ps(load13, join6);
_mm512_mask_storeu_ps(datPtr3+0+864960*i10+3392*j9+256*k9+64*l2, 65535, add5);
}
}
}

static void Example1LoomConsumeSums1(Example1ThreaderTeam1* team17, char** tensors7) {
Example1ThreaderTask1 task11;
task11.callee1 = Example1LoomConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 4;
task11.hull1[0] = 1;
task11.hull1[1] = 1;
task11.hull1[2] = 13;
task11.hull1[3] = 5;
Example1ThreaderDo1(team17, &task11);
}

struct Example1Net {
char* alloc1;
char* align1;
};

void Example1NetDestroy(Example1Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example1NetCreate(
Example1Net** net1,
Example1Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example1Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(15498963);
if (__builtin_expect(!alloc3, 0)) {
return Example1Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example1ThreaderTeam1* team12 = 0;
char* err8 = Example1ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors12[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example1LoomArrangeFilts1(team12, tensors12);
}
Example1ThreaderDestroy1(team12);
Example1Net* net5 = malloc(sizeof(Example1Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example1Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example1Engine {
Example1Net* net3;
Example1ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example1EnginePthreadT(
Example1Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example1ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example1EngineDestroy(Example1Engine* eng3) {
Example1ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example1EngineCreate(
Example1Engine** eng4,
Example1Net* net4,
ptrdiff_t threads2
) {
Example1Engine* eng5 = malloc(sizeof(Example1Engine));
if (__builtin_expect(!eng5, 0)) {
return Example1Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(80496703);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example1Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example1ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example1EngineInference(
Example1Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example1ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors9[] = {
(char*)inData,
align4+0
};
Example1LoomArrangeDats1(team14, tensors9);
char* tensors10[] = {
netAlign1+0,
align4+0,
align4+62218240
};
Example1LoomProduceSums1(team14, tensors10);
char* tensors11[] = {
align4+62218240,
(char*)outData
};
Example1LoomConsumeSums1(team14, tensors11);
}
}

// End of file.

Top