NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=Example18 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=in Channels=121 Height=255 Width=246
FullyConnected FromTensor=in ToTensor=out ToChannels=709
Output FromTensor=out

Top || Output Example18.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(Example18Params);
// Example18Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct Example18Params Example18Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// Example18Params* params = malloc(sizeof(Example18Params));
//
// ... Load params (read from a file, perhaps) ...
//
// Example18Net* net; // For example, 4 threads:
// char* err = Example18NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// Example18NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct Example18Net Example18Net;

char* Example18NetCreate(
Example18Net**,
Example18Params*,
ptrdiff_t threads
);

void Example18NetDestroy(Example18Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// Example18Net* net;
//
// ... Create net ...
//
// Example18Engine* engine; // For example, 4 inference threads:
// char* err = Example18EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// Example18EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = Example18EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* inData = malloc(sizeof(float)*121*255*246);
// float* outData = malloc(sizeof(float)*709*1*1);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// Example18EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// inData, // The tensor arguments are sorted by name.
// outData
// );
//
// ... Read the output floats ...
//
// }
//
// free(inData);
// free(outData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct Example18Engine Example18Engine;

char* Example18EngineCreate(
Example18Engine**,
Example18Net*,
ptrdiff_t threads
);

char* Example18EnginePthreadT(
Example18Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void Example18EngineInference(
Example18Engine*,
float* inData,
float* outData
);

void Example18EngineDestroy(Example18Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct Example18Params {
float outBiases[709]; // 1x709x1x1
float outWeights[5381543970]; // 709x121x255x246
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output Example18.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f Example18.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "Example18.h"

static char* Example18Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "Example18: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct Example18ThreaderTask1 Example18ThreaderTask1;
typedef void (*Example18ThreaderCallee1)(Example18ThreaderTask1*, int64_t*);
typedef struct Example18ThreaderHub1 Example18ThreaderHub1;
typedef struct Example18ThreaderNode1 Example18ThreaderNode1;
typedef struct Example18ThreaderUnwind1 Example18ThreaderUnwind1;
typedef struct Example18ThreaderTeam1 Example18ThreaderTeam1;

struct Example18ThreaderTask1 {
Example18ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct Example18ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct Example18ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
Example18ThreaderTask1* task1;
pthread_cond_t cond2;
Example18ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct Example18ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct Example18ThreaderTeam1 {
ptrdiff_t nt1;
Example18ThreaderHub1* hub2;
Example18ThreaderNode1* nodes2;
Example18ThreaderUnwind1 unwind1;
};

static void Example18ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void Example18ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void Example18ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* Example18ThreaderMain1(void* arg1) {
Example18ThreaderNode1* node1 = arg1;
Example18ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
Example18ThreaderHub1* hub3 = team2->hub2;
Example18ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
Example18ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
Example18ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
Example18ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
Example18ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
Example18ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void Example18ThreaderDestroy1(Example18ThreaderTeam1* team3) {
if (!team3) return;
Example18ThreaderNode1* nodes4 = team3->nodes2;
Example18ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (Example18ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (Example18ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (Example18ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (Example18ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (Example18ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
Example18ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* Example18ThreaderCreate1Up4(Example18ThreaderTeam1* team8, ptrdiff_t nt7) {
Example18ThreaderNode1* nodes5 = team8->nodes2;
for (Example18ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = Example18Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = Example18Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, Example18ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = Example18Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* Example18ThreaderCreate1Up3(Example18ThreaderTeam1* team7, ptrdiff_t nt6) {
Example18ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return Example18Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return Example18Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return Example18ThreaderCreate1Up4(team7, nt6);
}

static char* Example18ThreaderCreate1Up2(Example18ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(Example18ThreaderNode1);
if (__builtin_expect(size2/sizeof(Example18ThreaderNode1) != (size_t)nt5, 0)) {
return Example18Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return Example18Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return Example18ThreaderCreate1Up3(team6, nt5);
}

static char* Example18ThreaderCreate1Up1(Example18ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(Example18ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return Example18Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return Example18ThreaderCreate1Up2(team5, nt4);
}

static char* Example18ThreaderCreate1(Example18ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return Example18Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(Example18ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return Example18Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = Example18ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
Example18ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* Example18ThreaderPthreadT1(
pthread_t* thr2,
Example18ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return Example18Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void Example18ThreaderDo1(Example18ThreaderTeam1* team10, Example18ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
Example18ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
Example18ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
Example18ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
Example18ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 Example18Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static __m512 Example18Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void Example18FcArrange1Callee1(Example18ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t t2 = pt7[0];
char*restrict weights1 = tensors2[0]+(ptrdiff_t)485781120*t2;
char*restrict biases1 = tensors2[1]+(ptrdiff_t)64*t2;
char*restrict weights2 = tensors2[2]+(ptrdiff_t)242890752*t2;
char*restrict biases2 = tensors2[2]+(ptrdiff_t)10763096448+(ptrdiff_t)64*t2;
if (t2 < 44) {
for (ptrdiff_t i5 = 0; i5 < 1; ++i5) {
for (ptrdiff_t j1 = 0; j1 < 474395; ++j1) {
__m512 wtLo1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtHi1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)30361320+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtLo2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)60722640+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtHi2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)91083960+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtLo3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)121445280+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtHi3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)151806600+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtLo4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)182167920+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtHi4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)212529240+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtLo5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)242890560+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtHi5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)273251880+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtLo6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)303613200+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtHi6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)333974520+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtLo7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)364335840+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtHi7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)394697160+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtLo8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)425058480+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m512 wtHi8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)455419800+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*j1);
__m256i halfLo1 = _mm512_cvtps_ph(wtLo1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi1 = _mm512_cvtps_ph(wtHi1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo2 = _mm512_cvtps_ph(wtLo2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi2 = _mm512_cvtps_ph(wtHi2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo3 = _mm512_cvtps_ph(wtLo3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi3 = _mm512_cvtps_ph(wtHi3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo4 = _mm512_cvtps_ph(wtLo4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi4 = _mm512_cvtps_ph(wtHi4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo5 = _mm512_cvtps_ph(wtLo5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi5 = _mm512_cvtps_ph(wtHi5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo6 = _mm512_cvtps_ph(wtLo6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi6 = _mm512_cvtps_ph(wtHi6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo7 = _mm512_cvtps_ph(wtLo7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi7 = _mm512_cvtps_ph(wtHi7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo8 = _mm512_cvtps_ph(wtLo8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi8 = _mm512_cvtps_ph(wtHi8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield1 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo1), halfHi1, 1);
__m512i yield2 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo2), halfHi2, 1);
__m512i yield3 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo3), halfHi3, 1);
__m512i yield4 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo4), halfHi4, 1);
__m512i yield5 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo5), halfHi5, 1);
__m512i yield6 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo6), halfHi6, 1);
__m512i yield7 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo7), halfHi7, 1);
__m512i yield8 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo8), halfHi8, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*j1, 65535, yield1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*j1, 65535, yield2);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*j1, 65535, yield3);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*j1, 65535, yield4);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*j1, 65535, yield5);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*j1, 65535, yield6);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*j1, 65535, yield7);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*j1, 65535, yield8);
}
__m512 wtLo9 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)0+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtHi9 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)30361320+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtLo10 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)60722640+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtHi10 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)91083960+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtLo11 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)121445280+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtHi11 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)151806600+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtLo12 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)182167920+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtHi12 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)212529240+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtLo13 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)242890560+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtHi13 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)273251880+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtLo14 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)303613200+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtHi14 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)333974520+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtLo15 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)364335840+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtHi15 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)394697160+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtLo16 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)425058480+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m512 wtHi16 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)455419800+(ptrdiff_t)485781120*i5+(ptrdiff_t)64*474395);
__m256i halfLo9 = _mm512_cvtps_ph(wtLo9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi9 = _mm512_cvtps_ph(wtHi9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo10 = _mm512_cvtps_ph(wtLo10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi10 = _mm512_cvtps_ph(wtHi10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo11 = _mm512_cvtps_ph(wtLo11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi11 = _mm512_cvtps_ph(wtHi11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo12 = _mm512_cvtps_ph(wtLo12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi12 = _mm512_cvtps_ph(wtHi12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo13 = _mm512_cvtps_ph(wtLo13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi13 = _mm512_cvtps_ph(wtHi13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo14 = _mm512_cvtps_ph(wtLo14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi14 = _mm512_cvtps_ph(wtHi14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo15 = _mm512_cvtps_ph(wtLo15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi15 = _mm512_cvtps_ph(wtHi15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo16 = _mm512_cvtps_ph(wtLo16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi16 = _mm512_cvtps_ph(wtHi16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield9 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo9), halfHi9, 1);
__m512i yield10 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo10), halfHi10, 1);
__m512i yield11 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo11), halfHi11, 1);
__m512i yield12 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo12), halfHi12, 1);
__m512i yield13 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo13), halfHi13, 1);
__m512i yield14 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo14), halfHi14, 1);
__m512i yield15 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo15), halfHi15, 1);
__m512i yield16 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo16), halfHi16, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*474395, 65535, yield9);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*474395, 65535, yield10);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*474395, 65535, yield11);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*474395, 65535, yield12);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*474395, 65535, yield13);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*474395, 65535, yield14);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*474395, 65535, yield15);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)242890752*i5+(ptrdiff_t)512*474395, 65535, yield16);
__m512 bias1 = _mm512_maskz_loadu_ps(65535, biases1+(ptrdiff_t)0+(ptrdiff_t)64*i5);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)64*i5, 65535, bias1);
}
return;
}
for (ptrdiff_t i6 = 0; i6 < 1; ++i6) {
for (ptrdiff_t j2 = 0; j2 < 237197; ++j2) {
__m512 wtLo17 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtHi17 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)30361320+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtLo18 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)60722640+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtHi18 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)91083960+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtLo19 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)121445280+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtHi19 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)64+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtLo20 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)30361384+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtHi20 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)60722704+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtLo21 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)91084024+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m512 wtHi21 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)121445344+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*j2);
__m256i halfLo17 = _mm512_cvtps_ph(wtLo17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi17 = _mm512_cvtps_ph(wtHi17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo18 = _mm512_cvtps_ph(wtLo18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi18 = _mm512_cvtps_ph(wtHi18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo19 = _mm512_cvtps_ph(wtLo19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi19 = _mm512_cvtps_ph(wtHi19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo20 = _mm512_cvtps_ph(wtLo20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi20 = _mm512_cvtps_ph(wtHi20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo21 = _mm512_cvtps_ph(wtLo21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi21 = _mm512_cvtps_ph(wtHi21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield17 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo17), halfHi17, 1);
__m512i yield18 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo18), halfHi18, 1);
__m512i yield19 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo19), halfHi19, 1);
__m512i yield20 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo20), halfHi20, 1);
__m512i yield21 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo21), halfHi21, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*j2, 65535, yield17);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*j2, 65535, yield18);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*j2, 65535, yield19);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*j2, 65535, yield20);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*j2, 65535, yield21);
}
__m512 wtLo22 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtHi22 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)30361320+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtLo23 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)60722640+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtHi23 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)91083960+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtLo24 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)121445280+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtHi24 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)64+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtLo25 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)30361384+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtHi25 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)60722704+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtLo26 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)91084024+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m512 wtHi26 = _mm512_maskz_loadu_ps(1023, weights1+(ptrdiff_t)121445344+(ptrdiff_t)151806600*i6+(ptrdiff_t)128*237197);
__m256i halfLo22 = _mm512_cvtps_ph(wtLo22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi22 = _mm512_cvtps_ph(wtHi22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo23 = _mm512_cvtps_ph(wtLo23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi23 = _mm512_cvtps_ph(wtHi23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo24 = _mm512_cvtps_ph(wtLo24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi24 = _mm512_cvtps_ph(wtHi24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo25 = _mm512_cvtps_ph(wtLo25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi25 = _mm512_cvtps_ph(wtHi25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo26 = _mm512_cvtps_ph(wtLo26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi26 = _mm512_cvtps_ph(wtHi26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield22 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo22), halfHi22, 1);
__m512i yield23 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo23), halfHi23, 1);
__m512i yield24 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo24), halfHi24, 1);
__m512i yield25 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo25), halfHi25, 1);
__m512i yield26 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo26), halfHi26, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*237197, 65535, yield22);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*237197, 65535, yield23);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*237197, 65535, yield24);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*237197, 65535, yield25);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)242890752*i6+(ptrdiff_t)320*237197, 65535, yield26);
__m512 bias2 = _mm512_maskz_loadu_ps(31, biases1+(ptrdiff_t)0+(ptrdiff_t)20*i6);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)20*i6, 31, bias2);
}
}

static void Example18FcArrange1(Example18ThreaderTeam1* team13, char** tensors1) {
Example18ThreaderTask1 task5;
task5.callee1 = Example18FcArrange1Callee1;
task5.any1 = tensors1;
task5.nd1 = 1;
task5.hull1[0] = 45;
Example18ThreaderDo1(team13, &task5);
}

static void Example18FcApply1Callee1(Example18ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t t3 = pt8[0];
char*restrict wtPtr1 = tensors4[0]+(ptrdiff_t)242890752*t3;
char*restrict biasPtr1 = tensors4[0]+(ptrdiff_t)10763096448+(ptrdiff_t)64*t3;
char*restrict datPtr1 = tensors4[1];
char*restrict datPtr2 = tensors4[2]+(ptrdiff_t)64*t3;
if (t3 < 44) {
for (ptrdiff_t i7 = 0; i7 < 1; ++i7) {
__m512 sum2 = _mm512_setzero_ps();
__m512 sum3 = _mm512_setzero_ps();
__m512 sum4 = _mm512_setzero_ps();
__m512 sum5 = _mm512_setzero_ps();
__m512 sum6 = _mm512_setzero_ps();
__m512 sum7 = _mm512_setzero_ps();
__m512 sum8 = _mm512_setzero_ps();
__m512 sum9 = _mm512_setzero_ps();
__m512 sum10 = _mm512_setzero_ps();
__m512 sum11 = _mm512_setzero_ps();
__m512 sum12 = _mm512_setzero_ps();
__m512 sum13 = _mm512_setzero_ps();
__m512 sum14 = _mm512_setzero_ps();
__m512 sum15 = _mm512_setzero_ps();
__m512 sum16 = _mm512_setzero_ps();
__m512 sum17 = _mm512_setzero_ps();
for (ptrdiff_t j3 = 0; j3 < 474395; ++j3) {
__m512i wts1 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)0+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*j3);
__m512 dat1 = _mm512_maskz_loadu_ps(65535, datPtr1+(ptrdiff_t)0+(ptrdiff_t)64*j3);
__m512i wts2 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)64+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*j3);
__m512i wts3 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)128+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*j3);
__m512i wts4 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)192+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*j3);
__m512 wtLo27 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts1));
__m512 wtHi27 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts1, 1));
__m512 wtLo28 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts2));
__m512 wtHi28 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts2, 1));
__m512 wtLo29 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts3));
__m512 wtHi29 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts3, 1));
__m512 wtLo30 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts4));
__m512 wtHi30 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts4, 1));
sum2 = _mm512_fmadd_ps(wtLo27, dat1, sum2);
sum3 = _mm512_fmadd_ps(wtHi27, dat1, sum3);
sum4 = _mm512_fmadd_ps(wtLo28, dat1, sum4);
sum5 = _mm512_fmadd_ps(wtHi28, dat1, sum5);
sum6 = _mm512_fmadd_ps(wtLo29, dat1, sum6);
sum7 = _mm512_fmadd_ps(wtHi29, dat1, sum7);
sum8 = _mm512_fmadd_ps(wtLo30, dat1, sum8);
sum9 = _mm512_fmadd_ps(wtHi30, dat1, sum9);
__m512i wts5 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)256+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*j3);
__m512i wts6 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)320+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*j3);
__m512i wts7 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)384+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*j3);
__m512i wts8 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)448+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*j3);
__m512 wtLo31 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts5));
__m512 wtHi31 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts5, 1));
__m512 wtLo32 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts6));
__m512 wtHi32 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts6, 1));
__m512 wtLo33 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts7));
__m512 wtHi33 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts7, 1));
__m512 wtLo34 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts8));
__m512 wtHi34 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts8, 1));
sum10 = _mm512_fmadd_ps(wtLo31, dat1, sum10);
sum11 = _mm512_fmadd_ps(wtHi31, dat1, sum11);
sum12 = _mm512_fmadd_ps(wtLo32, dat1, sum12);
sum13 = _mm512_fmadd_ps(wtHi32, dat1, sum13);
sum14 = _mm512_fmadd_ps(wtLo33, dat1, sum14);
sum15 = _mm512_fmadd_ps(wtHi33, dat1, sum15);
sum16 = _mm512_fmadd_ps(wtLo34, dat1, sum16);
sum17 = _mm512_fmadd_ps(wtHi34, dat1, sum17);
}
__m512i wts9 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)0+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*474395);
__m512 dat2 = _mm512_maskz_loadu_ps(1023, datPtr1+(ptrdiff_t)0+(ptrdiff_t)64*474395);
__m512i wts10 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)64+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*474395);
__m512i wts11 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)128+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*474395);
__m512i wts12 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)192+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*474395);
__m512 wtLo35 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts9));
__m512 wtHi35 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts9, 1));
__m512 wtLo36 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts10));
__m512 wtHi36 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts10, 1));
__m512 wtLo37 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts11));
__m512 wtHi37 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts11, 1));
__m512 wtLo38 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts12));
__m512 wtHi38 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts12, 1));
sum2 = _mm512_fmadd_ps(wtLo35, dat2, sum2);
sum3 = _mm512_fmadd_ps(wtHi35, dat2, sum3);
sum4 = _mm512_fmadd_ps(wtLo36, dat2, sum4);
sum5 = _mm512_fmadd_ps(wtHi36, dat2, sum5);
sum6 = _mm512_fmadd_ps(wtLo37, dat2, sum6);
sum7 = _mm512_fmadd_ps(wtHi37, dat2, sum7);
sum8 = _mm512_fmadd_ps(wtLo38, dat2, sum8);
sum9 = _mm512_fmadd_ps(wtHi38, dat2, sum9);
__m512i wts13 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)256+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*474395);
__m512i wts14 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)320+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*474395);
__m512i wts15 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)384+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*474395);
__m512i wts16 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)448+(ptrdiff_t)242890752*i7+(ptrdiff_t)512*474395);
__m512 wtLo39 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts13));
__m512 wtHi39 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts13, 1));
__m512 wtLo40 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts14));
__m512 wtHi40 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts14, 1));
__m512 wtLo41 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts15));
__m512 wtHi41 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts15, 1));
__m512 wtLo42 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts16));
__m512 wtHi42 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts16, 1));
sum10 = _mm512_fmadd_ps(wtLo39, dat2, sum10);
sum11 = _mm512_fmadd_ps(wtHi39, dat2, sum11);
sum12 = _mm512_fmadd_ps(wtLo40, dat2, sum12);
sum13 = _mm512_fmadd_ps(wtHi40, dat2, sum13);
sum14 = _mm512_fmadd_ps(wtLo41, dat2, sum14);
sum15 = _mm512_fmadd_ps(wtHi41, dat2, sum15);
sum16 = _mm512_fmadd_ps(wtLo42, dat2, sum16);
sum17 = _mm512_fmadd_ps(wtHi42, dat2, sum17);
__m512 bias3 = _mm512_maskz_loadu_ps(65535, biasPtr1+(ptrdiff_t)0+(ptrdiff_t)64*i7);
__m512i pm1Lo1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
__m512i pm1Hi1 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
__m512i pm4Lo1 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi1 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper4 = _mm512_shuffle_f32x4(sum2, sum10, 238);
__m512 upper5 = _mm512_shuffle_f32x4(sum6, sum14, 238);
sum2 = _mm512_shuffle_f32x4(sum2, sum10, 68);
sum6 = _mm512_shuffle_f32x4(sum6, sum14, 68);
sum2 = _mm512_add_ps(sum2, upper4);
sum6 = _mm512_add_ps(sum6, upper5);
__m512 upper7 = _mm512_shuffle_f32x4(sum4, sum12, 238);
__m512 upper8 = _mm512_shuffle_f32x4(sum8, sum16, 238);
sum4 = _mm512_shuffle_f32x4(sum4, sum12, 68);
sum8 = _mm512_shuffle_f32x4(sum8, sum16, 68);
sum4 = _mm512_add_ps(sum4, upper7);
sum8 = _mm512_add_ps(sum8, upper8);
__m512 upper3 = _mm512_permutex2var_ps(sum2, pm4Hi1, sum6);
__m512 upper6 = _mm512_permutex2var_ps(sum4, pm4Hi1, sum8);
sum2 = _mm512_permutex2var_ps(sum2, pm4Lo1, sum6);
sum4 = _mm512_permutex2var_ps(sum4, pm4Lo1, sum8);
sum2 = _mm512_add_ps(sum2, upper3);
sum4 = _mm512_add_ps(sum4, upper6);
__m512 upper11 = _mm512_shuffle_f32x4(sum3, sum11, 238);
__m512 upper12 = _mm512_shuffle_f32x4(sum7, sum15, 238);
sum3 = _mm512_shuffle_f32x4(sum3, sum11, 68);
sum7 = _mm512_shuffle_f32x4(sum7, sum15, 68);
sum3 = _mm512_add_ps(sum3, upper11);
sum7 = _mm512_add_ps(sum7, upper12);
__m512 upper14 = _mm512_shuffle_f32x4(sum5, sum13, 238);
__m512 upper15 = _mm512_shuffle_f32x4(sum9, sum17, 238);
sum5 = _mm512_shuffle_f32x4(sum5, sum13, 68);
sum9 = _mm512_shuffle_f32x4(sum9, sum17, 68);
sum5 = _mm512_add_ps(sum5, upper14);
sum9 = _mm512_add_ps(sum9, upper15);
__m512 upper10 = _mm512_permutex2var_ps(sum3, pm4Hi1, sum7);
__m512 upper13 = _mm512_permutex2var_ps(sum5, pm4Hi1, sum9);
sum3 = _mm512_permutex2var_ps(sum3, pm4Lo1, sum7);
sum5 = _mm512_permutex2var_ps(sum5, pm4Lo1, sum9);
sum3 = _mm512_add_ps(sum3, upper10);
sum5 = _mm512_add_ps(sum5, upper13);
__m512 upper2 = _mm512_shuffle_ps(sum2, sum4, 238);
__m512 upper9 = _mm512_shuffle_ps(sum3, sum5, 238);
sum2 = _mm512_shuffle_ps(sum2, sum4, 68);
sum3 = _mm512_shuffle_ps(sum3, sum5, 68);
sum2 = _mm512_add_ps(sum2, upper2);
sum3 = _mm512_add_ps(sum3, upper9);
__m512 upper1 = _mm512_permutex2var_ps(sum2, pm1Hi1, sum3);
sum2 = _mm512_permutex2var_ps(sum2, pm1Lo1, sum3);
sum2 = _mm512_add_ps(sum2, upper1);
sum2 = _mm512_add_ps(sum2, bias3);
_mm512_mask_storeu_ps(datPtr2+(ptrdiff_t)0+(ptrdiff_t)64*i7, 65535, sum2);
}
return;
}
for (ptrdiff_t i8 = 0; i8 < 1; ++i8) {
__m512 sum18 = _mm512_setzero_ps();
__m512 sum19 = _mm512_setzero_ps();
__m512 sum20 = _mm512_setzero_ps();
__m512 sum21 = _mm512_setzero_ps();
__m512 sum22 = _mm512_setzero_ps();
for (ptrdiff_t j4 = 0; j4 < 237197; ++j4) {
__m512i wts17 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)0+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*j4);
__m512 dat3 = _mm512_maskz_loadu_ps(65535, datPtr1+(ptrdiff_t)0+(ptrdiff_t)128*j4);
__m512i wts18 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)64+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*j4);
__m512i wts19 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)128+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*j4);
__m512 dat4 = _mm512_maskz_loadu_ps(65535, datPtr1+(ptrdiff_t)64+(ptrdiff_t)128*j4);
__m512i wts20 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)192+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*j4);
__m512 wtLo43 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts17));
__m512 wtHi43 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts17, 1));
__m512 wtLo44 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts18));
__m512 wtHi44 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts18, 1));
__m512 wtLo45 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts19));
__m512 wtHi45 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts19, 1));
__m512 wtLo46 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts20));
__m512 wtHi46 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts20, 1));
sum18 = _mm512_fmadd_ps(wtLo43, dat3, sum18);
sum19 = _mm512_fmadd_ps(wtHi43, dat3, sum19);
sum20 = _mm512_fmadd_ps(wtLo44, dat3, sum20);
sum21 = _mm512_fmadd_ps(wtHi44, dat3, sum21);
sum22 = _mm512_fmadd_ps(wtLo45, dat3, sum22);
sum18 = _mm512_fmadd_ps(wtHi45, dat4, sum18);
sum19 = _mm512_fmadd_ps(wtLo46, dat4, sum19);
sum20 = _mm512_fmadd_ps(wtHi46, dat4, sum20);
__m512i wts21 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)256+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*j4);
__m512 wtLo47 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts21));
__m512 wtHi47 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts21, 1));
sum21 = _mm512_fmadd_ps(wtLo47, dat4, sum21);
sum22 = _mm512_fmadd_ps(wtHi47, dat4, sum22);
}
__m512i wts22 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)0+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*237197);
__m512 dat5 = _mm512_maskz_loadu_ps(65535, datPtr1+(ptrdiff_t)0+(ptrdiff_t)128*237197);
__m512i wts23 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)64+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*237197);
__m512i wts24 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)128+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*237197);
__m512 dat6 = _mm512_maskz_loadu_ps(1023, datPtr1+(ptrdiff_t)64+(ptrdiff_t)128*237197);
__m512i wts25 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)192+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*237197);
__m512 wtLo48 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts22));
__m512 wtHi48 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts22, 1));
__m512 wtLo49 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts23));
__m512 wtHi49 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts23, 1));
__m512 wtLo50 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts24));
__m512 wtHi50 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts24, 1));
__m512 wtLo51 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts25));
__m512 wtHi51 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts25, 1));
sum18 = _mm512_fmadd_ps(wtLo48, dat5, sum18);
sum19 = _mm512_fmadd_ps(wtHi48, dat5, sum19);
sum20 = _mm512_fmadd_ps(wtLo49, dat5, sum20);
sum21 = _mm512_fmadd_ps(wtHi49, dat5, sum21);
sum22 = _mm512_fmadd_ps(wtLo50, dat5, sum22);
sum18 = _mm512_fmadd_ps(wtHi50, dat6, sum18);
sum19 = _mm512_fmadd_ps(wtLo51, dat6, sum19);
sum20 = _mm512_fmadd_ps(wtHi51, dat6, sum20);
__m512i wts26 = _mm512_maskz_loadu_epi32(65535, wtPtr1+(ptrdiff_t)256+(ptrdiff_t)242890752*i8+(ptrdiff_t)320*237197);
__m512 wtLo52 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts26));
__m512 wtHi52 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts26, 1));
sum21 = _mm512_fmadd_ps(wtLo52, dat6, sum21);
sum22 = _mm512_fmadd_ps(wtHi52, dat6, sum22);
__m512 bias4 = _mm512_maskz_loadu_ps(31, biasPtr1+(ptrdiff_t)0+(ptrdiff_t)20*i8);
__m512i pmEven1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmOdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm4Lo2 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi2 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper18 = _mm512_shuffle_f32x4(sum18, sum22, 238);
__m512 upper19 = _mm512_shuffle_f32x4(sum20, sum20, 14);
sum18 = _mm512_shuffle_f32x4(sum18, sum22, 68);
sum18 = _mm512_add_ps(sum18, upper18);
sum20 = _mm512_add_ps(sum20, upper19);
__m512 upper21 = _mm512_shuffle_f32x4(sum19, sum19, 14);
__m512 upper22 = _mm512_shuffle_f32x4(sum21, sum21, 14);
sum19 = _mm512_add_ps(sum19, upper21);
sum21 = _mm512_add_ps(sum21, upper22);
__m512 upper17 = _mm512_permutex2var_ps(sum18, pm4Hi2, sum20);
__m512 upper20 = _mm512_permutex2var_ps(sum19, pm4Hi2, sum21);
sum18 = _mm512_permutex2var_ps(sum18, pm4Lo2, sum20);
sum19 = _mm512_permutex2var_ps(sum19, pm4Lo2, sum21);
sum18 = _mm512_add_ps(sum18, upper17);
sum19 = _mm512_add_ps(sum19, upper20);
__m512 upper16 = _mm512_shuffle_ps(sum18, sum19, 238);
sum18 = _mm512_shuffle_ps(sum18, sum19, 68);
sum18 = _mm512_add_ps(sum18, upper16);
__m512 upper23 = _mm512_permutexvar_ps(pmOdd1, sum18);
sum18 = _mm512_permutexvar_ps(pmEven1, sum18);
sum18 = _mm512_add_ps(sum18, upper23);
sum18 = _mm512_add_ps(sum18, bias4);
_mm512_mask_storeu_ps(datPtr2+(ptrdiff_t)0+(ptrdiff_t)20*i8, 31, sum18);
}
}

static void Example18FcApply1(Example18ThreaderTeam1* team15, char** tensors3) {
Example18ThreaderTask1 task7;
task7.callee1 = Example18FcApply1Callee1;
task7.any1 = tensors3;
task7.nd1 = 1;
task7.hull1[0] = 45;
Example18ThreaderDo1(team15, &task7);
}

struct Example18Net {
char* alloc1;
char* align1;
};

void Example18NetDestroy(Example18Net* net2) {
free(net2->alloc1);
free(net2);
}

char* Example18NetCreate(
Example18Net** net1,
Example18Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return Example18Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(10763099347);
if (__builtin_expect(!alloc3, 0)) {
return Example18Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
Example18ThreaderTeam1* team12 = 0;
char* err8 = Example18ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(alloc3);
return err8;
}
{
char* tensors6[] = {
(char*)params1->outWeights,
(char*)params1->outBiases,
align3+0
};
Example18FcArrange1(team12, tensors6);
}
Example18ThreaderDestroy1(team12);
Example18Net* net5 = malloc(sizeof(Example18Net));
if (__builtin_expect(!net5, 0)) {
char* msg6 = Example18Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct Example18Engine {
Example18Net* net3;
Example18ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* Example18EnginePthreadT(
Example18Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return Example18ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void Example18EngineDestroy(Example18Engine* eng3) {
Example18ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* Example18EngineCreate(
Example18Engine** eng4,
Example18Net* net4,
ptrdiff_t threads2
) {
Example18Engine* eng5 = malloc(sizeof(Example18Engine));
if (__builtin_expect(!eng5, 0)) {
return Example18Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(63);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = Example18Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = Example18ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void Example18EngineInference(
Example18Engine* eng1,
float* inData,
float* outData
) {
char* netAlign1 = eng1->net3->align1;
Example18ThreaderTeam1* team14 = eng1->team11;
{
char* tensors5[] = {
netAlign1+0,
(char*)inData,
(char*)outData
};
Example18FcApply1(team14, tensors5);
}
}

// End of file.

Top