NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=ResNet50 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=image Channels=3 Height=224 Width=224
Conv FromTensor=image ToTensor=sevenDS ToChannels=64 FilterH=7 FilterW=7 StrideH=2 StrideW=2 PaddingH=3 PaddingW=3 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=sevenDS ToTensor=bn1 Epsilon=0.00001
Activation FromTensor=bn1 ToTensor=relu1 Kind=ReLU Param=0
Pooling FromTensor=relu1 ToTensor=pool1 Kind=Max3x3Stride2 PaddingH=1 PaddingW=1
Conv FromTensor=pool1 ToTensor=one1 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one1 ToTensor=bn2 Epsilon=0.00001
Conv FromTensor=pool1 ToTensor=one2 ToChannels=64 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one2 ToTensor=bn3 Epsilon=0.00001
Activation FromTensor=bn3 ToTensor=relu2 Kind=ReLU Param=0
Conv FromTensor=relu2 ToTensor=three1 ToChannels=64 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three1 ToTensor=bn4 Epsilon=0.00001
Activation FromTensor=bn4 ToTensor=relu3 Kind=ReLU Param=0
Conv FromTensor=relu3 ToTensor=one3 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one3 ToTensor=bn5 Epsilon=0.00001
Add FromTensor1=bn2 FromTensor2=bn5 ToTensor=add1
Activation FromTensor=add1 ToTensor=relu4 Kind=ReLU Param=0
Conv FromTensor=relu4 ToTensor=one4 ToChannels=64 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one4 ToTensor=bn6 Epsilon=0.00001
Activation FromTensor=bn6 ToTensor=relu5 Kind=ReLU Param=0
Conv FromTensor=relu5 ToTensor=three2 ToChannels=64 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three2 ToTensor=bn7 Epsilon=0.00001
Activation FromTensor=bn7 ToTensor=relu6 Kind=ReLU Param=0
Conv FromTensor=relu6 ToTensor=one5 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one5 ToTensor=bn8 Epsilon=0.00001
Add FromTensor1=relu4 FromTensor2=bn8 ToTensor=add2
Activation FromTensor=add2 ToTensor=relu7 Kind=ReLU Param=0
Conv FromTensor=relu7 ToTensor=one6 ToChannels=64 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one6 ToTensor=bn9 Epsilon=0.00001
Activation FromTensor=bn9 ToTensor=relu8 Kind=ReLU Param=0
Conv FromTensor=relu8 ToTensor=three3 ToChannels=64 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three3 ToTensor=bn10 Epsilon=0.00001
Activation FromTensor=bn10 ToTensor=relu9 Kind=ReLU Param=0
Conv FromTensor=relu9 ToTensor=one7 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one7 ToTensor=bn11 Epsilon=0.00001
Add FromTensor1=relu7 FromTensor2=bn11 ToTensor=add3
Activation FromTensor=add3 ToTensor=relu10 Kind=ReLU Param=0
Conv FromTensor=relu10 ToTensor=oneDS1 ToChannels=512 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS1 ToTensor=bn12 Epsilon=0.00001
Conv FromTensor=relu10 ToTensor=oneDS2 ToChannels=128 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS2 ToTensor=bn13 Epsilon=0.00001
Activation FromTensor=bn13 ToTensor=relu11 Kind=ReLU Param=0
Conv FromTensor=relu11 ToTensor=three4 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three4 ToTensor=bn14 Epsilon=0.00001
Activation FromTensor=bn14 ToTensor=relu12 Kind=ReLU Param=0
Conv FromTensor=relu12 ToTensor=one8 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one8 ToTensor=bn15 Epsilon=0.00001
Add FromTensor1=bn12 FromTensor2=bn15 ToTensor=add4
Activation FromTensor=add4 ToTensor=relu13 Kind=ReLU Param=0
Conv FromTensor=relu13 ToTensor=one9 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one9 ToTensor=bn16 Epsilon=0.00001
Activation FromTensor=bn16 ToTensor=relu14 Kind=ReLU Param=0
Conv FromTensor=relu14 ToTensor=three5 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three5 ToTensor=bn17 Epsilon=0.00001
Activation FromTensor=bn17 ToTensor=relu15 Kind=ReLU Param=0
Conv FromTensor=relu15 ToTensor=one10 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one10 ToTensor=bn18 Epsilon=0.00001
Add FromTensor1=relu13 FromTensor2=bn18 ToTensor=add5
Activation FromTensor=add5 ToTensor=relu16 Kind=ReLU Param=0
Conv FromTensor=relu16 ToTensor=one11 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one11 ToTensor=bn19 Epsilon=0.00001
Activation FromTensor=bn19 ToTensor=relu17 Kind=ReLU Param=0
Conv FromTensor=relu17 ToTensor=three6 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three6 ToTensor=bn20 Epsilon=0.00001
Activation FromTensor=bn20 ToTensor=relu18 Kind=ReLU Param=0
Conv FromTensor=relu18 ToTensor=one12 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one12 ToTensor=bn21 Epsilon=0.00001
Add FromTensor1=relu16 FromTensor2=bn21 ToTensor=add6
Activation FromTensor=add6 ToTensor=relu19 Kind=ReLU Param=0
Conv FromTensor=relu19 ToTensor=one13 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one13 ToTensor=bn22 Epsilon=0.00001
Activation FromTensor=bn22 ToTensor=relu20 Kind=ReLU Param=0
Conv FromTensor=relu20 ToTensor=three7 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three7 ToTensor=bn23 Epsilon=0.00001
Activation FromTensor=bn23 ToTensor=relu21 Kind=ReLU Param=0
Conv FromTensor=relu21 ToTensor=one14 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one14 ToTensor=bn24 Epsilon=0.00001
Add FromTensor1=relu19 FromTensor2=bn24 ToTensor=add7
Activation FromTensor=add7 ToTensor=relu22 Kind=ReLU Param=0
Conv FromTensor=relu22 ToTensor=oneDS3 ToChannels=1024 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS3 ToTensor=bn25 Epsilon=0.00001
Conv FromTensor=relu22 ToTensor=oneDS4 ToChannels=256 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS4 ToTensor=bn26 Epsilon=0.00001
Activation FromTensor=bn26 ToTensor=relu23 Kind=ReLU Param=0
Conv FromTensor=relu23 ToTensor=three8 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three8 ToTensor=bn27 Epsilon=0.00001
Activation FromTensor=bn27 ToTensor=relu24 Kind=ReLU Param=0
Conv FromTensor=relu24 ToTensor=one15 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one15 ToTensor=bn28 Epsilon=0.00001
Add FromTensor1=bn25 FromTensor2=bn28 ToTensor=add8
Activation FromTensor=add8 ToTensor=relu25 Kind=ReLU Param=0
Conv FromTensor=relu25 ToTensor=one16 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one16 ToTensor=bn29 Epsilon=0.00001
Activation FromTensor=bn29 ToTensor=relu26 Kind=ReLU Param=0
Conv FromTensor=relu26 ToTensor=three9 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three9 ToTensor=bn30 Epsilon=0.00001
Activation FromTensor=bn30 ToTensor=relu27 Kind=ReLU Param=0
Conv FromTensor=relu27 ToTensor=one17 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one17 ToTensor=bn31 Epsilon=0.00001
Add FromTensor1=relu25 FromTensor2=bn31 ToTensor=add9
Activation FromTensor=add9 ToTensor=relu28 Kind=ReLU Param=0
Conv FromTensor=relu28 ToTensor=one18 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one18 ToTensor=bn32 Epsilon=0.00001
Activation FromTensor=bn32 ToTensor=relu29 Kind=ReLU Param=0
Conv FromTensor=relu29 ToTensor=three10 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three10 ToTensor=bn33 Epsilon=0.00001
Activation FromTensor=bn33 ToTensor=relu30 Kind=ReLU Param=0
Conv FromTensor=relu30 ToTensor=one19 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one19 ToTensor=bn34 Epsilon=0.00001
Add FromTensor1=relu28 FromTensor2=bn34 ToTensor=add10
Activation FromTensor=add10 ToTensor=relu31 Kind=ReLU Param=0
Conv FromTensor=relu31 ToTensor=one20 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one20 ToTensor=bn35 Epsilon=0.00001
Activation FromTensor=bn35 ToTensor=relu32 Kind=ReLU Param=0
Conv FromTensor=relu32 ToTensor=three11 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three11 ToTensor=bn36 Epsilon=0.00001
Activation FromTensor=bn36 ToTensor=relu33 Kind=ReLU Param=0
Conv FromTensor=relu33 ToTensor=one21 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one21 ToTensor=bn37 Epsilon=0.00001
Add FromTensor1=relu31 FromTensor2=bn37 ToTensor=add11
Activation FromTensor=add11 ToTensor=relu34 Kind=ReLU Param=0
Conv FromTensor=relu34 ToTensor=one22 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one22 ToTensor=bn38 Epsilon=0.00001
Activation FromTensor=bn38 ToTensor=relu35 Kind=ReLU Param=0
Conv FromTensor=relu35 ToTensor=three12 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three12 ToTensor=bn39 Epsilon=0.00001
Activation FromTensor=bn39 ToTensor=relu36 Kind=ReLU Param=0
Conv FromTensor=relu36 ToTensor=one23 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one23 ToTensor=bn40 Epsilon=0.00001
Add FromTensor1=relu34 FromTensor2=bn40 ToTensor=add12
Activation FromTensor=add12 ToTensor=relu37 Kind=ReLU Param=0
Conv FromTensor=relu37 ToTensor=one24 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one24 ToTensor=bn41 Epsilon=0.00001
Activation FromTensor=bn41 ToTensor=relu38 Kind=ReLU Param=0
Conv FromTensor=relu38 ToTensor=three13 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three13 ToTensor=bn42 Epsilon=0.00001
Activation FromTensor=bn42 ToTensor=relu39 Kind=ReLU Param=0
Conv FromTensor=relu39 ToTensor=one25 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one25 ToTensor=bn43 Epsilon=0.00001
Add FromTensor1=relu37 FromTensor2=bn43 ToTensor=add13
Activation FromTensor=add13 ToTensor=relu40 Kind=ReLU Param=0
Conv FromTensor=relu40 ToTensor=oneDS5 ToChannels=2048 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS5 ToTensor=bn44 Epsilon=0.00001
Conv FromTensor=relu40 ToTensor=oneDS6 ToChannels=512 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS6 ToTensor=bn45 Epsilon=0.00001
Activation FromTensor=bn45 ToTensor=relu41 Kind=ReLU Param=0
Conv FromTensor=relu41 ToTensor=three14 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three14 ToTensor=bn46 Epsilon=0.00001
Activation FromTensor=bn46 ToTensor=relu42 Kind=ReLU Param=0
Conv FromTensor=relu42 ToTensor=one26 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one26 ToTensor=bn47 Epsilon=0.00001
Add FromTensor1=bn44 FromTensor2=bn47 ToTensor=add14
Activation FromTensor=add14 ToTensor=relu43 Kind=ReLU Param=0
Conv FromTensor=relu43 ToTensor=one27 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one27 ToTensor=bn48 Epsilon=0.00001
Activation FromTensor=bn48 ToTensor=relu44 Kind=ReLU Param=0
Conv FromTensor=relu44 ToTensor=three15 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three15 ToTensor=bn49 Epsilon=0.00001
Activation FromTensor=bn49 ToTensor=relu45 Kind=ReLU Param=0
Conv FromTensor=relu45 ToTensor=one28 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one28 ToTensor=bn50 Epsilon=0.00001
Add FromTensor1=relu43 FromTensor2=bn50 ToTensor=add15
Activation FromTensor=add15 ToTensor=relu46 Kind=ReLU Param=0
Conv FromTensor=relu46 ToTensor=one29 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one29 ToTensor=bn51 Epsilon=0.00001
Activation FromTensor=bn51 ToTensor=relu47 Kind=ReLU Param=0
Conv FromTensor=relu47 ToTensor=three16 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three16 ToTensor=bn52 Epsilon=0.00001
Activation FromTensor=bn52 ToTensor=relu48 Kind=ReLU Param=0
Conv FromTensor=relu48 ToTensor=one30 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one30 ToTensor=bn53 Epsilon=0.00001
Add FromTensor1=relu46 FromTensor2=bn53 ToTensor=add16
Activation FromTensor=add16 ToTensor=relu49 Kind=ReLU Param=0
Pooling FromTensor=relu49 ToTensor=pool2 Kind=AvgGlobal PaddingH=0 PaddingW=0
FullyConnected FromTensor=pool2 ToTensor=fc ToChannels=1000
Softmax FromTensor=fc ToTensor=prob
Output FromTensor=prob

Top || Output ResNet50.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(ResNet50Params);
// ResNet50Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct ResNet50Params ResNet50Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// ResNet50Params* params = malloc(sizeof(ResNet50Params));
//
// ... Load params (read from a file, perhaps) ...
//
// ResNet50Net* net; // For example, 4 threads:
// char* err = ResNet50NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// ResNet50NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct ResNet50Net ResNet50Net;

char* ResNet50NetCreate(
ResNet50Net**,
ResNet50Params*,
ptrdiff_t threads
);

void ResNet50NetDestroy(ResNet50Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// ResNet50Net* net;
//
// ... Create net ...
//
// ResNet50Engine* engine; // For example, 4 inference threads:
// char* err = ResNet50EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// ResNet50EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = ResNet50EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* imageData = malloc(sizeof(float)*3*224*224);
// float* probData = malloc(sizeof(float)*1000*1*1);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// ResNet50EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// imageData, // The tensor arguments are sorted by name.
// probData
// );
//
// ... Read the output floats ...
//
// }
//
// free(imageData);
// free(probData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct ResNet50Engine ResNet50Engine;

char* ResNet50EngineCreate(
ResNet50Engine**,
ResNet50Net*,
ptrdiff_t threads
);

char* ResNet50EnginePthreadT(
ResNet50Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void ResNet50EngineInference(
ResNet50Engine*,
float* imageData,
float* probData
);

void ResNet50EngineDestroy(ResNet50Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct ResNet50Params {
float bn10Means[64]; // 1x64x1x1
float bn10Scales[64]; // 1x64x1x1
float bn10Shifts[64]; // 1x64x1x1
float bn10Variances[64]; // 1x64x1x1
float bn11Means[256]; // 1x256x1x1
float bn11Scales[256]; // 1x256x1x1
float bn11Shifts[256]; // 1x256x1x1
float bn11Variances[256]; // 1x256x1x1
float bn12Means[512]; // 1x512x1x1
float bn12Scales[512]; // 1x512x1x1
float bn12Shifts[512]; // 1x512x1x1
float bn12Variances[512]; // 1x512x1x1
float bn13Means[128]; // 1x128x1x1
float bn13Scales[128]; // 1x128x1x1
float bn13Shifts[128]; // 1x128x1x1
float bn13Variances[128]; // 1x128x1x1
float bn14Means[128]; // 1x128x1x1
float bn14Scales[128]; // 1x128x1x1
float bn14Shifts[128]; // 1x128x1x1
float bn14Variances[128]; // 1x128x1x1
float bn15Means[512]; // 1x512x1x1
float bn15Scales[512]; // 1x512x1x1
float bn15Shifts[512]; // 1x512x1x1
float bn15Variances[512]; // 1x512x1x1
float bn16Means[128]; // 1x128x1x1
float bn16Scales[128]; // 1x128x1x1
float bn16Shifts[128]; // 1x128x1x1
float bn16Variances[128]; // 1x128x1x1
float bn17Means[128]; // 1x128x1x1
float bn17Scales[128]; // 1x128x1x1
float bn17Shifts[128]; // 1x128x1x1
float bn17Variances[128]; // 1x128x1x1
float bn18Means[512]; // 1x512x1x1
float bn18Scales[512]; // 1x512x1x1
float bn18Shifts[512]; // 1x512x1x1
float bn18Variances[512]; // 1x512x1x1
float bn19Means[128]; // 1x128x1x1
float bn19Scales[128]; // 1x128x1x1
float bn19Shifts[128]; // 1x128x1x1
float bn19Variances[128]; // 1x128x1x1
float bn1Means[64]; // 1x64x1x1
float bn1Scales[64]; // 1x64x1x1
float bn1Shifts[64]; // 1x64x1x1
float bn1Variances[64]; // 1x64x1x1
float bn20Means[128]; // 1x128x1x1
float bn20Scales[128]; // 1x128x1x1
float bn20Shifts[128]; // 1x128x1x1
float bn20Variances[128]; // 1x128x1x1
float bn21Means[512]; // 1x512x1x1
float bn21Scales[512]; // 1x512x1x1
float bn21Shifts[512]; // 1x512x1x1
float bn21Variances[512]; // 1x512x1x1
float bn22Means[128]; // 1x128x1x1
float bn22Scales[128]; // 1x128x1x1
float bn22Shifts[128]; // 1x128x1x1
float bn22Variances[128]; // 1x128x1x1
float bn23Means[128]; // 1x128x1x1
float bn23Scales[128]; // 1x128x1x1
float bn23Shifts[128]; // 1x128x1x1
float bn23Variances[128]; // 1x128x1x1
float bn24Means[512]; // 1x512x1x1
float bn24Scales[512]; // 1x512x1x1
float bn24Shifts[512]; // 1x512x1x1
float bn24Variances[512]; // 1x512x1x1
float bn25Means[1024]; // 1x1024x1x1
float bn25Scales[1024]; // 1x1024x1x1
float bn25Shifts[1024]; // 1x1024x1x1
float bn25Variances[1024]; // 1x1024x1x1
float bn26Means[256]; // 1x256x1x1
float bn26Scales[256]; // 1x256x1x1
float bn26Shifts[256]; // 1x256x1x1
float bn26Variances[256]; // 1x256x1x1
float bn27Means[256]; // 1x256x1x1
float bn27Scales[256]; // 1x256x1x1
float bn27Shifts[256]; // 1x256x1x1
float bn27Variances[256]; // 1x256x1x1
float bn28Means[1024]; // 1x1024x1x1
float bn28Scales[1024]; // 1x1024x1x1
float bn28Shifts[1024]; // 1x1024x1x1
float bn28Variances[1024]; // 1x1024x1x1
float bn29Means[256]; // 1x256x1x1
float bn29Scales[256]; // 1x256x1x1
float bn29Shifts[256]; // 1x256x1x1
float bn29Variances[256]; // 1x256x1x1
float bn2Means[256]; // 1x256x1x1
float bn2Scales[256]; // 1x256x1x1
float bn2Shifts[256]; // 1x256x1x1
float bn2Variances[256]; // 1x256x1x1
float bn30Means[256]; // 1x256x1x1
float bn30Scales[256]; // 1x256x1x1
float bn30Shifts[256]; // 1x256x1x1
float bn30Variances[256]; // 1x256x1x1
float bn31Means[1024]; // 1x1024x1x1
float bn31Scales[1024]; // 1x1024x1x1
float bn31Shifts[1024]; // 1x1024x1x1
float bn31Variances[1024]; // 1x1024x1x1
float bn32Means[256]; // 1x256x1x1
float bn32Scales[256]; // 1x256x1x1
float bn32Shifts[256]; // 1x256x1x1
float bn32Variances[256]; // 1x256x1x1
float bn33Means[256]; // 1x256x1x1
float bn33Scales[256]; // 1x256x1x1
float bn33Shifts[256]; // 1x256x1x1
float bn33Variances[256]; // 1x256x1x1
float bn34Means[1024]; // 1x1024x1x1
float bn34Scales[1024]; // 1x1024x1x1
float bn34Shifts[1024]; // 1x1024x1x1
float bn34Variances[1024]; // 1x1024x1x1
float bn35Means[256]; // 1x256x1x1
float bn35Scales[256]; // 1x256x1x1
float bn35Shifts[256]; // 1x256x1x1
float bn35Variances[256]; // 1x256x1x1
float bn36Means[256]; // 1x256x1x1
float bn36Scales[256]; // 1x256x1x1
float bn36Shifts[256]; // 1x256x1x1
float bn36Variances[256]; // 1x256x1x1
float bn37Means[1024]; // 1x1024x1x1
float bn37Scales[1024]; // 1x1024x1x1
float bn37Shifts[1024]; // 1x1024x1x1
float bn37Variances[1024]; // 1x1024x1x1
float bn38Means[256]; // 1x256x1x1
float bn38Scales[256]; // 1x256x1x1
float bn38Shifts[256]; // 1x256x1x1
float bn38Variances[256]; // 1x256x1x1
float bn39Means[256]; // 1x256x1x1
float bn39Scales[256]; // 1x256x1x1
float bn39Shifts[256]; // 1x256x1x1
float bn39Variances[256]; // 1x256x1x1
float bn3Means[64]; // 1x64x1x1
float bn3Scales[64]; // 1x64x1x1
float bn3Shifts[64]; // 1x64x1x1
float bn3Variances[64]; // 1x64x1x1
float bn40Means[1024]; // 1x1024x1x1
float bn40Scales[1024]; // 1x1024x1x1
float bn40Shifts[1024]; // 1x1024x1x1
float bn40Variances[1024]; // 1x1024x1x1
float bn41Means[256]; // 1x256x1x1
float bn41Scales[256]; // 1x256x1x1
float bn41Shifts[256]; // 1x256x1x1
float bn41Variances[256]; // 1x256x1x1
float bn42Means[256]; // 1x256x1x1
float bn42Scales[256]; // 1x256x1x1
float bn42Shifts[256]; // 1x256x1x1
float bn42Variances[256]; // 1x256x1x1
float bn43Means[1024]; // 1x1024x1x1
float bn43Scales[1024]; // 1x1024x1x1
float bn43Shifts[1024]; // 1x1024x1x1
float bn43Variances[1024]; // 1x1024x1x1
float bn44Means[2048]; // 1x2048x1x1
float bn44Scales[2048]; // 1x2048x1x1
float bn44Shifts[2048]; // 1x2048x1x1
float bn44Variances[2048]; // 1x2048x1x1
float bn45Means[512]; // 1x512x1x1
float bn45Scales[512]; // 1x512x1x1
float bn45Shifts[512]; // 1x512x1x1
float bn45Variances[512]; // 1x512x1x1
float bn46Means[512]; // 1x512x1x1
float bn46Scales[512]; // 1x512x1x1
float bn46Shifts[512]; // 1x512x1x1
float bn46Variances[512]; // 1x512x1x1
float bn47Means[2048]; // 1x2048x1x1
float bn47Scales[2048]; // 1x2048x1x1
float bn47Shifts[2048]; // 1x2048x1x1
float bn47Variances[2048]; // 1x2048x1x1
float bn48Means[512]; // 1x512x1x1
float bn48Scales[512]; // 1x512x1x1
float bn48Shifts[512]; // 1x512x1x1
float bn48Variances[512]; // 1x512x1x1
float bn49Means[512]; // 1x512x1x1
float bn49Scales[512]; // 1x512x1x1
float bn49Shifts[512]; // 1x512x1x1
float bn49Variances[512]; // 1x512x1x1
float bn4Means[64]; // 1x64x1x1
float bn4Scales[64]; // 1x64x1x1
float bn4Shifts[64]; // 1x64x1x1
float bn4Variances[64]; // 1x64x1x1
float bn50Means[2048]; // 1x2048x1x1
float bn50Scales[2048]; // 1x2048x1x1
float bn50Shifts[2048]; // 1x2048x1x1
float bn50Variances[2048]; // 1x2048x1x1
float bn51Means[512]; // 1x512x1x1
float bn51Scales[512]; // 1x512x1x1
float bn51Shifts[512]; // 1x512x1x1
float bn51Variances[512]; // 1x512x1x1
float bn52Means[512]; // 1x512x1x1
float bn52Scales[512]; // 1x512x1x1
float bn52Shifts[512]; // 1x512x1x1
float bn52Variances[512]; // 1x512x1x1
float bn53Means[2048]; // 1x2048x1x1
float bn53Scales[2048]; // 1x2048x1x1
float bn53Shifts[2048]; // 1x2048x1x1
float bn53Variances[2048]; // 1x2048x1x1
float bn5Means[256]; // 1x256x1x1
float bn5Scales[256]; // 1x256x1x1
float bn5Shifts[256]; // 1x256x1x1
float bn5Variances[256]; // 1x256x1x1
float bn6Means[64]; // 1x64x1x1
float bn6Scales[64]; // 1x64x1x1
float bn6Shifts[64]; // 1x64x1x1
float bn6Variances[64]; // 1x64x1x1
float bn7Means[64]; // 1x64x1x1
float bn7Scales[64]; // 1x64x1x1
float bn7Shifts[64]; // 1x64x1x1
float bn7Variances[64]; // 1x64x1x1
float bn8Means[256]; // 1x256x1x1
float bn8Scales[256]; // 1x256x1x1
float bn8Shifts[256]; // 1x256x1x1
float bn8Variances[256]; // 1x256x1x1
float bn9Means[64]; // 1x64x1x1
float bn9Scales[64]; // 1x64x1x1
float bn9Shifts[64]; // 1x64x1x1
float bn9Variances[64]; // 1x64x1x1
float fcBiases[1000]; // 1x1000x1x1
float fcWeights[2048000]; // 1000x2048x1x1
float one10Biases[512]; // 1x512x1x1
float one10Weights[65536]; // 512x128x1x1
float one11Biases[128]; // 1x128x1x1
float one11Weights[65536]; // 128x512x1x1
float one12Biases[512]; // 1x512x1x1
float one12Weights[65536]; // 512x128x1x1
float one13Biases[128]; // 1x128x1x1
float one13Weights[65536]; // 128x512x1x1
float one14Biases[512]; // 1x512x1x1
float one14Weights[65536]; // 512x128x1x1
float one15Biases[1024]; // 1x1024x1x1
float one15Weights[262144]; // 1024x256x1x1
float one16Biases[256]; // 1x256x1x1
float one16Weights[262144]; // 256x1024x1x1
float one17Biases[1024]; // 1x1024x1x1
float one17Weights[262144]; // 1024x256x1x1
float one18Biases[256]; // 1x256x1x1
float one18Weights[262144]; // 256x1024x1x1
float one19Biases[1024]; // 1x1024x1x1
float one19Weights[262144]; // 1024x256x1x1
float one1Biases[256]; // 1x256x1x1
float one1Weights[16384]; // 256x64x1x1
float one20Biases[256]; // 1x256x1x1
float one20Weights[262144]; // 256x1024x1x1
float one21Biases[1024]; // 1x1024x1x1
float one21Weights[262144]; // 1024x256x1x1
float one22Biases[256]; // 1x256x1x1
float one22Weights[262144]; // 256x1024x1x1
float one23Biases[1024]; // 1x1024x1x1
float one23Weights[262144]; // 1024x256x1x1
float one24Biases[256]; // 1x256x1x1
float one24Weights[262144]; // 256x1024x1x1
float one25Biases[1024]; // 1x1024x1x1
float one25Weights[262144]; // 1024x256x1x1
float one26Biases[2048]; // 1x2048x1x1
float one26Weights[1048576]; // 2048x512x1x1
float one27Biases[512]; // 1x512x1x1
float one27Weights[1048576]; // 512x2048x1x1
float one28Biases[2048]; // 1x2048x1x1
float one28Weights[1048576]; // 2048x512x1x1
float one29Biases[512]; // 1x512x1x1
float one29Weights[1048576]; // 512x2048x1x1
float one2Biases[64]; // 1x64x1x1
float one2Weights[4096]; // 64x64x1x1
float one30Biases[2048]; // 1x2048x1x1
float one30Weights[1048576]; // 2048x512x1x1
float one3Biases[256]; // 1x256x1x1
float one3Weights[16384]; // 256x64x1x1
float one4Biases[64]; // 1x64x1x1
float one4Weights[16384]; // 64x256x1x1
float one5Biases[256]; // 1x256x1x1
float one5Weights[16384]; // 256x64x1x1
float one6Biases[64]; // 1x64x1x1
float one6Weights[16384]; // 64x256x1x1
float one7Biases[256]; // 1x256x1x1
float one7Weights[16384]; // 256x64x1x1
float one8Biases[512]; // 1x512x1x1
float one8Weights[65536]; // 512x128x1x1
float one9Biases[128]; // 1x128x1x1
float one9Weights[65536]; // 128x512x1x1
float oneDS1Biases[512]; // 1x512x1x1
float oneDS1Weights[131072]; // 512x256x1x1
float oneDS2Biases[128]; // 1x128x1x1
float oneDS2Weights[32768]; // 128x256x1x1
float oneDS3Biases[1024]; // 1x1024x1x1
float oneDS3Weights[524288]; // 1024x512x1x1
float oneDS4Biases[256]; // 1x256x1x1
float oneDS4Weights[131072]; // 256x512x1x1
float oneDS5Biases[2048]; // 1x2048x1x1
float oneDS5Weights[2097152]; // 2048x1024x1x1
float oneDS6Biases[512]; // 1x512x1x1
float oneDS6Weights[524288]; // 512x1024x1x1
float sevenDSBiases[64]; // 1x64x1x1
float sevenDSWeights[9408]; // 64x3x7x7
float three10Biases[256]; // 1x256x1x1
float three10Weights[589824]; // 256x256x3x3
float three11Biases[256]; // 1x256x1x1
float three11Weights[589824]; // 256x256x3x3
float three12Biases[256]; // 1x256x1x1
float three12Weights[589824]; // 256x256x3x3
float three13Biases[256]; // 1x256x1x1
float three13Weights[589824]; // 256x256x3x3
float three14Biases[512]; // 1x512x1x1
float three14Weights[2359296]; // 512x512x3x3
float three15Biases[512]; // 1x512x1x1
float three15Weights[2359296]; // 512x512x3x3
float three16Biases[512]; // 1x512x1x1
float three16Weights[2359296]; // 512x512x3x3
float three1Biases[64]; // 1x64x1x1
float three1Weights[36864]; // 64x64x3x3
float three2Biases[64]; // 1x64x1x1
float three2Weights[36864]; // 64x64x3x3
float three3Biases[64]; // 1x64x1x1
float three3Weights[36864]; // 64x64x3x3
float three4Biases[128]; // 1x128x1x1
float three4Weights[147456]; // 128x128x3x3
float three5Biases[128]; // 1x128x1x1
float three5Weights[147456]; // 128x128x3x3
float three6Biases[128]; // 1x128x1x1
float three6Weights[147456]; // 128x128x3x3
float three7Biases[128]; // 1x128x1x1
float three7Weights[147456]; // 128x128x3x3
float three8Biases[256]; // 1x256x1x1
float three8Weights[589824]; // 256x256x3x3
float three9Biases[256]; // 1x256x1x1
float three9Weights[589824]; // 256x256x3x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output ResNet50.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f ResNet50.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "ResNet50.h"

static char* ResNet50Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(276);
int step1 = sprintf(msg1, "ResNet50: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 276-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct ResNet50ThreaderTask1 ResNet50ThreaderTask1;
typedef void (*ResNet50ThreaderCallee1)(ResNet50ThreaderTask1*, int64_t*);
typedef struct ResNet50ThreaderHub1 ResNet50ThreaderHub1;
typedef struct ResNet50ThreaderNode1 ResNet50ThreaderNode1;
typedef struct ResNet50ThreaderUnwind1 ResNet50ThreaderUnwind1;
typedef struct ResNet50ThreaderTeam1 ResNet50ThreaderTeam1;

struct ResNet50ThreaderTask1 {
ResNet50ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct ResNet50ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct ResNet50ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
ResNet50ThreaderTask1* task1;
pthread_cond_t cond2;
ResNet50ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct ResNet50ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct ResNet50ThreaderTeam1 {
ptrdiff_t nt1;
ResNet50ThreaderHub1* hub2;
ResNet50ThreaderNode1* nodes2;
ResNet50ThreaderUnwind1 unwind1;
};

static void ResNet50ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void ResNet50ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void ResNet50ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* ResNet50ThreaderMain1(void* arg1) {
ResNet50ThreaderNode1* node1 = arg1;
ResNet50ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
ResNet50ThreaderHub1* hub3 = team2->hub2;
ResNet50ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
ResNet50ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
ResNet50ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
ResNet50ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
ResNet50ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
ResNet50ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void ResNet50ThreaderDestroy1(ResNet50ThreaderTeam1* team3) {
if (!team3) return;
ResNet50ThreaderNode1* nodes4 = team3->nodes2;
ResNet50ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (ResNet50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (ResNet50ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (ResNet50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (ResNet50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (ResNet50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
ResNet50ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* ResNet50ThreaderCreate1Up4(ResNet50ThreaderTeam1* team8, ptrdiff_t nt7) {
ResNet50ThreaderNode1* nodes5 = team8->nodes2;
for (ResNet50ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = ResNet50Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = ResNet50Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, ResNet50ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = ResNet50Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* ResNet50ThreaderCreate1Up3(ResNet50ThreaderTeam1* team7, ptrdiff_t nt6) {
ResNet50ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return ResNet50ThreaderCreate1Up4(team7, nt6);
}

static char* ResNet50ThreaderCreate1Up2(ResNet50ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(ResNet50ThreaderNode1);
if (__builtin_expect(size2/sizeof(ResNet50ThreaderNode1) != (size_t)nt5, 0)) {
return ResNet50Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return ResNet50ThreaderCreate1Up3(team6, nt5);
}

static char* ResNet50ThreaderCreate1Up1(ResNet50ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(ResNet50ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return ResNet50ThreaderCreate1Up2(team5, nt4);
}

static char* ResNet50ThreaderCreate1(ResNet50ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return ResNet50Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(ResNet50ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = ResNet50ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
ResNet50ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* ResNet50ThreaderPthreadT1(
pthread_t* thr2,
ResNet50ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return ResNet50Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void ResNet50ThreaderDo1(ResNet50ThreaderTeam1* team10, ResNet50ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
ResNet50ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
ResNet50ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
ResNet50ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
ResNet50ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 ResNet50Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static void ResNet50Softmax1(ResNet50ThreaderTeam1* team90, char** tensors155) {
(void)team90;
char*restrict ptr5 = tensors155[0];
char*restrict ptr6 = tensors155[1];
__m512 max1 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0);
__m512 max2 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1);
__m512 max3 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2);
__m512 max4 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3);
__m512 max5 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4);
__m512 max6 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5);
__m512 max7 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6);
__m512 max8 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7);
__m512 max9 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8);
__m512 max10 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9);
__m512 max11 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10);
__m512 max12 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11);
__m512 max13 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12);
__m512 max14 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13);
__m512 max15 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14);
__m512 max16 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15);
for (ptrdiff_t i93 = 1; i93 <= 2; ++i93) {
__m512 dat2557 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i93);
__m512 dat2558 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i93);
__m512 dat2559 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i93);
__m512 dat2560 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i93);
__m512 dat2561 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i93);
__m512 dat2562 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i93);
__m512 dat2563 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i93);
__m512 dat2564 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i93);
__m512 dat2565 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i93);
__m512 dat2566 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i93);
__m512 dat2567 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i93);
__m512 dat2568 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i93);
__m512 dat2569 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i93);
__m512 dat2570 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i93);
__m512 dat2571 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i93);
__m512 dat2572 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i93);
max1 = _mm512_max_ps(max1, dat2557);
max2 = _mm512_max_ps(max2, dat2558);
max3 = _mm512_max_ps(max3, dat2559);
max4 = _mm512_max_ps(max4, dat2560);
max5 = _mm512_max_ps(max5, dat2561);
max6 = _mm512_max_ps(max6, dat2562);
max7 = _mm512_max_ps(max7, dat2563);
max8 = _mm512_max_ps(max8, dat2564);
max9 = _mm512_max_ps(max9, dat2565);
max10 = _mm512_max_ps(max10, dat2566);
max11 = _mm512_max_ps(max11, dat2567);
max12 = _mm512_max_ps(max12, dat2568);
max13 = _mm512_max_ps(max13, dat2569);
max14 = _mm512_max_ps(max14, dat2570);
max15 = _mm512_max_ps(max15, dat2571);
max16 = _mm512_max_ps(max16, dat2572);
}
__m512 dat2573 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
__m512 dat2574 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2575 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2576 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2577 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2578 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2579 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2580 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2581 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2582 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2583 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2584 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2585 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2586 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
max1 = _mm512_max_ps(max1, dat2573);
max2 = _mm512_max_ps(max2, dat2574);
max3 = _mm512_max_ps(max3, dat2575);
max4 = _mm512_max_ps(max4, dat2576);
max5 = _mm512_max_ps(max5, dat2577);
max6 = _mm512_max_ps(max6, dat2578);
max7 = _mm512_max_ps(max7, dat2579);
max8 = _mm512_max_ps(max8, dat2580);
max9 = _mm512_max_ps(max9, dat2581);
max10 = _mm512_max_ps(max10, dat2582);
max11 = _mm512_max_ps(max11, dat2583);
max12 = _mm512_max_ps(max12, dat2584);
max13 = _mm512_max_ps(max13, dat2585);
max14 = _mm512_max_ps(max14, dat2586);
__m512 dat2587 = _mm512_maskz_loadu_ps(255, ptr5+(ptrdiff_t)64*62);
max16 = _mm512_mask_max_ps(max16, 255, max16, dat2587);
max1 = _mm512_max_ps(max1, max9);
max2 = _mm512_max_ps(max2, max10);
max3 = _mm512_max_ps(max3, max11);
max4 = _mm512_max_ps(max4, max12);
max5 = _mm512_max_ps(max5, max13);
max6 = _mm512_max_ps(max6, max14);
max7 = _mm512_max_ps(max7, max15);
max8 = _mm512_max_ps(max8, max16);
max1 = _mm512_max_ps(max1, max5);
max2 = _mm512_max_ps(max2, max6);
max3 = _mm512_max_ps(max3, max7);
max4 = _mm512_max_ps(max4, max8);
max1 = _mm512_max_ps(max1, max3);
max2 = _mm512_max_ps(max2, max4);
max1 = _mm512_max_ps(max1, max2);
__m512i p2 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
max1 = _mm512_mask_max_ps(max1, 255, max1, _mm512_permutexvar_ps(p2, max1));
__m512i p3 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
max1 = _mm512_mask_max_ps(max1, 15, max1, _mm512_permutexvar_ps(p3, max1));
__m512i p4 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
max1 = _mm512_mask_max_ps(max1, 3, max1, _mm512_permutexvar_ps(p4, max1));
__m512i p5 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
max1 = _mm512_mask_max_ps(max1, 1, max1, _mm512_permutexvar_ps(p5, max1));
__m512i p6 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
max1 = _mm512_permutexvar_ps(p6, max1);
__m512 sum861 = _mm512_setzero_ps();
__m512 neg1 = _mm512_sub_ps(sum861, max1);
__m512 dat2618 = _mm512_maskz_loadu_ps(255, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3);
__m512 dat2617 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
__m512 dat2616 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2615 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2614 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2613 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2612 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2611 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2610 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2609 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2608 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2607 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2606 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2605 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2604 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
dat2618 = ResNet50Exp1(_mm512_add_ps(neg1, dat2618));
sum861 = _mm512_mask_add_ps(sum861, 255, sum861, dat2618);
dat2617 = ResNet50Exp1(_mm512_add_ps(neg1, dat2617));
sum861 = _mm512_add_ps(sum861, dat2617);
dat2616 = ResNet50Exp1(_mm512_add_ps(neg1, dat2616));
sum861 = _mm512_add_ps(sum861, dat2616);
dat2615 = ResNet50Exp1(_mm512_add_ps(neg1, dat2615));
sum861 = _mm512_add_ps(sum861, dat2615);
dat2614 = ResNet50Exp1(_mm512_add_ps(neg1, dat2614));
sum861 = _mm512_add_ps(sum861, dat2614);
dat2613 = ResNet50Exp1(_mm512_add_ps(neg1, dat2613));
sum861 = _mm512_add_ps(sum861, dat2613);
dat2612 = ResNet50Exp1(_mm512_add_ps(neg1, dat2612));
sum861 = _mm512_add_ps(sum861, dat2612);
dat2611 = ResNet50Exp1(_mm512_add_ps(neg1, dat2611));
sum861 = _mm512_add_ps(sum861, dat2611);
dat2610 = ResNet50Exp1(_mm512_add_ps(neg1, dat2610));
sum861 = _mm512_add_ps(sum861, dat2610);
dat2609 = ResNet50Exp1(_mm512_add_ps(neg1, dat2609));
sum861 = _mm512_add_ps(sum861, dat2609);
dat2608 = ResNet50Exp1(_mm512_add_ps(neg1, dat2608));
sum861 = _mm512_add_ps(sum861, dat2608);
dat2607 = ResNet50Exp1(_mm512_add_ps(neg1, dat2607));
sum861 = _mm512_add_ps(sum861, dat2607);
dat2606 = ResNet50Exp1(_mm512_add_ps(neg1, dat2606));
sum861 = _mm512_add_ps(sum861, dat2606);
dat2605 = ResNet50Exp1(_mm512_add_ps(neg1, dat2605));
sum861 = _mm512_add_ps(sum861, dat2605);
dat2604 = ResNet50Exp1(_mm512_add_ps(neg1, dat2604));
sum861 = _mm512_add_ps(sum861, dat2604);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3, 255, dat2618);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3, 65535, dat2617);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3, 65535, dat2616);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3, 65535, dat2615);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3, 65535, dat2614);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3, 65535, dat2613);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3, 65535, dat2612);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3, 65535, dat2611);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3, 65535, dat2610);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3, 65535, dat2609);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3, 65535, dat2608);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3, 65535, dat2607);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3, 65535, dat2606);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3, 65535, dat2605);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3, 65535, dat2604);
for (ptrdiff_t i94 = 2; i94 >= 0; --i94) {
__m512 dat2603 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i94);
__m512 dat2602 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i94);
__m512 dat2601 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i94);
__m512 dat2600 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i94);
__m512 dat2599 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i94);
__m512 dat2598 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i94);
__m512 dat2597 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i94);
__m512 dat2596 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i94);
__m512 dat2595 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i94);
__m512 dat2594 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i94);
__m512 dat2593 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i94);
__m512 dat2592 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i94);
__m512 dat2591 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i94);
__m512 dat2590 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i94);
__m512 dat2589 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i94);
__m512 dat2588 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i94);
dat2603 = ResNet50Exp1(_mm512_add_ps(neg1, dat2603));
sum861 = _mm512_add_ps(sum861, dat2603);
dat2602 = ResNet50Exp1(_mm512_add_ps(neg1, dat2602));
sum861 = _mm512_add_ps(sum861, dat2602);
dat2601 = ResNet50Exp1(_mm512_add_ps(neg1, dat2601));
sum861 = _mm512_add_ps(sum861, dat2601);
dat2600 = ResNet50Exp1(_mm512_add_ps(neg1, dat2600));
sum861 = _mm512_add_ps(sum861, dat2600);
dat2599 = ResNet50Exp1(_mm512_add_ps(neg1, dat2599));
sum861 = _mm512_add_ps(sum861, dat2599);
dat2598 = ResNet50Exp1(_mm512_add_ps(neg1, dat2598));
sum861 = _mm512_add_ps(sum861, dat2598);
dat2597 = ResNet50Exp1(_mm512_add_ps(neg1, dat2597));
sum861 = _mm512_add_ps(sum861, dat2597);
dat2596 = ResNet50Exp1(_mm512_add_ps(neg1, dat2596));
sum861 = _mm512_add_ps(sum861, dat2596);
dat2595 = ResNet50Exp1(_mm512_add_ps(neg1, dat2595));
sum861 = _mm512_add_ps(sum861, dat2595);
dat2594 = ResNet50Exp1(_mm512_add_ps(neg1, dat2594));
sum861 = _mm512_add_ps(sum861, dat2594);
dat2593 = ResNet50Exp1(_mm512_add_ps(neg1, dat2593));
sum861 = _mm512_add_ps(sum861, dat2593);
dat2592 = ResNet50Exp1(_mm512_add_ps(neg1, dat2592));
sum861 = _mm512_add_ps(sum861, dat2592);
dat2591 = ResNet50Exp1(_mm512_add_ps(neg1, dat2591));
sum861 = _mm512_add_ps(sum861, dat2591);
dat2590 = ResNet50Exp1(_mm512_add_ps(neg1, dat2590));
sum861 = _mm512_add_ps(sum861, dat2590);
dat2589 = ResNet50Exp1(_mm512_add_ps(neg1, dat2589));
sum861 = _mm512_add_ps(sum861, dat2589);
dat2588 = ResNet50Exp1(_mm512_add_ps(neg1, dat2588));
sum861 = _mm512_add_ps(sum861, dat2588);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i94, 65535, dat2603);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i94, 65535, dat2602);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i94, 65535, dat2601);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i94, 65535, dat2600);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i94, 65535, dat2599);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i94, 65535, dat2598);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i94, 65535, dat2597);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i94, 65535, dat2596);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i94, 65535, dat2595);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i94, 65535, dat2594);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i94, 65535, dat2593);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i94, 65535, dat2592);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i94, 65535, dat2591);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i94, 65535, dat2590);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i94, 65535, dat2589);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i94, 65535, dat2588);
}
__m512i p7 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
sum861 = _mm512_mask_add_ps(sum861, 255, sum861, _mm512_permutexvar_ps(p7, sum861));
__m512i p8 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
sum861 = _mm512_mask_add_ps(sum861, 15, sum861, _mm512_permutexvar_ps(p8, sum861));
__m512i p9 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
sum861 = _mm512_mask_add_ps(sum861, 3, sum861, _mm512_permutexvar_ps(p9, sum861));
__m512i p10 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
sum861 = _mm512_mask_add_ps(sum861, 1, sum861, _mm512_permutexvar_ps(p10, sum861));
__m512i p11 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
sum861 = _mm512_permutexvar_ps(p11, sum861);
__m512 rcp43 = _mm512_div_ps(_mm512_set1_ps(1e+00f), sum861);
for (ptrdiff_t i95 = 0; i95 < 62; ++i95) {
__m512 dat2619 = _mm512_maskz_loadu_ps(65535, ptr6+(ptrdiff_t)64*i95);
dat2619 = _mm512_mul_ps(rcp43, dat2619);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*i95, 65535, dat2619);
}
__m512 dat2620 = _mm512_maskz_loadu_ps(255, ptr6+(ptrdiff_t)64*62);
dat2620 = _mm512_mul_ps(rcp43, dat2620);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*62, 255, dat2620);
}

static __m512 ResNet50Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void ResNet50BnSimplify1(
float*restrict means1,
float*restrict variances1,
float*restrict scales1,
float*restrict shifts1,
char*restrict mas1
) {
__m512 eps1 = _mm512_set1_ps(1e-05f);
__m512i xlo1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va1 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*0);
__m512 va2 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*1);
__m512 va3 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*2);
__m512 va4 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*3);
__m512 rcp1 = ResNet50Rsqrt1(_mm512_add_ps(eps1, va1));
__m512 rcp2 = ResNet50Rsqrt1(_mm512_add_ps(eps1, va2));
__m512 rcp3 = ResNet50Rsqrt1(_mm512_add_ps(eps1, va3));
__m512 rcp4 = ResNet50Rsqrt1(_mm512_add_ps(eps1, va4));
__m512 sc1 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*0);
__m512 sc2 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*1);
__m512 sc3 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*2);
__m512 sc4 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*3);
__m512 mul1 = _mm512_mul_ps(rcp1, sc1);
__m512 mul2 = _mm512_mul_ps(rcp2, sc2);
__m512 mul3 = _mm512_mul_ps(rcp3, sc3);
__m512 mul4 = _mm512_mul_ps(rcp4, sc4);
__m512 me1 = _mm512_loadu_ps(means1+(ptrdiff_t)16*0);
__m512 me2 = _mm512_loadu_ps(means1+(ptrdiff_t)16*1);
__m512 me3 = _mm512_loadu_ps(means1+(ptrdiff_t)16*2);
__m512 me4 = _mm512_loadu_ps(means1+(ptrdiff_t)16*3);
__m512 sh1 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*0);
__m512 sh2 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*1);
__m512 sh3 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*2);
__m512 sh4 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*3);
__m512 add1 = _mm512_fnmadd_ps(me1, mul1, sh1);
__m512 add2 = _mm512_fnmadd_ps(me2, mul2, sh2);
__m512 add3 = _mm512_fnmadd_ps(me3, mul3, sh3);
__m512 add4 = _mm512_fnmadd_ps(me4, mul4, sh4);
__m512 lo1 = _mm512_permutex2var_ps(mul1, xlo1, add1);
__m512 lo2 = _mm512_permutex2var_ps(mul2, xlo1, add2);
__m512 lo3 = _mm512_permutex2var_ps(mul3, xlo1, add3);
__m512 lo4 = _mm512_permutex2var_ps(mul4, xlo1, add4);
__m512 hi1 = _mm512_permutex2var_ps(mul1, xhi1, add1);
__m512 hi2 = _mm512_permutex2var_ps(mul2, xhi1, add2);
__m512 hi3 = _mm512_permutex2var_ps(mul3, xhi1, add3);
__m512 hi4 = _mm512_permutex2var_ps(mul4, xhi1, add4);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*0, lo1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*1, hi1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*2, lo2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*3, hi2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*4, lo3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*5, hi3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*6, lo4);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*7, hi4);
}

static void ResNet50BnSimplify2(
float*restrict means2,
float*restrict variances2,
float*restrict scales2,
float*restrict shifts2,
char*restrict mas3
) {
__m512 eps2 = _mm512_set1_ps(1e-05f);
__m512i xlo2 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i11 = 0; i11 < 3; ++i11) {
__m512 va5 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 va6 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 va7 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 va8 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 va9 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 rcp5 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va5));
__m512 rcp6 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va6));
__m512 rcp7 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va7));
__m512 rcp8 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va8));
__m512 rcp9 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va9));
__m512 sc5 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sc6 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sc7 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sc8 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sc9 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 mul5 = _mm512_mul_ps(rcp5, sc5);
__m512 mul6 = _mm512_mul_ps(rcp6, sc6);
__m512 mul7 = _mm512_mul_ps(rcp7, sc7);
__m512 mul8 = _mm512_mul_ps(rcp8, sc8);
__m512 mul9 = _mm512_mul_ps(rcp9, sc9);
__m512 me5 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 me6 = _mm512_loadu_ps(means2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 me7 = _mm512_loadu_ps(means2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 me8 = _mm512_loadu_ps(means2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 me9 = _mm512_loadu_ps(means2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 sh5 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sh6 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sh7 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sh8 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sh9 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 add5 = _mm512_fnmadd_ps(me5, mul5, sh5);
__m512 add6 = _mm512_fnmadd_ps(me6, mul6, sh6);
__m512 add7 = _mm512_fnmadd_ps(me7, mul7, sh7);
__m512 add8 = _mm512_fnmadd_ps(me8, mul8, sh8);
__m512 add9 = _mm512_fnmadd_ps(me9, mul9, sh9);
__m512 lo5 = _mm512_permutex2var_ps(mul5, xlo2, add5);
__m512 lo6 = _mm512_permutex2var_ps(mul6, xlo2, add6);
__m512 lo7 = _mm512_permutex2var_ps(mul7, xlo2, add7);
__m512 lo8 = _mm512_permutex2var_ps(mul8, xlo2, add8);
__m512 lo9 = _mm512_permutex2var_ps(mul9, xlo2, add9);
__m512 hi5 = _mm512_permutex2var_ps(mul5, xhi2, add5);
__m512 hi6 = _mm512_permutex2var_ps(mul6, xhi2, add6);
__m512 hi7 = _mm512_permutex2var_ps(mul7, xhi2, add7);
__m512 hi8 = _mm512_permutex2var_ps(mul8, xhi2, add8);
__m512 hi9 = _mm512_permutex2var_ps(mul9, xhi2, add9);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*0+(ptrdiff_t)640*i11, lo5);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*1+(ptrdiff_t)640*i11, hi5);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*2+(ptrdiff_t)640*i11, lo6);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*3+(ptrdiff_t)640*i11, hi6);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*4+(ptrdiff_t)640*i11, lo7);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*5+(ptrdiff_t)640*i11, hi7);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*6+(ptrdiff_t)640*i11, lo8);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*7+(ptrdiff_t)640*i11, hi8);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*8+(ptrdiff_t)640*i11, lo9);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*9+(ptrdiff_t)640*i11, hi9);
}
__m512 va10 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 rcp10 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va10));
__m512 sc10 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 mul10 = _mm512_mul_ps(rcp10, sc10);
__m512 me10 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 sh10 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 add10 = _mm512_fnmadd_ps(me10, mul10, sh10);
__m512 lo10 = _mm512_permutex2var_ps(mul10, xlo2, add10);
__m512 hi10 = _mm512_permutex2var_ps(mul10, xhi2, add10);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*0+(ptrdiff_t)640*3, lo10);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*1+(ptrdiff_t)640*3, hi10);
}

static void ResNet50BnSimplify3(
float*restrict means3,
float*restrict variances3,
float*restrict scales3,
float*restrict shifts3,
char*restrict mas6
) {
__m512 eps3 = _mm512_set1_ps(1e-05f);
__m512i xlo3 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi3 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i29 = 0; i29 < 6; ++i29) {
__m512 va11 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*i29);
__m512 va12 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*1+(ptrdiff_t)80*i29);
__m512 va13 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*2+(ptrdiff_t)80*i29);
__m512 va14 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*3+(ptrdiff_t)80*i29);
__m512 va15 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*4+(ptrdiff_t)80*i29);
__m512 rcp11 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va11));
__m512 rcp12 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va12));
__m512 rcp13 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va13));
__m512 rcp14 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va14));
__m512 rcp15 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va15));
__m512 sc11 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*i29);
__m512 sc12 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*1+(ptrdiff_t)80*i29);
__m512 sc13 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*2+(ptrdiff_t)80*i29);
__m512 sc14 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*3+(ptrdiff_t)80*i29);
__m512 sc15 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*4+(ptrdiff_t)80*i29);
__m512 mul11 = _mm512_mul_ps(rcp11, sc11);
__m512 mul12 = _mm512_mul_ps(rcp12, sc12);
__m512 mul13 = _mm512_mul_ps(rcp13, sc13);
__m512 mul14 = _mm512_mul_ps(rcp14, sc14);
__m512 mul15 = _mm512_mul_ps(rcp15, sc15);
__m512 me11 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*i29);
__m512 me12 = _mm512_loadu_ps(means3+(ptrdiff_t)16*1+(ptrdiff_t)80*i29);
__m512 me13 = _mm512_loadu_ps(means3+(ptrdiff_t)16*2+(ptrdiff_t)80*i29);
__m512 me14 = _mm512_loadu_ps(means3+(ptrdiff_t)16*3+(ptrdiff_t)80*i29);
__m512 me15 = _mm512_loadu_ps(means3+(ptrdiff_t)16*4+(ptrdiff_t)80*i29);
__m512 sh11 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*i29);
__m512 sh12 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*1+(ptrdiff_t)80*i29);
__m512 sh13 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*2+(ptrdiff_t)80*i29);
__m512 sh14 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*3+(ptrdiff_t)80*i29);
__m512 sh15 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*4+(ptrdiff_t)80*i29);
__m512 add11 = _mm512_fnmadd_ps(me11, mul11, sh11);
__m512 add12 = _mm512_fnmadd_ps(me12, mul12, sh12);
__m512 add13 = _mm512_fnmadd_ps(me13, mul13, sh13);
__m512 add14 = _mm512_fnmadd_ps(me14, mul14, sh14);
__m512 add15 = _mm512_fnmadd_ps(me15, mul15, sh15);
__m512 lo11 = _mm512_permutex2var_ps(mul11, xlo3, add11);
__m512 lo12 = _mm512_permutex2var_ps(mul12, xlo3, add12);
__m512 lo13 = _mm512_permutex2var_ps(mul13, xlo3, add13);
__m512 lo14 = _mm512_permutex2var_ps(mul14, xlo3, add14);
__m512 lo15 = _mm512_permutex2var_ps(mul15, xlo3, add15);
__m512 hi11 = _mm512_permutex2var_ps(mul11, xhi3, add11);
__m512 hi12 = _mm512_permutex2var_ps(mul12, xhi3, add12);
__m512 hi13 = _mm512_permutex2var_ps(mul13, xhi3, add13);
__m512 hi14 = _mm512_permutex2var_ps(mul14, xhi3, add14);
__m512 hi15 = _mm512_permutex2var_ps(mul15, xhi3, add15);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*0+(ptrdiff_t)640*i29, lo11);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*1+(ptrdiff_t)640*i29, hi11);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*2+(ptrdiff_t)640*i29, lo12);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*3+(ptrdiff_t)640*i29, hi12);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*4+(ptrdiff_t)640*i29, lo13);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*5+(ptrdiff_t)640*i29, hi13);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*6+(ptrdiff_t)640*i29, lo14);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*7+(ptrdiff_t)640*i29, hi14);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*8+(ptrdiff_t)640*i29, lo15);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*9+(ptrdiff_t)640*i29, hi15);
}
__m512 va16 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 va17 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 rcp16 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va16));
__m512 rcp17 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va17));
__m512 sc16 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sc17 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 mul16 = _mm512_mul_ps(rcp16, sc16);
__m512 mul17 = _mm512_mul_ps(rcp17, sc17);
__m512 me16 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 me17 = _mm512_loadu_ps(means3+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 sh16 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sh17 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 add16 = _mm512_fnmadd_ps(me16, mul16, sh16);
__m512 add17 = _mm512_fnmadd_ps(me17, mul17, sh17);
__m512 lo16 = _mm512_permutex2var_ps(mul16, xlo3, add16);
__m512 lo17 = _mm512_permutex2var_ps(mul17, xlo3, add17);
__m512 hi16 = _mm512_permutex2var_ps(mul16, xhi3, add16);
__m512 hi17 = _mm512_permutex2var_ps(mul17, xhi3, add17);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*0+(ptrdiff_t)640*6, lo16);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*1+(ptrdiff_t)640*6, hi16);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*2+(ptrdiff_t)640*6, lo17);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*3+(ptrdiff_t)640*6, hi17);
}

static void ResNet50BnSimplify4(
float*restrict means4,
float*restrict variances4,
float*restrict scales4,
float*restrict shifts4,
char*restrict mas7
) {
__m512 eps4 = _mm512_set1_ps(1e-05f);
__m512i xlo4 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi4 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i30 = 0; i30 < 1; ++i30) {
__m512 va18 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 va19 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 va20 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 va21 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 va22 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 rcp18 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va18));
__m512 rcp19 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va19));
__m512 rcp20 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va20));
__m512 rcp21 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va21));
__m512 rcp22 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va22));
__m512 sc18 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sc19 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sc20 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sc21 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sc22 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 mul18 = _mm512_mul_ps(rcp18, sc18);
__m512 mul19 = _mm512_mul_ps(rcp19, sc19);
__m512 mul20 = _mm512_mul_ps(rcp20, sc20);
__m512 mul21 = _mm512_mul_ps(rcp21, sc21);
__m512 mul22 = _mm512_mul_ps(rcp22, sc22);
__m512 me18 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 me19 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 me20 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 me21 = _mm512_loadu_ps(means4+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 me22 = _mm512_loadu_ps(means4+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 sh18 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sh19 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sh20 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sh21 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sh22 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 add18 = _mm512_fnmadd_ps(me18, mul18, sh18);
__m512 add19 = _mm512_fnmadd_ps(me19, mul19, sh19);
__m512 add20 = _mm512_fnmadd_ps(me20, mul20, sh20);
__m512 add21 = _mm512_fnmadd_ps(me21, mul21, sh21);
__m512 add22 = _mm512_fnmadd_ps(me22, mul22, sh22);
__m512 lo18 = _mm512_permutex2var_ps(mul18, xlo4, add18);
__m512 lo19 = _mm512_permutex2var_ps(mul19, xlo4, add19);
__m512 lo20 = _mm512_permutex2var_ps(mul20, xlo4, add20);
__m512 lo21 = _mm512_permutex2var_ps(mul21, xlo4, add21);
__m512 lo22 = _mm512_permutex2var_ps(mul22, xlo4, add22);
__m512 hi18 = _mm512_permutex2var_ps(mul18, xhi4, add18);
__m512 hi19 = _mm512_permutex2var_ps(mul19, xhi4, add19);
__m512 hi20 = _mm512_permutex2var_ps(mul20, xhi4, add20);
__m512 hi21 = _mm512_permutex2var_ps(mul21, xhi4, add21);
__m512 hi22 = _mm512_permutex2var_ps(mul22, xhi4, add22);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*0+(ptrdiff_t)640*i30, lo18);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*1+(ptrdiff_t)640*i30, hi18);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*2+(ptrdiff_t)640*i30, lo19);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*3+(ptrdiff_t)640*i30, hi19);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*4+(ptrdiff_t)640*i30, lo20);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*5+(ptrdiff_t)640*i30, hi20);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*6+(ptrdiff_t)640*i30, lo21);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*7+(ptrdiff_t)640*i30, hi21);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*8+(ptrdiff_t)640*i30, lo22);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*9+(ptrdiff_t)640*i30, hi22);
}
__m512 va23 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 va24 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 va25 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 rcp23 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va23));
__m512 rcp24 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va24));
__m512 rcp25 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va25));
__m512 sc23 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sc24 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sc25 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 mul23 = _mm512_mul_ps(rcp23, sc23);
__m512 mul24 = _mm512_mul_ps(rcp24, sc24);
__m512 mul25 = _mm512_mul_ps(rcp25, sc25);
__m512 me23 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 me24 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 me25 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 sh23 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sh24 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sh25 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 add23 = _mm512_fnmadd_ps(me23, mul23, sh23);
__m512 add24 = _mm512_fnmadd_ps(me24, mul24, sh24);
__m512 add25 = _mm512_fnmadd_ps(me25, mul25, sh25);
__m512 lo23 = _mm512_permutex2var_ps(mul23, xlo4, add23);
__m512 lo24 = _mm512_permutex2var_ps(mul24, xlo4, add24);
__m512 lo25 = _mm512_permutex2var_ps(mul25, xlo4, add25);
__m512 hi23 = _mm512_permutex2var_ps(mul23, xhi4, add23);
__m512 hi24 = _mm512_permutex2var_ps(mul24, xhi4, add24);
__m512 hi25 = _mm512_permutex2var_ps(mul25, xhi4, add25);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*0+(ptrdiff_t)640*1, lo23);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*1+(ptrdiff_t)640*1, hi23);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*2+(ptrdiff_t)640*1, lo24);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*3+(ptrdiff_t)640*1, hi24);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*4+(ptrdiff_t)640*1, lo25);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*5+(ptrdiff_t)640*1, hi25);
}

static void ResNet50BnSimplify5(
float*restrict means5,
float*restrict variances5,
float*restrict scales5,
float*restrict shifts5,
char*restrict mas10
) {
__m512 eps5 = _mm512_set1_ps(1e-05f);
__m512i xlo5 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi5 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i48 = 0; i48 < 12; ++i48) {
__m512 va26 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*i48);
__m512 va27 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*i48);
__m512 va28 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*2+(ptrdiff_t)80*i48);
__m512 va29 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*3+(ptrdiff_t)80*i48);
__m512 va30 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*4+(ptrdiff_t)80*i48);
__m512 rcp26 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va26));
__m512 rcp27 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va27));
__m512 rcp28 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va28));
__m512 rcp29 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va29));
__m512 rcp30 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va30));
__m512 sc26 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*i48);
__m512 sc27 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*i48);
__m512 sc28 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*2+(ptrdiff_t)80*i48);
__m512 sc29 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*3+(ptrdiff_t)80*i48);
__m512 sc30 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*4+(ptrdiff_t)80*i48);
__m512 mul26 = _mm512_mul_ps(rcp26, sc26);
__m512 mul27 = _mm512_mul_ps(rcp27, sc27);
__m512 mul28 = _mm512_mul_ps(rcp28, sc28);
__m512 mul29 = _mm512_mul_ps(rcp29, sc29);
__m512 mul30 = _mm512_mul_ps(rcp30, sc30);
__m512 me26 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*i48);
__m512 me27 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*i48);
__m512 me28 = _mm512_loadu_ps(means5+(ptrdiff_t)16*2+(ptrdiff_t)80*i48);
__m512 me29 = _mm512_loadu_ps(means5+(ptrdiff_t)16*3+(ptrdiff_t)80*i48);
__m512 me30 = _mm512_loadu_ps(means5+(ptrdiff_t)16*4+(ptrdiff_t)80*i48);
__m512 sh26 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*i48);
__m512 sh27 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*i48);
__m512 sh28 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*2+(ptrdiff_t)80*i48);
__m512 sh29 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*3+(ptrdiff_t)80*i48);
__m512 sh30 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*4+(ptrdiff_t)80*i48);
__m512 add26 = _mm512_fnmadd_ps(me26, mul26, sh26);
__m512 add27 = _mm512_fnmadd_ps(me27, mul27, sh27);
__m512 add28 = _mm512_fnmadd_ps(me28, mul28, sh28);
__m512 add29 = _mm512_fnmadd_ps(me29, mul29, sh29);
__m512 add30 = _mm512_fnmadd_ps(me30, mul30, sh30);
__m512 lo26 = _mm512_permutex2var_ps(mul26, xlo5, add26);
__m512 lo27 = _mm512_permutex2var_ps(mul27, xlo5, add27);
__m512 lo28 = _mm512_permutex2var_ps(mul28, xlo5, add28);
__m512 lo29 = _mm512_permutex2var_ps(mul29, xlo5, add29);
__m512 lo30 = _mm512_permutex2var_ps(mul30, xlo5, add30);
__m512 hi26 = _mm512_permutex2var_ps(mul26, xhi5, add26);
__m512 hi27 = _mm512_permutex2var_ps(mul27, xhi5, add27);
__m512 hi28 = _mm512_permutex2var_ps(mul28, xhi5, add28);
__m512 hi29 = _mm512_permutex2var_ps(mul29, xhi5, add29);
__m512 hi30 = _mm512_permutex2var_ps(mul30, xhi5, add30);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*0+(ptrdiff_t)640*i48, lo26);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*1+(ptrdiff_t)640*i48, hi26);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*2+(ptrdiff_t)640*i48, lo27);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*3+(ptrdiff_t)640*i48, hi27);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*4+(ptrdiff_t)640*i48, lo28);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*5+(ptrdiff_t)640*i48, hi28);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*6+(ptrdiff_t)640*i48, lo29);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*7+(ptrdiff_t)640*i48, hi29);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*8+(ptrdiff_t)640*i48, lo30);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*9+(ptrdiff_t)640*i48, hi30);
}
__m512 va31 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 va32 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 va33 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 va34 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 rcp31 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va31));
__m512 rcp32 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va32));
__m512 rcp33 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va33));
__m512 rcp34 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va34));
__m512 sc31 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sc32 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sc33 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sc34 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 mul31 = _mm512_mul_ps(rcp31, sc31);
__m512 mul32 = _mm512_mul_ps(rcp32, sc32);
__m512 mul33 = _mm512_mul_ps(rcp33, sc33);
__m512 mul34 = _mm512_mul_ps(rcp34, sc34);
__m512 me31 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 me32 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 me33 = _mm512_loadu_ps(means5+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 me34 = _mm512_loadu_ps(means5+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 sh31 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sh32 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sh33 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sh34 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 add31 = _mm512_fnmadd_ps(me31, mul31, sh31);
__m512 add32 = _mm512_fnmadd_ps(me32, mul32, sh32);
__m512 add33 = _mm512_fnmadd_ps(me33, mul33, sh33);
__m512 add34 = _mm512_fnmadd_ps(me34, mul34, sh34);
__m512 lo31 = _mm512_permutex2var_ps(mul31, xlo5, add31);
__m512 lo32 = _mm512_permutex2var_ps(mul32, xlo5, add32);
__m512 lo33 = _mm512_permutex2var_ps(mul33, xlo5, add33);
__m512 lo34 = _mm512_permutex2var_ps(mul34, xlo5, add34);
__m512 hi31 = _mm512_permutex2var_ps(mul31, xhi5, add31);
__m512 hi32 = _mm512_permutex2var_ps(mul32, xhi5, add32);
__m512 hi33 = _mm512_permutex2var_ps(mul33, xhi5, add33);
__m512 hi34 = _mm512_permutex2var_ps(mul34, xhi5, add34);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*0+(ptrdiff_t)640*12, lo31);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*1+(ptrdiff_t)640*12, hi31);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*2+(ptrdiff_t)640*12, lo32);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*3+(ptrdiff_t)640*12, hi32);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*4+(ptrdiff_t)640*12, lo33);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*5+(ptrdiff_t)640*12, hi33);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*6+(ptrdiff_t)640*12, lo34);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*7+(ptrdiff_t)640*12, hi34);
}

static void ResNet50BnSimplify6(
float*restrict means6,
float*restrict variances6,
float*restrict scales6,
float*restrict shifts6,
char*restrict mas13
) {
__m512 eps6 = _mm512_set1_ps(1e-05f);
__m512i xlo6 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi6 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i66 = 0; i66 < 25; ++i66) {
__m512 va35 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*i66);
__m512 va36 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*i66);
__m512 va37 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*i66);
__m512 va38 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*3+(ptrdiff_t)80*i66);
__m512 va39 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*4+(ptrdiff_t)80*i66);
__m512 rcp35 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va35));
__m512 rcp36 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va36));
__m512 rcp37 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va37));
__m512 rcp38 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va38));
__m512 rcp39 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va39));
__m512 sc35 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*i66);
__m512 sc36 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*i66);
__m512 sc37 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*i66);
__m512 sc38 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*3+(ptrdiff_t)80*i66);
__m512 sc39 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*4+(ptrdiff_t)80*i66);
__m512 mul35 = _mm512_mul_ps(rcp35, sc35);
__m512 mul36 = _mm512_mul_ps(rcp36, sc36);
__m512 mul37 = _mm512_mul_ps(rcp37, sc37);
__m512 mul38 = _mm512_mul_ps(rcp38, sc38);
__m512 mul39 = _mm512_mul_ps(rcp39, sc39);
__m512 me35 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*i66);
__m512 me36 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*i66);
__m512 me37 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*i66);
__m512 me38 = _mm512_loadu_ps(means6+(ptrdiff_t)16*3+(ptrdiff_t)80*i66);
__m512 me39 = _mm512_loadu_ps(means6+(ptrdiff_t)16*4+(ptrdiff_t)80*i66);
__m512 sh35 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*i66);
__m512 sh36 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*i66);
__m512 sh37 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*i66);
__m512 sh38 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*3+(ptrdiff_t)80*i66);
__m512 sh39 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*4+(ptrdiff_t)80*i66);
__m512 add35 = _mm512_fnmadd_ps(me35, mul35, sh35);
__m512 add36 = _mm512_fnmadd_ps(me36, mul36, sh36);
__m512 add37 = _mm512_fnmadd_ps(me37, mul37, sh37);
__m512 add38 = _mm512_fnmadd_ps(me38, mul38, sh38);
__m512 add39 = _mm512_fnmadd_ps(me39, mul39, sh39);
__m512 lo35 = _mm512_permutex2var_ps(mul35, xlo6, add35);
__m512 lo36 = _mm512_permutex2var_ps(mul36, xlo6, add36);
__m512 lo37 = _mm512_permutex2var_ps(mul37, xlo6, add37);
__m512 lo38 = _mm512_permutex2var_ps(mul38, xlo6, add38);
__m512 lo39 = _mm512_permutex2var_ps(mul39, xlo6, add39);
__m512 hi35 = _mm512_permutex2var_ps(mul35, xhi6, add35);
__m512 hi36 = _mm512_permutex2var_ps(mul36, xhi6, add36);
__m512 hi37 = _mm512_permutex2var_ps(mul37, xhi6, add37);
__m512 hi38 = _mm512_permutex2var_ps(mul38, xhi6, add38);
__m512 hi39 = _mm512_permutex2var_ps(mul39, xhi6, add39);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*0+(ptrdiff_t)640*i66, lo35);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*1+(ptrdiff_t)640*i66, hi35);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*2+(ptrdiff_t)640*i66, lo36);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*3+(ptrdiff_t)640*i66, hi36);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*4+(ptrdiff_t)640*i66, lo37);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*5+(ptrdiff_t)640*i66, hi37);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*6+(ptrdiff_t)640*i66, lo38);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*7+(ptrdiff_t)640*i66, hi38);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*8+(ptrdiff_t)640*i66, lo39);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*9+(ptrdiff_t)640*i66, hi39);
}
__m512 va40 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 va41 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 va42 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 rcp40 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va40));
__m512 rcp41 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va41));
__m512 rcp42 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va42));
__m512 sc40 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 sc41 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 sc42 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 mul40 = _mm512_mul_ps(rcp40, sc40);
__m512 mul41 = _mm512_mul_ps(rcp41, sc41);
__m512 mul42 = _mm512_mul_ps(rcp42, sc42);
__m512 me40 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 me41 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 me42 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 sh40 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 sh41 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 sh42 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 add40 = _mm512_fnmadd_ps(me40, mul40, sh40);
__m512 add41 = _mm512_fnmadd_ps(me41, mul41, sh41);
__m512 add42 = _mm512_fnmadd_ps(me42, mul42, sh42);
__m512 lo40 = _mm512_permutex2var_ps(mul40, xlo6, add40);
__m512 lo41 = _mm512_permutex2var_ps(mul41, xlo6, add41);
__m512 lo42 = _mm512_permutex2var_ps(mul42, xlo6, add42);
__m512 hi40 = _mm512_permutex2var_ps(mul40, xhi6, add40);
__m512 hi41 = _mm512_permutex2var_ps(mul41, xhi6, add41);
__m512 hi42 = _mm512_permutex2var_ps(mul42, xhi6, add42);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*0+(ptrdiff_t)640*25, lo40);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*1+(ptrdiff_t)640*25, hi40);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*2+(ptrdiff_t)640*25, lo41);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*3+(ptrdiff_t)640*25, hi41);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*4+(ptrdiff_t)640*25, lo42);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*5+(ptrdiff_t)640*25, hi42);
}

static void ResNet50Glopl1Callee1(ResNet50ThreaderTask1* task154, int64_t* pt82) {
char** tensors150 = task154->any1;
ptrdiff_t c69 = pt82[0];
char*restrict ptr3 = tensors150[0]+(ptrdiff_t)40960*c69;
char*restrict ptr4 = tensors150[1]+(ptrdiff_t)512*c69;
__m512 buf1 = _mm512_setzero_ps();
__mmask16 mask3 = 65535;
for (ptrdiff_t i88 = 0; i88 < 64; ++i88) {
__m512 acc1 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)0+(ptrdiff_t)640*i88);
__m512 acc2 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)64+(ptrdiff_t)640*i88);
__m512 acc3 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)128+(ptrdiff_t)640*i88);
__m512 acc4 = _mm512_maskz_loadu_ps(1, ptr3+(ptrdiff_t)192+(ptrdiff_t)640*i88);
__m512 acc5 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)320+(ptrdiff_t)640*i88);
__m512 acc6 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)384+(ptrdiff_t)640*i88);
__m512 acc7 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)448+(ptrdiff_t)640*i88);
__m512 acc8 = _mm512_maskz_loadu_ps(1, ptr3+(ptrdiff_t)512+(ptrdiff_t)640*i88);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc3);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc7);
acc2 = _mm512_mask_add_ps(acc2, 1, acc2, acc4);
acc6 = _mm512_mask_add_ps(acc6, 1, acc6, acc8);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc2);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc6);
__m512i pm1lo1 = _mm512_set_epi32(16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0);
__m512i pm1hi1 = _mm512_set_epi32(17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1);
__m512 hi43 = _mm512_shuffle_f32x4(acc1, acc1, 238);
__m512 hi46 = _mm512_shuffle_f32x4(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 255, acc1, hi43);
acc5 = _mm512_mask_add_ps(acc5, 255, acc5, hi46);
__m512 hi44 = _mm512_shuffle_f32x4(acc1, acc1, 1);
__m512 hi47 = _mm512_shuffle_f32x4(acc5, acc5, 1);
acc1 = _mm512_mask_add_ps(acc1, 15, acc1, hi44);
acc5 = _mm512_mask_add_ps(acc5, 15, acc5, hi47);
__m512 hi45 = _mm512_shuffle_ps(acc1, acc1, 238);
__m512 hi48 = _mm512_shuffle_ps(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 3, acc1, hi45);
acc5 = _mm512_mask_add_ps(acc5, 3, acc5, hi48);
__m512 hi49 = _mm512_permutex2var_ps(acc1, pm1hi1, acc5);
acc1 = _mm512_permutex2var_ps(acc1, pm1lo1, acc5);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, hi49);
buf1 = _mm512_mask_mov_ps(buf1, mask3, acc1);
mask3 &= mask3<<2;
if (__builtin_expect(!mask3, 0)) {
mask3 = 65535;
buf1 = _mm512_mul_ps(buf1, _mm512_set1_ps(2.0408163e-02f));
_mm512_mask_storeu_ps(ptr4+(ptrdiff_t)4*((ptrdiff_t)2*i88-14), 65535, buf1);
}
}
}

static void ResNet50Glopl1(ResNet50ThreaderTeam1* team87, char** tensors149) {
ResNet50ThreaderTask1 task155;
task155.callee1 = ResNet50Glopl1Callee1;
task155.any1 = tensors149;
task155.nd1 = 1;
task155.hull1[0] = 16;
ResNet50ThreaderDo1(team87, &task155);
}

static void ResNet50Thrpl1Callee1(ResNet50ThreaderTask1* task12, int64_t* pt11) {
char** tensors10 = task12->any1;
ptrdiff_t b43 = pt11[0];
ptrdiff_t e5 = pt11[1];
ptrdiff_t c4 = pt11[2];
char*restrict ptr1 = tensors10[0]-(ptrdiff_t)448+(ptrdiff_t)50176*b43+(ptrdiff_t)448*e5+(ptrdiff_t)50240*c4;
char*restrict ptr2 = tensors10[1]+(ptrdiff_t)12544*b43+(ptrdiff_t)224*e5+(ptrdiff_t)12608*c4;
for (ptrdiff_t i10 = 0; i10 < 1; ++i10) {
__m512 in1 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 in2 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat894 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat895 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
in1 = _mm512_max_ps(in1, dat894);
in2 = _mm512_max_ps(in2, dat895);
__m512i pm57 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm58 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm59 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out1 = _mm512_permutex2var_ps(in1, pm57, in2);
__m512 pack263 = _mm512_permutex2var_ps(in1, pm58, in2);
__m512 pack264 = _mm512_permutex2var_ps(in1, pm59, in2);
out1 = _mm512_mask_max_ps(out1, 65535, out1, pack263);
out1 = _mm512_mask_max_ps(out1, 65534, out1, pack264);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*0, 65535, out1);
for (ptrdiff_t k44 = 1; k44 < 3; ++k44) {
__m512 in3 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 in4 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat896 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat897 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
in3 = _mm512_max_ps(in3, dat896);
in4 = _mm512_max_ps(in4, dat897);
__m512 blend1 = _mm512_mask_mov_ps(in4, 32768, in2);
__m512 out2 = _mm512_permutex2var_ps(in3, pm57, in4);
__m512 pack265 = _mm512_permutex2var_ps(in3, pm58, in4);
__m512 pack266 = _mm512_permutex2var_ps(in3, pm59, blend1);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack265);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack266);
in2 = in4;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*k44, 65535, out2);
}
__m512 in5 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
__m512 dat898 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
in5 = _mm512_max_ps(in5, dat898);
__m512 blend2 = _mm512_mask_mov_ps(in5, 32768, in2);
__m512 out3 = _mm512_permutexvar_ps(pm57, in5);
__m512 pack267 = _mm512_permutexvar_ps(pm58, in5);
__m512 pack268 = _mm512_permutexvar_ps(pm59, blend2);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack267);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack268);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*3, 255, out3);
for (ptrdiff_t j6 = 1; j6 < 56; ++j6) {
__m512 in6 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 in7 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat899 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat901 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat900 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat902 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
in6 = _mm512_max_ps(in6, dat899);
in7 = _mm512_max_ps(in7, dat901);
in6 = _mm512_max_ps(in6, dat900);
in7 = _mm512_max_ps(in7, dat902);
__m512i pm60 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm61 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm62 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out4 = _mm512_permutex2var_ps(in6, pm60, in7);
__m512 pack269 = _mm512_permutex2var_ps(in6, pm61, in7);
__m512 pack270 = _mm512_permutex2var_ps(in6, pm62, in7);
out4 = _mm512_mask_max_ps(out4, 65535, out4, pack269);
out4 = _mm512_mask_max_ps(out4, 65534, out4, pack270);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*0, 65535, out4);
for (ptrdiff_t k45 = 1; k45 < 3; ++k45) {
__m512 in8 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 in9 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat903 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat905 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat904 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat906 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
in8 = _mm512_max_ps(in8, dat903);
in9 = _mm512_max_ps(in9, dat905);
in8 = _mm512_max_ps(in8, dat904);
in9 = _mm512_max_ps(in9, dat906);
__m512 blend3 = _mm512_mask_mov_ps(in9, 32768, in7);
__m512 out5 = _mm512_permutex2var_ps(in8, pm60, in9);
__m512 pack271 = _mm512_permutex2var_ps(in8, pm61, in9);
__m512 pack272 = _mm512_permutex2var_ps(in8, pm62, blend3);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack271);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack272);
in7 = in9;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*k45, 65535, out5);
}
__m512 in10 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat907 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat908 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
in10 = _mm512_max_ps(in10, dat907);
in10 = _mm512_max_ps(in10, dat908);
__m512 blend4 = _mm512_mask_mov_ps(in10, 32768, in7);
__m512 out6 = _mm512_permutexvar_ps(pm60, in10);
__m512 pack273 = _mm512_permutexvar_ps(pm61, in10);
__m512 pack274 = _mm512_permutexvar_ps(pm62, blend4);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack273);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack274);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*3, 255, out6);
}
}
}

static void ResNet50Thrpl1(ResNet50ThreaderTeam1* team18, char** tensors9) {
ResNet50ThreaderTask1 task13;
task13.callee1 = ResNet50Thrpl1Callee1;
task13.any1 = tensors9;
task13.nd1 = 3;
task13.hull1[0] = 1;
task13.hull1[1] = 1;
task13.hull1[2] = 64;
ResNet50ThreaderDo1(team18, &task13);
}

static void ResNet50FcArrange1Callee1(ResNet50ThreaderTask1* task156, int64_t* pt83) {
char** tensors152 = task156->any1;
ptrdiff_t t35 = pt83[0];
char*restrict weights1 = tensors152[0]+(ptrdiff_t)131072*t35;
char*restrict biases1 = tensors152[1]+(ptrdiff_t)64*t35;
char*restrict weights2 = tensors152[2]+(ptrdiff_t)65536*t35;
char*restrict biases2 = tensors152[2]+(ptrdiff_t)4096000+(ptrdiff_t)64*t35;
if (t35 < 62) {
for (ptrdiff_t i89 = 0; i89 < 1; ++i89) {
for (ptrdiff_t j79 = 0; j79 < 128; ++j79) {
__m512 wtLo1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)65536+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)73728+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)81920+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)90112+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)98304+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)106496+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)114688+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)122880+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m256i halfLo1 = _mm512_cvtps_ph(wtLo1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi1 = _mm512_cvtps_ph(wtHi1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo2 = _mm512_cvtps_ph(wtLo2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi2 = _mm512_cvtps_ph(wtHi2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo3 = _mm512_cvtps_ph(wtLo3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi3 = _mm512_cvtps_ph(wtHi3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo4 = _mm512_cvtps_ph(wtLo4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi4 = _mm512_cvtps_ph(wtHi4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo5 = _mm512_cvtps_ph(wtLo5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi5 = _mm512_cvtps_ph(wtHi5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo6 = _mm512_cvtps_ph(wtLo6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi6 = _mm512_cvtps_ph(wtHi6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo7 = _mm512_cvtps_ph(wtLo7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi7 = _mm512_cvtps_ph(wtHi7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo8 = _mm512_cvtps_ph(wtLo8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi8 = _mm512_cvtps_ph(wtHi8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield1 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo1), halfHi1, 1);
__m512i yield2 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo2), halfHi2, 1);
__m512i yield3 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo3), halfHi3, 1);
__m512i yield4 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo4), halfHi4, 1);
__m512i yield5 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo5), halfHi5, 1);
__m512i yield6 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo6), halfHi6, 1);
__m512i yield7 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo7), halfHi7, 1);
__m512i yield8 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo8), halfHi8, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield2);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield3);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield4);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield5);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield6);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield7);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield8);
}
__m512 bias10 = _mm512_maskz_loadu_ps(65535, biases1+(ptrdiff_t)0+(ptrdiff_t)64*i89);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)64*i89, 65535, bias10);
}
return;
}
for (ptrdiff_t i90 = 0; i90 < 1; ++i90) {
for (ptrdiff_t j80 = 0; j80 < 64; ++j80) {
__m512 wtLo9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)64+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8256+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16448+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24640+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32832+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)41024+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49216+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57408+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m256i halfLo9 = _mm512_cvtps_ph(wtLo9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi9 = _mm512_cvtps_ph(wtHi9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo10 = _mm512_cvtps_ph(wtLo10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi10 = _mm512_cvtps_ph(wtHi10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo11 = _mm512_cvtps_ph(wtLo11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi11 = _mm512_cvtps_ph(wtHi11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo12 = _mm512_cvtps_ph(wtLo12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi12 = _mm512_cvtps_ph(wtHi12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo13 = _mm512_cvtps_ph(wtLo13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi13 = _mm512_cvtps_ph(wtHi13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo14 = _mm512_cvtps_ph(wtLo14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi14 = _mm512_cvtps_ph(wtHi14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo15 = _mm512_cvtps_ph(wtLo15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi15 = _mm512_cvtps_ph(wtHi15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo16 = _mm512_cvtps_ph(wtLo16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi16 = _mm512_cvtps_ph(wtHi16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield9 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo9), halfHi9, 1);
__m512i yield10 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo10), halfHi10, 1);
__m512i yield11 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo11), halfHi11, 1);
__m512i yield12 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo12), halfHi12, 1);
__m512i yield13 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo13), halfHi13, 1);
__m512i yield14 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo14), halfHi14, 1);
__m512i yield15 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo15), halfHi15, 1);
__m512i yield16 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo16), halfHi16, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield9);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield10);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield11);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield12);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield13);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield14);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield15);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield16);
}
__m512 bias11 = _mm512_maskz_loadu_ps(255, biases1+(ptrdiff_t)0+(ptrdiff_t)32*i90);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)32*i90, 255, bias11);
}
}

static void ResNet50FcArrange1(ResNet50ThreaderTeam1* team88, char** tensors151) {
ResNet50ThreaderTask1 task157;
task157.callee1 = ResNet50FcArrange1Callee1;
task157.any1 = tensors151;
task157.nd1 = 1;
task157.hull1[0] = 63;
ResNet50ThreaderDo1(team88, &task157);
}

static void ResNet50FcApply1Callee1(ResNet50ThreaderTask1* task158, int64_t* pt84) {
char** tensors154 = task158->any1;
ptrdiff_t t36 = pt84[0];
char*restrict wtPtr27 = tensors154[0]+(ptrdiff_t)65536*t36;
char*restrict biasPtr26 = tensors154[0]+(ptrdiff_t)4096000+(ptrdiff_t)64*t36;
char*restrict datPtr50 = tensors154[1];
char*restrict datPtr51 = tensors154[2]+(ptrdiff_t)64*t36;
if (t36 < 62) {
for (ptrdiff_t i91 = 0; i91 < 1; ++i91) {
__m512 sum837 = _mm512_setzero_ps();
__m512 sum838 = _mm512_setzero_ps();
__m512 sum839 = _mm512_setzero_ps();
__m512 sum840 = _mm512_setzero_ps();
__m512 sum841 = _mm512_setzero_ps();
__m512 sum842 = _mm512_setzero_ps();
__m512 sum843 = _mm512_setzero_ps();
__m512 sum844 = _mm512_setzero_ps();
__m512 sum845 = _mm512_setzero_ps();
__m512 sum846 = _mm512_setzero_ps();
__m512 sum847 = _mm512_setzero_ps();
__m512 sum848 = _mm512_setzero_ps();
__m512 sum849 = _mm512_setzero_ps();
__m512 sum850 = _mm512_setzero_ps();
__m512 sum851 = _mm512_setzero_ps();
__m512 sum852 = _mm512_setzero_ps();
for (ptrdiff_t j81 = 0; j81 < 128; ++j81) {
__m512i wts1 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)0+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512 dat2555 = _mm512_maskz_loadu_ps(65535, datPtr50+(ptrdiff_t)0+(ptrdiff_t)64*j81);
__m512i wts2 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)64+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts3 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)128+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts4 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)192+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512 wtLo17 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts1));
__m512 wtHi17 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts1, 1));
__m512 wtLo18 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts2));
__m512 wtHi18 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts2, 1));
__m512 wtLo19 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts3));
__m512 wtHi19 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts3, 1));
__m512 wtLo20 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts4));
__m512 wtHi20 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts4, 1));
sum837 = _mm512_fmadd_ps(wtLo17, dat2555, sum837);
sum838 = _mm512_fmadd_ps(wtHi17, dat2555, sum838);
sum839 = _mm512_fmadd_ps(wtLo18, dat2555, sum839);
sum840 = _mm512_fmadd_ps(wtHi18, dat2555, sum840);
sum841 = _mm512_fmadd_ps(wtLo19, dat2555, sum841);
sum842 = _mm512_fmadd_ps(wtHi19, dat2555, sum842);
sum843 = _mm512_fmadd_ps(wtLo20, dat2555, sum843);
sum844 = _mm512_fmadd_ps(wtHi20, dat2555, sum844);
__m512i wts5 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)256+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts6 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)320+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts7 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)384+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts8 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)448+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512 wtLo21 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts5));
__m512 wtHi21 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts5, 1));
__m512 wtLo22 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts6));
__m512 wtHi22 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts6, 1));
__m512 wtLo23 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts7));
__m512 wtHi23 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts7, 1));
__m512 wtLo24 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts8));
__m512 wtHi24 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts8, 1));
sum845 = _mm512_fmadd_ps(wtLo21, dat2555, sum845);
sum846 = _mm512_fmadd_ps(wtHi21, dat2555, sum846);
sum847 = _mm512_fmadd_ps(wtLo22, dat2555, sum847);
sum848 = _mm512_fmadd_ps(wtHi22, dat2555, sum848);
sum849 = _mm512_fmadd_ps(wtLo23, dat2555, sum849);
sum850 = _mm512_fmadd_ps(wtHi23, dat2555, sum850);
sum851 = _mm512_fmadd_ps(wtLo24, dat2555, sum851);
sum852 = _mm512_fmadd_ps(wtHi24, dat2555, sum852);
}
__m512 bias12 = _mm512_maskz_loadu_ps(65535, biasPtr26+(ptrdiff_t)0+(ptrdiff_t)64*i91);
__m512i pm1Lo1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
__m512i pm1Hi1 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
__m512i pm4Lo1 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi1 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper4 = _mm512_shuffle_f32x4(sum837, sum845, 238);
__m512 upper5 = _mm512_shuffle_f32x4(sum841, sum849, 238);
sum837 = _mm512_shuffle_f32x4(sum837, sum845, 68);
sum841 = _mm512_shuffle_f32x4(sum841, sum849, 68);
sum837 = _mm512_add_ps(sum837, upper4);
sum841 = _mm512_add_ps(sum841, upper5);
__m512 upper7 = _mm512_shuffle_f32x4(sum839, sum847, 238);
__m512 upper8 = _mm512_shuffle_f32x4(sum843, sum851, 238);
sum839 = _mm512_shuffle_f32x4(sum839, sum847, 68);
sum843 = _mm512_shuffle_f32x4(sum843, sum851, 68);
sum839 = _mm512_add_ps(sum839, upper7);
sum843 = _mm512_add_ps(sum843, upper8);
__m512 upper3 = _mm512_permutex2var_ps(sum837, pm4Hi1, sum841);
__m512 upper6 = _mm512_permutex2var_ps(sum839, pm4Hi1, sum843);
sum837 = _mm512_permutex2var_ps(sum837, pm4Lo1, sum841);
sum839 = _mm512_permutex2var_ps(sum839, pm4Lo1, sum843);
sum837 = _mm512_add_ps(sum837, upper3);
sum839 = _mm512_add_ps(sum839, upper6);
__m512 upper11 = _mm512_shuffle_f32x4(sum838, sum846, 238);
__m512 upper12 = _mm512_shuffle_f32x4(sum842, sum850, 238);
sum838 = _mm512_shuffle_f32x4(sum838, sum846, 68);
sum842 = _mm512_shuffle_f32x4(sum842, sum850, 68);
sum838 = _mm512_add_ps(sum838, upper11);
sum842 = _mm512_add_ps(sum842, upper12);
__m512 upper14 = _mm512_shuffle_f32x4(sum840, sum848, 238);
__m512 upper15 = _mm512_shuffle_f32x4(sum844, sum852, 238);
sum840 = _mm512_shuffle_f32x4(sum840, sum848, 68);
sum844 = _mm512_shuffle_f32x4(sum844, sum852, 68);
sum840 = _mm512_add_ps(sum840, upper14);
sum844 = _mm512_add_ps(sum844, upper15);
__m512 upper10 = _mm512_permutex2var_ps(sum838, pm4Hi1, sum842);
__m512 upper13 = _mm512_permutex2var_ps(sum840, pm4Hi1, sum844);
sum838 = _mm512_permutex2var_ps(sum838, pm4Lo1, sum842);
sum840 = _mm512_permutex2var_ps(sum840, pm4Lo1, sum844);
sum838 = _mm512_add_ps(sum838, upper10);
sum840 = _mm512_add_ps(sum840, upper13);
__m512 upper2 = _mm512_shuffle_ps(sum837, sum839, 238);
__m512 upper9 = _mm512_shuffle_ps(sum838, sum840, 238);
sum837 = _mm512_shuffle_ps(sum837, sum839, 68);
sum838 = _mm512_shuffle_ps(sum838, sum840, 68);
sum837 = _mm512_add_ps(sum837, upper2);
sum838 = _mm512_add_ps(sum838, upper9);
__m512 upper1 = _mm512_permutex2var_ps(sum837, pm1Hi1, sum838);
sum837 = _mm512_permutex2var_ps(sum837, pm1Lo1, sum838);
sum837 = _mm512_add_ps(sum837, upper1);
sum837 = _mm512_add_ps(sum837, bias12);
_mm512_mask_storeu_ps(datPtr51+(ptrdiff_t)0+(ptrdiff_t)64*i91, 65535, sum837);
}
return;
}
for (ptrdiff_t i92 = 0; i92 < 1; ++i92) {
__m512 sum853 = _mm512_setzero_ps();
__m512 sum854 = _mm512_setzero_ps();
__m512 sum855 = _mm512_setzero_ps();
__m512 sum856 = _mm512_setzero_ps();
__m512 sum857 = _mm512_setzero_ps();
__m512 sum858 = _mm512_setzero_ps();
__m512 sum859 = _mm512_setzero_ps();
__m512 sum860 = _mm512_setzero_ps();
for (ptrdiff_t j82 = 0; j82 < 128; ++j82) {
__m512i wts9 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)0+(ptrdiff_t)65536*i92+(ptrdiff_t)256*j82);
__m512 dat2556 = _mm512_maskz_loadu_ps(65535, datPtr50+(ptrdiff_t)0+(ptrdiff_t)64*j82);
__m512i wts10 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)64+(ptrdiff_t)65536*i92+(ptrdiff_t)256*j82);
__m512i wts11 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)128+(ptrdiff_t)65536*i92+(ptrdiff_t)256*j82);
__m512i wts12 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)192+(ptrdiff_t)65536*i92+(ptrdiff_t)256*j82);
__m512 wtLo25 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts9));
__m512 wtHi25 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts9, 1));
__m512 wtLo26 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts10));
__m512 wtHi26 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts10, 1));
__m512 wtLo27 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts11));
__m512 wtHi27 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts11, 1));
__m512 wtLo28 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts12));
__m512 wtHi28 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts12, 1));
sum853 = _mm512_fmadd_ps(wtLo25, dat2556, sum853);
sum854 = _mm512_fmadd_ps(wtHi25, dat2556, sum854);
sum855 = _mm512_fmadd_ps(wtLo26, dat2556, sum855);
sum856 = _mm512_fmadd_ps(wtHi26, dat2556, sum856);
sum857 = _mm512_fmadd_ps(wtLo27, dat2556, sum857);
sum858 = _mm512_fmadd_ps(wtHi27, dat2556, sum858);
sum859 = _mm512_fmadd_ps(wtLo28, dat2556, sum859);
sum860 = _mm512_fmadd_ps(wtHi28, dat2556, sum860);
}
__m512 bias13 = _mm512_maskz_loadu_ps(255, biasPtr26+(ptrdiff_t)0+(ptrdiff_t)32*i92);
__m512i pmEven1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmOdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm4Lo2 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi2 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper18 = _mm512_shuffle_f32x4(sum853, sum857, 238);
__m512 upper19 = _mm512_shuffle_f32x4(sum855, sum859, 238);
sum853 = _mm512_shuffle_f32x4(sum853, sum857, 68);
sum855 = _mm512_shuffle_f32x4(sum855, sum859, 68);
sum853 = _mm512_add_ps(sum853, upper18);
sum855 = _mm512_add_ps(sum855, upper19);
__m512 upper21 = _mm512_shuffle_f32x4(sum854, sum858, 238);
__m512 upper22 = _mm512_shuffle_f32x4(sum856, sum860, 238);
sum854 = _mm512_shuffle_f32x4(sum854, sum858, 68);
sum856 = _mm512_shuffle_f32x4(sum856, sum860, 68);
sum854 = _mm512_add_ps(sum854, upper21);
sum856 = _mm512_add_ps(sum856, upper22);
__m512 upper17 = _mm512_permutex2var_ps(sum853, pm4Hi2, sum855);
__m512 upper20 = _mm512_permutex2var_ps(sum854, pm4Hi2, sum856);
sum853 = _mm512_permutex2var_ps(sum853, pm4Lo2, sum855);
sum854 = _mm512_permutex2var_ps(sum854, pm4Lo2, sum856);
sum853 = _mm512_add_ps(sum853, upper17);
sum854 = _mm512_add_ps(sum854, upper20);
__m512 upper16 = _mm512_shuffle_ps(sum853, sum854, 238);
sum853 = _mm512_shuffle_ps(sum853, sum854, 68);
sum853 = _mm512_add_ps(sum853, upper16);
__m512 upper23 = _mm512_permutexvar_ps(pmOdd1, sum853);
sum853 = _mm512_permutexvar_ps(pmEven1, sum853);
sum853 = _mm512_add_ps(sum853, upper23);
sum853 = _mm512_add_ps(sum853, bias13);
_mm512_mask_storeu_ps(datPtr51+(ptrdiff_t)0+(ptrdiff_t)32*i92, 255, sum853);
}
}

static void ResNet50FcApply1(ResNet50ThreaderTeam1* team89, char** tensors153) {
ResNet50ThreaderTask1 task159;
task159.callee1 = ResNet50FcApply1Callee1;
task159.any1 = tensors153;
task159.nd1 = 1;
task159.hull1[0] = 63;
ResNet50ThreaderDo1(team89, &task159);
}

static void ResNet50OneArrangeWts1Callee1(ResNet50ThreaderTask1* task14, int64_t* pt12) {
char** tensors12 = task14->any1;
ptrdiff_t b44 = pt12[0];
char*restrict wtPtr2 = tensors12[0]+(ptrdiff_t)3340*0+(ptrdiff_t)81920*0;
char*restrict biasPtr2 = tensors12[1]+(ptrdiff_t)1280*0;
char*restrict bnPtr2 = tensors12[2]+(ptrdiff_t)8*320*0;
char*restrict wtPtr3 = tensors12[3]+(ptrdiff_t)3340*0+(ptrdiff_t)81920*0;
char*restrict biasPtr3 = tensors12[4]+(ptrdiff_t)1280*0;
char*restrict bnPtr3 = tensors12[5]+(ptrdiff_t)8*320*0;
char*restrict arranged1 = tensors12[6]+(ptrdiff_t)1070080*0+(ptrdiff_t)83200*0;
ptrdiff_t ii1 = 1;
for (ptrdiff_t i12 = 0; i12 < ii1; ++i12) {
ptrdiff_t j7 = 10*b44;
ptrdiff_t jj19 = j7+10;
for (; j7 < jj19; ++j7) {
if (j7 < 16) {
ptrdiff_t k46 = 0+16*(j7-0);
ptrdiff_t l9 = (size_t)(0+k46)/6;
ptrdiff_t cut1 = (size_t)(0+k46)%6;
switch (cut1) {
case 0:;
case 2: {
__m512 sum2 = _mm512_maskz_loadu_ps(65535, biasPtr2+1280*i12+4*k46);
__m512i pmMul2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo1 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k46+320*i12));
__m512 masHi1 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(k46+320*i12)+(ptrdiff_t)64);
__m512 postMul4 = _mm512_permutex2var_ps(masLo1, pmMul2, masHi1);
__m512 postAdd2 = _mm512_permutex2var_ps(masLo1, pmAdd2, masHi1);
sum2 = _mm512_fmadd_ps(sum2, postMul4, postAdd2);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 65535-(4095>>cut1), sum2);
ptrdiff_t c5 = 0;
for (; c5 != 4; ++c5) {
__m512 wt15 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)0);
__m512 wt16 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)256);
__m512 wt17 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)512);
__m512 wt18 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)768);
__m512 wt19 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)1024);
__m512 wt20 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)1280);
__m512 wt21 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)1536);
__m512 wt22 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)1792);
__m512 wt23 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)2048);
__m512 wt24 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)2304);
__m512 wt25 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)2560);
__m512 wt26 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)2816);
__m512 wt27 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)3072);
__m512 wt28 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)3328);
__m512 wt29 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)3584);
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)3840);
__m512 tmp1 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp2 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp3 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp4 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp5 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp6 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp7 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp8 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp9 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp10 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp11 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp12 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp13 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp14 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp15 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp16 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt15 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt23 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt16 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt24 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt17 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt25 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt18 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt26 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt19 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt27 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt20 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt28 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt21 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt29 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt22 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt30 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
wt15 = _mm512_mul_ps(wt15, postMul4);
wt16 = _mm512_mul_ps(wt16, postMul4);
wt17 = _mm512_mul_ps(wt17, postMul4);
wt18 = _mm512_mul_ps(wt18, postMul4);
wt19 = _mm512_mul_ps(wt19, postMul4);
wt20 = _mm512_mul_ps(wt20, postMul4);
wt21 = _mm512_mul_ps(wt21, postMul4);
wt22 = _mm512_mul_ps(wt22, postMul4);
wt23 = _mm512_mul_ps(wt23, postMul4);
wt24 = _mm512_mul_ps(wt24, postMul4);
wt25 = _mm512_mul_ps(wt25, postMul4);
wt26 = _mm512_mul_ps(wt26, postMul4);
wt27 = _mm512_mul_ps(wt27, postMul4);
wt28 = _mm512_mul_ps(wt28, postMul4);
wt29 = _mm512_mul_ps(wt29, postMul4);
wt30 = _mm512_mul_ps(wt30, postMul4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)0, 63>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)0, 63>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)0, 63>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)0, 63>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt15);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt16);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt17);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt18);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt19);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt20);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt21);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt22);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt23);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt24);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt25);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt26);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt27);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt28);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt29);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt30);
}
break;
}
default: {
cut1 = 4;
__m512 sum3 = _mm512_maskz_loadu_ps(65535, biasPtr2+1280*i12+4*k46);
__m512i pmMul3 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd3 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo2 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k46+320*i12));
__m512 masHi2 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(k46+320*i12)+(ptrdiff_t)64);
__m512 postMul5 = _mm512_permutex2var_ps(masLo2, pmMul3, masHi2);
__m512 postAdd3 = _mm512_permutex2var_ps(masLo2, pmAdd3, masHi2);
sum3 = _mm512_fmadd_ps(sum3, postMul5, postAdd3);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 258048>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)4608, 65535-(262143>>cut1), sum3);
ptrdiff_t c6 = 0;
for (; c6 != 4; ++c6) {
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)0);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)256);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)512);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)768);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)1024);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)1280);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)1536);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)1792);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)2048);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)2304);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)2560);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)2816);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)3072);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)3328);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)3584);
__m512 wt46 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)3840);
__m512 tmp49 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp50 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp51 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp52 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp53 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp54 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp55 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp56 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp57 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp58 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp59 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp60 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp61 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp62 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp63 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp64 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp50, tmp52, 238);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp70 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp71 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp54, tmp56, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp75 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp76 = _mm512_shuffle_ps(tmp58, tmp60, 238);
__m512 tmp77 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp78 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp79 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp80 = _mm512_shuffle_ps(tmp62, tmp64, 238);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp65, tmp69, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp65, tmp69, 221);
__m512 tmp83 = _mm512_shuffle_f32x4(tmp66, tmp70, 136);
__m512 tmp84 = _mm512_shuffle_f32x4(tmp66, tmp70, 221);
__m512 tmp85 = _mm512_shuffle_f32x4(tmp67, tmp71, 136);
__m512 tmp86 = _mm512_shuffle_f32x4(tmp67, tmp71, 221);
__m512 tmp87 = _mm512_shuffle_f32x4(tmp68, tmp72, 136);
__m512 tmp88 = _mm512_shuffle_f32x4(tmp68, tmp72, 221);
__m512 tmp89 = _mm512_shuffle_f32x4(tmp73, tmp77, 136);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp73, tmp77, 221);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp74, tmp78, 136);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp74, tmp78, 221);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp75, tmp79, 136);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp75, tmp79, 221);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp76, tmp80, 136);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp76, tmp80, 221);
wt31 = _mm512_shuffle_f32x4(tmp81, tmp89, 136);
wt39 = _mm512_shuffle_f32x4(tmp81, tmp89, 221);
wt32 = _mm512_shuffle_f32x4(tmp83, tmp91, 136);
wt40 = _mm512_shuffle_f32x4(tmp83, tmp91, 221);
wt33 = _mm512_shuffle_f32x4(tmp85, tmp93, 136);
wt41 = _mm512_shuffle_f32x4(tmp85, tmp93, 221);
wt34 = _mm512_shuffle_f32x4(tmp87, tmp95, 136);
wt42 = _mm512_shuffle_f32x4(tmp87, tmp95, 221);
wt35 = _mm512_shuffle_f32x4(tmp82, tmp90, 136);
wt43 = _mm512_shuffle_f32x4(tmp82, tmp90, 221);
wt36 = _mm512_shuffle_f32x4(tmp84, tmp92, 136);
wt44 = _mm512_shuffle_f32x4(tmp84, tmp92, 221);
wt37 = _mm512_shuffle_f32x4(tmp86, tmp94, 136);
wt45 = _mm512_shuffle_f32x4(tmp86, tmp94, 221);
wt38 = _mm512_shuffle_f32x4(tmp88, tmp96, 136);
wt46 = _mm512_shuffle_f32x4(tmp88, tmp96, 221);
wt31 = _mm512_mul_ps(wt31, postMul5);
wt32 = _mm512_mul_ps(wt32, postMul5);
wt33 = _mm512_mul_ps(wt33, postMul5);
wt34 = _mm512_mul_ps(wt34, postMul5);
wt35 = _mm512_mul_ps(wt35, postMul5);
wt36 = _mm512_mul_ps(wt36, postMul5);
wt37 = _mm512_mul_ps(wt37, postMul5);
wt38 = _mm512_mul_ps(wt38, postMul5);
wt39 = _mm512_mul_ps(wt39, postMul5);
wt40 = _mm512_mul_ps(wt40, postMul5);
wt41 = _mm512_mul_ps(wt41, postMul5);
wt42 = _mm512_mul_ps(wt42, postMul5);
wt43 = _mm512_mul_ps(wt43, postMul5);
wt44 = _mm512_mul_ps(wt44, postMul5);
wt45 = _mm512_mul_ps(wt45, postMul5);
wt46 = _mm512_mul_ps(wt46, postMul5);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)0, 63>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)0, 63>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)0, 63>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)0, 63>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt31);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt32);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt33);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt34);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt35);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt36);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt37);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt38);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt39);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt40);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt41);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt42);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt43);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt44);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt45);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt46);
}
}
}
} else if (j7 < 19) {
ptrdiff_t k48 = 0+16*(j7-16);
ptrdiff_t l11 = (size_t)(256+k48)/6;
ptrdiff_t cut3 = (size_t)(256+k48)%6;
switch (cut3) {
case 0:;
case 2: {
__m512 sum5 = _mm512_maskz_loadu_ps(65535, biasPtr3+1280*i12+4*k48);
__m512i pmMul4 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd4 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo3 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k48+320*i12));
__m512 masHi3 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k48+320*i12)+(ptrdiff_t)64);
__m512 postMul7 = _mm512_permutex2var_ps(masLo3, pmMul4, masHi3);
__m512 postAdd5 = _mm512_permutex2var_ps(masLo3, pmAdd4, masHi3);
sum5 = _mm512_fmadd_ps(sum5, postMul7, postAdd5);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)0, 63>>cut3, sum5);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)1536, 4032>>cut3, sum5);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)3072, 65535-(4095>>cut3), sum5);
ptrdiff_t c8 = 0;
for (; c8 != 4; ++c8) {
__m512 wt63 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)0);
__m512 wt64 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)256);
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)512);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)768);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)1024);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)1280);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)1536);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)1792);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)2048);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)2304);
__m512 wt73 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)2560);
__m512 wt74 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)2816);
__m512 wt75 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)3072);
__m512 wt76 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)3328);
__m512 wt77 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)3584);
__m512 wt78 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)3840);
__m512 tmp97 = _mm512_unpacklo_ps(wt63, wt64);
__m512 tmp98 = _mm512_unpackhi_ps(wt63, wt64);
__m512 tmp99 = _mm512_unpacklo_ps(wt65, wt66);
__m512 tmp100 = _mm512_unpackhi_ps(wt65, wt66);
__m512 tmp101 = _mm512_unpacklo_ps(wt67, wt68);
__m512 tmp102 = _mm512_unpackhi_ps(wt67, wt68);
__m512 tmp103 = _mm512_unpacklo_ps(wt69, wt70);
__m512 tmp104 = _mm512_unpackhi_ps(wt69, wt70);
__m512 tmp105 = _mm512_unpacklo_ps(wt71, wt72);
__m512 tmp106 = _mm512_unpackhi_ps(wt71, wt72);
__m512 tmp107 = _mm512_unpacklo_ps(wt73, wt74);
__m512 tmp108 = _mm512_unpackhi_ps(wt73, wt74);
__m512 tmp109 = _mm512_unpacklo_ps(wt75, wt76);
__m512 tmp110 = _mm512_unpackhi_ps(wt75, wt76);
__m512 tmp111 = _mm512_unpacklo_ps(wt77, wt78);
__m512 tmp112 = _mm512_unpackhi_ps(wt77, wt78);
__m512 tmp113 = _mm512_shuffle_ps(tmp97, tmp99, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp97, tmp99, 238);
__m512 tmp115 = _mm512_shuffle_ps(tmp98, tmp100, 68);
__m512 tmp116 = _mm512_shuffle_ps(tmp98, tmp100, 238);
__m512 tmp117 = _mm512_shuffle_ps(tmp101, tmp103, 68);
__m512 tmp118 = _mm512_shuffle_ps(tmp101, tmp103, 238);
__m512 tmp119 = _mm512_shuffle_ps(tmp102, tmp104, 68);
__m512 tmp120 = _mm512_shuffle_ps(tmp102, tmp104, 238);
__m512 tmp121 = _mm512_shuffle_ps(tmp105, tmp107, 68);
__m512 tmp122 = _mm512_shuffle_ps(tmp105, tmp107, 238);
__m512 tmp123 = _mm512_shuffle_ps(tmp106, tmp108, 68);
__m512 tmp124 = _mm512_shuffle_ps(tmp106, tmp108, 238);
__m512 tmp125 = _mm512_shuffle_ps(tmp109, tmp111, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp109, tmp111, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp110, tmp112, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp110, tmp112, 238);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp113, tmp117, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp113, tmp117, 221);
__m512 tmp131 = _mm512_shuffle_f32x4(tmp114, tmp118, 136);
__m512 tmp132 = _mm512_shuffle_f32x4(tmp114, tmp118, 221);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp115, tmp119, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp115, tmp119, 221);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp116, tmp120, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp116, tmp120, 221);
__m512 tmp137 = _mm512_shuffle_f32x4(tmp121, tmp125, 136);
__m512 tmp138 = _mm512_shuffle_f32x4(tmp121, tmp125, 221);
__m512 tmp139 = _mm512_shuffle_f32x4(tmp122, tmp126, 136);
__m512 tmp140 = _mm512_shuffle_f32x4(tmp122, tmp126, 221);
__m512 tmp141 = _mm512_shuffle_f32x4(tmp123, tmp127, 136);
__m512 tmp142 = _mm512_shuffle_f32x4(tmp123, tmp127, 221);
__m512 tmp143 = _mm512_shuffle_f32x4(tmp124, tmp128, 136);
__m512 tmp144 = _mm512_shuffle_f32x4(tmp124, tmp128, 221);
wt63 = _mm512_shuffle_f32x4(tmp129, tmp137, 136);
wt71 = _mm512_shuffle_f32x4(tmp129, tmp137, 221);
wt64 = _mm512_shuffle_f32x4(tmp131, tmp139, 136);
wt72 = _mm512_shuffle_f32x4(tmp131, tmp139, 221);
wt65 = _mm512_shuffle_f32x4(tmp133, tmp141, 136);
wt73 = _mm512_shuffle_f32x4(tmp133, tmp141, 221);
wt66 = _mm512_shuffle_f32x4(tmp135, tmp143, 136);
wt74 = _mm512_shuffle_f32x4(tmp135, tmp143, 221);
wt67 = _mm512_shuffle_f32x4(tmp130, tmp138, 136);
wt75 = _mm512_shuffle_f32x4(tmp130, tmp138, 221);
wt68 = _mm512_shuffle_f32x4(tmp132, tmp140, 136);
wt76 = _mm512_shuffle_f32x4(tmp132, tmp140, 221);
wt69 = _mm512_shuffle_f32x4(tmp134, tmp142, 136);
wt77 = _mm512_shuffle_f32x4(tmp134, tmp142, 221);
wt70 = _mm512_shuffle_f32x4(tmp136, tmp144, 136);
wt78 = _mm512_shuffle_f32x4(tmp136, tmp144, 221);
wt63 = _mm512_mul_ps(wt63, postMul7);
wt64 = _mm512_mul_ps(wt64, postMul7);
wt65 = _mm512_mul_ps(wt65, postMul7);
wt66 = _mm512_mul_ps(wt66, postMul7);
wt67 = _mm512_mul_ps(wt67, postMul7);
wt68 = _mm512_mul_ps(wt68, postMul7);
wt69 = _mm512_mul_ps(wt69, postMul7);
wt70 = _mm512_mul_ps(wt70, postMul7);
wt71 = _mm512_mul_ps(wt71, postMul7);
wt72 = _mm512_mul_ps(wt72, postMul7);
wt73 = _mm512_mul_ps(wt73, postMul7);
wt74 = _mm512_mul_ps(wt74, postMul7);
wt75 = _mm512_mul_ps(wt75, postMul7);
wt76 = _mm512_mul_ps(wt76, postMul7);
wt77 = _mm512_mul_ps(wt77, postMul7);
wt78 = _mm512_mul_ps(wt78, postMul7);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c8)+(ptrdiff_t)0, 63>>cut3, wt63);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c8)+(ptrdiff_t)0, 63>>cut3, wt64);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c8)+(ptrdiff_t)0, 63>>cut3, wt65);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c8)+(ptrdiff_t)0, 63>>cut3, wt66);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c8)+(ptrdiff_t)0, 63>>cut3, wt67);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c8)+(ptrdiff_t)0, 63>>cut3, wt68);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c8)+(ptrdiff_t)0, 63>>cut3, wt69);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c8)+(ptrdiff_t)0, 63>>cut3, wt70);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c8)+(ptrdiff_t)0, 63>>cut3, wt71);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c8)+(ptrdiff_t)0, 63>>cut3, wt72);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c8)+(ptrdiff_t)0, 63>>cut3, wt73);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c8)+(ptrdiff_t)0, 63>>cut3, wt74);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c8)+(ptrdiff_t)0, 63>>cut3, wt75);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c8)+(ptrdiff_t)0, 63>>cut3, wt76);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c8)+(ptrdiff_t)0, 63>>cut3, wt77);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c8)+(ptrdiff_t)0, 63>>cut3, wt78);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt63);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt64);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt65);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt66);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt67);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt68);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt69);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt70);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt71);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt72);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt73);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt74);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt75);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt76);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt77);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt78);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt63);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt64);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt65);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt66);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt67);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt68);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt69);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt70);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt71);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt72);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt73);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt74);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt75);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt76);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt77);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt78);
}
break;
}
default: {
cut3 = 4;
__m512 sum6 = _mm512_maskz_loadu_ps(65535, biasPtr3+1280*i12+4*k48);
__m512i pmMul5 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd5 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo4 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k48+320*i12));
__m512 masHi4 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k48+320*i12)+(ptrdiff_t)64);
__m512 postMul8 = _mm512_permutex2var_ps(masLo4, pmMul5, masHi4);
__m512 postAdd6 = _mm512_permutex2var_ps(masLo4, pmAdd5, masHi4);
sum6 = _mm512_fmadd_ps(sum6, postMul8, postAdd6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)0, 63>>cut3, sum6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)1536, 4032>>cut3, sum6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)3072, 258048>>cut3, sum6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)4608, 65535-(262143>>cut3), sum6);
ptrdiff_t c9 = 0;
for (; c9 != 4; ++c9) {
__m512 wt79 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)0);
__m512 wt80 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)256);
__m512 wt81 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)512);
__m512 wt82 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)768);
__m512 wt83 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)1024);
__m512 wt84 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)1280);
__m512 wt85 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)1536);
__m512 wt86 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)1792);
__m512 wt87 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)2048);
__m512 wt88 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)2304);
__m512 wt89 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)2560);
__m512 wt90 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)2816);
__m512 wt91 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)3072);
__m512 wt92 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)3328);
__m512 wt93 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)3584);
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)3840);
__m512 tmp145 = _mm512_unpacklo_ps(wt79, wt80);
__m512 tmp146 = _mm512_unpackhi_ps(wt79, wt80);
__m512 tmp147 = _mm512_unpacklo_ps(wt81, wt82);
__m512 tmp148 = _mm512_unpackhi_ps(wt81, wt82);
__m512 tmp149 = _mm512_unpacklo_ps(wt83, wt84);
__m512 tmp150 = _mm512_unpackhi_ps(wt83, wt84);
__m512 tmp151 = _mm512_unpacklo_ps(wt85, wt86);
__m512 tmp152 = _mm512_unpackhi_ps(wt85, wt86);
__m512 tmp153 = _mm512_unpacklo_ps(wt87, wt88);
__m512 tmp154 = _mm512_unpackhi_ps(wt87, wt88);
__m512 tmp155 = _mm512_unpacklo_ps(wt89, wt90);
__m512 tmp156 = _mm512_unpackhi_ps(wt89, wt90);
__m512 tmp157 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp158 = _mm512_unpackhi_ps(wt91, wt92);
__m512 tmp159 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp160 = _mm512_unpackhi_ps(wt93, wt94);
__m512 tmp161 = _mm512_shuffle_ps(tmp145, tmp147, 68);
__m512 tmp162 = _mm512_shuffle_ps(tmp145, tmp147, 238);
__m512 tmp163 = _mm512_shuffle_ps(tmp146, tmp148, 68);
__m512 tmp164 = _mm512_shuffle_ps(tmp146, tmp148, 238);
__m512 tmp165 = _mm512_shuffle_ps(tmp149, tmp151, 68);
__m512 tmp166 = _mm512_shuffle_ps(tmp149, tmp151, 238);
__m512 tmp167 = _mm512_shuffle_ps(tmp150, tmp152, 68);
__m512 tmp168 = _mm512_shuffle_ps(tmp150, tmp152, 238);
__m512 tmp169 = _mm512_shuffle_ps(tmp153, tmp155, 68);
__m512 tmp170 = _mm512_shuffle_ps(tmp153, tmp155, 238);
__m512 tmp171 = _mm512_shuffle_ps(tmp154, tmp156, 68);
__m512 tmp172 = _mm512_shuffle_ps(tmp154, tmp156, 238);
__m512 tmp173 = _mm512_shuffle_ps(tmp157, tmp159, 68);
__m512 tmp174 = _mm512_shuffle_ps(tmp157, tmp159, 238);
__m512 tmp175 = _mm512_shuffle_ps(tmp158, tmp160, 68);
__m512 tmp176 = _mm512_shuffle_ps(tmp158, tmp160, 238);
__m512 tmp177 = _mm512_shuffle_f32x4(tmp161, tmp165, 136);
__m512 tmp178 = _mm512_shuffle_f32x4(tmp161, tmp165, 221);
__m512 tmp179 = _mm512_shuffle_f32x4(tmp162, tmp166, 136);
__m512 tmp180 = _mm512_shuffle_f32x4(tmp162, tmp166, 221);
__m512 tmp181 = _mm512_shuffle_f32x4(tmp163, tmp167, 136);
__m512 tmp182 = _mm512_shuffle_f32x4(tmp163, tmp167, 221);
__m512 tmp183 = _mm512_shuffle_f32x4(tmp164, tmp168, 136);
__m512 tmp184 = _mm512_shuffle_f32x4(tmp164, tmp168, 221);
__m512 tmp185 = _mm512_shuffle_f32x4(tmp169, tmp173, 136);
__m512 tmp186 = _mm512_shuffle_f32x4(tmp169, tmp173, 221);
__m512 tmp187 = _mm512_shuffle_f32x4(tmp170, tmp174, 136);
__m512 tmp188 = _mm512_shuffle_f32x4(tmp170, tmp174, 221);
__m512 tmp189 = _mm512_shuffle_f32x4(tmp171, tmp175, 136);
__m512 tmp190 = _mm512_shuffle_f32x4(tmp171, tmp175, 221);
__m512 tmp191 = _mm512_shuffle_f32x4(tmp172, tmp176, 136);
__m512 tmp192 = _mm512_shuffle_f32x4(tmp172, tmp176, 221);
wt79 = _mm512_shuffle_f32x4(tmp177, tmp185, 136);
wt87 = _mm512_shuffle_f32x4(tmp177, tmp185, 221);
wt80 = _mm512_shuffle_f32x4(tmp179, tmp187, 136);
wt88 = _mm512_shuffle_f32x4(tmp179, tmp187, 221);
wt81 = _mm512_shuffle_f32x4(tmp181, tmp189, 136);
wt89 = _mm512_shuffle_f32x4(tmp181, tmp189, 221);
wt82 = _mm512_shuffle_f32x4(tmp183, tmp191, 136);
wt90 = _mm512_shuffle_f32x4(tmp183, tmp191, 221);
wt83 = _mm512_shuffle_f32x4(tmp178, tmp186, 136);
wt91 = _mm512_shuffle_f32x4(tmp178, tmp186, 221);
wt84 = _mm512_shuffle_f32x4(tmp180, tmp188, 136);
wt92 = _mm512_shuffle_f32x4(tmp180, tmp188, 221);
wt85 = _mm512_shuffle_f32x4(tmp182, tmp190, 136);
wt93 = _mm512_shuffle_f32x4(tmp182, tmp190, 221);
wt86 = _mm512_shuffle_f32x4(tmp184, tmp192, 136);
wt94 = _mm512_shuffle_f32x4(tmp184, tmp192, 221);
wt79 = _mm512_mul_ps(wt79, postMul8);
wt80 = _mm512_mul_ps(wt80, postMul8);
wt81 = _mm512_mul_ps(wt81, postMul8);
wt82 = _mm512_mul_ps(wt82, postMul8);
wt83 = _mm512_mul_ps(wt83, postMul8);
wt84 = _mm512_mul_ps(wt84, postMul8);
wt85 = _mm512_mul_ps(wt85, postMul8);
wt86 = _mm512_mul_ps(wt86, postMul8);
wt87 = _mm512_mul_ps(wt87, postMul8);
wt88 = _mm512_mul_ps(wt88, postMul8);
wt89 = _mm512_mul_ps(wt89, postMul8);
wt90 = _mm512_mul_ps(wt90, postMul8);
wt91 = _mm512_mul_ps(wt91, postMul8);
wt92 = _mm512_mul_ps(wt92, postMul8);
wt93 = _mm512_mul_ps(wt93, postMul8);
wt94 = _mm512_mul_ps(wt94, postMul8);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c9)+(ptrdiff_t)0, 63>>cut3, wt79);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c9)+(ptrdiff_t)0, 63>>cut3, wt80);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c9)+(ptrdiff_t)0, 63>>cut3, wt81);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c9)+(ptrdiff_t)0, 63>>cut3, wt82);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c9)+(ptrdiff_t)0, 63>>cut3, wt83);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c9)+(ptrdiff_t)0, 63>>cut3, wt84);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c9)+(ptrdiff_t)0, 63>>cut3, wt85);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c9)+(ptrdiff_t)0, 63>>cut3, wt86);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c9)+(ptrdiff_t)0, 63>>cut3, wt87);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c9)+(ptrdiff_t)0, 63>>cut3, wt88);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c9)+(ptrdiff_t)0, 63>>cut3, wt89);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c9)+(ptrdiff_t)0, 63>>cut3, wt90);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c9)+(ptrdiff_t)0, 63>>cut3, wt91);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c9)+(ptrdiff_t)0, 63>>cut3, wt92);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c9)+(ptrdiff_t)0, 63>>cut3, wt93);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c9)+(ptrdiff_t)0, 63>>cut3, wt94);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt79);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt80);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt81);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt82);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt83);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt84);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt85);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt86);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt87);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt88);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt89);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt90);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt91);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt92);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt93);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt94);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt79);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt80);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt81);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt82);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt83);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt84);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt85);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt86);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt87);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt88);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt89);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt90);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt91);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt92);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt93);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt94);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt79);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt80);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt81);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt82);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt83);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt84);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt85);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt86);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt87);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt88);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt89);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt90);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt91);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt92);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt93);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt94);
}
}
}
} else {
ptrdiff_t k47 = 48;
ptrdiff_t l10 = (size_t)(256+k47)/6;
ptrdiff_t cut2 = (size_t)(256+k47)%6;
__m512 sum4 = _mm512_maskz_loadu_ps(65535, biasPtr3+1280*i12+4*k47);
__m512i pmMul6 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd6 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo5 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k47+320*i12));
__m512 masHi5 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k47+320*i12)+(ptrdiff_t)64);
__m512 postMul6 = _mm512_permutex2var_ps(masLo5, pmMul6, masHi5);
__m512 postAdd4 = _mm512_permutex2var_ps(masLo5, pmAdd6, masHi5);
sum4 = _mm512_fmadd_ps(sum4, postMul6, postAdd4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)1536, 4032>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)3072, 258048>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*0+(ptrdiff_t)4608, 65535-(262143>>cut2), sum4);
ptrdiff_t c7 = 0;
for (; c7 != 4; ++c7) {
__m512 wt47 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)0);
__m512 wt48 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)256);
__m512 wt49 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)512);
__m512 wt50 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)768);
__m512 wt51 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)1024);
__m512 wt52 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)1280);
__m512 wt53 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)1536);
__m512 wt54 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)1792);
__m512 wt55 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)2048);
__m512 wt56 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)2304);
__m512 wt57 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)2560);
__m512 wt58 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)2816);
__m512 wt59 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)3072);
__m512 wt60 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)3328);
__m512 wt61 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)3584);
__m512 wt62 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)3840);
__m512 tmp193 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp194 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp195 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp196 = _mm512_unpackhi_ps(wt49, wt50);
__m512 tmp197 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp198 = _mm512_unpackhi_ps(wt51, wt52);
__m512 tmp199 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp200 = _mm512_unpackhi_ps(wt53, wt54);
__m512 tmp201 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp202 = _mm512_unpackhi_ps(wt55, wt56);
__m512 tmp203 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp204 = _mm512_unpackhi_ps(wt57, wt58);
__m512 tmp205 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp206 = _mm512_unpackhi_ps(wt59, wt60);
__m512 tmp207 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp208 = _mm512_unpackhi_ps(wt61, wt62);
__m512 tmp209 = _mm512_shuffle_ps(tmp193, tmp195, 68);
__m512 tmp210 = _mm512_shuffle_ps(tmp193, tmp195, 238);
__m512 tmp211 = _mm512_shuffle_ps(tmp194, tmp196, 68);
__m512 tmp212 = _mm512_shuffle_ps(tmp194, tmp196, 238);
__m512 tmp213 = _mm512_shuffle_ps(tmp197, tmp199, 68);
__m512 tmp214 = _mm512_shuffle_ps(tmp197, tmp199, 238);
__m512 tmp215 = _mm512_shuffle_ps(tmp198, tmp200, 68);
__m512 tmp216 = _mm512_shuffle_ps(tmp198, tmp200, 238);
__m512 tmp217 = _mm512_shuffle_ps(tmp201, tmp203, 68);
__m512 tmp218 = _mm512_shuffle_ps(tmp201, tmp203, 238);
__m512 tmp219 = _mm512_shuffle_ps(tmp202, tmp204, 68);
__m512 tmp220 = _mm512_shuffle_ps(tmp202, tmp204, 238);
__m512 tmp221 = _mm512_shuffle_ps(tmp205, tmp207, 68);
__m512 tmp222 = _mm512_shuffle_ps(tmp205, tmp207, 238);
__m512 tmp223 = _mm512_shuffle_ps(tmp206, tmp208, 68);
__m512 tmp224 = _mm512_shuffle_ps(tmp206, tmp208, 238);
__m512 tmp225 = _mm512_shuffle_f32x4(tmp209, tmp213, 136);
__m512 tmp226 = _mm512_shuffle_f32x4(tmp209, tmp213, 221);
__m512 tmp227 = _mm512_shuffle_f32x4(tmp210, tmp214, 136);
__m512 tmp228 = _mm512_shuffle_f32x4(tmp210, tmp214, 221);
__m512 tmp229 = _mm512_shuffle_f32x4(tmp211, tmp215, 136);
__m512 tmp230 = _mm512_shuffle_f32x4(tmp211, tmp215, 221);
__m512 tmp231 = _mm512_shuffle_f32x4(tmp212, tmp216, 136);
__m512 tmp232 = _mm512_shuffle_f32x4(tmp212, tmp216, 221);
__m512 tmp233 = _mm512_shuffle_f32x4(tmp217, tmp221, 136);
__m512 tmp234 = _mm512_shuffle_f32x4(tmp217, tmp221, 221);
__m512 tmp235 = _mm512_shuffle_f32x4(tmp218, tmp222, 136);
__m512 tmp236 = _mm512_shuffle_f32x4(tmp218, tmp222, 221);
__m512 tmp237 = _mm512_shuffle_f32x4(tmp219, tmp223, 136);
__m512 tmp238 = _mm512_shuffle_f32x4(tmp219, tmp223, 221);
__m512 tmp239 = _mm512_shuffle_f32x4(tmp220, tmp224, 136);
__m512 tmp240 = _mm512_shuffle_f32x4(tmp220, tmp224, 221);
wt47 = _mm512_shuffle_f32x4(tmp225, tmp233, 136);
wt55 = _mm512_shuffle_f32x4(tmp225, tmp233, 221);
wt48 = _mm512_shuffle_f32x4(tmp227, tmp235, 136);
wt56 = _mm512_shuffle_f32x4(tmp227, tmp235, 221);
wt49 = _mm512_shuffle_f32x4(tmp229, tmp237, 136);
wt57 = _mm512_shuffle_f32x4(tmp229, tmp237, 221);
wt50 = _mm512_shuffle_f32x4(tmp231, tmp239, 136);
wt58 = _mm512_shuffle_f32x4(tmp231, tmp239, 221);
wt51 = _mm512_shuffle_f32x4(tmp226, tmp234, 136);
wt59 = _mm512_shuffle_f32x4(tmp226, tmp234, 221);
wt52 = _mm512_shuffle_f32x4(tmp228, tmp236, 136);
wt60 = _mm512_shuffle_f32x4(tmp228, tmp236, 221);
wt53 = _mm512_shuffle_f32x4(tmp230, tmp238, 136);
wt61 = _mm512_shuffle_f32x4(tmp230, tmp238, 221);
wt54 = _mm512_shuffle_f32x4(tmp232, tmp240, 136);
wt62 = _mm512_shuffle_f32x4(tmp232, tmp240, 221);
wt47 = _mm512_mul_ps(wt47, postMul6);
wt48 = _mm512_mul_ps(wt48, postMul6);
wt49 = _mm512_mul_ps(wt49, postMul6);
wt50 = _mm512_mul_ps(wt50, postMul6);
wt51 = _mm512_mul_ps(wt51, postMul6);
wt52 = _mm512_mul_ps(wt52, postMul6);
wt53 = _mm512_mul_ps(wt53, postMul6);
wt54 = _mm512_mul_ps(wt54, postMul6);
wt55 = _mm512_mul_ps(wt55, postMul6);
wt56 = _mm512_mul_ps(wt56, postMul6);
wt57 = _mm512_mul_ps(wt57, postMul6);
wt58 = _mm512_mul_ps(wt58, postMul6);
wt59 = _mm512_mul_ps(wt59, postMul6);
wt60 = _mm512_mul_ps(wt60, postMul6);
wt61 = _mm512_mul_ps(wt61, postMul6);
wt62 = _mm512_mul_ps(wt62, postMul6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)0, 63>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)0, 63>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)0, 63>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)0, 63>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)0, 63>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)0, 63>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)0, 63>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)0, 63>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)0, 63>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)0, 63>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)0, 63>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)0, 63>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)0, 63>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)0, 63>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)0, 63>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)0, 63>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(1+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt47);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(2+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt48);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(3+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt49);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(4+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt50);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(5+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt51);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(6+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt52);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(7+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt53);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(8+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt54);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(9+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt55);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(10+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt56);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(11+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt57);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(12+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt58);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(13+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt59);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(14+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt60);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(15+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt61);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(16+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt62);
}
}
}
}
}

static void ResNet50OneArrangeWts1(ResNet50ThreaderTeam1* team19, char** tensors11) {
ResNet50ThreaderTask1 task15;
task15.callee1 = ResNet50OneArrangeWts1Callee1;
task15.any1 = tensors11;
task15.nd1 = 3;
task15.hull1[0] = 2;
task15.hull1[1] = 1;
task15.hull1[2] = 1;
ResNet50ThreaderDo1(team19, &task15);
}

static void ResNet50OneArrangeDats1Callee1(ResNet50ThreaderTask1* task16, int64_t* pt13) {
char** tensors14 = task16->any1;
ptrdiff_t c10 = pt13[1];
char*restrict datPtr3 = tensors14[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)806912*0;
char*restrict arranged2 = tensors14[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)802816*0;
ptrdiff_t ii2 = 1;
for (ptrdiff_t i13 = 0; i13 < ii2; ++i13) {
ptrdiff_t j8 = 2*c10;
ptrdiff_t jj20 = j8+(c10 < 23 ? 1 : 2);
for (; j8 != 49; ++j8) {
ptrdiff_t k49 = 0;
ptrdiff_t kk24 = k49+64;
for (; k49 < kk24; ++k49) {
__m512 dat909 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k49+(ptrdiff_t)0);
__m512 dat910 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k49+(ptrdiff_t)64);
__m512 dat911 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k49+(ptrdiff_t)128);
__m512 dat912 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k49+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k49+(ptrdiff_t)0, 65535, dat909);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k49+(ptrdiff_t)64, 65535, dat910);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k49+(ptrdiff_t)128, 65535, dat911);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k49+(ptrdiff_t)192, 65535, dat912);
}
if (j8 >= jj20) goto next1;
}
next1:;
}
}

static void ResNet50OneArrangeDats1(ResNet50ThreaderTeam1* team20, char** tensors13) {
ResNet50ThreaderTask1 task17;
task17.callee1 = ResNet50OneArrangeDats1Callee1;
task17.any1 = tensors13;
task17.nd1 = 4;
task17.hull1[0] = 1;
task17.hull1[1] = 24;
task17.hull1[2] = 1;
task17.hull1[3] = 1;
ResNet50ThreaderDo1(team20, &task17);
}

static void ResNet50OneApply1Callee1(ResNet50ThreaderTask1* task18, int64_t* pt14) {
void** pair2 = task18->any1;
char** tensors16 = pair2[0];
ptrdiff_t e6 = 0;
ptrdiff_t g6 = 0;
ptrdiff_t d3 = pt14[1];
ptrdiff_t w22 = pt14[0];
char*restrict arrangedWts1 = tensors16[0]+1070080*e6+(ptrdiff_t)83200*1*g6;
char*restrict arrangedDats1 = tensors16[1]+10474240*e6+(ptrdiff_t)802816*1*g6;
char*restrict datPtr4 = tensors16[2]+(ptrdiff_t)4034560*1*g6;
ptrdiff_t ii3 = 1;
for (ptrdiff_t i14 = 0; i14 < ii3; ++i14) {
ptrdiff_t j9 = 1*d3;
ptrdiff_t jj21 = j9+0;
for (; j9 != 49; ++j9) {
ptrdiff_t k50 = 9*w22;
ptrdiff_t kk25 = k50+8;
for (; k50 != 53; ++k50) {
ptrdiff_t s10 = -1;
__m512 sum7 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)24));
__m512 sum11 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)28));
__m512 sum15 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)32));
__m512 sum19 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)36));
__m512 sum23 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)40));
__m512 sum27 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)44));
__m512 sum8 = sum7;
__m512 sum9 = sum7;
__m512 sum10 = sum7;
__m512 sum12 = sum11;
__m512 sum13 = sum11;
__m512 sum14 = sum11;
__m512 sum16 = sum15;
__m512 sum17 = sum15;
__m512 sum18 = sum15;
__m512 sum20 = sum19;
__m512 sum21 = sum19;
__m512 sum22 = sum19;
__m512 sum24 = sum23;
__m512 sum25 = sum23;
__m512 sum26 = sum23;
__m512 sum28 = sum27;
__m512 sum29 = sum27;
__m512 sum30 = sum27;
for (s10 = 0; s10 < 64; ++s10) {
__m512 dat913 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)0);
__m512 dat914 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)64);
__m512 dat915 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)128);
__m512 dat916 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)192);
__m512 wt95 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)24));
sum7 = _mm512_fmadd_ps(wt95, dat913, sum7);
sum8 = _mm512_fmadd_ps(wt95, dat914, sum8);
sum9 = _mm512_fmadd_ps(wt95, dat915, sum9);
sum10 = _mm512_fmadd_ps(wt95, dat916, sum10);
__m512 wt96 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)28));
sum11 = _mm512_fmadd_ps(wt96, dat913, sum11);
sum12 = _mm512_fmadd_ps(wt96, dat914, sum12);
sum13 = _mm512_fmadd_ps(wt96, dat915, sum13);
sum14 = _mm512_fmadd_ps(wt96, dat916, sum14);
__m512 wt97 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)32));
sum15 = _mm512_fmadd_ps(wt97, dat913, sum15);
sum16 = _mm512_fmadd_ps(wt97, dat914, sum16);
sum17 = _mm512_fmadd_ps(wt97, dat915, sum17);
sum18 = _mm512_fmadd_ps(wt97, dat916, sum18);
__m512 wt98 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)36));
sum19 = _mm512_fmadd_ps(wt98, dat913, sum19);
sum20 = _mm512_fmadd_ps(wt98, dat914, sum20);
sum21 = _mm512_fmadd_ps(wt98, dat915, sum21);
sum22 = _mm512_fmadd_ps(wt98, dat916, sum22);
__m512 wt99 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)40));
sum23 = _mm512_fmadd_ps(wt99, dat913, sum23);
sum24 = _mm512_fmadd_ps(wt99, dat914, sum24);
sum25 = _mm512_fmadd_ps(wt99, dat915, sum25);
sum26 = _mm512_fmadd_ps(wt99, dat916, sum26);
__m512 wt100 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)44));
sum27 = _mm512_fmadd_ps(wt100, dat913, sum27);
sum28 = _mm512_fmadd_ps(wt100, dat914, sum28);
sum29 = _mm512_fmadd_ps(wt100, dat915, sum29);
sum30 = _mm512_fmadd_ps(wt100, dat916, sum30);
}
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)0, 65535, sum7);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)64, 65535, sum8);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)128, 65535, sum9);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)192, 65535, sum10);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12608, 65535, sum11);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12672, 65535, sum12);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12736, 65535, sum13);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12800, 65535, sum14);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)25216, 65535, sum15);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)25280, 65535, sum16);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)25344, 65535, sum17);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)25408, 65535, sum18);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)37824, 65535, sum19);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)37888, 65535, sum20);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)37952, 65535, sum21);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)38016, 65535, sum22);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)50432, 65535, sum23);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)50496, 65535, sum24);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)50560, 65535, sum25);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)50624, 65535, sum26);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)63040, 65535, sum27);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)63104, 65535, sum28);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)63168, 65535, sum29);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)63232, 65535, sum30);
if (k50 >= kk25) return;
}
ptrdiff_t s11 = -1;
__m512 sum31 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+8*s11+(ptrdiff_t)8));
__m512 sum35 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+8*s11+(ptrdiff_t)12));
__m512 sum32 = sum31;
__m512 sum33 = sum31;
__m512 sum34 = sum31;
__m512 sum36 = sum35;
__m512 sum37 = sum35;
__m512 sum38 = sum35;
for (s11 = 0; s11 < 64; ++s11) {
__m512 dat917 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)0);
__m512 dat918 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)64);
__m512 dat919 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)128);
__m512 dat920 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)192);
__m512 wt101 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+8*s11+(ptrdiff_t)8));
sum31 = _mm512_fmadd_ps(wt101, dat917, sum31);
sum32 = _mm512_fmadd_ps(wt101, dat918, sum32);
sum33 = _mm512_fmadd_ps(wt101, dat919, sum33);
sum34 = _mm512_fmadd_ps(wt101, dat920, sum34);
__m512 wt102 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+8*s11+(ptrdiff_t)12));
sum35 = _mm512_fmadd_ps(wt102, dat917, sum35);
sum36 = _mm512_fmadd_ps(wt102, dat918, sum36);
sum37 = _mm512_fmadd_ps(wt102, dat919, sum37);
sum38 = _mm512_fmadd_ps(wt102, dat920, sum38);
}
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)0, 65535, sum31);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)64, 65535, sum32);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)128, 65535, sum33);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)192, 65535, sum34);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12608, 65535, sum35);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12672, 65535, sum36);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12736, 65535, sum37);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12800, 65535, sum38);
if (j9 >= jj21) return;
}
}
}

static void ResNet50OneApply1(ResNet50ThreaderTeam1* team21, char** tensors15) {
void* pair1[] = {tensors15, 0};
ResNet50ThreaderTask1 task19;
task19.callee1 = ResNet50OneApply1Callee1;
task19.any1 = pair1;
task19.nd1 = 3;
task19.hull1[0] = 6;
task19.hull1[1] = 49;
task19.hull1[2] = 1;
ResNet50ThreaderDo1(team21, &task19);
}

static void ResNet50OneArrangeWts2Callee1(ResNet50ThreaderTask1* task28, int64_t* pt19) {
char** tensors26 = task28->any1;
ptrdiff_t b48 = pt19[0];
char*restrict wtPtr5 = tensors26[0]+(ptrdiff_t)3340*0+(ptrdiff_t)65536*0;
char*restrict biasPtr5 = tensors26[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr5 = tensors26[2]+(ptrdiff_t)8*256*0;
char*restrict arranged3 = tensors26[3]+(ptrdiff_t)856064*0+(ptrdiff_t)66560*0;
ptrdiff_t ii4 = 1;
for (ptrdiff_t i19 = 0; i19 < ii4; ++i19) {
ptrdiff_t j14 = 8*b48;
ptrdiff_t jj23 = j14+8;
for (; j14 < jj23; ++j14) {
if (j14 < 15) {
ptrdiff_t k72 = 0+16*(j14-0);
ptrdiff_t l24 = (size_t)(0+k72)/6;
ptrdiff_t cut6 = (size_t)(0+k72)%6;
switch (cut6) {
case 0:;
case 2: {
__m512 sum80 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i19+4*k72);
__m512i pmMul8 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd8 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo6 = _mm512_loadu_ps(bnPtr5+(ptrdiff_t)8*(k72+256*i19));
__m512 masHi6 = _mm512_maskz_loadu_ps(65535, bnPtr5+(ptrdiff_t)8*(k72+256*i19)+(ptrdiff_t)64);
__m512 postMul15 = _mm512_permutex2var_ps(masLo6, pmMul8, masHi6);
__m512 postAdd9 = _mm512_permutex2var_ps(masLo6, pmAdd8, masHi6);
sum80 = _mm512_fmadd_ps(sum80, postMul15, postAdd9);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)0, 63>>cut6, sum80);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)1536, 4032>>cut6, sum80);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)3072, 65535-(4095>>cut6), sum80);
ptrdiff_t c13 = 0;
for (; c13 != 4; ++c13) {
__m512 wt123 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)0);
__m512 wt124 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)256);
__m512 wt125 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)512);
__m512 wt126 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)768);
__m512 wt127 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)1024);
__m512 wt128 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)1280);
__m512 wt129 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)1536);
__m512 wt130 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)1792);
__m512 wt131 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)2048);
__m512 wt132 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)2304);
__m512 wt133 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)2560);
__m512 wt134 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)2816);
__m512 wt135 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)3072);
__m512 wt136 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)3328);
__m512 wt137 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)3584);
__m512 wt138 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)3840);
__m512 tmp5253 = _mm512_unpacklo_ps(wt123, wt124);
__m512 tmp5254 = _mm512_unpackhi_ps(wt123, wt124);
__m512 tmp5255 = _mm512_unpacklo_ps(wt125, wt126);
__m512 tmp5256 = _mm512_unpackhi_ps(wt125, wt126);
__m512 tmp5257 = _mm512_unpacklo_ps(wt127, wt128);
__m512 tmp5258 = _mm512_unpackhi_ps(wt127, wt128);
__m512 tmp5259 = _mm512_unpacklo_ps(wt129, wt130);
__m512 tmp5260 = _mm512_unpackhi_ps(wt129, wt130);
__m512 tmp5261 = _mm512_unpacklo_ps(wt131, wt132);
__m512 tmp5262 = _mm512_unpackhi_ps(wt131, wt132);
__m512 tmp5263 = _mm512_unpacklo_ps(wt133, wt134);
__m512 tmp5264 = _mm512_unpackhi_ps(wt133, wt134);
__m512 tmp5265 = _mm512_unpacklo_ps(wt135, wt136);
__m512 tmp5266 = _mm512_unpackhi_ps(wt135, wt136);
__m512 tmp5267 = _mm512_unpacklo_ps(wt137, wt138);
__m512 tmp5268 = _mm512_unpackhi_ps(wt137, wt138);
__m512 tmp5269 = _mm512_shuffle_ps(tmp5253, tmp5255, 68);
__m512 tmp5270 = _mm512_shuffle_ps(tmp5253, tmp5255, 238);
__m512 tmp5271 = _mm512_shuffle_ps(tmp5254, tmp5256, 68);
__m512 tmp5272 = _mm512_shuffle_ps(tmp5254, tmp5256, 238);
__m512 tmp5273 = _mm512_shuffle_ps(tmp5257, tmp5259, 68);
__m512 tmp5274 = _mm512_shuffle_ps(tmp5257, tmp5259, 238);
__m512 tmp5275 = _mm512_shuffle_ps(tmp5258, tmp5260, 68);
__m512 tmp5276 = _mm512_shuffle_ps(tmp5258, tmp5260, 238);
__m512 tmp5277 = _mm512_shuffle_ps(tmp5261, tmp5263, 68);
__m512 tmp5278 = _mm512_shuffle_ps(tmp5261, tmp5263, 238);
__m512 tmp5279 = _mm512_shuffle_ps(tmp5262, tmp5264, 68);
__m512 tmp5280 = _mm512_shuffle_ps(tmp5262, tmp5264, 238);
__m512 tmp5281 = _mm512_shuffle_ps(tmp5265, tmp5267, 68);
__m512 tmp5282 = _mm512_shuffle_ps(tmp5265, tmp5267, 238);
__m512 tmp5283 = _mm512_shuffle_ps(tmp5266, tmp5268, 68);
__m512 tmp5284 = _mm512_shuffle_ps(tmp5266, tmp5268, 238);
__m512 tmp5285 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 136);
__m512 tmp5286 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 221);
__m512 tmp5287 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 136);
__m512 tmp5288 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 221);
__m512 tmp5289 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 136);
__m512 tmp5290 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 221);
__m512 tmp5291 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 136);
__m512 tmp5292 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 221);
__m512 tmp5293 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 136);
__m512 tmp5294 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 221);
__m512 tmp5295 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 136);
__m512 tmp5296 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 221);
__m512 tmp5297 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 136);
__m512 tmp5298 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 221);
__m512 tmp5299 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 136);
__m512 tmp5300 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 221);
wt123 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 136);
wt131 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 221);
wt124 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 136);
wt132 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 221);
wt125 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 136);
wt133 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 221);
wt126 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 136);
wt134 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 221);
wt127 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 136);
wt135 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 221);
wt128 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 136);
wt136 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 221);
wt129 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 136);
wt137 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 221);
wt130 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 136);
wt138 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 221);
wt123 = _mm512_mul_ps(wt123, postMul15);
wt124 = _mm512_mul_ps(wt124, postMul15);
wt125 = _mm512_mul_ps(wt125, postMul15);
wt126 = _mm512_mul_ps(wt126, postMul15);
wt127 = _mm512_mul_ps(wt127, postMul15);
wt128 = _mm512_mul_ps(wt128, postMul15);
wt129 = _mm512_mul_ps(wt129, postMul15);
wt130 = _mm512_mul_ps(wt130, postMul15);
wt131 = _mm512_mul_ps(wt131, postMul15);
wt132 = _mm512_mul_ps(wt132, postMul15);
wt133 = _mm512_mul_ps(wt133, postMul15);
wt134 = _mm512_mul_ps(wt134, postMul15);
wt135 = _mm512_mul_ps(wt135, postMul15);
wt136 = _mm512_mul_ps(wt136, postMul15);
wt137 = _mm512_mul_ps(wt137, postMul15);
wt138 = _mm512_mul_ps(wt138, postMul15);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c13)+(ptrdiff_t)0, 63>>cut6, wt123);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c13)+(ptrdiff_t)0, 63>>cut6, wt124);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c13)+(ptrdiff_t)0, 63>>cut6, wt125);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c13)+(ptrdiff_t)0, 63>>cut6, wt126);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c13)+(ptrdiff_t)0, 63>>cut6, wt127);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c13)+(ptrdiff_t)0, 63>>cut6, wt128);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c13)+(ptrdiff_t)0, 63>>cut6, wt129);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c13)+(ptrdiff_t)0, 63>>cut6, wt130);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c13)+(ptrdiff_t)0, 63>>cut6, wt131);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c13)+(ptrdiff_t)0, 63>>cut6, wt132);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c13)+(ptrdiff_t)0, 63>>cut6, wt133);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c13)+(ptrdiff_t)0, 63>>cut6, wt134);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c13)+(ptrdiff_t)0, 63>>cut6, wt135);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c13)+(ptrdiff_t)0, 63>>cut6, wt136);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c13)+(ptrdiff_t)0, 63>>cut6, wt137);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c13)+(ptrdiff_t)0, 63>>cut6, wt138);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt123);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt124);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt125);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt126);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt127);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt128);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt129);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt130);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt131);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt132);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt133);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt134);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt135);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt136);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt137);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt138);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt123);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt124);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt125);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt126);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt127);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt128);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt129);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt130);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt131);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt132);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt133);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt134);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt135);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt136);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt137);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt138);
}
break;
}
default: {
cut6 = 4;
__m512 sum81 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i19+4*k72);
__m512i pmMul9 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd9 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo7 = _mm512_loadu_ps(bnPtr5+(ptrdiff_t)8*(k72+256*i19));
__m512 masHi7 = _mm512_maskz_loadu_ps(65535, bnPtr5+(ptrdiff_t)8*(k72+256*i19)+(ptrdiff_t)64);
__m512 postMul16 = _mm512_permutex2var_ps(masLo7, pmMul9, masHi7);
__m512 postAdd10 = _mm512_permutex2var_ps(masLo7, pmAdd9, masHi7);
sum81 = _mm512_fmadd_ps(sum81, postMul16, postAdd10);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)0, 63>>cut6, sum81);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)1536, 4032>>cut6, sum81);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)3072, 258048>>cut6, sum81);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)4608, 65535-(262143>>cut6), sum81);
ptrdiff_t c14 = 0;
for (; c14 != 4; ++c14) {
__m512 wt139 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)0);
__m512 wt140 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)256);
__m512 wt141 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)512);
__m512 wt142 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)768);
__m512 wt143 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)1024);
__m512 wt144 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)1280);
__m512 wt145 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)1536);
__m512 wt146 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)1792);
__m512 wt147 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)2048);
__m512 wt148 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)2304);
__m512 wt149 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)2560);
__m512 wt150 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)2816);
__m512 wt151 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)3072);
__m512 wt152 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)3328);
__m512 wt153 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)3584);
__m512 wt154 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)3840);
__m512 tmp5301 = _mm512_unpacklo_ps(wt139, wt140);
__m512 tmp5302 = _mm512_unpackhi_ps(wt139, wt140);
__m512 tmp5303 = _mm512_unpacklo_ps(wt141, wt142);
__m512 tmp5304 = _mm512_unpackhi_ps(wt141, wt142);
__m512 tmp5305 = _mm512_unpacklo_ps(wt143, wt144);
__m512 tmp5306 = _mm512_unpackhi_ps(wt143, wt144);
__m512 tmp5307 = _mm512_unpacklo_ps(wt145, wt146);
__m512 tmp5308 = _mm512_unpackhi_ps(wt145, wt146);
__m512 tmp5309 = _mm512_unpacklo_ps(wt147, wt148);
__m512 tmp5310 = _mm512_unpackhi_ps(wt147, wt148);
__m512 tmp5311 = _mm512_unpacklo_ps(wt149, wt150);
__m512 tmp5312 = _mm512_unpackhi_ps(wt149, wt150);
__m512 tmp5313 = _mm512_unpacklo_ps(wt151, wt152);
__m512 tmp5314 = _mm512_unpackhi_ps(wt151, wt152);
__m512 tmp5315 = _mm512_unpacklo_ps(wt153, wt154);
__m512 tmp5316 = _mm512_unpackhi_ps(wt153, wt154);
__m512 tmp5317 = _mm512_shuffle_ps(tmp5301, tmp5303, 68);
__m512 tmp5318 = _mm512_shuffle_ps(tmp5301, tmp5303, 238);
__m512 tmp5319 = _mm512_shuffle_ps(tmp5302, tmp5304, 68);
__m512 tmp5320 = _mm512_shuffle_ps(tmp5302, tmp5304, 238);
__m512 tmp5321 = _mm512_shuffle_ps(tmp5305, tmp5307, 68);
__m512 tmp5322 = _mm512_shuffle_ps(tmp5305, tmp5307, 238);
__m512 tmp5323 = _mm512_shuffle_ps(tmp5306, tmp5308, 68);
__m512 tmp5324 = _mm512_shuffle_ps(tmp5306, tmp5308, 238);
__m512 tmp5325 = _mm512_shuffle_ps(tmp5309, tmp5311, 68);
__m512 tmp5326 = _mm512_shuffle_ps(tmp5309, tmp5311, 238);
__m512 tmp5327 = _mm512_shuffle_ps(tmp5310, tmp5312, 68);
__m512 tmp5328 = _mm512_shuffle_ps(tmp5310, tmp5312, 238);
__m512 tmp5329 = _mm512_shuffle_ps(tmp5313, tmp5315, 68);
__m512 tmp5330 = _mm512_shuffle_ps(tmp5313, tmp5315, 238);
__m512 tmp5331 = _mm512_shuffle_ps(tmp5314, tmp5316, 68);
__m512 tmp5332 = _mm512_shuffle_ps(tmp5314, tmp5316, 238);
__m512 tmp5333 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 136);
__m512 tmp5334 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 221);
__m512 tmp5335 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 136);
__m512 tmp5336 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 221);
__m512 tmp5337 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 136);
__m512 tmp5338 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 221);
__m512 tmp5339 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 136);
__m512 tmp5340 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 221);
__m512 tmp5341 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 136);
__m512 tmp5342 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 221);
__m512 tmp5343 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 136);
__m512 tmp5344 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 221);
__m512 tmp5345 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 136);
__m512 tmp5346 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 221);
__m512 tmp5347 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 136);
__m512 tmp5348 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 221);
wt139 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 136);
wt147 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 221);
wt140 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 136);
wt148 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 221);
wt141 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 136);
wt149 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 221);
wt142 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 136);
wt150 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 221);
wt143 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 136);
wt151 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 221);
wt144 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 136);
wt152 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 221);
wt145 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 136);
wt153 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 221);
wt146 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 136);
wt154 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 221);
wt139 = _mm512_mul_ps(wt139, postMul16);
wt140 = _mm512_mul_ps(wt140, postMul16);
wt141 = _mm512_mul_ps(wt141, postMul16);
wt142 = _mm512_mul_ps(wt142, postMul16);
wt143 = _mm512_mul_ps(wt143, postMul16);
wt144 = _mm512_mul_ps(wt144, postMul16);
wt145 = _mm512_mul_ps(wt145, postMul16);
wt146 = _mm512_mul_ps(wt146, postMul16);
wt147 = _mm512_mul_ps(wt147, postMul16);
wt148 = _mm512_mul_ps(wt148, postMul16);
wt149 = _mm512_mul_ps(wt149, postMul16);
wt150 = _mm512_mul_ps(wt150, postMul16);
wt151 = _mm512_mul_ps(wt151, postMul16);
wt152 = _mm512_mul_ps(wt152, postMul16);
wt153 = _mm512_mul_ps(wt153, postMul16);
wt154 = _mm512_mul_ps(wt154, postMul16);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)0, 63>>cut6, wt139);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)0, 63>>cut6, wt140);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)0, 63>>cut6, wt141);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)0, 63>>cut6, wt142);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)0, 63>>cut6, wt143);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)0, 63>>cut6, wt144);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)0, 63>>cut6, wt145);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)0, 63>>cut6, wt146);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)0, 63>>cut6, wt147);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)0, 63>>cut6, wt148);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)0, 63>>cut6, wt149);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)0, 63>>cut6, wt150);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)0, 63>>cut6, wt151);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)0, 63>>cut6, wt152);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)0, 63>>cut6, wt153);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)0, 63>>cut6, wt154);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt139);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt140);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt141);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt142);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt143);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt144);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt145);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt146);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt147);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt148);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt149);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt150);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt151);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt152);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt153);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt154);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt139);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt140);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt141);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt142);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt143);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt144);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt145);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt146);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt147);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt148);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt149);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt150);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt151);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt152);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt153);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt154);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt139);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt140);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt141);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt142);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt143);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt144);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt145);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt146);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt147);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt148);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt149);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt150);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt151);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt152);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt153);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt154);
}
}
}
} else {
ptrdiff_t k71 = 240;
ptrdiff_t l23 = (size_t)(0+k71)/6;
ptrdiff_t cut5 = (size_t)(0+k71)%6;
__m512 sum79 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i19+4*k71);
__m512i pmMul10 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd10 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo8 = _mm512_loadu_ps(bnPtr5+(ptrdiff_t)8*(k71+256*i19));
__m512 masHi8 = _mm512_maskz_loadu_ps(65535, bnPtr5+(ptrdiff_t)8*(k71+256*i19)+(ptrdiff_t)64);
__m512 postMul14 = _mm512_permutex2var_ps(masLo8, pmMul10, masHi8);
__m512 postAdd8 = _mm512_permutex2var_ps(masLo8, pmAdd10, masHi8);
sum79 = _mm512_fmadd_ps(sum79, postMul14, postAdd8);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*0+(ptrdiff_t)0, 63>>cut5, sum79);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*0+(ptrdiff_t)1536, 4032>>cut5, sum79);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*0+(ptrdiff_t)3072, 65535-(4095>>cut5), sum79);
ptrdiff_t c12 = 0;
for (; c12 != 4; ++c12) {
__m512 wt107 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)0);
__m512 wt108 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)256);
__m512 wt109 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)512);
__m512 wt110 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)768);
__m512 wt111 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)1024);
__m512 wt112 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)1280);
__m512 wt113 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)1536);
__m512 wt114 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)1792);
__m512 wt115 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)2048);
__m512 wt116 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)2304);
__m512 wt117 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)2560);
__m512 wt118 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)2816);
__m512 wt119 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)3072);
__m512 wt120 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)3328);
__m512 wt121 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)3584);
__m512 wt122 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)3840);
__m512 tmp5349 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp5350 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp5351 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp5352 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp5353 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp5354 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp5355 = _mm512_unpacklo_ps(wt113, wt114);
__m512 tmp5356 = _mm512_unpackhi_ps(wt113, wt114);
__m512 tmp5357 = _mm512_unpacklo_ps(wt115, wt116);
__m512 tmp5358 = _mm512_unpackhi_ps(wt115, wt116);
__m512 tmp5359 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp5360 = _mm512_unpackhi_ps(wt117, wt118);
__m512 tmp5361 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp5362 = _mm512_unpackhi_ps(wt119, wt120);
__m512 tmp5363 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp5364 = _mm512_unpackhi_ps(wt121, wt122);
__m512 tmp5365 = _mm512_shuffle_ps(tmp5349, tmp5351, 68);
__m512 tmp5366 = _mm512_shuffle_ps(tmp5349, tmp5351, 238);
__m512 tmp5367 = _mm512_shuffle_ps(tmp5350, tmp5352, 68);
__m512 tmp5368 = _mm512_shuffle_ps(tmp5350, tmp5352, 238);
__m512 tmp5369 = _mm512_shuffle_ps(tmp5353, tmp5355, 68);
__m512 tmp5370 = _mm512_shuffle_ps(tmp5353, tmp5355, 238);
__m512 tmp5371 = _mm512_shuffle_ps(tmp5354, tmp5356, 68);
__m512 tmp5372 = _mm512_shuffle_ps(tmp5354, tmp5356, 238);
__m512 tmp5373 = _mm512_shuffle_ps(tmp5357, tmp5359, 68);
__m512 tmp5374 = _mm512_shuffle_ps(tmp5357, tmp5359, 238);
__m512 tmp5375 = _mm512_shuffle_ps(tmp5358, tmp5360, 68);
__m512 tmp5376 = _mm512_shuffle_ps(tmp5358, tmp5360, 238);
__m512 tmp5377 = _mm512_shuffle_ps(tmp5361, tmp5363, 68);
__m512 tmp5378 = _mm512_shuffle_ps(tmp5361, tmp5363, 238);
__m512 tmp5379 = _mm512_shuffle_ps(tmp5362, tmp5364, 68);
__m512 tmp5380 = _mm512_shuffle_ps(tmp5362, tmp5364, 238);
__m512 tmp5381 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 136);
__m512 tmp5382 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 221);
__m512 tmp5383 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 136);
__m512 tmp5384 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 221);
__m512 tmp5385 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 136);
__m512 tmp5386 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 221);
__m512 tmp5387 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 136);
__m512 tmp5388 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 221);
__m512 tmp5389 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 136);
__m512 tmp5390 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 221);
__m512 tmp5391 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 136);
__m512 tmp5392 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 221);
__m512 tmp5393 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 136);
__m512 tmp5394 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 221);
__m512 tmp5395 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 136);
__m512 tmp5396 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 221);
wt107 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 136);
wt115 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 221);
wt108 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 136);
wt116 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 221);
wt109 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 136);
wt117 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 221);
wt110 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 136);
wt118 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 221);
wt111 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 136);
wt119 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 221);
wt112 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 136);
wt120 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 221);
wt113 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 136);
wt121 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 221);
wt114 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 136);
wt122 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 221);
wt107 = _mm512_mul_ps(wt107, postMul14);
wt108 = _mm512_mul_ps(wt108, postMul14);
wt109 = _mm512_mul_ps(wt109, postMul14);
wt110 = _mm512_mul_ps(wt110, postMul14);
wt111 = _mm512_mul_ps(wt111, postMul14);
wt112 = _mm512_mul_ps(wt112, postMul14);
wt113 = _mm512_mul_ps(wt113, postMul14);
wt114 = _mm512_mul_ps(wt114, postMul14);
wt115 = _mm512_mul_ps(wt115, postMul14);
wt116 = _mm512_mul_ps(wt116, postMul14);
wt117 = _mm512_mul_ps(wt117, postMul14);
wt118 = _mm512_mul_ps(wt118, postMul14);
wt119 = _mm512_mul_ps(wt119, postMul14);
wt120 = _mm512_mul_ps(wt120, postMul14);
wt121 = _mm512_mul_ps(wt121, postMul14);
wt122 = _mm512_mul_ps(wt122, postMul14);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)0, 63>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)0, 63>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)0, 63>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)0, 63>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)0, 63>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)0, 63>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)0, 63>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)0, 63>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)0, 63>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)0, 63>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)0, 63>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)0, 63>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)0, 63>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)0, 63>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)0, 63>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)0, 63>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(1+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt107);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(2+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt108);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(3+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt109);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(4+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt110);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(5+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt111);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(6+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt112);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(7+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt113);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(8+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt114);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(9+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt115);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(10+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt116);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(11+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt117);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(12+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt118);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(13+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt119);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(14+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt120);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(15+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt121);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(16+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt122);
}
}
}
}
}

static void ResNet50OneArrangeWts2(ResNet50ThreaderTeam1* team26, char** tensors25) {
ResNet50ThreaderTask1 task29;
task29.callee1 = ResNet50OneArrangeWts2Callee1;
task29.any1 = tensors25;
task29.nd1 = 3;
task29.hull1[0] = 2;
task29.hull1[1] = 1;
task29.hull1[2] = 1;
ResNet50ThreaderDo1(team26, &task29);
}

static void ResNet50OneArrangeDats2Callee1(ResNet50ThreaderTask1* task30, int64_t* pt20) {
char** tensors28 = task30->any1;
ptrdiff_t c15 = pt20[1];
char*restrict datPtr7 = tensors28[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)806912*0;
char*restrict arranged4 = tensors28[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)802816*0;
ptrdiff_t ii5 = 1;
for (ptrdiff_t i20 = 0; i20 < ii5; ++i20) {
ptrdiff_t j15 = 2*c15;
ptrdiff_t jj24 = j15+(c15 < 23 ? 1 : 2);
for (; j15 != 49; ++j15) {
ptrdiff_t k73 = 0;
ptrdiff_t kk27 = k73+64;
for (; k73 < kk27; ++k73) {
__m512 dat1271 = _mm512_maskz_loadu_ps(65535, datPtr7+806912*i20+256*j15+12608*k73+(ptrdiff_t)0);
__m512 dat1272 = _mm512_maskz_loadu_ps(65535, datPtr7+806912*i20+256*j15+12608*k73+(ptrdiff_t)64);
__m512 dat1273 = _mm512_maskz_loadu_ps(65535, datPtr7+806912*i20+256*j15+12608*k73+(ptrdiff_t)128);
__m512 dat1274 = _mm512_maskz_loadu_ps(65535, datPtr7+806912*i20+256*j15+12608*k73+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged4+802816*i20+16384*j15+256*k73+(ptrdiff_t)0, 65535, dat1271);
_mm512_mask_storeu_ps(arranged4+802816*i20+16384*j15+256*k73+(ptrdiff_t)64, 65535, dat1272);
_mm512_mask_storeu_ps(arranged4+802816*i20+16384*j15+256*k73+(ptrdiff_t)128, 65535, dat1273);
_mm512_mask_storeu_ps(arranged4+802816*i20+16384*j15+256*k73+(ptrdiff_t)192, 65535, dat1274);
}
if (j15 >= jj24) goto next2;
}
next2:;
}
}

static void ResNet50OneArrangeDats2(ResNet50ThreaderTeam1* team27, char** tensors27) {
ResNet50ThreaderTask1 task31;
task31.callee1 = ResNet50OneArrangeDats2Callee1;
task31.any1 = tensors27;
task31.nd1 = 4;
task31.hull1[0] = 1;
task31.hull1[1] = 24;
task31.hull1[2] = 1;
task31.hull1[3] = 1;
ResNet50ThreaderDo1(team27, &task31);
}

static void ResNet50OneApply2Callee1(ResNet50ThreaderTask1* task32, int64_t* pt21) {
void** pair6 = task32->any1;
char** tensors30 = pair6[0];
ptrdiff_t e10 = 0;
ptrdiff_t g11 = 0;
ptrdiff_t d6 = pt21[1];
ptrdiff_t w34 = pt21[0];
char*restrict arrangedWts2 = tensors30[0]+856064*e10+(ptrdiff_t)66560*1*g11;
char*restrict arrangedDats2 = tensors30[1]+10474240*e10+(ptrdiff_t)802816*1*g11;
char*restrict datPtr8 = tensors30[2]+(ptrdiff_t)3227648*1*g11;
char*restrict datPtr9 = tensors30[3]+(ptrdiff_t)3227648*1*g11;
ptrdiff_t ii6 = 1;
for (ptrdiff_t i21 = 0; i21 < ii6; ++i21) {
ptrdiff_t j16 = 1*d6;
ptrdiff_t jj25 = j16+0;
for (; j16 != 49; ++j16) {
ptrdiff_t k74 = 8*w34;
ptrdiff_t kk28 = k74+(w34 < 4 ? 7 : 10);
for (; k74 != 42; ++k74) {
ptrdiff_t s14 = -1;
__m512 sum82 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)24));
__m512 sum86 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)28));
__m512 sum90 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)32));
__m512 sum94 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)36));
__m512 sum98 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)40));
__m512 sum102 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)44));
__m512 sum83 = sum82;
__m512 sum84 = sum82;
__m512 sum85 = sum82;
__m512 sum87 = sum86;
__m512 sum88 = sum86;
__m512 sum89 = sum86;
__m512 sum91 = sum90;
__m512 sum92 = sum90;
__m512 sum93 = sum90;
__m512 sum95 = sum94;
__m512 sum96 = sum94;
__m512 sum97 = sum94;
__m512 sum99 = sum98;
__m512 sum100 = sum98;
__m512 sum101 = sum98;
__m512 sum103 = sum102;
__m512 sum104 = sum102;
__m512 sum105 = sum102;
for (s14 = 0; s14 < 64; ++s14) {
__m512 dat1275 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s14+(ptrdiff_t)0);
__m512 dat1276 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s14+(ptrdiff_t)64);
__m512 dat1277 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s14+(ptrdiff_t)128);
__m512 dat1278 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s14+(ptrdiff_t)192);
__m512 wt155 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)24));
sum82 = _mm512_fmadd_ps(wt155, dat1275, sum82);
sum83 = _mm512_fmadd_ps(wt155, dat1276, sum83);
sum84 = _mm512_fmadd_ps(wt155, dat1277, sum84);
sum85 = _mm512_fmadd_ps(wt155, dat1278, sum85);
__m512 wt156 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)28));
sum86 = _mm512_fmadd_ps(wt156, dat1275, sum86);
sum87 = _mm512_fmadd_ps(wt156, dat1276, sum87);
sum88 = _mm512_fmadd_ps(wt156, dat1277, sum88);
sum89 = _mm512_fmadd_ps(wt156, dat1278, sum89);
__m512 wt157 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)32));
sum90 = _mm512_fmadd_ps(wt157, dat1275, sum90);
sum91 = _mm512_fmadd_ps(wt157, dat1276, sum91);
sum92 = _mm512_fmadd_ps(wt157, dat1277, sum92);
sum93 = _mm512_fmadd_ps(wt157, dat1278, sum93);
__m512 wt158 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)36));
sum94 = _mm512_fmadd_ps(wt158, dat1275, sum94);
sum95 = _mm512_fmadd_ps(wt158, dat1276, sum95);
sum96 = _mm512_fmadd_ps(wt158, dat1277, sum96);
sum97 = _mm512_fmadd_ps(wt158, dat1278, sum97);
__m512 wt159 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)40));
sum98 = _mm512_fmadd_ps(wt159, dat1275, sum98);
sum99 = _mm512_fmadd_ps(wt159, dat1276, sum99);
sum100 = _mm512_fmadd_ps(wt159, dat1277, sum100);
sum101 = _mm512_fmadd_ps(wt159, dat1278, sum101);
__m512 wt160 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)44));
sum102 = _mm512_fmadd_ps(wt160, dat1275, sum102);
sum103 = _mm512_fmadd_ps(wt160, dat1276, sum103);
sum104 = _mm512_fmadd_ps(wt160, dat1277, sum104);
sum105 = _mm512_fmadd_ps(wt160, dat1278, sum105);
}
sum82 = _mm512_add_ps(sum82, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)0));
sum83 = _mm512_add_ps(sum83, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)64));
sum84 = _mm512_add_ps(sum84, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)128));
sum85 = _mm512_add_ps(sum85, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)192));
sum82 = _mm512_max_ps(_mm512_setzero_ps(), sum82);
sum83 = _mm512_max_ps(_mm512_setzero_ps(), sum83);
sum84 = _mm512_max_ps(_mm512_setzero_ps(), sum84);
sum85 = _mm512_max_ps(_mm512_setzero_ps(), sum85);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)0, 65535, sum82);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)64, 65535, sum83);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)128, 65535, sum84);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)192, 65535, sum85);
sum86 = _mm512_add_ps(sum86, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12608));
sum87 = _mm512_add_ps(sum87, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12672));
sum88 = _mm512_add_ps(sum88, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12736));
sum89 = _mm512_add_ps(sum89, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12800));
sum86 = _mm512_max_ps(_mm512_setzero_ps(), sum86);
sum87 = _mm512_max_ps(_mm512_setzero_ps(), sum87);
sum88 = _mm512_max_ps(_mm512_setzero_ps(), sum88);
sum89 = _mm512_max_ps(_mm512_setzero_ps(), sum89);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12608, 65535, sum86);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12672, 65535, sum87);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12736, 65535, sum88);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12800, 65535, sum89);
sum90 = _mm512_add_ps(sum90, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25216));
sum91 = _mm512_add_ps(sum91, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25280));
sum92 = _mm512_add_ps(sum92, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25344));
sum93 = _mm512_add_ps(sum93, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25408));
sum90 = _mm512_max_ps(_mm512_setzero_ps(), sum90);
sum91 = _mm512_max_ps(_mm512_setzero_ps(), sum91);
sum92 = _mm512_max_ps(_mm512_setzero_ps(), sum92);
sum93 = _mm512_max_ps(_mm512_setzero_ps(), sum93);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25216, 65535, sum90);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25280, 65535, sum91);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25344, 65535, sum92);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25408, 65535, sum93);
sum94 = _mm512_add_ps(sum94, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37824));
sum95 = _mm512_add_ps(sum95, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37888));
sum96 = _mm512_add_ps(sum96, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37952));
sum97 = _mm512_add_ps(sum97, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)38016));
sum94 = _mm512_max_ps(_mm512_setzero_ps(), sum94);
sum95 = _mm512_max_ps(_mm512_setzero_ps(), sum95);
sum96 = _mm512_max_ps(_mm512_setzero_ps(), sum96);
sum97 = _mm512_max_ps(_mm512_setzero_ps(), sum97);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37824, 65535, sum94);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37888, 65535, sum95);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37952, 65535, sum96);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)38016, 65535, sum97);
sum98 = _mm512_add_ps(sum98, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50432));
sum99 = _mm512_add_ps(sum99, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50496));
sum100 = _mm512_add_ps(sum100, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50560));
sum101 = _mm512_add_ps(sum101, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50624));
sum98 = _mm512_max_ps(_mm512_setzero_ps(), sum98);
sum99 = _mm512_max_ps(_mm512_setzero_ps(), sum99);
sum100 = _mm512_max_ps(_mm512_setzero_ps(), sum100);
sum101 = _mm512_max_ps(_mm512_setzero_ps(), sum101);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50432, 65535, sum98);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50496, 65535, sum99);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50560, 65535, sum100);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50624, 65535, sum101);
sum102 = _mm512_add_ps(sum102, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63040));
sum103 = _mm512_add_ps(sum103, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63104));
sum104 = _mm512_add_ps(sum104, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63168));
sum105 = _mm512_add_ps(sum105, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63232));
sum102 = _mm512_max_ps(_mm512_setzero_ps(), sum102);
sum103 = _mm512_max_ps(_mm512_setzero_ps(), sum103);
sum104 = _mm512_max_ps(_mm512_setzero_ps(), sum104);
sum105 = _mm512_max_ps(_mm512_setzero_ps(), sum105);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63040, 65535, sum102);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63104, 65535, sum103);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63168, 65535, sum104);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63232, 65535, sum105);
if (k74 >= kk28) return;
}
ptrdiff_t s15 = -1;
__m512 sum106 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)16));
__m512 sum110 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)20));
__m512 sum114 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)24));
__m512 sum118 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)28));
__m512 sum107 = sum106;
__m512 sum108 = sum106;
__m512 sum109 = sum106;
__m512 sum111 = sum110;
__m512 sum112 = sum110;
__m512 sum113 = sum110;
__m512 sum115 = sum114;
__m512 sum116 = sum114;
__m512 sum117 = sum114;
__m512 sum119 = sum118;
__m512 sum120 = sum118;
__m512 sum121 = sum118;
for (s15 = 0; s15 < 64; ++s15) {
__m512 dat1279 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s15+(ptrdiff_t)0);
__m512 dat1280 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s15+(ptrdiff_t)64);
__m512 dat1281 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s15+(ptrdiff_t)128);
__m512 dat1282 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s15+(ptrdiff_t)192);
__m512 wt161 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)16));
sum106 = _mm512_fmadd_ps(wt161, dat1279, sum106);
sum107 = _mm512_fmadd_ps(wt161, dat1280, sum107);
sum108 = _mm512_fmadd_ps(wt161, dat1281, sum108);
sum109 = _mm512_fmadd_ps(wt161, dat1282, sum109);
__m512 wt162 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)20));
sum110 = _mm512_fmadd_ps(wt162, dat1279, sum110);
sum111 = _mm512_fmadd_ps(wt162, dat1280, sum111);
sum112 = _mm512_fmadd_ps(wt162, dat1281, sum112);
sum113 = _mm512_fmadd_ps(wt162, dat1282, sum113);
__m512 wt163 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)24));
sum114 = _mm512_fmadd_ps(wt163, dat1279, sum114);
sum115 = _mm512_fmadd_ps(wt163, dat1280, sum115);
sum116 = _mm512_fmadd_ps(wt163, dat1281, sum116);
sum117 = _mm512_fmadd_ps(wt163, dat1282, sum117);
__m512 wt164 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)28));
sum118 = _mm512_fmadd_ps(wt164, dat1279, sum118);
sum119 = _mm512_fmadd_ps(wt164, dat1280, sum119);
sum120 = _mm512_fmadd_ps(wt164, dat1281, sum120);
sum121 = _mm512_fmadd_ps(wt164, dat1282, sum121);
}
sum106 = _mm512_add_ps(sum106, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)0));
sum107 = _mm512_add_ps(sum107, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)64));
sum108 = _mm512_add_ps(sum108, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)128));
sum109 = _mm512_add_ps(sum109, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)192));
sum106 = _mm512_max_ps(_mm512_setzero_ps(), sum106);
sum107 = _mm512_max_ps(_mm512_setzero_ps(), sum107);
sum108 = _mm512_max_ps(_mm512_setzero_ps(), sum108);
sum109 = _mm512_max_ps(_mm512_setzero_ps(), sum109);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)0, 65535, sum106);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)64, 65535, sum107);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)128, 65535, sum108);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)192, 65535, sum109);
sum110 = _mm512_add_ps(sum110, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12608));
sum111 = _mm512_add_ps(sum111, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12672));
sum112 = _mm512_add_ps(sum112, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12736));
sum113 = _mm512_add_ps(sum113, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12800));
sum110 = _mm512_max_ps(_mm512_setzero_ps(), sum110);
sum111 = _mm512_max_ps(_mm512_setzero_ps(), sum111);
sum112 = _mm512_max_ps(_mm512_setzero_ps(), sum112);
sum113 = _mm512_max_ps(_mm512_setzero_ps(), sum113);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12608, 65535, sum110);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12672, 65535, sum111);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12736, 65535, sum112);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12800, 65535, sum113);
sum114 = _mm512_add_ps(sum114, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25216));
sum115 = _mm512_add_ps(sum115, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25280));
sum116 = _mm512_add_ps(sum116, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25344));
sum117 = _mm512_add_ps(sum117, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25408));
sum114 = _mm512_max_ps(_mm512_setzero_ps(), sum114);
sum115 = _mm512_max_ps(_mm512_setzero_ps(), sum115);
sum116 = _mm512_max_ps(_mm512_setzero_ps(), sum116);
sum117 = _mm512_max_ps(_mm512_setzero_ps(), sum117);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25216, 65535, sum114);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25280, 65535, sum115);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25344, 65535, sum116);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25408, 65535, sum117);
sum118 = _mm512_add_ps(sum118, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37824));
sum119 = _mm512_add_ps(sum119, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37888));
sum120 = _mm512_add_ps(sum120, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37952));
sum121 = _mm512_add_ps(sum121, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)38016));
sum118 = _mm512_max_ps(_mm512_setzero_ps(), sum118);
sum119 = _mm512_max_ps(_mm512_setzero_ps(), sum119);
sum120 = _mm512_max_ps(_mm512_setzero_ps(), sum120);
sum121 = _mm512_max_ps(_mm512_setzero_ps(), sum121);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37824, 65535, sum118);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37888, 65535, sum119);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37952, 65535, sum120);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)38016, 65535, sum121);
if (j16 >= jj25) return;
}
}
}

static void ResNet50OneApply2(ResNet50ThreaderTeam1* team28, char** tensors29) {
void* pair5[] = {tensors29, 0};
ResNet50ThreaderTask1 task33;
task33.callee1 = ResNet50OneApply2Callee1;
task33.any1 = pair5;
task33.nd1 = 3;
task33.hull1[0] = 5;
task33.hull1[1] = 49;
task33.hull1[2] = 1;
ResNet50ThreaderDo1(team28, &task33);
}

static void ResNet50OneArrangeWts3Callee1(ResNet50ThreaderTask1* task34, int64_t* pt22) {
char** tensors32 = task34->any1;
ptrdiff_t b49 = pt22[0];
char*restrict wtPtr6 = tensors32[0]+(ptrdiff_t)3340*0+(ptrdiff_t)65536*0;
char*restrict biasPtr6 = tensors32[1]+(ptrdiff_t)256*0;
char*restrict bnPtr6 = tensors32[2]+(ptrdiff_t)8*64*0;
char*restrict arranged5 = tensors32[3]+(ptrdiff_t)214016*0+(ptrdiff_t)65792*0;
ptrdiff_t ii7 = 1;
for (ptrdiff_t i22 = 0; i22 < ii7; ++i22) {
ptrdiff_t j17 = 2*b49;
ptrdiff_t jj26 = j17+2;
for (; j17 < jj26; ++j17) {
if (j17 < 3) {
ptrdiff_t k76 = 0+16*(j17-0);
ptrdiff_t l26 = (size_t)(0+k76)/6;
ptrdiff_t cut8 = (size_t)(0+k76)%6;
switch (cut8) {
case 0:;
case 2: {
__m512 sum123 = _mm512_maskz_loadu_ps(65535, biasPtr6+256*i22+4*k76);
__m512i pmMul11 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd11 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo9 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k76+64*i22));
__m512 masHi9 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k76+64*i22)+(ptrdiff_t)64);
__m512 postMul18 = _mm512_permutex2var_ps(masLo9, pmMul11, masHi9);
__m512 postAdd12 = _mm512_permutex2var_ps(masLo9, pmAdd11, masHi9);
sum123 = _mm512_fmadd_ps(sum123, postMul18, postAdd12);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)0, 63>>cut8, sum123);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)6144, 4032>>cut8, sum123);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)12288, 65535-(4095>>cut8), sum123);
ptrdiff_t c17 = 0;
for (; c17 != 16; ++c17) {
__m512 wt181 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)0);
__m512 wt182 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)1024);
__m512 wt183 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)2048);
__m512 wt184 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)3072);
__m512 wt185 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)4096);
__m512 wt186 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)5120);
__m512 wt187 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)6144);
__m512 wt188 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)7168);
__m512 wt189 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)8192);
__m512 wt190 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)9216);
__m512 wt191 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)10240);
__m512 wt192 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)11264);
__m512 wt193 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)12288);
__m512 wt194 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)13312);
__m512 wt195 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)14336);
__m512 wt196 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)15360);
__m512 tmp5397 = _mm512_unpacklo_ps(wt181, wt182);
__m512 tmp5398 = _mm512_unpackhi_ps(wt181, wt182);
__m512 tmp5399 = _mm512_unpacklo_ps(wt183, wt184);
__m512 tmp5400 = _mm512_unpackhi_ps(wt183, wt184);
__m512 tmp5401 = _mm512_unpacklo_ps(wt185, wt186);
__m512 tmp5402 = _mm512_unpackhi_ps(wt185, wt186);
__m512 tmp5403 = _mm512_unpacklo_ps(wt187, wt188);
__m512 tmp5404 = _mm512_unpackhi_ps(wt187, wt188);
__m512 tmp5405 = _mm512_unpacklo_ps(wt189, wt190);
__m512 tmp5406 = _mm512_unpackhi_ps(wt189, wt190);
__m512 tmp5407 = _mm512_unpacklo_ps(wt191, wt192);
__m512 tmp5408 = _mm512_unpackhi_ps(wt191, wt192);
__m512 tmp5409 = _mm512_unpacklo_ps(wt193, wt194);
__m512 tmp5410 = _mm512_unpackhi_ps(wt193, wt194);
__m512 tmp5411 = _mm512_unpacklo_ps(wt195, wt196);
__m512 tmp5412 = _mm512_unpackhi_ps(wt195, wt196);
__m512 tmp5413 = _mm512_shuffle_ps(tmp5397, tmp5399, 68);
__m512 tmp5414 = _mm512_shuffle_ps(tmp5397, tmp5399, 238);
__m512 tmp5415 = _mm512_shuffle_ps(tmp5398, tmp5400, 68);
__m512 tmp5416 = _mm512_shuffle_ps(tmp5398, tmp5400, 238);
__m512 tmp5417 = _mm512_shuffle_ps(tmp5401, tmp5403, 68);
__m512 tmp5418 = _mm512_shuffle_ps(tmp5401, tmp5403, 238);
__m512 tmp5419 = _mm512_shuffle_ps(tmp5402, tmp5404, 68);
__m512 tmp5420 = _mm512_shuffle_ps(tmp5402, tmp5404, 238);
__m512 tmp5421 = _mm512_shuffle_ps(tmp5405, tmp5407, 68);
__m512 tmp5422 = _mm512_shuffle_ps(tmp5405, tmp5407, 238);
__m512 tmp5423 = _mm512_shuffle_ps(tmp5406, tmp5408, 68);
__m512 tmp5424 = _mm512_shuffle_ps(tmp5406, tmp5408, 238);
__m512 tmp5425 = _mm512_shuffle_ps(tmp5409, tmp5411, 68);
__m512 tmp5426 = _mm512_shuffle_ps(tmp5409, tmp5411, 238);
__m512 tmp5427 = _mm512_shuffle_ps(tmp5410, tmp5412, 68);
__m512 tmp5428 = _mm512_shuffle_ps(tmp5410, tmp5412, 238);
__m512 tmp5429 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 136);
__m512 tmp5430 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 221);
__m512 tmp5431 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 136);
__m512 tmp5432 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 221);
__m512 tmp5433 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 136);
__m512 tmp5434 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 221);
__m512 tmp5435 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 136);
__m512 tmp5436 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 221);
__m512 tmp5437 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 136);
__m512 tmp5438 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 221);
__m512 tmp5439 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 136);
__m512 tmp5440 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 221);
__m512 tmp5441 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 136);
__m512 tmp5442 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 221);
__m512 tmp5443 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 136);
__m512 tmp5444 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 221);
wt181 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 136);
wt189 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 221);
wt182 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 136);
wt190 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 221);
wt183 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 136);
wt191 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 221);
wt184 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 136);
wt192 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 221);
wt185 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 136);
wt193 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 221);
wt186 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 136);
wt194 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 221);
wt187 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 136);
wt195 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 221);
wt188 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 136);
wt196 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 221);
wt181 = _mm512_mul_ps(wt181, postMul18);
wt182 = _mm512_mul_ps(wt182, postMul18);
wt183 = _mm512_mul_ps(wt183, postMul18);
wt184 = _mm512_mul_ps(wt184, postMul18);
wt185 = _mm512_mul_ps(wt185, postMul18);
wt186 = _mm512_mul_ps(wt186, postMul18);
wt187 = _mm512_mul_ps(wt187, postMul18);
wt188 = _mm512_mul_ps(wt188, postMul18);
wt189 = _mm512_mul_ps(wt189, postMul18);
wt190 = _mm512_mul_ps(wt190, postMul18);
wt191 = _mm512_mul_ps(wt191, postMul18);
wt192 = _mm512_mul_ps(wt192, postMul18);
wt193 = _mm512_mul_ps(wt193, postMul18);
wt194 = _mm512_mul_ps(wt194, postMul18);
wt195 = _mm512_mul_ps(wt195, postMul18);
wt196 = _mm512_mul_ps(wt196, postMul18);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c17)+(ptrdiff_t)0, 63>>cut8, wt181);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c17)+(ptrdiff_t)0, 63>>cut8, wt182);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c17)+(ptrdiff_t)0, 63>>cut8, wt183);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c17)+(ptrdiff_t)0, 63>>cut8, wt184);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c17)+(ptrdiff_t)0, 63>>cut8, wt185);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c17)+(ptrdiff_t)0, 63>>cut8, wt186);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c17)+(ptrdiff_t)0, 63>>cut8, wt187);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c17)+(ptrdiff_t)0, 63>>cut8, wt188);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c17)+(ptrdiff_t)0, 63>>cut8, wt189);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c17)+(ptrdiff_t)0, 63>>cut8, wt190);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c17)+(ptrdiff_t)0, 63>>cut8, wt191);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c17)+(ptrdiff_t)0, 63>>cut8, wt192);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c17)+(ptrdiff_t)0, 63>>cut8, wt193);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c17)+(ptrdiff_t)0, 63>>cut8, wt194);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c17)+(ptrdiff_t)0, 63>>cut8, wt195);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c17)+(ptrdiff_t)0, 63>>cut8, wt196);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt181);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt182);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt183);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt184);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt185);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt186);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt187);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt188);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt189);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt190);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt191);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt192);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt193);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt194);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt195);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt196);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt181);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt182);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt183);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt184);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt185);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt186);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt187);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt188);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt189);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt190);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt191);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt192);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt193);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt194);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt195);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt196);
}
break;
}
default: {
cut8 = 4;
__m512 sum124 = _mm512_maskz_loadu_ps(65535, biasPtr6+256*i22+4*k76);
__m512i pmMul12 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd12 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo10 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k76+64*i22));
__m512 masHi10 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k76+64*i22)+(ptrdiff_t)64);
__m512 postMul19 = _mm512_permutex2var_ps(masLo10, pmMul12, masHi10);
__m512 postAdd13 = _mm512_permutex2var_ps(masLo10, pmAdd12, masHi10);
sum124 = _mm512_fmadd_ps(sum124, postMul19, postAdd13);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)0, 63>>cut8, sum124);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)6144, 4032>>cut8, sum124);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)12288, 258048>>cut8, sum124);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)18432, 65535-(262143>>cut8), sum124);
ptrdiff_t c18 = 0;
for (; c18 != 16; ++c18) {
__m512 wt197 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)0);
__m512 wt198 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)1024);
__m512 wt199 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)2048);
__m512 wt200 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)3072);
__m512 wt201 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)4096);
__m512 wt202 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)5120);
__m512 wt203 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)6144);
__m512 wt204 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)7168);
__m512 wt205 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)8192);
__m512 wt206 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)9216);
__m512 wt207 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)10240);
__m512 wt208 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)11264);
__m512 wt209 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)12288);
__m512 wt210 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)13312);
__m512 wt211 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)14336);
__m512 wt212 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)15360);
__m512 tmp5445 = _mm512_unpacklo_ps(wt197, wt198);
__m512 tmp5446 = _mm512_unpackhi_ps(wt197, wt198);
__m512 tmp5447 = _mm512_unpacklo_ps(wt199, wt200);
__m512 tmp5448 = _mm512_unpackhi_ps(wt199, wt200);
__m512 tmp5449 = _mm512_unpacklo_ps(wt201, wt202);
__m512 tmp5450 = _mm512_unpackhi_ps(wt201, wt202);
__m512 tmp5451 = _mm512_unpacklo_ps(wt203, wt204);
__m512 tmp5452 = _mm512_unpackhi_ps(wt203, wt204);
__m512 tmp5453 = _mm512_unpacklo_ps(wt205, wt206);
__m512 tmp5454 = _mm512_unpackhi_ps(wt205, wt206);
__m512 tmp5455 = _mm512_unpacklo_ps(wt207, wt208);
__m512 tmp5456 = _mm512_unpackhi_ps(wt207, wt208);
__m512 tmp5457 = _mm512_unpacklo_ps(wt209, wt210);
__m512 tmp5458 = _mm512_unpackhi_ps(wt209, wt210);
__m512 tmp5459 = _mm512_unpacklo_ps(wt211, wt212);
__m512 tmp5460 = _mm512_unpackhi_ps(wt211, wt212);
__m512 tmp5461 = _mm512_shuffle_ps(tmp5445, tmp5447, 68);
__m512 tmp5462 = _mm512_shuffle_ps(tmp5445, tmp5447, 238);
__m512 tmp5463 = _mm512_shuffle_ps(tmp5446, tmp5448, 68);
__m512 tmp5464 = _mm512_shuffle_ps(tmp5446, tmp5448, 238);
__m512 tmp5465 = _mm512_shuffle_ps(tmp5449, tmp5451, 68);
__m512 tmp5466 = _mm512_shuffle_ps(tmp5449, tmp5451, 238);
__m512 tmp5467 = _mm512_shuffle_ps(tmp5450, tmp5452, 68);
__m512 tmp5468 = _mm512_shuffle_ps(tmp5450, tmp5452, 238);
__m512 tmp5469 = _mm512_shuffle_ps(tmp5453, tmp5455, 68);
__m512 tmp5470 = _mm512_shuffle_ps(tmp5453, tmp5455, 238);
__m512 tmp5471 = _mm512_shuffle_ps(tmp5454, tmp5456, 68);
__m512 tmp5472 = _mm512_shuffle_ps(tmp5454, tmp5456, 238);
__m512 tmp5473 = _mm512_shuffle_ps(tmp5457, tmp5459, 68);
__m512 tmp5474 = _mm512_shuffle_ps(tmp5457, tmp5459, 238);
__m512 tmp5475 = _mm512_shuffle_ps(tmp5458, tmp5460, 68);
__m512 tmp5476 = _mm512_shuffle_ps(tmp5458, tmp5460, 238);
__m512 tmp5477 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 136);
__m512 tmp5478 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 221);
__m512 tmp5479 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 136);
__m512 tmp5480 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 221);
__m512 tmp5481 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 136);
__m512 tmp5482 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 221);
__m512 tmp5483 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 136);
__m512 tmp5484 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 221);
__m512 tmp5485 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 136);
__m512 tmp5486 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 221);
__m512 tmp5487 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 136);
__m512 tmp5488 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 221);
__m512 tmp5489 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 136);
__m512 tmp5490 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 221);
__m512 tmp5491 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 136);
__m512 tmp5492 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 221);
wt197 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 136);
wt205 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 221);
wt198 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 136);
wt206 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 221);
wt199 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 136);
wt207 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 221);
wt200 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 136);
wt208 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 221);
wt201 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 136);
wt209 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 221);
wt202 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 136);
wt210 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 221);
wt203 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 136);
wt211 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 221);
wt204 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 136);
wt212 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 221);
wt197 = _mm512_mul_ps(wt197, postMul19);
wt198 = _mm512_mul_ps(wt198, postMul19);
wt199 = _mm512_mul_ps(wt199, postMul19);
wt200 = _mm512_mul_ps(wt200, postMul19);
wt201 = _mm512_mul_ps(wt201, postMul19);
wt202 = _mm512_mul_ps(wt202, postMul19);
wt203 = _mm512_mul_ps(wt203, postMul19);
wt204 = _mm512_mul_ps(wt204, postMul19);
wt205 = _mm512_mul_ps(wt205, postMul19);
wt206 = _mm512_mul_ps(wt206, postMul19);
wt207 = _mm512_mul_ps(wt207, postMul19);
wt208 = _mm512_mul_ps(wt208, postMul19);
wt209 = _mm512_mul_ps(wt209, postMul19);
wt210 = _mm512_mul_ps(wt210, postMul19);
wt211 = _mm512_mul_ps(wt211, postMul19);
wt212 = _mm512_mul_ps(wt212, postMul19);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)0, 63>>cut8, wt197);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)0, 63>>cut8, wt198);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)0, 63>>cut8, wt199);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)0, 63>>cut8, wt200);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)0, 63>>cut8, wt201);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)0, 63>>cut8, wt202);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)0, 63>>cut8, wt203);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)0, 63>>cut8, wt204);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)0, 63>>cut8, wt205);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)0, 63>>cut8, wt206);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)0, 63>>cut8, wt207);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)0, 63>>cut8, wt208);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)0, 63>>cut8, wt209);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)0, 63>>cut8, wt210);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)0, 63>>cut8, wt211);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)0, 63>>cut8, wt212);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt197);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt198);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt199);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt200);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt201);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt202);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt203);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt204);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt205);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt206);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt207);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt208);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt209);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt210);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt211);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt212);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt197);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt198);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt199);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt200);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt201);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt202);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt203);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt204);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt205);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt206);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt207);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt208);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt209);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt210);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt211);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt212);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt197);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt198);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt199);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt200);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt201);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt202);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt203);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt204);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt205);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt206);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt207);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt208);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt209);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt210);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt211);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt212);
}
}
}
} else {
ptrdiff_t k75 = 48;
ptrdiff_t l25 = (size_t)(0+k75)/6;
ptrdiff_t cut7 = (size_t)(0+k75)%6;
__m512 sum122 = _mm512_maskz_loadu_ps(65535, biasPtr6+256*i22+4*k75);
__m512i pmMul13 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd13 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo11 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k75+64*i22));
__m512 masHi11 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k75+64*i22)+(ptrdiff_t)64);
__m512 postMul17 = _mm512_permutex2var_ps(masLo11, pmMul13, masHi11);
__m512 postAdd11 = _mm512_permutex2var_ps(masLo11, pmAdd13, masHi11);
sum122 = _mm512_fmadd_ps(sum122, postMul17, postAdd11);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*0+(ptrdiff_t)0, 63>>cut7, sum122);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*0+(ptrdiff_t)6144, 4032>>cut7, sum122);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*0+(ptrdiff_t)12288, 65535-(4095>>cut7), sum122);
ptrdiff_t c16 = 0;
for (; c16 != 16; ++c16) {
__m512 wt165 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)0);
__m512 wt166 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)1024);
__m512 wt167 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)2048);
__m512 wt168 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)3072);
__m512 wt169 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)4096);
__m512 wt170 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)5120);
__m512 wt171 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)6144);
__m512 wt172 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)7168);
__m512 wt173 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)8192);
__m512 wt174 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)9216);
__m512 wt175 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)10240);
__m512 wt176 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)11264);
__m512 wt177 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)12288);
__m512 wt178 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)13312);
__m512 wt179 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)14336);
__m512 wt180 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)15360);
__m512 tmp5493 = _mm512_unpacklo_ps(wt165, wt166);
__m512 tmp5494 = _mm512_unpackhi_ps(wt165, wt166);
__m512 tmp5495 = _mm512_unpacklo_ps(wt167, wt168);
__m512 tmp5496 = _mm512_unpackhi_ps(wt167, wt168);
__m512 tmp5497 = _mm512_unpacklo_ps(wt169, wt170);
__m512 tmp5498 = _mm512_unpackhi_ps(wt169, wt170);
__m512 tmp5499 = _mm512_unpacklo_ps(wt171, wt172);
__m512 tmp5500 = _mm512_unpackhi_ps(wt171, wt172);
__m512 tmp5501 = _mm512_unpacklo_ps(wt173, wt174);
__m512 tmp5502 = _mm512_unpackhi_ps(wt173, wt174);
__m512 tmp5503 = _mm512_unpacklo_ps(wt175, wt176);
__m512 tmp5504 = _mm512_unpackhi_ps(wt175, wt176);
__m512 tmp5505 = _mm512_unpacklo_ps(wt177, wt178);
__m512 tmp5506 = _mm512_unpackhi_ps(wt177, wt178);
__m512 tmp5507 = _mm512_unpacklo_ps(wt179, wt180);
__m512 tmp5508 = _mm512_unpackhi_ps(wt179, wt180);
__m512 tmp5509 = _mm512_shuffle_ps(tmp5493, tmp5495, 68);
__m512 tmp5510 = _mm512_shuffle_ps(tmp5493, tmp5495, 238);
__m512 tmp5511 = _mm512_shuffle_ps(tmp5494, tmp5496, 68);
__m512 tmp5512 = _mm512_shuffle_ps(tmp5494, tmp5496, 238);
__m512 tmp5513 = _mm512_shuffle_ps(tmp5497, tmp5499, 68);
__m512 tmp5514 = _mm512_shuffle_ps(tmp5497, tmp5499, 238);
__m512 tmp5515 = _mm512_shuffle_ps(tmp5498, tmp5500, 68);
__m512 tmp5516 = _mm512_shuffle_ps(tmp5498, tmp5500, 238);
__m512 tmp5517 = _mm512_shuffle_ps(tmp5501, tmp5503, 68);
__m512 tmp5518 = _mm512_shuffle_ps(tmp5501, tmp5503, 238);
__m512 tmp5519 = _mm512_shuffle_ps(tmp5502, tmp5504, 68);
__m512 tmp5520 = _mm512_shuffle_ps(tmp5502, tmp5504, 238);
__m512 tmp5521 = _mm512_shuffle_ps(tmp5505, tmp5507, 68);
__m512 tmp5522 = _mm512_shuffle_ps(tmp5505, tmp5507, 238);
__m512 tmp5523 = _mm512_shuffle_ps(tmp5506, tmp5508, 68);
__m512 tmp5524 = _mm512_shuffle_ps(tmp5506, tmp5508, 238);
__m512 tmp5525 = _mm512_shuffle_f32x4(tmp5509, tmp5513, 136);
__m512 tmp5526 = _mm512_shuffle_f32x4(tmp5509, tmp5513, 221);
__m512 tmp5527 = _mm512_shuffle_f32x4(tmp5510, tmp5514, 136);
__m512 tmp5528 = _mm512_shuffle_f32x4(tmp5510, tmp5514, 221);
__m512 tmp5529 = _mm512_shuffle_f32x4(tmp5511, tmp5515, 136);
__m512 tmp5530 = _mm512_shuffle_f32x4(tmp5511, tmp5515, 221);
__m512 tmp5531 = _mm512_shuffle_f32x4(tmp5512, tmp5516, 136);
__m512 tmp5532 = _mm512_shuffle_f32x4(tmp5512, tmp5516, 221);
__m512 tmp5533 = _mm512_shuffle_f32x4(tmp5517, tmp5521, 136);
__m512 tmp5534 = _mm512_shuffle_f32x4(tmp5517, tmp5521, 221);
__m512 tmp5535 = _mm512_shuffle_f32x4(tmp5518, tmp5522, 136);
__m512 tmp5536 = _mm512_shuffle_f32x4(tmp5518, tmp5522, 221);
__m512 tmp5537 = _mm512_shuffle_f32x4(tmp5519, tmp5523, 136);
__m512 tmp5538 = _mm512_shuffle_f32x4(tmp5519, tmp5523, 221);
__m512 tmp5539 = _mm512_shuffle_f32x4(tmp5520, tmp5524, 136);
__m512 tmp5540 = _mm512_shuffle_f32x4(tmp5520, tmp5524, 221);
wt165 = _mm512_shuffle_f32x4(tmp5525, tmp5533, 136);
wt173 = _mm512_shuffle_f32x4(tmp5525, tmp5533, 221);
wt166 = _mm512_shuffle_f32x4(tmp5527, tmp5535, 136);
wt174 = _mm512_shuffle_f32x4(tmp5527, tmp5535, 221);
wt167 = _mm512_shuffle_f32x4(tmp5529, tmp5537, 136);
wt175 = _mm512_shuffle_f32x4(tmp5529, tmp5537, 221);
wt168 = _mm512_shuffle_f32x4(tmp5531, tmp5539, 136);
wt176 = _mm512_shuffle_f32x4(tmp5531, tmp5539, 221);
wt169 = _mm512_shuffle_f32x4(tmp5526, tmp5534, 136);
wt177 = _mm512_shuffle_f32x4(tmp5526, tmp5534, 221);
wt170 = _mm512_shuffle_f32x4(tmp5528, tmp5536, 136);
wt178 = _mm512_shuffle_f32x4(tmp5528, tmp5536, 221);
wt171 = _mm512_shuffle_f32x4(tmp5530, tmp5538, 136);
wt179 = _mm512_shuffle_f32x4(tmp5530, tmp5538, 221);
wt172 = _mm512_shuffle_f32x4(tmp5532, tmp5540, 136);
wt180 = _mm512_shuffle_f32x4(tmp5532, tmp5540, 221);
wt165 = _mm512_mul_ps(wt165, postMul17);
wt166 = _mm512_mul_ps(wt166, postMul17);
wt167 = _mm512_mul_ps(wt167, postMul17);
wt168 = _mm512_mul_ps(wt168, postMul17);
wt169 = _mm512_mul_ps(wt169, postMul17);
wt170 = _mm512_mul_ps(wt170, postMul17);
wt171 = _mm512_mul_ps(wt171, postMul17);
wt172 = _mm512_mul_ps(wt172, postMul17);
wt173 = _mm512_mul_ps(wt173, postMul17);
wt174 = _mm512_mul_ps(wt174, postMul17);
wt175 = _mm512_mul_ps(wt175, postMul17);
wt176 = _mm512_mul_ps(wt176, postMul17);
wt177 = _mm512_mul_ps(wt177, postMul17);
wt178 = _mm512_mul_ps(wt178, postMul17);
wt179 = _mm512_mul_ps(wt179, postMul17);
wt180 = _mm512_mul_ps(wt180, postMul17);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)0, 63>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)0, 63>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)0, 63>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)0, 63>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)0, 63>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)0, 63>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)0, 63>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)0, 63>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)0, 63>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)0, 63>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)0, 63>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)0, 63>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)0, 63>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)0, 63>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)0, 63>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)0, 63>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(1+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt165);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(2+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt166);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(3+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt167);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(4+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt168);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(5+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt169);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(6+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt170);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(7+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt171);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(8+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt172);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(9+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt173);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(10+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt174);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(11+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt175);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(12+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt176);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(13+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt177);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(14+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt178);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(15+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt179);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(16+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt180);
}
}
}
}
}

static void ResNet50OneArrangeWts3(ResNet50ThreaderTeam1* team29, char** tensors31) {
ResNet50ThreaderTask1 task35;
task35.callee1 = ResNet50OneArrangeWts3Callee1;
task35.any1 = tensors31;
task35.nd1 = 3;
task35.hull1[0] = 2;
task35.hull1[1] = 1;
task35.hull1[2] = 1;
ResNet50ThreaderDo1(team29, &task35);
}

static void ResNet50OneArrangeDats3Callee1(ResNet50ThreaderTask1* task36, int64_t* pt23) {
char** tensors34 = task36->any1;
ptrdiff_t s16 = pt23[0];
ptrdiff_t c19 = pt23[1];
char*restrict datPtr10 = tensors34[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged6 = tensors34[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)3211264*0;
ptrdiff_t ii8 = 1;
for (ptrdiff_t i23 = 0; i23 < ii8; ++i23) {
ptrdiff_t j18 = 1*c19;
ptrdiff_t jj27 = j18+0;
for (; j18 != 49; ++j18) {
ptrdiff_t k77 = 128*s16;
ptrdiff_t kk29 = k77+128;
for (; k77 < kk29; ++k77) {
__m512 dat1283 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i23+256*j18+12608*k77+(ptrdiff_t)0);
__m512 dat1284 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i23+256*j18+12608*k77+(ptrdiff_t)64);
__m512 dat1285 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i23+256*j18+12608*k77+(ptrdiff_t)128);
__m512 dat1286 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i23+256*j18+12608*k77+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged6+3211264*i23+65536*j18+256*k77+(ptrdiff_t)0, 65535, dat1283);
_mm512_mask_storeu_ps(arranged6+3211264*i23+65536*j18+256*k77+(ptrdiff_t)64, 65535, dat1284);
_mm512_mask_storeu_ps(arranged6+3211264*i23+65536*j18+256*k77+(ptrdiff_t)128, 65535, dat1285);
_mm512_mask_storeu_ps(arranged6+3211264*i23+65536*j18+256*k77+(ptrdiff_t)192, 65535, dat1286);
}
if (j18 >= jj27) goto next3;
}
next3:;
}
}

static void ResNet50OneArrangeDats3(ResNet50ThreaderTeam1* team30, char** tensors33) {
ResNet50ThreaderTask1 task37;
task37.callee1 = ResNet50OneArrangeDats3Callee1;
task37.any1 = tensors33;
task37.nd1 = 4;
task37.hull1[0] = 2;
task37.hull1[1] = 49;
task37.hull1[2] = 1;
task37.hull1[3] = 1;
ResNet50ThreaderDo1(team30, &task37);
}

static void ResNet50OneApply3Callee1(ResNet50ThreaderTask1* task38, int64_t* pt24) {
void** pair8 = task38->any1;
char** tensors36 = pair8[0];
ptrdiff_t e11 = 0;
ptrdiff_t g12 = 0;
ptrdiff_t d7 = pt24[1];
ptrdiff_t w35 = pt24[0];
char*restrict arrangedWts3 = tensors36[0]+214016*e11+(ptrdiff_t)65792*1*g12;
char*restrict arrangedDats3 = tensors36[1]+10474240*e11+(ptrdiff_t)3211264*1*g12;
char*restrict datPtr11 = tensors36[2]+(ptrdiff_t)806912*1*g12;
ptrdiff_t ii9 = 1;
for (ptrdiff_t i24 = 0; i24 < ii9; ++i24) {
ptrdiff_t j19 = 1*d7;
ptrdiff_t jj28 = j19+0;
for (; j19 != 49; ++j19) {
ptrdiff_t k78 = 2*w35;
ptrdiff_t kk30 = k78+(w35 < 4 ? 1 : 2);
for (; k78 != 10; ++k78) {
ptrdiff_t s17 = -1;
__m512 sum125 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)24));
__m512 sum129 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)28));
__m512 sum133 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)32));
__m512 sum137 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)36));
__m512 sum141 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)40));
__m512 sum145 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)44));
__m512 sum126 = sum125;
__m512 sum127 = sum125;
__m512 sum128 = sum125;
__m512 sum130 = sum129;
__m512 sum131 = sum129;
__m512 sum132 = sum129;
__m512 sum134 = sum133;
__m512 sum135 = sum133;
__m512 sum136 = sum133;
__m512 sum138 = sum137;
__m512 sum139 = sum137;
__m512 sum140 = sum137;
__m512 sum142 = sum141;
__m512 sum143 = sum141;
__m512 sum144 = sum141;
__m512 sum146 = sum145;
__m512 sum147 = sum145;
__m512 sum148 = sum145;
for (s17 = 0; s17 < 256; ++s17) {
__m512 dat1287 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s17+(ptrdiff_t)0);
__m512 dat1288 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s17+(ptrdiff_t)64);
__m512 dat1289 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s17+(ptrdiff_t)128);
__m512 dat1290 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s17+(ptrdiff_t)192);
__m512 wt213 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)24));
sum125 = _mm512_fmadd_ps(wt213, dat1287, sum125);
sum126 = _mm512_fmadd_ps(wt213, dat1288, sum126);
sum127 = _mm512_fmadd_ps(wt213, dat1289, sum127);
sum128 = _mm512_fmadd_ps(wt213, dat1290, sum128);
__m512 wt214 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)28));
sum129 = _mm512_fmadd_ps(wt214, dat1287, sum129);
sum130 = _mm512_fmadd_ps(wt214, dat1288, sum130);
sum131 = _mm512_fmadd_ps(wt214, dat1289, sum131);
sum132 = _mm512_fmadd_ps(wt214, dat1290, sum132);
__m512 wt215 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)32));
sum133 = _mm512_fmadd_ps(wt215, dat1287, sum133);
sum134 = _mm512_fmadd_ps(wt215, dat1288, sum134);
sum135 = _mm512_fmadd_ps(wt215, dat1289, sum135);
sum136 = _mm512_fmadd_ps(wt215, dat1290, sum136);
__m512 wt216 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)36));
sum137 = _mm512_fmadd_ps(wt216, dat1287, sum137);
sum138 = _mm512_fmadd_ps(wt216, dat1288, sum138);
sum139 = _mm512_fmadd_ps(wt216, dat1289, sum139);
sum140 = _mm512_fmadd_ps(wt216, dat1290, sum140);
__m512 wt217 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)40));
sum141 = _mm512_fmadd_ps(wt217, dat1287, sum141);
sum142 = _mm512_fmadd_ps(wt217, dat1288, sum142);
sum143 = _mm512_fmadd_ps(wt217, dat1289, sum143);
sum144 = _mm512_fmadd_ps(wt217, dat1290, sum144);
__m512 wt218 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)44));
sum145 = _mm512_fmadd_ps(wt218, dat1287, sum145);
sum146 = _mm512_fmadd_ps(wt218, dat1288, sum146);
sum147 = _mm512_fmadd_ps(wt218, dat1289, sum147);
sum148 = _mm512_fmadd_ps(wt218, dat1290, sum148);
}
sum125 = _mm512_max_ps(_mm512_setzero_ps(), sum125);
sum126 = _mm512_max_ps(_mm512_setzero_ps(), sum126);
sum127 = _mm512_max_ps(_mm512_setzero_ps(), sum127);
sum128 = _mm512_max_ps(_mm512_setzero_ps(), sum128);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)0, 65535, sum125);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)64, 65535, sum126);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)128, 65535, sum127);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)192, 65535, sum128);
sum129 = _mm512_max_ps(_mm512_setzero_ps(), sum129);
sum130 = _mm512_max_ps(_mm512_setzero_ps(), sum130);
sum131 = _mm512_max_ps(_mm512_setzero_ps(), sum131);
sum132 = _mm512_max_ps(_mm512_setzero_ps(), sum132);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12608, 65535, sum129);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12672, 65535, sum130);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12736, 65535, sum131);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12800, 65535, sum132);
sum133 = _mm512_max_ps(_mm512_setzero_ps(), sum133);
sum134 = _mm512_max_ps(_mm512_setzero_ps(), sum134);
sum135 = _mm512_max_ps(_mm512_setzero_ps(), sum135);
sum136 = _mm512_max_ps(_mm512_setzero_ps(), sum136);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25216, 65535, sum133);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25280, 65535, sum134);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25344, 65535, sum135);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25408, 65535, sum136);
sum137 = _mm512_max_ps(_mm512_setzero_ps(), sum137);
sum138 = _mm512_max_ps(_mm512_setzero_ps(), sum138);
sum139 = _mm512_max_ps(_mm512_setzero_ps(), sum139);
sum140 = _mm512_max_ps(_mm512_setzero_ps(), sum140);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37824, 65535, sum137);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37888, 65535, sum138);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37952, 65535, sum139);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)38016, 65535, sum140);
sum141 = _mm512_max_ps(_mm512_setzero_ps(), sum141);
sum142 = _mm512_max_ps(_mm512_setzero_ps(), sum142);
sum143 = _mm512_max_ps(_mm512_setzero_ps(), sum143);
sum144 = _mm512_max_ps(_mm512_setzero_ps(), sum144);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)50432, 65535, sum141);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)50496, 65535, sum142);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)50560, 65535, sum143);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)50624, 65535, sum144);
sum145 = _mm512_max_ps(_mm512_setzero_ps(), sum145);
sum146 = _mm512_max_ps(_mm512_setzero_ps(), sum146);
sum147 = _mm512_max_ps(_mm512_setzero_ps(), sum147);
sum148 = _mm512_max_ps(_mm512_setzero_ps(), sum148);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)63040, 65535, sum145);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)63104, 65535, sum146);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)63168, 65535, sum147);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)63232, 65535, sum148);
if (k78 >= kk30) return;
}
ptrdiff_t s18 = -1;
__m512 sum149 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)16));
__m512 sum153 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)20));
__m512 sum157 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)24));
__m512 sum161 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)28));
__m512 sum150 = sum149;
__m512 sum151 = sum149;
__m512 sum152 = sum149;
__m512 sum154 = sum153;
__m512 sum155 = sum153;
__m512 sum156 = sum153;
__m512 sum158 = sum157;
__m512 sum159 = sum157;
__m512 sum160 = sum157;
__m512 sum162 = sum161;
__m512 sum163 = sum161;
__m512 sum164 = sum161;
for (s18 = 0; s18 < 256; ++s18) {
__m512 dat1291 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s18+(ptrdiff_t)0);
__m512 dat1292 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s18+(ptrdiff_t)64);
__m512 dat1293 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s18+(ptrdiff_t)128);
__m512 dat1294 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s18+(ptrdiff_t)192);
__m512 wt219 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)16));
sum149 = _mm512_fmadd_ps(wt219, dat1291, sum149);
sum150 = _mm512_fmadd_ps(wt219, dat1292, sum150);
sum151 = _mm512_fmadd_ps(wt219, dat1293, sum151);
sum152 = _mm512_fmadd_ps(wt219, dat1294, sum152);
__m512 wt220 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)20));
sum153 = _mm512_fmadd_ps(wt220, dat1291, sum153);
sum154 = _mm512_fmadd_ps(wt220, dat1292, sum154);
sum155 = _mm512_fmadd_ps(wt220, dat1293, sum155);
sum156 = _mm512_fmadd_ps(wt220, dat1294, sum156);
__m512 wt221 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)24));
sum157 = _mm512_fmadd_ps(wt221, dat1291, sum157);
sum158 = _mm512_fmadd_ps(wt221, dat1292, sum158);
sum159 = _mm512_fmadd_ps(wt221, dat1293, sum159);
sum160 = _mm512_fmadd_ps(wt221, dat1294, sum160);
__m512 wt222 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)28));
sum161 = _mm512_fmadd_ps(wt222, dat1291, sum161);
sum162 = _mm512_fmadd_ps(wt222, dat1292, sum162);
sum163 = _mm512_fmadd_ps(wt222, dat1293, sum163);
sum164 = _mm512_fmadd_ps(wt222, dat1294, sum164);
}
sum149 = _mm512_max_ps(_mm512_setzero_ps(), sum149);
sum150 = _mm512_max_ps(_mm512_setzero_ps(), sum150);
sum151 = _mm512_max_ps(_mm512_setzero_ps(), sum151);
sum152 = _mm512_max_ps(_mm512_setzero_ps(), sum152);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)0, 65535, sum149);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)64, 65535, sum150);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)128, 65535, sum151);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)192, 65535, sum152);
sum153 = _mm512_max_ps(_mm512_setzero_ps(), sum153);
sum154 = _mm512_max_ps(_mm512_setzero_ps(), sum154);
sum155 = _mm512_max_ps(_mm512_setzero_ps(), sum155);
sum156 = _mm512_max_ps(_mm512_setzero_ps(), sum156);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12608, 65535, sum153);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12672, 65535, sum154);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12736, 65535, sum155);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12800, 65535, sum156);
sum157 = _mm512_max_ps(_mm512_setzero_ps(), sum157);
sum158 = _mm512_max_ps(_mm512_setzero_ps(), sum158);
sum159 = _mm512_max_ps(_mm512_setzero_ps(), sum159);
sum160 = _mm512_max_ps(_mm512_setzero_ps(), sum160);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25216, 65535, sum157);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25280, 65535, sum158);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25344, 65535, sum159);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25408, 65535, sum160);
sum161 = _mm512_max_ps(_mm512_setzero_ps(), sum161);
sum162 = _mm512_max_ps(_mm512_setzero_ps(), sum162);
sum163 = _mm512_max_ps(_mm512_setzero_ps(), sum163);
sum164 = _mm512_max_ps(_mm512_setzero_ps(), sum164);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37824, 65535, sum161);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37888, 65535, sum162);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37952, 65535, sum163);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)38016, 65535, sum164);
if (j19 >= jj28) return;
}
}
}

static void ResNet50OneApply3(ResNet50ThreaderTeam1* team31, char** tensors35) {
void* pair7[] = {tensors35, 0};
ResNet50ThreaderTask1 task39;
task39.callee1 = ResNet50OneApply3Callee1;
task39.any1 = pair7;
task39.nd1 = 3;
task39.hull1[0] = 5;
task39.hull1[1] = 49;
task39.hull1[2] = 1;
ResNet50ThreaderDo1(team31, &task39);
}

static void ResNet50OneArrangeWts4Callee1(ResNet50ThreaderTask1* task48, int64_t* pt29) {
char** tensors46 = task48->any1;
ptrdiff_t b53 = pt29[0];
char*restrict wtPtr8 = tensors46[0]+(ptrdiff_t)3340*0+(ptrdiff_t)655360*0;
char*restrict biasPtr8 = tensors46[1]+(ptrdiff_t)2560*0;
char*restrict bnPtr8 = tensors46[2]+(ptrdiff_t)8*640*0;
char*restrict wtPtr9 = tensors46[3]+(ptrdiff_t)3340*0+(ptrdiff_t)655360*0;
char*restrict biasPtr9 = tensors46[4]+(ptrdiff_t)2560*0;
char*restrict bnPtr9 = tensors46[5]+(ptrdiff_t)8*640*0;
char*restrict arranged7 = tensors46[6]+(ptrdiff_t)2140160*0+(ptrdiff_t)657920*0;
ptrdiff_t ii10 = 1;
for (ptrdiff_t i31 = 0; i31 < ii10; ++i31) {
ptrdiff_t j24 = 2*b53;
ptrdiff_t jj30 = j24+2;
for (; j24 < jj30; ++j24) {
if (j24 < 32) {
ptrdiff_t k99 = 0+16*(j24-0);
ptrdiff_t l38 = (size_t)(0+k99)/6;
ptrdiff_t cut10 = (size_t)(0+k99)%6;
switch (cut10) {
case 0:;
case 2: {
__m512 sum205 = _mm512_maskz_loadu_ps(65535, biasPtr8+2560*i31+4*k99);
__m512i pmMul15 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd15 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo12 = _mm512_loadu_ps(bnPtr8+(ptrdiff_t)8*(k99+640*i31));
__m512 masHi12 = _mm512_maskz_loadu_ps(65535, bnPtr8+(ptrdiff_t)8*(k99+640*i31)+(ptrdiff_t)64);
__m512 postMul25 = _mm512_permutex2var_ps(masLo12, pmMul15, masHi12);
__m512 postAdd15 = _mm512_permutex2var_ps(masLo12, pmAdd15, masHi12);
sum205 = _mm512_fmadd_ps(sum205, postMul25, postAdd15);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum205);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)6144, 4032>>cut10, sum205);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)12288, 65535-(4095>>cut10), sum205);
ptrdiff_t c21 = 0;
for (; c21 != 16; ++c21) {
__m512 wt227 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)0);
__m512 wt228 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)1024);
__m512 wt229 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)2048);
__m512 wt230 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)3072);
__m512 wt231 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)4096);
__m512 wt232 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)5120);
__m512 wt233 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)6144);
__m512 wt234 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)7168);
__m512 wt235 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)8192);
__m512 wt236 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)9216);
__m512 wt237 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)10240);
__m512 wt238 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)11264);
__m512 wt239 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)12288);
__m512 wt240 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)13312);
__m512 wt241 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)14336);
__m512 wt242 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)15360);
__m512 tmp10553 = _mm512_unpacklo_ps(wt227, wt228);
__m512 tmp10554 = _mm512_unpackhi_ps(wt227, wt228);
__m512 tmp10555 = _mm512_unpacklo_ps(wt229, wt230);
__m512 tmp10556 = _mm512_unpackhi_ps(wt229, wt230);
__m512 tmp10557 = _mm512_unpacklo_ps(wt231, wt232);
__m512 tmp10558 = _mm512_unpackhi_ps(wt231, wt232);
__m512 tmp10559 = _mm512_unpacklo_ps(wt233, wt234);
__m512 tmp10560 = _mm512_unpackhi_ps(wt233, wt234);
__m512 tmp10561 = _mm512_unpacklo_ps(wt235, wt236);
__m512 tmp10562 = _mm512_unpackhi_ps(wt235, wt236);
__m512 tmp10563 = _mm512_unpacklo_ps(wt237, wt238);
__m512 tmp10564 = _mm512_unpackhi_ps(wt237, wt238);
__m512 tmp10565 = _mm512_unpacklo_ps(wt239, wt240);
__m512 tmp10566 = _mm512_unpackhi_ps(wt239, wt240);
__m512 tmp10567 = _mm512_unpacklo_ps(wt241, wt242);
__m512 tmp10568 = _mm512_unpackhi_ps(wt241, wt242);
__m512 tmp10569 = _mm512_shuffle_ps(tmp10553, tmp10555, 68);
__m512 tmp10570 = _mm512_shuffle_ps(tmp10553, tmp10555, 238);
__m512 tmp10571 = _mm512_shuffle_ps(tmp10554, tmp10556, 68);
__m512 tmp10572 = _mm512_shuffle_ps(tmp10554, tmp10556, 238);
__m512 tmp10573 = _mm512_shuffle_ps(tmp10557, tmp10559, 68);
__m512 tmp10574 = _mm512_shuffle_ps(tmp10557, tmp10559, 238);
__m512 tmp10575 = _mm512_shuffle_ps(tmp10558, tmp10560, 68);
__m512 tmp10576 = _mm512_shuffle_ps(tmp10558, tmp10560, 238);
__m512 tmp10577 = _mm512_shuffle_ps(tmp10561, tmp10563, 68);
__m512 tmp10578 = _mm512_shuffle_ps(tmp10561, tmp10563, 238);
__m512 tmp10579 = _mm512_shuffle_ps(tmp10562, tmp10564, 68);
__m512 tmp10580 = _mm512_shuffle_ps(tmp10562, tmp10564, 238);
__m512 tmp10581 = _mm512_shuffle_ps(tmp10565, tmp10567, 68);
__m512 tmp10582 = _mm512_shuffle_ps(tmp10565, tmp10567, 238);
__m512 tmp10583 = _mm512_shuffle_ps(tmp10566, tmp10568, 68);
__m512 tmp10584 = _mm512_shuffle_ps(tmp10566, tmp10568, 238);
__m512 tmp10585 = _mm512_shuffle_f32x4(tmp10569, tmp10573, 136);
__m512 tmp10586 = _mm512_shuffle_f32x4(tmp10569, tmp10573, 221);
__m512 tmp10587 = _mm512_shuffle_f32x4(tmp10570, tmp10574, 136);
__m512 tmp10588 = _mm512_shuffle_f32x4(tmp10570, tmp10574, 221);
__m512 tmp10589 = _mm512_shuffle_f32x4(tmp10571, tmp10575, 136);
__m512 tmp10590 = _mm512_shuffle_f32x4(tmp10571, tmp10575, 221);
__m512 tmp10591 = _mm512_shuffle_f32x4(tmp10572, tmp10576, 136);
__m512 tmp10592 = _mm512_shuffle_f32x4(tmp10572, tmp10576, 221);
__m512 tmp10593 = _mm512_shuffle_f32x4(tmp10577, tmp10581, 136);
__m512 tmp10594 = _mm512_shuffle_f32x4(tmp10577, tmp10581, 221);
__m512 tmp10595 = _mm512_shuffle_f32x4(tmp10578, tmp10582, 136);
__m512 tmp10596 = _mm512_shuffle_f32x4(tmp10578, tmp10582, 221);
__m512 tmp10597 = _mm512_shuffle_f32x4(tmp10579, tmp10583, 136);
__m512 tmp10598 = _mm512_shuffle_f32x4(tmp10579, tmp10583, 221);
__m512 tmp10599 = _mm512_shuffle_f32x4(tmp10580, tmp10584, 136);
__m512 tmp10600 = _mm512_shuffle_f32x4(tmp10580, tmp10584, 221);
wt227 = _mm512_shuffle_f32x4(tmp10585, tmp10593, 136);
wt235 = _mm512_shuffle_f32x4(tmp10585, tmp10593, 221);
wt228 = _mm512_shuffle_f32x4(tmp10587, tmp10595, 136);
wt236 = _mm512_shuffle_f32x4(tmp10587, tmp10595, 221);
wt229 = _mm512_shuffle_f32x4(tmp10589, tmp10597, 136);
wt237 = _mm512_shuffle_f32x4(tmp10589, tmp10597, 221);
wt230 = _mm512_shuffle_f32x4(tmp10591, tmp10599, 136);
wt238 = _mm512_shuffle_f32x4(tmp10591, tmp10599, 221);
wt231 = _mm512_shuffle_f32x4(tmp10586, tmp10594, 136);
wt239 = _mm512_shuffle_f32x4(tmp10586, tmp10594, 221);
wt232 = _mm512_shuffle_f32x4(tmp10588, tmp10596, 136);
wt240 = _mm512_shuffle_f32x4(tmp10588, tmp10596, 221);
wt233 = _mm512_shuffle_f32x4(tmp10590, tmp10598, 136);
wt241 = _mm512_shuffle_f32x4(tmp10590, tmp10598, 221);
wt234 = _mm512_shuffle_f32x4(tmp10592, tmp10600, 136);
wt242 = _mm512_shuffle_f32x4(tmp10592, tmp10600, 221);
wt227 = _mm512_mul_ps(wt227, postMul25);
wt228 = _mm512_mul_ps(wt228, postMul25);
wt229 = _mm512_mul_ps(wt229, postMul25);
wt230 = _mm512_mul_ps(wt230, postMul25);
wt231 = _mm512_mul_ps(wt231, postMul25);
wt232 = _mm512_mul_ps(wt232, postMul25);
wt233 = _mm512_mul_ps(wt233, postMul25);
wt234 = _mm512_mul_ps(wt234, postMul25);
wt235 = _mm512_mul_ps(wt235, postMul25);
wt236 = _mm512_mul_ps(wt236, postMul25);
wt237 = _mm512_mul_ps(wt237, postMul25);
wt238 = _mm512_mul_ps(wt238, postMul25);
wt239 = _mm512_mul_ps(wt239, postMul25);
wt240 = _mm512_mul_ps(wt240, postMul25);
wt241 = _mm512_mul_ps(wt241, postMul25);
wt242 = _mm512_mul_ps(wt242, postMul25);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)0, 63>>cut10, wt227);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)0, 63>>cut10, wt228);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)0, 63>>cut10, wt229);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)0, 63>>cut10, wt230);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)0, 63>>cut10, wt231);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)0, 63>>cut10, wt232);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)0, 63>>cut10, wt233);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)0, 63>>cut10, wt234);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)0, 63>>cut10, wt235);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)0, 63>>cut10, wt236);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)0, 63>>cut10, wt237);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)0, 63>>cut10, wt238);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)0, 63>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)0, 63>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)0, 63>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)0, 63>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt227);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt228);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt229);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt230);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt231);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt232);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt233);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt234);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt235);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt236);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt237);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt238);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt227);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt228);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt229);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt230);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt231);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt232);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt233);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt234);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt235);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt236);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt237);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt238);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt239);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt240);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt241);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt242);
}
break;
}
default: {
cut10 = 4;
__m512 sum206 = _mm512_maskz_loadu_ps(65535, biasPtr8+2560*i31+4*k99);
__m512i pmMul16 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd16 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo13 = _mm512_loadu_ps(bnPtr8+(ptrdiff_t)8*(k99+640*i31));
__m512 masHi13 = _mm512_maskz_loadu_ps(65535, bnPtr8+(ptrdiff_t)8*(k99+640*i31)+(ptrdiff_t)64);
__m512 postMul26 = _mm512_permutex2var_ps(masLo13, pmMul16, masHi13);
__m512 postAdd16 = _mm512_permutex2var_ps(masLo13, pmAdd16, masHi13);
sum206 = _mm512_fmadd_ps(sum206, postMul26, postAdd16);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum206);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)6144, 4032>>cut10, sum206);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)12288, 258048>>cut10, sum206);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)18432, 65535-(262143>>cut10), sum206);
ptrdiff_t c22 = 0;
for (; c22 != 16; ++c22) {
__m512 wt243 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)0);
__m512 wt244 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)1024);
__m512 wt245 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)2048);
__m512 wt246 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)3072);
__m512 wt247 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)4096);
__m512 wt248 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)5120);
__m512 wt249 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)6144);
__m512 wt250 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)7168);
__m512 wt251 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)8192);
__m512 wt252 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)9216);
__m512 wt253 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)10240);
__m512 wt254 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)11264);
__m512 wt255 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)12288);
__m512 wt256 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)13312);
__m512 wt257 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)14336);
__m512 wt258 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)15360);
__m512 tmp10601 = _mm512_unpacklo_ps(wt243, wt244);
__m512 tmp10602 = _mm512_unpackhi_ps(wt243, wt244);
__m512 tmp10603 = _mm512_unpacklo_ps(wt245, wt246);
__m512 tmp10604 = _mm512_unpackhi_ps(wt245, wt246);
__m512 tmp10605 = _mm512_unpacklo_ps(wt247, wt248);
__m512 tmp10606 = _mm512_unpackhi_ps(wt247, wt248);
__m512 tmp10607 = _mm512_unpacklo_ps(wt249, wt250);
__m512 tmp10608 = _mm512_unpackhi_ps(wt249, wt250);
__m512 tmp10609 = _mm512_unpacklo_ps(wt251, wt252);
__m512 tmp10610 = _mm512_unpackhi_ps(wt251, wt252);
__m512 tmp10611 = _mm512_unpacklo_ps(wt253, wt254);
__m512 tmp10612 = _mm512_unpackhi_ps(wt253, wt254);
__m512 tmp10613 = _mm512_unpacklo_ps(wt255, wt256);
__m512 tmp10614 = _mm512_unpackhi_ps(wt255, wt256);
__m512 tmp10615 = _mm512_unpacklo_ps(wt257, wt258);
__m512 tmp10616 = _mm512_unpackhi_ps(wt257, wt258);
__m512 tmp10617 = _mm512_shuffle_ps(tmp10601, tmp10603, 68);
__m512 tmp10618 = _mm512_shuffle_ps(tmp10601, tmp10603, 238);
__m512 tmp10619 = _mm512_shuffle_ps(tmp10602, tmp10604, 68);
__m512 tmp10620 = _mm512_shuffle_ps(tmp10602, tmp10604, 238);
__m512 tmp10621 = _mm512_shuffle_ps(tmp10605, tmp10607, 68);
__m512 tmp10622 = _mm512_shuffle_ps(tmp10605, tmp10607, 238);
__m512 tmp10623 = _mm512_shuffle_ps(tmp10606, tmp10608, 68);
__m512 tmp10624 = _mm512_shuffle_ps(tmp10606, tmp10608, 238);
__m512 tmp10625 = _mm512_shuffle_ps(tmp10609, tmp10611, 68);
__m512 tmp10626 = _mm512_shuffle_ps(tmp10609, tmp10611, 238);
__m512 tmp10627 = _mm512_shuffle_ps(tmp10610, tmp10612, 68);
__m512 tmp10628 = _mm512_shuffle_ps(tmp10610, tmp10612, 238);
__m512 tmp10629 = _mm512_shuffle_ps(tmp10613, tmp10615, 68);
__m512 tmp10630 = _mm512_shuffle_ps(tmp10613, tmp10615, 238);
__m512 tmp10631 = _mm512_shuffle_ps(tmp10614, tmp10616, 68);
__m512 tmp10632 = _mm512_shuffle_ps(tmp10614, tmp10616, 238);
__m512 tmp10633 = _mm512_shuffle_f32x4(tmp10617, tmp10621, 136);
__m512 tmp10634 = _mm512_shuffle_f32x4(tmp10617, tmp10621, 221);
__m512 tmp10635 = _mm512_shuffle_f32x4(tmp10618, tmp10622, 136);
__m512 tmp10636 = _mm512_shuffle_f32x4(tmp10618, tmp10622, 221);
__m512 tmp10637 = _mm512_shuffle_f32x4(tmp10619, tmp10623, 136);
__m512 tmp10638 = _mm512_shuffle_f32x4(tmp10619, tmp10623, 221);
__m512 tmp10639 = _mm512_shuffle_f32x4(tmp10620, tmp10624, 136);
__m512 tmp10640 = _mm512_shuffle_f32x4(tmp10620, tmp10624, 221);
__m512 tmp10641 = _mm512_shuffle_f32x4(tmp10625, tmp10629, 136);
__m512 tmp10642 = _mm512_shuffle_f32x4(tmp10625, tmp10629, 221);
__m512 tmp10643 = _mm512_shuffle_f32x4(tmp10626, tmp10630, 136);
__m512 tmp10644 = _mm512_shuffle_f32x4(tmp10626, tmp10630, 221);
__m512 tmp10645 = _mm512_shuffle_f32x4(tmp10627, tmp10631, 136);
__m512 tmp10646 = _mm512_shuffle_f32x4(tmp10627, tmp10631, 221);
__m512 tmp10647 = _mm512_shuffle_f32x4(tmp10628, tmp10632, 136);
__m512 tmp10648 = _mm512_shuffle_f32x4(tmp10628, tmp10632, 221);
wt243 = _mm512_shuffle_f32x4(tmp10633, tmp10641, 136);
wt251 = _mm512_shuffle_f32x4(tmp10633, tmp10641, 221);
wt244 = _mm512_shuffle_f32x4(tmp10635, tmp10643, 136);
wt252 = _mm512_shuffle_f32x4(tmp10635, tmp10643, 221);
wt245 = _mm512_shuffle_f32x4(tmp10637, tmp10645, 136);
wt253 = _mm512_shuffle_f32x4(tmp10637, tmp10645, 221);
wt246 = _mm512_shuffle_f32x4(tmp10639, tmp10647, 136);
wt254 = _mm512_shuffle_f32x4(tmp10639, tmp10647, 221);
wt247 = _mm512_shuffle_f32x4(tmp10634, tmp10642, 136);
wt255 = _mm512_shuffle_f32x4(tmp10634, tmp10642, 221);
wt248 = _mm512_shuffle_f32x4(tmp10636, tmp10644, 136);
wt256 = _mm512_shuffle_f32x4(tmp10636, tmp10644, 221);
wt249 = _mm512_shuffle_f32x4(tmp10638, tmp10646, 136);
wt257 = _mm512_shuffle_f32x4(tmp10638, tmp10646, 221);
wt250 = _mm512_shuffle_f32x4(tmp10640, tmp10648, 136);
wt258 = _mm512_shuffle_f32x4(tmp10640, tmp10648, 221);
wt243 = _mm512_mul_ps(wt243, postMul26);
wt244 = _mm512_mul_ps(wt244, postMul26);
wt245 = _mm512_mul_ps(wt245, postMul26);
wt246 = _mm512_mul_ps(wt246, postMul26);
wt247 = _mm512_mul_ps(wt247, postMul26);
wt248 = _mm512_mul_ps(wt248, postMul26);
wt249 = _mm512_mul_ps(wt249, postMul26);
wt250 = _mm512_mul_ps(wt250, postMul26);
wt251 = _mm512_mul_ps(wt251, postMul26);
wt252 = _mm512_mul_ps(wt252, postMul26);
wt253 = _mm512_mul_ps(wt253, postMul26);
wt254 = _mm512_mul_ps(wt254, postMul26);
wt255 = _mm512_mul_ps(wt255, postMul26);
wt256 = _mm512_mul_ps(wt256, postMul26);
wt257 = _mm512_mul_ps(wt257, postMul26);
wt258 = _mm512_mul_ps(wt258, postMul26);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)0, 63>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)0, 63>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)0, 63>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)0, 63>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)0, 63>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)0, 63>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)0, 63>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)0, 63>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)0, 63>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)0, 63>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)0, 63>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)0, 63>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)0, 63>>cut10, wt255);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)0, 63>>cut10, wt256);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)0, 63>>cut10, wt257);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)0, 63>>cut10, wt258);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt255);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt256);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt257);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt258);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt255);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt256);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt257);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt258);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt243);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt244);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt245);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt246);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt247);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt248);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt249);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt250);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt251);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt252);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt253);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt254);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt255);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt256);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt257);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt258);
}
}
}
} else if (j24 < 39) {
ptrdiff_t k101 = 0+16*(j24-32);
ptrdiff_t l40 = (size_t)(512+k101)/6;
ptrdiff_t cut12 = (size_t)(512+k101)%6;
switch (cut12) {
case 0:;
case 2: {
__m512 sum208 = _mm512_maskz_loadu_ps(65535, biasPtr9+2560*i31+4*k101);
__m512i pmMul17 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd17 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo14 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k101+640*i31));
__m512 masHi14 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k101+640*i31)+(ptrdiff_t)64);
__m512 postMul28 = _mm512_permutex2var_ps(masLo14, pmMul17, masHi14);
__m512 postAdd18 = _mm512_permutex2var_ps(masLo14, pmAdd17, masHi14);
sum208 = _mm512_fmadd_ps(sum208, postMul28, postAdd18);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum208);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)6144, 4032>>cut12, sum208);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)12288, 65535-(4095>>cut12), sum208);
ptrdiff_t c24 = 0;
for (; c24 != 16; ++c24) {
__m512 wt275 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)0);
__m512 wt276 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)1024);
__m512 wt277 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)2048);
__m512 wt278 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)3072);
__m512 wt279 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)4096);
__m512 wt280 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)5120);
__m512 wt281 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)6144);
__m512 wt282 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)7168);
__m512 wt283 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)8192);
__m512 wt284 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)9216);
__m512 wt285 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)10240);
__m512 wt286 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)11264);
__m512 wt287 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)12288);
__m512 wt288 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)13312);
__m512 wt289 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)14336);
__m512 wt290 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)15360);
__m512 tmp10649 = _mm512_unpacklo_ps(wt275, wt276);
__m512 tmp10650 = _mm512_unpackhi_ps(wt275, wt276);
__m512 tmp10651 = _mm512_unpacklo_ps(wt277, wt278);
__m512 tmp10652 = _mm512_unpackhi_ps(wt277, wt278);
__m512 tmp10653 = _mm512_unpacklo_ps(wt279, wt280);
__m512 tmp10654 = _mm512_unpackhi_ps(wt279, wt280);
__m512 tmp10655 = _mm512_unpacklo_ps(wt281, wt282);
__m512 tmp10656 = _mm512_unpackhi_ps(wt281, wt282);
__m512 tmp10657 = _mm512_unpacklo_ps(wt283, wt284);
__m512 tmp10658 = _mm512_unpackhi_ps(wt283, wt284);
__m512 tmp10659 = _mm512_unpacklo_ps(wt285, wt286);
__m512 tmp10660 = _mm512_unpackhi_ps(wt285, wt286);
__m512 tmp10661 = _mm512_unpacklo_ps(wt287, wt288);
__m512 tmp10662 = _mm512_unpackhi_ps(wt287, wt288);
__m512 tmp10663 = _mm512_unpacklo_ps(wt289, wt290);
__m512 tmp10664 = _mm512_unpackhi_ps(wt289, wt290);
__m512 tmp10665 = _mm512_shuffle_ps(tmp10649, tmp10651, 68);
__m512 tmp10666 = _mm512_shuffle_ps(tmp10649, tmp10651, 238);
__m512 tmp10667 = _mm512_shuffle_ps(tmp10650, tmp10652, 68);
__m512 tmp10668 = _mm512_shuffle_ps(tmp10650, tmp10652, 238);
__m512 tmp10669 = _mm512_shuffle_ps(tmp10653, tmp10655, 68);
__m512 tmp10670 = _mm512_shuffle_ps(tmp10653, tmp10655, 238);
__m512 tmp10671 = _mm512_shuffle_ps(tmp10654, tmp10656, 68);
__m512 tmp10672 = _mm512_shuffle_ps(tmp10654, tmp10656, 238);
__m512 tmp10673 = _mm512_shuffle_ps(tmp10657, tmp10659, 68);
__m512 tmp10674 = _mm512_shuffle_ps(tmp10657, tmp10659, 238);
__m512 tmp10675 = _mm512_shuffle_ps(tmp10658, tmp10660, 68);
__m512 tmp10676 = _mm512_shuffle_ps(tmp10658, tmp10660, 238);
__m512 tmp10677 = _mm512_shuffle_ps(tmp10661, tmp10663, 68);
__m512 tmp10678 = _mm512_shuffle_ps(tmp10661, tmp10663, 238);
__m512 tmp10679 = _mm512_shuffle_ps(tmp10662, tmp10664, 68);
__m512 tmp10680 = _mm512_shuffle_ps(tmp10662, tmp10664, 238);
__m512 tmp10681 = _mm512_shuffle_f32x4(tmp10665, tmp10669, 136);
__m512 tmp10682 = _mm512_shuffle_f32x4(tmp10665, tmp10669, 221);
__m512 tmp10683 = _mm512_shuffle_f32x4(tmp10666, tmp10670, 136);
__m512 tmp10684 = _mm512_shuffle_f32x4(tmp10666, tmp10670, 221);
__m512 tmp10685 = _mm512_shuffle_f32x4(tmp10667, tmp10671, 136);
__m512 tmp10686 = _mm512_shuffle_f32x4(tmp10667, tmp10671, 221);
__m512 tmp10687 = _mm512_shuffle_f32x4(tmp10668, tmp10672, 136);
__m512 tmp10688 = _mm512_shuffle_f32x4(tmp10668, tmp10672, 221);
__m512 tmp10689 = _mm512_shuffle_f32x4(tmp10673, tmp10677, 136);
__m512 tmp10690 = _mm512_shuffle_f32x4(tmp10673, tmp10677, 221);
__m512 tmp10691 = _mm512_shuffle_f32x4(tmp10674, tmp10678, 136);
__m512 tmp10692 = _mm512_shuffle_f32x4(tmp10674, tmp10678, 221);
__m512 tmp10693 = _mm512_shuffle_f32x4(tmp10675, tmp10679, 136);
__m512 tmp10694 = _mm512_shuffle_f32x4(tmp10675, tmp10679, 221);
__m512 tmp10695 = _mm512_shuffle_f32x4(tmp10676, tmp10680, 136);
__m512 tmp10696 = _mm512_shuffle_f32x4(tmp10676, tmp10680, 221);
wt275 = _mm512_shuffle_f32x4(tmp10681, tmp10689, 136);
wt283 = _mm512_shuffle_f32x4(tmp10681, tmp10689, 221);
wt276 = _mm512_shuffle_f32x4(tmp10683, tmp10691, 136);
wt284 = _mm512_shuffle_f32x4(tmp10683, tmp10691, 221);
wt277 = _mm512_shuffle_f32x4(tmp10685, tmp10693, 136);
wt285 = _mm512_shuffle_f32x4(tmp10685, tmp10693, 221);
wt278 = _mm512_shuffle_f32x4(tmp10687, tmp10695, 136);
wt286 = _mm512_shuffle_f32x4(tmp10687, tmp10695, 221);
wt279 = _mm512_shuffle_f32x4(tmp10682, tmp10690, 136);
wt287 = _mm512_shuffle_f32x4(tmp10682, tmp10690, 221);
wt280 = _mm512_shuffle_f32x4(tmp10684, tmp10692, 136);
wt288 = _mm512_shuffle_f32x4(tmp10684, tmp10692, 221);
wt281 = _mm512_shuffle_f32x4(tmp10686, tmp10694, 136);
wt289 = _mm512_shuffle_f32x4(tmp10686, tmp10694, 221);
wt282 = _mm512_shuffle_f32x4(tmp10688, tmp10696, 136);
wt290 = _mm512_shuffle_f32x4(tmp10688, tmp10696, 221);
wt275 = _mm512_mul_ps(wt275, postMul28);
wt276 = _mm512_mul_ps(wt276, postMul28);
wt277 = _mm512_mul_ps(wt277, postMul28);
wt278 = _mm512_mul_ps(wt278, postMul28);
wt279 = _mm512_mul_ps(wt279, postMul28);
wt280 = _mm512_mul_ps(wt280, postMul28);
wt281 = _mm512_mul_ps(wt281, postMul28);
wt282 = _mm512_mul_ps(wt282, postMul28);
wt283 = _mm512_mul_ps(wt283, postMul28);
wt284 = _mm512_mul_ps(wt284, postMul28);
wt285 = _mm512_mul_ps(wt285, postMul28);
wt286 = _mm512_mul_ps(wt286, postMul28);
wt287 = _mm512_mul_ps(wt287, postMul28);
wt288 = _mm512_mul_ps(wt288, postMul28);
wt289 = _mm512_mul_ps(wt289, postMul28);
wt290 = _mm512_mul_ps(wt290, postMul28);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c24)+(ptrdiff_t)0, 63>>cut12, wt275);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c24)+(ptrdiff_t)0, 63>>cut12, wt276);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c24)+(ptrdiff_t)0, 63>>cut12, wt277);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c24)+(ptrdiff_t)0, 63>>cut12, wt278);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c24)+(ptrdiff_t)0, 63>>cut12, wt279);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c24)+(ptrdiff_t)0, 63>>cut12, wt280);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c24)+(ptrdiff_t)0, 63>>cut12, wt281);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c24)+(ptrdiff_t)0, 63>>cut12, wt282);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c24)+(ptrdiff_t)0, 63>>cut12, wt283);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c24)+(ptrdiff_t)0, 63>>cut12, wt284);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c24)+(ptrdiff_t)0, 63>>cut12, wt285);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c24)+(ptrdiff_t)0, 63>>cut12, wt286);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c24)+(ptrdiff_t)0, 63>>cut12, wt287);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c24)+(ptrdiff_t)0, 63>>cut12, wt288);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c24)+(ptrdiff_t)0, 63>>cut12, wt289);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c24)+(ptrdiff_t)0, 63>>cut12, wt290);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt275);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt276);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt277);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt278);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt279);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt280);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt281);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt282);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt283);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt284);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt285);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt286);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt287);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt288);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt289);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt290);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt275);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt276);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt277);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt278);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt279);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt280);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt281);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt282);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt283);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt284);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt285);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt286);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt287);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt288);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt289);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt290);
}
break;
}
default: {
cut12 = 4;
__m512 sum209 = _mm512_maskz_loadu_ps(65535, biasPtr9+2560*i31+4*k101);
__m512i pmMul18 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd18 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo15 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k101+640*i31));
__m512 masHi15 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k101+640*i31)+(ptrdiff_t)64);
__m512 postMul29 = _mm512_permutex2var_ps(masLo15, pmMul18, masHi15);
__m512 postAdd19 = _mm512_permutex2var_ps(masLo15, pmAdd18, masHi15);
sum209 = _mm512_fmadd_ps(sum209, postMul29, postAdd19);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum209);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)6144, 4032>>cut12, sum209);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)12288, 258048>>cut12, sum209);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)18432, 65535-(262143>>cut12), sum209);
ptrdiff_t c25 = 0;
for (; c25 != 16; ++c25) {
__m512 wt291 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)0);
__m512 wt292 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)1024);
__m512 wt293 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)2048);
__m512 wt294 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)3072);
__m512 wt295 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)4096);
__m512 wt296 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)5120);
__m512 wt297 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)6144);
__m512 wt298 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)7168);
__m512 wt299 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)8192);
__m512 wt300 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)9216);
__m512 wt301 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)10240);
__m512 wt302 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)11264);
__m512 wt303 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)12288);
__m512 wt304 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)13312);
__m512 wt305 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)14336);
__m512 wt306 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)15360);
__m512 tmp10697 = _mm512_unpacklo_ps(wt291, wt292);
__m512 tmp10698 = _mm512_unpackhi_ps(wt291, wt292);
__m512 tmp10699 = _mm512_unpacklo_ps(wt293, wt294);
__m512 tmp10700 = _mm512_unpackhi_ps(wt293, wt294);
__m512 tmp10701 = _mm512_unpacklo_ps(wt295, wt296);
__m512 tmp10702 = _mm512_unpackhi_ps(wt295, wt296);
__m512 tmp10703 = _mm512_unpacklo_ps(wt297, wt298);
__m512 tmp10704 = _mm512_unpackhi_ps(wt297, wt298);
__m512 tmp10705 = _mm512_unpacklo_ps(wt299, wt300);
__m512 tmp10706 = _mm512_unpackhi_ps(wt299, wt300);
__m512 tmp10707 = _mm512_unpacklo_ps(wt301, wt302);
__m512 tmp10708 = _mm512_unpackhi_ps(wt301, wt302);
__m512 tmp10709 = _mm512_unpacklo_ps(wt303, wt304);
__m512 tmp10710 = _mm512_unpackhi_ps(wt303, wt304);
__m512 tmp10711 = _mm512_unpacklo_ps(wt305, wt306);
__m512 tmp10712 = _mm512_unpackhi_ps(wt305, wt306);
__m512 tmp10713 = _mm512_shuffle_ps(tmp10697, tmp10699, 68);
__m512 tmp10714 = _mm512_shuffle_ps(tmp10697, tmp10699, 238);
__m512 tmp10715 = _mm512_shuffle_ps(tmp10698, tmp10700, 68);
__m512 tmp10716 = _mm512_shuffle_ps(tmp10698, tmp10700, 238);
__m512 tmp10717 = _mm512_shuffle_ps(tmp10701, tmp10703, 68);
__m512 tmp10718 = _mm512_shuffle_ps(tmp10701, tmp10703, 238);
__m512 tmp10719 = _mm512_shuffle_ps(tmp10702, tmp10704, 68);
__m512 tmp10720 = _mm512_shuffle_ps(tmp10702, tmp10704, 238);
__m512 tmp10721 = _mm512_shuffle_ps(tmp10705, tmp10707, 68);
__m512 tmp10722 = _mm512_shuffle_ps(tmp10705, tmp10707, 238);
__m512 tmp10723 = _mm512_shuffle_ps(tmp10706, tmp10708, 68);
__m512 tmp10724 = _mm512_shuffle_ps(tmp10706, tmp10708, 238);
__m512 tmp10725 = _mm512_shuffle_ps(tmp10709, tmp10711, 68);
__m512 tmp10726 = _mm512_shuffle_ps(tmp10709, tmp10711, 238);
__m512 tmp10727 = _mm512_shuffle_ps(tmp10710, tmp10712, 68);
__m512 tmp10728 = _mm512_shuffle_ps(tmp10710, tmp10712, 238);
__m512 tmp10729 = _mm512_shuffle_f32x4(tmp10713, tmp10717, 136);
__m512 tmp10730 = _mm512_shuffle_f32x4(tmp10713, tmp10717, 221);
__m512 tmp10731 = _mm512_shuffle_f32x4(tmp10714, tmp10718, 136);
__m512 tmp10732 = _mm512_shuffle_f32x4(tmp10714, tmp10718, 221);
__m512 tmp10733 = _mm512_shuffle_f32x4(tmp10715, tmp10719, 136);
__m512 tmp10734 = _mm512_shuffle_f32x4(tmp10715, tmp10719, 221);
__m512 tmp10735 = _mm512_shuffle_f32x4(tmp10716, tmp10720, 136);
__m512 tmp10736 = _mm512_shuffle_f32x4(tmp10716, tmp10720, 221);
__m512 tmp10737 = _mm512_shuffle_f32x4(tmp10721, tmp10725, 136);
__m512 tmp10738 = _mm512_shuffle_f32x4(tmp10721, tmp10725, 221);
__m512 tmp10739 = _mm512_shuffle_f32x4(tmp10722, tmp10726, 136);
__m512 tmp10740 = _mm512_shuffle_f32x4(tmp10722, tmp10726, 221);
__m512 tmp10741 = _mm512_shuffle_f32x4(tmp10723, tmp10727, 136);
__m512 tmp10742 = _mm512_shuffle_f32x4(tmp10723, tmp10727, 221);
__m512 tmp10743 = _mm512_shuffle_f32x4(tmp10724, tmp10728, 136);
__m512 tmp10744 = _mm512_shuffle_f32x4(tmp10724, tmp10728, 221);
wt291 = _mm512_shuffle_f32x4(tmp10729, tmp10737, 136);
wt299 = _mm512_shuffle_f32x4(tmp10729, tmp10737, 221);
wt292 = _mm512_shuffle_f32x4(tmp10731, tmp10739, 136);
wt300 = _mm512_shuffle_f32x4(tmp10731, tmp10739, 221);
wt293 = _mm512_shuffle_f32x4(tmp10733, tmp10741, 136);
wt301 = _mm512_shuffle_f32x4(tmp10733, tmp10741, 221);
wt294 = _mm512_shuffle_f32x4(tmp10735, tmp10743, 136);
wt302 = _mm512_shuffle_f32x4(tmp10735, tmp10743, 221);
wt295 = _mm512_shuffle_f32x4(tmp10730, tmp10738, 136);
wt303 = _mm512_shuffle_f32x4(tmp10730, tmp10738, 221);
wt296 = _mm512_shuffle_f32x4(tmp10732, tmp10740, 136);
wt304 = _mm512_shuffle_f32x4(tmp10732, tmp10740, 221);
wt297 = _mm512_shuffle_f32x4(tmp10734, tmp10742, 136);
wt305 = _mm512_shuffle_f32x4(tmp10734, tmp10742, 221);
wt298 = _mm512_shuffle_f32x4(tmp10736, tmp10744, 136);
wt306 = _mm512_shuffle_f32x4(tmp10736, tmp10744, 221);
wt291 = _mm512_mul_ps(wt291, postMul29);
wt292 = _mm512_mul_ps(wt292, postMul29);
wt293 = _mm512_mul_ps(wt293, postMul29);
wt294 = _mm512_mul_ps(wt294, postMul29);
wt295 = _mm512_mul_ps(wt295, postMul29);
wt296 = _mm512_mul_ps(wt296, postMul29);
wt297 = _mm512_mul_ps(wt297, postMul29);
wt298 = _mm512_mul_ps(wt298, postMul29);
wt299 = _mm512_mul_ps(wt299, postMul29);
wt300 = _mm512_mul_ps(wt300, postMul29);
wt301 = _mm512_mul_ps(wt301, postMul29);
wt302 = _mm512_mul_ps(wt302, postMul29);
wt303 = _mm512_mul_ps(wt303, postMul29);
wt304 = _mm512_mul_ps(wt304, postMul29);
wt305 = _mm512_mul_ps(wt305, postMul29);
wt306 = _mm512_mul_ps(wt306, postMul29);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)0, 63>>cut12, wt291);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)0, 63>>cut12, wt292);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)0, 63>>cut12, wt293);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)0, 63>>cut12, wt294);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)0, 63>>cut12, wt295);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)0, 63>>cut12, wt296);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)0, 63>>cut12, wt297);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)0, 63>>cut12, wt298);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)0, 63>>cut12, wt299);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)0, 63>>cut12, wt300);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)0, 63>>cut12, wt301);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)0, 63>>cut12, wt302);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)0, 63>>cut12, wt303);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)0, 63>>cut12, wt304);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)0, 63>>cut12, wt305);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)0, 63>>cut12, wt306);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt291);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt292);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt293);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt294);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt295);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt296);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt297);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt298);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt299);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt300);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt301);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt302);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt303);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt304);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt305);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt306);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt291);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt292);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt293);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt294);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt295);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt296);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt297);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt298);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt299);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt300);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt301);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt302);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt303);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt304);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt305);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt306);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt291);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt292);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt293);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt294);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt295);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt296);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt297);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt298);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt299);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt300);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt301);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt302);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt303);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt304);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt305);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt306);
}
}
}
} else {
ptrdiff_t k100 = 112;
ptrdiff_t l39 = (size_t)(512+k100)/6;
ptrdiff_t cut11 = (size_t)(512+k100)%6;
__m512 sum207 = _mm512_maskz_loadu_ps(65535, biasPtr9+2560*i31+4*k100);
__m512i pmMul19 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd19 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo16 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k100+640*i31));
__m512 masHi16 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k100+640*i31)+(ptrdiff_t)64);
__m512 postMul27 = _mm512_permutex2var_ps(masLo16, pmMul19, masHi16);
__m512 postAdd17 = _mm512_permutex2var_ps(masLo16, pmAdd19, masHi16);
sum207 = _mm512_fmadd_ps(sum207, postMul27, postAdd17);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*0+(ptrdiff_t)0, 63>>cut11, sum207);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*0+(ptrdiff_t)6144, 4032>>cut11, sum207);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*0+(ptrdiff_t)12288, 65535-(4095>>cut11), sum207);
ptrdiff_t c23 = 0;
for (; c23 != 16; ++c23) {
__m512 wt259 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)0);
__m512 wt260 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)1024);
__m512 wt261 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)2048);
__m512 wt262 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)3072);
__m512 wt263 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)4096);
__m512 wt264 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)5120);
__m512 wt265 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)6144);
__m512 wt266 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)7168);
__m512 wt267 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)8192);
__m512 wt268 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)9216);
__m512 wt269 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)10240);
__m512 wt270 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)11264);
__m512 wt271 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)12288);
__m512 wt272 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)13312);
__m512 wt273 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)14336);
__m512 wt274 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)15360);
__m512 tmp10745 = _mm512_unpacklo_ps(wt259, wt260);
__m512 tmp10746 = _mm512_unpackhi_ps(wt259, wt260);
__m512 tmp10747 = _mm512_unpacklo_ps(wt261, wt262);
__m512 tmp10748 = _mm512_unpackhi_ps(wt261, wt262);
__m512 tmp10749 = _mm512_unpacklo_ps(wt263, wt264);
__m512 tmp10750 = _mm512_unpackhi_ps(wt263, wt264);
__m512 tmp10751 = _mm512_unpacklo_ps(wt265, wt266);
__m512 tmp10752 = _mm512_unpackhi_ps(wt265, wt266);
__m512 tmp10753 = _mm512_unpacklo_ps(wt267, wt268);
__m512 tmp10754 = _mm512_unpackhi_ps(wt267, wt268);
__m512 tmp10755 = _mm512_unpacklo_ps(wt269, wt270);
__m512 tmp10756 = _mm512_unpackhi_ps(wt269, wt270);
__m512 tmp10757 = _mm512_unpacklo_ps(wt271, wt272);
__m512 tmp10758 = _mm512_unpackhi_ps(wt271, wt272);
__m512 tmp10759 = _mm512_unpacklo_ps(wt273, wt274);
__m512 tmp10760 = _mm512_unpackhi_ps(wt273, wt274);
__m512 tmp10761 = _mm512_shuffle_ps(tmp10745, tmp10747, 68);
__m512 tmp10762 = _mm512_shuffle_ps(tmp10745, tmp10747, 238);
__m512 tmp10763 = _mm512_shuffle_ps(tmp10746, tmp10748, 68);
__m512 tmp10764 = _mm512_shuffle_ps(tmp10746, tmp10748, 238);
__m512 tmp10765 = _mm512_shuffle_ps(tmp10749, tmp10751, 68);
__m512 tmp10766 = _mm512_shuffle_ps(tmp10749, tmp10751, 238);
__m512 tmp10767 = _mm512_shuffle_ps(tmp10750, tmp10752, 68);
__m512 tmp10768 = _mm512_shuffle_ps(tmp10750, tmp10752, 238);
__m512 tmp10769 = _mm512_shuffle_ps(tmp10753, tmp10755, 68);
__m512 tmp10770 = _mm512_shuffle_ps(tmp10753, tmp10755, 238);
__m512 tmp10771 = _mm512_shuffle_ps(tmp10754, tmp10756, 68);
__m512 tmp10772 = _mm512_shuffle_ps(tmp10754, tmp10756, 238);
__m512 tmp10773 = _mm512_shuffle_ps(tmp10757, tmp10759, 68);
__m512 tmp10774 = _mm512_shuffle_ps(tmp10757, tmp10759, 238);
__m512 tmp10775 = _mm512_shuffle_ps(tmp10758, tmp10760, 68);
__m512 tmp10776 = _mm512_shuffle_ps(tmp10758, tmp10760, 238);
__m512 tmp10777 = _mm512_shuffle_f32x4(tmp10761, tmp10765, 136);
__m512 tmp10778 = _mm512_shuffle_f32x4(tmp10761, tmp10765, 221);
__m512 tmp10779 = _mm512_shuffle_f32x4(tmp10762, tmp10766, 136);
__m512 tmp10780 = _mm512_shuffle_f32x4(tmp10762, tmp10766, 221);
__m512 tmp10781 = _mm512_shuffle_f32x4(tmp10763, tmp10767, 136);
__m512 tmp10782 = _mm512_shuffle_f32x4(tmp10763, tmp10767, 221);
__m512 tmp10783 = _mm512_shuffle_f32x4(tmp10764, tmp10768, 136);
__m512 tmp10784 = _mm512_shuffle_f32x4(tmp10764, tmp10768, 221);
__m512 tmp10785 = _mm512_shuffle_f32x4(tmp10769, tmp10773, 136);
__m512 tmp10786 = _mm512_shuffle_f32x4(tmp10769, tmp10773, 221);
__m512 tmp10787 = _mm512_shuffle_f32x4(tmp10770, tmp10774, 136);
__m512 tmp10788 = _mm512_shuffle_f32x4(tmp10770, tmp10774, 221);
__m512 tmp10789 = _mm512_shuffle_f32x4(tmp10771, tmp10775, 136);
__m512 tmp10790 = _mm512_shuffle_f32x4(tmp10771, tmp10775, 221);
__m512 tmp10791 = _mm512_shuffle_f32x4(tmp10772, tmp10776, 136);
__m512 tmp10792 = _mm512_shuffle_f32x4(tmp10772, tmp10776, 221);
wt259 = _mm512_shuffle_f32x4(tmp10777, tmp10785, 136);
wt267 = _mm512_shuffle_f32x4(tmp10777, tmp10785, 221);
wt260 = _mm512_shuffle_f32x4(tmp10779, tmp10787, 136);
wt268 = _mm512_shuffle_f32x4(tmp10779, tmp10787, 221);
wt261 = _mm512_shuffle_f32x4(tmp10781, tmp10789, 136);
wt269 = _mm512_shuffle_f32x4(tmp10781, tmp10789, 221);
wt262 = _mm512_shuffle_f32x4(tmp10783, tmp10791, 136);
wt270 = _mm512_shuffle_f32x4(tmp10783, tmp10791, 221);
wt263 = _mm512_shuffle_f32x4(tmp10778, tmp10786, 136);
wt271 = _mm512_shuffle_f32x4(tmp10778, tmp10786, 221);
wt264 = _mm512_shuffle_f32x4(tmp10780, tmp10788, 136);
wt272 = _mm512_shuffle_f32x4(tmp10780, tmp10788, 221);
wt265 = _mm512_shuffle_f32x4(tmp10782, tmp10790, 136);
wt273 = _mm512_shuffle_f32x4(tmp10782, tmp10790, 221);
wt266 = _mm512_shuffle_f32x4(tmp10784, tmp10792, 136);
wt274 = _mm512_shuffle_f32x4(tmp10784, tmp10792, 221);
wt259 = _mm512_mul_ps(wt259, postMul27);
wt260 = _mm512_mul_ps(wt260, postMul27);
wt261 = _mm512_mul_ps(wt261, postMul27);
wt262 = _mm512_mul_ps(wt262, postMul27);
wt263 = _mm512_mul_ps(wt263, postMul27);
wt264 = _mm512_mul_ps(wt264, postMul27);
wt265 = _mm512_mul_ps(wt265, postMul27);
wt266 = _mm512_mul_ps(wt266, postMul27);
wt267 = _mm512_mul_ps(wt267, postMul27);
wt268 = _mm512_mul_ps(wt268, postMul27);
wt269 = _mm512_mul_ps(wt269, postMul27);
wt270 = _mm512_mul_ps(wt270, postMul27);
wt271 = _mm512_mul_ps(wt271, postMul27);
wt272 = _mm512_mul_ps(wt272, postMul27);
wt273 = _mm512_mul_ps(wt273, postMul27);
wt274 = _mm512_mul_ps(wt274, postMul27);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(1+16*c23)+(ptrdiff_t)0, 63>>cut11, wt259);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(2+16*c23)+(ptrdiff_t)0, 63>>cut11, wt260);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(3+16*c23)+(ptrdiff_t)0, 63>>cut11, wt261);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(4+16*c23)+(ptrdiff_t)0, 63>>cut11, wt262);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(5+16*c23)+(ptrdiff_t)0, 63>>cut11, wt263);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(6+16*c23)+(ptrdiff_t)0, 63>>cut11, wt264);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(7+16*c23)+(ptrdiff_t)0, 63>>cut11, wt265);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(8+16*c23)+(ptrdiff_t)0, 63>>cut11, wt266);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(9+16*c23)+(ptrdiff_t)0, 63>>cut11, wt267);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(10+16*c23)+(ptrdiff_t)0, 63>>cut11, wt268);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(11+16*c23)+(ptrdiff_t)0, 63>>cut11, wt269);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(12+16*c23)+(ptrdiff_t)0, 63>>cut11, wt270);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(13+16*c23)+(ptrdiff_t)0, 63>>cut11, wt271);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(14+16*c23)+(ptrdiff_t)0, 63>>cut11, wt272);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(15+16*c23)+(ptrdiff_t)0, 63>>cut11, wt273);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(16+16*c23)+(ptrdiff_t)0, 63>>cut11, wt274);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(1+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt259);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(2+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt260);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(3+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt261);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(4+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt262);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(5+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt263);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(6+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt264);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(7+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt265);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(8+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt266);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(9+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt267);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(10+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt268);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(11+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt269);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(12+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt270);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(13+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt271);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(14+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt272);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(15+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt273);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(16+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt274);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(1+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt259);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(2+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt260);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(3+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt261);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(4+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt262);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(5+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt263);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(6+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt264);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(7+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt265);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(8+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt266);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(9+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt267);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(10+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt268);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(11+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt269);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(12+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt270);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(13+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt271);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(14+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt272);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(15+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt273);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(16+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt274);
}
}
}
}
}

static void ResNet50OneArrangeWts4(ResNet50ThreaderTeam1* team36, char** tensors45) {
ResNet50ThreaderTask1 task49;
task49.callee1 = ResNet50OneArrangeWts4Callee1;
task49.any1 = tensors45;
task49.nd1 = 3;
task49.hull1[0] = 20;
task49.hull1[1] = 1;
task49.hull1[2] = 1;
ResNet50ThreaderDo1(team36, &task49);
}

static void ResNet50OneArrangeDats4Callee1(ResNet50ThreaderTask1* task50, int64_t* pt30) {
char** tensors48 = task50->any1;
ptrdiff_t s21 = pt30[0];
ptrdiff_t c26 = pt30[1];
char*restrict datPtr14 = tensors48[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged8 = tensors48[1]+(ptrdiff_t)2992640*0+(ptrdiff_t)917504*0;
ptrdiff_t ii11 = 1;
for (ptrdiff_t i32 = 0; i32 < ii11; ++i32) {
ptrdiff_t j25 = 1*c26;
ptrdiff_t jj31 = j25+0;
ptrdiff_t h38 = 0+((size_t)j25-0)/1*4;
switch (((size_t)j25-0)%1) {
default: {
wrap3:;
ptrdiff_t k102 = 128*s21;
ptrdiff_t kk32 = k102+128;
for (; k102 < kk32; ++k102) {
__m512 dat1645 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)0);
__m512 dat1646 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)64);
__m512i pm153 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1647 = _mm512_permutex2var_ps(dat1645, pm153, dat1646);
__m512 dat1648 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)128);
__m512 dat1649 = _mm512_maskz_loadu_ps(127, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)192);
__m512i pm154 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1650 = _mm512_permutex2var_ps(dat1648, pm154, dat1649);
__m512 dat1651 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)448);
__m512 dat1652 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)512);
__m512i pm155 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1653 = _mm512_permutex2var_ps(dat1651, pm155, dat1652);
__m512 dat1654 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)576);
__m512 dat1655 = _mm512_maskz_loadu_ps(127, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)640);
__m512i pm156 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1656 = _mm512_permutex2var_ps(dat1654, pm156, dat1655);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k102+(ptrdiff_t)0, dat1647);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k102+(ptrdiff_t)64, dat1650);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k102+(ptrdiff_t)128, dat1653);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k102+(ptrdiff_t)192, dat1656);
}
if (j25 >= jj31) goto next4;
if (j25 >= 13) break;
++j25;
h38 += 4;
goto wrap3;
}
}
j25 = 14;
next4:;
}
}

static void ResNet50OneArrangeDats4(ResNet50ThreaderTeam1* team37, char** tensors47) {
ResNet50ThreaderTask1 task51;
task51.callee1 = ResNet50OneArrangeDats4Callee1;
task51.any1 = tensors47;
task51.nd1 = 4;
task51.hull1[0] = 2;
task51.hull1[1] = 14;
task51.hull1[2] = 1;
task51.hull1[3] = 1;
ResNet50ThreaderDo1(team37, &task51);
}

static void ResNet50OneApply4Callee1(ResNet50ThreaderTask1* task52, int64_t* pt31) {
void** pair12 = task52->any1;
char** tensors50 = pair12[0];
ptrdiff_t e15 = 0;
ptrdiff_t g17 = 0;
ptrdiff_t d10 = pt31[1];
ptrdiff_t w47 = pt31[0];
char*restrict arrangedWts4 = tensors50[0]+2140160*e15+(ptrdiff_t)657920*1*g17;
char*restrict arrangedDats4 = tensors50[1]+2992640*e15+(ptrdiff_t)917504*1*g17;
char*restrict datPtr15 = tensors50[2]+(ptrdiff_t)2007040*1*g17;
ptrdiff_t ii12 = 1;
for (ptrdiff_t i33 = 0; i33 < ii12; ++i33) {
ptrdiff_t j26 = 1*d10;
ptrdiff_t jj32 = j26+0;
ptrdiff_t h39 = 0+((size_t)j26-0)/1*2;
switch (((size_t)j26-0)%1) {
default: {
wrap4:;
ptrdiff_t k103 = 2*w47;
ptrdiff_t kk33 = k103+(w47 < 52 ? 1 : 2);
for (; k103 != 106; ++k103) {
ptrdiff_t s22 = -1;
__m512 sum210 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)24));
__m512 sum214 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)28));
__m512 sum218 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)32));
__m512 sum222 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)36));
__m512 sum226 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)40));
__m512 sum230 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)44));
__m512 sum211 = sum210;
__m512 sum212 = sum210;
__m512 sum213 = sum210;
__m512 sum215 = sum214;
__m512 sum216 = sum214;
__m512 sum217 = sum214;
__m512 sum219 = sum218;
__m512 sum220 = sum218;
__m512 sum221 = sum218;
__m512 sum223 = sum222;
__m512 sum224 = sum222;
__m512 sum225 = sum222;
__m512 sum227 = sum226;
__m512 sum228 = sum226;
__m512 sum229 = sum226;
__m512 sum231 = sum230;
__m512 sum232 = sum230;
__m512 sum233 = sum230;
for (s22 = 0; s22 < 256; ++s22) {
__m512 dat1657 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)0);
__m512 dat1658 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)64);
__m512 dat1659 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)128);
__m512 dat1660 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)192);
__m512 wt307 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)24));
sum210 = _mm512_fmadd_ps(wt307, dat1657, sum210);
sum211 = _mm512_fmadd_ps(wt307, dat1658, sum211);
sum212 = _mm512_fmadd_ps(wt307, dat1659, sum212);
sum213 = _mm512_fmadd_ps(wt307, dat1660, sum213);
__m512 wt308 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)28));
sum214 = _mm512_fmadd_ps(wt308, dat1657, sum214);
sum215 = _mm512_fmadd_ps(wt308, dat1658, sum215);
sum216 = _mm512_fmadd_ps(wt308, dat1659, sum216);
sum217 = _mm512_fmadd_ps(wt308, dat1660, sum217);
__m512 wt309 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)32));
sum218 = _mm512_fmadd_ps(wt309, dat1657, sum218);
sum219 = _mm512_fmadd_ps(wt309, dat1658, sum219);
sum220 = _mm512_fmadd_ps(wt309, dat1659, sum220);
sum221 = _mm512_fmadd_ps(wt309, dat1660, sum221);
__m512 wt310 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)36));
sum222 = _mm512_fmadd_ps(wt310, dat1657, sum222);
sum223 = _mm512_fmadd_ps(wt310, dat1658, sum223);
sum224 = _mm512_fmadd_ps(wt310, dat1659, sum224);
sum225 = _mm512_fmadd_ps(wt310, dat1660, sum225);
__m512 wt311 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)40));
sum226 = _mm512_fmadd_ps(wt311, dat1657, sum226);
sum227 = _mm512_fmadd_ps(wt311, dat1658, sum227);
sum228 = _mm512_fmadd_ps(wt311, dat1659, sum228);
sum229 = _mm512_fmadd_ps(wt311, dat1660, sum229);
__m512 wt312 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)44));
sum230 = _mm512_fmadd_ps(wt312, dat1657, sum230);
sum231 = _mm512_fmadd_ps(wt312, dat1658, sum231);
sum232 = _mm512_fmadd_ps(wt312, dat1659, sum232);
sum233 = _mm512_fmadd_ps(wt312, dat1660, sum233);
}
__m512 dat1661 = sum211;
__m512 dat1662 = sum213;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)0, 65535, sum210);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)64, 4095, dat1661);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)112, 65535, sum212);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)176, 4095, dat1662);
__m512 dat1663 = sum215;
__m512 dat1664 = sum217;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3136, 65535, sum214);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3200, 4095, dat1663);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3248, 65535, sum216);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3312, 4095, dat1664);
__m512 dat1665 = sum219;
__m512 dat1666 = sum221;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6272, 65535, sum218);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6336, 4095, dat1665);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6384, 65535, sum220);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6448, 4095, dat1666);
__m512 dat1667 = sum223;
__m512 dat1668 = sum225;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9408, 65535, sum222);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9472, 4095, dat1667);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9520, 65535, sum224);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9584, 4095, dat1668);
__m512 dat1669 = sum227;
__m512 dat1670 = sum229;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)12544, 65535, sum226);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)12608, 4095, dat1669);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)12656, 65535, sum228);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)12720, 4095, dat1670);
__m512 dat1671 = sum231;
__m512 dat1672 = sum233;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)15680, 65535, sum230);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)15744, 4095, dat1671);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)15792, 65535, sum232);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)15856, 4095, dat1672);
if (k103 >= kk33) return;
}
ptrdiff_t s23 = -1;
__m512 sum234 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)16));
__m512 sum238 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)20));
__m512 sum242 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)24));
__m512 sum246 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)28));
__m512 sum235 = sum234;
__m512 sum236 = sum234;
__m512 sum237 = sum234;
__m512 sum239 = sum238;
__m512 sum240 = sum238;
__m512 sum241 = sum238;
__m512 sum243 = sum242;
__m512 sum244 = sum242;
__m512 sum245 = sum242;
__m512 sum247 = sum246;
__m512 sum248 = sum246;
__m512 sum249 = sum246;
for (s23 = 0; s23 < 256; ++s23) {
__m512 dat1673 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s23+(ptrdiff_t)0);
__m512 dat1674 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s23+(ptrdiff_t)64);
__m512 dat1675 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s23+(ptrdiff_t)128);
__m512 dat1676 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s23+(ptrdiff_t)192);
__m512 wt313 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)16));
sum234 = _mm512_fmadd_ps(wt313, dat1673, sum234);
sum235 = _mm512_fmadd_ps(wt313, dat1674, sum235);
sum236 = _mm512_fmadd_ps(wt313, dat1675, sum236);
sum237 = _mm512_fmadd_ps(wt313, dat1676, sum237);
__m512 wt314 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)20));
sum238 = _mm512_fmadd_ps(wt314, dat1673, sum238);
sum239 = _mm512_fmadd_ps(wt314, dat1674, sum239);
sum240 = _mm512_fmadd_ps(wt314, dat1675, sum240);
sum241 = _mm512_fmadd_ps(wt314, dat1676, sum241);
__m512 wt315 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)24));
sum242 = _mm512_fmadd_ps(wt315, dat1673, sum242);
sum243 = _mm512_fmadd_ps(wt315, dat1674, sum243);
sum244 = _mm512_fmadd_ps(wt315, dat1675, sum244);
sum245 = _mm512_fmadd_ps(wt315, dat1676, sum245);
__m512 wt316 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)28));
sum246 = _mm512_fmadd_ps(wt316, dat1673, sum246);
sum247 = _mm512_fmadd_ps(wt316, dat1674, sum247);
sum248 = _mm512_fmadd_ps(wt316, dat1675, sum248);
sum249 = _mm512_fmadd_ps(wt316, dat1676, sum249);
}
__m512 dat1677 = sum235;
__m512 dat1678 = sum237;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)0, 65535, sum234);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)64, 4095, dat1677);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)112, 65535, sum236);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)176, 4095, dat1678);
__m512 dat1679 = sum239;
__m512 dat1680 = sum241;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3136, 65535, sum238);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3200, 4095, dat1679);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3248, 65535, sum240);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3312, 4095, dat1680);
__m512 dat1681 = sum243;
__m512 dat1682 = sum245;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6272, 65535, sum242);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6336, 4095, dat1681);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6384, 65535, sum244);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6448, 4095, dat1682);
__m512 dat1683 = sum247;
__m512 dat1684 = sum249;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9408, 65535, sum246);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9472, 4095, dat1683);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9520, 65535, sum248);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9584, 4095, dat1684);
if (j26 >= jj32) return;
if (j26 >= 13) break;
++j26;
h39 += 2;
goto wrap4;
}
}
j26 = 14;
}
}

static void ResNet50OneApply4(ResNet50ThreaderTeam1* team38, char** tensors49) {
void* pair11[] = {tensors49, 0};
ResNet50ThreaderTask1 task53;
task53.callee1 = ResNet50OneApply4Callee1;
task53.any1 = pair11;
task53.nd1 = 3;
task53.hull1[0] = 53;
task53.hull1[1] = 14;
task53.hull1[2] = 1;
ResNet50ThreaderDo1(team38, &task53);
}

static void ResNet50OneArrangeWts5Callee1(ResNet50ThreaderTask1* task62, int64_t* pt36) {
char** tensors60 = task62->any1;
ptrdiff_t b57 = pt36[0];
char*restrict wtPtr11 = tensors60[0]+(ptrdiff_t)3340*0+(ptrdiff_t)262144*0;
char*restrict biasPtr11 = tensors60[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr11 = tensors60[2]+(ptrdiff_t)8*512*0;
char*restrict arranged9 = tensors60[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)264192*0;
ptrdiff_t ii13 = 1;
for (ptrdiff_t i38 = 0; i38 < ii13; ++i38) {
ptrdiff_t j31 = 4*b57;
ptrdiff_t jj34 = j31+4;
for (; j31 < jj34; ++j31) {
if (j31 < 31) {
ptrdiff_t k117 = 0+16*(j31-0);
ptrdiff_t l49 = (size_t)(0+k117)/6;
ptrdiff_t cut15 = (size_t)(0+k117)%6;
switch (cut15) {
case 0:;
case 2: {
__m512 sum279 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i38+4*k117);
__m512i pmMul21 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd21 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo17 = _mm512_loadu_ps(bnPtr11+(ptrdiff_t)8*(k117+512*i38));
__m512 masHi17 = _mm512_maskz_loadu_ps(65535, bnPtr11+(ptrdiff_t)8*(k117+512*i38)+(ptrdiff_t)64);
__m512 postMul36 = _mm512_permutex2var_ps(masLo17, pmMul21, masHi17);
__m512 postAdd22 = _mm512_permutex2var_ps(masLo17, pmAdd21, masHi17);
sum279 = _mm512_fmadd_ps(sum279, postMul36, postAdd22);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)0, 63>>cut15, sum279);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)3072, 4032>>cut15, sum279);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)6144, 65535-(4095>>cut15), sum279);
ptrdiff_t c29 = 0;
for (; c29 != 8; ++c29) {
__m512 wt337 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)0);
__m512 wt338 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)512);
__m512 wt339 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)1024);
__m512 wt340 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)1536);
__m512 wt341 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)2048);
__m512 wt342 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)2560);
__m512 wt343 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)3072);
__m512 wt344 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)3584);
__m512 wt345 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)4096);
__m512 wt346 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)4608);
__m512 wt347 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)5120);
__m512 wt348 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)5632);
__m512 wt349 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)6144);
__m512 wt350 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)6656);
__m512 wt351 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)7168);
__m512 wt352 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)7680);
__m512 tmp13411 = _mm512_unpacklo_ps(wt337, wt338);
__m512 tmp13412 = _mm512_unpackhi_ps(wt337, wt338);
__m512 tmp13413 = _mm512_unpacklo_ps(wt339, wt340);
__m512 tmp13414 = _mm512_unpackhi_ps(wt339, wt340);
__m512 tmp13415 = _mm512_unpacklo_ps(wt341, wt342);
__m512 tmp13416 = _mm512_unpackhi_ps(wt341, wt342);
__m512 tmp13417 = _mm512_unpacklo_ps(wt343, wt344);
__m512 tmp13418 = _mm512_unpackhi_ps(wt343, wt344);
__m512 tmp13419 = _mm512_unpacklo_ps(wt345, wt346);
__m512 tmp13420 = _mm512_unpackhi_ps(wt345, wt346);
__m512 tmp13421 = _mm512_unpacklo_ps(wt347, wt348);
__m512 tmp13422 = _mm512_unpackhi_ps(wt347, wt348);
__m512 tmp13423 = _mm512_unpacklo_ps(wt349, wt350);
__m512 tmp13424 = _mm512_unpackhi_ps(wt349, wt350);
__m512 tmp13425 = _mm512_unpacklo_ps(wt351, wt352);
__m512 tmp13426 = _mm512_unpackhi_ps(wt351, wt352);
__m512 tmp13427 = _mm512_shuffle_ps(tmp13411, tmp13413, 68);
__m512 tmp13428 = _mm512_shuffle_ps(tmp13411, tmp13413, 238);
__m512 tmp13429 = _mm512_shuffle_ps(tmp13412, tmp13414, 68);
__m512 tmp13430 = _mm512_shuffle_ps(tmp13412, tmp13414, 238);
__m512 tmp13431 = _mm512_shuffle_ps(tmp13415, tmp13417, 68);
__m512 tmp13432 = _mm512_shuffle_ps(tmp13415, tmp13417, 238);
__m512 tmp13433 = _mm512_shuffle_ps(tmp13416, tmp13418, 68);
__m512 tmp13434 = _mm512_shuffle_ps(tmp13416, tmp13418, 238);
__m512 tmp13435 = _mm512_shuffle_ps(tmp13419, tmp13421, 68);
__m512 tmp13436 = _mm512_shuffle_ps(tmp13419, tmp13421, 238);
__m512 tmp13437 = _mm512_shuffle_ps(tmp13420, tmp13422, 68);
__m512 tmp13438 = _mm512_shuffle_ps(tmp13420, tmp13422, 238);
__m512 tmp13439 = _mm512_shuffle_ps(tmp13423, tmp13425, 68);
__m512 tmp13440 = _mm512_shuffle_ps(tmp13423, tmp13425, 238);
__m512 tmp13441 = _mm512_shuffle_ps(tmp13424, tmp13426, 68);
__m512 tmp13442 = _mm512_shuffle_ps(tmp13424, tmp13426, 238);
__m512 tmp13443 = _mm512_shuffle_f32x4(tmp13427, tmp13431, 136);
__m512 tmp13444 = _mm512_shuffle_f32x4(tmp13427, tmp13431, 221);
__m512 tmp13445 = _mm512_shuffle_f32x4(tmp13428, tmp13432, 136);
__m512 tmp13446 = _mm512_shuffle_f32x4(tmp13428, tmp13432, 221);
__m512 tmp13447 = _mm512_shuffle_f32x4(tmp13429, tmp13433, 136);
__m512 tmp13448 = _mm512_shuffle_f32x4(tmp13429, tmp13433, 221);
__m512 tmp13449 = _mm512_shuffle_f32x4(tmp13430, tmp13434, 136);
__m512 tmp13450 = _mm512_shuffle_f32x4(tmp13430, tmp13434, 221);
__m512 tmp13451 = _mm512_shuffle_f32x4(tmp13435, tmp13439, 136);
__m512 tmp13452 = _mm512_shuffle_f32x4(tmp13435, tmp13439, 221);
__m512 tmp13453 = _mm512_shuffle_f32x4(tmp13436, tmp13440, 136);
__m512 tmp13454 = _mm512_shuffle_f32x4(tmp13436, tmp13440, 221);
__m512 tmp13455 = _mm512_shuffle_f32x4(tmp13437, tmp13441, 136);
__m512 tmp13456 = _mm512_shuffle_f32x4(tmp13437, tmp13441, 221);
__m512 tmp13457 = _mm512_shuffle_f32x4(tmp13438, tmp13442, 136);
__m512 tmp13458 = _mm512_shuffle_f32x4(tmp13438, tmp13442, 221);
wt337 = _mm512_shuffle_f32x4(tmp13443, tmp13451, 136);
wt345 = _mm512_shuffle_f32x4(tmp13443, tmp13451, 221);
wt338 = _mm512_shuffle_f32x4(tmp13445, tmp13453, 136);
wt346 = _mm512_shuffle_f32x4(tmp13445, tmp13453, 221);
wt339 = _mm512_shuffle_f32x4(tmp13447, tmp13455, 136);
wt347 = _mm512_shuffle_f32x4(tmp13447, tmp13455, 221);
wt340 = _mm512_shuffle_f32x4(tmp13449, tmp13457, 136);
wt348 = _mm512_shuffle_f32x4(tmp13449, tmp13457, 221);
wt341 = _mm512_shuffle_f32x4(tmp13444, tmp13452, 136);
wt349 = _mm512_shuffle_f32x4(tmp13444, tmp13452, 221);
wt342 = _mm512_shuffle_f32x4(tmp13446, tmp13454, 136);
wt350 = _mm512_shuffle_f32x4(tmp13446, tmp13454, 221);
wt343 = _mm512_shuffle_f32x4(tmp13448, tmp13456, 136);
wt351 = _mm512_shuffle_f32x4(tmp13448, tmp13456, 221);
wt344 = _mm512_shuffle_f32x4(tmp13450, tmp13458, 136);
wt352 = _mm512_shuffle_f32x4(tmp13450, tmp13458, 221);
wt337 = _mm512_mul_ps(wt337, postMul36);
wt338 = _mm512_mul_ps(wt338, postMul36);
wt339 = _mm512_mul_ps(wt339, postMul36);
wt340 = _mm512_mul_ps(wt340, postMul36);
wt341 = _mm512_mul_ps(wt341, postMul36);
wt342 = _mm512_mul_ps(wt342, postMul36);
wt343 = _mm512_mul_ps(wt343, postMul36);
wt344 = _mm512_mul_ps(wt344, postMul36);
wt345 = _mm512_mul_ps(wt345, postMul36);
wt346 = _mm512_mul_ps(wt346, postMul36);
wt347 = _mm512_mul_ps(wt347, postMul36);
wt348 = _mm512_mul_ps(wt348, postMul36);
wt349 = _mm512_mul_ps(wt349, postMul36);
wt350 = _mm512_mul_ps(wt350, postMul36);
wt351 = _mm512_mul_ps(wt351, postMul36);
wt352 = _mm512_mul_ps(wt352, postMul36);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c29)+(ptrdiff_t)0, 63>>cut15, wt337);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c29)+(ptrdiff_t)0, 63>>cut15, wt338);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c29)+(ptrdiff_t)0, 63>>cut15, wt339);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c29)+(ptrdiff_t)0, 63>>cut15, wt340);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c29)+(ptrdiff_t)0, 63>>cut15, wt341);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c29)+(ptrdiff_t)0, 63>>cut15, wt342);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c29)+(ptrdiff_t)0, 63>>cut15, wt343);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c29)+(ptrdiff_t)0, 63>>cut15, wt344);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c29)+(ptrdiff_t)0, 63>>cut15, wt345);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c29)+(ptrdiff_t)0, 63>>cut15, wt346);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c29)+(ptrdiff_t)0, 63>>cut15, wt347);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c29)+(ptrdiff_t)0, 63>>cut15, wt348);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c29)+(ptrdiff_t)0, 63>>cut15, wt349);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c29)+(ptrdiff_t)0, 63>>cut15, wt350);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c29)+(ptrdiff_t)0, 63>>cut15, wt351);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c29)+(ptrdiff_t)0, 63>>cut15, wt352);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt337);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt338);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt339);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt340);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt341);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt342);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt343);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt344);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt345);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt346);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt347);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt348);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt349);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt350);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt351);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt352);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt337);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt338);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt339);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt340);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt341);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt342);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt343);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt344);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt345);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt346);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt347);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt348);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt349);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt350);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt351);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt352);
}
break;
}
default: {
cut15 = 4;
__m512 sum280 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i38+4*k117);
__m512i pmMul22 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd22 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo18 = _mm512_loadu_ps(bnPtr11+(ptrdiff_t)8*(k117+512*i38));
__m512 masHi18 = _mm512_maskz_loadu_ps(65535, bnPtr11+(ptrdiff_t)8*(k117+512*i38)+(ptrdiff_t)64);
__m512 postMul37 = _mm512_permutex2var_ps(masLo18, pmMul22, masHi18);
__m512 postAdd23 = _mm512_permutex2var_ps(masLo18, pmAdd22, masHi18);
sum280 = _mm512_fmadd_ps(sum280, postMul37, postAdd23);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)0, 63>>cut15, sum280);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)3072, 4032>>cut15, sum280);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)6144, 258048>>cut15, sum280);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)9216, 65535-(262143>>cut15), sum280);
ptrdiff_t c30 = 0;
for (; c30 != 8; ++c30) {
__m512 wt353 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)0);
__m512 wt354 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)512);
__m512 wt355 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)1024);
__m512 wt356 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)1536);
__m512 wt357 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)2048);
__m512 wt358 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)2560);
__m512 wt359 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)3072);
__m512 wt360 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)3584);
__m512 wt361 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)4096);
__m512 wt362 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)4608);
__m512 wt363 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)5120);
__m512 wt364 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)5632);
__m512 wt365 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)6144);
__m512 wt366 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)6656);
__m512 wt367 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)7168);
__m512 wt368 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)7680);
__m512 tmp13459 = _mm512_unpacklo_ps(wt353, wt354);
__m512 tmp13460 = _mm512_unpackhi_ps(wt353, wt354);
__m512 tmp13461 = _mm512_unpacklo_ps(wt355, wt356);
__m512 tmp13462 = _mm512_unpackhi_ps(wt355, wt356);
__m512 tmp13463 = _mm512_unpacklo_ps(wt357, wt358);
__m512 tmp13464 = _mm512_unpackhi_ps(wt357, wt358);
__m512 tmp13465 = _mm512_unpacklo_ps(wt359, wt360);
__m512 tmp13466 = _mm512_unpackhi_ps(wt359, wt360);
__m512 tmp13467 = _mm512_unpacklo_ps(wt361, wt362);
__m512 tmp13468 = _mm512_unpackhi_ps(wt361, wt362);
__m512 tmp13469 = _mm512_unpacklo_ps(wt363, wt364);
__m512 tmp13470 = _mm512_unpackhi_ps(wt363, wt364);
__m512 tmp13471 = _mm512_unpacklo_ps(wt365, wt366);
__m512 tmp13472 = _mm512_unpackhi_ps(wt365, wt366);
__m512 tmp13473 = _mm512_unpacklo_ps(wt367, wt368);
__m512 tmp13474 = _mm512_unpackhi_ps(wt367, wt368);
__m512 tmp13475 = _mm512_shuffle_ps(tmp13459, tmp13461, 68);
__m512 tmp13476 = _mm512_shuffle_ps(tmp13459, tmp13461, 238);
__m512 tmp13477 = _mm512_shuffle_ps(tmp13460, tmp13462, 68);
__m512 tmp13478 = _mm512_shuffle_ps(tmp13460, tmp13462, 238);
__m512 tmp13479 = _mm512_shuffle_ps(tmp13463, tmp13465, 68);
__m512 tmp13480 = _mm512_shuffle_ps(tmp13463, tmp13465, 238);
__m512 tmp13481 = _mm512_shuffle_ps(tmp13464, tmp13466, 68);
__m512 tmp13482 = _mm512_shuffle_ps(tmp13464, tmp13466, 238);
__m512 tmp13483 = _mm512_shuffle_ps(tmp13467, tmp13469, 68);
__m512 tmp13484 = _mm512_shuffle_ps(tmp13467, tmp13469, 238);
__m512 tmp13485 = _mm512_shuffle_ps(tmp13468, tmp13470, 68);
__m512 tmp13486 = _mm512_shuffle_ps(tmp13468, tmp13470, 238);
__m512 tmp13487 = _mm512_shuffle_ps(tmp13471, tmp13473, 68);
__m512 tmp13488 = _mm512_shuffle_ps(tmp13471, tmp13473, 238);
__m512 tmp13489 = _mm512_shuffle_ps(tmp13472, tmp13474, 68);
__m512 tmp13490 = _mm512_shuffle_ps(tmp13472, tmp13474, 238);
__m512 tmp13491 = _mm512_shuffle_f32x4(tmp13475, tmp13479, 136);
__m512 tmp13492 = _mm512_shuffle_f32x4(tmp13475, tmp13479, 221);
__m512 tmp13493 = _mm512_shuffle_f32x4(tmp13476, tmp13480, 136);
__m512 tmp13494 = _mm512_shuffle_f32x4(tmp13476, tmp13480, 221);
__m512 tmp13495 = _mm512_shuffle_f32x4(tmp13477, tmp13481, 136);
__m512 tmp13496 = _mm512_shuffle_f32x4(tmp13477, tmp13481, 221);
__m512 tmp13497 = _mm512_shuffle_f32x4(tmp13478, tmp13482, 136);
__m512 tmp13498 = _mm512_shuffle_f32x4(tmp13478, tmp13482, 221);
__m512 tmp13499 = _mm512_shuffle_f32x4(tmp13483, tmp13487, 136);
__m512 tmp13500 = _mm512_shuffle_f32x4(tmp13483, tmp13487, 221);
__m512 tmp13501 = _mm512_shuffle_f32x4(tmp13484, tmp13488, 136);
__m512 tmp13502 = _mm512_shuffle_f32x4(tmp13484, tmp13488, 221);
__m512 tmp13503 = _mm512_shuffle_f32x4(tmp13485, tmp13489, 136);
__m512 tmp13504 = _mm512_shuffle_f32x4(tmp13485, tmp13489, 221);
__m512 tmp13505 = _mm512_shuffle_f32x4(tmp13486, tmp13490, 136);
__m512 tmp13506 = _mm512_shuffle_f32x4(tmp13486, tmp13490, 221);
wt353 = _mm512_shuffle_f32x4(tmp13491, tmp13499, 136);
wt361 = _mm512_shuffle_f32x4(tmp13491, tmp13499, 221);
wt354 = _mm512_shuffle_f32x4(tmp13493, tmp13501, 136);
wt362 = _mm512_shuffle_f32x4(tmp13493, tmp13501, 221);
wt355 = _mm512_shuffle_f32x4(tmp13495, tmp13503, 136);
wt363 = _mm512_shuffle_f32x4(tmp13495, tmp13503, 221);
wt356 = _mm512_shuffle_f32x4(tmp13497, tmp13505, 136);
wt364 = _mm512_shuffle_f32x4(tmp13497, tmp13505, 221);
wt357 = _mm512_shuffle_f32x4(tmp13492, tmp13500, 136);
wt365 = _mm512_shuffle_f32x4(tmp13492, tmp13500, 221);
wt358 = _mm512_shuffle_f32x4(tmp13494, tmp13502, 136);
wt366 = _mm512_shuffle_f32x4(tmp13494, tmp13502, 221);
wt359 = _mm512_shuffle_f32x4(tmp13496, tmp13504, 136);
wt367 = _mm512_shuffle_f32x4(tmp13496, tmp13504, 221);
wt360 = _mm512_shuffle_f32x4(tmp13498, tmp13506, 136);
wt368 = _mm512_shuffle_f32x4(tmp13498, tmp13506, 221);
wt353 = _mm512_mul_ps(wt353, postMul37);
wt354 = _mm512_mul_ps(wt354, postMul37);
wt355 = _mm512_mul_ps(wt355, postMul37);
wt356 = _mm512_mul_ps(wt356, postMul37);
wt357 = _mm512_mul_ps(wt357, postMul37);
wt358 = _mm512_mul_ps(wt358, postMul37);
wt359 = _mm512_mul_ps(wt359, postMul37);
wt360 = _mm512_mul_ps(wt360, postMul37);
wt361 = _mm512_mul_ps(wt361, postMul37);
wt362 = _mm512_mul_ps(wt362, postMul37);
wt363 = _mm512_mul_ps(wt363, postMul37);
wt364 = _mm512_mul_ps(wt364, postMul37);
wt365 = _mm512_mul_ps(wt365, postMul37);
wt366 = _mm512_mul_ps(wt366, postMul37);
wt367 = _mm512_mul_ps(wt367, postMul37);
wt368 = _mm512_mul_ps(wt368, postMul37);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c30)+(ptrdiff_t)0, 63>>cut15, wt353);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c30)+(ptrdiff_t)0, 63>>cut15, wt354);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c30)+(ptrdiff_t)0, 63>>cut15, wt355);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c30)+(ptrdiff_t)0, 63>>cut15, wt356);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c30)+(ptrdiff_t)0, 63>>cut15, wt357);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c30)+(ptrdiff_t)0, 63>>cut15, wt358);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c30)+(ptrdiff_t)0, 63>>cut15, wt359);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c30)+(ptrdiff_t)0, 63>>cut15, wt360);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c30)+(ptrdiff_t)0, 63>>cut15, wt361);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c30)+(ptrdiff_t)0, 63>>cut15, wt362);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c30)+(ptrdiff_t)0, 63>>cut15, wt363);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c30)+(ptrdiff_t)0, 63>>cut15, wt364);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c30)+(ptrdiff_t)0, 63>>cut15, wt365);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c30)+(ptrdiff_t)0, 63>>cut15, wt366);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c30)+(ptrdiff_t)0, 63>>cut15, wt367);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c30)+(ptrdiff_t)0, 63>>cut15, wt368);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt353);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt354);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt355);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt356);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt357);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt358);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt359);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt360);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt361);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt362);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt363);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt364);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt365);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt366);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt367);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt368);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt353);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt354);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt355);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt356);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt357);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt358);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt359);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt360);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt361);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt362);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt363);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt364);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt365);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt366);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt367);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt368);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt353);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt354);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt355);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt356);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt357);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt358);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt359);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt360);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt361);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt362);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt363);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt364);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt365);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt366);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt367);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt368);
}
}
}
} else {
ptrdiff_t k116 = 496;
ptrdiff_t l48 = (size_t)(0+k116)/6;
ptrdiff_t cut14 = (size_t)(0+k116)%6;
__m512 sum278 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i38+4*k116);
__m512i pmMul23 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd23 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo19 = _mm512_loadu_ps(bnPtr11+(ptrdiff_t)8*(k116+512*i38));
__m512 masHi19 = _mm512_maskz_loadu_ps(65535, bnPtr11+(ptrdiff_t)8*(k116+512*i38)+(ptrdiff_t)64);
__m512 postMul35 = _mm512_permutex2var_ps(masLo19, pmMul23, masHi19);
__m512 postAdd21 = _mm512_permutex2var_ps(masLo19, pmAdd23, masHi19);
sum278 = _mm512_fmadd_ps(sum278, postMul35, postAdd21);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*0+(ptrdiff_t)0, 63>>cut14, sum278);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*0+(ptrdiff_t)3072, 4032>>cut14, sum278);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*0+(ptrdiff_t)6144, 258048>>cut14, sum278);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*0+(ptrdiff_t)9216, 65535-(262143>>cut14), sum278);
ptrdiff_t c28 = 0;
for (; c28 != 8; ++c28) {
__m512 wt321 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)0);
__m512 wt322 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)512);
__m512 wt323 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)1024);
__m512 wt324 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)1536);
__m512 wt325 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)2048);
__m512 wt326 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)2560);
__m512 wt327 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)3072);
__m512 wt328 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)3584);
__m512 wt329 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)4096);
__m512 wt330 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)4608);
__m512 wt331 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)5120);
__m512 wt332 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)5632);
__m512 wt333 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)6144);
__m512 wt334 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)6656);
__m512 wt335 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)7168);
__m512 wt336 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)7680);
__m512 tmp13507 = _mm512_unpacklo_ps(wt321, wt322);
__m512 tmp13508 = _mm512_unpackhi_ps(wt321, wt322);
__m512 tmp13509 = _mm512_unpacklo_ps(wt323, wt324);
__m512 tmp13510 = _mm512_unpackhi_ps(wt323, wt324);
__m512 tmp13511 = _mm512_unpacklo_ps(wt325, wt326);
__m512 tmp13512 = _mm512_unpackhi_ps(wt325, wt326);
__m512 tmp13513 = _mm512_unpacklo_ps(wt327, wt328);
__m512 tmp13514 = _mm512_unpackhi_ps(wt327, wt328);
__m512 tmp13515 = _mm512_unpacklo_ps(wt329, wt330);
__m512 tmp13516 = _mm512_unpackhi_ps(wt329, wt330);
__m512 tmp13517 = _mm512_unpacklo_ps(wt331, wt332);
__m512 tmp13518 = _mm512_unpackhi_ps(wt331, wt332);
__m512 tmp13519 = _mm512_unpacklo_ps(wt333, wt334);
__m512 tmp13520 = _mm512_unpackhi_ps(wt333, wt334);
__m512 tmp13521 = _mm512_unpacklo_ps(wt335, wt336);
__m512 tmp13522 = _mm512_unpackhi_ps(wt335, wt336);
__m512 tmp13523 = _mm512_shuffle_ps(tmp13507, tmp13509, 68);
__m512 tmp13524 = _mm512_shuffle_ps(tmp13507, tmp13509, 238);
__m512 tmp13525 = _mm512_shuffle_ps(tmp13508, tmp13510, 68);
__m512 tmp13526 = _mm512_shuffle_ps(tmp13508, tmp13510, 238);
__m512 tmp13527 = _mm512_shuffle_ps(tmp13511, tmp13513, 68);
__m512 tmp13528 = _mm512_shuffle_ps(tmp13511, tmp13513, 238);
__m512 tmp13529 = _mm512_shuffle_ps(tmp13512, tmp13514, 68);
__m512 tmp13530 = _mm512_shuffle_ps(tmp13512, tmp13514, 238);
__m512 tmp13531 = _mm512_shuffle_ps(tmp13515, tmp13517, 68);
__m512 tmp13532 = _mm512_shuffle_ps(tmp13515, tmp13517, 238);
__m512 tmp13533 = _mm512_shuffle_ps(tmp13516, tmp13518, 68);
__m512 tmp13534 = _mm512_shuffle_ps(tmp13516, tmp13518, 238);
__m512 tmp13535 = _mm512_shuffle_ps(tmp13519, tmp13521, 68);
__m512 tmp13536 = _mm512_shuffle_ps(tmp13519, tmp13521, 238);
__m512 tmp13537 = _mm512_shuffle_ps(tmp13520, tmp13522, 68);
__m512 tmp13538 = _mm512_shuffle_ps(tmp13520, tmp13522, 238);
__m512 tmp13539 = _mm512_shuffle_f32x4(tmp13523, tmp13527, 136);
__m512 tmp13540 = _mm512_shuffle_f32x4(tmp13523, tmp13527, 221);
__m512 tmp13541 = _mm512_shuffle_f32x4(tmp13524, tmp13528, 136);
__m512 tmp13542 = _mm512_shuffle_f32x4(tmp13524, tmp13528, 221);
__m512 tmp13543 = _mm512_shuffle_f32x4(tmp13525, tmp13529, 136);
__m512 tmp13544 = _mm512_shuffle_f32x4(tmp13525, tmp13529, 221);
__m512 tmp13545 = _mm512_shuffle_f32x4(tmp13526, tmp13530, 136);
__m512 tmp13546 = _mm512_shuffle_f32x4(tmp13526, tmp13530, 221);
__m512 tmp13547 = _mm512_shuffle_f32x4(tmp13531, tmp13535, 136);
__m512 tmp13548 = _mm512_shuffle_f32x4(tmp13531, tmp13535, 221);
__m512 tmp13549 = _mm512_shuffle_f32x4(tmp13532, tmp13536, 136);
__m512 tmp13550 = _mm512_shuffle_f32x4(tmp13532, tmp13536, 221);
__m512 tmp13551 = _mm512_shuffle_f32x4(tmp13533, tmp13537, 136);
__m512 tmp13552 = _mm512_shuffle_f32x4(tmp13533, tmp13537, 221);
__m512 tmp13553 = _mm512_shuffle_f32x4(tmp13534, tmp13538, 136);
__m512 tmp13554 = _mm512_shuffle_f32x4(tmp13534, tmp13538, 221);
wt321 = _mm512_shuffle_f32x4(tmp13539, tmp13547, 136);
wt329 = _mm512_shuffle_f32x4(tmp13539, tmp13547, 221);
wt322 = _mm512_shuffle_f32x4(tmp13541, tmp13549, 136);
wt330 = _mm512_shuffle_f32x4(tmp13541, tmp13549, 221);
wt323 = _mm512_shuffle_f32x4(tmp13543, tmp13551, 136);
wt331 = _mm512_shuffle_f32x4(tmp13543, tmp13551, 221);
wt324 = _mm512_shuffle_f32x4(tmp13545, tmp13553, 136);
wt332 = _mm512_shuffle_f32x4(tmp13545, tmp13553, 221);
wt325 = _mm512_shuffle_f32x4(tmp13540, tmp13548, 136);
wt333 = _mm512_shuffle_f32x4(tmp13540, tmp13548, 221);
wt326 = _mm512_shuffle_f32x4(tmp13542, tmp13550, 136);
wt334 = _mm512_shuffle_f32x4(tmp13542, tmp13550, 221);
wt327 = _mm512_shuffle_f32x4(tmp13544, tmp13552, 136);
wt335 = _mm512_shuffle_f32x4(tmp13544, tmp13552, 221);
wt328 = _mm512_shuffle_f32x4(tmp13546, tmp13554, 136);
wt336 = _mm512_shuffle_f32x4(tmp13546, tmp13554, 221);
wt321 = _mm512_mul_ps(wt321, postMul35);
wt322 = _mm512_mul_ps(wt322, postMul35);
wt323 = _mm512_mul_ps(wt323, postMul35);
wt324 = _mm512_mul_ps(wt324, postMul35);
wt325 = _mm512_mul_ps(wt325, postMul35);
wt326 = _mm512_mul_ps(wt326, postMul35);
wt327 = _mm512_mul_ps(wt327, postMul35);
wt328 = _mm512_mul_ps(wt328, postMul35);
wt329 = _mm512_mul_ps(wt329, postMul35);
wt330 = _mm512_mul_ps(wt330, postMul35);
wt331 = _mm512_mul_ps(wt331, postMul35);
wt332 = _mm512_mul_ps(wt332, postMul35);
wt333 = _mm512_mul_ps(wt333, postMul35);
wt334 = _mm512_mul_ps(wt334, postMul35);
wt335 = _mm512_mul_ps(wt335, postMul35);
wt336 = _mm512_mul_ps(wt336, postMul35);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(1+16*c28)+(ptrdiff_t)0, 63>>cut14, wt321);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(2+16*c28)+(ptrdiff_t)0, 63>>cut14, wt322);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(3+16*c28)+(ptrdiff_t)0, 63>>cut14, wt323);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(4+16*c28)+(ptrdiff_t)0, 63>>cut14, wt324);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(5+16*c28)+(ptrdiff_t)0, 63>>cut14, wt325);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(6+16*c28)+(ptrdiff_t)0, 63>>cut14, wt326);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(7+16*c28)+(ptrdiff_t)0, 63>>cut14, wt327);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(8+16*c28)+(ptrdiff_t)0, 63>>cut14, wt328);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(9+16*c28)+(ptrdiff_t)0, 63>>cut14, wt329);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(10+16*c28)+(ptrdiff_t)0, 63>>cut14, wt330);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(11+16*c28)+(ptrdiff_t)0, 63>>cut14, wt331);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(12+16*c28)+(ptrdiff_t)0, 63>>cut14, wt332);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(13+16*c28)+(ptrdiff_t)0, 63>>cut14, wt333);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(14+16*c28)+(ptrdiff_t)0, 63>>cut14, wt334);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(15+16*c28)+(ptrdiff_t)0, 63>>cut14, wt335);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(16+16*c28)+(ptrdiff_t)0, 63>>cut14, wt336);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(1+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt321);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(2+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt322);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(3+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt323);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(4+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt324);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(5+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt325);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(6+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt326);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(7+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt327);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(8+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt328);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(9+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt329);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(10+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt330);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(11+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt331);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(12+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt332);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(13+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt333);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(14+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt334);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(15+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt335);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(16+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt336);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(1+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt321);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(2+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt322);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(3+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt323);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(4+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt324);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(5+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt325);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(6+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt326);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(7+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt327);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(8+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt328);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(9+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt329);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(10+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt330);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(11+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt331);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(12+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt332);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(13+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt333);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(14+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt334);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(15+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt335);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(16+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt336);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(1+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt321);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(2+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt322);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(3+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt323);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(4+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt324);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(5+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt325);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(6+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt326);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(7+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt327);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(8+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt328);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(9+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt329);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(10+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt330);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(11+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt331);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(12+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt332);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(13+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt333);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(14+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt334);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(15+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt335);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(16+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt336);
}
}
}
}
}

static void ResNet50OneArrangeWts5(ResNet50ThreaderTeam1* team43, char** tensors59) {
ResNet50ThreaderTask1 task63;
task63.callee1 = ResNet50OneArrangeWts5Callee1;
task63.any1 = tensors59;
task63.nd1 = 3;
task63.hull1[0] = 8;
task63.hull1[1] = 1;
task63.hull1[2] = 1;
ResNet50ThreaderDo1(team43, &task63);
}

static void ResNet50OneArrangeDats5Callee1(ResNet50ThreaderTask1* task64, int64_t* pt37) {
char** tensors62 = task64->any1;
ptrdiff_t c31 = pt37[1];
char*restrict datPtr18 = tensors62[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)401408*0;
char*restrict arranged10 = tensors62[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)401408*0;
ptrdiff_t ii14 = 1;
for (ptrdiff_t i39 = 0; i39 < ii14; ++i39) {
ptrdiff_t j32 = 1*c31;
ptrdiff_t jj35 = j32+0;
for (; j32 != 12; ++j32) {
ptrdiff_t k118 = 0;
ptrdiff_t kk35 = k118+128;
for (; k118 < kk35; ++k118) {
__m512 dat1911 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k118+(ptrdiff_t)0);
__m512 dat1912 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k118+(ptrdiff_t)64);
__m512 dat1913 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k118+(ptrdiff_t)128);
__m512 dat1914 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k118+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+256*k118+(ptrdiff_t)0, 65535, dat1911);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+256*k118+(ptrdiff_t)64, 65535, dat1912);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+256*k118+(ptrdiff_t)128, 65535, dat1913);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+256*k118+(ptrdiff_t)192, 65535, dat1914);
}
if (j32 >= jj35) goto next5;
}
ptrdiff_t k119 = 0;
ptrdiff_t kk36 = k119+128;
for (; k119 < kk36; ++k119) {
__m512 dat1915 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k119+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+64*k119+(ptrdiff_t)0, 65535, dat1915);
}
next5:;
}
}

static void ResNet50OneArrangeDats5(ResNet50ThreaderTeam1* team44, char** tensors61) {
ResNet50ThreaderTask1 task65;
task65.callee1 = ResNet50OneArrangeDats5Callee1;
task65.any1 = tensors61;
task65.nd1 = 4;
task65.hull1[0] = 1;
task65.hull1[1] = 13;
task65.hull1[2] = 1;
task65.hull1[3] = 1;
ResNet50ThreaderDo1(team44, &task65);
}

static void ResNet50OneApply5Callee1(ResNet50ThreaderTask1* task66, int64_t* pt38) {
void** pair16 = task66->any1;
char** tensors64 = pair16[0];
ptrdiff_t e19 = 0;
ptrdiff_t g22 = 0;
ptrdiff_t d13 = pt38[1];
ptrdiff_t w55 = pt38[0];
char*restrict arrangedWts5 = tensors64[0]+1712128*e19+(ptrdiff_t)264192*1*g22;
char*restrict arrangedDats5 = tensors64[1]+2618560*e19+(ptrdiff_t)401408*1*g22;
char*restrict datPtr19 = tensors64[2]+(ptrdiff_t)1605632*1*g22;
char*restrict datPtr20 = tensors64[3]+(ptrdiff_t)1605632*1*g22;
ptrdiff_t ii15 = 1;
for (ptrdiff_t i40 = 0; i40 < ii15; ++i40) {
ptrdiff_t j33 = 1*d13;
ptrdiff_t jj36 = j33+0;
for (; j33 != 12; ++j33) {
ptrdiff_t k120 = 4*w55;
ptrdiff_t kk37 = k120+(w55 < 20 ? 3 : 5);
for (; k120 != 85; ++k120) {
ptrdiff_t s26 = -1;
__m512 sum281 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)24));
__m512 sum285 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)28));
__m512 sum289 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)32));
__m512 sum293 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)36));
__m512 sum297 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)40));
__m512 sum301 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)44));
__m512 sum282 = sum281;
__m512 sum283 = sum281;
__m512 sum284 = sum281;
__m512 sum286 = sum285;
__m512 sum287 = sum285;
__m512 sum288 = sum285;
__m512 sum290 = sum289;
__m512 sum291 = sum289;
__m512 sum292 = sum289;
__m512 sum294 = sum293;
__m512 sum295 = sum293;
__m512 sum296 = sum293;
__m512 sum298 = sum297;
__m512 sum299 = sum297;
__m512 sum300 = sum297;
__m512 sum302 = sum301;
__m512 sum303 = sum301;
__m512 sum304 = sum301;
for (s26 = 0; s26 < 128; ++s26) {
__m512 dat1916 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s26+(ptrdiff_t)0);
__m512 dat1917 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s26+(ptrdiff_t)64);
__m512 dat1918 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s26+(ptrdiff_t)128);
__m512 dat1919 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s26+(ptrdiff_t)192);
__m512 wt369 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)24));
sum281 = _mm512_fmadd_ps(wt369, dat1916, sum281);
sum282 = _mm512_fmadd_ps(wt369, dat1917, sum282);
sum283 = _mm512_fmadd_ps(wt369, dat1918, sum283);
sum284 = _mm512_fmadd_ps(wt369, dat1919, sum284);
__m512 wt370 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)28));
sum285 = _mm512_fmadd_ps(wt370, dat1916, sum285);
sum286 = _mm512_fmadd_ps(wt370, dat1917, sum286);
sum287 = _mm512_fmadd_ps(wt370, dat1918, sum287);
sum288 = _mm512_fmadd_ps(wt370, dat1919, sum288);
__m512 wt371 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)32));
sum289 = _mm512_fmadd_ps(wt371, dat1916, sum289);
sum290 = _mm512_fmadd_ps(wt371, dat1917, sum290);
sum291 = _mm512_fmadd_ps(wt371, dat1918, sum291);
sum292 = _mm512_fmadd_ps(wt371, dat1919, sum292);
__m512 wt372 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)36));
sum293 = _mm512_fmadd_ps(wt372, dat1916, sum293);
sum294 = _mm512_fmadd_ps(wt372, dat1917, sum294);
sum295 = _mm512_fmadd_ps(wt372, dat1918, sum295);
sum296 = _mm512_fmadd_ps(wt372, dat1919, sum296);
__m512 wt373 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)40));
sum297 = _mm512_fmadd_ps(wt373, dat1916, sum297);
sum298 = _mm512_fmadd_ps(wt373, dat1917, sum298);
sum299 = _mm512_fmadd_ps(wt373, dat1918, sum299);
sum300 = _mm512_fmadd_ps(wt373, dat1919, sum300);
__m512 wt374 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)44));
sum301 = _mm512_fmadd_ps(wt374, dat1916, sum301);
sum302 = _mm512_fmadd_ps(wt374, dat1917, sum302);
sum303 = _mm512_fmadd_ps(wt374, dat1918, sum303);
sum304 = _mm512_fmadd_ps(wt374, dat1919, sum304);
}
sum281 = _mm512_add_ps(sum281, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)0));
sum282 = _mm512_add_ps(sum282, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)64));
sum283 = _mm512_add_ps(sum283, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)128));
sum284 = _mm512_add_ps(sum284, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)192));
sum281 = _mm512_max_ps(_mm512_setzero_ps(), sum281);
sum282 = _mm512_max_ps(_mm512_setzero_ps(), sum282);
sum283 = _mm512_max_ps(_mm512_setzero_ps(), sum283);
sum284 = _mm512_max_ps(_mm512_setzero_ps(), sum284);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)0, 65535, sum281);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)64, 65535, sum282);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)128, 65535, sum283);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)192, 65535, sum284);
sum285 = _mm512_add_ps(sum285, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3136));
sum286 = _mm512_add_ps(sum286, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3200));
sum287 = _mm512_add_ps(sum287, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3264));
sum288 = _mm512_add_ps(sum288, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3328));
sum285 = _mm512_max_ps(_mm512_setzero_ps(), sum285);
sum286 = _mm512_max_ps(_mm512_setzero_ps(), sum286);
sum287 = _mm512_max_ps(_mm512_setzero_ps(), sum287);
sum288 = _mm512_max_ps(_mm512_setzero_ps(), sum288);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3136, 65535, sum285);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3200, 65535, sum286);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3264, 65535, sum287);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3328, 65535, sum288);
sum289 = _mm512_add_ps(sum289, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6272));
sum290 = _mm512_add_ps(sum290, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6336));
sum291 = _mm512_add_ps(sum291, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6400));
sum292 = _mm512_add_ps(sum292, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6464));
sum289 = _mm512_max_ps(_mm512_setzero_ps(), sum289);
sum290 = _mm512_max_ps(_mm512_setzero_ps(), sum290);
sum291 = _mm512_max_ps(_mm512_setzero_ps(), sum291);
sum292 = _mm512_max_ps(_mm512_setzero_ps(), sum292);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6272, 65535, sum289);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6336, 65535, sum290);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6400, 65535, sum291);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6464, 65535, sum292);
sum293 = _mm512_add_ps(sum293, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9408));
sum294 = _mm512_add_ps(sum294, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9472));
sum295 = _mm512_add_ps(sum295, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9536));
sum296 = _mm512_add_ps(sum296, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9600));
sum293 = _mm512_max_ps(_mm512_setzero_ps(), sum293);
sum294 = _mm512_max_ps(_mm512_setzero_ps(), sum294);
sum295 = _mm512_max_ps(_mm512_setzero_ps(), sum295);
sum296 = _mm512_max_ps(_mm512_setzero_ps(), sum296);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9408, 65535, sum293);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9472, 65535, sum294);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9536, 65535, sum295);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9600, 65535, sum296);
sum297 = _mm512_add_ps(sum297, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12544));
sum298 = _mm512_add_ps(sum298, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12608));
sum299 = _mm512_add_ps(sum299, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12672));
sum300 = _mm512_add_ps(sum300, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12736));
sum297 = _mm512_max_ps(_mm512_setzero_ps(), sum297);
sum298 = _mm512_max_ps(_mm512_setzero_ps(), sum298);
sum299 = _mm512_max_ps(_mm512_setzero_ps(), sum299);
sum300 = _mm512_max_ps(_mm512_setzero_ps(), sum300);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12544, 65535, sum297);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12608, 65535, sum298);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12672, 65535, sum299);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12736, 65535, sum300);
sum301 = _mm512_add_ps(sum301, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15680));
sum302 = _mm512_add_ps(sum302, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15744));
sum303 = _mm512_add_ps(sum303, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15808));
sum304 = _mm512_add_ps(sum304, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15872));
sum301 = _mm512_max_ps(_mm512_setzero_ps(), sum301);
sum302 = _mm512_max_ps(_mm512_setzero_ps(), sum302);
sum303 = _mm512_max_ps(_mm512_setzero_ps(), sum303);
sum304 = _mm512_max_ps(_mm512_setzero_ps(), sum304);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15680, 65535, sum301);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15744, 65535, sum302);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15808, 65535, sum303);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15872, 65535, sum304);
if (k120 >= kk37) return;
}
ptrdiff_t s27 = -1;
__m512 sum305 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+8*s27+(ptrdiff_t)8));
__m512 sum309 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+8*s27+(ptrdiff_t)12));
__m512 sum306 = sum305;
__m512 sum307 = sum305;
__m512 sum308 = sum305;
__m512 sum310 = sum309;
__m512 sum311 = sum309;
__m512 sum312 = sum309;
for (s27 = 0; s27 < 128; ++s27) {
__m512 dat1920 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s27+(ptrdiff_t)0);
__m512 dat1921 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s27+(ptrdiff_t)64);
__m512 dat1922 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s27+(ptrdiff_t)128);
__m512 dat1923 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s27+(ptrdiff_t)192);
__m512 wt375 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+8*s27+(ptrdiff_t)8));
sum305 = _mm512_fmadd_ps(wt375, dat1920, sum305);
sum306 = _mm512_fmadd_ps(wt375, dat1921, sum306);
sum307 = _mm512_fmadd_ps(wt375, dat1922, sum307);
sum308 = _mm512_fmadd_ps(wt375, dat1923, sum308);
__m512 wt376 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+8*s27+(ptrdiff_t)12));
sum309 = _mm512_fmadd_ps(wt376, dat1920, sum309);
sum310 = _mm512_fmadd_ps(wt376, dat1921, sum310);
sum311 = _mm512_fmadd_ps(wt376, dat1922, sum311);
sum312 = _mm512_fmadd_ps(wt376, dat1923, sum312);
}
sum305 = _mm512_add_ps(sum305, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)0));
sum306 = _mm512_add_ps(sum306, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)64));
sum307 = _mm512_add_ps(sum307, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)128));
sum308 = _mm512_add_ps(sum308, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)192));
sum305 = _mm512_max_ps(_mm512_setzero_ps(), sum305);
sum306 = _mm512_max_ps(_mm512_setzero_ps(), sum306);
sum307 = _mm512_max_ps(_mm512_setzero_ps(), sum307);
sum308 = _mm512_max_ps(_mm512_setzero_ps(), sum308);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)0, 65535, sum305);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)64, 65535, sum306);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)128, 65535, sum307);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)192, 65535, sum308);
sum309 = _mm512_add_ps(sum309, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3136));
sum310 = _mm512_add_ps(sum310, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3200));
sum311 = _mm512_add_ps(sum311, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3264));
sum312 = _mm512_add_ps(sum312, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3328));
sum309 = _mm512_max_ps(_mm512_setzero_ps(), sum309);
sum310 = _mm512_max_ps(_mm512_setzero_ps(), sum310);
sum311 = _mm512_max_ps(_mm512_setzero_ps(), sum311);
sum312 = _mm512_max_ps(_mm512_setzero_ps(), sum312);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3136, 65535, sum309);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3200, 65535, sum310);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3264, 65535, sum311);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3328, 65535, sum312);
if (j33 >= jj36) return;
}
ptrdiff_t k121 = 4*w55;
ptrdiff_t kk38 = k121+(w55 < 20 ? 3 : 5);
for (; k121 != 85; ++k121) {
ptrdiff_t s28 = -1;
__m512 sum313 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)24));
__m512 sum314 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)28));
__m512 sum315 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)32));
__m512 sum316 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)36));
__m512 sum317 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)40));
__m512 sum318 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)44));
for (s28 = 0; s28 < 128; ++s28) {
__m512 dat1924 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+64*s28+(ptrdiff_t)0);
__m512 wt377 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)24));
sum313 = _mm512_fmadd_ps(wt377, dat1924, sum313);
__m512 wt378 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)28));
sum314 = _mm512_fmadd_ps(wt378, dat1924, sum314);
__m512 wt379 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)32));
sum315 = _mm512_fmadd_ps(wt379, dat1924, sum315);
__m512 wt380 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)36));
sum316 = _mm512_fmadd_ps(wt380, dat1924, sum316);
__m512 wt381 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)40));
sum317 = _mm512_fmadd_ps(wt381, dat1924, sum317);
__m512 wt382 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)44));
sum318 = _mm512_fmadd_ps(wt382, dat1924, sum318);
}
sum313 = _mm512_add_ps(sum313, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)0));
sum313 = _mm512_max_ps(_mm512_setzero_ps(), sum313);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)0, 65535, sum313);
sum314 = _mm512_add_ps(sum314, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)3136));
sum314 = _mm512_max_ps(_mm512_setzero_ps(), sum314);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)3136, 65535, sum314);
sum315 = _mm512_add_ps(sum315, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)6272));
sum315 = _mm512_max_ps(_mm512_setzero_ps(), sum315);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)6272, 65535, sum315);
sum316 = _mm512_add_ps(sum316, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)9408));
sum316 = _mm512_max_ps(_mm512_setzero_ps(), sum316);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)9408, 65535, sum316);
sum317 = _mm512_add_ps(sum317, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)12544));
sum317 = _mm512_max_ps(_mm512_setzero_ps(), sum317);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)12544, 65535, sum317);
sum318 = _mm512_add_ps(sum318, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)15680));
sum318 = _mm512_max_ps(_mm512_setzero_ps(), sum318);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)15680, 65535, sum318);
if (k121 >= kk38) return;
}
ptrdiff_t s29 = -1;
__m512 sum319 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+8*s29+(ptrdiff_t)8));
__m512 sum320 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+8*s29+(ptrdiff_t)12));
for (s29 = 0; s29 < 128; ++s29) {
__m512 dat1925 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+64*s29+(ptrdiff_t)0);
__m512 wt383 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+8*s29+(ptrdiff_t)8));
sum319 = _mm512_fmadd_ps(wt383, dat1925, sum319);
__m512 wt384 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+8*s29+(ptrdiff_t)12));
sum320 = _mm512_fmadd_ps(wt384, dat1925, sum320);
}
sum319 = _mm512_add_ps(sum319, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)0));
sum319 = _mm512_max_ps(_mm512_setzero_ps(), sum319);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)0, 65535, sum319);
sum320 = _mm512_add_ps(sum320, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)3136));
sum320 = _mm512_max_ps(_mm512_setzero_ps(), sum320);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)3136, 65535, sum320);
}
}

static void ResNet50OneApply5(ResNet50ThreaderTeam1* team45, char** tensors63) {
void* pair15[] = {tensors63, 0};
ResNet50ThreaderTask1 task67;
task67.callee1 = ResNet50OneApply5Callee1;
task67.any1 = pair15;
task67.nd1 = 3;
task67.hull1[0] = 21;
task67.hull1[1] = 13;
task67.hull1[2] = 1;
ResNet50ThreaderDo1(team45, &task67);
}

static void ResNet50OneArrangeWts6Callee1(ResNet50ThreaderTask1* task68, int64_t* pt39) {
char** tensors66 = task68->any1;
ptrdiff_t b58 = pt39[0];
char*restrict wtPtr12 = tensors66[0]+(ptrdiff_t)3340*0+(ptrdiff_t)262144*0;
char*restrict biasPtr12 = tensors66[1]+(ptrdiff_t)512*0;
char*restrict bnPtr12 = tensors66[2]+(ptrdiff_t)8*128*0;
char*restrict arranged11 = tensors66[3]+(ptrdiff_t)428032*0+(ptrdiff_t)262656*0;
ptrdiff_t ii16 = 1;
for (ptrdiff_t i41 = 0; i41 < ii16; ++i41) {
ptrdiff_t j34 = 1*b58;
ptrdiff_t jj37 = j34+1;
for (; j34 < jj37; ++j34) {
if (j34 < 7) {
ptrdiff_t k123 = 0+16*(j34-0);
ptrdiff_t l51 = (size_t)(0+k123)/6;
ptrdiff_t cut17 = (size_t)(0+k123)%6;
switch (cut17) {
case 0:;
case 2: {
__m512 sum322 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i41+4*k123);
__m512i pmMul24 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd24 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo20 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k123+128*i41));
__m512 masHi20 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k123+128*i41)+(ptrdiff_t)64);
__m512 postMul39 = _mm512_permutex2var_ps(masLo20, pmMul24, masHi20);
__m512 postAdd25 = _mm512_permutex2var_ps(masLo20, pmAdd24, masHi20);
sum322 = _mm512_fmadd_ps(sum322, postMul39, postAdd25);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)0, 63>>cut17, sum322);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)12288, 4032>>cut17, sum322);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)24576, 65535-(4095>>cut17), sum322);
ptrdiff_t c33 = 0;
for (; c33 != 32; ++c33) {
__m512 wt401 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)0);
__m512 wt402 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)2048);
__m512 wt403 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)4096);
__m512 wt404 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)6144);
__m512 wt405 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)8192);
__m512 wt406 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)10240);
__m512 wt407 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)12288);
__m512 wt408 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)14336);
__m512 wt409 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)16384);
__m512 wt410 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)18432);
__m512 wt411 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)20480);
__m512 wt412 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)22528);
__m512 wt413 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)24576);
__m512 wt414 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)26624);
__m512 wt415 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)28672);
__m512 wt416 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)30720);
__m512 tmp13555 = _mm512_unpacklo_ps(wt401, wt402);
__m512 tmp13556 = _mm512_unpackhi_ps(wt401, wt402);
__m512 tmp13557 = _mm512_unpacklo_ps(wt403, wt404);
__m512 tmp13558 = _mm512_unpackhi_ps(wt403, wt404);
__m512 tmp13559 = _mm512_unpacklo_ps(wt405, wt406);
__m512 tmp13560 = _mm512_unpackhi_ps(wt405, wt406);
__m512 tmp13561 = _mm512_unpacklo_ps(wt407, wt408);
__m512 tmp13562 = _mm512_unpackhi_ps(wt407, wt408);
__m512 tmp13563 = _mm512_unpacklo_ps(wt409, wt410);
__m512 tmp13564 = _mm512_unpackhi_ps(wt409, wt410);
__m512 tmp13565 = _mm512_unpacklo_ps(wt411, wt412);
__m512 tmp13566 = _mm512_unpackhi_ps(wt411, wt412);
__m512 tmp13567 = _mm512_unpacklo_ps(wt413, wt414);
__m512 tmp13568 = _mm512_unpackhi_ps(wt413, wt414);
__m512 tmp13569 = _mm512_unpacklo_ps(wt415, wt416);
__m512 tmp13570 = _mm512_unpackhi_ps(wt415, wt416);
__m512 tmp13571 = _mm512_shuffle_ps(tmp13555, tmp13557, 68);
__m512 tmp13572 = _mm512_shuffle_ps(tmp13555, tmp13557, 238);
__m512 tmp13573 = _mm512_shuffle_ps(tmp13556, tmp13558, 68);
__m512 tmp13574 = _mm512_shuffle_ps(tmp13556, tmp13558, 238);
__m512 tmp13575 = _mm512_shuffle_ps(tmp13559, tmp13561, 68);
__m512 tmp13576 = _mm512_shuffle_ps(tmp13559, tmp13561, 238);
__m512 tmp13577 = _mm512_shuffle_ps(tmp13560, tmp13562, 68);
__m512 tmp13578 = _mm512_shuffle_ps(tmp13560, tmp13562, 238);
__m512 tmp13579 = _mm512_shuffle_ps(tmp13563, tmp13565, 68);
__m512 tmp13580 = _mm512_shuffle_ps(tmp13563, tmp13565, 238);
__m512 tmp13581 = _mm512_shuffle_ps(tmp13564, tmp13566, 68);
__m512 tmp13582 = _mm512_shuffle_ps(tmp13564, tmp13566, 238);
__m512 tmp13583 = _mm512_shuffle_ps(tmp13567, tmp13569, 68);
__m512 tmp13584 = _mm512_shuffle_ps(tmp13567, tmp13569, 238);
__m512 tmp13585 = _mm512_shuffle_ps(tmp13568, tmp13570, 68);
__m512 tmp13586 = _mm512_shuffle_ps(tmp13568, tmp13570, 238);
__m512 tmp13587 = _mm512_shuffle_f32x4(tmp13571, tmp13575, 136);
__m512 tmp13588 = _mm512_shuffle_f32x4(tmp13571, tmp13575, 221);
__m512 tmp13589 = _mm512_shuffle_f32x4(tmp13572, tmp13576, 136);
__m512 tmp13590 = _mm512_shuffle_f32x4(tmp13572, tmp13576, 221);
__m512 tmp13591 = _mm512_shuffle_f32x4(tmp13573, tmp13577, 136);
__m512 tmp13592 = _mm512_shuffle_f32x4(tmp13573, tmp13577, 221);
__m512 tmp13593 = _mm512_shuffle_f32x4(tmp13574, tmp13578, 136);
__m512 tmp13594 = _mm512_shuffle_f32x4(tmp13574, tmp13578, 221);
__m512 tmp13595 = _mm512_shuffle_f32x4(tmp13579, tmp13583, 136);
__m512 tmp13596 = _mm512_shuffle_f32x4(tmp13579, tmp13583, 221);
__m512 tmp13597 = _mm512_shuffle_f32x4(tmp13580, tmp13584, 136);
__m512 tmp13598 = _mm512_shuffle_f32x4(tmp13580, tmp13584, 221);
__m512 tmp13599 = _mm512_shuffle_f32x4(tmp13581, tmp13585, 136);
__m512 tmp13600 = _mm512_shuffle_f32x4(tmp13581, tmp13585, 221);
__m512 tmp13601 = _mm512_shuffle_f32x4(tmp13582, tmp13586, 136);
__m512 tmp13602 = _mm512_shuffle_f32x4(tmp13582, tmp13586, 221);
wt401 = _mm512_shuffle_f32x4(tmp13587, tmp13595, 136);
wt409 = _mm512_shuffle_f32x4(tmp13587, tmp13595, 221);
wt402 = _mm512_shuffle_f32x4(tmp13589, tmp13597, 136);
wt410 = _mm512_shuffle_f32x4(tmp13589, tmp13597, 221);
wt403 = _mm512_shuffle_f32x4(tmp13591, tmp13599, 136);
wt411 = _mm512_shuffle_f32x4(tmp13591, tmp13599, 221);
wt404 = _mm512_shuffle_f32x4(tmp13593, tmp13601, 136);
wt412 = _mm512_shuffle_f32x4(tmp13593, tmp13601, 221);
wt405 = _mm512_shuffle_f32x4(tmp13588, tmp13596, 136);
wt413 = _mm512_shuffle_f32x4(tmp13588, tmp13596, 221);
wt406 = _mm512_shuffle_f32x4(tmp13590, tmp13598, 136);
wt414 = _mm512_shuffle_f32x4(tmp13590, tmp13598, 221);
wt407 = _mm512_shuffle_f32x4(tmp13592, tmp13600, 136);
wt415 = _mm512_shuffle_f32x4(tmp13592, tmp13600, 221);
wt408 = _mm512_shuffle_f32x4(tmp13594, tmp13602, 136);
wt416 = _mm512_shuffle_f32x4(tmp13594, tmp13602, 221);
wt401 = _mm512_mul_ps(wt401, postMul39);
wt402 = _mm512_mul_ps(wt402, postMul39);
wt403 = _mm512_mul_ps(wt403, postMul39);
wt404 = _mm512_mul_ps(wt404, postMul39);
wt405 = _mm512_mul_ps(wt405, postMul39);
wt406 = _mm512_mul_ps(wt406, postMul39);
wt407 = _mm512_mul_ps(wt407, postMul39);
wt408 = _mm512_mul_ps(wt408, postMul39);
wt409 = _mm512_mul_ps(wt409, postMul39);
wt410 = _mm512_mul_ps(wt410, postMul39);
wt411 = _mm512_mul_ps(wt411, postMul39);
wt412 = _mm512_mul_ps(wt412, postMul39);
wt413 = _mm512_mul_ps(wt413, postMul39);
wt414 = _mm512_mul_ps(wt414, postMul39);
wt415 = _mm512_mul_ps(wt415, postMul39);
wt416 = _mm512_mul_ps(wt416, postMul39);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c33)+(ptrdiff_t)0, 63>>cut17, wt401);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c33)+(ptrdiff_t)0, 63>>cut17, wt402);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c33)+(ptrdiff_t)0, 63>>cut17, wt403);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c33)+(ptrdiff_t)0, 63>>cut17, wt404);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c33)+(ptrdiff_t)0, 63>>cut17, wt405);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c33)+(ptrdiff_t)0, 63>>cut17, wt406);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c33)+(ptrdiff_t)0, 63>>cut17, wt407);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c33)+(ptrdiff_t)0, 63>>cut17, wt408);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c33)+(ptrdiff_t)0, 63>>cut17, wt409);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c33)+(ptrdiff_t)0, 63>>cut17, wt410);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c33)+(ptrdiff_t)0, 63>>cut17, wt411);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c33)+(ptrdiff_t)0, 63>>cut17, wt412);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c33)+(ptrdiff_t)0, 63>>cut17, wt413);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c33)+(ptrdiff_t)0, 63>>cut17, wt414);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c33)+(ptrdiff_t)0, 63>>cut17, wt415);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c33)+(ptrdiff_t)0, 63>>cut17, wt416);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt401);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt402);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt403);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt404);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt405);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt406);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt407);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt408);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt409);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt410);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt411);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt412);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt413);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt414);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt415);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt416);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt401);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt402);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt403);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt404);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt405);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt406);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt407);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt408);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt409);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt410);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt411);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt412);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt413);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt414);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt415);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt416);
}
break;
}
default: {
cut17 = 4;
__m512 sum323 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i41+4*k123);
__m512i pmMul25 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd25 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo21 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k123+128*i41));
__m512 masHi21 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k123+128*i41)+(ptrdiff_t)64);
__m512 postMul40 = _mm512_permutex2var_ps(masLo21, pmMul25, masHi21);
__m512 postAdd26 = _mm512_permutex2var_ps(masLo21, pmAdd25, masHi21);
sum323 = _mm512_fmadd_ps(sum323, postMul40, postAdd26);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)0, 63>>cut17, sum323);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)12288, 4032>>cut17, sum323);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)24576, 258048>>cut17, sum323);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)36864, 65535-(262143>>cut17), sum323);
ptrdiff_t c34 = 0;
for (; c34 != 32; ++c34) {
__m512 wt417 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)0);
__m512 wt418 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)2048);
__m512 wt419 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)4096);
__m512 wt420 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)6144);
__m512 wt421 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)8192);
__m512 wt422 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)10240);
__m512 wt423 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)12288);
__m512 wt424 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)14336);
__m512 wt425 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)16384);
__m512 wt426 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)18432);
__m512 wt427 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)20480);
__m512 wt428 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)22528);
__m512 wt429 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)24576);
__m512 wt430 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)26624);
__m512 wt431 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)28672);
__m512 wt432 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)30720);
__m512 tmp13603 = _mm512_unpacklo_ps(wt417, wt418);
__m512 tmp13604 = _mm512_unpackhi_ps(wt417, wt418);
__m512 tmp13605 = _mm512_unpacklo_ps(wt419, wt420);
__m512 tmp13606 = _mm512_unpackhi_ps(wt419, wt420);
__m512 tmp13607 = _mm512_unpacklo_ps(wt421, wt422);
__m512 tmp13608 = _mm512_unpackhi_ps(wt421, wt422);
__m512 tmp13609 = _mm512_unpacklo_ps(wt423, wt424);
__m512 tmp13610 = _mm512_unpackhi_ps(wt423, wt424);
__m512 tmp13611 = _mm512_unpacklo_ps(wt425, wt426);
__m512 tmp13612 = _mm512_unpackhi_ps(wt425, wt426);
__m512 tmp13613 = _mm512_unpacklo_ps(wt427, wt428);
__m512 tmp13614 = _mm512_unpackhi_ps(wt427, wt428);
__m512 tmp13615 = _mm512_unpacklo_ps(wt429, wt430);
__m512 tmp13616 = _mm512_unpackhi_ps(wt429, wt430);
__m512 tmp13617 = _mm512_unpacklo_ps(wt431, wt432);
__m512 tmp13618 = _mm512_unpackhi_ps(wt431, wt432);
__m512 tmp13619 = _mm512_shuffle_ps(tmp13603, tmp13605, 68);
__m512 tmp13620 = _mm512_shuffle_ps(tmp13603, tmp13605, 238);
__m512 tmp13621 = _mm512_shuffle_ps(tmp13604, tmp13606, 68);
__m512 tmp13622 = _mm512_shuffle_ps(tmp13604, tmp13606, 238);
__m512 tmp13623 = _mm512_shuffle_ps(tmp13607, tmp13609, 68);
__m512 tmp13624 = _mm512_shuffle_ps(tmp13607, tmp13609, 238);
__m512 tmp13625 = _mm512_shuffle_ps(tmp13608, tmp13610, 68);
__m512 tmp13626 = _mm512_shuffle_ps(tmp13608, tmp13610, 238);
__m512 tmp13627 = _mm512_shuffle_ps(tmp13611, tmp13613, 68);
__m512 tmp13628 = _mm512_shuffle_ps(tmp13611, tmp13613, 238);
__m512 tmp13629 = _mm512_shuffle_ps(tmp13612, tmp13614, 68);
__m512 tmp13630 = _mm512_shuffle_ps(tmp13612, tmp13614, 238);
__m512 tmp13631 = _mm512_shuffle_ps(tmp13615, tmp13617, 68);
__m512 tmp13632 = _mm512_shuffle_ps(tmp13615, tmp13617, 238);
__m512 tmp13633 = _mm512_shuffle_ps(tmp13616, tmp13618, 68);
__m512 tmp13634 = _mm512_shuffle_ps(tmp13616, tmp13618, 238);
__m512 tmp13635 = _mm512_shuffle_f32x4(tmp13619, tmp13623, 136);
__m512 tmp13636 = _mm512_shuffle_f32x4(tmp13619, tmp13623, 221);
__m512 tmp13637 = _mm512_shuffle_f32x4(tmp13620, tmp13624, 136);
__m512 tmp13638 = _mm512_shuffle_f32x4(tmp13620, tmp13624, 221);
__m512 tmp13639 = _mm512_shuffle_f32x4(tmp13621, tmp13625, 136);
__m512 tmp13640 = _mm512_shuffle_f32x4(tmp13621, tmp13625, 221);
__m512 tmp13641 = _mm512_shuffle_f32x4(tmp13622, tmp13626, 136);
__m512 tmp13642 = _mm512_shuffle_f32x4(tmp13622, tmp13626, 221);
__m512 tmp13643 = _mm512_shuffle_f32x4(tmp13627, tmp13631, 136);
__m512 tmp13644 = _mm512_shuffle_f32x4(tmp13627, tmp13631, 221);
__m512 tmp13645 = _mm512_shuffle_f32x4(tmp13628, tmp13632, 136);
__m512 tmp13646 = _mm512_shuffle_f32x4(tmp13628, tmp13632, 221);
__m512 tmp13647 = _mm512_shuffle_f32x4(tmp13629, tmp13633, 136);
__m512 tmp13648 = _mm512_shuffle_f32x4(tmp13629, tmp13633, 221);
__m512 tmp13649 = _mm512_shuffle_f32x4(tmp13630, tmp13634, 136);
__m512 tmp13650 = _mm512_shuffle_f32x4(tmp13630, tmp13634, 221);
wt417 = _mm512_shuffle_f32x4(tmp13635, tmp13643, 136);
wt425 = _mm512_shuffle_f32x4(tmp13635, tmp13643, 221);
wt418 = _mm512_shuffle_f32x4(tmp13637, tmp13645, 136);
wt426 = _mm512_shuffle_f32x4(tmp13637, tmp13645, 221);
wt419 = _mm512_shuffle_f32x4(tmp13639, tmp13647, 136);
wt427 = _mm512_shuffle_f32x4(tmp13639, tmp13647, 221);
wt420 = _mm512_shuffle_f32x4(tmp13641, tmp13649, 136);
wt428 = _mm512_shuffle_f32x4(tmp13641, tmp13649, 221);
wt421 = _mm512_shuffle_f32x4(tmp13636, tmp13644, 136);
wt429 = _mm512_shuffle_f32x4(tmp13636, tmp13644, 221);
wt422 = _mm512_shuffle_f32x4(tmp13638, tmp13646, 136);
wt430 = _mm512_shuffle_f32x4(tmp13638, tmp13646, 221);
wt423 = _mm512_shuffle_f32x4(tmp13640, tmp13648, 136);
wt431 = _mm512_shuffle_f32x4(tmp13640, tmp13648, 221);
wt424 = _mm512_shuffle_f32x4(tmp13642, tmp13650, 136);
wt432 = _mm512_shuffle_f32x4(tmp13642, tmp13650, 221);
wt417 = _mm512_mul_ps(wt417, postMul40);
wt418 = _mm512_mul_ps(wt418, postMul40);
wt419 = _mm512_mul_ps(wt419, postMul40);
wt420 = _mm512_mul_ps(wt420, postMul40);
wt421 = _mm512_mul_ps(wt421, postMul40);
wt422 = _mm512_mul_ps(wt422, postMul40);
wt423 = _mm512_mul_ps(wt423, postMul40);
wt424 = _mm512_mul_ps(wt424, postMul40);
wt425 = _mm512_mul_ps(wt425, postMul40);
wt426 = _mm512_mul_ps(wt426, postMul40);
wt427 = _mm512_mul_ps(wt427, postMul40);
wt428 = _mm512_mul_ps(wt428, postMul40);
wt429 = _mm512_mul_ps(wt429, postMul40);
wt430 = _mm512_mul_ps(wt430, postMul40);
wt431 = _mm512_mul_ps(wt431, postMul40);
wt432 = _mm512_mul_ps(wt432, postMul40);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c34)+(ptrdiff_t)0, 63>>cut17, wt417);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c34)+(ptrdiff_t)0, 63>>cut17, wt418);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c34)+(ptrdiff_t)0, 63>>cut17, wt419);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c34)+(ptrdiff_t)0, 63>>cut17, wt420);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c34)+(ptrdiff_t)0, 63>>cut17, wt421);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c34)+(ptrdiff_t)0, 63>>cut17, wt422);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c34)+(ptrdiff_t)0, 63>>cut17, wt423);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c34)+(ptrdiff_t)0, 63>>cut17, wt424);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c34)+(ptrdiff_t)0, 63>>cut17, wt425);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c34)+(ptrdiff_t)0, 63>>cut17, wt426);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c34)+(ptrdiff_t)0, 63>>cut17, wt427);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c34)+(ptrdiff_t)0, 63>>cut17, wt428);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c34)+(ptrdiff_t)0, 63>>cut17, wt429);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c34)+(ptrdiff_t)0, 63>>cut17, wt430);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c34)+(ptrdiff_t)0, 63>>cut17, wt431);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c34)+(ptrdiff_t)0, 63>>cut17, wt432);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt417);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt418);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt419);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt420);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt421);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt422);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt423);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt424);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt425);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt426);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt427);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt428);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt429);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt430);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt431);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt432);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt417);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt418);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt419);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt420);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt421);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt422);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt423);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt424);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt425);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt426);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt427);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt428);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt429);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt430);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt431);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt432);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt417);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt418);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt419);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt420);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt421);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt422);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt423);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt424);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt425);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt426);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt427);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt428);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt429);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt430);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt431);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt432);
}
}
}
} else {
ptrdiff_t k122 = 112;
ptrdiff_t l50 = (size_t)(0+k122)/6;
ptrdiff_t cut16 = (size_t)(0+k122)%6;
__m512 sum321 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i41+4*k122);
__m512i pmMul26 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd26 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo22 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k122+128*i41));
__m512 masHi22 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k122+128*i41)+(ptrdiff_t)64);
__m512 postMul38 = _mm512_permutex2var_ps(masLo22, pmMul26, masHi22);
__m512 postAdd24 = _mm512_permutex2var_ps(masLo22, pmAdd26, masHi22);
sum321 = _mm512_fmadd_ps(sum321, postMul38, postAdd24);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*0+(ptrdiff_t)0, 63>>cut16, sum321);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*0+(ptrdiff_t)12288, 4032>>cut16, sum321);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*0+(ptrdiff_t)24576, 258048>>cut16, sum321);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*0+(ptrdiff_t)36864, 65535-(262143>>cut16), sum321);
ptrdiff_t c32 = 0;
for (; c32 != 32; ++c32) {
__m512 wt385 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)0);
__m512 wt386 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)2048);
__m512 wt387 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)4096);
__m512 wt388 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)6144);
__m512 wt389 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)8192);
__m512 wt390 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)10240);
__m512 wt391 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)12288);
__m512 wt392 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)14336);
__m512 wt393 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)16384);
__m512 wt394 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)18432);
__m512 wt395 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)20480);
__m512 wt396 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)22528);
__m512 wt397 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)24576);
__m512 wt398 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)26624);
__m512 wt399 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)28672);
__m512 wt400 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)30720);
__m512 tmp13651 = _mm512_unpacklo_ps(wt385, wt386);
__m512 tmp13652 = _mm512_unpackhi_ps(wt385, wt386);
__m512 tmp13653 = _mm512_unpacklo_ps(wt387, wt388);
__m512 tmp13654 = _mm512_unpackhi_ps(wt387, wt388);
__m512 tmp13655 = _mm512_unpacklo_ps(wt389, wt390);
__m512 tmp13656 = _mm512_unpackhi_ps(wt389, wt390);
__m512 tmp13657 = _mm512_unpacklo_ps(wt391, wt392);
__m512 tmp13658 = _mm512_unpackhi_ps(wt391, wt392);
__m512 tmp13659 = _mm512_unpacklo_ps(wt393, wt394);
__m512 tmp13660 = _mm512_unpackhi_ps(wt393, wt394);
__m512 tmp13661 = _mm512_unpacklo_ps(wt395, wt396);
__m512 tmp13662 = _mm512_unpackhi_ps(wt395, wt396);
__m512 tmp13663 = _mm512_unpacklo_ps(wt397, wt398);
__m512 tmp13664 = _mm512_unpackhi_ps(wt397, wt398);
__m512 tmp13665 = _mm512_unpacklo_ps(wt399, wt400);
__m512 tmp13666 = _mm512_unpackhi_ps(wt399, wt400);
__m512 tmp13667 = _mm512_shuffle_ps(tmp13651, tmp13653, 68);
__m512 tmp13668 = _mm512_shuffle_ps(tmp13651, tmp13653, 238);
__m512 tmp13669 = _mm512_shuffle_ps(tmp13652, tmp13654, 68);
__m512 tmp13670 = _mm512_shuffle_ps(tmp13652, tmp13654, 238);
__m512 tmp13671 = _mm512_shuffle_ps(tmp13655, tmp13657, 68);
__m512 tmp13672 = _mm512_shuffle_ps(tmp13655, tmp13657, 238);
__m512 tmp13673 = _mm512_shuffle_ps(tmp13656, tmp13658, 68);
__m512 tmp13674 = _mm512_shuffle_ps(tmp13656, tmp13658, 238);
__m512 tmp13675 = _mm512_shuffle_ps(tmp13659, tmp13661, 68);
__m512 tmp13676 = _mm512_shuffle_ps(tmp13659, tmp13661, 238);
__m512 tmp13677 = _mm512_shuffle_ps(tmp13660, tmp13662, 68);
__m512 tmp13678 = _mm512_shuffle_ps(tmp13660, tmp13662, 238);
__m512 tmp13679 = _mm512_shuffle_ps(tmp13663, tmp13665, 68);
__m512 tmp13680 = _mm512_shuffle_ps(tmp13663, tmp13665, 238);
__m512 tmp13681 = _mm512_shuffle_ps(tmp13664, tmp13666, 68);
__m512 tmp13682 = _mm512_shuffle_ps(tmp13664, tmp13666, 238);
__m512 tmp13683 = _mm512_shuffle_f32x4(tmp13667, tmp13671, 136);
__m512 tmp13684 = _mm512_shuffle_f32x4(tmp13667, tmp13671, 221);
__m512 tmp13685 = _mm512_shuffle_f32x4(tmp13668, tmp13672, 136);
__m512 tmp13686 = _mm512_shuffle_f32x4(tmp13668, tmp13672, 221);
__m512 tmp13687 = _mm512_shuffle_f32x4(tmp13669, tmp13673, 136);
__m512 tmp13688 = _mm512_shuffle_f32x4(tmp13669, tmp13673, 221);
__m512 tmp13689 = _mm512_shuffle_f32x4(tmp13670, tmp13674, 136);
__m512 tmp13690 = _mm512_shuffle_f32x4(tmp13670, tmp13674, 221);
__m512 tmp13691 = _mm512_shuffle_f32x4(tmp13675, tmp13679, 136);
__m512 tmp13692 = _mm512_shuffle_f32x4(tmp13675, tmp13679, 221);
__m512 tmp13693 = _mm512_shuffle_f32x4(tmp13676, tmp13680, 136);
__m512 tmp13694 = _mm512_shuffle_f32x4(tmp13676, tmp13680, 221);
__m512 tmp13695 = _mm512_shuffle_f32x4(tmp13677, tmp13681, 136);
__m512 tmp13696 = _mm512_shuffle_f32x4(tmp13677, tmp13681, 221);
__m512 tmp13697 = _mm512_shuffle_f32x4(tmp13678, tmp13682, 136);
__m512 tmp13698 = _mm512_shuffle_f32x4(tmp13678, tmp13682, 221);
wt385 = _mm512_shuffle_f32x4(tmp13683, tmp13691, 136);
wt393 = _mm512_shuffle_f32x4(tmp13683, tmp13691, 221);
wt386 = _mm512_shuffle_f32x4(tmp13685, tmp13693, 136);
wt394 = _mm512_shuffle_f32x4(tmp13685, tmp13693, 221);
wt387 = _mm512_shuffle_f32x4(tmp13687, tmp13695, 136);
wt395 = _mm512_shuffle_f32x4(tmp13687, tmp13695, 221);
wt388 = _mm512_shuffle_f32x4(tmp13689, tmp13697, 136);
wt396 = _mm512_shuffle_f32x4(tmp13689, tmp13697, 221);
wt389 = _mm512_shuffle_f32x4(tmp13684, tmp13692, 136);
wt397 = _mm512_shuffle_f32x4(tmp13684, tmp13692, 221);
wt390 = _mm512_shuffle_f32x4(tmp13686, tmp13694, 136);
wt398 = _mm512_shuffle_f32x4(tmp13686, tmp13694, 221);
wt391 = _mm512_shuffle_f32x4(tmp13688, tmp13696, 136);
wt399 = _mm512_shuffle_f32x4(tmp13688, tmp13696, 221);
wt392 = _mm512_shuffle_f32x4(tmp13690, tmp13698, 136);
wt400 = _mm512_shuffle_f32x4(tmp13690, tmp13698, 221);
wt385 = _mm512_mul_ps(wt385, postMul38);
wt386 = _mm512_mul_ps(wt386, postMul38);
wt387 = _mm512_mul_ps(wt387, postMul38);
wt388 = _mm512_mul_ps(wt388, postMul38);
wt389 = _mm512_mul_ps(wt389, postMul38);
wt390 = _mm512_mul_ps(wt390, postMul38);
wt391 = _mm512_mul_ps(wt391, postMul38);
wt392 = _mm512_mul_ps(wt392, postMul38);
wt393 = _mm512_mul_ps(wt393, postMul38);
wt394 = _mm512_mul_ps(wt394, postMul38);
wt395 = _mm512_mul_ps(wt395, postMul38);
wt396 = _mm512_mul_ps(wt396, postMul38);
wt397 = _mm512_mul_ps(wt397, postMul38);
wt398 = _mm512_mul_ps(wt398, postMul38);
wt399 = _mm512_mul_ps(wt399, postMul38);
wt400 = _mm512_mul_ps(wt400, postMul38);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(1+16*c32)+(ptrdiff_t)0, 63>>cut16, wt385);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(2+16*c32)+(ptrdiff_t)0, 63>>cut16, wt386);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(3+16*c32)+(ptrdiff_t)0, 63>>cut16, wt387);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(4+16*c32)+(ptrdiff_t)0, 63>>cut16, wt388);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(5+16*c32)+(ptrdiff_t)0, 63>>cut16, wt389);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(6+16*c32)+(ptrdiff_t)0, 63>>cut16, wt390);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(7+16*c32)+(ptrdiff_t)0, 63>>cut16, wt391);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(8+16*c32)+(ptrdiff_t)0, 63>>cut16, wt392);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(9+16*c32)+(ptrdiff_t)0, 63>>cut16, wt393);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(10+16*c32)+(ptrdiff_t)0, 63>>cut16, wt394);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(11+16*c32)+(ptrdiff_t)0, 63>>cut16, wt395);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(12+16*c32)+(ptrdiff_t)0, 63>>cut16, wt396);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(13+16*c32)+(ptrdiff_t)0, 63>>cut16, wt397);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(14+16*c32)+(ptrdiff_t)0, 63>>cut16, wt398);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(15+16*c32)+(ptrdiff_t)0, 63>>cut16, wt399);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(16+16*c32)+(ptrdiff_t)0, 63>>cut16, wt400);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(1+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt385);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(2+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt386);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(3+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt387);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(4+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt388);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(5+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt389);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(6+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt390);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(7+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt391);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(8+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt392);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(9+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt393);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(10+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt394);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(11+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt395);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(12+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt396);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(13+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt397);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(14+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt398);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(15+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt399);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(16+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt400);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(1+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt385);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(2+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt386);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(3+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt387);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(4+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt388);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(5+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt389);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(6+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt390);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(7+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt391);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(8+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt392);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(9+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt393);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(10+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt394);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(11+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt395);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(12+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt396);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(13+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt397);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(14+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt398);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(15+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt399);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(16+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt400);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(1+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt385);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(2+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt386);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(3+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt387);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(4+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt388);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(5+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt389);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(6+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt390);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(7+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt391);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(8+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt392);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(9+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt393);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(10+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt394);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(11+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt395);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(12+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt396);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(13+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt397);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(14+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt398);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(15+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt399);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(16+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt400);
}
}
}
}
}

static void ResNet50OneArrangeWts6(ResNet50ThreaderTeam1* team46, char** tensors65) {
ResNet50ThreaderTask1 task69;
task69.callee1 = ResNet50OneArrangeWts6Callee1;
task69.any1 = tensors65;
task69.nd1 = 3;
task69.hull1[0] = 8;
task69.hull1[1] = 1;
task69.hull1[2] = 1;
ResNet50ThreaderDo1(team46, &task69);
}

static void ResNet50OneArrangeDats6Callee1(ResNet50ThreaderTask1* task70, int64_t* pt40) {
char** tensors68 = task70->any1;
ptrdiff_t s30 = pt40[0];
ptrdiff_t c35 = pt40[1];
char*restrict datPtr21 = tensors68[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged12 = tensors68[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii17 = 1;
for (ptrdiff_t i42 = 0; i42 < ii17; ++i42) {
ptrdiff_t j35 = 1*c35;
ptrdiff_t jj38 = j35+0;
for (; j35 != 12; ++j35) {
ptrdiff_t k124 = 128*s30;
ptrdiff_t kk39 = k124+128;
for (; k124 < kk39; ++k124) {
__m512 dat1926 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k124+(ptrdiff_t)0);
__m512 dat1927 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k124+(ptrdiff_t)64);
__m512 dat1928 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k124+(ptrdiff_t)128);
__m512 dat1929 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k124+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+256*k124+(ptrdiff_t)0, 65535, dat1926);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+256*k124+(ptrdiff_t)64, 65535, dat1927);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+256*k124+(ptrdiff_t)128, 65535, dat1928);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+256*k124+(ptrdiff_t)192, 65535, dat1929);
}
if (j35 >= jj38) goto next6;
}
ptrdiff_t k125 = 128*s30;
ptrdiff_t kk40 = k125+128;
for (; k125 < kk40; ++k125) {
__m512 dat1930 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k125+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+64*k125+(ptrdiff_t)0, 65535, dat1930);
}
next6:;
}
}

static void ResNet50OneArrangeDats6(ResNet50ThreaderTeam1* team47, char** tensors67) {
ResNet50ThreaderTask1 task71;
task71.callee1 = ResNet50OneArrangeDats6Callee1;
task71.any1 = tensors67;
task71.nd1 = 4;
task71.hull1[0] = 4;
task71.hull1[1] = 13;
task71.hull1[2] = 1;
task71.hull1[3] = 1;
ResNet50ThreaderDo1(team47, &task71);
}

static void ResNet50OneApply6Callee1(ResNet50ThreaderTask1* task72, int64_t* pt41) {
void** pair18 = task72->any1;
char** tensors70 = pair18[0];
ptrdiff_t e20 = 0;
ptrdiff_t g23 = 0;
ptrdiff_t d14 = pt41[1];
ptrdiff_t w56 = pt41[0];
char*restrict arrangedWts6 = tensors70[0]+428032*e20+(ptrdiff_t)262656*1*g23;
char*restrict arrangedDats6 = tensors70[1]+2618560*e20+(ptrdiff_t)1605632*1*g23;
char*restrict datPtr22 = tensors70[2]+(ptrdiff_t)401408*1*g23;
ptrdiff_t ii18 = 1;
for (ptrdiff_t i43 = 0; i43 < ii18; ++i43) {
ptrdiff_t j36 = 1*d14;
ptrdiff_t jj39 = j36+0;
for (; j36 != 12; ++j36) {
ptrdiff_t k126 = 1*w56;
ptrdiff_t kk41 = k126+0;
for (; k126 != 21; ++k126) {
ptrdiff_t s31 = -1;
__m512 sum324 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)24));
__m512 sum328 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)28));
__m512 sum332 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)32));
__m512 sum336 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)36));
__m512 sum340 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)40));
__m512 sum344 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)44));
__m512 sum325 = sum324;
__m512 sum326 = sum324;
__m512 sum327 = sum324;
__m512 sum329 = sum328;
__m512 sum330 = sum328;
__m512 sum331 = sum328;
__m512 sum333 = sum332;
__m512 sum334 = sum332;
__m512 sum335 = sum332;
__m512 sum337 = sum336;
__m512 sum338 = sum336;
__m512 sum339 = sum336;
__m512 sum341 = sum340;
__m512 sum342 = sum340;
__m512 sum343 = sum340;
__m512 sum345 = sum344;
__m512 sum346 = sum344;
__m512 sum347 = sum344;
for (s31 = 0; s31 < 512; ++s31) {
__m512 dat1931 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s31+(ptrdiff_t)0);
__m512 dat1932 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s31+(ptrdiff_t)64);
__m512 dat1933 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s31+(ptrdiff_t)128);
__m512 dat1934 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s31+(ptrdiff_t)192);
__m512 wt433 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)24));
sum324 = _mm512_fmadd_ps(wt433, dat1931, sum324);
sum325 = _mm512_fmadd_ps(wt433, dat1932, sum325);
sum326 = _mm512_fmadd_ps(wt433, dat1933, sum326);
sum327 = _mm512_fmadd_ps(wt433, dat1934, sum327);
__m512 wt434 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)28));
sum328 = _mm512_fmadd_ps(wt434, dat1931, sum328);
sum329 = _mm512_fmadd_ps(wt434, dat1932, sum329);
sum330 = _mm512_fmadd_ps(wt434, dat1933, sum330);
sum331 = _mm512_fmadd_ps(wt434, dat1934, sum331);
__m512 wt435 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)32));
sum332 = _mm512_fmadd_ps(wt435, dat1931, sum332);
sum333 = _mm512_fmadd_ps(wt435, dat1932, sum333);
sum334 = _mm512_fmadd_ps(wt435, dat1933, sum334);
sum335 = _mm512_fmadd_ps(wt435, dat1934, sum335);
__m512 wt436 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)36));
sum336 = _mm512_fmadd_ps(wt436, dat1931, sum336);
sum337 = _mm512_fmadd_ps(wt436, dat1932, sum337);
sum338 = _mm512_fmadd_ps(wt436, dat1933, sum338);
sum339 = _mm512_fmadd_ps(wt436, dat1934, sum339);
__m512 wt437 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)40));
sum340 = _mm512_fmadd_ps(wt437, dat1931, sum340);
sum341 = _mm512_fmadd_ps(wt437, dat1932, sum341);
sum342 = _mm512_fmadd_ps(wt437, dat1933, sum342);
sum343 = _mm512_fmadd_ps(wt437, dat1934, sum343);
__m512 wt438 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)44));
sum344 = _mm512_fmadd_ps(wt438, dat1931, sum344);
sum345 = _mm512_fmadd_ps(wt438, dat1932, sum345);
sum346 = _mm512_fmadd_ps(wt438, dat1933, sum346);
sum347 = _mm512_fmadd_ps(wt438, dat1934, sum347);
}
sum324 = _mm512_max_ps(_mm512_setzero_ps(), sum324);
sum325 = _mm512_max_ps(_mm512_setzero_ps(), sum325);
sum326 = _mm512_max_ps(_mm512_setzero_ps(), sum326);
sum327 = _mm512_max_ps(_mm512_setzero_ps(), sum327);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)0, 65535, sum324);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)64, 65535, sum325);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)128, 65535, sum326);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)192, 65535, sum327);
sum328 = _mm512_max_ps(_mm512_setzero_ps(), sum328);
sum329 = _mm512_max_ps(_mm512_setzero_ps(), sum329);
sum330 = _mm512_max_ps(_mm512_setzero_ps(), sum330);
sum331 = _mm512_max_ps(_mm512_setzero_ps(), sum331);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3136, 65535, sum328);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3200, 65535, sum329);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3264, 65535, sum330);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3328, 65535, sum331);
sum332 = _mm512_max_ps(_mm512_setzero_ps(), sum332);
sum333 = _mm512_max_ps(_mm512_setzero_ps(), sum333);
sum334 = _mm512_max_ps(_mm512_setzero_ps(), sum334);
sum335 = _mm512_max_ps(_mm512_setzero_ps(), sum335);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)6272, 65535, sum332);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)6336, 65535, sum333);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)6400, 65535, sum334);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)6464, 65535, sum335);
sum336 = _mm512_max_ps(_mm512_setzero_ps(), sum336);
sum337 = _mm512_max_ps(_mm512_setzero_ps(), sum337);
sum338 = _mm512_max_ps(_mm512_setzero_ps(), sum338);
sum339 = _mm512_max_ps(_mm512_setzero_ps(), sum339);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)9408, 65535, sum336);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)9472, 65535, sum337);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)9536, 65535, sum338);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)9600, 65535, sum339);
sum340 = _mm512_max_ps(_mm512_setzero_ps(), sum340);
sum341 = _mm512_max_ps(_mm512_setzero_ps(), sum341);
sum342 = _mm512_max_ps(_mm512_setzero_ps(), sum342);
sum343 = _mm512_max_ps(_mm512_setzero_ps(), sum343);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)12544, 65535, sum340);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)12608, 65535, sum341);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)12672, 65535, sum342);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)12736, 65535, sum343);
sum344 = _mm512_max_ps(_mm512_setzero_ps(), sum344);
sum345 = _mm512_max_ps(_mm512_setzero_ps(), sum345);
sum346 = _mm512_max_ps(_mm512_setzero_ps(), sum346);
sum347 = _mm512_max_ps(_mm512_setzero_ps(), sum347);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)15680, 65535, sum344);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)15744, 65535, sum345);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)15808, 65535, sum346);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)15872, 65535, sum347);
if (k126 >= kk41) return;
}
ptrdiff_t s32 = -1;
__m512 sum348 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+8*s32+(ptrdiff_t)8));
__m512 sum352 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+8*s32+(ptrdiff_t)12));
__m512 sum349 = sum348;
__m512 sum350 = sum348;
__m512 sum351 = sum348;
__m512 sum353 = sum352;
__m512 sum354 = sum352;
__m512 sum355 = sum352;
for (s32 = 0; s32 < 512; ++s32) {
__m512 dat1935 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s32+(ptrdiff_t)0);
__m512 dat1936 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s32+(ptrdiff_t)64);
__m512 dat1937 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s32+(ptrdiff_t)128);
__m512 dat1938 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s32+(ptrdiff_t)192);
__m512 wt439 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+8*s32+(ptrdiff_t)8));
sum348 = _mm512_fmadd_ps(wt439, dat1935, sum348);
sum349 = _mm512_fmadd_ps(wt439, dat1936, sum349);
sum350 = _mm512_fmadd_ps(wt439, dat1937, sum350);
sum351 = _mm512_fmadd_ps(wt439, dat1938, sum351);
__m512 wt440 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+8*s32+(ptrdiff_t)12));
sum352 = _mm512_fmadd_ps(wt440, dat1935, sum352);
sum353 = _mm512_fmadd_ps(wt440, dat1936, sum353);
sum354 = _mm512_fmadd_ps(wt440, dat1937, sum354);
sum355 = _mm512_fmadd_ps(wt440, dat1938, sum355);
}
sum348 = _mm512_max_ps(_mm512_setzero_ps(), sum348);
sum349 = _mm512_max_ps(_mm512_setzero_ps(), sum349);
sum350 = _mm512_max_ps(_mm512_setzero_ps(), sum350);
sum351 = _mm512_max_ps(_mm512_setzero_ps(), sum351);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)0, 65535, sum348);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)64, 65535, sum349);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)128, 65535, sum350);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)192, 65535, sum351);
sum352 = _mm512_max_ps(_mm512_setzero_ps(), sum352);
sum353 = _mm512_max_ps(_mm512_setzero_ps(), sum353);
sum354 = _mm512_max_ps(_mm512_setzero_ps(), sum354);
sum355 = _mm512_max_ps(_mm512_setzero_ps(), sum355);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3136, 65535, sum352);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3200, 65535, sum353);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3264, 65535, sum354);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3328, 65535, sum355);
if (j36 >= jj39) return;
}
ptrdiff_t k127 = 1*w56;
ptrdiff_t kk42 = k127+0;
for (; k127 != 21; ++k127) {
ptrdiff_t s33 = -1;
__m512 sum356 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)24));
__m512 sum357 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)28));
__m512 sum358 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)32));
__m512 sum359 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)36));
__m512 sum360 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)40));
__m512 sum361 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)44));
for (s33 = 0; s33 < 512; ++s33) {
__m512 dat1939 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+64*s33+(ptrdiff_t)0);
__m512 wt441 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)24));
sum356 = _mm512_fmadd_ps(wt441, dat1939, sum356);
__m512 wt442 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)28));
sum357 = _mm512_fmadd_ps(wt442, dat1939, sum357);
__m512 wt443 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)32));
sum358 = _mm512_fmadd_ps(wt443, dat1939, sum358);
__m512 wt444 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)36));
sum359 = _mm512_fmadd_ps(wt444, dat1939, sum359);
__m512 wt445 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)40));
sum360 = _mm512_fmadd_ps(wt445, dat1939, sum360);
__m512 wt446 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)44));
sum361 = _mm512_fmadd_ps(wt446, dat1939, sum361);
}
sum356 = _mm512_max_ps(_mm512_setzero_ps(), sum356);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)0, 65535, sum356);
sum357 = _mm512_max_ps(_mm512_setzero_ps(), sum357);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)3136, 65535, sum357);
sum358 = _mm512_max_ps(_mm512_setzero_ps(), sum358);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)6272, 65535, sum358);
sum359 = _mm512_max_ps(_mm512_setzero_ps(), sum359);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)9408, 65535, sum359);
sum360 = _mm512_max_ps(_mm512_setzero_ps(), sum360);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)12544, 65535, sum360);
sum361 = _mm512_max_ps(_mm512_setzero_ps(), sum361);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)15680, 65535, sum361);
if (k127 >= kk42) return;
}
ptrdiff_t s34 = -1;
__m512 sum362 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+8*s34+(ptrdiff_t)8));
__m512 sum363 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+8*s34+(ptrdiff_t)12));
for (s34 = 0; s34 < 512; ++s34) {
__m512 dat1940 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+64*s34+(ptrdiff_t)0);
__m512 wt447 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+8*s34+(ptrdiff_t)8));
sum362 = _mm512_fmadd_ps(wt447, dat1940, sum362);
__m512 wt448 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+8*s34+(ptrdiff_t)12));
sum363 = _mm512_fmadd_ps(wt448, dat1940, sum363);
}
sum362 = _mm512_max_ps(_mm512_setzero_ps(), sum362);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)0, 65535, sum362);
sum363 = _mm512_max_ps(_mm512_setzero_ps(), sum363);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)3136, 65535, sum363);
}
}

static void ResNet50OneApply6(ResNet50ThreaderTeam1* team48, char** tensors69) {
void* pair17[] = {tensors69, 0};
ResNet50ThreaderTask1 task73;
task73.callee1 = ResNet50OneApply6Callee1;
task73.any1 = pair17;
task73.nd1 = 3;
task73.hull1[0] = 22;
task73.hull1[1] = 13;
task73.hull1[2] = 1;
ResNet50ThreaderDo1(team48, &task73);
}

static void ResNet50OneArrangeWts7Callee1(ResNet50ThreaderTask1* task82, int64_t* pt46) {
char** tensors80 = task82->any1;
ptrdiff_t b62 = pt46[0];
char*restrict wtPtr14 = tensors80[0]+(ptrdiff_t)3340*0+(ptrdiff_t)2621440*0;
char*restrict biasPtr14 = tensors80[1]+(ptrdiff_t)5120*0;
char*restrict bnPtr14 = tensors80[2]+(ptrdiff_t)8*1280*0;
char*restrict wtPtr15 = tensors80[3]+(ptrdiff_t)3340*0+(ptrdiff_t)2621440*0;
char*restrict biasPtr15 = tensors80[4]+(ptrdiff_t)5120*0;
char*restrict bnPtr15 = tensors80[5]+(ptrdiff_t)8*1280*0;
char*restrict arranged13 = tensors80[6]+(ptrdiff_t)4280320*0+(ptrdiff_t)2626560*0;
ptrdiff_t ii19 = 1;
for (ptrdiff_t i49 = 0; i49 < ii19; ++i49) {
ptrdiff_t j41 = 1*b62;
ptrdiff_t jj41 = j41+1;
for (; j41 < jj41; ++j41) {
if (j41 < 64) {
ptrdiff_t k140 = 0+16*(j41-0);
ptrdiff_t l59 = (size_t)(0+k140)/6;
ptrdiff_t cut19 = (size_t)(0+k140)%6;
switch (cut19) {
case 0:;
case 2: {
__m512 sum392 = _mm512_maskz_loadu_ps(65535, biasPtr14+5120*i49+4*k140);
__m512i pmMul28 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd28 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo23 = _mm512_loadu_ps(bnPtr14+(ptrdiff_t)8*(k140+1280*i49));
__m512 masHi23 = _mm512_maskz_loadu_ps(65535, bnPtr14+(ptrdiff_t)8*(k140+1280*i49)+(ptrdiff_t)64);
__m512 postMul46 = _mm512_permutex2var_ps(masLo23, pmMul28, masHi23);
__m512 postAdd28 = _mm512_permutex2var_ps(masLo23, pmAdd28, masHi23);
sum392 = _mm512_fmadd_ps(sum392, postMul46, postAdd28);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum392);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)12288, 4032>>cut19, sum392);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)24576, 65535-(4095>>cut19), sum392);
ptrdiff_t c37 = 0;
for (; c37 != 32; ++c37) {
__m512 wt453 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)0);
__m512 wt454 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)2048);
__m512 wt455 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)4096);
__m512 wt456 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)6144);
__m512 wt457 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)8192);
__m512 wt458 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)10240);
__m512 wt459 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)12288);
__m512 wt460 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)14336);
__m512 wt461 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)16384);
__m512 wt462 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)18432);
__m512 wt463 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)20480);
__m512 wt464 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)22528);
__m512 wt465 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)24576);
__m512 wt466 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)26624);
__m512 wt467 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)28672);
__m512 wt468 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)30720);
__m512 tmp16317 = _mm512_unpacklo_ps(wt453, wt454);
__m512 tmp16318 = _mm512_unpackhi_ps(wt453, wt454);
__m512 tmp16319 = _mm512_unpacklo_ps(wt455, wt456);
__m512 tmp16320 = _mm512_unpackhi_ps(wt455, wt456);
__m512 tmp16321 = _mm512_unpacklo_ps(wt457, wt458);
__m512 tmp16322 = _mm512_unpackhi_ps(wt457, wt458);
__m512 tmp16323 = _mm512_unpacklo_ps(wt459, wt460);
__m512 tmp16324 = _mm512_unpackhi_ps(wt459, wt460);
__m512 tmp16325 = _mm512_unpacklo_ps(wt461, wt462);
__m512 tmp16326 = _mm512_unpackhi_ps(wt461, wt462);
__m512 tmp16327 = _mm512_unpacklo_ps(wt463, wt464);
__m512 tmp16328 = _mm512_unpackhi_ps(wt463, wt464);
__m512 tmp16329 = _mm512_unpacklo_ps(wt465, wt466);
__m512 tmp16330 = _mm512_unpackhi_ps(wt465, wt466);
__m512 tmp16331 = _mm512_unpacklo_ps(wt467, wt468);
__m512 tmp16332 = _mm512_unpackhi_ps(wt467, wt468);
__m512 tmp16333 = _mm512_shuffle_ps(tmp16317, tmp16319, 68);
__m512 tmp16334 = _mm512_shuffle_ps(tmp16317, tmp16319, 238);
__m512 tmp16335 = _mm512_shuffle_ps(tmp16318, tmp16320, 68);
__m512 tmp16336 = _mm512_shuffle_ps(tmp16318, tmp16320, 238);
__m512 tmp16337 = _mm512_shuffle_ps(tmp16321, tmp16323, 68);
__m512 tmp16338 = _mm512_shuffle_ps(tmp16321, tmp16323, 238);
__m512 tmp16339 = _mm512_shuffle_ps(tmp16322, tmp16324, 68);
__m512 tmp16340 = _mm512_shuffle_ps(tmp16322, tmp16324, 238);
__m512 tmp16341 = _mm512_shuffle_ps(tmp16325, tmp16327, 68);
__m512 tmp16342 = _mm512_shuffle_ps(tmp16325, tmp16327, 238);
__m512 tmp16343 = _mm512_shuffle_ps(tmp16326, tmp16328, 68);
__m512 tmp16344 = _mm512_shuffle_ps(tmp16326, tmp16328, 238);
__m512 tmp16345 = _mm512_shuffle_ps(tmp16329, tmp16331, 68);
__m512 tmp16346 = _mm512_shuffle_ps(tmp16329, tmp16331, 238);
__m512 tmp16347 = _mm512_shuffle_ps(tmp16330, tmp16332, 68);
__m512 tmp16348 = _mm512_shuffle_ps(tmp16330, tmp16332, 238);
__m512 tmp16349 = _mm512_shuffle_f32x4(tmp16333, tmp16337, 136);
__m512 tmp16350 = _mm512_shuffle_f32x4(tmp16333, tmp16337, 221);
__m512 tmp16351 = _mm512_shuffle_f32x4(tmp16334, tmp16338, 136);
__m512 tmp16352 = _mm512_shuffle_f32x4(tmp16334, tmp16338, 221);
__m512 tmp16353 = _mm512_shuffle_f32x4(tmp16335, tmp16339, 136);
__m512 tmp16354 = _mm512_shuffle_f32x4(tmp16335, tmp16339, 221);
__m512 tmp16355 = _mm512_shuffle_f32x4(tmp16336, tmp16340, 136);
__m512 tmp16356 = _mm512_shuffle_f32x4(tmp16336, tmp16340, 221);
__m512 tmp16357 = _mm512_shuffle_f32x4(tmp16341, tmp16345, 136);
__m512 tmp16358 = _mm512_shuffle_f32x4(tmp16341, tmp16345, 221);
__m512 tmp16359 = _mm512_shuffle_f32x4(tmp16342, tmp16346, 136);
__m512 tmp16360 = _mm512_shuffle_f32x4(tmp16342, tmp16346, 221);
__m512 tmp16361 = _mm512_shuffle_f32x4(tmp16343, tmp16347, 136);
__m512 tmp16362 = _mm512_shuffle_f32x4(tmp16343, tmp16347, 221);
__m512 tmp16363 = _mm512_shuffle_f32x4(tmp16344, tmp16348, 136);
__m512 tmp16364 = _mm512_shuffle_f32x4(tmp16344, tmp16348, 221);
wt453 = _mm512_shuffle_f32x4(tmp16349, tmp16357, 136);
wt461 = _mm512_shuffle_f32x4(tmp16349, tmp16357, 221);
wt454 = _mm512_shuffle_f32x4(tmp16351, tmp16359, 136);
wt462 = _mm512_shuffle_f32x4(tmp16351, tmp16359, 221);
wt455 = _mm512_shuffle_f32x4(tmp16353, tmp16361, 136);
wt463 = _mm512_shuffle_f32x4(tmp16353, tmp16361, 221);
wt456 = _mm512_shuffle_f32x4(tmp16355, tmp16363, 136);
wt464 = _mm512_shuffle_f32x4(tmp16355, tmp16363, 221);
wt457 = _mm512_shuffle_f32x4(tmp16350, tmp16358, 136);
wt465 = _mm512_shuffle_f32x4(tmp16350, tmp16358, 221);
wt458 = _mm512_shuffle_f32x4(tmp16352, tmp16360, 136);
wt466 = _mm512_shuffle_f32x4(tmp16352, tmp16360, 221);
wt459 = _mm512_shuffle_f32x4(tmp16354, tmp16362, 136);
wt467 = _mm512_shuffle_f32x4(tmp16354, tmp16362, 221);
wt460 = _mm512_shuffle_f32x4(tmp16356, tmp16364, 136);
wt468 = _mm512_shuffle_f32x4(tmp16356, tmp16364, 221);
wt453 = _mm512_mul_ps(wt453, postMul46);
wt454 = _mm512_mul_ps(wt454, postMul46);
wt455 = _mm512_mul_ps(wt455, postMul46);
wt456 = _mm512_mul_ps(wt456, postMul46);
wt457 = _mm512_mul_ps(wt457, postMul46);
wt458 = _mm512_mul_ps(wt458, postMul46);
wt459 = _mm512_mul_ps(wt459, postMul46);
wt460 = _mm512_mul_ps(wt460, postMul46);
wt461 = _mm512_mul_ps(wt461, postMul46);
wt462 = _mm512_mul_ps(wt462, postMul46);
wt463 = _mm512_mul_ps(wt463, postMul46);
wt464 = _mm512_mul_ps(wt464, postMul46);
wt465 = _mm512_mul_ps(wt465, postMul46);
wt466 = _mm512_mul_ps(wt466, postMul46);
wt467 = _mm512_mul_ps(wt467, postMul46);
wt468 = _mm512_mul_ps(wt468, postMul46);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c37)+(ptrdiff_t)0, 63>>cut19, wt453);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c37)+(ptrdiff_t)0, 63>>cut19, wt454);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c37)+(ptrdiff_t)0, 63>>cut19, wt455);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c37)+(ptrdiff_t)0, 63>>cut19, wt456);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c37)+(ptrdiff_t)0, 63>>cut19, wt457);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c37)+(ptrdiff_t)0, 63>>cut19, wt458);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c37)+(ptrdiff_t)0, 63>>cut19, wt459);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c37)+(ptrdiff_t)0, 63>>cut19, wt460);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c37)+(ptrdiff_t)0, 63>>cut19, wt461);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c37)+(ptrdiff_t)0, 63>>cut19, wt462);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c37)+(ptrdiff_t)0, 63>>cut19, wt463);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c37)+(ptrdiff_t)0, 63>>cut19, wt464);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c37)+(ptrdiff_t)0, 63>>cut19, wt465);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c37)+(ptrdiff_t)0, 63>>cut19, wt466);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c37)+(ptrdiff_t)0, 63>>cut19, wt467);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c37)+(ptrdiff_t)0, 63>>cut19, wt468);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt453);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt454);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt455);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt456);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt457);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt458);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt459);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt460);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt461);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt462);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt463);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt464);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt465);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt466);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt467);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt468);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt453);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt454);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt455);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt456);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt457);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt458);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt459);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt460);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt461);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt462);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt463);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt464);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt465);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt466);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt467);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt468);
}
break;
}
default: {
cut19 = 4;
__m512 sum393 = _mm512_maskz_loadu_ps(65535, biasPtr14+5120*i49+4*k140);
__m512i pmMul29 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd29 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo24 = _mm512_loadu_ps(bnPtr14+(ptrdiff_t)8*(k140+1280*i49));
__m512 masHi24 = _mm512_maskz_loadu_ps(65535, bnPtr14+(ptrdiff_t)8*(k140+1280*i49)+(ptrdiff_t)64);
__m512 postMul47 = _mm512_permutex2var_ps(masLo24, pmMul29, masHi24);
__m512 postAdd29 = _mm512_permutex2var_ps(masLo24, pmAdd29, masHi24);
sum393 = _mm512_fmadd_ps(sum393, postMul47, postAdd29);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum393);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)12288, 4032>>cut19, sum393);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)24576, 258048>>cut19, sum393);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)36864, 65535-(262143>>cut19), sum393);
ptrdiff_t c38 = 0;
for (; c38 != 32; ++c38) {
__m512 wt469 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)0);
__m512 wt470 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)2048);
__m512 wt471 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)4096);
__m512 wt472 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)6144);
__m512 wt473 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)8192);
__m512 wt474 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)10240);
__m512 wt475 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)12288);
__m512 wt476 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)14336);
__m512 wt477 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)16384);
__m512 wt478 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)18432);
__m512 wt479 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)20480);
__m512 wt480 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)22528);
__m512 wt481 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)24576);
__m512 wt482 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)26624);
__m512 wt483 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)28672);
__m512 wt484 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)30720);
__m512 tmp16365 = _mm512_unpacklo_ps(wt469, wt470);
__m512 tmp16366 = _mm512_unpackhi_ps(wt469, wt470);
__m512 tmp16367 = _mm512_unpacklo_ps(wt471, wt472);
__m512 tmp16368 = _mm512_unpackhi_ps(wt471, wt472);
__m512 tmp16369 = _mm512_unpacklo_ps(wt473, wt474);
__m512 tmp16370 = _mm512_unpackhi_ps(wt473, wt474);
__m512 tmp16371 = _mm512_unpacklo_ps(wt475, wt476);
__m512 tmp16372 = _mm512_unpackhi_ps(wt475, wt476);
__m512 tmp16373 = _mm512_unpacklo_ps(wt477, wt478);
__m512 tmp16374 = _mm512_unpackhi_ps(wt477, wt478);
__m512 tmp16375 = _mm512_unpacklo_ps(wt479, wt480);
__m512 tmp16376 = _mm512_unpackhi_ps(wt479, wt480);
__m512 tmp16377 = _mm512_unpacklo_ps(wt481, wt482);
__m512 tmp16378 = _mm512_unpackhi_ps(wt481, wt482);
__m512 tmp16379 = _mm512_unpacklo_ps(wt483, wt484);
__m512 tmp16380 = _mm512_unpackhi_ps(wt483, wt484);
__m512 tmp16381 = _mm512_shuffle_ps(tmp16365, tmp16367, 68);
__m512 tmp16382 = _mm512_shuffle_ps(tmp16365, tmp16367, 238);
__m512 tmp16383 = _mm512_shuffle_ps(tmp16366, tmp16368, 68);
__m512 tmp16384 = _mm512_shuffle_ps(tmp16366, tmp16368, 238);
__m512 tmp16385 = _mm512_shuffle_ps(tmp16369, tmp16371, 68);
__m512 tmp16386 = _mm512_shuffle_ps(tmp16369, tmp16371, 238);
__m512 tmp16387 = _mm512_shuffle_ps(tmp16370, tmp16372, 68);
__m512 tmp16388 = _mm512_shuffle_ps(tmp16370, tmp16372, 238);
__m512 tmp16389 = _mm512_shuffle_ps(tmp16373, tmp16375, 68);
__m512 tmp16390 = _mm512_shuffle_ps(tmp16373, tmp16375, 238);
__m512 tmp16391 = _mm512_shuffle_ps(tmp16374, tmp16376, 68);
__m512 tmp16392 = _mm512_shuffle_ps(tmp16374, tmp16376, 238);
__m512 tmp16393 = _mm512_shuffle_ps(tmp16377, tmp16379, 68);
__m512 tmp16394 = _mm512_shuffle_ps(tmp16377, tmp16379, 238);
__m512 tmp16395 = _mm512_shuffle_ps(tmp16378, tmp16380, 68);
__m512 tmp16396 = _mm512_shuffle_ps(tmp16378, tmp16380, 238);
__m512 tmp16397 = _mm512_shuffle_f32x4(tmp16381, tmp16385, 136);
__m512 tmp16398 = _mm512_shuffle_f32x4(tmp16381, tmp16385, 221);
__m512 tmp16399 = _mm512_shuffle_f32x4(tmp16382, tmp16386, 136);
__m512 tmp16400 = _mm512_shuffle_f32x4(tmp16382, tmp16386, 221);
__m512 tmp16401 = _mm512_shuffle_f32x4(tmp16383, tmp16387, 136);
__m512 tmp16402 = _mm512_shuffle_f32x4(tmp16383, tmp16387, 221);
__m512 tmp16403 = _mm512_shuffle_f32x4(tmp16384, tmp16388, 136);
__m512 tmp16404 = _mm512_shuffle_f32x4(tmp16384, tmp16388, 221);
__m512 tmp16405 = _mm512_shuffle_f32x4(tmp16389, tmp16393, 136);
__m512 tmp16406 = _mm512_shuffle_f32x4(tmp16389, tmp16393, 221);
__m512 tmp16407 = _mm512_shuffle_f32x4(tmp16390, tmp16394, 136);
__m512 tmp16408 = _mm512_shuffle_f32x4(tmp16390, tmp16394, 221);
__m512 tmp16409 = _mm512_shuffle_f32x4(tmp16391, tmp16395, 136);
__m512 tmp16410 = _mm512_shuffle_f32x4(tmp16391, tmp16395, 221);
__m512 tmp16411 = _mm512_shuffle_f32x4(tmp16392, tmp16396, 136);
__m512 tmp16412 = _mm512_shuffle_f32x4(tmp16392, tmp16396, 221);
wt469 = _mm512_shuffle_f32x4(tmp16397, tmp16405, 136);
wt477 = _mm512_shuffle_f32x4(tmp16397, tmp16405, 221);
wt470 = _mm512_shuffle_f32x4(tmp16399, tmp16407, 136);
wt478 = _mm512_shuffle_f32x4(tmp16399, tmp16407, 221);
wt471 = _mm512_shuffle_f32x4(tmp16401, tmp16409, 136);
wt479 = _mm512_shuffle_f32x4(tmp16401, tmp16409, 221);
wt472 = _mm512_shuffle_f32x4(tmp16403, tmp16411, 136);
wt480 = _mm512_shuffle_f32x4(tmp16403, tmp16411, 221);
wt473 = _mm512_shuffle_f32x4(tmp16398, tmp16406, 136);
wt481 = _mm512_shuffle_f32x4(tmp16398, tmp16406, 221);
wt474 = _mm512_shuffle_f32x4(tmp16400, tmp16408, 136);
wt482 = _mm512_shuffle_f32x4(tmp16400, tmp16408, 221);
wt475 = _mm512_shuffle_f32x4(tmp16402, tmp16410, 136);
wt483 = _mm512_shuffle_f32x4(tmp16402, tmp16410, 221);
wt476 = _mm512_shuffle_f32x4(tmp16404, tmp16412, 136);
wt484 = _mm512_shuffle_f32x4(tmp16404, tmp16412, 221);
wt469 = _mm512_mul_ps(wt469, postMul47);
wt470 = _mm512_mul_ps(wt470, postMul47);
wt471 = _mm512_mul_ps(wt471, postMul47);
wt472 = _mm512_mul_ps(wt472, postMul47);
wt473 = _mm512_mul_ps(wt473, postMul47);
wt474 = _mm512_mul_ps(wt474, postMul47);
wt475 = _mm512_mul_ps(wt475, postMul47);
wt476 = _mm512_mul_ps(wt476, postMul47);
wt477 = _mm512_mul_ps(wt477, postMul47);
wt478 = _mm512_mul_ps(wt478, postMul47);
wt479 = _mm512_mul_ps(wt479, postMul47);
wt480 = _mm512_mul_ps(wt480, postMul47);
wt481 = _mm512_mul_ps(wt481, postMul47);
wt482 = _mm512_mul_ps(wt482, postMul47);
wt483 = _mm512_mul_ps(wt483, postMul47);
wt484 = _mm512_mul_ps(wt484, postMul47);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c38)+(ptrdiff_t)0, 63>>cut19, wt469);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c38)+(ptrdiff_t)0, 63>>cut19, wt470);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c38)+(ptrdiff_t)0, 63>>cut19, wt471);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c38)+(ptrdiff_t)0, 63>>cut19, wt472);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c38)+(ptrdiff_t)0, 63>>cut19, wt473);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c38)+(ptrdiff_t)0, 63>>cut19, wt474);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c38)+(ptrdiff_t)0, 63>>cut19, wt475);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c38)+(ptrdiff_t)0, 63>>cut19, wt476);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c38)+(ptrdiff_t)0, 63>>cut19, wt477);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c38)+(ptrdiff_t)0, 63>>cut19, wt478);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c38)+(ptrdiff_t)0, 63>>cut19, wt479);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c38)+(ptrdiff_t)0, 63>>cut19, wt480);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c38)+(ptrdiff_t)0, 63>>cut19, wt481);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c38)+(ptrdiff_t)0, 63>>cut19, wt482);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c38)+(ptrdiff_t)0, 63>>cut19, wt483);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c38)+(ptrdiff_t)0, 63>>cut19, wt484);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt469);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt470);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt471);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt472);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt473);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt474);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt475);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt476);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt477);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt478);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt479);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt480);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt481);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt482);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt483);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt484);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt469);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt470);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt471);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt472);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt473);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt474);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt475);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt476);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt477);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt478);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt479);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt480);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt481);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt482);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt483);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt484);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt469);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt470);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt471);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt472);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt473);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt474);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt475);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt476);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt477);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt478);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt479);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt480);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt481);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt482);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt483);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt484);
}
}
}
} else if (j41 < 79) {
ptrdiff_t k142 = 0+16*(j41-64);
ptrdiff_t l61 = (size_t)(1024+k142)/6;
ptrdiff_t cut21 = (size_t)(1024+k142)%6;
switch (cut21) {
case 0:;
case 2: {
__m512 sum395 = _mm512_maskz_loadu_ps(65535, biasPtr15+5120*i49+4*k142);
__m512i pmMul30 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd30 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo25 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k142+1280*i49));
__m512 masHi25 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k142+1280*i49)+(ptrdiff_t)64);
__m512 postMul49 = _mm512_permutex2var_ps(masLo25, pmMul30, masHi25);
__m512 postAdd31 = _mm512_permutex2var_ps(masLo25, pmAdd30, masHi25);
sum395 = _mm512_fmadd_ps(sum395, postMul49, postAdd31);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)0, 63>>cut21, sum395);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)12288, 4032>>cut21, sum395);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)24576, 65535-(4095>>cut21), sum395);
ptrdiff_t c40 = 0;
for (; c40 != 32; ++c40) {
__m512 wt501 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)0);
__m512 wt502 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)2048);
__m512 wt503 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)4096);
__m512 wt504 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)6144);
__m512 wt505 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)8192);
__m512 wt506 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)10240);
__m512 wt507 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)12288);
__m512 wt508 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)14336);
__m512 wt509 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)16384);
__m512 wt510 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)18432);
__m512 wt511 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)20480);
__m512 wt512 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)22528);
__m512 wt513 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)24576);
__m512 wt514 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)26624);
__m512 wt515 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)28672);
__m512 wt516 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)30720);
__m512 tmp16413 = _mm512_unpacklo_ps(wt501, wt502);
__m512 tmp16414 = _mm512_unpackhi_ps(wt501, wt502);
__m512 tmp16415 = _mm512_unpacklo_ps(wt503, wt504);
__m512 tmp16416 = _mm512_unpackhi_ps(wt503, wt504);
__m512 tmp16417 = _mm512_unpacklo_ps(wt505, wt506);
__m512 tmp16418 = _mm512_unpackhi_ps(wt505, wt506);
__m512 tmp16419 = _mm512_unpacklo_ps(wt507, wt508);
__m512 tmp16420 = _mm512_unpackhi_ps(wt507, wt508);
__m512 tmp16421 = _mm512_unpacklo_ps(wt509, wt510);
__m512 tmp16422 = _mm512_unpackhi_ps(wt509, wt510);
__m512 tmp16423 = _mm512_unpacklo_ps(wt511, wt512);
__m512 tmp16424 = _mm512_unpackhi_ps(wt511, wt512);
__m512 tmp16425 = _mm512_unpacklo_ps(wt513, wt514);
__m512 tmp16426 = _mm512_unpackhi_ps(wt513, wt514);
__m512 tmp16427 = _mm512_unpacklo_ps(wt515, wt516);
__m512 tmp16428 = _mm512_unpackhi_ps(wt515, wt516);
__m512 tmp16429 = _mm512_shuffle_ps(tmp16413, tmp16415, 68);
__m512 tmp16430 = _mm512_shuffle_ps(tmp16413, tmp16415, 238);
__m512 tmp16431 = _mm512_shuffle_ps(tmp16414, tmp16416, 68);
__m512 tmp16432 = _mm512_shuffle_ps(tmp16414, tmp16416, 238);
__m512 tmp16433 = _mm512_shuffle_ps(tmp16417, tmp16419, 68);
__m512 tmp16434 = _mm512_shuffle_ps(tmp16417, tmp16419, 238);
__m512 tmp16435 = _mm512_shuffle_ps(tmp16418, tmp16420, 68);
__m512 tmp16436 = _mm512_shuffle_ps(tmp16418, tmp16420, 238);
__m512 tmp16437 = _mm512_shuffle_ps(tmp16421, tmp16423, 68);
__m512 tmp16438 = _mm512_shuffle_ps(tmp16421, tmp16423, 238);
__m512 tmp16439 = _mm512_shuffle_ps(tmp16422, tmp16424, 68);
__m512 tmp16440 = _mm512_shuffle_ps(tmp16422, tmp16424, 238);
__m512 tmp16441 = _mm512_shuffle_ps(tmp16425, tmp16427, 68);
__m512 tmp16442 = _mm512_shuffle_ps(tmp16425, tmp16427, 238);
__m512 tmp16443 = _mm512_shuffle_ps(tmp16426, tmp16428, 68);
__m512 tmp16444 = _mm512_shuffle_ps(tmp16426, tmp16428, 238);
__m512 tmp16445 = _mm512_shuffle_f32x4(tmp16429, tmp16433, 136);
__m512 tmp16446 = _mm512_shuffle_f32x4(tmp16429, tmp16433, 221);
__m512 tmp16447 = _mm512_shuffle_f32x4(tmp16430, tmp16434, 136);
__m512 tmp16448 = _mm512_shuffle_f32x4(tmp16430, tmp16434, 221);
__m512 tmp16449 = _mm512_shuffle_f32x4(tmp16431, tmp16435, 136);
__m512 tmp16450 = _mm512_shuffle_f32x4(tmp16431, tmp16435, 221);
__m512 tmp16451 = _mm512_shuffle_f32x4(tmp16432, tmp16436, 136);
__m512 tmp16452 = _mm512_shuffle_f32x4(tmp16432, tmp16436, 221);
__m512 tmp16453 = _mm512_shuffle_f32x4(tmp16437, tmp16441, 136);
__m512 tmp16454 = _mm512_shuffle_f32x4(tmp16437, tmp16441, 221);
__m512 tmp16455 = _mm512_shuffle_f32x4(tmp16438, tmp16442, 136);
__m512 tmp16456 = _mm512_shuffle_f32x4(tmp16438, tmp16442, 221);
__m512 tmp16457 = _mm512_shuffle_f32x4(tmp16439, tmp16443, 136);
__m512 tmp16458 = _mm512_shuffle_f32x4(tmp16439, tmp16443, 221);
__m512 tmp16459 = _mm512_shuffle_f32x4(tmp16440, tmp16444, 136);
__m512 tmp16460 = _mm512_shuffle_f32x4(tmp16440, tmp16444, 221);
wt501 = _mm512_shuffle_f32x4(tmp16445, tmp16453, 136);
wt509 = _mm512_shuffle_f32x4(tmp16445, tmp16453, 221);
wt502 = _mm512_shuffle_f32x4(tmp16447, tmp16455, 136);
wt510 = _mm512_shuffle_f32x4(tmp16447, tmp16455, 221);
wt503 = _mm512_shuffle_f32x4(tmp16449, tmp16457, 136);
wt511 = _mm512_shuffle_f32x4(tmp16449, tmp16457, 221);
wt504 = _mm512_shuffle_f32x4(tmp16451, tmp16459, 136);
wt512 = _mm512_shuffle_f32x4(tmp16451, tmp16459, 221);
wt505 = _mm512_shuffle_f32x4(tmp16446, tmp16454, 136);
wt513 = _mm512_shuffle_f32x4(tmp16446, tmp16454, 221);
wt506 = _mm512_shuffle_f32x4(tmp16448, tmp16456, 136);
wt514 = _mm512_shuffle_f32x4(tmp16448, tmp16456, 221);
wt507 = _mm512_shuffle_f32x4(tmp16450, tmp16458, 136);
wt515 = _mm512_shuffle_f32x4(tmp16450, tmp16458, 221);
wt508 = _mm512_shuffle_f32x4(tmp16452, tmp16460, 136);
wt516 = _mm512_shuffle_f32x4(tmp16452, tmp16460, 221);
wt501 = _mm512_mul_ps(wt501, postMul49);
wt502 = _mm512_mul_ps(wt502, postMul49);
wt503 = _mm512_mul_ps(wt503, postMul49);
wt504 = _mm512_mul_ps(wt504, postMul49);
wt505 = _mm512_mul_ps(wt505, postMul49);
wt506 = _mm512_mul_ps(wt506, postMul49);
wt507 = _mm512_mul_ps(wt507, postMul49);
wt508 = _mm512_mul_ps(wt508, postMul49);
wt509 = _mm512_mul_ps(wt509, postMul49);
wt510 = _mm512_mul_ps(wt510, postMul49);
wt511 = _mm512_mul_ps(wt511, postMul49);
wt512 = _mm512_mul_ps(wt512, postMul49);
wt513 = _mm512_mul_ps(wt513, postMul49);
wt514 = _mm512_mul_ps(wt514, postMul49);
wt515 = _mm512_mul_ps(wt515, postMul49);
wt516 = _mm512_mul_ps(wt516, postMul49);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c40)+(ptrdiff_t)0, 63>>cut21, wt501);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c40)+(ptrdiff_t)0, 63>>cut21, wt502);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c40)+(ptrdiff_t)0, 63>>cut21, wt503);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c40)+(ptrdiff_t)0, 63>>cut21, wt504);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c40)+(ptrdiff_t)0, 63>>cut21, wt505);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c40)+(ptrdiff_t)0, 63>>cut21, wt506);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c40)+(ptrdiff_t)0, 63>>cut21, wt507);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c40)+(ptrdiff_t)0, 63>>cut21, wt508);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c40)+(ptrdiff_t)0, 63>>cut21, wt509);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c40)+(ptrdiff_t)0, 63>>cut21, wt510);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c40)+(ptrdiff_t)0, 63>>cut21, wt511);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c40)+(ptrdiff_t)0, 63>>cut21, wt512);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c40)+(ptrdiff_t)0, 63>>cut21, wt513);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c40)+(ptrdiff_t)0, 63>>cut21, wt514);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c40)+(ptrdiff_t)0, 63>>cut21, wt515);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c40)+(ptrdiff_t)0, 63>>cut21, wt516);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt501);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt502);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt503);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt504);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt505);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt506);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt507);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt508);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt509);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt510);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt511);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt512);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt513);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt514);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt515);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt516);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt501);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt502);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt503);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt504);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt505);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt506);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt507);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt508);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt509);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt510);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt511);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt512);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt513);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt514);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt515);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt516);
}
break;
}
default: {
cut21 = 4;
__m512 sum396 = _mm512_maskz_loadu_ps(65535, biasPtr15+5120*i49+4*k142);
__m512i pmMul31 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd31 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo26 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k142+1280*i49));
__m512 masHi26 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k142+1280*i49)+(ptrdiff_t)64);
__m512 postMul50 = _mm512_permutex2var_ps(masLo26, pmMul31, masHi26);
__m512 postAdd32 = _mm512_permutex2var_ps(masLo26, pmAdd31, masHi26);
sum396 = _mm512_fmadd_ps(sum396, postMul50, postAdd32);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)0, 63>>cut21, sum396);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)12288, 4032>>cut21, sum396);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)24576, 258048>>cut21, sum396);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)36864, 65535-(262143>>cut21), sum396);
ptrdiff_t c41 = 0;
for (; c41 != 32; ++c41) {
__m512 wt517 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)0);
__m512 wt518 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)2048);
__m512 wt519 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)4096);
__m512 wt520 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)6144);
__m512 wt521 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)8192);
__m512 wt522 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)10240);
__m512 wt523 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)12288);
__m512 wt524 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)14336);
__m512 wt525 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)16384);
__m512 wt526 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)18432);
__m512 wt527 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)20480);
__m512 wt528 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)22528);
__m512 wt529 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)24576);
__m512 wt530 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)26624);
__m512 wt531 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)28672);
__m512 wt532 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)30720);
__m512 tmp16461 = _mm512_unpacklo_ps(wt517, wt518);
__m512 tmp16462 = _mm512_unpackhi_ps(wt517, wt518);
__m512 tmp16463 = _mm512_unpacklo_ps(wt519, wt520);
__m512 tmp16464 = _mm512_unpackhi_ps(wt519, wt520);
__m512 tmp16465 = _mm512_unpacklo_ps(wt521, wt522);
__m512 tmp16466 = _mm512_unpackhi_ps(wt521, wt522);
__m512 tmp16467 = _mm512_unpacklo_ps(wt523, wt524);
__m512 tmp16468 = _mm512_unpackhi_ps(wt523, wt524);
__m512 tmp16469 = _mm512_unpacklo_ps(wt525, wt526);
__m512 tmp16470 = _mm512_unpackhi_ps(wt525, wt526);
__m512 tmp16471 = _mm512_unpacklo_ps(wt527, wt528);
__m512 tmp16472 = _mm512_unpackhi_ps(wt527, wt528);
__m512 tmp16473 = _mm512_unpacklo_ps(wt529, wt530);
__m512 tmp16474 = _mm512_unpackhi_ps(wt529, wt530);
__m512 tmp16475 = _mm512_unpacklo_ps(wt531, wt532);
__m512 tmp16476 = _mm512_unpackhi_ps(wt531, wt532);
__m512 tmp16477 = _mm512_shuffle_ps(tmp16461, tmp16463, 68);
__m512 tmp16478 = _mm512_shuffle_ps(tmp16461, tmp16463, 238);
__m512 tmp16479 = _mm512_shuffle_ps(tmp16462, tmp16464, 68);
__m512 tmp16480 = _mm512_shuffle_ps(tmp16462, tmp16464, 238);
__m512 tmp16481 = _mm512_shuffle_ps(tmp16465, tmp16467, 68);
__m512 tmp16482 = _mm512_shuffle_ps(tmp16465, tmp16467, 238);
__m512 tmp16483 = _mm512_shuffle_ps(tmp16466, tmp16468, 68);
__m512 tmp16484 = _mm512_shuffle_ps(tmp16466, tmp16468, 238);
__m512 tmp16485 = _mm512_shuffle_ps(tmp16469, tmp16471, 68);
__m512 tmp16486 = _mm512_shuffle_ps(tmp16469, tmp16471, 238);
__m512 tmp16487 = _mm512_shuffle_ps(tmp16470, tmp16472, 68);
__m512 tmp16488 = _mm512_shuffle_ps(tmp16470, tmp16472, 238);
__m512 tmp16489 = _mm512_shuffle_ps(tmp16473, tmp16475, 68);
__m512 tmp16490 = _mm512_shuffle_ps(tmp16473, tmp16475, 238);
__m512 tmp16491 = _mm512_shuffle_ps(tmp16474, tmp16476, 68);
__m512 tmp16492 = _mm512_shuffle_ps(tmp16474, tmp16476, 238);
__m512 tmp16493 = _mm512_shuffle_f32x4(tmp16477, tmp16481, 136);
__m512 tmp16494 = _mm512_shuffle_f32x4(tmp16477, tmp16481, 221);
__m512 tmp16495 = _mm512_shuffle_f32x4(tmp16478, tmp16482, 136);
__m512 tmp16496 = _mm512_shuffle_f32x4(tmp16478, tmp16482, 221);
__m512 tmp16497 = _mm512_shuffle_f32x4(tmp16479, tmp16483, 136);
__m512 tmp16498 = _mm512_shuffle_f32x4(tmp16479, tmp16483, 221);
__m512 tmp16499 = _mm512_shuffle_f32x4(tmp16480, tmp16484, 136);
__m512 tmp16500 = _mm512_shuffle_f32x4(tmp16480, tmp16484, 221);
__m512 tmp16501 = _mm512_shuffle_f32x4(tmp16485, tmp16489, 136);
__m512 tmp16502 = _mm512_shuffle_f32x4(tmp16485, tmp16489, 221);
__m512 tmp16503 = _mm512_shuffle_f32x4(tmp16486, tmp16490, 136);
__m512 tmp16504 = _mm512_shuffle_f32x4(tmp16486, tmp16490, 221);
__m512 tmp16505 = _mm512_shuffle_f32x4(tmp16487, tmp16491, 136);
__m512 tmp16506 = _mm512_shuffle_f32x4(tmp16487, tmp16491, 221);
__m512 tmp16507 = _mm512_shuffle_f32x4(tmp16488, tmp16492, 136);
__m512 tmp16508 = _mm512_shuffle_f32x4(tmp16488, tmp16492, 221);
wt517 = _mm512_shuffle_f32x4(tmp16493, tmp16501, 136);
wt525 = _mm512_shuffle_f32x4(tmp16493, tmp16501, 221);
wt518 = _mm512_shuffle_f32x4(tmp16495, tmp16503, 136);
wt526 = _mm512_shuffle_f32x4(tmp16495, tmp16503, 221);
wt519 = _mm512_shuffle_f32x4(tmp16497, tmp16505, 136);
wt527 = _mm512_shuffle_f32x4(tmp16497, tmp16505, 221);
wt520 = _mm512_shuffle_f32x4(tmp16499, tmp16507, 136);
wt528 = _mm512_shuffle_f32x4(tmp16499, tmp16507, 221);
wt521 = _mm512_shuffle_f32x4(tmp16494, tmp16502, 136);
wt529 = _mm512_shuffle_f32x4(tmp16494, tmp16502, 221);
wt522 = _mm512_shuffle_f32x4(tmp16496, tmp16504, 136);
wt530 = _mm512_shuffle_f32x4(tmp16496, tmp16504, 221);
wt523 = _mm512_shuffle_f32x4(tmp16498, tmp16506, 136);
wt531 = _mm512_shuffle_f32x4(tmp16498, tmp16506, 221);
wt524 = _mm512_shuffle_f32x4(tmp16500, tmp16508, 136);
wt532 = _mm512_shuffle_f32x4(tmp16500, tmp16508, 221);
wt517 = _mm512_mul_ps(wt517, postMul50);
wt518 = _mm512_mul_ps(wt518, postMul50);
wt519 = _mm512_mul_ps(wt519, postMul50);
wt520 = _mm512_mul_ps(wt520, postMul50);
wt521 = _mm512_mul_ps(wt521, postMul50);
wt522 = _mm512_mul_ps(wt522, postMul50);
wt523 = _mm512_mul_ps(wt523, postMul50);
wt524 = _mm512_mul_ps(wt524, postMul50);
wt525 = _mm512_mul_ps(wt525, postMul50);
wt526 = _mm512_mul_ps(wt526, postMul50);
wt527 = _mm512_mul_ps(wt527, postMul50);
wt528 = _mm512_mul_ps(wt528, postMul50);
wt529 = _mm512_mul_ps(wt529, postMul50);
wt530 = _mm512_mul_ps(wt530, postMul50);
wt531 = _mm512_mul_ps(wt531, postMul50);
wt532 = _mm512_mul_ps(wt532, postMul50);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c41)+(ptrdiff_t)0, 63>>cut21, wt517);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c41)+(ptrdiff_t)0, 63>>cut21, wt518);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c41)+(ptrdiff_t)0, 63>>cut21, wt519);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c41)+(ptrdiff_t)0, 63>>cut21, wt520);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c41)+(ptrdiff_t)0, 63>>cut21, wt521);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c41)+(ptrdiff_t)0, 63>>cut21, wt522);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c41)+(ptrdiff_t)0, 63>>cut21, wt523);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c41)+(ptrdiff_t)0, 63>>cut21, wt524);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c41)+(ptrdiff_t)0, 63>>cut21, wt525);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c41)+(ptrdiff_t)0, 63>>cut21, wt526);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c41)+(ptrdiff_t)0, 63>>cut21, wt527);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c41)+(ptrdiff_t)0, 63>>cut21, wt528);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c41)+(ptrdiff_t)0, 63>>cut21, wt529);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c41)+(ptrdiff_t)0, 63>>cut21, wt530);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c41)+(ptrdiff_t)0, 63>>cut21, wt531);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c41)+(ptrdiff_t)0, 63>>cut21, wt532);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt517);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt518);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt519);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt520);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt521);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt522);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt523);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt524);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt525);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt526);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt527);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt528);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt529);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt530);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt531);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt532);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt517);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt518);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt519);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt520);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt521);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt522);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt523);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt524);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt525);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt526);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt527);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt528);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt529);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt530);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt531);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt532);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt517);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt518);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt519);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt520);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt521);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt522);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt523);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt524);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt525);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt526);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt527);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt528);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt529);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt530);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt531);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt532);
}
}
}
} else {
ptrdiff_t k141 = 240;
ptrdiff_t l60 = (size_t)(1024+k141)/6;
ptrdiff_t cut20 = (size_t)(1024+k141)%6;
__m512 sum394 = _mm512_maskz_loadu_ps(65535, biasPtr15+5120*i49+4*k141);
__m512i pmMul32 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd32 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo27 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k141+1280*i49));
__m512 masHi27 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k141+1280*i49)+(ptrdiff_t)64);
__m512 postMul48 = _mm512_permutex2var_ps(masLo27, pmMul32, masHi27);
__m512 postAdd30 = _mm512_permutex2var_ps(masLo27, pmAdd32, masHi27);
sum394 = _mm512_fmadd_ps(sum394, postMul48, postAdd30);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*0+(ptrdiff_t)0, 63>>cut20, sum394);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*0+(ptrdiff_t)12288, 4032>>cut20, sum394);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*0+(ptrdiff_t)24576, 258048>>cut20, sum394);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*0+(ptrdiff_t)36864, 65535-(262143>>cut20), sum394);
ptrdiff_t c39 = 0;
for (; c39 != 32; ++c39) {
__m512 wt485 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)0);
__m512 wt486 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)2048);
__m512 wt487 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)4096);
__m512 wt488 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)6144);
__m512 wt489 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)8192);
__m512 wt490 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)10240);
__m512 wt491 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)12288);
__m512 wt492 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)14336);
__m512 wt493 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)16384);
__m512 wt494 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)18432);
__m512 wt495 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)20480);
__m512 wt496 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)22528);
__m512 wt497 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)24576);
__m512 wt498 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)26624);
__m512 wt499 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)28672);
__m512 wt500 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)30720);
__m512 tmp16509 = _mm512_unpacklo_ps(wt485, wt486);
__m512 tmp16510 = _mm512_unpackhi_ps(wt485, wt486);
__m512 tmp16511 = _mm512_unpacklo_ps(wt487, wt488);
__m512 tmp16512 = _mm512_unpackhi_ps(wt487, wt488);
__m512 tmp16513 = _mm512_unpacklo_ps(wt489, wt490);
__m512 tmp16514 = _mm512_unpackhi_ps(wt489, wt490);
__m512 tmp16515 = _mm512_unpacklo_ps(wt491, wt492);
__m512 tmp16516 = _mm512_unpackhi_ps(wt491, wt492);
__m512 tmp16517 = _mm512_unpacklo_ps(wt493, wt494);
__m512 tmp16518 = _mm512_unpackhi_ps(wt493, wt494);
__m512 tmp16519 = _mm512_unpacklo_ps(wt495, wt496);
__m512 tmp16520 = _mm512_unpackhi_ps(wt495, wt496);
__m512 tmp16521 = _mm512_unpacklo_ps(wt497, wt498);
__m512 tmp16522 = _mm512_unpackhi_ps(wt497, wt498);
__m512 tmp16523 = _mm512_unpacklo_ps(wt499, wt500);
__m512 tmp16524 = _mm512_unpackhi_ps(wt499, wt500);
__m512 tmp16525 = _mm512_shuffle_ps(tmp16509, tmp16511, 68);
__m512 tmp16526 = _mm512_shuffle_ps(tmp16509, tmp16511, 238);
__m512 tmp16527 = _mm512_shuffle_ps(tmp16510, tmp16512, 68);
__m512 tmp16528 = _mm512_shuffle_ps(tmp16510, tmp16512, 238);
__m512 tmp16529 = _mm512_shuffle_ps(tmp16513, tmp16515, 68);
__m512 tmp16530 = _mm512_shuffle_ps(tmp16513, tmp16515, 238);
__m512 tmp16531 = _mm512_shuffle_ps(tmp16514, tmp16516, 68);
__m512 tmp16532 = _mm512_shuffle_ps(tmp16514, tmp16516, 238);
__m512 tmp16533 = _mm512_shuffle_ps(tmp16517, tmp16519, 68);
__m512 tmp16534 = _mm512_shuffle_ps(tmp16517, tmp16519, 238);
__m512 tmp16535 = _mm512_shuffle_ps(tmp16518, tmp16520, 68);
__m512 tmp16536 = _mm512_shuffle_ps(tmp16518, tmp16520, 238);
__m512 tmp16537 = _mm512_shuffle_ps(tmp16521, tmp16523, 68);
__m512 tmp16538 = _mm512_shuffle_ps(tmp16521, tmp16523, 238);
__m512 tmp16539 = _mm512_shuffle_ps(tmp16522, tmp16524, 68);
__m512 tmp16540 = _mm512_shuffle_ps(tmp16522, tmp16524, 238);
__m512 tmp16541 = _mm512_shuffle_f32x4(tmp16525, tmp16529, 136);
__m512 tmp16542 = _mm512_shuffle_f32x4(tmp16525, tmp16529, 221);
__m512 tmp16543 = _mm512_shuffle_f32x4(tmp16526, tmp16530, 136);
__m512 tmp16544 = _mm512_shuffle_f32x4(tmp16526, tmp16530, 221);
__m512 tmp16545 = _mm512_shuffle_f32x4(tmp16527, tmp16531, 136);
__m512 tmp16546 = _mm512_shuffle_f32x4(tmp16527, tmp16531, 221);
__m512 tmp16547 = _mm512_shuffle_f32x4(tmp16528, tmp16532, 136);
__m512 tmp16548 = _mm512_shuffle_f32x4(tmp16528, tmp16532, 221);
__m512 tmp16549 = _mm512_shuffle_f32x4(tmp16533, tmp16537, 136);
__m512 tmp16550 = _mm512_shuffle_f32x4(tmp16533, tmp16537, 221);
__m512 tmp16551 = _mm512_shuffle_f32x4(tmp16534, tmp16538, 136);
__m512 tmp16552 = _mm512_shuffle_f32x4(tmp16534, tmp16538, 221);
__m512 tmp16553 = _mm512_shuffle_f32x4(tmp16535, tmp16539, 136);
__m512 tmp16554 = _mm512_shuffle_f32x4(tmp16535, tmp16539, 221);
__m512 tmp16555 = _mm512_shuffle_f32x4(tmp16536, tmp16540, 136);
__m512 tmp16556 = _mm512_shuffle_f32x4(tmp16536, tmp16540, 221);
wt485 = _mm512_shuffle_f32x4(tmp16541, tmp16549, 136);
wt493 = _mm512_shuffle_f32x4(tmp16541, tmp16549, 221);
wt486 = _mm512_shuffle_f32x4(tmp16543, tmp16551, 136);
wt494 = _mm512_shuffle_f32x4(tmp16543, tmp16551, 221);
wt487 = _mm512_shuffle_f32x4(tmp16545, tmp16553, 136);
wt495 = _mm512_shuffle_f32x4(tmp16545, tmp16553, 221);
wt488 = _mm512_shuffle_f32x4(tmp16547, tmp16555, 136);
wt496 = _mm512_shuffle_f32x4(tmp16547, tmp16555, 221);
wt489 = _mm512_shuffle_f32x4(tmp16542, tmp16550, 136);
wt497 = _mm512_shuffle_f32x4(tmp16542, tmp16550, 221);
wt490 = _mm512_shuffle_f32x4(tmp16544, tmp16552, 136);
wt498 = _mm512_shuffle_f32x4(tmp16544, tmp16552, 221);
wt491 = _mm512_shuffle_f32x4(tmp16546, tmp16554, 136);
wt499 = _mm512_shuffle_f32x4(tmp16546, tmp16554, 221);
wt492 = _mm512_shuffle_f32x4(tmp16548, tmp16556, 136);
wt500 = _mm512_shuffle_f32x4(tmp16548, tmp16556, 221);
wt485 = _mm512_mul_ps(wt485, postMul48);
wt486 = _mm512_mul_ps(wt486, postMul48);
wt487 = _mm512_mul_ps(wt487, postMul48);
wt488 = _mm512_mul_ps(wt488, postMul48);
wt489 = _mm512_mul_ps(wt489, postMul48);
wt490 = _mm512_mul_ps(wt490, postMul48);
wt491 = _mm512_mul_ps(wt491, postMul48);
wt492 = _mm512_mul_ps(wt492, postMul48);
wt493 = _mm512_mul_ps(wt493, postMul48);
wt494 = _mm512_mul_ps(wt494, postMul48);
wt495 = _mm512_mul_ps(wt495, postMul48);
wt496 = _mm512_mul_ps(wt496, postMul48);
wt497 = _mm512_mul_ps(wt497, postMul48);
wt498 = _mm512_mul_ps(wt498, postMul48);
wt499 = _mm512_mul_ps(wt499, postMul48);
wt500 = _mm512_mul_ps(wt500, postMul48);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(1+16*c39)+(ptrdiff_t)0, 63>>cut20, wt485);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(2+16*c39)+(ptrdiff_t)0, 63>>cut20, wt486);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(3+16*c39)+(ptrdiff_t)0, 63>>cut20, wt487);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(4+16*c39)+(ptrdiff_t)0, 63>>cut20, wt488);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(5+16*c39)+(ptrdiff_t)0, 63>>cut20, wt489);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(6+16*c39)+(ptrdiff_t)0, 63>>cut20, wt490);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(7+16*c39)+(ptrdiff_t)0, 63>>cut20, wt491);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(8+16*c39)+(ptrdiff_t)0, 63>>cut20, wt492);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(9+16*c39)+(ptrdiff_t)0, 63>>cut20, wt493);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(10+16*c39)+(ptrdiff_t)0, 63>>cut20, wt494);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(11+16*c39)+(ptrdiff_t)0, 63>>cut20, wt495);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(12+16*c39)+(ptrdiff_t)0, 63>>cut20, wt496);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(13+16*c39)+(ptrdiff_t)0, 63>>cut20, wt497);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(14+16*c39)+(ptrdiff_t)0, 63>>cut20, wt498);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(15+16*c39)+(ptrdiff_t)0, 63>>cut20, wt499);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(16+16*c39)+(ptrdiff_t)0, 63>>cut20, wt500);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(1+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt485);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(2+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt486);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(3+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt487);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(4+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt488);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(5+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt489);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(6+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt490);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(7+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt491);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(8+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt492);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(9+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt493);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(10+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt494);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(11+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt495);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(12+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt496);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(13+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt497);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(14+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt498);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(15+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt499);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(16+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt500);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(1+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt485);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(2+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt486);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(3+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt487);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(4+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt488);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(5+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt489);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(6+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt490);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(7+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt491);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(8+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt492);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(9+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt493);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(10+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt494);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(11+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt495);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(12+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt496);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(13+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt497);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(14+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt498);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(15+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt499);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(16+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt500);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(1+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt485);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(2+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt486);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(3+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt487);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(4+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt488);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(5+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt489);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(6+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt490);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(7+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt491);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(8+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt492);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(9+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt493);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(10+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt494);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(11+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt495);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(12+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt496);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(13+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt497);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(14+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt498);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(15+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt499);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(16+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt500);
}
}
}
}
}

static void ResNet50OneArrangeWts7(ResNet50ThreaderTeam1* team53, char** tensors79) {
ResNet50ThreaderTask1 task83;
task83.callee1 = ResNet50OneArrangeWts7Callee1;
task83.any1 = tensors79;
task83.nd1 = 3;
task83.hull1[0] = 80;
task83.hull1[1] = 1;
task83.hull1[2] = 1;
ResNet50ThreaderDo1(team53, &task83);
}

static void ResNet50OneArrangeDats7Callee1(ResNet50ThreaderTask1* task84, int64_t* pt47) {
char** tensors82 = task84->any1;
ptrdiff_t s37 = pt47[0];
ptrdiff_t c42 = pt47[1];
char*restrict datPtr25 = tensors82[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged14 = tensors82[1]+(ptrdiff_t)748160*0+(ptrdiff_t)458752*0;
ptrdiff_t ii20 = 1;
for (ptrdiff_t i50 = 0; i50 < ii20; ++i50) {
ptrdiff_t j42 = 1*c42;
ptrdiff_t jj42 = j42+0;
if (j42 < 3) {
ptrdiff_t h50 = 0+((size_t)j42-0)/1*8;
switch (((size_t)j42-0)%1) {
default: {
wrap5:;
ptrdiff_t k143 = 128*s37;
ptrdiff_t kk44 = k143+128;
for (; k143 < kk44; ++k143) {
__m512 dat2167 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)0);
__m512 dat2168 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)64);
__m512i pm213 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2169 = _mm512_permutex2var_ps(dat2167, pm213, dat2168);
__m512 dat2170 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)224);
__m512 dat2171 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)288);
__m512i pm214 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2172 = _mm512_permutex2var_ps(dat2170, pm214, dat2171);
__m512 dat2173 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)448);
__m512 dat2174 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)512);
__m512i pm215 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2175 = _mm512_permutex2var_ps(dat2173, pm215, dat2174);
__m512 dat2176 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)672);
__m512 dat2177 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)736);
__m512i pm216 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2178 = _mm512_permutex2var_ps(dat2176, pm216, dat2177);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+256*k143+(ptrdiff_t)0, dat2169);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+256*k143+(ptrdiff_t)64, dat2172);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+256*k143+(ptrdiff_t)128, dat2175);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+256*k143+(ptrdiff_t)192, dat2178);
}
if (j42 >= jj42) goto next7;
if (j42 >= 2) break;
++j42;
h50 += 8;
goto wrap5;
}
}
j42 = 3;
}
switch ((size_t)j42-3) {
default: {
j42 = 3;
ptrdiff_t k144 = 128*s37;
ptrdiff_t kk45 = k144+128;
for (; k144 < kk45; ++k144) {
__m512 dat2179 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)0);
__m512 dat2180 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)64);
__m512i pm217 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2181 = _mm512_permutex2var_ps(dat2179, pm217, dat2180);
__m512 dat2182 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)224);
__m512 dat2183 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)288);
__m512i pm218 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2184 = _mm512_permutex2var_ps(dat2182, pm218, dat2183);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+128*k144+(ptrdiff_t)0, dat2181);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+128*k144+(ptrdiff_t)64, dat2184);
}
if (j42 >= jj42) goto next7;
}
}
j42 = 4;
next7:;
}
}

static void ResNet50OneArrangeDats7(ResNet50ThreaderTeam1* team54, char** tensors81) {
ResNet50ThreaderTask1 task85;
task85.callee1 = ResNet50OneArrangeDats7Callee1;
task85.any1 = tensors81;
task85.nd1 = 4;
task85.hull1[0] = 4;
task85.hull1[1] = 4;
task85.hull1[2] = 1;
task85.hull1[3] = 1;
ResNet50ThreaderDo1(team54, &task85);
}

static void ResNet50OneApply7Callee1(ResNet50ThreaderTask1* task86, int64_t* pt48) {
void** pair22 = task86->any1;
char** tensors84 = pair22[0];
ptrdiff_t e24 = 0;
ptrdiff_t g28 = 0;
ptrdiff_t d17 = pt48[1];
ptrdiff_t w64 = pt48[0];
char*restrict arrangedWts7 = tensors84[0]+4280320*e24+(ptrdiff_t)2626560*1*g28;
char*restrict arrangedDats7 = tensors84[1]+748160*e24+(ptrdiff_t)458752*1*g28;
char*restrict datPtr26 = tensors84[2]+(ptrdiff_t)1064960*1*g28;
ptrdiff_t ii21 = 1;
for (ptrdiff_t i51 = 0; i51 < ii21; ++i51) {
ptrdiff_t j43 = 1*d17;
ptrdiff_t jj43 = j43+0;
if (j43 < 3) {
ptrdiff_t h51 = 0+((size_t)j43-0)/1*4;
switch (((size_t)j43-0)%1) {
default: {
wrap6:;
ptrdiff_t k145 = 1*w64;
ptrdiff_t kk46 = k145+0;
for (; k145 != 213; ++k145) {
ptrdiff_t s38 = -1;
__m512 sum397 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)24));
__m512 sum401 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)28));
__m512 sum405 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)32));
__m512 sum409 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)36));
__m512 sum413 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)40));
__m512 sum417 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)44));
__m512 sum398 = sum397;
__m512 sum399 = sum397;
__m512 sum400 = sum397;
__m512 sum402 = sum401;
__m512 sum403 = sum401;
__m512 sum404 = sum401;
__m512 sum406 = sum405;
__m512 sum407 = sum405;
__m512 sum408 = sum405;
__m512 sum410 = sum409;
__m512 sum411 = sum409;
__m512 sum412 = sum409;
__m512 sum414 = sum413;
__m512 sum415 = sum413;
__m512 sum416 = sum413;
__m512 sum418 = sum417;
__m512 sum419 = sum417;
__m512 sum420 = sum417;
for (s38 = 0; s38 < 512; ++s38) {
__m512 dat2185 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s38+(ptrdiff_t)0);
__m512 dat2186 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s38+(ptrdiff_t)64);
__m512 dat2187 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s38+(ptrdiff_t)128);
__m512 dat2188 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s38+(ptrdiff_t)192);
__m512 wt533 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)24));
sum397 = _mm512_fmadd_ps(wt533, dat2185, sum397);
sum398 = _mm512_fmadd_ps(wt533, dat2186, sum398);
sum399 = _mm512_fmadd_ps(wt533, dat2187, sum399);
sum400 = _mm512_fmadd_ps(wt533, dat2188, sum400);
__m512 wt534 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)28));
sum401 = _mm512_fmadd_ps(wt534, dat2185, sum401);
sum402 = _mm512_fmadd_ps(wt534, dat2186, sum402);
sum403 = _mm512_fmadd_ps(wt534, dat2187, sum403);
sum404 = _mm512_fmadd_ps(wt534, dat2188, sum404);
__m512 wt535 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)32));
sum405 = _mm512_fmadd_ps(wt535, dat2185, sum405);
sum406 = _mm512_fmadd_ps(wt535, dat2186, sum406);
sum407 = _mm512_fmadd_ps(wt535, dat2187, sum407);
sum408 = _mm512_fmadd_ps(wt535, dat2188, sum408);
__m512 wt536 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)36));
sum409 = _mm512_fmadd_ps(wt536, dat2185, sum409);
sum410 = _mm512_fmadd_ps(wt536, dat2186, sum410);
sum411 = _mm512_fmadd_ps(wt536, dat2187, sum411);
sum412 = _mm512_fmadd_ps(wt536, dat2188, sum412);
__m512 wt537 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)40));
sum413 = _mm512_fmadd_ps(wt537, dat2185, sum413);
sum414 = _mm512_fmadd_ps(wt537, dat2186, sum414);
sum415 = _mm512_fmadd_ps(wt537, dat2187, sum415);
sum416 = _mm512_fmadd_ps(wt537, dat2188, sum416);
__m512 wt538 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)44));
sum417 = _mm512_fmadd_ps(wt538, dat2185, sum417);
sum418 = _mm512_fmadd_ps(wt538, dat2186, sum418);
sum419 = _mm512_fmadd_ps(wt538, dat2187, sum419);
sum420 = _mm512_fmadd_ps(wt538, dat2188, sum420);
}
__m512 dat2189 = sum397;
__m512 dat2190 = sum398;
__m512 dat2191 = sum399;
__m512 dat2192 = sum400;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)0, 16383, dat2189);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)56, 16383, dat2190);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)112, 16383, dat2191);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)168, 16383, dat2192);
__m512 dat2193 = sum401;
__m512 dat2194 = sum402;
__m512 dat2195 = sum403;
__m512 dat2196 = sum404;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)832, 16383, dat2193);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)888, 16383, dat2194);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)944, 16383, dat2195);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1000, 16383, dat2196);
__m512 dat2197 = sum405;
__m512 dat2198 = sum406;
__m512 dat2199 = sum407;
__m512 dat2200 = sum408;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1664, 16383, dat2197);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1720, 16383, dat2198);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1776, 16383, dat2199);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1832, 16383, dat2200);
__m512 dat2201 = sum409;
__m512 dat2202 = sum410;
__m512 dat2203 = sum411;
__m512 dat2204 = sum412;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)2496, 16383, dat2201);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)2552, 16383, dat2202);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)2608, 16383, dat2203);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)2664, 16383, dat2204);
__m512 dat2205 = sum413;
__m512 dat2206 = sum414;
__m512 dat2207 = sum415;
__m512 dat2208 = sum416;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)3328, 16383, dat2205);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)3384, 16383, dat2206);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)3440, 16383, dat2207);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)3496, 16383, dat2208);
__m512 dat2209 = sum417;
__m512 dat2210 = sum418;
__m512 dat2211 = sum419;
__m512 dat2212 = sum420;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)4160, 16383, dat2209);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)4216, 16383, dat2210);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)4272, 16383, dat2211);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)4328, 16383, dat2212);
if (k145 >= kk46) return;
}
ptrdiff_t s39 = -1;
__m512 sum421 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+8*s39+(ptrdiff_t)8));
__m512 sum425 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+8*s39+(ptrdiff_t)12));
__m512 sum422 = sum421;
__m512 sum423 = sum421;
__m512 sum424 = sum421;
__m512 sum426 = sum425;
__m512 sum427 = sum425;
__m512 sum428 = sum425;
for (s39 = 0; s39 < 512; ++s39) {
__m512 dat2213 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s39+(ptrdiff_t)0);
__m512 dat2214 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s39+(ptrdiff_t)64);
__m512 dat2215 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s39+(ptrdiff_t)128);
__m512 dat2216 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s39+(ptrdiff_t)192);
__m512 wt539 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+8*s39+(ptrdiff_t)8));
sum421 = _mm512_fmadd_ps(wt539, dat2213, sum421);
sum422 = _mm512_fmadd_ps(wt539, dat2214, sum422);
sum423 = _mm512_fmadd_ps(wt539, dat2215, sum423);
sum424 = _mm512_fmadd_ps(wt539, dat2216, sum424);
__m512 wt540 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+8*s39+(ptrdiff_t)12));
sum425 = _mm512_fmadd_ps(wt540, dat2213, sum425);
sum426 = _mm512_fmadd_ps(wt540, dat2214, sum426);
sum427 = _mm512_fmadd_ps(wt540, dat2215, sum427);
sum428 = _mm512_fmadd_ps(wt540, dat2216, sum428);
}
__m512 dat2217 = sum421;
__m512 dat2218 = sum422;
__m512 dat2219 = sum423;
__m512 dat2220 = sum424;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)0, 16383, dat2217);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)56, 16383, dat2218);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)112, 16383, dat2219);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)168, 16383, dat2220);
__m512 dat2221 = sum425;
__m512 dat2222 = sum426;
__m512 dat2223 = sum427;
__m512 dat2224 = sum428;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)832, 16383, dat2221);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)888, 16383, dat2222);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)944, 16383, dat2223);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1000, 16383, dat2224);
if (j43 >= jj43) return;
if (j43 >= 2) break;
++j43;
h51 += 4;
goto wrap6;
}
}
j43 = 3;
}
ptrdiff_t h52 = 12;
switch (j43) {
default: {
j43 = 3;
ptrdiff_t k146 = 1*w64;
ptrdiff_t kk47 = k146+0;
for (; k146 != 213; ++k146) {
ptrdiff_t s40 = -1;
__m512 sum429 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)24));
__m512 sum431 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)28));
__m512 sum433 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)32));
__m512 sum435 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)36));
__m512 sum437 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)40));
__m512 sum439 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)44));
__m512 sum430 = sum429;
__m512 sum432 = sum431;
__m512 sum434 = sum433;
__m512 sum436 = sum435;
__m512 sum438 = sum437;
__m512 sum440 = sum439;
for (s40 = 0; s40 < 512; ++s40) {
__m512 dat2225 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+128*s40+(ptrdiff_t)0);
__m512 dat2226 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+128*s40+(ptrdiff_t)64);
__m512 wt541 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)24));
sum429 = _mm512_fmadd_ps(wt541, dat2225, sum429);
sum430 = _mm512_fmadd_ps(wt541, dat2226, sum430);
__m512 wt542 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)28));
sum431 = _mm512_fmadd_ps(wt542, dat2225, sum431);
sum432 = _mm512_fmadd_ps(wt542, dat2226, sum432);
__m512 wt543 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)32));
sum433 = _mm512_fmadd_ps(wt543, dat2225, sum433);
sum434 = _mm512_fmadd_ps(wt543, dat2226, sum434);
__m512 wt544 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)36));
sum435 = _mm512_fmadd_ps(wt544, dat2225, sum435);
sum436 = _mm512_fmadd_ps(wt544, dat2226, sum436);
__m512 wt545 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)40));
sum437 = _mm512_fmadd_ps(wt545, dat2225, sum437);
sum438 = _mm512_fmadd_ps(wt545, dat2226, sum438);
__m512 wt546 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)44));
sum439 = _mm512_fmadd_ps(wt546, dat2225, sum439);
sum440 = _mm512_fmadd_ps(wt546, dat2226, sum440);
}
__m512 dat2227 = sum429;
__m512 dat2228 = sum430;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)0, 16383, dat2227);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)56, 16383, dat2228);
__m512 dat2229 = sum431;
__m512 dat2230 = sum432;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)832, 16383, dat2229);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)888, 16383, dat2230);
__m512 dat2231 = sum433;
__m512 dat2232 = sum434;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)1664, 16383, dat2231);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)1720, 16383, dat2232);
__m512 dat2233 = sum435;
__m512 dat2234 = sum436;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)2496, 16383, dat2233);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)2552, 16383, dat2234);
__m512 dat2235 = sum437;
__m512 dat2236 = sum438;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)3328, 16383, dat2235);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)3384, 16383, dat2236);
__m512 dat2237 = sum439;
__m512 dat2238 = sum440;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)4160, 16383, dat2237);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)4216, 16383, dat2238);
if (k146 >= kk47) return;
}
ptrdiff_t s41 = -1;
__m512 sum441 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+8*s41+(ptrdiff_t)8));
__m512 sum443 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+8*s41+(ptrdiff_t)12));
__m512 sum442 = sum441;
__m512 sum444 = sum443;
for (s41 = 0; s41 < 512; ++s41) {
__m512 dat2239 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+128*s41+(ptrdiff_t)0);
__m512 dat2240 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+128*s41+(ptrdiff_t)64);
__m512 wt547 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+8*s41+(ptrdiff_t)8));
sum441 = _mm512_fmadd_ps(wt547, dat2239, sum441);
sum442 = _mm512_fmadd_ps(wt547, dat2240, sum442);
__m512 wt548 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+8*s41+(ptrdiff_t)12));
sum443 = _mm512_fmadd_ps(wt548, dat2239, sum443);
sum444 = _mm512_fmadd_ps(wt548, dat2240, sum444);
}
__m512 dat2241 = sum441;
__m512 dat2242 = sum442;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)0, 16383, dat2241);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)56, 16383, dat2242);
__m512 dat2243 = sum443;
__m512 dat2244 = sum444;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)832, 16383, dat2243);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)888, 16383, dat2244);
if (j43 >= jj43) return;
}
}
j43 = 4;
}
}

static void ResNet50OneApply7(ResNet50ThreaderTeam1* team55, char** tensors83) {
void* pair21[] = {tensors83, 0};
ResNet50ThreaderTask1 task87;
task87.callee1 = ResNet50OneApply7Callee1;
task87.any1 = pair21;
task87.nd1 = 3;
task87.hull1[0] = 214;
task87.hull1[1] = 4;
task87.hull1[2] = 1;
ResNet50ThreaderDo1(team55, &task87);
}

static void ResNet50OneArrangeWts8Callee1(ResNet50ThreaderTask1* task96, int64_t* pt53) {
char** tensors94 = task96->any1;
ptrdiff_t b66 = pt53[0];
char*restrict wtPtr17 = tensors94[0]+(ptrdiff_t)3340*0+(ptrdiff_t)1048576*0;
char*restrict biasPtr17 = tensors94[1]+(ptrdiff_t)4096*0;
char*restrict bnPtr17 = tensors94[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged15 = tensors94[3]+(ptrdiff_t)3424256*0+(ptrdiff_t)1052672*0;
ptrdiff_t ii22 = 1;
for (ptrdiff_t i56 = 0; i56 < ii22; ++i56) {
ptrdiff_t j48 = 2*b66;
ptrdiff_t jj45 = j48+2;
for (; j48 < jj45; ++j48) {
if (j48 < 63) {
ptrdiff_t k154 = 0+16*(j48-0);
ptrdiff_t l67 = (size_t)(0+k154)/6;
ptrdiff_t cut24 = (size_t)(0+k154)%6;
switch (cut24) {
case 0:;
case 2: {
__m512 sum482 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i56+4*k154);
__m512i pmMul34 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd34 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo28 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k154+1024*i56));
__m512 masHi28 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k154+1024*i56)+(ptrdiff_t)64);
__m512 postMul57 = _mm512_permutex2var_ps(masLo28, pmMul34, masHi28);
__m512 postAdd35 = _mm512_permutex2var_ps(masLo28, pmAdd34, masHi28);
sum482 = _mm512_fmadd_ps(sum482, postMul57, postAdd35);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)0, 63>>cut24, sum482);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)6144, 4032>>cut24, sum482);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)12288, 65535-(4095>>cut24), sum482);
ptrdiff_t c45 = 0;
for (; c45 != 16; ++c45) {
__m512 wt569 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)0);
__m512 wt570 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)1024);
__m512 wt571 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)2048);
__m512 wt572 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)3072);
__m512 wt573 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)4096);
__m512 wt574 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)5120);
__m512 wt575 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)6144);
__m512 wt576 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)7168);
__m512 wt577 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)8192);
__m512 wt578 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)9216);
__m512 wt579 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)10240);
__m512 wt580 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)11264);
__m512 wt581 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)12288);
__m512 wt582 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)13312);
__m512 wt583 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)14336);
__m512 wt584 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)15360);
__m512 tmp17773 = _mm512_unpacklo_ps(wt569, wt570);
__m512 tmp17774 = _mm512_unpackhi_ps(wt569, wt570);
__m512 tmp17775 = _mm512_unpacklo_ps(wt571, wt572);
__m512 tmp17776 = _mm512_unpackhi_ps(wt571, wt572);
__m512 tmp17777 = _mm512_unpacklo_ps(wt573, wt574);
__m512 tmp17778 = _mm512_unpackhi_ps(wt573, wt574);
__m512 tmp17779 = _mm512_unpacklo_ps(wt575, wt576);
__m512 tmp17780 = _mm512_unpackhi_ps(wt575, wt576);
__m512 tmp17781 = _mm512_unpacklo_ps(wt577, wt578);
__m512 tmp17782 = _mm512_unpackhi_ps(wt577, wt578);
__m512 tmp17783 = _mm512_unpacklo_ps(wt579, wt580);
__m512 tmp17784 = _mm512_unpackhi_ps(wt579, wt580);
__m512 tmp17785 = _mm512_unpacklo_ps(wt581, wt582);
__m512 tmp17786 = _mm512_unpackhi_ps(wt581, wt582);
__m512 tmp17787 = _mm512_unpacklo_ps(wt583, wt584);
__m512 tmp17788 = _mm512_unpackhi_ps(wt583, wt584);
__m512 tmp17789 = _mm512_shuffle_ps(tmp17773, tmp17775, 68);
__m512 tmp17790 = _mm512_shuffle_ps(tmp17773, tmp17775, 238);
__m512 tmp17791 = _mm512_shuffle_ps(tmp17774, tmp17776, 68);
__m512 tmp17792 = _mm512_shuffle_ps(tmp17774, tmp17776, 238);
__m512 tmp17793 = _mm512_shuffle_ps(tmp17777, tmp17779, 68);
__m512 tmp17794 = _mm512_shuffle_ps(tmp17777, tmp17779, 238);
__m512 tmp17795 = _mm512_shuffle_ps(tmp17778, tmp17780, 68);
__m512 tmp17796 = _mm512_shuffle_ps(tmp17778, tmp17780, 238);
__m512 tmp17797 = _mm512_shuffle_ps(tmp17781, tmp17783, 68);
__m512 tmp17798 = _mm512_shuffle_ps(tmp17781, tmp17783, 238);
__m512 tmp17799 = _mm512_shuffle_ps(tmp17782, tmp17784, 68);
__m512 tmp17800 = _mm512_shuffle_ps(tmp17782, tmp17784, 238);
__m512 tmp17801 = _mm512_shuffle_ps(tmp17785, tmp17787, 68);
__m512 tmp17802 = _mm512_shuffle_ps(tmp17785, tmp17787, 238);
__m512 tmp17803 = _mm512_shuffle_ps(tmp17786, tmp17788, 68);
__m512 tmp17804 = _mm512_shuffle_ps(tmp17786, tmp17788, 238);
__m512 tmp17805 = _mm512_shuffle_f32x4(tmp17789, tmp17793, 136);
__m512 tmp17806 = _mm512_shuffle_f32x4(tmp17789, tmp17793, 221);
__m512 tmp17807 = _mm512_shuffle_f32x4(tmp17790, tmp17794, 136);
__m512 tmp17808 = _mm512_shuffle_f32x4(tmp17790, tmp17794, 221);
__m512 tmp17809 = _mm512_shuffle_f32x4(tmp17791, tmp17795, 136);
__m512 tmp17810 = _mm512_shuffle_f32x4(tmp17791, tmp17795, 221);
__m512 tmp17811 = _mm512_shuffle_f32x4(tmp17792, tmp17796, 136);
__m512 tmp17812 = _mm512_shuffle_f32x4(tmp17792, tmp17796, 221);
__m512 tmp17813 = _mm512_shuffle_f32x4(tmp17797, tmp17801, 136);
__m512 tmp17814 = _mm512_shuffle_f32x4(tmp17797, tmp17801, 221);
__m512 tmp17815 = _mm512_shuffle_f32x4(tmp17798, tmp17802, 136);
__m512 tmp17816 = _mm512_shuffle_f32x4(tmp17798, tmp17802, 221);
__m512 tmp17817 = _mm512_shuffle_f32x4(tmp17799, tmp17803, 136);
__m512 tmp17818 = _mm512_shuffle_f32x4(tmp17799, tmp17803, 221);
__m512 tmp17819 = _mm512_shuffle_f32x4(tmp17800, tmp17804, 136);
__m512 tmp17820 = _mm512_shuffle_f32x4(tmp17800, tmp17804, 221);
wt569 = _mm512_shuffle_f32x4(tmp17805, tmp17813, 136);
wt577 = _mm512_shuffle_f32x4(tmp17805, tmp17813, 221);
wt570 = _mm512_shuffle_f32x4(tmp17807, tmp17815, 136);
wt578 = _mm512_shuffle_f32x4(tmp17807, tmp17815, 221);
wt571 = _mm512_shuffle_f32x4(tmp17809, tmp17817, 136);
wt579 = _mm512_shuffle_f32x4(tmp17809, tmp17817, 221);
wt572 = _mm512_shuffle_f32x4(tmp17811, tmp17819, 136);
wt580 = _mm512_shuffle_f32x4(tmp17811, tmp17819, 221);
wt573 = _mm512_shuffle_f32x4(tmp17806, tmp17814, 136);
wt581 = _mm512_shuffle_f32x4(tmp17806, tmp17814, 221);
wt574 = _mm512_shuffle_f32x4(tmp17808, tmp17816, 136);
wt582 = _mm512_shuffle_f32x4(tmp17808, tmp17816, 221);
wt575 = _mm512_shuffle_f32x4(tmp17810, tmp17818, 136);
wt583 = _mm512_shuffle_f32x4(tmp17810, tmp17818, 221);
wt576 = _mm512_shuffle_f32x4(tmp17812, tmp17820, 136);
wt584 = _mm512_shuffle_f32x4(tmp17812, tmp17820, 221);
wt569 = _mm512_mul_ps(wt569, postMul57);
wt570 = _mm512_mul_ps(wt570, postMul57);
wt571 = _mm512_mul_ps(wt571, postMul57);
wt572 = _mm512_mul_ps(wt572, postMul57);
wt573 = _mm512_mul_ps(wt573, postMul57);
wt574 = _mm512_mul_ps(wt574, postMul57);
wt575 = _mm512_mul_ps(wt575, postMul57);
wt576 = _mm512_mul_ps(wt576, postMul57);
wt577 = _mm512_mul_ps(wt577, postMul57);
wt578 = _mm512_mul_ps(wt578, postMul57);
wt579 = _mm512_mul_ps(wt579, postMul57);
wt580 = _mm512_mul_ps(wt580, postMul57);
wt581 = _mm512_mul_ps(wt581, postMul57);
wt582 = _mm512_mul_ps(wt582, postMul57);
wt583 = _mm512_mul_ps(wt583, postMul57);
wt584 = _mm512_mul_ps(wt584, postMul57);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c45)+(ptrdiff_t)0, 63>>cut24, wt569);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c45)+(ptrdiff_t)0, 63>>cut24, wt570);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c45)+(ptrdiff_t)0, 63>>cut24, wt571);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c45)+(ptrdiff_t)0, 63>>cut24, wt572);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c45)+(ptrdiff_t)0, 63>>cut24, wt573);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c45)+(ptrdiff_t)0, 63>>cut24, wt574);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c45)+(ptrdiff_t)0, 63>>cut24, wt575);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c45)+(ptrdiff_t)0, 63>>cut24, wt576);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c45)+(ptrdiff_t)0, 63>>cut24, wt577);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c45)+(ptrdiff_t)0, 63>>cut24, wt578);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c45)+(ptrdiff_t)0, 63>>cut24, wt579);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c45)+(ptrdiff_t)0, 63>>cut24, wt580);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c45)+(ptrdiff_t)0, 63>>cut24, wt581);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c45)+(ptrdiff_t)0, 63>>cut24, wt582);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c45)+(ptrdiff_t)0, 63>>cut24, wt583);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c45)+(ptrdiff_t)0, 63>>cut24, wt584);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt569);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt570);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt571);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt572);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt573);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt574);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt575);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt576);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt577);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt578);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt579);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt580);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt581);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt582);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt583);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt584);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt569);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt570);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt571);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt572);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt573);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt574);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt575);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt576);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt577);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt578);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt579);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt580);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt581);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt582);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt583);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt584);
}
break;
}
default: {
cut24 = 4;
__m512 sum483 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i56+4*k154);
__m512i pmMul35 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd35 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo29 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k154+1024*i56));
__m512 masHi29 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k154+1024*i56)+(ptrdiff_t)64);
__m512 postMul58 = _mm512_permutex2var_ps(masLo29, pmMul35, masHi29);
__m512 postAdd36 = _mm512_permutex2var_ps(masLo29, pmAdd35, masHi29);
sum483 = _mm512_fmadd_ps(sum483, postMul58, postAdd36);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)0, 63>>cut24, sum483);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)6144, 4032>>cut24, sum483);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)12288, 258048>>cut24, sum483);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)18432, 65535-(262143>>cut24), sum483);
ptrdiff_t c46 = 0;
for (; c46 != 16; ++c46) {
__m512 wt585 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)0);
__m512 wt586 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)1024);
__m512 wt587 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)2048);
__m512 wt588 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)3072);
__m512 wt589 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)4096);
__m512 wt590 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)5120);
__m512 wt591 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)6144);
__m512 wt592 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)7168);
__m512 wt593 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)8192);
__m512 wt594 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)9216);
__m512 wt595 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)10240);
__m512 wt596 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)11264);
__m512 wt597 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)12288);
__m512 wt598 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)13312);
__m512 wt599 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)14336);
__m512 wt600 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)15360);
__m512 tmp17821 = _mm512_unpacklo_ps(wt585, wt586);
__m512 tmp17822 = _mm512_unpackhi_ps(wt585, wt586);
__m512 tmp17823 = _mm512_unpacklo_ps(wt587, wt588);
__m512 tmp17824 = _mm512_unpackhi_ps(wt587, wt588);
__m512 tmp17825 = _mm512_unpacklo_ps(wt589, wt590);
__m512 tmp17826 = _mm512_unpackhi_ps(wt589, wt590);
__m512 tmp17827 = _mm512_unpacklo_ps(wt591, wt592);
__m512 tmp17828 = _mm512_unpackhi_ps(wt591, wt592);
__m512 tmp17829 = _mm512_unpacklo_ps(wt593, wt594);
__m512 tmp17830 = _mm512_unpackhi_ps(wt593, wt594);
__m512 tmp17831 = _mm512_unpacklo_ps(wt595, wt596);
__m512 tmp17832 = _mm512_unpackhi_ps(wt595, wt596);
__m512 tmp17833 = _mm512_unpacklo_ps(wt597, wt598);
__m512 tmp17834 = _mm512_unpackhi_ps(wt597, wt598);
__m512 tmp17835 = _mm512_unpacklo_ps(wt599, wt600);
__m512 tmp17836 = _mm512_unpackhi_ps(wt599, wt600);
__m512 tmp17837 = _mm512_shuffle_ps(tmp17821, tmp17823, 68);
__m512 tmp17838 = _mm512_shuffle_ps(tmp17821, tmp17823, 238);
__m512 tmp17839 = _mm512_shuffle_ps(tmp17822, tmp17824, 68);
__m512 tmp17840 = _mm512_shuffle_ps(tmp17822, tmp17824, 238);
__m512 tmp17841 = _mm512_shuffle_ps(tmp17825, tmp17827, 68);
__m512 tmp17842 = _mm512_shuffle_ps(tmp17825, tmp17827, 238);
__m512 tmp17843 = _mm512_shuffle_ps(tmp17826, tmp17828, 68);
__m512 tmp17844 = _mm512_shuffle_ps(tmp17826, tmp17828, 238);
__m512 tmp17845 = _mm512_shuffle_ps(tmp17829, tmp17831, 68);
__m512 tmp17846 = _mm512_shuffle_ps(tmp17829, tmp17831, 238);
__m512 tmp17847 = _mm512_shuffle_ps(tmp17830, tmp17832, 68);
__m512 tmp17848 = _mm512_shuffle_ps(tmp17830, tmp17832, 238);
__m512 tmp17849 = _mm512_shuffle_ps(tmp17833, tmp17835, 68);
__m512 tmp17850 = _mm512_shuffle_ps(tmp17833, tmp17835, 238);
__m512 tmp17851 = _mm512_shuffle_ps(tmp17834, tmp17836, 68);
__m512 tmp17852 = _mm512_shuffle_ps(tmp17834, tmp17836, 238);
__m512 tmp17853 = _mm512_shuffle_f32x4(tmp17837, tmp17841, 136);
__m512 tmp17854 = _mm512_shuffle_f32x4(tmp17837, tmp17841, 221);
__m512 tmp17855 = _mm512_shuffle_f32x4(tmp17838, tmp17842, 136);
__m512 tmp17856 = _mm512_shuffle_f32x4(tmp17838, tmp17842, 221);
__m512 tmp17857 = _mm512_shuffle_f32x4(tmp17839, tmp17843, 136);
__m512 tmp17858 = _mm512_shuffle_f32x4(tmp17839, tmp17843, 221);
__m512 tmp17859 = _mm512_shuffle_f32x4(tmp17840, tmp17844, 136);
__m512 tmp17860 = _mm512_shuffle_f32x4(tmp17840, tmp17844, 221);
__m512 tmp17861 = _mm512_shuffle_f32x4(tmp17845, tmp17849, 136);
__m512 tmp17862 = _mm512_shuffle_f32x4(tmp17845, tmp17849, 221);
__m512 tmp17863 = _mm512_shuffle_f32x4(tmp17846, tmp17850, 136);
__m512 tmp17864 = _mm512_shuffle_f32x4(tmp17846, tmp17850, 221);
__m512 tmp17865 = _mm512_shuffle_f32x4(tmp17847, tmp17851, 136);
__m512 tmp17866 = _mm512_shuffle_f32x4(tmp17847, tmp17851, 221);
__m512 tmp17867 = _mm512_shuffle_f32x4(tmp17848, tmp17852, 136);
__m512 tmp17868 = _mm512_shuffle_f32x4(tmp17848, tmp17852, 221);
wt585 = _mm512_shuffle_f32x4(tmp17853, tmp17861, 136);
wt593 = _mm512_shuffle_f32x4(tmp17853, tmp17861, 221);
wt586 = _mm512_shuffle_f32x4(tmp17855, tmp17863, 136);
wt594 = _mm512_shuffle_f32x4(tmp17855, tmp17863, 221);
wt587 = _mm512_shuffle_f32x4(tmp17857, tmp17865, 136);
wt595 = _mm512_shuffle_f32x4(tmp17857, tmp17865, 221);
wt588 = _mm512_shuffle_f32x4(tmp17859, tmp17867, 136);
wt596 = _mm512_shuffle_f32x4(tmp17859, tmp17867, 221);
wt589 = _mm512_shuffle_f32x4(tmp17854, tmp17862, 136);
wt597 = _mm512_shuffle_f32x4(tmp17854, tmp17862, 221);
wt590 = _mm512_shuffle_f32x4(tmp17856, tmp17864, 136);
wt598 = _mm512_shuffle_f32x4(tmp17856, tmp17864, 221);
wt591 = _mm512_shuffle_f32x4(tmp17858, tmp17866, 136);
wt599 = _mm512_shuffle_f32x4(tmp17858, tmp17866, 221);
wt592 = _mm512_shuffle_f32x4(tmp17860, tmp17868, 136);
wt600 = _mm512_shuffle_f32x4(tmp17860, tmp17868, 221);
wt585 = _mm512_mul_ps(wt585, postMul58);
wt586 = _mm512_mul_ps(wt586, postMul58);
wt587 = _mm512_mul_ps(wt587, postMul58);
wt588 = _mm512_mul_ps(wt588, postMul58);
wt589 = _mm512_mul_ps(wt589, postMul58);
wt590 = _mm512_mul_ps(wt590, postMul58);
wt591 = _mm512_mul_ps(wt591, postMul58);
wt592 = _mm512_mul_ps(wt592, postMul58);
wt593 = _mm512_mul_ps(wt593, postMul58);
wt594 = _mm512_mul_ps(wt594, postMul58);
wt595 = _mm512_mul_ps(wt595, postMul58);
wt596 = _mm512_mul_ps(wt596, postMul58);
wt597 = _mm512_mul_ps(wt597, postMul58);
wt598 = _mm512_mul_ps(wt598, postMul58);
wt599 = _mm512_mul_ps(wt599, postMul58);
wt600 = _mm512_mul_ps(wt600, postMul58);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c46)+(ptrdiff_t)0, 63>>cut24, wt585);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c46)+(ptrdiff_t)0, 63>>cut24, wt586);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c46)+(ptrdiff_t)0, 63>>cut24, wt587);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c46)+(ptrdiff_t)0, 63>>cut24, wt588);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c46)+(ptrdiff_t)0, 63>>cut24, wt589);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c46)+(ptrdiff_t)0, 63>>cut24, wt590);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c46)+(ptrdiff_t)0, 63>>cut24, wt591);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c46)+(ptrdiff_t)0, 63>>cut24, wt592);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c46)+(ptrdiff_t)0, 63>>cut24, wt593);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c46)+(ptrdiff_t)0, 63>>cut24, wt594);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c46)+(ptrdiff_t)0, 63>>cut24, wt595);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c46)+(ptrdiff_t)0, 63>>cut24, wt596);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c46)+(ptrdiff_t)0, 63>>cut24, wt597);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c46)+(ptrdiff_t)0, 63>>cut24, wt598);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c46)+(ptrdiff_t)0, 63>>cut24, wt599);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c46)+(ptrdiff_t)0, 63>>cut24, wt600);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt585);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt586);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt587);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt588);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt589);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt590);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt591);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt592);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt593);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt594);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt595);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt596);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt597);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt598);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt599);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt600);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt585);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt586);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt587);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt588);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt589);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt590);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt591);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt592);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt593);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt594);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt595);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt596);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt597);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt598);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt599);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt600);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt585);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt586);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt587);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt588);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt589);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt590);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt591);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt592);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt593);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt594);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt595);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt596);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt597);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt598);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt599);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt600);
}
}
}
} else {
ptrdiff_t k153 = 1008;
ptrdiff_t l66 = (size_t)(0+k153)/6;
ptrdiff_t cut23 = (size_t)(0+k153)%6;
__m512 sum481 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i56+4*k153);
__m512i pmMul36 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd36 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo30 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k153+1024*i56));
__m512 masHi30 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k153+1024*i56)+(ptrdiff_t)64);
__m512 postMul56 = _mm512_permutex2var_ps(masLo30, pmMul36, masHi30);
__m512 postAdd34 = _mm512_permutex2var_ps(masLo30, pmAdd36, masHi30);
sum481 = _mm512_fmadd_ps(sum481, postMul56, postAdd34);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*0+(ptrdiff_t)0, 63>>cut23, sum481);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*0+(ptrdiff_t)6144, 4032>>cut23, sum481);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*0+(ptrdiff_t)12288, 65535-(4095>>cut23), sum481);
ptrdiff_t c44 = 0;
for (; c44 != 16; ++c44) {
__m512 wt553 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)0);
__m512 wt554 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)1024);
__m512 wt555 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)2048);
__m512 wt556 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)3072);
__m512 wt557 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)4096);
__m512 wt558 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)5120);
__m512 wt559 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)6144);
__m512 wt560 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)7168);
__m512 wt561 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)8192);
__m512 wt562 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)9216);
__m512 wt563 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)10240);
__m512 wt564 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)11264);
__m512 wt565 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)12288);
__m512 wt566 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)13312);
__m512 wt567 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)14336);
__m512 wt568 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)15360);
__m512 tmp17869 = _mm512_unpacklo_ps(wt553, wt554);
__m512 tmp17870 = _mm512_unpackhi_ps(wt553, wt554);
__m512 tmp17871 = _mm512_unpacklo_ps(wt555, wt556);
__m512 tmp17872 = _mm512_unpackhi_ps(wt555, wt556);
__m512 tmp17873 = _mm512_unpacklo_ps(wt557, wt558);
__m512 tmp17874 = _mm512_unpackhi_ps(wt557, wt558);
__m512 tmp17875 = _mm512_unpacklo_ps(wt559, wt560);
__m512 tmp17876 = _mm512_unpackhi_ps(wt559, wt560);
__m512 tmp17877 = _mm512_unpacklo_ps(wt561, wt562);
__m512 tmp17878 = _mm512_unpackhi_ps(wt561, wt562);
__m512 tmp17879 = _mm512_unpacklo_ps(wt563, wt564);
__m512 tmp17880 = _mm512_unpackhi_ps(wt563, wt564);
__m512 tmp17881 = _mm512_unpacklo_ps(wt565, wt566);
__m512 tmp17882 = _mm512_unpackhi_ps(wt565, wt566);
__m512 tmp17883 = _mm512_unpacklo_ps(wt567, wt568);
__m512 tmp17884 = _mm512_unpackhi_ps(wt567, wt568);
__m512 tmp17885 = _mm512_shuffle_ps(tmp17869, tmp17871, 68);
__m512 tmp17886 = _mm512_shuffle_ps(tmp17869, tmp17871, 238);
__m512 tmp17887 = _mm512_shuffle_ps(tmp17870, tmp17872, 68);
__m512 tmp17888 = _mm512_shuffle_ps(tmp17870, tmp17872, 238);
__m512 tmp17889 = _mm512_shuffle_ps(tmp17873, tmp17875, 68);
__m512 tmp17890 = _mm512_shuffle_ps(tmp17873, tmp17875, 238);
__m512 tmp17891 = _mm512_shuffle_ps(tmp17874, tmp17876, 68);
__m512 tmp17892 = _mm512_shuffle_ps(tmp17874, tmp17876, 238);
__m512 tmp17893 = _mm512_shuffle_ps(tmp17877, tmp17879, 68);
__m512 tmp17894 = _mm512_shuffle_ps(tmp17877, tmp17879, 238);
__m512 tmp17895 = _mm512_shuffle_ps(tmp17878, tmp17880, 68);
__m512 tmp17896 = _mm512_shuffle_ps(tmp17878, tmp17880, 238);
__m512 tmp17897 = _mm512_shuffle_ps(tmp17881, tmp17883, 68);
__m512 tmp17898 = _mm512_shuffle_ps(tmp17881, tmp17883, 238);
__m512 tmp17899 = _mm512_shuffle_ps(tmp17882, tmp17884, 68);
__m512 tmp17900 = _mm512_shuffle_ps(tmp17882, tmp17884, 238);
__m512 tmp17901 = _mm512_shuffle_f32x4(tmp17885, tmp17889, 136);
__m512 tmp17902 = _mm512_shuffle_f32x4(tmp17885, tmp17889, 221);
__m512 tmp17903 = _mm512_shuffle_f32x4(tmp17886, tmp17890, 136);
__m512 tmp17904 = _mm512_shuffle_f32x4(tmp17886, tmp17890, 221);
__m512 tmp17905 = _mm512_shuffle_f32x4(tmp17887, tmp17891, 136);
__m512 tmp17906 = _mm512_shuffle_f32x4(tmp17887, tmp17891, 221);
__m512 tmp17907 = _mm512_shuffle_f32x4(tmp17888, tmp17892, 136);
__m512 tmp17908 = _mm512_shuffle_f32x4(tmp17888, tmp17892, 221);
__m512 tmp17909 = _mm512_shuffle_f32x4(tmp17893, tmp17897, 136);
__m512 tmp17910 = _mm512_shuffle_f32x4(tmp17893, tmp17897, 221);
__m512 tmp17911 = _mm512_shuffle_f32x4(tmp17894, tmp17898, 136);
__m512 tmp17912 = _mm512_shuffle_f32x4(tmp17894, tmp17898, 221);
__m512 tmp17913 = _mm512_shuffle_f32x4(tmp17895, tmp17899, 136);
__m512 tmp17914 = _mm512_shuffle_f32x4(tmp17895, tmp17899, 221);
__m512 tmp17915 = _mm512_shuffle_f32x4(tmp17896, tmp17900, 136);
__m512 tmp17916 = _mm512_shuffle_f32x4(tmp17896, tmp17900, 221);
wt553 = _mm512_shuffle_f32x4(tmp17901, tmp17909, 136);
wt561 = _mm512_shuffle_f32x4(tmp17901, tmp17909, 221);
wt554 = _mm512_shuffle_f32x4(tmp17903, tmp17911, 136);
wt562 = _mm512_shuffle_f32x4(tmp17903, tmp17911, 221);
wt555 = _mm512_shuffle_f32x4(tmp17905, tmp17913, 136);
wt563 = _mm512_shuffle_f32x4(tmp17905, tmp17913, 221);
wt556 = _mm512_shuffle_f32x4(tmp17907, tmp17915, 136);
wt564 = _mm512_shuffle_f32x4(tmp17907, tmp17915, 221);
wt557 = _mm512_shuffle_f32x4(tmp17902, tmp17910, 136);
wt565 = _mm512_shuffle_f32x4(tmp17902, tmp17910, 221);
wt558 = _mm512_shuffle_f32x4(tmp17904, tmp17912, 136);
wt566 = _mm512_shuffle_f32x4(tmp17904, tmp17912, 221);
wt559 = _mm512_shuffle_f32x4(tmp17906, tmp17914, 136);
wt567 = _mm512_shuffle_f32x4(tmp17906, tmp17914, 221);
wt560 = _mm512_shuffle_f32x4(tmp17908, tmp17916, 136);
wt568 = _mm512_shuffle_f32x4(tmp17908, tmp17916, 221);
wt553 = _mm512_mul_ps(wt553, postMul56);
wt554 = _mm512_mul_ps(wt554, postMul56);
wt555 = _mm512_mul_ps(wt555, postMul56);
wt556 = _mm512_mul_ps(wt556, postMul56);
wt557 = _mm512_mul_ps(wt557, postMul56);
wt558 = _mm512_mul_ps(wt558, postMul56);
wt559 = _mm512_mul_ps(wt559, postMul56);
wt560 = _mm512_mul_ps(wt560, postMul56);
wt561 = _mm512_mul_ps(wt561, postMul56);
wt562 = _mm512_mul_ps(wt562, postMul56);
wt563 = _mm512_mul_ps(wt563, postMul56);
wt564 = _mm512_mul_ps(wt564, postMul56);
wt565 = _mm512_mul_ps(wt565, postMul56);
wt566 = _mm512_mul_ps(wt566, postMul56);
wt567 = _mm512_mul_ps(wt567, postMul56);
wt568 = _mm512_mul_ps(wt568, postMul56);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(1+16*c44)+(ptrdiff_t)0, 63>>cut23, wt553);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(2+16*c44)+(ptrdiff_t)0, 63>>cut23, wt554);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(3+16*c44)+(ptrdiff_t)0, 63>>cut23, wt555);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(4+16*c44)+(ptrdiff_t)0, 63>>cut23, wt556);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(5+16*c44)+(ptrdiff_t)0, 63>>cut23, wt557);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(6+16*c44)+(ptrdiff_t)0, 63>>cut23, wt558);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(7+16*c44)+(ptrdiff_t)0, 63>>cut23, wt559);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(8+16*c44)+(ptrdiff_t)0, 63>>cut23, wt560);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(9+16*c44)+(ptrdiff_t)0, 63>>cut23, wt561);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(10+16*c44)+(ptrdiff_t)0, 63>>cut23, wt562);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(11+16*c44)+(ptrdiff_t)0, 63>>cut23, wt563);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(12+16*c44)+(ptrdiff_t)0, 63>>cut23, wt564);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(13+16*c44)+(ptrdiff_t)0, 63>>cut23, wt565);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(14+16*c44)+(ptrdiff_t)0, 63>>cut23, wt566);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(15+16*c44)+(ptrdiff_t)0, 63>>cut23, wt567);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(16+16*c44)+(ptrdiff_t)0, 63>>cut23, wt568);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(1+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt553);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(2+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt554);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(3+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt555);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(4+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt556);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(5+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt557);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(6+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt558);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(7+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt559);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(8+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt560);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(9+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt561);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(10+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt562);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(11+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt563);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(12+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt564);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(13+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt565);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(14+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt566);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(15+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt567);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(16+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt568);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(1+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt553);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(2+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt554);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(3+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt555);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(4+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt556);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(5+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt557);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(6+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt558);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(7+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt559);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(8+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt560);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(9+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt561);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(10+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt562);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(11+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt563);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(12+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt564);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(13+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt565);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(14+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt566);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(15+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt567);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(16+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt568);
}
}
}
}
}

static void ResNet50OneArrangeWts8(ResNet50ThreaderTeam1* team60, char** tensors93) {
ResNet50ThreaderTask1 task97;
task97.callee1 = ResNet50OneArrangeWts8Callee1;
task97.any1 = tensors93;
task97.nd1 = 3;
task97.hull1[0] = 32;
task97.hull1[1] = 1;
task97.hull1[2] = 1;
ResNet50ThreaderDo1(team60, &task97);
}

static void ResNet50OneArrangeDats8Callee1(ResNet50ThreaderTask1* task98, int64_t* pt54) {
char** tensors96 = task98->any1;
ptrdiff_t s44 = pt54[0];
ptrdiff_t c47 = pt54[1];
char*restrict datPtr29 = tensors96[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)212992*0;
char*restrict arranged16 = tensors96[1]+(ptrdiff_t)694720*0+(ptrdiff_t)212992*0;
ptrdiff_t ii23 = 1;
for (ptrdiff_t i57 = 0; i57 < ii23; ++i57) {
ptrdiff_t j49 = 1*c47;
ptrdiff_t jj46 = j49+0;
for (; j49 != 3; ++j49) {
ptrdiff_t k155 = 128*s44;
ptrdiff_t kk51 = k155+128;
for (; k155 < kk51; ++k155) {
__m512 dat2308 = _mm512_maskz_loadu_ps(65535, datPtr29+212992*i57+256*j49+832*k155+(ptrdiff_t)0);
__m512 dat2309 = _mm512_maskz_loadu_ps(65535, datPtr29+212992*i57+256*j49+832*k155+(ptrdiff_t)64);
__m512 dat2310 = _mm512_maskz_loadu_ps(65535, datPtr29+212992*i57+256*j49+832*k155+(ptrdiff_t)128);
__m512 dat2311 = _mm512_maskz_loadu_ps(65535, datPtr29+212992*i57+256*j49+832*k155+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+256*k155+(ptrdiff_t)0, 65535, dat2308);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+256*k155+(ptrdiff_t)64, 65535, dat2309);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+256*k155+(ptrdiff_t)128, 65535, dat2310);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+256*k155+(ptrdiff_t)192, 65535, dat2311);
}
if (j49 >= jj46) goto next8;
}
ptrdiff_t k156 = 128*s44;
ptrdiff_t kk52 = k156+128;
for (; k156 < kk52; ++k156) {
__m512 dat2312 = _mm512_maskz_loadu_ps(15, datPtr29+212992*i57+256*j49+832*k156+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+64*k156+(ptrdiff_t)0, 15, dat2312);
}
next8:;
}
}

static void ResNet50OneArrangeDats8(ResNet50ThreaderTeam1* team61, char** tensors95) {
ResNet50ThreaderTask1 task99;
task99.callee1 = ResNet50OneArrangeDats8Callee1;
task99.any1 = tensors95;
task99.nd1 = 4;
task99.hull1[0] = 2;
task99.hull1[1] = 4;
task99.hull1[2] = 1;
task99.hull1[3] = 1;
ResNet50ThreaderDo1(team61, &task99);
}

static void ResNet50OneApply8Callee1(ResNet50ThreaderTask1* task100, int64_t* pt55) {
void** pair26 = task100->any1;
char** tensors98 = pair26[0];
ptrdiff_t e28 = 0;
ptrdiff_t g33 = 0;
ptrdiff_t d20 = pt55[1];
ptrdiff_t w69 = pt55[0];
char*restrict arrangedWts8 = tensors98[0]+3424256*e28+(ptrdiff_t)1052672*1*g33;
char*restrict arrangedDats8 = tensors98[1]+694720*e28+(ptrdiff_t)212992*1*g33;
char*restrict datPtr30 = tensors98[2]+(ptrdiff_t)851968*1*g33;
char*restrict datPtr31 = tensors98[3]+(ptrdiff_t)851968*1*g33;
ptrdiff_t ii24 = 1;
for (ptrdiff_t i58 = 0; i58 < ii24; ++i58) {
ptrdiff_t j50 = 1*d20;
ptrdiff_t jj47 = j50+0;
for (; j50 != 3; ++j50) {
ptrdiff_t k157 = 2*w69;
ptrdiff_t kk53 = k157+(w69 < 84 ? 1 : 2);
for (; k157 != 170; ++k157) {
ptrdiff_t s45 = -1;
__m512 sum484 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)24));
__m512 sum488 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)28));
__m512 sum492 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)32));
__m512 sum496 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)36));
__m512 sum500 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)40));
__m512 sum504 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)44));
__m512 sum485 = sum484;
__m512 sum486 = sum484;
__m512 sum487 = sum484;
__m512 sum489 = sum488;
__m512 sum490 = sum488;
__m512 sum491 = sum488;
__m512 sum493 = sum492;
__m512 sum494 = sum492;
__m512 sum495 = sum492;
__m512 sum497 = sum496;
__m512 sum498 = sum496;
__m512 sum499 = sum496;
__m512 sum501 = sum500;
__m512 sum502 = sum500;
__m512 sum503 = sum500;
__m512 sum505 = sum504;
__m512 sum506 = sum504;
__m512 sum507 = sum504;
for (s45 = 0; s45 < 256; ++s45) {
__m512 dat2313 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s45+(ptrdiff_t)0);
__m512 dat2314 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s45+(ptrdiff_t)64);
__m512 dat2315 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s45+(ptrdiff_t)128);
__m512 dat2316 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s45+(ptrdiff_t)192);
__m512 wt601 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)24));
sum484 = _mm512_fmadd_ps(wt601, dat2313, sum484);
sum485 = _mm512_fmadd_ps(wt601, dat2314, sum485);
sum486 = _mm512_fmadd_ps(wt601, dat2315, sum486);
sum487 = _mm512_fmadd_ps(wt601, dat2316, sum487);
__m512 wt602 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)28));
sum488 = _mm512_fmadd_ps(wt602, dat2313, sum488);
sum489 = _mm512_fmadd_ps(wt602, dat2314, sum489);
sum490 = _mm512_fmadd_ps(wt602, dat2315, sum490);
sum491 = _mm512_fmadd_ps(wt602, dat2316, sum491);
__m512 wt603 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)32));
sum492 = _mm512_fmadd_ps(wt603, dat2313, sum492);
sum493 = _mm512_fmadd_ps(wt603, dat2314, sum493);
sum494 = _mm512_fmadd_ps(wt603, dat2315, sum494);
sum495 = _mm512_fmadd_ps(wt603, dat2316, sum495);
__m512 wt604 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)36));
sum496 = _mm512_fmadd_ps(wt604, dat2313, sum496);
sum497 = _mm512_fmadd_ps(wt604, dat2314, sum497);
sum498 = _mm512_fmadd_ps(wt604, dat2315, sum498);
sum499 = _mm512_fmadd_ps(wt604, dat2316, sum499);
__m512 wt605 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)40));
sum500 = _mm512_fmadd_ps(wt605, dat2313, sum500);
sum501 = _mm512_fmadd_ps(wt605, dat2314, sum501);
sum502 = _mm512_fmadd_ps(wt605, dat2315, sum502);
sum503 = _mm512_fmadd_ps(wt605, dat2316, sum503);
__m512 wt606 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)44));
sum504 = _mm512_fmadd_ps(wt606, dat2313, sum504);
sum505 = _mm512_fmadd_ps(wt606, dat2314, sum505);
sum506 = _mm512_fmadd_ps(wt606, dat2315, sum506);
sum507 = _mm512_fmadd_ps(wt606, dat2316, sum507);
}
sum484 = _mm512_add_ps(sum484, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)0));
sum485 = _mm512_add_ps(sum485, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)64));
sum486 = _mm512_add_ps(sum486, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)128));
sum487 = _mm512_add_ps(sum487, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)192));
sum484 = _mm512_max_ps(_mm512_setzero_ps(), sum484);
sum485 = _mm512_max_ps(_mm512_setzero_ps(), sum485);
sum486 = _mm512_max_ps(_mm512_setzero_ps(), sum486);
sum487 = _mm512_max_ps(_mm512_setzero_ps(), sum487);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)0, 65535, sum484);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)64, 65535, sum485);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)128, 65535, sum486);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)192, 65535, sum487);
sum488 = _mm512_add_ps(sum488, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)832));
sum489 = _mm512_add_ps(sum489, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)896));
sum490 = _mm512_add_ps(sum490, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)960));
sum491 = _mm512_add_ps(sum491, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1024));
sum488 = _mm512_max_ps(_mm512_setzero_ps(), sum488);
sum489 = _mm512_max_ps(_mm512_setzero_ps(), sum489);
sum490 = _mm512_max_ps(_mm512_setzero_ps(), sum490);
sum491 = _mm512_max_ps(_mm512_setzero_ps(), sum491);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)832, 65535, sum488);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)896, 65535, sum489);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)960, 65535, sum490);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1024, 65535, sum491);
sum492 = _mm512_add_ps(sum492, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1664));
sum493 = _mm512_add_ps(sum493, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1728));
sum494 = _mm512_add_ps(sum494, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1792));
sum495 = _mm512_add_ps(sum495, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1856));
sum492 = _mm512_max_ps(_mm512_setzero_ps(), sum492);
sum493 = _mm512_max_ps(_mm512_setzero_ps(), sum493);
sum494 = _mm512_max_ps(_mm512_setzero_ps(), sum494);
sum495 = _mm512_max_ps(_mm512_setzero_ps(), sum495);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1664, 65535, sum492);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1728, 65535, sum493);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1792, 65535, sum494);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1856, 65535, sum495);
sum496 = _mm512_add_ps(sum496, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2496));
sum497 = _mm512_add_ps(sum497, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2560));
sum498 = _mm512_add_ps(sum498, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2624));
sum499 = _mm512_add_ps(sum499, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2688));
sum496 = _mm512_max_ps(_mm512_setzero_ps(), sum496);
sum497 = _mm512_max_ps(_mm512_setzero_ps(), sum497);
sum498 = _mm512_max_ps(_mm512_setzero_ps(), sum498);
sum499 = _mm512_max_ps(_mm512_setzero_ps(), sum499);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2496, 65535, sum496);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2560, 65535, sum497);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2624, 65535, sum498);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2688, 65535, sum499);
sum500 = _mm512_add_ps(sum500, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)3328));
sum501 = _mm512_add_ps(sum501, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)3392));
sum502 = _mm512_add_ps(sum502, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)3456));
sum503 = _mm512_add_ps(sum503, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)3520));
sum500 = _mm512_max_ps(_mm512_setzero_ps(), sum500);
sum501 = _mm512_max_ps(_mm512_setzero_ps(), sum501);
sum502 = _mm512_max_ps(_mm512_setzero_ps(), sum502);
sum503 = _mm512_max_ps(_mm512_setzero_ps(), sum503);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)3328, 65535, sum500);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)3392, 65535, sum501);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)3456, 65535, sum502);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)3520, 65535, sum503);
sum504 = _mm512_add_ps(sum504, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)4160));
sum505 = _mm512_add_ps(sum505, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)4224));
sum506 = _mm512_add_ps(sum506, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)4288));
sum507 = _mm512_add_ps(sum507, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)4352));
sum504 = _mm512_max_ps(_mm512_setzero_ps(), sum504);
sum505 = _mm512_max_ps(_mm512_setzero_ps(), sum505);
sum506 = _mm512_max_ps(_mm512_setzero_ps(), sum506);
sum507 = _mm512_max_ps(_mm512_setzero_ps(), sum507);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)4160, 65535, sum504);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)4224, 65535, sum505);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)4288, 65535, sum506);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)4352, 65535, sum507);
if (k157 >= kk53) return;
}
ptrdiff_t s46 = -1;
__m512 sum508 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)16));
__m512 sum512 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)20));
__m512 sum516 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)24));
__m512 sum520 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)28));
__m512 sum509 = sum508;
__m512 sum510 = sum508;
__m512 sum511 = sum508;
__m512 sum513 = sum512;
__m512 sum514 = sum512;
__m512 sum515 = sum512;
__m512 sum517 = sum516;
__m512 sum518 = sum516;
__m512 sum519 = sum516;
__m512 sum521 = sum520;
__m512 sum522 = sum520;
__m512 sum523 = sum520;
for (s46 = 0; s46 < 256; ++s46) {
__m512 dat2317 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s46+(ptrdiff_t)0);
__m512 dat2318 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s46+(ptrdiff_t)64);
__m512 dat2319 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s46+(ptrdiff_t)128);
__m512 dat2320 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s46+(ptrdiff_t)192);
__m512 wt607 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)16));
sum508 = _mm512_fmadd_ps(wt607, dat2317, sum508);
sum509 = _mm512_fmadd_ps(wt607, dat2318, sum509);
sum510 = _mm512_fmadd_ps(wt607, dat2319, sum510);
sum511 = _mm512_fmadd_ps(wt607, dat2320, sum511);
__m512 wt608 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)20));
sum512 = _mm512_fmadd_ps(wt608, dat2317, sum512);
sum513 = _mm512_fmadd_ps(wt608, dat2318, sum513);
sum514 = _mm512_fmadd_ps(wt608, dat2319, sum514);
sum515 = _mm512_fmadd_ps(wt608, dat2320, sum515);
__m512 wt609 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)24));
sum516 = _mm512_fmadd_ps(wt609, dat2317, sum516);
sum517 = _mm512_fmadd_ps(wt609, dat2318, sum517);
sum518 = _mm512_fmadd_ps(wt609, dat2319, sum518);
sum519 = _mm512_fmadd_ps(wt609, dat2320, sum519);
__m512 wt610 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)28));
sum520 = _mm512_fmadd_ps(wt610, dat2317, sum520);
sum521 = _mm512_fmadd_ps(wt610, dat2318, sum521);
sum522 = _mm512_fmadd_ps(wt610, dat2319, sum522);
sum523 = _mm512_fmadd_ps(wt610, dat2320, sum523);
}
sum508 = _mm512_add_ps(sum508, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)0));
sum509 = _mm512_add_ps(sum509, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)64));
sum510 = _mm512_add_ps(sum510, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)128));
sum511 = _mm512_add_ps(sum511, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)192));
sum508 = _mm512_max_ps(_mm512_setzero_ps(), sum508);
sum509 = _mm512_max_ps(_mm512_setzero_ps(), sum509);
sum510 = _mm512_max_ps(_mm512_setzero_ps(), sum510);
sum511 = _mm512_max_ps(_mm512_setzero_ps(), sum511);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)0, 65535, sum508);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)64, 65535, sum509);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)128, 65535, sum510);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)192, 65535, sum511);
sum512 = _mm512_add_ps(sum512, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)832));
sum513 = _mm512_add_ps(sum513, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)896));
sum514 = _mm512_add_ps(sum514, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)960));
sum515 = _mm512_add_ps(sum515, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1024));
sum512 = _mm512_max_ps(_mm512_setzero_ps(), sum512);
sum513 = _mm512_max_ps(_mm512_setzero_ps(), sum513);
sum514 = _mm512_max_ps(_mm512_setzero_ps(), sum514);
sum515 = _mm512_max_ps(_mm512_setzero_ps(), sum515);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)832, 65535, sum512);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)896, 65535, sum513);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)960, 65535, sum514);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1024, 65535, sum515);
sum516 = _mm512_add_ps(sum516, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1664));
sum517 = _mm512_add_ps(sum517, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1728));
sum518 = _mm512_add_ps(sum518, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1792));
sum519 = _mm512_add_ps(sum519, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1856));
sum516 = _mm512_max_ps(_mm512_setzero_ps(), sum516);
sum517 = _mm512_max_ps(_mm512_setzero_ps(), sum517);
sum518 = _mm512_max_ps(_mm512_setzero_ps(), sum518);
sum519 = _mm512_max_ps(_mm512_setzero_ps(), sum519);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1664, 65535, sum516);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1728, 65535, sum517);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1792, 65535, sum518);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1856, 65535, sum519);
sum520 = _mm512_add_ps(sum520, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2496));
sum521 = _mm512_add_ps(sum521, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2560));
sum522 = _mm512_add_ps(sum522, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2624));
sum523 = _mm512_add_ps(sum523, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2688));
sum520 = _mm512_max_ps(_mm512_setzero_ps(), sum520);
sum521 = _mm512_max_ps(_mm512_setzero_ps(), sum521);
sum522 = _mm512_max_ps(_mm512_setzero_ps(), sum522);
sum523 = _mm512_max_ps(_mm512_setzero_ps(), sum523);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2496, 65535, sum520);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2560, 65535, sum521);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2624, 65535, sum522);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2688, 65535, sum523);
if (j50 >= jj47) return;
}
ptrdiff_t k158 = 2*w69;
ptrdiff_t kk54 = k158+(w69 < 84 ? 1 : 2);
for (; k158 != 170; ++k158) {
ptrdiff_t s47 = -1;
__m512 sum524 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)24));
__m512 sum525 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)28));
__m512 sum526 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)32));
__m512 sum527 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)36));
__m512 sum528 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)40));
__m512 sum529 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)44));
for (s47 = 0; s47 < 256; ++s47) {
__m512 dat2321 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+64*s47+(ptrdiff_t)0);
__m512 wt611 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)24));
sum524 = _mm512_fmadd_ps(wt611, dat2321, sum524);
__m512 wt612 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)28));
sum525 = _mm512_fmadd_ps(wt612, dat2321, sum525);
__m512 wt613 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)32));
sum526 = _mm512_fmadd_ps(wt613, dat2321, sum526);
__m512 wt614 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)36));
sum527 = _mm512_fmadd_ps(wt614, dat2321, sum527);
__m512 wt615 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)40));
sum528 = _mm512_fmadd_ps(wt615, dat2321, sum528);
__m512 wt616 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)44));
sum529 = _mm512_fmadd_ps(wt616, dat2321, sum529);
}
sum524 = _mm512_add_ps(sum524, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)0));
sum524 = _mm512_max_ps(_mm512_setzero_ps(), sum524);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)0, 15, sum524);
sum525 = _mm512_add_ps(sum525, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)832));
sum525 = _mm512_max_ps(_mm512_setzero_ps(), sum525);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)832, 15, sum525);
sum526 = _mm512_add_ps(sum526, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)1664));
sum526 = _mm512_max_ps(_mm512_setzero_ps(), sum526);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)1664, 15, sum526);
sum527 = _mm512_add_ps(sum527, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)2496));
sum527 = _mm512_max_ps(_mm512_setzero_ps(), sum527);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)2496, 15, sum527);
sum528 = _mm512_add_ps(sum528, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)3328));
sum528 = _mm512_max_ps(_mm512_setzero_ps(), sum528);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)3328, 15, sum528);
sum529 = _mm512_add_ps(sum529, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)4160));
sum529 = _mm512_max_ps(_mm512_setzero_ps(), sum529);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)4160, 15, sum529);
if (k158 >= kk54) return;
}
ptrdiff_t s48 = -1;
__m512 sum530 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)16));
__m512 sum531 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)20));
__m512 sum532 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)24));
__m512 sum533 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)28));
for (s48 = 0; s48 < 256; ++s48) {
__m512 dat2322 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+64*s48+(ptrdiff_t)0);
__m512 wt617 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)16));
sum530 = _mm512_fmadd_ps(wt617, dat2322, sum530);
__m512 wt618 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)20));
sum531 = _mm512_fmadd_ps(wt618, dat2322, sum531);
__m512 wt619 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)24));
sum532 = _mm512_fmadd_ps(wt619, dat2322, sum532);
__m512 wt620 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)28));
sum533 = _mm512_fmadd_ps(wt620, dat2322, sum533);
}
sum530 = _mm512_add_ps(sum530, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)0));
sum530 = _mm512_max_ps(_mm512_setzero_ps(), sum530);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)0, 15, sum530);
sum531 = _mm512_add_ps(sum531, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)832));
sum531 = _mm512_max_ps(_mm512_setzero_ps(), sum531);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)832, 15, sum531);
sum532 = _mm512_add_ps(sum532, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)1664));
sum532 = _mm512_max_ps(_mm512_setzero_ps(), sum532);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)1664, 15, sum532);
sum533 = _mm512_add_ps(sum533, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)2496));
sum533 = _mm512_max_ps(_mm512_setzero_ps(), sum533);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)2496, 15, sum533);
}
}

static void ResNet50OneApply8(ResNet50ThreaderTeam1* team62, char** tensors97) {
void* pair25[] = {tensors97, 0};
ResNet50ThreaderTask1 task101;
task101.callee1 = ResNet50OneApply8Callee1;
task101.any1 = pair25;
task101.nd1 = 3;
task101.hull1[0] = 85;
task101.hull1[1] = 4;
task101.hull1[2] = 1;
ResNet50ThreaderDo1(team62, &task101);
}

static void ResNet50OneArrangeWts9Callee1(ResNet50ThreaderTask1* task102, int64_t* pt56) {
char** tensors100 = task102->any1;
ptrdiff_t b67 = pt56[0];
char*restrict wtPtr18 = tensors100[0]+(ptrdiff_t)3340*0+(ptrdiff_t)1048576*0;
char*restrict biasPtr18 = tensors100[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr18 = tensors100[2]+(ptrdiff_t)8*256*0;
char*restrict arranged17 = tensors100[3]+(ptrdiff_t)856064*0+(ptrdiff_t)1049600*0;
ptrdiff_t ii25 = 1;
for (ptrdiff_t i59 = 0; i59 < ii25; ++i59) {
ptrdiff_t j51 = 1*b67;
ptrdiff_t jj48 = j51+1;
for (; j51 < jj48; ++j51) {
if (j51 < 15) {
ptrdiff_t k160 = 0+16*(j51-0);
ptrdiff_t l69 = (size_t)(0+k160)/6;
ptrdiff_t cut26 = (size_t)(0+k160)%6;
switch (cut26) {
case 0:;
case 2: {
__m512 sum535 = _mm512_maskz_loadu_ps(65535, biasPtr18+1024*i59+4*k160);
__m512i pmMul37 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd37 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo31 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k160+256*i59));
__m512 masHi31 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k160+256*i59)+(ptrdiff_t)64);
__m512 postMul60 = _mm512_permutex2var_ps(masLo31, pmMul37, masHi31);
__m512 postAdd38 = _mm512_permutex2var_ps(masLo31, pmAdd37, masHi31);
sum535 = _mm512_fmadd_ps(sum535, postMul60, postAdd38);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)0, 63>>cut26, sum535);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)24576, 4032>>cut26, sum535);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)49152, 65535-(4095>>cut26), sum535);
ptrdiff_t c49 = 0;
for (; c49 != 64; ++c49) {
__m512 wt637 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)0);
__m512 wt638 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)4096);
__m512 wt639 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)8192);
__m512 wt640 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)12288);
__m512 wt641 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)16384);
__m512 wt642 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)20480);
__m512 wt643 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)24576);
__m512 wt644 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)28672);
__m512 wt645 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)32768);
__m512 wt646 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)36864);
__m512 wt647 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)40960);
__m512 wt648 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)45056);
__m512 wt649 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)49152);
__m512 wt650 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)53248);
__m512 wt651 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)57344);
__m512 wt652 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)61440);
__m512 tmp17917 = _mm512_unpacklo_ps(wt637, wt638);
__m512 tmp17918 = _mm512_unpackhi_ps(wt637, wt638);
__m512 tmp17919 = _mm512_unpacklo_ps(wt639, wt640);
__m512 tmp17920 = _mm512_unpackhi_ps(wt639, wt640);
__m512 tmp17921 = _mm512_unpacklo_ps(wt641, wt642);
__m512 tmp17922 = _mm512_unpackhi_ps(wt641, wt642);
__m512 tmp17923 = _mm512_unpacklo_ps(wt643, wt644);
__m512 tmp17924 = _mm512_unpackhi_ps(wt643, wt644);
__m512 tmp17925 = _mm512_unpacklo_ps(wt645, wt646);
__m512 tmp17926 = _mm512_unpackhi_ps(wt645, wt646);
__m512 tmp17927 = _mm512_unpacklo_ps(wt647, wt648);
__m512 tmp17928 = _mm512_unpackhi_ps(wt647, wt648);
__m512 tmp17929 = _mm512_unpacklo_ps(wt649, wt650);
__m512 tmp17930 = _mm512_unpackhi_ps(wt649, wt650);
__m512 tmp17931 = _mm512_unpacklo_ps(wt651, wt652);
__m512 tmp17932 = _mm512_unpackhi_ps(wt651, wt652);
__m512 tmp17933 = _mm512_shuffle_ps(tmp17917, tmp17919, 68);
__m512 tmp17934 = _mm512_shuffle_ps(tmp17917, tmp17919, 238);
__m512 tmp17935 = _mm512_shuffle_ps(tmp17918, tmp17920, 68);
__m512 tmp17936 = _mm512_shuffle_ps(tmp17918, tmp17920, 238);
__m512 tmp17937 = _mm512_shuffle_ps(tmp17921, tmp17923, 68);
__m512 tmp17938 = _mm512_shuffle_ps(tmp17921, tmp17923, 238);
__m512 tmp17939 = _mm512_shuffle_ps(tmp17922, tmp17924, 68);
__m512 tmp17940 = _mm512_shuffle_ps(tmp17922, tmp17924, 238);
__m512 tmp17941 = _mm512_shuffle_ps(tmp17925, tmp17927, 68);
__m512 tmp17942 = _mm512_shuffle_ps(tmp17925, tmp17927, 238);
__m512 tmp17943 = _mm512_shuffle_ps(tmp17926, tmp17928, 68);
__m512 tmp17944 = _mm512_shuffle_ps(tmp17926, tmp17928, 238);
__m512 tmp17945 = _mm512_shuffle_ps(tmp17929, tmp17931, 68);
__m512 tmp17946 = _mm512_shuffle_ps(tmp17929, tmp17931, 238);
__m512 tmp17947 = _mm512_shuffle_ps(tmp17930, tmp17932, 68);
__m512 tmp17948 = _mm512_shuffle_ps(tmp17930, tmp17932, 238);
__m512 tmp17949 = _mm512_shuffle_f32x4(tmp17933, tmp17937, 136);
__m512 tmp17950 = _mm512_shuffle_f32x4(tmp17933, tmp17937, 221);
__m512 tmp17951 = _mm512_shuffle_f32x4(tmp17934, tmp17938, 136);
__m512 tmp17952 = _mm512_shuffle_f32x4(tmp17934, tmp17938, 221);
__m512 tmp17953 = _mm512_shuffle_f32x4(tmp17935, tmp17939, 136);
__m512 tmp17954 = _mm512_shuffle_f32x4(tmp17935, tmp17939, 221);
__m512 tmp17955 = _mm512_shuffle_f32x4(tmp17936, tmp17940, 136);
__m512 tmp17956 = _mm512_shuffle_f32x4(tmp17936, tmp17940, 221);
__m512 tmp17957 = _mm512_shuffle_f32x4(tmp17941, tmp17945, 136);
__m512 tmp17958 = _mm512_shuffle_f32x4(tmp17941, tmp17945, 221);
__m512 tmp17959 = _mm512_shuffle_f32x4(tmp17942, tmp17946, 136);
__m512 tmp17960 = _mm512_shuffle_f32x4(tmp17942, tmp17946, 221);
__m512 tmp17961 = _mm512_shuffle_f32x4(tmp17943, tmp17947, 136);
__m512 tmp17962 = _mm512_shuffle_f32x4(tmp17943, tmp17947, 221);
__m512 tmp17963 = _mm512_shuffle_f32x4(tmp17944, tmp17948, 136);
__m512 tmp17964 = _mm512_shuffle_f32x4(tmp17944, tmp17948, 221);
wt637 = _mm512_shuffle_f32x4(tmp17949, tmp17957, 136);
wt645 = _mm512_shuffle_f32x4(tmp17949, tmp17957, 221);
wt638 = _mm512_shuffle_f32x4(tmp17951, tmp17959, 136);
wt646 = _mm512_shuffle_f32x4(tmp17951, tmp17959, 221);
wt639 = _mm512_shuffle_f32x4(tmp17953, tmp17961, 136);
wt647 = _mm512_shuffle_f32x4(tmp17953, tmp17961, 221);
wt640 = _mm512_shuffle_f32x4(tmp17955, tmp17963, 136);
wt648 = _mm512_shuffle_f32x4(tmp17955, tmp17963, 221);
wt641 = _mm512_shuffle_f32x4(tmp17950, tmp17958, 136);
wt649 = _mm512_shuffle_f32x4(tmp17950, tmp17958, 221);
wt642 = _mm512_shuffle_f32x4(tmp17952, tmp17960, 136);
wt650 = _mm512_shuffle_f32x4(tmp17952, tmp17960, 221);
wt643 = _mm512_shuffle_f32x4(tmp17954, tmp17962, 136);
wt651 = _mm512_shuffle_f32x4(tmp17954, tmp17962, 221);
wt644 = _mm512_shuffle_f32x4(tmp17956, tmp17964, 136);
wt652 = _mm512_shuffle_f32x4(tmp17956, tmp17964, 221);
wt637 = _mm512_mul_ps(wt637, postMul60);
wt638 = _mm512_mul_ps(wt638, postMul60);
wt639 = _mm512_mul_ps(wt639, postMul60);
wt640 = _mm512_mul_ps(wt640, postMul60);
wt641 = _mm512_mul_ps(wt641, postMul60);
wt642 = _mm512_mul_ps(wt642, postMul60);
wt643 = _mm512_mul_ps(wt643, postMul60);
wt644 = _mm512_mul_ps(wt644, postMul60);
wt645 = _mm512_mul_ps(wt645, postMul60);
wt646 = _mm512_mul_ps(wt646, postMul60);
wt647 = _mm512_mul_ps(wt647, postMul60);
wt648 = _mm512_mul_ps(wt648, postMul60);
wt649 = _mm512_mul_ps(wt649, postMul60);
wt650 = _mm512_mul_ps(wt650, postMul60);
wt651 = _mm512_mul_ps(wt651, postMul60);
wt652 = _mm512_mul_ps(wt652, postMul60);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c49)+(ptrdiff_t)0, 63>>cut26, wt637);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c49)+(ptrdiff_t)0, 63>>cut26, wt638);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c49)+(ptrdiff_t)0, 63>>cut26, wt639);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c49)+(ptrdiff_t)0, 63>>cut26, wt640);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c49)+(ptrdiff_t)0, 63>>cut26, wt641);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c49)+(ptrdiff_t)0, 63>>cut26, wt642);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c49)+(ptrdiff_t)0, 63>>cut26, wt643);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c49)+(ptrdiff_t)0, 63>>cut26, wt644);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c49)+(ptrdiff_t)0, 63>>cut26, wt645);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c49)+(ptrdiff_t)0, 63>>cut26, wt646);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c49)+(ptrdiff_t)0, 63>>cut26, wt647);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c49)+(ptrdiff_t)0, 63>>cut26, wt648);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c49)+(ptrdiff_t)0, 63>>cut26, wt649);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c49)+(ptrdiff_t)0, 63>>cut26, wt650);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c49)+(ptrdiff_t)0, 63>>cut26, wt651);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c49)+(ptrdiff_t)0, 63>>cut26, wt652);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt637);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt638);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt639);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt640);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt641);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt642);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt643);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt644);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt645);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt646);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt647);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt648);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt649);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt650);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt651);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt652);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt637);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt638);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt639);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt640);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt641);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt642);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt643);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt644);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt645);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt646);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt647);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt648);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt649);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt650);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt651);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt652);
}
break;
}
default: {
cut26 = 4;
__m512 sum536 = _mm512_maskz_loadu_ps(65535, biasPtr18+1024*i59+4*k160);
__m512i pmMul38 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd38 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo32 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k160+256*i59));
__m512 masHi32 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k160+256*i59)+(ptrdiff_t)64);
__m512 postMul61 = _mm512_permutex2var_ps(masLo32, pmMul38, masHi32);
__m512 postAdd39 = _mm512_permutex2var_ps(masLo32, pmAdd38, masHi32);
sum536 = _mm512_fmadd_ps(sum536, postMul61, postAdd39);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)0, 63>>cut26, sum536);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)24576, 4032>>cut26, sum536);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)49152, 258048>>cut26, sum536);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)73728, 65535-(262143>>cut26), sum536);
ptrdiff_t c50 = 0;
for (; c50 != 64; ++c50) {
__m512 wt653 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)0);
__m512 wt654 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)4096);
__m512 wt655 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)8192);
__m512 wt656 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)12288);
__m512 wt657 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)16384);
__m512 wt658 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)20480);
__m512 wt659 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)24576);
__m512 wt660 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)28672);
__m512 wt661 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)32768);
__m512 wt662 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)36864);
__m512 wt663 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)40960);
__m512 wt664 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)45056);
__m512 wt665 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)49152);
__m512 wt666 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)53248);
__m512 wt667 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)57344);
__m512 wt668 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)61440);
__m512 tmp17965 = _mm512_unpacklo_ps(wt653, wt654);
__m512 tmp17966 = _mm512_unpackhi_ps(wt653, wt654);
__m512 tmp17967 = _mm512_unpacklo_ps(wt655, wt656);
__m512 tmp17968 = _mm512_unpackhi_ps(wt655, wt656);
__m512 tmp17969 = _mm512_unpacklo_ps(wt657, wt658);
__m512 tmp17970 = _mm512_unpackhi_ps(wt657, wt658);
__m512 tmp17971 = _mm512_unpacklo_ps(wt659, wt660);
__m512 tmp17972 = _mm512_unpackhi_ps(wt659, wt660);
__m512 tmp17973 = _mm512_unpacklo_ps(wt661, wt662);
__m512 tmp17974 = _mm512_unpackhi_ps(wt661, wt662);
__m512 tmp17975 = _mm512_unpacklo_ps(wt663, wt664);
__m512 tmp17976 = _mm512_unpackhi_ps(wt663, wt664);
__m512 tmp17977 = _mm512_unpacklo_ps(wt665, wt666);
__m512 tmp17978 = _mm512_unpackhi_ps(wt665, wt666);
__m512 tmp17979 = _mm512_unpacklo_ps(wt667, wt668);
__m512 tmp17980 = _mm512_unpackhi_ps(wt667, wt668);
__m512 tmp17981 = _mm512_shuffle_ps(tmp17965, tmp17967, 68);
__m512 tmp17982 = _mm512_shuffle_ps(tmp17965, tmp17967, 238);
__m512 tmp17983 = _mm512_shuffle_ps(tmp17966, tmp17968, 68);
__m512 tmp17984 = _mm512_shuffle_ps(tmp17966, tmp17968, 238);
__m512 tmp17985 = _mm512_shuffle_ps(tmp17969, tmp17971, 68);
__m512 tmp17986 = _mm512_shuffle_ps(tmp17969, tmp17971, 238);
__m512 tmp17987 = _mm512_shuffle_ps(tmp17970, tmp17972, 68);
__m512 tmp17988 = _mm512_shuffle_ps(tmp17970, tmp17972, 238);
__m512 tmp17989 = _mm512_shuffle_ps(tmp17973, tmp17975, 68);
__m512 tmp17990 = _mm512_shuffle_ps(tmp17973, tmp17975, 238);
__m512 tmp17991 = _mm512_shuffle_ps(tmp17974, tmp17976, 68);
__m512 tmp17992 = _mm512_shuffle_ps(tmp17974, tmp17976, 238);
__m512 tmp17993 = _mm512_shuffle_ps(tmp17977, tmp17979, 68);
__m512 tmp17994 = _mm512_shuffle_ps(tmp17977, tmp17979, 238);
__m512 tmp17995 = _mm512_shuffle_ps(tmp17978, tmp17980, 68);
__m512 tmp17996 = _mm512_shuffle_ps(tmp17978, tmp17980, 238);
__m512 tmp17997 = _mm512_shuffle_f32x4(tmp17981, tmp17985, 136);
__m512 tmp17998 = _mm512_shuffle_f32x4(tmp17981, tmp17985, 221);
__m512 tmp17999 = _mm512_shuffle_f32x4(tmp17982, tmp17986, 136);
__m512 tmp18000 = _mm512_shuffle_f32x4(tmp17982, tmp17986, 221);
__m512 tmp18001 = _mm512_shuffle_f32x4(tmp17983, tmp17987, 136);
__m512 tmp18002 = _mm512_shuffle_f32x4(tmp17983, tmp17987, 221);
__m512 tmp18003 = _mm512_shuffle_f32x4(tmp17984, tmp17988, 136);
__m512 tmp18004 = _mm512_shuffle_f32x4(tmp17984, tmp17988, 221);
__m512 tmp18005 = _mm512_shuffle_f32x4(tmp17989, tmp17993, 136);
__m512 tmp18006 = _mm512_shuffle_f32x4(tmp17989, tmp17993, 221);
__m512 tmp18007 = _mm512_shuffle_f32x4(tmp17990, tmp17994, 136);
__m512 tmp18008 = _mm512_shuffle_f32x4(tmp17990, tmp17994, 221);
__m512 tmp18009 = _mm512_shuffle_f32x4(tmp17991, tmp17995, 136);
__m512 tmp18010 = _mm512_shuffle_f32x4(tmp17991, tmp17995, 221);
__m512 tmp18011 = _mm512_shuffle_f32x4(tmp17992, tmp17996, 136);
__m512 tmp18012 = _mm512_shuffle_f32x4(tmp17992, tmp17996, 221);
wt653 = _mm512_shuffle_f32x4(tmp17997, tmp18005, 136);
wt661 = _mm512_shuffle_f32x4(tmp17997, tmp18005, 221);
wt654 = _mm512_shuffle_f32x4(tmp17999, tmp18007, 136);
wt662 = _mm512_shuffle_f32x4(tmp17999, tmp18007, 221);
wt655 = _mm512_shuffle_f32x4(tmp18001, tmp18009, 136);
wt663 = _mm512_shuffle_f32x4(tmp18001, tmp18009, 221);
wt656 = _mm512_shuffle_f32x4(tmp18003, tmp18011, 136);
wt664 = _mm512_shuffle_f32x4(tmp18003, tmp18011, 221);
wt657 = _mm512_shuffle_f32x4(tmp17998, tmp18006, 136);
wt665 = _mm512_shuffle_f32x4(tmp17998, tmp18006, 221);
wt658 = _mm512_shuffle_f32x4(tmp18000, tmp18008, 136);
wt666 = _mm512_shuffle_f32x4(tmp18000, tmp18008, 221);
wt659 = _mm512_shuffle_f32x4(tmp18002, tmp18010, 136);
wt667 = _mm512_shuffle_f32x4(tmp18002, tmp18010, 221);
wt660 = _mm512_shuffle_f32x4(tmp18004, tmp18012, 136);
wt668 = _mm512_shuffle_f32x4(tmp18004, tmp18012, 221);
wt653 = _mm512_mul_ps(wt653, postMul61);
wt654 = _mm512_mul_ps(wt654, postMul61);
wt655 = _mm512_mul_ps(wt655, postMul61);
wt656 = _mm512_mul_ps(wt656, postMul61);
wt657 = _mm512_mul_ps(wt657, postMul61);
wt658 = _mm512_mul_ps(wt658, postMul61);
wt659 = _mm512_mul_ps(wt659, postMul61);
wt660 = _mm512_mul_ps(wt660, postMul61);
wt661 = _mm512_mul_ps(wt661, postMul61);
wt662 = _mm512_mul_ps(wt662, postMul61);
wt663 = _mm512_mul_ps(wt663, postMul61);
wt664 = _mm512_mul_ps(wt664, postMul61);
wt665 = _mm512_mul_ps(wt665, postMul61);
wt666 = _mm512_mul_ps(wt666, postMul61);
wt667 = _mm512_mul_ps(wt667, postMul61);
wt668 = _mm512_mul_ps(wt668, postMul61);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c50)+(ptrdiff_t)0, 63>>cut26, wt653);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c50)+(ptrdiff_t)0, 63>>cut26, wt654);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c50)+(ptrdiff_t)0, 63>>cut26, wt655);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c50)+(ptrdiff_t)0, 63>>cut26, wt656);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c50)+(ptrdiff_t)0, 63>>cut26, wt657);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c50)+(ptrdiff_t)0, 63>>cut26, wt658);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c50)+(ptrdiff_t)0, 63>>cut26, wt659);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c50)+(ptrdiff_t)0, 63>>cut26, wt660);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c50)+(ptrdiff_t)0, 63>>cut26, wt661);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c50)+(ptrdiff_t)0, 63>>cut26, wt662);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c50)+(ptrdiff_t)0, 63>>cut26, wt663);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c50)+(ptrdiff_t)0, 63>>cut26, wt664);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c50)+(ptrdiff_t)0, 63>>cut26, wt665);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c50)+(ptrdiff_t)0, 63>>cut26, wt666);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c50)+(ptrdiff_t)0, 63>>cut26, wt667);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c50)+(ptrdiff_t)0, 63>>cut26, wt668);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt653);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt654);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt655);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt656);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt657);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt658);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt659);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt660);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt661);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt662);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt663);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt664);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt665);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt666);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt667);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt668);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt653);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt654);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt655);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt656);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt657);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt658);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt659);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt660);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt661);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt662);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt663);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt664);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt665);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt666);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt667);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt668);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt653);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt654);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt655);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt656);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt657);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt658);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt659);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt660);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt661);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt662);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt663);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt664);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt665);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt666);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt667);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt668);
}
}
}
} else {
ptrdiff_t k159 = 240;
ptrdiff_t l68 = (size_t)(0+k159)/6;
ptrdiff_t cut25 = (size_t)(0+k159)%6;
__m512 sum534 = _mm512_maskz_loadu_ps(65535, biasPtr18+1024*i59+4*k159);
__m512i pmMul39 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd39 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo33 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k159+256*i59));
__m512 masHi33 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k159+256*i59)+(ptrdiff_t)64);
__m512 postMul59 = _mm512_permutex2var_ps(masLo33, pmMul39, masHi33);
__m512 postAdd37 = _mm512_permutex2var_ps(masLo33, pmAdd39, masHi33);
sum534 = _mm512_fmadd_ps(sum534, postMul59, postAdd37);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*0+(ptrdiff_t)0, 63>>cut25, sum534);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*0+(ptrdiff_t)24576, 4032>>cut25, sum534);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*0+(ptrdiff_t)49152, 65535-(4095>>cut25), sum534);
ptrdiff_t c48 = 0;
for (; c48 != 64; ++c48) {
__m512 wt621 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)0);
__m512 wt622 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)4096);
__m512 wt623 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)8192);
__m512 wt624 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)12288);
__m512 wt625 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)16384);
__m512 wt626 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)20480);
__m512 wt627 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)24576);
__m512 wt628 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)28672);
__m512 wt629 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)32768);
__m512 wt630 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)36864);
__m512 wt631 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)40960);
__m512 wt632 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)45056);
__m512 wt633 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)49152);
__m512 wt634 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)53248);
__m512 wt635 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)57344);
__m512 wt636 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)61440);
__m512 tmp18013 = _mm512_unpacklo_ps(wt621, wt622);
__m512 tmp18014 = _mm512_unpackhi_ps(wt621, wt622);
__m512 tmp18015 = _mm512_unpacklo_ps(wt623, wt624);
__m512 tmp18016 = _mm512_unpackhi_ps(wt623, wt624);
__m512 tmp18017 = _mm512_unpacklo_ps(wt625, wt626);
__m512 tmp18018 = _mm512_unpackhi_ps(wt625, wt626);
__m512 tmp18019 = _mm512_unpacklo_ps(wt627, wt628);
__m512 tmp18020 = _mm512_unpackhi_ps(wt627, wt628);
__m512 tmp18021 = _mm512_unpacklo_ps(wt629, wt630);
__m512 tmp18022 = _mm512_unpackhi_ps(wt629, wt630);
__m512 tmp18023 = _mm512_unpacklo_ps(wt631, wt632);
__m512 tmp18024 = _mm512_unpackhi_ps(wt631, wt632);
__m512 tmp18025 = _mm512_unpacklo_ps(wt633, wt634);
__m512 tmp18026 = _mm512_unpackhi_ps(wt633, wt634);
__m512 tmp18027 = _mm512_unpacklo_ps(wt635, wt636);
__m512 tmp18028 = _mm512_unpackhi_ps(wt635, wt636);
__m512 tmp18029 = _mm512_shuffle_ps(tmp18013, tmp18015, 68);
__m512 tmp18030 = _mm512_shuffle_ps(tmp18013, tmp18015, 238);
__m512 tmp18031 = _mm512_shuffle_ps(tmp18014, tmp18016, 68);
__m512 tmp18032 = _mm512_shuffle_ps(tmp18014, tmp18016, 238);
__m512 tmp18033 = _mm512_shuffle_ps(tmp18017, tmp18019, 68);
__m512 tmp18034 = _mm512_shuffle_ps(tmp18017, tmp18019, 238);
__m512 tmp18035 = _mm512_shuffle_ps(tmp18018, tmp18020, 68);
__m512 tmp18036 = _mm512_shuffle_ps(tmp18018, tmp18020, 238);
__m512 tmp18037 = _mm512_shuffle_ps(tmp18021, tmp18023, 68);
__m512 tmp18038 = _mm512_shuffle_ps(tmp18021, tmp18023, 238);
__m512 tmp18039 = _mm512_shuffle_ps(tmp18022, tmp18024, 68);
__m512 tmp18040 = _mm512_shuffle_ps(tmp18022, tmp18024, 238);
__m512 tmp18041 = _mm512_shuffle_ps(tmp18025, tmp18027, 68);
__m512 tmp18042 = _mm512_shuffle_ps(tmp18025, tmp18027, 238);
__m512 tmp18043 = _mm512_shuffle_ps(tmp18026, tmp18028, 68);
__m512 tmp18044 = _mm512_shuffle_ps(tmp18026, tmp18028, 238);
__m512 tmp18045 = _mm512_shuffle_f32x4(tmp18029, tmp18033, 136);
__m512 tmp18046 = _mm512_shuffle_f32x4(tmp18029, tmp18033, 221);
__m512 tmp18047 = _mm512_shuffle_f32x4(tmp18030, tmp18034, 136);
__m512 tmp18048 = _mm512_shuffle_f32x4(tmp18030, tmp18034, 221);
__m512 tmp18049 = _mm512_shuffle_f32x4(tmp18031, tmp18035, 136);
__m512 tmp18050 = _mm512_shuffle_f32x4(tmp18031, tmp18035, 221);
__m512 tmp18051 = _mm512_shuffle_f32x4(tmp18032, tmp18036, 136);
__m512 tmp18052 = _mm512_shuffle_f32x4(tmp18032, tmp18036, 221);
__m512 tmp18053 = _mm512_shuffle_f32x4(tmp18037, tmp18041, 136);
__m512 tmp18054 = _mm512_shuffle_f32x4(tmp18037, tmp18041, 221);
__m512 tmp18055 = _mm512_shuffle_f32x4(tmp18038, tmp18042, 136);
__m512 tmp18056 = _mm512_shuffle_f32x4(tmp18038, tmp18042, 221);
__m512 tmp18057 = _mm512_shuffle_f32x4(tmp18039, tmp18043, 136);
__m512 tmp18058 = _mm512_shuffle_f32x4(tmp18039, tmp18043, 221);
__m512 tmp18059 = _mm512_shuffle_f32x4(tmp18040, tmp18044, 136);
__m512 tmp18060 = _mm512_shuffle_f32x4(tmp18040, tmp18044, 221);
wt621 = _mm512_shuffle_f32x4(tmp18045, tmp18053, 136);
wt629 = _mm512_shuffle_f32x4(tmp18045, tmp18053, 221);
wt622 = _mm512_shuffle_f32x4(tmp18047, tmp18055, 136);
wt630 = _mm512_shuffle_f32x4(tmp18047, tmp18055, 221);
wt623 = _mm512_shuffle_f32x4(tmp18049, tmp18057, 136);
wt631 = _mm512_shuffle_f32x4(tmp18049, tmp18057, 221);
wt624 = _mm512_shuffle_f32x4(tmp18051, tmp18059, 136);
wt632 = _mm512_shuffle_f32x4(tmp18051, tmp18059, 221);
wt625 = _mm512_shuffle_f32x4(tmp18046, tmp18054, 136);
wt633 = _mm512_shuffle_f32x4(tmp18046, tmp18054, 221);
wt626 = _mm512_shuffle_f32x4(tmp18048, tmp18056, 136);
wt634 = _mm512_shuffle_f32x4(tmp18048, tmp18056, 221);
wt627 = _mm512_shuffle_f32x4(tmp18050, tmp18058, 136);
wt635 = _mm512_shuffle_f32x4(tmp18050, tmp18058, 221);
wt628 = _mm512_shuffle_f32x4(tmp18052, tmp18060, 136);
wt636 = _mm512_shuffle_f32x4(tmp18052, tmp18060, 221);
wt621 = _mm512_mul_ps(wt621, postMul59);
wt622 = _mm512_mul_ps(wt622, postMul59);
wt623 = _mm512_mul_ps(wt623, postMul59);
wt624 = _mm512_mul_ps(wt624, postMul59);
wt625 = _mm512_mul_ps(wt625, postMul59);
wt626 = _mm512_mul_ps(wt626, postMul59);
wt627 = _mm512_mul_ps(wt627, postMul59);
wt628 = _mm512_mul_ps(wt628, postMul59);
wt629 = _mm512_mul_ps(wt629, postMul59);
wt630 = _mm512_mul_ps(wt630, postMul59);
wt631 = _mm512_mul_ps(wt631, postMul59);
wt632 = _mm512_mul_ps(wt632, postMul59);
wt633 = _mm512_mul_ps(wt633, postMul59);
wt634 = _mm512_mul_ps(wt634, postMul59);
wt635 = _mm512_mul_ps(wt635, postMul59);
wt636 = _mm512_mul_ps(wt636, postMul59);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(1+16*c48)+(ptrdiff_t)0, 63>>cut25, wt621);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(2+16*c48)+(ptrdiff_t)0, 63>>cut25, wt622);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(3+16*c48)+(ptrdiff_t)0, 63>>cut25, wt623);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(4+16*c48)+(ptrdiff_t)0, 63>>cut25, wt624);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(5+16*c48)+(ptrdiff_t)0, 63>>cut25, wt625);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(6+16*c48)+(ptrdiff_t)0, 63>>cut25, wt626);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(7+16*c48)+(ptrdiff_t)0, 63>>cut25, wt627);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(8+16*c48)+(ptrdiff_t)0, 63>>cut25, wt628);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(9+16*c48)+(ptrdiff_t)0, 63>>cut25, wt629);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(10+16*c48)+(ptrdiff_t)0, 63>>cut25, wt630);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(11+16*c48)+(ptrdiff_t)0, 63>>cut25, wt631);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(12+16*c48)+(ptrdiff_t)0, 63>>cut25, wt632);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(13+16*c48)+(ptrdiff_t)0, 63>>cut25, wt633);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(14+16*c48)+(ptrdiff_t)0, 63>>cut25, wt634);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(15+16*c48)+(ptrdiff_t)0, 63>>cut25, wt635);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(16+16*c48)+(ptrdiff_t)0, 63>>cut25, wt636);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(1+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt621);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(2+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt622);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(3+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt623);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(4+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt624);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(5+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt625);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(6+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt626);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(7+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt627);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(8+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt628);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(9+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt629);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(10+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt630);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(11+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt631);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(12+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt632);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(13+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt633);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(14+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt634);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(15+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt635);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(16+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt636);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(1+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt621);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(2+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt622);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(3+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt623);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(4+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt624);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(5+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt625);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(6+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt626);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(7+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt627);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(8+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt628);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(9+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt629);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(10+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt630);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(11+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt631);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(12+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt632);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(13+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt633);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(14+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt634);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(15+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt635);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(16+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt636);
}
}
}
}
}

static void ResNet50OneArrangeWts9(ResNet50ThreaderTeam1* team63, char** tensors99) {
ResNet50ThreaderTask1 task103;
task103.callee1 = ResNet50OneArrangeWts9Callee1;
task103.any1 = tensors99;
task103.nd1 = 3;
task103.hull1[0] = 16;
task103.hull1[1] = 1;
task103.hull1[2] = 1;
ResNet50ThreaderDo1(team63, &task103);
}

static void ResNet50OneArrangeDats9Callee1(ResNet50ThreaderTask1* task104, int64_t* pt57) {
char** tensors102 = task104->any1;
ptrdiff_t s49 = pt57[0];
ptrdiff_t c51 = pt57[1];
char*restrict datPtr32 = tensors102[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
char*restrict arranged18 = tensors102[1]+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
ptrdiff_t ii26 = 1;
for (ptrdiff_t i60 = 0; i60 < ii26; ++i60) {
ptrdiff_t j52 = 1*c51;
ptrdiff_t jj49 = j52+0;
for (; j52 != 3; ++j52) {
ptrdiff_t k161 = 128*s49;
ptrdiff_t kk55 = k161+128;
for (; k161 < kk55; ++k161) {
__m512 dat2323 = _mm512_maskz_loadu_ps(65535, datPtr32+851968*i60+256*j52+832*k161+(ptrdiff_t)0);
__m512 dat2324 = _mm512_maskz_loadu_ps(65535, datPtr32+851968*i60+256*j52+832*k161+(ptrdiff_t)64);
__m512 dat2325 = _mm512_maskz_loadu_ps(65535, datPtr32+851968*i60+256*j52+832*k161+(ptrdiff_t)128);
__m512 dat2326 = _mm512_maskz_loadu_ps(65535, datPtr32+851968*i60+256*j52+832*k161+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+256*k161+(ptrdiff_t)0, 65535, dat2323);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+256*k161+(ptrdiff_t)64, 65535, dat2324);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+256*k161+(ptrdiff_t)128, 65535, dat2325);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+256*k161+(ptrdiff_t)192, 65535, dat2326);
}
if (j52 >= jj49) goto next9;
}
ptrdiff_t k162 = 128*s49;
ptrdiff_t kk56 = k162+128;
for (; k162 < kk56; ++k162) {
__m512 dat2327 = _mm512_maskz_loadu_ps(15, datPtr32+851968*i60+256*j52+832*k162+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+64*k162+(ptrdiff_t)0, 15, dat2327);
}
next9:;
}
}

static void ResNet50OneArrangeDats9(ResNet50ThreaderTeam1* team64, char** tensors101) {
ResNet50ThreaderTask1 task105;
task105.callee1 = ResNet50OneArrangeDats9Callee1;
task105.any1 = tensors101;
task105.nd1 = 4;
task105.hull1[0] = 8;
task105.hull1[1] = 4;
task105.hull1[2] = 1;
task105.hull1[3] = 1;
ResNet50ThreaderDo1(team64, &task105);
}

static void ResNet50OneApply9Callee1(ResNet50ThreaderTask1* task106, int64_t* pt58) {
void** pair28 = task106->any1;
char** tensors104 = pair28[0];
ptrdiff_t e29 = 0;
ptrdiff_t g34 = 0;
ptrdiff_t d21 = pt58[1];
ptrdiff_t w70 = pt58[0];
char*restrict arrangedWts9 = tensors104[0]+856064*e29+(ptrdiff_t)1049600*1*g34;
char*restrict arrangedDats9 = tensors104[1]+694720*e29+(ptrdiff_t)851968*1*g34;
char*restrict datPtr33 = tensors104[2]+(ptrdiff_t)212992*1*g34;
ptrdiff_t ii27 = 1;
for (ptrdiff_t i61 = 0; i61 < ii27; ++i61) {
ptrdiff_t j53 = 1*d21;
ptrdiff_t jj50 = j53+0;
for (; j53 != 3; ++j53) {
ptrdiff_t k163 = 1*w70;
ptrdiff_t kk57 = k163+0;
for (; k163 != 42; ++k163) {
ptrdiff_t s50 = -1;
__m512 sum537 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)24));
__m512 sum541 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)28));
__m512 sum545 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)32));
__m512 sum549 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)36));
__m512 sum553 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)40));
__m512 sum557 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)44));
__m512 sum538 = sum537;
__m512 sum539 = sum537;
__m512 sum540 = sum537;
__m512 sum542 = sum541;
__m512 sum543 = sum541;
__m512 sum544 = sum541;
__m512 sum546 = sum545;
__m512 sum547 = sum545;
__m512 sum548 = sum545;
__m512 sum550 = sum549;
__m512 sum551 = sum549;
__m512 sum552 = sum549;
__m512 sum554 = sum553;
__m512 sum555 = sum553;
__m512 sum556 = sum553;
__m512 sum558 = sum557;
__m512 sum559 = sum557;
__m512 sum560 = sum557;
for (s50 = 0; s50 < 1024; ++s50) {
__m512 dat2328 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s50+(ptrdiff_t)0);
__m512 dat2329 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s50+(ptrdiff_t)64);
__m512 dat2330 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s50+(ptrdiff_t)128);
__m512 dat2331 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s50+(ptrdiff_t)192);
__m512 wt669 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)24));
sum537 = _mm512_fmadd_ps(wt669, dat2328, sum537);
sum538 = _mm512_fmadd_ps(wt669, dat2329, sum538);
sum539 = _mm512_fmadd_ps(wt669, dat2330, sum539);
sum540 = _mm512_fmadd_ps(wt669, dat2331, sum540);
__m512 wt670 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)28));
sum541 = _mm512_fmadd_ps(wt670, dat2328, sum541);
sum542 = _mm512_fmadd_ps(wt670, dat2329, sum542);
sum543 = _mm512_fmadd_ps(wt670, dat2330, sum543);
sum544 = _mm512_fmadd_ps(wt670, dat2331, sum544);
__m512 wt671 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)32));
sum545 = _mm512_fmadd_ps(wt671, dat2328, sum545);
sum546 = _mm512_fmadd_ps(wt671, dat2329, sum546);
sum547 = _mm512_fmadd_ps(wt671, dat2330, sum547);
sum548 = _mm512_fmadd_ps(wt671, dat2331, sum548);
__m512 wt672 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)36));
sum549 = _mm512_fmadd_ps(wt672, dat2328, sum549);
sum550 = _mm512_fmadd_ps(wt672, dat2329, sum550);
sum551 = _mm512_fmadd_ps(wt672, dat2330, sum551);
sum552 = _mm512_fmadd_ps(wt672, dat2331, sum552);
__m512 wt673 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)40));
sum553 = _mm512_fmadd_ps(wt673, dat2328, sum553);
sum554 = _mm512_fmadd_ps(wt673, dat2329, sum554);
sum555 = _mm512_fmadd_ps(wt673, dat2330, sum555);
sum556 = _mm512_fmadd_ps(wt673, dat2331, sum556);
__m512 wt674 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)44));
sum557 = _mm512_fmadd_ps(wt674, dat2328, sum557);
sum558 = _mm512_fmadd_ps(wt674, dat2329, sum558);
sum559 = _mm512_fmadd_ps(wt674, dat2330, sum559);
sum560 = _mm512_fmadd_ps(wt674, dat2331, sum560);
}
sum537 = _mm512_max_ps(_mm512_setzero_ps(), sum537);
sum538 = _mm512_max_ps(_mm512_setzero_ps(), sum538);
sum539 = _mm512_max_ps(_mm512_setzero_ps(), sum539);
sum540 = _mm512_max_ps(_mm512_setzero_ps(), sum540);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)0, 65535, sum537);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)64, 65535, sum538);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)128, 65535, sum539);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)192, 65535, sum540);
sum541 = _mm512_max_ps(_mm512_setzero_ps(), sum541);
sum542 = _mm512_max_ps(_mm512_setzero_ps(), sum542);
sum543 = _mm512_max_ps(_mm512_setzero_ps(), sum543);
sum544 = _mm512_max_ps(_mm512_setzero_ps(), sum544);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)832, 65535, sum541);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)896, 65535, sum542);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)960, 65535, sum543);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1024, 65535, sum544);
sum545 = _mm512_max_ps(_mm512_setzero_ps(), sum545);
sum546 = _mm512_max_ps(_mm512_setzero_ps(), sum546);
sum547 = _mm512_max_ps(_mm512_setzero_ps(), sum547);
sum548 = _mm512_max_ps(_mm512_setzero_ps(), sum548);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1664, 65535, sum545);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1728, 65535, sum546);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1792, 65535, sum547);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1856, 65535, sum548);
sum549 = _mm512_max_ps(_mm512_setzero_ps(), sum549);
sum550 = _mm512_max_ps(_mm512_setzero_ps(), sum550);
sum551 = _mm512_max_ps(_mm512_setzero_ps(), sum551);
sum552 = _mm512_max_ps(_mm512_setzero_ps(), sum552);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2496, 65535, sum549);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2560, 65535, sum550);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2624, 65535, sum551);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2688, 65535, sum552);
sum553 = _mm512_max_ps(_mm512_setzero_ps(), sum553);
sum554 = _mm512_max_ps(_mm512_setzero_ps(), sum554);
sum555 = _mm512_max_ps(_mm512_setzero_ps(), sum555);
sum556 = _mm512_max_ps(_mm512_setzero_ps(), sum556);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)3328, 65535, sum553);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)3392, 65535, sum554);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)3456, 65535, sum555);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)3520, 65535, sum556);
sum557 = _mm512_max_ps(_mm512_setzero_ps(), sum557);
sum558 = _mm512_max_ps(_mm512_setzero_ps(), sum558);
sum559 = _mm512_max_ps(_mm512_setzero_ps(), sum559);
sum560 = _mm512_max_ps(_mm512_setzero_ps(), sum560);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)4160, 65535, sum557);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)4224, 65535, sum558);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)4288, 65535, sum559);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)4352, 65535, sum560);
if (k163 >= kk57) return;
}
ptrdiff_t s51 = -1;
__m512 sum561 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)16));
__m512 sum565 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)20));
__m512 sum569 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)24));
__m512 sum573 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)28));
__m512 sum562 = sum561;
__m512 sum563 = sum561;
__m512 sum564 = sum561;
__m512 sum566 = sum565;
__m512 sum567 = sum565;
__m512 sum568 = sum565;
__m512 sum570 = sum569;
__m512 sum571 = sum569;
__m512 sum572 = sum569;
__m512 sum574 = sum573;
__m512 sum575 = sum573;
__m512 sum576 = sum573;
for (s51 = 0; s51 < 1024; ++s51) {
__m512 dat2332 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s51+(ptrdiff_t)0);
__m512 dat2333 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s51+(ptrdiff_t)64);
__m512 dat2334 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s51+(ptrdiff_t)128);
__m512 dat2335 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s51+(ptrdiff_t)192);
__m512 wt675 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)16));
sum561 = _mm512_fmadd_ps(wt675, dat2332, sum561);
sum562 = _mm512_fmadd_ps(wt675, dat2333, sum562);
sum563 = _mm512_fmadd_ps(wt675, dat2334, sum563);
sum564 = _mm512_fmadd_ps(wt675, dat2335, sum564);
__m512 wt676 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)20));
sum565 = _mm512_fmadd_ps(wt676, dat2332, sum565);
sum566 = _mm512_fmadd_ps(wt676, dat2333, sum566);
sum567 = _mm512_fmadd_ps(wt676, dat2334, sum567);
sum568 = _mm512_fmadd_ps(wt676, dat2335, sum568);
__m512 wt677 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)24));
sum569 = _mm512_fmadd_ps(wt677, dat2332, sum569);
sum570 = _mm512_fmadd_ps(wt677, dat2333, sum570);
sum571 = _mm512_fmadd_ps(wt677, dat2334, sum571);
sum572 = _mm512_fmadd_ps(wt677, dat2335, sum572);
__m512 wt678 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)28));
sum573 = _mm512_fmadd_ps(wt678, dat2332, sum573);
sum574 = _mm512_fmadd_ps(wt678, dat2333, sum574);
sum575 = _mm512_fmadd_ps(wt678, dat2334, sum575);
sum576 = _mm512_fmadd_ps(wt678, dat2335, sum576);
}
sum561 = _mm512_max_ps(_mm512_setzero_ps(), sum561);
sum562 = _mm512_max_ps(_mm512_setzero_ps(), sum562);
sum563 = _mm512_max_ps(_mm512_setzero_ps(), sum563);
sum564 = _mm512_max_ps(_mm512_setzero_ps(), sum564);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)0, 65535, sum561);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)64, 65535, sum562);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)128, 65535, sum563);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)192, 65535, sum564);
sum565 = _mm512_max_ps(_mm512_setzero_ps(), sum565);
sum566 = _mm512_max_ps(_mm512_setzero_ps(), sum566);
sum567 = _mm512_max_ps(_mm512_setzero_ps(), sum567);
sum568 = _mm512_max_ps(_mm512_setzero_ps(), sum568);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)832, 65535, sum565);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)896, 65535, sum566);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)960, 65535, sum567);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1024, 65535, sum568);
sum569 = _mm512_max_ps(_mm512_setzero_ps(), sum569);
sum570 = _mm512_max_ps(_mm512_setzero_ps(), sum570);
sum571 = _mm512_max_ps(_mm512_setzero_ps(), sum571);
sum572 = _mm512_max_ps(_mm512_setzero_ps(), sum572);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1664, 65535, sum569);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1728, 65535, sum570);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1792, 65535, sum571);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1856, 65535, sum572);
sum573 = _mm512_max_ps(_mm512_setzero_ps(), sum573);
sum574 = _mm512_max_ps(_mm512_setzero_ps(), sum574);
sum575 = _mm512_max_ps(_mm512_setzero_ps(), sum575);
sum576 = _mm512_max_ps(_mm512_setzero_ps(), sum576);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2496, 65535, sum573);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2560, 65535, sum574);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2624, 65535, sum575);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2688, 65535, sum576);
if (j53 >= jj50) return;
}
ptrdiff_t k164 = 1*w70;
ptrdiff_t kk58 = k164+0;
for (; k164 != 42; ++k164) {
ptrdiff_t s52 = -1;
__m512 sum577 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)24));
__m512 sum578 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)28));
__m512 sum579 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)32));
__m512 sum580 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)36));
__m512 sum581 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)40));
__m512 sum582 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)44));
for (s52 = 0; s52 < 1024; ++s52) {
__m512 dat2336 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+64*s52+(ptrdiff_t)0);
__m512 wt679 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)24));
sum577 = _mm512_fmadd_ps(wt679, dat2336, sum577);
__m512 wt680 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)28));
sum578 = _mm512_fmadd_ps(wt680, dat2336, sum578);
__m512 wt681 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)32));
sum579 = _mm512_fmadd_ps(wt681, dat2336, sum579);
__m512 wt682 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)36));
sum580 = _mm512_fmadd_ps(wt682, dat2336, sum580);
__m512 wt683 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)40));
sum581 = _mm512_fmadd_ps(wt683, dat2336, sum581);
__m512 wt684 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)44));
sum582 = _mm512_fmadd_ps(wt684, dat2336, sum582);
}
sum577 = _mm512_max_ps(_mm512_setzero_ps(), sum577);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)0, 15, sum577);
sum578 = _mm512_max_ps(_mm512_setzero_ps(), sum578);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)832, 15, sum578);
sum579 = _mm512_max_ps(_mm512_setzero_ps(), sum579);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)1664, 15, sum579);
sum580 = _mm512_max_ps(_mm512_setzero_ps(), sum580);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)2496, 15, sum580);
sum581 = _mm512_max_ps(_mm512_setzero_ps(), sum581);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)3328, 15, sum581);
sum582 = _mm512_max_ps(_mm512_setzero_ps(), sum582);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)4160, 15, sum582);
if (k164 >= kk58) return;
}
ptrdiff_t s53 = -1;
__m512 sum583 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)16));
__m512 sum584 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)20));
__m512 sum585 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)24));
__m512 sum586 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)28));
for (s53 = 0; s53 < 1024; ++s53) {
__m512 dat2337 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+64*s53+(ptrdiff_t)0);
__m512 wt685 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)16));
sum583 = _mm512_fmadd_ps(wt685, dat2337, sum583);
__m512 wt686 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)20));
sum584 = _mm512_fmadd_ps(wt686, dat2337, sum584);
__m512 wt687 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)24));
sum585 = _mm512_fmadd_ps(wt687, dat2337, sum585);
__m512 wt688 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)28));
sum586 = _mm512_fmadd_ps(wt688, dat2337, sum586);
}
sum583 = _mm512_max_ps(_mm512_setzero_ps(), sum583);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)0, 15, sum583);
sum584 = _mm512_max_ps(_mm512_setzero_ps(), sum584);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)832, 15, sum584);
sum585 = _mm512_max_ps(_mm512_setzero_ps(), sum585);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)1664, 15, sum585);
sum586 = _mm512_max_ps(_mm512_setzero_ps(), sum586);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)2496, 15, sum586);
}
}

static void ResNet50OneApply9(ResNet50ThreaderTeam1* team65, char** tensors103) {
void* pair27[] = {tensors103, 0};
ResNet50ThreaderTask1 task107;
task107.callee1 = ResNet50OneApply9Callee1;
task107.any1 = pair27;
task107.nd1 = 3;
task107.hull1[0] = 43;
task107.hull1[1] = 4;
task107.hull1[2] = 1;
ResNet50ThreaderDo1(team65, &task107);
}

static void ResNet50OneArrangeWts10Callee1(ResNet50ThreaderTask1* task116, int64_t* pt63) {
char** tensors114 = task116->any1;
ptrdiff_t b71 = pt63[0];
char*restrict wtPtr20 = tensors114[0]+(ptrdiff_t)3340*0+(ptrdiff_t)10485760*0;
char*restrict biasPtr20 = tensors114[1]+(ptrdiff_t)10240*0;
char*restrict bnPtr20 = tensors114[2]+(ptrdiff_t)8*2560*0;
char*restrict wtPtr21 = tensors114[3]+(ptrdiff_t)3340*0+(ptrdiff_t)10485760*0;
char*restrict biasPtr21 = tensors114[4]+(ptrdiff_t)10240*0;
char*restrict bnPtr21 = tensors114[5]+(ptrdiff_t)8*2560*0;
char*restrict arranged19 = tensors114[6]+(ptrdiff_t)8560640*0+(ptrdiff_t)10496000*0;
ptrdiff_t ii28 = 1;
for (ptrdiff_t i67 = 0; i67 < ii28; ++i67) {
ptrdiff_t j58 = 1*b71;
ptrdiff_t jj52 = j58+1;
for (; j58 < jj52; ++j58) {
if (j58 < 128) {
ptrdiff_t k171 = 0+16*(j58-0);
ptrdiff_t l74 = (size_t)(0+k171)/6;
ptrdiff_t cut28 = (size_t)(0+k171)%6;
switch (cut28) {
case 0:;
case 2: {
__m512 sum623 = _mm512_maskz_loadu_ps(65535, biasPtr20+10240*i67+4*k171);
__m512i pmMul41 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd41 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo34 = _mm512_loadu_ps(bnPtr20+(ptrdiff_t)8*(k171+2560*i67));
__m512 masHi34 = _mm512_maskz_loadu_ps(65535, bnPtr20+(ptrdiff_t)8*(k171+2560*i67)+(ptrdiff_t)64);
__m512 postMul67 = _mm512_permutex2var_ps(masLo34, pmMul41, masHi34);
__m512 postAdd41 = _mm512_permutex2var_ps(masLo34, pmAdd41, masHi34);
sum623 = _mm512_fmadd_ps(sum623, postMul67, postAdd41);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)0, 63>>cut28, sum623);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)24576, 4032>>cut28, sum623);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)49152, 65535-(4095>>cut28), sum623);
ptrdiff_t c53 = 0;
for (; c53 != 64; ++c53) {
__m512 wt693 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)0);
__m512 wt694 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)4096);
__m512 wt695 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)8192);
__m512 wt696 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)12288);
__m512 wt697 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)16384);
__m512 wt698 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)20480);
__m512 wt699 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)24576);
__m512 wt700 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)28672);
__m512 wt701 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)32768);
__m512 wt702 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)36864);
__m512 wt703 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)40960);
__m512 wt704 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)45056);
__m512 wt705 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)49152);
__m512 wt706 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)53248);
__m512 wt707 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)57344);
__m512 wt708 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)61440);
__m512 tmp19277 = _mm512_unpacklo_ps(wt693, wt694);
__m512 tmp19278 = _mm512_unpackhi_ps(wt693, wt694);
__m512 tmp19279 = _mm512_unpacklo_ps(wt695, wt696);
__m512 tmp19280 = _mm512_unpackhi_ps(wt695, wt696);
__m512 tmp19281 = _mm512_unpacklo_ps(wt697, wt698);
__m512 tmp19282 = _mm512_unpackhi_ps(wt697, wt698);
__m512 tmp19283 = _mm512_unpacklo_ps(wt699, wt700);
__m512 tmp19284 = _mm512_unpackhi_ps(wt699, wt700);
__m512 tmp19285 = _mm512_unpacklo_ps(wt701, wt702);
__m512 tmp19286 = _mm512_unpackhi_ps(wt701, wt702);
__m512 tmp19287 = _mm512_unpacklo_ps(wt703, wt704);
__m512 tmp19288 = _mm512_unpackhi_ps(wt703, wt704);
__m512 tmp19289 = _mm512_unpacklo_ps(wt705, wt706);
__m512 tmp19290 = _mm512_unpackhi_ps(wt705, wt706);
__m512 tmp19291 = _mm512_unpacklo_ps(wt707, wt708);
__m512 tmp19292 = _mm512_unpackhi_ps(wt707, wt708);
__m512 tmp19293 = _mm512_shuffle_ps(tmp19277, tmp19279, 68);
__m512 tmp19294 = _mm512_shuffle_ps(tmp19277, tmp19279, 238);
__m512 tmp19295 = _mm512_shuffle_ps(tmp19278, tmp19280, 68);
__m512 tmp19296 = _mm512_shuffle_ps(tmp19278, tmp19280, 238);
__m512 tmp19297 = _mm512_shuffle_ps(tmp19281, tmp19283, 68);
__m512 tmp19298 = _mm512_shuffle_ps(tmp19281, tmp19283, 238);
__m512 tmp19299 = _mm512_shuffle_ps(tmp19282, tmp19284, 68);
__m512 tmp19300 = _mm512_shuffle_ps(tmp19282, tmp19284, 238);
__m512 tmp19301 = _mm512_shuffle_ps(tmp19285, tmp19287, 68);
__m512 tmp19302 = _mm512_shuffle_ps(tmp19285, tmp19287, 238);
__m512 tmp19303 = _mm512_shuffle_ps(tmp19286, tmp19288, 68);
__m512 tmp19304 = _mm512_shuffle_ps(tmp19286, tmp19288, 238);
__m512 tmp19305 = _mm512_shuffle_ps(tmp19289, tmp19291, 68);
__m512 tmp19306 = _mm512_shuffle_ps(tmp19289, tmp19291, 238);
__m512 tmp19307 = _mm512_shuffle_ps(tmp19290, tmp19292, 68);
__m512 tmp19308 = _mm512_shuffle_ps(tmp19290, tmp19292, 238);
__m512 tmp19309 = _mm512_shuffle_f32x4(tmp19293, tmp19297, 136);
__m512 tmp19310 = _mm512_shuffle_f32x4(tmp19293, tmp19297, 221);
__m512 tmp19311 = _mm512_shuffle_f32x4(tmp19294, tmp19298, 136);
__m512 tmp19312 = _mm512_shuffle_f32x4(tmp19294, tmp19298, 221);
__m512 tmp19313 = _mm512_shuffle_f32x4(tmp19295, tmp19299, 136);
__m512 tmp19314 = _mm512_shuffle_f32x4(tmp19295, tmp19299, 221);
__m512 tmp19315 = _mm512_shuffle_f32x4(tmp19296, tmp19300, 136);
__m512 tmp19316 = _mm512_shuffle_f32x4(tmp19296, tmp19300, 221);
__m512 tmp19317 = _mm512_shuffle_f32x4(tmp19301, tmp19305, 136);
__m512 tmp19318 = _mm512_shuffle_f32x4(tmp19301, tmp19305, 221);
__m512 tmp19319 = _mm512_shuffle_f32x4(tmp19302, tmp19306, 136);
__m512 tmp19320 = _mm512_shuffle_f32x4(tmp19302, tmp19306, 221);
__m512 tmp19321 = _mm512_shuffle_f32x4(tmp19303, tmp19307, 136);
__m512 tmp19322 = _mm512_shuffle_f32x4(tmp19303, tmp19307, 221);
__m512 tmp19323 = _mm512_shuffle_f32x4(tmp19304, tmp19308, 136);
__m512 tmp19324 = _mm512_shuffle_f32x4(tmp19304, tmp19308, 221);
wt693 = _mm512_shuffle_f32x4(tmp19309, tmp19317, 136);
wt701 = _mm512_shuffle_f32x4(tmp19309, tmp19317, 221);
wt694 = _mm512_shuffle_f32x4(tmp19311, tmp19319, 136);
wt702 = _mm512_shuffle_f32x4(tmp19311, tmp19319, 221);
wt695 = _mm512_shuffle_f32x4(tmp19313, tmp19321, 136);
wt703 = _mm512_shuffle_f32x4(tmp19313, tmp19321, 221);
wt696 = _mm512_shuffle_f32x4(tmp19315, tmp19323, 136);
wt704 = _mm512_shuffle_f32x4(tmp19315, tmp19323, 221);
wt697 = _mm512_shuffle_f32x4(tmp19310, tmp19318, 136);
wt705 = _mm512_shuffle_f32x4(tmp19310, tmp19318, 221);
wt698 = _mm512_shuffle_f32x4(tmp19312, tmp19320, 136);
wt706 = _mm512_shuffle_f32x4(tmp19312, tmp19320, 221);
wt699 = _mm512_shuffle_f32x4(tmp19314, tmp19322, 136);
wt707 = _mm512_shuffle_f32x4(tmp19314, tmp19322, 221);
wt700 = _mm512_shuffle_f32x4(tmp19316, tmp19324, 136);
wt708 = _mm512_shuffle_f32x4(tmp19316, tmp19324, 221);
wt693 = _mm512_mul_ps(wt693, postMul67);
wt694 = _mm512_mul_ps(wt694, postMul67);
wt695 = _mm512_mul_ps(wt695, postMul67);
wt696 = _mm512_mul_ps(wt696, postMul67);
wt697 = _mm512_mul_ps(wt697, postMul67);
wt698 = _mm512_mul_ps(wt698, postMul67);
wt699 = _mm512_mul_ps(wt699, postMul67);
wt700 = _mm512_mul_ps(wt700, postMul67);
wt701 = _mm512_mul_ps(wt701, postMul67);
wt702 = _mm512_mul_ps(wt702, postMul67);
wt703 = _mm512_mul_ps(wt703, postMul67);
wt704 = _mm512_mul_ps(wt704, postMul67);
wt705 = _mm512_mul_ps(wt705, postMul67);
wt706 = _mm512_mul_ps(wt706, postMul67);
wt707 = _mm512_mul_ps(wt707, postMul67);
wt708 = _mm512_mul_ps(wt708, postMul67);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c53)+(ptrdiff_t)0, 63>>cut28, wt693);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c53)+(ptrdiff_t)0, 63>>cut28, wt694);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c53)+(ptrdiff_t)0, 63>>cut28, wt695);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c53)+(ptrdiff_t)0, 63>>cut28, wt696);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c53)+(ptrdiff_t)0, 63>>cut28, wt697);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c53)+(ptrdiff_t)0, 63>>cut28, wt698);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c53)+(ptrdiff_t)0, 63>>cut28, wt699);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c53)+(ptrdiff_t)0, 63>>cut28, wt700);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c53)+(ptrdiff_t)0, 63>>cut28, wt701);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c53)+(ptrdiff_t)0, 63>>cut28, wt702);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c53)+(ptrdiff_t)0, 63>>cut28, wt703);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c53)+(ptrdiff_t)0, 63>>cut28, wt704);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c53)+(ptrdiff_t)0, 63>>cut28, wt705);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c53)+(ptrdiff_t)0, 63>>cut28, wt706);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c53)+(ptrdiff_t)0, 63>>cut28, wt707);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c53)+(ptrdiff_t)0, 63>>cut28, wt708);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt693);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt694);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt695);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt696);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt697);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt698);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt699);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt700);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt701);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt702);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt703);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt704);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt705);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt706);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt707);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt708);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt693);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt694);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt695);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt696);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt697);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt698);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt699);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt700);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt701);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt702);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt703);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt704);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt705);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt706);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt707);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt708);
}
break;
}
default: {
cut28 = 4;
__m512 sum624 = _mm512_maskz_loadu_ps(65535, biasPtr20+10240*i67+4*k171);
__m512i pmMul42 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd42 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo35 = _mm512_loadu_ps(bnPtr20+(ptrdiff_t)8*(k171+2560*i67));
__m512 masHi35 = _mm512_maskz_loadu_ps(65535, bnPtr20+(ptrdiff_t)8*(k171+2560*i67)+(ptrdiff_t)64);
__m512 postMul68 = _mm512_permutex2var_ps(masLo35, pmMul42, masHi35);
__m512 postAdd42 = _mm512_permutex2var_ps(masLo35, pmAdd42, masHi35);
sum624 = _mm512_fmadd_ps(sum624, postMul68, postAdd42);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)0, 63>>cut28, sum624);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)24576, 4032>>cut28, sum624);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)49152, 258048>>cut28, sum624);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)73728, 65535-(262143>>cut28), sum624);
ptrdiff_t c54 = 0;
for (; c54 != 64; ++c54) {
__m512 wt709 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)0);
__m512 wt710 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)4096);
__m512 wt711 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)8192);
__m512 wt712 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)12288);
__m512 wt713 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)16384);
__m512 wt714 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)20480);
__m512 wt715 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)24576);
__m512 wt716 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)28672);
__m512 wt717 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)32768);
__m512 wt718 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)36864);
__m512 wt719 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)40960);
__m512 wt720 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)45056);
__m512 wt721 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)49152);
__m512 wt722 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)53248);
__m512 wt723 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)57344);
__m512 wt724 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)61440);
__m512 tmp19325 = _mm512_unpacklo_ps(wt709, wt710);
__m512 tmp19326 = _mm512_unpackhi_ps(wt709, wt710);
__m512 tmp19327 = _mm512_unpacklo_ps(wt711, wt712);
__m512 tmp19328 = _mm512_unpackhi_ps(wt711, wt712);
__m512 tmp19329 = _mm512_unpacklo_ps(wt713, wt714);
__m512 tmp19330 = _mm512_unpackhi_ps(wt713, wt714);
__m512 tmp19331 = _mm512_unpacklo_ps(wt715, wt716);
__m512 tmp19332 = _mm512_unpackhi_ps(wt715, wt716);
__m512 tmp19333 = _mm512_unpacklo_ps(wt717, wt718);
__m512 tmp19334 = _mm512_unpackhi_ps(wt717, wt718);
__m512 tmp19335 = _mm512_unpacklo_ps(wt719, wt720);
__m512 tmp19336 = _mm512_unpackhi_ps(wt719, wt720);
__m512 tmp19337 = _mm512_unpacklo_ps(wt721, wt722);
__m512 tmp19338 = _mm512_unpackhi_ps(wt721, wt722);
__m512 tmp19339 = _mm512_unpacklo_ps(wt723, wt724);
__m512 tmp19340 = _mm512_unpackhi_ps(wt723, wt724);
__m512 tmp19341 = _mm512_shuffle_ps(tmp19325, tmp19327, 68);
__m512 tmp19342 = _mm512_shuffle_ps(tmp19325, tmp19327, 238);
__m512 tmp19343 = _mm512_shuffle_ps(tmp19326, tmp19328, 68);
__m512 tmp19344 = _mm512_shuffle_ps(tmp19326, tmp19328, 238);
__m512 tmp19345 = _mm512_shuffle_ps(tmp19329, tmp19331, 68);
__m512 tmp19346 = _mm512_shuffle_ps(tmp19329, tmp19331, 238);
__m512 tmp19347 = _mm512_shuffle_ps(tmp19330, tmp19332, 68);
__m512 tmp19348 = _mm512_shuffle_ps(tmp19330, tmp19332, 238);
__m512 tmp19349 = _mm512_shuffle_ps(tmp19333, tmp19335, 68);
__m512 tmp19350 = _mm512_shuffle_ps(tmp19333, tmp19335, 238);
__m512 tmp19351 = _mm512_shuffle_ps(tmp19334, tmp19336, 68);
__m512 tmp19352 = _mm512_shuffle_ps(tmp19334, tmp19336, 238);
__m512 tmp19353 = _mm512_shuffle_ps(tmp19337, tmp19339, 68);
__m512 tmp19354 = _mm512_shuffle_ps(tmp19337, tmp19339, 238);
__m512 tmp19355 = _mm512_shuffle_ps(tmp19338, tmp19340, 68);
__m512 tmp19356 = _mm512_shuffle_ps(tmp19338, tmp19340, 238);
__m512 tmp19357 = _mm512_shuffle_f32x4(tmp19341, tmp19345, 136);
__m512 tmp19358 = _mm512_shuffle_f32x4(tmp19341, tmp19345, 221);
__m512 tmp19359 = _mm512_shuffle_f32x4(tmp19342, tmp19346, 136);
__m512 tmp19360 = _mm512_shuffle_f32x4(tmp19342, tmp19346, 221);
__m512 tmp19361 = _mm512_shuffle_f32x4(tmp19343, tmp19347, 136);
__m512 tmp19362 = _mm512_shuffle_f32x4(tmp19343, tmp19347, 221);
__m512 tmp19363 = _mm512_shuffle_f32x4(tmp19344, tmp19348, 136);
__m512 tmp19364 = _mm512_shuffle_f32x4(tmp19344, tmp19348, 221);
__m512 tmp19365 = _mm512_shuffle_f32x4(tmp19349, tmp19353, 136);
__m512 tmp19366 = _mm512_shuffle_f32x4(tmp19349, tmp19353, 221);
__m512 tmp19367 = _mm512_shuffle_f32x4(tmp19350, tmp19354, 136);
__m512 tmp19368 = _mm512_shuffle_f32x4(tmp19350, tmp19354, 221);
__m512 tmp19369 = _mm512_shuffle_f32x4(tmp19351, tmp19355, 136);
__m512 tmp19370 = _mm512_shuffle_f32x4(tmp19351, tmp19355, 221);
__m512 tmp19371 = _mm512_shuffle_f32x4(tmp19352, tmp19356, 136);
__m512 tmp19372 = _mm512_shuffle_f32x4(tmp19352, tmp19356, 221);
wt709 = _mm512_shuffle_f32x4(tmp19357, tmp19365, 136);
wt717 = _mm512_shuffle_f32x4(tmp19357, tmp19365, 221);
wt710 = _mm512_shuffle_f32x4(tmp19359, tmp19367, 136);
wt718 = _mm512_shuffle_f32x4(tmp19359, tmp19367, 221);
wt711 = _mm512_shuffle_f32x4(tmp19361, tmp19369, 136);
wt719 = _mm512_shuffle_f32x4(tmp19361, tmp19369, 221);
wt712 = _mm512_shuffle_f32x4(tmp19363, tmp19371, 136);
wt720 = _mm512_shuffle_f32x4(tmp19363, tmp19371, 221);
wt713 = _mm512_shuffle_f32x4(tmp19358, tmp19366, 136);
wt721 = _mm512_shuffle_f32x4(tmp19358, tmp19366, 221);
wt714 = _mm512_shuffle_f32x4(tmp19360, tmp19368, 136);
wt722 = _mm512_shuffle_f32x4(tmp19360, tmp19368, 221);
wt715 = _mm512_shuffle_f32x4(tmp19362, tmp19370, 136);
wt723 = _mm512_shuffle_f32x4(tmp19362, tmp19370, 221);
wt716 = _mm512_shuffle_f32x4(tmp19364, tmp19372, 136);
wt724 = _mm512_shuffle_f32x4(tmp19364, tmp19372, 221);
wt709 = _mm512_mul_ps(wt709, postMul68);
wt710 = _mm512_mul_ps(wt710, postMul68);
wt711 = _mm512_mul_ps(wt711, postMul68);
wt712 = _mm512_mul_ps(wt712, postMul68);
wt713 = _mm512_mul_ps(wt713, postMul68);
wt714 = _mm512_mul_ps(wt714, postMul68);
wt715 = _mm512_mul_ps(wt715, postMul68);
wt716 = _mm512_mul_ps(wt716, postMul68);
wt717 = _mm512_mul_ps(wt717, postMul68);
wt718 = _mm512_mul_ps(wt718, postMul68);
wt719 = _mm512_mul_ps(wt719, postMul68);
wt720 = _mm512_mul_ps(wt720, postMul68);
wt721 = _mm512_mul_ps(wt721, postMul68);
wt722 = _mm512_mul_ps(wt722, postMul68);
wt723 = _mm512_mul_ps(wt723, postMul68);
wt724 = _mm512_mul_ps(wt724, postMul68);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c54)+(ptrdiff_t)0, 63>>cut28, wt709);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c54)+(ptrdiff_t)0, 63>>cut28, wt710);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c54)+(ptrdiff_t)0, 63>>cut28, wt711);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c54)+(ptrdiff_t)0, 63>>cut28, wt712);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c54)+(ptrdiff_t)0, 63>>cut28, wt713);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c54)+(ptrdiff_t)0, 63>>cut28, wt714);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c54)+(ptrdiff_t)0, 63>>cut28, wt715);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c54)+(ptrdiff_t)0, 63>>cut28, wt716);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c54)+(ptrdiff_t)0, 63>>cut28, wt717);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c54)+(ptrdiff_t)0, 63>>cut28, wt718);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c54)+(ptrdiff_t)0, 63>>cut28, wt719);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c54)+(ptrdiff_t)0, 63>>cut28, wt720);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c54)+(ptrdiff_t)0, 63>>cut28, wt721);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c54)+(ptrdiff_t)0, 63>>cut28, wt722);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c54)+(ptrdiff_t)0, 63>>cut28, wt723);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c54)+(ptrdiff_t)0, 63>>cut28, wt724);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt709);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt710);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt711);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt712);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt713);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt714);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt715);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt716);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt717);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt718);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt719);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt720);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt721);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt722);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt723);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt724);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt709);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt710);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt711);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt712);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt713);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt714);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt715);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt716);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt717);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt718);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt719);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt720);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt721);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt722);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt723);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt724);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt709);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt710);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt711);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt712);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt713);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt714);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt715);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt716);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt717);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt718);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt719);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt720);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt721);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt722);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt723);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt724);
}
}
}
} else if (j58 < 159) {
ptrdiff_t k173 = 0+16*(j58-128);
ptrdiff_t l76 = (size_t)(2048+k173)/6;
ptrdiff_t cut30 = (size_t)(2048+k173)%6;
switch (cut30) {
case 0:;
case 2: {
__m512 sum626 = _mm512_maskz_loadu_ps(65535, biasPtr21+10240*i67+4*k173);
__m512i pmMul43 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd43 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo36 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k173+2560*i67));
__m512 masHi36 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k173+2560*i67)+(ptrdiff_t)64);
__m512 postMul70 = _mm512_permutex2var_ps(masLo36, pmMul43, masHi36);
__m512 postAdd44 = _mm512_permutex2var_ps(masLo36, pmAdd43, masHi36);
sum626 = _mm512_fmadd_ps(sum626, postMul70, postAdd44);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)0, 63>>cut30, sum626);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)24576, 4032>>cut30, sum626);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)49152, 65535-(4095>>cut30), sum626);
ptrdiff_t c56 = 0;
for (; c56 != 64; ++c56) {
__m512 wt741 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)0);
__m512 wt742 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)4096);
__m512 wt743 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)8192);
__m512 wt744 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)12288);
__m512 wt745 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)16384);
__m512 wt746 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)20480);
__m512 wt747 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)24576);
__m512 wt748 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)28672);
__m512 wt749 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)32768);
__m512 wt750 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)36864);
__m512 wt751 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)40960);
__m512 wt752 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)45056);
__m512 wt753 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)49152);
__m512 wt754 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)53248);
__m512 wt755 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)57344);
__m512 wt756 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)61440);
__m512 tmp19373 = _mm512_unpacklo_ps(wt741, wt742);
__m512 tmp19374 = _mm512_unpackhi_ps(wt741, wt742);
__m512 tmp19375 = _mm512_unpacklo_ps(wt743, wt744);
__m512 tmp19376 = _mm512_unpackhi_ps(wt743, wt744);
__m512 tmp19377 = _mm512_unpacklo_ps(wt745, wt746);
__m512 tmp19378 = _mm512_unpackhi_ps(wt745, wt746);
__m512 tmp19379 = _mm512_unpacklo_ps(wt747, wt748);
__m512 tmp19380 = _mm512_unpackhi_ps(wt747, wt748);
__m512 tmp19381 = _mm512_unpacklo_ps(wt749, wt750);
__m512 tmp19382 = _mm512_unpackhi_ps(wt749, wt750);
__m512 tmp19383 = _mm512_unpacklo_ps(wt751, wt752);
__m512 tmp19384 = _mm512_unpackhi_ps(wt751, wt752);
__m512 tmp19385 = _mm512_unpacklo_ps(wt753, wt754);
__m512 tmp19386 = _mm512_unpackhi_ps(wt753, wt754);
__m512 tmp19387 = _mm512_unpacklo_ps(wt755, wt756);
__m512 tmp19388 = _mm512_unpackhi_ps(wt755, wt756);
__m512 tmp19389 = _mm512_shuffle_ps(tmp19373, tmp19375, 68);
__m512 tmp19390 = _mm512_shuffle_ps(tmp19373, tmp19375, 238);
__m512 tmp19391 = _mm512_shuffle_ps(tmp19374, tmp19376, 68);
__m512 tmp19392 = _mm512_shuffle_ps(tmp19374, tmp19376, 238);
__m512 tmp19393 = _mm512_shuffle_ps(tmp19377, tmp19379, 68);
__m512 tmp19394 = _mm512_shuffle_ps(tmp19377, tmp19379, 238);
__m512 tmp19395 = _mm512_shuffle_ps(tmp19378, tmp19380, 68);
__m512 tmp19396 = _mm512_shuffle_ps(tmp19378, tmp19380, 238);
__m512 tmp19397 = _mm512_shuffle_ps(tmp19381, tmp19383, 68);
__m512 tmp19398 = _mm512_shuffle_ps(tmp19381, tmp19383, 238);
__m512 tmp19399 = _mm512_shuffle_ps(tmp19382, tmp19384, 68);
__m512 tmp19400 = _mm512_shuffle_ps(tmp19382, tmp19384, 238);
__m512 tmp19401 = _mm512_shuffle_ps(tmp19385, tmp19387, 68);
__m512 tmp19402 = _mm512_shuffle_ps(tmp19385, tmp19387, 238);
__m512 tmp19403 = _mm512_shuffle_ps(tmp19386, tmp19388, 68);
__m512 tmp19404 = _mm512_shuffle_ps(tmp19386, tmp19388, 238);
__m512 tmp19405 = _mm512_shuffle_f32x4(tmp19389, tmp19393, 136);
__m512 tmp19406 = _mm512_shuffle_f32x4(tmp19389, tmp19393, 221);
__m512 tmp19407 = _mm512_shuffle_f32x4(tmp19390, tmp19394, 136);
__m512 tmp19408 = _mm512_shuffle_f32x4(tmp19390, tmp19394, 221);
__m512 tmp19409 = _mm512_shuffle_f32x4(tmp19391, tmp19395, 136);
__m512 tmp19410 = _mm512_shuffle_f32x4(tmp19391, tmp19395, 221);
__m512 tmp19411 = _mm512_shuffle_f32x4(tmp19392, tmp19396, 136);
__m512 tmp19412 = _mm512_shuffle_f32x4(tmp19392, tmp19396, 221);
__m512 tmp19413 = _mm512_shuffle_f32x4(tmp19397, tmp19401, 136);
__m512 tmp19414 = _mm512_shuffle_f32x4(tmp19397, tmp19401, 221);
__m512 tmp19415 = _mm512_shuffle_f32x4(tmp19398, tmp19402, 136);
__m512 tmp19416 = _mm512_shuffle_f32x4(tmp19398, tmp19402, 221);
__m512 tmp19417 = _mm512_shuffle_f32x4(tmp19399, tmp19403, 136);
__m512 tmp19418 = _mm512_shuffle_f32x4(tmp19399, tmp19403, 221);
__m512 tmp19419 = _mm512_shuffle_f32x4(tmp19400, tmp19404, 136);
__m512 tmp19420 = _mm512_shuffle_f32x4(tmp19400, tmp19404, 221);
wt741 = _mm512_shuffle_f32x4(tmp19405, tmp19413, 136);
wt749 = _mm512_shuffle_f32x4(tmp19405, tmp19413, 221);
wt742 = _mm512_shuffle_f32x4(tmp19407, tmp19415, 136);
wt750 = _mm512_shuffle_f32x4(tmp19407, tmp19415, 221);
wt743 = _mm512_shuffle_f32x4(tmp19409, tmp19417, 136);
wt751 = _mm512_shuffle_f32x4(tmp19409, tmp19417, 221);
wt744 = _mm512_shuffle_f32x4(tmp19411, tmp19419, 136);
wt752 = _mm512_shuffle_f32x4(tmp19411, tmp19419, 221);
wt745 = _mm512_shuffle_f32x4(tmp19406, tmp19414, 136);
wt753 = _mm512_shuffle_f32x4(tmp19406, tmp19414, 221);
wt746 = _mm512_shuffle_f32x4(tmp19408, tmp19416, 136);
wt754 = _mm512_shuffle_f32x4(tmp19408, tmp19416, 221);
wt747 = _mm512_shuffle_f32x4(tmp19410, tmp19418, 136);
wt755 = _mm512_shuffle_f32x4(tmp19410, tmp19418, 221);
wt748 = _mm512_shuffle_f32x4(tmp19412, tmp19420, 136);
wt756 = _mm512_shuffle_f32x4(tmp19412, tmp19420, 221);
wt741 = _mm512_mul_ps(wt741, postMul70);
wt742 = _mm512_mul_ps(wt742, postMul70);
wt743 = _mm512_mul_ps(wt743, postMul70);
wt744 = _mm512_mul_ps(wt744, postMul70);
wt745 = _mm512_mul_ps(wt745, postMul70);
wt746 = _mm512_mul_ps(wt746, postMul70);
wt747 = _mm512_mul_ps(wt747, postMul70);
wt748 = _mm512_mul_ps(wt748, postMul70);
wt749 = _mm512_mul_ps(wt749, postMul70);
wt750 = _mm512_mul_ps(wt750, postMul70);
wt751 = _mm512_mul_ps(wt751, postMul70);
wt752 = _mm512_mul_ps(wt752, postMul70);
wt753 = _mm512_mul_ps(wt753, postMul70);
wt754 = _mm512_mul_ps(wt754, postMul70);
wt755 = _mm512_mul_ps(wt755, postMul70);
wt756 = _mm512_mul_ps(wt756, postMul70);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c56)+(ptrdiff_t)0, 63>>cut30, wt741);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c56)+(ptrdiff_t)0, 63>>cut30, wt742);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c56)+(ptrdiff_t)0, 63>>cut30, wt743);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c56)+(ptrdiff_t)0, 63>>cut30, wt744);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c56)+(ptrdiff_t)0, 63>>cut30, wt745);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c56)+(ptrdiff_t)0, 63>>cut30, wt746);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c56)+(ptrdiff_t)0, 63>>cut30, wt747);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c56)+(ptrdiff_t)0, 63>>cut30, wt748);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c56)+(ptrdiff_t)0, 63>>cut30, wt749);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c56)+(ptrdiff_t)0, 63>>cut30, wt750);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c56)+(ptrdiff_t)0, 63>>cut30, wt751);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c56)+(ptrdiff_t)0, 63>>cut30, wt752);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c56)+(ptrdiff_t)0, 63>>cut30, wt753);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c56)+(ptrdiff_t)0, 63>>cut30, wt754);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c56)+(ptrdiff_t)0, 63>>cut30, wt755);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c56)+(ptrdiff_t)0, 63>>cut30, wt756);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt741);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt742);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt743);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt744);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt745);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt746);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt747);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt748);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt749);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt750);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt751);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt752);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt753);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt754);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt755);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt756);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt741);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt742);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt743);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt744);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt745);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt746);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt747);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt748);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt749);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt750);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt751);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt752);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt753);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt754);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt755);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt756);
}
break;
}
default: {
cut30 = 4;
__m512 sum627 = _mm512_maskz_loadu_ps(65535, biasPtr21+10240*i67+4*k173);
__m512i pmMul44 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd44 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo37 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k173+2560*i67));
__m512 masHi37 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k173+2560*i67)+(ptrdiff_t)64);
__m512 postMul71 = _mm512_permutex2var_ps(masLo37, pmMul44, masHi37);
__m512 postAdd45 = _mm512_permutex2var_ps(masLo37, pmAdd44, masHi37);
sum627 = _mm512_fmadd_ps(sum627, postMul71, postAdd45);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)0, 63>>cut30, sum627);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)24576, 4032>>cut30, sum627);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)49152, 258048>>cut30, sum627);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)73728, 65535-(262143>>cut30), sum627);
ptrdiff_t c57 = 0;
for (; c57 != 64; ++c57) {
__m512 wt757 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)0);
__m512 wt758 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)4096);
__m512 wt759 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)8192);
__m512 wt760 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)12288);
__m512 wt761 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)16384);
__m512 wt762 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)20480);
__m512 wt763 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)24576);
__m512 wt764 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)28672);
__m512 wt765 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)32768);
__m512 wt766 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)36864);
__m512 wt767 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)40960);
__m512 wt768 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)45056);
__m512 wt769 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)49152);
__m512 wt770 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)53248);
__m512 wt771 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)57344);
__m512 wt772 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)61440);
__m512 tmp19421 = _mm512_unpacklo_ps(wt757, wt758);
__m512 tmp19422 = _mm512_unpackhi_ps(wt757, wt758);
__m512 tmp19423 = _mm512_unpacklo_ps(wt759, wt760);
__m512 tmp19424 = _mm512_unpackhi_ps(wt759, wt760);
__m512 tmp19425 = _mm512_unpacklo_ps(wt761, wt762);
__m512 tmp19426 = _mm512_unpackhi_ps(wt761, wt762);
__m512 tmp19427 = _mm512_unpacklo_ps(wt763, wt764);
__m512 tmp19428 = _mm512_unpackhi_ps(wt763, wt764);
__m512 tmp19429 = _mm512_unpacklo_ps(wt765, wt766);
__m512 tmp19430 = _mm512_unpackhi_ps(wt765, wt766);
__m512 tmp19431 = _mm512_unpacklo_ps(wt767, wt768);
__m512 tmp19432 = _mm512_unpackhi_ps(wt767, wt768);
__m512 tmp19433 = _mm512_unpacklo_ps(wt769, wt770);
__m512 tmp19434 = _mm512_unpackhi_ps(wt769, wt770);
__m512 tmp19435 = _mm512_unpacklo_ps(wt771, wt772);
__m512 tmp19436 = _mm512_unpackhi_ps(wt771, wt772);
__m512 tmp19437 = _mm512_shuffle_ps(tmp19421, tmp19423, 68);
__m512 tmp19438 = _mm512_shuffle_ps(tmp19421, tmp19423, 238);
__m512 tmp19439 = _mm512_shuffle_ps(tmp19422, tmp19424, 68);
__m512 tmp19440 = _mm512_shuffle_ps(tmp19422, tmp19424, 238);
__m512 tmp19441 = _mm512_shuffle_ps(tmp19425, tmp19427, 68);
__m512 tmp19442 = _mm512_shuffle_ps(tmp19425, tmp19427, 238);
__m512 tmp19443 = _mm512_shuffle_ps(tmp19426, tmp19428, 68);
__m512 tmp19444 = _mm512_shuffle_ps(tmp19426, tmp19428, 238);
__m512 tmp19445 = _mm512_shuffle_ps(tmp19429, tmp19431, 68);
__m512 tmp19446 = _mm512_shuffle_ps(tmp19429, tmp19431, 238);
__m512 tmp19447 = _mm512_shuffle_ps(tmp19430, tmp19432, 68);
__m512 tmp19448 = _mm512_shuffle_ps(tmp19430, tmp19432, 238);
__m512 tmp19449 = _mm512_shuffle_ps(tmp19433, tmp19435, 68);
__m512 tmp19450 = _mm512_shuffle_ps(tmp19433, tmp19435, 238);
__m512 tmp19451 = _mm512_shuffle_ps(tmp19434, tmp19436, 68);
__m512 tmp19452 = _mm512_shuffle_ps(tmp19434, tmp19436, 238);
__m512 tmp19453 = _mm512_shuffle_f32x4(tmp19437, tmp19441, 136);
__m512 tmp19454 = _mm512_shuffle_f32x4(tmp19437, tmp19441, 221);
__m512 tmp19455 = _mm512_shuffle_f32x4(tmp19438, tmp19442, 136);
__m512 tmp19456 = _mm512_shuffle_f32x4(tmp19438, tmp19442, 221);
__m512 tmp19457 = _mm512_shuffle_f32x4(tmp19439, tmp19443, 136);
__m512 tmp19458 = _mm512_shuffle_f32x4(tmp19439, tmp19443, 221);
__m512 tmp19459 = _mm512_shuffle_f32x4(tmp19440, tmp19444, 136);
__m512 tmp19460 = _mm512_shuffle_f32x4(tmp19440, tmp19444, 221);
__m512 tmp19461 = _mm512_shuffle_f32x4(tmp19445, tmp19449, 136);
__m512 tmp19462 = _mm512_shuffle_f32x4(tmp19445, tmp19449, 221);
__m512 tmp19463 = _mm512_shuffle_f32x4(tmp19446, tmp19450, 136);
__m512 tmp19464 = _mm512_shuffle_f32x4(tmp19446, tmp19450, 221);
__m512 tmp19465 = _mm512_shuffle_f32x4(tmp19447, tmp19451, 136);
__m512 tmp19466 = _mm512_shuffle_f32x4(tmp19447, tmp19451, 221);
__m512 tmp19467 = _mm512_shuffle_f32x4(tmp19448, tmp19452, 136);
__m512 tmp19468 = _mm512_shuffle_f32x4(tmp19448, tmp19452, 221);
wt757 = _mm512_shuffle_f32x4(tmp19453, tmp19461, 136);
wt765 = _mm512_shuffle_f32x4(tmp19453, tmp19461, 221);
wt758 = _mm512_shuffle_f32x4(tmp19455, tmp19463, 136);
wt766 = _mm512_shuffle_f32x4(tmp19455, tmp19463, 221);
wt759 = _mm512_shuffle_f32x4(tmp19457, tmp19465, 136);
wt767 = _mm512_shuffle_f32x4(tmp19457, tmp19465, 221);
wt760 = _mm512_shuffle_f32x4(tmp19459, tmp19467, 136);
wt768 = _mm512_shuffle_f32x4(tmp19459, tmp19467, 221);
wt761 = _mm512_shuffle_f32x4(tmp19454, tmp19462, 136);
wt769 = _mm512_shuffle_f32x4(tmp19454, tmp19462, 221);
wt762 = _mm512_shuffle_f32x4(tmp19456, tmp19464, 136);
wt770 = _mm512_shuffle_f32x4(tmp19456, tmp19464, 221);
wt763 = _mm512_shuffle_f32x4(tmp19458, tmp19466, 136);
wt771 = _mm512_shuffle_f32x4(tmp19458, tmp19466, 221);
wt764 = _mm512_shuffle_f32x4(tmp19460, tmp19468, 136);
wt772 = _mm512_shuffle_f32x4(tmp19460, tmp19468, 221);
wt757 = _mm512_mul_ps(wt757, postMul71);
wt758 = _mm512_mul_ps(wt758, postMul71);
wt759 = _mm512_mul_ps(wt759, postMul71);
wt760 = _mm512_mul_ps(wt760, postMul71);
wt761 = _mm512_mul_ps(wt761, postMul71);
wt762 = _mm512_mul_ps(wt762, postMul71);
wt763 = _mm512_mul_ps(wt763, postMul71);
wt764 = _mm512_mul_ps(wt764, postMul71);
wt765 = _mm512_mul_ps(wt765, postMul71);
wt766 = _mm512_mul_ps(wt766, postMul71);
wt767 = _mm512_mul_ps(wt767, postMul71);
wt768 = _mm512_mul_ps(wt768, postMul71);
wt769 = _mm512_mul_ps(wt769, postMul71);
wt770 = _mm512_mul_ps(wt770, postMul71);
wt771 = _mm512_mul_ps(wt771, postMul71);
wt772 = _mm512_mul_ps(wt772, postMul71);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c57)+(ptrdiff_t)0, 63>>cut30, wt757);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c57)+(ptrdiff_t)0, 63>>cut30, wt758);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c57)+(ptrdiff_t)0, 63>>cut30, wt759);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c57)+(ptrdiff_t)0, 63>>cut30, wt760);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c57)+(ptrdiff_t)0, 63>>cut30, wt761);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c57)+(ptrdiff_t)0, 63>>cut30, wt762);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c57)+(ptrdiff_t)0, 63>>cut30, wt763);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c57)+(ptrdiff_t)0, 63>>cut30, wt764);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c57)+(ptrdiff_t)0, 63>>cut30, wt765);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c57)+(ptrdiff_t)0, 63>>cut30, wt766);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c57)+(ptrdiff_t)0, 63>>cut30, wt767);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c57)+(ptrdiff_t)0, 63>>cut30, wt768);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c57)+(ptrdiff_t)0, 63>>cut30, wt769);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c57)+(ptrdiff_t)0, 63>>cut30, wt770);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c57)+(ptrdiff_t)0, 63>>cut30, wt771);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c57)+(ptrdiff_t)0, 63>>cut30, wt772);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt757);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt758);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt759);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt760);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt761);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt762);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c57)+(ptrdiff_t)24576, 4032>&g